- remove superfluous multibyte processing in str_append for UTF-8 encoding

(thanks Paolo Bonzini, #177246)
2006-08-03 13:36:23 +00:00 · 2006-08-03 13:36:23 +00:00 · e7b4da6742
parent b0233900b0
commit e7b4da6742
2 changed files with 118 additions and 4 deletions
--- a/sed-4.1.5-utf8performance.patch
+++ b/sed-4.1.5-utf8performance.patch
@ -0,0 +1,113 @@
+* looking for bonzini@gnu.org--2004b/sed--stable--4.1--patch-69 to compare with
+* comparing to bonzini@gnu.org--2004b/sed--stable--4.1--patch-69
+M  sed/mbcs.c
+M  sed/sed.h
+M  sed/execute.c
+
+* modified files
+
+--- orig/sed/execute.c
+++ mod/sed/execute.c
+@@ -235,25 +235,26 @@ str_append(to, string, length)
+   to->length = new_length;
+ 
+ #ifdef HAVE_MBRTOWC
+-  if (mb_cur_max == 1)
+-    return;
+-
+-  while (length)
+-    {
+-      int n = MBRLEN (string, length, &to->mbstate);
+  if (mb_cur_max > 1 && !is_utf8)
+    while (length)
+      {
+        size_t n = MBRLEN (string, length, &to->mbstate);
+ 
+-      /* An invalid sequence is treated like a singlebyte character. */
+-      if (n == -1)
+-	{
+-	  memset (&to->mbstate, 0, sizeof (to->mbstate));
+-	  n = 1;
+-	}
+        /* An invalid sequence is treated like a singlebyte character. */
+        if (n == (size_t) -1)
+	  {
+	    memset (&to->mbstate, 0, sizeof (to->mbstate));
+	    n = 1;
+	  }
+ 
+-      if (n > 0)
+-	length -= n;
+-      else
+-	break;
+-    }
+        if (n > 0)
+	  {
+	    string += n;
+	    length -= n;
+	  }
+        else
+	  break;
+      }
+ #endif
+ }
+ 
+
+
+--- orig/sed/mbcs.c
+++ mod/sed/mbcs.c
+@@ -18,7 +18,12 @@
+ #include "sed.h"
+ #include <stdlib.h>
+ 
+#ifdef HAVE_LANGINFO_CODESET
+#include <langinfo.h>
+#endif
+
+ int mb_cur_max;
+bool is_utf8;
+ 
+ #ifdef HAVE_MBRTOWC
+ /* Add a byte to the multibyte character represented by the state
+@@ -47,6 +52,26 @@ int brlen (ch, cur_stat)
+ void
+ initialize_mbcs ()
+ {
+  /* For UTF-8, we know that the encoding is stateless.  */
+  const char *codeset_name;
+
+#ifdef HAVE_LANGINFO_CODESET
+  codeset_name = nl_langinfo (CODESET);
+#else
+  codeset_name = getenv ("LC_ALL");
+  if (codeset_name == NULL || codeset_name[0] == '\0')
+    codeset_name = getenv ("LC_CTYPE");
+  if (codeset_name == NULL || codeset_name[0] == '\0')
+    codeset_name = getenv ("LANG");
+  if (codeset_name == NULL)
+    codeset_name = "";
+  else if (strchr (codeset_name, '.') !=  NULL)
+    codeset_name = strchr (codeset_name, '.') + 1;
+#endif
+
+  is_utf8 = (strcasecmp (codeset_name, "UTF-8") == 0
+	     || strcasecmp (codeset_name, "UTF8") == 0);
+
+ #ifdef HAVE_MBRTOWC
+   mb_cur_max = MB_CUR_MAX;
+ #else
+
+
+--- orig/sed/sed.h
+++ mod/sed/sed.h
+@@ -233,6 +233,7 @@ extern bool use_extended_syntax_p;
+ 
+ /* Declarations for multibyte character sets.  */
+ extern int mb_cur_max;
+extern bool is_utf8;
+ 
+ #ifdef HAVE_MBRTOWC
+ #ifdef HAVE_BTOWC
+
+
+
--- a/sed.spec
+++ b/sed.spec
@ -10,8 +10,8 @@ License: GPL
 Group: Applications/Text
 Source0: ftp://ftp.gnu.org/pub/gnu/sed/sed-%{version}.tar.gz
 Source1: http://sed.sourceforge.net/sedfaq.txt
-Patch0: sed-4.1.5-bz185374.patch
-Patch1: sed-4.1.5-str_append.patch
+Patch0: sed-4.1.5-utf8performance.patch
+Patch1: sed-4.1.5-bz185374.patch
 Prereq: /sbin/install-info
 Prefix: %{_prefix}
 Buildroot: %{_tmppath}/%{name}-root
@ -67,8 +67,9 @@ rm -rf ${RPM_BUILD_ROOT}
 %{_mandir}/man*/*

 %changelog
-* Wed Aug  2 2006 Petr Machata <pmachata@redhat.com> - 4.1.5-4
- remove superfluous multibyte processing in str_append (#177246)
+* Wed Aug  3 2006 Petr Machata <pmachata@redhat.com> - 4.1.5-4
+- remove superfluous multibyte processing in str_append for UTF-8
+  encoding (thanks Paolo Bonzini, #177246)

 * Mon Jul 17 2006 Petr Machata <pmachata@redhat.com> - 4.1.5-3
 - use dist tag