From a9aa92da406b87d2fa2c163a09beea139588e559 Mon Sep 17 00:00:00 2001 From: Toshio Kuratomi Date: Fri, 20 Aug 2010 15:07:00 -0400 Subject: [PATCH] - Fix for lone surrogates, utf8 and certain encode error handlers. --- ...one-surrogate-and-utf8-error-handler.patch | 175 ++++++++++++++++++ python3.spec | 12 +- 2 files changed, 186 insertions(+), 1 deletion(-) create mode 100644 python3-r80382-lone-surrogate-and-utf8-error-handler.patch diff --git a/python3-r80382-lone-surrogate-and-utf8-error-handler.patch b/python3-r80382-lone-surrogate-and-utf8-error-handler.patch new file mode 100644 index 0000000..b4b59f2 --- /dev/null +++ b/python3-r80382-lone-surrogate-and-utf8-error-handler.patch @@ -0,0 +1,175 @@ +Index: Objects/unicodeobject.c +=================================================================== +--- Objects/unicodeobject.c (revision 80382) ++++ Objects/unicodeobject.c (revision 80383) +@@ -159,6 +159,12 @@ + const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, + Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); + ++static void raise_encode_exception(PyObject **exceptionObject, ++ const char *encoding, ++ const Py_UNICODE *unicode, Py_ssize_t size, ++ Py_ssize_t startpos, Py_ssize_t endpos, ++ const char *reason); ++ + /* Same for linebreaks */ + static unsigned char ascii_linebreak[] = { + 0, 0, 0, 0, 0, 0, 0, 0, +@@ -2461,61 +2467,88 @@ + /* Encode Latin-1 */ + *p++ = (char)(0xc0 | (ch >> 6)); + *p++ = (char)(0x80 | (ch & 0x3f)); +- } +- else { +- /* Encode UCS2 Unicode ordinals */ +- if (ch < 0x10000) { ++ } else if (0xD800 <= ch && ch <= 0xDFFF) { + #ifndef Py_UNICODE_WIDE +- /* Special case: check for high surrogate */ +- if (0xD800 <= ch && ch <= 0xDBFF && i != size) { +- Py_UCS4 ch2 = s[i]; +- /* Check for low surrogate and combine the two to +- form a UCS4 value */ +- if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { +- ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000; +- i++; +- goto encodeUCS4; +- } +- /* Fall through: handles isolated high surrogates */ +- } ++ /* Special case: check for high and low surrogate */ ++ if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) { ++ Py_UCS4 ch2 = s[i]; ++ /* Combine the two surrogates to form a UCS4 value */ ++ ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000; ++ i++; ++ ++ /* Encode UCS4 Unicode ordinals */ ++ *p++ = (char)(0xf0 | (ch >> 18)); ++ *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); ++ *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); ++ *p++ = (char)(0x80 | (ch & 0x3f)); ++ + #endif +- if (ch >= 0xd800 && ch <= 0xdfff) { +- Py_ssize_t newpos; +- PyObject *rep; +- char *prep; +- int k; +- rep = unicode_encode_call_errorhandler +- (errors, &errorHandler, "utf-8", "surrogates not allowed", +- s, size, &exc, i-1, i, &newpos); +- if (!rep) ++ } else { ++ Py_ssize_t newpos; ++ PyObject *rep; ++ Py_ssize_t repsize, k; ++ rep = unicode_encode_call_errorhandler ++ (errors, &errorHandler, "utf-8", "surrogates not allowed", ++ s, size, &exc, i-1, i, &newpos); ++ if (!rep) ++ goto error; ++ ++ if (PyBytes_Check(rep)) ++ repsize = PyBytes_GET_SIZE(rep); ++ else ++ repsize = PyUnicode_GET_SIZE(rep); ++ ++ if (repsize > 4) { ++ Py_ssize_t offset; ++ ++ if (result == NULL) ++ offset = p - stackbuf; ++ else ++ offset = p - PyBytes_AS_STRING(result); ++ ++ if (nallocated > PY_SSIZE_T_MAX - repsize + 4) { ++ /* integer overflow */ ++ PyErr_NoMemory(); + goto error; +- /* Implementation limitations: only support error handler that return +- bytes, and only support up to four replacement bytes. */ +- if (!PyBytes_Check(rep)) { +- PyErr_SetString(PyExc_TypeError, "error handler should have returned bytes"); +- Py_DECREF(rep); +- goto error; + } +- if (PyBytes_Size(rep) > 4) { +- PyErr_SetString(PyExc_TypeError, "error handler returned too many bytes"); +- Py_DECREF(rep); +- goto error; ++ nallocated += repsize - 4; ++ if (result != NULL) { ++ if (_PyBytes_Resize(&result, nallocated) < 0) ++ goto error; ++ } else { ++ result = PyBytes_FromStringAndSize(NULL, nallocated); ++ if (result == NULL) ++ goto error; ++ Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset); + } +- prep = PyBytes_AsString(rep); +- for(k = PyBytes_Size(rep); k > 0; k--) ++ p = PyBytes_AS_STRING(result) + offset; ++ } ++ ++ if (PyBytes_Check(rep)) { ++ char *prep = PyBytes_AS_STRING(rep); ++ for(k = repsize; k > 0; k--) + *p++ = *prep++; +- Py_DECREF(rep); +- continue; +- ++ } else /* rep is unicode */ { ++ Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep); ++ Py_UNICODE c; ++ ++ for(k=0; k> 12)); +- *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); +- *p++ = (char)(0x80 | (ch & 0x3f)); +- continue; ++ Py_DECREF(rep); + } +-#ifndef Py_UNICODE_WIDE +- encodeUCS4: +-#endif ++ } else if (ch < 0x10000) { ++ *p++ = (char)(0xe0 | (ch >> 12)); ++ *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); ++ *p++ = (char)(0x80 | (ch & 0x3f)); ++ } else /* ch >= 0x10000 */ { + /* Encode UCS4 Unicode ordinals */ + *p++ = (char)(0xf0 | (ch >> 18)); + *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); +Index: Lib/test/test_codecs.py +=================================================================== +--- Lib/test/test_codecs.py (revision 80382) ++++ Lib/test/test_codecs.py (revision 80383) +@@ -571,6 +571,16 @@ + def test_lone_surrogates(self): + self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8") + self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8") ++ self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"), ++ b'[\\udc80]') ++ self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"), ++ b'[�]') ++ self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"), ++ b'[\x80]') ++ self.assertEqual("[\uDC80]".encode("utf-8", "ignore"), ++ b'[]') ++ self.assertEqual("[\uDC80]".encode("utf-8", "replace"), ++ b'[?]') + + def test_surrogatepass_handler(self): + self.assertEquals("abc\ud800def".encode("utf-8", "surrogatepass"), + diff --git a/python3.spec b/python3.spec index 4adf0a2..9ec77d6 100644 --- a/python3.spec +++ b/python3.spec @@ -40,7 +40,7 @@ Summary: Version 3 of the Python programming language aka Python 3000 Name: python3 Version: %{pybasever}.2 -Release: 12%{?dist} +Release: 13%{?dist} License: Python Group: Development/Languages Source: http://python.org/ftp/python/%{version}/Python-%{version}.tar.bz2 @@ -224,6 +224,11 @@ Patch109: python-3.1.2-CVE-2008-5983.patch # Sent upstream as http://bugs.python.org/issue9054 Patch110: python-3.1.2-fix-expat-issue9054.patch +# Fix encoding to utf8 when lone surrogates are present and error handler is +# set to ignore, replace, or others that return a unicode str. +# http://bugs.python.org/issue8092 +Patch111: python3-r80382-lone-surrogate-and-utf8-error-handler.patch + BuildRoot: %{_tmppath}/%{name}-%{version}-root BuildRequires: readline-devel, openssl-devel, gmp-devel BuildRequires: ncurses-devel, gdbm-devel, zlib-devel, expat-devel @@ -384,6 +389,8 @@ rm -r Modules/zlib || exit 1 %patch110 -p0 -b .fix-expat-issue9054 +%patch111 -p0 -b .surrogate-utf8 + # Currently (2010-01-15), http://docs.python.org/library is for 2.6, and there # are many differences between 2.6 and the Python 3 library. # @@ -1064,6 +1071,9 @@ rm -fr %{buildroot} %changelog +* Fri Aug 20 2010 Toshio Kuratomi - 3.1.2-13 +- Fix for lone surrogates, utf8 and certain encode error handlers. + * Fri Jul 2 2010 David Malcolm - 3.1.2-12 - rebuild