# HG changeset patch # User Fredrik Lundh # Date 1257441411 -3600 # Node ID 1342ecf9d9c837ce381d2adf7f4b10d6ed9ed208 # Parent aaf2b609a42ca33569bd66503cde2db45a53b964 Use "replace" decoder behaviour for malformed UTF-8. Also, fixed support for narrow Python builds. diff -r aaf2b609a42ca33569bd66503cde2db45a53b964 -r 1342ecf9d9c837ce381d2adf7f4b10d6ed9ed208 cjson.c --- a/cjson.c Thu Nov 05 14:49:20 2009 +0100 +++ b/cjson.c Thu Nov 05 18:16:51 2009 +0100 @@ -3,7 +3,7 @@ * Author: Dan Pascu * * getdefaultencoding support etc added in 2009 by Fredrik Lundh. - * + * * Fast JSON encoder/decoder implementation for Python * */ @@ -79,6 +79,9 @@ #define STR(x) #x #define STRINGIFY(x) STR(x) +/* used as a replacement for invalid UTF-8 sequences */ +#define UNICODE_REPLACEMENT_CHARACTER 0xfffd + /* ------------------------------ Decoding ----------------------------- */ static PyObject* @@ -562,8 +565,8 @@ static Py_UNICODE utf8_decode(unsigned char *s, int code_length) { - /* Decode a single UTF-8 character. Assumes code_length >= 2. Returns 0 - if data is malformed. */ + /* Decode a single UTF-8 character. Assumes code_length >= 2. + Returns UNICODE_REPLACEMENT_CHARACTER if data is malformed. */ Py_UNICODE ch; @@ -571,33 +574,35 @@ case 2: if ((s[1] & 0xc0) != 0x80) - return 0; + return UNICODE_REPLACEMENT_CHARACTER; ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); if (ch < 0x80) - return 0; + return UNICODE_REPLACEMENT_CHARACTER; break; case 3: if ((s[1] & 0xc0) != 0x80 || (s[2] & 0xc0) != 0x80) - return 0; + return UNICODE_REPLACEMENT_CHARACTER; ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); if (ch < 0x0800) - return 0; + return UNICODE_REPLACEMENT_CHARACTER; break; +#ifdef Py_UNICODE_WIDE case 4: if ((s[1] & 0xc0) != 0x80 || (s[2] & 0xc0) != 0x80 || (s[3] & 0xc0) != 0x80) - return 0; + return UNICODE_REPLACEMENT_CHARACTER; ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); if (ch < 0x10000 || ch > 0x10ffff) - return 0; + return UNICODE_REPLACEMENT_CHARACTER; +#endif break; default: - return 0; + return UNICODE_REPLACEMENT_CHARACTER; } return ch; @@ -716,21 +721,20 @@ else if (c == '\b') *p++ = '\\', *p++ = 'b'; else { - /* Decode UTF-8, and output as escaped Unicode. */ + /* Decode UTF-8, and output as escaped Unicode. Invalid + UTF-8 forms are stored as replacement characters. */ Py_UNICODE ch; int n = utf8_code_length[c & 0xff]; - if (n <= 1) { - ch = 0; + if (n == 0) { + ch = UNICODE_REPLACEMENT_CHARACTER; + } else if (n == 1) { + ch = (Py_UNICODE) (unsigned char) c; } else { - ch = utf8_decode(op->ob_sval+i, n); + ch = utf8_decode((unsigned char*) op->ob_sval+i, n); + i += n - 1; } - if (ch == 0) { - /* TODO: or raise exception? */ - *p++ = '?'; - } else if (ch < 0x10000) { - sprintf(p, "\\u%04x", ch & 0xffff); - p += 6; - } else { +#ifdef Py_UNICODE_WIDE + if (ch >= 0x10000) { if (p + 12 > p + PyString_GET_SIZE(v)) { int offset = p - PyString_AS_STRING(v); if (_PyString_Resize(&v, PyString_GET_SIZE(v) + 100)) @@ -739,8 +743,11 @@ } sprintf(p, "\\U%08x", ch & 0xffffffff); p += 10; + continue; } - i += n - 1; +#endif + sprintf(p, "\\u%04x", ch); + p += 6; } } assert(newsize - (p - PyString_AS_STRING(v)) >= 1); @@ -1313,5 +1320,3 @@ PyModule_AddStringConstant(m, "__version__", STRINGIFY(MODULE_VERSION)); } - - diff -r aaf2b609a42ca33569bd66503cde2db45a53b964 -r 1342ecf9d9c837ce381d2adf7f4b10d6ed9ed208 jsontest.py --- a/jsontest.py Thu Nov 05 14:49:20 2009 +0100 +++ b/jsontest.py Thu Nov 05 18:16:51 2009 +0100 @@ -309,6 +309,18 @@ sys.setdefaultencoding("utf-8") self.assertEqual(cjson.encode(campus_utf8), cjson.encode(campus)) + # full BMP roundtrip + for i in range(0xd800) + range(0xe000, 0xffff): + ch1 = unichr(i) + ch2 = cjson.decode(cjson.encode(ch1.encode("utf-8"))) + self.assertEqual(ch1, ch2) # unicode roundtrip + + # replacment handling + for i in range(0x100): + ch1 = chr(i).decode("utf-8", "replace") + ch2 = cjson.decode(cjson.encode(chr(i))) + self.assertEqual(ch1, ch2) # utf-8 error replacement + sys.setdefaultencoding(encoding) def testReadEmptyObjectAtEndOfArray(self): diff -r aaf2b609a42ca33569bd66503cde2db45a53b964 -r 1342ecf9d9c837ce381d2adf7f4b10d6ed9ed208 test_encoding.py --- a/test_encoding.py Thu Nov 05 14:49:20 2009 +0100 +++ b/test_encoding.py Thu Nov 05 18:16:51 2009 +0100 @@ -22,4 +22,19 @@ print cjson.encode([campus]) print cjson.encode([campus_utf8]) +print "roundtrip testing" +for i in range(0xd800) + range(0xe000, 0xffff): + ch1 = unichr(i) + ch2 = cjson.decode(cjson.encode(ch1.encode("utf-8"))) + if ch1 != ch2: + print repr(ch1), "!=", repr(ch2) + +print "invalid characters" +for i in range(0x100): + ch1 = chr(i).decode("utf-8", "replace") + ch2 = cjson.decode(cjson.encode(chr(i))) + if ch1 != ch2: + print repr(ch1), "!=", repr(ch2) + +# restore, just to be nice :) sys.setdefaultencoding(encoding)