# HG changeset patch # User Fredrik Lundh # Date 1257427995 -3600 # Node ID 1e4aea265498a78d163f04befc7451b8dcc82251 # Parent c6a248bf2463ddc53a6eb4c50b41cdf83680ffb9 Take sys.getdefaultencoding into account. This version handles ascii, utf-8, and latin-1. Note that this patch is somewhat rough, and will be cleaned up a bit. diff -r c6a248bf2463ddc53a6eb4c50b41cdf83680ffb9 -r 1e4aea265498a78d163f04befc7451b8dcc82251 cjson.c --- a/cjson.c Thu Nov 05 12:46:57 2009 +0100 +++ b/cjson.c Thu Nov 05 14:33:15 2009 +0100 @@ -26,7 +26,8 @@ } JSONEncode; static PyObject* encode_object(JSONEncode *encode, PyObject *object); -static PyObject* encode_string(PyObject *object); +static PyObject* encode_string_latin1(PyObject *object); +static PyObject* encode_string_utf8(PyObject *object); static PyObject* encode_unicode(PyObject *object); static PyObject* encode_tuple(JSONEncode *encode, PyObject *object); static PyObject* encode_list(JSONEncode *encode, PyObject *object); @@ -536,6 +537,72 @@ /* ------------------------------ Encoding ----------------------------- */ +static char +utf8_code_length[256] = { + /* Map UTF-8 encoded prefix byte to sequence length. Zero means + illegal prefix. See RFC 2279 for details. */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 +}; + +static Py_UNICODE +utf8_decode(unsigned char *s, int code_length) +{ + /* Decode a single UTF-8 character. Assumes code_length >= 2. Returns 0 + if data is malformed. */ + + Py_UNICODE ch; + + switch (code_length) { + + case 2: + if ((s[1] & 0xc0) != 0x80) + return 0; + ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); + if (ch < 0x80) + return 0; + break; + + case 3: + if ((s[1] & 0xc0) != 0x80 || (s[2] & 0xc0) != 0x80) + return 0; + ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + + (s[2] & 0x3f); + if (ch < 0x0800) + return 0; + break; + + case 4: + if ((s[1] & 0xc0) != 0x80 || (s[2] & 0xc0) != 0x80 || + (s[3] & 0xc0) != 0x80) + return 0; + ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + + ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); + if (ch < 0x10000 || ch > 0x10ffff) + return 0; + break; + + default: + return 0; + } + + return ch; +} + /* * This function is an almost verbatim copy of PyString_Repr() from * Python's stringobject.c with the following differences: @@ -543,9 +610,10 @@ * - it always quotes the output using double quotes. * - it also quotes \b and \f * - it replaces any non ASCII character hh with \u00hh instead of \xhh + * (i.e. it assumes the string is Latin-1) */ static PyObject* -encode_string(PyObject *string) +encode_string_latin1(PyObject *string) { register PyStringObject* op = (PyStringObject*) string; size_t newsize = 2 + 6 * op->ob_size; @@ -603,6 +671,86 @@ } } +/* Same, but used when defaultencoding = utf-8 */ + +static PyObject* +encode_string_utf8(PyObject *string) +{ + register PyStringObject* op = (PyStringObject*) string; + size_t newsize = 2 + 6 * op->ob_size; + PyObject *v; + + if (op->ob_size > (PY_SSIZE_T_MAX-2)/6) { + PyErr_SetString(PyExc_OverflowError, + "string is too large to make repr"); + return NULL; + } + v = PyString_FromStringAndSize((char *)NULL, newsize); + if (v == NULL) { + return NULL; + } + else { + register Py_ssize_t i; + register unsigned char c; + register char *p; + + p = PyString_AS_STRING(v); + *p++ = '"'; + for (i = 0; i < op->ob_size; i++) { + /* There's at least enough room for a hex escape + and a closing quote. */ + assert(newsize - (p - PyString_AS_STRING(v)) >= 7); + c = op->ob_sval[i]; + if (c == '"' || c == '\\') + *p++ = '\\', *p++ = c; + else if (c >= ' ' && c < 0x7f) + *p++ = c; + else if (c == '\t') + *p++ = '\\', *p++ = 't'; + else if (c == '\n') + *p++ = '\\', *p++ = 'n'; + else if (c == '\r') + *p++ = '\\', *p++ = 'r'; + else if (c == '\f') + *p++ = '\\', *p++ = 'f'; + else if (c == '\b') + *p++ = '\\', *p++ = 'b'; + else { + /* Decode UTF-8, and output as escaped Unicode. */ + Py_UNICODE ch; + int n = utf8_code_length[c & 0xff]; + if (n <= 1) { + ch = 0; + } else { + ch = utf8_decode(op->ob_sval+i, n); + } + if (ch == 0) { + /* TODO: or raise exception? */ + *p++ = '?'; + } else if (ch < 0x10000) { + sprintf(p, "\\u%04x", ch & 0xffff); + p += 6; + } else { + if (p + 12 > p + PyString_GET_SIZE(v)) { + int offset = p - PyString_AS_STRING(v); + if (_PyString_Resize(&v, PyString_GET_SIZE(v) + 100)) + return NULL; + p = PyString_AS_STRING(v) + offset; + } + sprintf(p, "\\U%08x", ch & 0xffffffff); + p += 10; + } + i += n - 1; + } + } + assert(newsize - (p - PyString_AS_STRING(v)) >= 1); + *p++ = '"'; + + _PyString_Resize(&v, (int) (p - PyString_AS_STRING(v))); + return v; + } +} + /* * This function is an almost verbatim copy of unicodeescape_string() from * Python's unicodeobject.c with the following differences: @@ -1011,9 +1159,9 @@ return PyString_FromString("null"); } else if (PyString_Check(object)) { if (encode->utf8) - return encode_string(object); + return encode_string_utf8(object); else - return encode_string(object); + return encode_string_latin1(object); } else if (PyUnicode_Check(object)) { return encode_unicode(object); } else if (PyFloat_Check(object)) { diff -r c6a248bf2463ddc53a6eb4c50b41cdf83680ffb9 -r 1e4aea265498a78d163f04befc7451b8dcc82251 jsonbench.py --- a/jsonbench.py Thu Nov 05 12:46:57 2009 +0100 +++ b/jsonbench.py Thu Nov 05 14:33:15 2009 +0100 @@ -1,5 +1,10 @@ import timeit +import sys +reload(sys) + +sys.setdefaultencoding("utf-8") + testobj1 = ['JSON Test Pattern pass1', {'object with 1 member': ['array with 1 element']}, {}, @@ -53,23 +58,23 @@ 'rosebud'] testobj2 = { - 'name': u'blah blah', - 'address': u'\u53ea'} + 'name': u'blah blah', + 'address': u'\u53ea'} def compare(testobj, reps=10000): - cjson_setup = '\n'.join(['import cjson', 'testobj = ' + repr(testobj)]) - simplejson_setup = '\n'.join(['import json', 'testobj = ' + repr(testobj)]) + cjson_setup = '\n'.join(['import cjson', 'testobj = ' + repr(testobj)]) + simplejson_setup = '\n'.join(['import json', 'testobj = ' + repr(testobj)]) - # make sure syntax is valid, before running either one - exec cjson_setup - exec simplejson_setup + # make sure syntax is valid, before running either one + exec cjson_setup + exec simplejson_setup - cjson_t = timeit.Timer('cjson.encode(testobj)', cjson_setup) - print 'cjson=', cjson_t.timeit(reps) + cjson_t = timeit.Timer('cjson.encode(testobj)', cjson_setup) + print 'cjson=', cjson_t.timeit(reps) - # simplejson_t = timeit.Timer('json.dumps(testobj)', simplejson_setup) - # print 'simplejson=', simplejson_t.timeit(reps) + # simplejson_t = timeit.Timer('json.dumps(testobj)', simplejson_setup) + # print 'simplejson=', simplejson_t.timeit(reps) compare(testobj1) compare(testobj2, 100000) diff -r c6a248bf2463ddc53a6eb4c50b41cdf83680ffb9 -r 1e4aea265498a78d163f04befc7451b8dcc82251 test_encoding.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test_encoding.py Thu Nov 05 14:33:15 2009 +0100 @@ -0,0 +1,24 @@ +import cjson + +import sys +reload(sys) + +encoding = sys.getdefaultencoding() +print encoding + +campus = u'\u5927\u5b66\u6821\u56ed' + +# ascii tests +sys.setdefaultencoding("ascii") +print cjson.encode([campus]) +print cjson.encode(["\345\244\247\345\255\246\346\240\241\345\233\255"]) + +sys.setdefaultencoding("iso-8859-1") +print cjson.encode([campus]) +print cjson.encode(["\345\244\247\345\255\246\346\240\241\345\233\255"]) + +sys.setdefaultencoding("utf-8") +print cjson.encode([campus]) +print cjson.encode(["\345\244\247\345\255\246\346\240\241\345\233\255"]) + +sys.setdefaultencoding(encoding)