# HG changeset patch # User Fredrik Lundh # Date 1257469909 -3600 # Node ID 3bfa8cf10f8811a23a8dca44c33284b3e1fdeb38 # Parent c20be1d34e7eb5482f2e22041cabd750e60f8e52 Support arbitrary default encodings. Use optimized code paths for ascii, iso-8859-1, latin-1, us-ascii, and utf-8. diff -r c20be1d34e7eb5482f2e22041cabd750e60f8e52 -r 3bfa8cf10f8811a23a8dca44c33284b3e1fdeb38 cjson.c --- a/cjson.c Fri Nov 06 00:37:07 2009 +0100 +++ b/cjson.c Fri Nov 06 02:11:49 2009 +0100 @@ -21,13 +21,15 @@ int all_unicode; // make all output strings unicode if true } JSONData; + typedef struct JSONEncode { - int utf8; // true if byte strings are utf-8 + enum { LATIN1, UTF8, OTHER } encoding; } JSONEncode; static PyObject* encode_object(JSONEncode *encode, PyObject *object); static PyObject* encode_string_latin1(PyObject *object); static PyObject* encode_string_utf8(PyObject *object); +static PyObject* encode_string_other(PyObject *object); static PyObject* encode_unicode(PyObject *object); static PyObject* encode_tuple(JSONEncode *encode, PyObject *object); static PyObject* encode_list(JSONEncode *encode, PyObject *object); @@ -175,6 +177,8 @@ len = ptr - jsondata->ptr - 1; + /* TODO(effbot): DecodeEscape doesn't handle escaped / correctly, + so should be replaced with custom code. */ if (has_unicode || jsondata->all_unicode) object = PyUnicode_DecodeUnicodeEscape(jsondata->ptr+1, len, NULL); else if (string_escape) @@ -676,7 +680,7 @@ } } -/* Same, but used when defaultencoding = utf-8 */ +/* Same, but for UTF-8 */ static PyObject* encode_string_utf8(PyObject *string) @@ -758,6 +762,30 @@ } } +/* Same, but for other default encodings. */ + +static PyObject* +encode_string_other(PyObject *string) +{ + PyObject* unicode_string; + PyObject* result; + + unicode_string = PyUnicode_Decode( + PyString_AS_STRING(string), PyString_GET_SIZE(string), + NULL, /* = use default encoding */ + "replace"); + + if (!unicode_string) + return NULL; + + result = encode_unicode(unicode_string); + + Py_DECREF(unicode_string); + + return result; +} + + /* * This function is an almost verbatim copy of unicodeescape_string() from * Python's unicodeobject.c with the following differences: @@ -1165,10 +1193,14 @@ } else if (object == Py_None) { return PyString_FromString("null"); } else if (PyString_Check(object)) { - if (encode->utf8) + switch (encode->encoding) { + case LATIN1: + return encode_string_latin1(object); + case UTF8: return encode_string_utf8(object); - else - return encode_string_latin1(object); + default: + return encode_string_other(object); + } } else if (PyUnicode_Check(object)) { return encode_unicode(object); } else if (PyInt_Check(object)) { @@ -1214,8 +1246,27 @@ { JSONEncode encode; const char *encoding = PyUnicode_GetDefaultEncoding(); - - encode.utf8 = encoding[0] == 'u' && (strcmp(encoding, "utf-8") == 0); + encode.encoding = OTHER; + switch (*encoding) { + case 'a': + if (strcmp(encoding, "ascii") == 0) + encode.encoding = LATIN1; /* ascii is a subset */ + break; + case 'i': + if (strcmp(encoding, "iso-8859-1") == 0) + encode.encoding = LATIN1; + break; + case 'l': + if (strcmp(encoding, "latin-1") == 0) + encode.encoding = LATIN1; + break; + case 'u': + if (strcmp(encoding, "utf-8") == 0) + encode.encoding = UTF8; + else if (strcmp(encoding, "us-ascii") == 0) + encode.encoding = LATIN1; + break; + } return encode_object(&encode, object); } diff -r c20be1d34e7eb5482f2e22041cabd750e60f8e52 -r 3bfa8cf10f8811a23a8dca44c33284b3e1fdeb38 jsontest.py --- a/jsontest.py Fri Nov 06 00:37:07 2009 +0100 +++ b/jsontest.py Fri Nov 06 02:11:49 2009 +0100 @@ -300,14 +300,24 @@ campus_utf8 = campus.encode("utf-8") campus_raw = campus_utf8.decode("iso-8859-1") + island = u'\xe5 i \xe5a \xe4 e \xf6' + sys.setdefaultencoding("ascii") self.assertEqual(cjson.encode(campus_utf8), cjson.encode(campus_raw)) sys.setdefaultencoding("iso-8859-1") self.assertEqual(cjson.encode(campus_utf8), cjson.encode(campus_raw)) + self.assertEqual(cjson.encode(island.encode("iso-8859-1")), + cjson.encode(island)) + + sys.setdefaultencoding("cp850") + self.assertEqual(cjson.encode(island.encode("cp850")), + cjson.encode(island)) sys.setdefaultencoding("utf-8") self.assertEqual(cjson.encode(campus_utf8), cjson.encode(campus)) + self.assertEqual(cjson.encode(island.encode("utf-8")), + cjson.encode(island)) # full BMP roundtrip for i in range(0xd800) + range(0xe000, 0xffff):