diff --git a/include/rapidjson/encodings.h b/include/rapidjson/encodings.h index dcf21e4..fd8689d 100644 --- a/include/rapidjson/encodings.h +++ b/include/rapidjson/encodings.h @@ -23,6 +23,8 @@ namespace rapidjson { concept Encoding { typename Ch; //! Type of character. A "character" is actually a code unit in unicode's definition. + enum { supportUnicode = 1 }; // or 0 if not supporting unicode + //! \brief Encode a Unicode codepoint to an output stream. //! \param os Output stream. //! \param codepoint An unicode codepoint, ranging from 0x0 to 0x10FFFF inclusively. @@ -78,6 +80,8 @@ template struct UTF8 { typedef CharType Ch; + enum { supportUnicode = 1 }; + template static void Encode(OutputStream& os, unsigned codepoint) { if (codepoint <= 0x7F) @@ -222,6 +226,8 @@ struct UTF16 { typedef CharType Ch; RAPIDJSON_STATIC_ASSERT(sizeof(Ch) >= 2); + enum { supportUnicode = 1 }; + template static void Encode(OutputStream& os, unsigned codepoint) { RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputStream::Ch) >= 2); @@ -351,6 +357,8 @@ struct UTF32 { typedef CharType Ch; RAPIDJSON_STATIC_ASSERT(sizeof(Ch) >= 4); + enum { supportUnicode = 1 }; + template static void Encode(OutputStream& os, unsigned codepoint) { RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputStream::Ch) >= 4); @@ -447,6 +455,66 @@ struct UTF32BE : UTF32 { } }; +/////////////////////////////////////////////////////////////////////////////// +// ASCII + +//! ASCII encoding. +/*! http://en.wikipedia.org/wiki/ASCII + \tparam CharType Code unit for storing 7-bit ASCII data. Default is char. + \note implements Encoding concept +*/ +template +struct ASCII { + typedef CharType Ch; + + enum { supportUnicode = 0 }; + + template + static void Encode(OutputStream& os, unsigned codepoint) { + RAPIDJSON_ASSERT(codepoint <= 0x7F); + os.Put(static_cast(codepoint & 0xFF)); + } + + template + static bool Decode(InputStream& is, unsigned* codepoint) { + unsigned char c = static_cast(is.Take()); + *codepoint = c; + return c <= 0X7F; + } + + template + static bool Validate(InputStream& is, OutputStream& os) { + unsigned char c = is.Take(); + os.Put(c); + return c <= 0x7F; + } + + template + static CharType TakeBOM(InputByteStream& is) { + RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1); + Ch c = Take(is); + return c; + } + + template + static Ch Take(InputByteStream& is) { + RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1); + return is.Take(); + } + + template + static void PutBOM(OutputByteStream& os) { + RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1); + (void)os; + } + + template + static void Put(OutputByteStream& os, Ch c) { + RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1); + os.Put(static_cast(c)); + } +}; + /////////////////////////////////////////////////////////////////////////////// // AutoUTF diff --git a/include/rapidjson/writer.h b/include/rapidjson/writer.h index f3b7567..73677c1 100644 --- a/include/rapidjson/writer.h +++ b/include/rapidjson/writer.h @@ -256,7 +256,39 @@ protected: GenericStringStream is(str); while (is.Tell() < length) { const Ch c = is.Peek(); - if ((sizeof(Ch) == 1 || (unsigned)c < 256) && escape[(unsigned char)c]) { + if (!TargetEncoding::supportUnicode && (unsigned)c >= 0x80) { + // Unicode escaping + unsigned codepoint; + if (!SourceEncoding::Decode(is, &codepoint)) + return false; + os_.Put('\\'); + os_.Put('u'); + if (codepoint <= 0xD7FF || (codepoint >= 0xE000 && codepoint <= 0xFFFF)) { + os_.Put(hexDigits[(codepoint >> 12) & 15]); + os_.Put(hexDigits[(codepoint >> 8) & 15]); + os_.Put(hexDigits[(codepoint >> 4) & 15]); + os_.Put(hexDigits[(codepoint ) & 15]); + } + else if (codepoint >= 0x010000 && codepoint <= 0x10FFFF) { + // Surrogate pair + unsigned s = codepoint - 0x010000; + unsigned lead = (s >> 10) + 0xD800; + unsigned trail = (s & 0x3FF) + 0xDC00; + os_.Put(hexDigits[(lead >> 12) & 15]); + os_.Put(hexDigits[(lead >> 8) & 15]); + os_.Put(hexDigits[(lead >> 4) & 15]); + os_.Put(hexDigits[(lead ) & 15]); + os_.Put('\\'); + os_.Put('u'); + os_.Put(hexDigits[(trail >> 12) & 15]); + os_.Put(hexDigits[(trail >> 8) & 15]); + os_.Put(hexDigits[(trail >> 4) & 15]); + os_.Put(hexDigits[(trail ) & 15]); + } + else + return false; // invalid code point + } + else if ((sizeof(Ch) == 1 || (unsigned)c < 256) && escape[(unsigned char)c]) { is.Take(); os_.Put('\\'); os_.Put(escape[(unsigned char)c]); diff --git a/test/unittest/writertest.cpp b/test/unittest/writertest.cpp index be3d77b..b9a4891 100644 --- a/test/unittest/writertest.cpp +++ b/test/unittest/writertest.cpp @@ -113,13 +113,34 @@ TEST(Writer,DoublePrecision) { } TEST(Writer, Transcode) { + const char json[] = "{\"hello\":\"world\",\"t\":true,\"f\":false,\"n\":null,\"i\":123,\"pi\":3.1416,\"a\":[1,2,3],\"dollar\":\"\x24\",\"cents\":\"\xC2\xA2\",\"euro\":\"\xE2\x82\xAC\",\"gclef\":\"\xF0\x9D\x84\x9E\"}"; + // UTF8 -> UTF16 -> UTF8 - StringStream s("{ \"hello\" : \"world\", \"t\" : true , \"f\" : false, \"n\": null, \"i\":123, \"pi\": 3.1416, \"a\":[1, 2, 3], \"dollar\":\"\x24\", \"cents\":\"\xC2\xA2\", \"euro\":\"\xE2\x82\xAC\", \"gclef\":\"\xF0\x9D\x84\x9E\" } "); - StringBuffer buffer; - Writer, UTF8<> > writer(buffer); - GenericReader, UTF16<> > reader; - reader.Parse<0>(s, writer); - EXPECT_STREQ("{\"hello\":\"world\",\"t\":true,\"f\":false,\"n\":null,\"i\":123,\"pi\":3.1416,\"a\":[1,2,3],\"dollar\":\"\x24\",\"cents\":\"\xC2\xA2\",\"euro\":\"\xE2\x82\xAC\",\"gclef\":\"\xF0\x9D\x84\x9E\"}", buffer.GetString()); + { + StringStream s(json); + StringBuffer buffer; + Writer, UTF8<> > writer(buffer); + GenericReader, UTF16<> > reader; + reader.Parse(s, writer); + EXPECT_STREQ(json, buffer.GetString()); + } + + // UTF8 -> UTF8 -> ASCII -> UTF8 -> UTF8 + { + StringStream s(json); + StringBuffer buffer; + Writer, ASCII<> > writer(buffer); + Reader reader; + reader.Parse(s, writer); + + StringBuffer buffer2; + Writer writer2(buffer2); + GenericReader, UTF8<> > reader2; + StringStream s2(buffer.GetString()); + reader2.Parse(s2, writer2); + + EXPECT_STREQ(json, buffer2.GetString()); + } } #include