diff --git a/bin/encodings/utf16be.json b/bin/encodings/utf16be.json new file mode 100644 index 0000000..e46dbfb Binary files /dev/null and b/bin/encodings/utf16be.json differ diff --git a/bin/encodings/utf16le.json b/bin/encodings/utf16le.json new file mode 100644 index 0000000..92d5045 Binary files /dev/null and b/bin/encodings/utf16le.json differ diff --git a/bin/encodings/utf32be.json b/bin/encodings/utf32be.json new file mode 100644 index 0000000..9cbb522 Binary files /dev/null and b/bin/encodings/utf32be.json differ diff --git a/bin/encodings/utf32bebom.json b/bin/encodings/utf32bebom.json new file mode 100644 index 0000000..bde6a99 Binary files /dev/null and b/bin/encodings/utf32bebom.json differ diff --git a/bin/encodings/utf32le.json b/bin/encodings/utf32le.json new file mode 100644 index 0000000..b00f290 Binary files /dev/null and b/bin/encodings/utf32le.json differ diff --git a/bin/encodings/utf32lebom.json b/bin/encodings/utf32lebom.json new file mode 100644 index 0000000..d3db39b Binary files /dev/null and b/bin/encodings/utf32lebom.json differ diff --git a/include/rapidjson/encodedstream.h b/include/rapidjson/encodedstream.h index 9ad960b..0b5a499 100644 --- a/include/rapidjson/encodedstream.h +++ b/include/rapidjson/encodedstream.h @@ -12,12 +12,11 @@ public: typedef typename Encoding::Ch Ch; EncodedInputStream(InputStream& is) : is_(is) { - Encoding::TakeBOM(is_); - Read(); + current_ = Encoding::TakeBOM(is_); } Ch Peek() const { return current_; } - Ch Take() { Ch c = current_; Read(); return c; } + Ch Take() { Ch c = current_; current_ = Encoding::Take(is_); return c; } size_t Tell() const { is_.Tell(); } // Not implemented @@ -27,13 +26,37 @@ public: size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; } private: - void Read() { - current_ = Encoding::Take(is_); - } InputStream& is_; Ch current_; }; +//! Adapts an output byte stream with an specified encoding. +template +class EncodedOutputStream { +public: + typedef typename Encoding::Ch Ch; + + EncodedOutputStream(OutputStream& os, bool putBOM = true) : os_(os) { + if (putBOM) + Encoding::PutBOM(os_); + } + + void Put(Ch c) { Encoding::Put(os_, c); } + void Flush() { os_.Flush(); } + + // Not implemented + Ch Peek() const { RAPIDJSON_ASSERT(false); } + Ch Take() { RAPIDJSON_ASSERT(false); } + size_t Tell() const { RAPIDJSON_ASSERT(false); } + Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; } + size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; } + +private: + OutputStream& os_; +}; + +#define ENCODINGS_FUNC(x) UTF8::x, UTF16LE::x, UTF16BE::x, UTF32LE::x, UTF32BE::x + template class AutoUTFInputStream { public: @@ -58,33 +81,34 @@ private: friend struct AutoUTF; void TakeBOM(InputStream& is) { -#define TAKE() is.Take() -#define PEEK(x) if ((unsigned char)is.Peek() != x) break +#define ASSUME(x) if ((unsigned char)is.Peek() != x) break; is.Take() switch ((unsigned char)is.Peek()) { - case 0x00: TAKE(); PEEK(0x00); TAKE(); PEEK(0xFE); TAKE(); PEEK(0xFF); type_ = kUTF32BE; return; - case 0xEF: TAKE(); PEEK(0xBB); TAKE(); PEEK(0xBF); TAKE(); type_ = kUTF8; return; - case 0xFE: TAKE(); PEEK(0xFF); TAKE(); type_ = kUTF16BE; return; - case 0xFF: TAKE(); PEEK(0xFE); TAKE(); + case 0x00: is.Take(); ASSUME(0x00); ASSUME(0xFE); ASSUME(0xFF); type_ = kUTF32BE; break; + case 0xEF: is.Take(); ASSUME(0xBB); ASSUME(0xBF); type_ = kUTF8; break; + case 0xFE: is.Take(); ASSUME(0xFF); type_ = kUTF16BE; break; + case 0xFF: is.Take(); ASSUME(0xFE); if (is.Peek() == 0x00) { - TAKE(); PEEK(0x00); TAKE(); type_ = kUTF32LE; return; + is.Take(); ASSUME(0x00); type_ = kUTF32LE; break; } type_ = kUTF16LE; - return; } -#undef TAKE -#undef PEEK +#undef ASSUME + // RUntime check whether the size of character type is sufficient. It only perform checks with assertion. + switch (type_) { + case kUTF16LE: + case kUTF16BE: + RAPIDJSON_ASSERT(sizeof(Ch) >= 2); + break; + case kUTF32LE: + case kUTF32BE: + RAPIDJSON_ASSERT(sizeof(Ch) >= 4); + break; + } } void Read() { typedef Ch (*TakeFunc)(InputStream& is); - static const TakeFunc f[] = { - UTF8::Take, - UTF16LE::Take, - UTF16BE::Take, - UTF32LE::Take, - UTF32BE::Take, - }; - + static const TakeFunc f[] = { ENCODINGS_FUNC(Take) }; current_ = f[type_](is_); } @@ -93,6 +117,59 @@ private: Ch current_; }; +template +class AutoUTFOutputStream { +public: + typedef CharType Ch; + + AutoUTFOutputStream(OutputStream& os, UTFType type, bool putBOM) : os_(os), type_(type) { + // RUntime check whether the size of character type is sufficient. It only perform checks with assertion. + switch (type_) { + case kUTF16LE: + case kUTF16BE: + RAPIDJSON_ASSERT(sizeof(Ch) >= 2); + break; + case kUTF32LE: + case kUTF32BE: + RAPIDJSON_ASSERT(sizeof(Ch) >= 4); + break; + } + + if (putBOM) + PutBOM(); + } + + void Put(Ch c) { + typedef void (*PutFunc)(OutputStream&, Ch); + static const PutFunc f[] = { ENCODINGS_FUNC(Put) }; + f[type_](os_, c); + } + + void Flush() { os_.Flush(); } + + // Not implemented + Ch Peek() const { RAPIDJSON_ASSERT(false); } + Ch Take() { RAPIDJSON_ASSERT(false); } + size_t Tell() const { RAPIDJSON_ASSERT(false); } + Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; } + size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; } + +private: + friend struct AutoUTF; + + void PutBOM() { + typedef void (*PutBOMFunc)(OutputStream&); + static const PutBOMFunc f[] = { ENCODINGS_FUNC(PutBOM) }; + f[type_](os_); + } + + + OutputStream& os_; + UTFType type_; +}; + +#undef ENCODINGS_FUNC + } // namespace rapidjson #endif // RAPIDJSON_FILESTREAM_H_ diff --git a/include/rapidjson/encodings.h b/include/rapidjson/encodings.h index 19ec77c..1442a80 100644 --- a/include/rapidjson/encodings.h +++ b/include/rapidjson/encodings.h @@ -141,19 +141,31 @@ struct UTF8 { } template - static void TakeBOM(InputStream& is) { - if ((unsigned char)is.Peek() != 0xEF) return; - is.Take(); - if ((unsigned char)is.Peek() != 0xBB) return; - is.Take(); - if ((unsigned char)is.Peek() != 0xBF) return; - is.Take(); + static CharType TakeBOM(InputStream& is) { + Ch c = Take(is); + if ((unsigned char)c != 0xEFu) return c; + c = is.Take(); + if ((unsigned char)c != 0xBBu) return c; + c = is.Take(); + if ((unsigned char)c != 0xBFu) return c; + c = is.Take(); + return c; } template RAPIDJSON_FORCEINLINE static Ch Take(InputStream& is) { return is.Take(); } + + template + static void PutBOM(OutputStream& os) { + os.Put(0xEFu); os.Put(0xBBu); os.Put(0xBFu); + } + + template + static void Put(OutputStream& os, Ch c) { + os.Put(c); + } }; /////////////////////////////////////////////////////////////////////////////// @@ -217,11 +229,9 @@ struct UTF16 { template struct UTF16LE : UTF16 { template - static void TakeBOM(InputStream& is) { - if ((unsigned char)is.Peek() != 0xFF) return; - is.Take(); - if ((unsigned char)is.Peek() != 0xFE) return; - is.Take(); + static CharType TakeBOM(InputStream& is) { + CharType c = Take(is); + return (unsigned short)c == 0xFEFFu ? Take(is) : c; } template @@ -230,16 +240,25 @@ struct UTF16LE : UTF16 { c |= (unsigned char)is.Take() << 8; return c; } + + template + static void PutBOM(OutputStream& os) { + os.Put(0xFFu); os.Put(0xFEu); + } + + template + static void Put(OutputStream& os, Ch c) { + os.Put(c & 0xFFu); + os.Put((c >> 8) & 0xFFu); + } }; template struct UTF16BE : UTF16 { template - static void TakeBOM(InputStream& is) { - if ((unsigned char)is.Peek() != 0xFE) return; - is.Take(); - if ((unsigned char)is.Peek() != 0xFF) return; - is.Take(); + static CharType TakeBOM(InputStream& is) { + CharType c = Take(is); + return (unsigned short)c == 0xFEFFu ? Take(is) : c; } template @@ -248,6 +267,17 @@ struct UTF16BE : UTF16 { c |= (unsigned char)is.Take(); return c; } + + template + static void PutBOM(OutputStream& os) { + os.Put(0xFEu); os.Put(0xFFu); + } + + template + static void Put(OutputStream& os, Ch c) { + os.Put((c >> 8) & 0xFFu); + os.Put(c & 0xFFu); + } }; /////////////////////////////////////////////////////////////////////////////// @@ -286,15 +316,9 @@ struct UTF32 { template struct UTF32LE : UTF32 { template - static void TakeBOM(InputStream& is) { - if ((unsigned char)is.Peek() != 0xFF) return; - is.Take(); - if ((unsigned char)is.Peek() != 0xFE) return; - is.Take(); - if ((unsigned char)is.Peek() != 0x00) return; - is.Take(); - if ((unsigned char)is.Peek() != 0x00) return; - is.Take(); + static CharType TakeBOM(InputStream& is) { + CharType c = Take(is); + return (unsigned)c == 0x0000FEFFu ? Take(is) : c; } template @@ -305,20 +329,27 @@ struct UTF32LE : UTF32 { c |= (unsigned char)is.Take() << 24; return c; } + + template + static void PutBOM(OutputStream& os) { + os.Put(0xFFu); os.Put(0xFEu); os.Put(0x00u); os.Put(0x00u); + } + + template + static void Put(OutputStream& os, Ch c) { + os.Put(c & 0xFFu); + os.Put((c >> 8) & 0xFFu); + os.Put((c >> 16) & 0xFFu); + os.Put((c >> 24) & 0xFFu); + } }; template struct UTF32BE : UTF32 { template - static void TakeBOM(InputStream& is) { - if ((unsigned char)is.Peek() != 0x00) return; - is.Take(); - if ((unsigned char)is.Peek() != 0x00) return; - is.Take(); - if ((unsigned char)is.Peek() != 0xFE) return; - is.Take(); - if ((unsigned char)is.Peek() != 0xFF) return; - is.Take(); + static CharType TakeBOM(InputStream& is) { + CharType c = Take(is); + return (unsigned)c == 0x0000FEFFu ? Take(is) : c; } template @@ -329,6 +360,19 @@ struct UTF32BE : UTF32 { c |= (unsigned char)is.Take(); return c; } + + template + static void PutBOM(OutputStream& os) { + os.Put(0x00u); os.Put(0x00u); os.Put(0xFEu); os.Put(0xFFu); + } + + template + static void Put(OutputStream& os, Ch c) { + os.Put((c >> 24) & 0xFFu); + os.Put((c >> 16) & 0xFFu); + os.Put((c >> 8) & 0xFFu); + os.Put(c & 0xFFu); + } }; /////////////////////////////////////////////////////////////////////////////// @@ -347,44 +391,30 @@ template struct AutoUTF { typedef CharType Ch; +#define ENCODINGS_FUNC(x) UTF8::x, UTF16LE::x, UTF16BE::x, UTF32LE::x, UTF32BE::x + template RAPIDJSON_FORCEINLINE static void Encode(OutputStream& os, unsigned codepoint) { typedef void (*EncodeFunc)(OutputStream&, unsigned); - static const EncodeFunc f[] = { - UTF8::Encode, - UTF16::Encode, - UTF16::Encode, - UTF32::Encode, - UTF32::Encode, - }; + static const EncodeFunc f[] = { ENCODINGS_FUNC(Encode) }; (*f[os.type_])(os, codepoint); } template RAPIDJSON_FORCEINLINE static bool Decode(InputStream& is, unsigned* codepoint) { typedef bool (*DecodeFunc)(InputStream&, unsigned*); - static const DecodeFunc f[] = { - UTF8::Decode, - UTF16::Decode, - UTF16::Decode, - UTF32::Decode, - UTF32::Decode, - }; + static const DecodeFunc f[] = { ENCODINGS_FUNC(Decode) }; return (*f[is.type_])(is, codepoint); } template RAPIDJSON_FORCEINLINE static bool Validate(InputStream& is, OutputStream& os) { typedef bool (*ValidateFunc)(InputStream&, unsigned*); - static const ValidateFunc f[] = { - UTF8::Decode, - UTF16::Decode, - UTF16::Decode, - UTF32::Decode, - UTF32::Decode, - }; + static const ValidateFunc f[] = { ENCODINGS_FUNC(Validate) }; return (*f[is.type_])(is, os); } + +#undef ENCODINGS_FUNC }; /////////////////////////////////////////////////////////////////////////////// diff --git a/test/unittest/encodingstest.cpp b/test/unittest/encodingstest.cpp index 6c57444..f6d3416 100644 --- a/test/unittest/encodingstest.cpp +++ b/test/unittest/encodingstest.cpp @@ -7,7 +7,16 @@ using namespace rapidjson; class EncodingsTest : public ::testing::Test { public: - FILE* Open(const char* filename) { + virtual void SetUp() { + json_ = ReadFile("utf8.json", true, &length_); + } + + virtual void TearDown() { + free(json_); + } + +protected: + static FILE* Open(const char* filename) { char buffer[1024]; sprintf(buffer, "encodings/%s", filename); FILE *fp = fopen(buffer, "rb"); @@ -18,105 +27,158 @@ public: return fp; } - virtual void SetUp() { - FILE *fp = Open("utf8.json"); - ASSERT_TRUE(fp != 0); + static char *ReadFile(const char* filename, bool appendPath, size_t* outLength) { + FILE *fp = appendPath ? Open(filename) : fopen(filename, "rb"); + + if (!fp) { + *outLength = 0; + return 0; + } fseek(fp, 0, SEEK_END); - length_ = (size_t)ftell(fp); + *outLength = (size_t)ftell(fp); fseek(fp, 0, SEEK_SET); - json_ = (char*)malloc(length_ + 1); - fread(json_, 1, length_, fp); - json_[length_] = '\0'; + char* buffer = (char*)malloc(*outLength + 1); + fread(buffer, 1, *outLength, fp); + buffer[*outLength] = '\0'; + fclose(fp); + return buffer; + } + + template + void TestEncodedInputStream(const char* filename) { + char buffer[16]; + FILE *fp = Open(filename); + ASSERT_TRUE(fp != 0); + FileReadStream fs(fp, buffer, sizeof(buffer)); + EncodedInputStream eis(fs); + StringStream s(json_); + + while (eis.Peek() != '\0') { + unsigned expected, actual; + EXPECT_TRUE(UTF8<>::Decode(s, &expected)); + EXPECT_TRUE(MemoryEncoding::Decode(eis, &actual)); + EXPECT_EQ(expected, actual); + } + EXPECT_EQ('\0', s.Peek()); fclose(fp); } - virtual void TearDown() { - free(json_); + void TestAutoUTFInputStream(const char *filename) { + char buffer[16]; + FILE *fp = Open(filename); + ASSERT_TRUE(fp != 0); + FileReadStream fs(fp, buffer, sizeof(buffer)); + AutoUTFInputStream eis(fs); + StringStream s(json_); + while (eis.Peek() != '\0') { + unsigned expected, actual; + EXPECT_TRUE(UTF8<>::Decode(s, &expected)); + EXPECT_TRUE(AutoUTF::Decode(eis, &actual)); + EXPECT_EQ(expected, actual); + } + EXPECT_EQ('\0', s.Peek()); + fclose(fp); + } + + template + void TestEncodedOutputStream(const char* expectedFilename, bool putBOM) { + char filename[L_tmpnam]; + tmpnam(filename); + + FILE *fp = fopen(filename, "wb"); + char buffer[16]; + FileWriteStream os(fp, buffer, sizeof(buffer)); + EncodedOutputStream eos(os, putBOM); + StringStream s(json_); + while (s.Peek() != '\0') { + bool success = Transcoder, MemoryEncoding>::Transcode(s, eos); + EXPECT_TRUE(success); + } + eos.Flush(); + fclose(fp); + EXPECT_TRUE(CompareFile(filename, expectedFilename)); + remove(filename); + } + + bool CompareFile(char * filename, const char* expectedFilename) { + size_t actualLength, expectedLength; + char* actualBuffer = ReadFile(filename, false, &actualLength); + char* expectedBuffer = ReadFile(expectedFilename, true, &expectedLength); + bool ret = (expectedLength == actualLength) && memcmp(expectedBuffer, actualBuffer, actualLength) == 0; + free(actualBuffer); + free(expectedBuffer); + return ret; + } + + void TestAutoUTFOutputStream(UTFType type, bool putBOM, const char *expectedFilename) { + char filename[L_tmpnam]; + tmpnam(filename); + + FILE *fp = fopen(filename, "wb"); + char buffer[16]; + FileWriteStream os(fp, buffer, sizeof(buffer)); + AutoUTFOutputStream eos(os, type, putBOM); + StringStream s(json_); + while (s.Peek() != '\0') { + bool success = Transcoder, AutoUTF>::Transcode(s, eos); + EXPECT_TRUE(success); + } + eos.Flush(); + fclose(fp); + EXPECT_TRUE(CompareFile(filename, expectedFilename)); + remove(filename); } -protected: const char* filename_; char *json_; size_t length_; }; -TEST_F(EncodingsTest, EncodedInputStream_UTF8BOM) { - char buffer[16]; - FILE *fp = Open("utf8bom.json"); - ASSERT_TRUE(fp != 0); - FileReadStream fs(fp, buffer, sizeof(buffer)); - EncodedInputStream, FileReadStream> eis(fs); - StringStream s(json_); - - while (eis.Peek() != '\0') { - unsigned expected, actual; - UTF8<>::Decode(s, &expected); - UTF8<>::Decode(eis, &actual); - EXPECT_EQ(expected, actual); - } - EXPECT_EQ('\0', s.Peek()); - fclose(fp); -} - -TEST_F(EncodingsTest, EncodedInputStream_UTF16LEBOM) { - char buffer[16]; - FILE *fp = Open("utf16lebom.json"); - ASSERT_TRUE(fp != 0); - FileReadStream fs(fp, buffer, sizeof(buffer)); - EncodedInputStream, FileReadStream> eis(fs); - StringStream s(json_); - - while (eis.Peek() != '\0') { - unsigned expected, actual; - UTF8<>::Decode(s, &expected); - UTF16<>::Decode(eis, &actual); - EXPECT_EQ(expected, actual); - } - EXPECT_EQ('\0', s.Peek()); - fclose(fp); -} - -TEST_F(EncodingsTest, EncodedInputStream_UTF16BEBOM) { - char buffer[16]; - FILE *fp = Open("utf16bebom.json"); - ASSERT_TRUE(fp != 0); - FileReadStream fs(fp, buffer, sizeof(buffer)); - EncodedInputStream, FileReadStream> eis(fs); - StringStream s(json_); - - while (eis.Peek() != '\0') { - unsigned expected, actual; - UTF8<>::Decode(s, &expected); - UTF16<>::Decode(eis, &actual); - EXPECT_EQ(expected, actual); - } - EXPECT_EQ('\0', s.Peek()); - fclose(fp); +TEST_F(EncodingsTest, EncodedInputStream) { + TestEncodedInputStream, UTF8<>>("utf8.json"); + TestEncodedInputStream, UTF8<>>("utf8bom.json"); + TestEncodedInputStream, UTF16<>>("utf16le.json"); + TestEncodedInputStream, UTF16<>>("utf16lebom.json"); + TestEncodedInputStream, UTF16<>>("utf16be.json"); + TestEncodedInputStream, UTF16<>>("utf16bebom.json"); + TestEncodedInputStream, UTF32<>>("utf32le.json"); + TestEncodedInputStream, UTF32<>>("utf32lebom.json"); + TestEncodedInputStream, UTF32<>>("utf32be.json"); + TestEncodedInputStream, UTF32<>>("utf32bebom.json"); } TEST_F(EncodingsTest, AutoUTFInputStream) { -#define TEST_FILE(filename) \ - { \ - char buffer[16]; \ - FILE *fp = Open(filename); \ - ASSERT_TRUE(fp != 0); \ - FileReadStream fs(fp, buffer, sizeof(buffer)); \ - AutoUTFInputStream eis(fs); \ - StringStream s(json_); \ - while (eis.Peek() != '\0') { \ - unsigned expected, actual; \ - UTF8<>::Decode(s, &expected); \ - AutoUTF::Decode(eis, &actual); \ - EXPECT_EQ(expected, actual); \ - } \ - EXPECT_EQ('\0', s.Peek()); \ - fclose(fp); \ - } - - TEST_FILE("utf8.json"); - TEST_FILE("utf8bom.json"); - TEST_FILE("utf16lebom.json"); - TEST_FILE("utf16bebom.json"); -#undef TEST_FILE + TestAutoUTFInputStream("utf8.json"); + TestAutoUTFInputStream("utf8bom.json"); + TestAutoUTFInputStream("utf16lebom.json"); + TestAutoUTFInputStream("utf16bebom.json"); + TestAutoUTFInputStream("utf32lebom.json"); + TestAutoUTFInputStream("utf32bebom.json"); +} + +TEST_F(EncodingsTest, EncodedOutputStream) { + TestEncodedOutputStream, UTF8<>>("utf8.json", false); + TestEncodedOutputStream, UTF8<>>("utf8bom.json", true); + TestEncodedOutputStream, UTF16<>>("utf16le.json", false); + TestEncodedOutputStream, UTF16<>>("utf16lebom.json", true); + TestEncodedOutputStream, UTF16<>>("utf16be.json", false); + TestEncodedOutputStream, UTF16<>>("utf16bebom.json", true); + TestEncodedOutputStream, UTF32<>>("utf32le.json", false); + TestEncodedOutputStream, UTF32<>>("utf32lebom.json", true); + TestEncodedOutputStream, UTF32<>>("utf32be.json", false); + TestEncodedOutputStream, UTF32<>>("utf32bebom.json", true); +} + +TEST_F(EncodingsTest, AutoUTFOutputStream) { + TestAutoUTFOutputStream(kUTF8, false, "utf8.json"); + TestAutoUTFOutputStream(kUTF8, true, "utf8bom.json"); + TestAutoUTFOutputStream(kUTF16LE, false, "utf16le.json"); + TestAutoUTFOutputStream(kUTF16LE, true, "utf16lebom.json"); + TestAutoUTFOutputStream(kUTF16BE, false, "utf16be.json"); + TestAutoUTFOutputStream(kUTF16BE, true, "utf16bebom.json"); + TestAutoUTFOutputStream(kUTF32LE, false, "utf32le.json"); + TestAutoUTFOutputStream(kUTF32LE, true, "utf32lebom.json"); + TestAutoUTFOutputStream(kUTF32BE, false, "utf32be.json"); + TestAutoUTFOutputStream(kUTF32BE, true, "utf32bebom.json"); }