diff --git a/include/rapidjson/encodedstream.h b/include/rapidjson/encodedstream.h index 0b5a499..37daaf6 100644 --- a/include/rapidjson/encodedstream.h +++ b/include/rapidjson/encodedstream.h @@ -63,12 +63,16 @@ public: typedef CharType Ch; AutoUTFInputStream(InputStream& is, UTFType type = kUTF8) : is_(is), type_(type) { - TakeBOM(is); - Read(); + DetectType(is); + static const TakeFunc f[] = { ENCODINGS_FUNC(Take) }; + takeFunc_ = f[type_]; + current_ = takeFunc_(is_); } + UTFType GetType() const { return type_; } + Ch Peek() const { return current_; } - Ch Take() { Ch c = current_; Read(); return c; } + Ch Take() { Ch c = current_; current_ = takeFunc_(is_); return c; } size_t Tell() const { is_.Tell(); } // Not implemented @@ -78,21 +82,47 @@ public: size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; } private: - friend struct AutoUTF; + // Detect encoding type with BOM or RFC 4627 + void DetectType(InputStream& is) { + // BOM (Byte Order Mark): + // 00 00 FE FF UTF-32BE + // FF FE 00 00 UTF-32LE + // FE FF UTF-16BE + // FF FE UTF-16LE + // EF BB BF UTF-8 - void TakeBOM(InputStream& is) { -#define ASSUME(x) if ((unsigned char)is.Peek() != x) break; is.Take() - switch ((unsigned char)is.Peek()) { - case 0x00: is.Take(); ASSUME(0x00); ASSUME(0xFE); ASSUME(0xFF); type_ = kUTF32BE; break; - case 0xEF: is.Take(); ASSUME(0xBB); ASSUME(0xBF); type_ = kUTF8; break; - case 0xFE: is.Take(); ASSUME(0xFF); type_ = kUTF16BE; break; - case 0xFF: is.Take(); ASSUME(0xFE); - if (is.Peek() == 0x00) { - is.Take(); ASSUME(0x00); type_ = kUTF32LE; break; - } - type_ = kUTF16LE; + const unsigned char* c = (const unsigned char *)is.Peek4(); + if (!c) + return; + + unsigned bom = c[0] | (c[1] << 8) | (c[2] << 16) | (c[3] << 24); + if (bom == 0xFFFE0000) { type_ = kUTF32BE; is.Take(); is.Take(); is.Take(); is.Take(); goto sizecheck; } + else if (bom == 0x0000FEFF) { type_ = kUTF32LE; is.Take(); is.Take(); is.Take(); is.Take(); goto sizecheck; } + else if ((bom & 0xFFFF) == 0xFFFE) { type_ = kUTF16BE; is.Take(); is.Take(); goto sizecheck; } + else if ((bom & 0xFFFF) == 0xFEFF) { type_ = kUTF16LE; is.Take(); is.Take(); goto sizecheck; } + else if ((bom & 0xFFFFFF) == 0xBFBBEF) { type_ = kUTF8; is.Take(); is.Take(); is.Take(); goto sizecheck; } + + // RFC 4627: Section 3 + // "Since the first two characters of a JSON text will always be ASCII + // characters [RFC0020], it is possible to determine whether an octet + // stream is UTF-8, UTF-16 (BE or LE), or UTF-32 (BE or LE) by looking + // at the pattern of nulls in the first four octets." + // 00 00 00 xx UTF-32BE + // 00 xx 00 xx UTF-16BE + // xx 00 00 00 UTF-32LE + // xx 00 xx 00 UTF-16LE + // xx xx xx xx UTF-8 + + unsigned pattern = (c[0] ? 1 : 0) | (c[1] ? 2 : 0) | (c[2] ? 4 : 0) | (c[3] ? 8 : 0); + switch (pattern) { + case 0x08: type_ = kUTF32BE; break; + case 0x0A: type_ = kUTF16BE; break; + case 0x01: type_ = kUTF32LE; break; + case 0x05: type_ = kUTF16LE; break; + case 0x0F: type_ = kUTF8; break; } -#undef ASSUME + + sizecheck: // RUntime check whether the size of character type is sufficient. It only perform checks with assertion. switch (type_) { case kUTF16LE: @@ -106,15 +136,11 @@ private: } } - void Read() { - typedef Ch (*TakeFunc)(InputStream& is); - static const TakeFunc f[] = { ENCODINGS_FUNC(Take) }; - current_ = f[type_](is_); - } - + typedef Ch (*TakeFunc)(InputStream& is); InputStream& is_; UTFType type_; Ch current_; + TakeFunc takeFunc_; }; template @@ -135,14 +161,17 @@ public: break; } + static const PutFunc f[] = { ENCODINGS_FUNC(Put) }; + putFunc_ = f[type_]; + if (putBOM) PutBOM(); } + UTFType GetType() const { return type_; } + void Put(Ch c) { - typedef void (*PutFunc)(OutputStream&, Ch); - static const PutFunc f[] = { ENCODINGS_FUNC(Put) }; - f[type_](os_, c); + putFunc_(os_, c); } void Flush() { os_.Flush(); } @@ -155,17 +184,17 @@ public: size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; } private: - friend struct AutoUTF; - void PutBOM() { typedef void (*PutBOMFunc)(OutputStream&); static const PutBOMFunc f[] = { ENCODINGS_FUNC(PutBOM) }; f[type_](os_); } + typedef void (*PutFunc)(OutputStream&, Ch); OutputStream& os_; UTFType type_; + PutFunc putFunc_; }; #undef ENCODINGS_FUNC diff --git a/include/rapidjson/encodings.h b/include/rapidjson/encodings.h index 66f493c..98c9481 100644 --- a/include/rapidjson/encodings.h +++ b/include/rapidjson/encodings.h @@ -99,7 +99,7 @@ struct UTF8 { template RAPIDJSON_FORCEINLINE static bool Validate(InputStream& is, OutputStream& os) { #define COPY() os.Put(c = is.Take()) -#define TRANS(mask) result &= ((GetType(c) & mask) != 0) +#define TRANS(mask) result &= ((GetType((unsigned char)c) & mask) != 0) #define TAIL() COPY(); TRANS(0x70) Ch c; COPY(); @@ -107,7 +107,7 @@ struct UTF8 { return true; bool result = true; - switch (GetType(c)) { + switch (GetType((unsigned char)c)) { case 2: TAIL(); return result; case 3: TAIL(); TAIL(); return result; case 4: COPY(); TRANS(0x50); TAIL(); return result; @@ -397,21 +397,21 @@ struct AutoUTF { RAPIDJSON_FORCEINLINE static void Encode(OutputStream& os, unsigned codepoint) { typedef void (*EncodeFunc)(OutputStream&, unsigned); static const EncodeFunc f[] = { ENCODINGS_FUNC(Encode) }; - (*f[os.type_])(os, codepoint); + (*f[os.GetType()])(os, codepoint); } template RAPIDJSON_FORCEINLINE static bool Decode(InputStream& is, unsigned* codepoint) { typedef bool (*DecodeFunc)(InputStream&, unsigned*); static const DecodeFunc f[] = { ENCODINGS_FUNC(Decode) }; - return (*f[is.type_])(is, codepoint); + return (*f[is.GetType()])(is, codepoint); } template RAPIDJSON_FORCEINLINE static bool Validate(InputStream& is, OutputStream& os) { typedef bool (*ValidateFunc)(InputStream&, unsigned*); static const ValidateFunc f[] = { ENCODINGS_FUNC(Validate) }; - return (*f[is.type_])(is, os); + return (*f[is.GetType()])(is, os); } #undef ENCODINGS_FUNC diff --git a/include/rapidjson/filereadstream.h b/include/rapidjson/filereadstream.h index 06b9c18..41e54dc 100644 --- a/include/rapidjson/filereadstream.h +++ b/include/rapidjson/filereadstream.h @@ -16,6 +16,7 @@ public: FileReadStream(FILE* fp, char* buffer, size_t bufferSize) : fp_(fp), buffer_(buffer), bufferSize_(bufferSize), bufferLast_(0), current_(buffer_), readCount_(0), count_(0), eof_(false) { RAPIDJSON_ASSERT(fp_ != 0); + RAPIDJSON_ASSERT(bufferSize >= 4); Read(); } @@ -29,6 +30,11 @@ public: char* PutBegin() { RAPIDJSON_ASSERT(false); return 0; } size_t PutEnd(char*) { RAPIDJSON_ASSERT(false); return 0; } + // For encoding detection only. + const char* Peek4() const { + return (current_ + 4 <= bufferLast_) ? current_ : 0; + } + private: void Read() { if (current_ < bufferLast_) diff --git a/test/unittest/encodingstest.cpp b/test/unittest/encodingstest.cpp index 322a73e..81efdf8 100644 --- a/test/unittest/encodingstest.cpp +++ b/test/unittest/encodingstest.cpp @@ -151,23 +151,27 @@ TEST_F(EncodingsTest, EncodedInputStream) { TEST_F(EncodingsTest, AutoUTFInputStream) { TestAutoUTFInputStream("utf8.json"); TestAutoUTFInputStream("utf8bom.json"); + TestAutoUTFInputStream("utf16le.json"); TestAutoUTFInputStream("utf16lebom.json"); + TestAutoUTFInputStream("utf16be.json"); TestAutoUTFInputStream("utf16bebom.json"); + TestAutoUTFInputStream("utf32le.json"); TestAutoUTFInputStream("utf32lebom.json"); + TestAutoUTFInputStream("utf32be.json"); TestAutoUTFInputStream("utf32bebom.json"); } TEST_F(EncodingsTest, EncodedOutputStream) { TestEncodedOutputStream, UTF8<> >("utf8.json", false); - TestEncodedOutputStream, UTF8<> >("utf8bom.json", true); + TestEncodedOutputStream, UTF8<> >("utf8bom.json", true); TestEncodedOutputStream, UTF16<> >("utf16le.json", false); - TestEncodedOutputStream, UTF16<> >("utf16lebom.json", true); + TestEncodedOutputStream, UTF16<> >("utf16lebom.json",true); TestEncodedOutputStream, UTF16<> >("utf16be.json", false); - TestEncodedOutputStream, UTF16<> >("utf16bebom.json", true); + TestEncodedOutputStream, UTF16<> >("utf16bebom.json",true); TestEncodedOutputStream, UTF32<> >("utf32le.json", false); - TestEncodedOutputStream, UTF32<> >("utf32lebom.json", true); + TestEncodedOutputStream, UTF32<> >("utf32lebom.json",true); TestEncodedOutputStream, UTF32<> >("utf32be.json", false); - TestEncodedOutputStream, UTF32<> >("utf32bebom.json", true); + TestEncodedOutputStream, UTF32<> >("utf32bebom.json",true); } TEST_F(EncodingsTest, AutoUTFOutputStream) {