diff --git a/include/rapidjson/rapidjson.h b/include/rapidjson/rapidjson.h index 163be6c..716fa1d 100644 --- a/include/rapidjson/rapidjson.h +++ b/include/rapidjson/rapidjson.h @@ -303,11 +303,19 @@ private: concept Encoding { typename Ch; //! Type of character. - //! \brief Encode a Unicode codepoint to a buffer. - //! \param buffer pointer to destination buffer to store the result. It should have sufficient size of encoding one character. + //! \brief Encode a Unicode codepoint to a stream. + //! \param os Output stream. //! \param codepoint An unicode codepoint, ranging from 0x0 to 0x10FFFF inclusively. - //! \returns the pointer to the next character after the encoded data. - static Ch* Encode(Ch *buffer, unsigned codepoint); + template + static void Encode(OutputStream& os, unsigned codepoint) { + + //! \brief Validate one Unicode codepoint from an encoded stream. + //! \param is Input stream to obtain codepoint. + //! \param os Output for copying one codepoint. + //! \return true if it is valid. + //! \note This function just validating and copying the codepoint without actually decode it. + template + RAPIDJSON_FORCEINLINE static bool Validate(InputStream& is, OutputStream& os) { }; \endcode */ @@ -317,6 +325,7 @@ concept Encoding { //! UTF-8 encoding. /*! http://en.wikipedia.org/wiki/UTF-8 + http://tools.ietf.org/html/rfc3629 \tparam CharType Type for storing 8-bit UTF-8 data. Default is char. \implements Encoding */ @@ -324,67 +333,70 @@ template struct UTF8 { typedef CharType Ch; - static Ch* Encode(Ch *buffer, unsigned codepoint) { + template + static void Encode(OutputStream& os, unsigned codepoint) { if (codepoint <= 0x7F) - *buffer++ = codepoint & 0xFF; + os.Put(codepoint & 0xFF); else if (codepoint <= 0x7FF) { - *buffer++ = 0xC0 | ((codepoint >> 6) & 0xFF); - *buffer++ = 0x80 | ((codepoint & 0x3F)); + os.Put(0xC0 | ((codepoint >> 6) & 0xFF)); + os.Put(0x80 | ((codepoint & 0x3F))); } else if (codepoint <= 0xFFFF) { - *buffer++ = 0xE0 | ((codepoint >> 12) & 0xFF); - *buffer++ = 0x80 | ((codepoint >> 6) & 0x3F); - *buffer++ = 0x80 | (codepoint & 0x3F); + os.Put(0xE0 | ((codepoint >> 12) & 0xFF)); + os.Put(0x80 | ((codepoint >> 6) & 0x3F)); + os.Put(0x80 | (codepoint & 0x3F)); } else { RAPIDJSON_ASSERT(codepoint <= 0x10FFFF); - *buffer++ = 0xF0 | ((codepoint >> 18) & 0xFF); - *buffer++ = 0x80 | ((codepoint >> 12) & 0x3F); - *buffer++ = 0x80 | ((codepoint >> 6) & 0x3F); - *buffer++ = 0x80 | (codepoint & 0x3F); + os.Put(0xF0 | ((codepoint >> 18) & 0xFF)); + os.Put(0x80 | ((codepoint >> 12) & 0x3F)); + os.Put(0x80 | ((codepoint >> 6) & 0x3F)); + os.Put(0x80 | (codepoint & 0x3F)); } - return buffer; } - template - RAPIDJSON_FORCEINLINE static Ch* Validate(Ch *buffer, Stream& s) { -#define X1 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 -#define X5 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5 - static const char utf8[256] = { - X1,X1,X1,X1,X1,X1,X1,X1, // 00-7F 1 byte - X5,X5,X5,X5, // 80-BF Continuation - 0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // C0-C1: invalid, C2-CF: 2 bytes - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // D0-DF: 2 bytes - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // E0-EF: 3 bytes - 4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // F0-F4: 4 bytes + template + RAPIDJSON_FORCEINLINE static bool Validate(InputStream& is, OutputStream& os) { + // http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ + static const unsigned char utf8d[] = { + //! \todo optimization + // The first part of the table maps bytes to character classes that + // to reduce the size of the transition table and create bitmasks. + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, + + // The second part is a transition table that maps a combination + // of a state of the automaton and a character class to a state. + 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, + 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, + 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, + 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, + 12,36,12,12,12,12,12,12,12,12,12,12, }; -#undef X1 -#undef X5 + Ch c; + os.Put(c = is.Take()); + if ((unsigned char) c <= 0x80) + return true; -#define TAIL() c = *buffer++ = s.Take(); if ((c & 0xC0) != 0x80) return NULL; + unsigned type = utf8d[(unsigned char)c]; + unsigned state = utf8d[256 + type]; + if (state == 12) + return false; - Ch c = *buffer++ = s.Take(); - if ((unsigned char)c < 0x80u) - return buffer; - - switch(utf8[(unsigned char)c]) { - case 2: - TAIL(); - return buffer; - - case 3: - TAIL(); - TAIL(); - return buffer; - - case 4: - TAIL(); - TAIL(); - TAIL(); - return buffer; - } - return NULL; -#undef TAIL + while (state) { + os.Put(c = is.Take()); + unsigned type = utf8d[(unsigned char)c]; + state = utf8d[256 + state + type]; + if (state == 12) + return false; + }; + return true; } }; @@ -393,6 +405,7 @@ struct UTF8 { //! UTF-16 encoding. /*! http://en.wikipedia.org/wiki/UTF-16 + http://tools.ietf.org/html/rfc2781 \tparam CharType Type for storing 16-bit UTF-16 data. Default is wchar_t. C++11 may use char16_t instead. \implements Encoding */ @@ -400,33 +413,32 @@ template struct UTF16 { typedef CharType Ch; - static Ch* Encode(Ch* buffer, unsigned codepoint) { + template + static void Encode(OutputStream& os, unsigned codepoint) { if (codepoint <= 0xFFFF) { RAPIDJSON_ASSERT(codepoint < 0xD800 || codepoint > 0xDFFF); // Code point itself cannot be surrogate pair - *buffer++ = codepoint; + os.Put(codepoint); } else { RAPIDJSON_ASSERT(codepoint <= 0x10FFFF); unsigned v = codepoint - 0x10000; - *buffer++ = (v >> 10) + 0xD800; - *buffer++ = (v & 0x3FF) + 0xDC00; + os.Put((v >> 10) + 0xD800); + os.Put((v & 0x3FF) + 0xDC00); } - return buffer; } - template - static Ch* Validate(Ch *buffer, Stream& s) { - Ch c = *buffer++ = s.Take(); + template + RAPIDJSON_FORCEINLINE static bool Validate(InputStream& is, OutputStream& os) { + Ch c; + os.Put(c = is.Take()); if (c < 0xD800 || c > 0xDFFF) - ; + return true; else if (c < 0xDBFF) { - Ch c = *buffer++ = s.Take(); - if (c < 0xDC00 || c > 0xDFFF) - return NULL; + os.Put(c = is.Take()); + return c >= 0xDC00 && c <= 0xDFFF; } else - return NULL; - return buffer; + return false; } }; @@ -442,16 +454,17 @@ template struct UTF32 { typedef CharType Ch; - static Ch *Encode(Ch* buffer, unsigned codepoint) { + template + static void Encode(OutputStream& os, unsigned codepoint) { RAPIDJSON_ASSERT(codepoint <= 0x10FFFF); - *buffer++ = codepoint; - return buffer; + os.Put(codepoint); } - template - static Ch* Validate(Ch *buffer, Stream& s) { - Ch c = *buffer++ = s.Take(); - return c <= 0x10FFFF ? buffer : 0; + template + RAPIDJSON_FORCEINLINE static bool Validate(InputStream& is, OutputStream& os) { + Ch c; + os.Put(c = is.Take()); + return c <= 0x10FFFF; } }; diff --git a/include/rapidjson/reader.h b/include/rapidjson/reader.h index efd092a..43ad7af 100644 --- a/include/rapidjson/reader.h +++ b/include/rapidjson/reader.h @@ -368,6 +368,16 @@ private: return codepoint; } + struct StackStream { + StackStream(internal::Stack& stack) : stack_(stack), length_(0) {} + void Put(Ch c) { + *stack_.template Push() = c; + ++length_; + } + internal::Stack& stack_; + SizeType length_; + }; + // Parse string, handling the prefix and suffix double quotes and escaping. template void ParseString(Stream& stream, Handler& handler) { @@ -391,13 +401,13 @@ private: else len = 0; + StackStream stackStream(stack_); #define RAPIDJSON_PUT(x) \ do { \ if (parseFlags & kParseInsituFlag) \ s.Put(x); \ else { \ - *stack_.template Push() = x; \ - ++len; \ + stackStream.Put(x); \ } \ } while(false) @@ -423,16 +433,10 @@ private: codepoint = (((codepoint - 0xD800) << 10) | (codepoint2 - 0xDC00)) + 0x10000; } - Ch buffer[4]; - SizeType count = SizeType(Encoding::Encode(buffer, codepoint) - &buffer[0]); - if (parseFlags & kParseInsituFlag) - for (SizeType i = 0; i < count; i++) - s.Put(buffer[i]); - else { - memcpy(stack_.template Push(count), buffer, count * sizeof(Ch)); - len += count; - } + Encoding::Encode(s, codepoint); + else + Encoding::Encode(stackStream, codepoint); } else { RAPIDJSON_PARSE_ERROR("Unknown escape character", stream.Tell() - 1); @@ -449,7 +453,7 @@ private: } else { RAPIDJSON_PUT('\0'); - handler.String(stack_.template Pop(len), len - 1, true); + handler.String(stack_.template Pop(stackStream.length_), stackStream.length_ - 1, true); } stream = s; // restore stream return; @@ -463,24 +467,19 @@ private: return; } else if (parseFlags & kParseValidateEncodingFlag) { - Ch buffer[4]; - Ch* end = Encoding::Validate(&buffer[0], s); - if (end == NULL) { - RAPIDJSON_PARSE_ERROR("Invalid encoding", s.Tell()); - return; + if (parseFlags & kParseInsituFlag) { + if (!Encoding::Validate(s, s)) { + RAPIDJSON_PARSE_ERROR("Invalid encoding", s.Tell()); + return; + } } - - if (parseFlags & kParseInsituFlag) - for (Ch* p = &buffer[0]; p != end; ++p) - s.Put(*p); - else { - SizeType l = SizeType(end - &buffer[0]); - Ch* q = stack_.template Push(l); - for (Ch* p = &buffer[0]; p != end; ++p) - *q++ = *p; - len += l; + else + { + if (!Encoding::Validate(s, stackStream)) { + RAPIDJSON_PARSE_ERROR("Invalid encoding", s.Tell()); + return; + } } - } else { RAPIDJSON_PUT(s.Take()); // Normal character, just copy