From a8d631fbc2ffd26d353a46c50bcdc9e2ee456776 Mon Sep 17 00:00:00 2001 From: "miloyip@gmail.com" Date: Mon, 28 Nov 2011 09:30:32 +0000 Subject: [PATCH] Added Transcoder for converting Encoding during parsing. git-svn-id: https://rapidjson.googlecode.com/svn/trunk@39 c5894555-1306-4e8d-425f-1f6f381ee07c --- include/rapidjson/document.h | 33 ++++++---- include/rapidjson/rapidjson.h | 112 +++++++++++++++++++++++++++------- include/rapidjson/reader.h | 24 ++++---- test/unittest/readertest.cpp | 19 ++++-- 4 files changed, 139 insertions(+), 49 deletions(-) diff --git a/include/rapidjson/document.h b/include/rapidjson/document.h index d70756a..22d8311 100644 --- a/include/rapidjson/document.h +++ b/include/rapidjson/document.h @@ -697,11 +697,11 @@ public: \param stream Input stream to be parsed. \return The document itself for fluent API. */ - template - GenericDocument& ParseStream(Stream& stream) { + template + GenericDocument& ParseStream(InputStream& is) { ValueType::SetNull(); // Remove existing root if exist - GenericReader reader; - if (reader.Parse(stream, *this)) { + GenericReader reader; + if (reader.Parse(is, *this)) { RAPIDJSON_ASSERT(stack_.GetSize() == sizeof(ValueType)); // Got one and only one root object RawAssign(*stack_.template Pop(1)); parseError_ = 0; @@ -720,21 +720,31 @@ public: \param str Mutable zero-terminated string to be parsed. \return The document itself for fluent API. */ - template + template GenericDocument& ParseInsitu(Ch* str) { GenericInsituStringStream s(str); - return ParseStream(s); + return ParseStream(s); + } + + template + GenericDocument& ParseInsitu(Ch* str) { + return ParseInsitu(str); } //! Parse JSON text from a read-only string. /*! \tparam parseFlags Combination of ParseFlag (must not contain kParseInsituFlag). \param str Read-only zero-terminated string to be parsed. */ - template + template GenericDocument& Parse(const Ch* str) { RAPIDJSON_ASSERT(!(parseFlags & kParseInsituFlag)); - GenericStringStream s(str); - return ParseStream(s); + GenericStringStream s(str); + return ParseStream(s); + } + + template + GenericDocument& Parse(const Ch* str) { + return Parse(str); } //! Whether a parse error was occured in the last parsing. @@ -752,8 +762,8 @@ public: //! Get the capacity of stack in bytes. size_t GetStackCapacity() const { return stack_.GetCapacity(); } -private: - friend class GenericReader; // for Reader to call the following private handler functions +//private: + //friend class GenericReader; // for Reader to call the following private handler functions // Implementation of Handler void Null() { new (stack_.template Push()) ValueType(); } @@ -785,6 +795,7 @@ private: stack_.template Top()->SetArrayRaw(elements, elementCount, GetAllocator()); } +private: void ClearStack() { if (Allocator::kNeedFree) while (stack_.GetSize() > 0) // Here assumes all elements in stack array are GenericValue (Member is actually 2 GenericValue objects) diff --git a/include/rapidjson/rapidjson.h b/include/rapidjson/rapidjson.h index 19997e7..c646dce 100644 --- a/include/rapidjson/rapidjson.h +++ b/include/rapidjson/rapidjson.h @@ -355,8 +355,62 @@ struct UTF8 { } } + template + static bool Decode(InputStream& is, unsigned* codepoint) { +#define COPY() c = is.Take(); *codepoint = (*codepoint << 6) | ((unsigned char)c & 0x3Fu) +#define TRANS(mask) result &= ((GetType(c) & mask) != 0) +#define TAIL() COPY(); TRANS(0x70) + Ch c = is.Take(); + if (!(c & 0x80)) { + *codepoint = (unsigned char)c; + return true; + } + + unsigned char type = GetType(c); + *codepoint = (0xFF >> type) & (unsigned char)c; + bool result = true; + switch (type) { + case 2: TAIL(); return result; + case 3: TAIL(); TAIL(); return result; + case 4: COPY(); TRANS(0x50); TAIL(); return result; + case 5: COPY(); TRANS(0x10); COPY(); TAIL(); return result; + case 6: TAIL(); TAIL(); TAIL(); return result; + case 10: COPY(); TRANS(0x20); TAIL(); return result; + case 11: COPY(); TRANS(0x60); TAIL(); return result; + default: return false; + } +#undef COPY +#undef TRANS +#undef TAIL + } + template RAPIDJSON_FORCEINLINE static bool Validate(InputStream& is, OutputStream& os) { +#define COPY() os.Put(c = is.Take()) +#define TRANS(mask) result &= ((GetType(c) & mask) != 0) +#define TAIL() COPY(); TRANS(0x70) + Ch c; + COPY(); + if (!(c & 0x80)) + return true; + + bool result = true; + switch (GetType(c)) { + case 2: TAIL(); return result; + case 3: TAIL(); TAIL(); return result; + case 4: COPY(); TRANS(0x50); TAIL(); return result; + case 5: COPY(); TRANS(0x10); COPY(); TAIL(); return result; + case 6: TAIL(); TAIL(); TAIL(); return result; + case 10: COPY(); TRANS(0x20); TAIL(); return result; + case 11: COPY(); TRANS(0x60); TAIL(); return result; + default: return false; + } +#undef COPY +#undef TRANS +#undef TAIL + } + + RAPIDJSON_FORCEINLINE static unsigned char GetType(unsigned char c) { // Referring to DFA of http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ // With new mapping 1 -> 0x10, 7 -> 0x20, 9 -> 0x40, such that AND operation can test multiple types. static const unsigned char type[] = { @@ -371,28 +425,7 @@ struct UTF8 { 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, }; -#define COPY() os.Put(c = is.Take()) -#define TRANS(mask) result &= ((type[(unsigned char)c] & mask) != 0) -#define TAIL() COPY(); TRANS(0x70) - Ch c; - COPY(); - if (!(c & 0x80)) - return true; - - bool result = true; - switch (type[(unsigned char)c]) { - case 2: TAIL(); return result; - case 3: TAIL(); TAIL(); return result; - case 4: COPY(); TRANS(0x50); TAIL(); return result; - case 5: COPY(); TRANS(0x10); COPY(); TAIL(); return result; - case 6: TAIL(); TAIL(); TAIL(); return result; - case 10: COPY(); TRANS(0x20); TAIL(); return result; - case 11: COPY(); TRANS(0x60); TAIL(); return result; - default: return false; - } -#undef COPY -#undef TRANS -#undef TAIL + return type[c]; } }; @@ -464,6 +497,41 @@ struct UTF32 { } }; +/////////////////////////////////////////////////////////////////////////////// +// Transcoder + +template +struct Transcoder { + template + static bool Transcode(InputStream& is, OutputStream& os) { + unsigned codepoint; + if (!SourceEncoding::Decode(is, &codepoint)) + return false; + TargetEncoding::Encode(os, codepoint); + return true; + } + + template + static bool Validate(InputStream& is, OutputStream& os) { + return Transcode(is, os); + } +}; + +//! Specialization of Transcoder with same source and target encoding. +template +struct Transcoder { + template + static bool Transcode(InputStream& is, OutputStream& os) { + os.Put(is.Take()); + return true; + } + + template + static bool Validate(InputStream& is, OutputStream& os) { + return Encoding::Validate(is, os); + } +}; + /////////////////////////////////////////////////////////////////////////////// // Stream diff --git a/include/rapidjson/reader.h b/include/rapidjson/reader.h index b5625d8..62ce232 100644 --- a/include/rapidjson/reader.h +++ b/include/rapidjson/reader.h @@ -189,10 +189,10 @@ template<> inline void SkipWhitespace(StringStream& stream) { \tparam Encoding Encoding of both the stream and the parse output. \tparam Allocator Allocator type for stack. */ -template > +template > class GenericReader { public: - typedef typename Encoding::Ch Ch; + typedef typename SourceEncoding::Ch Ch; //! Constructor. /*! \param allocator Optional allocator for allocating stack memory. (Only use for non-destructive parsing) @@ -365,8 +365,8 @@ private: struct StackStream { StackStream(internal::Stack& stack) : stack_(stack), length_(0) {} - void Put(Ch c) { - *stack_.template Push() = c; + void Put(typename TargetEncoding::Ch c) { + *stack_.template Push() = c; ++length_; } internal::Stack& stack_; @@ -382,12 +382,12 @@ private: ParseStringToStream(s, s); size_t length = s.PutEnd(head) - 1; RAPIDJSON_ASSERT(length <= 0xFFFFFFFF); - handler.String(head, SizeType(length), false); + handler.String((typename TargetEncoding::Ch*)head, SizeType(length), false); } else { StackStream stackStream(stack_); ParseStringToStream(s, stackStream); - handler.String(stack_.template Pop(stackStream.length_), stackStream.length_ - 1, true); + handler.String(stack_.template Pop(stackStream.length_), stackStream.length_ - 1, true); } stream = s; // Restore stream } @@ -427,7 +427,7 @@ private: RAPIDJSON_PARSE_ERROR("The second \\u in surrogate pair is invalid", input.Tell() - 2); codepoint = (((codepoint - 0xD800) << 10) | (codepoint2 - 0xDC00)) + 0x10000; } - Encoding::Encode(output, codepoint); + TargetEncoding::Encode(output, codepoint); } else RAPIDJSON_PARSE_ERROR("Unknown escape character", input.Tell() - 1); @@ -441,12 +441,12 @@ private: RAPIDJSON_PARSE_ERROR("lacks ending quotation before the end of string", input.Tell() - 1); else if ((unsigned)c < 0x20) // RFC 4627: unescaped = %x20-21 / %x23-5B / %x5D-10FFFF RAPIDJSON_PARSE_ERROR("Incorrect unescaped character in string", input.Tell() - 1); - else if (parseFlags & kParseValidateEncodingFlag) { - if (!Encoding::Validate(input, output)) + else { + if (parseFlags & kParseValidateEncodingFlag ? + !Transcoder::Validate(input, output) : + !Transcoder::Transcode(input, output)) RAPIDJSON_PARSE_ERROR("Invalid encoding", input.Tell()); } - else - output.Put(input.Take()); // Normal character, just copy } } @@ -632,7 +632,7 @@ private: }; // class GenericReader //! Reader with UTF8 encoding and default allocator. -typedef GenericReader > Reader; +typedef GenericReader, UTF8<> > Reader; } // namespace rapidjson diff --git a/test/unittest/readertest.cpp b/test/unittest/readertest.cpp index c849122..beddd2d 100644 --- a/test/unittest/readertest.cpp +++ b/test/unittest/readertest.cpp @@ -203,14 +203,14 @@ TEST(Reader, ParseString) { Encoding::Ch* buffer = StrDup(x); \ GenericInsituStringStream is(buffer); \ ParseStringHandler h; \ - GenericReader reader; \ + GenericReader reader; \ reader.ParseString(is, h); \ EXPECT_EQ(0, StrCmp(e, h.str_)); \ EXPECT_EQ(StrLen(e), h.length_); \ free(buffer); \ GenericStringStream s(x); \ ParseStringHandler h2; \ - GenericReader reader2; \ + GenericReader reader2; \ reader2.ParseString<0>(s, h2); \ EXPECT_EQ(0, StrCmp(e, h2.str_)); \ EXPECT_EQ(StrLen(e), h2.length_); \ @@ -277,6 +277,17 @@ TEST(Reader, ParseString) { } } +TEST(Reader, ParseString_Transcoding) { + const char* x = "\"Hello\""; + const wchar_t* e = L"Hello"; + GenericStringStream > is(x); + GenericReader, UTF16<> > reader; + ParseStringHandler > h; + reader.ParseString<0>(is, h); + EXPECT_EQ(0, StrCmp::Ch>(e, h.str_)); + EXPECT_EQ(StrLen(e), h.length_); +} + TEST(Reader, ParseString_NonDestructive) { StringStream s("\"Hello\\nWorld\""); ParseStringHandler > h; @@ -403,7 +414,7 @@ TEST(Reader, ParseArray_Error) { strncpy(buffer, str, 1000); \ InsituStringStream s(buffer); \ BaseReaderHandler<> h; \ - GenericReader, CrtAllocator> reader; \ + GenericReader, UTF8<>, CrtAllocator> reader; \ EXPECT_FALSE(reader.Parse<0>(s, h)); \ } @@ -507,7 +518,7 @@ TEST(Reader, ParseObject_Error) { strncpy(buffer, str, 1000); \ InsituStringStream s(buffer); \ BaseReaderHandler<> h; \ - GenericReader, CrtAllocator> reader; \ + GenericReader, UTF8<>, CrtAllocator> reader; \ EXPECT_FALSE(reader.Parse<0>(s, h)); \ }