Added Transcoder for converting Encoding during parsing.

git-svn-id: https://rapidjson.googlecode.com/svn/trunk@39 c5894555-1306-4e8d-425f-1f6f381ee07c
This commit is contained in:
miloyip@gmail.com 2011-11-28 09:30:32 +00:00
parent bdf6da641e
commit a8d631fbc2
4 changed files with 139 additions and 49 deletions

View File

@ -697,11 +697,11 @@ public:
\param stream Input stream to be parsed. \param stream Input stream to be parsed.
\return The document itself for fluent API. \return The document itself for fluent API.
*/ */
template <unsigned parseFlags, typename Stream> template <unsigned parseFlags, typename SourceEncoding, typename InputStream>
GenericDocument& ParseStream(Stream& stream) { GenericDocument& ParseStream(InputStream& is) {
ValueType::SetNull(); // Remove existing root if exist ValueType::SetNull(); // Remove existing root if exist
GenericReader<Encoding> reader; GenericReader<SourceEncoding, Encoding> reader;
if (reader.Parse<parseFlags>(stream, *this)) { if (reader.Parse<parseFlags>(is, *this)) {
RAPIDJSON_ASSERT(stack_.GetSize() == sizeof(ValueType)); // Got one and only one root object RAPIDJSON_ASSERT(stack_.GetSize() == sizeof(ValueType)); // Got one and only one root object
RawAssign(*stack_.template Pop<ValueType>(1)); RawAssign(*stack_.template Pop<ValueType>(1));
parseError_ = 0; parseError_ = 0;
@ -720,21 +720,31 @@ public:
\param str Mutable zero-terminated string to be parsed. \param str Mutable zero-terminated string to be parsed.
\return The document itself for fluent API. \return The document itself for fluent API.
*/ */
template <unsigned parseFlags> template <unsigned parseFlags, typename SourceEncoding>
GenericDocument& ParseInsitu(Ch* str) { GenericDocument& ParseInsitu(Ch* str) {
GenericInsituStringStream<Encoding> s(str); GenericInsituStringStream<Encoding> s(str);
return ParseStream<parseFlags | kParseInsituFlag>(s); return ParseStream<parseFlags | kParseInsituFlag, SourceEncoding>(s);
}
template <unsigned parseFlags>
GenericDocument& ParseInsitu(Ch* str) {
return ParseInsitu<parseFlags, Encoding>(str);
} }
//! Parse JSON text from a read-only string. //! Parse JSON text from a read-only string.
/*! \tparam parseFlags Combination of ParseFlag (must not contain kParseInsituFlag). /*! \tparam parseFlags Combination of ParseFlag (must not contain kParseInsituFlag).
\param str Read-only zero-terminated string to be parsed. \param str Read-only zero-terminated string to be parsed.
*/ */
template <unsigned parseFlags> template <unsigned parseFlags, typename SourceEncoding>
GenericDocument& Parse(const Ch* str) { GenericDocument& Parse(const Ch* str) {
RAPIDJSON_ASSERT(!(parseFlags & kParseInsituFlag)); RAPIDJSON_ASSERT(!(parseFlags & kParseInsituFlag));
GenericStringStream<Encoding> s(str); GenericStringStream<SourceEncoding> s(str);
return ParseStream<parseFlags>(s); return ParseStream<parseFlags, SourceEncoding>(s);
}
template <unsigned parseFlags>
GenericDocument& Parse(const Ch* str) {
return Parse<parseFlags, Encoding>(str);
} }
//! Whether a parse error was occured in the last parsing. //! Whether a parse error was occured in the last parsing.
@ -752,8 +762,8 @@ public:
//! Get the capacity of stack in bytes. //! Get the capacity of stack in bytes.
size_t GetStackCapacity() const { return stack_.GetCapacity(); } size_t GetStackCapacity() const { return stack_.GetCapacity(); }
private: //private:
friend class GenericReader<Encoding>; // for Reader to call the following private handler functions //friend class GenericReader<Encoding>; // for Reader to call the following private handler functions
// Implementation of Handler // Implementation of Handler
void Null() { new (stack_.template Push<ValueType>()) ValueType(); } void Null() { new (stack_.template Push<ValueType>()) ValueType(); }
@ -785,6 +795,7 @@ private:
stack_.template Top<ValueType>()->SetArrayRaw(elements, elementCount, GetAllocator()); stack_.template Top<ValueType>()->SetArrayRaw(elements, elementCount, GetAllocator());
} }
private:
void ClearStack() { void ClearStack() {
if (Allocator::kNeedFree) if (Allocator::kNeedFree)
while (stack_.GetSize() > 0) // Here assumes all elements in stack array are GenericValue (Member is actually 2 GenericValue objects) while (stack_.GetSize() > 0) // Here assumes all elements in stack array are GenericValue (Member is actually 2 GenericValue objects)

View File

@ -355,8 +355,62 @@ struct UTF8 {
} }
} }
template <typename InputStream>
static bool Decode(InputStream& is, unsigned* codepoint) {
#define COPY() c = is.Take(); *codepoint = (*codepoint << 6) | ((unsigned char)c & 0x3Fu)
#define TRANS(mask) result &= ((GetType(c) & mask) != 0)
#define TAIL() COPY(); TRANS(0x70)
Ch c = is.Take();
if (!(c & 0x80)) {
*codepoint = (unsigned char)c;
return true;
}
unsigned char type = GetType(c);
*codepoint = (0xFF >> type) & (unsigned char)c;
bool result = true;
switch (type) {
case 2: TAIL(); return result;
case 3: TAIL(); TAIL(); return result;
case 4: COPY(); TRANS(0x50); TAIL(); return result;
case 5: COPY(); TRANS(0x10); COPY(); TAIL(); return result;
case 6: TAIL(); TAIL(); TAIL(); return result;
case 10: COPY(); TRANS(0x20); TAIL(); return result;
case 11: COPY(); TRANS(0x60); TAIL(); return result;
default: return false;
}
#undef COPY
#undef TRANS
#undef TAIL
}
template <typename InputStream, typename OutputStream> template <typename InputStream, typename OutputStream>
RAPIDJSON_FORCEINLINE static bool Validate(InputStream& is, OutputStream& os) { RAPIDJSON_FORCEINLINE static bool Validate(InputStream& is, OutputStream& os) {
#define COPY() os.Put(c = is.Take())
#define TRANS(mask) result &= ((GetType(c) & mask) != 0)
#define TAIL() COPY(); TRANS(0x70)
Ch c;
COPY();
if (!(c & 0x80))
return true;
bool result = true;
switch (GetType(c)) {
case 2: TAIL(); return result;
case 3: TAIL(); TAIL(); return result;
case 4: COPY(); TRANS(0x50); TAIL(); return result;
case 5: COPY(); TRANS(0x10); COPY(); TAIL(); return result;
case 6: TAIL(); TAIL(); TAIL(); return result;
case 10: COPY(); TRANS(0x20); TAIL(); return result;
case 11: COPY(); TRANS(0x60); TAIL(); return result;
default: return false;
}
#undef COPY
#undef TRANS
#undef TAIL
}
RAPIDJSON_FORCEINLINE static unsigned char GetType(unsigned char c) {
// Referring to DFA of http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ // Referring to DFA of http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
// With new mapping 1 -> 0x10, 7 -> 0x20, 9 -> 0x40, such that AND operation can test multiple types. // With new mapping 1 -> 0x10, 7 -> 0x20, 9 -> 0x40, such that AND operation can test multiple types.
static const unsigned char type[] = { static const unsigned char type[] = {
@ -371,28 +425,7 @@ struct UTF8 {
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
}; };
#define COPY() os.Put(c = is.Take()) return type[c];
#define TRANS(mask) result &= ((type[(unsigned char)c] & mask) != 0)
#define TAIL() COPY(); TRANS(0x70)
Ch c;
COPY();
if (!(c & 0x80))
return true;
bool result = true;
switch (type[(unsigned char)c]) {
case 2: TAIL(); return result;
case 3: TAIL(); TAIL(); return result;
case 4: COPY(); TRANS(0x50); TAIL(); return result;
case 5: COPY(); TRANS(0x10); COPY(); TAIL(); return result;
case 6: TAIL(); TAIL(); TAIL(); return result;
case 10: COPY(); TRANS(0x20); TAIL(); return result;
case 11: COPY(); TRANS(0x60); TAIL(); return result;
default: return false;
}
#undef COPY
#undef TRANS
#undef TAIL
} }
}; };
@ -464,6 +497,41 @@ struct UTF32 {
} }
}; };
///////////////////////////////////////////////////////////////////////////////
// Transcoder
template<typename SourceEncoding, typename TargetEncoding>
struct Transcoder {
template<typename InputStream, typename OutputStream>
static bool Transcode(InputStream& is, OutputStream& os) {
unsigned codepoint;
if (!SourceEncoding::Decode(is, &codepoint))
return false;
TargetEncoding::Encode(os, codepoint);
return true;
}
template<typename InputStream, typename OutputStream>
static bool Validate(InputStream& is, OutputStream& os) {
return Transcode(is, os);
}
};
//! Specialization of Transcoder with same source and target encoding.
template<typename Encoding>
struct Transcoder<Encoding, Encoding> {
template<typename InputStream, typename OutputStream>
static bool Transcode(InputStream& is, OutputStream& os) {
os.Put(is.Take());
return true;
}
template<typename InputStream, typename OutputStream>
static bool Validate(InputStream& is, OutputStream& os) {
return Encoding::Validate(is, os);
}
};
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// Stream // Stream

View File

@ -189,10 +189,10 @@ template<> inline void SkipWhitespace(StringStream& stream) {
\tparam Encoding Encoding of both the stream and the parse output. \tparam Encoding Encoding of both the stream and the parse output.
\tparam Allocator Allocator type for stack. \tparam Allocator Allocator type for stack.
*/ */
template <typename Encoding, typename Allocator = MemoryPoolAllocator<> > template <typename SourceEncoding, typename TargetEncoding, typename Allocator = MemoryPoolAllocator<> >
class GenericReader { class GenericReader {
public: public:
typedef typename Encoding::Ch Ch; typedef typename SourceEncoding::Ch Ch;
//! Constructor. //! Constructor.
/*! \param allocator Optional allocator for allocating stack memory. (Only use for non-destructive parsing) /*! \param allocator Optional allocator for allocating stack memory. (Only use for non-destructive parsing)
@ -365,8 +365,8 @@ private:
struct StackStream { struct StackStream {
StackStream(internal::Stack<Allocator>& stack) : stack_(stack), length_(0) {} StackStream(internal::Stack<Allocator>& stack) : stack_(stack), length_(0) {}
void Put(Ch c) { void Put(typename TargetEncoding::Ch c) {
*stack_.template Push<Ch>() = c; *stack_.template Push<typename TargetEncoding::Ch>() = c;
++length_; ++length_;
} }
internal::Stack<Allocator>& stack_; internal::Stack<Allocator>& stack_;
@ -382,12 +382,12 @@ private:
ParseStringToStream<parseFlags>(s, s); ParseStringToStream<parseFlags>(s, s);
size_t length = s.PutEnd(head) - 1; size_t length = s.PutEnd(head) - 1;
RAPIDJSON_ASSERT(length <= 0xFFFFFFFF); RAPIDJSON_ASSERT(length <= 0xFFFFFFFF);
handler.String(head, SizeType(length), false); handler.String((typename TargetEncoding::Ch*)head, SizeType(length), false);
} }
else { else {
StackStream stackStream(stack_); StackStream stackStream(stack_);
ParseStringToStream<parseFlags>(s, stackStream); ParseStringToStream<parseFlags>(s, stackStream);
handler.String(stack_.template Pop<Ch>(stackStream.length_), stackStream.length_ - 1, true); handler.String(stack_.template Pop<typename TargetEncoding::Ch>(stackStream.length_), stackStream.length_ - 1, true);
} }
stream = s; // Restore stream stream = s; // Restore stream
} }
@ -427,7 +427,7 @@ private:
RAPIDJSON_PARSE_ERROR("The second \\u in surrogate pair is invalid", input.Tell() - 2); RAPIDJSON_PARSE_ERROR("The second \\u in surrogate pair is invalid", input.Tell() - 2);
codepoint = (((codepoint - 0xD800) << 10) | (codepoint2 - 0xDC00)) + 0x10000; codepoint = (((codepoint - 0xD800) << 10) | (codepoint2 - 0xDC00)) + 0x10000;
} }
Encoding::Encode(output, codepoint); TargetEncoding::Encode(output, codepoint);
} }
else else
RAPIDJSON_PARSE_ERROR("Unknown escape character", input.Tell() - 1); RAPIDJSON_PARSE_ERROR("Unknown escape character", input.Tell() - 1);
@ -441,12 +441,12 @@ private:
RAPIDJSON_PARSE_ERROR("lacks ending quotation before the end of string", input.Tell() - 1); RAPIDJSON_PARSE_ERROR("lacks ending quotation before the end of string", input.Tell() - 1);
else if ((unsigned)c < 0x20) // RFC 4627: unescaped = %x20-21 / %x23-5B / %x5D-10FFFF else if ((unsigned)c < 0x20) // RFC 4627: unescaped = %x20-21 / %x23-5B / %x5D-10FFFF
RAPIDJSON_PARSE_ERROR("Incorrect unescaped character in string", input.Tell() - 1); RAPIDJSON_PARSE_ERROR("Incorrect unescaped character in string", input.Tell() - 1);
else if (parseFlags & kParseValidateEncodingFlag) { else {
if (!Encoding::Validate(input, output)) if (parseFlags & kParseValidateEncodingFlag ?
!Transcoder<SourceEncoding, TargetEncoding>::Validate(input, output) :
!Transcoder<SourceEncoding, TargetEncoding>::Transcode(input, output))
RAPIDJSON_PARSE_ERROR("Invalid encoding", input.Tell()); RAPIDJSON_PARSE_ERROR("Invalid encoding", input.Tell());
} }
else
output.Put(input.Take()); // Normal character, just copy
} }
} }
@ -632,7 +632,7 @@ private:
}; // class GenericReader }; // class GenericReader
//! Reader with UTF8 encoding and default allocator. //! Reader with UTF8 encoding and default allocator.
typedef GenericReader<UTF8<> > Reader; typedef GenericReader<UTF8<>, UTF8<> > Reader;
} // namespace rapidjson } // namespace rapidjson

View File

@ -203,14 +203,14 @@ TEST(Reader, ParseString) {
Encoding::Ch* buffer = StrDup(x); \ Encoding::Ch* buffer = StrDup(x); \
GenericInsituStringStream<Encoding> is(buffer); \ GenericInsituStringStream<Encoding> is(buffer); \
ParseStringHandler<Encoding> h; \ ParseStringHandler<Encoding> h; \
GenericReader<Encoding> reader; \ GenericReader<Encoding, Encoding> reader; \
reader.ParseString<kParseInsituFlag | kParseValidateEncodingFlag>(is, h); \ reader.ParseString<kParseInsituFlag | kParseValidateEncodingFlag>(is, h); \
EXPECT_EQ(0, StrCmp<Encoding::Ch>(e, h.str_)); \ EXPECT_EQ(0, StrCmp<Encoding::Ch>(e, h.str_)); \
EXPECT_EQ(StrLen(e), h.length_); \ EXPECT_EQ(StrLen(e), h.length_); \
free(buffer); \ free(buffer); \
GenericStringStream<Encoding> s(x); \ GenericStringStream<Encoding> s(x); \
ParseStringHandler<Encoding> h2; \ ParseStringHandler<Encoding> h2; \
GenericReader<Encoding> reader2; \ GenericReader<Encoding, Encoding> reader2; \
reader2.ParseString<0>(s, h2); \ reader2.ParseString<0>(s, h2); \
EXPECT_EQ(0, StrCmp<Encoding::Ch>(e, h2.str_)); \ EXPECT_EQ(0, StrCmp<Encoding::Ch>(e, h2.str_)); \
EXPECT_EQ(StrLen(e), h2.length_); \ EXPECT_EQ(StrLen(e), h2.length_); \
@ -277,6 +277,17 @@ TEST(Reader, ParseString) {
} }
} }
TEST(Reader, ParseString_Transcoding) {
const char* x = "\"Hello\"";
const wchar_t* e = L"Hello";
GenericStringStream<UTF8<> > is(x);
GenericReader<UTF8<>, UTF16<> > reader;
ParseStringHandler<UTF16<> > h;
reader.ParseString<0>(is, h);
EXPECT_EQ(0, StrCmp<UTF16<>::Ch>(e, h.str_));
EXPECT_EQ(StrLen(e), h.length_);
}
TEST(Reader, ParseString_NonDestructive) { TEST(Reader, ParseString_NonDestructive) {
StringStream s("\"Hello\\nWorld\""); StringStream s("\"Hello\\nWorld\"");
ParseStringHandler<UTF8<> > h; ParseStringHandler<UTF8<> > h;
@ -403,7 +414,7 @@ TEST(Reader, ParseArray_Error) {
strncpy(buffer, str, 1000); \ strncpy(buffer, str, 1000); \
InsituStringStream s(buffer); \ InsituStringStream s(buffer); \
BaseReaderHandler<> h; \ BaseReaderHandler<> h; \
GenericReader<UTF8<>, CrtAllocator> reader; \ GenericReader<UTF8<>, UTF8<>, CrtAllocator> reader; \
EXPECT_FALSE(reader.Parse<0>(s, h)); \ EXPECT_FALSE(reader.Parse<0>(s, h)); \
} }
@ -507,7 +518,7 @@ TEST(Reader, ParseObject_Error) {
strncpy(buffer, str, 1000); \ strncpy(buffer, str, 1000); \
InsituStringStream s(buffer); \ InsituStringStream s(buffer); \
BaseReaderHandler<> h; \ BaseReaderHandler<> h; \
GenericReader<UTF8<>, CrtAllocator> reader; \ GenericReader<UTF8<>, UTF8<>, CrtAllocator> reader; \
EXPECT_FALSE(reader.Parse<0>(s, h)); \ EXPECT_FALSE(reader.Parse<0>(s, h)); \
} }