Refactor encoding concept to use stream for encoding and validation.
git-svn-id: https://rapidjson.googlecode.com/svn/trunk@32 c5894555-1306-4e8d-425f-1f6f381ee07c
This commit is contained in:
parent
b9d932ac9a
commit
25eeff24f3
@ -303,11 +303,19 @@ private:
|
|||||||
concept Encoding {
|
concept Encoding {
|
||||||
typename Ch; //! Type of character.
|
typename Ch; //! Type of character.
|
||||||
|
|
||||||
//! \brief Encode a Unicode codepoint to a buffer.
|
//! \brief Encode a Unicode codepoint to a stream.
|
||||||
//! \param buffer pointer to destination buffer to store the result. It should have sufficient size of encoding one character.
|
//! \param os Output stream.
|
||||||
//! \param codepoint An unicode codepoint, ranging from 0x0 to 0x10FFFF inclusively.
|
//! \param codepoint An unicode codepoint, ranging from 0x0 to 0x10FFFF inclusively.
|
||||||
//! \returns the pointer to the next character after the encoded data.
|
template<typename OutputStream>
|
||||||
static Ch* Encode(Ch *buffer, unsigned codepoint);
|
static void Encode(OutputStream& os, unsigned codepoint) {
|
||||||
|
|
||||||
|
//! \brief Validate one Unicode codepoint from an encoded stream.
|
||||||
|
//! \param is Input stream to obtain codepoint.
|
||||||
|
//! \param os Output for copying one codepoint.
|
||||||
|
//! \return true if it is valid.
|
||||||
|
//! \note This function just validating and copying the codepoint without actually decode it.
|
||||||
|
template <typename InputStream, typename OutputStream>
|
||||||
|
RAPIDJSON_FORCEINLINE static bool Validate(InputStream& is, OutputStream& os) {
|
||||||
};
|
};
|
||||||
\endcode
|
\endcode
|
||||||
*/
|
*/
|
||||||
@ -317,6 +325,7 @@ concept Encoding {
|
|||||||
|
|
||||||
//! UTF-8 encoding.
|
//! UTF-8 encoding.
|
||||||
/*! http://en.wikipedia.org/wiki/UTF-8
|
/*! http://en.wikipedia.org/wiki/UTF-8
|
||||||
|
http://tools.ietf.org/html/rfc3629
|
||||||
\tparam CharType Type for storing 8-bit UTF-8 data. Default is char.
|
\tparam CharType Type for storing 8-bit UTF-8 data. Default is char.
|
||||||
\implements Encoding
|
\implements Encoding
|
||||||
*/
|
*/
|
||||||
@ -324,67 +333,70 @@ template<typename CharType = char>
|
|||||||
struct UTF8 {
|
struct UTF8 {
|
||||||
typedef CharType Ch;
|
typedef CharType Ch;
|
||||||
|
|
||||||
static Ch* Encode(Ch *buffer, unsigned codepoint) {
|
template<typename OutputStream>
|
||||||
|
static void Encode(OutputStream& os, unsigned codepoint) {
|
||||||
if (codepoint <= 0x7F)
|
if (codepoint <= 0x7F)
|
||||||
*buffer++ = codepoint & 0xFF;
|
os.Put(codepoint & 0xFF);
|
||||||
else if (codepoint <= 0x7FF) {
|
else if (codepoint <= 0x7FF) {
|
||||||
*buffer++ = 0xC0 | ((codepoint >> 6) & 0xFF);
|
os.Put(0xC0 | ((codepoint >> 6) & 0xFF));
|
||||||
*buffer++ = 0x80 | ((codepoint & 0x3F));
|
os.Put(0x80 | ((codepoint & 0x3F)));
|
||||||
}
|
}
|
||||||
else if (codepoint <= 0xFFFF) {
|
else if (codepoint <= 0xFFFF) {
|
||||||
*buffer++ = 0xE0 | ((codepoint >> 12) & 0xFF);
|
os.Put(0xE0 | ((codepoint >> 12) & 0xFF));
|
||||||
*buffer++ = 0x80 | ((codepoint >> 6) & 0x3F);
|
os.Put(0x80 | ((codepoint >> 6) & 0x3F));
|
||||||
*buffer++ = 0x80 | (codepoint & 0x3F);
|
os.Put(0x80 | (codepoint & 0x3F));
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
RAPIDJSON_ASSERT(codepoint <= 0x10FFFF);
|
RAPIDJSON_ASSERT(codepoint <= 0x10FFFF);
|
||||||
*buffer++ = 0xF0 | ((codepoint >> 18) & 0xFF);
|
os.Put(0xF0 | ((codepoint >> 18) & 0xFF));
|
||||||
*buffer++ = 0x80 | ((codepoint >> 12) & 0x3F);
|
os.Put(0x80 | ((codepoint >> 12) & 0x3F));
|
||||||
*buffer++ = 0x80 | ((codepoint >> 6) & 0x3F);
|
os.Put(0x80 | ((codepoint >> 6) & 0x3F));
|
||||||
*buffer++ = 0x80 | (codepoint & 0x3F);
|
os.Put(0x80 | (codepoint & 0x3F));
|
||||||
}
|
}
|
||||||
return buffer;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename Stream>
|
template <typename InputStream, typename OutputStream>
|
||||||
RAPIDJSON_FORCEINLINE static Ch* Validate(Ch *buffer, Stream& s) {
|
RAPIDJSON_FORCEINLINE static bool Validate(InputStream& is, OutputStream& os) {
|
||||||
#define X1 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
|
// http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
|
||||||
#define X5 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
|
static const unsigned char utf8d[] = {
|
||||||
static const char utf8[256] = {
|
//! \todo optimization
|
||||||
X1,X1,X1,X1,X1,X1,X1,X1, // 00-7F 1 byte
|
// The first part of the table maps bytes to character classes that
|
||||||
X5,X5,X5,X5, // 80-BF Continuation
|
// to reduce the size of the transition table and create bitmasks.
|
||||||
0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // C0-C1: invalid, C2-CF: 2 bytes
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||||
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // D0-DF: 2 bytes
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // E0-EF: 3 bytes
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||||
4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // F0-F4: 4 bytes
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||||
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
|
||||||
|
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
|
||||||
|
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
||||||
|
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
|
||||||
|
|
||||||
|
// The second part is a transition table that maps a combination
|
||||||
|
// of a state of the automaton and a character class to a state.
|
||||||
|
0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
|
||||||
|
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
|
||||||
|
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
|
||||||
|
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
|
||||||
|
12,36,12,12,12,12,12,12,12,12,12,12,
|
||||||
};
|
};
|
||||||
#undef X1
|
Ch c;
|
||||||
#undef X5
|
os.Put(c = is.Take());
|
||||||
|
if ((unsigned char) c <= 0x80)
|
||||||
|
return true;
|
||||||
|
|
||||||
#define TAIL() c = *buffer++ = s.Take(); if ((c & 0xC0) != 0x80) return NULL;
|
unsigned type = utf8d[(unsigned char)c];
|
||||||
|
unsigned state = utf8d[256 + type];
|
||||||
|
if (state == 12)
|
||||||
|
return false;
|
||||||
|
|
||||||
Ch c = *buffer++ = s.Take();
|
while (state) {
|
||||||
if ((unsigned char)c < 0x80u)
|
os.Put(c = is.Take());
|
||||||
return buffer;
|
unsigned type = utf8d[(unsigned char)c];
|
||||||
|
state = utf8d[256 + state + type];
|
||||||
switch(utf8[(unsigned char)c]) {
|
if (state == 12)
|
||||||
case 2:
|
return false;
|
||||||
TAIL();
|
};
|
||||||
return buffer;
|
return true;
|
||||||
|
|
||||||
case 3:
|
|
||||||
TAIL();
|
|
||||||
TAIL();
|
|
||||||
return buffer;
|
|
||||||
|
|
||||||
case 4:
|
|
||||||
TAIL();
|
|
||||||
TAIL();
|
|
||||||
TAIL();
|
|
||||||
return buffer;
|
|
||||||
}
|
|
||||||
return NULL;
|
|
||||||
#undef TAIL
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -393,6 +405,7 @@ struct UTF8 {
|
|||||||
|
|
||||||
//! UTF-16 encoding.
|
//! UTF-16 encoding.
|
||||||
/*! http://en.wikipedia.org/wiki/UTF-16
|
/*! http://en.wikipedia.org/wiki/UTF-16
|
||||||
|
http://tools.ietf.org/html/rfc2781
|
||||||
\tparam CharType Type for storing 16-bit UTF-16 data. Default is wchar_t. C++11 may use char16_t instead.
|
\tparam CharType Type for storing 16-bit UTF-16 data. Default is wchar_t. C++11 may use char16_t instead.
|
||||||
\implements Encoding
|
\implements Encoding
|
||||||
*/
|
*/
|
||||||
@ -400,33 +413,32 @@ template<typename CharType = wchar_t>
|
|||||||
struct UTF16 {
|
struct UTF16 {
|
||||||
typedef CharType Ch;
|
typedef CharType Ch;
|
||||||
|
|
||||||
static Ch* Encode(Ch* buffer, unsigned codepoint) {
|
template<typename OutputStream>
|
||||||
|
static void Encode(OutputStream& os, unsigned codepoint) {
|
||||||
if (codepoint <= 0xFFFF) {
|
if (codepoint <= 0xFFFF) {
|
||||||
RAPIDJSON_ASSERT(codepoint < 0xD800 || codepoint > 0xDFFF); // Code point itself cannot be surrogate pair
|
RAPIDJSON_ASSERT(codepoint < 0xD800 || codepoint > 0xDFFF); // Code point itself cannot be surrogate pair
|
||||||
*buffer++ = codepoint;
|
os.Put(codepoint);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
RAPIDJSON_ASSERT(codepoint <= 0x10FFFF);
|
RAPIDJSON_ASSERT(codepoint <= 0x10FFFF);
|
||||||
unsigned v = codepoint - 0x10000;
|
unsigned v = codepoint - 0x10000;
|
||||||
*buffer++ = (v >> 10) + 0xD800;
|
os.Put((v >> 10) + 0xD800);
|
||||||
*buffer++ = (v & 0x3FF) + 0xDC00;
|
os.Put((v & 0x3FF) + 0xDC00);
|
||||||
}
|
}
|
||||||
return buffer;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename Stream>
|
template <typename InputStream, typename OutputStream>
|
||||||
static Ch* Validate(Ch *buffer, Stream& s) {
|
RAPIDJSON_FORCEINLINE static bool Validate(InputStream& is, OutputStream& os) {
|
||||||
Ch c = *buffer++ = s.Take();
|
Ch c;
|
||||||
|
os.Put(c = is.Take());
|
||||||
if (c < 0xD800 || c > 0xDFFF)
|
if (c < 0xD800 || c > 0xDFFF)
|
||||||
;
|
return true;
|
||||||
else if (c < 0xDBFF) {
|
else if (c < 0xDBFF) {
|
||||||
Ch c = *buffer++ = s.Take();
|
os.Put(c = is.Take());
|
||||||
if (c < 0xDC00 || c > 0xDFFF)
|
return c >= 0xDC00 && c <= 0xDFFF;
|
||||||
return NULL;
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
return NULL;
|
return false;
|
||||||
return buffer;
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -442,16 +454,17 @@ template<typename CharType = unsigned>
|
|||||||
struct UTF32 {
|
struct UTF32 {
|
||||||
typedef CharType Ch;
|
typedef CharType Ch;
|
||||||
|
|
||||||
static Ch *Encode(Ch* buffer, unsigned codepoint) {
|
template<typename OutputStream>
|
||||||
|
static void Encode(OutputStream& os, unsigned codepoint) {
|
||||||
RAPIDJSON_ASSERT(codepoint <= 0x10FFFF);
|
RAPIDJSON_ASSERT(codepoint <= 0x10FFFF);
|
||||||
*buffer++ = codepoint;
|
os.Put(codepoint);
|
||||||
return buffer;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename Stream>
|
template <typename InputStream, typename OutputStream>
|
||||||
static Ch* Validate(Ch *buffer, Stream& s) {
|
RAPIDJSON_FORCEINLINE static bool Validate(InputStream& is, OutputStream& os) {
|
||||||
Ch c = *buffer++ = s.Take();
|
Ch c;
|
||||||
return c <= 0x10FFFF ? buffer : 0;
|
os.Put(c = is.Take());
|
||||||
|
return c <= 0x10FFFF;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -368,6 +368,16 @@ private:
|
|||||||
return codepoint;
|
return codepoint;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct StackStream {
|
||||||
|
StackStream(internal::Stack<Allocator>& stack) : stack_(stack), length_(0) {}
|
||||||
|
void Put(Ch c) {
|
||||||
|
*stack_.template Push<Ch>() = c;
|
||||||
|
++length_;
|
||||||
|
}
|
||||||
|
internal::Stack<Allocator>& stack_;
|
||||||
|
SizeType length_;
|
||||||
|
};
|
||||||
|
|
||||||
// Parse string, handling the prefix and suffix double quotes and escaping.
|
// Parse string, handling the prefix and suffix double quotes and escaping.
|
||||||
template<unsigned parseFlags, typename Stream, typename Handler>
|
template<unsigned parseFlags, typename Stream, typename Handler>
|
||||||
void ParseString(Stream& stream, Handler& handler) {
|
void ParseString(Stream& stream, Handler& handler) {
|
||||||
@ -391,13 +401,13 @@ private:
|
|||||||
else
|
else
|
||||||
len = 0;
|
len = 0;
|
||||||
|
|
||||||
|
StackStream stackStream(stack_);
|
||||||
#define RAPIDJSON_PUT(x) \
|
#define RAPIDJSON_PUT(x) \
|
||||||
do { \
|
do { \
|
||||||
if (parseFlags & kParseInsituFlag) \
|
if (parseFlags & kParseInsituFlag) \
|
||||||
s.Put(x); \
|
s.Put(x); \
|
||||||
else { \
|
else { \
|
||||||
*stack_.template Push<Ch>() = x; \
|
stackStream.Put(x); \
|
||||||
++len; \
|
|
||||||
} \
|
} \
|
||||||
} while(false)
|
} while(false)
|
||||||
|
|
||||||
@ -423,16 +433,10 @@ private:
|
|||||||
codepoint = (((codepoint - 0xD800) << 10) | (codepoint2 - 0xDC00)) + 0x10000;
|
codepoint = (((codepoint - 0xD800) << 10) | (codepoint2 - 0xDC00)) + 0x10000;
|
||||||
}
|
}
|
||||||
|
|
||||||
Ch buffer[4];
|
|
||||||
SizeType count = SizeType(Encoding::Encode(buffer, codepoint) - &buffer[0]);
|
|
||||||
|
|
||||||
if (parseFlags & kParseInsituFlag)
|
if (parseFlags & kParseInsituFlag)
|
||||||
for (SizeType i = 0; i < count; i++)
|
Encoding::Encode(s, codepoint);
|
||||||
s.Put(buffer[i]);
|
else
|
||||||
else {
|
Encoding::Encode(stackStream, codepoint);
|
||||||
memcpy(stack_.template Push<Ch>(count), buffer, count * sizeof(Ch));
|
|
||||||
len += count;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
RAPIDJSON_PARSE_ERROR("Unknown escape character", stream.Tell() - 1);
|
RAPIDJSON_PARSE_ERROR("Unknown escape character", stream.Tell() - 1);
|
||||||
@ -449,7 +453,7 @@ private:
|
|||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
RAPIDJSON_PUT('\0');
|
RAPIDJSON_PUT('\0');
|
||||||
handler.String(stack_.template Pop<Ch>(len), len - 1, true);
|
handler.String(stack_.template Pop<Ch>(stackStream.length_), stackStream.length_ - 1, true);
|
||||||
}
|
}
|
||||||
stream = s; // restore stream
|
stream = s; // restore stream
|
||||||
return;
|
return;
|
||||||
@ -463,24 +467,19 @@ private:
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
else if (parseFlags & kParseValidateEncodingFlag) {
|
else if (parseFlags & kParseValidateEncodingFlag) {
|
||||||
Ch buffer[4];
|
if (parseFlags & kParseInsituFlag) {
|
||||||
Ch* end = Encoding::Validate(&buffer[0], s);
|
if (!Encoding::Validate(s, s)) {
|
||||||
if (end == NULL) {
|
RAPIDJSON_PARSE_ERROR("Invalid encoding", s.Tell());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (!Encoding::Validate(s, stackStream)) {
|
||||||
RAPIDJSON_PARSE_ERROR("Invalid encoding", s.Tell());
|
RAPIDJSON_PARSE_ERROR("Invalid encoding", s.Tell());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (parseFlags & kParseInsituFlag)
|
|
||||||
for (Ch* p = &buffer[0]; p != end; ++p)
|
|
||||||
s.Put(*p);
|
|
||||||
else {
|
|
||||||
SizeType l = SizeType(end - &buffer[0]);
|
|
||||||
Ch* q = stack_.template Push<Ch>(l);
|
|
||||||
for (Ch* p = &buffer[0]; p != end; ++p)
|
|
||||||
*q++ = *p;
|
|
||||||
len += l;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
RAPIDJSON_PUT(s.Take()); // Normal character, just copy
|
RAPIDJSON_PUT(s.Take()); // Normal character, just copy
|
||||||
|
Loading…
x
Reference in New Issue
Block a user