Rewrite UTF8::Validate() to obtain better performance.

git-svn-id: https://rapidjson.googlecode.com/svn/trunk@35 c5894555-1306-4e8d-425f-1f6f381ee07c
This commit is contained in:
miloyip@gmail.com 2011-11-23 17:59:30 +00:00
parent 827de60fb8
commit a45bcbba7b
3 changed files with 38 additions and 31 deletions

View File

@ -357,46 +357,41 @@ struct UTF8 {
template <typename InputStream, typename OutputStream> template <typename InputStream, typename OutputStream>
RAPIDJSON_FORCEINLINE static bool Validate(InputStream& is, OutputStream& os) { RAPIDJSON_FORCEINLINE static bool Validate(InputStream& is, OutputStream& os) {
// http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ // Referring to DFA of http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
static const unsigned char utf8d[] = { // With new mapping 1 -> 0x10, 7 -> 0x20, 9 -> 0x40, such that AND operation can test multiple types.
//! \todo optimization static const unsigned char type[] = {
// The first part of the table maps bytes to character classes that
// to reduce the size of the transition table and create bitmasks.
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,
0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,
0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
// The second part is a transition table that maps a combination
// of a state of the automaton and a character class to a state.
0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
12,36,12,12,12,12,12,12,12,12,12,12,
}; };
#define COPY() os.Put(c = is.Take())
#define TRANS(mask) if (!(type[(unsigned char)c] & mask)) return false
#define TAIL() COPY(); TRANS(0x70)
Ch c; Ch c;
os.Put(c = is.Take()); COPY();
if ((unsigned char) c < 0x80) if (!(c & 0x80))
return true; return true;
unsigned type = utf8d[(unsigned char)c]; switch (type[(unsigned char)c]) {
unsigned state = utf8d[256 + type]; case 2: TAIL(); return true;
if (state == 12) case 3: TAIL(); TAIL(); return true;
return false; case 4: COPY(); TRANS(0x50); TAIL(); return true;
case 5: COPY(); TRANS(0x10); COPY(); TAIL(); return true;
while (state) { case 6: TAIL(); TAIL(); TAIL(); return true;
os.Put(c = is.Take()); case 10: COPY(); TRANS(0x20); TAIL(); return true;
unsigned type = utf8d[(unsigned char)c]; case 11: COPY(); TRANS(0x60); TAIL(); return true;
state = utf8d[256 + state + type]; default: return false;
if (state == 12) }
return false; #undef COPY
}; #undef TRANS
return true; #undef TAIL
} }
}; };

View File

@ -31,7 +31,7 @@ struct GenericStringBuffer {
return stack_.template Bottom<Ch>(); return stack_.template Bottom<Ch>();
} }
size_t Size() const { return stack_.Size(); } size_t GetSize() const { return stack_.GetSize(); }
static const size_t kDefaultCapacity = 256; static const size_t kDefaultCapacity = 256;
mutable internal::Stack<Allocator> stack_; mutable internal::Stack<Allocator> stack_;

View File

@ -232,6 +232,18 @@ TEST_F(RapidJson, SIMD_SUFFIX(Whitespace)) {
} }
} }
TEST_F(RapidJson, UTF8_Validate) {
StringBuffer os(0, length_ + 1);
for (int i = 0; i < kTrialCount; i++) {
StringStream is(json_);
os.Clear();
while (is.Peek() != '\0')
UTF8<>::Validate(is, os);
EXPECT_EQ(length_, os.GetSize());
}
}
// Depreciated. // Depreciated.
//TEST_F(RapidJson, FileStream_Read) { //TEST_F(RapidJson, FileStream_Read) {
// for (int i = 0; i < kTrialCount; i++) { // for (int i = 0; i < kTrialCount; i++) {