Rewrite UTF8::Validate() to obtain better performance.
git-svn-id: https://rapidjson.googlecode.com/svn/trunk@35 c5894555-1306-4e8d-425f-1f6f381ee07c
This commit is contained in:
parent
827de60fb8
commit
a45bcbba7b
@ -357,46 +357,41 @@ struct UTF8 {
|
|||||||
|
|
||||||
template <typename InputStream, typename OutputStream>
|
template <typename InputStream, typename OutputStream>
|
||||||
RAPIDJSON_FORCEINLINE static bool Validate(InputStream& is, OutputStream& os) {
|
RAPIDJSON_FORCEINLINE static bool Validate(InputStream& is, OutputStream& os) {
|
||||||
// http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
|
// Referring to DFA of http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
|
||||||
static const unsigned char utf8d[] = {
|
// With new mapping 1 -> 0x10, 7 -> 0x20, 9 -> 0x40, such that AND operation can test multiple types.
|
||||||
//! \todo optimization
|
static const unsigned char type[] = {
|
||||||
// The first part of the table maps bytes to character classes that
|
|
||||||
// to reduce the size of the transition table and create bitmasks.
|
|
||||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
|
0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,
|
||||||
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
|
0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,
|
||||||
|
0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,
|
||||||
|
0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,
|
||||||
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
||||||
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
|
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
|
||||||
|
|
||||||
// The second part is a transition table that maps a combination
|
|
||||||
// of a state of the automaton and a character class to a state.
|
|
||||||
0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
|
|
||||||
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
|
|
||||||
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
|
|
||||||
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
|
|
||||||
12,36,12,12,12,12,12,12,12,12,12,12,
|
|
||||||
};
|
};
|
||||||
|
#define COPY() os.Put(c = is.Take())
|
||||||
|
#define TRANS(mask) if (!(type[(unsigned char)c] & mask)) return false
|
||||||
|
#define TAIL() COPY(); TRANS(0x70)
|
||||||
Ch c;
|
Ch c;
|
||||||
os.Put(c = is.Take());
|
COPY();
|
||||||
if ((unsigned char) c < 0x80)
|
if (!(c & 0x80))
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
unsigned type = utf8d[(unsigned char)c];
|
switch (type[(unsigned char)c]) {
|
||||||
unsigned state = utf8d[256 + type];
|
case 2: TAIL(); return true;
|
||||||
if (state == 12)
|
case 3: TAIL(); TAIL(); return true;
|
||||||
return false;
|
case 4: COPY(); TRANS(0x50); TAIL(); return true;
|
||||||
|
case 5: COPY(); TRANS(0x10); COPY(); TAIL(); return true;
|
||||||
while (state) {
|
case 6: TAIL(); TAIL(); TAIL(); return true;
|
||||||
os.Put(c = is.Take());
|
case 10: COPY(); TRANS(0x20); TAIL(); return true;
|
||||||
unsigned type = utf8d[(unsigned char)c];
|
case 11: COPY(); TRANS(0x60); TAIL(); return true;
|
||||||
state = utf8d[256 + state + type];
|
default: return false;
|
||||||
if (state == 12)
|
}
|
||||||
return false;
|
#undef COPY
|
||||||
};
|
#undef TRANS
|
||||||
return true;
|
#undef TAIL
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -31,7 +31,7 @@ struct GenericStringBuffer {
|
|||||||
return stack_.template Bottom<Ch>();
|
return stack_.template Bottom<Ch>();
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t Size() const { return stack_.Size(); }
|
size_t GetSize() const { return stack_.GetSize(); }
|
||||||
|
|
||||||
static const size_t kDefaultCapacity = 256;
|
static const size_t kDefaultCapacity = 256;
|
||||||
mutable internal::Stack<Allocator> stack_;
|
mutable internal::Stack<Allocator> stack_;
|
||||||
|
@ -232,6 +232,18 @@ TEST_F(RapidJson, SIMD_SUFFIX(Whitespace)) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_F(RapidJson, UTF8_Validate) {
|
||||||
|
StringBuffer os(0, length_ + 1);
|
||||||
|
|
||||||
|
for (int i = 0; i < kTrialCount; i++) {
|
||||||
|
StringStream is(json_);
|
||||||
|
os.Clear();
|
||||||
|
while (is.Peek() != '\0')
|
||||||
|
UTF8<>::Validate(is, os);
|
||||||
|
EXPECT_EQ(length_, os.GetSize());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Depreciated.
|
// Depreciated.
|
||||||
//TEST_F(RapidJson, FileStream_Read) {
|
//TEST_F(RapidJson, FileStream_Read) {
|
||||||
// for (int i = 0; i < kTrialCount; i++) {
|
// for (int i = 0; i < kTrialCount; i++) {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user