From a45bcbba7b095877e896858872242d3ea8fcc5a6 Mon Sep 17 00:00:00 2001 From: "miloyip@gmail.com" Date: Wed, 23 Nov 2011 17:59:30 +0000 Subject: [PATCH] Rewrite UTF8::Validate() to obtain better performance. git-svn-id: https://rapidjson.googlecode.com/svn/trunk@35 c5894555-1306-4e8d-425f-1f6f381ee07c --- include/rapidjson/rapidjson.h | 55 +++++++++++++++----------------- include/rapidjson/stringbuffer.h | 2 +- test/perftest/rapidjsontest.cpp | 12 +++++++ 3 files changed, 38 insertions(+), 31 deletions(-) diff --git a/include/rapidjson/rapidjson.h b/include/rapidjson/rapidjson.h index 21d860c..bb4fd0c 100644 --- a/include/rapidjson/rapidjson.h +++ b/include/rapidjson/rapidjson.h @@ -357,46 +357,41 @@ struct UTF8 { template RAPIDJSON_FORCEINLINE static bool Validate(InputStream& is, OutputStream& os) { - // http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ - static const unsigned char utf8d[] = { - //! \todo optimization - // The first part of the table maps bytes to character classes that - // to reduce the size of the transition table and create bitmasks. + // Referring to DFA of http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ + // With new mapping 1 -> 0x10, 7 -> 0x20, 9 -> 0x40, such that AND operation can test multiple types. + static const unsigned char type[] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, - 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10, + 0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40, + 0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20, + 0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20, 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, - - // The second part is a transition table that maps a combination - // of a state of the automaton and a character class to a state. - 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, - 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, - 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, - 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, - 12,36,12,12,12,12,12,12,12,12,12,12, }; +#define COPY() os.Put(c = is.Take()) +#define TRANS(mask) if (!(type[(unsigned char)c] & mask)) return false +#define TAIL() COPY(); TRANS(0x70) Ch c; - os.Put(c = is.Take()); - if ((unsigned char) c < 0x80) + COPY(); + if (!(c & 0x80)) return true; - unsigned type = utf8d[(unsigned char)c]; - unsigned state = utf8d[256 + type]; - if (state == 12) - return false; - - while (state) { - os.Put(c = is.Take()); - unsigned type = utf8d[(unsigned char)c]; - state = utf8d[256 + state + type]; - if (state == 12) - return false; - }; - return true; + switch (type[(unsigned char)c]) { + case 2: TAIL(); return true; + case 3: TAIL(); TAIL(); return true; + case 4: COPY(); TRANS(0x50); TAIL(); return true; + case 5: COPY(); TRANS(0x10); COPY(); TAIL(); return true; + case 6: TAIL(); TAIL(); TAIL(); return true; + case 10: COPY(); TRANS(0x20); TAIL(); return true; + case 11: COPY(); TRANS(0x60); TAIL(); return true; + default: return false; + } +#undef COPY +#undef TRANS +#undef TAIL } }; diff --git a/include/rapidjson/stringbuffer.h b/include/rapidjson/stringbuffer.h index 319bb44..506effd 100644 --- a/include/rapidjson/stringbuffer.h +++ b/include/rapidjson/stringbuffer.h @@ -31,7 +31,7 @@ struct GenericStringBuffer { return stack_.template Bottom(); } - size_t Size() const { return stack_.Size(); } + size_t GetSize() const { return stack_.GetSize(); } static const size_t kDefaultCapacity = 256; mutable internal::Stack stack_; diff --git a/test/perftest/rapidjsontest.cpp b/test/perftest/rapidjsontest.cpp index 5042791..ae2af33 100644 --- a/test/perftest/rapidjsontest.cpp +++ b/test/perftest/rapidjsontest.cpp @@ -232,6 +232,18 @@ TEST_F(RapidJson, SIMD_SUFFIX(Whitespace)) { } } +TEST_F(RapidJson, UTF8_Validate) { + StringBuffer os(0, length_ + 1); + + for (int i = 0; i < kTrialCount; i++) { + StringStream is(json_); + os.Clear(); + while (is.Peek() != '\0') + UTF8<>::Validate(is, os); + EXPECT_EQ(length_, os.GetSize()); + } +} + // Depreciated. //TEST_F(RapidJson, FileStream_Read) { // for (int i = 0; i < kTrialCount; i++) {