diff --git a/include/rapidjson/encodings.h b/include/rapidjson/encodings.h index 98c9481..4e745ee 100644 --- a/include/rapidjson/encodings.h +++ b/include/rapidjson/encodings.h @@ -70,7 +70,7 @@ struct UTF8 { template RAPIDJSON_FORCEINLINE static bool Decode(InputStream& is, unsigned* codepoint) { #define COPY() c = is.Take(); *codepoint = (*codepoint << 6) | ((unsigned char)c & 0x3Fu) -#define TRANS(mask) result &= ((GetType((unsigned char)c) & mask) != 0) +#define TRANS(mask) result &= ((GetRange((unsigned char)c) & mask) != 0) #define TAIL() COPY(); TRANS(0x70) Ch c = is.Take(); if (!(c & 0x80)) { @@ -78,17 +78,17 @@ struct UTF8 { return true; } - unsigned char type = GetType((unsigned char)c); + unsigned char type = GetRange((unsigned char)c); *codepoint = (0xFF >> type) & (unsigned char)c; bool result = true; switch (type) { case 2: TAIL(); return result; case 3: TAIL(); TAIL(); return result; case 4: COPY(); TRANS(0x50); TAIL(); return result; - case 5: COPY(); TRANS(0x10); COPY(); TAIL(); return result; + case 5: COPY(); TRANS(0x10); TAIL(); TAIL(); return result; case 6: TAIL(); TAIL(); TAIL(); return result; case 10: COPY(); TRANS(0x20); TAIL(); return result; - case 11: COPY(); TRANS(0x60); TAIL(); return result; + case 11: COPY(); TRANS(0x60); TAIL(); TAIL(); return result; default: return false; } #undef COPY @@ -99,7 +99,7 @@ struct UTF8 { template RAPIDJSON_FORCEINLINE static bool Validate(InputStream& is, OutputStream& os) { #define COPY() os.Put(c = is.Take()) -#define TRANS(mask) result &= ((GetType((unsigned char)c) & mask) != 0) +#define TRANS(mask) result &= ((GetRange((unsigned char)c) & mask) != 0) #define TAIL() COPY(); TRANS(0x70) Ch c; COPY(); @@ -107,14 +107,14 @@ struct UTF8 { return true; bool result = true; - switch (GetType((unsigned char)c)) { + switch (GetRange((unsigned char)c)) { case 2: TAIL(); return result; case 3: TAIL(); TAIL(); return result; case 4: COPY(); TRANS(0x50); TAIL(); return result; - case 5: COPY(); TRANS(0x10); COPY(); TAIL(); return result; + case 5: COPY(); TRANS(0x10); TAIL(); TAIL(); return result; case 6: TAIL(); TAIL(); TAIL(); return result; case 10: COPY(); TRANS(0x20); TAIL(); return result; - case 11: COPY(); TRANS(0x60); TAIL(); return result; + case 11: COPY(); TRANS(0x60); TAIL(); TAIL(); return result; default: return false; } #undef COPY @@ -122,7 +122,7 @@ struct UTF8 { #undef TAIL } - RAPIDJSON_FORCEINLINE static unsigned char GetType(unsigned char c) { + RAPIDJSON_FORCEINLINE static unsigned char GetRange(unsigned char c) { // Referring to DFA of http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ // With new mapping 1 -> 0x10, 7 -> 0x20, 9 -> 0x40, such that AND operation can test multiple types. static const unsigned char type[] = { @@ -202,7 +202,7 @@ struct UTF16 { *codepoint = c; return true; } - else if (c < 0xDBFF) { + else if (c <= 0xDBFF) { *codepoint = (c & 0x3FF) << 10; c = is.Take(); *codepoint |= (c & 0x3FF); @@ -218,7 +218,7 @@ struct UTF16 { os.Put(c = is.Take()); if (c < 0xD800 || c > 0xDFFF) return true; - else if (c < 0xDBFF) { + else if (c <= 0xDBFF) { os.Put(c = is.Take()); return c >= 0xDC00 && c <= 0xDFFF; } diff --git a/include/rapidjson/stringbuffer.h b/include/rapidjson/stringbuffer.h index 506effd..9a5efd7 100644 --- a/include/rapidjson/stringbuffer.h +++ b/include/rapidjson/stringbuffer.h @@ -23,7 +23,7 @@ struct GenericStringBuffer { void Clear() { stack_.Clear(); } - const char* GetString() const { + const Ch* GetString() const { // Push and pop a null terminator. This is safe. *stack_.template Push() = '\0'; stack_.template Pop(1); diff --git a/test/unittest/encodedstreamtest.cpp b/test/unittest/encodedstreamtest.cpp new file mode 100644 index 0000000..a1bda32 --- /dev/null +++ b/test/unittest/encodedstreamtest.cpp @@ -0,0 +1,189 @@ +#include "unittest.h" +#include "rapidjson/filereadstream.h" +#include "rapidjson/filewritestream.h" +#include "rapidjson/encodedstream.h" +#include "rapidjson/stringbuffer.h" + +using namespace rapidjson; + +class EncodingsTest : public ::testing::Test { +public: + virtual void SetUp() { + json_ = ReadFile("utf8.json", true, &length_); + } + + virtual void TearDown() { + free(json_); + } + +protected: + static FILE* Open(const char* filename) { + char buffer[1024]; + sprintf(buffer, "encodings/%s", filename); + FILE *fp = fopen(buffer, "rb"); + if (!fp) { + sprintf(buffer, "../../bin/encodings/%s", filename); + fp = fopen(buffer, "rb"); + } + return fp; + } + + static char *ReadFile(const char* filename, bool appendPath, size_t* outLength) { + FILE *fp = appendPath ? Open(filename) : fopen(filename, "rb"); + + if (!fp) { + *outLength = 0; + return 0; + } + + fseek(fp, 0, SEEK_END); + *outLength = (size_t)ftell(fp); + fseek(fp, 0, SEEK_SET); + char* buffer = (char*)malloc(*outLength + 1); + fread(buffer, 1, *outLength, fp); + buffer[*outLength] = '\0'; + fclose(fp); + return buffer; + } + + template + void TestEncodedInputStream(const char* filename) { + char buffer[16]; + FILE *fp = Open(filename); + ASSERT_TRUE(fp != 0); + FileReadStream fs(fp, buffer, sizeof(buffer)); + EncodedInputStream eis(fs); + StringStream s(json_); + + while (eis.Peek() != '\0') { + unsigned expected, actual; + EXPECT_TRUE(UTF8<>::Decode(s, &expected)); + EXPECT_TRUE(MemoryEncoding::Decode(eis, &actual)); + EXPECT_EQ(expected, actual); + } + EXPECT_EQ('\0', s.Peek()); + fclose(fp); + } + + void TestAutoUTFInputStream(const char *filename) { + char buffer[16]; + FILE *fp = Open(filename); + ASSERT_TRUE(fp != 0); + FileReadStream fs(fp, buffer, sizeof(buffer)); + AutoUTFInputStream eis(fs); + StringStream s(json_); + while (eis.Peek() != '\0') { + unsigned expected, actual; + EXPECT_TRUE(UTF8<>::Decode(s, &expected)); + EXPECT_TRUE(AutoUTF::Decode(eis, &actual)); + EXPECT_EQ(expected, actual); + } + EXPECT_EQ('\0', s.Peek()); + fclose(fp); + } + + template + void TestEncodedOutputStream(const char* expectedFilename, bool putBOM) { + char filename[L_tmpnam]; + tmpnam(filename); + + FILE *fp = fopen(filename, "wb"); + char buffer[16]; + FileWriteStream os(fp, buffer, sizeof(buffer)); + EncodedOutputStream eos(os, putBOM); + StringStream s(json_); + while (s.Peek() != '\0') { + bool success = Transcoder, MemoryEncoding>::Transcode(s, eos); + EXPECT_TRUE(success); + } + eos.Flush(); + fclose(fp); + EXPECT_TRUE(CompareFile(filename, expectedFilename)); + remove(filename); + } + + bool CompareFile(char * filename, const char* expectedFilename) { + size_t actualLength, expectedLength; + char* actualBuffer = ReadFile(filename, false, &actualLength); + char* expectedBuffer = ReadFile(expectedFilename, true, &expectedLength); + bool ret = (expectedLength == actualLength) && memcmp(expectedBuffer, actualBuffer, actualLength) == 0; + free(actualBuffer); + free(expectedBuffer); + return ret; + } + + void TestAutoUTFOutputStream(UTFType type, bool putBOM, const char *expectedFilename) { + char filename[L_tmpnam]; + tmpnam(filename); + + FILE *fp = fopen(filename, "wb"); + char buffer[16]; + FileWriteStream os(fp, buffer, sizeof(buffer)); + AutoUTFOutputStream eos(os, type, putBOM); + StringStream s(json_); + while (s.Peek() != '\0') { + bool success = Transcoder, AutoUTF >::Transcode(s, eos); + EXPECT_TRUE(success); + } + eos.Flush(); + fclose(fp); + EXPECT_TRUE(CompareFile(filename, expectedFilename)); + remove(filename); + } + + const char* filename_; + char *json_; + size_t length_; +}; + +TEST_F(EncodingsTest, EncodedInputStream) { + TestEncodedInputStream, UTF8<> >("utf8.json"); + TestEncodedInputStream, UTF8<> >("utf8bom.json"); + TestEncodedInputStream, UTF16<> >("utf16le.json"); + TestEncodedInputStream, UTF16<> >("utf16lebom.json"); + TestEncodedInputStream, UTF16<> >("utf16be.json"); + TestEncodedInputStream, UTF16<> >("utf16bebom.json"); + TestEncodedInputStream, UTF32<> >("utf32le.json"); + TestEncodedInputStream, UTF32<> >("utf32lebom.json"); + TestEncodedInputStream, UTF32<> >("utf32be.json"); + TestEncodedInputStream, UTF32<> >("utf32bebom.json"); +} + +TEST_F(EncodingsTest, AutoUTFInputStream) { + TestAutoUTFInputStream("utf8.json"); + TestAutoUTFInputStream("utf8bom.json"); + TestAutoUTFInputStream("utf16le.json"); + TestAutoUTFInputStream("utf16lebom.json"); + TestAutoUTFInputStream("utf16be.json"); + TestAutoUTFInputStream("utf16bebom.json"); + TestAutoUTFInputStream("utf32le.json"); + TestAutoUTFInputStream("utf32lebom.json"); + TestAutoUTFInputStream("utf32be.json"); + TestAutoUTFInputStream("utf32bebom.json"); +} + +TEST_F(EncodingsTest, EncodedOutputStream) { + TestEncodedOutputStream, UTF8<> >("utf8.json", false); + TestEncodedOutputStream, UTF8<> >("utf8bom.json", true); + TestEncodedOutputStream, UTF16<> >("utf16le.json", false); + TestEncodedOutputStream, UTF16<> >("utf16lebom.json",true); + TestEncodedOutputStream, UTF16<> >("utf16be.json", false); + TestEncodedOutputStream, UTF16<> >("utf16bebom.json",true); + TestEncodedOutputStream, UTF32<> >("utf32le.json", false); + TestEncodedOutputStream, UTF32<> >("utf32lebom.json",true); + TestEncodedOutputStream, UTF32<> >("utf32be.json", false); + TestEncodedOutputStream, UTF32<> >("utf32bebom.json",true); +} + +TEST_F(EncodingsTest, AutoUTFOutputStream) { + TestAutoUTFOutputStream(kUTF8, false, "utf8.json"); + TestAutoUTFOutputStream(kUTF8, true, "utf8bom.json"); + TestAutoUTFOutputStream(kUTF16LE, false, "utf16le.json"); + TestAutoUTFOutputStream(kUTF16LE, true, "utf16lebom.json"); + TestAutoUTFOutputStream(kUTF16BE, false, "utf16be.json"); + TestAutoUTFOutputStream(kUTF16BE, true, "utf16bebom.json"); + TestAutoUTFOutputStream(kUTF32LE, false, "utf32le.json"); + TestAutoUTFOutputStream(kUTF32LE, true, "utf32lebom.json"); + TestAutoUTFOutputStream(kUTF32BE, false, "utf32be.json"); + TestAutoUTFOutputStream(kUTF32BE, true, "utf32bebom.json"); +} diff --git a/test/unittest/encodingstest.cpp b/test/unittest/encodingstest.cpp index 81efdf8..1c3260f 100644 --- a/test/unittest/encodingstest.cpp +++ b/test/unittest/encodingstest.cpp @@ -2,187 +2,411 @@ #include "rapidjson/filereadstream.h" #include "rapidjson/filewritestream.h" #include "rapidjson/encodedstream.h" +#include "rapidjson/stringbuffer.h" using namespace rapidjson; -class EncodingsTest : public ::testing::Test { -public: - virtual void SetUp() { - json_ = ReadFile("utf8.json", true, &length_); - } +// Verification of encoders/decoders with Hoehrmann's UTF8 decoder - virtual void TearDown() { - free(json_); - } - -protected: - static FILE* Open(const char* filename) { - char buffer[1024]; - sprintf(buffer, "encodings/%s", filename); - FILE *fp = fopen(buffer, "rb"); - if (!fp) { - sprintf(buffer, "../../bin/encodings/%s", filename); - fp = fopen(buffer, "rb"); - } - return fp; - } - - static char *ReadFile(const char* filename, bool appendPath, size_t* outLength) { - FILE *fp = appendPath ? Open(filename) : fopen(filename, "rb"); - - if (!fp) { - *outLength = 0; - return 0; - } - - fseek(fp, 0, SEEK_END); - *outLength = (size_t)ftell(fp); - fseek(fp, 0, SEEK_SET); - char* buffer = (char*)malloc(*outLength + 1); - fread(buffer, 1, *outLength, fp); - buffer[*outLength] = '\0'; - fclose(fp); - return buffer; - } - - template - void TestEncodedInputStream(const char* filename) { - char buffer[16]; - FILE *fp = Open(filename); - ASSERT_TRUE(fp != 0); - FileReadStream fs(fp, buffer, sizeof(buffer)); - EncodedInputStream eis(fs); - StringStream s(json_); - - while (eis.Peek() != '\0') { - unsigned expected, actual; - EXPECT_TRUE(UTF8<>::Decode(s, &expected)); - EXPECT_TRUE(MemoryEncoding::Decode(eis, &actual)); - EXPECT_EQ(expected, actual); - } - EXPECT_EQ('\0', s.Peek()); - fclose(fp); - } - - void TestAutoUTFInputStream(const char *filename) { - char buffer[16]; - FILE *fp = Open(filename); - ASSERT_TRUE(fp != 0); - FileReadStream fs(fp, buffer, sizeof(buffer)); - AutoUTFInputStream eis(fs); - StringStream s(json_); - while (eis.Peek() != '\0') { - unsigned expected, actual; - EXPECT_TRUE(UTF8<>::Decode(s, &expected)); - EXPECT_TRUE(AutoUTF::Decode(eis, &actual)); - EXPECT_EQ(expected, actual); - } - EXPECT_EQ('\0', s.Peek()); - fclose(fp); - } - - template - void TestEncodedOutputStream(const char* expectedFilename, bool putBOM) { - char filename[L_tmpnam]; - tmpnam(filename); - - FILE *fp = fopen(filename, "wb"); - char buffer[16]; - FileWriteStream os(fp, buffer, sizeof(buffer)); - EncodedOutputStream eos(os, putBOM); - StringStream s(json_); - while (s.Peek() != '\0') { - bool success = Transcoder, MemoryEncoding>::Transcode(s, eos); - EXPECT_TRUE(success); - } - eos.Flush(); - fclose(fp); - EXPECT_TRUE(CompareFile(filename, expectedFilename)); - remove(filename); - } - - bool CompareFile(char * filename, const char* expectedFilename) { - size_t actualLength, expectedLength; - char* actualBuffer = ReadFile(filename, false, &actualLength); - char* expectedBuffer = ReadFile(expectedFilename, true, &expectedLength); - bool ret = (expectedLength == actualLength) && memcmp(expectedBuffer, actualBuffer, actualLength) == 0; - free(actualBuffer); - free(expectedBuffer); - return ret; - } - - void TestAutoUTFOutputStream(UTFType type, bool putBOM, const char *expectedFilename) { - char filename[L_tmpnam]; - tmpnam(filename); - - FILE *fp = fopen(filename, "wb"); - char buffer[16]; - FileWriteStream os(fp, buffer, sizeof(buffer)); - AutoUTFOutputStream eos(os, type, putBOM); - StringStream s(json_); - while (s.Peek() != '\0') { - bool success = Transcoder, AutoUTF >::Transcode(s, eos); - EXPECT_TRUE(success); - } - eos.Flush(); - fclose(fp); - EXPECT_TRUE(CompareFile(filename, expectedFilename)); - remove(filename); - } - - const char* filename_; - char *json_; - size_t length_; +// http://www.unicode.org/Public/UNIDATA/Blocks.txt +static const unsigned kCodepointRanges[] = { + 0x0000, 0x007F, // Basic Latin + 0x0080, 0x00FF, // Latin-1 Supplement + 0x0100, 0x017F, // Latin Extended-A + 0x0180, 0x024F, // Latin Extended-B + 0x0250, 0x02AF, // IPA Extensions + 0x02B0, 0x02FF, // Spacing Modifier Letters + 0x0300, 0x036F, // Combining Diacritical Marks + 0x0370, 0x03FF, // Greek and Coptic + 0x0400, 0x04FF, // Cyrillic + 0x0500, 0x052F, // Cyrillic Supplement + 0x0530, 0x058F, // Armenian + 0x0590, 0x05FF, // Hebrew + 0x0600, 0x06FF, // Arabic + 0x0700, 0x074F, // Syriac + 0x0750, 0x077F, // Arabic Supplement + 0x0780, 0x07BF, // Thaana + 0x07C0, 0x07FF, // NKo + 0x0800, 0x083F, // Samaritan + 0x0840, 0x085F, // Mandaic + 0x0900, 0x097F, // Devanagari + 0x0980, 0x09FF, // Bengali + 0x0A00, 0x0A7F, // Gurmukhi + 0x0A80, 0x0AFF, // Gujarati + 0x0B00, 0x0B7F, // Oriya + 0x0B80, 0x0BFF, // Tamil + 0x0C00, 0x0C7F, // Telugu + 0x0C80, 0x0CFF, // Kannada + 0x0D00, 0x0D7F, // Malayalam + 0x0D80, 0x0DFF, // Sinhala + 0x0E00, 0x0E7F, // Thai + 0x0E80, 0x0EFF, // Lao + 0x0F00, 0x0FFF, // Tibetan + 0x1000, 0x109F, // Myanmar + 0x10A0, 0x10FF, // Georgian + 0x1100, 0x11FF, // Hangul Jamo + 0x1200, 0x137F, // Ethiopic + 0x1380, 0x139F, // Ethiopic Supplement + 0x13A0, 0x13FF, // Cherokee + 0x1400, 0x167F, // Unified Canadian Aboriginal Syllabics + 0x1680, 0x169F, // Ogham + 0x16A0, 0x16FF, // Runic + 0x1700, 0x171F, // Tagalog + 0x1720, 0x173F, // Hanunoo + 0x1740, 0x175F, // Buhid + 0x1760, 0x177F, // Tagbanwa + 0x1780, 0x17FF, // Khmer + 0x1800, 0x18AF, // Mongolian + 0x18B0, 0x18FF, // Unified Canadian Aboriginal Syllabics Extended + 0x1900, 0x194F, // Limbu + 0x1950, 0x197F, // Tai Le + 0x1980, 0x19DF, // New Tai Lue + 0x19E0, 0x19FF, // Khmer Symbols + 0x1A00, 0x1A1F, // Buginese + 0x1A20, 0x1AAF, // Tai Tham + 0x1B00, 0x1B7F, // Balinese + 0x1B80, 0x1BBF, // Sundanese + 0x1BC0, 0x1BFF, // Batak + 0x1C00, 0x1C4F, // Lepcha + 0x1C50, 0x1C7F, // Ol Chiki + 0x1CD0, 0x1CFF, // Vedic Extensions + 0x1D00, 0x1D7F, // Phonetic Extensions + 0x1D80, 0x1DBF, // Phonetic Extensions Supplement + 0x1DC0, 0x1DFF, // Combining Diacritical Marks Supplement + 0x1E00, 0x1EFF, // Latin Extended Additional + 0x1F00, 0x1FFF, // Greek Extended + 0x2000, 0x206F, // General Punctuation + 0x2070, 0x209F, // Superscripts and Subscripts + 0x20A0, 0x20CF, // Currency Symbols + 0x20D0, 0x20FF, // Combining Diacritical Marks for Symbols + 0x2100, 0x214F, // Letterlike Symbols + 0x2150, 0x218F, // Number Forms + 0x2190, 0x21FF, // Arrows + 0x2200, 0x22FF, // Mathematical Operators + 0x2300, 0x23FF, // Miscellaneous Technical + 0x2400, 0x243F, // Control Pictures + 0x2440, 0x245F, // Optical Character Recognition + 0x2460, 0x24FF, // Enclosed Alphanumerics + 0x2500, 0x257F, // Box Drawing + 0x2580, 0x259F, // Block Elements + 0x25A0, 0x25FF, // Geometric Shapes + 0x2600, 0x26FF, // Miscellaneous Symbols + 0x2700, 0x27BF, // Dingbats + 0x27C0, 0x27EF, // Miscellaneous Mathematical Symbols-A + 0x27F0, 0x27FF, // Supplemental Arrows-A + 0x2800, 0x28FF, // Braille Patterns + 0x2900, 0x297F, // Supplemental Arrows-B + 0x2980, 0x29FF, // Miscellaneous Mathematical Symbols-B + 0x2A00, 0x2AFF, // Supplemental Mathematical Operators + 0x2B00, 0x2BFF, // Miscellaneous Symbols and Arrows + 0x2C00, 0x2C5F, // Glagolitic + 0x2C60, 0x2C7F, // Latin Extended-C + 0x2C80, 0x2CFF, // Coptic + 0x2D00, 0x2D2F, // Georgian Supplement + 0x2D30, 0x2D7F, // Tifinagh + 0x2D80, 0x2DDF, // Ethiopic Extended + 0x2DE0, 0x2DFF, // Cyrillic Extended-A + 0x2E00, 0x2E7F, // Supplemental Punctuation + 0x2E80, 0x2EFF, // CJK Radicals Supplement + 0x2F00, 0x2FDF, // Kangxi Radicals + 0x2FF0, 0x2FFF, // Ideographic Description Characters + 0x3000, 0x303F, // CJK Symbols and Punctuation + 0x3040, 0x309F, // Hiragana + 0x30A0, 0x30FF, // Katakana + 0x3100, 0x312F, // Bopomofo + 0x3130, 0x318F, // Hangul Compatibility Jamo + 0x3190, 0x319F, // Kanbun + 0x31A0, 0x31BF, // Bopomofo Extended + 0x31C0, 0x31EF, // CJK Strokes + 0x31F0, 0x31FF, // Katakana Phonetic Extensions + 0x3200, 0x32FF, // Enclosed CJK Letters and Months + 0x3300, 0x33FF, // CJK Compatibility + 0x3400, 0x4DBF, // CJK Unified Ideographs Extension A + 0x4DC0, 0x4DFF, // Yijing Hexagram Symbols + 0x4E00, 0x9FFF, // CJK Unified Ideographs + 0xA000, 0xA48F, // Yi Syllables + 0xA490, 0xA4CF, // Yi Radicals + 0xA4D0, 0xA4FF, // Lisu + 0xA500, 0xA63F, // Vai + 0xA640, 0xA69F, // Cyrillic Extended-B + 0xA6A0, 0xA6FF, // Bamum + 0xA700, 0xA71F, // Modifier Tone Letters + 0xA720, 0xA7FF, // Latin Extended-D + 0xA800, 0xA82F, // Syloti Nagri + 0xA830, 0xA83F, // Common Indic Number Forms + 0xA840, 0xA87F, // Phags-pa + 0xA880, 0xA8DF, // Saurashtra + 0xA8E0, 0xA8FF, // Devanagari Extended + 0xA900, 0xA92F, // Kayah Li + 0xA930, 0xA95F, // Rejang + 0xA960, 0xA97F, // Hangul Jamo Extended-A + 0xA980, 0xA9DF, // Javanese + 0xAA00, 0xAA5F, // Cham + 0xAA60, 0xAA7F, // Myanmar Extended-A + 0xAA80, 0xAADF, // Tai Viet + 0xAB00, 0xAB2F, // Ethiopic Extended-A + 0xABC0, 0xABFF, // Meetei Mayek + 0xAC00, 0xD7AF, // Hangul Syllables + 0xD7B0, 0xD7FF, // Hangul Jamo Extended-B + //0xD800, 0xDB7F, // High Surrogates + //0xDB80, 0xDBFF, // High Private Use Surrogates + //0xDC00, 0xDFFF, // Low Surrogates + 0xE000, 0xF8FF, // Private Use Area + 0xF900, 0xFAFF, // CJK Compatibility Ideographs + 0xFB00, 0xFB4F, // Alphabetic Presentation Forms + 0xFB50, 0xFDFF, // Arabic Presentation Forms-A + 0xFE00, 0xFE0F, // Variation Selectors + 0xFE10, 0xFE1F, // Vertical Forms + 0xFE20, 0xFE2F, // Combining Half Marks + 0xFE30, 0xFE4F, // CJK Compatibility Forms + 0xFE50, 0xFE6F, // Small Form Variants + 0xFE70, 0xFEFF, // Arabic Presentation Forms-B + 0xFF00, 0xFFEF, // Halfwidth and Fullwidth Forms + 0xFFF0, 0xFFFF, // Specials + 0x10000, 0x1007F, // Linear B Syllabary + 0x10080, 0x100FF, // Linear B Ideograms + 0x10100, 0x1013F, // Aegean Numbers + 0x10140, 0x1018F, // Ancient Greek Numbers + 0x10190, 0x101CF, // Ancient Symbols + 0x101D0, 0x101FF, // Phaistos Disc + 0x10280, 0x1029F, // Lycian + 0x102A0, 0x102DF, // Carian + 0x10300, 0x1032F, // Old Italic + 0x10330, 0x1034F, // Gothic + 0x10380, 0x1039F, // Ugaritic + 0x103A0, 0x103DF, // Old Persian + 0x10400, 0x1044F, // Deseret + 0x10450, 0x1047F, // Shavian + 0x10480, 0x104AF, // Osmanya + 0x10800, 0x1083F, // Cypriot Syllabary + 0x10840, 0x1085F, // Imperial Aramaic + 0x10900, 0x1091F, // Phoenician + 0x10920, 0x1093F, // Lydian + 0x10A00, 0x10A5F, // Kharoshthi + 0x10A60, 0x10A7F, // Old South Arabian + 0x10B00, 0x10B3F, // Avestan + 0x10B40, 0x10B5F, // Inscriptional Parthian + 0x10B60, 0x10B7F, // Inscriptional Pahlavi + 0x10C00, 0x10C4F, // Old Turkic + 0x10E60, 0x10E7F, // Rumi Numeral Symbols + 0x11000, 0x1107F, // Brahmi + 0x11080, 0x110CF, // Kaithi + 0x12000, 0x123FF, // Cuneiform + 0x12400, 0x1247F, // Cuneiform Numbers and Punctuation + 0x13000, 0x1342F, // Egyptian Hieroglyphs + 0x16800, 0x16A3F, // Bamum Supplement + 0x1B000, 0x1B0FF, // Kana Supplement + 0x1D000, 0x1D0FF, // Byzantine Musical Symbols + 0x1D100, 0x1D1FF, // Musical Symbols + 0x1D200, 0x1D24F, // Ancient Greek Musical Notation + 0x1D300, 0x1D35F, // Tai Xuan Jing Symbols + 0x1D360, 0x1D37F, // Counting Rod Numerals + 0x1D400, 0x1D7FF, // Mathematical Alphanumeric Symbols + 0x1F000, 0x1F02F, // Mahjong Tiles + 0x1F030, 0x1F09F, // Domino Tiles + 0x1F0A0, 0x1F0FF, // Playing Cards + 0x1F100, 0x1F1FF, // Enclosed Alphanumeric Supplement + 0x1F200, 0x1F2FF, // Enclosed Ideographic Supplement + 0x1F300, 0x1F5FF, // Miscellaneous Symbols And Pictographs + 0x1F600, 0x1F64F, // Emoticons + 0x1F680, 0x1F6FF, // Transport And Map Symbols + 0x1F700, 0x1F77F, // Alchemical Symbols + 0x20000, 0x2A6DF, // CJK Unified Ideographs Extension B + 0x2A700, 0x2B73F, // CJK Unified Ideographs Extension C + 0x2B740, 0x2B81F, // CJK Unified Ideographs Extension D + 0x2F800, 0x2FA1F, // CJK Compatibility Ideographs Supplement + 0xE0000, 0xE007F, // Tags + 0xE0100, 0xE01EF, // Variation Selectors Supplement + 0xF0000, 0xFFFFF, // Supplementary Private Use Area-A + 0x100000, 0x10FFFF, // Supplementary Private Use Area-B + 0xFFFFFFFF }; -TEST_F(EncodingsTest, EncodedInputStream) { - TestEncodedInputStream, UTF8<> >("utf8.json"); - TestEncodedInputStream, UTF8<> >("utf8bom.json"); - TestEncodedInputStream, UTF16<> >("utf16le.json"); - TestEncodedInputStream, UTF16<> >("utf16lebom.json"); - TestEncodedInputStream, UTF16<> >("utf16be.json"); - TestEncodedInputStream, UTF16<> >("utf16bebom.json"); - TestEncodedInputStream, UTF32<> >("utf32le.json"); - TestEncodedInputStream, UTF32<> >("utf32lebom.json"); - TestEncodedInputStream, UTF32<> >("utf32be.json"); - TestEncodedInputStream, UTF32<> >("utf32bebom.json"); +// Copyright (c) 2008-2010 Bjoern Hoehrmann +// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. + +#define UTF8_ACCEPT 0 +#define UTF8_REJECT 12 + +static const unsigned char utf8d[] = { + // The first part of the table maps bytes to character classes that + // to reduce the size of the transition table and create bitmasks. + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, + + // The second part is a transition table that maps a combination + // of a state of the automaton and a character class to a state. + 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, + 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, + 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, + 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, + 12,36,12,12,12,12,12,12,12,12,12,12, +}; + +static unsigned inline decode(unsigned* state, unsigned* codep, unsigned byte) { + unsigned type = utf8d[byte]; + + *codep = (*state != UTF8_ACCEPT) ? + (byte & 0x3fu) | (*codep << 6) : + (0xff >> type) & (byte); + + *state = utf8d[256 + *state + type]; + return *state; } -TEST_F(EncodingsTest, AutoUTFInputStream) { - TestAutoUTFInputStream("utf8.json"); - TestAutoUTFInputStream("utf8bom.json"); - TestAutoUTFInputStream("utf16le.json"); - TestAutoUTFInputStream("utf16lebom.json"); - TestAutoUTFInputStream("utf16be.json"); - TestAutoUTFInputStream("utf16bebom.json"); - TestAutoUTFInputStream("utf32le.json"); - TestAutoUTFInputStream("utf32lebom.json"); - TestAutoUTFInputStream("utf32be.json"); - TestAutoUTFInputStream("utf32bebom.json"); +static bool IsUTF8(unsigned char* s) { + unsigned codepoint, state = 0; + + while (*s) + decode(&state, &codepoint, *s++); + + return state == UTF8_ACCEPT; } -TEST_F(EncodingsTest, EncodedOutputStream) { - TestEncodedOutputStream, UTF8<> >("utf8.json", false); - TestEncodedOutputStream, UTF8<> >("utf8bom.json", true); - TestEncodedOutputStream, UTF16<> >("utf16le.json", false); - TestEncodedOutputStream, UTF16<> >("utf16lebom.json",true); - TestEncodedOutputStream, UTF16<> >("utf16be.json", false); - TestEncodedOutputStream, UTF16<> >("utf16bebom.json",true); - TestEncodedOutputStream, UTF32<> >("utf32le.json", false); - TestEncodedOutputStream, UTF32<> >("utf32lebom.json",true); - TestEncodedOutputStream, UTF32<> >("utf32be.json", false); - TestEncodedOutputStream, UTF32<> >("utf32bebom.json",true); +TEST(EncodingsTest, UTF8) { + StringBuffer os, os2; + for (const unsigned* range = kCodepointRanges; *range != 0xFFFFFFFF; range += 2) { + for (unsigned codepoint = range[0]; codepoint <= range[1]; ++codepoint) { + os.Clear(); + UTF8<>::Encode(os, codepoint); + const char* encodedStr = os.GetString(); + + // Decode with Hoehrmann + { + unsigned decodedCodepoint; + unsigned state = 0; + + unsigned decodedCount = 0; + for (const char* s = encodedStr; *s; ++s) + if (!decode(&state, &decodedCodepoint, (unsigned char)*s)) { + EXPECT_EQ(codepoint, decodedCodepoint); + decodedCount++; + } + + if (*encodedStr) // This decoder cannot handle U+0000 + EXPECT_EQ(1, decodedCount); // Should only contain one code point + + EXPECT_EQ(UTF8_ACCEPT, state); + if (UTF8_ACCEPT != state) + std::cout << std::hex << codepoint << " " << decodedCodepoint << std::endl; + } + + // Decode + { + StringStream is(encodedStr); + unsigned decodedCodepoint; + bool result = UTF8<>::Decode(is, &decodedCodepoint); + EXPECT_TRUE(result); + EXPECT_EQ(codepoint, decodedCodepoint); + if (!result || codepoint != decodedCodepoint) + std::cout << std::hex << codepoint << " " << decodedCodepoint << std::endl; + } + + // Validate + { + StringStream is(encodedStr); + os2.Clear(); + bool result = UTF8<>::Validate(is, os2); + EXPECT_TRUE(result); + EXPECT_EQ(0, StrCmp(encodedStr, os2.GetString())); + } + } + } } -TEST_F(EncodingsTest, AutoUTFOutputStream) { - TestAutoUTFOutputStream(kUTF8, false, "utf8.json"); - TestAutoUTFOutputStream(kUTF8, true, "utf8bom.json"); - TestAutoUTFOutputStream(kUTF16LE, false, "utf16le.json"); - TestAutoUTFOutputStream(kUTF16LE, true, "utf16lebom.json"); - TestAutoUTFOutputStream(kUTF16BE, false, "utf16be.json"); - TestAutoUTFOutputStream(kUTF16BE, true, "utf16bebom.json"); - TestAutoUTFOutputStream(kUTF32LE, false, "utf32le.json"); - TestAutoUTFOutputStream(kUTF32LE, true, "utf32lebom.json"); - TestAutoUTFOutputStream(kUTF32BE, false, "utf32be.json"); - TestAutoUTFOutputStream(kUTF32BE, true, "utf32bebom.json"); +TEST(EncodingsTest, UTF16) { + GenericStringBuffer > os, os2; + GenericStringBuffer > utf8os; + for (const unsigned* range = kCodepointRanges; *range != 0xFFFFFFFF; range += 2) { + for (unsigned codepoint = range[0]; codepoint <= range[1]; ++codepoint) { + os.Clear(); + UTF16<>::Encode(os, codepoint); + const UTF16<>::Ch* encodedStr = os.GetString(); + + // Encode with Hoehrmann's code + if (codepoint != 0) // cannot handle U+0000 + { + // encode with UTF8<> first + utf8os.Clear(); + UTF8<>::Encode(utf8os, codepoint); + + // transcode from UTF8 to UTF16 with Hoehrmann's code + unsigned decodedCodepoint; + unsigned state = 0; + UTF16<>::Ch buffer[3], *p = &buffer[0]; + for (const char* s = utf8os.GetString(); *s; ++s) { + if (!decode(&state, &decodedCodepoint, (unsigned char)*s)) + break; + } + + if (codepoint <= 0xFFFF) + *p++ = decodedCodepoint; + else { + // Encode code points above U+FFFF as surrogate pair. + *p++ = 0xD7C0 + (decodedCodepoint >> 10); + *p++ = 0xDC00 + (decodedCodepoint & 0x3FF); + } + *p++ = '\0'; + + EXPECT_EQ(0, StrCmp(buffer, encodedStr)); + } + + // Decode + { + GenericStringStream > is(encodedStr); + unsigned decodedCodepoint; + bool result = UTF16<>::Decode(is, &decodedCodepoint); + EXPECT_TRUE(result); + EXPECT_EQ(codepoint, decodedCodepoint); + if (!result || codepoint != decodedCodepoint) + std::cout << std::hex << codepoint << " " << decodedCodepoint << std::endl; + } + + // Validate + { + GenericStringStream > is(encodedStr); + os2.Clear(); + bool result = UTF16<>::Validate(is, os2); + EXPECT_TRUE(result); + EXPECT_EQ(0, StrCmp(encodedStr, os2.GetString())); + } + } + } +} + +TEST(EncodingsTest, UTF32) { + GenericStringBuffer > os, os2; + for (const unsigned* range = kCodepointRanges; *range != 0xFFFFFFFF; range += 2) { + for (unsigned codepoint = range[0]; codepoint <= range[1]; ++codepoint) { + os.Clear(); + UTF32<>::Encode(os, codepoint); + const UTF32<>::Ch* encodedStr = os.GetString(); + + // Decode + { + GenericStringStream > is(encodedStr); + unsigned decodedCodepoint; + bool result = UTF32<>::Decode(is, &decodedCodepoint); + EXPECT_TRUE(result); + EXPECT_EQ(codepoint, decodedCodepoint); + if (!result || codepoint != decodedCodepoint) + std::cout << std::hex << codepoint << " " << decodedCodepoint << std::endl; + } + + // Validate + { + GenericStringStream > is(encodedStr); + os2.Clear(); + bool result = UTF32<>::Validate(is, os2); + EXPECT_TRUE(result); + EXPECT_EQ(0, StrCmp(encodedStr, os2.GetString())); + } + } + } }