From da4fd6794c8709667137d667525e5005a091adbb Mon Sep 17 00:00:00 2001 From: Alejandro Martinez Date: Wed, 22 Mar 2017 10:19:54 +0000 Subject: [PATCH 1/3] Fixed bug on space hexadecimal encoding --- include/rapidjson/reader.h | 12 ++++++------ include/rapidjson/writer.h | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/rapidjson/reader.h b/include/rapidjson/reader.h index 00ab6a5..6ba3f17 100644 --- a/include/rapidjson/reader.h +++ b/include/rapidjson/reader.h @@ -990,7 +990,7 @@ private: // The rest of string using SIMD static const char dquote[16] = { '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"' }; static const char bslash[16] = { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' }; - static const char space[16] = { 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19 }; + static const char space[16] = { 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F }; const __m128i dq = _mm_loadu_si128(reinterpret_cast(&dquote[0])); const __m128i bs = _mm_loadu_si128(reinterpret_cast(&bslash[0])); const __m128i sp = _mm_loadu_si128(reinterpret_cast(&space[0])); @@ -999,7 +999,7 @@ private: const __m128i s = _mm_load_si128(reinterpret_cast(p)); const __m128i t1 = _mm_cmpeq_epi8(s, dq); const __m128i t2 = _mm_cmpeq_epi8(s, bs); - const __m128i t3 = _mm_cmpeq_epi8(_mm_max_epu8(s, sp), sp); // s < 0x20 <=> max(s, 0x19) == 0x19 + const __m128i t3 = _mm_cmpeq_epi8(_mm_max_epu8(s, sp), sp); // s < 0x20 <=> max(s, 0x1F) == 0x1F const __m128i x = _mm_or_si128(_mm_or_si128(t1, t2), t3); unsigned short r = static_cast(_mm_movemask_epi8(x)); if (RAPIDJSON_UNLIKELY(r != 0)) { // some of characters is escaped @@ -1053,7 +1053,7 @@ private: // The rest of string using SIMD static const char dquote[16] = { '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"' }; static const char bslash[16] = { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' }; - static const char space[16] = { 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19 }; + static const char space[16] = { 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F }; const __m128i dq = _mm_loadu_si128(reinterpret_cast(&dquote[0])); const __m128i bs = _mm_loadu_si128(reinterpret_cast(&bslash[0])); const __m128i sp = _mm_loadu_si128(reinterpret_cast(&space[0])); @@ -1062,7 +1062,7 @@ private: const __m128i s = _mm_load_si128(reinterpret_cast(p)); const __m128i t1 = _mm_cmpeq_epi8(s, dq); const __m128i t2 = _mm_cmpeq_epi8(s, bs); - const __m128i t3 = _mm_cmpeq_epi8(_mm_max_epu8(s, sp), sp); // s < 0x20 <=> max(s, 0x19) == 0x19 + const __m128i t3 = _mm_cmpeq_epi8(_mm_max_epu8(s, sp), sp); // s < 0x20 <=> max(s, 0x1F) == 0x1F const __m128i x = _mm_or_si128(_mm_or_si128(t1, t2), t3); unsigned short r = static_cast(_mm_movemask_epi8(x)); if (RAPIDJSON_UNLIKELY(r != 0)) { // some of characters is escaped @@ -1101,7 +1101,7 @@ private: // The rest of string using SIMD static const char dquote[16] = { '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"' }; static const char bslash[16] = { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' }; - static const char space[16] = { 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19 }; + static const char space[16] = { 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F }; const __m128i dq = _mm_loadu_si128(reinterpret_cast(&dquote[0])); const __m128i bs = _mm_loadu_si128(reinterpret_cast(&bslash[0])); const __m128i sp = _mm_loadu_si128(reinterpret_cast(&space[0])); @@ -1110,7 +1110,7 @@ private: const __m128i s = _mm_load_si128(reinterpret_cast(p)); const __m128i t1 = _mm_cmpeq_epi8(s, dq); const __m128i t2 = _mm_cmpeq_epi8(s, bs); - const __m128i t3 = _mm_cmpeq_epi8(_mm_max_epu8(s, sp), sp); // s < 0x20 <=> max(s, 0x19) == 0x19 + const __m128i t3 = _mm_cmpeq_epi8(_mm_max_epu8(s, sp), sp); // s < 0x20 <=> max(s, 0x1F) == 0x1F const __m128i x = _mm_or_si128(_mm_or_si128(t1, t2), t3); unsigned short r = static_cast(_mm_movemask_epi8(x)); if (RAPIDJSON_UNLIKELY(r != 0)) { // some of characters is escaped diff --git a/include/rapidjson/writer.h b/include/rapidjson/writer.h index cb7afd5..219da5e 100644 --- a/include/rapidjson/writer.h +++ b/include/rapidjson/writer.h @@ -585,7 +585,7 @@ inline bool Writer::ScanWriteUnescapedString(StringStream& is, siz // The rest of string using SIMD static const char dquote[16] = { '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"' }; static const char bslash[16] = { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' }; - static const char space[16] = { 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19 }; + static const char space[16] = { 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F }; const __m128i dq = _mm_loadu_si128(reinterpret_cast(&dquote[0])); const __m128i bs = _mm_loadu_si128(reinterpret_cast(&bslash[0])); const __m128i sp = _mm_loadu_si128(reinterpret_cast(&space[0])); @@ -594,7 +594,7 @@ inline bool Writer::ScanWriteUnescapedString(StringStream& is, siz const __m128i s = _mm_load_si128(reinterpret_cast(p)); const __m128i t1 = _mm_cmpeq_epi8(s, dq); const __m128i t2 = _mm_cmpeq_epi8(s, bs); - const __m128i t3 = _mm_cmpeq_epi8(_mm_max_epu8(s, sp), sp); // s < 0x20 <=> max(s, 0x19) == 0x19 + const __m128i t3 = _mm_cmpeq_epi8(_mm_max_epu8(s, sp), sp); // s < 0x20 <=> max(s, 0x1F) == 0x1F const __m128i x = _mm_or_si128(_mm_or_si128(t1, t2), t3); unsigned short r = static_cast(_mm_movemask_epi8(x)); if (RAPIDJSON_UNLIKELY(r != 0)) { // some of characters is escaped From 3c6e2cf0309c4e146c8ed18fd16096136fecbf00 Mon Sep 17 00:00:00 2001 From: Alejandro Martinez Date: Thu, 23 Mar 2017 10:14:17 +0000 Subject: [PATCH 2/3] Added unittests for invalid ascii control chars --- test/unittest/readertest.cpp | 2 ++ test/unittest/writertest.cpp | 10 ++++++++++ 2 files changed, 12 insertions(+) diff --git a/test/unittest/readertest.cpp b/test/unittest/readertest.cpp index 3555f11..5078f52 100644 --- a/test/unittest/readertest.cpp +++ b/test/unittest/readertest.cpp @@ -725,6 +725,8 @@ TEST(Reader, ParseString_Error) { // Malform ASCII sequence TEST_STRINGENCODING_ERROR(ASCII<>, UTF8<>, char, ARRAY('[', '\"', char(0x80u), '\"', ']', '\0')); + TEST_STRINGENCODING_ERROR(ASCII<>, UTF8<>, char, ARRAY('[', '\"', char(0x01u), '\"', ']', '\0')); + TEST_STRINGENCODING_ERROR(ASCII<>, UTF8<>, char, ARRAY('[', '\"', char(0x1Cu), '\"', ']', '\0')); #undef ARRAY #undef TEST_STRINGARRAY_ERROR diff --git a/test/unittest/writertest.cpp b/test/unittest/writertest.cpp index e630bb9..bc28e02 100644 --- a/test/unittest/writertest.cpp +++ b/test/unittest/writertest.cpp @@ -401,6 +401,16 @@ TEST(Writer, InvalidEncoding) { static const UTF32<>::Ch s[] = { 0x110000, 0 }; // Out of U+0000 to U+10FFFF EXPECT_FALSE(writer.String(s)); } + + // Fail in decoding invalid ASCII control bytes + { + GenericStringBuffer > buffer; + Writer >, UTF8<>, UTF16<> > writer(buffer); + writer.StartArray(); + EXPECT_FALSE(writer.String("\x01")); + EXPECT_FALSE(writer.String("\x1C")); + writer.EndArray(); + } } TEST(Writer, ValidateEncoding) { From 85500e8c8f1eabe10bcf7944e71fbb2dbcc893de Mon Sep 17 00:00:00 2001 From: Alejandro Martinez Date: Fri, 24 Mar 2017 13:37:23 +0000 Subject: [PATCH 3/3] Changed error code for invalid special ascii chars, fixed writer tests --- include/rapidjson/reader.h | 2 +- test/unittest/writertest.cpp | 14 +++----------- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/include/rapidjson/reader.h b/include/rapidjson/reader.h index 6ba3f17..ccc025e 100644 --- a/include/rapidjson/reader.h +++ b/include/rapidjson/reader.h @@ -955,7 +955,7 @@ private: if (c == '\0') RAPIDJSON_PARSE_ERROR(kParseErrorStringMissQuotationMark, is.Tell()); else - RAPIDJSON_PARSE_ERROR(kParseErrorStringEscapeInvalid, is.Tell()); + RAPIDJSON_PARSE_ERROR(kParseErrorStringInvalidEncoding, is.Tell()); } else { size_t offset = is.Tell(); diff --git a/test/unittest/writertest.cpp b/test/unittest/writertest.cpp index bc28e02..b190c6c 100644 --- a/test/unittest/writertest.cpp +++ b/test/unittest/writertest.cpp @@ -401,16 +401,6 @@ TEST(Writer, InvalidEncoding) { static const UTF32<>::Ch s[] = { 0x110000, 0 }; // Out of U+0000 to U+10FFFF EXPECT_FALSE(writer.String(s)); } - - // Fail in decoding invalid ASCII control bytes - { - GenericStringBuffer > buffer; - Writer >, UTF8<>, UTF16<> > writer(buffer); - writer.StartArray(); - EXPECT_FALSE(writer.String("\x01")); - EXPECT_FALSE(writer.String("\x1C")); - writer.EndArray(); - } } TEST(Writer, ValidateEncoding) { @@ -422,8 +412,10 @@ TEST(Writer, ValidateEncoding) { EXPECT_TRUE(writer.String("\xC2\xA2")); // Cents sign U+00A2 EXPECT_TRUE(writer.String("\xE2\x82\xAC")); // Euro sign U+20AC EXPECT_TRUE(writer.String("\xF0\x9D\x84\x9E")); // G clef sign U+1D11E + EXPECT_TRUE(writer.String("\x01")); // SOH control U+0001 + EXPECT_TRUE(writer.String("\x1B")); // Escape control U+001B writer.EndArray(); - EXPECT_STREQ("[\"\x24\",\"\xC2\xA2\",\"\xE2\x82\xAC\",\"\xF0\x9D\x84\x9E\"]", buffer.GetString()); + EXPECT_STREQ("[\"\x24\",\"\xC2\xA2\",\"\xE2\x82\xAC\",\"\xF0\x9D\x84\x9E\",\"\\u0001\",\"\\u001B\"]", buffer.GetString()); } // Fail in decoding invalid UTF-8 sequence http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt