From 77ce51209f82c9f0d221388e7a4242ff4f668912 Mon Sep 17 00:00:00 2001 From: "miloyip@gmail.com" Date: Wed, 23 Nov 2011 16:13:32 +0000 Subject: [PATCH] Added test cases for UTF8 validation. Fixed a bug in validation. git-svn-id: https://rapidjson.googlecode.com/svn/trunk@33 c5894555-1306-4e8d-425f-1f6f381ee07c --- include/rapidjson/rapidjson.h | 2 +- test/unittest/readertest.cpp | 85 +++++++++++++++++++++++++++-------- 2 files changed, 67 insertions(+), 20 deletions(-) diff --git a/include/rapidjson/rapidjson.h b/include/rapidjson/rapidjson.h index 716fa1d..21d860c 100644 --- a/include/rapidjson/rapidjson.h +++ b/include/rapidjson/rapidjson.h @@ -381,7 +381,7 @@ struct UTF8 { }; Ch c; os.Put(c = is.Take()); - if ((unsigned char) c <= 0x80) + if ((unsigned char) c < 0x80) return true; unsigned type = utf8d[(unsigned char)c]; diff --git a/test/unittest/readertest.cpp b/test/unittest/readertest.cpp index fba29a7..c849122 100644 --- a/test/unittest/readertest.cpp +++ b/test/unittest/readertest.cpp @@ -286,35 +286,82 @@ TEST(Reader, ParseString_NonDestructive) { EXPECT_EQ(11, h.length_); } -TEST(Reader, ParseString_Error) { -#define TEST_STRING_ERROR(str) \ - { \ - char buffer[1001]; \ - strncpy(buffer, str, 1000); \ - InsituStringStream s(buffer); \ - BaseReaderHandler<> h; \ - Reader reader; \ - EXPECT_FALSE(reader.Parse(s, h)); \ - } +bool TestString(const char* str) { + StringStream s(str); + BaseReaderHandler<> h; + Reader reader; + return reader.Parse(s, h); +} +TEST(Reader, ParseString_Error) { #define ARRAY(...) { __VA_ARGS__ } #define TEST_STRINGARRAY_ERROR(Encoding, array) \ { \ static const Encoding::Ch e[] = array; \ - TEST_STRING_ERROR(e); \ + EXPECT_FALSE(TestString(e)); \ } - TEST_STRING_ERROR("[\"\\a\"]"); // Unknown escape character - TEST_STRING_ERROR("[\"\\uABCG\"]"); // Incorrect hex digit after \\u escape - TEST_STRING_ERROR("[\"\\uD800X\"]"); // Missing the second \\u in surrogate pair - TEST_STRING_ERROR("[\"\\uD800\\uFFFF\"]"); // The second \\u in surrogate pair is invalid - TEST_STRING_ERROR("[\"Test]"); // lacks ending quotation before the end of string - TEST_STRINGARRAY_ERROR(UTF8<>, ARRAY('[', 0x80u, ']')); // Incorrect UTF8 sequence - TEST_STRINGARRAY_ERROR(UTF8<>, ARRAY('[', 0xC0u, 0x40, ']')); // Incorrect UTF8 sequence + EXPECT_FALSE(TestString("[\"\\a\"]")); // Unknown escape character + EXPECT_FALSE(TestString("[\"\\uABCG\"]")); // Incorrect hex digit after \\u escape + EXPECT_FALSE(TestString("[\"\\uD800X\"]")); // Missing the second \\u in surrogate pair + EXPECT_FALSE(TestString("[\"\\uD800\\uFFFF\"]")); // The second \\u in surrogate pair is invalid + EXPECT_FALSE(TestString("[\"Test]")); // lacks ending quotation before the end of string + + // http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt + + // 3 Malformed sequences + + // 3.1 Unexpected continuation bytes + { + char e[] = { '[', '\"', 0, '\"', ']', '\0' }; + for (unsigned char c = 0x80u; c <= 0xBFu; c++) { + e[2] = c; + bool b; + EXPECT_FALSE(b = TestString(e)); + if (b) + std::cout << (unsigned)(unsigned char)c << std::endl; + } + } + + // 3.2 Lonely start characters, 3.5 Impossible bytes + { + char e[] = { '[', '\"', 0, ' ', '\"', ']', '\0' }; + for (unsigned c = 0xC0u; c <= 0xFFu; c++) { + e[2] = (char)c; + EXPECT_FALSE(TestString(e)); + } + } + + // 4 Overlong sequences + + // 4.1 Examples of an overlong ASCII character + TEST_STRINGARRAY_ERROR(UTF8<>, ARRAY('[', '\"', 0xC0u, 0xAFu, '\"', ']', '\0')); + TEST_STRINGARRAY_ERROR(UTF8<>, ARRAY('[', '\"', 0xE0u, 0x80u, 0xAFu, '\"', ']', '\0')); + TEST_STRINGARRAY_ERROR(UTF8<>, ARRAY('[', '\"', 0xF0u, 0x80u, 0x80u, 0xAFu, '\"', ']', '\0')); + + // 4.2 Maximum overlong sequences + TEST_STRINGARRAY_ERROR(UTF8<>, ARRAY('[', '\"', 0xC1u, 0xBFu, '\"', ']', '\0')); + TEST_STRINGARRAY_ERROR(UTF8<>, ARRAY('[', '\"', 0xE0u, 0x9Fu, 0xBFu, '\"', ']', '\0')); + TEST_STRINGARRAY_ERROR(UTF8<>, ARRAY('[', '\"', 0xF0u, 0x8Fu, 0xBFu, 0xBFu, '\"', ']', '\0')); + + // 4.3 Overlong representation of the NUL character + TEST_STRINGARRAY_ERROR(UTF8<>, ARRAY('[', '\"', 0xC0u, 0x80u, '\"', ']', '\0')); + TEST_STRINGARRAY_ERROR(UTF8<>, ARRAY('[', '\"', 0xE0u, 0x80u, 0x80u, '\"', ']', '\0')); + TEST_STRINGARRAY_ERROR(UTF8<>, ARRAY('[', '\"', 0xF0u, 0x80u, 0x80u, 0x80u, '\"', ']', '\0')); + + // 5 Illegal code positions + + // 5.1 Single UTF-16 surrogates + TEST_STRINGARRAY_ERROR(UTF8<>, ARRAY('[', '\"', 0xEDu, 0xA0u, 0x80u, '\"', ']', '\0')); + TEST_STRINGARRAY_ERROR(UTF8<>, ARRAY('[', '\"', 0xEDu, 0xADu, 0xBFu, '\"', ']', '\0')); + TEST_STRINGARRAY_ERROR(UTF8<>, ARRAY('[', '\"', 0xEDu, 0xAEu, 0x80u, '\"', ']', '\0')); + TEST_STRINGARRAY_ERROR(UTF8<>, ARRAY('[', '\"', 0xEDu, 0xAFu, 0xBFu, '\"', ']', '\0')); + TEST_STRINGARRAY_ERROR(UTF8<>, ARRAY('[', '\"', 0xEDu, 0xB0u, 0x80u, '\"', ']', '\0')); + TEST_STRINGARRAY_ERROR(UTF8<>, ARRAY('[', '\"', 0xEDu, 0xBEu, 0x80u, '\"', ']', '\0')); + TEST_STRINGARRAY_ERROR(UTF8<>, ARRAY('[', '\"', 0xEDu, 0xBFu, 0xBFu, '\"', ']', '\0')); #undef ARRAY #undef TEST_STRINGARRAY_ERROR -#undef TEST_STRING_ERROR } template