From ed73d7bdb4cae1bac509ec85b4fbc4de4ac4b558 Mon Sep 17 00:00:00 2001 From: Lars Klein Date: Sat, 4 Jul 2020 14:34:09 +0200 Subject: [PATCH 1/2] Improve surrogate handling Report a single low surrogate as kParseErrorStringUnicodeSurrogateInvalid. --- include/rapidjson/reader.h | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/include/rapidjson/reader.h b/include/rapidjson/reader.h index 0f85032..30e45e1 100644 --- a/include/rapidjson/reader.h +++ b/include/rapidjson/reader.h @@ -1023,15 +1023,23 @@ private: is.Take(); unsigned codepoint = ParseHex4(is, escapeOffset); RAPIDJSON_PARSE_ERROR_EARLY_RETURN_VOID; - if (RAPIDJSON_UNLIKELY(codepoint >= 0xD800 && codepoint <= 0xDBFF)) { - // Handle UTF-16 surrogate pair - if (RAPIDJSON_UNLIKELY(!Consume(is, '\\') || !Consume(is, 'u'))) + if (RAPIDJSON_UNLIKELY(codepoint >= 0xD800 && codepoint <= 0xDFFF)) { + // high surrogate, check if followed by valid low surrogate + if (RAPIDJSON_LIKELY(codepoint <= 0xDBFF)) { + // Handle UTF-16 surrogate pair + if (RAPIDJSON_UNLIKELY(!Consume(is, '\\') || !Consume(is, 'u'))) + RAPIDJSON_PARSE_ERROR(kParseErrorStringUnicodeSurrogateInvalid, escapeOffset); + unsigned codepoint2 = ParseHex4(is, escapeOffset); + RAPIDJSON_PARSE_ERROR_EARLY_RETURN_VOID; + if (RAPIDJSON_UNLIKELY(codepoint2 < 0xDC00 || codepoint2 > 0xDFFF)) + RAPIDJSON_PARSE_ERROR(kParseErrorStringUnicodeSurrogateInvalid, escapeOffset); + codepoint = (((codepoint - 0xD800) << 10) | (codepoint2 - 0xDC00)) + 0x10000; + } + // single low surrogate + else + { RAPIDJSON_PARSE_ERROR(kParseErrorStringUnicodeSurrogateInvalid, escapeOffset); - unsigned codepoint2 = ParseHex4(is, escapeOffset); - RAPIDJSON_PARSE_ERROR_EARLY_RETURN_VOID; - if (RAPIDJSON_UNLIKELY(codepoint2 < 0xDC00 || codepoint2 > 0xDFFF)) - RAPIDJSON_PARSE_ERROR(kParseErrorStringUnicodeSurrogateInvalid, escapeOffset); - codepoint = (((codepoint - 0xD800) << 10) | (codepoint2 - 0xDC00)) + 0x10000; + } } TEncoding::Encode(os, codepoint); } From 6694c996b9e5a5e44d9f7cea1d619cae86384981 Mon Sep 17 00:00:00 2001 From: Lars Klein Date: Sat, 4 Jul 2020 14:48:39 +0200 Subject: [PATCH 2/2] Add test case for low surrogate handling --- test/unittest/readertest.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/unittest/readertest.cpp b/test/unittest/readertest.cpp index 2795766..2a4a626 100644 --- a/test/unittest/readertest.cpp +++ b/test/unittest/readertest.cpp @@ -944,6 +944,9 @@ TEST(Reader, ParseString_Error) { TEST_STRING_ERROR(kParseErrorStringUnicodeSurrogateInvalid, "[\"\\uD800X\"]", 2u, 8u); TEST_STRING_ERROR(kParseErrorStringUnicodeSurrogateInvalid, "[\"\\uD800\\uFFFF\"]", 2u, 14u); + // Single low surrogate pair in string is invalid. + TEST_STRING_ERROR(kParseErrorStringUnicodeSurrogateInvalid, "[\"\\udc4d\"]", 2u, 8u); + // Missing a closing quotation mark in string. TEST_STRING_ERROR(kParseErrorStringMissQuotationMark, "[\"Test]", 7u, 7u);