From ed73d7bdb4cae1bac509ec85b4fbc4de4ac4b558 Mon Sep 17 00:00:00 2001 From: Lars Klein Date: Sat, 4 Jul 2020 14:34:09 +0200 Subject: [PATCH] Improve surrogate handling Report a single low surrogate as kParseErrorStringUnicodeSurrogateInvalid. --- include/rapidjson/reader.h | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/include/rapidjson/reader.h b/include/rapidjson/reader.h index 0f85032..30e45e1 100644 --- a/include/rapidjson/reader.h +++ b/include/rapidjson/reader.h @@ -1023,15 +1023,23 @@ private: is.Take(); unsigned codepoint = ParseHex4(is, escapeOffset); RAPIDJSON_PARSE_ERROR_EARLY_RETURN_VOID; - if (RAPIDJSON_UNLIKELY(codepoint >= 0xD800 && codepoint <= 0xDBFF)) { - // Handle UTF-16 surrogate pair - if (RAPIDJSON_UNLIKELY(!Consume(is, '\\') || !Consume(is, 'u'))) + if (RAPIDJSON_UNLIKELY(codepoint >= 0xD800 && codepoint <= 0xDFFF)) { + // high surrogate, check if followed by valid low surrogate + if (RAPIDJSON_LIKELY(codepoint <= 0xDBFF)) { + // Handle UTF-16 surrogate pair + if (RAPIDJSON_UNLIKELY(!Consume(is, '\\') || !Consume(is, 'u'))) + RAPIDJSON_PARSE_ERROR(kParseErrorStringUnicodeSurrogateInvalid, escapeOffset); + unsigned codepoint2 = ParseHex4(is, escapeOffset); + RAPIDJSON_PARSE_ERROR_EARLY_RETURN_VOID; + if (RAPIDJSON_UNLIKELY(codepoint2 < 0xDC00 || codepoint2 > 0xDFFF)) + RAPIDJSON_PARSE_ERROR(kParseErrorStringUnicodeSurrogateInvalid, escapeOffset); + codepoint = (((codepoint - 0xD800) << 10) | (codepoint2 - 0xDC00)) + 0x10000; + } + // single low surrogate + else + { RAPIDJSON_PARSE_ERROR(kParseErrorStringUnicodeSurrogateInvalid, escapeOffset); - unsigned codepoint2 = ParseHex4(is, escapeOffset); - RAPIDJSON_PARSE_ERROR_EARLY_RETURN_VOID; - if (RAPIDJSON_UNLIKELY(codepoint2 < 0xDC00 || codepoint2 > 0xDFFF)) - RAPIDJSON_PARSE_ERROR(kParseErrorStringUnicodeSurrogateInvalid, escapeOffset); - codepoint = (((codepoint - 0xD800) << 10) | (codepoint2 - 0xDC00)) + 0x10000; + } } TEncoding::Encode(os, codepoint); }