diff --git a/include/rapidjson/internal/regex.h b/include/rapidjson/internal/regex.h index 0d85885..15b6f8f 100644 --- a/include/rapidjson/internal/regex.h +++ b/include/rapidjson/internal/regex.h @@ -46,6 +46,7 @@ static const SizeType kRegexInvalidRange = ~SizeType(0); - \c [a-z0-9_] Character class combination - \c [^abc] Negated character classes - \c [^a-c] Negated character class range + - \c [\b] Backspace (U+0008) - \c \\| \\\\ ... Escape characters - \c \\f Form feed (U+000C) - \c \\n Line feed (U+000A) @@ -265,26 +266,8 @@ private: case '\\': // Escape character if (!Encoding::Decode(is, &codepoint) || codepoint == 0) return; // Expect an escape character - switch (codepoint) { - case '|': - case '(': - case ')': - case '?': - case '*': - case '+': - case '.': - case '[': - case ']': - case '\\': - break; // use the codepoint as is - case 'f': codepoint = 0x000C; break; - case 'n': codepoint = 0x000A; break; - case 'r': codepoint = 0x000D; break; - case 't': codepoint = 0x0009; break; - case 'v': codepoint = 0x000B; break; - default: - return; // Unsupported escape character - } + if (!CharacterEscape(codepoint, &codepoint)) + return; // Unsupported escape character // fall through to default default: // Pattern character @@ -414,9 +397,16 @@ private: SizeType current = kRegexInvalidRange; unsigned codepoint; while (Encoding::Decode(is, &codepoint) && codepoint != 0) { - if (isBegin && codepoint == '^') - negate = true; - else if (codepoint == ']') { + if (isBegin) { + isBegin = false; + if (codepoint == '^') { + negate = true; + continue; + } + } + + switch (codepoint) { + case ']': if (step == 2) { // Add trailing '-' SizeType r = NewRange('-'); RAPIDJSON_ASSERT(current != kRegexInvalidRange); @@ -426,8 +416,17 @@ private: GetRange(start).start |= kRangeNegationFlag; *range = start; return true; - } - else { + + case '\\': + if (!Encoding::Decode(is, &codepoint) || codepoint == 0) + return false; // Expect an escape character + if (codepoint == 'b') + codepoint = 0x0008; // Escape backspace character + else if (!CharacterEscape(codepoint, &codepoint)) + return false; + // fall through to default + + default: switch (step) { case 1: if (codepoint == '-') { @@ -454,7 +453,6 @@ private: step = 0; } } - isBegin = false; } return false; } @@ -466,6 +464,29 @@ private: return rangeCount_++; } + bool CharacterEscape(unsigned codepoint, unsigned* escapedCodepoint) { + switch (codepoint) { + case '|': + case '(': + case ')': + case '?': + case '*': + case '+': + case '.': + case '[': + case ']': + case '\\': + *escapedCodepoint = codepoint; return true; + case 'f': *escapedCodepoint = 0x000C; return true; + case 'n': *escapedCodepoint = 0x000A; return true; + case 'r': *escapedCodepoint = 0x000D; return true; + case 't': *escapedCodepoint = 0x0009; return true; + case 'v': *escapedCodepoint = 0x000B; return true; + default: + return false; // Unsupported escape character + } + } + Stack states_; Stack ranges_; SizeType root_; diff --git a/test/unittest/regextest.cpp b/test/unittest/regextest.cpp index 23acb46..b5fd56e 100644 --- a/test/unittest/regextest.cpp +++ b/test/unittest/regextest.cpp @@ -328,10 +328,10 @@ TEST(Regex, CharacterRange8) { } TEST(Regex, Escape) { - const char* s = "\\|\\(\\)\\?\\*\\+\\.\\[\\]\\\\\\f\\n\\r\\t\\v"; + const char* s = "\\|\\(\\)\\?\\*\\+\\.\\[\\]\\\\\\f\\n\\r\\t\\v[\\b][\\[][\\]]"; Regex re(s); ASSERT_TRUE(re.IsValid()); - EXPECT_TRUE(re.Match("|()?*+.[]\\\x0C\n\r\t\x0B")); + EXPECT_TRUE(re.Match("|()?*+.[]\\\x0C\n\r\t\x0B\b[]")); EXPECT_FALSE(re.Match(s)); // Not escaping }