From 92285bed44ee028f5c3392924f8dee9c9eb66b64 Mon Sep 17 00:00:00 2001 From: miloyip Date: Wed, 27 May 2015 09:37:55 +0800 Subject: [PATCH] Add escape characters and control characters --- include/rapidjson/internal/regex.h | 33 +++++++++++++++++++++++++++++- test/unittest/regextest.cpp | 8 ++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/include/rapidjson/internal/regex.h b/include/rapidjson/internal/regex.h index 4d31180..0d85885 100644 --- a/include/rapidjson/internal/regex.h +++ b/include/rapidjson/internal/regex.h @@ -46,6 +46,12 @@ static const SizeType kRegexInvalidRange = ~SizeType(0); - \c [a-z0-9_] Character class combination - \c [^abc] Negated character classes - \c [^a-c] Negated character class range + - \c \\| \\\\ ... Escape characters + - \c \\f Form feed (U+000C) + - \c \\n Line feed (U+000A) + - \c \\r Carriage return (U+000D) + - \c \\t Tab (U+0009) + - \c \\v Vertical tab (U+000B) */ template class GenericRegex { @@ -256,7 +262,32 @@ private: ImplicitConcatenation(atomCountStack, operatorStack); break; - default: + case '\\': // Escape character + if (!Encoding::Decode(is, &codepoint) || codepoint == 0) + return; // Expect an escape character + switch (codepoint) { + case '|': + case '(': + case ')': + case '?': + case '*': + case '+': + case '.': + case '[': + case ']': + case '\\': + break; // use the codepoint as is + case 'f': codepoint = 0x000C; break; + case 'n': codepoint = 0x000A; break; + case 'r': codepoint = 0x000D; break; + case 't': codepoint = 0x0009; break; + case 'v': codepoint = 0x000B; break; + default: + return; // Unsupported escape character + } + // fall through to default + + default: // Pattern character PushOperand(operandStack, codepoint); ImplicitConcatenation(atomCountStack, operatorStack); } diff --git a/test/unittest/regextest.cpp b/test/unittest/regextest.cpp index 8818117..23acb46 100644 --- a/test/unittest/regextest.cpp +++ b/test/unittest/regextest.cpp @@ -327,4 +327,12 @@ TEST(Regex, CharacterRange8) { EXPECT_FALSE(re.Match("!")); } +TEST(Regex, Escape) { + const char* s = "\\|\\(\\)\\?\\*\\+\\.\\[\\]\\\\\\f\\n\\r\\t\\v"; + Regex re(s); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("|()?*+.[]\\\x0C\n\r\t\x0B")); + EXPECT_FALSE(re.Match(s)); // Not escaping +} + #undef EURO