diff --git a/include/rapidjson/internal/regex.h b/include/rapidjson/internal/regex.h index 7ec925f..26d1098 100644 --- a/include/rapidjson/internal/regex.h +++ b/include/rapidjson/internal/regex.h @@ -39,6 +39,9 @@ static const SizeType kRegexInvalidRange = ~SizeType(0); - \c a? Zero or one - \c a* Zero or more - \c a+ One or more + - \c a{3} Exactly 3 times + - \c a{3,} At least 3 times + - \c a{3,5} 3 to 5 times - \c (ab)* Grouping - \c . Any character - \c [abc] Character classes @@ -266,6 +269,28 @@ private: return; break; + case '{': + { + unsigned n, m; + if (!ParseUnsigned(ds, &n) || n == 0) + return; + + if (ds.Peek() == ',') { + ds.Take(); + if (ds.Peek() == '}') + m = 0; + else if (!ParseUnsigned(ds, &m) || m < n) + return; + } + else + m = n; + + if (!EvalQuantifier(operandStack, n, m) || ds.Peek() != '}') + return; + ds.Take(); + } + break; + case '.': PushOperand(operandStack, kAnyCharacterClass); ImplicitConcatenation(atomCountStack, operatorStack); @@ -406,6 +431,71 @@ private: } } + bool EvalQuantifier(Stack& operandStack, unsigned n, unsigned m) { + RAPIDJSON_ASSERT(n > 0); + RAPIDJSON_ASSERT(m == 0 || n <= m); // m == 0 means infinity + if (operandStack.GetSize() < sizeof(Frag)) + return false; + + for (unsigned i = 0; i < n - 1; i++) // a{3} -> a a a + CloneTopOperand(operandStack); + + if (m == 0) + Eval(operandStack, kOneOrMore); // a{3,} -> a a a+ + else if (m > n) { + CloneTopOperand(operandStack); // a{3,5} -> a a a a + Eval(operandStack, kZeroOrOne); // a{3,5} -> a a a a? + for (unsigned i = n; i < m - 1; i++) + CloneTopOperand(operandStack); // a{3,5} -> a a a a? a? + for (unsigned i = n; i < m; i++) + Eval(operandStack, kConcatenation); // a{3,5} -> a a aa?a? + } + + for (unsigned i = 0; i < n - 1; i++) + Eval(operandStack, kConcatenation); // a{3} -> aaa, a{3,} -> aaa+, a{3.5} -> aaaa?a? + + return true; + } + + static SizeType Min(SizeType a, SizeType b) { return a < b ? a : b; } + + SizeType GetMinStateIndex(SizeType index) { + State& s = GetState(index); + if (s.out != kRegexInvalidState && s.out < index) + index = Min(index, GetMinStateIndex(s.out)); + if (s.out1 != kRegexInvalidState && s.out1 < index) + index = Min(index, GetMinStateIndex(s.out1)); + return index; + } + + void CloneTopOperand(Stack& operandStack) { + const Frag *src = operandStack.template Top(); + SizeType minIndex = GetMinStateIndex(src->start); + SizeType count = stateCount_ - minIndex; // Assumes top operand contains states in [min, stateCount_) + State* s = states_.template Push(count); + memcpy(s, &GetState(minIndex), count * sizeof(State)); + for (SizeType j = 0; j < count; j++) { + if (s[j].out != kRegexInvalidState) + s[j].out += count; + if (s[j].out1 != kRegexInvalidState) + s[j].out1 += count; + } + *operandStack.template Push() = Frag(src->start + count, src->out + count); + stateCount_ += count; + } + + template + bool ParseUnsigned(DecodedStream& ds, unsigned* u) { + unsigned r = 0; + while (ds.Peek() >= '0' && ds.Peek() <= '9') { + if (r >= 429496729 && ds.Peek() > '5') // 2^32 - 1 = 4294967295 + return false; // overflow + r = r * 10 + (ds.Take() - '0'); + } + *u = r; + return true; + } + template bool ParseRange(DecodedStream& ds, SizeType* range) { bool isBegin = true; @@ -495,6 +585,8 @@ private: case '.': case '[': case ']': + case '{': + case '}': case '\\': *escapedCodepoint = codepoint; return true; case 'f': *escapedCodepoint = 0x000C; return true; diff --git a/test/unittest/regextest.cpp b/test/unittest/regextest.cpp index b5fd56e..05acc99 100644 --- a/test/unittest/regextest.cpp +++ b/test/unittest/regextest.cpp @@ -220,6 +220,111 @@ TEST(Regex, OneOrMore4) { EXPECT_FALSE(re.Match("ab")); } +TEST(Regex, QuantifierExact1) { + Regex re("ab{3}c"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("abbbc")); + EXPECT_FALSE(re.Match("ac")); + EXPECT_FALSE(re.Match("abc")); + EXPECT_FALSE(re.Match("abbc")); + EXPECT_FALSE(re.Match("abbbbc")); +} + +TEST(Regex, QuantifierExact2) { + Regex re("a(bc){3}d"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("abcbcbcd")); + EXPECT_FALSE(re.Match("ad")); + EXPECT_FALSE(re.Match("abcd")); + EXPECT_FALSE(re.Match("abcbcd")); + EXPECT_FALSE(re.Match("abcbcbcbcd")); +} + +TEST(Regex, QuantifierExact3) { + Regex re("a(b|c){3}d"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("abbbd")); + EXPECT_TRUE(re.Match("acccd")); + EXPECT_TRUE(re.Match("abcbd")); + EXPECT_FALSE(re.Match("ad")); + EXPECT_FALSE(re.Match("abbd")); + EXPECT_FALSE(re.Match("accccd")); + EXPECT_FALSE(re.Match("abbbbd")); +} + +TEST(Regex, QuantifierMin1) { + Regex re("ab{3,}c"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("abbbc")); + EXPECT_TRUE(re.Match("abbbbc")); + EXPECT_TRUE(re.Match("abbbbbc")); + EXPECT_FALSE(re.Match("ac")); + EXPECT_FALSE(re.Match("abc")); + EXPECT_FALSE(re.Match("abbc")); +} + +TEST(Regex, QuantifierMin2) { + Regex re("a(bc){3,}d"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("abcbcbcd")); + EXPECT_TRUE(re.Match("abcbcbcbcd")); + EXPECT_FALSE(re.Match("ad")); + EXPECT_FALSE(re.Match("abcd")); + EXPECT_FALSE(re.Match("abcbcd")); +} + +TEST(Regex, QuantifierMin3) { + Regex re("a(b|c){3,}d"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("abbbd")); + EXPECT_TRUE(re.Match("acccd")); + EXPECT_TRUE(re.Match("abcbd")); + EXPECT_TRUE(re.Match("accccd")); + EXPECT_TRUE(re.Match("abbbbd")); + EXPECT_FALSE(re.Match("ad")); + EXPECT_FALSE(re.Match("abbd")); +} + +TEST(Regex, QuantifierMinMax1) { + Regex re("ab{3,5}c"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("abbbc")); + EXPECT_TRUE(re.Match("abbbbc")); + EXPECT_TRUE(re.Match("abbbbbc")); + EXPECT_FALSE(re.Match("ac")); + EXPECT_FALSE(re.Match("abc")); + EXPECT_FALSE(re.Match("abbc")); + EXPECT_FALSE(re.Match("abbbbbbc")); +} + +TEST(Regex, QuantifierMinMax2) { + Regex re("a(bc){3,5}d"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("abcbcbcd")); + EXPECT_TRUE(re.Match("abcbcbcbcd")); + EXPECT_TRUE(re.Match("abcbcbcbcbcd")); + EXPECT_FALSE(re.Match("ad")); + EXPECT_FALSE(re.Match("abcd")); + EXPECT_FALSE(re.Match("abcbcd")); + EXPECT_FALSE(re.Match("abcbcbcbcbcbcd")); +} + +TEST(Regex, QuantifierMinMax3) { + Regex re("a(b|c){3,5}d"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("abbbd")); + EXPECT_TRUE(re.Match("acccd")); + EXPECT_TRUE(re.Match("abcbd")); + EXPECT_TRUE(re.Match("accccd")); + EXPECT_TRUE(re.Match("abbbbd")); + EXPECT_TRUE(re.Match("acccccd")); + EXPECT_TRUE(re.Match("abbbbbd")); + EXPECT_FALSE(re.Match("ad")); + EXPECT_FALSE(re.Match("abbd")); + EXPECT_FALSE(re.Match("accccccd")); + EXPECT_FALSE(re.Match("abbbbbbd")); +} + #define EURO "\xE2\x82\xAC" // "\xE2\x82\xAC" is UTF-8 sequence of Euro sign U+20AC TEST(Regex, Unicode) { @@ -328,10 +433,10 @@ TEST(Regex, CharacterRange8) { } TEST(Regex, Escape) { - const char* s = "\\|\\(\\)\\?\\*\\+\\.\\[\\]\\\\\\f\\n\\r\\t\\v[\\b][\\[][\\]]"; + const char* s = "\\|\\(\\)\\?\\*\\+\\.\\[\\]\\{\\}\\\\\\f\\n\\r\\t\\v[\\b][\\[][\\]]"; Regex re(s); ASSERT_TRUE(re.IsValid()); - EXPECT_TRUE(re.Match("|()?*+.[]\\\x0C\n\r\t\x0B\b[]")); + EXPECT_TRUE(re.Match("|()?*+.[]{}\\\x0C\n\r\t\x0B\b[]")); EXPECT_FALSE(re.Match(s)); // Not escaping }