diff --git a/include/rapidjson/internal/regex.h b/include/rapidjson/internal/regex.h index f757cfe..c19adb1 100644 --- a/include/rapidjson/internal/regex.h +++ b/include/rapidjson/internal/regex.h @@ -54,11 +54,12 @@ public: const size_t stateSetSize = (stateCount_ + 31) / 32 * 4; unsigned* stateSet = static_cast(allocator.Malloc(stateSetSize)); std::memset(stateSet, 0, stateSetSize); - AddState(stateSet, *current, root_); unsigned codepoint; while (!current->Empty() && Encoding::Decode(is, &codepoint) && codepoint != 0) { + std::memset(stateSet, 0, stateSetSize); + next->Clear(); for (const SizeType* s = current->template Bottom(); s != current->template End(); ++s) { const State& sr = GetState(*s); // if (sr.out != kRegexInvalidState) @@ -70,8 +71,6 @@ public: Stack* temp = current; current = next; next = temp; - std::memset(stateSet, 0, stateSetSize); - next->Clear(); // printf("\n"); } @@ -91,9 +90,12 @@ public: private: enum Operator { + kZeroOrOne, + kZeroOrMore, + kOneOrMore, kConcatenation, kAlternation, - kLeftParenthesis, + kLeftParenthesis }; struct State { @@ -193,6 +195,24 @@ private: ImplicitConcatenation(atomCountStack, operatorStack); break; + case '?': + *operatorStack.template Push() = kZeroOrOne; + if (!Eval(operandStack, operatorStack)) + return; + break; + + case '*': + *operatorStack.template Push() = kZeroOrMore; + if (!Eval(operandStack, operatorStack)) + return; + break; + + case '+': + *operatorStack.template Push() = kOneOrMore; + if (!Eval(operandStack, operatorStack)) + return; + break; + default: SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, codepoint); *operandStack.template Push() = Frag(s, s); @@ -209,16 +229,19 @@ private: Frag* e = operandStack.template Pop(1); Patch(e->out, NewState(kRegexInvalidState, kRegexInvalidState, 0)); root_ = e->start; - // printf("root: %d\n", root_); - // for (SizeType i = 0; i < stateCount_ ; i++) { - // State& s = GetState(i); - // printf("[%2d] out: %2d out1: %2d c: '%c'\n", i, s.out, s.out1, (char)s.codepoint); - // } - // printf("\n"); +#if 0 + printf("root: %d\n", root_); + for (SizeType i = 0; i < stateCount_ ; i++) { + State& s = GetState(i); + printf("[%2d] out: %2d out1: %2d c: '%c'\n", i, s.out, s.out1, (char)s.codepoint); + } + printf("\n"); +#endif } } bool Eval(Stack& operandStack, Stack& operatorStack) { + // printf("Eval %c\n", "?*+.|("[*operatorStack.template Top()]); switch (*operatorStack.template Pop(1)) { case kConcatenation: if (operandStack.GetSize() >= sizeof(Frag) * 2) { @@ -240,6 +263,35 @@ private: } return false; + case kZeroOrOne: + if (operandStack.GetSize() >= sizeof(Frag)) { + Frag e = *operandStack.template Pop(1); + SizeType s = NewState(kRegexInvalidState, e.start, 0); + *operandStack.template Push() = Frag(s, Append(e.out, s)); + return true; + } + return false; + + case kZeroOrMore: + if (operandStack.GetSize() >= sizeof(Frag)) { + Frag e = *operandStack.template Pop(1); + SizeType s = NewState(kRegexInvalidState, e.start, 0); + Patch(e.out, s); + *operandStack.template Push() = Frag(s, s); + return true; + } + return false; + + case kOneOrMore: + if (operandStack.GetSize() >= sizeof(Frag)) { + Frag e = *operandStack.template Pop(1); + SizeType s = NewState(kRegexInvalidState, e.start, 0); + Patch(e.out, s); + *operandStack.template Push() = Frag(e.start, s); + return true; + } + return false; + default: return false; } diff --git a/test/unittest/regextest.cpp b/test/unittest/regextest.cpp index 658bbc2..1a1bffa 100644 --- a/test/unittest/regextest.cpp +++ b/test/unittest/regextest.cpp @@ -17,7 +17,7 @@ using namespace rapidjson::internal; -TEST(Regex, concatenation) { +TEST(Regex, Concatenation) { Regex re("abc"); ASSERT_TRUE(re.IsValid()); EXPECT_TRUE(re.Match("abc")); @@ -28,7 +28,7 @@ TEST(Regex, concatenation) { EXPECT_FALSE(re.Match("abcd")); } -TEST(Regex, split1) { +TEST(Regex, Alternation1) { Regex re("abab|abbb"); ASSERT_TRUE(re.IsValid()); EXPECT_TRUE(re.Match("abab")); @@ -40,7 +40,7 @@ TEST(Regex, split1) { EXPECT_FALSE(re.Match("abbbb")); } -TEST(Regex, split2) { +TEST(Regex, Alternation2) { Regex re("a|b|c"); ASSERT_TRUE(re.IsValid()); EXPECT_TRUE(re.Match("a")); @@ -51,7 +51,7 @@ TEST(Regex, split2) { EXPECT_FALSE(re.Match("ab")); } -TEST(Regex, parenthesis1) { +TEST(Regex, Parenthesis1) { Regex re("(ab)c"); ASSERT_TRUE(re.IsValid()); EXPECT_TRUE(re.Match("abc")); @@ -62,7 +62,7 @@ TEST(Regex, parenthesis1) { EXPECT_FALSE(re.Match("abcd")); } -TEST(Regex, parenthesis2) { +TEST(Regex, Parenthesis2) { Regex re("a(bc)"); ASSERT_TRUE(re.IsValid()); EXPECT_TRUE(re.Match("abc")); @@ -73,7 +73,7 @@ TEST(Regex, parenthesis2) { EXPECT_FALSE(re.Match("abcd")); } -TEST(Regex, parenthesis3) { +TEST(Regex, Parenthesis3) { Regex re("(a|b)(c|d)"); ASSERT_TRUE(re.IsValid()); EXPECT_TRUE(re.Match("ac")); @@ -84,3 +84,138 @@ TEST(Regex, parenthesis3) { EXPECT_FALSE(re.Match("ab")); EXPECT_FALSE(re.Match("cd")); } + +TEST(Regex, ZeroOrOne1) { + Regex re("a?"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("")); + EXPECT_TRUE(re.Match("a")); + EXPECT_FALSE(re.Match("aa")); +} + +TEST(Regex, ZeroOrOne2) { + Regex re("a?b"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("b")); + EXPECT_TRUE(re.Match("ab")); + EXPECT_FALSE(re.Match("a")); + EXPECT_FALSE(re.Match("aa")); + EXPECT_FALSE(re.Match("bb")); + EXPECT_FALSE(re.Match("ba")); +} + +TEST(Regex, ZeroOrOne3) { + Regex re("ab?"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("a")); + EXPECT_TRUE(re.Match("ab")); + EXPECT_FALSE(re.Match("b")); + EXPECT_FALSE(re.Match("aa")); + EXPECT_FALSE(re.Match("bb")); + EXPECT_FALSE(re.Match("ba")); +} + +TEST(Regex, ZeroOrOne4) { + Regex re("a?b?"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("")); + EXPECT_TRUE(re.Match("a")); + EXPECT_TRUE(re.Match("b")); + EXPECT_TRUE(re.Match("ab")); + EXPECT_FALSE(re.Match("aa")); + EXPECT_FALSE(re.Match("bb")); + EXPECT_FALSE(re.Match("ba")); + EXPECT_FALSE(re.Match("abc")); +} + +TEST(Regex, ZeroOrOne5) { + Regex re("a(ab)?b"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("ab")); + EXPECT_TRUE(re.Match("aabb")); + EXPECT_FALSE(re.Match("aab")); + EXPECT_FALSE(re.Match("abb")); +} + +TEST(Regex, ZeroOrMore1) { + Regex re("a*"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("")); + EXPECT_TRUE(re.Match("a")); + EXPECT_TRUE(re.Match("aa")); + EXPECT_FALSE(re.Match("b")); + EXPECT_FALSE(re.Match("ab")); +} + +TEST(Regex, ZeroOrMore2) { + Regex re("a*b"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("b")); + EXPECT_TRUE(re.Match("ab")); + EXPECT_TRUE(re.Match("aab")); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("bb")); +} + +TEST(Regex, ZeroOrMore3) { + Regex re("a*b*"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("")); + EXPECT_TRUE(re.Match("a")); + EXPECT_TRUE(re.Match("aa")); + EXPECT_TRUE(re.Match("b")); + EXPECT_TRUE(re.Match("bb")); + EXPECT_TRUE(re.Match("ab")); + EXPECT_TRUE(re.Match("aabb")); + EXPECT_FALSE(re.Match("ba")); +} + +TEST(Regex, ZeroOrMore4) { + Regex re("a(ab)*b"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("ab")); + EXPECT_TRUE(re.Match("aabb")); + EXPECT_TRUE(re.Match("aababb")); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("aa")); +} + +TEST(Regex, OneOrMore1) { + Regex re("a+"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("a")); + EXPECT_TRUE(re.Match("aa")); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("b")); + EXPECT_FALSE(re.Match("ab")); +} + +TEST(Regex, OneOrMore2) { + Regex re("a+b"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("ab")); + EXPECT_TRUE(re.Match("aab")); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("b")); +} + +TEST(Regex, OneOrMore3) { + Regex re("a+b+"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("ab")); + EXPECT_TRUE(re.Match("aab")); + EXPECT_TRUE(re.Match("abb")); + EXPECT_TRUE(re.Match("aabb")); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("b")); + EXPECT_FALSE(re.Match("ba")); +} + +TEST(Regex, OneOrMore4) { + Regex re("a(ab)+b"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("aabb")); + EXPECT_TRUE(re.Match("aababb")); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("ab")); +}