diff --git a/include/rapidjson/internal/regex.h b/include/rapidjson/internal/regex.h index b61aaaa..f757cfe 100644 --- a/include/rapidjson/internal/regex.h +++ b/include/rapidjson/internal/regex.h @@ -90,6 +90,12 @@ public: } private: + enum Operator { + kConcatenation, + kAlternation, + kLeftParenthesis, + }; + struct State { SizeType out; //!< Equals to kInvalid for match SizeType out1; //!< Equals to non-kInvalid for split @@ -155,52 +161,96 @@ private: void Parse(InputStream& is) { Allocator allocator; Stack operandStack(&allocator, 256); // Frag - Stack operatorStack(&allocator, 256); // char + Stack operatorStack(&allocator, 256); // Operator + Stack atomCountStack(&allocator, 256); // unsigned (Atom per parenthesis) + + *atomCountStack.template Push() = 0; unsigned codepoint; - bool previousOperand = false; while (Encoding::Decode(is, &codepoint) && codepoint != 0) { switch (codepoint) { case '|': - *operatorStack.template Push() = '|'; - previousOperand = false; + while (!operatorStack.Empty() && *operatorStack.template Top() < kAlternation) + if (!Eval(operandStack, operatorStack)) + return; + *operatorStack.template Push() = kAlternation; + *atomCountStack.template Top() = 0; + break; + + case '(': + *operatorStack.template Push() = kLeftParenthesis; + *atomCountStack.template Push() = 0; + break; + + case ')': + while (!operatorStack.Empty() && *operatorStack.template Top() != kLeftParenthesis) + if (!Eval(operandStack, operatorStack)) + return; + if (operatorStack.Empty()) + return; + operatorStack.template Pop(1); + atomCountStack.template Pop(1); + ImplicitConcatenation(atomCountStack, operatorStack); break; default: SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, codepoint); - // concatenation with previous operand - if (previousOperand) { - Frag* e = operandStack.template Top(); - Patch(e->out, s); - e->out = s; - } - else - *operandStack.template Push() = Frag(s, s); - previousOperand = true; + *operandStack.template Push() = Frag(s, s); + ImplicitConcatenation(atomCountStack, operatorStack); } } - while (!operatorStack.Empty()) { - switch (*operatorStack.template Pop(1)) { - case '|': - { - Frag e2 = *operandStack.template Pop(1); - Frag e1 = *operandStack.template Pop(1); - SizeType s = NewState(e1.start, e2.start, 0); - *operandStack.template Push() = Frag(s, Append(e1.out, e2.out)); - } - break; - } - } + while (!operatorStack.Empty()) + if (!Eval(operandStack, operatorStack)) + return; // Link the operand to matching state. if (operandStack.GetSize() == sizeof(Frag)) { Frag* e = operandStack.template Pop(1); Patch(e->out, NewState(kRegexInvalidState, kRegexInvalidState, 0)); root_ = e->start; + // printf("root: %d\n", root_); + // for (SizeType i = 0; i < stateCount_ ; i++) { + // State& s = GetState(i); + // printf("[%2d] out: %2d out1: %2d c: '%c'\n", i, s.out, s.out1, (char)s.codepoint); + // } + // printf("\n"); } } + bool Eval(Stack& operandStack, Stack& operatorStack) { + switch (*operatorStack.template Pop(1)) { + case kConcatenation: + if (operandStack.GetSize() >= sizeof(Frag) * 2) { + Frag e2 = *operandStack.template Pop(1); + Frag e1 = *operandStack.template Pop(1); + Patch(e1.out, e2.start); + *operandStack.template Push() = Frag(e1.start, e2.out); + return true; + } + return false; + + case kAlternation: + if (operandStack.GetSize() >= sizeof(Frag) * 2) { + Frag e2 = *operandStack.template Pop(1); + Frag e1 = *operandStack.template Pop(1); + SizeType s = NewState(e1.start, e2.start, 0); + *operandStack.template Push() = Frag(s, Append(e1.out, e2.out)); + return true; + } + return false; + + default: + return false; + } + } + + void ImplicitConcatenation(Stack& atomCountStack, Stack& operatorStack) { + if (*atomCountStack.template Top()) + *operatorStack.template Push() = kConcatenation; + (*atomCountStack.template Top())++; + } + Stack states_; SizeType root_; SizeType stateCount_; diff --git a/test/unittest/regextest.cpp b/test/unittest/regextest.cpp index 7c67c0f..658bbc2 100644 --- a/test/unittest/regextest.cpp +++ b/test/unittest/regextest.cpp @@ -19,6 +19,7 @@ using namespace rapidjson::internal; TEST(Regex, concatenation) { Regex re("abc"); + ASSERT_TRUE(re.IsValid()); EXPECT_TRUE(re.Match("abc")); EXPECT_FALSE(re.Match("")); EXPECT_FALSE(re.Match("a")); @@ -27,24 +28,59 @@ TEST(Regex, concatenation) { EXPECT_FALSE(re.Match("abcd")); } -TEST(Regex, split) { - { - Regex re("abab|abbb"); - EXPECT_TRUE(re.Match("abab")); - EXPECT_TRUE(re.Match("abbb")); - EXPECT_FALSE(re.Match("")); - EXPECT_FALSE(re.Match("ab")); - EXPECT_FALSE(re.Match("ababa")); - EXPECT_FALSE(re.Match("abb")); - EXPECT_FALSE(re.Match("abbbb")); - } - { - Regex re("a|b|c"); - EXPECT_TRUE(re.Match("a")); - EXPECT_TRUE(re.Match("b")); - EXPECT_TRUE(re.Match("c")); - EXPECT_FALSE(re.Match("")); - EXPECT_FALSE(re.Match("aa")); - EXPECT_FALSE(re.Match("ab")); - } +TEST(Regex, split1) { + Regex re("abab|abbb"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("abab")); + EXPECT_TRUE(re.Match("abbb")); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("ab")); + EXPECT_FALSE(re.Match("ababa")); + EXPECT_FALSE(re.Match("abb")); + EXPECT_FALSE(re.Match("abbbb")); +} + +TEST(Regex, split2) { + Regex re("a|b|c"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("a")); + EXPECT_TRUE(re.Match("b")); + EXPECT_TRUE(re.Match("c")); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("aa")); + EXPECT_FALSE(re.Match("ab")); +} + +TEST(Regex, parenthesis1) { + Regex re("(ab)c"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("abc")); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("a")); + EXPECT_FALSE(re.Match("b")); + EXPECT_FALSE(re.Match("ab")); + EXPECT_FALSE(re.Match("abcd")); +} + +TEST(Regex, parenthesis2) { + Regex re("a(bc)"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("abc")); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("a")); + EXPECT_FALSE(re.Match("b")); + EXPECT_FALSE(re.Match("ab")); + EXPECT_FALSE(re.Match("abcd")); +} + +TEST(Regex, parenthesis3) { + Regex re("(a|b)(c|d)"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("ac")); + EXPECT_TRUE(re.Match("ad")); + EXPECT_TRUE(re.Match("bc")); + EXPECT_TRUE(re.Match("bd")); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("ab")); + EXPECT_FALSE(re.Match("cd")); }