Add parenthesis support in regex
This commit is contained in:
parent
0bef29a5f6
commit
05c79891d1
@ -90,6 +90,12 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
enum Operator {
|
||||||
|
kConcatenation,
|
||||||
|
kAlternation,
|
||||||
|
kLeftParenthesis,
|
||||||
|
};
|
||||||
|
|
||||||
struct State {
|
struct State {
|
||||||
SizeType out; //!< Equals to kInvalid for match
|
SizeType out; //!< Equals to kInvalid for match
|
||||||
SizeType out1; //!< Equals to non-kInvalid for split
|
SizeType out1; //!< Equals to non-kInvalid for split
|
||||||
@ -155,52 +161,96 @@ private:
|
|||||||
void Parse(InputStream& is) {
|
void Parse(InputStream& is) {
|
||||||
Allocator allocator;
|
Allocator allocator;
|
||||||
Stack<Allocator> operandStack(&allocator, 256); // Frag
|
Stack<Allocator> operandStack(&allocator, 256); // Frag
|
||||||
Stack<Allocator> operatorStack(&allocator, 256); // char
|
Stack<Allocator> operatorStack(&allocator, 256); // Operator
|
||||||
|
Stack<Allocator> atomCountStack(&allocator, 256); // unsigned (Atom per parenthesis)
|
||||||
|
|
||||||
|
*atomCountStack.template Push<unsigned>() = 0;
|
||||||
|
|
||||||
unsigned codepoint;
|
unsigned codepoint;
|
||||||
bool previousOperand = false;
|
|
||||||
while (Encoding::Decode(is, &codepoint) && codepoint != 0) {
|
while (Encoding::Decode(is, &codepoint) && codepoint != 0) {
|
||||||
switch (codepoint) {
|
switch (codepoint) {
|
||||||
case '|':
|
case '|':
|
||||||
*operatorStack.template Push<char>() = '|';
|
while (!operatorStack.Empty() && *operatorStack.template Top<Operator>() < kAlternation)
|
||||||
previousOperand = false;
|
if (!Eval(operandStack, operatorStack))
|
||||||
|
return;
|
||||||
|
*operatorStack.template Push<Operator>() = kAlternation;
|
||||||
|
*atomCountStack.template Top<unsigned>() = 0;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '(':
|
||||||
|
*operatorStack.template Push<Operator>() = kLeftParenthesis;
|
||||||
|
*atomCountStack.template Push<unsigned>() = 0;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case ')':
|
||||||
|
while (!operatorStack.Empty() && *operatorStack.template Top<Operator>() != kLeftParenthesis)
|
||||||
|
if (!Eval(operandStack, operatorStack))
|
||||||
|
return;
|
||||||
|
if (operatorStack.Empty())
|
||||||
|
return;
|
||||||
|
operatorStack.template Pop<Operator>(1);
|
||||||
|
atomCountStack.template Pop<unsigned>(1);
|
||||||
|
ImplicitConcatenation(atomCountStack, operatorStack);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, codepoint);
|
SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, codepoint);
|
||||||
// concatenation with previous operand
|
|
||||||
if (previousOperand) {
|
|
||||||
Frag* e = operandStack.template Top<Frag>();
|
|
||||||
Patch(e->out, s);
|
|
||||||
e->out = s;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
*operandStack.template Push<Frag>() = Frag(s, s);
|
*operandStack.template Push<Frag>() = Frag(s, s);
|
||||||
previousOperand = true;
|
ImplicitConcatenation(atomCountStack, operatorStack);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
while (!operatorStack.Empty()) {
|
while (!operatorStack.Empty())
|
||||||
switch (*operatorStack.template Pop<char>(1)) {
|
if (!Eval(operandStack, operatorStack))
|
||||||
case '|':
|
return;
|
||||||
{
|
|
||||||
Frag e2 = *operandStack.template Pop<Frag>(1);
|
|
||||||
Frag e1 = *operandStack.template Pop<Frag>(1);
|
|
||||||
SizeType s = NewState(e1.start, e2.start, 0);
|
|
||||||
*operandStack.template Push<Frag>() = Frag(s, Append(e1.out, e2.out));
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Link the operand to matching state.
|
// Link the operand to matching state.
|
||||||
if (operandStack.GetSize() == sizeof(Frag)) {
|
if (operandStack.GetSize() == sizeof(Frag)) {
|
||||||
Frag* e = operandStack.template Pop<Frag>(1);
|
Frag* e = operandStack.template Pop<Frag>(1);
|
||||||
Patch(e->out, NewState(kRegexInvalidState, kRegexInvalidState, 0));
|
Patch(e->out, NewState(kRegexInvalidState, kRegexInvalidState, 0));
|
||||||
root_ = e->start;
|
root_ = e->start;
|
||||||
|
// printf("root: %d\n", root_);
|
||||||
|
// for (SizeType i = 0; i < stateCount_ ; i++) {
|
||||||
|
// State& s = GetState(i);
|
||||||
|
// printf("[%2d] out: %2d out1: %2d c: '%c'\n", i, s.out, s.out1, (char)s.codepoint);
|
||||||
|
// }
|
||||||
|
// printf("\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool Eval(Stack<Allocator>& operandStack, Stack<Allocator>& operatorStack) {
|
||||||
|
switch (*operatorStack.template Pop<Operator>(1)) {
|
||||||
|
case kConcatenation:
|
||||||
|
if (operandStack.GetSize() >= sizeof(Frag) * 2) {
|
||||||
|
Frag e2 = *operandStack.template Pop<Frag>(1);
|
||||||
|
Frag e1 = *operandStack.template Pop<Frag>(1);
|
||||||
|
Patch(e1.out, e2.start);
|
||||||
|
*operandStack.template Push<Frag>() = Frag(e1.start, e2.out);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
|
||||||
|
case kAlternation:
|
||||||
|
if (operandStack.GetSize() >= sizeof(Frag) * 2) {
|
||||||
|
Frag e2 = *operandStack.template Pop<Frag>(1);
|
||||||
|
Frag e1 = *operandStack.template Pop<Frag>(1);
|
||||||
|
SizeType s = NewState(e1.start, e2.start, 0);
|
||||||
|
*operandStack.template Push<Frag>() = Frag(s, Append(e1.out, e2.out));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
|
||||||
|
default:
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void ImplicitConcatenation(Stack<Allocator>& atomCountStack, Stack<Allocator>& operatorStack) {
|
||||||
|
if (*atomCountStack.template Top<unsigned>())
|
||||||
|
*operatorStack.template Push<Operator>() = kConcatenation;
|
||||||
|
(*atomCountStack.template Top<unsigned>())++;
|
||||||
|
}
|
||||||
|
|
||||||
Stack<Allocator> states_;
|
Stack<Allocator> states_;
|
||||||
SizeType root_;
|
SizeType root_;
|
||||||
SizeType stateCount_;
|
SizeType stateCount_;
|
||||||
|
@ -19,6 +19,7 @@ using namespace rapidjson::internal;
|
|||||||
|
|
||||||
TEST(Regex, concatenation) {
|
TEST(Regex, concatenation) {
|
||||||
Regex re("abc");
|
Regex re("abc");
|
||||||
|
ASSERT_TRUE(re.IsValid());
|
||||||
EXPECT_TRUE(re.Match("abc"));
|
EXPECT_TRUE(re.Match("abc"));
|
||||||
EXPECT_FALSE(re.Match(""));
|
EXPECT_FALSE(re.Match(""));
|
||||||
EXPECT_FALSE(re.Match("a"));
|
EXPECT_FALSE(re.Match("a"));
|
||||||
@ -27,9 +28,9 @@ TEST(Regex, concatenation) {
|
|||||||
EXPECT_FALSE(re.Match("abcd"));
|
EXPECT_FALSE(re.Match("abcd"));
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(Regex, split) {
|
TEST(Regex, split1) {
|
||||||
{
|
|
||||||
Regex re("abab|abbb");
|
Regex re("abab|abbb");
|
||||||
|
ASSERT_TRUE(re.IsValid());
|
||||||
EXPECT_TRUE(re.Match("abab"));
|
EXPECT_TRUE(re.Match("abab"));
|
||||||
EXPECT_TRUE(re.Match("abbb"));
|
EXPECT_TRUE(re.Match("abbb"));
|
||||||
EXPECT_FALSE(re.Match(""));
|
EXPECT_FALSE(re.Match(""));
|
||||||
@ -37,14 +38,49 @@ TEST(Regex, split) {
|
|||||||
EXPECT_FALSE(re.Match("ababa"));
|
EXPECT_FALSE(re.Match("ababa"));
|
||||||
EXPECT_FALSE(re.Match("abb"));
|
EXPECT_FALSE(re.Match("abb"));
|
||||||
EXPECT_FALSE(re.Match("abbbb"));
|
EXPECT_FALSE(re.Match("abbbb"));
|
||||||
}
|
}
|
||||||
{
|
|
||||||
|
TEST(Regex, split2) {
|
||||||
Regex re("a|b|c");
|
Regex re("a|b|c");
|
||||||
|
ASSERT_TRUE(re.IsValid());
|
||||||
EXPECT_TRUE(re.Match("a"));
|
EXPECT_TRUE(re.Match("a"));
|
||||||
EXPECT_TRUE(re.Match("b"));
|
EXPECT_TRUE(re.Match("b"));
|
||||||
EXPECT_TRUE(re.Match("c"));
|
EXPECT_TRUE(re.Match("c"));
|
||||||
EXPECT_FALSE(re.Match(""));
|
EXPECT_FALSE(re.Match(""));
|
||||||
EXPECT_FALSE(re.Match("aa"));
|
EXPECT_FALSE(re.Match("aa"));
|
||||||
EXPECT_FALSE(re.Match("ab"));
|
EXPECT_FALSE(re.Match("ab"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(Regex, parenthesis1) {
|
||||||
|
Regex re("(ab)c");
|
||||||
|
ASSERT_TRUE(re.IsValid());
|
||||||
|
EXPECT_TRUE(re.Match("abc"));
|
||||||
|
EXPECT_FALSE(re.Match(""));
|
||||||
|
EXPECT_FALSE(re.Match("a"));
|
||||||
|
EXPECT_FALSE(re.Match("b"));
|
||||||
|
EXPECT_FALSE(re.Match("ab"));
|
||||||
|
EXPECT_FALSE(re.Match("abcd"));
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(Regex, parenthesis2) {
|
||||||
|
Regex re("a(bc)");
|
||||||
|
ASSERT_TRUE(re.IsValid());
|
||||||
|
EXPECT_TRUE(re.Match("abc"));
|
||||||
|
EXPECT_FALSE(re.Match(""));
|
||||||
|
EXPECT_FALSE(re.Match("a"));
|
||||||
|
EXPECT_FALSE(re.Match("b"));
|
||||||
|
EXPECT_FALSE(re.Match("ab"));
|
||||||
|
EXPECT_FALSE(re.Match("abcd"));
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(Regex, parenthesis3) {
|
||||||
|
Regex re("(a|b)(c|d)");
|
||||||
|
ASSERT_TRUE(re.IsValid());
|
||||||
|
EXPECT_TRUE(re.Match("ac"));
|
||||||
|
EXPECT_TRUE(re.Match("ad"));
|
||||||
|
EXPECT_TRUE(re.Match("bc"));
|
||||||
|
EXPECT_TRUE(re.Match("bd"));
|
||||||
|
EXPECT_FALSE(re.Match(""));
|
||||||
|
EXPECT_FALSE(re.Match("ab"));
|
||||||
|
EXPECT_FALSE(re.Match("cd"));
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user