diff --git a/include/rapidjson/internal/regex.h b/include/rapidjson/internal/regex.h index a91cadd..4d31180 100644 --- a/include/rapidjson/internal/regex.h +++ b/include/rapidjson/internal/regex.h @@ -29,23 +29,30 @@ namespace internal { // GenericRegex static const SizeType kRegexInvalidState = ~SizeType(0); //!< Represents an invalid index in GenericRegex::State::out, out1 +static const SizeType kRegexInvalidRange = ~SizeType(0); -//! Regular expression engine. +//! Regular expression engine with subset of ECMAscript grammar. /*! Supported regular expression syntax: - - \c ab Concatenation - - \c a|b Alternation - - \c a? Zero or one - - \c a* Zero or more - - \c a+ One or more - - \c (ab)* Parenthesis grouping + - \c ab Concatenation + - \c a|b Alternation + - \c a? Zero or one + - \c a* Zero or more + - \c a+ One or more + - \c (ab)* Grouping + - \c . Any character + - \c [abc] Character classes + - \c [a-c] Character class range + - \c [a-z0-9_] Character class combination + - \c [^abc] Negated character classes + - \c [^a-c] Negated character class range */ template class GenericRegex { public: typedef typename Encoding::Ch Ch; - GenericRegex(const Ch* source, Allocator* allocator = 0) : states_(allocator, 256), root_(kRegexInvalidState), stateCount_() { + GenericRegex(const Ch* source, Allocator* allocator = 0) : states_(allocator, 256), ranges_(allocator, 256), root_(kRegexInvalidState), stateCount_(),rangeCount_() { StringStream is(source); Parse(is); } @@ -76,8 +83,12 @@ public: next->Clear(); for (const SizeType* s = current->template Bottom(); s != current->template End(); ++s) { const State& sr = GetState(*s); - if (sr.codepoint == kAnyCharacterClass || sr.codepoint == codepoint) + if (sr.codepoint == codepoint || + sr.codepoint == kAnyCharacterClass || + (sr.codepoint == kRangeCharacterClass && MatchRange(sr.rangeStart, codepoint))) + { AddState(stateSet, *next, sr.out); + } } Stack* temp = current; current = next; @@ -109,10 +120,19 @@ private: }; static const unsigned kAnyCharacterClass = 0xFFFFFFFF; //!< For '.' + static const unsigned kRangeCharacterClass = 0xFFFFFFFE; + static const unsigned kRangeNegationFlag = 0x80000000; + + struct Range { + unsigned start; // + unsigned end; + SizeType next; + }; struct State { SizeType out; //!< Equals to kInvalid for matching state SizeType out1; //!< Equals to non-kInvalid for split + SizeType rangeStart; unsigned codepoint; }; @@ -132,6 +152,16 @@ private: return states_.template Bottom()[index]; } + Range& GetRange(SizeType index) { + RAPIDJSON_ASSERT(index < rangeCount_); + return ranges_.template Bottom()[index]; + } + + const Range& GetRange(SizeType index) const { + RAPIDJSON_ASSERT(index < rangeCount_); + return ranges_.template Bottom()[index]; + } + void AddState(unsigned* stateSet, Stack& l, SizeType index) const { if (index == kRegexInvalidState) return; @@ -147,34 +177,17 @@ private: } } - SizeType NewState(SizeType out, SizeType out1, unsigned codepoint) { - State* s = states_.template Push(); - s->out = out; - s->out1 = out1; - s->codepoint = codepoint; - return stateCount_++; - } - - SizeType Append(SizeType l1, SizeType l2) { - SizeType old = l1; - while (GetState(l1).out != kRegexInvalidState) - l1 = GetState(l1).out; - GetState(l1).out = l2; - return old; - } - - void Patch(SizeType l, SizeType s) { - for (SizeType next; l != kRegexInvalidState; l = next) { - next = GetState(l).out; - GetState(l).out = s; + bool MatchRange(SizeType rangeIndex, unsigned codepoint) const { + bool yes = (GetRange(rangeIndex).start & kRangeNegationFlag) == 0; + while (rangeIndex != kRegexInvalidRange) { + const Range& r = GetRange(rangeIndex); + if (codepoint >= (r.start & ~kRangeNegationFlag) && codepoint <= r.end) + return yes; + rangeIndex = r.next; } + return !yes; } - void PushOperand(Stack& operandStack, unsigned codepoint) { - SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, codepoint); - *operandStack.template Push() = Frag(s, s); - } - template void Parse(InputStream& is) { Allocator allocator; @@ -231,6 +244,18 @@ private: ImplicitConcatenation(atomCountStack, operatorStack); break; + case '[': + { + SizeType range; + if (!ParseRange(is, &range)) + return; + SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, kRangeCharacterClass); + GetState(s).rangeStart = range; + *operandStack.template Push() = Frag(s, s); + } + ImplicitConcatenation(atomCountStack, operatorStack); + break; + default: PushOperand(operandStack, codepoint); ImplicitConcatenation(atomCountStack, operatorStack); @@ -258,6 +283,41 @@ private: } } + SizeType NewState(SizeType out, SizeType out1, unsigned codepoint) { + State* s = states_.template Push(); + s->out = out; + s->out1 = out1; + s->codepoint = codepoint; + s->rangeStart = kRegexInvalidRange; + return stateCount_++; + } + + void PushOperand(Stack& operandStack, unsigned codepoint) { + SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, codepoint); + *operandStack.template Push() = Frag(s, s); + } + + void ImplicitConcatenation(Stack& atomCountStack, Stack& operatorStack) { + if (*atomCountStack.template Top()) + *operatorStack.template Push() = kConcatenation; + (*atomCountStack.template Top())++; + } + + SizeType Append(SizeType l1, SizeType l2) { + SizeType old = l1; + while (GetState(l1).out != kRegexInvalidState) + l1 = GetState(l1).out; + GetState(l1).out = l2; + return old; + } + + void Patch(SizeType l, SizeType s) { + for (SizeType next; l != kRegexInvalidState; l = next) { + next = GetState(l).out; + GetState(l).out = s; + } + } + bool Eval(Stack& operandStack, Operator op) { switch (op) { case kConcatenation: @@ -314,15 +374,72 @@ private: } } - void ImplicitConcatenation(Stack& atomCountStack, Stack& operatorStack) { - if (*atomCountStack.template Top()) - *operatorStack.template Push() = kConcatenation; - (*atomCountStack.template Top())++; + template + bool ParseRange(InputStream& is, SizeType* range) { + bool isBegin = true; + bool negate = false; + int step = 0; + SizeType start = kRegexInvalidRange; + SizeType current = kRegexInvalidRange; + unsigned codepoint; + while (Encoding::Decode(is, &codepoint) && codepoint != 0) { + if (isBegin && codepoint == '^') + negate = true; + else if (codepoint == ']') { + if (step == 2) { // Add trailing '-' + SizeType r = NewRange('-'); + RAPIDJSON_ASSERT(current != kRegexInvalidRange); + GetRange(current).next = r; + } + if (negate) + GetRange(start).start |= kRangeNegationFlag; + *range = start; + return true; + } + else { + switch (step) { + case 1: + if (codepoint == '-') { + step++; + break; + } + // fall through to step 0 for other characters + + case 0: + { + SizeType r = NewRange(codepoint); + if (current != kRegexInvalidRange) + GetRange(current).next = r; + if (start == kRegexInvalidRange) + start = r; + current = r; + } + step = 1; + break; + + default: + RAPIDJSON_ASSERT(step == 2); + GetRange(current).end = codepoint; + step = 0; + } + } + isBegin = false; + } + return false; + } + + SizeType NewRange(unsigned codepoint) { + Range* r = ranges_.template Push(); + r->start = r->end = codepoint; + r->next = kRegexInvalidRange; + return rangeCount_++; } Stack states_; + Stack ranges_; SizeType root_; SizeType stateCount_; + SizeType rangeCount_; }; typedef GenericRegex > Regex; diff --git a/test/unittest/regextest.cpp b/test/unittest/regextest.cpp index 52735cb..8818117 100644 --- a/test/unittest/regextest.cpp +++ b/test/unittest/regextest.cpp @@ -241,4 +241,90 @@ TEST(Regex, AnyCharacter) { EXPECT_FALSE(re.Match("aa")); } +TEST(Regex, CharacterRange1) { + Regex re("[abc]"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("a")); + EXPECT_TRUE(re.Match("b")); + EXPECT_TRUE(re.Match("c")); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("`")); + EXPECT_FALSE(re.Match("d")); + EXPECT_FALSE(re.Match("aa")); +} + +TEST(Regex, CharacterRange2) { + Regex re("[^abc]"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("`")); + EXPECT_TRUE(re.Match("d")); + EXPECT_FALSE(re.Match("a")); + EXPECT_FALSE(re.Match("b")); + EXPECT_FALSE(re.Match("c")); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("aa")); +} + +TEST(Regex, CharacterRange3) { + Regex re("[a-c]"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("a")); + EXPECT_TRUE(re.Match("b")); + EXPECT_TRUE(re.Match("c")); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("`")); + EXPECT_FALSE(re.Match("d")); + EXPECT_FALSE(re.Match("aa")); +} + +TEST(Regex, CharacterRange4) { + Regex re("[^a-c]"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("`")); + EXPECT_TRUE(re.Match("d")); + EXPECT_FALSE(re.Match("a")); + EXPECT_FALSE(re.Match("b")); + EXPECT_FALSE(re.Match("c")); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("aa")); +} + +TEST(Regex, CharacterRange5) { + Regex re("[-]"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("-")); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("a")); +} + +TEST(Regex, CharacterRange6) { + Regex re("[a-]"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("a")); + EXPECT_TRUE(re.Match("-")); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("`")); + EXPECT_FALSE(re.Match("b")); +} + +TEST(Regex, CharacterRange7) { + Regex re("[-a]"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("a")); + EXPECT_TRUE(re.Match("-")); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("`")); + EXPECT_FALSE(re.Match("b")); +} + +TEST(Regex, CharacterRange8) { + Regex re("[a-zA-Z0-9]*"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("Milo")); + EXPECT_TRUE(re.Match("MT19937")); + EXPECT_TRUE(re.Match("43")); + EXPECT_FALSE(re.Match("a_b")); + EXPECT_FALSE(re.Match("!")); +} + #undef EURO