From 06853b89b07d02708d80721f169462fe29f07295 Mon Sep 17 00:00:00 2001 From: miloyip Date: Mon, 25 May 2015 22:51:03 +0800 Subject: [PATCH] Add any character (.) to regex --- include/rapidjson/internal/regex.h | 17 ++++++++++++++--- test/unittest/regextest.cpp | 16 ++++++++++++++-- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/include/rapidjson/internal/regex.h b/include/rapidjson/internal/regex.h index 184da81..a91cadd 100644 --- a/include/rapidjson/internal/regex.h +++ b/include/rapidjson/internal/regex.h @@ -76,7 +76,7 @@ public: next->Clear(); for (const SizeType* s = current->template Bottom(); s != current->template End(); ++s) { const State& sr = GetState(*s); - if (sr.codepoint == codepoint) + if (sr.codepoint == kAnyCharacterClass || sr.codepoint == codepoint) AddState(stateSet, *next, sr.out); } Stack* temp = current; @@ -108,6 +108,8 @@ private: kLeftParenthesis }; + static const unsigned kAnyCharacterClass = 0xFFFFFFFF; //!< For '.' + struct State { SizeType out; //!< Equals to kInvalid for matching state SizeType out1; //!< Equals to non-kInvalid for split @@ -168,6 +170,11 @@ private: } } + void PushOperand(Stack& operandStack, unsigned codepoint) { + SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, codepoint); + *operandStack.template Push() = Frag(s, s); + } + template void Parse(InputStream& is) { Allocator allocator; @@ -219,9 +226,13 @@ private: return; break; + case '.': + PushOperand(operandStack, kAnyCharacterClass); + ImplicitConcatenation(atomCountStack, operatorStack); + break; + default: - SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, codepoint); - *operandStack.template Push() = Frag(s, s); + PushOperand(operandStack, codepoint); ImplicitConcatenation(atomCountStack, operatorStack); } } diff --git a/test/unittest/regextest.cpp b/test/unittest/regextest.cpp index 979e230..52735cb 100644 --- a/test/unittest/regextest.cpp +++ b/test/unittest/regextest.cpp @@ -220,13 +220,25 @@ TEST(Regex, OneOrMore4) { EXPECT_FALSE(re.Match("ab")); } -TEST(Regex, Unicode) { #define EURO "\xE2\x82\xAC" // "\xE2\x82\xAC" is UTF-8 sequence of Euro sign U+20AC + +TEST(Regex, Unicode) { Regex re("a" EURO "+b"); ASSERT_TRUE(re.IsValid()); EXPECT_TRUE(re.Match("a" EURO "b")); EXPECT_TRUE(re.Match("a" EURO EURO "b")); EXPECT_FALSE(re.Match("a?b")); EXPECT_FALSE(re.Match("a" EURO "\xAC" "b")); // unaware of UTF-8 will match -#undef EURO } + +TEST(Regex, AnyCharacter) { + Regex re("."); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("a")); + EXPECT_TRUE(re.Match("b")); + EXPECT_TRUE(re.Match(EURO)); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("aa")); +} + +#undef EURO