From 32295665739115f5149123bc5a4fe1de21786d18 Mon Sep 17 00:00:00 2001 From: Milo Yip Date: Wed, 20 May 2015 13:33:14 +0800 Subject: [PATCH 01/26] Add multiple SkipWhitespace perftest --- test/perftest/rapidjsontest.cpp | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/test/perftest/rapidjsontest.cpp b/test/perftest/rapidjsontest.cpp index 05ecf6e..9be966a 100644 --- a/test/perftest/rapidjsontest.cpp +++ b/test/perftest/rapidjsontest.cpp @@ -298,11 +298,28 @@ TEST_F(RapidJson, internal_Pow10) { EXPECT_GT(sum, 0.0); } -TEST_F(RapidJson, SIMD_SUFFIX(Whitespace)) { +TEST_F(RapidJson, SkipWhitespace_Basic) { for (size_t i = 0; i < kTrialCount; i++) { - Document doc; - ASSERT_TRUE(doc.Parse(whitespace_).IsArray()); - } + rapidjson::StringStream s(whitespace_); + while (s.Peek() == ' ' || s.Peek() == '\n' || s.Peek() == '\r' || s.Peek() == '\t') + s.Take(); + ASSERT_EQ('[', s.Peek()); + } +} + +TEST_F(RapidJson, SIMD_SUFFIX(SkipWhitespace)) { + for (size_t i = 0; i < kTrialCount; i++) { + rapidjson::StringStream s(whitespace_); + rapidjson::SkipWhitespace(s); + ASSERT_EQ('[', s.Peek()); + } +} + +TEST_F(RapidJson, SkipWhitespace_strspn) { + for (size_t i = 0; i < kTrialCount; i++) { + const char* s = whitespace_ + std::strspn(whitespace_, " \t\r\n"); + ASSERT_EQ('[', *s); + } } TEST_F(RapidJson, UTF8_Validate) { From f688b2b15279bc8face7f4d319fd545b129a3315 Mon Sep 17 00:00:00 2001 From: miloyip Date: Thu, 21 May 2015 13:25:37 +0800 Subject: [PATCH 02/26] Improve coverage of Pointer --- test/unittest/pointertest.cpp | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/test/unittest/pointertest.cpp b/test/unittest/pointertest.cpp index 0e050c0..312683a 100644 --- a/test/unittest/pointertest.cpp +++ b/test/unittest/pointertest.cpp @@ -873,7 +873,9 @@ TEST(Pointer, Erase) { d.Parse(kJson); EXPECT_FALSE(Pointer("").Erase(d)); + EXPECT_FALSE(Pointer("/nonexist").Erase(d)); EXPECT_FALSE(Pointer("/foo/nonexist").Erase(d)); + EXPECT_FALSE(Pointer("/foo/0/nonexist").Erase(d)); EXPECT_TRUE(Pointer("/foo/0").Erase(d)); EXPECT_EQ(1u, d["foo"].Size()); EXPECT_STREQ("baz", d["foo"][0].GetString()); @@ -881,6 +883,24 @@ TEST(Pointer, Erase) { EXPECT_TRUE(d["foo"].Empty()); EXPECT_TRUE(Pointer("/foo").Erase(d)); EXPECT_TRUE(Pointer("/foo").Get(d) == 0); + + Pointer("/a/0/b/0").Create(d); + + EXPECT_TRUE(Pointer("/a/0/b/0").Get(d) != 0); + EXPECT_TRUE(Pointer("/a/0/b/0").Erase(d)); + EXPECT_TRUE(Pointer("/a/0/b/0").Get(d) == 0); + + EXPECT_TRUE(Pointer("/a/0/b").Get(d) != 0); + EXPECT_TRUE(Pointer("/a/0/b").Erase(d)); + EXPECT_TRUE(Pointer("/a/0/b").Get(d) == 0); + + EXPECT_TRUE(Pointer("/a/0").Get(d) != 0); + EXPECT_TRUE(Pointer("/a/0").Erase(d)); + EXPECT_TRUE(Pointer("/a/0").Get(d) == 0); + + EXPECT_TRUE(Pointer("/a").Get(d) != 0); + EXPECT_TRUE(Pointer("/a").Erase(d)); + EXPECT_TRUE(Pointer("/a").Get(d) == 0); } TEST(Pointer, CreateValueByPointer) { From 1a570c342dad0927503b35782263da006ef93351 Mon Sep 17 00:00:00 2001 From: miloyip Date: Thu, 21 May 2015 16:00:32 +0800 Subject: [PATCH 03/26] Fix the undefined behaviour when negating the minimum value integers in Reader --- include/rapidjson/reader.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/rapidjson/reader.h b/include/rapidjson/reader.h index be0d9fb..8989e38 100644 --- a/include/rapidjson/reader.h +++ b/include/rapidjson/reader.h @@ -967,13 +967,13 @@ private: else { if (use64bit) { if (minus) - cont = handler.Int64(-(int64_t)i64); + cont = handler.Int64(static_cast(~i64 + 1)); else cont = handler.Uint64(i64); } else { if (minus) - cont = handler.Int(-(int)i); + cont = handler.Int(static_cast(~i + 1)); else cont = handler.Uint(i); } From 6e1d10ec6b1454737015ca2b018816185092efc4 Mon Sep 17 00:00:00 2001 From: miloyip Date: Thu, 21 May 2015 16:12:33 +0800 Subject: [PATCH 04/26] Add GenericValue::EraseMember(string types) APIs --- include/rapidjson/document.h | 25 +++++++++++++++++++++++++ test/unittest/valuetest.cpp | 18 ++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/include/rapidjson/document.h b/include/rapidjson/document.h index ca80963..889cdfa 100644 --- a/include/rapidjson/document.h +++ b/include/rapidjson/document.h @@ -1213,6 +1213,31 @@ public: return pos; } + //! Erase a member in object by its name. + /*! \param name Name of member to be removed. + \return Whether the member existed. + \note Linear time complexity. + */ + bool EraseMember(const Ch* name) { + GenericValue n(StringRef(name)); + return EraseMember(n); + } + +#if RAPIDJSON_HAS_STDSTRING + bool EraseMember(const std::basic_string& name) { return EraseMember(GenericValue(StringRef(name))); } +#endif + + template + bool EraseMember(const GenericValue& name) { + MemberIterator m = FindMember(name); + if (m != MemberEnd()) { + EraseMember(m); + return true; + } + else + return false; + } + //@} //!@name Array diff --git a/test/unittest/valuetest.cpp b/test/unittest/valuetest.cpp index 5e142e1..f14669a 100644 --- a/test/unittest/valuetest.cpp +++ b/test/unittest/valuetest.cpp @@ -1182,6 +1182,24 @@ TEST(Value, Object) { EXPECT_TRUE(z.IsObject()); } +TEST(Value, EraseMember_String) { + Value::AllocatorType allocator; + Value x(kObjectType); + x.AddMember("A", "Apple", allocator); + x.AddMember("B", "Banana", allocator); + + EXPECT_TRUE(x.EraseMember("B")); + EXPECT_FALSE(x.HasMember("B")); + + EXPECT_FALSE(x.EraseMember("nonexist")); + + GenericValue, CrtAllocator> othername("A"); + EXPECT_TRUE(x.EraseMember(othername)); + EXPECT_FALSE(x.HasMember("A")); + + EXPECT_TRUE(x.MemberBegin() == x.MemberEnd()); +} + TEST(Value, BigNestedArray) { MemoryPoolAllocator<> allocator; Value x(kArrayType); From a2d09f0a03cf58f62f90c0af1416a59b0ca5cf50 Mon Sep 17 00:00:00 2001 From: miloyip Date: Thu, 21 May 2015 16:13:02 +0800 Subject: [PATCH 05/26] Refactor GenericPointer::Erase() --- include/rapidjson/pointer.h | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/include/rapidjson/pointer.h b/include/rapidjson/pointer.h index 13e688e..5d2aa8d 100644 --- a/include/rapidjson/pointer.h +++ b/include/rapidjson/pointer.h @@ -686,34 +686,37 @@ public: ValueType* v = &root; const Token* last = tokens_ + (tokenCount_ - 1); - for (const Token *t = tokens_; t != tokens_ + tokenCount_; ++t) { + for (const Token *t = tokens_; t != last; ++t) { switch (v->GetType()) { case kObjectType: { typename ValueType::MemberIterator m = v->FindMember(GenericStringRef(t->name, t->length)); if (m == v->MemberEnd()) return false; - if (t == last) { - v->EraseMember(m); - return true; - } v = &m->value; } break; case kArrayType: if (t->index == kPointerInvalidIndex || t->index >= v->Size()) return false; - if (t == last) { - v->Erase(v->Begin() + t->index); - return true; - } v = &((*v)[t->index]); break; default: return false; } } - return false; + + switch (v->GetType()) { + case kObjectType: + return v->EraseMember(GenericStringRef(last->name, last->length)); + case kArrayType: + if (last->index == kPointerInvalidIndex || last->index >= v->Size()) + return false; + v->Erase(v->Begin() + last->index); + return true; + default: + return false; + } } private: From 7ddaa80e461f8069fc80a28254ffe40804d51451 Mon Sep 17 00:00:00 2001 From: miloyip Date: Thu, 21 May 2015 16:45:39 +0800 Subject: [PATCH 06/26] Improve coverage of GenericPointer::Erase() --- test/unittest/pointertest.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/unittest/pointertest.cpp b/test/unittest/pointertest.cpp index 312683a..0f78450 100644 --- a/test/unittest/pointertest.cpp +++ b/test/unittest/pointertest.cpp @@ -875,7 +875,10 @@ TEST(Pointer, Erase) { EXPECT_FALSE(Pointer("").Erase(d)); EXPECT_FALSE(Pointer("/nonexist").Erase(d)); EXPECT_FALSE(Pointer("/foo/nonexist").Erase(d)); + EXPECT_FALSE(Pointer("/foo/nonexist/nonexist").Erase(d)); EXPECT_FALSE(Pointer("/foo/0/nonexist").Erase(d)); + EXPECT_FALSE(Pointer("/foo/0/nonexist/nonexist").Erase(d)); + EXPECT_FALSE(Pointer("/foo/2/nonexist").Erase(d)); EXPECT_TRUE(Pointer("/foo/0").Erase(d)); EXPECT_EQ(1u, d["foo"].Size()); EXPECT_STREQ("baz", d["foo"][0].GetString()); From c8c8ad47c372dc724cbabcbaf5c62aee4618afeb Mon Sep 17 00:00:00 2001 From: miloyip Date: Thu, 21 May 2015 17:02:27 +0800 Subject: [PATCH 07/26] Further improve coverage of GenericPointer::Erase() --- test/unittest/pointertest.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/test/unittest/pointertest.cpp b/test/unittest/pointertest.cpp index 0f78450..7ec3f72 100644 --- a/test/unittest/pointertest.cpp +++ b/test/unittest/pointertest.cpp @@ -874,6 +874,7 @@ TEST(Pointer, Erase) { EXPECT_FALSE(Pointer("").Erase(d)); EXPECT_FALSE(Pointer("/nonexist").Erase(d)); + EXPECT_FALSE(Pointer("/nonexist/nonexist").Erase(d)); EXPECT_FALSE(Pointer("/foo/nonexist").Erase(d)); EXPECT_FALSE(Pointer("/foo/nonexist/nonexist").Erase(d)); EXPECT_FALSE(Pointer("/foo/0/nonexist").Erase(d)); From 0bef29a5f649637fd9a51bb758aaa62c57c12920 Mon Sep 17 00:00:00 2001 From: miloyip Date: Sun, 24 May 2015 21:23:39 +0800 Subject: [PATCH 08/26] Initial reggae implementation with only concatenation and alternation --- include/rapidjson/internal/regex.h | 214 +++++++++++++++++++++++++++++ include/rapidjson/internal/stack.h | 15 ++ test/unittest/CMakeLists.txt | 1 + test/unittest/regextest.cpp | 50 +++++++ 4 files changed, 280 insertions(+) create mode 100644 include/rapidjson/internal/regex.h create mode 100644 test/unittest/regextest.cpp diff --git a/include/rapidjson/internal/regex.h b/include/rapidjson/internal/regex.h new file mode 100644 index 0000000..b61aaaa --- /dev/null +++ b/include/rapidjson/internal/regex.h @@ -0,0 +1,214 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef RAPIDJSON_INTERNAL_REGEX_H_ +#define RAPIDJSON_INTERNAL_REGEX_H_ + +#include "../rapidjson.h" +#include "stack.h" + +RAPIDJSON_NAMESPACE_BEGIN +namespace internal { + +/////////////////////////////////////////////////////////////////////////////// +// GenericRegex + +static const SizeType kRegexInvalidState = ~SizeType(0); //!< Represents an invalid index in GenericRegex::State::out, out1 + +template +class GenericRegex { +public: + typedef typename Encoding::Ch Ch; + + GenericRegex(const Ch* source, Allocator* allocator = 0) : states_(allocator, 256), root_(kRegexInvalidState), stateCount_() { + StringStream is(source); + Parse(is); + } + + ~GenericRegex() { + } + + bool IsValid() const { + return root_ != kRegexInvalidState; + } + + template + bool Match(InputStream& is) const { + RAPIDJSON_ASSERT(IsValid()); + Allocator allocator; + Stack state0(&allocator, stateCount_ * sizeof(SizeType)); + Stack state1(&allocator, stateCount_ * sizeof(SizeType)); + Stack *current = &state0, *next = &state1; + + const size_t stateSetSize = (stateCount_ + 31) / 32 * 4; + unsigned* stateSet = static_cast(allocator.Malloc(stateSetSize)); + std::memset(stateSet, 0, stateSetSize); + + AddState(stateSet, *current, root_); + + unsigned codepoint; + while (!current->Empty() && Encoding::Decode(is, &codepoint) && codepoint != 0) { + for (const SizeType* s = current->template Bottom(); s != current->template End(); ++s) { + const State& sr = GetState(*s); + // if (sr.out != kRegexInvalidState) + // printf("%c matches %c\n", (char)sr.codepoint, (char)codepoint); + + if (sr.out != kRegexInvalidState && sr.codepoint == codepoint) + AddState(stateSet, *next, sr.out); + } + Stack* temp = current; + current = next; + next = temp; + std::memset(stateSet, 0, stateSetSize); + next->Clear(); + // printf("\n"); + } + + Allocator::Free(stateSet); + + for (const SizeType* s = current->template Bottom(); s != current->template End(); ++s) + if (GetState(*s).out == kRegexInvalidState) + return true; + + return false; + } + + bool Match(const Ch* s) { + StringStream is(s); + return Match(is); + } + +private: + struct State { + SizeType out; //!< Equals to kInvalid for match + SizeType out1; //!< Equals to non-kInvalid for split + unsigned codepoint; + }; + + struct Frag { + Frag(SizeType s, SizeType o) : start(s), out(o) {} + SizeType start; + SizeType out; //!< link-list of all output states + }; + + State& GetState(SizeType index) { + RAPIDJSON_ASSERT(index < stateCount_); + return states_.template Bottom()[index]; + } + + const State& GetState(SizeType index) const { + RAPIDJSON_ASSERT(index < stateCount_); + return states_.template Bottom()[index]; + } + + void AddState(unsigned* stateSet, Stack& l, SizeType index) const { + if (index == kRegexInvalidState) + return; + + const State& s = GetState(index); + if (s.out1 != kRegexInvalidState) { // Split + AddState(stateSet, l, s.out); + AddState(stateSet, l, s.out1); + } + else if (!(stateSet[index >> 5] & (1 << (index & 31)))) { + stateSet[index >> 5] |= (1 << (index & 31)); + *l.template Push() = index; + } + } + + SizeType NewState(SizeType out, SizeType out1, unsigned codepoint) { + State* s = states_.template Push(); + s->out = out; + s->out1 = out1; + s->codepoint = codepoint; + return stateCount_++; + } + + SizeType Append(SizeType l1, SizeType l2) { + SizeType old = l1; + while (GetState(l1).out != kRegexInvalidState) + l1 = GetState(l1).out; + GetState(l1).out = l2; + return old; + } + + void Patch(SizeType l, SizeType s) { + SizeType next; + for (; l != kRegexInvalidState; l = next) { + next = GetState(l).out; + GetState(l).out = s; + } + } + + template + void Parse(InputStream& is) { + Allocator allocator; + Stack operandStack(&allocator, 256); // Frag + Stack operatorStack(&allocator, 256); // char + + unsigned codepoint; + bool previousOperand = false; + while (Encoding::Decode(is, &codepoint) && codepoint != 0) { + switch (codepoint) { + case '|': + *operatorStack.template Push() = '|'; + previousOperand = false; + break; + + default: + SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, codepoint); + // concatenation with previous operand + if (previousOperand) { + Frag* e = operandStack.template Top(); + Patch(e->out, s); + e->out = s; + } + else + *operandStack.template Push() = Frag(s, s); + previousOperand = true; + } + } + + while (!operatorStack.Empty()) { + switch (*operatorStack.template Pop(1)) { + case '|': + { + Frag e2 = *operandStack.template Pop(1); + Frag e1 = *operandStack.template Pop(1); + SizeType s = NewState(e1.start, e2.start, 0); + *operandStack.template Push() = Frag(s, Append(e1.out, e2.out)); + } + break; + } + } + + // Link the operand to matching state. + if (operandStack.GetSize() == sizeof(Frag)) { + Frag* e = operandStack.template Pop(1); + Patch(e->out, NewState(kRegexInvalidState, kRegexInvalidState, 0)); + root_ = e->start; + } + } + + Stack states_; + SizeType root_; + SizeType stateCount_; +}; + +typedef GenericRegex > Regex; + +} // namespace internal +RAPIDJSON_NAMESPACE_END + +#endif // RAPIDJSON_INTERNAL_REGEX_H_ diff --git a/include/rapidjson/internal/stack.h b/include/rapidjson/internal/stack.h index bb31cc0..f911588 100644 --- a/include/rapidjson/internal/stack.h +++ b/include/rapidjson/internal/stack.h @@ -121,9 +121,24 @@ public: return reinterpret_cast(stackTop_ - sizeof(T)); } + template + const T* Top() const { + RAPIDJSON_ASSERT(GetSize() >= sizeof(T)); + return reinterpret_cast(stackTop_ - sizeof(T)); + } + + template + T* End() { return reinterpret_cast(stackTop_); } + + template + const T* End() const { return reinterpret_cast(stackTop_); } + template T* Bottom() { return (T*)stack_; } + template + const T* Bottom() const { return (T*)stack_; } + Allocator& GetAllocator() { return *allocator_; } bool Empty() const { return stackTop_ == stack_; } size_t GetSize() const { return static_cast(stackTop_ - stack_); } diff --git a/test/unittest/CMakeLists.txt b/test/unittest/CMakeLists.txt index fb95b8e..d1734b4 100644 --- a/test/unittest/CMakeLists.txt +++ b/test/unittest/CMakeLists.txt @@ -11,6 +11,7 @@ set(UNITTEST_SOURCES pointertest.cpp prettywritertest.cpp readertest.cpp + regextest.cpp simdtest.cpp stringbuffertest.cpp strtodtest.cpp diff --git a/test/unittest/regextest.cpp b/test/unittest/regextest.cpp new file mode 100644 index 0000000..7c67c0f --- /dev/null +++ b/test/unittest/regextest.cpp @@ -0,0 +1,50 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "unittest.h" +#include "rapidjson/internal/regex.h" + +using namespace rapidjson::internal; + +TEST(Regex, concatenation) { + Regex re("abc"); + EXPECT_TRUE(re.Match("abc")); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("a")); + EXPECT_FALSE(re.Match("b")); + EXPECT_FALSE(re.Match("ab")); + EXPECT_FALSE(re.Match("abcd")); +} + +TEST(Regex, split) { + { + Regex re("abab|abbb"); + EXPECT_TRUE(re.Match("abab")); + EXPECT_TRUE(re.Match("abbb")); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("ab")); + EXPECT_FALSE(re.Match("ababa")); + EXPECT_FALSE(re.Match("abb")); + EXPECT_FALSE(re.Match("abbbb")); + } + { + Regex re("a|b|c"); + EXPECT_TRUE(re.Match("a")); + EXPECT_TRUE(re.Match("b")); + EXPECT_TRUE(re.Match("c")); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("aa")); + EXPECT_FALSE(re.Match("ab")); + } +} From 05c79891d113c07fe3e968e75aa14cc840358b68 Mon Sep 17 00:00:00 2001 From: miloyip Date: Mon, 25 May 2015 09:14:51 +0800 Subject: [PATCH 09/26] Add parenthesis support in regex --- include/rapidjson/internal/regex.h | 100 +++++++++++++++++++++-------- test/unittest/regextest.cpp | 76 ++++++++++++++++------ 2 files changed, 131 insertions(+), 45 deletions(-) diff --git a/include/rapidjson/internal/regex.h b/include/rapidjson/internal/regex.h index b61aaaa..f757cfe 100644 --- a/include/rapidjson/internal/regex.h +++ b/include/rapidjson/internal/regex.h @@ -90,6 +90,12 @@ public: } private: + enum Operator { + kConcatenation, + kAlternation, + kLeftParenthesis, + }; + struct State { SizeType out; //!< Equals to kInvalid for match SizeType out1; //!< Equals to non-kInvalid for split @@ -155,52 +161,96 @@ private: void Parse(InputStream& is) { Allocator allocator; Stack operandStack(&allocator, 256); // Frag - Stack operatorStack(&allocator, 256); // char + Stack operatorStack(&allocator, 256); // Operator + Stack atomCountStack(&allocator, 256); // unsigned (Atom per parenthesis) + + *atomCountStack.template Push() = 0; unsigned codepoint; - bool previousOperand = false; while (Encoding::Decode(is, &codepoint) && codepoint != 0) { switch (codepoint) { case '|': - *operatorStack.template Push() = '|'; - previousOperand = false; + while (!operatorStack.Empty() && *operatorStack.template Top() < kAlternation) + if (!Eval(operandStack, operatorStack)) + return; + *operatorStack.template Push() = kAlternation; + *atomCountStack.template Top() = 0; + break; + + case '(': + *operatorStack.template Push() = kLeftParenthesis; + *atomCountStack.template Push() = 0; + break; + + case ')': + while (!operatorStack.Empty() && *operatorStack.template Top() != kLeftParenthesis) + if (!Eval(operandStack, operatorStack)) + return; + if (operatorStack.Empty()) + return; + operatorStack.template Pop(1); + atomCountStack.template Pop(1); + ImplicitConcatenation(atomCountStack, operatorStack); break; default: SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, codepoint); - // concatenation with previous operand - if (previousOperand) { - Frag* e = operandStack.template Top(); - Patch(e->out, s); - e->out = s; - } - else - *operandStack.template Push() = Frag(s, s); - previousOperand = true; + *operandStack.template Push() = Frag(s, s); + ImplicitConcatenation(atomCountStack, operatorStack); } } - while (!operatorStack.Empty()) { - switch (*operatorStack.template Pop(1)) { - case '|': - { - Frag e2 = *operandStack.template Pop(1); - Frag e1 = *operandStack.template Pop(1); - SizeType s = NewState(e1.start, e2.start, 0); - *operandStack.template Push() = Frag(s, Append(e1.out, e2.out)); - } - break; - } - } + while (!operatorStack.Empty()) + if (!Eval(operandStack, operatorStack)) + return; // Link the operand to matching state. if (operandStack.GetSize() == sizeof(Frag)) { Frag* e = operandStack.template Pop(1); Patch(e->out, NewState(kRegexInvalidState, kRegexInvalidState, 0)); root_ = e->start; + // printf("root: %d\n", root_); + // for (SizeType i = 0; i < stateCount_ ; i++) { + // State& s = GetState(i); + // printf("[%2d] out: %2d out1: %2d c: '%c'\n", i, s.out, s.out1, (char)s.codepoint); + // } + // printf("\n"); } } + bool Eval(Stack& operandStack, Stack& operatorStack) { + switch (*operatorStack.template Pop(1)) { + case kConcatenation: + if (operandStack.GetSize() >= sizeof(Frag) * 2) { + Frag e2 = *operandStack.template Pop(1); + Frag e1 = *operandStack.template Pop(1); + Patch(e1.out, e2.start); + *operandStack.template Push() = Frag(e1.start, e2.out); + return true; + } + return false; + + case kAlternation: + if (operandStack.GetSize() >= sizeof(Frag) * 2) { + Frag e2 = *operandStack.template Pop(1); + Frag e1 = *operandStack.template Pop(1); + SizeType s = NewState(e1.start, e2.start, 0); + *operandStack.template Push() = Frag(s, Append(e1.out, e2.out)); + return true; + } + return false; + + default: + return false; + } + } + + void ImplicitConcatenation(Stack& atomCountStack, Stack& operatorStack) { + if (*atomCountStack.template Top()) + *operatorStack.template Push() = kConcatenation; + (*atomCountStack.template Top())++; + } + Stack states_; SizeType root_; SizeType stateCount_; diff --git a/test/unittest/regextest.cpp b/test/unittest/regextest.cpp index 7c67c0f..658bbc2 100644 --- a/test/unittest/regextest.cpp +++ b/test/unittest/regextest.cpp @@ -19,6 +19,7 @@ using namespace rapidjson::internal; TEST(Regex, concatenation) { Regex re("abc"); + ASSERT_TRUE(re.IsValid()); EXPECT_TRUE(re.Match("abc")); EXPECT_FALSE(re.Match("")); EXPECT_FALSE(re.Match("a")); @@ -27,24 +28,59 @@ TEST(Regex, concatenation) { EXPECT_FALSE(re.Match("abcd")); } -TEST(Regex, split) { - { - Regex re("abab|abbb"); - EXPECT_TRUE(re.Match("abab")); - EXPECT_TRUE(re.Match("abbb")); - EXPECT_FALSE(re.Match("")); - EXPECT_FALSE(re.Match("ab")); - EXPECT_FALSE(re.Match("ababa")); - EXPECT_FALSE(re.Match("abb")); - EXPECT_FALSE(re.Match("abbbb")); - } - { - Regex re("a|b|c"); - EXPECT_TRUE(re.Match("a")); - EXPECT_TRUE(re.Match("b")); - EXPECT_TRUE(re.Match("c")); - EXPECT_FALSE(re.Match("")); - EXPECT_FALSE(re.Match("aa")); - EXPECT_FALSE(re.Match("ab")); - } +TEST(Regex, split1) { + Regex re("abab|abbb"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("abab")); + EXPECT_TRUE(re.Match("abbb")); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("ab")); + EXPECT_FALSE(re.Match("ababa")); + EXPECT_FALSE(re.Match("abb")); + EXPECT_FALSE(re.Match("abbbb")); +} + +TEST(Regex, split2) { + Regex re("a|b|c"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("a")); + EXPECT_TRUE(re.Match("b")); + EXPECT_TRUE(re.Match("c")); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("aa")); + EXPECT_FALSE(re.Match("ab")); +} + +TEST(Regex, parenthesis1) { + Regex re("(ab)c"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("abc")); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("a")); + EXPECT_FALSE(re.Match("b")); + EXPECT_FALSE(re.Match("ab")); + EXPECT_FALSE(re.Match("abcd")); +} + +TEST(Regex, parenthesis2) { + Regex re("a(bc)"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("abc")); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("a")); + EXPECT_FALSE(re.Match("b")); + EXPECT_FALSE(re.Match("ab")); + EXPECT_FALSE(re.Match("abcd")); +} + +TEST(Regex, parenthesis3) { + Regex re("(a|b)(c|d)"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("ac")); + EXPECT_TRUE(re.Match("ad")); + EXPECT_TRUE(re.Match("bc")); + EXPECT_TRUE(re.Match("bd")); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("ab")); + EXPECT_FALSE(re.Match("cd")); } From a386934288f3bb537c12a257aae298294ecfe1d2 Mon Sep 17 00:00:00 2001 From: miloyip Date: Mon, 25 May 2015 19:34:47 +0800 Subject: [PATCH 10/26] Add ?*+ to regex --- include/rapidjson/internal/regex.h | 72 ++++++++++++-- test/unittest/regextest.cpp | 147 +++++++++++++++++++++++++++-- 2 files changed, 203 insertions(+), 16 deletions(-) diff --git a/include/rapidjson/internal/regex.h b/include/rapidjson/internal/regex.h index f757cfe..c19adb1 100644 --- a/include/rapidjson/internal/regex.h +++ b/include/rapidjson/internal/regex.h @@ -54,11 +54,12 @@ public: const size_t stateSetSize = (stateCount_ + 31) / 32 * 4; unsigned* stateSet = static_cast(allocator.Malloc(stateSetSize)); std::memset(stateSet, 0, stateSetSize); - AddState(stateSet, *current, root_); unsigned codepoint; while (!current->Empty() && Encoding::Decode(is, &codepoint) && codepoint != 0) { + std::memset(stateSet, 0, stateSetSize); + next->Clear(); for (const SizeType* s = current->template Bottom(); s != current->template End(); ++s) { const State& sr = GetState(*s); // if (sr.out != kRegexInvalidState) @@ -70,8 +71,6 @@ public: Stack* temp = current; current = next; next = temp; - std::memset(stateSet, 0, stateSetSize); - next->Clear(); // printf("\n"); } @@ -91,9 +90,12 @@ public: private: enum Operator { + kZeroOrOne, + kZeroOrMore, + kOneOrMore, kConcatenation, kAlternation, - kLeftParenthesis, + kLeftParenthesis }; struct State { @@ -193,6 +195,24 @@ private: ImplicitConcatenation(atomCountStack, operatorStack); break; + case '?': + *operatorStack.template Push() = kZeroOrOne; + if (!Eval(operandStack, operatorStack)) + return; + break; + + case '*': + *operatorStack.template Push() = kZeroOrMore; + if (!Eval(operandStack, operatorStack)) + return; + break; + + case '+': + *operatorStack.template Push() = kOneOrMore; + if (!Eval(operandStack, operatorStack)) + return; + break; + default: SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, codepoint); *operandStack.template Push() = Frag(s, s); @@ -209,16 +229,19 @@ private: Frag* e = operandStack.template Pop(1); Patch(e->out, NewState(kRegexInvalidState, kRegexInvalidState, 0)); root_ = e->start; - // printf("root: %d\n", root_); - // for (SizeType i = 0; i < stateCount_ ; i++) { - // State& s = GetState(i); - // printf("[%2d] out: %2d out1: %2d c: '%c'\n", i, s.out, s.out1, (char)s.codepoint); - // } - // printf("\n"); +#if 0 + printf("root: %d\n", root_); + for (SizeType i = 0; i < stateCount_ ; i++) { + State& s = GetState(i); + printf("[%2d] out: %2d out1: %2d c: '%c'\n", i, s.out, s.out1, (char)s.codepoint); + } + printf("\n"); +#endif } } bool Eval(Stack& operandStack, Stack& operatorStack) { + // printf("Eval %c\n", "?*+.|("[*operatorStack.template Top()]); switch (*operatorStack.template Pop(1)) { case kConcatenation: if (operandStack.GetSize() >= sizeof(Frag) * 2) { @@ -240,6 +263,35 @@ private: } return false; + case kZeroOrOne: + if (operandStack.GetSize() >= sizeof(Frag)) { + Frag e = *operandStack.template Pop(1); + SizeType s = NewState(kRegexInvalidState, e.start, 0); + *operandStack.template Push() = Frag(s, Append(e.out, s)); + return true; + } + return false; + + case kZeroOrMore: + if (operandStack.GetSize() >= sizeof(Frag)) { + Frag e = *operandStack.template Pop(1); + SizeType s = NewState(kRegexInvalidState, e.start, 0); + Patch(e.out, s); + *operandStack.template Push() = Frag(s, s); + return true; + } + return false; + + case kOneOrMore: + if (operandStack.GetSize() >= sizeof(Frag)) { + Frag e = *operandStack.template Pop(1); + SizeType s = NewState(kRegexInvalidState, e.start, 0); + Patch(e.out, s); + *operandStack.template Push() = Frag(e.start, s); + return true; + } + return false; + default: return false; } diff --git a/test/unittest/regextest.cpp b/test/unittest/regextest.cpp index 658bbc2..1a1bffa 100644 --- a/test/unittest/regextest.cpp +++ b/test/unittest/regextest.cpp @@ -17,7 +17,7 @@ using namespace rapidjson::internal; -TEST(Regex, concatenation) { +TEST(Regex, Concatenation) { Regex re("abc"); ASSERT_TRUE(re.IsValid()); EXPECT_TRUE(re.Match("abc")); @@ -28,7 +28,7 @@ TEST(Regex, concatenation) { EXPECT_FALSE(re.Match("abcd")); } -TEST(Regex, split1) { +TEST(Regex, Alternation1) { Regex re("abab|abbb"); ASSERT_TRUE(re.IsValid()); EXPECT_TRUE(re.Match("abab")); @@ -40,7 +40,7 @@ TEST(Regex, split1) { EXPECT_FALSE(re.Match("abbbb")); } -TEST(Regex, split2) { +TEST(Regex, Alternation2) { Regex re("a|b|c"); ASSERT_TRUE(re.IsValid()); EXPECT_TRUE(re.Match("a")); @@ -51,7 +51,7 @@ TEST(Regex, split2) { EXPECT_FALSE(re.Match("ab")); } -TEST(Regex, parenthesis1) { +TEST(Regex, Parenthesis1) { Regex re("(ab)c"); ASSERT_TRUE(re.IsValid()); EXPECT_TRUE(re.Match("abc")); @@ -62,7 +62,7 @@ TEST(Regex, parenthesis1) { EXPECT_FALSE(re.Match("abcd")); } -TEST(Regex, parenthesis2) { +TEST(Regex, Parenthesis2) { Regex re("a(bc)"); ASSERT_TRUE(re.IsValid()); EXPECT_TRUE(re.Match("abc")); @@ -73,7 +73,7 @@ TEST(Regex, parenthesis2) { EXPECT_FALSE(re.Match("abcd")); } -TEST(Regex, parenthesis3) { +TEST(Regex, Parenthesis3) { Regex re("(a|b)(c|d)"); ASSERT_TRUE(re.IsValid()); EXPECT_TRUE(re.Match("ac")); @@ -84,3 +84,138 @@ TEST(Regex, parenthesis3) { EXPECT_FALSE(re.Match("ab")); EXPECT_FALSE(re.Match("cd")); } + +TEST(Regex, ZeroOrOne1) { + Regex re("a?"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("")); + EXPECT_TRUE(re.Match("a")); + EXPECT_FALSE(re.Match("aa")); +} + +TEST(Regex, ZeroOrOne2) { + Regex re("a?b"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("b")); + EXPECT_TRUE(re.Match("ab")); + EXPECT_FALSE(re.Match("a")); + EXPECT_FALSE(re.Match("aa")); + EXPECT_FALSE(re.Match("bb")); + EXPECT_FALSE(re.Match("ba")); +} + +TEST(Regex, ZeroOrOne3) { + Regex re("ab?"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("a")); + EXPECT_TRUE(re.Match("ab")); + EXPECT_FALSE(re.Match("b")); + EXPECT_FALSE(re.Match("aa")); + EXPECT_FALSE(re.Match("bb")); + EXPECT_FALSE(re.Match("ba")); +} + +TEST(Regex, ZeroOrOne4) { + Regex re("a?b?"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("")); + EXPECT_TRUE(re.Match("a")); + EXPECT_TRUE(re.Match("b")); + EXPECT_TRUE(re.Match("ab")); + EXPECT_FALSE(re.Match("aa")); + EXPECT_FALSE(re.Match("bb")); + EXPECT_FALSE(re.Match("ba")); + EXPECT_FALSE(re.Match("abc")); +} + +TEST(Regex, ZeroOrOne5) { + Regex re("a(ab)?b"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("ab")); + EXPECT_TRUE(re.Match("aabb")); + EXPECT_FALSE(re.Match("aab")); + EXPECT_FALSE(re.Match("abb")); +} + +TEST(Regex, ZeroOrMore1) { + Regex re("a*"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("")); + EXPECT_TRUE(re.Match("a")); + EXPECT_TRUE(re.Match("aa")); + EXPECT_FALSE(re.Match("b")); + EXPECT_FALSE(re.Match("ab")); +} + +TEST(Regex, ZeroOrMore2) { + Regex re("a*b"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("b")); + EXPECT_TRUE(re.Match("ab")); + EXPECT_TRUE(re.Match("aab")); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("bb")); +} + +TEST(Regex, ZeroOrMore3) { + Regex re("a*b*"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("")); + EXPECT_TRUE(re.Match("a")); + EXPECT_TRUE(re.Match("aa")); + EXPECT_TRUE(re.Match("b")); + EXPECT_TRUE(re.Match("bb")); + EXPECT_TRUE(re.Match("ab")); + EXPECT_TRUE(re.Match("aabb")); + EXPECT_FALSE(re.Match("ba")); +} + +TEST(Regex, ZeroOrMore4) { + Regex re("a(ab)*b"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("ab")); + EXPECT_TRUE(re.Match("aabb")); + EXPECT_TRUE(re.Match("aababb")); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("aa")); +} + +TEST(Regex, OneOrMore1) { + Regex re("a+"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("a")); + EXPECT_TRUE(re.Match("aa")); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("b")); + EXPECT_FALSE(re.Match("ab")); +} + +TEST(Regex, OneOrMore2) { + Regex re("a+b"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("ab")); + EXPECT_TRUE(re.Match("aab")); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("b")); +} + +TEST(Regex, OneOrMore3) { + Regex re("a+b+"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("ab")); + EXPECT_TRUE(re.Match("aab")); + EXPECT_TRUE(re.Match("abb")); + EXPECT_TRUE(re.Match("aabb")); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("b")); + EXPECT_FALSE(re.Match("ba")); +} + +TEST(Regex, OneOrMore4) { + Regex re("a(ab)+b"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("aabb")); + EXPECT_TRUE(re.Match("aababb")); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("ab")); +} From 994b0dfea2ea32f68e78add1f23466ee534ddf50 Mon Sep 17 00:00:00 2001 From: miloyip Date: Mon, 25 May 2015 19:42:23 +0800 Subject: [PATCH 11/26] Clean up regex --- include/rapidjson/internal/regex.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/include/rapidjson/internal/regex.h b/include/rapidjson/internal/regex.h index c19adb1..7ee99b6 100644 --- a/include/rapidjson/internal/regex.h +++ b/include/rapidjson/internal/regex.h @@ -18,6 +18,10 @@ #include "../rapidjson.h" #include "stack.h" +#ifndef RAPIDJSON_REGEX_VERBOSE +#define RAPIDJSON_REGEX_VERBOSE 0 +#endif + RAPIDJSON_NAMESPACE_BEGIN namespace internal { @@ -62,16 +66,12 @@ public: next->Clear(); for (const SizeType* s = current->template Bottom(); s != current->template End(); ++s) { const State& sr = GetState(*s); - // if (sr.out != kRegexInvalidState) - // printf("%c matches %c\n", (char)sr.codepoint, (char)codepoint); - - if (sr.out != kRegexInvalidState && sr.codepoint == codepoint) + if (sr.codepoint == codepoint) AddState(stateSet, *next, sr.out); } Stack* temp = current; current = next; next = temp; - // printf("\n"); } Allocator::Free(stateSet); @@ -99,7 +99,7 @@ private: }; struct State { - SizeType out; //!< Equals to kInvalid for match + SizeType out; //!< Equals to kInvalid for matching state SizeType out1; //!< Equals to non-kInvalid for split unsigned codepoint; }; @@ -229,7 +229,7 @@ private: Frag* e = operandStack.template Pop(1); Patch(e->out, NewState(kRegexInvalidState, kRegexInvalidState, 0)); root_ = e->start; -#if 0 +#if RAPIDJSON_REGEX_VERBOSE printf("root: %d\n", root_); for (SizeType i = 0; i < stateCount_ ; i++) { State& s = GetState(i); From 328b0d8afc88050ddcbe73677d632c4f39dfacfb Mon Sep 17 00:00:00 2001 From: miloyip Date: Mon, 25 May 2015 19:49:07 +0800 Subject: [PATCH 12/26] Minor refactor regex --- include/rapidjson/internal/regex.h | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/include/rapidjson/internal/regex.h b/include/rapidjson/internal/regex.h index 7ee99b6..dace328 100644 --- a/include/rapidjson/internal/regex.h +++ b/include/rapidjson/internal/regex.h @@ -152,8 +152,7 @@ private: } void Patch(SizeType l, SizeType s) { - SizeType next; - for (; l != kRegexInvalidState; l = next) { + for (SizeType next; l != kRegexInvalidState; l = next) { next = GetState(l).out; GetState(l).out = s; } @@ -173,7 +172,7 @@ private: switch (codepoint) { case '|': while (!operatorStack.Empty() && *operatorStack.template Top() < kAlternation) - if (!Eval(operandStack, operatorStack)) + if (!Eval(operandStack, *operatorStack.template Pop(1))) return; *operatorStack.template Push() = kAlternation; *atomCountStack.template Top() = 0; @@ -186,7 +185,7 @@ private: case ')': while (!operatorStack.Empty() && *operatorStack.template Top() != kLeftParenthesis) - if (!Eval(operandStack, operatorStack)) + if (!Eval(operandStack, *operatorStack.template Pop(1))) return; if (operatorStack.Empty()) return; @@ -196,20 +195,17 @@ private: break; case '?': - *operatorStack.template Push() = kZeroOrOne; - if (!Eval(operandStack, operatorStack)) + if (!Eval(operandStack, kZeroOrOne)) return; break; case '*': - *operatorStack.template Push() = kZeroOrMore; - if (!Eval(operandStack, operatorStack)) + if (!Eval(operandStack, kZeroOrMore)) return; break; case '+': - *operatorStack.template Push() = kOneOrMore; - if (!Eval(operandStack, operatorStack)) + if (!Eval(operandStack, kOneOrMore)) return; break; @@ -221,7 +217,7 @@ private: } while (!operatorStack.Empty()) - if (!Eval(operandStack, operatorStack)) + if (!Eval(operandStack, *operatorStack.template Pop(1))) return; // Link the operand to matching state. @@ -229,6 +225,7 @@ private: Frag* e = operandStack.template Pop(1); Patch(e->out, NewState(kRegexInvalidState, kRegexInvalidState, 0)); root_ = e->start; + #if RAPIDJSON_REGEX_VERBOSE printf("root: %d\n", root_); for (SizeType i = 0; i < stateCount_ ; i++) { @@ -240,9 +237,8 @@ private: } } - bool Eval(Stack& operandStack, Stack& operatorStack) { - // printf("Eval %c\n", "?*+.|("[*operatorStack.template Top()]); - switch (*operatorStack.template Pop(1)) { + bool Eval(Stack& operandStack, Operator op) { + switch (op) { case kConcatenation: if (operandStack.GetSize() >= sizeof(Frag) * 2) { Frag e2 = *operandStack.template Pop(1); From 0934803ae1e5b4aa52f837beff20183ca6bec6c0 Mon Sep 17 00:00:00 2001 From: miloyip Date: Mon, 25 May 2015 21:57:46 +0800 Subject: [PATCH 13/26] Add Unicode regex test --- test/unittest/regextest.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/test/unittest/regextest.cpp b/test/unittest/regextest.cpp index 1a1bffa..979e230 100644 --- a/test/unittest/regextest.cpp +++ b/test/unittest/regextest.cpp @@ -219,3 +219,14 @@ TEST(Regex, OneOrMore4) { EXPECT_FALSE(re.Match("")); EXPECT_FALSE(re.Match("ab")); } + +TEST(Regex, Unicode) { +#define EURO "\xE2\x82\xAC" // "\xE2\x82\xAC" is UTF-8 sequence of Euro sign U+20AC + Regex re("a" EURO "+b"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("a" EURO "b")); + EXPECT_TRUE(re.Match("a" EURO EURO "b")); + EXPECT_FALSE(re.Match("a?b")); + EXPECT_FALSE(re.Match("a" EURO "\xAC" "b")); // unaware of UTF-8 will match +#undef EURO +} From 3c9ceb32a5c805d3e6ac5b0dda0185777206dee8 Mon Sep 17 00:00:00 2001 From: miloyip Date: Mon, 25 May 2015 22:09:32 +0800 Subject: [PATCH 14/26] Add doxygen notes for regex --- include/rapidjson/internal/regex.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/include/rapidjson/internal/regex.h b/include/rapidjson/internal/regex.h index dace328..184da81 100644 --- a/include/rapidjson/internal/regex.h +++ b/include/rapidjson/internal/regex.h @@ -30,6 +30,16 @@ namespace internal { static const SizeType kRegexInvalidState = ~SizeType(0); //!< Represents an invalid index in GenericRegex::State::out, out1 +//! Regular expression engine. +/*! + Supported regular expression syntax: + - \c ab Concatenation + - \c a|b Alternation + - \c a? Zero or one + - \c a* Zero or more + - \c a+ One or more + - \c (ab)* Parenthesis grouping +*/ template class GenericRegex { public: From 06853b89b07d02708d80721f169462fe29f07295 Mon Sep 17 00:00:00 2001 From: miloyip Date: Mon, 25 May 2015 22:51:03 +0800 Subject: [PATCH 15/26] Add any character (.) to regex --- include/rapidjson/internal/regex.h | 17 ++++++++++++++--- test/unittest/regextest.cpp | 16 ++++++++++++++-- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/include/rapidjson/internal/regex.h b/include/rapidjson/internal/regex.h index 184da81..a91cadd 100644 --- a/include/rapidjson/internal/regex.h +++ b/include/rapidjson/internal/regex.h @@ -76,7 +76,7 @@ public: next->Clear(); for (const SizeType* s = current->template Bottom(); s != current->template End(); ++s) { const State& sr = GetState(*s); - if (sr.codepoint == codepoint) + if (sr.codepoint == kAnyCharacterClass || sr.codepoint == codepoint) AddState(stateSet, *next, sr.out); } Stack* temp = current; @@ -108,6 +108,8 @@ private: kLeftParenthesis }; + static const unsigned kAnyCharacterClass = 0xFFFFFFFF; //!< For '.' + struct State { SizeType out; //!< Equals to kInvalid for matching state SizeType out1; //!< Equals to non-kInvalid for split @@ -168,6 +170,11 @@ private: } } + void PushOperand(Stack& operandStack, unsigned codepoint) { + SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, codepoint); + *operandStack.template Push() = Frag(s, s); + } + template void Parse(InputStream& is) { Allocator allocator; @@ -219,9 +226,13 @@ private: return; break; + case '.': + PushOperand(operandStack, kAnyCharacterClass); + ImplicitConcatenation(atomCountStack, operatorStack); + break; + default: - SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, codepoint); - *operandStack.template Push() = Frag(s, s); + PushOperand(operandStack, codepoint); ImplicitConcatenation(atomCountStack, operatorStack); } } diff --git a/test/unittest/regextest.cpp b/test/unittest/regextest.cpp index 979e230..52735cb 100644 --- a/test/unittest/regextest.cpp +++ b/test/unittest/regextest.cpp @@ -220,13 +220,25 @@ TEST(Regex, OneOrMore4) { EXPECT_FALSE(re.Match("ab")); } -TEST(Regex, Unicode) { #define EURO "\xE2\x82\xAC" // "\xE2\x82\xAC" is UTF-8 sequence of Euro sign U+20AC + +TEST(Regex, Unicode) { Regex re("a" EURO "+b"); ASSERT_TRUE(re.IsValid()); EXPECT_TRUE(re.Match("a" EURO "b")); EXPECT_TRUE(re.Match("a" EURO EURO "b")); EXPECT_FALSE(re.Match("a?b")); EXPECT_FALSE(re.Match("a" EURO "\xAC" "b")); // unaware of UTF-8 will match -#undef EURO } + +TEST(Regex, AnyCharacter) { + Regex re("."); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("a")); + EXPECT_TRUE(re.Match("b")); + EXPECT_TRUE(re.Match(EURO)); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("aa")); +} + +#undef EURO From 1784afe5f72f1d95ab98dc3a15a59f2295ecdd30 Mon Sep 17 00:00:00 2001 From: miloyip Date: Tue, 26 May 2015 00:40:23 +0800 Subject: [PATCH 16/26] Add character class to regex --- include/rapidjson/internal/regex.h | 193 +++++++++++++++++++++++------ test/unittest/regextest.cpp | 86 +++++++++++++ 2 files changed, 241 insertions(+), 38 deletions(-) diff --git a/include/rapidjson/internal/regex.h b/include/rapidjson/internal/regex.h index a91cadd..4d31180 100644 --- a/include/rapidjson/internal/regex.h +++ b/include/rapidjson/internal/regex.h @@ -29,23 +29,30 @@ namespace internal { // GenericRegex static const SizeType kRegexInvalidState = ~SizeType(0); //!< Represents an invalid index in GenericRegex::State::out, out1 +static const SizeType kRegexInvalidRange = ~SizeType(0); -//! Regular expression engine. +//! Regular expression engine with subset of ECMAscript grammar. /*! Supported regular expression syntax: - - \c ab Concatenation - - \c a|b Alternation - - \c a? Zero or one - - \c a* Zero or more - - \c a+ One or more - - \c (ab)* Parenthesis grouping + - \c ab Concatenation + - \c a|b Alternation + - \c a? Zero or one + - \c a* Zero or more + - \c a+ One or more + - \c (ab)* Grouping + - \c . Any character + - \c [abc] Character classes + - \c [a-c] Character class range + - \c [a-z0-9_] Character class combination + - \c [^abc] Negated character classes + - \c [^a-c] Negated character class range */ template class GenericRegex { public: typedef typename Encoding::Ch Ch; - GenericRegex(const Ch* source, Allocator* allocator = 0) : states_(allocator, 256), root_(kRegexInvalidState), stateCount_() { + GenericRegex(const Ch* source, Allocator* allocator = 0) : states_(allocator, 256), ranges_(allocator, 256), root_(kRegexInvalidState), stateCount_(),rangeCount_() { StringStream is(source); Parse(is); } @@ -76,8 +83,12 @@ public: next->Clear(); for (const SizeType* s = current->template Bottom(); s != current->template End(); ++s) { const State& sr = GetState(*s); - if (sr.codepoint == kAnyCharacterClass || sr.codepoint == codepoint) + if (sr.codepoint == codepoint || + sr.codepoint == kAnyCharacterClass || + (sr.codepoint == kRangeCharacterClass && MatchRange(sr.rangeStart, codepoint))) + { AddState(stateSet, *next, sr.out); + } } Stack* temp = current; current = next; @@ -109,10 +120,19 @@ private: }; static const unsigned kAnyCharacterClass = 0xFFFFFFFF; //!< For '.' + static const unsigned kRangeCharacterClass = 0xFFFFFFFE; + static const unsigned kRangeNegationFlag = 0x80000000; + + struct Range { + unsigned start; // + unsigned end; + SizeType next; + }; struct State { SizeType out; //!< Equals to kInvalid for matching state SizeType out1; //!< Equals to non-kInvalid for split + SizeType rangeStart; unsigned codepoint; }; @@ -132,6 +152,16 @@ private: return states_.template Bottom()[index]; } + Range& GetRange(SizeType index) { + RAPIDJSON_ASSERT(index < rangeCount_); + return ranges_.template Bottom()[index]; + } + + const Range& GetRange(SizeType index) const { + RAPIDJSON_ASSERT(index < rangeCount_); + return ranges_.template Bottom()[index]; + } + void AddState(unsigned* stateSet, Stack& l, SizeType index) const { if (index == kRegexInvalidState) return; @@ -147,34 +177,17 @@ private: } } - SizeType NewState(SizeType out, SizeType out1, unsigned codepoint) { - State* s = states_.template Push(); - s->out = out; - s->out1 = out1; - s->codepoint = codepoint; - return stateCount_++; - } - - SizeType Append(SizeType l1, SizeType l2) { - SizeType old = l1; - while (GetState(l1).out != kRegexInvalidState) - l1 = GetState(l1).out; - GetState(l1).out = l2; - return old; - } - - void Patch(SizeType l, SizeType s) { - for (SizeType next; l != kRegexInvalidState; l = next) { - next = GetState(l).out; - GetState(l).out = s; + bool MatchRange(SizeType rangeIndex, unsigned codepoint) const { + bool yes = (GetRange(rangeIndex).start & kRangeNegationFlag) == 0; + while (rangeIndex != kRegexInvalidRange) { + const Range& r = GetRange(rangeIndex); + if (codepoint >= (r.start & ~kRangeNegationFlag) && codepoint <= r.end) + return yes; + rangeIndex = r.next; } + return !yes; } - void PushOperand(Stack& operandStack, unsigned codepoint) { - SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, codepoint); - *operandStack.template Push() = Frag(s, s); - } - template void Parse(InputStream& is) { Allocator allocator; @@ -231,6 +244,18 @@ private: ImplicitConcatenation(atomCountStack, operatorStack); break; + case '[': + { + SizeType range; + if (!ParseRange(is, &range)) + return; + SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, kRangeCharacterClass); + GetState(s).rangeStart = range; + *operandStack.template Push() = Frag(s, s); + } + ImplicitConcatenation(atomCountStack, operatorStack); + break; + default: PushOperand(operandStack, codepoint); ImplicitConcatenation(atomCountStack, operatorStack); @@ -258,6 +283,41 @@ private: } } + SizeType NewState(SizeType out, SizeType out1, unsigned codepoint) { + State* s = states_.template Push(); + s->out = out; + s->out1 = out1; + s->codepoint = codepoint; + s->rangeStart = kRegexInvalidRange; + return stateCount_++; + } + + void PushOperand(Stack& operandStack, unsigned codepoint) { + SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, codepoint); + *operandStack.template Push() = Frag(s, s); + } + + void ImplicitConcatenation(Stack& atomCountStack, Stack& operatorStack) { + if (*atomCountStack.template Top()) + *operatorStack.template Push() = kConcatenation; + (*atomCountStack.template Top())++; + } + + SizeType Append(SizeType l1, SizeType l2) { + SizeType old = l1; + while (GetState(l1).out != kRegexInvalidState) + l1 = GetState(l1).out; + GetState(l1).out = l2; + return old; + } + + void Patch(SizeType l, SizeType s) { + for (SizeType next; l != kRegexInvalidState; l = next) { + next = GetState(l).out; + GetState(l).out = s; + } + } + bool Eval(Stack& operandStack, Operator op) { switch (op) { case kConcatenation: @@ -314,15 +374,72 @@ private: } } - void ImplicitConcatenation(Stack& atomCountStack, Stack& operatorStack) { - if (*atomCountStack.template Top()) - *operatorStack.template Push() = kConcatenation; - (*atomCountStack.template Top())++; + template + bool ParseRange(InputStream& is, SizeType* range) { + bool isBegin = true; + bool negate = false; + int step = 0; + SizeType start = kRegexInvalidRange; + SizeType current = kRegexInvalidRange; + unsigned codepoint; + while (Encoding::Decode(is, &codepoint) && codepoint != 0) { + if (isBegin && codepoint == '^') + negate = true; + else if (codepoint == ']') { + if (step == 2) { // Add trailing '-' + SizeType r = NewRange('-'); + RAPIDJSON_ASSERT(current != kRegexInvalidRange); + GetRange(current).next = r; + } + if (negate) + GetRange(start).start |= kRangeNegationFlag; + *range = start; + return true; + } + else { + switch (step) { + case 1: + if (codepoint == '-') { + step++; + break; + } + // fall through to step 0 for other characters + + case 0: + { + SizeType r = NewRange(codepoint); + if (current != kRegexInvalidRange) + GetRange(current).next = r; + if (start == kRegexInvalidRange) + start = r; + current = r; + } + step = 1; + break; + + default: + RAPIDJSON_ASSERT(step == 2); + GetRange(current).end = codepoint; + step = 0; + } + } + isBegin = false; + } + return false; + } + + SizeType NewRange(unsigned codepoint) { + Range* r = ranges_.template Push(); + r->start = r->end = codepoint; + r->next = kRegexInvalidRange; + return rangeCount_++; } Stack states_; + Stack ranges_; SizeType root_; SizeType stateCount_; + SizeType rangeCount_; }; typedef GenericRegex > Regex; diff --git a/test/unittest/regextest.cpp b/test/unittest/regextest.cpp index 52735cb..8818117 100644 --- a/test/unittest/regextest.cpp +++ b/test/unittest/regextest.cpp @@ -241,4 +241,90 @@ TEST(Regex, AnyCharacter) { EXPECT_FALSE(re.Match("aa")); } +TEST(Regex, CharacterRange1) { + Regex re("[abc]"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("a")); + EXPECT_TRUE(re.Match("b")); + EXPECT_TRUE(re.Match("c")); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("`")); + EXPECT_FALSE(re.Match("d")); + EXPECT_FALSE(re.Match("aa")); +} + +TEST(Regex, CharacterRange2) { + Regex re("[^abc]"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("`")); + EXPECT_TRUE(re.Match("d")); + EXPECT_FALSE(re.Match("a")); + EXPECT_FALSE(re.Match("b")); + EXPECT_FALSE(re.Match("c")); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("aa")); +} + +TEST(Regex, CharacterRange3) { + Regex re("[a-c]"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("a")); + EXPECT_TRUE(re.Match("b")); + EXPECT_TRUE(re.Match("c")); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("`")); + EXPECT_FALSE(re.Match("d")); + EXPECT_FALSE(re.Match("aa")); +} + +TEST(Regex, CharacterRange4) { + Regex re("[^a-c]"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("`")); + EXPECT_TRUE(re.Match("d")); + EXPECT_FALSE(re.Match("a")); + EXPECT_FALSE(re.Match("b")); + EXPECT_FALSE(re.Match("c")); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("aa")); +} + +TEST(Regex, CharacterRange5) { + Regex re("[-]"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("-")); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("a")); +} + +TEST(Regex, CharacterRange6) { + Regex re("[a-]"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("a")); + EXPECT_TRUE(re.Match("-")); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("`")); + EXPECT_FALSE(re.Match("b")); +} + +TEST(Regex, CharacterRange7) { + Regex re("[-a]"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("a")); + EXPECT_TRUE(re.Match("-")); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("`")); + EXPECT_FALSE(re.Match("b")); +} + +TEST(Regex, CharacterRange8) { + Regex re("[a-zA-Z0-9]*"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("Milo")); + EXPECT_TRUE(re.Match("MT19937")); + EXPECT_TRUE(re.Match("43")); + EXPECT_FALSE(re.Match("a_b")); + EXPECT_FALSE(re.Match("!")); +} + #undef EURO From 92285bed44ee028f5c3392924f8dee9c9eb66b64 Mon Sep 17 00:00:00 2001 From: miloyip Date: Wed, 27 May 2015 09:37:55 +0800 Subject: [PATCH 17/26] Add escape characters and control characters --- include/rapidjson/internal/regex.h | 33 +++++++++++++++++++++++++++++- test/unittest/regextest.cpp | 8 ++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/include/rapidjson/internal/regex.h b/include/rapidjson/internal/regex.h index 4d31180..0d85885 100644 --- a/include/rapidjson/internal/regex.h +++ b/include/rapidjson/internal/regex.h @@ -46,6 +46,12 @@ static const SizeType kRegexInvalidRange = ~SizeType(0); - \c [a-z0-9_] Character class combination - \c [^abc] Negated character classes - \c [^a-c] Negated character class range + - \c \\| \\\\ ... Escape characters + - \c \\f Form feed (U+000C) + - \c \\n Line feed (U+000A) + - \c \\r Carriage return (U+000D) + - \c \\t Tab (U+0009) + - \c \\v Vertical tab (U+000B) */ template class GenericRegex { @@ -256,7 +262,32 @@ private: ImplicitConcatenation(atomCountStack, operatorStack); break; - default: + case '\\': // Escape character + if (!Encoding::Decode(is, &codepoint) || codepoint == 0) + return; // Expect an escape character + switch (codepoint) { + case '|': + case '(': + case ')': + case '?': + case '*': + case '+': + case '.': + case '[': + case ']': + case '\\': + break; // use the codepoint as is + case 'f': codepoint = 0x000C; break; + case 'n': codepoint = 0x000A; break; + case 'r': codepoint = 0x000D; break; + case 't': codepoint = 0x0009; break; + case 'v': codepoint = 0x000B; break; + default: + return; // Unsupported escape character + } + // fall through to default + + default: // Pattern character PushOperand(operandStack, codepoint); ImplicitConcatenation(atomCountStack, operatorStack); } diff --git a/test/unittest/regextest.cpp b/test/unittest/regextest.cpp index 8818117..23acb46 100644 --- a/test/unittest/regextest.cpp +++ b/test/unittest/regextest.cpp @@ -327,4 +327,12 @@ TEST(Regex, CharacterRange8) { EXPECT_FALSE(re.Match("!")); } +TEST(Regex, Escape) { + const char* s = "\\|\\(\\)\\?\\*\\+\\.\\[\\]\\\\\\f\\n\\r\\t\\v"; + Regex re(s); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("|()?*+.[]\\\x0C\n\r\t\x0B")); + EXPECT_FALSE(re.Match(s)); // Not escaping +} + #undef EURO From 0dffe875517e07a3cfc9a1b446ee93de8d9de094 Mon Sep 17 00:00:00 2001 From: miloyip Date: Wed, 27 May 2015 09:56:06 +0800 Subject: [PATCH 18/26] Add character class escapes --- include/rapidjson/internal/regex.h | 73 +++++++++++++++++++----------- test/unittest/regextest.cpp | 4 +- 2 files changed, 49 insertions(+), 28 deletions(-) diff --git a/include/rapidjson/internal/regex.h b/include/rapidjson/internal/regex.h index 0d85885..15b6f8f 100644 --- a/include/rapidjson/internal/regex.h +++ b/include/rapidjson/internal/regex.h @@ -46,6 +46,7 @@ static const SizeType kRegexInvalidRange = ~SizeType(0); - \c [a-z0-9_] Character class combination - \c [^abc] Negated character classes - \c [^a-c] Negated character class range + - \c [\b] Backspace (U+0008) - \c \\| \\\\ ... Escape characters - \c \\f Form feed (U+000C) - \c \\n Line feed (U+000A) @@ -265,26 +266,8 @@ private: case '\\': // Escape character if (!Encoding::Decode(is, &codepoint) || codepoint == 0) return; // Expect an escape character - switch (codepoint) { - case '|': - case '(': - case ')': - case '?': - case '*': - case '+': - case '.': - case '[': - case ']': - case '\\': - break; // use the codepoint as is - case 'f': codepoint = 0x000C; break; - case 'n': codepoint = 0x000A; break; - case 'r': codepoint = 0x000D; break; - case 't': codepoint = 0x0009; break; - case 'v': codepoint = 0x000B; break; - default: - return; // Unsupported escape character - } + if (!CharacterEscape(codepoint, &codepoint)) + return; // Unsupported escape character // fall through to default default: // Pattern character @@ -414,9 +397,16 @@ private: SizeType current = kRegexInvalidRange; unsigned codepoint; while (Encoding::Decode(is, &codepoint) && codepoint != 0) { - if (isBegin && codepoint == '^') - negate = true; - else if (codepoint == ']') { + if (isBegin) { + isBegin = false; + if (codepoint == '^') { + negate = true; + continue; + } + } + + switch (codepoint) { + case ']': if (step == 2) { // Add trailing '-' SizeType r = NewRange('-'); RAPIDJSON_ASSERT(current != kRegexInvalidRange); @@ -426,8 +416,17 @@ private: GetRange(start).start |= kRangeNegationFlag; *range = start; return true; - } - else { + + case '\\': + if (!Encoding::Decode(is, &codepoint) || codepoint == 0) + return false; // Expect an escape character + if (codepoint == 'b') + codepoint = 0x0008; // Escape backspace character + else if (!CharacterEscape(codepoint, &codepoint)) + return false; + // fall through to default + + default: switch (step) { case 1: if (codepoint == '-') { @@ -454,7 +453,6 @@ private: step = 0; } } - isBegin = false; } return false; } @@ -466,6 +464,29 @@ private: return rangeCount_++; } + bool CharacterEscape(unsigned codepoint, unsigned* escapedCodepoint) { + switch (codepoint) { + case '|': + case '(': + case ')': + case '?': + case '*': + case '+': + case '.': + case '[': + case ']': + case '\\': + *escapedCodepoint = codepoint; return true; + case 'f': *escapedCodepoint = 0x000C; return true; + case 'n': *escapedCodepoint = 0x000A; return true; + case 'r': *escapedCodepoint = 0x000D; return true; + case 't': *escapedCodepoint = 0x0009; return true; + case 'v': *escapedCodepoint = 0x000B; return true; + default: + return false; // Unsupported escape character + } + } + Stack states_; Stack ranges_; SizeType root_; diff --git a/test/unittest/regextest.cpp b/test/unittest/regextest.cpp index 23acb46..b5fd56e 100644 --- a/test/unittest/regextest.cpp +++ b/test/unittest/regextest.cpp @@ -328,10 +328,10 @@ TEST(Regex, CharacterRange8) { } TEST(Regex, Escape) { - const char* s = "\\|\\(\\)\\?\\*\\+\\.\\[\\]\\\\\\f\\n\\r\\t\\v"; + const char* s = "\\|\\(\\)\\?\\*\\+\\.\\[\\]\\\\\\f\\n\\r\\t\\v[\\b][\\[][\\]]"; Regex re(s); ASSERT_TRUE(re.IsValid()); - EXPECT_TRUE(re.Match("|()?*+.[]\\\x0C\n\r\t\x0B")); + EXPECT_TRUE(re.Match("|()?*+.[]\\\x0C\n\r\t\x0B\b[]")); EXPECT_FALSE(re.Match(s)); // Not escaping } From 51bb7631f49561e20207366c9d1edc2b04a4ea9d Mon Sep 17 00:00:00 2001 From: miloyip Date: Wed, 27 May 2015 14:25:00 +0800 Subject: [PATCH 19/26] Refactor regex with DecodedStream with one look-ahead character --- include/rapidjson/internal/regex.h | 56 ++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 18 deletions(-) diff --git a/include/rapidjson/internal/regex.h b/include/rapidjson/internal/regex.h index 15b6f8f..7ec925f 100644 --- a/include/rapidjson/internal/regex.h +++ b/include/rapidjson/internal/regex.h @@ -60,8 +60,9 @@ public: typedef typename Encoding::Ch Ch; GenericRegex(const Ch* source, Allocator* allocator = 0) : states_(allocator, 256), ranges_(allocator, 256), root_(kRegexInvalidState), stateCount_(),rangeCount_() { - StringStream is(source); - Parse(is); + StringStream ss(source); + DecodedStream ds(ss); + Parse(ds); } ~GenericRegex() { @@ -74,6 +75,8 @@ public: template bool Match(InputStream& is) const { RAPIDJSON_ASSERT(IsValid()); + DecodedStream ds(is); + Allocator allocator; Stack state0(&allocator, stateCount_ * sizeof(SizeType)); Stack state1(&allocator, stateCount_ * sizeof(SizeType)); @@ -85,7 +88,7 @@ public: AddState(stateSet, *current, root_); unsigned codepoint; - while (!current->Empty() && Encoding::Decode(is, &codepoint) && codepoint != 0) { + while (!current->Empty() && (codepoint = ds.Take()) != 0) { std::memset(stateSet, 0, stateSetSize); next->Clear(); for (const SizeType* s = current->template Bottom(); s != current->template End(); ++s) { @@ -149,6 +152,23 @@ private: SizeType out; //!< link-list of all output states }; + template + class DecodedStream { + public: + DecodedStream(SourceStream& ss) : ss_(ss) { Decode(); } + unsigned Peek() { return codepoint_; } + unsigned Take() { unsigned c = codepoint_; Decode(); return c; } + + private: + void Decode() { + if (!Encoding::Decode(ss_, &codepoint_)) + codepoint_ = 0; + } + + SourceStream& ss_; + unsigned codepoint_; + }; + State& GetState(SizeType index) { RAPIDJSON_ASSERT(index < stateCount_); return states_.template Bottom()[index]; @@ -196,7 +216,7 @@ private: } template - void Parse(InputStream& is) { + void Parse(DecodedStream& ds) { Allocator allocator; Stack operandStack(&allocator, 256); // Frag Stack operatorStack(&allocator, 256); // Operator @@ -205,8 +225,8 @@ private: *atomCountStack.template Push() = 0; unsigned codepoint; - while (Encoding::Decode(is, &codepoint) && codepoint != 0) { - switch (codepoint) { + while (ds.Peek() != 0) { + switch (codepoint = ds.Take()) { case '|': while (!operatorStack.Empty() && *operatorStack.template Top() < kAlternation) if (!Eval(operandStack, *operatorStack.template Pop(1))) @@ -254,7 +274,7 @@ private: case '[': { SizeType range; - if (!ParseRange(is, &range)) + if (!ParseRange(ds, &range)) return; SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, kRangeCharacterClass); GetState(s).rangeStart = range; @@ -264,9 +284,7 @@ private: break; case '\\': // Escape character - if (!Encoding::Decode(is, &codepoint) || codepoint == 0) - return; // Expect an escape character - if (!CharacterEscape(codepoint, &codepoint)) + if (!CharacterEscape(ds, &codepoint)) return; // Unsupported escape character // fall through to default @@ -389,14 +407,14 @@ private: } template - bool ParseRange(InputStream& is, SizeType* range) { + bool ParseRange(DecodedStream& ds, SizeType* range) { bool isBegin = true; bool negate = false; int step = 0; SizeType start = kRegexInvalidRange; SizeType current = kRegexInvalidRange; unsigned codepoint; - while (Encoding::Decode(is, &codepoint) && codepoint != 0) { + while ((codepoint = ds.Take()) != 0) { if (isBegin) { isBegin = false; if (codepoint == '^') { @@ -418,11 +436,11 @@ private: return true; case '\\': - if (!Encoding::Decode(is, &codepoint) || codepoint == 0) - return false; // Expect an escape character - if (codepoint == 'b') + if (ds.Peek() == 'b') { + ds.Take(); codepoint = 0x0008; // Escape backspace character - else if (!CharacterEscape(codepoint, &codepoint)) + } + else if (!CharacterEscape(ds, &codepoint)) return false; // fall through to default @@ -464,8 +482,10 @@ private: return rangeCount_++; } - bool CharacterEscape(unsigned codepoint, unsigned* escapedCodepoint) { - switch (codepoint) { + template + bool CharacterEscape(DecodedStream& ds, unsigned* escapedCodepoint) { + unsigned codepoint; + switch (codepoint = ds.Take()) { case '|': case '(': case ')': From fa7dc1c439cbb316343e8991d41a9bedf585e5a6 Mon Sep 17 00:00:00 2001 From: miloyip Date: Wed, 27 May 2015 23:39:22 +0800 Subject: [PATCH 20/26] Add numbered quantifier --- include/rapidjson/internal/regex.h | 92 ++++++++++++++++++++++++ test/unittest/regextest.cpp | 109 ++++++++++++++++++++++++++++- 2 files changed, 199 insertions(+), 2 deletions(-) diff --git a/include/rapidjson/internal/regex.h b/include/rapidjson/internal/regex.h index 7ec925f..26d1098 100644 --- a/include/rapidjson/internal/regex.h +++ b/include/rapidjson/internal/regex.h @@ -39,6 +39,9 @@ static const SizeType kRegexInvalidRange = ~SizeType(0); - \c a? Zero or one - \c a* Zero or more - \c a+ One or more + - \c a{3} Exactly 3 times + - \c a{3,} At least 3 times + - \c a{3,5} 3 to 5 times - \c (ab)* Grouping - \c . Any character - \c [abc] Character classes @@ -266,6 +269,28 @@ private: return; break; + case '{': + { + unsigned n, m; + if (!ParseUnsigned(ds, &n) || n == 0) + return; + + if (ds.Peek() == ',') { + ds.Take(); + if (ds.Peek() == '}') + m = 0; + else if (!ParseUnsigned(ds, &m) || m < n) + return; + } + else + m = n; + + if (!EvalQuantifier(operandStack, n, m) || ds.Peek() != '}') + return; + ds.Take(); + } + break; + case '.': PushOperand(operandStack, kAnyCharacterClass); ImplicitConcatenation(atomCountStack, operatorStack); @@ -406,6 +431,71 @@ private: } } + bool EvalQuantifier(Stack& operandStack, unsigned n, unsigned m) { + RAPIDJSON_ASSERT(n > 0); + RAPIDJSON_ASSERT(m == 0 || n <= m); // m == 0 means infinity + if (operandStack.GetSize() < sizeof(Frag)) + return false; + + for (unsigned i = 0; i < n - 1; i++) // a{3} -> a a a + CloneTopOperand(operandStack); + + if (m == 0) + Eval(operandStack, kOneOrMore); // a{3,} -> a a a+ + else if (m > n) { + CloneTopOperand(operandStack); // a{3,5} -> a a a a + Eval(operandStack, kZeroOrOne); // a{3,5} -> a a a a? + for (unsigned i = n; i < m - 1; i++) + CloneTopOperand(operandStack); // a{3,5} -> a a a a? a? + for (unsigned i = n; i < m; i++) + Eval(operandStack, kConcatenation); // a{3,5} -> a a aa?a? + } + + for (unsigned i = 0; i < n - 1; i++) + Eval(operandStack, kConcatenation); // a{3} -> aaa, a{3,} -> aaa+, a{3.5} -> aaaa?a? + + return true; + } + + static SizeType Min(SizeType a, SizeType b) { return a < b ? a : b; } + + SizeType GetMinStateIndex(SizeType index) { + State& s = GetState(index); + if (s.out != kRegexInvalidState && s.out < index) + index = Min(index, GetMinStateIndex(s.out)); + if (s.out1 != kRegexInvalidState && s.out1 < index) + index = Min(index, GetMinStateIndex(s.out1)); + return index; + } + + void CloneTopOperand(Stack& operandStack) { + const Frag *src = operandStack.template Top(); + SizeType minIndex = GetMinStateIndex(src->start); + SizeType count = stateCount_ - minIndex; // Assumes top operand contains states in [min, stateCount_) + State* s = states_.template Push(count); + memcpy(s, &GetState(minIndex), count * sizeof(State)); + for (SizeType j = 0; j < count; j++) { + if (s[j].out != kRegexInvalidState) + s[j].out += count; + if (s[j].out1 != kRegexInvalidState) + s[j].out1 += count; + } + *operandStack.template Push() = Frag(src->start + count, src->out + count); + stateCount_ += count; + } + + template + bool ParseUnsigned(DecodedStream& ds, unsigned* u) { + unsigned r = 0; + while (ds.Peek() >= '0' && ds.Peek() <= '9') { + if (r >= 429496729 && ds.Peek() > '5') // 2^32 - 1 = 4294967295 + return false; // overflow + r = r * 10 + (ds.Take() - '0'); + } + *u = r; + return true; + } + template bool ParseRange(DecodedStream& ds, SizeType* range) { bool isBegin = true; @@ -495,6 +585,8 @@ private: case '.': case '[': case ']': + case '{': + case '}': case '\\': *escapedCodepoint = codepoint; return true; case 'f': *escapedCodepoint = 0x000C; return true; diff --git a/test/unittest/regextest.cpp b/test/unittest/regextest.cpp index b5fd56e..05acc99 100644 --- a/test/unittest/regextest.cpp +++ b/test/unittest/regextest.cpp @@ -220,6 +220,111 @@ TEST(Regex, OneOrMore4) { EXPECT_FALSE(re.Match("ab")); } +TEST(Regex, QuantifierExact1) { + Regex re("ab{3}c"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("abbbc")); + EXPECT_FALSE(re.Match("ac")); + EXPECT_FALSE(re.Match("abc")); + EXPECT_FALSE(re.Match("abbc")); + EXPECT_FALSE(re.Match("abbbbc")); +} + +TEST(Regex, QuantifierExact2) { + Regex re("a(bc){3}d"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("abcbcbcd")); + EXPECT_FALSE(re.Match("ad")); + EXPECT_FALSE(re.Match("abcd")); + EXPECT_FALSE(re.Match("abcbcd")); + EXPECT_FALSE(re.Match("abcbcbcbcd")); +} + +TEST(Regex, QuantifierExact3) { + Regex re("a(b|c){3}d"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("abbbd")); + EXPECT_TRUE(re.Match("acccd")); + EXPECT_TRUE(re.Match("abcbd")); + EXPECT_FALSE(re.Match("ad")); + EXPECT_FALSE(re.Match("abbd")); + EXPECT_FALSE(re.Match("accccd")); + EXPECT_FALSE(re.Match("abbbbd")); +} + +TEST(Regex, QuantifierMin1) { + Regex re("ab{3,}c"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("abbbc")); + EXPECT_TRUE(re.Match("abbbbc")); + EXPECT_TRUE(re.Match("abbbbbc")); + EXPECT_FALSE(re.Match("ac")); + EXPECT_FALSE(re.Match("abc")); + EXPECT_FALSE(re.Match("abbc")); +} + +TEST(Regex, QuantifierMin2) { + Regex re("a(bc){3,}d"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("abcbcbcd")); + EXPECT_TRUE(re.Match("abcbcbcbcd")); + EXPECT_FALSE(re.Match("ad")); + EXPECT_FALSE(re.Match("abcd")); + EXPECT_FALSE(re.Match("abcbcd")); +} + +TEST(Regex, QuantifierMin3) { + Regex re("a(b|c){3,}d"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("abbbd")); + EXPECT_TRUE(re.Match("acccd")); + EXPECT_TRUE(re.Match("abcbd")); + EXPECT_TRUE(re.Match("accccd")); + EXPECT_TRUE(re.Match("abbbbd")); + EXPECT_FALSE(re.Match("ad")); + EXPECT_FALSE(re.Match("abbd")); +} + +TEST(Regex, QuantifierMinMax1) { + Regex re("ab{3,5}c"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("abbbc")); + EXPECT_TRUE(re.Match("abbbbc")); + EXPECT_TRUE(re.Match("abbbbbc")); + EXPECT_FALSE(re.Match("ac")); + EXPECT_FALSE(re.Match("abc")); + EXPECT_FALSE(re.Match("abbc")); + EXPECT_FALSE(re.Match("abbbbbbc")); +} + +TEST(Regex, QuantifierMinMax2) { + Regex re("a(bc){3,5}d"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("abcbcbcd")); + EXPECT_TRUE(re.Match("abcbcbcbcd")); + EXPECT_TRUE(re.Match("abcbcbcbcbcd")); + EXPECT_FALSE(re.Match("ad")); + EXPECT_FALSE(re.Match("abcd")); + EXPECT_FALSE(re.Match("abcbcd")); + EXPECT_FALSE(re.Match("abcbcbcbcbcbcd")); +} + +TEST(Regex, QuantifierMinMax3) { + Regex re("a(b|c){3,5}d"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Match("abbbd")); + EXPECT_TRUE(re.Match("acccd")); + EXPECT_TRUE(re.Match("abcbd")); + EXPECT_TRUE(re.Match("accccd")); + EXPECT_TRUE(re.Match("abbbbd")); + EXPECT_TRUE(re.Match("acccccd")); + EXPECT_TRUE(re.Match("abbbbbd")); + EXPECT_FALSE(re.Match("ad")); + EXPECT_FALSE(re.Match("abbd")); + EXPECT_FALSE(re.Match("accccccd")); + EXPECT_FALSE(re.Match("abbbbbbd")); +} + #define EURO "\xE2\x82\xAC" // "\xE2\x82\xAC" is UTF-8 sequence of Euro sign U+20AC TEST(Regex, Unicode) { @@ -328,10 +433,10 @@ TEST(Regex, CharacterRange8) { } TEST(Regex, Escape) { - const char* s = "\\|\\(\\)\\?\\*\\+\\.\\[\\]\\\\\\f\\n\\r\\t\\v[\\b][\\[][\\]]"; + const char* s = "\\|\\(\\)\\?\\*\\+\\.\\[\\]\\{\\}\\\\\\f\\n\\r\\t\\v[\\b][\\[][\\]]"; Regex re(s); ASSERT_TRUE(re.IsValid()); - EXPECT_TRUE(re.Match("|()?*+.[]\\\x0C\n\r\t\x0B\b[]")); + EXPECT_TRUE(re.Match("|()?*+.[]{}\\\x0C\n\r\t\x0B\b[]")); EXPECT_FALSE(re.Match(s)); // Not escaping } From 56b205264c42ae0fe30d4c93758a9ae9ba970563 Mon Sep 17 00:00:00 2001 From: miloyip Date: Thu, 28 May 2015 00:05:05 +0800 Subject: [PATCH 21/26] Refactor to store minIndex in Frag of regex --- include/rapidjson/internal/regex.h | 34 ++++++++++++------------------ 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/include/rapidjson/internal/regex.h b/include/rapidjson/internal/regex.h index 26d1098..bc47f95 100644 --- a/include/rapidjson/internal/regex.h +++ b/include/rapidjson/internal/regex.h @@ -150,9 +150,10 @@ private: }; struct Frag { - Frag(SizeType s, SizeType o) : start(s), out(o) {} + Frag(SizeType s, SizeType o, SizeType m) : start(s), out(o), minIndex(m) {} SizeType start; SizeType out; //!< link-list of all output states + SizeType minIndex; }; template @@ -303,7 +304,7 @@ private: return; SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, kRangeCharacterClass); GetState(s).rangeStart = range; - *operandStack.template Push() = Frag(s, s); + *operandStack.template Push() = Frag(s, s, s); } ImplicitConcatenation(atomCountStack, operatorStack); break; @@ -351,7 +352,7 @@ private: void PushOperand(Stack& operandStack, unsigned codepoint) { SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, codepoint); - *operandStack.template Push() = Frag(s, s); + *operandStack.template Push() = Frag(s, s, s); } void ImplicitConcatenation(Stack& atomCountStack, Stack& operatorStack) { @@ -382,7 +383,7 @@ private: Frag e2 = *operandStack.template Pop(1); Frag e1 = *operandStack.template Pop(1); Patch(e1.out, e2.start); - *operandStack.template Push() = Frag(e1.start, e2.out); + *operandStack.template Push() = Frag(e1.start, e2.out, Min(e1.minIndex, e2.minIndex)); return true; } return false; @@ -392,7 +393,7 @@ private: Frag e2 = *operandStack.template Pop(1); Frag e1 = *operandStack.template Pop(1); SizeType s = NewState(e1.start, e2.start, 0); - *operandStack.template Push() = Frag(s, Append(e1.out, e2.out)); + *operandStack.template Push() = Frag(s, Append(e1.out, e2.out), Min(e1.minIndex, e2.minIndex)); return true; } return false; @@ -401,7 +402,7 @@ private: if (operandStack.GetSize() >= sizeof(Frag)) { Frag e = *operandStack.template Pop(1); SizeType s = NewState(kRegexInvalidState, e.start, 0); - *operandStack.template Push() = Frag(s, Append(e.out, s)); + *operandStack.template Push() = Frag(s, Append(e.out, s), e.minIndex); return true; } return false; @@ -411,7 +412,7 @@ private: Frag e = *operandStack.template Pop(1); SizeType s = NewState(kRegexInvalidState, e.start, 0); Patch(e.out, s); - *operandStack.template Push() = Frag(s, s); + *operandStack.template Push() = Frag(s, s, e.minIndex); return true; } return false; @@ -421,7 +422,7 @@ private: Frag e = *operandStack.template Pop(1); SizeType s = NewState(kRegexInvalidState, e.start, 0); Patch(e.out, s); - *operandStack.template Push() = Frag(e.start, s); + *operandStack.template Push() = Frag(e.start, s, e.minIndex); return true; } return false; @@ -459,28 +460,19 @@ private: static SizeType Min(SizeType a, SizeType b) { return a < b ? a : b; } - SizeType GetMinStateIndex(SizeType index) { - State& s = GetState(index); - if (s.out != kRegexInvalidState && s.out < index) - index = Min(index, GetMinStateIndex(s.out)); - if (s.out1 != kRegexInvalidState && s.out1 < index) - index = Min(index, GetMinStateIndex(s.out1)); - return index; - } - void CloneTopOperand(Stack& operandStack) { const Frag *src = operandStack.template Top(); - SizeType minIndex = GetMinStateIndex(src->start); - SizeType count = stateCount_ - minIndex; // Assumes top operand contains states in [min, stateCount_) + SizeType minIndex = minIndex; + SizeType count = stateCount_ - src->minIndex; // Assumes top operand contains states in [src->minIndex, stateCount_) State* s = states_.template Push(count); - memcpy(s, &GetState(minIndex), count * sizeof(State)); + memcpy(s, &GetState(src->minIndex), count * sizeof(State)); for (SizeType j = 0; j < count; j++) { if (s[j].out != kRegexInvalidState) s[j].out += count; if (s[j].out1 != kRegexInvalidState) s[j].out1 += count; } - *operandStack.template Push() = Frag(src->start + count, src->out + count); + *operandStack.template Push() = Frag(src->start + count, src->out + count, src->minIndex + count); stateCount_ += count; } From 960bc0eabd1c7ffc919fdd8862e820903bec2745 Mon Sep 17 00:00:00 2001 From: miloyip Date: Thu, 28 May 2015 10:10:38 +0800 Subject: [PATCH 22/26] Fix gcc warning --- include/rapidjson/internal/regex.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/rapidjson/internal/regex.h b/include/rapidjson/internal/regex.h index bc47f95..1aff9e2 100644 --- a/include/rapidjson/internal/regex.h +++ b/include/rapidjson/internal/regex.h @@ -159,7 +159,7 @@ private: template class DecodedStream { public: - DecodedStream(SourceStream& ss) : ss_(ss) { Decode(); } + DecodedStream(SourceStream& ss) : ss_(ss), codepoint_() { Decode(); } unsigned Peek() { return codepoint_; } unsigned Take() { unsigned c = codepoint_; Decode(); return c; } From a5ac3b5dbc6c9d9406c89cdea911887be66ce0e0 Mon Sep 17 00:00:00 2001 From: miloyip Date: Thu, 28 May 2015 10:44:52 +0800 Subject: [PATCH 23/26] Remove an unused line of code --- include/rapidjson/internal/regex.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/rapidjson/internal/regex.h b/include/rapidjson/internal/regex.h index 1aff9e2..4127f9c 100644 --- a/include/rapidjson/internal/regex.h +++ b/include/rapidjson/internal/regex.h @@ -462,7 +462,6 @@ private: void CloneTopOperand(Stack& operandStack) { const Frag *src = operandStack.template Top(); - SizeType minIndex = minIndex; SizeType count = stateCount_ - src->minIndex; // Assumes top operand contains states in [src->minIndex, stateCount_) State* s = states_.template Push(count); memcpy(s, &GetState(src->minIndex), count * sizeof(State)); From 3eb19ceaf9131e5936070096a6a185bd4b887a8b Mon Sep 17 00:00:00 2001 From: Milo Yip Date: Fri, 29 May 2015 15:23:28 +0800 Subject: [PATCH 24/26] Add Search(), ^ and $ assertions to regex --- include/rapidjson/internal/regex.h | 165 +++++++++++++++++------------ test/unittest/regextest.cpp | 58 +++++++++- 2 files changed, 155 insertions(+), 68 deletions(-) diff --git a/include/rapidjson/internal/regex.h b/include/rapidjson/internal/regex.h index 4127f9c..5d483bf 100644 --- a/include/rapidjson/internal/regex.h +++ b/include/rapidjson/internal/regex.h @@ -62,7 +62,7 @@ class GenericRegex { public: typedef typename Encoding::Ch Ch; - GenericRegex(const Ch* source, Allocator* allocator = 0) : states_(allocator, 256), ranges_(allocator, 256), root_(kRegexInvalidState), stateCount_(),rangeCount_() { + GenericRegex(const Ch* source, Allocator* allocator = 0) : states_(allocator, 256), ranges_(allocator, 256), root_(kRegexInvalidState), stateCount_(), rangeCount_(), anchorBegin_(), anchorEnd_() { StringStream ss(source); DecodedStream ds(ss); Parse(ds); @@ -77,51 +77,24 @@ public: template bool Match(InputStream& is) const { - RAPIDJSON_ASSERT(IsValid()); - DecodedStream ds(is); - - Allocator allocator; - Stack state0(&allocator, stateCount_ * sizeof(SizeType)); - Stack state1(&allocator, stateCount_ * sizeof(SizeType)); - Stack *current = &state0, *next = &state1; - - const size_t stateSetSize = (stateCount_ + 31) / 32 * 4; - unsigned* stateSet = static_cast(allocator.Malloc(stateSetSize)); - std::memset(stateSet, 0, stateSetSize); - AddState(stateSet, *current, root_); - - unsigned codepoint; - while (!current->Empty() && (codepoint = ds.Take()) != 0) { - std::memset(stateSet, 0, stateSetSize); - next->Clear(); - for (const SizeType* s = current->template Bottom(); s != current->template End(); ++s) { - const State& sr = GetState(*s); - if (sr.codepoint == codepoint || - sr.codepoint == kAnyCharacterClass || - (sr.codepoint == kRangeCharacterClass && MatchRange(sr.rangeStart, codepoint))) - { - AddState(stateSet, *next, sr.out); - } - } - Stack* temp = current; - current = next; - next = temp; - } - - Allocator::Free(stateSet); - - for (const SizeType* s = current->template Bottom(); s != current->template End(); ++s) - if (GetState(*s).out == kRegexInvalidState) - return true; - - return false; + return SearchWithAnchoring(is, true, true); } - bool Match(const Ch* s) { + bool Match(const Ch* s) const { StringStream is(s); return Match(is); } + template + bool Search(InputStream& is) const { + return SearchWithAnchoring(is, anchorBegin_, anchorEnd_); + } + + bool Search(const Ch* s) const { + StringStream is(s); + return Search(is); + } + private: enum Operator { kZeroOrOne, @@ -193,32 +166,6 @@ private: return ranges_.template Bottom()[index]; } - void AddState(unsigned* stateSet, Stack& l, SizeType index) const { - if (index == kRegexInvalidState) - return; - - const State& s = GetState(index); - if (s.out1 != kRegexInvalidState) { // Split - AddState(stateSet, l, s.out); - AddState(stateSet, l, s.out1); - } - else if (!(stateSet[index >> 5] & (1 << (index & 31)))) { - stateSet[index >> 5] |= (1 << (index & 31)); - *l.template Push() = index; - } - } - - bool MatchRange(SizeType rangeIndex, unsigned codepoint) const { - bool yes = (GetRange(rangeIndex).start & kRangeNegationFlag) == 0; - while (rangeIndex != kRegexInvalidRange) { - const Range& r = GetRange(rangeIndex); - if (codepoint >= (r.start & ~kRangeNegationFlag) && codepoint <= r.end) - return yes; - rangeIndex = r.next; - } - return !yes; - } - template void Parse(DecodedStream& ds) { Allocator allocator; @@ -231,6 +178,14 @@ private: unsigned codepoint; while (ds.Peek() != 0) { switch (codepoint = ds.Take()) { + case '^': + anchorBegin_ = true; + break; + + case '$': + anchorEnd_ = true; + break; + case '|': while (!operatorStack.Empty() && *operatorStack.template Top() < kAlternation) if (!Eval(operandStack, *operatorStack.template Pop(1))) @@ -567,6 +522,8 @@ private: bool CharacterEscape(DecodedStream& ds, unsigned* escapedCodepoint) { unsigned codepoint; switch (codepoint = ds.Take()) { + case '^': + case '$': case '|': case '(': case ')': @@ -590,11 +547,87 @@ private: } } + template + bool SearchWithAnchoring(InputStream& is, bool anchorBegin, bool anchorEnd) const { + RAPIDJSON_ASSERT(IsValid()); + DecodedStream ds(is); + + Allocator allocator; + Stack state0(&allocator, stateCount_ * sizeof(SizeType)); + Stack state1(&allocator, stateCount_ * sizeof(SizeType)); + Stack *current = &state0, *next = &state1; + + const size_t stateSetSize = (stateCount_ + 31) / 32 * 4; + unsigned* stateSet = static_cast(allocator.Malloc(stateSetSize)); + std::memset(stateSet, 0, stateSetSize); + + bool matched = false; + matched = AddState(stateSet, *current, root_); + + unsigned codepoint; + while (!current->Empty() && (codepoint = ds.Take()) != 0) { + std::memset(stateSet, 0, stateSetSize); + next->Clear(); + matched = false; + for (const SizeType* s = current->template Bottom(); s != current->template End(); ++s) { + const State& sr = GetState(*s); + if (sr.codepoint == codepoint || + sr.codepoint == kAnyCharacterClass || + (sr.codepoint == kRangeCharacterClass && MatchRange(sr.rangeStart, codepoint))) + { + matched = AddState(stateSet, *next, sr.out) || matched; + if (!anchorEnd && matched) + goto exit; + } + if (!anchorBegin) + AddState(stateSet, *next, root_); + } + Stack* temp = current; + current = next; + next = temp; + } + + exit: + Allocator::Free(stateSet); + return matched; + } + + // Return whether the added states is a match state + bool AddState(unsigned* stateSet, Stack& l, SizeType index) const { + if (index == kRegexInvalidState) + return true; + + const State& s = GetState(index); + if (s.out1 != kRegexInvalidState) { // Split + bool matched = AddState(stateSet, l, s.out); + matched = AddState(stateSet, l, s.out1) || matched; + return matched; + } + else if (!(stateSet[index >> 5] & (1 << (index & 31)))) { + stateSet[index >> 5] |= (1 << (index & 31)); + *l.template Push() = index; + return GetState(index).out == kRegexInvalidState; + } + } + + bool MatchRange(SizeType rangeIndex, unsigned codepoint) const { + bool yes = (GetRange(rangeIndex).start & kRangeNegationFlag) == 0; + while (rangeIndex != kRegexInvalidRange) { + const Range& r = GetRange(rangeIndex); + if (codepoint >= (r.start & ~kRangeNegationFlag) && codepoint <= r.end) + return yes; + rangeIndex = r.next; + } + return !yes; + } + Stack states_; Stack ranges_; SizeType root_; SizeType stateCount_; SizeType rangeCount_; + bool anchorBegin_; + bool anchorEnd_; }; typedef GenericRegex > Regex; diff --git a/test/unittest/regextest.cpp b/test/unittest/regextest.cpp index 05acc99..37a88ff 100644 --- a/test/unittest/regextest.cpp +++ b/test/unittest/regextest.cpp @@ -432,11 +432,65 @@ TEST(Regex, CharacterRange8) { EXPECT_FALSE(re.Match("!")); } +TEST(Regex, Search) { + Regex re("abc"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Search("abc")); + EXPECT_TRUE(re.Search("_abc")); + EXPECT_TRUE(re.Search("abc_")); + EXPECT_TRUE(re.Search("_abc_")); + EXPECT_TRUE(re.Search("__abc__")); + EXPECT_TRUE(re.Search("abcabc")); + EXPECT_FALSE(re.Search("a")); + EXPECT_FALSE(re.Search("ab")); + EXPECT_FALSE(re.Search("bc")); + EXPECT_FALSE(re.Search("cba")); +} + +TEST(Regex, Search_BeginAnchor) { + Regex re("^abc"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Search("abc")); + EXPECT_TRUE(re.Search("abc_")); + EXPECT_TRUE(re.Search("abcabc")); + EXPECT_FALSE(re.Search("_abc")); + EXPECT_FALSE(re.Search("_abc_")); + EXPECT_FALSE(re.Search("a")); + EXPECT_FALSE(re.Search("ab")); + EXPECT_FALSE(re.Search("bc")); + EXPECT_FALSE(re.Search("cba")); +} + +TEST(Regex, Search_EndAnchor) { + Regex re("abc$"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Search("abc")); + EXPECT_TRUE(re.Search("_abc")); + EXPECT_TRUE(re.Search("abcabc")); + EXPECT_FALSE(re.Search("abc_")); + EXPECT_FALSE(re.Search("_abc_")); + EXPECT_FALSE(re.Search("a")); + EXPECT_FALSE(re.Search("ab")); + EXPECT_FALSE(re.Search("bc")); + EXPECT_FALSE(re.Search("cba")); +} + +TEST(Regex, Search_BothAnchor) { + Regex re("^abc$"); + ASSERT_TRUE(re.IsValid()); + EXPECT_TRUE(re.Search("abc")); + EXPECT_FALSE(re.Search("")); + EXPECT_FALSE(re.Search("a")); + EXPECT_FALSE(re.Search("b")); + EXPECT_FALSE(re.Search("ab")); + EXPECT_FALSE(re.Search("abcd")); +} + TEST(Regex, Escape) { - const char* s = "\\|\\(\\)\\?\\*\\+\\.\\[\\]\\{\\}\\\\\\f\\n\\r\\t\\v[\\b][\\[][\\]]"; + const char* s = "\\^\\$\\|\\(\\)\\?\\*\\+\\.\\[\\]\\{\\}\\\\\\f\\n\\r\\t\\v[\\b][\\[][\\]]"; Regex re(s); ASSERT_TRUE(re.IsValid()); - EXPECT_TRUE(re.Match("|()?*+.[]{}\\\x0C\n\r\t\x0B\b[]")); + EXPECT_TRUE(re.Match("^$|()?*+.[]{}\\\x0C\n\r\t\x0B\b[]")); EXPECT_FALSE(re.Match(s)); // Not escaping } From c0e7c8304b61d539302d28ad428d5e4da41ec7c7 Mon Sep 17 00:00:00 2001 From: Milo Yip Date: Fri, 29 May 2015 16:02:14 +0800 Subject: [PATCH 25/26] Fix a bug and add document in regex --- include/rapidjson/internal/regex.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/rapidjson/internal/regex.h b/include/rapidjson/internal/regex.h index 5d483bf..056535e 100644 --- a/include/rapidjson/internal/regex.h +++ b/include/rapidjson/internal/regex.h @@ -42,7 +42,9 @@ static const SizeType kRegexInvalidRange = ~SizeType(0); - \c a{3} Exactly 3 times - \c a{3,} At least 3 times - \c a{3,5} 3 to 5 times - - \c (ab)* Grouping + - \c (ab) Grouping + - \c ^a At the beginning + - \c a$ At the end - \c . Any character - \c [abc] Character classes - \c [a-c] Character class range @@ -606,8 +608,8 @@ private: else if (!(stateSet[index >> 5] & (1 << (index & 31)))) { stateSet[index >> 5] |= (1 << (index & 31)); *l.template Push() = index; - return GetState(index).out == kRegexInvalidState; } + return GetState(index).out == kRegexInvalidState; } bool MatchRange(SizeType rangeIndex, unsigned codepoint) const { From a8feeb4c3ef7198ab8883a754f2780657a4b2267 Mon Sep 17 00:00:00 2001 From: Milo Yip Date: Fri, 29 May 2015 17:42:08 +0800 Subject: [PATCH 26/26] Add invalid regex tests and fix a bug --- include/rapidjson/internal/regex.h | 2 ++ test/unittest/regextest.cpp | 36 ++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/include/rapidjson/internal/regex.h b/include/rapidjson/internal/regex.h index 056535e..fcf2600 100644 --- a/include/rapidjson/internal/regex.h +++ b/include/rapidjson/internal/regex.h @@ -463,6 +463,8 @@ private: switch (codepoint) { case ']': + if (start == kRegexInvalidRange) + return false; // Error: nothing inside [] if (step == 2) { // Add trailing '-' SizeType r = NewRange('-'); RAPIDJSON_ASSERT(current != kRegexInvalidRange); diff --git a/test/unittest/regextest.cpp b/test/unittest/regextest.cpp index 37a88ff..65105fa 100644 --- a/test/unittest/regextest.cpp +++ b/test/unittest/regextest.cpp @@ -494,4 +494,40 @@ TEST(Regex, Escape) { EXPECT_FALSE(re.Match(s)); // Not escaping } +TEST(Regex, Invalid) { +#define TEST_INVALID(s) \ + {\ + Regex re(s);\ + EXPECT_FALSE(re.IsValid());\ + } + + TEST_INVALID("a|"); + TEST_INVALID("()"); + TEST_INVALID(")"); + TEST_INVALID("(a))"); + TEST_INVALID("(a|)"); + TEST_INVALID("(a||b)"); + TEST_INVALID("(|b)"); + TEST_INVALID("?"); + TEST_INVALID("*"); + TEST_INVALID("+"); + TEST_INVALID("{"); + TEST_INVALID("{}"); + TEST_INVALID("a{a}"); + TEST_INVALID("a{0}"); + TEST_INVALID("a{-1}"); + TEST_INVALID("a{}"); + TEST_INVALID("a{0,}"); + TEST_INVALID("a{,0}"); + TEST_INVALID("a{1,0}"); + TEST_INVALID("a{-1,0}"); + TEST_INVALID("a{-1,1}"); + TEST_INVALID("[]"); + TEST_INVALID("[^]"); + TEST_INVALID("[\\a]"); + TEST_INVALID("\\a"); + +#undef TEST_INVALID +} + #undef EURO