From 0bef29a5f649637fd9a51bb758aaa62c57c12920 Mon Sep 17 00:00:00 2001 From: miloyip Date: Sun, 24 May 2015 21:23:39 +0800 Subject: [PATCH] Initial reggae implementation with only concatenation and alternation --- include/rapidjson/internal/regex.h | 214 +++++++++++++++++++++++++++++ include/rapidjson/internal/stack.h | 15 ++ test/unittest/CMakeLists.txt | 1 + test/unittest/regextest.cpp | 50 +++++++ 4 files changed, 280 insertions(+) create mode 100644 include/rapidjson/internal/regex.h create mode 100644 test/unittest/regextest.cpp diff --git a/include/rapidjson/internal/regex.h b/include/rapidjson/internal/regex.h new file mode 100644 index 0000000..b61aaaa --- /dev/null +++ b/include/rapidjson/internal/regex.h @@ -0,0 +1,214 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef RAPIDJSON_INTERNAL_REGEX_H_ +#define RAPIDJSON_INTERNAL_REGEX_H_ + +#include "../rapidjson.h" +#include "stack.h" + +RAPIDJSON_NAMESPACE_BEGIN +namespace internal { + +/////////////////////////////////////////////////////////////////////////////// +// GenericRegex + +static const SizeType kRegexInvalidState = ~SizeType(0); //!< Represents an invalid index in GenericRegex::State::out, out1 + +template +class GenericRegex { +public: + typedef typename Encoding::Ch Ch; + + GenericRegex(const Ch* source, Allocator* allocator = 0) : states_(allocator, 256), root_(kRegexInvalidState), stateCount_() { + StringStream is(source); + Parse(is); + } + + ~GenericRegex() { + } + + bool IsValid() const { + return root_ != kRegexInvalidState; + } + + template + bool Match(InputStream& is) const { + RAPIDJSON_ASSERT(IsValid()); + Allocator allocator; + Stack state0(&allocator, stateCount_ * sizeof(SizeType)); + Stack state1(&allocator, stateCount_ * sizeof(SizeType)); + Stack *current = &state0, *next = &state1; + + const size_t stateSetSize = (stateCount_ + 31) / 32 * 4; + unsigned* stateSet = static_cast(allocator.Malloc(stateSetSize)); + std::memset(stateSet, 0, stateSetSize); + + AddState(stateSet, *current, root_); + + unsigned codepoint; + while (!current->Empty() && Encoding::Decode(is, &codepoint) && codepoint != 0) { + for (const SizeType* s = current->template Bottom(); s != current->template End(); ++s) { + const State& sr = GetState(*s); + // if (sr.out != kRegexInvalidState) + // printf("%c matches %c\n", (char)sr.codepoint, (char)codepoint); + + if (sr.out != kRegexInvalidState && sr.codepoint == codepoint) + AddState(stateSet, *next, sr.out); + } + Stack* temp = current; + current = next; + next = temp; + std::memset(stateSet, 0, stateSetSize); + next->Clear(); + // printf("\n"); + } + + Allocator::Free(stateSet); + + for (const SizeType* s = current->template Bottom(); s != current->template End(); ++s) + if (GetState(*s).out == kRegexInvalidState) + return true; + + return false; + } + + bool Match(const Ch* s) { + StringStream is(s); + return Match(is); + } + +private: + struct State { + SizeType out; //!< Equals to kInvalid for match + SizeType out1; //!< Equals to non-kInvalid for split + unsigned codepoint; + }; + + struct Frag { + Frag(SizeType s, SizeType o) : start(s), out(o) {} + SizeType start; + SizeType out; //!< link-list of all output states + }; + + State& GetState(SizeType index) { + RAPIDJSON_ASSERT(index < stateCount_); + return states_.template Bottom()[index]; + } + + const State& GetState(SizeType index) const { + RAPIDJSON_ASSERT(index < stateCount_); + return states_.template Bottom()[index]; + } + + void AddState(unsigned* stateSet, Stack& l, SizeType index) const { + if (index == kRegexInvalidState) + return; + + const State& s = GetState(index); + if (s.out1 != kRegexInvalidState) { // Split + AddState(stateSet, l, s.out); + AddState(stateSet, l, s.out1); + } + else if (!(stateSet[index >> 5] & (1 << (index & 31)))) { + stateSet[index >> 5] |= (1 << (index & 31)); + *l.template Push() = index; + } + } + + SizeType NewState(SizeType out, SizeType out1, unsigned codepoint) { + State* s = states_.template Push(); + s->out = out; + s->out1 = out1; + s->codepoint = codepoint; + return stateCount_++; + } + + SizeType Append(SizeType l1, SizeType l2) { + SizeType old = l1; + while (GetState(l1).out != kRegexInvalidState) + l1 = GetState(l1).out; + GetState(l1).out = l2; + return old; + } + + void Patch(SizeType l, SizeType s) { + SizeType next; + for (; l != kRegexInvalidState; l = next) { + next = GetState(l).out; + GetState(l).out = s; + } + } + + template + void Parse(InputStream& is) { + Allocator allocator; + Stack operandStack(&allocator, 256); // Frag + Stack operatorStack(&allocator, 256); // char + + unsigned codepoint; + bool previousOperand = false; + while (Encoding::Decode(is, &codepoint) && codepoint != 0) { + switch (codepoint) { + case '|': + *operatorStack.template Push() = '|'; + previousOperand = false; + break; + + default: + SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, codepoint); + // concatenation with previous operand + if (previousOperand) { + Frag* e = operandStack.template Top(); + Patch(e->out, s); + e->out = s; + } + else + *operandStack.template Push() = Frag(s, s); + previousOperand = true; + } + } + + while (!operatorStack.Empty()) { + switch (*operatorStack.template Pop(1)) { + case '|': + { + Frag e2 = *operandStack.template Pop(1); + Frag e1 = *operandStack.template Pop(1); + SizeType s = NewState(e1.start, e2.start, 0); + *operandStack.template Push() = Frag(s, Append(e1.out, e2.out)); + } + break; + } + } + + // Link the operand to matching state. + if (operandStack.GetSize() == sizeof(Frag)) { + Frag* e = operandStack.template Pop(1); + Patch(e->out, NewState(kRegexInvalidState, kRegexInvalidState, 0)); + root_ = e->start; + } + } + + Stack states_; + SizeType root_; + SizeType stateCount_; +}; + +typedef GenericRegex > Regex; + +} // namespace internal +RAPIDJSON_NAMESPACE_END + +#endif // RAPIDJSON_INTERNAL_REGEX_H_ diff --git a/include/rapidjson/internal/stack.h b/include/rapidjson/internal/stack.h index bb31cc0..f911588 100644 --- a/include/rapidjson/internal/stack.h +++ b/include/rapidjson/internal/stack.h @@ -121,9 +121,24 @@ public: return reinterpret_cast(stackTop_ - sizeof(T)); } + template + const T* Top() const { + RAPIDJSON_ASSERT(GetSize() >= sizeof(T)); + return reinterpret_cast(stackTop_ - sizeof(T)); + } + + template + T* End() { return reinterpret_cast(stackTop_); } + + template + const T* End() const { return reinterpret_cast(stackTop_); } + template T* Bottom() { return (T*)stack_; } + template + const T* Bottom() const { return (T*)stack_; } + Allocator& GetAllocator() { return *allocator_; } bool Empty() const { return stackTop_ == stack_; } size_t GetSize() const { return static_cast(stackTop_ - stack_); } diff --git a/test/unittest/CMakeLists.txt b/test/unittest/CMakeLists.txt index fb95b8e..d1734b4 100644 --- a/test/unittest/CMakeLists.txt +++ b/test/unittest/CMakeLists.txt @@ -11,6 +11,7 @@ set(UNITTEST_SOURCES pointertest.cpp prettywritertest.cpp readertest.cpp + regextest.cpp simdtest.cpp stringbuffertest.cpp strtodtest.cpp diff --git a/test/unittest/regextest.cpp b/test/unittest/regextest.cpp new file mode 100644 index 0000000..7c67c0f --- /dev/null +++ b/test/unittest/regextest.cpp @@ -0,0 +1,50 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "unittest.h" +#include "rapidjson/internal/regex.h" + +using namespace rapidjson::internal; + +TEST(Regex, concatenation) { + Regex re("abc"); + EXPECT_TRUE(re.Match("abc")); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("a")); + EXPECT_FALSE(re.Match("b")); + EXPECT_FALSE(re.Match("ab")); + EXPECT_FALSE(re.Match("abcd")); +} + +TEST(Regex, split) { + { + Regex re("abab|abbb"); + EXPECT_TRUE(re.Match("abab")); + EXPECT_TRUE(re.Match("abbb")); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("ab")); + EXPECT_FALSE(re.Match("ababa")); + EXPECT_FALSE(re.Match("abb")); + EXPECT_FALSE(re.Match("abbbb")); + } + { + Regex re("a|b|c"); + EXPECT_TRUE(re.Match("a")); + EXPECT_TRUE(re.Match("b")); + EXPECT_TRUE(re.Match("c")); + EXPECT_FALSE(re.Match("")); + EXPECT_FALSE(re.Match("aa")); + EXPECT_FALSE(re.Match("ab")); + } +}