From 2291258bb8adb87e5da30ed2b12fa9929d0e76f8 Mon Sep 17 00:00:00 2001 From: Alejandro Martinez Date: Tue, 11 Apr 2017 02:02:15 +0000 Subject: [PATCH] Added ARM-Neon support for SIMD.SkipWhitespace* Change-Id: Iaf210d029758723a7eeb7f28fc10cab7467889a9 Signed-off-by: Jun He --- doc/faq.md | 2 +- doc/faq.zh-cn.md | 2 +- doc/internals.md | 7 +- doc/internals.zh-cn.md | 7 +- include/rapidjson/rapidjson.h | 18 ++- include/rapidjson/reader.h | 264 +++++++++++++++++++++++++++++++- include/rapidjson/writer.h | 72 ++++++++- test/perftest/perftest.h | 3 + test/perftest/rapidjsontest.cpp | 2 + test/unittest/simdtest.cpp | 4 + 10 files changed, 365 insertions(+), 16 deletions(-) diff --git a/doc/faq.md b/doc/faq.md index 1b0541c..4946cfe 100644 --- a/doc/faq.md +++ b/doc/faq.md @@ -256,7 +256,7 @@ Alternatively, if we don't want to explicitly refer to the root value of `addres 3. What is SIMD? How it is applied in RapidJSON? - [SIMD](http://en.wikipedia.org/wiki/SIMD) instructions can perform parallel computation in modern CPUs. RapidJSON support Intel's SSE2/SSE4.2 to accelerate whitespace skipping. This improves performance of parsing indent formatted JSON. Define `RAPIDJSON_SSE2` or `RAPIDJSON_SSE42` macro to enable this feature. However, running the executable on a machine without such instruction set support will make it crash. + [SIMD](http://en.wikipedia.org/wiki/SIMD) instructions can perform parallel computation in modern CPUs. RapidJSON support Intel's SSE2/SSE4.2 and ARM's Neon to accelerate whitespace/tabspace/carriage-return/line-feed skipping. This improves performance of parsing indent formatted JSON. Define `RAPIDJSON_SSE2`, `RAPIDJSON_SSE42` or `RAPIDJSON_NEON` macro to enable this feature. However, running the executable on a machine without such instruction set support will make it crash. 4. Does it consume a lot of memory? diff --git a/doc/faq.zh-cn.md b/doc/faq.zh-cn.md index f12d830..307b02f 100644 --- a/doc/faq.zh-cn.md +++ b/doc/faq.zh-cn.md @@ -257,7 +257,7 @@ 3. 什是是 SIMD?它如何用于 RapidJSON? - [SIMD](http://en.wikipedia.org/wiki/SIMD) 指令可以在现代 CPU 中执行并行运算。RapidJSON 支持了 Intel 的 SSE2/SSE4.2 去加速跳过空白字符。在解析含缩进的 JSON 时,这能提升性能。只要定义名为 `RAPIDJSON_SSE2` 或 `RAPIDJSON_SSE42` 的宏,就能启动这个功能。然而,若在不支持这些指令集的机器上执行这些可执行文件,会导致崩溃。 + [SIMD](http://en.wikipedia.org/wiki/SIMD) 指令可以在现代 CPU 中执行并行运算。RapidJSON 支持使用 Intel 的 SSE2/SSE4.2 和 ARM 的 Neon 来加速对空白符、制表符、回车符和换行符的过滤处理。在解析含缩进的 JSON 时,这能提升性能。只要定义名为 `RAPIDJSON_SSE2` ,`RAPIDJSON_SSE42` 或 `RAPIDJSON_NEON` 的宏,就能启动这个功能。然而,若在不支持这些指令集的机器上执行这些可执行文件,会导致崩溃。 4. 它会消耗许多内存么? diff --git a/doc/internals.md b/doc/internals.md index 49802a0..2fff2d9 100644 --- a/doc/internals.md +++ b/doc/internals.md @@ -183,17 +183,20 @@ void SkipWhitespace(InputStream& s) { However, this requires 4 comparisons and a few branching for each character. This was found to be a hot spot. -To accelerate this process, SIMD was applied to compare 16 characters with 4 white spaces for each iteration. Currently RapidJSON only supports SSE2 and SSE4.2 instructions for this. And it is only activated for UTF-8 memory streams, including string stream or *in situ* parsing. +To accelerate this process, SIMD was applied to compare 16 characters with 4 white spaces for each iteration. Currently RapidJSON supports SSE2, SSE4.2 and ARM Neon instructions for this. And it is only activated for UTF-8 memory streams, including string stream or *in situ* parsing. -To enable this optimization, need to define `RAPIDJSON_SSE2` or `RAPIDJSON_SSE42` before including `rapidjson.h`. Some compilers can detect the setting, as in `perftest.h`: +To enable this optimization, need to define `RAPIDJSON_SSE2`, `RAPIDJSON_SSE42` or `RAPIDJSON_NEON` before including `rapidjson.h`. Some compilers can detect the setting, as in `perftest.h`: ~~~cpp // __SSE2__ and __SSE4_2__ are recognized by gcc, clang, and the Intel compiler. // We use -march=native with gmake to enable -msse2 and -msse4.2, if supported. +// Likewise, __ARM_NEON is used to detect Neon. #if defined(__SSE4_2__) # define RAPIDJSON_SSE42 #elif defined(__SSE2__) # define RAPIDJSON_SSE2 +#elif defined(__ARM_NEON) +# define RAPIDJSON_NEON #endif ~~~ diff --git a/doc/internals.zh-cn.md b/doc/internals.zh-cn.md index ec57959..0c8bc06 100644 --- a/doc/internals.zh-cn.md +++ b/doc/internals.zh-cn.md @@ -183,17 +183,20 @@ void SkipWhitespace(InputStream& s) { 但是,这需要对每个字符进行4次比较以及一些分支。这被发现是一个热点。 -为了加速这一处理,RapidJSON 使用 SIMD 来在一次迭代中比较16个字符和4个空格。目前 RapidJSON 只支持 SSE2 和 SSE4.2 指令。同时它也只会对 UTF-8 内存流启用,包括字符串流或 *原位* 解析。 +为了加速这一处理,RapidJSON 使用 SIMD 来在一次迭代中比较16个字符和4个空格。目前 RapidJSON 支持 SSE2 , SSE4.2 和 ARM Neon 指令。同时它也只会对 UTF-8 内存流启用,包括字符串流或 *原位* 解析。 -你可以通过在包含 `rapidjson.h` 之前定义 `RAPIDJSON_SSE2` 或 `RAPIDJSON_SSE42` 来启用这个优化。一些编译器可以检测这个设置,如 `perftest.h`: +你可以通过在包含 `rapidjson.h` 之前定义 `RAPIDJSON_SSE2` , `RAPIDJSON_SSE42` 或 `RAPIDJSON_NEON` 来启用这个优化。一些编译器可以检测这个设置,如 `perftest.h`: ~~~cpp // __SSE2__ 和 __SSE4_2__ 可被 gcc、clang 和 Intel 编译器识别: // 如果支持的话,我们在 gmake 中使用了 -march=native 来启用 -msse2 和 -msse4.2 +// 同样的, __ARM_NEON 被用于识别Neon #if defined(__SSE4_2__) # define RAPIDJSON_SSE42 #elif defined(__SSE2__) # define RAPIDJSON_SSE2 +#elif defined(__ARM_NEON) +# define RAPIDJSON_NEON #endif ~~~ diff --git a/include/rapidjson/rapidjson.h b/include/rapidjson/rapidjson.h index f41bb20..57ab851 100644 --- a/include/rapidjson/rapidjson.h +++ b/include/rapidjson/rapidjson.h @@ -325,17 +325,17 @@ #endif /////////////////////////////////////////////////////////////////////////////// -// RAPIDJSON_SSE2/RAPIDJSON_SSE42/RAPIDJSON_SIMD +// RAPIDJSON_SSE2/RAPIDJSON_SSE42/RAPIDJSON_NEON/RAPIDJSON_SIMD /*! \def RAPIDJSON_SIMD \ingroup RAPIDJSON_CONFIG - \brief Enable SSE2/SSE4.2 optimization. + \brief Enable SSE2/SSE4.2/Neon optimization. RapidJSON supports optimized implementations for some parsing operations - based on the SSE2 or SSE4.2 SIMD extensions on modern Intel-compatible - processors. + based on the SSE2, SSE4.2 or NEon SIMD extensions on modern Intel + or ARM compatible processors. - To enable these optimizations, two different symbols can be defined; + To enable these optimizations, three different symbols can be defined; \code // Enable SSE2 optimization. #define RAPIDJSON_SSE2 @@ -344,13 +344,17 @@ #define RAPIDJSON_SSE42 \endcode - \c RAPIDJSON_SSE42 takes precedence, if both are defined. + // Enable ARM Neon optimization. + #define RAPIDJSON_NEON + \endcode + + \c RAPIDJSON_SSE42 takes precedence over SSE2, if both are defined. If any of these symbols is defined, RapidJSON defines the macro \c RAPIDJSON_SIMD to indicate the availability of the optimized code. */ #if defined(RAPIDJSON_SSE2) || defined(RAPIDJSON_SSE42) \ - || defined(RAPIDJSON_DOXYGEN_RUNNING) + || defined(RAPIDJSON_NEON) || defined(RAPIDJSON_DOXYGEN_RUNNING) #define RAPIDJSON_SIMD #endif diff --git a/include/rapidjson/reader.h b/include/rapidjson/reader.h index ccc025e..120c311 100644 --- a/include/rapidjson/reader.h +++ b/include/rapidjson/reader.h @@ -33,6 +33,8 @@ #include #elif defined(RAPIDJSON_SSE2) #include +#elif defined(RAPIDJSON_NEON) +#include #endif #ifdef _MSC_VER @@ -411,7 +413,92 @@ inline const char *SkipWhitespace_SIMD(const char* p, const char* end) { return SkipWhitespace(p, end); } -#endif // RAPIDJSON_SSE2 +#elif defined(RAPIDJSON_NEON) + +//! Skip whitespace with ARM Neon instructions, testing 16 8-byte characters at once. +inline const char *SkipWhitespace_SIMD(const char* p) { + // Fast return for single non-whitespace + if (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t') + ++p; + else + return p; + + // 16-byte align to the next boundary + const char* nextAligned = reinterpret_cast((reinterpret_cast(p) + 15) & static_cast(~15)); + while (p != nextAligned) + if (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t') + ++p; + else + return p; + + const uint8x16_t w0 = vmovq_n_u8(' '); + const uint8x16_t w1 = vmovq_n_u8('\n'); + const uint8x16_t w2 = vmovq_n_u8('\r'); + const uint8x16_t w3 = vmovq_n_u8('\t'); + + for (;; p += 16) { + const uint8x16_t s = vld1q_u8(reinterpret_cast(p)); + uint8x16_t x = vceqq_u8(s, w0); + x = vorrq_u8(x, vceqq_u8(s, w1)); + x = vorrq_u8(x, vceqq_u8(s, w2)); + x = vorrq_u8(x, vceqq_u8(s, w3)); + + x = vmvnq_u8(x); // Negate + x = vrev64q_u8(x); // Rev in 64 + uint64_t low = vgetq_lane_u64(reinterpret_cast(x), 0); // extract + uint64_t high = vgetq_lane_u64(reinterpret_cast(x), 1); // extract + + if (low == 0) { + if (high != 0) { + int lz =__builtin_clzll(high);; + return p + 8 + (lz >> 3); + } + } else { + int lz = __builtin_clzll(low);; + return p + (lz >> 3); + } + } +} + +inline const char *SkipWhitespace_SIMD(const char* p, const char* end) { + // Fast return for single non-whitespace + if (p != end && (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t')) + ++p; + else + return p; + + const uint8x16_t w0 = vmovq_n_u8(' '); + const uint8x16_t w1 = vmovq_n_u8('\n'); + const uint8x16_t w2 = vmovq_n_u8('\r'); + const uint8x16_t w3 = vmovq_n_u8('\t'); + + for (; p <= end - 16; p += 16) { + const uint8x16_t s = vld1q_u8(reinterpret_cast(p)); + uint8x16_t x = vceqq_u8(s, w0); + x = vorrq_u8(x, vceqq_u8(s, w1)); + x = vorrq_u8(x, vceqq_u8(s, w2)); + x = vorrq_u8(x, vceqq_u8(s, w3)); + + x = vmvnq_u8(x); // Negate + x = vrev64q_u8(x); // Rev in 64 + uint64_t low = vgetq_lane_u64(reinterpret_cast(x), 0); // extract + uint64_t high = vgetq_lane_u64(reinterpret_cast(x), 1); // extract + + if (low == 0) { + if (high != 0) { + int lz = __builtin_clzll(high); + return p + 8 + (lz >> 3); + } + } else { + int lz = __builtin_clzll(low); + return p + (lz >> 3); + } + } + + return SkipWhitespace(p, end); +} + +#endif // RAPIDJSON_NEON #ifdef RAPIDJSON_SIMD //! Template function specialization for InsituStringStream @@ -1129,7 +1216,180 @@ private: is.src_ = is.dst_ = p; } -#endif +#elif defined(RAPIDJSON_NEON) + // StringStream -> StackStream + static RAPIDJSON_FORCEINLINE void ScanCopyUnescapedString(StringStream& is, StackStream& os) { + const char* p = is.src_; + + // Scan one by one until alignment (unaligned load may cross page boundary and cause crash) + const char* nextAligned = reinterpret_cast((reinterpret_cast(p) + 15) & static_cast(~15)); + while (p != nextAligned) + if (RAPIDJSON_UNLIKELY(*p == '\"') || RAPIDJSON_UNLIKELY(*p == '\\') || RAPIDJSON_UNLIKELY(static_cast(*p) < 0x20)) { + is.src_ = p; + return; + } + else + os.Put(*p++); + + // The rest of string using SIMD + const uint8x16_t s0 = vmovq_n_u8('"'); + const uint8x16_t s1 = vmovq_n_u8('\\'); + const uint8x16_t s2 = vmovq_n_u8('\b'); + const uint8x16_t s3 = vmovq_n_u8(32); + + for (;; p += 16) { + const uint8x16_t s = vld1q_u8(reinterpret_cast(p)); + uint8x16_t x = vceqq_u8(s, s0); + x = vorrq_u8(x, vceqq_u8(s, s1)); + x = vorrq_u8(x, vceqq_u8(s, s2)); + x = vorrq_u8(x, vcltq_u8(s, s3)); + + x = vrev64q_u8(x); // Rev in 64 + uint64_t low = vgetq_lane_u64(reinterpret_cast(x), 0); // extract + uint64_t high = vgetq_lane_u64(reinterpret_cast(x), 1); // extract + + SizeType length = 0; + bool escaped = false; + if (low == 0) { + if (high != 0) { + unsigned lz = (unsigned)__builtin_clzll(high);; + length = 8 + (lz >> 3); + escaped = true; + } + } else { + unsigned lz = (unsigned)__builtin_clzll(low);; + length = lz >> 3; + escaped = true; + } + if (RAPIDJSON_UNLIKELY(escaped)) { // some of characters is escaped + if (length != 0) { + char* q = reinterpret_cast(os.Push(length)); + for (size_t i = 0; i < length; i++) + q[i] = p[i]; + + p += length; + } + break; + } + vst1q_u8(reinterpret_cast(os.Push(16)), s); + } + + is.src_ = p; + } + + // InsituStringStream -> InsituStringStream + static RAPIDJSON_FORCEINLINE void ScanCopyUnescapedString(InsituStringStream& is, InsituStringStream& os) { + RAPIDJSON_ASSERT(&is == &os); + (void)os; + + if (is.src_ == is.dst_) { + SkipUnescapedString(is); + return; + } + + char* p = is.src_; + char *q = is.dst_; + + // Scan one by one until alignment (unaligned load may cross page boundary and cause crash) + const char* nextAligned = reinterpret_cast((reinterpret_cast(p) + 15) & static_cast(~15)); + while (p != nextAligned) + if (RAPIDJSON_UNLIKELY(*p == '\"') || RAPIDJSON_UNLIKELY(*p == '\\') || RAPIDJSON_UNLIKELY(static_cast(*p) < 0x20)) { + is.src_ = p; + is.dst_ = q; + return; + } + else + *q++ = *p++; + + // The rest of string using SIMD + const uint8x16_t s0 = vmovq_n_u8('"'); + const uint8x16_t s1 = vmovq_n_u8('\\'); + const uint8x16_t s2 = vmovq_n_u8('\b'); + const uint8x16_t s3 = vmovq_n_u8(32); + + for (;; p += 16, q += 16) { + const uint8x16_t s = vld1q_u8(reinterpret_cast(p)); + uint8x16_t x = vceqq_u8(s, s0); + x = vorrq_u8(x, vceqq_u8(s, s1)); + x = vorrq_u8(x, vceqq_u8(s, s2)); + x = vorrq_u8(x, vcltq_u8(s, s3)); + + x = vrev64q_u8(x); // Rev in 64 + uint64_t low = vgetq_lane_u64(reinterpret_cast(x), 0); // extract + uint64_t high = vgetq_lane_u64(reinterpret_cast(x), 1); // extract + + SizeType length = 0; + bool escaped = false; + if (low == 0) { + if (high != 0) { + unsigned lz = (unsigned)__builtin_clzll(high); + length = 8 + (lz >> 3); + escaped = true; + } + } else { + unsigned lz = (unsigned)__builtin_clzll(low); + length = lz >> 3; + escaped = true; + } + if (RAPIDJSON_UNLIKELY(escaped)) { // some of characters is escaped + for (const char* pend = p + length; p != pend; ) { + *q++ = *p++; + } + break; + } + vst1q_u8(reinterpret_cast(q), s); + } + + is.src_ = p; + is.dst_ = q; + } + + // When read/write pointers are the same for insitu stream, just skip unescaped characters + static RAPIDJSON_FORCEINLINE void SkipUnescapedString(InsituStringStream& is) { + RAPIDJSON_ASSERT(is.src_ == is.dst_); + char* p = is.src_; + + // Scan one by one until alignment (unaligned load may cross page boundary and cause crash) + const char* nextAligned = reinterpret_cast((reinterpret_cast(p) + 15) & static_cast(~15)); + for (; p != nextAligned; p++) + if (RAPIDJSON_UNLIKELY(*p == '\"') || RAPIDJSON_UNLIKELY(*p == '\\') || RAPIDJSON_UNLIKELY(static_cast(*p) < 0x20)) { + is.src_ = is.dst_ = p; + return; + } + + // The rest of string using SIMD + const uint8x16_t s0 = vmovq_n_u8('"'); + const uint8x16_t s1 = vmovq_n_u8('\\'); + const uint8x16_t s2 = vmovq_n_u8('\b'); + const uint8x16_t s3 = vmovq_n_u8(32); + + for (;; p += 16) { + const uint8x16_t s = vld1q_u8(reinterpret_cast(p)); + uint8x16_t x = vceqq_u8(s, s0); + x = vorrq_u8(x, vceqq_u8(s, s1)); + x = vorrq_u8(x, vceqq_u8(s, s2)); + x = vorrq_u8(x, vcltq_u8(s, s3)); + + x = vrev64q_u8(x); // Rev in 64 + uint64_t low = vgetq_lane_u64(reinterpret_cast(x), 0); // extract + uint64_t high = vgetq_lane_u64(reinterpret_cast(x), 1); // extract + + if (low == 0) { + if (high != 0) { + int lz = __builtin_clzll(high); + p += 8 + (lz >> 3); + break; + } + } else { + int lz = __builtin_clzll(low); + p += lz >> 3; + break; + } + } + + is.src_ = is.dst_ = p; + } +#endif // RAPIDJSON_NEON template class NumberStream; diff --git a/include/rapidjson/writer.h b/include/rapidjson/writer.h index 219da5e..61cd070 100644 --- a/include/rapidjson/writer.h +++ b/include/rapidjson/writer.h @@ -32,6 +32,8 @@ #include #elif defined(RAPIDJSON_SSE2) #include +#elif defined(RAPIDJSON_NEON) +#include #endif #ifdef _MSC_VER @@ -619,7 +621,75 @@ inline bool Writer::ScanWriteUnescapedString(StringStream& is, siz is.src_ = p; return RAPIDJSON_LIKELY(is.Tell() < length); } -#endif // defined(RAPIDJSON_SSE2) || defined(RAPIDJSON_SSE42) +#elif defined(RAPIDJSON_NEON) +template<> +inline bool Writer::ScanWriteUnescapedString(StringStream& is, size_t length) { + if (length < 16) + return RAPIDJSON_LIKELY(is.Tell() < length); + + if (!RAPIDJSON_LIKELY(is.Tell() < length)) + return false; + + const char* p = is.src_; + const char* end = is.head_ + length; + const char* nextAligned = reinterpret_cast((reinterpret_cast(p) + 15) & static_cast(~15)); + const char* endAligned = reinterpret_cast(reinterpret_cast(end) & static_cast(~15)); + if (nextAligned > end) + return true; + + while (p != nextAligned) + if (*p < 0x20 || *p == '\"' || *p == '\\') { + is.src_ = p; + return RAPIDJSON_LIKELY(is.Tell() < length); + } + else + os_->PutUnsafe(*p++); + + // The rest of string using SIMD + const uint8x16_t s0 = vmovq_n_u8('"'); + const uint8x16_t s1 = vmovq_n_u8('\\'); + const uint8x16_t s2 = vmovq_n_u8('\b'); + const uint8x16_t s3 = vmovq_n_u8(32); + + for (; p != endAligned; p += 16) { + const uint8x16_t s = vld1q_u8(reinterpret_cast(p)); + uint8x16_t x = vceqq_u8(s, s0); + x = vorrq_u8(x, vceqq_u8(s, s1)); + x = vorrq_u8(x, vceqq_u8(s, s2)); + x = vorrq_u8(x, vcltq_u8(s, s3)); + + x = vrev64q_u8(x); // Rev in 64 + uint64_t low = vgetq_lane_u64(reinterpret_cast(x), 0); // extract + uint64_t high = vgetq_lane_u64(reinterpret_cast(x), 1); // extract + + SizeType len = 0; + bool escaped = false; + if (low == 0) { + if (high != 0) { + unsigned lz = (unsigned)__builtin_clzll(high); + len = 8 + (lz >> 3); + escaped = true; + } + } else { + unsigned lz = (unsigned)__builtin_clzll(low); + len = lz >> 3; + escaped = true; + } + if (RAPIDJSON_UNLIKELY(escaped)) { // some of characters is escaped + char* q = reinterpret_cast(os_->PushUnsafe(len)); + for (size_t i = 0; i < len; i++) + q[i] = p[i]; + + p += len; + break; + } + vst1q_u8(reinterpret_cast(os_->PushUnsafe(16)), s); + } + + is.src_ = p; + return RAPIDJSON_LIKELY(is.Tell() < length); +} +#endif // RAPIDJSON_NEON RAPIDJSON_NAMESPACE_END diff --git a/test/perftest/perftest.h b/test/perftest/perftest.h index b098e41..953f95d 100644 --- a/test/perftest/perftest.h +++ b/test/perftest/perftest.h @@ -24,10 +24,13 @@ // __SSE2__ and __SSE4_2__ are recognized by gcc, clang, and the Intel compiler. // We use -march=native with gmake to enable -msse2 and -msse4.2, if supported. +// Likewise, __ARM_NEON is used to detect Neon. #if defined(__SSE4_2__) # define RAPIDJSON_SSE42 #elif defined(__SSE2__) # define RAPIDJSON_SSE2 +#elif defined(__ARM_NEON) +# define RAPIDJSON_NEON #endif #define RAPIDJSON_HAS_STDSTRING 1 diff --git a/test/perftest/rapidjsontest.cpp b/test/perftest/rapidjsontest.cpp index f14e702..a11a557 100644 --- a/test/perftest/rapidjsontest.cpp +++ b/test/perftest/rapidjsontest.cpp @@ -28,6 +28,8 @@ #define SIMD_SUFFIX(name) name##_SSE2 #elif defined(RAPIDJSON_SSE42) #define SIMD_SUFFIX(name) name##_SSE42 +#elif defined(RAPIDJSON_NEON) +#define SIMD_SUFFIX(name) name##_NEON #else #define SIMD_SUFFIX(name) name #endif diff --git a/test/unittest/simdtest.cpp b/test/unittest/simdtest.cpp index b01b559..7b58cd0 100644 --- a/test/unittest/simdtest.cpp +++ b/test/unittest/simdtest.cpp @@ -21,6 +21,8 @@ # define RAPIDJSON_SSE42 #elif defined(__SSE2__) # define RAPIDJSON_SSE2 +#elif defined(__ARM_NEON) +# define RAPIDJSON_NEON #endif #define RAPIDJSON_NAMESPACE rapidjson_simd @@ -41,6 +43,8 @@ using namespace rapidjson_simd; #define SIMD_SUFFIX(name) name##_SSE2 #elif defined(RAPIDJSON_SSE42) #define SIMD_SUFFIX(name) name##_SSE42 +#elif defined(RAPIDJSON_NEON) +#define SIMD_SUFFIX(name) name##_NEON #else #define SIMD_SUFFIX(name) name #endif