diff --git a/include/rapidjson/reader.h b/include/rapidjson/reader.h index cd67040..701255f 100644 --- a/include/rapidjson/reader.h +++ b/include/rapidjson/reader.h @@ -242,57 +242,34 @@ void SkipWhitespace(InputStream& is) { #ifdef RAPIDJSON_SSE42 //! Skip whitespace with SSE 4.2 pcmpistrm instruction, testing 16 8-byte characters at once. inline const char *SkipWhitespace_SIMD(const char* p) { - static const char whitespace[16] = " \n\r\t"; - static const char whitespaces[4][17] = { - " ", - "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", - "\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r", - "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"}; + // Fast return for single non-whitespace + if (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t') + ++p; + else + return p; - // 16-byte align to the lower boundary - const char* ap = reinterpret_cast(reinterpret_cast(p) & ~15); + // 16-byte align to the next boundary + const char* nextAligned = reinterpret_cast((reinterpret_cast(p) + 15) & ~15); + while (p != nextAligned) + if (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t') + ++p; + else + return p; - // Test first unaligned characters - // Cannot make use of _mm_cmpistrm() because it stops when encounters '\0' before p - if (ap != p) { - const __m128i w0 = _mm_loadu_si128((const __m128i *)&whitespaces[0][0]); - const __m128i w1 = _mm_loadu_si128((const __m128i *)&whitespaces[1][0]); - const __m128i w2 = _mm_loadu_si128((const __m128i *)&whitespaces[2][0]); - const __m128i w3 = _mm_loadu_si128((const __m128i *)&whitespaces[3][0]); + // The rest of string using SIMD + static const char whitespace[16] = " \n\r\t"; + const __m128i w = _mm_loadu_si128((const __m128i *)&whitespace[0]); - unsigned char shift = reinterpret_cast(p) & 15; - const __m128i s = _mm_load_si128(reinterpret_cast(ap)); - __m128i x = _mm_cmpeq_epi8(s, w0); - x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w1)); - x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w2)); - x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w3)); - unsigned short r = (unsigned short)~_mm_movemask_epi8(x); - r = r >> shift << shift; // Clear results before p - if (r != 0) { -#ifdef _MSC_VER // Find the index of first non-whitespace - unsigned long offset; - _BitScanForward(&offset, r); - return ap + offset; -#else - return ap + __builtin_ffs(r) - 1; -#endif - } - ap += 16; - } - - const __m128i w = _mm_loadu_si128((const __m128i *)&whitespace[0]); - - // The rest of string - for (;; ap += 16) { - const __m128i s = _mm_load_si128((const __m128i *)ap); + for (;; p += 16) { + const __m128i s = _mm_load_si128((const __m128i *)p); const unsigned r = _mm_cvtsi128_si32(_mm_cmpistrm(w, s, _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK | _SIDD_NEGATIVE_POLARITY)); if (r != 0) { // some of characters is non-whitespace #ifdef _MSC_VER // Find the index of first non-whitespace unsigned long offset; _BitScanForward(&offset, r); - return ap + offset; + return p + offset; #else - return ap + __builtin_ffs(r) - 1; + return p + __builtin_ffs(r) - 1; #endif } } @@ -302,45 +279,34 @@ inline const char *SkipWhitespace_SIMD(const char* p) { //! Skip whitespace with SSE2 instructions, testing 16 8-byte characters at once. inline const char *SkipWhitespace_SIMD(const char* p) { - static const char whitespaces[4][17] = { - " ", - "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", - "\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r", - "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"}; + // Fast return for single non-whitespace + if (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t') + ++p; + else + return p; - const __m128i w0 = _mm_loadu_si128((const __m128i *)&whitespaces[0][0]); - const __m128i w1 = _mm_loadu_si128((const __m128i *)&whitespaces[1][0]); - const __m128i w2 = _mm_loadu_si128((const __m128i *)&whitespaces[2][0]); - const __m128i w3 = _mm_loadu_si128((const __m128i *)&whitespaces[3][0]); - - // 16-byte align to the lower boundary - const char* ap = reinterpret_cast(reinterpret_cast(p) & ~15); - - // Test first unaligned characters - if (ap != p) { - unsigned char shift = reinterpret_cast(p) & 15; - const __m128i s = _mm_load_si128(reinterpret_cast(ap)); - __m128i x = _mm_cmpeq_epi8(s, w0); - x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w1)); - x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w2)); - x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w3)); - unsigned short r = (unsigned short)~_mm_movemask_epi8(x); - r = r >> shift << shift; // Clear results before p - if (r != 0) { -#ifdef _MSC_VER // Find the index of first non-whitespace - unsigned long offset; - _BitScanForward(&offset, r); - return ap + offset; -#else - return ap + __builtin_ffs(r) - 1; -#endif - } - ap += 16; - } + // 16-byte align to the next boundary + const char* nextAligned = reinterpret_cast((reinterpret_cast(p) + 15) & ~15); + while (p != nextAligned) + if (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t') + ++p; + else + return p; // The rest of string - for (;; ap += 16) { - const __m128i s = _mm_load_si128((const __m128i *)ap); + static const char whitespaces[4][17] = { + " ", + "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r", + "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"}; + + const __m128i w0 = _mm_loadu_si128((const __m128i *)&whitespaces[0][0]); + const __m128i w1 = _mm_loadu_si128((const __m128i *)&whitespaces[1][0]); + const __m128i w2 = _mm_loadu_si128((const __m128i *)&whitespaces[2][0]); + const __m128i w3 = _mm_loadu_si128((const __m128i *)&whitespaces[3][0]); + + for (;; p += 16) { + const __m128i s = _mm_load_si128((const __m128i *)p); __m128i x = _mm_cmpeq_epi8(s, w0); x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w1)); x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w2)); @@ -350,9 +316,9 @@ inline const char *SkipWhitespace_SIMD(const char* p) { #ifdef _MSC_VER // Find the index of first non-whitespace unsigned long offset; _BitScanForward(&offset, r); - return ap + offset; + return p + offset; #else - return ap + __builtin_ffs(r) - 1; + return p + __builtin_ffs(r) - 1; #endif } }