From a32e2ac1d08f833667fc4138da1e20526ec671d3 Mon Sep 17 00:00:00 2001 From: miloyip Date: Tue, 29 Jul 2014 16:05:54 +0800 Subject: [PATCH] Fix SIMD page fault by using aligned load --- include/rapidjson/reader.h | 106 ++++++++++++++++++++++++++++--------- 1 file changed, 82 insertions(+), 24 deletions(-) diff --git a/include/rapidjson/reader.h b/include/rapidjson/reader.h index 084cb1e..5d3493a 100644 --- a/include/rapidjson/reader.h +++ b/include/rapidjson/reader.h @@ -178,21 +178,56 @@ void SkipWhitespace(InputStream& is) { //! Skip whitespace with SSE 4.2 pcmpistrm instruction, testing 16 8-byte characters at once. inline const char *SkipWhitespace_SIMD(const char* p) { static const char whitespace[16] = " \n\r\t"; - __m128i w = _mm_loadu_si128((const __m128i *)&whitespace[0]); + static const char whitespaces[4][17] = { + " ", + "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r", + "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"}; - for (;;) { - __m128i s = _mm_loadu_si128((const __m128i *)p); - unsigned r = _mm_cvtsi128_si32(_mm_cmpistrm(w, s, _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK | _SIDD_NEGATIVE_POLARITY)); - if (r == 0) // all 16 characters are whitespace - p += 16; - else { // some of characters may be non-whitespace + // 16-byte align to the lower boundary + const char* ap = reinterpret_cast(reinterpret_cast(p) & ~15); + + // Test first unaligned characters + // Cannot make use of _mm_cmpistrm() because it stops when encounters '\0' before p + if (ap != p) { + const __m128i w0 = _mm_loadu_si128((const __m128i *)&whitespaces[0][0]); + const __m128i w1 = _mm_loadu_si128((const __m128i *)&whitespaces[1][0]); + const __m128i w2 = _mm_loadu_si128((const __m128i *)&whitespaces[2][0]); + const __m128i w3 = _mm_loadu_si128((const __m128i *)&whitespaces[3][0]); + + unsigned char shift = reinterpret_cast(p) & 15; + const __m128i s = _mm_load_si128(reinterpret_cast(ap)); + __m128i x = _mm_cmpeq_epi8(s, w0); + x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w1)); + x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w2)); + x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w3)); + unsigned short r = (unsigned short)~_mm_movemask_epi8(x); + r = r >> shift << shift; // Clear results before p + if (r != 0) { #ifdef _MSC_VER // Find the index of first non-whitespace unsigned long offset; - if (_BitScanForward(&offset, r)) - return p + offset; + _BitScanForward(&offset, r); + return ap + offset; #else - if (r != 0) - return p + __builtin_ffs(r) - 1; + return ap + __builtin_ffs(r) - 1; +#endif + } + ap += 16; + } + + const __m128i w = _mm_loadu_si128((const __m128i *)&whitespace[0]); + + // The rest of string + for (;; ap += 16) { + const __m128i s = _mm_load_si128((const __m128i *)ap); + const unsigned r = _mm_cvtsi128_si32(_mm_cmpistrm(w, s, _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK | _SIDD_NEGATIVE_POLARITY)); + if (r != 0) { // some of characters is non-whitespace +#ifdef _MSC_VER // Find the index of first non-whitespace + unsigned long offset; + _BitScanForward(&offset, r); + return ap + offset; +#else + return ap + __builtin_ffs(r) - 1; #endif } } @@ -208,28 +243,51 @@ inline const char *SkipWhitespace_SIMD(const char* p) { "\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r", "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"}; - __m128i w0 = _mm_loadu_si128((const __m128i *)&whitespaces[0][0]); - __m128i w1 = _mm_loadu_si128((const __m128i *)&whitespaces[1][0]); - __m128i w2 = _mm_loadu_si128((const __m128i *)&whitespaces[2][0]); - __m128i w3 = _mm_loadu_si128((const __m128i *)&whitespaces[3][0]); + const __m128i w0 = _mm_loadu_si128((const __m128i *)&whitespaces[0][0]); + const __m128i w1 = _mm_loadu_si128((const __m128i *)&whitespaces[1][0]); + const __m128i w2 = _mm_loadu_si128((const __m128i *)&whitespaces[2][0]); + const __m128i w3 = _mm_loadu_si128((const __m128i *)&whitespaces[3][0]); - for (;;) { - __m128i s = _mm_loadu_si128((const __m128i *)p); + // 16-byte align to the lower boundary + const char* ap = reinterpret_cast(reinterpret_cast(p) & ~15); + + // Test first unaligned characters + if (ap != p) { + unsigned char shift = reinterpret_cast(p) & 15; + const __m128i s = _mm_load_si128(reinterpret_cast(ap)); __m128i x = _mm_cmpeq_epi8(s, w0); x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w1)); x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w2)); x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w3)); unsigned short r = (unsigned short)~_mm_movemask_epi8(x); - if (r == 0) // all 16 characters are whitespace - p += 16; - else { // some of characters may be non-whitespace + r = r >> shift << shift; // Clear results before p + if (r != 0) { #ifdef _MSC_VER // Find the index of first non-whitespace unsigned long offset; - if (_BitScanForward(&offset, r)) - return p + offset; + _BitScanForward(&offset, r); + return ap + offset; #else - if (r != 0) - return p + __builtin_ffs(r) - 1; + return ap + __builtin_ffs(r) - 1; +#endif + } + ap += 16; + } + + // The rest of string + for (;; ap += 16) { + const __m128i s = _mm_load_si128((const __m128i *)ap); + __m128i x = _mm_cmpeq_epi8(s, w0); + x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w1)); + x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w2)); + x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w3)); + unsigned short r = (unsigned short)~_mm_movemask_epi8(x); + if (r != 0) { // some of characters may be non-whitespace +#ifdef _MSC_VER // Find the index of first non-whitespace + unsigned long offset; + _BitScanForward(&offset, r); + return ap + offset; +#else + return ap + __builtin_ffs(r) - 1; #endif } }