Fix SIMD page fault by using aligned load
This commit is contained in:
parent
fdd380bbc0
commit
a32e2ac1d0
@ -178,21 +178,56 @@ void SkipWhitespace(InputStream& is) {
|
|||||||
//! Skip whitespace with SSE 4.2 pcmpistrm instruction, testing 16 8-byte characters at once.
|
//! Skip whitespace with SSE 4.2 pcmpistrm instruction, testing 16 8-byte characters at once.
|
||||||
inline const char *SkipWhitespace_SIMD(const char* p) {
|
inline const char *SkipWhitespace_SIMD(const char* p) {
|
||||||
static const char whitespace[16] = " \n\r\t";
|
static const char whitespace[16] = " \n\r\t";
|
||||||
__m128i w = _mm_loadu_si128((const __m128i *)&whitespace[0]);
|
static const char whitespaces[4][17] = {
|
||||||
|
" ",
|
||||||
|
"\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
|
||||||
|
"\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r",
|
||||||
|
"\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"};
|
||||||
|
|
||||||
for (;;) {
|
// 16-byte align to the lower boundary
|
||||||
__m128i s = _mm_loadu_si128((const __m128i *)p);
|
const char* ap = reinterpret_cast<const char*>(reinterpret_cast<size_t>(p) & ~15);
|
||||||
unsigned r = _mm_cvtsi128_si32(_mm_cmpistrm(w, s, _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK | _SIDD_NEGATIVE_POLARITY));
|
|
||||||
if (r == 0) // all 16 characters are whitespace
|
// Test first unaligned characters
|
||||||
p += 16;
|
// Cannot make use of _mm_cmpistrm() because it stops when encounters '\0' before p
|
||||||
else { // some of characters may be non-whitespace
|
if (ap != p) {
|
||||||
|
const __m128i w0 = _mm_loadu_si128((const __m128i *)&whitespaces[0][0]);
|
||||||
|
const __m128i w1 = _mm_loadu_si128((const __m128i *)&whitespaces[1][0]);
|
||||||
|
const __m128i w2 = _mm_loadu_si128((const __m128i *)&whitespaces[2][0]);
|
||||||
|
const __m128i w3 = _mm_loadu_si128((const __m128i *)&whitespaces[3][0]);
|
||||||
|
|
||||||
|
unsigned char shift = reinterpret_cast<size_t>(p) & 15;
|
||||||
|
const __m128i s = _mm_load_si128(reinterpret_cast<const __m128i*>(ap));
|
||||||
|
__m128i x = _mm_cmpeq_epi8(s, w0);
|
||||||
|
x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w1));
|
||||||
|
x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w2));
|
||||||
|
x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w3));
|
||||||
|
unsigned short r = (unsigned short)~_mm_movemask_epi8(x);
|
||||||
|
r = r >> shift << shift; // Clear results before p
|
||||||
|
if (r != 0) {
|
||||||
#ifdef _MSC_VER // Find the index of first non-whitespace
|
#ifdef _MSC_VER // Find the index of first non-whitespace
|
||||||
unsigned long offset;
|
unsigned long offset;
|
||||||
if (_BitScanForward(&offset, r))
|
_BitScanForward(&offset, r);
|
||||||
return p + offset;
|
return ap + offset;
|
||||||
#else
|
#else
|
||||||
if (r != 0)
|
return ap + __builtin_ffs(r) - 1;
|
||||||
return p + __builtin_ffs(r) - 1;
|
#endif
|
||||||
|
}
|
||||||
|
ap += 16;
|
||||||
|
}
|
||||||
|
|
||||||
|
const __m128i w = _mm_loadu_si128((const __m128i *)&whitespace[0]);
|
||||||
|
|
||||||
|
// The rest of string
|
||||||
|
for (;; ap += 16) {
|
||||||
|
const __m128i s = _mm_load_si128((const __m128i *)ap);
|
||||||
|
const unsigned r = _mm_cvtsi128_si32(_mm_cmpistrm(w, s, _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK | _SIDD_NEGATIVE_POLARITY));
|
||||||
|
if (r != 0) { // some of characters is non-whitespace
|
||||||
|
#ifdef _MSC_VER // Find the index of first non-whitespace
|
||||||
|
unsigned long offset;
|
||||||
|
_BitScanForward(&offset, r);
|
||||||
|
return ap + offset;
|
||||||
|
#else
|
||||||
|
return ap + __builtin_ffs(r) - 1;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -208,28 +243,51 @@ inline const char *SkipWhitespace_SIMD(const char* p) {
|
|||||||
"\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r",
|
"\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r",
|
||||||
"\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"};
|
"\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"};
|
||||||
|
|
||||||
__m128i w0 = _mm_loadu_si128((const __m128i *)&whitespaces[0][0]);
|
const __m128i w0 = _mm_loadu_si128((const __m128i *)&whitespaces[0][0]);
|
||||||
__m128i w1 = _mm_loadu_si128((const __m128i *)&whitespaces[1][0]);
|
const __m128i w1 = _mm_loadu_si128((const __m128i *)&whitespaces[1][0]);
|
||||||
__m128i w2 = _mm_loadu_si128((const __m128i *)&whitespaces[2][0]);
|
const __m128i w2 = _mm_loadu_si128((const __m128i *)&whitespaces[2][0]);
|
||||||
__m128i w3 = _mm_loadu_si128((const __m128i *)&whitespaces[3][0]);
|
const __m128i w3 = _mm_loadu_si128((const __m128i *)&whitespaces[3][0]);
|
||||||
|
|
||||||
for (;;) {
|
// 16-byte align to the lower boundary
|
||||||
__m128i s = _mm_loadu_si128((const __m128i *)p);
|
const char* ap = reinterpret_cast<const char*>(reinterpret_cast<size_t>(p) & ~15);
|
||||||
|
|
||||||
|
// Test first unaligned characters
|
||||||
|
if (ap != p) {
|
||||||
|
unsigned char shift = reinterpret_cast<size_t>(p) & 15;
|
||||||
|
const __m128i s = _mm_load_si128(reinterpret_cast<const __m128i*>(ap));
|
||||||
__m128i x = _mm_cmpeq_epi8(s, w0);
|
__m128i x = _mm_cmpeq_epi8(s, w0);
|
||||||
x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w1));
|
x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w1));
|
||||||
x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w2));
|
x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w2));
|
||||||
x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w3));
|
x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w3));
|
||||||
unsigned short r = (unsigned short)~_mm_movemask_epi8(x);
|
unsigned short r = (unsigned short)~_mm_movemask_epi8(x);
|
||||||
if (r == 0) // all 16 characters are whitespace
|
r = r >> shift << shift; // Clear results before p
|
||||||
p += 16;
|
if (r != 0) {
|
||||||
else { // some of characters may be non-whitespace
|
|
||||||
#ifdef _MSC_VER // Find the index of first non-whitespace
|
#ifdef _MSC_VER // Find the index of first non-whitespace
|
||||||
unsigned long offset;
|
unsigned long offset;
|
||||||
if (_BitScanForward(&offset, r))
|
_BitScanForward(&offset, r);
|
||||||
return p + offset;
|
return ap + offset;
|
||||||
#else
|
#else
|
||||||
if (r != 0)
|
return ap + __builtin_ffs(r) - 1;
|
||||||
return p + __builtin_ffs(r) - 1;
|
#endif
|
||||||
|
}
|
||||||
|
ap += 16;
|
||||||
|
}
|
||||||
|
|
||||||
|
// The rest of string
|
||||||
|
for (;; ap += 16) {
|
||||||
|
const __m128i s = _mm_load_si128((const __m128i *)ap);
|
||||||
|
__m128i x = _mm_cmpeq_epi8(s, w0);
|
||||||
|
x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w1));
|
||||||
|
x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w2));
|
||||||
|
x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w3));
|
||||||
|
unsigned short r = (unsigned short)~_mm_movemask_epi8(x);
|
||||||
|
if (r != 0) { // some of characters may be non-whitespace
|
||||||
|
#ifdef _MSC_VER // Find the index of first non-whitespace
|
||||||
|
unsigned long offset;
|
||||||
|
_BitScanForward(&offset, r);
|
||||||
|
return ap + offset;
|
||||||
|
#else
|
||||||
|
return ap + __builtin_ffs(r) - 1;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user