Merge pull request #553 from miloyip/issue158_parsestdstring

Issue158 parsestdstring
This commit is contained in:
Milo Yip 2016-02-20 21:42:32 +08:00
commit dd25c9651a
8 changed files with 243 additions and 2 deletions

View File

@ -20,6 +20,8 @@
#include "reader.h" #include "reader.h"
#include "internal/meta.h" #include "internal/meta.h"
#include "internal/strfunc.h" #include "internal/strfunc.h"
#include "memorystream.h"
#include "encodedstream.h"
#include <new> // placement new #include <new> // placement new
#ifdef _MSC_VER #ifdef _MSC_VER
@ -2224,6 +2226,42 @@ public:
GenericDocument& Parse(const Ch* str) { GenericDocument& Parse(const Ch* str) {
return Parse<kParseDefaultFlags>(str); return Parse<kParseDefaultFlags>(str);
} }
template <unsigned parseFlags, typename SourceEncoding>
GenericDocument& Parse(const typename SourceEncoding::Ch* str, size_t length) {
RAPIDJSON_ASSERT(!(parseFlags & kParseInsituFlag));
MemoryStream ms(static_cast<const char*>(str), length * sizeof(typename SourceEncoding::Ch));
EncodedInputStream<SourceEncoding, MemoryStream> is(ms);
ParseStream<parseFlags, SourceEncoding>(is);
return *this;
}
template <unsigned parseFlags>
GenericDocument& Parse(const Ch* str, size_t length) {
return Parse<parseFlags, Encoding>(str, length);
}
GenericDocument& Parse(const Ch* str, size_t length) {
return Parse<kParseDefaultFlags>(str, length);
}
#if RAPIDJSON_HAS_STDSTRING
template <unsigned parseFlags, typename SourceEncoding>
GenericDocument& Parse(const std::basic_string<typename SourceEncoding::Ch>& str) {
// c_str() is constant complexity according to standard. Should be faster than Parse(const char*, size_t)
return Parse<parseFlags, SourceEncoding>(str.c_str());
}
template <unsigned parseFlags>
GenericDocument& Parse(const std::basic_string<Ch>& str) {
return Parse<parseFlags, Encoding>(str);
}
GenericDocument& Parse(const std::basic_string<Ch>& str) {
return Parse<kParseDefaultFlags>(str);
}
#endif // RAPIDJSON_HAS_STDSTRING
//!@} //!@}
//!@name Handling parse errors //!@name Handling parse errors

View File

@ -16,6 +16,7 @@
#define RAPIDJSON_ENCODEDSTREAM_H_ #define RAPIDJSON_ENCODEDSTREAM_H_
#include "stream.h" #include "stream.h"
#include "memorystream.h"
#ifdef __GNUC__ #ifdef __GNUC__
RAPIDJSON_DIAG_PUSH RAPIDJSON_DIAG_PUSH
@ -62,6 +63,30 @@ private:
Ch current_; Ch current_;
}; };
//! Specialized for UTF8 MemoryStream.
template <>
class EncodedInputStream<UTF8<>, MemoryStream> {
public:
typedef UTF8<>::Ch Ch;
EncodedInputStream(MemoryStream& is) : is_(is) {
if (static_cast<unsigned char>(is_.Peek()) == 0xEFu) is_.Take();
if (static_cast<unsigned char>(is_.Peek()) == 0xBBu) is_.Take();
if (static_cast<unsigned char>(is_.Peek()) == 0xBFu) is_.Take();
}
Ch Peek() const { return is_.Peek(); }
Ch Take() { return is_.Take(); }
size_t Tell() const { return is_.Tell(); }
// Not implemented
void Put(Ch) {}
void Flush() {}
Ch* PutBegin() { return 0; }
size_t PutEnd(Ch*) { return 0; }
MemoryStream& is_;
};
//! Output byte stream wrapper with statically bound encoding. //! Output byte stream wrapper with statically bound encoding.
/*! /*!
\tparam Encoding The interpretation of encoding of the stream. Either UTF8, UTF16LE, UTF16BE, UTF32LE, UTF32BE. \tparam Encoding The interpretation of encoding of the stream. Either UTF8, UTF16LE, UTF16BE, UTF32LE, UTF32BE.

View File

@ -42,8 +42,8 @@ struct MemoryStream {
MemoryStream(const Ch *src, size_t size) : src_(src), begin_(src), end_(src + size), size_(size) {} MemoryStream(const Ch *src, size_t size) : src_(src), begin_(src), end_(src + size), size_(size) {}
Ch Peek() const { return (src_ == end_) ? '\0' : *src_; } Ch Peek() const { return RAPIDJSON_UNLIKELY(src_ == end_) ? '\0' : *src_; }
Ch Take() { return (src_ == end_) ? '\0' : *src_++; } Ch Take() { return RAPIDJSON_UNLIKELY(src_ == end_) ? '\0' : *src_++; }
size_t Tell() const { return static_cast<size_t>(src_ - begin_); } size_t Tell() const { return static_cast<size_t>(src_ - begin_); }
Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; } Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }

View File

@ -19,6 +19,7 @@
#include "allocators.h" #include "allocators.h"
#include "stream.h" #include "stream.h"
#include "encodedstream.h"
#include "internal/meta.h" #include "internal/meta.h"
#include "internal/stack.h" #include "internal/stack.h"
#include "internal/strtod.h" #include "internal/strtod.h"
@ -259,6 +260,12 @@ void SkipWhitespace(InputStream& is) {
s.Take(); s.Take();
} }
inline const char* SkipWhitespace(const char* p, const char* end) {
while (p != end && (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t'))
++p;
return p;
}
#ifdef RAPIDJSON_SSE42 #ifdef RAPIDJSON_SSE42
//! Skip whitespace with SSE 4.2 pcmpistrm instruction, testing 16 8-byte characters at once. //! Skip whitespace with SSE 4.2 pcmpistrm instruction, testing 16 8-byte characters at once.
inline const char *SkipWhitespace_SIMD(const char* p) { inline const char *SkipWhitespace_SIMD(const char* p) {
@ -295,6 +302,34 @@ inline const char *SkipWhitespace_SIMD(const char* p) {
} }
} }
inline const char *SkipWhitespace_SIMD(const char* p, const char* end) {
// Fast return for single non-whitespace
if (p != end && (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t'))
++p;
else
return p;
// The middle of string using SIMD
static const char whitespace[16] = " \n\r\t";
const __m128i w = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&whitespace[0]));
for (; p <= end - 16; p += 16) {
const __m128i s = _mm_loadu_si128(reinterpret_cast<const __m128i *>(p));
const int r = _mm_cvtsi128_si32(_mm_cmpistrm(w, s, _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK | _SIDD_NEGATIVE_POLARITY));
if (r != 0) { // some of characters is non-whitespace
#ifdef _MSC_VER // Find the index of first non-whitespace
unsigned long offset;
_BitScanForward(&offset, r);
return p + offset;
#else
return p + __builtin_ffs(r) - 1;
#endif
}
}
return SkipWhitespace(p, end);
}
#elif defined(RAPIDJSON_SSE2) #elif defined(RAPIDJSON_SSE2)
//! Skip whitespace with SSE2 instructions, testing 16 8-byte characters at once. //! Skip whitespace with SSE2 instructions, testing 16 8-byte characters at once.
@ -342,6 +377,44 @@ inline const char *SkipWhitespace_SIMD(const char* p) {
} }
} }
inline const char *SkipWhitespace_SIMD(const char* p, const char* end) {
// Fast return for single non-whitespace
if (p != end && (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t'))
++p;
else
return p;
// The rest of string
#define C16(c) { c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c }
static const char whitespaces[4][16] = { C16(' '), C16('\n'), C16('\r'), C16('\t') };
#undef C16
const __m128i w0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&whitespaces[0][0]));
const __m128i w1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&whitespaces[1][0]));
const __m128i w2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&whitespaces[2][0]));
const __m128i w3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&whitespaces[3][0]));
for (; p <= end - 16; p += 16) {
const __m128i s = _mm_loadu_si128(reinterpret_cast<const __m128i *>(p));
__m128i x = _mm_cmpeq_epi8(s, w0);
x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w1));
x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w2));
x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w3));
unsigned short r = static_cast<unsigned short>(~_mm_movemask_epi8(x));
if (r != 0) { // some of characters may be non-whitespace
#ifdef _MSC_VER // Find the index of first non-whitespace
unsigned long offset;
_BitScanForward(&offset, r);
return p + offset;
#else
return p + __builtin_ffs(r) - 1;
#endif
}
}
return SkipWhitespace(p, end);
}
#endif // RAPIDJSON_SSE2 #endif // RAPIDJSON_SSE2
#ifdef RAPIDJSON_SIMD #ifdef RAPIDJSON_SIMD
@ -354,6 +427,10 @@ template<> inline void SkipWhitespace(InsituStringStream& is) {
template<> inline void SkipWhitespace(StringStream& is) { template<> inline void SkipWhitespace(StringStream& is) {
is.src_ = SkipWhitespace_SIMD(is.src_); is.src_ = SkipWhitespace_SIMD(is.src_);
} }
template<> inline void SkipWhitespace(EncodedInputStream<UTF8<>, MemoryStream>& is) {
is.is_.src_ = SkipWhitespace_SIMD(is.is_.src_, is.is_.end_);
}
#endif // RAPIDJSON_SIMD #endif // RAPIDJSON_SIMD
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////

View File

@ -30,6 +30,8 @@
# define RAPIDJSON_SSE2 # define RAPIDJSON_SSE2
#endif #endif
#define RAPIDJSON_HAS_STDSTRING 1
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Google Test // Google Test

View File

@ -187,6 +187,25 @@ TEST_F(RapidJson, SIMD_SUFFIX(DocumentParse_MemoryPoolAllocator)) {
} }
} }
TEST_F(RapidJson, SIMD_SUFFIX(DocumentParseLength_MemoryPoolAllocator)) {
for (size_t i = 0; i < kTrialCount; i++) {
Document doc;
doc.Parse(json_, length_);
ASSERT_TRUE(doc.IsObject());
}
}
#if RAPIDJSON_HAS_STDSTRING
TEST_F(RapidJson, SIMD_SUFFIX(DocumentParseStdString_MemoryPoolAllocator)) {
const std::string s(json_, length_);
for (size_t i = 0; i < kTrialCount; i++) {
Document doc;
doc.Parse(s);
ASSERT_TRUE(doc.IsObject());
}
}
#endif
TEST_F(RapidJson, SIMD_SUFFIX(DocumentParseIterative_MemoryPoolAllocator)) { TEST_F(RapidJson, SIMD_SUFFIX(DocumentParseIterative_MemoryPoolAllocator)) {
for (size_t i = 0; i < kTrialCount; i++) { for (size_t i = 0; i < kTrialCount; i++) {
Document doc; Document doc;

View File

@ -34,6 +34,8 @@ void ParseCheck(DocumentType& doc) {
typedef typename DocumentType::ValueType ValueType; typedef typename DocumentType::ValueType ValueType;
EXPECT_FALSE(doc.HasParseError()); EXPECT_FALSE(doc.HasParseError());
if (doc.HasParseError())
printf("Error: %d at %zu\n", static_cast<int>(doc.GetParseError()), doc.GetErrorOffset());
EXPECT_TRUE(static_cast<ParseResult>(doc)); EXPECT_TRUE(static_cast<ParseResult>(doc));
EXPECT_TRUE(doc.IsObject()); EXPECT_TRUE(doc.IsObject());
@ -93,6 +95,26 @@ void ParseTest() {
doc.ParseInsitu(buffer); doc.ParseInsitu(buffer);
ParseCheck(doc); ParseCheck(doc);
free(buffer); free(buffer);
// Parse(const Ch*, size_t)
size_t length = strlen(json);
buffer = reinterpret_cast<char*>(malloc(length * 2));
memcpy(buffer, json, length);
memset(buffer + length, 'X', length);
#if RAPIDJSON_HAS_STDSTRING
std::string s2(buffer, length); // backup buffer
#endif
doc.SetNull();
doc.Parse(buffer, length);
free(buffer);
ParseCheck(doc);
#if RAPIDJSON_HAS_STDSTRING
// Parse(std::string)
doc.SetNull();
doc.Parse(s2);
ParseCheck(doc);
#endif
} }
TEST(Document, Parse) { TEST(Document, Parse) {
@ -140,6 +162,42 @@ static FILE* OpenEncodedFile(const char* filename) {
return 0; return 0;
} }
TEST(Document, Parse_Encoding) {
const char* json = " { \"hello\" : \"world\", \"t\" : true , \"f\" : false, \"n\": null, \"i\":123, \"pi\": 3.1416, \"a\":[1, 2, 3, 4] } ";
typedef GenericDocument<UTF16<> > DocumentType;
DocumentType doc;
// Parse<unsigned, SourceEncoding>(const SourceEncoding::Ch*)
// doc.Parse<kParseDefaultFlags, UTF8<> >(json);
// EXPECT_FALSE(doc.HasParseError());
// EXPECT_EQ(0, StrCmp(doc[L"hello"].GetString(), L"world"));
// Parse<unsigned, SourceEncoding>(const SourceEncoding::Ch*, size_t)
size_t length = strlen(json);
char* buffer = reinterpret_cast<char*>(malloc(length * 2));
memcpy(buffer, json, length);
memset(buffer + length, 'X', length);
#if RAPIDJSON_HAS_STDSTRING
std::string s2(buffer, length); // backup buffer
#endif
doc.SetNull();
doc.Parse<kParseDefaultFlags, UTF8<> >(buffer, length);
free(buffer);
EXPECT_FALSE(doc.HasParseError());
if (doc.HasParseError())
printf("Error: %d at %zu\n", static_cast<int>(doc.GetParseError()), doc.GetErrorOffset());
EXPECT_EQ(0, StrCmp(doc[L"hello"].GetString(), L"world"));
#if RAPIDJSON_HAS_STDSTRING
// Parse<unsigned, SourceEncoding>(std::string)
doc.SetNull();
doc.Parse<kParseDefaultFlags, UTF8<> >(s2);
EXPECT_FALSE(doc.HasParseError());
EXPECT_EQ(0, StrCmp(doc[L"hello"].GetString(), L"world"));
#endif
}
TEST(Document, ParseStream_EncodedInputStream) { TEST(Document, ParseStream_EncodedInputStream) {
// UTF8 -> UTF16 // UTF8 -> UTF16
FILE* fp = OpenEncodedFile("utf8.json"); FILE* fp = OpenEncodedFile("utf8.json");

View File

@ -73,6 +73,28 @@ TEST(SIMD, SIMD_SUFFIX(SkipWhitespace)) {
TestSkipWhitespace<InsituStringStream>(); TestSkipWhitespace<InsituStringStream>();
} }
TEST(SIMD, SIMD_SUFFIX(SkipWhitespace_EncodedMemoryStream)) {
for (size_t step = 1; step < 32; step++) {
char buffer[1024];
for (size_t i = 0; i < 1024; i++)
buffer[i] = " \t\r\n"[i % 4];
for (size_t i = 0; i < 1024; i += step)
buffer[i] = 'X';
MemoryStream ms(buffer, 1024);
EncodedInputStream<UTF8<>, MemoryStream> s(ms);
size_t i = 0;
for (;;) {
SkipWhitespace(s);
if (s.Peek() == '\0')
break;
//EXPECT_EQ(i, s.Tell());
EXPECT_EQ('X', s.Take());
i += step;
}
}
}
struct ScanCopyUnescapedStringHandler : BaseReaderHandler<UTF8<>, ScanCopyUnescapedStringHandler> { struct ScanCopyUnescapedStringHandler : BaseReaderHandler<UTF8<>, ScanCopyUnescapedStringHandler> {
bool String(const char* str, size_t length, bool) { bool String(const char* str, size_t length, bool) {
memcpy(buffer, str, length + 1); memcpy(buffer, str, length + 1);