Improve performance of sdssplitargs (#1230)

The current implementation of `sdssplitargs` does repeated `sdscatlen`
to build the parsed arguments, which isn't very efficient because it
does a lot of extra reallocations and moves through the sds code a lot.
It also typically results in memory overhead, because `sdscatlen`
over-allocates, which is usually not needed since args are usually not
modified after being created.

The new implementation of sdssplitargs does two passes, the first to
parse the argument to figure out the final length and the second to
actually copy the string. It's generally about 2x faster for larger
strings (~100 bytes), and about 20% faster for small strings (~10
bytes). This is generally faster since as long as everything is in the
CPU cache, it's going to be fast.

There are a couple of sanity tests, none existed before, as well as some
fuzzying which was used to find some bugs and also to do the
benchmarking. The original benchmarking code can be seen
6576aeb86a.

```
test_sdssplitargs_benchmark - unit/test_sds.c:530] Using random seed: 1729883235
[test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 56.44%, new:13039us, old:29930us
[test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 56.58%, new:12057us, old:27771us
[test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 59.18%, new:9048us, old:22165us
[test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 54.61%, new:12381us, old:27278us
[test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 51.17%, new:16012us, old:32793us
[test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 49.18%, new:16041us, old:31563us
[test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 58.40%, new:12450us, old:29930us
[test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 56.49%, new:13066us, old:30031us
[test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 58.75%, new:12744us, old:30894us
[test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 52.44%, new:16885us, old:35504us
[test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 62.57%, new:8107us, old:21659us
[test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 62.12%, new:8320us, old:21966us
[test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 45.23%, new:13960us, old:25487us
[test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 57.95%, new:9188us, old:21849us
```

---------

Signed-off-by: Madelyn Olson <madelyneolson@gmail.com>
This commit is contained in:
Madelyn Olson 2024-10-31 11:37:53 -07:00 committed by GitHub
parent 91cbf77442
commit 1c222f77ce
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 142 additions and 81 deletions

179
src/sds.c
View File

@ -1032,6 +1032,86 @@ int hex_digit_to_int(char c) {
}
}
/* Helper function for sdssplitargs that parses a single argument. It
* populates the number characters needed to store the parsed argument
* in len, if provided, or will copy the parsed string into dst, if provided.
* If the string is able to be parsed, this function returns the number of
* characters that were parsed. If the argument can't be parsed, it
* returns 0. */
static int sdsparsearg(const char *arg, unsigned int *len, char *dst) {
const char *p = arg;
int inq = 0; /* set to 1 if we are in "quotes" */
int insq = 0; /* set to 1 if we are in 'single quotes' */
int done = 0;
while (!done) {
int new_char = -1;
if (inq) {
if (*p == '\\' && *(p + 1) == 'x' && is_hex_digit(*(p + 2)) && is_hex_digit(*(p + 3))) {
new_char = (hex_digit_to_int(*(p + 2)) * 16) + hex_digit_to_int(*(p + 3));
p += 3;
} else if (*p == '\\' && *(p + 1)) {
p++;
switch (*p) {
case 'n': new_char = '\n'; break;
case 'r': new_char = '\r'; break;
case 't': new_char = '\t'; break;
case 'b': new_char = '\b'; break;
case 'a': new_char = '\a'; break;
default: new_char = *p; break;
}
} else if (*p == '"') {
/* closing quote must be followed by a space or
* nothing at all. */
if (*(p + 1) && !isspace(*(p + 1))) return 0;
done = 1;
} else if (!*p) {
/* unterminated quotes */
return 0;
} else {
new_char = *p;
}
} else if (insq) {
if (*p == '\\' && *(p + 1) == '\'') {
p++;
new_char = *p;
} else if (*p == '\'') {
/* closing quote must be followed by a space or
* nothing at all. */
if (*(p + 1) && !isspace(*(p + 1))) return 0;
done = 1;
} else if (!*p) {
/* unterminated quotes */
return 0;
} else {
new_char = *p;
}
} else {
switch (*p) {
case ' ':
case '\n':
case '\r':
case '\t':
case '\0': done = 1; break;
case '"': inq = 1; break;
case '\'': insq = 1; break;
default: new_char = *p; break;
}
}
if (new_char != -1) {
if (len) (*len)++;
if (dst) {
*dst = (char)new_char;
dst++;
}
}
if (*p) {
p++;
}
}
return p - arg;
}
/* Split a line into arguments, where every argument can be in the
* following programming-language REPL-alike form:
*
@ -1049,103 +1129,42 @@ int hex_digit_to_int(char c) {
* The function returns the allocated tokens on success, even when the
* input string is empty, or NULL if the input contains unbalanced
* quotes or closed quotes followed by non space characters
* as in: "foo"bar or "foo'
* as in: "foo"bar or "foo'.
*
* The sds strings returned by this function are not initialized with
* extra space.
*/
sds *sdssplitargs(const char *line, int *argc) {
const char *p = line;
char *current = NULL;
char **vector = NULL;
*argc = 0;
while (1) {
while (*p) {
/* skip blanks */
while (*p && isspace(*p)) p++;
if (*p) {
/* get a token */
int inq = 0; /* set to 1 if we are in "quotes" */
int insq = 0; /* set to 1 if we are in 'single quotes' */
int done = 0;
if (!(*p)) break;
unsigned int len = 0;
if (sdsparsearg(p, &len, NULL)) {
sds current = sdsnewlen(SDS_NOINIT, len);
int parsedlen = sdsparsearg(p, NULL, current);
assert(parsedlen > 0);
p += parsedlen;
if (current == NULL) current = sdsempty();
while (!done) {
if (inq) {
if (*p == '\\' && *(p + 1) == 'x' && is_hex_digit(*(p + 2)) && is_hex_digit(*(p + 3))) {
unsigned char byte;
byte = (hex_digit_to_int(*(p + 2)) * 16) + hex_digit_to_int(*(p + 3));
current = sdscatlen(current, (char *)&byte, 1);
p += 3;
} else if (*p == '\\' && *(p + 1)) {
char c;
p++;
switch (*p) {
case 'n': c = '\n'; break;
case 'r': c = '\r'; break;
case 't': c = '\t'; break;
case 'b': c = '\b'; break;
case 'a': c = '\a'; break;
default: c = *p; break;
}
current = sdscatlen(current, &c, 1);
} else if (*p == '"') {
/* closing quote must be followed by a space or
* nothing at all. */
if (*(p + 1) && !isspace(*(p + 1))) goto err;
done = 1;
} else if (!*p) {
/* unterminated quotes */
goto err;
} else {
current = sdscatlen(current, p, 1);
}
} else if (insq) {
if (*p == '\\' && *(p + 1) == '\'') {
p++;
current = sdscatlen(current, "'", 1);
} else if (*p == '\'') {
/* closing quote must be followed by a space or
* nothing at all. */
if (*(p + 1) && !isspace(*(p + 1))) goto err;
done = 1;
} else if (!*p) {
/* unterminated quotes */
goto err;
} else {
current = sdscatlen(current, p, 1);
}
} else {
switch (*p) {
case ' ':
case '\n':
case '\r':
case '\t':
case '\0': done = 1; break;
case '"': inq = 1; break;
case '\'': insq = 1; break;
default: current = sdscatlen(current, p, 1); break;
}
}
if (*p) p++;
}
/* add the token to the vector */
vector = s_realloc(vector, ((*argc) + 1) * sizeof(char *));
vector[*argc] = current;
(*argc)++;
current = NULL;
} else {
/* Even on empty input string return something not NULL. */
if (vector == NULL) vector = s_malloc(sizeof(void *));
return vector;
while ((*argc)--) sdsfree(vector[*argc]);
s_free(vector);
*argc = 0;
return NULL;
}
}
err:
while ((*argc)--) sdsfree(vector[*argc]);
s_free(vector);
if (current) sdsfree(current);
*argc = 0;
return NULL;
/* Even on empty input string return something not NULL. */
if (vector == NULL) vector = s_malloc(sizeof(void *));
return vector;
}
/* Modify the string substituting all the occurrences of the set of

View File

@ -99,6 +99,7 @@ int test_raxFuzz(int argc, char **argv, int flags);
int test_sds(int argc, char **argv, int flags);
int test_typesAndAllocSize(int argc, char **argv, int flags);
int test_sdsHeaderSizes(int argc, char **argv, int flags);
int test_sdssplitargs(int argc, char **argv, int flags);
int test_sha1(int argc, char **argv, int flags);
int test_string2ll(int argc, char **argv, int flags);
int test_string2l(int argc, char **argv, int flags);
@ -157,7 +158,7 @@ unitTest __test_intset_c[] = {{"test_intsetValueEncodings", test_intsetValueEnco
unitTest __test_kvstore_c[] = {{"test_kvstoreAdd16Keys", test_kvstoreAdd16Keys}, {"test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyDict", test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyDict}, {"test_kvstoreIteratorRemoveAllKeysDeleteEmptyDict", test_kvstoreIteratorRemoveAllKeysDeleteEmptyDict}, {"test_kvstoreDictIteratorRemoveAllKeysNoDeleteEmptyDict", test_kvstoreDictIteratorRemoveAllKeysNoDeleteEmptyDict}, {"test_kvstoreDictIteratorRemoveAllKeysDeleteEmptyDict", test_kvstoreDictIteratorRemoveAllKeysDeleteEmptyDict}, {NULL, NULL}};
unitTest __test_listpack_c[] = {{"test_listpackCreateIntList", test_listpackCreateIntList}, {"test_listpackCreateList", test_listpackCreateList}, {"test_listpackLpPrepend", test_listpackLpPrepend}, {"test_listpackLpPrependInteger", test_listpackLpPrependInteger}, {"test_listpackGetELementAtIndex", test_listpackGetELementAtIndex}, {"test_listpackPop", test_listpackPop}, {"test_listpackGetELementAtIndex2", test_listpackGetELementAtIndex2}, {"test_listpackIterate0toEnd", test_listpackIterate0toEnd}, {"test_listpackIterate1toEnd", test_listpackIterate1toEnd}, {"test_listpackIterate2toEnd", test_listpackIterate2toEnd}, {"test_listpackIterateBackToFront", test_listpackIterateBackToFront}, {"test_listpackIterateBackToFrontWithDelete", test_listpackIterateBackToFrontWithDelete}, {"test_listpackDeleteWhenNumIsMinusOne", test_listpackDeleteWhenNumIsMinusOne}, {"test_listpackDeleteWithNegativeIndex", test_listpackDeleteWithNegativeIndex}, {"test_listpackDeleteInclusiveRange0_0", test_listpackDeleteInclusiveRange0_0}, {"test_listpackDeleteInclusiveRange0_1", test_listpackDeleteInclusiveRange0_1}, {"test_listpackDeleteInclusiveRange1_2", test_listpackDeleteInclusiveRange1_2}, {"test_listpackDeleteWitStartIndexOutOfRange", test_listpackDeleteWitStartIndexOutOfRange}, {"test_listpackDeleteWitNumOverflow", test_listpackDeleteWitNumOverflow}, {"test_listpackBatchDelete", test_listpackBatchDelete}, {"test_listpackDeleteFooWhileIterating", test_listpackDeleteFooWhileIterating}, {"test_listpackReplaceWithSameSize", test_listpackReplaceWithSameSize}, {"test_listpackReplaceWithDifferentSize", test_listpackReplaceWithDifferentSize}, {"test_listpackRegressionGt255Bytes", test_listpackRegressionGt255Bytes}, {"test_listpackCreateLongListAndCheckIndices", test_listpackCreateLongListAndCheckIndices}, {"test_listpackCompareStrsWithLpEntries", test_listpackCompareStrsWithLpEntries}, {"test_listpackLpMergeEmptyLps", test_listpackLpMergeEmptyLps}, {"test_listpackLpMergeLp1Larger", test_listpackLpMergeLp1Larger}, {"test_listpackLpMergeLp2Larger", test_listpackLpMergeLp2Larger}, {"test_listpackLpNextRandom", test_listpackLpNextRandom}, {"test_listpackLpNextRandomCC", test_listpackLpNextRandomCC}, {"test_listpackRandomPairWithOneElement", test_listpackRandomPairWithOneElement}, {"test_listpackRandomPairWithManyElements", test_listpackRandomPairWithManyElements}, {"test_listpackRandomPairsWithOneElement", test_listpackRandomPairsWithOneElement}, {"test_listpackRandomPairsWithManyElements", test_listpackRandomPairsWithManyElements}, {"test_listpackRandomPairsUniqueWithOneElement", test_listpackRandomPairsUniqueWithOneElement}, {"test_listpackRandomPairsUniqueWithManyElements", test_listpackRandomPairsUniqueWithManyElements}, {"test_listpackPushVariousEncodings", test_listpackPushVariousEncodings}, {"test_listpackLpFind", test_listpackLpFind}, {"test_listpackLpValidateIntegrity", test_listpackLpValidateIntegrity}, {"test_listpackNumberOfElementsExceedsLP_HDR_NUMELE_UNKNOWN", test_listpackNumberOfElementsExceedsLP_HDR_NUMELE_UNKNOWN}, {"test_listpackStressWithRandom", test_listpackStressWithRandom}, {"test_listpackSTressWithVariableSize", test_listpackSTressWithVariableSize}, {"test_listpackBenchmarkInit", test_listpackBenchmarkInit}, {"test_listpackBenchmarkLpAppend", test_listpackBenchmarkLpAppend}, {"test_listpackBenchmarkLpFindString", test_listpackBenchmarkLpFindString}, {"test_listpackBenchmarkLpFindNumber", test_listpackBenchmarkLpFindNumber}, {"test_listpackBenchmarkLpSeek", test_listpackBenchmarkLpSeek}, {"test_listpackBenchmarkLpValidateIntegrity", test_listpackBenchmarkLpValidateIntegrity}, {"test_listpackBenchmarkLpCompareWithString", test_listpackBenchmarkLpCompareWithString}, {"test_listpackBenchmarkLpCompareWithNumber", test_listpackBenchmarkLpCompareWithNumber}, {"test_listpackBenchmarkFree", test_listpackBenchmarkFree}, {NULL, NULL}};
unitTest __test_rax_c[] = {{"test_raxRandomWalk", test_raxRandomWalk}, {"test_raxIteratorUnitTests", test_raxIteratorUnitTests}, {"test_raxTryInsertUnitTests", test_raxTryInsertUnitTests}, {"test_raxRegressionTest1", test_raxRegressionTest1}, {"test_raxRegressionTest2", test_raxRegressionTest2}, {"test_raxRegressionTest3", test_raxRegressionTest3}, {"test_raxRegressionTest4", test_raxRegressionTest4}, {"test_raxRegressionTest5", test_raxRegressionTest5}, {"test_raxRegressionTest6", test_raxRegressionTest6}, {"test_raxBenchmark", test_raxBenchmark}, {"test_raxHugeKey", test_raxHugeKey}, {"test_raxFuzz", test_raxFuzz}, {NULL, NULL}};
unitTest __test_sds_c[] = {{"test_sds", test_sds}, {"test_typesAndAllocSize", test_typesAndAllocSize}, {"test_sdsHeaderSizes", test_sdsHeaderSizes}, {NULL, NULL}};
unitTest __test_sds_c[] = {{"test_sds", test_sds}, {"test_typesAndAllocSize", test_typesAndAllocSize}, {"test_sdsHeaderSizes", test_sdsHeaderSizes}, {"test_sdssplitargs", test_sdssplitargs}, {NULL, NULL}};
unitTest __test_sha1_c[] = {{"test_sha1", test_sha1}, {NULL, NULL}};
unitTest __test_util_c[] = {{"test_string2ll", test_string2ll}, {"test_string2l", test_string2l}, {"test_ll2string", test_ll2string}, {"test_ld2string", test_ld2string}, {"test_fixedpoint_d2string", test_fixedpoint_d2string}, {"test_version2num", test_version2num}, {"test_reclaimFilePageCache", test_reclaimFilePageCache}, {NULL, NULL}};
unitTest __test_ziplist_c[] = {{"test_ziplistCreateIntList", test_ziplistCreateIntList}, {"test_ziplistPop", test_ziplistPop}, {"test_ziplistGetElementAtIndex3", test_ziplistGetElementAtIndex3}, {"test_ziplistGetElementOutOfRange", test_ziplistGetElementOutOfRange}, {"test_ziplistGetLastElement", test_ziplistGetLastElement}, {"test_ziplistGetFirstElement", test_ziplistGetFirstElement}, {"test_ziplistGetElementOutOfRangeReverse", test_ziplistGetElementOutOfRangeReverse}, {"test_ziplistIterateThroughFullList", test_ziplistIterateThroughFullList}, {"test_ziplistIterateThroughListFrom1ToEnd", test_ziplistIterateThroughListFrom1ToEnd}, {"test_ziplistIterateThroughListFrom2ToEnd", test_ziplistIterateThroughListFrom2ToEnd}, {"test_ziplistIterateThroughStartOutOfRange", test_ziplistIterateThroughStartOutOfRange}, {"test_ziplistIterateBackToFront", test_ziplistIterateBackToFront}, {"test_ziplistIterateBackToFrontDeletingAllItems", test_ziplistIterateBackToFrontDeletingAllItems}, {"test_ziplistDeleteInclusiveRange0To0", test_ziplistDeleteInclusiveRange0To0}, {"test_ziplistDeleteInclusiveRange0To1", test_ziplistDeleteInclusiveRange0To1}, {"test_ziplistDeleteInclusiveRange1To2", test_ziplistDeleteInclusiveRange1To2}, {"test_ziplistDeleteWithStartIndexOutOfRange", test_ziplistDeleteWithStartIndexOutOfRange}, {"test_ziplistDeleteWithNumOverflow", test_ziplistDeleteWithNumOverflow}, {"test_ziplistDeleteFooWhileIterating", test_ziplistDeleteFooWhileIterating}, {"test_ziplistReplaceWithSameSize", test_ziplistReplaceWithSameSize}, {"test_ziplistReplaceWithDifferentSize", test_ziplistReplaceWithDifferentSize}, {"test_ziplistRegressionTestForOver255ByteStrings", test_ziplistRegressionTestForOver255ByteStrings}, {"test_ziplistRegressionTestDeleteNextToLastEntries", test_ziplistRegressionTestDeleteNextToLastEntries}, {"test_ziplistCreateLongListAndCheckIndices", test_ziplistCreateLongListAndCheckIndices}, {"test_ziplistCompareStringWithZiplistEntries", test_ziplistCompareStringWithZiplistEntries}, {"test_ziplistMergeTest", test_ziplistMergeTest}, {"test_ziplistStressWithRandomPayloadsOfDifferentEncoding", test_ziplistStressWithRandomPayloadsOfDifferentEncoding}, {"test_ziplistCascadeUpdateEdgeCases", test_ziplistCascadeUpdateEdgeCases}, {"test_ziplistInsertEdgeCase", test_ziplistInsertEdgeCase}, {"test_ziplistStressWithVariableSize", test_ziplistStressWithVariableSize}, {"test_BenchmarkziplistFind", test_BenchmarkziplistFind}, {"test_BenchmarkziplistIndex", test_BenchmarkziplistIndex}, {"test_BenchmarkziplistValidateIntegrity", test_BenchmarkziplistValidateIntegrity}, {"test_BenchmarkziplistCompareWithString", test_BenchmarkziplistCompareWithString}, {"test_BenchmarkziplistCompareWithNumber", test_BenchmarkziplistCompareWithNumber}, {"test_ziplistStress__ziplistCascadeUpdate", test_ziplistStress__ziplistCascadeUpdate}, {NULL, NULL}};

View File

@ -328,3 +328,44 @@ int test_sdsHeaderSizes(int argc, char **argv, int flags) {
return 0;
}
int test_sdssplitargs(int argc, char **argv, int flags) {
UNUSED(argc);
UNUSED(argv);
UNUSED(flags);
int len;
sds *sargv;
sargv = sdssplitargs("Testing one two three", &len);
TEST_ASSERT(4 == len);
TEST_ASSERT(!strcmp("Testing", sargv[0]));
TEST_ASSERT(!strcmp("one", sargv[1]));
TEST_ASSERT(!strcmp("two", sargv[2]));
TEST_ASSERT(!strcmp("three", sargv[3]));
sdsfreesplitres(sargv, len);
sargv = sdssplitargs("", &len);
TEST_ASSERT(0 == len);
TEST_ASSERT(sargv != NULL);
sdsfreesplitres(sargv, len);
sargv = sdssplitargs("\"Testing split strings\" \'Another split string\'", &len);
TEST_ASSERT(2 == len);
TEST_ASSERT(!strcmp("Testing split strings", sargv[0]));
TEST_ASSERT(!strcmp("Another split string", sargv[1]));
sdsfreesplitres(sargv, len);
sargv = sdssplitargs("\"Hello\" ", &len);
TEST_ASSERT(1 == len);
TEST_ASSERT(!strcmp("Hello", sargv[0]));
sdsfreesplitres(sargv, len);
char *binary_string = "\"\\x73\\x75\\x70\\x65\\x72\\x20\\x00\\x73\\x65\\x63\\x72\\x65\\x74\\x20\\x70\\x61\\x73\\x73\\x77\\x6f\\x72\\x64\"";
sargv = sdssplitargs(binary_string, &len);
TEST_ASSERT(1 == len);
TEST_ASSERT(22 == sdslen(sargv[0]));
sdsfreesplitres(sargv, len);
return 0;
}