CRC64 perf improvements from Redis patches (#350)
Improve the performance of crc64 for large batches by processing large number of bytes in parallel and combining the results. ## Performance * 53-73% faster on Xeon 2670 v0 @ 2.6ghz * 2-2.5x faster on Core i3 8130U @ 2.2 ghz * 1.6-2.46 bytes/cycle on i3 8130U * likely >2x faster than crcspeed on newer CPUs with more resources than a 2012-era Xeon 2670 * crc64 combine function runs in <50 nanoseconds typical with vector + cache optimizations (~8 *microseconds* without vector optimizations, ~80 *microseconds without cache, the combination is extra effective) * still single-threaded * valkey-server test crc64 --help (requires `make distclean && make SERVER_TEST=yes`) --------- Signed-off-by: Josiah Carlson <josiah.carlson@gmail.com> Signed-off-by: Madelyn Olson <madelyneolson@gmail.com> Co-authored-by: Viktor Söderqvist <viktor.soderqvist@est.tech> Co-authored-by: Madelyn Olson <madelyneolson@gmail.com>
This commit is contained in:
parent
89f72bc3ae
commit
f4e10eee06
@ -131,6 +131,9 @@ ifdef REDIS_LDFLAGS
|
||||
endif
|
||||
|
||||
FINAL_CFLAGS=$(STD) $(WARN) $(OPT) $(DEBUG) $(CFLAGS) $(SERVER_CFLAGS)
|
||||
ifeq ($(SERVER_TEST),yes)
|
||||
FINAL_CFLAGS +=-DSERVER_TEST=1
|
||||
endif
|
||||
FINAL_LDFLAGS=$(LDFLAGS) $(OPT) $(SERVER_LDFLAGS) $(DEBUG)
|
||||
FINAL_LIBS=-lm
|
||||
DEBUG=-g -ggdb
|
||||
@ -382,11 +385,11 @@ endif
|
||||
ENGINE_NAME=valkey
|
||||
SERVER_NAME=$(ENGINE_NAME)-server$(PROG_SUFFIX)
|
||||
ENGINE_SENTINEL_NAME=$(ENGINE_NAME)-sentinel$(PROG_SUFFIX)
|
||||
ENGINE_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o valkey-check-rdb.o valkey-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o
|
||||
ENGINE_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crccombine.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o valkey-check-rdb.o valkey-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o
|
||||
ENGINE_CLI_NAME=$(ENGINE_NAME)-cli$(PROG_SUFFIX)
|
||||
ENGINE_CLI_OBJ=anet.o adlist.o dict.o valkey-cli.o zmalloc.o release.o ae.o serverassert.o crcspeed.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o cli_commands.o
|
||||
ENGINE_CLI_OBJ=anet.o adlist.o dict.o valkey-cli.o zmalloc.o release.o ae.o serverassert.o crcspeed.o crccombine.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o cli_commands.o
|
||||
ENGINE_BENCHMARK_NAME=$(ENGINE_NAME)-benchmark$(PROG_SUFFIX)
|
||||
ENGINE_BENCHMARK_OBJ=ae.o anet.o valkey-benchmark.o adlist.o dict.o zmalloc.o serverassert.o release.o crcspeed.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o
|
||||
ENGINE_BENCHMARK_OBJ=ae.o anet.o valkey-benchmark.o adlist.o dict.o zmalloc.o serverassert.o release.o crcspeed.o crccombine.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o
|
||||
ENGINE_CHECK_RDB_NAME=$(ENGINE_NAME)-check-rdb$(PROG_SUFFIX)
|
||||
ENGINE_CHECK_AOF_NAME=$(ENGINE_NAME)-check-aof$(PROG_SUFFIX)
|
||||
ALL_SOURCES=$(sort $(patsubst %.o,%.c,$(ENGINE_SERVER_OBJ) $(ENGINE_CLI_OBJ) $(ENGINE_BENCHMARK_OBJ)))
|
||||
|
264
src/crc64.c
264
src/crc64.c
@ -28,6 +28,7 @@
|
||||
|
||||
#include "crc64.h"
|
||||
#include "crcspeed.h"
|
||||
#include "serverassert.h"
|
||||
static uint64_t crc64_table[8][256] = {{0}};
|
||||
|
||||
#define POLY UINT64_C(0xad93d23594c935a9)
|
||||
@ -67,14 +68,33 @@ static uint64_t crc64_table[8][256] = {{0}};
|
||||
* \return The reflected data.
|
||||
*****************************************************************************/
|
||||
static inline uint_fast64_t crc_reflect(uint_fast64_t data, size_t data_len) {
|
||||
uint_fast64_t ret = data & 0x01;
|
||||
/* only ever called for data_len == 64 in this codebase
|
||||
*
|
||||
* Borrowed from bit twiddling hacks, original in the public domain.
|
||||
* https://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel
|
||||
* Extended to 64 bits, and added byteswap for final 3 steps.
|
||||
* 16-30x 64-bit operations, no comparisons (16 for native byteswap, 30 for pure C)
|
||||
*/
|
||||
|
||||
for (size_t i = 1; i < data_len; i++) {
|
||||
data >>= 1;
|
||||
ret = (ret << 1) | (data & 0x01);
|
||||
}
|
||||
|
||||
return ret;
|
||||
assert(data_len <= 64);
|
||||
/* swap odd and even bits */
|
||||
data = ((data >> 1) & 0x5555555555555555ULL) | ((data & 0x5555555555555555ULL) << 1);
|
||||
/* swap consecutive pairs */
|
||||
data = ((data >> 2) & 0x3333333333333333ULL) | ((data & 0x3333333333333333ULL) << 2);
|
||||
/* swap nibbles ... */
|
||||
data = ((data >> 4) & 0x0F0F0F0F0F0F0F0FULL) | ((data & 0x0F0F0F0F0F0F0F0FULL) << 4);
|
||||
#if defined(__GNUC__) || defined(__clang__)
|
||||
data = __builtin_bswap64(data);
|
||||
#else
|
||||
/* swap bytes */
|
||||
data = ((data >> 8) & 0x00FF00FF00FF00FFULL) | ((data & 0x00FF00FF00FF00FFULL) << 8);
|
||||
/* swap 2-byte long pairs */
|
||||
data = ( data >> 16 & 0xFFFF0000FFFFULL) | ((data & 0xFFFF0000FFFFULL) << 16);
|
||||
/* swap 4-byte quads */
|
||||
data = ( data >> 32 & 0xFFFFFFFFULL) | ((data & 0xFFFFFFFFULL) << 32);
|
||||
#endif
|
||||
/* adjust for non-64-bit reversals */
|
||||
return data >> (64 - data_len);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -126,29 +146,221 @@ uint64_t crc64(uint64_t crc, const unsigned char *s, uint64_t l) {
|
||||
#ifdef SERVER_TEST
|
||||
#include <stdio.h>
|
||||
|
||||
static void genBenchmarkRandomData(char *data, int count);
|
||||
static int bench_crc64(unsigned char *data, uint64_t size, long long passes, uint64_t check, char *name, int csv);
|
||||
static void bench_combine(char *label, uint64_t size, uint64_t expect, int csv);
|
||||
long long _ustime(void);
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <time.h>
|
||||
#include <sys/time.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "zmalloc.h"
|
||||
#include "crccombine.h"
|
||||
|
||||
long long _ustime(void) {
|
||||
struct timeval tv;
|
||||
long long ust;
|
||||
|
||||
gettimeofday(&tv, NULL);
|
||||
ust = ((long long)tv.tv_sec)*1000000;
|
||||
ust += tv.tv_usec;
|
||||
return ust;
|
||||
}
|
||||
|
||||
static int bench_crc64(unsigned char *data, uint64_t size, long long passes, uint64_t check, char *name, int csv) {
|
||||
uint64_t min = size, hash;
|
||||
long long original_start = _ustime(), original_end;
|
||||
for (long long i=passes; i > 0; i--) {
|
||||
hash = crc64(0, data, size);
|
||||
}
|
||||
original_end = _ustime();
|
||||
min = (original_end - original_start) * 1000 / passes;
|
||||
/* approximate nanoseconds without nstime */
|
||||
if (csv) {
|
||||
printf("%s,%" PRIu64 ",%" PRIu64 ",%d\n",
|
||||
name, size, (1000 * size) / min, hash == check);
|
||||
} else {
|
||||
printf("test size=%" PRIu64 " algorithm=%s %" PRIu64 " M/sec matches=%d\n",
|
||||
size, name, (1000 * size) / min, hash == check);
|
||||
}
|
||||
return hash != check;
|
||||
}
|
||||
|
||||
const uint64_t BENCH_RPOLY = UINT64_C(0x95ac9329ac4bc9b5);
|
||||
|
||||
static void bench_combine(char *label, uint64_t size, uint64_t expect, int csv) {
|
||||
uint64_t min = size, start = expect, thash = expect ^ (expect >> 17);
|
||||
long long original_start = _ustime(), original_end;
|
||||
for (int i=0; i < 1000; i++) {
|
||||
crc64_combine(thash, start, size, BENCH_RPOLY, 64);
|
||||
}
|
||||
original_end = _ustime();
|
||||
/* ran 1000 times, want ns per, counted us per 1000 ... */
|
||||
min = original_end - original_start;
|
||||
if (csv) {
|
||||
printf("%s,%" PRIu64 ",%" PRIu64 "\n", label, size, min);
|
||||
} else {
|
||||
printf("%s size=%" PRIu64 " in %" PRIu64 " nsec\n", label, size, min);
|
||||
}
|
||||
}
|
||||
|
||||
static void genBenchmarkRandomData(char *data, int count) {
|
||||
static uint32_t state = 1234;
|
||||
int i = 0;
|
||||
|
||||
while (count--) {
|
||||
state = (state*1103515245+12345);
|
||||
data[i++] = '0'+((state>>16)&63);
|
||||
}
|
||||
}
|
||||
|
||||
#define UNUSED(x) (void)(x)
|
||||
int crc64Test(int argc, char *argv[], int flags) {
|
||||
UNUSED(argc);
|
||||
UNUSED(argv);
|
||||
UNUSED(flags);
|
||||
crc64_init();
|
||||
printf("[calcula]: e9c6d914c4b8d9ca == %016" PRIx64 "\n",
|
||||
(uint64_t)_crc64(0, "123456789", 9));
|
||||
printf("[64speed]: e9c6d914c4b8d9ca == %016" PRIx64 "\n",
|
||||
(uint64_t)crc64(0, (unsigned char*)"123456789", 9));
|
||||
char li[] = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed "
|
||||
"do eiusmod tempor incididunt ut labore et dolore magna "
|
||||
"aliqua. Ut enim ad minim veniam, quis nostrud exercitation "
|
||||
"ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis "
|
||||
"aute irure dolor in reprehenderit in voluptate velit esse "
|
||||
"cillum dolore eu fugiat nulla pariatur. Excepteur sint "
|
||||
"occaecat cupidatat non proident, sunt in culpa qui officia "
|
||||
"deserunt mollit anim id est laborum.";
|
||||
printf("[calcula]: c7794709e69683b3 == %016" PRIx64 "\n",
|
||||
(uint64_t)_crc64(0, li, sizeof(li)));
|
||||
printf("[64speed]: c7794709e69683b3 == %016" PRIx64 "\n",
|
||||
(uint64_t)crc64(0, (unsigned char*)li, sizeof(li)));
|
||||
|
||||
uint64_t crc64_test_size = 0;
|
||||
int i, lastarg, csv = 0, loop = 0, combine = 0;
|
||||
again:
|
||||
for (i = 3; i < argc; i++) {
|
||||
lastarg = (i == (argc-1));
|
||||
if (!strcmp(argv[i],"--help")) {
|
||||
goto usage;
|
||||
} else if (!strcmp(argv[i],"--csv")) {
|
||||
csv = 1;
|
||||
} else if (!strcmp(argv[i],"-l")) {
|
||||
loop = 1;
|
||||
} else if (!strcmp(argv[i],"--crc")) {
|
||||
if (lastarg) goto invalid;
|
||||
crc64_test_size = atoll(argv[++i]);
|
||||
} else if (!strcmp(argv[i],"--combine")) {
|
||||
combine = 1;
|
||||
} else {
|
||||
invalid:
|
||||
printf("Invalid option \"%s\" or option argument missing\n\n",argv[i]);
|
||||
usage:
|
||||
printf(
|
||||
"Usage: crc64 [OPTIONS]\n\n"
|
||||
" --csv Output in CSV format\n"
|
||||
" -l Loop. Run the tests forever\n"
|
||||
" --crc <bytes> Benchmark crc64 faster options, using a buffer this big, and quit when done.\n"
|
||||
" --combine Benchmark crc64 combine value ranges and timings.\n"
|
||||
);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (crc64_test_size == 0 && combine == 0) {
|
||||
crc64_init();
|
||||
printf("[calcula]: e9c6d914c4b8d9ca == %016" PRIx64 "\n",
|
||||
(uint64_t)_crc64(0, "123456789", 9));
|
||||
printf("[64speed]: e9c6d914c4b8d9ca == %016" PRIx64 "\n",
|
||||
(uint64_t)crc64(0, (unsigned char*)"123456789", 9));
|
||||
char li[] = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed "
|
||||
"do eiusmod tempor incididunt ut labore et dolore magna "
|
||||
"aliqua. Ut enim ad minim veniam, quis nostrud exercitation "
|
||||
"ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis "
|
||||
"aute irure dolor in reprehenderit in voluptate velit esse "
|
||||
"cillum dolore eu fugiat nulla pariatur. Excepteur sint "
|
||||
"occaecat cupidatat non proident, sunt in culpa qui officia "
|
||||
"deserunt mollit anim id est laborum.";
|
||||
printf("[calcula]: c7794709e69683b3 == %016" PRIx64 "\n",
|
||||
(uint64_t)_crc64(0, li, sizeof(li)));
|
||||
printf("[64speed]: c7794709e69683b3 == %016" PRIx64 "\n",
|
||||
(uint64_t)crc64(0, (unsigned char*)li, sizeof(li)));
|
||||
return 0;
|
||||
|
||||
}
|
||||
|
||||
int init_this_loop = 1;
|
||||
long long init_start, init_end;
|
||||
|
||||
do {
|
||||
unsigned char* data = NULL;
|
||||
uint64_t passes = 0;
|
||||
if (crc64_test_size) {
|
||||
data = zmalloc(crc64_test_size);
|
||||
genBenchmarkRandomData((char*)data, crc64_test_size);
|
||||
/* We want to hash about 1 gig of data in total, looped, to get a good
|
||||
* idea of our performance.
|
||||
*/
|
||||
passes = (UINT64_C(0x100000000) / crc64_test_size);
|
||||
passes = passes >= 2 ? passes : 2;
|
||||
passes = passes <= 1000 ? passes : 1000;
|
||||
}
|
||||
|
||||
crc64_init();
|
||||
/* warm up the cache */
|
||||
set_crc64_cutoffs(crc64_test_size+1, crc64_test_size+1);
|
||||
uint64_t expect = crc64(0, data, crc64_test_size);
|
||||
|
||||
if (!combine && crc64_test_size) {
|
||||
if (csv && init_this_loop) printf("algorithm,buffer,performance,crc64_matches\n");
|
||||
|
||||
/* get the single-character version for single-byte Redis behavior */
|
||||
set_crc64_cutoffs(0, crc64_test_size+1);
|
||||
if (bench_crc64(data, crc64_test_size, passes, expect, "crc_1byte", csv)) return 1;
|
||||
|
||||
set_crc64_cutoffs(crc64_test_size+1, crc64_test_size+1);
|
||||
/* run with 8-byte "single" path, crcfaster */
|
||||
if (bench_crc64(data, crc64_test_size, passes, expect, "crcspeed", csv)) return 1;
|
||||
|
||||
/* run with dual 8-byte paths */
|
||||
set_crc64_cutoffs(1, crc64_test_size+1);
|
||||
if (bench_crc64(data, crc64_test_size, passes, expect, "crcdual", csv)) return 1;
|
||||
|
||||
/* run with tri 8-byte paths */
|
||||
set_crc64_cutoffs(1, 1);
|
||||
if (bench_crc64(data, crc64_test_size, passes, expect, "crctri", csv)) return 1;
|
||||
|
||||
/* Be free memory region, be free. */
|
||||
zfree(data);
|
||||
data = NULL;
|
||||
}
|
||||
|
||||
uint64_t INIT_SIZE = UINT64_C(0xffffffffffffffff);
|
||||
if (combine) {
|
||||
if (init_this_loop) {
|
||||
init_start = _ustime();
|
||||
crc64_combine(
|
||||
UINT64_C(0xdeadbeefdeadbeef),
|
||||
UINT64_C(0xfeebdaedfeebdaed),
|
||||
INIT_SIZE,
|
||||
BENCH_RPOLY, 64);
|
||||
init_end = _ustime();
|
||||
|
||||
init_end -= init_start;
|
||||
init_end *= 1000;
|
||||
if (csv) {
|
||||
printf("operation,size,nanoseconds\n");
|
||||
printf("init_64,%" PRIu64 ",%" PRIu64 "\n", INIT_SIZE, (uint64_t)init_end);
|
||||
} else {
|
||||
printf("init_64 size=%" PRIu64 " in %" PRIu64 " nsec\n", INIT_SIZE, (uint64_t)init_end);
|
||||
}
|
||||
/* use the hash itself as the size (unpredictable) */
|
||||
bench_combine("hash_as_size_combine", crc64_test_size, expect, csv);
|
||||
|
||||
/* let's do something big (predictable, so fast) */
|
||||
bench_combine("largest_combine", INIT_SIZE, expect, csv);
|
||||
}
|
||||
bench_combine("combine", crc64_test_size, expect, csv);
|
||||
}
|
||||
init_this_loop = 0;
|
||||
/* step down by ~1.641 for a range of test sizes */
|
||||
crc64_test_size -= (crc64_test_size >> 2) + (crc64_test_size >> 3) + (crc64_test_size >> 6);
|
||||
} while (crc64_test_size > 3);
|
||||
if (loop) goto again;
|
||||
return 0;
|
||||
}
|
||||
# endif
|
||||
|
||||
|
||||
#ifdef SERVER_TEST_MAIN
|
||||
int main(int argc, char *argv[]) {
|
||||
return crc64Test(argc, argv);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
253
src/crccombine.c
Normal file
253
src/crccombine.c
Normal file
@ -0,0 +1,253 @@
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <strings.h>
|
||||
#if defined(__i386__) || defined(__X86_64__)
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
#include "crccombine.h"
|
||||
|
||||
/* Copyright (C) 2013 Mark Adler
|
||||
* Copyright (C) 2019-2024 Josiah Carlson
|
||||
* Portions originally from: crc64.c Version 1.4 16 Dec 2013 Mark Adler
|
||||
* Modifications by Josiah Carlson <josiah.carlson@gmail.com>
|
||||
* - Added implementation variations with sample timings for gf_matrix_times*()
|
||||
* - Most folks would be best using gf2_matrix_times_vec or
|
||||
* gf2_matrix_times_vec2, unless some processor does AVX2 fast.
|
||||
* - This is the implementation of the MERGE_CRC macro defined in
|
||||
* crcspeed.c (which calls crc_combine()), and is a specialization of the
|
||||
* generic crc_combine() (and related from the 2013 edition of Mark Adler's
|
||||
* crc64.c)) for the sake of clarity and performance.
|
||||
|
||||
This software is provided 'as-is', without any express or implied
|
||||
warranty. In no event will the author be held liable for any damages
|
||||
arising from the use of this software.
|
||||
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it
|
||||
freely, subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not
|
||||
claim that you wrote the original software. If you use this software
|
||||
in a product, an acknowledgment in the product documentation would be
|
||||
appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be
|
||||
misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
|
||||
Mark Adler
|
||||
madler@alumni.caltech.edu
|
||||
*/
|
||||
|
||||
#define STATIC_ASSERT(VVV) do {int test = 1 / (VVV);test++;} while (0)
|
||||
|
||||
#if !((defined(__i386__) || defined(__X86_64__)))
|
||||
|
||||
/* This cuts 40% of the time vs bit-by-bit. */
|
||||
|
||||
uint64_t gf2_matrix_times_switch(uint64_t *mat, uint64_t vec) {
|
||||
/*
|
||||
* Without using any vector math, this handles 4 bits at a time,
|
||||
* and saves 40+% of the time compared to the bit-by-bit version. Use if you
|
||||
* have no vector compile option available to you. With cache, we see:
|
||||
* E5-2670 ~1-2us to extend ~1 meg 64 bit hash
|
||||
*/
|
||||
uint64_t sum;
|
||||
|
||||
sum = 0;
|
||||
while (vec) {
|
||||
/* reversing the case order is ~10% slower on Xeon E5-2670 */
|
||||
switch (vec & 15) {
|
||||
case 15:
|
||||
sum ^= *mat ^ *(mat+1) ^ *(mat+2) ^ *(mat+3);
|
||||
break;
|
||||
case 14:
|
||||
sum ^= *(mat+1) ^ *(mat+2) ^ *(mat+3);
|
||||
break;
|
||||
case 13:
|
||||
sum ^= *mat ^ *(mat+2) ^ *(mat+3);
|
||||
break;
|
||||
case 12:
|
||||
sum ^= *(mat+2) ^ *(mat+3);
|
||||
break;
|
||||
case 11:
|
||||
sum ^= *mat ^ *(mat+1) ^ *(mat+3);
|
||||
break;
|
||||
case 10:
|
||||
sum ^= *(mat+1) ^ *(mat+3);
|
||||
break;
|
||||
case 9:
|
||||
sum ^= *mat ^ *(mat+3);
|
||||
break;
|
||||
case 8:
|
||||
sum ^= *(mat+3);
|
||||
break;
|
||||
case 7:
|
||||
sum ^= *mat ^ *(mat+1) ^ *(mat+2);
|
||||
break;
|
||||
case 6:
|
||||
sum ^= *(mat+1) ^ *(mat+2);
|
||||
break;
|
||||
case 5:
|
||||
sum ^= *mat ^ *(mat+2);
|
||||
break;
|
||||
case 4:
|
||||
sum ^= *(mat+2);
|
||||
break;
|
||||
case 3:
|
||||
sum ^= *mat ^ *(mat+1);
|
||||
break;
|
||||
case 2:
|
||||
sum ^= *(mat+1);
|
||||
break;
|
||||
case 1:
|
||||
sum ^= *mat;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
vec >>= 4;
|
||||
mat += 4;
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
#define CRC_MULTIPLY gf2_matrix_times_switch
|
||||
|
||||
#else
|
||||
|
||||
/*
|
||||
Warning: here there be dragons involving vector math, and macros to save us
|
||||
from repeating the same information over and over.
|
||||
*/
|
||||
|
||||
uint64_t gf2_matrix_times_vec2(uint64_t *mat, uint64_t vec) {
|
||||
/*
|
||||
* Uses xmm registers on x86, works basically everywhere fast, doing
|
||||
* cycles of movqda, mov, shr, pand, and, pxor, at least on gcc 8.
|
||||
* Is 9-11x faster than original.
|
||||
* E5-2670 ~29us to extend ~1 meg 64 bit hash
|
||||
* i3-8130U ~22us to extend ~1 meg 64 bit hash
|
||||
*/
|
||||
v2uq sum = {0, 0},
|
||||
*mv2 = (v2uq*)mat;
|
||||
/* this table allows us to eliminate conditions during gf2_matrix_times_vec2() */
|
||||
static v2uq masks2[4] = {
|
||||
{0,0},
|
||||
{-1,0},
|
||||
{0,-1},
|
||||
{-1,-1},
|
||||
};
|
||||
|
||||
/* Almost as beautiful as gf2_matrix_times_vec, but only half as many
|
||||
* bits per step, so we need 2 per chunk4 operation. Faster in my tests. */
|
||||
|
||||
#define DO_CHUNK4() \
|
||||
sum ^= (*mv2++) & masks2[vec & 3]; \
|
||||
vec >>= 2; \
|
||||
sum ^= (*mv2++) & masks2[vec & 3]; \
|
||||
vec >>= 2
|
||||
|
||||
#define DO_CHUNK16() \
|
||||
DO_CHUNK4(); \
|
||||
DO_CHUNK4(); \
|
||||
DO_CHUNK4(); \
|
||||
DO_CHUNK4()
|
||||
|
||||
DO_CHUNK16();
|
||||
DO_CHUNK16();
|
||||
DO_CHUNK16();
|
||||
DO_CHUNK16();
|
||||
|
||||
STATIC_ASSERT(sizeof(uint64_t) == 8);
|
||||
STATIC_ASSERT(sizeof(long long unsigned int) == 8);
|
||||
return sum[0] ^ sum[1];
|
||||
}
|
||||
|
||||
#undef DO_CHUNK16
|
||||
#undef DO_CHUNK4
|
||||
|
||||
#define CRC_MULTIPLY gf2_matrix_times_vec2
|
||||
#endif
|
||||
|
||||
static void gf2_matrix_square(uint64_t *square, uint64_t *mat, uint8_t dim) {
|
||||
unsigned n;
|
||||
|
||||
for (n = 0; n < dim; n++)
|
||||
square[n] = CRC_MULTIPLY(mat, mat[n]);
|
||||
}
|
||||
|
||||
/* Turns out our Redis / Jones CRC cycles at this point, so we can support
|
||||
* more than 64 bits of extension if we want. Trivially. */
|
||||
static uint64_t combine_cache[64][64];
|
||||
|
||||
/* Mark Adler has some amazing updates to crc.c in his crcany repository. I
|
||||
* like static caches, and not worrying about finding cycles generally. We are
|
||||
* okay to spend the 32k of memory here, leaving the algorithm unchanged from
|
||||
* as it was a decade ago, and be happy that it costs <200 microseconds to
|
||||
* init, and that subsequent calls to the combine function take under 100
|
||||
* nanoseconds. We also note that the crcany/crc.c code applies to any CRC, and
|
||||
* we are currently targeting one: Jones CRC64.
|
||||
*/
|
||||
|
||||
void init_combine_cache(uint64_t poly, uint8_t dim) {
|
||||
unsigned n, cache_num = 0;
|
||||
combine_cache[1][0] = poly;
|
||||
int prev = 1;
|
||||
uint64_t row = 1;
|
||||
for (n = 1; n < dim; n++)
|
||||
{
|
||||
combine_cache[1][n] = row;
|
||||
row <<= 1;
|
||||
}
|
||||
|
||||
gf2_matrix_square(combine_cache[0], combine_cache[1], dim);
|
||||
gf2_matrix_square(combine_cache[1], combine_cache[0], dim);
|
||||
|
||||
/* do/while to overwrite the first two layers, they are not used, but are
|
||||
* re-generated in the last two layers for the Redis polynomial */
|
||||
do {
|
||||
gf2_matrix_square(combine_cache[cache_num], combine_cache[cache_num + prev], dim);
|
||||
prev = -1;
|
||||
} while (++cache_num < 64);
|
||||
}
|
||||
|
||||
/* Return the CRC-64 of two sequential blocks, where crc1 is the CRC-64 of the
|
||||
* first block, crc2 is the CRC-64 of the second block, and len2 is the length
|
||||
* of the second block.
|
||||
*
|
||||
* If you want reflections on your CRCs; do them outside before / after.
|
||||
* WARNING: if you enable USE_STATIC_COMBINE_CACHE to make this fast, you MUST
|
||||
* ALWAYS USE THE SAME POLYNOMIAL, otherwise you will get the wrong results.
|
||||
* You MAY bzero() the even/odd static arrays, which will induce a re-cache on
|
||||
* next call as a work-around, but ... maybe just parameterize the cached
|
||||
* models at that point like Mark Adler does in modern crcany/crc.c .
|
||||
*/
|
||||
|
||||
uint64_t crc64_combine(uint64_t crc1, uint64_t crc2, uintmax_t len2, uint64_t poly, uint8_t dim) {
|
||||
/* degenerate case */
|
||||
if (len2 == 0)
|
||||
return crc1;
|
||||
|
||||
unsigned cache_num = 0;
|
||||
if (combine_cache[0][0] == 0) {
|
||||
init_combine_cache(poly, dim);
|
||||
}
|
||||
|
||||
/* apply len2 zeros to crc1 (first square will put the operator for one
|
||||
zero byte, eight zero bits, in even) */
|
||||
do
|
||||
{
|
||||
/* apply zeros operator for this bit of len2 */
|
||||
if (len2 & 1)
|
||||
crc1 = CRC_MULTIPLY(combine_cache[cache_num], crc1);
|
||||
len2 >>= 1;
|
||||
cache_num = (cache_num + 1) & 63;
|
||||
/* if no more bits set, then done */
|
||||
} while (len2 != 0);
|
||||
|
||||
/* return combined crc */
|
||||
crc1 ^= crc2;
|
||||
return crc1;
|
||||
}
|
||||
|
||||
#undef CRC_MULTIPLY
|
10
src/crccombine.h
Normal file
10
src/crccombine.h
Normal file
@ -0,0 +1,10 @@
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
|
||||
/* mask types */
|
||||
typedef unsigned long long v2uq __attribute__ ((vector_size (16)));
|
||||
|
||||
uint64_t gf2_matrix_times_vec2(uint64_t *mat, uint64_t vec);
|
||||
void init_combine_cache(uint64_t poly, uint8_t dim);
|
||||
uint64_t crc64_combine(uint64_t crc1, uint64_t crc2, uintmax_t len2, uint64_t poly, uint8_t dim);
|
172
src/crcspeed.c
172
src/crcspeed.c
@ -1,11 +1,21 @@
|
||||
/*
|
||||
* Copyright (C) 2013 Mark Adler
|
||||
* Copyright (C) 2019-2024 Josiah Carlson
|
||||
* Originally by: crc64.c Version 1.4 16 Dec 2013 Mark Adler
|
||||
* Modifications by Matt Stancliff <matt@genges.com>:
|
||||
* - removed CRC64-specific behavior
|
||||
* - added generation of lookup tables by parameters
|
||||
* - removed inversion of CRC input/result
|
||||
* - removed automatic initialization in favor of explicit initialization
|
||||
* Modifications by Josiah Carlson <josiah.carlson@gmail.com>
|
||||
* - Added case/vector/AVX/+ versions of crc combine function; see crccombine.c
|
||||
* - added optional static cache
|
||||
* - Modified to use 1 thread to:
|
||||
* - Partition large crc blobs into 2-3 segments
|
||||
* - Process the 2-3 segments in parallel
|
||||
* - Merge the resulting crcs
|
||||
* -> Resulting in 10-90% performance boost for data > 1 meg
|
||||
* - macro-ized to reduce copy/pasta
|
||||
|
||||
This software is provided 'as-is', without any express or implied
|
||||
warranty. In no event will the author be held liable for any damages
|
||||
@ -28,6 +38,10 @@
|
||||
*/
|
||||
|
||||
#include "crcspeed.h"
|
||||
#include "crccombine.h"
|
||||
|
||||
#define CRC64_LEN_MASK UINT64_C(0x7ffffffffffffff8)
|
||||
#define CRC64_REVERSED_POLY UINT64_C(0x95ac9329ac4bc9b5)
|
||||
|
||||
/* Fill in a CRC constants table. */
|
||||
void crcspeed64little_init(crcfn64 crcfn, uint64_t table[8][256]) {
|
||||
@ -39,7 +53,7 @@ void crcspeed64little_init(crcfn64 crcfn, uint64_t table[8][256]) {
|
||||
table[0][n] = crcfn(0, &v, 1);
|
||||
}
|
||||
|
||||
/* generate nested CRC table for future slice-by-8 lookup */
|
||||
/* generate nested CRC table for future slice-by-8/16/24+ lookup */
|
||||
for (int n = 0; n < 256; n++) {
|
||||
crc = table[0][n];
|
||||
for (int k = 1; k < 8; k++) {
|
||||
@ -47,6 +61,10 @@ void crcspeed64little_init(crcfn64 crcfn, uint64_t table[8][256]) {
|
||||
table[k][n] = crc;
|
||||
}
|
||||
}
|
||||
#if USE_STATIC_COMBINE_CACHE
|
||||
/* initialize combine cache for CRC stapling for slice-by 16/24+ */
|
||||
init_combine_cache(CRC64_REVERSED_POLY, 64);
|
||||
#endif
|
||||
}
|
||||
|
||||
void crcspeed16little_init(crcfn16 crcfn, uint16_t table[8][256]) {
|
||||
@ -104,45 +122,151 @@ void crcspeed16big_init(crcfn16 fn, uint16_t big_table[8][256]) {
|
||||
}
|
||||
}
|
||||
|
||||
/* Note: doing all of our crc/next modifications *before* the crc table
|
||||
* references is an absolute speedup on all CPUs tested. So... keep these
|
||||
* macros separate.
|
||||
*/
|
||||
|
||||
#define DO_8_1(crc, next) \
|
||||
crc ^= *(uint64_t *)next; \
|
||||
next += 8
|
||||
|
||||
#define DO_8_2(crc) \
|
||||
crc = little_table[7][(uint8_t)crc] ^ \
|
||||
little_table[6][(uint8_t)(crc >> 8)] ^ \
|
||||
little_table[5][(uint8_t)(crc >> 16)] ^ \
|
||||
little_table[4][(uint8_t)(crc >> 24)] ^ \
|
||||
little_table[3][(uint8_t)(crc >> 32)] ^ \
|
||||
little_table[2][(uint8_t)(crc >> 40)] ^ \
|
||||
little_table[1][(uint8_t)(crc >> 48)] ^ \
|
||||
little_table[0][crc >> 56]
|
||||
|
||||
#define CRC64_SPLIT(div) \
|
||||
olen = len; \
|
||||
next2 = next1 + ((len / div) & CRC64_LEN_MASK); \
|
||||
len = (next2 - next1)
|
||||
|
||||
#define MERGE_CRC(crcn) \
|
||||
crc1 = crc64_combine(crc1, crcn, next2 - next1, CRC64_REVERSED_POLY, 64)
|
||||
|
||||
#define MERGE_END(last, DIV) \
|
||||
len = olen - ((next2 - next1) * DIV); \
|
||||
next1 = last
|
||||
|
||||
/* Variables so we can change for benchmarking; these seem to be fairly
|
||||
* reasonable for Intel CPUs made since 2010. Please adjust as necessary if
|
||||
* or when your CPU has more load / execute units. We've written benchmark code
|
||||
* to help you tune your platform, see crc64Test. */
|
||||
#if defined(__i386__) || defined(__X86_64__)
|
||||
static size_t CRC64_TRI_CUTOFF = (2*1024);
|
||||
static size_t CRC64_DUAL_CUTOFF = (128);
|
||||
#else
|
||||
static size_t CRC64_TRI_CUTOFF = (16*1024);
|
||||
static size_t CRC64_DUAL_CUTOFF = (1024);
|
||||
#endif
|
||||
|
||||
|
||||
void set_crc64_cutoffs(size_t dual_cutoff, size_t tri_cutoff) {
|
||||
CRC64_DUAL_CUTOFF = dual_cutoff;
|
||||
CRC64_TRI_CUTOFF = tri_cutoff;
|
||||
}
|
||||
|
||||
/* Calculate a non-inverted CRC multiple bytes at a time on a little-endian
|
||||
* architecture. If you need inverted CRC, invert *before* calling and invert
|
||||
* *after* calling.
|
||||
* 64 bit crc = process 8 bytes at once;
|
||||
* 64 bit crc = process 8/16/24 bytes at once;
|
||||
*/
|
||||
uint64_t crcspeed64little(uint64_t little_table[8][256], uint64_t crc,
|
||||
uint64_t crcspeed64little(uint64_t little_table[8][256], uint64_t crc1,
|
||||
void *buf, size_t len) {
|
||||
unsigned char *next = buf;
|
||||
unsigned char *next1 = buf;
|
||||
|
||||
if (CRC64_DUAL_CUTOFF < 1) {
|
||||
goto final;
|
||||
}
|
||||
|
||||
/* process individual bytes until we reach an 8-byte aligned pointer */
|
||||
while (len && ((uintptr_t)next & 7) != 0) {
|
||||
crc = little_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
|
||||
while (len && ((uintptr_t)next1 & 7) != 0) {
|
||||
crc1 = little_table[0][(crc1 ^ *next1++) & 0xff] ^ (crc1 >> 8);
|
||||
len--;
|
||||
}
|
||||
|
||||
/* fast middle processing, 8 bytes (aligned!) per loop */
|
||||
while (len >= 8) {
|
||||
crc ^= *(uint64_t *)next;
|
||||
crc = little_table[7][crc & 0xff] ^
|
||||
little_table[6][(crc >> 8) & 0xff] ^
|
||||
little_table[5][(crc >> 16) & 0xff] ^
|
||||
little_table[4][(crc >> 24) & 0xff] ^
|
||||
little_table[3][(crc >> 32) & 0xff] ^
|
||||
little_table[2][(crc >> 40) & 0xff] ^
|
||||
little_table[1][(crc >> 48) & 0xff] ^
|
||||
little_table[0][crc >> 56];
|
||||
next += 8;
|
||||
len -= 8;
|
||||
}
|
||||
if (len > CRC64_TRI_CUTOFF) {
|
||||
/* 24 bytes per loop, doing 3 parallel 8 byte chunks at a time */
|
||||
unsigned char *next2, *next3;
|
||||
uint64_t olen, crc2=0, crc3=0;
|
||||
CRC64_SPLIT(3);
|
||||
/* len is now the length of the first segment, the 3rd segment possibly
|
||||
* having extra bytes to clean up at the end
|
||||
*/
|
||||
next3 = next2 + len;
|
||||
while (len >= 8) {
|
||||
len -= 8;
|
||||
DO_8_1(crc1, next1);
|
||||
DO_8_1(crc2, next2);
|
||||
DO_8_1(crc3, next3);
|
||||
DO_8_2(crc1);
|
||||
DO_8_2(crc2);
|
||||
DO_8_2(crc3);
|
||||
}
|
||||
|
||||
/* merge the 3 crcs */
|
||||
MERGE_CRC(crc2);
|
||||
MERGE_CRC(crc3);
|
||||
MERGE_END(next3, 3);
|
||||
} else if (len > CRC64_DUAL_CUTOFF) {
|
||||
/* 16 bytes per loop, doing 2 parallel 8 byte chunks at a time */
|
||||
unsigned char *next2;
|
||||
uint64_t olen, crc2=0;
|
||||
CRC64_SPLIT(2);
|
||||
/* len is now the length of the first segment, the 2nd segment possibly
|
||||
* having extra bytes to clean up at the end
|
||||
*/
|
||||
while (len >= 8) {
|
||||
len -= 8;
|
||||
DO_8_1(crc1, next1);
|
||||
DO_8_1(crc2, next2);
|
||||
DO_8_2(crc1);
|
||||
DO_8_2(crc2);
|
||||
}
|
||||
|
||||
/* merge the 2 crcs */
|
||||
MERGE_CRC(crc2);
|
||||
MERGE_END(next2, 2);
|
||||
}
|
||||
/* We fall through here to handle our <CRC64_DUAL_CUTOFF inputs, and for any trailing
|
||||
* bytes that wasn't evenly divisble by 16 or 24 above. */
|
||||
|
||||
/* fast processing, 8 bytes (aligned!) per loop */
|
||||
while (len >= 8) {
|
||||
len -= 8;
|
||||
DO_8_1(crc1, next1);
|
||||
DO_8_2(crc1);
|
||||
}
|
||||
final:
|
||||
/* process remaining bytes (can't be larger than 8) */
|
||||
while (len) {
|
||||
crc = little_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
|
||||
crc1 = little_table[0][(crc1 ^ *next1++) & 0xff] ^ (crc1 >> 8);
|
||||
len--;
|
||||
}
|
||||
|
||||
return crc;
|
||||
return crc1;
|
||||
}
|
||||
|
||||
/* clean up our namespace */
|
||||
#undef DO_8_1
|
||||
#undef DO_8_2
|
||||
#undef CRC64_SPLIT
|
||||
#undef MERGE_CRC
|
||||
#undef MERGE_END
|
||||
#undef CRC64_REVERSED_POLY
|
||||
#undef CRC64_LEN_MASK
|
||||
|
||||
|
||||
/* note: similar perf advantages can be had for long strings in crc16 using all
|
||||
* of the same optimizations as above; though this is unnecessary. crc16 is
|
||||
* normally used to shard keys; not hash / verify data, so is used on shorter
|
||||
* data that doesn't warrant such changes. */
|
||||
|
||||
uint16_t crcspeed16little(uint16_t little_table[8][256], uint16_t crc,
|
||||
void *buf, size_t len) {
|
||||
unsigned char *next = buf;
|
||||
@ -190,6 +314,10 @@ uint64_t crcspeed64big(uint64_t big_table[8][256], uint64_t crc, void *buf,
|
||||
len--;
|
||||
}
|
||||
|
||||
/* note: alignment + 2/3-way processing can probably be handled here nearly
|
||||
the same as above, using our updated DO_8_2 macro. Not included in these
|
||||
changes, as other authors, I don't have big-endian to test with. */
|
||||
|
||||
while (len >= 8) {
|
||||
crc ^= *(uint64_t *)next;
|
||||
crc = big_table[0][crc & 0xff] ^
|
||||
|
@ -34,6 +34,8 @@
|
||||
typedef uint64_t (*crcfn64)(uint64_t, const void *, const uint64_t);
|
||||
typedef uint16_t (*crcfn16)(uint16_t, const void *, const uint64_t);
|
||||
|
||||
void set_crc64_cutoffs(size_t dual_cutoff, size_t tri_cutoff);
|
||||
|
||||
/* CRC-64 */
|
||||
void crcspeed64little_init(crcfn64 fn, uint64_t table[8][256]);
|
||||
void crcspeed64big_init(crcfn64 fn, uint64_t table[8][256]);
|
||||
|
Loading…
x
Reference in New Issue
Block a user