CRC64 perf improvements from Redis patches (#350)

Improve the performance of crc64 for large batches by processing large
number of bytes in parallel and combining the results.

## Performance 
* 53-73% faster on Xeon 2670 v0 @ 2.6ghz
* 2-2.5x faster on Core i3 8130U @ 2.2 ghz
* 1.6-2.46 bytes/cycle on i3 8130U
* likely >2x faster than crcspeed on newer CPUs with more resources than
a 2012-era Xeon 2670
* crc64 combine function runs in <50 nanoseconds typical with vector +
cache optimizations (~8 *microseconds* without vector optimizations, ~80
*microseconds without cache, the combination is extra effective)
* still single-threaded
* valkey-server test crc64 --help (requires `make distclean && make
SERVER_TEST=yes`)

---------

Signed-off-by: Josiah Carlson <josiah.carlson@gmail.com>
Signed-off-by: Madelyn Olson <madelyneolson@gmail.com>
Co-authored-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
Co-authored-by: Madelyn Olson <madelyneolson@gmail.com>
This commit is contained in:
Josiah Carlson 2024-04-30 19:32:01 -07:00 committed by GitHub
parent 89f72bc3ae
commit f4e10eee06
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 659 additions and 51 deletions

View File

@ -131,6 +131,9 @@ ifdef REDIS_LDFLAGS
endif
FINAL_CFLAGS=$(STD) $(WARN) $(OPT) $(DEBUG) $(CFLAGS) $(SERVER_CFLAGS)
ifeq ($(SERVER_TEST),yes)
FINAL_CFLAGS +=-DSERVER_TEST=1
endif
FINAL_LDFLAGS=$(LDFLAGS) $(OPT) $(SERVER_LDFLAGS) $(DEBUG)
FINAL_LIBS=-lm
DEBUG=-g -ggdb
@ -382,11 +385,11 @@ endif
ENGINE_NAME=valkey
SERVER_NAME=$(ENGINE_NAME)-server$(PROG_SUFFIX)
ENGINE_SENTINEL_NAME=$(ENGINE_NAME)-sentinel$(PROG_SUFFIX)
ENGINE_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o valkey-check-rdb.o valkey-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o
ENGINE_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crccombine.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o valkey-check-rdb.o valkey-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o
ENGINE_CLI_NAME=$(ENGINE_NAME)-cli$(PROG_SUFFIX)
ENGINE_CLI_OBJ=anet.o adlist.o dict.o valkey-cli.o zmalloc.o release.o ae.o serverassert.o crcspeed.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o cli_commands.o
ENGINE_CLI_OBJ=anet.o adlist.o dict.o valkey-cli.o zmalloc.o release.o ae.o serverassert.o crcspeed.o crccombine.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o cli_commands.o
ENGINE_BENCHMARK_NAME=$(ENGINE_NAME)-benchmark$(PROG_SUFFIX)
ENGINE_BENCHMARK_OBJ=ae.o anet.o valkey-benchmark.o adlist.o dict.o zmalloc.o serverassert.o release.o crcspeed.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o
ENGINE_BENCHMARK_OBJ=ae.o anet.o valkey-benchmark.o adlist.o dict.o zmalloc.o serverassert.o release.o crcspeed.o crccombine.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o
ENGINE_CHECK_RDB_NAME=$(ENGINE_NAME)-check-rdb$(PROG_SUFFIX)
ENGINE_CHECK_AOF_NAME=$(ENGINE_NAME)-check-aof$(PROG_SUFFIX)
ALL_SOURCES=$(sort $(patsubst %.o,%.c,$(ENGINE_SERVER_OBJ) $(ENGINE_CLI_OBJ) $(ENGINE_BENCHMARK_OBJ)))

View File

@ -28,6 +28,7 @@
#include "crc64.h"
#include "crcspeed.h"
#include "serverassert.h"
static uint64_t crc64_table[8][256] = {{0}};
#define POLY UINT64_C(0xad93d23594c935a9)
@ -67,14 +68,33 @@ static uint64_t crc64_table[8][256] = {{0}};
* \return The reflected data.
*****************************************************************************/
static inline uint_fast64_t crc_reflect(uint_fast64_t data, size_t data_len) {
uint_fast64_t ret = data & 0x01;
/* only ever called for data_len == 64 in this codebase
*
* Borrowed from bit twiddling hacks, original in the public domain.
* https://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel
* Extended to 64 bits, and added byteswap for final 3 steps.
* 16-30x 64-bit operations, no comparisons (16 for native byteswap, 30 for pure C)
*/
for (size_t i = 1; i < data_len; i++) {
data >>= 1;
ret = (ret << 1) | (data & 0x01);
}
return ret;
assert(data_len <= 64);
/* swap odd and even bits */
data = ((data >> 1) & 0x5555555555555555ULL) | ((data & 0x5555555555555555ULL) << 1);
/* swap consecutive pairs */
data = ((data >> 2) & 0x3333333333333333ULL) | ((data & 0x3333333333333333ULL) << 2);
/* swap nibbles ... */
data = ((data >> 4) & 0x0F0F0F0F0F0F0F0FULL) | ((data & 0x0F0F0F0F0F0F0F0FULL) << 4);
#if defined(__GNUC__) || defined(__clang__)
data = __builtin_bswap64(data);
#else
/* swap bytes */
data = ((data >> 8) & 0x00FF00FF00FF00FFULL) | ((data & 0x00FF00FF00FF00FFULL) << 8);
/* swap 2-byte long pairs */
data = ( data >> 16 & 0xFFFF0000FFFFULL) | ((data & 0xFFFF0000FFFFULL) << 16);
/* swap 4-byte quads */
data = ( data >> 32 & 0xFFFFFFFFULL) | ((data & 0xFFFFFFFFULL) << 32);
#endif
/* adjust for non-64-bit reversals */
return data >> (64 - data_len);
}
/**
@ -126,29 +146,221 @@ uint64_t crc64(uint64_t crc, const unsigned char *s, uint64_t l) {
#ifdef SERVER_TEST
#include <stdio.h>
static void genBenchmarkRandomData(char *data, int count);
static int bench_crc64(unsigned char *data, uint64_t size, long long passes, uint64_t check, char *name, int csv);
static void bench_combine(char *label, uint64_t size, uint64_t expect, int csv);
long long _ustime(void);
#include <inttypes.h>
#include <string.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <unistd.h>
#include "zmalloc.h"
#include "crccombine.h"
long long _ustime(void) {
struct timeval tv;
long long ust;
gettimeofday(&tv, NULL);
ust = ((long long)tv.tv_sec)*1000000;
ust += tv.tv_usec;
return ust;
}
static int bench_crc64(unsigned char *data, uint64_t size, long long passes, uint64_t check, char *name, int csv) {
uint64_t min = size, hash;
long long original_start = _ustime(), original_end;
for (long long i=passes; i > 0; i--) {
hash = crc64(0, data, size);
}
original_end = _ustime();
min = (original_end - original_start) * 1000 / passes;
/* approximate nanoseconds without nstime */
if (csv) {
printf("%s,%" PRIu64 ",%" PRIu64 ",%d\n",
name, size, (1000 * size) / min, hash == check);
} else {
printf("test size=%" PRIu64 " algorithm=%s %" PRIu64 " M/sec matches=%d\n",
size, name, (1000 * size) / min, hash == check);
}
return hash != check;
}
const uint64_t BENCH_RPOLY = UINT64_C(0x95ac9329ac4bc9b5);
static void bench_combine(char *label, uint64_t size, uint64_t expect, int csv) {
uint64_t min = size, start = expect, thash = expect ^ (expect >> 17);
long long original_start = _ustime(), original_end;
for (int i=0; i < 1000; i++) {
crc64_combine(thash, start, size, BENCH_RPOLY, 64);
}
original_end = _ustime();
/* ran 1000 times, want ns per, counted us per 1000 ... */
min = original_end - original_start;
if (csv) {
printf("%s,%" PRIu64 ",%" PRIu64 "\n", label, size, min);
} else {
printf("%s size=%" PRIu64 " in %" PRIu64 " nsec\n", label, size, min);
}
}
static void genBenchmarkRandomData(char *data, int count) {
static uint32_t state = 1234;
int i = 0;
while (count--) {
state = (state*1103515245+12345);
data[i++] = '0'+((state>>16)&63);
}
}
#define UNUSED(x) (void)(x)
int crc64Test(int argc, char *argv[], int flags) {
UNUSED(argc);
UNUSED(argv);
UNUSED(flags);
crc64_init();
printf("[calcula]: e9c6d914c4b8d9ca == %016" PRIx64 "\n",
(uint64_t)_crc64(0, "123456789", 9));
printf("[64speed]: e9c6d914c4b8d9ca == %016" PRIx64 "\n",
(uint64_t)crc64(0, (unsigned char*)"123456789", 9));
char li[] = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed "
"do eiusmod tempor incididunt ut labore et dolore magna "
"aliqua. Ut enim ad minim veniam, quis nostrud exercitation "
"ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis "
"aute irure dolor in reprehenderit in voluptate velit esse "
"cillum dolore eu fugiat nulla pariatur. Excepteur sint "
"occaecat cupidatat non proident, sunt in culpa qui officia "
"deserunt mollit anim id est laborum.";
printf("[calcula]: c7794709e69683b3 == %016" PRIx64 "\n",
(uint64_t)_crc64(0, li, sizeof(li)));
printf("[64speed]: c7794709e69683b3 == %016" PRIx64 "\n",
(uint64_t)crc64(0, (unsigned char*)li, sizeof(li)));
uint64_t crc64_test_size = 0;
int i, lastarg, csv = 0, loop = 0, combine = 0;
again:
for (i = 3; i < argc; i++) {
lastarg = (i == (argc-1));
if (!strcmp(argv[i],"--help")) {
goto usage;
} else if (!strcmp(argv[i],"--csv")) {
csv = 1;
} else if (!strcmp(argv[i],"-l")) {
loop = 1;
} else if (!strcmp(argv[i],"--crc")) {
if (lastarg) goto invalid;
crc64_test_size = atoll(argv[++i]);
} else if (!strcmp(argv[i],"--combine")) {
combine = 1;
} else {
invalid:
printf("Invalid option \"%s\" or option argument missing\n\n",argv[i]);
usage:
printf(
"Usage: crc64 [OPTIONS]\n\n"
" --csv Output in CSV format\n"
" -l Loop. Run the tests forever\n"
" --crc <bytes> Benchmark crc64 faster options, using a buffer this big, and quit when done.\n"
" --combine Benchmark crc64 combine value ranges and timings.\n"
);
return 1;
}
}
if (crc64_test_size == 0 && combine == 0) {
crc64_init();
printf("[calcula]: e9c6d914c4b8d9ca == %016" PRIx64 "\n",
(uint64_t)_crc64(0, "123456789", 9));
printf("[64speed]: e9c6d914c4b8d9ca == %016" PRIx64 "\n",
(uint64_t)crc64(0, (unsigned char*)"123456789", 9));
char li[] = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed "
"do eiusmod tempor incididunt ut labore et dolore magna "
"aliqua. Ut enim ad minim veniam, quis nostrud exercitation "
"ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis "
"aute irure dolor in reprehenderit in voluptate velit esse "
"cillum dolore eu fugiat nulla pariatur. Excepteur sint "
"occaecat cupidatat non proident, sunt in culpa qui officia "
"deserunt mollit anim id est laborum.";
printf("[calcula]: c7794709e69683b3 == %016" PRIx64 "\n",
(uint64_t)_crc64(0, li, sizeof(li)));
printf("[64speed]: c7794709e69683b3 == %016" PRIx64 "\n",
(uint64_t)crc64(0, (unsigned char*)li, sizeof(li)));
return 0;
}
int init_this_loop = 1;
long long init_start, init_end;
do {
unsigned char* data = NULL;
uint64_t passes = 0;
if (crc64_test_size) {
data = zmalloc(crc64_test_size);
genBenchmarkRandomData((char*)data, crc64_test_size);
/* We want to hash about 1 gig of data in total, looped, to get a good
* idea of our performance.
*/
passes = (UINT64_C(0x100000000) / crc64_test_size);
passes = passes >= 2 ? passes : 2;
passes = passes <= 1000 ? passes : 1000;
}
crc64_init();
/* warm up the cache */
set_crc64_cutoffs(crc64_test_size+1, crc64_test_size+1);
uint64_t expect = crc64(0, data, crc64_test_size);
if (!combine && crc64_test_size) {
if (csv && init_this_loop) printf("algorithm,buffer,performance,crc64_matches\n");
/* get the single-character version for single-byte Redis behavior */
set_crc64_cutoffs(0, crc64_test_size+1);
if (bench_crc64(data, crc64_test_size, passes, expect, "crc_1byte", csv)) return 1;
set_crc64_cutoffs(crc64_test_size+1, crc64_test_size+1);
/* run with 8-byte "single" path, crcfaster */
if (bench_crc64(data, crc64_test_size, passes, expect, "crcspeed", csv)) return 1;
/* run with dual 8-byte paths */
set_crc64_cutoffs(1, crc64_test_size+1);
if (bench_crc64(data, crc64_test_size, passes, expect, "crcdual", csv)) return 1;
/* run with tri 8-byte paths */
set_crc64_cutoffs(1, 1);
if (bench_crc64(data, crc64_test_size, passes, expect, "crctri", csv)) return 1;
/* Be free memory region, be free. */
zfree(data);
data = NULL;
}
uint64_t INIT_SIZE = UINT64_C(0xffffffffffffffff);
if (combine) {
if (init_this_loop) {
init_start = _ustime();
crc64_combine(
UINT64_C(0xdeadbeefdeadbeef),
UINT64_C(0xfeebdaedfeebdaed),
INIT_SIZE,
BENCH_RPOLY, 64);
init_end = _ustime();
init_end -= init_start;
init_end *= 1000;
if (csv) {
printf("operation,size,nanoseconds\n");
printf("init_64,%" PRIu64 ",%" PRIu64 "\n", INIT_SIZE, (uint64_t)init_end);
} else {
printf("init_64 size=%" PRIu64 " in %" PRIu64 " nsec\n", INIT_SIZE, (uint64_t)init_end);
}
/* use the hash itself as the size (unpredictable) */
bench_combine("hash_as_size_combine", crc64_test_size, expect, csv);
/* let's do something big (predictable, so fast) */
bench_combine("largest_combine", INIT_SIZE, expect, csv);
}
bench_combine("combine", crc64_test_size, expect, csv);
}
init_this_loop = 0;
/* step down by ~1.641 for a range of test sizes */
crc64_test_size -= (crc64_test_size >> 2) + (crc64_test_size >> 3) + (crc64_test_size >> 6);
} while (crc64_test_size > 3);
if (loop) goto again;
return 0;
}
# endif
#ifdef SERVER_TEST_MAIN
int main(int argc, char *argv[]) {
return crc64Test(argc, argv);
}
#endif

253
src/crccombine.c Normal file
View File

@ -0,0 +1,253 @@
#include <stdint.h>
#include <stdio.h>
#include <strings.h>
#if defined(__i386__) || defined(__X86_64__)
#include <immintrin.h>
#endif
#include "crccombine.h"
/* Copyright (C) 2013 Mark Adler
* Copyright (C) 2019-2024 Josiah Carlson
* Portions originally from: crc64.c Version 1.4 16 Dec 2013 Mark Adler
* Modifications by Josiah Carlson <josiah.carlson@gmail.com>
* - Added implementation variations with sample timings for gf_matrix_times*()
* - Most folks would be best using gf2_matrix_times_vec or
* gf2_matrix_times_vec2, unless some processor does AVX2 fast.
* - This is the implementation of the MERGE_CRC macro defined in
* crcspeed.c (which calls crc_combine()), and is a specialization of the
* generic crc_combine() (and related from the 2013 edition of Mark Adler's
* crc64.c)) for the sake of clarity and performance.
This software is provided 'as-is', without any express or implied
warranty. In no event will the author be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
Mark Adler
madler@alumni.caltech.edu
*/
#define STATIC_ASSERT(VVV) do {int test = 1 / (VVV);test++;} while (0)
#if !((defined(__i386__) || defined(__X86_64__)))
/* This cuts 40% of the time vs bit-by-bit. */
uint64_t gf2_matrix_times_switch(uint64_t *mat, uint64_t vec) {
/*
* Without using any vector math, this handles 4 bits at a time,
* and saves 40+% of the time compared to the bit-by-bit version. Use if you
* have no vector compile option available to you. With cache, we see:
* E5-2670 ~1-2us to extend ~1 meg 64 bit hash
*/
uint64_t sum;
sum = 0;
while (vec) {
/* reversing the case order is ~10% slower on Xeon E5-2670 */
switch (vec & 15) {
case 15:
sum ^= *mat ^ *(mat+1) ^ *(mat+2) ^ *(mat+3);
break;
case 14:
sum ^= *(mat+1) ^ *(mat+2) ^ *(mat+3);
break;
case 13:
sum ^= *mat ^ *(mat+2) ^ *(mat+3);
break;
case 12:
sum ^= *(mat+2) ^ *(mat+3);
break;
case 11:
sum ^= *mat ^ *(mat+1) ^ *(mat+3);
break;
case 10:
sum ^= *(mat+1) ^ *(mat+3);
break;
case 9:
sum ^= *mat ^ *(mat+3);
break;
case 8:
sum ^= *(mat+3);
break;
case 7:
sum ^= *mat ^ *(mat+1) ^ *(mat+2);
break;
case 6:
sum ^= *(mat+1) ^ *(mat+2);
break;
case 5:
sum ^= *mat ^ *(mat+2);
break;
case 4:
sum ^= *(mat+2);
break;
case 3:
sum ^= *mat ^ *(mat+1);
break;
case 2:
sum ^= *(mat+1);
break;
case 1:
sum ^= *mat;
break;
default:
break;
}
vec >>= 4;
mat += 4;
}
return sum;
}
#define CRC_MULTIPLY gf2_matrix_times_switch
#else
/*
Warning: here there be dragons involving vector math, and macros to save us
from repeating the same information over and over.
*/
uint64_t gf2_matrix_times_vec2(uint64_t *mat, uint64_t vec) {
/*
* Uses xmm registers on x86, works basically everywhere fast, doing
* cycles of movqda, mov, shr, pand, and, pxor, at least on gcc 8.
* Is 9-11x faster than original.
* E5-2670 ~29us to extend ~1 meg 64 bit hash
* i3-8130U ~22us to extend ~1 meg 64 bit hash
*/
v2uq sum = {0, 0},
*mv2 = (v2uq*)mat;
/* this table allows us to eliminate conditions during gf2_matrix_times_vec2() */
static v2uq masks2[4] = {
{0,0},
{-1,0},
{0,-1},
{-1,-1},
};
/* Almost as beautiful as gf2_matrix_times_vec, but only half as many
* bits per step, so we need 2 per chunk4 operation. Faster in my tests. */
#define DO_CHUNK4() \
sum ^= (*mv2++) & masks2[vec & 3]; \
vec >>= 2; \
sum ^= (*mv2++) & masks2[vec & 3]; \
vec >>= 2
#define DO_CHUNK16() \
DO_CHUNK4(); \
DO_CHUNK4(); \
DO_CHUNK4(); \
DO_CHUNK4()
DO_CHUNK16();
DO_CHUNK16();
DO_CHUNK16();
DO_CHUNK16();
STATIC_ASSERT(sizeof(uint64_t) == 8);
STATIC_ASSERT(sizeof(long long unsigned int) == 8);
return sum[0] ^ sum[1];
}
#undef DO_CHUNK16
#undef DO_CHUNK4
#define CRC_MULTIPLY gf2_matrix_times_vec2
#endif
static void gf2_matrix_square(uint64_t *square, uint64_t *mat, uint8_t dim) {
unsigned n;
for (n = 0; n < dim; n++)
square[n] = CRC_MULTIPLY(mat, mat[n]);
}
/* Turns out our Redis / Jones CRC cycles at this point, so we can support
* more than 64 bits of extension if we want. Trivially. */
static uint64_t combine_cache[64][64];
/* Mark Adler has some amazing updates to crc.c in his crcany repository. I
* like static caches, and not worrying about finding cycles generally. We are
* okay to spend the 32k of memory here, leaving the algorithm unchanged from
* as it was a decade ago, and be happy that it costs <200 microseconds to
* init, and that subsequent calls to the combine function take under 100
* nanoseconds. We also note that the crcany/crc.c code applies to any CRC, and
* we are currently targeting one: Jones CRC64.
*/
void init_combine_cache(uint64_t poly, uint8_t dim) {
unsigned n, cache_num = 0;
combine_cache[1][0] = poly;
int prev = 1;
uint64_t row = 1;
for (n = 1; n < dim; n++)
{
combine_cache[1][n] = row;
row <<= 1;
}
gf2_matrix_square(combine_cache[0], combine_cache[1], dim);
gf2_matrix_square(combine_cache[1], combine_cache[0], dim);
/* do/while to overwrite the first two layers, they are not used, but are
* re-generated in the last two layers for the Redis polynomial */
do {
gf2_matrix_square(combine_cache[cache_num], combine_cache[cache_num + prev], dim);
prev = -1;
} while (++cache_num < 64);
}
/* Return the CRC-64 of two sequential blocks, where crc1 is the CRC-64 of the
* first block, crc2 is the CRC-64 of the second block, and len2 is the length
* of the second block.
*
* If you want reflections on your CRCs; do them outside before / after.
* WARNING: if you enable USE_STATIC_COMBINE_CACHE to make this fast, you MUST
* ALWAYS USE THE SAME POLYNOMIAL, otherwise you will get the wrong results.
* You MAY bzero() the even/odd static arrays, which will induce a re-cache on
* next call as a work-around, but ... maybe just parameterize the cached
* models at that point like Mark Adler does in modern crcany/crc.c .
*/
uint64_t crc64_combine(uint64_t crc1, uint64_t crc2, uintmax_t len2, uint64_t poly, uint8_t dim) {
/* degenerate case */
if (len2 == 0)
return crc1;
unsigned cache_num = 0;
if (combine_cache[0][0] == 0) {
init_combine_cache(poly, dim);
}
/* apply len2 zeros to crc1 (first square will put the operator for one
zero byte, eight zero bits, in even) */
do
{
/* apply zeros operator for this bit of len2 */
if (len2 & 1)
crc1 = CRC_MULTIPLY(combine_cache[cache_num], crc1);
len2 >>= 1;
cache_num = (cache_num + 1) & 63;
/* if no more bits set, then done */
} while (len2 != 0);
/* return combined crc */
crc1 ^= crc2;
return crc1;
}
#undef CRC_MULTIPLY

10
src/crccombine.h Normal file
View File

@ -0,0 +1,10 @@
#include <stdint.h>
/* mask types */
typedef unsigned long long v2uq __attribute__ ((vector_size (16)));
uint64_t gf2_matrix_times_vec2(uint64_t *mat, uint64_t vec);
void init_combine_cache(uint64_t poly, uint8_t dim);
uint64_t crc64_combine(uint64_t crc1, uint64_t crc2, uintmax_t len2, uint64_t poly, uint8_t dim);

View File

@ -1,11 +1,21 @@
/*
* Copyright (C) 2013 Mark Adler
* Copyright (C) 2019-2024 Josiah Carlson
* Originally by: crc64.c Version 1.4 16 Dec 2013 Mark Adler
* Modifications by Matt Stancliff <matt@genges.com>:
* - removed CRC64-specific behavior
* - added generation of lookup tables by parameters
* - removed inversion of CRC input/result
* - removed automatic initialization in favor of explicit initialization
* Modifications by Josiah Carlson <josiah.carlson@gmail.com>
* - Added case/vector/AVX/+ versions of crc combine function; see crccombine.c
* - added optional static cache
* - Modified to use 1 thread to:
* - Partition large crc blobs into 2-3 segments
* - Process the 2-3 segments in parallel
* - Merge the resulting crcs
* -> Resulting in 10-90% performance boost for data > 1 meg
* - macro-ized to reduce copy/pasta
This software is provided 'as-is', without any express or implied
warranty. In no event will the author be held liable for any damages
@ -28,6 +38,10 @@
*/
#include "crcspeed.h"
#include "crccombine.h"
#define CRC64_LEN_MASK UINT64_C(0x7ffffffffffffff8)
#define CRC64_REVERSED_POLY UINT64_C(0x95ac9329ac4bc9b5)
/* Fill in a CRC constants table. */
void crcspeed64little_init(crcfn64 crcfn, uint64_t table[8][256]) {
@ -39,7 +53,7 @@ void crcspeed64little_init(crcfn64 crcfn, uint64_t table[8][256]) {
table[0][n] = crcfn(0, &v, 1);
}
/* generate nested CRC table for future slice-by-8 lookup */
/* generate nested CRC table for future slice-by-8/16/24+ lookup */
for (int n = 0; n < 256; n++) {
crc = table[0][n];
for (int k = 1; k < 8; k++) {
@ -47,6 +61,10 @@ void crcspeed64little_init(crcfn64 crcfn, uint64_t table[8][256]) {
table[k][n] = crc;
}
}
#if USE_STATIC_COMBINE_CACHE
/* initialize combine cache for CRC stapling for slice-by 16/24+ */
init_combine_cache(CRC64_REVERSED_POLY, 64);
#endif
}
void crcspeed16little_init(crcfn16 crcfn, uint16_t table[8][256]) {
@ -104,45 +122,151 @@ void crcspeed16big_init(crcfn16 fn, uint16_t big_table[8][256]) {
}
}
/* Note: doing all of our crc/next modifications *before* the crc table
* references is an absolute speedup on all CPUs tested. So... keep these
* macros separate.
*/
#define DO_8_1(crc, next) \
crc ^= *(uint64_t *)next; \
next += 8
#define DO_8_2(crc) \
crc = little_table[7][(uint8_t)crc] ^ \
little_table[6][(uint8_t)(crc >> 8)] ^ \
little_table[5][(uint8_t)(crc >> 16)] ^ \
little_table[4][(uint8_t)(crc >> 24)] ^ \
little_table[3][(uint8_t)(crc >> 32)] ^ \
little_table[2][(uint8_t)(crc >> 40)] ^ \
little_table[1][(uint8_t)(crc >> 48)] ^ \
little_table[0][crc >> 56]
#define CRC64_SPLIT(div) \
olen = len; \
next2 = next1 + ((len / div) & CRC64_LEN_MASK); \
len = (next2 - next1)
#define MERGE_CRC(crcn) \
crc1 = crc64_combine(crc1, crcn, next2 - next1, CRC64_REVERSED_POLY, 64)
#define MERGE_END(last, DIV) \
len = olen - ((next2 - next1) * DIV); \
next1 = last
/* Variables so we can change for benchmarking; these seem to be fairly
* reasonable for Intel CPUs made since 2010. Please adjust as necessary if
* or when your CPU has more load / execute units. We've written benchmark code
* to help you tune your platform, see crc64Test. */
#if defined(__i386__) || defined(__X86_64__)
static size_t CRC64_TRI_CUTOFF = (2*1024);
static size_t CRC64_DUAL_CUTOFF = (128);
#else
static size_t CRC64_TRI_CUTOFF = (16*1024);
static size_t CRC64_DUAL_CUTOFF = (1024);
#endif
void set_crc64_cutoffs(size_t dual_cutoff, size_t tri_cutoff) {
CRC64_DUAL_CUTOFF = dual_cutoff;
CRC64_TRI_CUTOFF = tri_cutoff;
}
/* Calculate a non-inverted CRC multiple bytes at a time on a little-endian
* architecture. If you need inverted CRC, invert *before* calling and invert
* *after* calling.
* 64 bit crc = process 8 bytes at once;
* 64 bit crc = process 8/16/24 bytes at once;
*/
uint64_t crcspeed64little(uint64_t little_table[8][256], uint64_t crc,
uint64_t crcspeed64little(uint64_t little_table[8][256], uint64_t crc1,
void *buf, size_t len) {
unsigned char *next = buf;
unsigned char *next1 = buf;
if (CRC64_DUAL_CUTOFF < 1) {
goto final;
}
/* process individual bytes until we reach an 8-byte aligned pointer */
while (len && ((uintptr_t)next & 7) != 0) {
crc = little_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
while (len && ((uintptr_t)next1 & 7) != 0) {
crc1 = little_table[0][(crc1 ^ *next1++) & 0xff] ^ (crc1 >> 8);
len--;
}
/* fast middle processing, 8 bytes (aligned!) per loop */
while (len >= 8) {
crc ^= *(uint64_t *)next;
crc = little_table[7][crc & 0xff] ^
little_table[6][(crc >> 8) & 0xff] ^
little_table[5][(crc >> 16) & 0xff] ^
little_table[4][(crc >> 24) & 0xff] ^
little_table[3][(crc >> 32) & 0xff] ^
little_table[2][(crc >> 40) & 0xff] ^
little_table[1][(crc >> 48) & 0xff] ^
little_table[0][crc >> 56];
next += 8;
len -= 8;
}
if (len > CRC64_TRI_CUTOFF) {
/* 24 bytes per loop, doing 3 parallel 8 byte chunks at a time */
unsigned char *next2, *next3;
uint64_t olen, crc2=0, crc3=0;
CRC64_SPLIT(3);
/* len is now the length of the first segment, the 3rd segment possibly
* having extra bytes to clean up at the end
*/
next3 = next2 + len;
while (len >= 8) {
len -= 8;
DO_8_1(crc1, next1);
DO_8_1(crc2, next2);
DO_8_1(crc3, next3);
DO_8_2(crc1);
DO_8_2(crc2);
DO_8_2(crc3);
}
/* merge the 3 crcs */
MERGE_CRC(crc2);
MERGE_CRC(crc3);
MERGE_END(next3, 3);
} else if (len > CRC64_DUAL_CUTOFF) {
/* 16 bytes per loop, doing 2 parallel 8 byte chunks at a time */
unsigned char *next2;
uint64_t olen, crc2=0;
CRC64_SPLIT(2);
/* len is now the length of the first segment, the 2nd segment possibly
* having extra bytes to clean up at the end
*/
while (len >= 8) {
len -= 8;
DO_8_1(crc1, next1);
DO_8_1(crc2, next2);
DO_8_2(crc1);
DO_8_2(crc2);
}
/* merge the 2 crcs */
MERGE_CRC(crc2);
MERGE_END(next2, 2);
}
/* We fall through here to handle our <CRC64_DUAL_CUTOFF inputs, and for any trailing
* bytes that wasn't evenly divisble by 16 or 24 above. */
/* fast processing, 8 bytes (aligned!) per loop */
while (len >= 8) {
len -= 8;
DO_8_1(crc1, next1);
DO_8_2(crc1);
}
final:
/* process remaining bytes (can't be larger than 8) */
while (len) {
crc = little_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
crc1 = little_table[0][(crc1 ^ *next1++) & 0xff] ^ (crc1 >> 8);
len--;
}
return crc;
return crc1;
}
/* clean up our namespace */
#undef DO_8_1
#undef DO_8_2
#undef CRC64_SPLIT
#undef MERGE_CRC
#undef MERGE_END
#undef CRC64_REVERSED_POLY
#undef CRC64_LEN_MASK
/* note: similar perf advantages can be had for long strings in crc16 using all
* of the same optimizations as above; though this is unnecessary. crc16 is
* normally used to shard keys; not hash / verify data, so is used on shorter
* data that doesn't warrant such changes. */
uint16_t crcspeed16little(uint16_t little_table[8][256], uint16_t crc,
void *buf, size_t len) {
unsigned char *next = buf;
@ -190,6 +314,10 @@ uint64_t crcspeed64big(uint64_t big_table[8][256], uint64_t crc, void *buf,
len--;
}
/* note: alignment + 2/3-way processing can probably be handled here nearly
the same as above, using our updated DO_8_2 macro. Not included in these
changes, as other authors, I don't have big-endian to test with. */
while (len >= 8) {
crc ^= *(uint64_t *)next;
crc = big_table[0][crc & 0xff] ^

View File

@ -34,6 +34,8 @@
typedef uint64_t (*crcfn64)(uint64_t, const void *, const uint64_t);
typedef uint16_t (*crcfn16)(uint16_t, const void *, const uint64_t);
void set_crc64_cutoffs(size_t dual_cutoff, size_t tri_cutoff);
/* CRC-64 */
void crcspeed64little_init(crcfn64 fn, uint64_t table[8][256]);
void crcspeed64big_init(crcfn64 fn, uint64_t table[8][256]);