CRC64 perf improvements from Redis patches (#350)

Improve the performance of crc64 for large batches by processing large number of bytes in parallel and combining the results. ## Performance * 53-73% faster on Xeon 2670 v0 @ 2.6ghz * 2-2.5x faster on Core i3 8130U @ 2.2 ghz * 1.6-2.46 bytes/cycle on i3 8130U * likely >2x faster than crcspeed on newer CPUs with more resources than a 2012-era Xeon 2670 * crc64 combine function runs in <50 nanoseconds typical with vector + cache optimizations (~8 *microseconds* without vector optimizations, ~80 *microseconds without cache, the combination is extra effective) * still single-threaded * valkey-server test crc64 --help (requires `make distclean && make SERVER_TEST=yes`) --------- Signed-off-by: Josiah Carlson <josiah.carlson@gmail.com> Signed-off-by: Madelyn Olson <madelyneolson@gmail.com> Co-authored-by: Viktor Söderqvist <viktor.soderqvist@est.tech> Co-authored-by: Madelyn Olson <madelyneolson@gmail.com>
2024-04-30 19:32:01 -07:00 · 2024-04-30 19:32:01 -07:00 · f4e10eee06
commit f4e10eee06
parent 89f72bc3ae
6 changed files with 659 additions and 51 deletions
--- a/src/Makefile
+++ b/src/Makefile
@ -131,6 +131,9 @@ ifdef REDIS_LDFLAGS
 endif

 FINAL_CFLAGS=$(STD) $(WARN) $(OPT) $(DEBUG) $(CFLAGS) $(SERVER_CFLAGS)
+ifeq ($(SERVER_TEST),yes)
+	FINAL_CFLAGS +=-DSERVER_TEST=1
+endif
 FINAL_LDFLAGS=$(LDFLAGS) $(OPT) $(SERVER_LDFLAGS) $(DEBUG)
 FINAL_LIBS=-lm
 DEBUG=-g -ggdb
@ -382,11 +385,11 @@ endif
 ENGINE_NAME=valkey
 SERVER_NAME=$(ENGINE_NAME)-server$(PROG_SUFFIX)
 ENGINE_SENTINEL_NAME=$(ENGINE_NAME)-sentinel$(PROG_SUFFIX)
-ENGINE_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o valkey-check-rdb.o valkey-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o
+ENGINE_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crccombine.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o valkey-check-rdb.o valkey-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o
 ENGINE_CLI_NAME=$(ENGINE_NAME)-cli$(PROG_SUFFIX)
-ENGINE_CLI_OBJ=anet.o adlist.o dict.o valkey-cli.o zmalloc.o release.o ae.o serverassert.o crcspeed.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o cli_commands.o
+ENGINE_CLI_OBJ=anet.o adlist.o dict.o valkey-cli.o zmalloc.o release.o ae.o serverassert.o crcspeed.o crccombine.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o cli_commands.o
 ENGINE_BENCHMARK_NAME=$(ENGINE_NAME)-benchmark$(PROG_SUFFIX)
-ENGINE_BENCHMARK_OBJ=ae.o anet.o valkey-benchmark.o adlist.o dict.o zmalloc.o serverassert.o release.o crcspeed.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o
+ENGINE_BENCHMARK_OBJ=ae.o anet.o valkey-benchmark.o adlist.o dict.o zmalloc.o serverassert.o release.o crcspeed.o crccombine.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o
 ENGINE_CHECK_RDB_NAME=$(ENGINE_NAME)-check-rdb$(PROG_SUFFIX)
 ENGINE_CHECK_AOF_NAME=$(ENGINE_NAME)-check-aof$(PROG_SUFFIX)
 ALL_SOURCES=$(sort $(patsubst %.o,%.c,$(ENGINE_SERVER_OBJ) $(ENGINE_CLI_OBJ) $(ENGINE_BENCHMARK_OBJ)))
--- a/src/crc64.c
+++ b/src/crc64.c
@ -28,6 +28,7 @@

 #include "crc64.h"
 #include "crcspeed.h"
+#include "serverassert.h"
 static uint64_t crc64_table[8][256] = {{0}};

 #define POLY UINT64_C(0xad93d23594c935a9)
@ -67,14 +68,33 @@ static uint64_t crc64_table[8][256] = {{0}};
 * \return             The reflected data.
 *****************************************************************************/
 static inline uint_fast64_t crc_reflect(uint_fast64_t data, size_t data_len) {
-    uint_fast64_t ret = data & 0x01;
+    /* only ever called for data_len == 64 in this codebase
+     *
+     * Borrowed from bit twiddling hacks, original in the public domain.
+     * https://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel
+     * Extended to 64 bits, and added byteswap for final 3 steps.
+     * 16-30x 64-bit operations, no comparisons (16 for native byteswap, 30 for pure C)
+     */

-    for (size_t i = 1; i < data_len; i++) {
-        data >>= 1;
-        ret = (ret << 1) | (data & 0x01);
-    }
-
-    return ret;
+    assert(data_len <= 64);
+    /* swap odd and even bits */
+    data = ((data >> 1) & 0x5555555555555555ULL) | ((data & 0x5555555555555555ULL) << 1);
+    /* swap consecutive pairs */
+    data = ((data >> 2) & 0x3333333333333333ULL) | ((data & 0x3333333333333333ULL) << 2);
+    /* swap nibbles ... */
+    data = ((data >> 4) & 0x0F0F0F0F0F0F0F0FULL) | ((data & 0x0F0F0F0F0F0F0F0FULL) << 4);
+#if defined(__GNUC__) || defined(__clang__)
+    data = __builtin_bswap64(data);
+#else
+    /* swap bytes */
+    data = ((data >> 8) & 0x00FF00FF00FF00FFULL) | ((data & 0x00FF00FF00FF00FFULL) << 8);
+    /* swap 2-byte long pairs */
+    data = ( data >> 16 &     0xFFFF0000FFFFULL) | ((data &     0xFFFF0000FFFFULL) << 16);
+    /* swap 4-byte quads */
+    data = ( data >> 32 &         0xFFFFFFFFULL) | ((data &         0xFFFFFFFFULL) << 32);
+#endif
+    /* adjust for non-64-bit reversals */
+    return data >> (64 - data_len);
 }

 /**
@ -126,29 +146,221 @@ uint64_t crc64(uint64_t crc, const unsigned char *s, uint64_t l) {
 #ifdef SERVER_TEST
 #include <stdio.h>

+static void genBenchmarkRandomData(char *data, int count);
+static int bench_crc64(unsigned char *data, uint64_t size, long long passes, uint64_t check, char *name, int csv);
+static void bench_combine(char *label, uint64_t size, uint64_t expect, int csv);
+long long _ustime(void);
+
+#include <inttypes.h>
+#include <string.h>
+#include <stdlib.h>
+#include <time.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#include "zmalloc.h"
+#include "crccombine.h"
+
+long long _ustime(void) {
+    struct timeval tv;
+    long long ust;
+
+    gettimeofday(&tv, NULL);
+    ust = ((long long)tv.tv_sec)*1000000;
+    ust += tv.tv_usec;
+    return ust;
+}
+
+static int bench_crc64(unsigned char *data, uint64_t size, long long passes, uint64_t check, char *name, int csv) {
+    uint64_t min = size, hash;
+    long long original_start = _ustime(), original_end;
+    for (long long i=passes; i > 0; i--) {
+        hash = crc64(0, data, size);
+    }
+    original_end = _ustime();
+    min = (original_end - original_start) * 1000 / passes;
+    /* approximate nanoseconds without nstime */
+    if (csv) {
+        printf("%s,%" PRIu64 ",%" PRIu64 ",%d\n",
+            name, size, (1000 * size) / min, hash == check);
+    } else {
+        printf("test size=%" PRIu64 " algorithm=%s %" PRIu64 " M/sec matches=%d\n",
+            size, name, (1000 * size) / min, hash == check);
+    }
+    return hash != check;
+}
+
+const uint64_t BENCH_RPOLY = UINT64_C(0x95ac9329ac4bc9b5);
+
+static void bench_combine(char *label, uint64_t size, uint64_t expect, int csv) {
+    uint64_t min = size, start = expect, thash = expect ^ (expect >> 17);
+    long long original_start = _ustime(), original_end;
+    for (int i=0; i < 1000; i++) {
+        crc64_combine(thash, start, size, BENCH_RPOLY, 64);
+    }
+    original_end = _ustime();
+    /* ran 1000 times, want ns per, counted us per 1000 ... */
+    min = original_end - original_start;
+    if (csv) {
+        printf("%s,%" PRIu64 ",%" PRIu64 "\n", label, size, min);
+    } else {
+        printf("%s size=%" PRIu64 " in %" PRIu64 " nsec\n", label, size, min);
+    }
+}
+
+static void genBenchmarkRandomData(char *data, int count) {
+    static uint32_t state = 1234;
+    int i = 0;
+
+    while (count--) {
+        state = (state*1103515245+12345);
+        data[i++] = '0'+((state>>16)&63);
+    }
+}
+
 #define UNUSED(x) (void)(x)
 int crc64Test(int argc, char *argv[], int flags) {
-    UNUSED(argc);
-    UNUSED(argv);
    UNUSED(flags);
-    crc64_init();
-    printf("[calcula]: e9c6d914c4b8d9ca == %016" PRIx64 "\n",
-           (uint64_t)_crc64(0, "123456789", 9));
-    printf("[64speed]: e9c6d914c4b8d9ca == %016" PRIx64 "\n",
-           (uint64_t)crc64(0, (unsigned char*)"123456789", 9));
-    char li[] = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed "
-                "do eiusmod tempor incididunt ut labore et dolore magna "
-                "aliqua. Ut enim ad minim veniam, quis nostrud exercitation "
-                "ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis "
-                "aute irure dolor in reprehenderit in voluptate velit esse "
-                "cillum dolore eu fugiat nulla pariatur. Excepteur sint "
-                "occaecat cupidatat non proident, sunt in culpa qui officia "
-                "deserunt mollit anim id est laborum.";
-    printf("[calcula]: c7794709e69683b3 == %016" PRIx64 "\n",
-           (uint64_t)_crc64(0, li, sizeof(li)));
-    printf("[64speed]: c7794709e69683b3 == %016" PRIx64 "\n",
-           (uint64_t)crc64(0, (unsigned char*)li, sizeof(li)));
+
+    uint64_t crc64_test_size = 0;
+    int i, lastarg, csv = 0, loop = 0, combine = 0;
+again:
+    for (i = 3; i < argc; i++) {
+        lastarg = (i == (argc-1));
+        if (!strcmp(argv[i],"--help")) {
+            goto usage;
+        } else if (!strcmp(argv[i],"--csv")) {
+            csv = 1;
+        } else if (!strcmp(argv[i],"-l")) {
+            loop = 1;
+        } else if (!strcmp(argv[i],"--crc")) {
+            if (lastarg) goto invalid;
+            crc64_test_size = atoll(argv[++i]);
+        } else if (!strcmp(argv[i],"--combine")) {
+            combine = 1;
+        } else {
+invalid:
+            printf("Invalid option \"%s\" or option argument missing\n\n",argv[i]);
+usage:
+            printf(
+"Usage: crc64 [OPTIONS]\n\n"
+" --csv              Output in CSV format\n"
+" -l                 Loop. Run the tests forever\n"
+" --crc <bytes>      Benchmark crc64 faster options, using a buffer this big, and quit when done.\n"
+" --combine          Benchmark crc64 combine value ranges and timings.\n"
+            );
+            return 1;
+        }
+    }
+
+    if (crc64_test_size == 0 && combine == 0) {
+        crc64_init();
+        printf("[calcula]: e9c6d914c4b8d9ca == %016" PRIx64 "\n",
+            (uint64_t)_crc64(0, "123456789", 9));
+        printf("[64speed]: e9c6d914c4b8d9ca == %016" PRIx64 "\n",
+            (uint64_t)crc64(0, (unsigned char*)"123456789", 9));
+        char li[] = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed "
+                    "do eiusmod tempor incididunt ut labore et dolore magna "
+                    "aliqua. Ut enim ad minim veniam, quis nostrud exercitation "
+                    "ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis "
+                    "aute irure dolor in reprehenderit in voluptate velit esse "
+                    "cillum dolore eu fugiat nulla pariatur. Excepteur sint "
+                    "occaecat cupidatat non proident, sunt in culpa qui officia "
+                    "deserunt mollit anim id est laborum.";
+        printf("[calcula]: c7794709e69683b3 == %016" PRIx64 "\n",
+            (uint64_t)_crc64(0, li, sizeof(li)));
+        printf("[64speed]: c7794709e69683b3 == %016" PRIx64 "\n",
+            (uint64_t)crc64(0, (unsigned char*)li, sizeof(li)));
+        return 0;
+
+    }
+
+    int init_this_loop = 1;
+    long long init_start, init_end;
+
+    do {
+        unsigned char* data = NULL;
+        uint64_t passes = 0;
+        if (crc64_test_size) {
+            data = zmalloc(crc64_test_size);
+            genBenchmarkRandomData((char*)data, crc64_test_size);
+            /* We want to hash about 1 gig of data in total, looped, to get a good
+             * idea of our performance.
+             */
+            passes = (UINT64_C(0x100000000) / crc64_test_size);
+            passes = passes >= 2 ? passes : 2;
+            passes = passes <= 1000 ? passes : 1000;
+        }
+
+        crc64_init();
+        /* warm up the cache */
+        set_crc64_cutoffs(crc64_test_size+1, crc64_test_size+1);
+        uint64_t expect = crc64(0, data, crc64_test_size);
+
+        if (!combine && crc64_test_size) {
+            if (csv && init_this_loop) printf("algorithm,buffer,performance,crc64_matches\n");
+
+            /* get the single-character version for single-byte Redis behavior */
+            set_crc64_cutoffs(0, crc64_test_size+1);
+            if (bench_crc64(data, crc64_test_size, passes, expect, "crc_1byte", csv)) return 1;
+
+            set_crc64_cutoffs(crc64_test_size+1, crc64_test_size+1);
+            /* run with 8-byte "single" path, crcfaster */
+            if (bench_crc64(data, crc64_test_size, passes, expect, "crcspeed", csv)) return 1;
+
+            /* run with dual 8-byte paths */
+            set_crc64_cutoffs(1, crc64_test_size+1);
+            if (bench_crc64(data, crc64_test_size, passes, expect, "crcdual", csv)) return 1;
+
+            /* run with tri 8-byte paths */
+            set_crc64_cutoffs(1, 1);
+            if (bench_crc64(data, crc64_test_size, passes, expect, "crctri", csv)) return 1;
+
+            /* Be free memory region, be free. */
+            zfree(data);
+            data = NULL;
+        }
+
+        uint64_t INIT_SIZE = UINT64_C(0xffffffffffffffff);
+        if (combine) {
+            if (init_this_loop) {
+                init_start = _ustime();
+                crc64_combine(
+                    UINT64_C(0xdeadbeefdeadbeef),
+                    UINT64_C(0xfeebdaedfeebdaed),
+                    INIT_SIZE,
+                    BENCH_RPOLY, 64);
+                init_end = _ustime();
+
+                init_end -= init_start;
+                init_end *= 1000;
+                if (csv) {
+                    printf("operation,size,nanoseconds\n");
+                    printf("init_64,%" PRIu64 ",%" PRIu64 "\n", INIT_SIZE, (uint64_t)init_end);
+                } else {
+                    printf("init_64 size=%" PRIu64 " in %" PRIu64 " nsec\n", INIT_SIZE, (uint64_t)init_end);
+                }
+                /* use the hash itself as the size (unpredictable) */
+                bench_combine("hash_as_size_combine", crc64_test_size, expect, csv);
+
+                /* let's do something big (predictable, so fast) */
+                bench_combine("largest_combine", INIT_SIZE, expect, csv);
+            }
+            bench_combine("combine", crc64_test_size, expect, csv);
+        }
+        init_this_loop = 0;
+        /* step down by ~1.641 for a range of test sizes */
+        crc64_test_size -= (crc64_test_size >> 2) + (crc64_test_size >> 3) + (crc64_test_size >> 6);
+    } while (crc64_test_size > 3);
+    if (loop) goto again;
    return 0;
 }
+# endif
+
+
+#ifdef SERVER_TEST_MAIN
+int main(int argc, char *argv[]) {
+    return crc64Test(argc, argv);
+}

 #endif
--- a/src/crccombine.c
+++ b/src/crccombine.c
@ -0,0 +1,253 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <strings.h>
+#if defined(__i386__) || defined(__X86_64__)
+#include <immintrin.h>
+#endif
+#include "crccombine.h"
+
+/* Copyright (C) 2013 Mark Adler
+ * Copyright (C) 2019-2024 Josiah Carlson
+ * Portions originally from: crc64.c Version 1.4  16 Dec 2013  Mark Adler
+ * Modifications by Josiah Carlson <josiah.carlson@gmail.com>
+ *   - Added implementation variations with sample timings for gf_matrix_times*()
+ *   - Most folks would be best using gf2_matrix_times_vec or
+ *	   gf2_matrix_times_vec2, unless some processor does AVX2 fast.
+ *   - This is the implementation of the MERGE_CRC macro defined in
+ *     crcspeed.c (which calls crc_combine()), and is a specialization of the
+ *     generic crc_combine() (and related from the 2013 edition of Mark Adler's
+ *     crc64.c)) for the sake of clarity and performance.
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the author be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+	 claim that you wrote the original software. If you use this software
+	 in a product, an acknowledgment in the product documentation would be
+	 appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+	 misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  Mark Adler
+  madler@alumni.caltech.edu
+*/
+
+#define STATIC_ASSERT(VVV) do {int test = 1 / (VVV);test++;} while (0)
+
+#if !((defined(__i386__) || defined(__X86_64__)))
+
+/* This cuts 40% of the time vs bit-by-bit. */
+
+uint64_t gf2_matrix_times_switch(uint64_t *mat, uint64_t vec) {
+	/*
+	 * Without using any vector math, this handles 4 bits at a time,
+	 * and saves 40+% of the time compared to the bit-by-bit version. Use if you
+	 * have no vector compile option available to you. With cache, we see:
+	 * E5-2670 ~1-2us to extend ~1 meg 64 bit hash
+	 */
+	uint64_t sum;
+
+	sum = 0;
+	while (vec) {
+		/* reversing the case order is ~10% slower on Xeon E5-2670 */
+		switch (vec & 15) {
+		case 15:
+			sum ^= *mat ^ *(mat+1) ^ *(mat+2) ^ *(mat+3);
+			break;
+		case 14:
+			sum ^= *(mat+1) ^ *(mat+2) ^ *(mat+3);
+			break;
+		case 13:
+			sum ^= *mat ^ *(mat+2) ^ *(mat+3);
+			break;
+		case 12:
+			sum ^= *(mat+2) ^ *(mat+3);
+			break;
+		case 11:
+			sum ^= *mat ^ *(mat+1) ^ *(mat+3);
+			break;
+		case 10:
+			sum ^= *(mat+1) ^ *(mat+3);
+			break;
+		case 9:
+			sum ^= *mat ^ *(mat+3);
+			break;
+		case 8:
+			sum ^= *(mat+3);
+			break;
+		case 7:
+			sum ^= *mat ^ *(mat+1) ^ *(mat+2);
+			break;
+		case 6:
+			sum ^= *(mat+1) ^ *(mat+2);
+			break;
+		case 5:
+			sum ^= *mat ^ *(mat+2);
+			break;
+		case 4:
+			sum ^= *(mat+2);
+			break;
+		case 3:
+			sum ^= *mat ^ *(mat+1);
+			break;
+		case 2:
+			sum ^= *(mat+1);
+			break;
+		case 1:
+			sum ^= *mat;
+			break;
+		default:
+			break;
+		}
+		vec >>= 4;
+		mat += 4;
+	}
+	return sum;
+}
+
+#define CRC_MULTIPLY gf2_matrix_times_switch
+
+#else
+
+/*
+	Warning: here there be dragons involving vector math, and macros to save us
+	from repeating the same information over and over.
+*/
+
+uint64_t gf2_matrix_times_vec2(uint64_t *mat, uint64_t vec) {
+	/*
+	 * Uses xmm registers on x86, works basically everywhere fast, doing
+	 * cycles of movqda, mov, shr, pand, and, pxor, at least on gcc 8.
+	 * Is 9-11x faster than original.
+	 * E5-2670 ~29us to extend ~1 meg 64 bit hash
+	 * i3-8130U ~22us to extend ~1 meg 64 bit hash
+	 */
+	v2uq sum = {0, 0},
+		*mv2 = (v2uq*)mat;
+	/* this table allows us to eliminate conditions during gf2_matrix_times_vec2() */
+	static v2uq masks2[4] = {
+		{0,0},
+		{-1,0},
+		{0,-1},
+		{-1,-1},
+	};
+
+	/* Almost as beautiful as gf2_matrix_times_vec, but only half as many
+	 * bits per step, so we need 2 per chunk4 operation. Faster in my tests. */
+
+#define DO_CHUNK4() \
+		sum ^= (*mv2++) & masks2[vec & 3]; \
+		vec >>= 2; \
+		sum ^= (*mv2++) & masks2[vec & 3]; \
+		vec >>= 2
+
+#define DO_CHUNK16() \
+		DO_CHUNK4(); \
+		DO_CHUNK4(); \
+		DO_CHUNK4(); \
+		DO_CHUNK4()
+
+	DO_CHUNK16();
+	DO_CHUNK16();
+	DO_CHUNK16();
+	DO_CHUNK16();
+
+	STATIC_ASSERT(sizeof(uint64_t) == 8);
+	STATIC_ASSERT(sizeof(long long unsigned int) == 8);
+	return sum[0] ^ sum[1];
+}
+
+#undef DO_CHUNK16
+#undef DO_CHUNK4
+
+#define CRC_MULTIPLY gf2_matrix_times_vec2
+#endif
+
+static void gf2_matrix_square(uint64_t *square, uint64_t *mat, uint8_t dim) {
+	unsigned n;
+
+	for (n = 0; n < dim; n++)
+		square[n] = CRC_MULTIPLY(mat, mat[n]);
+}
+
+/* Turns out our Redis / Jones CRC cycles at this point, so we can support
+ * more than 64 bits of extension if we want. Trivially. */
+static uint64_t combine_cache[64][64];
+
+/* Mark Adler has some amazing updates to crc.c in his crcany repository. I
+ * like static caches, and not worrying about finding cycles generally. We are
+ * okay to spend the 32k of memory here, leaving the algorithm unchanged from
+ * as it was a decade ago, and be happy that it costs <200 microseconds to
+ * init, and that subsequent calls to the combine function take under 100
+ * nanoseconds. We also note that the crcany/crc.c code applies to any CRC, and
+ * we are currently targeting one: Jones CRC64.
+ */
+
+void init_combine_cache(uint64_t poly, uint8_t dim) {
+	unsigned n, cache_num = 0;
+	combine_cache[1][0] = poly;
+	int prev = 1;
+	uint64_t row = 1;
+	for (n = 1; n < dim; n++)
+	{
+		combine_cache[1][n] = row;
+		row <<= 1;
+	}
+
+	gf2_matrix_square(combine_cache[0], combine_cache[1], dim);
+	gf2_matrix_square(combine_cache[1], combine_cache[0], dim);
+
+	/* do/while to overwrite the first two layers, they are not used, but are
+	 * re-generated in the last two layers for the Redis polynomial */
+	do {
+		gf2_matrix_square(combine_cache[cache_num], combine_cache[cache_num + prev], dim);
+		prev = -1;
+	} while (++cache_num < 64);
+}
+
+/* Return the CRC-64 of two sequential blocks, where crc1 is the CRC-64 of the
+ * first block, crc2 is the CRC-64 of the second block, and len2 is the length
+ * of the second block.
+ *
+ * If you want reflections on your CRCs; do them outside before / after.
+ * WARNING: if you enable USE_STATIC_COMBINE_CACHE to make this fast, you MUST
+ * ALWAYS USE THE SAME POLYNOMIAL, otherwise you will get the wrong results.
+ * You MAY bzero() the even/odd static arrays, which will induce a re-cache on
+ * next call as a work-around, but ... maybe just parameterize the cached
+ * models at that point like Mark Adler does in modern crcany/crc.c .
+ */
+
+uint64_t crc64_combine(uint64_t crc1, uint64_t crc2, uintmax_t len2, uint64_t poly, uint8_t dim) {
+	/* degenerate case */
+	if (len2 == 0)
+		return crc1;
+
+	unsigned cache_num = 0;
+	if (combine_cache[0][0] == 0) {
+		init_combine_cache(poly, dim);
+	}
+
+	/* apply len2 zeros to crc1 (first square will put the operator for one
+	   zero byte, eight zero bits, in even) */
+	do
+	{
+		/* apply zeros operator for this bit of len2 */
+		if (len2 & 1)
+			crc1 = CRC_MULTIPLY(combine_cache[cache_num], crc1);
+		len2 >>= 1;
+		cache_num = (cache_num + 1) & 63;
+		/* if no more bits set, then done */
+	} while (len2 != 0);
+
+	/* return combined crc */
+	crc1 ^= crc2;
+	return crc1;
+}
+
+#undef CRC_MULTIPLY
--- a/src/crccombine.h
+++ b/src/crccombine.h
@ -0,0 +1,10 @@
+
+#include <stdint.h>
+
+
+/* mask types */
+typedef unsigned long long v2uq __attribute__ ((vector_size (16)));
+
+uint64_t gf2_matrix_times_vec2(uint64_t *mat, uint64_t vec);
+void init_combine_cache(uint64_t poly, uint8_t dim);
+uint64_t crc64_combine(uint64_t crc1, uint64_t crc2, uintmax_t len2, uint64_t poly, uint8_t dim);
--- a/src/crcspeed.c
+++ b/src/crcspeed.c
@ -1,11 +1,21 @@
 /*
 * Copyright (C) 2013 Mark Adler
+ * Copyright (C) 2019-2024 Josiah Carlson
 * Originally by: crc64.c Version 1.4  16 Dec 2013  Mark Adler
 * Modifications by Matt Stancliff <matt@genges.com>:
 *   - removed CRC64-specific behavior
 *   - added generation of lookup tables by parameters
 *   - removed inversion of CRC input/result
 *   - removed automatic initialization in favor of explicit initialization
+ * Modifications by Josiah Carlson <josiah.carlson@gmail.com>
+ *   - Added case/vector/AVX/+ versions of crc combine function; see crccombine.c
+ *     - added optional static cache
+ *   - Modified to use 1 thread to:
+ *     - Partition large crc blobs into 2-3 segments
+ *     - Process the 2-3 segments in parallel
+ *     - Merge the resulting crcs
+ *     -> Resulting in 10-90% performance boost for data > 1 meg
+ *     - macro-ized to reduce copy/pasta

  This software is provided 'as-is', without any express or implied
  warranty.  In no event will the author be held liable for any damages
@ -28,6 +38,10 @@
 */

 #include "crcspeed.h"
+#include "crccombine.h"
+
+#define CRC64_LEN_MASK UINT64_C(0x7ffffffffffffff8)
+#define CRC64_REVERSED_POLY UINT64_C(0x95ac9329ac4bc9b5)

 /* Fill in a CRC constants table. */
 void crcspeed64little_init(crcfn64 crcfn, uint64_t table[8][256]) {
@ -39,7 +53,7 @@ void crcspeed64little_init(crcfn64 crcfn, uint64_t table[8][256]) {
        table[0][n] = crcfn(0, &v, 1);
    }

-    /* generate nested CRC table for future slice-by-8 lookup */
+    /* generate nested CRC table for future slice-by-8/16/24+ lookup */
    for (int n = 0; n < 256; n++) {
        crc = table[0][n];
        for (int k = 1; k < 8; k++) {
@ -47,6 +61,10 @@ void crcspeed64little_init(crcfn64 crcfn, uint64_t table[8][256]) {
            table[k][n] = crc;
        }
    }
+#if USE_STATIC_COMBINE_CACHE
+    /* initialize combine cache for CRC stapling for slice-by 16/24+ */
+    init_combine_cache(CRC64_REVERSED_POLY, 64);
+#endif
 }

 void crcspeed16little_init(crcfn16 crcfn, uint16_t table[8][256]) {
@ -104,45 +122,151 @@ void crcspeed16big_init(crcfn16 fn, uint16_t big_table[8][256]) {
    }
 }

+/* Note: doing all of our crc/next modifications *before* the crc table
+ * references is an absolute speedup on all CPUs tested. So... keep these
+ * macros separate.
+ */
+
+#define DO_8_1(crc, next)                            \
+    crc ^= *(uint64_t *)next;                        \
+    next += 8
+
+#define DO_8_2(crc)                                  \
+    crc = little_table[7][(uint8_t)crc] ^            \
+             little_table[6][(uint8_t)(crc >> 8)] ^  \
+             little_table[5][(uint8_t)(crc >> 16)] ^ \
+             little_table[4][(uint8_t)(crc >> 24)] ^ \
+             little_table[3][(uint8_t)(crc >> 32)] ^ \
+             little_table[2][(uint8_t)(crc >> 40)] ^ \
+             little_table[1][(uint8_t)(crc >> 48)] ^ \
+             little_table[0][crc >> 56]
+
+#define CRC64_SPLIT(div) \
+    olen = len; \
+    next2 = next1 + ((len / div) & CRC64_LEN_MASK); \
+    len = (next2 - next1)
+
+#define MERGE_CRC(crcn) \
+    crc1 = crc64_combine(crc1, crcn, next2 - next1, CRC64_REVERSED_POLY, 64)
+
+#define MERGE_END(last, DIV) \
+    len = olen - ((next2 - next1) * DIV); \
+    next1 = last
+
+/* Variables so we can change for benchmarking; these seem to be fairly
+ * reasonable for Intel CPUs made since 2010. Please adjust as necessary if
+ * or when your CPU has more load / execute units. We've written benchmark code
+ * to help you tune your platform, see crc64Test. */
+#if defined(__i386__) || defined(__X86_64__)
+static size_t CRC64_TRI_CUTOFF = (2*1024);
+static size_t CRC64_DUAL_CUTOFF = (128);
+#else
+static size_t CRC64_TRI_CUTOFF = (16*1024);
+static size_t CRC64_DUAL_CUTOFF = (1024);
+#endif
+
+
+void set_crc64_cutoffs(size_t dual_cutoff, size_t tri_cutoff) {
+    CRC64_DUAL_CUTOFF = dual_cutoff;
+    CRC64_TRI_CUTOFF = tri_cutoff;
+}
+
 /* Calculate a non-inverted CRC multiple bytes at a time on a little-endian
 * architecture. If you need inverted CRC, invert *before* calling and invert
 * *after* calling.
- * 64 bit crc = process 8 bytes at once;
+ * 64 bit crc = process 8/16/24 bytes at once;
 */
-uint64_t crcspeed64little(uint64_t little_table[8][256], uint64_t crc,
+uint64_t crcspeed64little(uint64_t little_table[8][256], uint64_t crc1,
                          void *buf, size_t len) {
-    unsigned char *next = buf;
+    unsigned char *next1 = buf;
+
+    if (CRC64_DUAL_CUTOFF < 1) {
+        goto final;
+    }

    /* process individual bytes until we reach an 8-byte aligned pointer */
-    while (len && ((uintptr_t)next & 7) != 0) {
-        crc = little_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
+    while (len && ((uintptr_t)next1 & 7) != 0) {
+        crc1 = little_table[0][(crc1 ^ *next1++) & 0xff] ^ (crc1 >> 8);
        len--;
    }

-    /* fast middle processing, 8 bytes (aligned!) per loop */
-    while (len >= 8) {
-        crc ^= *(uint64_t *)next;
-        crc = little_table[7][crc & 0xff] ^
-              little_table[6][(crc >> 8) & 0xff] ^
-              little_table[5][(crc >> 16) & 0xff] ^
-              little_table[4][(crc >> 24) & 0xff] ^
-              little_table[3][(crc >> 32) & 0xff] ^
-              little_table[2][(crc >> 40) & 0xff] ^
-              little_table[1][(crc >> 48) & 0xff] ^
-              little_table[0][crc >> 56];
-        next += 8;
-        len -= 8;
-    }
+    if (len >  CRC64_TRI_CUTOFF) {
+        /* 24 bytes per loop, doing 3 parallel 8 byte chunks at a time */
+        unsigned char *next2, *next3;
+        uint64_t olen, crc2=0, crc3=0;
+        CRC64_SPLIT(3);
+        /* len is now the length of the first segment, the 3rd segment possibly
+         * having extra bytes to clean up at the end
+         */
+        next3 = next2 + len;
+        while (len >= 8) {
+            len -= 8;
+            DO_8_1(crc1, next1);
+            DO_8_1(crc2, next2);
+            DO_8_1(crc3, next3);
+            DO_8_2(crc1);
+            DO_8_2(crc2);
+            DO_8_2(crc3);
+        }

+        /* merge the 3 crcs */
+        MERGE_CRC(crc2);
+        MERGE_CRC(crc3);
+        MERGE_END(next3, 3);
+    } else if (len > CRC64_DUAL_CUTOFF) {
+        /* 16 bytes per loop, doing 2 parallel 8 byte chunks at a time */
+        unsigned char *next2;
+        uint64_t olen, crc2=0;
+        CRC64_SPLIT(2);
+        /* len is now the length of the first segment, the 2nd segment possibly
+         * having extra bytes to clean up at the end
+         */
+        while (len >= 8) {
+            len -= 8;
+            DO_8_1(crc1, next1);
+            DO_8_1(crc2, next2);
+            DO_8_2(crc1);
+            DO_8_2(crc2);
+        }
+
+        /* merge the 2 crcs */
+        MERGE_CRC(crc2);
+        MERGE_END(next2, 2);
+    }
+    /* We fall through here to handle our <CRC64_DUAL_CUTOFF inputs, and for any trailing
+     * bytes that wasn't evenly divisble by 16 or 24 above. */
+
+    /* fast processing, 8 bytes (aligned!) per loop */
+    while (len >= 8) {
+        len -= 8;
+        DO_8_1(crc1, next1);
+        DO_8_2(crc1);
+    }
+final:
    /* process remaining bytes (can't be larger than 8) */
    while (len) {
-        crc = little_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
+        crc1 = little_table[0][(crc1 ^ *next1++) & 0xff] ^ (crc1 >> 8);
        len--;
    }

-    return crc;
+    return crc1;
 }

+/* clean up our namespace */
+#undef DO_8_1
+#undef DO_8_2
+#undef CRC64_SPLIT
+#undef MERGE_CRC
+#undef MERGE_END
+#undef CRC64_REVERSED_POLY
+#undef CRC64_LEN_MASK
+
+
+/* note: similar perf advantages can be had for long strings in crc16 using all
+ * of the same optimizations as above; though this is unnecessary. crc16 is
+ * normally used to shard keys; not hash / verify data, so is used on shorter
+ * data that doesn't warrant such changes. */
+
 uint16_t crcspeed16little(uint16_t little_table[8][256], uint16_t crc,
                          void *buf, size_t len) {
    unsigned char *next = buf;
@ -190,6 +314,10 @@ uint64_t crcspeed64big(uint64_t big_table[8][256], uint64_t crc, void *buf,
        len--;
    }

+    /* note: alignment + 2/3-way processing can probably be handled here nearly
+       the same as above, using our updated DO_8_2 macro. Not included in these
+       changes, as other authors, I don't have big-endian to test with. */
+
    while (len >= 8) {
        crc ^= *(uint64_t *)next;
        crc = big_table[0][crc & 0xff] ^
--- a/src/crcspeed.h
+++ b/src/crcspeed.h
@ -34,6 +34,8 @@
 typedef uint64_t (*crcfn64)(uint64_t, const void *, const uint64_t);
 typedef uint16_t (*crcfn16)(uint16_t, const void *, const uint64_t);

+void set_crc64_cutoffs(size_t dual_cutoff, size_t tri_cutoff);
+
 /* CRC-64 */
 void crcspeed64little_init(crcfn64 fn, uint64_t table[8][256]);
 void crcspeed64big_init(crcfn64 fn, uint64_t table[8][256]);