Optimize PFCOUNT, PFMERGE command by SIMD acceleration (#1293)

This PR optimizes the performance of HyperLogLog commands (PFCOUNT, PFMERGE) by adding AVX2 fast paths. Two AVX2 functions are added for conversion between raw representation and dense representation. They are 15 ~ 30 times faster than scalar implementaion. Note that sparse representation is not accelerated. AVX2 fast paths are enabled when the CPU supports AVX2 (checked at runtime) and the hyperloglog configuration is default (HLL_REGISTERS == 16384 && HLL_BITS == 6). `PFDEBUG SIMD (ON|OFF)` subcommand is added for unit tests. A new TCL unit test checks that the results produced by non-AVX2 and AVX2 implementations are exactly equal. When merging 3 dense hll structures, the benchmark shows a 12x speedup compared to the scalar version. ``` pfcount key1 key2 key3 pfmerge keyall key1 key2 key3 ``` ``` ====================================================================================================== Type Ops/sec Avg. Latency p50 Latency p99 Latency p99.9 Latency KB/sec ------------------------------------------------------------------------------------------------------ PFCOUNT-scalar 5665.56 35.29839 32.25500 63.99900 67.58300 608.60 PFCOUNT-avx2 72377.83 2.75834 2.67100 5.34300 6.81500 7774.96 ------------------------------------------------------------------------------------------------------ PFMERGE-scalar 9851.29 20.28806 20.09500 36.86300 39.16700 615.71 PFMERGE-avx2 125621.89 1.59126 1.55100 3.11900 4.70300 15702.74 ------------------------------------------------------------------------------------------------------ scalar: valkey:unstable 2df56d87c0ebe802f38e8922bb2ea1e4ca9cfa76 avx2: Nugine:hll-simd 8f9adc34021080d96e60bd0abe06b043f3ed0275 CPU: 13th Gen Intel® Core™ i9-13900H × 20 Memory: 32.0 GiB OS: Ubuntu 22.04.5 LTS ``` Experiment repo: https://github.com/Nugine/redis-hyperloglog Benchmark script: https://github.com/Nugine/redis-hyperloglog/blob/main/scripts/memtier.sh Algorithm: https://github.com/Nugine/redis-hyperloglog/blob/main/cpp/bench.cpp --------- Signed-off-by: Xuyang Wang <xuyangwang@link.cuhk.edu.cn>
2024-12-03 02:40:38 +08:00 · 2024-12-03 02:40:38 +08:00 · 3df609ef06
commit 3df609ef06
parent fbbfe5d3d3
3 changed files with 345 additions and 11 deletions
--- a/src/config.h
+++ b/src/config.h
@ -364,4 +364,17 @@ void setcpuaffinity(const char *cpulist);
 #define valkey_prefetch(addr) ((void)(addr))
 #endif

+/* Check if we can compile AVX2 code */
+#if defined(__x86_64__) && ((defined(__GNUC__) && __GNUC__ >= 5) || (defined(__clang__) && __clang_major__ >= 4))
+#if defined(__has_attribute) && __has_attribute(target)
+#define HAVE_AVX2
+#endif
+#endif
+
+#if defined(HAVE_AVX2)
+#define ATTRIBUTE_TARGET_AVX2 __attribute__((target("avx2")))
+#else
+#define ATTRIBUTE_TARGET_AVX2
+#endif
+
 #endif
--- a/src/hyperloglog.c
+++ b/src/hyperloglog.c
@ -35,6 +35,10 @@
 #include <stdint.h>
 #include <math.h>

+#ifdef HAVE_AVX2
+#include <immintrin.h>
+#endif
+
 /* The HyperLogLog implementation is based on the following ideas:
 *
 * * The use of a 64 bit hash function as proposed in [1], in order to estimate
@ -208,6 +212,13 @@ struct hllhdr {

 static char *invalid_hll_err = "-INVALIDOBJ Corrupted HLL object detected";

+#ifdef HAVE_AVX2
+static int simd_enabled = 1;
+#define HLL_USE_AVX2 (simd_enabled && __builtin_cpu_supports("avx2"))
+#else
+#define HLL_USE_AVX2 0
+#endif
+
 /* =========================== Low level bit macros ========================= */

 /* Macros to access the dense representation.
@ -1064,6 +1075,136 @@ int hllAdd(robj *o, unsigned char *ele, size_t elesize) {
    }
 }

+#ifdef HAVE_AVX2
+/* A specialized version of hllMergeDense, optimized for default configurations.
+ *
+ * Requirements:
+ * 1) HLL_REGISTERS == 16384 && HLL_BITS == 6
+ * 2) The CPU supports AVX2 (checked at runtime in hllMergeDense)
+ *
+ * reg_raw: pointer to the raw representation array (16384 bytes, one byte per register)
+ * reg_dense: pointer to the dense representation array (12288 bytes, 6 bits per register)
+ */
+ATTRIBUTE_TARGET_AVX2
+void hllMergeDenseAVX2(uint8_t *reg_raw, const uint8_t *reg_dense) {
+    /* Shuffle indices for unpacking bytes of dense registers
+     * From: {XXXX|AAAB|BBCC|CDDD|EEEF|FFGG|GHHH|XXXX}
+     * To:   {AAA0|BBB0|CCC0|DDD0|EEE0|FFF0|GGG0|HHH0}
+     */
+    const __m256i shuffle = _mm256_setr_epi8( //
+        4, 5, 6, -1,                          //
+        7, 8, 9, -1,                          //
+        10, 11, 12, -1,                       //
+        13, 14, 15, -1,                       //
+        0, 1, 2, -1,                          //
+        3, 4, 5, -1,                          //
+        6, 7, 8, -1,                          //
+        9, 10, 11, -1                         //
+    );
+
+    /* Merge the first 8 registers (6 bytes) normally
+     * as the AVX2 algorithm needs 4 padding bytes at the start */
+    uint8_t val;
+    for (int i = 0; i < 8; i++) {
+        HLL_DENSE_GET_REGISTER(val, reg_dense, i);
+        if (val > reg_raw[i]) {
+            reg_raw[i] = val;
+        }
+    }
+
+    /* Dense to Raw:
+     *
+     * 4 registers in 3 bytes:
+     * {bbaaaaaa|ccccbbbb|ddddddcc}
+     *
+     * LOAD 32 bytes (32 registers) per iteration:
+     * 4(padding) + 12(16 registers) + 12(16 registers) + 4(padding)
+     * {XXXX|AAAB|BBCC|CDDD|EEEF|FFGG|GHHH|XXXX}
+     *
+     * SHUFFLE to:
+     * {AAA0|BBB0|CCC0|DDD0|EEE0|FFF0|GGG0|HHH0}
+     * {bbaaaaaa|ccccbbbb|ddddddcc|00000000} x8
+     *
+     * AVX2 is little endian, each of the 8 groups is a little-endian int32.
+     * A group (int32) contains 3 valid bytes (4 registers) and a zero byte.
+     *
+     * extract registers in each group with AND and SHIFT:
+     * {00aaaaaa|00000000|00000000|00000000} x8 (<<0)
+     * {00000000|00bbbbbb|00000000|00000000} x8 (<<2)
+     * {00000000|00000000|00cccccc|00000000} x8 (<<4)
+     * {00000000|00000000|00000000|00dddddd} x8 (<<6)
+     *
+     * merge the extracted registers with OR:
+     * {00aaaaaa|00bbbbbb|00cccccc|00dddddd} x8
+     *
+     * Finally, compute MAX(reg_raw, merged) and STORE it back to reg_raw
+     */
+
+    /* Skip 8 registers (6 bytes) */
+    const uint8_t *r = reg_dense + 6 - 4;
+    uint8_t *t = reg_raw + 8;
+
+    for (int i = 0; i < HLL_REGISTERS / 32 - 1; ++i) {
+        __m256i x0, x;
+        x0 = _mm256_loadu_si256((__m256i *)r);
+        x = _mm256_shuffle_epi8(x0, shuffle);
+
+        __m256i a1, a2, a3, a4;
+        a1 = _mm256_and_si256(x, _mm256_set1_epi32(0x0000003f));
+        a2 = _mm256_and_si256(x, _mm256_set1_epi32(0x00000fc0));
+        a3 = _mm256_and_si256(x, _mm256_set1_epi32(0x0003f000));
+        a4 = _mm256_and_si256(x, _mm256_set1_epi32(0x00fc0000));
+
+        a2 = _mm256_slli_epi32(a2, 2);
+        a3 = _mm256_slli_epi32(a3, 4);
+        a4 = _mm256_slli_epi32(a4, 6);
+
+        __m256i y1, y2, y;
+        y1 = _mm256_or_si256(a1, a2);
+        y2 = _mm256_or_si256(a3, a4);
+        y = _mm256_or_si256(y1, y2);
+
+        __m256i z = _mm256_loadu_si256((__m256i *)t);
+
+        z = _mm256_max_epu8(z, y);
+
+        _mm256_storeu_si256((__m256i *)t, z);
+
+        r += 24;
+        t += 32;
+    }
+
+    /* Merge the last 24 registers normally
+     * as the AVX2 algorithm needs 4 padding bytes at the end */
+    for (int i = HLL_REGISTERS - 24; i < HLL_REGISTERS; i++) {
+        HLL_DENSE_GET_REGISTER(val, reg_dense, i);
+        if (val > reg_raw[i]) {
+            reg_raw[i] = val;
+        }
+    }
+}
+#endif
+
+/* Merge dense-encoded registers to raw registers array. */
+void hllMergeDense(uint8_t *reg_raw, const uint8_t *reg_dense) {
+#ifdef HAVE_AVX2
+    if (HLL_REGISTERS == 16384 && HLL_BITS == 6) {
+        if (HLL_USE_AVX2) {
+            hllMergeDenseAVX2(reg_raw, reg_dense);
+            return;
+        }
+    }
+#endif
+
+    uint8_t val;
+    for (int i = 0; i < HLL_REGISTERS; i++) {
+        HLL_DENSE_GET_REGISTER(val, reg_dense, i);
+        if (val > reg_raw[i]) {
+            reg_raw[i] = val;
+        }
+    }
+}
+
 /* Merge by computing MAX(registers[i],hll[i]) the HyperLogLog 'hll'
 * with an array of uint8_t HLL_REGISTERS registers pointed by 'max'.
 *
@ -1077,12 +1218,7 @@ int hllMerge(uint8_t *max, robj *hll) {
    int i;

    if (hdr->encoding == HLL_DENSE) {
-        uint8_t val;
-
-        for (i = 0; i < HLL_REGISTERS; i++) {
-            HLL_DENSE_GET_REGISTER(val, hdr->registers, i);
-            if (val > max[i]) max[i] = val;
-        }
+        hllMergeDense(max, hdr->registers);
    } else {
        uint8_t *p = hll->ptr, *end = p + sdslen(hll->ptr);
        long runlen, regval;
@ -1114,6 +1250,121 @@ int hllMerge(uint8_t *max, robj *hll) {
    return C_OK;
 }

+#ifdef HAVE_AVX2
+/* A specialized version of hllDenseCompress, optimized for default configurations.
+ *
+ * Requirements:
+ * 1) HLL_REGISTERS == 16384 && HLL_BITS == 6
+ * 2) The CPU supports AVX2 (checked at runtime in hllDenseCompress)
+ *
+ * reg_dense: pointer to the dense representation array (12288 bytes, 6 bits per register)
+ * reg_raw: pointer to the raw representation array (16384 bytes, one byte per register)
+ */
+ATTRIBUTE_TARGET_AVX2
+void hllDenseCompressAVX2(uint8_t *reg_dense, const uint8_t *reg_raw) {
+    /* Shuffle indices for packing bytes of dense registers
+     * From: {AAA0|BBB0|CCC0|DDD0|EEE0|FFF0|GGG0|HHH0}
+     * To:   {AAAB|BBCC|CDDD|0000|EEEF|FFGG|GHHH|0000}
+     */
+    const __m256i shuffle = _mm256_setr_epi8( //
+        0, 1, 2,                              //
+        4, 5, 6,                              //
+        8, 9, 10,                             //
+        12, 13, 14,                           //
+        -1, -1, -1, -1,                       //
+        0, 1, 2,                              //
+        4, 5, 6,                              //
+        8, 9, 10,                             //
+        12, 13, 14,                           //
+        -1, -1, -1, -1                        //
+    );
+
+    /* Raw to Dense:
+     *
+     * LOAD 32 bytes (32 registers) per iteration:
+     * {00aaaaaa|00bbbbbb|00cccccc|00dddddd} x8
+     *
+     * AVX2 is little endian, each of the 8 groups is a little-endian int32.
+     * A group (int32) contains 4 registers.
+     *
+     * move the registers to correct positions with AND and SHIFT:
+     * {00aaaaaa|00000000|00000000|00000000} x8 (>>0)
+     * {bb000000|0000bbbb|00000000|00000000} x8 (>>2)
+     * {00000000|cccc0000|000000cc|00000000} x8 (>>4)
+     * {00000000|00000000|dddddd00|00000000} x8 (>>6)
+     *
+     * merge the registers with OR:
+     * {bbaaaaaa|ccccbbbb|ddddddcc|00000000} x8
+     * {AAA0|BBB0|CCC0|DDD0|EEE0|FFF0|GGG0|HHH0}
+     *
+     * SHUFFLE to:
+     * {AAAB|BBCC|CDDD|0000|EEEF|FFGG|GHHH|0000}
+     *
+     * STORE the lower half and higher half respectively:
+     * AAABBBCCCDDD0000
+     *             EEEFFFGGGHHH0000
+     * AAABBBCCCDDDEEEFFFGGGHHH0000
+     *
+     * Note that the last 4 bytes are padding bytes.
+     */
+
+    const uint8_t *r = reg_raw;
+    uint8_t *t = reg_dense;
+
+    for (int i = 0; i < HLL_REGISTERS / 32 - 1; ++i) {
+        __m256i x = _mm256_loadu_si256((__m256i *)r);
+
+        __m256i a1, a2, a3, a4;
+        a1 = _mm256_and_si256(x, _mm256_set1_epi32(0x0000003f));
+        a2 = _mm256_and_si256(x, _mm256_set1_epi32(0x00003f00));
+        a3 = _mm256_and_si256(x, _mm256_set1_epi32(0x003f0000));
+        a4 = _mm256_and_si256(x, _mm256_set1_epi32(0x3f000000));
+
+        a2 = _mm256_srli_epi32(a2, 2);
+        a3 = _mm256_srli_epi32(a3, 4);
+        a4 = _mm256_srli_epi32(a4, 6);
+
+        __m256i y1, y2, y;
+        y1 = _mm256_or_si256(a1, a2);
+        y2 = _mm256_or_si256(a3, a4);
+        y = _mm256_or_si256(y1, y2);
+        y = _mm256_shuffle_epi8(y, shuffle);
+
+        __m128i lower, higher;
+        lower = _mm256_castsi256_si128(y);
+        higher = _mm256_extracti128_si256(y, 1);
+
+        _mm_storeu_si128((__m128i *)t, lower);
+        _mm_storeu_si128((__m128i *)(t + 12), higher);
+
+        r += 32;
+        t += 24;
+    }
+
+    /* Merge the last 32 registers normally
+     * as the AVX2 algorithm needs 4 padding bytes at the end */
+    for (int i = HLL_REGISTERS - 32; i < HLL_REGISTERS; i++) {
+        HLL_DENSE_SET_REGISTER(reg_dense, i, reg_raw[i]);
+    }
+}
+#endif
+
+/* Compress raw registers to dense representation. */
+void hllDenseCompress(uint8_t *reg_dense, const uint8_t *reg_raw) {
+#ifdef HAVE_AVX2
+    if (HLL_REGISTERS == 16384 && HLL_BITS == 6) {
+        if (HLL_USE_AVX2) {
+            hllDenseCompressAVX2(reg_dense, reg_raw);
+            return;
+        }
+    }
+#endif
+
+    for (int i = 0; i < HLL_REGISTERS; i++) {
+        HLL_DENSE_SET_REGISTER(reg_dense, i, reg_raw[i]);
+    }
+}
+
 /* ========================== HyperLogLog commands ========================== */

 /* Create an HLL object. We always create the HLL using sparse encoding.
@ -1363,12 +1614,17 @@ void pfmergeCommand(client *c) {

    /* Write the resulting HLL to the destination HLL registers and
     * invalidate the cached value. */
-    for (j = 0; j < HLL_REGISTERS; j++) {
-        if (max[j] == 0) continue;
+    if (use_dense) {
        hdr = o->ptr;
-        switch (hdr->encoding) {
-        case HLL_DENSE: hllDenseSet(hdr->registers, j, max[j]); break;
-        case HLL_SPARSE: hllSparseSet(o, j, max[j]); break;
+        hllDenseCompress(hdr->registers, max);
+    } else {
+        for (j = 0; j < HLL_REGISTERS; j++) {
+            if (max[j] == 0) continue;
+            hdr = o->ptr;
+            switch (hdr->encoding) {
+            case HLL_DENSE: hllDenseSet(hdr->registers, j, max[j]); break;
+            case HLL_SPARSE: hllSparseSet(o, j, max[j]); break;
+            }
        }
    }
    hdr = o->ptr; /* o->ptr may be different now, as a side effect of
@ -1494,6 +1750,7 @@ cleanup:
 * PFDEBUG DECODE <key>
 * PFDEBUG ENCODING <key>
 * PFDEBUG TODENSE <key>
+ * PFDEBUG SIMD (ON|OFF)
 */
 void pfdebugCommand(client *c) {
    char *cmd = c->argv[1]->ptr;
@ -1501,6 +1758,30 @@ void pfdebugCommand(client *c) {
    robj *o;
    int j;

+    if (!strcasecmp(cmd, "simd")) {
+        if (c->argc != 3) goto arityerr;
+
+        if (!strcasecmp(c->argv[2]->ptr, "on")) {
+#ifdef HAVE_AVX2
+            simd_enabled = 1;
+#endif
+        } else if (!strcasecmp(c->argv[2]->ptr, "off")) {
+#ifdef HAVE_AVX2
+            simd_enabled = 0;
+#endif
+        } else {
+            addReplyError(c, "Argument must be ON or OFF");
+        }
+
+        if (HLL_USE_AVX2) {
+            addReplyStatus(c, "enabled");
+        } else {
+            addReplyStatus(c, "disabled");
+        }
+
+        return;
+    }
+
    o = lookupKeyWrite(c->db, c->argv[2]);
    if (o == NULL) {
        addReplyError(c, "The specified key does not exist");
--- a/tests/unit/hyperloglog.tcl
+++ b/tests/unit/hyperloglog.tcl
@ -222,6 +222,46 @@ start_server {tags {"hll"}} {
        assert_equal 3 [r pfcount destkey]
    }

+    test {PFMERGE results with simd} {
+        r del hllscalar{t} hllsimd{t} hll1{t} hll2{t} hll3{t}
+        for {set x 1} {$x < 2000} {incr x} {
+            r pfadd hll1{t} [expr rand()]
+        }
+        for {set x 1} {$x < 4000} {incr x} {
+            r pfadd hll2{t} [expr rand()]
+        }
+        for {set x 1} {$x < 8000} {incr x} {
+            r pfadd hll3{t} [expr rand()]
+        }
+        assert {[r pfcount hll1{t}] > 0}
+        assert {[r pfcount hll2{t}] > 0}
+        assert {[r pfcount hll3{t}] > 0}
+
+        r pfdebug simd off
+        set scalar [r pfcount hll1{t} hll2{t} hll3{t}]
+        r pfdebug simd on
+        set simd [r pfcount hll1{t} hll2{t} hll3{t}]
+        assert {$scalar > 0}
+        assert {$simd > 0}
+        assert_equal $scalar $simd
+
+        r pfdebug simd off
+        r pfmerge hllscalar{t} hll1{t} hll2{t} hll3{t}
+        r pfdebug simd on
+        r pfmerge hllsimd{t} hll1{t} hll2{t} hll3{t}
+
+        set scalar [r pfcount hllscalar{t}]
+        set simd [r pfcount hllsimd{t}]
+        assert {$scalar > 0}
+        assert {$simd > 0}
+        assert_equal $scalar $simd
+
+        set scalar [r get hllscalar{t}]
+        set simd [r get hllsimd{t}]
+        assert_equal $scalar $simd
+
+    } {} {needs:pfdebug}
+
    test {PFCOUNT multiple-keys merge returns cardinality of union #1} {
        r del hll1{t} hll2{t} hll3{t}
        for {set x 1} {$x < 10000} {incr x} {