Introduce thread-local storage variable to update thread's own used_memory and sum when reading to reduce atomic contention. (#674)

#### Description This patch try to introduce a thread-local storage variable for each thread to update its own `used_memory`, and then sum them together when reading in `zmalloc_used_memory`. Then we can reduce unnecessary `lock add` contention from atomic variable. We also add a protection if too many threads created and the total threads number greater than 132, then fall back to atomic operation for the threads index >= 132. #### Problem Statement `zmalloc` and `zfree` related functions will update the `used_memory` atomicity for each operation, and they are called very frequency. From the benchmark of [memtier_benchmark-1Mkeys-load-stream-5-fields-with-100B-values-pipeline-10.yml](https://github.com/redis/redis-benchmarks-specification/blob/main/redis_benchmarks_specification/test-suites/memtier_benchmark-1Mkeys-load-stream-5-fields-with-100B-values-pipeline-10.yml) , the cycles ratio of `zmalloc` and `zfree` are high, they are wrappers for the lower allocator library, it should not take too much cycles. And most of the cycles are contributed by `lock add` and `lock sub` , they are expensive instructions. From the profiling, the metrics' update mainly come from the main thread, use a TLS will reduce a lot of contention. #### Performance Boost **Note:** This optimization should benefit common benchmark widely. I choose below 2 scenarios to validate the performance boost in my local environment. | Test Suites | Performance Boost | |-|-| |[memtier_benchmark-1Mkeys-load-stream-5-fields-with-100B-values-pipeline-10](https://github.com/redis/redis-benchmarks-specification/blob/main/redis_benchmarks_specification/test-suites/memtier_benchmark-1Mkeys-load-stream-5-fields-with-100B-values-pipeline-10.yml)|8%| |[memtier_benchmark-1Mkeys-load-string-with-100B-values-pipeline-10](https://github.com/redis/redis-benchmarks-specification/blob/main/redis_benchmarks_specification/test-suites/memtier_benchmark-1Mkeys-load-string-with-100B-values-pipeline-10.yml)|4%| ##### Test Env - OS: Ubuntu 22.04.4 LTS - Platform: Intel Xeon Platinum 8380 - Server and Client in same socket ##### Start Server ```sh taskset -c 0-3 ~/valkey/src/valkey-server /tmp/valkey_1.conf port 9001 bind * -::* daemonize yes protected-mode no save "" ``` --------- Signed-off-by: Lipeng Zhu <lipeng.zhu@intel.com> Co-authored-by: Wangyang Guo <wangyang.guo@intel.com>
2024-07-02 12:52:43 +08:00 · 2024-07-02 12:52:43 +08:00 · 3323e422ad
commit 3323e422ad
parent 0cc16d0298
3 changed files with 65 additions and 13 deletions
--- a/src/config.h
+++ b/src/config.h
@ -329,4 +329,14 @@ void setcpuaffinity(const char *cpulist);
 #define HAVE_FADVISE
 #endif

+#define IO_THREADS_MAX_NUM 128
+
+#ifndef CACHE_LINE_SIZE
+#if defined(__aarch64__) && defined(__APPLE__)
+#define CACHE_LINE_SIZE 128
+#else
+#define CACHE_LINE_SIZE 64
+#endif
+#endif
+
 #endif
--- a/src/networking.c
+++ b/src/networking.c
@ -4222,15 +4222,6 @@ void processEventsWhileBlocked(void) {
 * Threaded I/O
 * ========================================================================== */

-#define IO_THREADS_MAX_NUM 128
-#ifndef CACHE_LINE_SIZE
-#if defined(__aarch64__) && defined(__APPLE__)
-#define CACHE_LINE_SIZE 128
-#else
-#define CACHE_LINE_SIZE 64
-#endif
-#endif
-
 typedef struct __attribute__((aligned(CACHE_LINE_SIZE))) threads_pending {
    _Atomic unsigned long value;
 } threads_pending;
--- a/src/zmalloc.c
+++ b/src/zmalloc.c
@ -88,10 +88,53 @@ void zlibc_free(void *ptr) {
 #define dallocx(ptr, flags) je_dallocx(ptr, flags)
 #endif

-#define update_zmalloc_stat_alloc(__n) atomic_fetch_add_explicit(&used_memory, (__n), memory_order_relaxed)
-#define update_zmalloc_stat_free(__n) atomic_fetch_sub_explicit(&used_memory, (__n), memory_order_relaxed)
+#if __STDC_NO_THREADS__
+#define thread_local __thread
+#else
+#include <threads.h>
+#endif

-static _Atomic size_t used_memory = 0;
+#define MAX_THREADS_NUM (IO_THREADS_MAX_NUM + 3 + 1)
+/* A thread-local storage which keep the current thread's index in the used_memory_thread array. */
+static thread_local int thread_index = -1;
+/* Element in used_memory_thread array should only be written by a single thread which
+ * distinguished by the thread-local storage thread_index. But when an element in
+ * used_memory_thread array was written, it could be read by another thread simultaneously,
+ * the reader will see the inconsistency memory on non x86 architecture potentially.
+ * For the ARM and PowerPC platform, we can solve this issue by make the memory aligned.
+ * For the other architecture, lets fall back to the atomic operation to keep safe. */
+#if defined(__i386__) || defined(__x86_64__) || defined(__amd64__) || defined(__POWERPC__) || defined(__arm__) ||      \
+    defined(__arm64__)
+static __attribute__((aligned(sizeof(size_t)))) size_t used_memory_thread[MAX_THREADS_NUM];
+#else
+static _Atomic size_t used_memory_thread[MAX_THREADS_NUM];
+#endif
+static atomic_int total_active_threads = 0;
+/* This is a simple protection. It's used only if some modules create a lot of threads. */
+static atomic_size_t used_memory_for_additional_threads = 0;
+
+/* Register the thread index in start_routine. */
+static inline void zmalloc_register_thread_index(void) {
+    thread_index = atomic_fetch_add_explicit(&total_active_threads, 1, memory_order_relaxed);
+}
+
+static inline void update_zmalloc_stat_alloc(size_t size) {
+    if (unlikely(thread_index == -1)) zmalloc_register_thread_index();
+    if (unlikely(thread_index >= MAX_THREADS_NUM)) {
+        atomic_fetch_add_explicit(&used_memory_for_additional_threads, size, memory_order_relaxed);
+    } else {
+        used_memory_thread[thread_index] += size;
+    }
+}
+
+static inline void update_zmalloc_stat_free(size_t size) {
+    if (unlikely(thread_index == -1)) zmalloc_register_thread_index();
+    if (unlikely(thread_index >= MAX_THREADS_NUM)) {
+        atomic_fetch_sub_explicit(&used_memory_for_additional_threads, size, memory_order_relaxed);
+    } else {
+        used_memory_thread[thread_index] -= size;
+    }
+}

 static void zmalloc_default_oom(size_t size) {
    fprintf(stderr, "zmalloc: Out of memory trying to allocate %zu bytes\n", size);
@ -415,7 +458,15 @@ char *zstrdup(const char *s) {
 }

 size_t zmalloc_used_memory(void) {
-    size_t um = atomic_load_explicit(&used_memory, memory_order_relaxed);
+    size_t um = 0;
+    int threads_num = total_active_threads;
+    if (unlikely(total_active_threads > MAX_THREADS_NUM)) {
+        um += atomic_load_explicit(&used_memory_for_additional_threads, memory_order_relaxed);
+        threads_num = MAX_THREADS_NUM;
+    }
+    for (int i = 0; i < threads_num; i++) {
+        um += used_memory_thread[i];
+    }
    return um;
 }