Introduce thread-local storage variable to update thread's own used_memory and sum when reading to reduce atomic contention. (#674)
#### Description This patch try to introduce a thread-local storage variable for each thread to update its own `used_memory`, and then sum them together when reading in `zmalloc_used_memory`. Then we can reduce unnecessary `lock add` contention from atomic variable. We also add a protection if too many threads created and the total threads number greater than 132, then fall back to atomic operation for the threads index >= 132. #### Problem Statement `zmalloc` and `zfree` related functions will update the `used_memory` atomicity for each operation, and they are called very frequency. From the benchmark of [memtier_benchmark-1Mkeys-load-stream-5-fields-with-100B-values-pipeline-10.yml](https://github.com/redis/redis-benchmarks-specification/blob/main/redis_benchmarks_specification/test-suites/memtier_benchmark-1Mkeys-load-stream-5-fields-with-100B-values-pipeline-10.yml) , the cycles ratio of `zmalloc` and `zfree` are high, they are wrappers for the lower allocator library, it should not take too much cycles. And most of the cycles are contributed by `lock add` and `lock sub` , they are expensive instructions. From the profiling, the metrics' update mainly come from the main thread, use a TLS will reduce a lot of contention. #### Performance Boost **Note:** This optimization should benefit common benchmark widely. I choose below 2 scenarios to validate the performance boost in my local environment. | Test Suites | Performance Boost | |-|-| |[memtier_benchmark-1Mkeys-load-stream-5-fields-with-100B-values-pipeline-10](https://github.com/redis/redis-benchmarks-specification/blob/main/redis_benchmarks_specification/test-suites/memtier_benchmark-1Mkeys-load-stream-5-fields-with-100B-values-pipeline-10.yml)|8%| |[memtier_benchmark-1Mkeys-load-string-with-100B-values-pipeline-10](https://github.com/redis/redis-benchmarks-specification/blob/main/redis_benchmarks_specification/test-suites/memtier_benchmark-1Mkeys-load-string-with-100B-values-pipeline-10.yml)|4%| ##### Test Env - OS: Ubuntu 22.04.4 LTS - Platform: Intel Xeon Platinum 8380 - Server and Client in same socket ##### Start Server ```sh taskset -c 0-3 ~/valkey/src/valkey-server /tmp/valkey_1.conf port 9001 bind * -::* daemonize yes protected-mode no save "" ``` --------- Signed-off-by: Lipeng Zhu <lipeng.zhu@intel.com> Co-authored-by: Wangyang Guo <wangyang.guo@intel.com>
This commit is contained in:
parent
0cc16d0298
commit
3323e422ad
10
src/config.h
10
src/config.h
@ -329,4 +329,14 @@ void setcpuaffinity(const char *cpulist);
|
||||
#define HAVE_FADVISE
|
||||
#endif
|
||||
|
||||
#define IO_THREADS_MAX_NUM 128
|
||||
|
||||
#ifndef CACHE_LINE_SIZE
|
||||
#if defined(__aarch64__) && defined(__APPLE__)
|
||||
#define CACHE_LINE_SIZE 128
|
||||
#else
|
||||
#define CACHE_LINE_SIZE 64
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
@ -4222,15 +4222,6 @@ void processEventsWhileBlocked(void) {
|
||||
* Threaded I/O
|
||||
* ========================================================================== */
|
||||
|
||||
#define IO_THREADS_MAX_NUM 128
|
||||
#ifndef CACHE_LINE_SIZE
|
||||
#if defined(__aarch64__) && defined(__APPLE__)
|
||||
#define CACHE_LINE_SIZE 128
|
||||
#else
|
||||
#define CACHE_LINE_SIZE 64
|
||||
#endif
|
||||
#endif
|
||||
|
||||
typedef struct __attribute__((aligned(CACHE_LINE_SIZE))) threads_pending {
|
||||
_Atomic unsigned long value;
|
||||
} threads_pending;
|
||||
|
@ -88,10 +88,53 @@ void zlibc_free(void *ptr) {
|
||||
#define dallocx(ptr, flags) je_dallocx(ptr, flags)
|
||||
#endif
|
||||
|
||||
#define update_zmalloc_stat_alloc(__n) atomic_fetch_add_explicit(&used_memory, (__n), memory_order_relaxed)
|
||||
#define update_zmalloc_stat_free(__n) atomic_fetch_sub_explicit(&used_memory, (__n), memory_order_relaxed)
|
||||
#if __STDC_NO_THREADS__
|
||||
#define thread_local __thread
|
||||
#else
|
||||
#include <threads.h>
|
||||
#endif
|
||||
|
||||
static _Atomic size_t used_memory = 0;
|
||||
#define MAX_THREADS_NUM (IO_THREADS_MAX_NUM + 3 + 1)
|
||||
/* A thread-local storage which keep the current thread's index in the used_memory_thread array. */
|
||||
static thread_local int thread_index = -1;
|
||||
/* Element in used_memory_thread array should only be written by a single thread which
|
||||
* distinguished by the thread-local storage thread_index. But when an element in
|
||||
* used_memory_thread array was written, it could be read by another thread simultaneously,
|
||||
* the reader will see the inconsistency memory on non x86 architecture potentially.
|
||||
* For the ARM and PowerPC platform, we can solve this issue by make the memory aligned.
|
||||
* For the other architecture, lets fall back to the atomic operation to keep safe. */
|
||||
#if defined(__i386__) || defined(__x86_64__) || defined(__amd64__) || defined(__POWERPC__) || defined(__arm__) || \
|
||||
defined(__arm64__)
|
||||
static __attribute__((aligned(sizeof(size_t)))) size_t used_memory_thread[MAX_THREADS_NUM];
|
||||
#else
|
||||
static _Atomic size_t used_memory_thread[MAX_THREADS_NUM];
|
||||
#endif
|
||||
static atomic_int total_active_threads = 0;
|
||||
/* This is a simple protection. It's used only if some modules create a lot of threads. */
|
||||
static atomic_size_t used_memory_for_additional_threads = 0;
|
||||
|
||||
/* Register the thread index in start_routine. */
|
||||
static inline void zmalloc_register_thread_index(void) {
|
||||
thread_index = atomic_fetch_add_explicit(&total_active_threads, 1, memory_order_relaxed);
|
||||
}
|
||||
|
||||
static inline void update_zmalloc_stat_alloc(size_t size) {
|
||||
if (unlikely(thread_index == -1)) zmalloc_register_thread_index();
|
||||
if (unlikely(thread_index >= MAX_THREADS_NUM)) {
|
||||
atomic_fetch_add_explicit(&used_memory_for_additional_threads, size, memory_order_relaxed);
|
||||
} else {
|
||||
used_memory_thread[thread_index] += size;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void update_zmalloc_stat_free(size_t size) {
|
||||
if (unlikely(thread_index == -1)) zmalloc_register_thread_index();
|
||||
if (unlikely(thread_index >= MAX_THREADS_NUM)) {
|
||||
atomic_fetch_sub_explicit(&used_memory_for_additional_threads, size, memory_order_relaxed);
|
||||
} else {
|
||||
used_memory_thread[thread_index] -= size;
|
||||
}
|
||||
}
|
||||
|
||||
static void zmalloc_default_oom(size_t size) {
|
||||
fprintf(stderr, "zmalloc: Out of memory trying to allocate %zu bytes\n", size);
|
||||
@ -415,7 +458,15 @@ char *zstrdup(const char *s) {
|
||||
}
|
||||
|
||||
size_t zmalloc_used_memory(void) {
|
||||
size_t um = atomic_load_explicit(&used_memory, memory_order_relaxed);
|
||||
size_t um = 0;
|
||||
int threads_num = total_active_threads;
|
||||
if (unlikely(total_active_threads > MAX_THREADS_NUM)) {
|
||||
um += atomic_load_explicit(&used_memory_for_additional_threads, memory_order_relaxed);
|
||||
threads_num = MAX_THREADS_NUM;
|
||||
}
|
||||
for (int i = 0; i < threads_num; i++) {
|
||||
um += used_memory_thread[i];
|
||||
}
|
||||
return um;
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user