Fix false sharing issue between main thread and io-threads when access used_memory_thread. (#1179)

When profiling some workloads with `io-threads` enabled. We found the false sharing issue is heavy. This patch try to split the the elements accessed by main thread and io-threads into different cache line by padding the elements in the head of `used_memory_thread_padded` array. This design helps mitigate the false sharing between main thread and io-threads, because the main thread has been the bottleneck with io-threads enabled. We didn't put each element in an individual cache line is that we don't want to bring the additional cache line fetch operation (3 vs 16 cache line) when call function like `zmalloc_used_memory()`. --------- Signed-off-by: Lipeng Zhu <lipeng.zhu@intel.com> Signed-off-by: Lipeng Zhu <zhu.lipeng@outlook.com> Signed-off-by: Viktor Söderqvist <viktor.soderqvist@est.tech> Co-authored-by: Wangyang Guo <wangyang.guo@intel.com> Co-authored-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
2024-10-17 18:37:10 +08:00 · 2024-10-17 18:37:10 +08:00 · a62d1f177b
commit a62d1f177b
parent 701ab72429
1 changed files with 4 additions and 2 deletions
--- a/src/zmalloc.c
+++ b/src/zmalloc.c
@ -90,6 +90,7 @@ void zlibc_free(void *ptr) {

 #define thread_local _Thread_local

+#define PADDING_ELEMENT_NUM (CACHE_LINE_SIZE / sizeof(size_t) - 1)
 #define MAX_THREADS_NUM (IO_THREADS_MAX_NUM + 3 + 1)
 /* A thread-local storage which keep the current thread's index in the used_memory_thread array. */
 static thread_local int thread_index = -1;
@ -101,10 +102,11 @@ static thread_local int thread_index = -1;
 * For the other architecture, lets fall back to the atomic operation to keep safe. */
 #if defined(__i386__) || defined(__x86_64__) || defined(__amd64__) || defined(__POWERPC__) || defined(__arm__) || \
    defined(__arm64__)
-static __attribute__((aligned(sizeof(size_t)))) size_t used_memory_thread[MAX_THREADS_NUM];
+static __attribute__((aligned(CACHE_LINE_SIZE))) size_t used_memory_thread_padded[MAX_THREADS_NUM + PADDING_ELEMENT_NUM];
 #else
-static _Atomic size_t used_memory_thread[MAX_THREADS_NUM];
+static __attribute__((aligned(CACHE_LINE_SIZE))) _Atomic size_t used_memory_thread_padded[MAX_THREADS_NUM + PADDING_ELEMENT_NUM];
 #endif
+static size_t *used_memory_thread = &used_memory_thread_padded[PADDING_ELEMENT_NUM];
 static atomic_int total_active_threads = 0;
 /* This is a simple protection. It's used only if some modules create a lot of threads. */
 static atomic_size_t used_memory_for_additional_threads = 0;