From 0e0756659128102952c3213a9b4e7161dcc9ca3d Mon Sep 17 00:00:00 2001 From: Oran Agra Date: Thu, 30 May 2019 11:51:58 +0300 Subject: [PATCH 1/4] Jemalloc: Avoid blocking on background thread lock for stats. Background threads may run for a long time, especially when the # of dirty pages is high. Avoid blocking stats calls because of this (which may cause latency spikes). see https://github.com/jemalloc/jemalloc/issues/1502 cherry picked from commit 1a71533511027dbe3f9d989659efeec446915d6b --- deps/jemalloc/src/background_thread.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/deps/jemalloc/src/background_thread.c b/deps/jemalloc/src/background_thread.c index 3517a3bb8..457669c9e 100644 --- a/deps/jemalloc/src/background_thread.c +++ b/deps/jemalloc/src/background_thread.c @@ -787,7 +787,13 @@ background_thread_stats_read(tsdn_t *tsdn, background_thread_stats_t *stats) { nstime_init(&stats->run_interval, 0); for (unsigned i = 0; i < max_background_threads; i++) { background_thread_info_t *info = &background_thread_info[i]; - malloc_mutex_lock(tsdn, &info->mtx); + if (malloc_mutex_trylock(tsdn, &info->mtx)) { + /* + * Each background thread run may take a long time; + * avoid waiting on the stats if the thread is active. + */ + continue; + } if (info->state != background_thread_stopped) { num_runs += info->tot_n_runs; nstime_add(&stats->run_interval, &info->tot_sleep_time); From f7833d560d80604445ad34ad2ef9d829d0aef7d6 Mon Sep 17 00:00:00 2001 From: Oran Agra Date: Thu, 30 May 2019 12:51:32 +0300 Subject: [PATCH 2/4] make redis purge jemalloc after flush, and enable background purging thread jemalloc 5 doesn't immediately release memory back to the OS, instead there's a decaying mechanism, which doesn't work when there's no traffic (no allocations). this is most evident if there's no traffic after flushdb, the RSS will remain high. 1) enable jemalloc background purging 2) explicitly purge in flushdb --- src/config.c | 9 ++++++++ src/db.c | 14 ++++++++++++ src/debug.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/server.c | 2 ++ src/server.h | 1 + src/zmalloc.c | 34 ++++++++++++++++++++++++++++ src/zmalloc.h | 2 ++ 7 files changed, 124 insertions(+) diff --git a/src/config.c b/src/config.c index 7f0e9af89..16850a1fc 100644 --- a/src/config.c +++ b/src/config.c @@ -474,6 +474,10 @@ void loadServerConfigFromString(char *config) { err = "active defrag can't be enabled without proper jemalloc support"; goto loaderr; #endif } + } else if (!strcasecmp(argv[0],"jemalloc-bg-thread") && argc == 2) { + if ((server.jemalloc_bg_thread = yesnotoi(argv[1])) == -1) { + err = "argument must be 'yes' or 'no'"; goto loaderr; + } } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) { if ((server.daemonize = yesnotoi(argv[1])) == -1) { err = "argument must be 'yes' or 'no'"; goto loaderr; @@ -1152,6 +1156,9 @@ void configSetCommand(client *c) { return; } #endif + } config_set_bool_field( + "jemalloc-bg-thread",server.jemalloc_bg_thread) { + set_jemalloc_bg_thread(server.jemalloc_bg_thread); } config_set_bool_field( "protected-mode",server.protected_mode) { } config_set_bool_field( @@ -1487,6 +1494,7 @@ void configGetCommand(client *c) { config_get_bool_field("rdbchecksum", server.rdb_checksum); config_get_bool_field("activerehashing", server.activerehashing); config_get_bool_field("activedefrag", server.active_defrag_enabled); + config_get_bool_field("jemalloc-bg-thread", server.jemalloc_bg_thread); config_get_bool_field("protected-mode", server.protected_mode); config_get_bool_field("gopher-enabled", server.gopher_enabled); config_get_bool_field("io-threads-do-reads", server.io_threads_do_reads); @@ -2318,6 +2326,7 @@ int rewriteConfig(char *path) { rewriteConfigNumericalOption(state,"hll-sparse-max-bytes",server.hll_sparse_max_bytes,CONFIG_DEFAULT_HLL_SPARSE_MAX_BYTES); rewriteConfigYesNoOption(state,"activerehashing",server.activerehashing,CONFIG_DEFAULT_ACTIVE_REHASHING); rewriteConfigYesNoOption(state,"activedefrag",server.active_defrag_enabled,CONFIG_DEFAULT_ACTIVE_DEFRAG); + rewriteConfigYesNoOption(state,"jemalloc-bg-thread",server.jemalloc_bg_thread,1); rewriteConfigYesNoOption(state,"protected-mode",server.protected_mode,CONFIG_DEFAULT_PROTECTED_MODE); rewriteConfigYesNoOption(state,"gopher-enabled",server.gopher_enabled,CONFIG_DEFAULT_GOPHER_ENABLED); rewriteConfigYesNoOption(state,"io-threads-do-reads",server.io_threads_do_reads,CONFIG_DEFAULT_IO_THREADS_DO_READS); diff --git a/src/db.c b/src/db.c index b537a29a4..50e23d6b2 100644 --- a/src/db.c +++ b/src/db.c @@ -441,6 +441,13 @@ void flushdbCommand(client *c) { signalFlushedDb(c->db->id); server.dirty += emptyDb(c->db->id,flags,NULL); addReply(c,shared.ok); +#if defined(USE_JEMALLOC) + /* jemalloc 5 doesn't release pages back to the OS when there's no traffic. + * for large databases, flushdb blocks for long anyway, so a bit more won't + * harm and this way the flush and purge will be synchroneus. */ + if (!(flags & EMPTYDB_ASYNC)) + jemalloc_purge(); +#endif } /* FLUSHALL [ASYNC] @@ -464,6 +471,13 @@ void flushallCommand(client *c) { server.dirty = saved_dirty; } server.dirty++; +#if defined(USE_JEMALLOC) + /* jemalloc 5 doesn't release pages back to the OS when there's no traffic. + * for large databases, flushdb blocks for long anyway, so a bit more won't + * harm and this way the flush and purge will be synchroneus. */ + if (!(flags & EMPTYDB_ASYNC)) + jemalloc_purge(); +#endif } /* This command implements DEL and LAZYDEL. */ diff --git a/src/debug.c b/src/debug.c index 0c6b5630c..c82c99b1f 100644 --- a/src/debug.c +++ b/src/debug.c @@ -297,6 +297,56 @@ void computeDatasetDigest(unsigned char *final) { } } +#ifdef USE_JEMALLOC +void mallctl_int(client *c, robj **argv, int argc) { + int ret; + /* start with the biggest size (int64), and if that fails, try smaller sizes (int32, bool) */ + int64_t old = 0, val; + if (argc > 1) { + long long ll; + if (getLongLongFromObjectOrReply(c, argv[1], &ll, NULL) != C_OK) + return; + val = ll; + } + size_t sz = sizeof(old); + while (sz > 0) { + if ((ret=je_mallctl(argv[0]->ptr, &old, &sz, argc > 1? &val: NULL, argc > 1?sz: 0))) { + if (ret==EINVAL) { + /* size might be wrong, try a smaller one */ + sz /= 2; +#if BYTE_ORDER == BIG_ENDIAN + val <<= 8*sz; +#endif + continue; + } + addReplyErrorFormat(c,"%s", strerror(ret)); + return; + } else { +#if BYTE_ORDER == BIG_ENDIAN + old >>= 64 - 8*sz; +#endif + addReplyLongLong(c, old); + return; + } + } + addReplyErrorFormat(c,"%s", strerror(EINVAL)); +} + +void mallctl_string(client *c, robj **argv, int argc) { + int ret; + char *old; + size_t sz = sizeof(old); + /* for strings, it seems we need to first get the old value, before overriding it. */ + if ((ret=je_mallctl(argv[0]->ptr, &old, &sz, NULL, 0))) { + addReplyErrorFormat(c,"%s", strerror(ret)); + return; + } + addReplyBulkCString(c, old); + if(argc > 1) + je_mallctl(argv[0]->ptr, NULL, 0, &argv[1]->ptr, sizeof(char*)); +} +#endif + void debugCommand(client *c) { if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr,"help")) { const char *help[] = { @@ -323,6 +373,10 @@ void debugCommand(client *c) { "STRUCTSIZE -- Return the size of different Redis core C structures.", "ZIPLIST -- Show low level info about the ziplist encoding.", "STRINGMATCH-TEST -- Run a fuzz tester against the stringmatchlen() function.", +#ifdef USE_JEMALLOC +"MALLCTL [] -- Get or set a malloc tunning integer.", +"MALLCTL-STR [] -- Get or set a malloc tunning string.", +#endif NULL }; addReplyHelp(c, help); @@ -676,6 +730,14 @@ NULL { stringmatchlen_fuzz_test(); addReplyStatus(c,"Apparently Redis did not crash: test passed"); +#ifdef USE_JEMALLOC + } else if(!strcasecmp(c->argv[1]->ptr,"mallctl") && c->argc >= 3) { + mallctl_int(c, c->argv+2, c->argc-2); + return; + } else if(!strcasecmp(c->argv[1]->ptr,"mallctl-str") && c->argc >= 3) { + mallctl_string(c, c->argv+2, c->argc-2); + return; +#endif } else { addReplySubcommandSyntaxError(c); return; diff --git a/src/server.c b/src/server.c index 4b87b6ac2..fa2c7b1ee 100644 --- a/src/server.c +++ b/src/server.c @@ -2230,6 +2230,7 @@ void initServerConfig(void) { server.maxidletime = CONFIG_DEFAULT_CLIENT_TIMEOUT; server.tcpkeepalive = CONFIG_DEFAULT_TCP_KEEPALIVE; server.active_expire_enabled = 1; + server.jemalloc_bg_thread = 1; server.active_defrag_enabled = CONFIG_DEFAULT_ACTIVE_DEFRAG; server.active_defrag_ignore_bytes = CONFIG_DEFAULT_DEFRAG_IGNORE_BYTES; server.active_defrag_threshold_lower = CONFIG_DEFAULT_DEFRAG_THRESHOLD_LOWER; @@ -2866,6 +2867,7 @@ void initServer(void) { latencyMonitorInit(); bioInit(); initThreadedIO(); + set_jemalloc_bg_thread(server.jemalloc_bg_thread); server.initial_memory_usage = zmalloc_used_memory(); } diff --git a/src/server.h b/src/server.h index 0813f8bd1..4ae079ff2 100644 --- a/src/server.h +++ b/src/server.h @@ -1129,6 +1129,7 @@ struct redisServer { int tcpkeepalive; /* Set SO_KEEPALIVE if non-zero. */ int active_expire_enabled; /* Can be disabled for testing purposes. */ int active_defrag_enabled; + int jemalloc_bg_thread; /* Enable jemalloc background thread */ size_t active_defrag_ignore_bytes; /* minimum amount of fragmentation waste to start active defrag */ int active_defrag_threshold_lower; /* minimum percentage of fragmentation to start active defrag */ int active_defrag_threshold_upper; /* maximum percentage of fragmentation at which we use maximum effort */ diff --git a/src/zmalloc.c b/src/zmalloc.c index 5e6010278..58896a727 100644 --- a/src/zmalloc.c +++ b/src/zmalloc.c @@ -306,6 +306,7 @@ size_t zmalloc_get_rss(void) { #endif #if defined(USE_JEMALLOC) + int zmalloc_get_allocator_info(size_t *allocated, size_t *active, size_t *resident) { @@ -327,13 +328,46 @@ int zmalloc_get_allocator_info(size_t *allocated, je_mallctl("stats.allocated", allocated, &sz, NULL, 0); return 1; } + +void set_jemalloc_bg_thread(int enable) { + /* let jemalloc do purging asynchronously, required when there's no traffic + * after flushdb */ + if (enable) { + char val = 1; + je_mallctl("background_thread", NULL, 0, &val, 1); + } +} + +int jemalloc_purge() { + /* return all unused (reserved) pages to the OS */ + char tmp[32]; + unsigned narenas = 0; + size_t sz = sizeof(unsigned); + if (!je_mallctl("arenas.narenas", &narenas, &sz, NULL, 0)) { + sprintf(tmp, "arena.%d.purge", narenas); + if (!je_mallctl(tmp, NULL, 0, NULL, 0)) + return 0; + } + return -1; +} + #else + int zmalloc_get_allocator_info(size_t *allocated, size_t *active, size_t *resident) { *allocated = *resident = *active = 0; return 1; } + +void set_jemalloc_bg_thread(int enable) { + ((void)(enable)); +} + +int jemalloc_purge() { + return 0; +} + #endif /* Get the sum of the specified field (converted form kb to bytes) in diff --git a/src/zmalloc.h b/src/zmalloc.h index 6fb19b046..b136a910d 100644 --- a/src/zmalloc.h +++ b/src/zmalloc.h @@ -86,6 +86,8 @@ size_t zmalloc_used_memory(void); void zmalloc_set_oom_handler(void (*oom_handler)(size_t)); size_t zmalloc_get_rss(void); int zmalloc_get_allocator_info(size_t *allocated, size_t *active, size_t *resident); +void set_jemalloc_bg_thread(int enable); +int jemalloc_purge(); size_t zmalloc_get_private_dirty(long pid); size_t zmalloc_get_smap_bytes_by_field(char *field, long pid); size_t zmalloc_get_memory_size(void); From 086b3911073c9cc41bd0f998764fa592384c9a7e Mon Sep 17 00:00:00 2001 From: Oran Agra Date: Mon, 9 Sep 2019 14:35:06 +0300 Subject: [PATCH 3/4] RED-31295 - redis: avoid race between dlopen and thread creation It seeems that since I added the creation of the jemalloc thread redis sometimes fails to start with the following error: Inconsistency detected by ld.so: dl-tls.c: 493: _dl_allocate_tls_init: Assertion `listp->slotinfo[cnt].gen <= GL(dl_tls_generation)' failed! This seems to be due to a race bug in ld.so, in which TLS creation on the thread, collide with dlopen. Move the creation of BIO and jemalloc threads to after modules are loaded. plus small bugfix when trying to disable the jemalloc thread at runtime --- src/server.c | 10 ++++++++++ src/zmalloc.c | 6 ++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/server.c b/src/server.c index fa2c7b1ee..8e99431f9 100644 --- a/src/server.c +++ b/src/server.c @@ -2865,6 +2865,14 @@ void initServer(void) { scriptingInit(1); slowlogInit(); latencyMonitorInit(); +} + +/* Some steps in server initialization need to be done last (after modules + * are loaded). + * Specifically, creation of threads due to a race bug in ld.so, in which + * Thread Local Storage initialization collides with dlopen call. + * see: https://sourceware.org/bugzilla/show_bug.cgi?id=19329 */ +void InitServerLast() { bioInit(); initThreadedIO(); set_jemalloc_bg_thread(server.jemalloc_bg_thread); @@ -4876,6 +4884,7 @@ int main(int argc, char **argv) { #endif moduleLoadFromQueue(); ACLLoadUsersAtStartup(); + InitServerLast(); loadDataFromDisk(); if (server.cluster_enabled) { if (verifyClusterConfigWithData() == C_ERR) { @@ -4890,6 +4899,7 @@ int main(int argc, char **argv) { if (server.sofd > 0) serverLog(LL_NOTICE,"The server is now ready to accept connections at %s", server.unixsocket); } else { + InitServerLast(); sentinelIsRunning(); } diff --git a/src/zmalloc.c b/src/zmalloc.c index 58896a727..437f43b64 100644 --- a/src/zmalloc.c +++ b/src/zmalloc.c @@ -332,10 +332,8 @@ int zmalloc_get_allocator_info(size_t *allocated, void set_jemalloc_bg_thread(int enable) { /* let jemalloc do purging asynchronously, required when there's no traffic * after flushdb */ - if (enable) { - char val = 1; - je_mallctl("background_thread", NULL, 0, &val, 1); - } + char val = !!enable; + je_mallctl("background_thread", NULL, 0, &val, 1); } int jemalloc_purge() { From f484baa3ea8254ba491422f8f358d36b9cfac236 Mon Sep 17 00:00:00 2001 From: Oran Agra Date: Fri, 4 Oct 2019 14:22:13 +0300 Subject: [PATCH 4/4] trim the double implementation of jemalloc purge --- src/object.c | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/src/object.c b/src/object.c index 697429b84..70022f897 100644 --- a/src/object.c +++ b/src/object.c @@ -1450,22 +1450,10 @@ NULL addReplyVerbatim(c,report,sdslen(report),"txt"); sdsfree(report); } else if (!strcasecmp(c->argv[1]->ptr,"purge") && c->argc == 2) { -#if defined(USE_JEMALLOC) - char tmp[32]; - unsigned narenas = 0; - size_t sz = sizeof(unsigned); - if (!je_mallctl("arenas.narenas", &narenas, &sz, NULL, 0)) { - sprintf(tmp, "arena.%d.purge", narenas); - if (!je_mallctl(tmp, NULL, 0, NULL, 0)) { - addReply(c, shared.ok); - return; - } - } - addReplyError(c, "Error purging dirty pages"); -#else - addReply(c, shared.ok); - /* Nothing to do for other allocators. */ -#endif + if (jemalloc_purge() == 0) + addReply(c, shared.ok); + else + addReplyError(c, "Error purging dirty pages"); } else { addReplyErrorFormat(c, "Unknown subcommand or wrong number of arguments for '%s'. Try MEMORY HELP", (char*)c->argv[1]->ptr); }