From 543d92d0ff2acdfe24d16ff53254f06d96ddb4eb Mon Sep 17 00:00:00 2001 From: John Sully Date: Sun, 31 Jan 2021 02:30:28 +0000 Subject: [PATCH 01/40] Fix crash on shutdown command Former-commit-id: d72b5fa16df0c41dd62b8b6d25c1c2c9cce8b413 --- src/networking.cpp | 2 ++ src/server.cpp | 2 ++ 2 files changed, 4 insertions(+) diff --git a/src/networking.cpp b/src/networking.cpp index d33be6923..f1ec92bbe 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -3295,6 +3295,8 @@ void flushSlavesOutputBuffers(void) { listIter li; listNode *ln; + flushReplBacklogToClients(); + listRewind(g_pserver->slaves,&li); while((ln = listNext(&li))) { client *replica = (client*)listNodeValue(ln); diff --git a/src/server.cpp b/src/server.cpp index 554c0ac43..8a7184398 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -4203,6 +4203,8 @@ int prepareForShutdown(int flags) { /* Best effort flush of replica output buffers, so that we hopefully * send them pending writes. */ flushSlavesOutputBuffers(); + g_pserver->repl_batch_idxStart = -1; + g_pserver->repl_batch_offStart = -1; /* Close the listening sockets. Apparently this allows faster restarts. */ closeListeningSockets(1); From 516eee7a1a586a0e03abbc7b70eb13b89bf1c97a Mon Sep 17 00:00:00 2001 From: John Sully Date: Sun, 31 Jan 2021 02:36:30 +0000 Subject: [PATCH 02/40] Fix CI failure due to missing package Former-commit-id: cf46a014c112bbba0d147dbc8489e3054df9a1ef --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c869b7bb3..cb41880c9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,7 +17,7 @@ jobs: run: ./utils/gen-test-certs.sh - name: test-tls run: | - sudo apt-get -y install tcl8.5 tcl-tls + sudo apt-get -y install tcl tcl-tls ./runtest --clients 2 --verbose --tls - name: cluster-test run: | From 7766b370626fdb712f34d880a964632ac9c2bb6a Mon Sep 17 00:00:00 2001 From: John Sully Date: Sun, 31 Jan 2021 03:40:57 +0000 Subject: [PATCH 03/40] Fix test failure due to bad merge Former-commit-id: f6fb0e462001c49af185682caed8881ccd6d36f3 --- src/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/server.cpp b/src/server.cpp index 8a7184398..624612f89 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -2757,7 +2757,7 @@ int setOOMScoreAdj(int process_class) { int val; char buf[64]; - val = g_pserver->oom_score_adj_base + g_pserver->oom_score_adj_values[process_class]; + val = g_pserver->oom_score_adj_values[process_class]; if (g_pserver->oom_score_adj == OOM_SCORE_RELATIVE) val += g_pserver->oom_score_adj_base; if (val > 1000) val = 1000; From d4408580b5d8bc7b15ecebc07a9b7865ed35430c Mon Sep 17 00:00:00 2001 From: John Sully Date: Sun, 31 Jan 2021 04:33:27 +0000 Subject: [PATCH 04/40] Improve multithreaded test reliability Former-commit-id: effa53339cf272ced8f0661bc275628f87be5c73 --- tests/unit/maxmemory.tcl | 4 ++-- tests/unit/obuf-limits.tcl | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/unit/maxmemory.tcl b/tests/unit/maxmemory.tcl index 741281cfc..414733d1e 100644 --- a/tests/unit/maxmemory.tcl +++ b/tests/unit/maxmemory.tcl @@ -246,7 +246,7 @@ test_slave_buffers {slave buffer are counted correctly} 1000000 10 0 1 # test again with fewer (and bigger) commands without pipeline, but with eviction test_slave_buffers "replica buffer don't induce eviction" 100000 100 1 0 -start_server {tags {"maxmemory"}} { +start_server {tags {"maxmemory"} overrides {server-threads 1}} { test {client tracking don't cause eviction feedback loop} { r config set maxmemory 0 r config set maxmemory-policy allkeys-lru @@ -308,4 +308,4 @@ start_server {tags {"maxmemory"}} { if {$::verbose} { puts "evicted: $evicted" } } } -}; #run_solo \ No newline at end of file +}; #run_solo diff --git a/tests/unit/obuf-limits.tcl b/tests/unit/obuf-limits.tcl index 456d3ac82..1fbd29ff8 100644 --- a/tests/unit/obuf-limits.tcl +++ b/tests/unit/obuf-limits.tcl @@ -1,4 +1,4 @@ -start_server {tags {"obuf-limits"}} { +start_server {tags {"obuf-limits"} overrides { server-threads 1 }} { test {Client output buffer hard limit is enforced} { r config set client-output-buffer-limit {pubsub 100000 0 0} set rd1 [redis_deferring_client] From 19dc63bd3afd47edd3e1000a61aaff1be9dc888a Mon Sep 17 00:00:00 2001 From: christianEQ Date: Thu, 14 Jan 2021 21:40:17 +0000 Subject: [PATCH 05/40] added severity levels for afterErrorReply Former-commit-id: fe0f07199353abf6668cd66cd2e21751db5c21d9 --- src/networking.cpp | 33 +++++++++++++++++++++++++++------ src/server.cpp | 9 +++++---- src/server.h | 8 +++++++- 3 files changed, 39 insertions(+), 11 deletions(-) diff --git a/src/networking.cpp b/src/networking.cpp index f1ec92bbe..dc4e56e92 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -494,7 +494,7 @@ void addReplyErrorLength(client *c, const char *s, size_t len) { } /* Do some actions after an error reply was sent (Log if needed, updates stats, etc.) */ -void afterErrorReply(client *c, const char *s, size_t len) { +void afterErrorReply(client *c, const char *s, size_t len, int severity = ERR_CRITICAL) { /* Sometimes it could be normal that a replica replies to a master with * an error and this function gets called. Actually the error will never * be sent because addReply*() against master clients has no effect... @@ -522,9 +522,30 @@ void afterErrorReply(client *c, const char *s, size_t len) { if (len > 4096) len = 4096; const char *cmdname = c->lastcmd ? c->lastcmd->name : ""; - serverLog(LL_WARNING,"== CRITICAL == This %s is sending an error " - "to its %s: '%.*s' after processing the command " - "'%s'", from, to, (int)len, s, cmdname); + switch (severity) { + case ERR_NOTICE: + serverLog(LL_NOTICE,"== NOTICE == This %s is rejecting a command " + "from its %s: '%.*s' after processing the command " + "'%s'", from, to, (int)len, s, cmdname); + break; + case ERR_WARNING: + serverLog(LL_WARNING,"== WARNING == This %s is rejecting a command " + "from its %s: '%.*s' after processing the command " + "'%s'", from, to, (int)len, s, cmdname); + break; + case ERR_ERROR: + serverLog(LL_WARNING,"== ERROR == This %s is sending an error " + "to its %s: '%.*s' after processing the command " + "'%s'", from, to, (int)len, s, cmdname); + break; + case ERR_CRITICAL: + default: + serverLog(LL_WARNING,"== CRITICAL == This %s is sending an error " + "to its %s: '%.*s' after processing the command " + "'%s'", from, to, (int)len, s, cmdname); + break; + } + if (ctype == CLIENT_TYPE_MASTER && g_pserver->repl_backlog && g_pserver->repl_backlog_histlen > 0) { @@ -536,9 +557,9 @@ void afterErrorReply(client *c, const char *s, size_t len) { /* The 'err' object is expected to start with -ERRORCODE and end with \r\n. * Unlike addReplyErrorSds and others alike which rely on addReplyErrorLength. */ -void addReplyErrorObject(client *c, robj *err) { +void addReplyErrorObject(client *c, robj *err, int severity) { addReply(c, err); - afterErrorReply(c, szFromObj(err), sdslen(szFromObj(err))-2); /* Ignore trailing \r\n */ + afterErrorReply(c, szFromObj(err), sdslen(szFromObj(err))-2, severity); /* Ignore trailing \r\n */ } void addReplyError(client *c, const char *err) { diff --git a/src/server.cpp b/src/server.cpp index 624612f89..0b0fdc877 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -3773,13 +3773,14 @@ void call(client *c, int flags) { * If there's a transaction is flags it as dirty, and if the command is EXEC, * it aborts the transaction. * Note: 'reply' is expected to end with \r\n */ -void rejectCommand(client *c, robj *reply) { +void rejectCommand(client *c, robj *reply, int severity = ERR_CRITICAL) { flagTransaction(c); if (c->cmd && c->cmd->proc == execCommand) { execCommandAbort(c, szFromObj(reply)); - } else { + } + else { /* using addReplyError* rather than addReply so that the error can be logged. */ - addReplyErrorObject(c, reply); + addReplyErrorObject(c, reply, severity); } } @@ -4024,7 +4025,7 @@ int processCommand(client *c, int callFlags) { /* Active Replicas can execute read only commands, and optionally write commands */ if (!(g_pserver->loading == LOADING_REPLICATION && g_pserver->fActiveReplica && ((c->cmd->flags & CMD_READONLY) || g_pserver->fWriteDuringActiveLoad))) { - rejectCommand(c, shared.loadingerr); + rejectCommand(c, shared.loadingerr, ERR_WARNING); return C_OK; } } diff --git a/src/server.h b/src/server.h index c6b9a77b5..0da8b84f3 100644 --- a/src/server.h +++ b/src/server.h @@ -543,6 +543,12 @@ extern int configOOMScoreAdjValuesDefaults[CONFIG_OOM_COUNT]; #define LL_WARNING 3 #define LL_RAW (1<<10) /* Modifier to log without timestamp */ +/* Error severity levels */ +#define ERR_CRITICAL 0 +#define ERR_ERROR 1 +#define ERR_WARNING 2 +#define ERR_NOTICE 3 + /* Supervision options */ #define SUPERVISED_NONE 0 #define SUPERVISED_AUTODETECT 1 @@ -2056,7 +2062,7 @@ void addReplyBulkLongLong(client *c, long long ll); void addReply(client *c, robj_roptr obj); void addReplySds(client *c, sds s); void addReplyBulkSds(client *c, sds s); -void addReplyErrorObject(client *c, robj *err); +void addReplyErrorObject(client *c, robj *err, int severity); void addReplyErrorSds(client *c, sds err); void addReplyError(client *c, const char *err); void addReplyStatus(client *c, const char *status); From a0ae32a9f07e529c9b44e0fab868d7aecd0c61f6 Mon Sep 17 00:00:00 2001 From: christianEQ Date: Thu, 14 Jan 2021 21:59:34 +0000 Subject: [PATCH 06/40] added exception to allow pings during loading Former-commit-id: 8a557e9d711c3aadbc566cf96ca2a18f10e8d5a4 --- src/server.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/server.cpp b/src/server.cpp index 0b0fdc877..add31b969 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -3853,8 +3853,8 @@ int processCommand(client *c, int callFlags) { (c->cmd->proc == execCommand && (c->mstate.cmd_flags & CMD_DENYOOM)); int is_denystale_command = !(c->cmd->flags & CMD_STALE) || (c->cmd->proc == execCommand && (c->mstate.cmd_inv_flags & CMD_STALE)); - int is_denyloading_command = !(c->cmd->flags & CMD_LOADING) || - (c->cmd->proc == execCommand && (c->mstate.cmd_inv_flags & CMD_LOADING)); + int is_denyloading_command = strcmp(c->cmd->name, "ping") && (!(c->cmd->flags & CMD_LOADING) || + (c->cmd->proc == execCommand && (c->mstate.cmd_inv_flags & CMD_LOADING))); /* Check if the user is authenticated. This check is skipped in case * the default user is flagged as "nopass" and is active. */ From 10a745df65254b0f1f7e58690087057ca9ce00de Mon Sep 17 00:00:00 2001 From: John Sully Date: Sun, 31 Jan 2021 06:33:13 +0000 Subject: [PATCH 07/40] Reduce intermittent failures by giving tests more time Former-commit-id: 69bc7108bd552b18e5cf4ab6ea354ad5cd1e93e9 --- tests/unit/scripting.tcl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/scripting.tcl b/tests/unit/scripting.tcl index e3cd2f87f..fe4c5f2e3 100644 --- a/tests/unit/scripting.tcl +++ b/tests/unit/scripting.tcl @@ -448,7 +448,7 @@ start_server {tags {"scripting"}} { set start [clock clicks -milliseconds] $rd eval {redis.call('set',KEYS[1],'y'); for i=1,1500000 do redis.call('ping') end return 'ok'} 1 x $rd flush - after 100 + after 200 catch {r ping} err assert_match {BUSY*} $err $rd read @@ -457,7 +457,7 @@ start_server {tags {"scripting"}} { set start [clock clicks -milliseconds] $rd debug loadaof $rd flush - after 100 + after 200 catch {r ping} err assert_match {LOADING*} $err $rd read From 68ed1292eeb2e94e527befc6fd01023db27cfd4f Mon Sep 17 00:00:00 2001 From: John Sully Date: Sun, 31 Jan 2021 07:15:55 +0000 Subject: [PATCH 08/40] Fix test failure caused by allowing pings during load Former-commit-id: 569a3d9ff86f4cd74e269391ce1d582e009147ce --- src/server.cpp | 6 +++--- tests/unit/scripting.tcl | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/server.cpp b/src/server.cpp index add31b969..e5e35857d 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -689,7 +689,7 @@ struct redisCommand redisCommandTable[] = { * failure detection, and a loading server is considered to be * not available. */ {"ping",pingCommand,-1, - "ok-stale fast @connection @replication", + "ok-stale ok-loading fast @connection @replication", 0,NULL,0,0,0,0,0,0}, {"echo",echoCommand,2, @@ -3853,8 +3853,8 @@ int processCommand(client *c, int callFlags) { (c->cmd->proc == execCommand && (c->mstate.cmd_flags & CMD_DENYOOM)); int is_denystale_command = !(c->cmd->flags & CMD_STALE) || (c->cmd->proc == execCommand && (c->mstate.cmd_inv_flags & CMD_STALE)); - int is_denyloading_command = strcmp(c->cmd->name, "ping") && (!(c->cmd->flags & CMD_LOADING) || - (c->cmd->proc == execCommand && (c->mstate.cmd_inv_flags & CMD_LOADING))); + int is_denyloading_command = !(c->cmd->flags & CMD_LOADING) || + (c->cmd->proc == execCommand && (c->mstate.cmd_inv_flags & CMD_LOADING)); /* Check if the user is authenticated. This check is skipped in case * the default user is flagged as "nopass" and is active. */ diff --git a/tests/unit/scripting.tcl b/tests/unit/scripting.tcl index fe4c5f2e3..b41956f9a 100644 --- a/tests/unit/scripting.tcl +++ b/tests/unit/scripting.tcl @@ -449,7 +449,7 @@ start_server {tags {"scripting"}} { $rd eval {redis.call('set',KEYS[1],'y'); for i=1,1500000 do redis.call('ping') end return 'ok'} 1 x $rd flush after 200 - catch {r ping} err + catch {r echo "foo"} err assert_match {BUSY*} $err $rd read set elapsed [expr [clock clicks -milliseconds]-$start] @@ -458,7 +458,7 @@ start_server {tags {"scripting"}} { $rd debug loadaof $rd flush after 200 - catch {r ping} err + catch {r echo "foo"} err assert_match {LOADING*} $err $rd read set elapsed [expr [clock clicks -milliseconds]-$start] From 84576e9b397b626f0bcaa7e9c233ce74b100f748 Mon Sep 17 00:00:00 2001 From: John Sully Date: Sun, 31 Jan 2021 21:22:23 +0000 Subject: [PATCH 09/40] ARM build fix Former-commit-id: 5832c25ad1ae3fbe12ee245a96799b4b1a75e4b1 --- src/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/server.cpp b/src/server.cpp index e5e35857d..aa142b527 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -5180,7 +5180,7 @@ int linuxMadvFreeForkBugCheck(void) { const long map_size = 3 * 4096; /* Create a memory map that's in our full control (not one used by the allocator). */ - p = mmap(NULL, map_size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + p = (char*)mmap(NULL, map_size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); serverAssert(p != MAP_FAILED); q = p + 4096; From dd0b8af2c569d76fbf03cb1b5b4266a82a66598e Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Fri, 8 Jan 2021 17:33:14 +0000 Subject: [PATCH 10/40] removed synchronous calls to aePostFunction and changed scope of g_fModuleThread in order to prevent module related deadlocks, issue #214 Former-commit-id: 3b8d1f7076e4ab2082cd0768abc7b0b6ed4f951a --- src/ae.cpp | 19 ++----------------- src/ae.h | 2 +- src/expire.cpp | 2 +- src/module.cpp | 19 ++++++++----------- src/server.cpp | 2 +- 5 files changed, 13 insertions(+), 31 deletions(-) diff --git a/src/ae.cpp b/src/ae.cpp index 125179c89..4f26df7b3 100644 --- a/src/ae.cpp +++ b/src/ae.cpp @@ -286,7 +286,7 @@ int aePostFunction(aeEventLoop *eventLoop, aePostFunctionProc *proc, void *arg) return AE_OK; } -int aePostFunction(aeEventLoop *eventLoop, std::function fn, bool fSynchronous, bool fLock, bool fForceQueue) +int aePostFunction(aeEventLoop *eventLoop, std::function fn, bool fLock, bool fForceQueue) { if (eventLoop == g_eventLoopThisThread && !fForceQueue) { @@ -299,11 +299,6 @@ int aePostFunction(aeEventLoop *eventLoop, std::function fn, bool fSynch cmd.pfn = new (MALLOC_LOCAL) std::function(fn); cmd.pctl = nullptr; cmd.fLock = fLock; - if (fSynchronous) - { - cmd.pctl = new (MALLOC_LOCAL) aeCommandControl; - cmd.pctl->mutexcv.lock(); - } auto size = write(eventLoop->fdCmdWrite, &cmd, sizeof(cmd)); if (!(!size || size == sizeof(cmd))) { @@ -314,17 +309,7 @@ int aePostFunction(aeEventLoop *eventLoop, std::function fn, bool fSynch if (size == 0) return AE_ERR; - int ret = AE_OK; - if (fSynchronous) - { - { - std::unique_lock ulock(cmd.pctl->mutexcv, std::adopt_lock); - cmd.pctl->cv.wait(ulock); - ret = cmd.pctl->rval; - } - delete cmd.pctl; - } - return ret; + return AE_OK; } aeEventLoop *aeCreateEventLoop(int setsize) { diff --git a/src/ae.h b/src/ae.h index e77abb01f..20ef22a5e 100644 --- a/src/ae.h +++ b/src/ae.h @@ -135,7 +135,7 @@ aeEventLoop *aeCreateEventLoop(int setsize); int aePostFunction(aeEventLoop *eventLoop, aePostFunctionProc *proc, void *arg); #ifdef __cplusplus } // EXTERN C -int aePostFunction(aeEventLoop *eventLoop, std::function fn, bool fSynchronous = false, bool fLock = true, bool fForceQueue = false); +int aePostFunction(aeEventLoop *eventLoop, std::function fn, bool fLock = true, bool fForceQueue = false); extern "C" { #endif void aeDeleteEventLoop(aeEventLoop *eventLoop); diff --git a/src/expire.cpp b/src/expire.cpp index 46f52c19c..1d14f7152 100644 --- a/src/expire.cpp +++ b/src/expire.cpp @@ -136,7 +136,7 @@ void activeExpireCycleExpire(redisDb *db, expireEntry &e, long long now) { executeCronJobExpireHook(keyCopy, val); sdsfree(keyCopy); decrRefCount(val); - }, false, true /*fLock*/, true /*fForceQueue*/); + }, true /*fLock*/, true /*fForceQueue*/); } return; diff --git a/src/module.cpp b/src/module.cpp index c99ab4685..594a172e4 100644 --- a/src/module.cpp +++ b/src/module.cpp @@ -5045,7 +5045,7 @@ void RM_FreeThreadSafeContext(RedisModuleCtx *ctx) { zfree(ctx); } -static bool g_fModuleThread = false; +__thread bool g_fModuleThread = false; /* Acquire the server lock before executing a thread safe API call. * This is not needed for `RedisModule_Reply*` calls when there is * a blocked client connected to the thread safe context. */ @@ -5673,20 +5673,17 @@ RedisModuleTimerID RM_CreateTimer(RedisModuleCtx *ctx, mstime_t period, RedisMod if (memcmp(ri.key,&key,sizeof(key)) == 0) { /* This is the first key, we need to re-install the timer according * to the just added event. */ - aePostFunction(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el, [&]{ + aePostFunction(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el, [period]{ aeDeleteTimeEvent(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el,aeTimer); - }, true /* synchronous */, false /* fLock */); - aeTimer = -1; + aeTimer = aeCreateTimeEvent(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el,period,moduleTimerHandler,NULL,NULL); + }); } raxStop(&ri); - } - - /* If we have no main timer (the old one was invalidated, or this is the - * first module timer we have), install one. */ - if (aeTimer == -1) { - aePostFunction(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el, [&]{ + } else { + /* If we have no main timer because this is the first module timer we have, install one. */ + aePostFunction(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el, [period]{ aeTimer = aeCreateTimeEvent(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el,period,moduleTimerHandler,NULL,NULL); - }, true /* synchronous */, false /* fLock */); + }); } return key; diff --git a/src/server.cpp b/src/server.cpp index aa142b527..848b28bcf 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -4087,7 +4087,7 @@ bool client::postFunction(std::function fn, bool fLock) { std::lock_guardlock)> lock(this->lock); fn(this); --casyncOpsPending; - }, false, fLock) == AE_OK; + }, fLock) == AE_OK; } /*================================== Shutdown =============================== */ From 2addbe7e4a32cb3756f783693190edbdac16e127 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Fri, 8 Jan 2021 17:45:45 +0000 Subject: [PATCH 11/40] Added fix for scenario where module thread waiting for s_mutexModule in acquireGIL can deadlock with module thread waiting for s_mutex in releaseGIL Former-commit-id: 3205373bb378f895824cc1936a6bae663b1abdcc --- src/module.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/module.cpp b/src/module.cpp index 594a172e4..f128d10e3 100644 --- a/src/module.cpp +++ b/src/module.cpp @@ -5104,7 +5104,14 @@ void moduleAcquireGIL(int fServerThread) { } else { - s_mutexModule.lock(); + // It is possible that another module thread holds the GIL (and s_mutexModule as a result). + // When said thread goes to release the GIL, it will wait for s_mutex, which this thread owns. + // This thread is however waiting for the GIL (and s_mutexModule) that the other thread owns. + // As a result, a deadlock has occured. + // We release the lock on s_mutex and wait until we are able to safely acquire the GIL + // in order to prevent this deadlock from occuring. + while (!s_mutexModule.try_lock()) + s_cv.wait(lock); ++s_cAcquisitionsModule; fModuleGILWlocked++; } From 662037e3a33ff282b22cf398a73dd4f5f3f7d192 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Mon, 11 Jan 2021 19:09:51 +0000 Subject: [PATCH 12/40] Removed more uses of fSynchronous and the use of condition variable and mutex on the control struct. Former-commit-id: 6ab08cc3e1429178b26b55ed7aa8ba85240eb766 --- src/ae.cpp | 51 +++----------------------------------------------- src/module.cpp | 2 +- 2 files changed, 4 insertions(+), 49 deletions(-) diff --git a/src/ae.cpp b/src/ae.cpp index 4f26df7b3..89b36beaa 100644 --- a/src/ae.cpp +++ b/src/ae.cpp @@ -109,13 +109,6 @@ enum class AE_ASYNC_OP CreateFileEvent, }; -struct aeCommandControl -{ - std::condition_variable cv; - std::atomic rval; - std::mutex mutexcv; -}; - struct aeCommand { AE_ASYNC_OP op; @@ -128,7 +121,6 @@ struct aeCommand std::function *pfn; }; void *clientData; - aeCommandControl *pctl; }; void aeProcessCmd(aeEventLoop *eventLoop, int fd, void *, int ) @@ -149,19 +141,7 @@ void aeProcessCmd(aeEventLoop *eventLoop, int fd, void *, int ) break; case AE_ASYNC_OP::CreateFileEvent: - { - if (cmd.pctl != nullptr) - { - cmd.pctl->mutexcv.lock(); - std::atomic_store(&cmd.pctl->rval, aeCreateFileEvent(eventLoop, cmd.fd, cmd.mask, cmd.fproc, cmd.clientData)); - cmd.pctl->cv.notify_all(); - cmd.pctl->mutexcv.unlock(); - } - else - { - aeCreateFileEvent(eventLoop, cmd.fd, cmd.mask, cmd.fproc, cmd.clientData); - } - } + aeCreateFileEvent(eventLoop, cmd.fd, cmd.mask, cmd.fproc, cmd.clientData); break; case AE_ASYNC_OP::PostFunction: @@ -175,19 +155,11 @@ void aeProcessCmd(aeEventLoop *eventLoop, int fd, void *, int ) case AE_ASYNC_OP::PostCppFunction: { - if (cmd.pctl != nullptr) - cmd.pctl->mutexcv.lock(); - std::unique_lock ulock(g_lock, std::defer_lock); if (cmd.fLock) ulock.lock(); (*cmd.pfn)(); - - if (cmd.pctl != nullptr) - { - cmd.pctl->cv.notify_all(); - cmd.pctl->mutexcv.unlock(); - } + delete cmd.pfn; } break; @@ -226,7 +198,7 @@ ssize_t safe_write(int fd, const void *pv, size_t cb) } int aeCreateRemoteFileEvent(aeEventLoop *eventLoop, int fd, int mask, - aeFileProc *proc, void *clientData, int fSynchronous) + aeFileProc *proc, void *clientData) { if (eventLoop == g_eventLoopThisThread) return aeCreateFileEvent(eventLoop, fd, mask, proc, clientData); @@ -239,13 +211,7 @@ int aeCreateRemoteFileEvent(aeEventLoop *eventLoop, int fd, int mask, cmd.mask = mask; cmd.fproc = proc; cmd.clientData = clientData; - cmd.pctl = nullptr; cmd.fLock = true; - if (fSynchronous) - { - cmd.pctl = new (MALLOC_LOCAL) aeCommandControl(); - cmd.pctl->mutexcv.lock(); - } auto size = safe_write(eventLoop->fdCmdWrite, &cmd, sizeof(cmd)); if (size != sizeof(cmd)) @@ -254,16 +220,6 @@ int aeCreateRemoteFileEvent(aeEventLoop *eventLoop, int fd, int mask, serverAssert(errno == EAGAIN); ret = AE_ERR; } - - if (fSynchronous) - { - { - std::unique_lock ulock(cmd.pctl->mutexcv, std::adopt_lock); - cmd.pctl->cv.wait(ulock); - ret = cmd.pctl->rval; - } - delete cmd.pctl; - } return ret; } @@ -297,7 +253,6 @@ int aePostFunction(aeEventLoop *eventLoop, std::function fn, bool fLock, aeCommand cmd = {}; cmd.op = AE_ASYNC_OP::PostCppFunction; cmd.pfn = new (MALLOC_LOCAL) std::function(fn); - cmd.pctl = nullptr; cmd.fLock = fLock; auto size = write(eventLoop->fdCmdWrite, &cmd, sizeof(cmd)); diff --git a/src/module.cpp b/src/module.cpp index f128d10e3..0430124de 100644 --- a/src/module.cpp +++ b/src/module.cpp @@ -5045,7 +5045,7 @@ void RM_FreeThreadSafeContext(RedisModuleCtx *ctx) { zfree(ctx); } -__thread bool g_fModuleThread = false; +thread_local bool g_fModuleThread = false; /* Acquire the server lock before executing a thread safe API call. * This is not needed for `RedisModule_Reply*` calls when there is * a blocked client connected to the thread safe context. */ From 49132539dd837e7292c3514bd8c1bcdf00312578 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Mon, 11 Jan 2021 19:17:28 +0000 Subject: [PATCH 13/40] Updated header file to remove fSynchronous flag Former-commit-id: e2552ff8a92ea5adf7cea070b48afc573003254d --- src/ae.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ae.h b/src/ae.h index 20ef22a5e..ab7127f8b 100644 --- a/src/ae.h +++ b/src/ae.h @@ -144,7 +144,7 @@ int aeCreateFileEvent(aeEventLoop *eventLoop, int fd, int mask, aeFileProc *proc, void *clientData); int aeCreateRemoteFileEvent(aeEventLoop *eventLoop, int fd, int mask, - aeFileProc *proc, void *clientData, int fSynchronous); + aeFileProc *proc, void *clientData); void aeDeleteFileEvent(aeEventLoop *eventLoop, int fd, int mask); void aeDeleteFileEventAsync(aeEventLoop *eventLoop, int fd, int mask); From 84dc6401c741eb25e549b4988971de7dff240c0e Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Tue, 12 Jan 2021 23:22:36 +0000 Subject: [PATCH 14/40] Now post entire timer installation process as one function to make atomic with respect to global locks Former-commit-id: 53936661c88bd7eac88308afc75c510134a8e044 --- src/module.cpp | 44 +++++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/src/module.cpp b/src/module.cpp index 0430124de..a86241c43 100644 --- a/src/module.cpp +++ b/src/module.cpp @@ -5668,30 +5668,32 @@ RedisModuleTimerID RM_CreateTimer(RedisModuleCtx *ctx, mstime_t period, RedisMod } } - /* We need to install the main event loop timer if it's not already - * installed, or we may need to refresh its period if we just installed - * a timer that will expire sooner than any other else (i.e. the timer - * we just installed is the first timer in the Timers rax). */ - if (aeTimer != -1) { - raxIterator ri; - raxStart(&ri,Timers); - raxSeek(&ri,"^",NULL,0); - raxNext(&ri); - if (memcmp(ri.key,&key,sizeof(key)) == 0) { - /* This is the first key, we need to re-install the timer according - * to the just added event. */ - aePostFunction(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el, [period]{ + aePostFunction(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el, [period, key]{ + /* We need to install the main event loop timer if it's not already + * installed, or we may need to refresh its period if we just installed + * a timer that will expire sooner than any other else (i.e. the timer + * we just installed is the first timer in the Timers rax). */ + if (aeTimer != -1) { + raxIterator ri; + raxStart(&ri,Timers); + raxSeek(&ri,"^",NULL,0); + raxNext(&ri); + if (memcmp(ri.key,&key,sizeof(key)) == 0) { + /* This is the first key, we need to re-install the timer according + * to the just added event. */ aeDeleteTimeEvent(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el,aeTimer); - aeTimer = aeCreateTimeEvent(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el,period,moduleTimerHandler,NULL,NULL); - }); + aeTimer = -1; + } + raxStop(&ri); } - raxStop(&ri); - } else { - /* If we have no main timer because this is the first module timer we have, install one. */ - aePostFunction(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el, [period]{ + + /* If we have no main timer (the old one was invalidated, or this is the + * first module timer we have), install one. */ + if (aeTimer == -1) { aeTimer = aeCreateTimeEvent(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el,period,moduleTimerHandler,NULL,NULL); - }); - } + } + }); + return key; } From cf122aa54d67e83436e341e353db854a050cd0c8 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Thu, 14 Jan 2021 01:14:09 +0000 Subject: [PATCH 15/40] Changed RM_CreateTimer to only call aePostFunction is existing aePostFunction isn't in flight Former-commit-id: 9954f5b4a48286d07fb876fd9579801365b6c237 --- src/module.cpp | 56 ++++++++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/src/module.cpp b/src/module.cpp index a86241c43..bc1e9f9e6 100644 --- a/src/module.cpp +++ b/src/module.cpp @@ -5579,6 +5579,8 @@ void RM_SetClusterFlags(RedisModuleCtx *ctx, uint64_t flags) { static rax *Timers; /* The radix tree of all the timers sorted by expire. */ long long aeTimer = -1; /* Main event loop (ae.c) timer identifier. */ +bool aeTimerSet = false;/* Checks whether the main event loop timer is set + or if an aePostFunction is queued up that will set it */ typedef void (*RedisModuleTimerProc)(RedisModuleCtx *ctx, void *data); @@ -5668,32 +5670,38 @@ RedisModuleTimerID RM_CreateTimer(RedisModuleCtx *ctx, mstime_t period, RedisMod } } - aePostFunction(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el, [period, key]{ - /* We need to install the main event loop timer if it's not already - * installed, or we may need to refresh its period if we just installed - * a timer that will expire sooner than any other else (i.e. the timer - * we just installed is the first timer in the Timers rax). */ - if (aeTimer != -1) { - raxIterator ri; - raxStart(&ri,Timers); - raxSeek(&ri,"^",NULL,0); - raxNext(&ri); - if (memcmp(ri.key,&key,sizeof(key)) == 0) { - /* This is the first key, we need to re-install the timer according - * to the just added event. */ + /* We need to install the main event loop timer if it's not already + * installed, or we may need to refresh its period if we just installed + * a timer that will expire sooner than any other else (i.e. the timer + * we just installed is the first timer in the Timers rax). */ + bool isFirstExpiry = false; + if (raxSize(Timers) > 0){ + raxIterator ri; + raxStart(&ri, Timers); + raxSeek(&ri,"^",NULL,0); + raxNext(&ri); + if (memcmp(ri.key,&key,sizeof(key)) == 0) + /* This is the first key, we need to re-install the timer according + * to the just added event. */ + isFirstExpiry = true; + raxStop(&ri); + } + + /* Now that either we know that we either need to refresh the period of the + * recently installed timer, or that there is no timer to begin with, we must post + * a function call to install the main event timer */ + if (isFirstExpiry || !aeTimerSet){ + aeTimerSet = true; + aePostFunction(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el, [period, isFirstExpiry]{ + /* If we deemed that this timer required a reinstall, delete it before proceeding + * to the install */ + if (isFirstExpiry) aeDeleteTimeEvent(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el,aeTimer); - aeTimer = -1; - } - raxStop(&ri); - } - - /* If we have no main timer (the old one was invalidated, or this is the - * first module timer we have), install one. */ - if (aeTimer == -1) { + /* If we have no main timer (the old one was invalidated, or this is the + * first module timer we have), install one. */ aeTimer = aeCreateTimeEvent(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el,period,moduleTimerHandler,NULL,NULL); - } - }); - + }); + } return key; } From b0f94bd4b550b67e594ca8f2ced836f130f5bc05 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Thu, 14 Jan 2021 21:59:56 +0000 Subject: [PATCH 16/40] Modified RM_CreateTimer to prevent more that one function being posted at a time Former-commit-id: f66e5a9c3d6925c3c6d383fbc43fbfbd56721dad --- src/module.cpp | 44 ++++++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/src/module.cpp b/src/module.cpp index bc1e9f9e6..dfabebeae 100644 --- a/src/module.cpp +++ b/src/module.cpp @@ -5579,8 +5579,10 @@ void RM_SetClusterFlags(RedisModuleCtx *ctx, uint64_t flags) { static rax *Timers; /* The radix tree of all the timers sorted by expire. */ long long aeTimer = -1; /* Main event loop (ae.c) timer identifier. */ -bool aeTimerSet = false;/* Checks whether the main event loop timer is set - or if an aePostFunction is queued up that will set it */ +static bool aeTimerSet = false; /* Checks whether the main event loop timer is set */ +static mstime_t aeTimerPeriod; /* The period of the aeTimer */ +static bool aeTimerPending = false; /* Keeps track of if there is a aePostFunction in flight + * that would modify the current main event loop timer */ typedef void (*RedisModuleTimerProc)(RedisModuleCtx *ctx, void *data); @@ -5669,13 +5671,12 @@ RedisModuleTimerID RM_CreateTimer(RedisModuleCtx *ctx, mstime_t period, RedisMod expiretime++; } } - /* We need to install the main event loop timer if it's not already * installed, or we may need to refresh its period if we just installed * a timer that will expire sooner than any other else (i.e. the timer * we just installed is the first timer in the Timers rax). */ bool isFirstExpiry = false; - if (raxSize(Timers) > 0){ + if (aeTimerSet){ raxIterator ri; raxStart(&ri, Timers); raxSeek(&ri,"^",NULL,0); @@ -5688,20 +5689,31 @@ RedisModuleTimerID RM_CreateTimer(RedisModuleCtx *ctx, mstime_t period, RedisMod } /* Now that either we know that we either need to refresh the period of the - * recently installed timer, or that there is no timer to begin with, we must post - * a function call to install the main event timer */ + * recently installed timer, or that there is no timer to begin with, we must post + * a function call to install the main event timer. */ if (isFirstExpiry || !aeTimerSet){ + /* We set the period for the posted function in a global variable + * That is so if a function has been posted but not executed, and another + * function with a different period were to be posted, we can just update + * the period instead of posting a new function. */ + aeTimerPeriod = period; aeTimerSet = true; - aePostFunction(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el, [period, isFirstExpiry]{ - /* If we deemed that this timer required a reinstall, delete it before proceeding - * to the install */ - if (isFirstExpiry) - aeDeleteTimeEvent(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el,aeTimer); - /* If we have no main timer (the old one was invalidated, or this is the - * first module timer we have), install one. */ - aeTimer = aeCreateTimeEvent(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el,period,moduleTimerHandler,NULL,NULL); - }); - } + if (!aeTimerPending) { + /* We should only have one aePostFunction in flight at a time, aeTimerPending + * keeps track of that. */ + aeTimerPending = true; + aePostFunction(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el, [isFirstExpiry]{ + /* If we deemed that this timer required a reinstall, delete it before proceeding + * to the install */ + if (isFirstExpiry) + aeDeleteTimeEvent(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el,aeTimer); + /* If we have no main timer (the old one was invalidated, or this is the + * first module timer we have), install one. */ + aeTimer = aeCreateTimeEvent(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el,aeTimerPeriod,moduleTimerHandler,NULL,NULL); + aeTimerPending = false; + }); + } + } return key; } From cb5119b756523dbad83ae98e3444e796d124180a Mon Sep 17 00:00:00 2001 From: John Sully Date: Sun, 31 Jan 2021 06:31:51 +0000 Subject: [PATCH 17/40] Simplify the handling of CreateTimer Former-commit-id: 7b4e25008a352bd425582a3e60b26894826af626 --- src/module.cpp | 76 ++++++++++++++++++++++---------------------------- 1 file changed, 34 insertions(+), 42 deletions(-) diff --git a/src/module.cpp b/src/module.cpp index dfabebeae..ed57e7ef4 100644 --- a/src/module.cpp +++ b/src/module.cpp @@ -5579,10 +5579,6 @@ void RM_SetClusterFlags(RedisModuleCtx *ctx, uint64_t flags) { static rax *Timers; /* The radix tree of all the timers sorted by expire. */ long long aeTimer = -1; /* Main event loop (ae.c) timer identifier. */ -static bool aeTimerSet = false; /* Checks whether the main event loop timer is set */ -static mstime_t aeTimerPeriod; /* The period of the aeTimer */ -static bool aeTimerPending = false; /* Keeps track of if there is a aePostFunction in flight - * that would modify the current main event loop timer */ typedef void (*RedisModuleTimerProc)(RedisModuleCtx *ctx, void *data); @@ -5654,6 +5650,9 @@ int moduleTimerHandler(struct aeEventLoop *eventLoop, long long id, void *client * (If the time it takes to execute 'callback' is negligible the two * statements above mean the same) */ RedisModuleTimerID RM_CreateTimer(RedisModuleCtx *ctx, mstime_t period, RedisModuleTimerProc callback, void *data) { + static uint64_t pending_key; + static mstime_t pending_period = -1; + RedisModuleTimer *timer = (RedisModuleTimer*)zmalloc(sizeof(*timer), MALLOC_LOCAL); timer->module = ctx->module; timer->callback = callback; @@ -5671,49 +5670,42 @@ RedisModuleTimerID RM_CreateTimer(RedisModuleCtx *ctx, mstime_t period, RedisMod expiretime++; } } + + bool fNeedPost = (pending_period < 0); // If pending_period is already set, then a PostFunction is in flight and we don't need to set a new one + if (pending_period < 0 || period < pending_period) { + pending_period = period; + pending_key = key; + } + /* We need to install the main event loop timer if it's not already * installed, or we may need to refresh its period if we just installed * a timer that will expire sooner than any other else (i.e. the timer * we just installed is the first timer in the Timers rax). */ - bool isFirstExpiry = false; - if (aeTimerSet){ - raxIterator ri; - raxStart(&ri, Timers); - raxSeek(&ri,"^",NULL,0); - raxNext(&ri); - if (memcmp(ri.key,&key,sizeof(key)) == 0) - /* This is the first key, we need to re-install the timer according - * to the just added event. */ - isFirstExpiry = true; - raxStop(&ri); - } - - /* Now that either we know that we either need to refresh the period of the - * recently installed timer, or that there is no timer to begin with, we must post - * a function call to install the main event timer. */ - if (isFirstExpiry || !aeTimerSet){ - /* We set the period for the posted function in a global variable - * That is so if a function has been posted but not executed, and another - * function with a different period were to be posted, we can just update - * the period instead of posting a new function. */ - aeTimerPeriod = period; - aeTimerSet = true; - if (!aeTimerPending) { - /* We should only have one aePostFunction in flight at a time, aeTimerPending - * keeps track of that. */ - aeTimerPending = true; - aePostFunction(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el, [isFirstExpiry]{ - /* If we deemed that this timer required a reinstall, delete it before proceeding - * to the install */ - if (isFirstExpiry) + if (fNeedPost) { + aePostFunction(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el, []{ + if (aeTimer != -1) { + raxIterator ri; + raxStart(&ri,Timers); + raxSeek(&ri,"^",NULL,0); + raxNext(&ri); + if (memcmp(ri.key,&pending_key,sizeof(key)) == 0) { + /* This is the first key, we need to re-install the timer according + * to the just added event. */ aeDeleteTimeEvent(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el,aeTimer); - /* If we have no main timer (the old one was invalidated, or this is the - * first module timer we have), install one. */ - aeTimer = aeCreateTimeEvent(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el,aeTimerPeriod,moduleTimerHandler,NULL,NULL); - aeTimerPending = false; - }); - } - } + aeTimer = -1; + } + raxStop(&ri); + } + + /* If we have no main timer (the old one was invalidated, or this is the + * first module timer we have), install one. */ + if (aeTimer == -1) { + aeTimer = aeCreateTimeEvent(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el,pending_period,moduleTimerHandler,NULL,NULL); + } + + pending_period = -1; + }); + } return key; } From 651abfdca0e3d80965eb5578a7cd23d61d79e95e Mon Sep 17 00:00:00 2001 From: John Sully Date: Mon, 1 Feb 2021 22:21:49 +0000 Subject: [PATCH 18/40] Module test reliability Former-commit-id: cc827e157a8cdd0224eaa76d6a70e0cccd95f544 --- tests/unit/moduleapi/hooks.tcl | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit/moduleapi/hooks.tcl b/tests/unit/moduleapi/hooks.tcl index c4af59bd2..a9387e757 100644 --- a/tests/unit/moduleapi/hooks.tcl +++ b/tests/unit/moduleapi/hooks.tcl @@ -132,6 +132,7 @@ tags "modules" { } $replica replicaof no one + after 300 test {Test role-master hook} { assert_equal [r hooks.event_count role-replica] 1 From aa47e643b0df8db58e809388d9de26f6e79aacb6 Mon Sep 17 00:00:00 2001 From: John Sully Date: Sun, 7 Feb 2021 23:38:09 +0000 Subject: [PATCH 19/40] Fix memory leak in mvccRestore Former-commit-id: 165333b0fc648c79e66f04d9c8c4a1d0059fe57a --- src/Makefile | 4 ++-- src/cluster.cpp | 13 ++++++++----- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/Makefile b/src/Makefile index 3af99b3b4..9e4070d4a 100644 --- a/src/Makefile +++ b/src/Makefile @@ -48,8 +48,8 @@ endif USEASM?=true ifneq ($(strip $(SANITIZE)),) - CFLAGS+= -fsanitize=$(SANITIZE) -DSANITIZE - CXXFLAGS+= -fsanitize=$(SANITIZE) -DSANITIZE + CFLAGS+= -fsanitize=$(SANITIZE) -DSANITIZE -fno-omit-frame-pointer + CXXFLAGS+= -fsanitize=$(SANITIZE) -DSANITIZE -fno-omit-frame-pointer LDFLAGS+= -fsanitize=$(SANITIZE) MALLOC=libc USEASM=false diff --git a/src/cluster.cpp b/src/cluster.cpp index e60807180..8556f1d28 100644 --- a/src/cluster.cpp +++ b/src/cluster.cpp @@ -5066,12 +5066,15 @@ void mvccrestoreCommand(client *c) { setMvccTstamp(obj, mvcc); /* Create the key and set the TTL if any */ - dbMerge(c->db,key,obj,true); - if (expire >= 0) { - setExpire(c,c->db,key,nullptr,expire); + if (dbMerge(c->db,key,obj,true)) { + if (expire >= 0) { + setExpire(c,c->db,key,nullptr,expire); + } + signalModifiedKey(c,c->db,key); + notifyKeyspaceEvent(NOTIFY_GENERIC,"restore",key,c->db->id); + } else { + decrRefCount(obj); } - signalModifiedKey(c,c->db,key); - notifyKeyspaceEvent(NOTIFY_GENERIC,"restore",key,c->db->id); addReply(c,shared.ok); g_pserver->dirty++; } From 1c0b603def4b18b286ca33233b0816416cd460ac Mon Sep 17 00:00:00 2001 From: John Sully Date: Wed, 20 Jan 2021 20:23:56 +0000 Subject: [PATCH 20/40] Initial implementation Former-commit-id: 958f2c00c8efc15dc91fdeec2ff2e2ae2016c124 --- src/dict.cpp | 107 +++++++++++++++++++++++++++++++++++++++++++++++-- src/dict.h | 31 ++++++++++++++ src/server.cpp | 64 ++++++++++++++++++++--------- 3 files changed, 180 insertions(+), 22 deletions(-) diff --git a/src/dict.cpp b/src/dict.cpp index 2c16af119..e0b193312 100644 --- a/src/dict.cpp +++ b/src/dict.cpp @@ -127,6 +127,7 @@ int _dictInit(dict *d, dictType *type, d->privdata = privDataPtr; d->rehashidx = -1; d->iterators = 0; + d->asyncdata = nullptr; return DICT_OK; } @@ -188,13 +189,13 @@ int dictExpand(dict *d, unsigned long size) int dictRehash(dict *d, int n) { int empty_visits = n*10; /* Max number of empty buckets to visit. */ if (!dictIsRehashing(d)) return 0; + if (d->asyncdata) return 0; while(n-- && d->ht[0].used != 0) { dictEntry *de, *nextde; /* Note that rehashidx can't overflow as we are sure there are more * elements because ht[0].used != 0 */ - assert(d->ht[0].size > (unsigned long)d->rehashidx); while(d->ht[0].table[d->rehashidx] == NULL) { d->rehashidx++; if (--empty_visits == 0) return 1; @@ -230,6 +231,95 @@ int dictRehash(dict *d, int n) { return 1; } +dictAsyncRehashCtl *dictRehashAsyncStart(dict *d) { + if (!dictIsRehashing(d)) return 0; + if (d->asyncdata != nullptr) { + /* An async rehash is already in progress, in the future we could give out an additional block */ + return nullptr; + } + d->asyncdata = new dictAsyncRehashCtl(d); + + int empty_visits = dictAsyncRehashCtl::c_targetQueueSize * 10; + + while (d->asyncdata->queue.size() < dictAsyncRehashCtl::c_targetQueueSize && (d->ht[0].used - d->asyncdata->queue.size()) != 0) { + dictEntry *de; + + /* Note that rehashidx can't overflow as we are sure there are more + * elements because ht[0].used != 0 */ + while(d->ht[0].table[d->rehashidx] == NULL) { + d->rehashidx++; + if (--empty_visits == 0) goto LDone; + } + + de = d->ht[0].table[d->rehashidx]; + // We have to queue every node in the bucket, even if we go over our target size + while (de) { + d->asyncdata->queue.emplace_back(de); + de = de->next; + } + d->rehashidx++; + } + +LDone: + if (d->asyncdata->queue.empty()) { + // We didn't find any work to do (can happen if there is a large gap in the hash table) + delete d->asyncdata; + d->asyncdata = nullptr; + } + return d->asyncdata; +} + +void dictRehashAsync(dictAsyncRehashCtl *ctl) { + for (auto &wi : ctl->queue) { + wi.hash = dictHashKey(ctl->dict, dictGetKey(wi.de)); + } +} + +void dictCompleteRehashAsync(dictAsyncRehashCtl *ctl) { + dict *d = ctl->dict; + + // Was the dictionary free'd while we were in flight? + if (ctl->release) { + dictRelease(d); + delete ctl; + return; + } + + for (auto &wi : ctl->queue) { + // We need to remove it from the source hash table, and store it in the dest. + // Note that it may have been deleted in the meantime and therefore not exist. + // In this case it will be in the garbage collection list + + dictEntry **pdePrev = &d->ht[0].table[wi.hash & d->ht[0].sizemask]; + while (*pdePrev != nullptr && *pdePrev != wi.de) { + pdePrev = &((*pdePrev)->next); + } + if (*pdePrev != nullptr) { + assert(*pdePrev == wi.de); + *pdePrev = wi.de->next; + // Now link it to the dest hash table + wi.de->next = d->ht[1].table[wi.hash & d->ht[1].sizemask]; + d->ht[1].table[wi.hash & d->ht[1].sizemask] = wi.de; + d->ht[0].used--; + d->ht[1].used++; + } else { + // In this scenario the de should be in the free list, find and delete it + bool found = false; + for (dictEntry **pdeGCPrev = &ctl->deGCList; *pdeGCPrev != nullptr && !found; pdeGCPrev = &((*pdeGCPrev)->next)) { + if (*pdeGCPrev == wi.de) { + *pdeGCPrev = wi.de->next; + dictFreeUnlinkedEntry(d, wi.de); + found = true; + } + } + // Note: We may not find it if the entry was just unlinked but reused elsewhere + } + } + + delete ctl; + d->asyncdata = nullptr; +} + long long timeInMilliseconds(void) { struct timeval tv; @@ -385,9 +475,14 @@ static dictEntry *dictGenericDelete(dict *d, const void *key, int nofree) { else d->ht[table].table[idx] = he->next; if (!nofree) { - dictFreeKey(d, he); - dictFreeVal(d, he); - zfree(he); + if (d->asyncdata != nullptr) { + he->next = d->asyncdata->deGCList; + d->asyncdata->deGCList = he->next; + } else { + dictFreeKey(d, he); + dictFreeVal(d, he); + zfree(he); + } } d->ht[table].used--; return he; @@ -470,6 +565,10 @@ int _dictClear(dict *d, dictht *ht, void(callback)(void *)) { /* Clear & Release the hash table */ void dictRelease(dict *d) { + if (d->asyncdata) { + d->asyncdata->release = true; + return; + } _dictClear(d,&d->ht[0],NULL); _dictClear(d,&d->ht[1],NULL); zfree(d); diff --git a/src/dict.h b/src/dict.h index 03c39c7ff..42d097f9b 100644 --- a/src/dict.h +++ b/src/dict.h @@ -36,6 +36,7 @@ #include #ifdef __cplusplus +#include extern "C" { #endif @@ -81,12 +82,37 @@ typedef struct dictht { unsigned long used; } dictht; +#ifdef __cplusplus +struct dictAsyncRehashCtl { + struct workItem { + dictEntry *de; + uint64_t hash; + workItem(dictEntry *de) { + this->de = de; + } + }; + + static const int c_targetQueueSize = 100; + dictEntry *deGCList = nullptr; + struct dict *dict = nullptr; + std::vector queue; + bool release = false; + + dictAsyncRehashCtl(struct dict *d) : dict(d) { + queue.reserve(c_targetQueueSize); + } +}; +#else +struct dictAsyncRehashCtl; +#endif + typedef struct dict { dictType *type; void *privdata; dictht ht[2]; long rehashidx; /* rehashing not in progress if rehashidx == -1 */ unsigned long iterators; /* number of iterators currently running */ + dictAsyncRehashCtl *asyncdata; } dict; /* If safe is set to 1 this is a safe iterator, that means, you can call @@ -190,6 +216,11 @@ unsigned long dictScan(dict *d, unsigned long v, dictScanFunction *fn, dictScanB uint64_t dictGetHash(dict *d, const void *key); dictEntry **dictFindEntryRefByPtrAndHash(dict *d, const void *oldptr, uint64_t hash); +/* Async Rehash Functions */ +dictAsyncRehashCtl *dictRehashAsyncStart(dict *d); +void dictRehashAsync(dictAsyncRehashCtl *ctl); +void dictCompleteRehashAsync(dictAsyncRehashCtl *ctl); + /* Hash table types */ extern dictType dictTypeHeapStringCopyKey; extern dictType dictTypeHeapStrings; diff --git a/src/server.cpp b/src/server.cpp index 848b28bcf..fdb8885b1 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -1775,19 +1775,23 @@ bool expireOwnKeys() /* This function handles 'background' operations we are required to do * incrementally in Redis databases, such as active key expiring, resizing, * rehashing. */ -void databasesCron(void) { - /* Expire keys by random sampling. Not required for slaves - * as master will synthesize DELs for us. */ - if (g_pserver->active_expire_enabled) { - if (expireOwnKeys()) { - activeExpireCycle(ACTIVE_EXPIRE_CYCLE_SLOW); - } else { - expireSlaveKeys(); - } - } +dictAsyncRehashCtl* databasesCron(bool fMainThread) { + dictAsyncRehashCtl *ctl = nullptr; - /* Defrag keys gradually. */ - activeDefragCycle(); + if (fMainThread) { + /* Expire keys by random sampling. Not required for slaves + * as master will synthesize DELs for us. */ + if (g_pserver->active_expire_enabled) { + if (expireOwnKeys()) { + activeExpireCycle(ACTIVE_EXPIRE_CYCLE_SLOW); + } else { + expireSlaveKeys(); + } + } + + /* Defrag keys gradually. */ + activeDefragCycle(); + } /* Perform hash tables rehashing if needed, but only if there are no * other processes saving the DB on disk. Otherwise rehashing is bad @@ -1804,16 +1808,21 @@ void databasesCron(void) { /* Don't test more DBs than we have. */ if (dbs_per_call > cserver.dbnum) dbs_per_call = cserver.dbnum; - /* Resize */ - for (j = 0; j < dbs_per_call; j++) { - tryResizeHashTables(resize_db % cserver.dbnum); - resize_db++; + if (fMainThread) { + /* Resize */ + for (j = 0; j < dbs_per_call; j++) { + tryResizeHashTables(resize_db % cserver.dbnum); + resize_db++; + } } /* Rehash */ if (g_pserver->activerehashing) { for (j = 0; j < dbs_per_call; j++) { - int work_done = incrementallyRehash(rehash_db); + ctl = dictRehashAsyncStart(g_pserver->db[rehash_db].pdict); + if (ctl) + break; + int work_done = fMainThread && incrementallyRehash(rehash_db); if (work_done) { /* If the function did some work, stop here, we'll do * more at the next cron loop. */ @@ -1826,6 +1835,8 @@ void databasesCron(void) { } } } + + return ctl; } /* We take a cached value of the unix time in the global state because with @@ -2065,7 +2076,14 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) { clientsCron(IDX_EVENT_LOOP_MAIN); /* Handle background operations on Redis databases. */ - databasesCron(); + auto asyncRehash = databasesCron(true /* fMainThread */); + + if (asyncRehash) { + aeReleaseLock(); + dictRehashAsync(asyncRehash); + aeAcquireLock(); + dictCompleteRehashAsync(asyncRehash); + } /* Start a scheduled AOF rewrite if this was requested by the user while * a BGSAVE was in progress. */ @@ -2215,6 +2233,16 @@ int serverCronLite(struct aeEventLoop *eventLoop, long long id, void *clientData processUnblockedClients(iel); } + /* Handle background operations on Redis databases. */ + auto asyncRehash = databasesCron(false /* fMainThread */); + + if (asyncRehash) { + aeReleaseLock(); + dictRehashAsync(asyncRehash); + aeAcquireLock(); + dictCompleteRehashAsync(asyncRehash); + } + /* Unpause clients if enough time has elapsed */ unpauseClientsIfNecessary(); From 6379ae6c7f861e884bb21bcf01ae4d1b597d72ab Mon Sep 17 00:00:00 2001 From: John Sully Date: Thu, 21 Jan 2021 17:18:54 +0000 Subject: [PATCH 21/40] Fix races in free Former-commit-id: 3881d5c1f22f022855c0abcf7b0c2070e204c0a3 --- src/dict.cpp | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/src/dict.cpp b/src/dict.cpp index e0b193312..966abc255 100644 --- a/src/dict.cpp +++ b/src/dict.cpp @@ -277,6 +277,7 @@ void dictRehashAsync(dictAsyncRehashCtl *ctl) { void dictCompleteRehashAsync(dictAsyncRehashCtl *ctl) { dict *d = ctl->dict; + d->asyncdata = nullptr; // dictFreeUnlinkedEntry won't actually free if we don't NULL this // Was the dictionary free'd while we were in flight? if (ctl->release) { @@ -315,9 +316,9 @@ void dictCompleteRehashAsync(dictAsyncRehashCtl *ctl) { // Note: We may not find it if the entry was just unlinked but reused elsewhere } } + assert(ctl->deGCList == nullptr); delete ctl; - d->asyncdata = nullptr; } long long timeInMilliseconds(void) { @@ -475,7 +476,7 @@ static dictEntry *dictGenericDelete(dict *d, const void *key, int nofree) { else d->ht[table].table[idx] = he->next; if (!nofree) { - if (d->asyncdata != nullptr) { + if (table == 0 && d->asyncdata != nullptr) { he->next = d->asyncdata->deGCList; d->asyncdata->deGCList = he->next; } else { @@ -530,9 +531,14 @@ dictEntry *dictUnlink(dict *ht, const void *key) { * to dictUnlink(). It's safe to call this function with 'he' = NULL. */ void dictFreeUnlinkedEntry(dict *d, dictEntry *he) { if (he == NULL) return; - dictFreeKey(d, he); - dictFreeVal(d, he); - zfree(he); + if (d->asyncdata) { + he->next = d->asyncdata->deGCList; + d->asyncdata->deGCList = he; + } else { + dictFreeKey(d, he); + dictFreeVal(d, he); + zfree(he); + } } /* Destroy an entire dictionary */ @@ -548,9 +554,14 @@ int _dictClear(dict *d, dictht *ht, void(callback)(void *)) { if ((he = ht->table[i]) == NULL) continue; while(he) { nextHe = he->next; - dictFreeKey(d, he); - dictFreeVal(d, he); - zfree(he); + if (i == 0 && d->asyncdata) { + he->next = d->asyncdata->deGCList; + d->asyncdata->deGCList = he; + } else { + dictFreeKey(d, he); + dictFreeVal(d, he); + zfree(he); + } ht->used--; he = nextHe; } From 071ecb801a728a37c105e9fd3ba62681c704f525 Mon Sep 17 00:00:00 2001 From: John Sully Date: Thu, 21 Jan 2021 20:27:50 +0000 Subject: [PATCH 22/40] Allow multiple threads to rehash simultaneously Former-commit-id: 5a2cc90786dfd1bfd341dbf5713bcde01f0cfff3 --- src/dict.cpp | 53 +++++++++++++++++++++++++++++++--------------------- src/dict.h | 5 +++-- 2 files changed, 35 insertions(+), 23 deletions(-) diff --git a/src/dict.cpp b/src/dict.cpp index 966abc255..37e6998c4 100644 --- a/src/dict.cpp +++ b/src/dict.cpp @@ -233,11 +233,7 @@ int dictRehash(dict *d, int n) { dictAsyncRehashCtl *dictRehashAsyncStart(dict *d) { if (!dictIsRehashing(d)) return 0; - if (d->asyncdata != nullptr) { - /* An async rehash is already in progress, in the future we could give out an additional block */ - return nullptr; - } - d->asyncdata = new dictAsyncRehashCtl(d); + d->asyncdata = new dictAsyncRehashCtl(d, d->asyncdata); int empty_visits = dictAsyncRehashCtl::c_targetQueueSize * 10; @@ -263,8 +259,10 @@ dictAsyncRehashCtl *dictRehashAsyncStart(dict *d) { LDone: if (d->asyncdata->queue.empty()) { // We didn't find any work to do (can happen if there is a large gap in the hash table) - delete d->asyncdata; - d->asyncdata = nullptr; + auto asyncT = d->asyncdata; + d->asyncdata = d->asyncdata->next; + delete asyncT; + return nullptr; } return d->asyncdata; } @@ -277,7 +275,15 @@ void dictRehashAsync(dictAsyncRehashCtl *ctl) { void dictCompleteRehashAsync(dictAsyncRehashCtl *ctl) { dict *d = ctl->dict; - d->asyncdata = nullptr; // dictFreeUnlinkedEntry won't actually free if we don't NULL this + // Unlink ourselves + dictAsyncRehashCtl **pprev = &d->asyncdata; + while (*pprev != nullptr) { + if (*pprev == ctl) { + *pprev = ctl->next; + break; + } + pprev = &((*pprev)->next); + } // Was the dictionary free'd while we were in flight? if (ctl->release) { @@ -295,7 +301,7 @@ void dictCompleteRehashAsync(dictAsyncRehashCtl *ctl) { while (*pdePrev != nullptr && *pdePrev != wi.de) { pdePrev = &((*pdePrev)->next); } - if (*pdePrev != nullptr) { + if (*pdePrev != nullptr) { // The element may be NULL if its in the GC list assert(*pdePrev == wi.de); *pdePrev = wi.de->next; // Now link it to the dest hash table @@ -303,20 +309,25 @@ void dictCompleteRehashAsync(dictAsyncRehashCtl *ctl) { d->ht[1].table[wi.hash & d->ht[1].sizemask] = wi.de; d->ht[0].used--; d->ht[1].used++; - } else { - // In this scenario the de should be in the free list, find and delete it - bool found = false; - for (dictEntry **pdeGCPrev = &ctl->deGCList; *pdeGCPrev != nullptr && !found; pdeGCPrev = &((*pdeGCPrev)->next)) { - if (*pdeGCPrev == wi.de) { - *pdeGCPrev = wi.de->next; - dictFreeUnlinkedEntry(d, wi.de); - found = true; - } - } - // Note: We may not find it if the entry was just unlinked but reused elsewhere } } - assert(ctl->deGCList == nullptr); + + // Now free anyting in the GC list + while (ctl->deGCList != nullptr) { + auto next = ctl->deGCList->next; + dictFreeKey(d, ctl->deGCList); + dictFreeVal(d, ctl->deGCList); + zfree(ctl->deGCList); + ctl->deGCList = next; + } + + /* Check if we already rehashed the whole table... */ + if (d->ht[0].used == 0 && d->asyncdata == nullptr) { + zfree(d->ht[0].table); + d->ht[0] = d->ht[1]; + _dictReset(&d->ht[1]); + d->rehashidx = -1; + } delete ctl; } diff --git a/src/dict.h b/src/dict.h index 42d097f9b..b5cac21f9 100644 --- a/src/dict.h +++ b/src/dict.h @@ -92,13 +92,14 @@ struct dictAsyncRehashCtl { } }; - static const int c_targetQueueSize = 100; + static const int c_targetQueueSize = 512; dictEntry *deGCList = nullptr; struct dict *dict = nullptr; std::vector queue; bool release = false; + dictAsyncRehashCtl *next = nullptr; - dictAsyncRehashCtl(struct dict *d) : dict(d) { + dictAsyncRehashCtl(struct dict *d, dictAsyncRehashCtl *next) : dict(d), next(next) { queue.reserve(c_targetQueueSize); } }; From 495dff1e8cb1388f745d120b92b4bca633a71b59 Mon Sep 17 00:00:00 2001 From: John Sully Date: Mon, 25 Jan 2021 16:29:47 +0000 Subject: [PATCH 23/40] Implement rehash during spinlock Former-commit-id: f68a26381a35b27948046d46c2c7bcfbdc21143d --- src/ae.cpp | 8 ++++- src/ae.h | 1 + src/dict.cpp | 67 +++++++++++++++++++++++------------- src/dict.h | 4 ++- src/fastlock.cpp | 11 ++++-- src/fastlock.h | 12 +++++-- src/fastlock_x64.asm | 52 +++++++++++++++++++++++++--- src/server.cpp | 81 +++++++++++++++++++++++++------------------- src/server.h | 1 + 9 files changed, 167 insertions(+), 70 deletions(-) diff --git a/src/ae.cpp b/src/ae.cpp index 89b36beaa..16a2ddb64 100644 --- a/src/ae.cpp +++ b/src/ae.cpp @@ -842,9 +842,15 @@ void aeSetAfterSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *aftersleep, eventLoop->aftersleepFlags = flags; } +thread_local spin_worker tl_worker = nullptr; +void setAeLockSetThreadSpinWorker(spin_worker worker) +{ + tl_worker = worker; +} + void aeAcquireLock() { - g_lock.lock(); + g_lock.lock(tl_worker); } int aeTryAcquireLock(int fWeak) diff --git a/src/ae.h b/src/ae.h index ab7127f8b..3a240877d 100644 --- a/src/ae.h +++ b/src/ae.h @@ -164,6 +164,7 @@ aeEventLoop *aeGetCurrentEventLoop(); int aeResizeSetSize(aeEventLoop *eventLoop, int setsize); void aeSetDontWait(aeEventLoop *eventLoop, int noWait); +void setAeLockSetThreadSpinWorker(spin_worker worker); void aeAcquireLock(); int aeTryAcquireLock(int fWeak); void aeReleaseLock(); diff --git a/src/dict.cpp b/src/dict.cpp index 37e6998c4..e882fff2a 100644 --- a/src/dict.cpp +++ b/src/dict.cpp @@ -231,13 +231,14 @@ int dictRehash(dict *d, int n) { return 1; } -dictAsyncRehashCtl *dictRehashAsyncStart(dict *d) { +dictAsyncRehashCtl *dictRehashAsyncStart(dict *d, int buckets) { if (!dictIsRehashing(d)) return 0; + d->asyncdata = new dictAsyncRehashCtl(d, d->asyncdata); - int empty_visits = dictAsyncRehashCtl::c_targetQueueSize * 10; + int empty_visits = buckets * 10; - while (d->asyncdata->queue.size() < dictAsyncRehashCtl::c_targetQueueSize && (d->ht[0].used - d->asyncdata->queue.size()) != 0) { + while (d->asyncdata->queue.size() < (size_t)buckets && d->rehashidx < d->ht[0].size) { dictEntry *de; /* Note that rehashidx can't overflow as we are sure there are more @@ -245,11 +246,12 @@ dictAsyncRehashCtl *dictRehashAsyncStart(dict *d) { while(d->ht[0].table[d->rehashidx] == NULL) { d->rehashidx++; if (--empty_visits == 0) goto LDone; + if (d->rehashidx >= d->ht[0].size) goto LDone; } de = d->ht[0].table[d->rehashidx]; // We have to queue every node in the bucket, even if we go over our target size - while (de) { + while (de != nullptr) { d->asyncdata->queue.emplace_back(de); de = de->next; } @@ -268,9 +270,21 @@ LDone: } void dictRehashAsync(dictAsyncRehashCtl *ctl) { - for (auto &wi : ctl->queue) { + for (size_t idx = ctl->hashIdx; idx < ctl->queue.size(); ++idx) { + auto &wi = ctl->queue[idx]; wi.hash = dictHashKey(ctl->dict, dictGetKey(wi.de)); } + ctl->hashIdx = ctl->queue.size(); +} + +bool dictRehashSomeAsync(dictAsyncRehashCtl *ctl, size_t hashes) { + size_t max = std::min(ctl->hashIdx + hashes, ctl->queue.size()); + for (; ctl->hashIdx < max; ++ctl->hashIdx) { + auto &wi = ctl->queue[ctl->hashIdx]; + wi.hash = dictHashKey(ctl->dict, dictGetKey(wi.de)); + } + + return ctl->hashIdx < ctl->queue.size(); } void dictCompleteRehashAsync(dictAsyncRehashCtl *ctl) { @@ -287,28 +301,33 @@ void dictCompleteRehashAsync(dictAsyncRehashCtl *ctl) { // Was the dictionary free'd while we were in flight? if (ctl->release) { - dictRelease(d); + if (d->asyncdata != nullptr) + d->asyncdata->release = true; + else + dictRelease(d); delete ctl; return; } - for (auto &wi : ctl->queue) { - // We need to remove it from the source hash table, and store it in the dest. - // Note that it may have been deleted in the meantime and therefore not exist. - // In this case it will be in the garbage collection list - - dictEntry **pdePrev = &d->ht[0].table[wi.hash & d->ht[0].sizemask]; - while (*pdePrev != nullptr && *pdePrev != wi.de) { - pdePrev = &((*pdePrev)->next); - } - if (*pdePrev != nullptr) { // The element may be NULL if its in the GC list - assert(*pdePrev == wi.de); - *pdePrev = wi.de->next; - // Now link it to the dest hash table - wi.de->next = d->ht[1].table[wi.hash & d->ht[1].sizemask]; - d->ht[1].table[wi.hash & d->ht[1].sizemask] = wi.de; - d->ht[0].used--; - d->ht[1].used++; + if (d->ht[0].table != nullptr) { // can be null if we're cleared during the rehash + for (auto &wi : ctl->queue) { + // We need to remove it from the source hash table, and store it in the dest. + // Note that it may have been deleted in the meantime and therefore not exist. + // In this case it will be in the garbage collection list + + dictEntry **pdePrev = &d->ht[0].table[wi.hash & d->ht[0].sizemask]; + while (*pdePrev != nullptr && *pdePrev != wi.de) { + pdePrev = &((*pdePrev)->next); + } + if (*pdePrev != nullptr) { // The element may be NULL if its in the GC list + assert(*pdePrev == wi.de); + *pdePrev = wi.de->next; + // Now link it to the dest hash table + wi.de->next = d->ht[1].table[wi.hash & d->ht[1].sizemask]; + d->ht[1].table[wi.hash & d->ht[1].sizemask] = wi.de; + d->ht[0].used--; + d->ht[1].used++; + } } } @@ -565,7 +584,7 @@ int _dictClear(dict *d, dictht *ht, void(callback)(void *)) { if ((he = ht->table[i]) == NULL) continue; while(he) { nextHe = he->next; - if (i == 0 && d->asyncdata) { + if (d->asyncdata) { he->next = d->asyncdata->deGCList; d->asyncdata->deGCList = he; } else { diff --git a/src/dict.h b/src/dict.h index b5cac21f9..2f6c9677a 100644 --- a/src/dict.h +++ b/src/dict.h @@ -96,6 +96,7 @@ struct dictAsyncRehashCtl { dictEntry *deGCList = nullptr; struct dict *dict = nullptr; std::vector queue; + size_t hashIdx = 0; bool release = false; dictAsyncRehashCtl *next = nullptr; @@ -218,8 +219,9 @@ uint64_t dictGetHash(dict *d, const void *key); dictEntry **dictFindEntryRefByPtrAndHash(dict *d, const void *oldptr, uint64_t hash); /* Async Rehash Functions */ -dictAsyncRehashCtl *dictRehashAsyncStart(dict *d); +dictAsyncRehashCtl *dictRehashAsyncStart(dict *d, int buckets = dictAsyncRehashCtl::c_targetQueueSize); void dictRehashAsync(dictAsyncRehashCtl *ctl); +bool dictRehashSomeAsync(dictAsyncRehashCtl *ctl, size_t hashes); void dictCompleteRehashAsync(dictAsyncRehashCtl *ctl); /* Hash table types */ diff --git a/src/fastlock.cpp b/src/fastlock.cpp index 4cada72cc..61cd1752e 100644 --- a/src/fastlock.cpp +++ b/src/fastlock.cpp @@ -338,7 +338,7 @@ extern "C" void fastlock_init(struct fastlock *lock, const char *name) } #ifndef ASM_SPINLOCK -extern "C" void fastlock_lock(struct fastlock *lock) +extern "C" void fastlock_lock(struct fastlock *lock, spin_worker worker) { int pidOwner; __atomic_load(&lock->m_pidOwner, &pidOwner, __ATOMIC_ACQUIRE); @@ -362,11 +362,16 @@ extern "C" void fastlock_lock(struct fastlock *lock) if ((ticketT.u & 0xffff) == myticket) break; + if (worker != nullptr) { + worker(); + } else { #if defined(__i386__) || defined(__amd64__) - __asm__ __volatile__ ("pause"); + __asm__ __volatile__ ("pause"); #elif defined(__aarch64__) - __asm__ __volatile__ ("yield"); + __asm__ __volatile__ ("yield"); #endif + } + if ((++cloops % loopLimit) == 0) { fastlock_sleep(lock, tid, ticketT.u, myticket); diff --git a/src/fastlock.h b/src/fastlock.h index 0809c8bcd..ede6d4e16 100644 --- a/src/fastlock.h +++ b/src/fastlock.h @@ -6,10 +6,16 @@ extern "C" { #endif +typedef int (*spin_worker)(); + /* Begin C API */ struct fastlock; void fastlock_init(struct fastlock *lock, const char *name); -void fastlock_lock(struct fastlock *lock); +#ifdef __cplusplus +void fastlock_lock(struct fastlock *lock, spin_worker worker = nullptr); +#else +void fastlock_lock(struct fastlock *lock, spin_worker worker); +#endif int fastlock_trylock(struct fastlock *lock, int fWeak); void fastlock_unlock(struct fastlock *lock); void fastlock_free(struct fastlock *lock); @@ -56,9 +62,9 @@ struct fastlock fastlock_init(this, name); } - inline void lock() + inline void lock(spin_worker worker = nullptr) { - fastlock_lock(this); + fastlock_lock(this, worker); } inline bool try_lock(bool fWeak = false) diff --git a/src/fastlock_x64.asm b/src/fastlock_x64.asm index 26c302434..5fa8e0ade 100644 --- a/src/fastlock_x64.asm +++ b/src/fastlock_x64.asm @@ -22,14 +22,22 @@ fastlock_lock: # [rdi+64] ... # uint16_t active # uint16_t avail + # + # RSI points to a spin function to call, or NULL # First get our TID and put it in ecx - push rdi # we need our struct pointer (also balance the stack for the call) - .cfi_adjust_cfa_offset 8 + sub rsp, 24 # We only use 16 bytes, but we also need the stack aligned + .cfi_adjust_cfa_offset 24 + mov [rsp], rdi # we need our struct pointer (also balance the stack for the call) + mov [rsp+8], rsi # backup the spin function + call gettid # get our thread ID (TLS is nasty in ASM so don't bother inlining) mov esi, eax # back it up in esi - pop rdi # get our pointer back - .cfi_adjust_cfa_offset -8 + + mov rdi, [rsp] # Restore spin struct + mov r8, [rsp+8] # restore the function (in a different register) + add rsp, 24 + .cfi_adjust_cfa_offset -24 cmp [rdi], esi # Is the TID we got back the owner of the lock? je .LLocked # Don't spin in that case @@ -47,6 +55,8 @@ fastlock_lock: # ax now contains the ticket # OK Start the wait loop xor ecx, ecx + test r8, r8 + jnz .LLoopFunction .ALIGN 16 .LLoop: mov edx, [rdi+64] @@ -83,6 +93,40 @@ fastlock_lock: mov [rdi], esi # lock->m_pidOwner = gettid() inc dword ptr [rdi+4] # lock->m_depth++ ret + +.LLoopFunction: + sub rsp, 40 + .cfi_adjust_cfa_offset 40 + xor ecx, ecx + mov [rsp], rcx + mov [rsp+8], r8 + mov [rsp+16], rdi + mov [rsp+24], rsi + mov [rsp+32], eax +.LLoopFunctionCore: + mov edx, [rdi+64] + cmp dx, ax + je .LExitLoopFunction + mov r8, [rsp+8] + call r8 + test eax, eax + jz .LExitLoopFunctionForNormal + mov eax, [rsp+32] # restore clobbered eax + mov rdi, [rsp+16] + jmp .LLoopFunctionCore +.LExitLoopFunction: + mov rsi, [rsp+24] + add rsp, 40 + .cfi_adjust_cfa_offset -40 + jmp .LLocked +.LExitLoopFunctionForNormal: + xor ecx, ecx + mov rdi, [rsp+16] + mov rsi, [rsp+24] + mov eax, [rsp+32] + add rsp, 40 + .cfi_adjust_cfa_offset 40 + jmp .LLoop .cfi_endproc .ALIGN 16 diff --git a/src/server.cpp b/src/server.cpp index fdb8885b1..f1dd30d0d 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -1508,13 +1508,12 @@ void tryResizeHashTables(int dbid) { * table will use two tables for a long time. So we try to use 1 millisecond * of CPU time at every call of this function to perform some rehashing. * - * The function returns 1 if some rehashing was performed, otherwise 0 + * The function returns the number of rehashes if some rehashing was performed, otherwise 0 * is returned. */ int incrementallyRehash(int dbid) { /* Keys dictionary */ if (dictIsRehashing(g_pserver->db[dbid].dict)) { - dictRehashMilliseconds(g_pserver->db[dbid].dict,1); - return 1; /* already used our millisecond for this loop... */ + return dictRehashMilliseconds(g_pserver->db[dbid].dict,1); } return 0; } @@ -1772,12 +1771,22 @@ bool expireOwnKeys() return false; } +int hash_spin_worker() { + auto ctl = serverTL->rehashCtl; + if (ctl->hashIdx < ctl->queue.size()) { + ctl->queue[ctl->hashIdx].hash = dictHashKey(ctl->dict, dictGetKey(ctl->queue[ctl->hashIdx].de)); + ctl->hashIdx++; + return true; + } else { + return false; + } +} + /* This function handles 'background' operations we are required to do * incrementally in Redis databases, such as active key expiring, resizing, * rehashing. */ -dictAsyncRehashCtl* databasesCron(bool fMainThread) { - dictAsyncRehashCtl *ctl = nullptr; - +void databasesCron(bool fMainThread) { + static int rehashes_per_ms = 0; if (fMainThread) { /* Expire keys by random sampling. Not required for slaves * as master will synthesize DELs for us. */ @@ -1819,24 +1828,42 @@ dictAsyncRehashCtl* databasesCron(bool fMainThread) { /* Rehash */ if (g_pserver->activerehashing) { for (j = 0; j < dbs_per_call; j++) { - ctl = dictRehashAsyncStart(g_pserver->db[rehash_db].pdict); - if (ctl) + if (serverTL->rehashCtl != nullptr) { + if (dictRehashSomeAsync(serverTL->rehashCtl, 5)) { + break; + } else { + dictCompleteRehashAsync(serverTL->rehashCtl); + serverTL->rehashCtl = nullptr; + } + } + if (rehashes_per_ms > 0) + serverTL->rehashCtl = dictRehashAsyncStart(g_pserver->db[rehash_db].dict, rehashes_per_ms); + if (serverTL->rehashCtl) break; - int work_done = fMainThread && incrementallyRehash(rehash_db); - if (work_done) { - /* If the function did some work, stop here, we'll do - * more at the next cron loop. */ - break; - } else { - /* If this db didn't need rehash, we'll try the next one. */ - rehash_db++; - rehash_db %= cserver.dbnum; + + if (fMainThread) { + // We only synchronously rehash on the main thread, otherwise we'll cause too much latency + rehashes_per_ms = incrementallyRehash(rehash_db); + if (rehashes_per_ms > 0) { + /* If the function did some work, stop here, we'll do + * more at the next cron loop. */ + serverLog(LL_WARNING, "Rehashes per ms: %d", rehashes_per_ms); + break; + } else { + /* If this db didn't need rehash, we'll try the next one. */ + rehash_db++; + rehash_db %= cserver.dbnum; + } } } } } - return ctl; + if (serverTL->rehashCtl) { + setAeLockSetThreadSpinWorker(hash_spin_worker); + } else { + setAeLockSetThreadSpinWorker(nullptr); + } } /* We take a cached value of the unix time in the global state because with @@ -2076,14 +2103,7 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) { clientsCron(IDX_EVENT_LOOP_MAIN); /* Handle background operations on Redis databases. */ - auto asyncRehash = databasesCron(true /* fMainThread */); - - if (asyncRehash) { - aeReleaseLock(); - dictRehashAsync(asyncRehash); - aeAcquireLock(); - dictCompleteRehashAsync(asyncRehash); - } + databasesCron(true /* fMainThread */); /* Start a scheduled AOF rewrite if this was requested by the user while * a BGSAVE was in progress. */ @@ -2234,14 +2254,7 @@ int serverCronLite(struct aeEventLoop *eventLoop, long long id, void *clientData } /* Handle background operations on Redis databases. */ - auto asyncRehash = databasesCron(false /* fMainThread */); - - if (asyncRehash) { - aeReleaseLock(); - dictRehashAsync(asyncRehash); - aeAcquireLock(); - dictCompleteRehashAsync(asyncRehash); - } + databasesCron(false /* fMainThread */); /* Unpause clients if enough time has elapsed */ unpauseClientsIfNecessary(); diff --git a/src/server.h b/src/server.h index 0da8b84f3..ea1356889 100644 --- a/src/server.h +++ b/src/server.h @@ -1393,6 +1393,7 @@ struct redisServerThreadVars { long unsigned commandsExecuted = 0; bool fRetrySetAofEvent = false; std::vector vecclientsProcess; + dictAsyncRehashCtl *rehashCtl = nullptr; }; struct redisMaster { From ba006abe029c6766fd9dbcccbe03808134f294ce Mon Sep 17 00:00:00 2001 From: John Sully Date: Wed, 27 Jan 2021 00:47:22 +0000 Subject: [PATCH 24/40] Ensure rehash completes even when we're in a long running task Former-commit-id: f107746e90f7a8ff3c7094145ee1ad438911e8c2 --- src/dict.cpp | 112 ++++++++++++++++++++++++++------------------- src/dict.h | 4 +- src/fastlock.cpp | 1 + src/networking.cpp | 10 ++++ src/server.cpp | 44 ++++++++++-------- 5 files changed, 103 insertions(+), 68 deletions(-) diff --git a/src/dict.cpp b/src/dict.cpp index e882fff2a..c5d89eea9 100644 --- a/src/dict.cpp +++ b/src/dict.cpp @@ -275,6 +275,7 @@ void dictRehashAsync(dictAsyncRehashCtl *ctl) { wi.hash = dictHashKey(ctl->dict, dictGetKey(wi.de)); } ctl->hashIdx = ctl->queue.size(); + ctl->done = true; } bool dictRehashSomeAsync(dictAsyncRehashCtl *ctl, size_t hashes) { @@ -284,71 +285,88 @@ bool dictRehashSomeAsync(dictAsyncRehashCtl *ctl, size_t hashes) { wi.hash = dictHashKey(ctl->dict, dictGetKey(wi.de)); } + if (ctl->hashIdx == ctl->queue.size()) ctl->done = true; return ctl->hashIdx < ctl->queue.size(); } -void dictCompleteRehashAsync(dictAsyncRehashCtl *ctl) { +void dictCompleteRehashAsync(dictAsyncRehashCtl *ctl, bool fFree) { dict *d = ctl->dict; + assert(ctl->done); + // Unlink ourselves + bool fUnlinked = false; dictAsyncRehashCtl **pprev = &d->asyncdata; + while (*pprev != nullptr) { if (*pprev == ctl) { *pprev = ctl->next; + fUnlinked = true; break; } pprev = &((*pprev)->next); } - // Was the dictionary free'd while we were in flight? - if (ctl->release) { - if (d->asyncdata != nullptr) - d->asyncdata->release = true; - else - dictRelease(d); - delete ctl; - return; - } - - if (d->ht[0].table != nullptr) { // can be null if we're cleared during the rehash - for (auto &wi : ctl->queue) { - // We need to remove it from the source hash table, and store it in the dest. - // Note that it may have been deleted in the meantime and therefore not exist. - // In this case it will be in the garbage collection list - - dictEntry **pdePrev = &d->ht[0].table[wi.hash & d->ht[0].sizemask]; - while (*pdePrev != nullptr && *pdePrev != wi.de) { - pdePrev = &((*pdePrev)->next); - } - if (*pdePrev != nullptr) { // The element may be NULL if its in the GC list - assert(*pdePrev == wi.de); - *pdePrev = wi.de->next; - // Now link it to the dest hash table - wi.de->next = d->ht[1].table[wi.hash & d->ht[1].sizemask]; - d->ht[1].table[wi.hash & d->ht[1].sizemask] = wi.de; - d->ht[0].used--; - d->ht[1].used++; - } + if (fUnlinked) { + if (ctl->next != nullptr && ctl->deGCList != nullptr) { + // An older work item may depend on our free list, so pass our free list to them + dictEntry **deGC = &ctl->next->deGCList; + while (*deGC != nullptr) deGC = &((*deGC)->next); + *deGC = ctl->deGCList; + ctl->deGCList = nullptr; } } - // Now free anyting in the GC list - while (ctl->deGCList != nullptr) { - auto next = ctl->deGCList->next; - dictFreeKey(d, ctl->deGCList); - dictFreeVal(d, ctl->deGCList); - zfree(ctl->deGCList); - ctl->deGCList = next; + if (fUnlinked && !ctl->release) { + if (d->ht[0].table != nullptr) { // can be null if we're cleared during the rehash + for (auto &wi : ctl->queue) { + // We need to remove it from the source hash table, and store it in the dest. + // Note that it may have been deleted in the meantime and therefore not exist. + // In this case it will be in the garbage collection list + + dictEntry **pdePrev = &d->ht[0].table[wi.hash & d->ht[0].sizemask]; + while (*pdePrev != nullptr && *pdePrev != wi.de) { + pdePrev = &((*pdePrev)->next); + } + if (*pdePrev != nullptr) { // The element may be NULL if its in the GC list + assert(*pdePrev == wi.de); + *pdePrev = wi.de->next; + // Now link it to the dest hash table + wi.de->next = d->ht[1].table[wi.hash & d->ht[1].sizemask]; + d->ht[1].table[wi.hash & d->ht[1].sizemask] = wi.de; + d->ht[0].used--; + d->ht[1].used++; + } + } + } + + /* Check if we already rehashed the whole table... */ + if (d->ht[0].used == 0 && d->asyncdata == nullptr) { + zfree(d->ht[0].table); + d->ht[0] = d->ht[1]; + _dictReset(&d->ht[1]); + d->rehashidx = -1; + } } - /* Check if we already rehashed the whole table... */ - if (d->ht[0].used == 0 && d->asyncdata == nullptr) { - zfree(d->ht[0].table); - d->ht[0] = d->ht[1]; - _dictReset(&d->ht[1]); - d->rehashidx = -1; - } + if (fFree) { + while (ctl->deGCList != nullptr) { + auto next = ctl->deGCList->next; + dictFreeKey(d, ctl->deGCList); + dictFreeVal(d, ctl->deGCList); + zfree(ctl->deGCList); + ctl->deGCList = next; + } - delete ctl; + // Was the dictionary free'd while we were in flight? + if (ctl->release) { + if (d->asyncdata != nullptr) + d->asyncdata->release = true; + else + dictRelease(d); + } + + delete ctl; + } } long long timeInMilliseconds(void) { @@ -506,7 +524,7 @@ static dictEntry *dictGenericDelete(dict *d, const void *key, int nofree) { else d->ht[table].table[idx] = he->next; if (!nofree) { - if (table == 0 && d->asyncdata != nullptr) { + if (table == 0 && d->asyncdata != nullptr && idx < d->rehashidx) { he->next = d->asyncdata->deGCList; d->asyncdata->deGCList = he->next; } else { @@ -584,7 +602,7 @@ int _dictClear(dict *d, dictht *ht, void(callback)(void *)) { if ((he = ht->table[i]) == NULL) continue; while(he) { nextHe = he->next; - if (d->asyncdata) { + if (d->asyncdata && i < d->rehashidx) { he->next = d->asyncdata->deGCList; d->asyncdata->deGCList = he; } else { diff --git a/src/dict.h b/src/dict.h index 2f6c9677a..9abba00eb 100644 --- a/src/dict.h +++ b/src/dict.h @@ -37,6 +37,7 @@ #ifdef __cplusplus #include +#include extern "C" { #endif @@ -99,6 +100,7 @@ struct dictAsyncRehashCtl { size_t hashIdx = 0; bool release = false; dictAsyncRehashCtl *next = nullptr; + std::atomic done { false }; dictAsyncRehashCtl(struct dict *d, dictAsyncRehashCtl *next) : dict(d), next(next) { queue.reserve(c_targetQueueSize); @@ -222,7 +224,7 @@ dictEntry **dictFindEntryRefByPtrAndHash(dict *d, const void *oldptr, uint64_t h dictAsyncRehashCtl *dictRehashAsyncStart(dict *d, int buckets = dictAsyncRehashCtl::c_targetQueueSize); void dictRehashAsync(dictAsyncRehashCtl *ctl); bool dictRehashSomeAsync(dictAsyncRehashCtl *ctl, size_t hashes); -void dictCompleteRehashAsync(dictAsyncRehashCtl *ctl); +void dictCompleteRehashAsync(dictAsyncRehashCtl *ctl, bool fFree); /* Hash table types */ extern dictType dictTypeHeapStringCopyKey; diff --git a/src/fastlock.cpp b/src/fastlock.cpp index 61cd1752e..8c1c5f6e4 100644 --- a/src/fastlock.cpp +++ b/src/fastlock.cpp @@ -356,6 +356,7 @@ extern "C" void fastlock_lock(struct fastlock *lock, spin_worker worker) __atomic_load(&g_fHighCpuPressure, &fHighPressure, __ATOMIC_RELAXED); unsigned loopLimit = fHighPressure ? 0x10000 : 0x100000; + // WARNING:::: THIS DOESN"T MATCH ASM for (;;) { __atomic_load(&lock->m_ticket.u, &ticketT.u, __ATOMIC_ACQUIRE); diff --git a/src/networking.cpp b/src/networking.cpp index dc4e56e92..36470c0db 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -3486,6 +3486,16 @@ void processEventsWhileBlocked(int iel) { locker.arm(nullptr); locker.release(); + // Try to complete any async rehashes (this would normally happen in dbCron, but that won't run here) + for (int idb = 0; idb < cserver.dbnum; ++idb) { + redisDb *db = &g_pserver->db[idb]; + while (db->pdict->asyncdata != nullptr) { + if (!db->pdict->asyncdata->done) + break; + dictCompleteRehashAsync(db->pdict->asyncdata, false /*fFree*/); + } + } + // Restore it so the calling code is not confused if (fReplBacklog) { g_pserver->repl_batch_idxStart = g_pserver->repl_backlog_idx; diff --git a/src/server.cpp b/src/server.cpp index f1dd30d0d..be140c48c 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -1773,19 +1773,14 @@ bool expireOwnKeys() int hash_spin_worker() { auto ctl = serverTL->rehashCtl; - if (ctl->hashIdx < ctl->queue.size()) { - ctl->queue[ctl->hashIdx].hash = dictHashKey(ctl->dict, dictGetKey(ctl->queue[ctl->hashIdx].de)); - ctl->hashIdx++; - return true; - } else { - return false; - } + return dictRehashSomeAsync(ctl, 1); } /* This function handles 'background' operations we are required to do * incrementally in Redis databases, such as active key expiring, resizing, * rehashing. */ void databasesCron(bool fMainThread) { + serverAssert(GlobalLocksAcquired()); static int rehashes_per_ms = 0; if (fMainThread) { /* Expire keys by random sampling. Not required for slaves @@ -1832,29 +1827,38 @@ void databasesCron(bool fMainThread) { if (dictRehashSomeAsync(serverTL->rehashCtl, 5)) { break; } else { - dictCompleteRehashAsync(serverTL->rehashCtl); + dictCompleteRehashAsync(serverTL->rehashCtl, true /*fFree*/); serverTL->rehashCtl = nullptr; } } + + serverAssert(serverTL->rehashCtl == nullptr); if (rehashes_per_ms > 0) serverTL->rehashCtl = dictRehashAsyncStart(g_pserver->db[rehash_db].dict, rehashes_per_ms); if (serverTL->rehashCtl) break; - if (fMainThread) { - // We only synchronously rehash on the main thread, otherwise we'll cause too much latency - rehashes_per_ms = incrementallyRehash(rehash_db); - if (rehashes_per_ms > 0) { - /* If the function did some work, stop here, we'll do - * more at the next cron loop. */ - serverLog(LL_WARNING, "Rehashes per ms: %d", rehashes_per_ms); - break; - } else { - /* If this db didn't need rehash, we'll try the next one. */ - rehash_db++; - rehash_db %= cserver.dbnum; + // Before starting anything new, can we end the rehash of a blocked thread? + if (g_pserver->db[rehash_db].pdict->asyncdata != nullptr) { + auto asyncdata = g_pserver->db[rehash_db].pdict->asyncdata; + if (asyncdata->done) { + dictCompleteRehashAsync(asyncdata, false /*fFree*/); // Don't free because we don't own the pointer + serverAssert(g_pserver->db[rehash_db].pdict->asyncdata != asyncdata); + break; // completion can be expensive, don't do anything else } } + + rehashes_per_ms = incrementallyRehash(rehash_db); + if (rehashes_per_ms > 0) { + /* If the function did some work, stop here, we'll do + * more at the next cron loop. */ + serverLog(LL_WARNING, "Rehashes per ms: %d", rehashes_per_ms); + break; + } else if (g_pserver->db[rehash_db].pdict->asyncdata == nullptr) { + /* If this db didn't need rehash and we have none in flight, we'll try the next one. */ + rehash_db++; + rehash_db %= cserver.dbnum; + } } } } From 27552cb307911c87bf658f1cb2b95fa2a4a597a0 Mon Sep 17 00:00:00 2001 From: John Sully Date: Wed, 27 Jan 2021 05:05:05 +0000 Subject: [PATCH 25/40] Ensure the C lock implementation behaves the same as the ASM Former-commit-id: b5ddc11c46d2eabd28fae4c69927c356dd18bf6e --- src/fastlock.cpp | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/src/fastlock.cpp b/src/fastlock.cpp index 8c1c5f6e4..6e6b6e4d2 100644 --- a/src/fastlock.cpp +++ b/src/fastlock.cpp @@ -356,26 +356,32 @@ extern "C" void fastlock_lock(struct fastlock *lock, spin_worker worker) __atomic_load(&g_fHighCpuPressure, &fHighPressure, __ATOMIC_RELAXED); unsigned loopLimit = fHighPressure ? 0x10000 : 0x100000; - // WARNING:::: THIS DOESN"T MATCH ASM - for (;;) - { - __atomic_load(&lock->m_ticket.u, &ticketT.u, __ATOMIC_ACQUIRE); - if ((ticketT.u & 0xffff) == myticket) - break; + if (worker != nullptr) { + for (;;) { + __atomic_load(&lock->m_ticket.u, &ticketT.u, __ATOMIC_ACQUIRE); + if ((ticketT.u & 0xffff) == myticket) + break; + if (!worker()) + goto LNormalLoop; + } + } else { +LNormalLoop: + for (;;) + { + __atomic_load(&lock->m_ticket.u, &ticketT.u, __ATOMIC_ACQUIRE); + if ((ticketT.u & 0xffff) == myticket) + break; - if (worker != nullptr) { - worker(); - } else { #if defined(__i386__) || defined(__amd64__) __asm__ __volatile__ ("pause"); #elif defined(__aarch64__) __asm__ __volatile__ ("yield"); #endif - } - if ((++cloops % loopLimit) == 0) - { - fastlock_sleep(lock, tid, ticketT.u, myticket); + if ((++cloops % loopLimit) == 0) + { + fastlock_sleep(lock, tid, ticketT.u, myticket); + } } } From 1b3dc3d422ccb7e7cfd87451a4b41b0f20159df7 Mon Sep 17 00:00:00 2001 From: John Sully Date: Wed, 27 Jan 2021 06:37:10 +0000 Subject: [PATCH 26/40] The server should recalibrate periodically. Also reduce log noise Former-commit-id: 8d8e1810d49da6aa921d2327cb4ea250c2b5b234 --- src/networking.cpp | 6 +++--- src/server.cpp | 18 ++++++++++++------ 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/src/networking.cpp b/src/networking.cpp index 36470c0db..8e5683f65 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -3489,10 +3489,10 @@ void processEventsWhileBlocked(int iel) { // Try to complete any async rehashes (this would normally happen in dbCron, but that won't run here) for (int idb = 0; idb < cserver.dbnum; ++idb) { redisDb *db = &g_pserver->db[idb]; - while (db->pdict->asyncdata != nullptr) { - if (!db->pdict->asyncdata->done) + while (db->dict->asyncdata != nullptr) { + if (!db->dict->asyncdata->done) break; - dictCompleteRehashAsync(db->pdict->asyncdata, false /*fFree*/); + dictCompleteRehashAsync(db->dict->asyncdata, false /*fFree*/); } } diff --git a/src/server.cpp b/src/server.cpp index be140c48c..d95e207dc 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -1782,6 +1782,7 @@ int hash_spin_worker() { void databasesCron(bool fMainThread) { serverAssert(GlobalLocksAcquired()); static int rehashes_per_ms = 0; + static int async_rehashes = 0; if (fMainThread) { /* Expire keys by random sampling. Not required for slaves * as master will synthesize DELs for us. */ @@ -1833,28 +1834,33 @@ void databasesCron(bool fMainThread) { } serverAssert(serverTL->rehashCtl == nullptr); - if (rehashes_per_ms > 0) + /* Are we async rehashing? And if so is it time to re-calibrate? */ + /* The recalibration limit is a prime number to ensure balancing across threads */ + if (rehashes_per_ms > 0 && async_rehashes < 131) { serverTL->rehashCtl = dictRehashAsyncStart(g_pserver->db[rehash_db].dict, rehashes_per_ms); + ++async_rehashes; + } if (serverTL->rehashCtl) break; // Before starting anything new, can we end the rehash of a blocked thread? - if (g_pserver->db[rehash_db].pdict->asyncdata != nullptr) { - auto asyncdata = g_pserver->db[rehash_db].pdict->asyncdata; + if (g_pserver->db[rehash_db].dict->asyncdata != nullptr) { + auto asyncdata = g_pserver->db[rehash_db].dict->asyncdata; if (asyncdata->done) { dictCompleteRehashAsync(asyncdata, false /*fFree*/); // Don't free because we don't own the pointer - serverAssert(g_pserver->db[rehash_db].pdict->asyncdata != asyncdata); + serverAssert(g_pserver->db[rehash_db].dict->asyncdata != asyncdata); break; // completion can be expensive, don't do anything else } } rehashes_per_ms = incrementallyRehash(rehash_db); + async_rehashes = 0; if (rehashes_per_ms > 0) { /* If the function did some work, stop here, we'll do * more at the next cron loop. */ - serverLog(LL_WARNING, "Rehashes per ms: %d", rehashes_per_ms); + serverLog(LL_VERBOSE, "Calibrated rehashes per ms: %d", rehashes_per_ms); break; - } else if (g_pserver->db[rehash_db].pdict->asyncdata == nullptr) { + } else if (g_pserver->db[rehash_db].dict->asyncdata == nullptr) { /* If this db didn't need rehash and we have none in flight, we'll try the next one. */ rehash_db++; rehash_db %= cserver.dbnum; From 662fc28fdc8e1617589ae529176cf2adcd28a3e6 Mon Sep 17 00:00:00 2001 From: John Sully Date: Wed, 27 Jan 2021 06:50:01 +0000 Subject: [PATCH 27/40] Disable multithreaded rehash when active defrag is enabled. The two are not compatible Former-commit-id: 56addcd17262ffbaefea26c0097cde53c616ece2 --- src/server.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/server.cpp b/src/server.cpp index d95e207dc..4e4418b02 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -1836,7 +1836,7 @@ void databasesCron(bool fMainThread) { serverAssert(serverTL->rehashCtl == nullptr); /* Are we async rehashing? And if so is it time to re-calibrate? */ /* The recalibration limit is a prime number to ensure balancing across threads */ - if (rehashes_per_ms > 0 && async_rehashes < 131) { + if (rehashes_per_ms > 0 && async_rehashes < 131 && cserver.active_defrag_enabled) { serverTL->rehashCtl = dictRehashAsyncStart(g_pserver->db[rehash_db].dict, rehashes_per_ms); ++async_rehashes; } @@ -1858,7 +1858,9 @@ void databasesCron(bool fMainThread) { if (rehashes_per_ms > 0) { /* If the function did some work, stop here, we'll do * more at the next cron loop. */ - serverLog(LL_VERBOSE, "Calibrated rehashes per ms: %d", rehashes_per_ms); + if (!cserver.active_defrag_enabled) { + serverLog(LL_VERBOSE, "Calibrated rehashes per ms: %d", rehashes_per_ms); + } break; } else if (g_pserver->db[rehash_db].dict->asyncdata == nullptr) { /* If this db didn't need rehash and we have none in flight, we'll try the next one. */ From 4efab08466b45729ace98d8a9c5f8ecb7753a219 Mon Sep 17 00:00:00 2001 From: John Sully Date: Mon, 8 Feb 2021 00:10:38 +0000 Subject: [PATCH 28/40] Fix bug in fastlock stack metadata Former-commit-id: 241d1bd7e1ed64885adbb07653a5c7e8ef882607 --- src/fastlock_x64.asm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fastlock_x64.asm b/src/fastlock_x64.asm index 5fa8e0ade..9e17f8995 100644 --- a/src/fastlock_x64.asm +++ b/src/fastlock_x64.asm @@ -125,7 +125,7 @@ fastlock_lock: mov rsi, [rsp+24] mov eax, [rsp+32] add rsp, 40 - .cfi_adjust_cfa_offset 40 + .cfi_adjust_cfa_offset -40 jmp .LLoop .cfi_endproc From 3de15c16ac89f369d38a09aa6dc3746e009b1364 Mon Sep 17 00:00:00 2001 From: John Sully Date: Mon, 8 Feb 2021 19:02:01 +0000 Subject: [PATCH 29/40] Fix deadlock with disconnecting client and storage provider Former-commit-id: 0831745323e425f463322e8c0dc27fc25854868e --- src/server.cpp | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/server.cpp b/src/server.cpp index 137976437..5e566f957 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -2592,6 +2592,18 @@ void beforeSleep(struct aeEventLoop *eventLoop) { int aof_state = g_pserver->aof_state; + mstime_t commit_latency; + latencyStartMonitor(commit_latency); + if (g_pserver->m_pstorageFactory != nullptr) + { + locker.disarm(); + for (redisDb *db : vecdb) + db->commitChanges(); + locker.arm(); + } + latencyEndMonitor(commit_latency); + latencyAddSampleIfNeeded("storage-commit", commit_latency); + /* We try to handle writes at the end so we don't have to reacquire the lock, but if there is a pending async close we need to ensure the writes happen first so perform it here */ @@ -2602,16 +2614,6 @@ void beforeSleep(struct aeEventLoop *eventLoop) { locker.arm(); fSentReplies = true; } - - mstime_t commit_latency; - latencyStartMonitor(commit_latency); - if (g_pserver->m_pstorageFactory != nullptr) - { - for (redisDb *db : vecdb) - db->commitChanges(); - } - latencyEndMonitor(commit_latency); - latencyAddSampleIfNeeded("storage-commit", commit_latency); if (!serverTL->gcEpoch.isReset()) g_pserver->garbageCollector.endEpoch(serverTL->gcEpoch, true /*fNoFree*/); From e5e13502196741a8ffe83fb0d0a8668e10e0638b Mon Sep 17 00:00:00 2001 From: John Sully Date: Tue, 9 Feb 2021 02:41:44 +0000 Subject: [PATCH 30/40] Fix polarity issue in async rehash Former-commit-id: 9eefba49d4fcde7f12929929d7aeb36f5186a63d --- src/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/server.cpp b/src/server.cpp index 5e566f957..e9e71b46e 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -1950,7 +1950,7 @@ void databasesCron(bool fMainThread) { ::dict *dict = g_pserver->db[rehash_db]->dictUnsafeKeyOnly(); /* Are we async rehashing? And if so is it time to re-calibrate? */ /* The recalibration limit is a prime number to ensure balancing across threads */ - if (rehashes_per_ms > 0 && async_rehashes < 131 && cserver.active_defrag_enabled) { + if (rehashes_per_ms > 0 && async_rehashes < 131 && !cserver.active_defrag_enabled) { serverTL->rehashCtl = dictRehashAsyncStart(dict, rehashes_per_ms); ++async_rehashes; } From ef471e6e53c3e2abe1514706282d963dc74e8991 Mon Sep 17 00:00:00 2001 From: John Sully Date: Thu, 11 Feb 2021 00:50:21 +0000 Subject: [PATCH 31/40] Enable prefetch with storage providers Former-commit-id: 21f9c52c446ad1b460a340c3d873839b2355728f --- src/aof.cpp | 7 +- src/blocked.cpp | 2 +- src/db.cpp | 17 ++-- src/networking.cpp | 190 ++++++++++++++++++++++------------- src/replication.cpp | 11 +- src/server.cpp | 4 +- src/server.h | 65 ++++++++++-- tests/unit/introspection.tcl | 2 +- 8 files changed, 205 insertions(+), 93 deletions(-) diff --git a/src/aof.cpp b/src/aof.cpp index ab3aca2c6..e8f77e879 100644 --- a/src/aof.cpp +++ b/src/aof.cpp @@ -741,7 +741,7 @@ void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int a /* In Redis commands are always executed in the context of a client, so in * order to load the append only file we need to create a fake client. */ struct client *createAOFClient(void) { - struct client *c =(client*) zmalloc(sizeof(*c), MALLOC_LOCAL); + struct client *c = new client(); selectDb(c,0); c->id = CLIENT_ID_AOF; /* So modules can identify it's the AOF client. */ @@ -752,7 +752,6 @@ struct client *createAOFClient(void) { c->querybuf_peak = 0; c->argc = 0; c->argv = NULL; - c->argv_len_sum = 0; c->bufpos = 0; c->flags = 0; c->fPendingAsyncWrite = FALSE; @@ -783,7 +782,7 @@ void freeFakeClientArgv(struct client *c) { for (j = 0; j < c->argc; j++) decrRefCount(c->argv[j]); zfree(c->argv); - c->argv_len_sum = 0; + c->argv_len_sumActive = 0; } void freeFakeClient(struct client *c) { @@ -793,7 +792,7 @@ void freeFakeClient(struct client *c) { freeClientMultiState(c); fastlock_unlock(&c->lock); fastlock_free(&c->lock); - zfree(c); + delete c; } /* Replay the append log file. On success C_OK is returned. On non fatal diff --git a/src/blocked.cpp b/src/blocked.cpp index d4b7a4804..085090b50 100644 --- a/src/blocked.cpp +++ b/src/blocked.cpp @@ -121,7 +121,7 @@ void processUnblockedClients(int iel) { * the code is conceptually more correct this way. */ if (!(c->flags & CLIENT_BLOCKED)) { if (c->querybuf && sdslen(c->querybuf) > 0) { - processInputBuffer(c, CMD_CALL_FULL); + processInputBuffer(c, true /*fParse*/, CMD_CALL_FULL); } } fastlock_unlock(&c->lock); diff --git a/src/db.cpp b/src/db.cpp index 57e29c7d1..14c292b6f 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -2959,24 +2959,27 @@ int dbnumFromDb(redisDb *db) serverPanic("invalid database pointer"); } -void redisDbPersistentData::prefetchKeysAsync(AeLocker &lock, client *c) +void redisDbPersistentData::prefetchKeysAsync(parsed_command &command) { if (m_spstorage == nullptr) return; + AeLocker lock; + std::vector veckeys; - lock.arm(c); - getKeysResult* result = nullptr; - int numkeys = getKeysFromCommand(c->cmd, c->argv, c->argc, result); + lock.arm(nullptr); + getKeysResult result = GETKEYS_RESULT_INIT; + auto cmd = lookupCommand(szFromObj(command.argv[0])); + int numkeys = getKeysFromCommand(cmd, command.argv, command.argc, &result); for (int ikey = 0; ikey < numkeys; ++ikey) { - robj *objKey = c->argv[result->keys[ikey]]; + robj *objKey = command.argv[result.keys[ikey]]; if (this->find_cached_threadsafe(szFromObj(objKey)) == nullptr) veckeys.push_back(objKey); } lock.disarm(); - getKeysFreeResult(result); + getKeysFreeResult(&result); std::vector>> vecInserts; for (robj *objKey : veckeys) @@ -2997,7 +3000,7 @@ void redisDbPersistentData::prefetchKeysAsync(AeLocker &lock, client *c) vecInserts.emplace_back(sharedKey, o, std::move(spexpire)); } - lock.arm(c); + lock.arm(nullptr); for (auto &tuple : vecInserts) { sds sharedKey = std::get<0>(tuple); diff --git a/src/networking.cpp b/src/networking.cpp index 80214963d..be50373c9 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -102,7 +102,7 @@ void linkClient(client *c) { } client *createClient(connection *conn, int iel) { - client *c = (client*)zmalloc(sizeof(client), MALLOC_LOCAL); + client *c = new client; serverAssert(conn == nullptr || (iel == (serverTL - g_pserver->rgthreadvar))); c->iel = iel; @@ -124,7 +124,6 @@ client *createClient(connection *conn, int iel) { uint64_t client_id; client_id = g_pserver->next_client_id.fetch_add(1); c->iel = iel; - fastlock_init(&c->lock, "client"); c->id = client_id; c->resp = 2; c->conn = conn; @@ -137,7 +136,6 @@ client *createClient(connection *conn, int iel) { c->reqtype = 0; c->argc = 0; c->argv = NULL; - c->argv_len_sum = 0; c->cmd = c->lastcmd = NULL; c->puser = DefaultUser; c->multibulklen = 0; @@ -157,6 +155,7 @@ client *createClient(connection *conn, int iel) { c->reploff = 0; c->reploff_skipped = 0; c->read_reploff = 0; + c->reploff_cmd = 0; c->repl_ack_off = 0; c->repl_ack_time = 0; c->slave_listening_port = 0; @@ -203,6 +202,13 @@ client *createClient(connection *conn, int iel) { return c; } +size_t client::argv_len_sum() const { + size_t sum = 0; + for (auto &cmd : vecqueuedcmd) + sum += cmd.argv_len_sum; + return sum + argv_len_sumActive; +} + /* This function puts the client in the queue of clients that should write * their output buffers to the socket. Note that it does not *yet* install * the write handler, to start clients are put in a queue of clients that need @@ -1343,7 +1349,7 @@ static void freeClientArgv(client *c) { decrRefCount(c->argv[j]); c->argc = 0; c->cmd = NULL; - c->argv_len_sum = 0; + c->argv_len_sumActive = 0; } void disconnectSlavesExcept(unsigned char *uuid) @@ -1574,12 +1580,12 @@ bool freeClient(client *c) { zfree(c->replyAsync); if (c->name) decrRefCount(c->name); zfree(c->argv); - c->argv_len_sum = 0; + c->argv_len_sumActive = 0; freeClientMultiState(c); sdsfree(c->peerid); ulock.unlock(); fastlock_free(&c->lock); - zfree(c); + delete c; return true; } @@ -1918,9 +1924,6 @@ void resetClient(client *c) { redisCommandProc *prevcmd = c->cmd ? c->cmd->proc : NULL; freeClientArgv(c); - c->reqtype = 0; - c->multibulklen = 0; - c->bulklen = -1; /* We clear the ASKING flag as well if we are not inside a MULTI, and * if what we just executed is not the ASKING command itself. */ @@ -2042,16 +2045,13 @@ int processInlineBuffer(client *c) { /* Setup argv array on client structure */ if (argc) { - if (c->argv) zfree(c->argv); - c->argv = (robj**)zmalloc(sizeof(robj*)*argc, MALLOC_LOCAL); - c->argv_len_sum = 0; - } - - /* Create redis objects for all arguments. */ - for (c->argc = 0, j = 0; j < argc; j++) { - c->argv[c->argc] = createObject(OBJ_STRING,argv[j]); - c->argc++; - c->argv_len_sum += sdslen(argv[j]); + /* Create redis objects for all arguments. */ + c->vecqueuedcmd.emplace_back(argc); + auto &cmd = c->vecqueuedcmd.back(); + for (cmd.argc = 0, j = 0; j < argc; j++) { + cmd.argv[cmd.argc++] = createObject(OBJ_STRING,argv[j]); + cmd.argv_len_sum += sdslen(argv[j]); + } } sds_free(argv); return C_OK; @@ -2141,9 +2141,7 @@ int processMultibulkBuffer(client *c) { c->multibulklen = ll; /* Setup argv array on client structure */ - if (c->argv) zfree(c->argv); - c->argv = (robj**)zmalloc(sizeof(robj*)*c->multibulklen, MALLOC_LOCAL); - c->argv_len_sum = 0; + c->vecqueuedcmd.emplace_back(c->multibulklen); } serverAssertWithInfo(c,NULL,c->multibulklen > 0); @@ -2211,21 +2209,22 @@ int processMultibulkBuffer(client *c) { /* Optimization: if the buffer contains JUST our bulk element * instead of creating a new object by *copying* the sds we * just use the current sds string. */ + auto &cmd = c->vecqueuedcmd.back(); if (c->qb_pos == 0 && c->bulklen >= PROTO_MBULK_BIG_ARG && sdslen(c->querybuf) == (size_t)(c->bulklen+2)) { - c->argv[c->argc++] = createObject(OBJ_STRING,c->querybuf); - c->argv_len_sum += c->bulklen; + cmd.argv[cmd.argc++] = createObject(OBJ_STRING,c->querybuf); + cmd.argv_len_sum += c->bulklen; sdsIncrLen(c->querybuf,-2); /* remove CRLF */ /* Assume that if we saw a fat argument we'll see another one * likely... */ c->querybuf = sdsnewlen(SDS_NOINIT,c->bulklen+2); sdsclear(c->querybuf); } else { - c->argv[c->argc++] = + cmd.argv[cmd.argc++] = createStringObject(c->querybuf+c->qb_pos,c->bulklen); - c->argv_len_sum += c->bulklen; + cmd.argv_len_sum += c->bulklen; c->qb_pos += c->bulklen+2; } c->bulklen = -1; @@ -2249,7 +2248,8 @@ void commandProcessed(client *c, int flags) { long long prev_offset = c->reploff; if (c->flags & CLIENT_MASTER && !(c->flags & CLIENT_MULTI)) { /* Update the applied replication offset of our master. */ - c->reploff = c->read_reploff - sdslen(c->querybuf) + c->qb_pos; + serverAssert(c->reploff <= c->reploff_cmd); + c->reploff = c->reploff_cmd; } /* Don't reset the client structure for clients blocked in a @@ -2306,34 +2306,34 @@ int processCommandAndResetClient(client *c, int flags) { return deadclient ? C_ERR : C_OK; } -/* This function is called every time, in the client structure 'c', there is - * more query buffer to process, because we read more data from the socket - * or because a client was blocked and later reactivated, so there could be - * pending query buffer, already representing a full command, to process. */ -void processInputBuffer(client *c, int callFlags) { - AssertCorrectThread(c); +static bool FClientReady(client *c) { + /* Return if clients are paused. */ + if (!(c->flags & CLIENT_SLAVE) && clientsArePaused()) return false; + + /* Immediately abort if the client is in the middle of something. */ + if (c->flags & CLIENT_BLOCKED) return false; + + /* Don't process input from the master while there is a busy script + * condition on the replica. We want just to accumulate the replication + * stream (instead of replying -BUSY like we do with other clients) and + * later resume the processing. */ + if (g_pserver->lua_timedout && c->flags & CLIENT_MASTER) return false; + + /* CLIENT_CLOSE_AFTER_REPLY closes the connection once the reply is + * written to the client. Make sure to not let the reply grow after + * this flag has been set (i.e. don't process more commands). + * + * The same applies for clients we want to terminate ASAP. */ + if (c->flags & (CLIENT_CLOSE_AFTER_REPLY|CLIENT_CLOSE_ASAP)) return false; + + return true; +} + +void parseClientCommandBuffer(client *c) { + if (!FClientReady(c)) + return; - /* Keep processing while there is something in the input buffer */ - while(c->qb_pos < sdslen(c->querybuf)) { - /* Return if clients are paused. */ - if (!(c->flags & CLIENT_SLAVE) && clientsArePaused()) break; - - /* Immediately abort if the client is in the middle of something. */ - if (c->flags & CLIENT_BLOCKED) break; - - /* Don't process input from the master while there is a busy script - * condition on the replica. We want just to accumulate the replication - * stream (instead of replying -BUSY like we do with other clients) and - * later resume the processing. */ - if (g_pserver->lua_timedout && c->flags & CLIENT_MASTER) break; - - /* CLIENT_CLOSE_AFTER_REPLY closes the connection once the reply is - * written to the client. Make sure to not let the reply grow after - * this flag has been set (i.e. don't process more commands). - * - * The same applies for clients we want to terminate ASAP. */ - if (c->flags & (CLIENT_CLOSE_AFTER_REPLY|CLIENT_CLOSE_ASAP)) break; - + while(c->qb_pos < sdslen(c->querybuf)) { /* Determine request type when unknown. */ if (!c->reqtype) { if (c->querybuf[c->qb_pos] == '*') { @@ -2343,6 +2343,7 @@ void processInputBuffer(client *c, int callFlags) { } } + size_t cqueries = c->vecqueuedcmd.size(); if (c->reqtype == PROTO_REQ_INLINE) { if (processInlineBuffer(c) != C_OK) break; } else if (c->reqtype == PROTO_REQ_MULTIBULK) { @@ -2350,6 +2351,61 @@ void processInputBuffer(client *c, int callFlags) { } else { serverPanic("Unknown request type"); } + if (!c->vecqueuedcmd.empty() && (c->vecqueuedcmd.back().argc <= 0 || c->vecqueuedcmd.back().argv == nullptr)) { + c->vecqueuedcmd.pop_back(); + } else if (!c->vecqueuedcmd.empty()) { + if (c->flags & CLIENT_MASTER) c->vecqueuedcmd.back().reploff = c->read_reploff - sdslen(c->querybuf) + c->qb_pos; + serverAssert(c->vecqueuedcmd.back().reploff >= 0); + } + + /* Prefetch if we have a storage provider and we're not in the global lock */ + if (cqueries < c->vecqueuedcmd.size() && g_pserver->m_pstorageFactory != nullptr && !GlobalLocksAcquired()) { + auto &query = c->vecqueuedcmd.back(); + if (query.argc > 0 && query.argc == query.argcMax) { + c->db->prefetchKeysAsync(query); + } + } + c->reqtype = 0; + c->multibulklen = 0; + c->bulklen = -1; + c->reqtype = 0; + } + + /* Trim to pos */ + if (c->qb_pos) { + sdsrange(c->querybuf,c->qb_pos,-1); + c->qb_pos = 0; + } +} + +/* This function is called every time, in the client structure 'c', there is + * more query buffer to process, because we read more data from the socket + * or because a client was blocked and later reactivated, so there could be + * pending query buffer, already representing a full command, to process. */ +void processInputBuffer(client *c, bool fParse, int callFlags) { + AssertCorrectThread(c); + + if (fParse) + parseClientCommandBuffer(c); + + /* Keep processing while there is something in the input buffer */ + while (!c->vecqueuedcmd.empty()) { + /* Return if we're still parsing this command */ + auto &cmd = c->vecqueuedcmd.front(); + if (cmd.argc != cmd.argcMax) break; + + if (!FClientReady(c)) break; + + zfree(c->argv); + c->argc = cmd.argc; + c->argv = cmd.argv; + cmd.argv = nullptr; + c->argv_len_sumActive = cmd.argv_len_sum; + cmd.argv_len_sum = 0; + c->reploff_cmd = cmd.reploff; + serverAssert(c->argv != nullptr); + + c->vecqueuedcmd.erase(c->vecqueuedcmd.begin()); /* Multibulk processing could see a <= 0 length. */ if (c->argc == 0) { @@ -2364,12 +2420,6 @@ void processInputBuffer(client *c, int callFlags) { } } } - - /* Trim to pos */ - if (c->qb_pos) { - sdsrange(c->querybuf,c->qb_pos,-1); - c->qb_pos = 0; - } } void readQueryFromClient(connection *conn) { @@ -2450,6 +2500,8 @@ void readQueryFromClient(connection *conn) { return; } + parseClientCommandBuffer(c); + serverTL->vecclientsProcess.push_back(c); } @@ -2461,7 +2513,7 @@ void processClients() /* There is more data in the client input buffer, continue parsing it * in case to check if there is a full command to execute. */ std::unique_lock ul(c->lock); - processInputBuffer(c, CMD_CALL_FULL); + processInputBuffer(c, false /*fParse*/, CMD_CALL_FULL); } if (listLength(serverTL->clients_pending_asyncwrite)) @@ -2568,7 +2620,7 @@ sds catClientInfoString(sds s, client *client) { /* For efficiency (less work keeping track of the argv memory), it doesn't include the used memory * i.e. unused sds space and internal fragmentation, just the string length. but this is enough to * spot problematic clients. */ - total_mem += client->argv_len_sum; + total_mem += client->argv_len_sum(); if (client->argv) total_mem += zmalloc_size(client->argv); @@ -2587,7 +2639,7 @@ sds catClientInfoString(sds s, client *client) { (client->flags & CLIENT_MULTI) ? client->mstate.count : -1, (unsigned long long) sdslen(client->querybuf), (unsigned long long) sdsavail(client->querybuf), - (unsigned long long) client->argv_len_sum, + (unsigned long long) client->argv_len_sum(), (unsigned long long) client->bufpos, (unsigned long long) listLength(client->reply), (unsigned long long) obufmem, /* should not include client->buf since we want to see 0 for static clients. */ @@ -3144,10 +3196,10 @@ void rewriteClientCommandVector(client *c, int argc, ...) { /* Replace argv and argc with our new versions. */ c->argv = argv; c->argc = argc; - c->argv_len_sum = 0; + c->argv_len_sumActive = 0; for (j = 0; j < c->argc; j++) if (c->argv[j]) - c->argv_len_sum += getStringObjectLen(c->argv[j]); + c->argv_len_sumActive += getStringObjectLen(c->argv[j]); c->cmd = lookupCommandOrOriginal((sds)ptrFromObj(c->argv[0])); serverAssertWithInfo(c,NULL,c->cmd != NULL); va_end(ap); @@ -3160,10 +3212,10 @@ void replaceClientCommandVector(client *c, int argc, robj **argv) { zfree(c->argv); c->argv = argv; c->argc = argc; - c->argv_len_sum = 0; + c->argv_len_sumActive = 0; for (j = 0; j < c->argc; j++) if (c->argv[j]) - c->argv_len_sum += getStringObjectLen(c->argv[j]); + c->argv_len_sumActive += getStringObjectLen(c->argv[j]); c->cmd = lookupCommandOrOriginal((sds)ptrFromObj(c->argv[0])); serverAssertWithInfo(c,NULL,c->cmd != NULL); } @@ -3188,8 +3240,8 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) { c->argv[i] = NULL; } oldval = c->argv[i]; - if (oldval) c->argv_len_sum -= getStringObjectLen(oldval); - if (newval) c->argv_len_sum += getStringObjectLen(newval); + if (oldval) c->argv_len_sumActive -= getStringObjectLen(oldval); + if (newval) c->argv_len_sumActive += getStringObjectLen(newval); c->argv[i] = newval; incrRefCount(newval); if (oldval) decrRefCount(oldval); diff --git a/src/replication.cpp b/src/replication.cpp index 1f87c77a4..76c949e4b 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -3432,6 +3432,13 @@ void replicationCacheMaster(redisMaster *mi, client *c) { * offsets, including pending transactions, already populated arguments, * pending outputs to the master. */ sdsclear(mi->master->querybuf); + if (!mi->master->vecqueuedcmd.empty()) { + // Clear out everything except for partially parsed commands (which we'll cache) + auto cmd = std::move(mi->master->vecqueuedcmd.front()); + mi->master->vecqueuedcmd.clear(); + if (cmd.argc != cmd.argcMax) + mi->master->vecqueuedcmd.emplace_back(std::move(cmd)); + } sdsclear(mi->master->pending_querybuf); mi->master->read_reploff = mi->master->reploff; if (c->flags & CLIENT_MULTI) discardTransaction(c); @@ -4307,10 +4314,12 @@ void replicaReplayCommand(client *c) cFake->authenticated = c->authenticated; cFake->puser = c->puser; cFake->querybuf = sdscatsds(cFake->querybuf,(sds)ptrFromObj(c->argv[2])); + cFake->read_reploff = sdslen(cFake->querybuf); + cFake->reploff = 0; selectDb(cFake, c->db->id); auto ccmdPrev = serverTL->commandsExecuted; cFake->flags |= CLIENT_MASTER | CLIENT_PREVENT_REPL_PROP; - processInputBuffer(cFake, (CMD_CALL_FULL & (~CMD_CALL_PROPAGATE))); + processInputBuffer(cFake, true /*fParse*/, (CMD_CALL_FULL & (~CMD_CALL_PROPAGATE))); cFake->flags &= ~(CLIENT_MASTER | CLIENT_PREVENT_REPL_PROP); bool fExec = ccmdPrev != serverTL->commandsExecuted; cFake->lock.unlock(); diff --git a/src/server.cpp b/src/server.cpp index e9e71b46e..eda89bc0e 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -1749,7 +1749,7 @@ size_t ClientsPeakMemInput[CLIENTS_PEAK_MEM_USAGE_SLOTS]; size_t ClientsPeakMemOutput[CLIENTS_PEAK_MEM_USAGE_SLOTS]; int clientsCronTrackExpansiveClients(client *c) { - size_t in_usage = sdsZmallocSize(c->querybuf) + c->argv_len_sum; + size_t in_usage = sdsZmallocSize(c->querybuf) + c->argv_len_sum(); size_t out_usage = getClientOutputBufferMemoryUsage(c); int i = g_pserver->unixtime % CLIENTS_PEAK_MEM_USAGE_SLOTS; int zeroidx = (i+1) % CLIENTS_PEAK_MEM_USAGE_SLOTS; @@ -1786,7 +1786,7 @@ int clientsCronTrackClientsMemUsage(client *c) { mem += getClientOutputBufferMemoryUsage(c); mem += sdsZmallocSize(c->querybuf); mem += zmalloc_size(c); - mem += c->argv_len_sum; + mem += c->argv_len_sum(); if (c->argv) mem += zmalloc_size(c->argv); /* Now that we have the memory used by the client, remove the old * value from the old category, and add it back. */ diff --git a/src/server.h b/src/server.h index 3f06c9f8b..7d2611c37 100644 --- a/src/server.h +++ b/src/server.h @@ -1136,7 +1136,7 @@ public: bool removeCachedValue(const char *key); void removeAllCachedValues(); - void prefetchKeysAsync(class AeLocker &locker, client *c); + void prefetchKeysAsync(struct parsed_command &command); bool FSnapshot() const { return m_spdbSnapshotHOLDER != nullptr; } @@ -1443,7 +1443,53 @@ typedef struct { need more reserved IDs use UINT64_MAX-1, -2, ... and so forth. */ -typedef struct client { +struct parsed_command { + robj** argv = nullptr; + int argc = 0; + int argcMax; + long long reploff = 0; + size_t argv_len_sum = 0; /* Sum of lengths of objects in argv list. */ + + parsed_command(int maxargs) { + argv = (robj**)zmalloc(sizeof(robj*)*maxargs); + argcMax = maxargs; + } + + parsed_command &operator=(parsed_command &&o) { + argv = o.argv; + argc = o.argc; + argcMax = o.argcMax; + reploff = o.reploff; + o.argv = nullptr; + o.argc = 0; + o.argcMax = 0; + o.reploff = 0; + return *this; + } + + parsed_command(parsed_command &o) = delete; + parsed_command(parsed_command &&o) { + argv = o.argv; + argc = o.argc; + argcMax = o.argcMax; + reploff = o.reploff; + o.argv = nullptr; + o.argc = 0; + o.argcMax = 0; + o.reploff = 0; + } + + ~parsed_command() { + if (argv != nullptr) { + for (int i = 0; i < argc; ++i) { + decrRefCount(argv[i]); + } + zfree(argv); + } + } +}; + +struct client { uint64_t id; /* Client incremental unique ID. */ connection *conn; int resp; /* RESP protocol version. Can be 2 or 3. */ @@ -1456,9 +1502,6 @@ typedef struct client { replication stream that we are receiving from the master. */ size_t querybuf_peak; /* Recent (100ms or more) peak of querybuf size. */ - int argc; /* Num of arguments of current command. */ - robj **argv; /* Arguments of current command. */ - size_t argv_len_sum; /* Sum of lengths of objects in argv list. */ struct redisCommand *cmd, *lastcmd; /* Last command executed. */ user *puser; /* User associated with this connection. If the user is set to NULL the connection can do @@ -1488,6 +1531,7 @@ typedef struct client { long long read_reploff; /* Read replication offset if this is a master. */ long long reploff; /* Applied replication offset if this is a master. */ long long reploff_skipped; /* Repl backlog we did not send to this client */ + long long reploff_cmd; /* The replication offset of the executing command, reploff gets set to this after the execution completes */ long long repl_ack_off; /* Replication ack offset, if this is a replica. */ long long repl_ack_time;/* Replication ack time, if this is a replica. */ long long psync_initial_offset; /* FULLRESYNC reply offset other slaves @@ -1545,12 +1589,17 @@ typedef struct client { uint64_t mvccCheckpoint = 0; // the MVCC checkpoint of our last write int iel; /* the event loop index we're registered with */ - struct fastlock lock; + struct fastlock lock {"client"}; int master_error; + std::vector vecqueuedcmd; + int argc; + robj **argv; + size_t argv_len_sumActive = 0; // post a function from a non-client thread to run on its client thread bool postFunction(std::function fn, bool fLock = true); -} client; + size_t argv_len_sum() const; +}; struct saveparam { time_t seconds; @@ -2516,7 +2565,7 @@ void setDeferredMapLen(client *c, void *node, long length); void setDeferredSetLen(client *c, void *node, long length); void setDeferredAttributeLen(client *c, void *node, long length); void setDeferredPushLen(client *c, void *node, long length); -void processInputBuffer(client *c, int callFlags); +void processInputBuffer(client *c, bool fParse, int callFlags); void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask); void acceptTcpHandler(aeEventLoop *el, int fd, void *privdata, int mask); void acceptTLSHandler(aeEventLoop *el, int fd, void *privdata, int mask); diff --git a/tests/unit/introspection.tcl b/tests/unit/introspection.tcl index 278495931..54d9dbbcd 100644 --- a/tests/unit/introspection.tcl +++ b/tests/unit/introspection.tcl @@ -1,7 +1,7 @@ start_server {tags {"introspection"}} { test {CLIENT LIST} { r client list - } {*addr=*:* fd=* age=* idle=* flags=N db=9 sub=0 psub=0 multi=-1 qbuf=26 qbuf-free=* argv-mem=* obl=0 oll=0 omem=0 tot-mem=* events=r cmd=client*} + } {*addr=*:* fd=* age=* idle=* flags=N db=9 sub=0 psub=0 multi=-1 qbuf=0 qbuf-free=* argv-mem=* obl=0 oll=0 omem=0 tot-mem=* events=r cmd=client*} test {MONITOR can log executed commands} { set rd [redis_deferring_client] From c43bb8e8b5d447a936d7e14bc835f579beb0aee6 Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 12 Feb 2021 19:38:57 +0000 Subject: [PATCH 32/40] Fix deadlock in storage prefetch Former-commit-id: 9e6162de08248f7726b97b73aee2f23808ff4e7b --- src/db.cpp | 6 +++--- src/networking.cpp | 2 +- src/server.h | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/db.cpp b/src/db.cpp index 14c292b6f..ee79536d6 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -2959,7 +2959,7 @@ int dbnumFromDb(redisDb *db) serverPanic("invalid database pointer"); } -void redisDbPersistentData::prefetchKeysAsync(parsed_command &command) +void redisDbPersistentData::prefetchKeysAsync(client *c, parsed_command &command) { if (m_spstorage == nullptr) return; @@ -2967,7 +2967,7 @@ void redisDbPersistentData::prefetchKeysAsync(parsed_command &command) AeLocker lock; std::vector veckeys; - lock.arm(nullptr); + lock.arm(c); getKeysResult result = GETKEYS_RESULT_INIT; auto cmd = lookupCommand(szFromObj(command.argv[0])); int numkeys = getKeysFromCommand(cmd, command.argv, command.argc, &result); @@ -3000,7 +3000,7 @@ void redisDbPersistentData::prefetchKeysAsync(parsed_command &command) vecInserts.emplace_back(sharedKey, o, std::move(spexpire)); } - lock.arm(nullptr); + lock.arm(c); for (auto &tuple : vecInserts) { sds sharedKey = std::get<0>(tuple); diff --git a/src/networking.cpp b/src/networking.cpp index be50373c9..15aa6f43a 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -2362,7 +2362,7 @@ void parseClientCommandBuffer(client *c) { if (cqueries < c->vecqueuedcmd.size() && g_pserver->m_pstorageFactory != nullptr && !GlobalLocksAcquired()) { auto &query = c->vecqueuedcmd.back(); if (query.argc > 0 && query.argc == query.argcMax) { - c->db->prefetchKeysAsync(query); + c->db->prefetchKeysAsync(c, query); } } c->reqtype = 0; diff --git a/src/server.h b/src/server.h index 7d2611c37..941770450 100644 --- a/src/server.h +++ b/src/server.h @@ -1136,7 +1136,7 @@ public: bool removeCachedValue(const char *key); void removeAllCachedValues(); - void prefetchKeysAsync(struct parsed_command &command); + void prefetchKeysAsync(client *c, struct parsed_command &command); bool FSnapshot() const { return m_spdbSnapshotHOLDER != nullptr; } From 3ac42ec80c3c3bb4b8b61e17c3fda338abb0ed40 Mon Sep 17 00:00:00 2001 From: John Sully Date: Tue, 16 Feb 2021 21:07:43 +0000 Subject: [PATCH 33/40] Disable async rehash with single threads, it causes slowdowns as the rehash never completes Former-commit-id: 5d08dbdf76c0fd1e0cfcf86b97ef3e656f0e4f5d --- src/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/server.cpp b/src/server.cpp index eda89bc0e..a006d4da0 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -1950,7 +1950,7 @@ void databasesCron(bool fMainThread) { ::dict *dict = g_pserver->db[rehash_db]->dictUnsafeKeyOnly(); /* Are we async rehashing? And if so is it time to re-calibrate? */ /* The recalibration limit is a prime number to ensure balancing across threads */ - if (rehashes_per_ms > 0 && async_rehashes < 131 && !cserver.active_defrag_enabled) { + if (rehashes_per_ms > 0 && async_rehashes < 131 && !cserver.active_defrag_enabled && cserver.cthreads > 1) { serverTL->rehashCtl = dictRehashAsyncStart(dict, rehashes_per_ms); ++async_rehashes; } From 87fc0184154eaa8a0bd8426f3fbddd33c71dea04 Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 19 Feb 2021 05:17:41 +0000 Subject: [PATCH 34/40] Implement maxstorage config Former-commit-id: 79e07859dfec14edf2a39646013e1a82db033d4f --- src/config.cpp | 1 + src/evict.cpp | 5 +++++ src/server.h | 1 + 3 files changed, 7 insertions(+) diff --git a/src/config.cpp b/src/config.cpp index a5efb2f10..ca9181788 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -2619,6 +2619,7 @@ standardConfig configs[] = { /* Unsigned Long Long configs */ createULongLongConfig("maxmemory", NULL, MODIFIABLE_CONFIG, 0, LLONG_MAX, g_pserver->maxmemory, 0, MEMORY_CONFIG, NULL, updateMaxmemory), + createULongLongConfig("maxstorage", NULL, MODIFIABLE_CONFIG, 0, LLONG_MAX, g_pserver->maxstorage, 0, MEMORY_CONFIG, NULL, NULL), /* Size_t configs */ createSizeTConfig("hash-max-ziplist-entries", NULL, MODIFIABLE_CONFIG, 0, LONG_MAX, g_pserver->hash_max_ziplist_entries, 512, INTEGER_CONFIG, NULL, NULL), diff --git a/src/evict.cpp b/src/evict.cpp index d4b0c1f12..31cadeae5 100644 --- a/src/evict.cpp +++ b/src/evict.cpp @@ -500,6 +500,9 @@ int freeMemoryIfNeeded(bool fQuickCycle, bool fPreSnapshot) { int ckeysFailed = 0; int keys_freed = 0; + if (g_pserver->maxstorage && g_pserver->m_pstorageFactory != nullptr && g_pserver->m_pstorageFactory->totalDiskspaceUsed() >= g_pserver->maxstorage) + goto cant_free_storage; + /* When clients are paused the dataset should be static not just from the * POV of clients not being able to write, but also from the POV of * expires and evictions of keys not being performed. */ @@ -726,8 +729,10 @@ cant_free: latencyEndMonitor(lazyfree_latency); latencyAddSampleIfNeeded("eviction-lazyfree",lazyfree_latency); } + latencyEndMonitor(latency); latencyAddSampleIfNeeded("eviction-cycle",latency); +cant_free_storage: return result; } diff --git a/src/server.h b/src/server.h index 941770450..6fa6807c7 100644 --- a/src/server.h +++ b/src/server.h @@ -2230,6 +2230,7 @@ struct redisServer { /* Limits */ unsigned int maxclients; /* Max number of simultaneous clients */ unsigned long long maxmemory; /* Max number of memory bytes to use */ + unsigned long long maxstorage; /* Max number of bytes to use in a storage provider */ int maxmemory_policy; /* Policy for key eviction */ int maxmemory_samples; /* Precision of random sampling */ int lfu_log_factor; /* LFU logarithmic counter factor. */ From ca13fda90fad7c68d2153b2ddc9d14e46f72f48c Mon Sep 17 00:00:00 2001 From: John Sully Date: Mon, 22 Feb 2021 09:14:24 +0000 Subject: [PATCH 35/40] Fix issue where finding random keys is slow due to not shrinking the hash table. Former-commit-id: fd05010cdcf9d6a6187ca2e18bc55adbaa680a02 --- src/dict.cpp | 19 +++++++++++++------ src/dict.h | 2 +- src/replication.cpp | 1 + 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/src/dict.cpp b/src/dict.cpp index 819921dbb..0bf7e6c36 100644 --- a/src/dict.cpp +++ b/src/dict.cpp @@ -142,15 +142,15 @@ int dictResize(dict *d) minimal = d->ht[0].used; if (minimal < DICT_HT_INITIAL_SIZE) minimal = DICT_HT_INITIAL_SIZE; - return dictExpand(d, minimal); + return dictExpand(d, minimal, false /*fShirnk*/); } /* Expand or create the hash table */ -int dictExpand(dict *d, unsigned long size) +int dictExpand(dict *d, unsigned long size, bool fShrink) { /* the size is invalid if it is smaller than the number of * elements already inside the hash table */ - if (dictIsRehashing(d) || d->ht[0].used > size) + if (dictIsRehashing(d) || (d->ht[0].used > size && !fShrink) || size == 0) return DICT_ERR; dictht n; /* the new hash table */ @@ -264,7 +264,7 @@ int dictMerge(dict *dst, dict *src) return DICT_OK; } - dictExpand(dst, dictSize(dst)+dictSize(src)); // start dst rehashing if necessary + dictExpand(dst, dictSize(dst)+dictSize(src), false /* fShrink */); // start dst rehashing if necessary auto &htDst = dictIsRehashing(dst) ? dst->ht[1] : dst->ht[0]; for (int iht = 0; iht < 2; ++iht) { @@ -683,6 +683,8 @@ static dictEntry *dictGenericDelete(dict *d, const void *key, int nofree) { } if (!dictIsRehashing(d)) break; } + + _dictExpandIfNeeded(d); return NULL; /* not found */ } @@ -1269,7 +1271,7 @@ static int _dictExpandIfNeeded(dict *d) if (dictIsRehashing(d)) return DICT_OK; /* If the hash table is empty expand it to the initial size. */ - if (d->ht[0].size == 0) return dictExpand(d, DICT_HT_INITIAL_SIZE); + if (d->ht[0].size == 0) return dictExpand(d, DICT_HT_INITIAL_SIZE, false /*fShrink*/); /* If we reached the 1:1 ratio, and we are allowed to resize the hash * table (global setting) or we should avoid it but the ratio between @@ -1279,7 +1281,12 @@ static int _dictExpandIfNeeded(dict *d) (dict_can_resize || d->ht[0].used/d->ht[0].size > dict_force_resize_ratio)) { - return dictExpand(d, d->ht[0].used*2); + return dictExpand(d, d->ht[0].used*2, false /*fShrink*/); + } + else if (d->ht[0].used > 0 && d->ht[0].used * 16 < d->ht[0].size && dict_can_resize) + { + // If the dictionary has shurnk a lot we'll need to shrink the hash table instead + return dictExpand(d, d->ht[0].used*2, true /*fShrink*/); } return DICT_OK; } diff --git a/src/dict.h b/src/dict.h index 0be8e1c53..54519996b 100644 --- a/src/dict.h +++ b/src/dict.h @@ -187,7 +187,7 @@ typedef void (dictScanBucketFunction)(void *privdata, dictEntry **bucketref); /* API */ dict *dictCreate(dictType *type, void *privDataPtr); -int dictExpand(dict *d, unsigned long size); +int dictExpand(dict *d, unsigned long size, bool fShrink = false); int dictAdd(dict *d, void *key, void *val); dictEntry *dictAddRaw(dict *d, void *key, dictEntry **existing); dictEntry *dictAddOrFind(dict *d, void *key); diff --git a/src/replication.cpp b/src/replication.cpp index 76c949e4b..a6c13c044 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -1902,6 +1902,7 @@ void readSyncBulkPayload(connection *conn) { int use_diskless_load = useDisklessLoad(); const dbBackup *diskless_load_backup = NULL; rdbSaveInfo rsi = RDB_SAVE_INFO_INIT; + rsi.fForceSetKey = !!g_pserver->fActiveReplica; int empty_db_flags = g_pserver->repl_slave_lazy_flush ? EMPTYDB_ASYNC : EMPTYDB_NO_FLAGS; off_t left; From 53b7b83af6c0a2de0000a8b526b9a1aa240ceb9e Mon Sep 17 00:00:00 2001 From: christian Date: Fri, 26 Feb 2021 00:50:22 +0000 Subject: [PATCH 36/40] Offload updating cached time to dedicated thread Former-commit-id: 9bfc8a43952481b5b54a7b051d44b8bece4a18dd --- src/blocked.cpp | 1 - src/rdb.cpp | 4 ---- src/server.cpp | 27 +++++++++++++++------------ src/server.h | 3 ++- 4 files changed, 17 insertions(+), 18 deletions(-) diff --git a/src/blocked.cpp b/src/blocked.cpp index 085090b50..a37dcca58 100644 --- a/src/blocked.cpp +++ b/src/blocked.cpp @@ -525,7 +525,6 @@ void handleClientsBlockedOnKeys(void) { * lookup, invalidating the first one. * See https://github.com/antirez/redis/pull/6554. */ serverTL->fixed_time_expire++; - updateCachedTime(0); /* Serve clients blocked on the key. */ robj *o = lookupKeyWrite(rl->db,rl->key); diff --git a/src/rdb.cpp b/src/rdb.cpp index 3dad6e989..aefc730c0 100644 --- a/src/rdb.cpp +++ b/src/rdb.cpp @@ -2341,10 +2341,6 @@ void rdbLoadProgressCallback(rio *r, const void *buf, size_t len) { (g_pserver->loading_process_events_interval_keys && (r->keys_since_last_callback >= g_pserver->loading_process_events_interval_keys))) { - /* The DB can take some non trivial amount of time to load. Update - * our cached time since it is used to create and update the last - * interaction time with clients and for other important things. */ - updateCachedTime(0); listIter li; listNode *ln; listRewind(g_pserver->masters, &li); diff --git a/src/server.cpp b/src/server.cpp index a006d4da0..f0510c9b6 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -2002,7 +2002,7 @@ void databasesCron(bool fMainThread) { * info or not using the 'update_daylight_info' argument. Normally we update * such info only when calling this function from serverCron() but not when * calling it from call(). */ -void updateCachedTime(int update_daylight_info) { +void updateCachedTime() { long long t = ustime(); __atomic_store(&g_pserver->ustime, &t, __ATOMIC_RELAXED); t /= 1000; @@ -2015,12 +2015,10 @@ void updateCachedTime(int update_daylight_info) { * context is safe since we will never fork() while here, in the main * thread. The logging function will call a thread safe version of * localtime that has no locks. */ - if (update_daylight_info) { - struct tm tm; - time_t ut = g_pserver->unixtime; - localtime_r(&ut,&tm); - __atomic_store(&g_pserver->daylight_active, &tm.tm_isdst, __ATOMIC_RELAXED); - } + struct tm tm; + time_t ut = g_pserver->unixtime; + localtime_r(&ut,&tm); + __atomic_store(&g_pserver->daylight_active, &tm.tm_isdst, __ATOMIC_RELAXED); } void checkChildrenDone(void) { @@ -2172,9 +2170,6 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) { * handler if we don't return here fast enough. */ if (g_pserver->watchdog_period) watchdogScheduleSignal(g_pserver->watchdog_period); - /* Update the time cache. */ - updateCachedTime(1); - /* Unpause clients if enough time has elapsed */ unpauseClientsIfNecessary(); @@ -2812,7 +2807,7 @@ void initMasterInfo(redisMaster *master) void initServerConfig(void) { int j; - updateCachedTime(true); + updateCachedTime(); getRandomHexChars(g_pserver->runid,CONFIG_RUN_ID_SIZE); g_pserver->runid[CONFIG_RUN_ID_SIZE] = '\0'; changeReplicationId(); @@ -3915,7 +3910,6 @@ void call(client *c, int flags) { /* Call the command. */ dirty = g_pserver->dirty; - updateCachedTime(0); incrementMvccTstamp(); start = g_pserver->ustime; try { @@ -6072,6 +6066,13 @@ void OnTerminate() serverPanic("std::teminate() called"); } +void *timeThreadMain(void*) { + while (true) { + updateCachedTime(); + usleep(1); + } +} + void *workerThreadMain(void *parg) { int iel = (int)((int64_t)parg); @@ -6419,6 +6420,8 @@ int main(int argc, char **argv) { setOOMScoreAdj(-1); serverAssert(cserver.cthreads > 0 && cserver.cthreads <= MAX_EVENT_LOOPS); + pthread_create(&cserver.time_thread_id, nullptr, timeThreadMain, nullptr); + pthread_attr_t tattr; pthread_attr_init(&tattr); pthread_attr_setstacksize(&tattr, 1 << 23); // 8 MB diff --git a/src/server.h b/src/server.h index 6fa6807c7..1ecf838ea 100644 --- a/src/server.h +++ b/src/server.h @@ -1920,6 +1920,7 @@ struct redisServerConst { pid_t pid; /* Main process pid. */ time_t stat_starttime; /* Server start time */ pthread_t main_thread_id; /* Main thread id */ + pthread_t time_thread_id; char *configfile; /* Absolute config file path, or NULL */ char *executable; /* Absolute executable file path. */ char **exec_argv; /* Executable argv vector (copy). */ @@ -2965,7 +2966,7 @@ void populateCommandTable(void); void resetCommandTableStats(void); void adjustOpenFilesLimit(void); void closeListeningSockets(int unlink_unix_socket); -void updateCachedTime(int update_daylight_info); +void updateCachedTime(); void resetServerStats(void); void activeDefragCycle(void); unsigned int getLRUClock(void); From 4f632962c17740f9aebe9e853c1c7b369640eb3a Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 26 Feb 2021 01:03:10 +0000 Subject: [PATCH 37/40] eliminate syscall in call() Former-commit-id: 3ee111a2e50bc29818ba85ae4fb786171d695c37 --- src/server.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/server.cpp b/src/server.cpp index f0510c9b6..4dcca26bb 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -3911,6 +3911,7 @@ void call(client *c, int flags) { /* Call the command. */ dirty = g_pserver->dirty; incrementMvccTstamp(); + __atomic_load(&g_pserver->ustime, &start, __ATOMIC_SEQ_CST); start = g_pserver->ustime; try { c->cmd->proc(c); @@ -3922,7 +3923,9 @@ void call(client *c, int flags) { addReplyError(c, sz); } serverTL->commandsExecuted++; - duration = ustime()-start; + ustime_t end; + __atomic_load(&g_pserver->ustime, &end, __ATOMIC_SEQ_CST); + duration = end-start; dirty = g_pserver->dirty-dirty; if (dirty < 0) dirty = 0; From 6ee9d69763f337e402b5c5dd3d3e985e4a53f002 Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 26 Feb 2021 01:28:05 +0000 Subject: [PATCH 38/40] Eliminate the need for an mfence by tricking the CPU into ordering the futex read Former-commit-id: 340e6f5bc94cd1c3b0c6fb6da833e8504acaf23a --- src/fastlock_x64.asm | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/fastlock_x64.asm b/src/fastlock_x64.asm index 9e17f8995..3fecfa439 100644 --- a/src/fastlock_x64.asm +++ b/src/fastlock_x64.asm @@ -190,10 +190,11 @@ fastlock_unlock: mov dword ptr [rdi], -1 # pidOwner = -1 (we don't own it anymore) mov esi, [rdi+64] # get current active (this one) inc esi # bump it to the next thread + sfence # ensure whatever was written in the lock is visible mov word ptr [rdi+64], si # give up our ticket (note: lock is not required here because the spinlock itself guards this variable) - mfence # sync other threads # At this point the lock is removed, however we must wake up any pending futexs - mov edx, [rdi+64+4] # load the futex mask + mov rdx, [rdi+64] # load the futex mask, note we intentionally also read the ticket we just wrote to ensure this is ordered with the above mov + shr rdx, 32 # isolate the mask bt edx, esi # is the next thread waiting on a futex? jc unlock_futex # unlock the futex if necessary ret # if not we're done. From bbb310f419cd4afd79d7c7906762b3c9653d0377 Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 26 Feb 2021 05:40:56 +0000 Subject: [PATCH 39/40] Eliminate needless lock Former-commit-id: 60f972d463f202edb33ff9a25bc2bd3e2558105c --- src/db.cpp | 4 ++-- src/server.cpp | 2 -- src/server.h | 4 ++-- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/db.cpp b/src/db.cpp index ee79536d6..20e8ac014 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -2834,9 +2834,9 @@ bool redisDbPersistentData::removeCachedValue(const char *key) void redisDbPersistentData::trackChanges(bool fBulk) { - m_fTrackingChanges++; + m_fTrackingChanges.fetch_add(1, std::memory_order_relaxed); if (fBulk) - m_fAllChanged++; + m_fAllChanged.fetch_add(1, std::memory_order_acq_rel); } void redisDbPersistentData::removeAllCachedValues() diff --git a/src/server.cpp b/src/server.cpp index 4dcca26bb..1bc9ab175 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -2650,10 +2650,8 @@ void afterSleep(struct aeEventLoop *eventLoop) { serverAssert(serverTL->gcEpoch.isReset()); serverTL->gcEpoch = g_pserver->garbageCollector.startEpoch(); - aeAcquireLock(); for (int idb = 0; idb < cserver.dbnum; ++idb) g_pserver->db[idb]->trackChanges(false); - aeReleaseLock(); } /* =========================== Server initialization ======================== */ diff --git a/src/server.h b/src/server.h index 1ecf838ea..8e29ac196 100644 --- a/src/server.h +++ b/src/server.h @@ -1172,8 +1172,8 @@ private: // Keyspace dict *m_pdict = nullptr; /* The keyspace for this DB */ dict *m_pdictTombstone = nullptr; /* Track deletes when we have a snapshot */ - int m_fTrackingChanges = 0; // Note: Stack based - int m_fAllChanged = 0; + std::atomic m_fTrackingChanges {0}; // Note: Stack based + std::atomic m_fAllChanged {0}; std::set m_setchanged; size_t m_cnewKeysPending = 0; std::shared_ptr m_spstorage = nullptr; From 877ad1e4b25fdca623c53ce80d129ecea4a41967 Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 26 Feb 2021 06:06:58 +0000 Subject: [PATCH 40/40] Don't complain about unclean shutdowns with an empty database Former-commit-id: 99f5c02e87062552eaa2f26e960eb7c9dd977c84