From 543d92d0ff2acdfe24d16ff53254f06d96ddb4eb Mon Sep 17 00:00:00 2001
From: John Sully <john@csquare.ca>
Date: Sun, 31 Jan 2021 02:30:28 +0000
Subject: [PATCH 01/40] Fix crash on shutdown command

Former-commit-id: d72b5fa16df0c41dd62b8b6d25c1c2c9cce8b413
---
 src/networking.cpp | 2 ++
 src/server.cpp     | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/src/networking.cpp b/src/networking.cpp
index d33be6923..f1ec92bbe 100644
--- a/src/networking.cpp
+++ b/src/networking.cpp
@@ -3295,6 +3295,8 @@ void flushSlavesOutputBuffers(void) {
     listIter li;
     listNode *ln;
 
+    flushReplBacklogToClients();
+
     listRewind(g_pserver->slaves,&li);
     while((ln = listNext(&li))) {
         client *replica = (client*)listNodeValue(ln);
diff --git a/src/server.cpp b/src/server.cpp
index 554c0ac43..8a7184398 100644
--- a/src/server.cpp
+++ b/src/server.cpp
@@ -4203,6 +4203,8 @@ int prepareForShutdown(int flags) {
     /* Best effort flush of replica output buffers, so that we hopefully
      * send them pending writes. */
     flushSlavesOutputBuffers();
+    g_pserver->repl_batch_idxStart = -1;
+    g_pserver->repl_batch_offStart = -1;
 
     /* Close the listening sockets. Apparently this allows faster restarts. */
     closeListeningSockets(1);

From 516eee7a1a586a0e03abbc7b70eb13b89bf1c97a Mon Sep 17 00:00:00 2001
From: John Sully <john@csquare.ca>
Date: Sun, 31 Jan 2021 02:36:30 +0000
Subject: [PATCH 02/40] Fix CI failure due to missing package

Former-commit-id: cf46a014c112bbba0d147dbc8489e3054df9a1ef
---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index c869b7bb3..cb41880c9 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -17,7 +17,7 @@ jobs:
       run: ./utils/gen-test-certs.sh
     - name: test-tls
       run: |
-        sudo apt-get -y install tcl8.5 tcl-tls
+        sudo apt-get -y install tcl tcl-tls
         ./runtest --clients 2 --verbose --tls
     - name: cluster-test
       run: |

From 7766b370626fdb712f34d880a964632ac9c2bb6a Mon Sep 17 00:00:00 2001
From: John Sully <john@csquare.ca>
Date: Sun, 31 Jan 2021 03:40:57 +0000
Subject: [PATCH 03/40] Fix test failure due to bad merge

Former-commit-id: f6fb0e462001c49af185682caed8881ccd6d36f3
---
 src/server.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/server.cpp b/src/server.cpp
index 8a7184398..624612f89 100644
--- a/src/server.cpp
+++ b/src/server.cpp
@@ -2757,7 +2757,7 @@ int setOOMScoreAdj(int process_class) {
     int val;
     char buf[64];
 
-    val = g_pserver->oom_score_adj_base + g_pserver->oom_score_adj_values[process_class];
+    val = g_pserver->oom_score_adj_values[process_class];
     if (g_pserver->oom_score_adj == OOM_SCORE_RELATIVE)
         val += g_pserver->oom_score_adj_base;
     if (val > 1000) val = 1000;

From d4408580b5d8bc7b15ecebc07a9b7865ed35430c Mon Sep 17 00:00:00 2001
From: John Sully <john@csquare.ca>
Date: Sun, 31 Jan 2021 04:33:27 +0000
Subject: [PATCH 04/40] Improve multithreaded test reliability

Former-commit-id: effa53339cf272ced8f0661bc275628f87be5c73
---
 tests/unit/maxmemory.tcl   | 4 ++--
 tests/unit/obuf-limits.tcl | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/unit/maxmemory.tcl b/tests/unit/maxmemory.tcl
index 741281cfc..414733d1e 100644
--- a/tests/unit/maxmemory.tcl
+++ b/tests/unit/maxmemory.tcl
@@ -246,7 +246,7 @@ test_slave_buffers {slave buffer are counted correctly} 1000000 10 0 1
 # test again with fewer (and bigger) commands without pipeline, but with eviction
 test_slave_buffers "replica buffer don't induce eviction" 100000 100 1 0
 
-start_server {tags {"maxmemory"}} {
+start_server {tags {"maxmemory"} overrides {server-threads 1}} {
     test {client tracking don't cause eviction feedback loop} {
         r config set maxmemory 0
         r config set maxmemory-policy allkeys-lru
@@ -308,4 +308,4 @@ start_server {tags {"maxmemory"}} {
         if {$::verbose} { puts "evicted: $evicted" }
     }
 }
-}; #run_solo
\ No newline at end of file
+}; #run_solo
diff --git a/tests/unit/obuf-limits.tcl b/tests/unit/obuf-limits.tcl
index 456d3ac82..1fbd29ff8 100644
--- a/tests/unit/obuf-limits.tcl
+++ b/tests/unit/obuf-limits.tcl
@@ -1,4 +1,4 @@
-start_server {tags {"obuf-limits"}} {
+start_server {tags {"obuf-limits"} overrides { server-threads 1 }} {
     test {Client output buffer hard limit is enforced} {
         r config set client-output-buffer-limit {pubsub 100000 0 0}
         set rd1 [redis_deferring_client]

From 19dc63bd3afd47edd3e1000a61aaff1be9dc888a Mon Sep 17 00:00:00 2001
From: christianEQ <christian@eqalpha.com>
Date: Thu, 14 Jan 2021 21:40:17 +0000
Subject: [PATCH 05/40] added severity levels for afterErrorReply

Former-commit-id: fe0f07199353abf6668cd66cd2e21751db5c21d9
---
 src/networking.cpp | 33 +++++++++++++++++++++++++++------
 src/server.cpp     |  9 +++++----
 src/server.h       |  8 +++++++-
 3 files changed, 39 insertions(+), 11 deletions(-)

diff --git a/src/networking.cpp b/src/networking.cpp
index f1ec92bbe..dc4e56e92 100644
--- a/src/networking.cpp
+++ b/src/networking.cpp
@@ -494,7 +494,7 @@ void addReplyErrorLength(client *c, const char *s, size_t len) {
 }
 
 /* Do some actions after an error reply was sent (Log if needed, updates stats, etc.) */
-void afterErrorReply(client *c, const char *s, size_t len) {
+void afterErrorReply(client *c, const char *s, size_t len, int severity = ERR_CRITICAL) {
     /* Sometimes it could be normal that a replica replies to a master with
      * an error and this function gets called. Actually the error will never
      * be sent because addReply*() against master clients has no effect...
@@ -522,9 +522,30 @@ void afterErrorReply(client *c, const char *s, size_t len) {
 
         if (len > 4096) len = 4096;
         const char *cmdname = c->lastcmd ? c->lastcmd->name : "<unknown>";
-        serverLog(LL_WARNING,"== CRITICAL == This %s is sending an error "
-                             "to its %s: '%.*s' after processing the command "
-                             "'%s'", from, to, (int)len, s, cmdname);
+        switch (severity) {
+            case ERR_NOTICE:
+                serverLog(LL_NOTICE,"== NOTICE == This %s is rejecting a command "
+                    "from its %s: '%.*s' after processing the command "
+                    "'%s'", from, to, (int)len, s, cmdname);
+            break;
+            case ERR_WARNING:
+                serverLog(LL_WARNING,"== WARNING == This %s is rejecting a command "
+                    "from its %s: '%.*s' after processing the command "
+                    "'%s'", from, to, (int)len, s, cmdname);
+            break;
+            case ERR_ERROR:
+                serverLog(LL_WARNING,"== ERROR == This %s is sending an error "
+                    "to its %s: '%.*s' after processing the command "
+                    "'%s'", from, to, (int)len, s, cmdname);
+            break;
+            case ERR_CRITICAL:
+            default:
+                serverLog(LL_WARNING,"== CRITICAL == This %s is sending an error "
+                    "to its %s: '%.*s' after processing the command "
+                    "'%s'", from, to, (int)len, s, cmdname);
+            break;
+        }
+
         if (ctype == CLIENT_TYPE_MASTER && g_pserver->repl_backlog &&
             g_pserver->repl_backlog_histlen > 0)
         {
@@ -536,9 +557,9 @@ void afterErrorReply(client *c, const char *s, size_t len) {
 
 /* The 'err' object is expected to start with -ERRORCODE and end with \r\n.
  * Unlike addReplyErrorSds and others alike which rely on addReplyErrorLength. */
-void addReplyErrorObject(client *c, robj *err) {
+void addReplyErrorObject(client *c, robj *err, int severity) {
     addReply(c, err);
-    afterErrorReply(c, szFromObj(err), sdslen(szFromObj(err))-2); /* Ignore trailing \r\n */
+    afterErrorReply(c, szFromObj(err), sdslen(szFromObj(err))-2, severity); /* Ignore trailing \r\n */
 }
 
 void addReplyError(client *c, const char *err) {
diff --git a/src/server.cpp b/src/server.cpp
index 624612f89..0b0fdc877 100644
--- a/src/server.cpp
+++ b/src/server.cpp
@@ -3773,13 +3773,14 @@ void call(client *c, int flags) {
  * If there's a transaction is flags it as dirty, and if the command is EXEC,
  * it aborts the transaction.
  * Note: 'reply' is expected to end with \r\n */
-void rejectCommand(client *c, robj *reply) {
+void rejectCommand(client *c, robj *reply, int severity = ERR_CRITICAL) {
     flagTransaction(c);
     if (c->cmd && c->cmd->proc == execCommand) {
         execCommandAbort(c, szFromObj(reply));
-    } else {
+    }
+    else {
         /* using addReplyError* rather than addReply so that the error can be logged. */
-        addReplyErrorObject(c, reply);
+        addReplyErrorObject(c, reply, severity);
     }
 }
 
@@ -4024,7 +4025,7 @@ int processCommand(client *c, int callFlags) {
         /* Active Replicas can execute read only commands, and optionally write commands */
         if (!(g_pserver->loading == LOADING_REPLICATION && g_pserver->fActiveReplica && ((c->cmd->flags & CMD_READONLY) || g_pserver->fWriteDuringActiveLoad)))
         {
-            rejectCommand(c, shared.loadingerr);
+            rejectCommand(c, shared.loadingerr, ERR_WARNING);
             return C_OK;
         }
     }
diff --git a/src/server.h b/src/server.h
index c6b9a77b5..0da8b84f3 100644
--- a/src/server.h
+++ b/src/server.h
@@ -543,6 +543,12 @@ extern int configOOMScoreAdjValuesDefaults[CONFIG_OOM_COUNT];
 #define LL_WARNING 3
 #define LL_RAW (1<<10) /* Modifier to log without timestamp */
 
+/* Error severity levels */
+#define ERR_CRITICAL 0
+#define ERR_ERROR 1
+#define ERR_WARNING 2
+#define ERR_NOTICE 3
+
 /* Supervision options */
 #define SUPERVISED_NONE 0
 #define SUPERVISED_AUTODETECT 1
@@ -2056,7 +2062,7 @@ void addReplyBulkLongLong(client *c, long long ll);
 void addReply(client *c, robj_roptr obj);
 void addReplySds(client *c, sds s);
 void addReplyBulkSds(client *c, sds s);
-void addReplyErrorObject(client *c, robj *err);
+void addReplyErrorObject(client *c, robj *err, int severity);
 void addReplyErrorSds(client *c, sds err);
 void addReplyError(client *c, const char *err);
 void addReplyStatus(client *c, const char *status);

From a0ae32a9f07e529c9b44e0fab868d7aecd0c61f6 Mon Sep 17 00:00:00 2001
From: christianEQ <christian@eqalpha.com>
Date: Thu, 14 Jan 2021 21:59:34 +0000
Subject: [PATCH 06/40] added exception to allow pings during loading

Former-commit-id: 8a557e9d711c3aadbc566cf96ca2a18f10e8d5a4
---
 src/server.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/server.cpp b/src/server.cpp
index 0b0fdc877..add31b969 100644
--- a/src/server.cpp
+++ b/src/server.cpp
@@ -3853,8 +3853,8 @@ int processCommand(client *c, int callFlags) {
                              (c->cmd->proc == execCommand && (c->mstate.cmd_flags & CMD_DENYOOM));
     int is_denystale_command = !(c->cmd->flags & CMD_STALE) ||
                                (c->cmd->proc == execCommand && (c->mstate.cmd_inv_flags & CMD_STALE));
-    int is_denyloading_command = !(c->cmd->flags & CMD_LOADING) ||
-                                 (c->cmd->proc == execCommand && (c->mstate.cmd_inv_flags & CMD_LOADING));
+    int is_denyloading_command = strcmp(c->cmd->name, "ping") && (!(c->cmd->flags & CMD_LOADING) ||
+                                 (c->cmd->proc == execCommand && (c->mstate.cmd_inv_flags & CMD_LOADING)));
 
     /* Check if the user is authenticated. This check is skipped in case
      * the default user is flagged as "nopass" and is active. */

From 10a745df65254b0f1f7e58690087057ca9ce00de Mon Sep 17 00:00:00 2001
From: John Sully <john@csquare.ca>
Date: Sun, 31 Jan 2021 06:33:13 +0000
Subject: [PATCH 07/40] Reduce intermittent failures by giving tests more time

Former-commit-id: 69bc7108bd552b18e5cf4ab6ea354ad5cd1e93e9
---
 tests/unit/scripting.tcl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/unit/scripting.tcl b/tests/unit/scripting.tcl
index e3cd2f87f..fe4c5f2e3 100644
--- a/tests/unit/scripting.tcl
+++ b/tests/unit/scripting.tcl
@@ -448,7 +448,7 @@ start_server {tags {"scripting"}} {
         set start [clock clicks -milliseconds]
         $rd eval {redis.call('set',KEYS[1],'y'); for i=1,1500000 do redis.call('ping') end return 'ok'} 1 x
         $rd flush
-        after 100
+        after 200
         catch {r ping} err
         assert_match {BUSY*} $err
         $rd read
@@ -457,7 +457,7 @@ start_server {tags {"scripting"}} {
         set start [clock clicks -milliseconds]
         $rd debug loadaof
         $rd flush
-        after 100
+        after 200
         catch {r ping} err
         assert_match {LOADING*} $err
         $rd read

From 68ed1292eeb2e94e527befc6fd01023db27cfd4f Mon Sep 17 00:00:00 2001
From: John Sully <john@csquare.ca>
Date: Sun, 31 Jan 2021 07:15:55 +0000
Subject: [PATCH 08/40] Fix test failure caused by allowing pings during load

Former-commit-id: 569a3d9ff86f4cd74e269391ce1d582e009147ce
---
 src/server.cpp           | 6 +++---
 tests/unit/scripting.tcl | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/server.cpp b/src/server.cpp
index add31b969..e5e35857d 100644
--- a/src/server.cpp
+++ b/src/server.cpp
@@ -689,7 +689,7 @@ struct redisCommand redisCommandTable[] = {
      * failure detection, and a loading server is considered to be
      * not available. */
     {"ping",pingCommand,-1,
-     "ok-stale fast @connection @replication",
+     "ok-stale ok-loading fast @connection @replication",
      0,NULL,0,0,0,0,0,0},
 
     {"echo",echoCommand,2,
@@ -3853,8 +3853,8 @@ int processCommand(client *c, int callFlags) {
                              (c->cmd->proc == execCommand && (c->mstate.cmd_flags & CMD_DENYOOM));
     int is_denystale_command = !(c->cmd->flags & CMD_STALE) ||
                                (c->cmd->proc == execCommand && (c->mstate.cmd_inv_flags & CMD_STALE));
-    int is_denyloading_command = strcmp(c->cmd->name, "ping") && (!(c->cmd->flags & CMD_LOADING) ||
-                                 (c->cmd->proc == execCommand && (c->mstate.cmd_inv_flags & CMD_LOADING)));
+    int is_denyloading_command = !(c->cmd->flags & CMD_LOADING) ||
+                                 (c->cmd->proc == execCommand && (c->mstate.cmd_inv_flags & CMD_LOADING));
 
     /* Check if the user is authenticated. This check is skipped in case
      * the default user is flagged as "nopass" and is active. */
diff --git a/tests/unit/scripting.tcl b/tests/unit/scripting.tcl
index fe4c5f2e3..b41956f9a 100644
--- a/tests/unit/scripting.tcl
+++ b/tests/unit/scripting.tcl
@@ -449,7 +449,7 @@ start_server {tags {"scripting"}} {
         $rd eval {redis.call('set',KEYS[1],'y'); for i=1,1500000 do redis.call('ping') end return 'ok'} 1 x
         $rd flush
         after 200
-        catch {r ping} err
+        catch {r echo "foo"} err
         assert_match {BUSY*} $err
         $rd read
         set elapsed [expr [clock clicks -milliseconds]-$start]
@@ -458,7 +458,7 @@ start_server {tags {"scripting"}} {
         $rd debug loadaof
         $rd flush
         after 200
-        catch {r ping} err
+        catch {r echo "foo"} err
         assert_match {LOADING*} $err
         $rd read
         set elapsed [expr [clock clicks -milliseconds]-$start]

From 84576e9b397b626f0bcaa7e9c233ce74b100f748 Mon Sep 17 00:00:00 2001
From: John Sully <john@csquare.ca>
Date: Sun, 31 Jan 2021 21:22:23 +0000
Subject: [PATCH 09/40] ARM build fix

Former-commit-id: 5832c25ad1ae3fbe12ee245a96799b4b1a75e4b1
---
 src/server.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/server.cpp b/src/server.cpp
index e5e35857d..aa142b527 100644
--- a/src/server.cpp
+++ b/src/server.cpp
@@ -5180,7 +5180,7 @@ int linuxMadvFreeForkBugCheck(void) {
     const long map_size = 3 * 4096;
 
     /* Create a memory map that's in our full control (not one used by the allocator). */
-    p = mmap(NULL, map_size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+    p = (char*)mmap(NULL, map_size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
     serverAssert(p != MAP_FAILED);
 
     q = p + 4096;

From dd0b8af2c569d76fbf03cb1b5b4266a82a66598e Mon Sep 17 00:00:00 2001
From: VivekSainiEQ <vivek@eqalpha.com>
Date: Fri, 8 Jan 2021 17:33:14 +0000
Subject: [PATCH 10/40] removed synchronous calls to aePostFunction and changed
 scope of g_fModuleThread in order to prevent module related deadlocks, issue
 #214

Former-commit-id: 3b8d1f7076e4ab2082cd0768abc7b0b6ed4f951a
---
 src/ae.cpp     | 19 ++-----------------
 src/ae.h       |  2 +-
 src/expire.cpp |  2 +-
 src/module.cpp | 19 ++++++++-----------
 src/server.cpp |  2 +-
 5 files changed, 13 insertions(+), 31 deletions(-)

diff --git a/src/ae.cpp b/src/ae.cpp
index 125179c89..4f26df7b3 100644
--- a/src/ae.cpp
+++ b/src/ae.cpp
@@ -286,7 +286,7 @@ int aePostFunction(aeEventLoop *eventLoop, aePostFunctionProc *proc, void *arg)
     return AE_OK;
 }
 
-int aePostFunction(aeEventLoop *eventLoop, std::function<void()> fn, bool fSynchronous, bool fLock, bool fForceQueue)
+int aePostFunction(aeEventLoop *eventLoop, std::function<void()> fn, bool fLock, bool fForceQueue)
 {
     if (eventLoop == g_eventLoopThisThread && !fForceQueue)
     {
@@ -299,11 +299,6 @@ int aePostFunction(aeEventLoop *eventLoop, std::function<void()> fn, bool fSynch
     cmd.pfn = new (MALLOC_LOCAL) std::function<void()>(fn);
     cmd.pctl = nullptr;
     cmd.fLock = fLock;
-    if (fSynchronous)
-    {
-        cmd.pctl = new (MALLOC_LOCAL) aeCommandControl;
-        cmd.pctl->mutexcv.lock();
-    }
 
     auto size = write(eventLoop->fdCmdWrite, &cmd, sizeof(cmd));
     if (!(!size || size == sizeof(cmd))) {
@@ -314,17 +309,7 @@ int aePostFunction(aeEventLoop *eventLoop, std::function<void()> fn, bool fSynch
     if (size == 0)
         return AE_ERR;
 
-    int ret = AE_OK;
-    if (fSynchronous)
-    {
-        {
-        std::unique_lock<std::mutex> ulock(cmd.pctl->mutexcv, std::adopt_lock);
-        cmd.pctl->cv.wait(ulock);
-        ret = cmd.pctl->rval;
-        }
-        delete cmd.pctl;
-    }
-    return ret;
+    return AE_OK;
 }
 
 aeEventLoop *aeCreateEventLoop(int setsize) {
diff --git a/src/ae.h b/src/ae.h
index e77abb01f..20ef22a5e 100644
--- a/src/ae.h
+++ b/src/ae.h
@@ -135,7 +135,7 @@ aeEventLoop *aeCreateEventLoop(int setsize);
 int aePostFunction(aeEventLoop *eventLoop, aePostFunctionProc *proc, void *arg);
 #ifdef __cplusplus
 }   // EXTERN C
-int aePostFunction(aeEventLoop *eventLoop, std::function<void()> fn, bool fSynchronous = false, bool fLock = true, bool fForceQueue = false);
+int aePostFunction(aeEventLoop *eventLoop, std::function<void()> fn, bool fLock = true, bool fForceQueue = false);
 extern "C" {
 #endif
 void aeDeleteEventLoop(aeEventLoop *eventLoop);
diff --git a/src/expire.cpp b/src/expire.cpp
index 46f52c19c..1d14f7152 100644
--- a/src/expire.cpp
+++ b/src/expire.cpp
@@ -136,7 +136,7 @@ void activeExpireCycleExpire(redisDb *db, expireEntry &e, long long now) {
                 executeCronJobExpireHook(keyCopy, val);
                 sdsfree(keyCopy);
                 decrRefCount(val);
-            }, false, true /*fLock*/, true /*fForceQueue*/);
+            }, true /*fLock*/, true /*fForceQueue*/);
         }
             return;
 
diff --git a/src/module.cpp b/src/module.cpp
index c99ab4685..594a172e4 100644
--- a/src/module.cpp
+++ b/src/module.cpp
@@ -5045,7 +5045,7 @@ void RM_FreeThreadSafeContext(RedisModuleCtx *ctx) {
     zfree(ctx);
 }
 
-static bool g_fModuleThread = false;
+__thread bool g_fModuleThread = false;
 /* Acquire the server lock before executing a thread safe API call.
  * This is not needed for `RedisModule_Reply*` calls when there is
  * a blocked client connected to the thread safe context. */
@@ -5673,20 +5673,17 @@ RedisModuleTimerID RM_CreateTimer(RedisModuleCtx *ctx, mstime_t period, RedisMod
         if (memcmp(ri.key,&key,sizeof(key)) == 0) {
             /* This is the first key, we need to re-install the timer according
              * to the just added event. */
-            aePostFunction(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el, [&]{
+            aePostFunction(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el, [period]{
                 aeDeleteTimeEvent(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el,aeTimer);
-            }, true /* synchronous */, false /* fLock */);
-            aeTimer = -1;
+                aeTimer = aeCreateTimeEvent(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el,period,moduleTimerHandler,NULL,NULL);
+            });
         }
         raxStop(&ri);
-    }
-
-    /* If we have no main timer (the old one was invalidated, or this is the
-     * first module timer we have), install one. */
-    if (aeTimer == -1) {
-        aePostFunction(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el, [&]{
+    } else {    
+        /* If we have no main timer because this is the first module timer we have, install one. */
+        aePostFunction(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el, [period]{
             aeTimer = aeCreateTimeEvent(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el,period,moduleTimerHandler,NULL,NULL);
-        }, true /* synchronous */, false /* fLock */);
+        });
     }
 
     return key;
diff --git a/src/server.cpp b/src/server.cpp
index aa142b527..848b28bcf 100644
--- a/src/server.cpp
+++ b/src/server.cpp
@@ -4087,7 +4087,7 @@ bool client::postFunction(std::function<void(client *)> fn, bool fLock) {
         std::lock_guard<decltype(this->lock)> lock(this->lock);
         fn(this);
         --casyncOpsPending;
-    }, false, fLock) == AE_OK;
+    }, fLock) == AE_OK;
 }
 
 /*================================== Shutdown =============================== */

From 2addbe7e4a32cb3756f783693190edbdac16e127 Mon Sep 17 00:00:00 2001
From: VivekSainiEQ <vivek@eqalpha.com>
Date: Fri, 8 Jan 2021 17:45:45 +0000
Subject: [PATCH 11/40] Added fix for scenario where module thread waiting for
 s_mutexModule in acquireGIL can deadlock with module thread waiting for
 s_mutex in releaseGIL

Former-commit-id: 3205373bb378f895824cc1936a6bae663b1abdcc
---
 src/module.cpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/module.cpp b/src/module.cpp
index 594a172e4..f128d10e3 100644
--- a/src/module.cpp
+++ b/src/module.cpp
@@ -5104,7 +5104,14 @@ void moduleAcquireGIL(int fServerThread) {
     }
     else
     {
-        s_mutexModule.lock();
+        // It is possible that another module thread holds the GIL (and s_mutexModule as a result). 
+        // When said thread goes to release the GIL, it will wait for s_mutex, which this thread owns. 
+        // This thread is however waiting for the GIL (and s_mutexModule) that the other thread owns.
+        // As a result, a deadlock has occured. 
+        // We release the lock on s_mutex and wait until we are able to safely acquire the GIL 
+        // in order to prevent this deadlock from occuring. 
+        while (!s_mutexModule.try_lock())
+            s_cv.wait(lock);         
         ++s_cAcquisitionsModule;
         fModuleGILWlocked++;
     }

From 662037e3a33ff282b22cf398a73dd4f5f3f7d192 Mon Sep 17 00:00:00 2001
From: VivekSainiEQ <vivek@eqalpha.com>
Date: Mon, 11 Jan 2021 19:09:51 +0000
Subject: [PATCH 12/40] Removed more uses of fSynchronous and the use of
 condition variable and mutex on the control struct.

Former-commit-id: 6ab08cc3e1429178b26b55ed7aa8ba85240eb766
---
 src/ae.cpp     | 51 +++-----------------------------------------------
 src/module.cpp |  2 +-
 2 files changed, 4 insertions(+), 49 deletions(-)

diff --git a/src/ae.cpp b/src/ae.cpp
index 4f26df7b3..89b36beaa 100644
--- a/src/ae.cpp
+++ b/src/ae.cpp
@@ -109,13 +109,6 @@ enum class AE_ASYNC_OP
     CreateFileEvent,
 };
 
-struct aeCommandControl
-{
-    std::condition_variable cv;
-    std::atomic<int> rval;
-    std::mutex mutexcv;
-};
-
 struct aeCommand
 {
     AE_ASYNC_OP op;
@@ -128,7 +121,6 @@ struct aeCommand
         std::function<void()> *pfn;
     };
     void *clientData;
-    aeCommandControl *pctl;
 };
 
 void aeProcessCmd(aeEventLoop *eventLoop, int fd, void *, int )
@@ -149,19 +141,7 @@ void aeProcessCmd(aeEventLoop *eventLoop, int fd, void *, int )
             break;
 
         case AE_ASYNC_OP::CreateFileEvent:
-        {
-            if (cmd.pctl != nullptr)
-            {
-                cmd.pctl->mutexcv.lock();
-                std::atomic_store(&cmd.pctl->rval, aeCreateFileEvent(eventLoop, cmd.fd, cmd.mask, cmd.fproc, cmd.clientData));
-                cmd.pctl->cv.notify_all();
-                cmd.pctl->mutexcv.unlock();
-            }
-            else
-            {
-                aeCreateFileEvent(eventLoop, cmd.fd, cmd.mask, cmd.fproc, cmd.clientData);
-            }
-        }
+            aeCreateFileEvent(eventLoop, cmd.fd, cmd.mask, cmd.fproc, cmd.clientData);
             break;
 
         case AE_ASYNC_OP::PostFunction:
@@ -175,19 +155,11 @@ void aeProcessCmd(aeEventLoop *eventLoop, int fd, void *, int )
 
         case AE_ASYNC_OP::PostCppFunction:
         {
-            if (cmd.pctl != nullptr)
-                cmd.pctl->mutexcv.lock();
-            
             std::unique_lock<decltype(g_lock)> ulock(g_lock, std::defer_lock);
             if (cmd.fLock)
                 ulock.lock();
             (*cmd.pfn)();
-            
-            if (cmd.pctl != nullptr)
-            {
-                cmd.pctl->cv.notify_all();
-                cmd.pctl->mutexcv.unlock();
-            }
+
             delete cmd.pfn;
         }
             break;
@@ -226,7 +198,7 @@ ssize_t safe_write(int fd, const void *pv, size_t cb)
 }
 
 int aeCreateRemoteFileEvent(aeEventLoop *eventLoop, int fd, int mask,
-        aeFileProc *proc, void *clientData, int fSynchronous)
+        aeFileProc *proc, void *clientData)
 {
     if (eventLoop == g_eventLoopThisThread)
         return aeCreateFileEvent(eventLoop, fd, mask, proc, clientData);
@@ -239,13 +211,7 @@ int aeCreateRemoteFileEvent(aeEventLoop *eventLoop, int fd, int mask,
     cmd.mask = mask;
     cmd.fproc = proc;
     cmd.clientData = clientData;
-    cmd.pctl = nullptr;
     cmd.fLock = true;
-    if (fSynchronous)
-    {
-        cmd.pctl = new (MALLOC_LOCAL) aeCommandControl();
-        cmd.pctl->mutexcv.lock();
-    }
 
     auto size = safe_write(eventLoop->fdCmdWrite, &cmd, sizeof(cmd));
     if (size != sizeof(cmd))
@@ -254,16 +220,6 @@ int aeCreateRemoteFileEvent(aeEventLoop *eventLoop, int fd, int mask,
         serverAssert(errno == EAGAIN);
         ret = AE_ERR;
     }
-    
-    if (fSynchronous)
-    {
-        {
-        std::unique_lock<std::mutex> ulock(cmd.pctl->mutexcv, std::adopt_lock);
-        cmd.pctl->cv.wait(ulock);
-        ret = cmd.pctl->rval;
-        }
-        delete cmd.pctl;
-    }
 
     return ret;
 }
@@ -297,7 +253,6 @@ int aePostFunction(aeEventLoop *eventLoop, std::function<void()> fn, bool fLock,
     aeCommand cmd = {};
     cmd.op = AE_ASYNC_OP::PostCppFunction;
     cmd.pfn = new (MALLOC_LOCAL) std::function<void()>(fn);
-    cmd.pctl = nullptr;
     cmd.fLock = fLock;
 
     auto size = write(eventLoop->fdCmdWrite, &cmd, sizeof(cmd));
diff --git a/src/module.cpp b/src/module.cpp
index f128d10e3..0430124de 100644
--- a/src/module.cpp
+++ b/src/module.cpp
@@ -5045,7 +5045,7 @@ void RM_FreeThreadSafeContext(RedisModuleCtx *ctx) {
     zfree(ctx);
 }
 
-__thread bool g_fModuleThread = false;
+thread_local bool g_fModuleThread = false;
 /* Acquire the server lock before executing a thread safe API call.
  * This is not needed for `RedisModule_Reply*` calls when there is
  * a blocked client connected to the thread safe context. */

From 49132539dd837e7292c3514bd8c1bcdf00312578 Mon Sep 17 00:00:00 2001
From: VivekSainiEQ <vivek@eqalpha.com>
Date: Mon, 11 Jan 2021 19:17:28 +0000
Subject: [PATCH 13/40] Updated header file to remove fSynchronous flag

Former-commit-id: e2552ff8a92ea5adf7cea070b48afc573003254d
---
 src/ae.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ae.h b/src/ae.h
index 20ef22a5e..ab7127f8b 100644
--- a/src/ae.h
+++ b/src/ae.h
@@ -144,7 +144,7 @@ int aeCreateFileEvent(aeEventLoop *eventLoop, int fd, int mask,
         aeFileProc *proc, void *clientData);
 
 int aeCreateRemoteFileEvent(aeEventLoop *eventLoop, int fd, int mask,
-        aeFileProc *proc, void *clientData, int fSynchronous);
+        aeFileProc *proc, void *clientData);
 
 void aeDeleteFileEvent(aeEventLoop *eventLoop, int fd, int mask);
 void aeDeleteFileEventAsync(aeEventLoop *eventLoop, int fd, int mask);

From 84dc6401c741eb25e549b4988971de7dff240c0e Mon Sep 17 00:00:00 2001
From: VivekSainiEQ <vivek@eqalpha.com>
Date: Tue, 12 Jan 2021 23:22:36 +0000
Subject: [PATCH 14/40] Now post entire timer installation process as one
 function to make atomic with respect to global locks

Former-commit-id: 53936661c88bd7eac88308afc75c510134a8e044
---
 src/module.cpp | 44 +++++++++++++++++++++++---------------------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/src/module.cpp b/src/module.cpp
index 0430124de..a86241c43 100644
--- a/src/module.cpp
+++ b/src/module.cpp
@@ -5668,30 +5668,32 @@ RedisModuleTimerID RM_CreateTimer(RedisModuleCtx *ctx, mstime_t period, RedisMod
         }
     }
 
-    /* We need to install the main event loop timer if it's not already
-     * installed, or we may need to refresh its period if we just installed
-     * a timer that will expire sooner than any other else (i.e. the timer
-     * we just installed is the first timer in the Timers rax). */
-    if (aeTimer != -1) {
-        raxIterator ri;
-        raxStart(&ri,Timers);
-        raxSeek(&ri,"^",NULL,0);
-        raxNext(&ri);
-        if (memcmp(ri.key,&key,sizeof(key)) == 0) {
-            /* This is the first key, we need to re-install the timer according
-             * to the just added event. */
-            aePostFunction(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el, [period]{
+    aePostFunction(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el, [period, key]{
+        /* We need to install the main event loop timer if it's not already
+         * installed, or we may need to refresh its period if we just installed
+         * a timer that will expire sooner than any other else (i.e. the timer
+         * we just installed is the first timer in the Timers rax). */
+        if (aeTimer != -1) {
+            raxIterator ri;
+            raxStart(&ri,Timers);
+            raxSeek(&ri,"^",NULL,0);
+            raxNext(&ri);
+            if (memcmp(ri.key,&key,sizeof(key)) == 0) {
+                /* This is the first key, we need to re-install the timer according
+                * to the just added event. */
                 aeDeleteTimeEvent(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el,aeTimer);
-                aeTimer = aeCreateTimeEvent(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el,period,moduleTimerHandler,NULL,NULL);
-            });
+                aeTimer = -1;
+            }
+            raxStop(&ri);
         }
-        raxStop(&ri);
-    } else {    
-        /* If we have no main timer because this is the first module timer we have, install one. */
-        aePostFunction(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el, [period]{
+
+        /* If we have no main timer (the old one was invalidated, or this is the
+        * first module timer we have), install one. */
+        if (aeTimer == -1) {
             aeTimer = aeCreateTimeEvent(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el,period,moduleTimerHandler,NULL,NULL);
-        });
-    }
+        }
+    });
+
 
     return key;
 }

From cf122aa54d67e83436e341e353db854a050cd0c8 Mon Sep 17 00:00:00 2001
From: VivekSainiEQ <vivek@eqalpha.com>
Date: Thu, 14 Jan 2021 01:14:09 +0000
Subject: [PATCH 15/40] Changed RM_CreateTimer to only call aePostFunction is
 existing aePostFunction isn't in flight

Former-commit-id: 9954f5b4a48286d07fb876fd9579801365b6c237
---
 src/module.cpp | 56 ++++++++++++++++++++++++++++----------------------
 1 file changed, 32 insertions(+), 24 deletions(-)

diff --git a/src/module.cpp b/src/module.cpp
index a86241c43..bc1e9f9e6 100644
--- a/src/module.cpp
+++ b/src/module.cpp
@@ -5579,6 +5579,8 @@ void RM_SetClusterFlags(RedisModuleCtx *ctx, uint64_t flags) {
 
 static rax *Timers;     /* The radix tree of all the timers sorted by expire. */
 long long aeTimer = -1; /* Main event loop (ae.c) timer identifier. */
+bool aeTimerSet = false;/* Checks whether the main event loop timer is set
+                           or if an aePostFunction is queued up that will set it */
 
 typedef void (*RedisModuleTimerProc)(RedisModuleCtx *ctx, void *data);
 
@@ -5668,32 +5670,38 @@ RedisModuleTimerID RM_CreateTimer(RedisModuleCtx *ctx, mstime_t period, RedisMod
         }
     }
 
-    aePostFunction(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el, [period, key]{
-        /* We need to install the main event loop timer if it's not already
-         * installed, or we may need to refresh its period if we just installed
-         * a timer that will expire sooner than any other else (i.e. the timer
-         * we just installed is the first timer in the Timers rax). */
-        if (aeTimer != -1) {
-            raxIterator ri;
-            raxStart(&ri,Timers);
-            raxSeek(&ri,"^",NULL,0);
-            raxNext(&ri);
-            if (memcmp(ri.key,&key,sizeof(key)) == 0) {
-                /* This is the first key, we need to re-install the timer according
-                * to the just added event. */
+    /* We need to install the main event loop timer if it's not already
+     * installed, or we may need to refresh its period if we just installed
+     * a timer that will expire sooner than any other else (i.e. the timer
+     * we just installed is the first timer in the Timers rax). */
+    bool isFirstExpiry = false;
+    if (raxSize(Timers) > 0){
+        raxIterator ri;
+        raxStart(&ri, Timers);
+        raxSeek(&ri,"^",NULL,0);
+        raxNext(&ri);    
+        if (memcmp(ri.key,&key,sizeof(key)) == 0)
+            /* This is the first key, we need to re-install the timer according
+             * to the just added event. */
+            isFirstExpiry = true;
+        raxStop(&ri);
+    }
+
+    /* Now that either we know that we either need to refresh the period of the
+    *  recently installed timer, or that there is no timer to begin with, we must post
+    *  a function call to install the main event timer */
+    if (isFirstExpiry || !aeTimerSet){
+        aeTimerSet = true;
+        aePostFunction(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el, [period, isFirstExpiry]{
+            /* If we deemed that this timer required a reinstall, delete it before proceeding
+             * to the install */
+            if (isFirstExpiry)
                 aeDeleteTimeEvent(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el,aeTimer);
-                aeTimer = -1;
-            }
-            raxStop(&ri);
-        }
-
-        /* If we have no main timer (the old one was invalidated, or this is the
-        * first module timer we have), install one. */
-        if (aeTimer == -1) {
+            /* If we have no main timer (the old one was invalidated, or this is the
+             * first module timer we have), install one. */
             aeTimer = aeCreateTimeEvent(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el,period,moduleTimerHandler,NULL,NULL);
-        }
-    });
-
+        });
+    }
 
     return key;
 }

From b0f94bd4b550b67e594ca8f2ced836f130f5bc05 Mon Sep 17 00:00:00 2001
From: VivekSainiEQ <vivek@eqalpha.com>
Date: Thu, 14 Jan 2021 21:59:56 +0000
Subject: [PATCH 16/40] Modified RM_CreateTimer to prevent more that one
 function being posted at a time

Former-commit-id: f66e5a9c3d6925c3c6d383fbc43fbfbd56721dad
---
 src/module.cpp | 44 ++++++++++++++++++++++++++++----------------
 1 file changed, 28 insertions(+), 16 deletions(-)

diff --git a/src/module.cpp b/src/module.cpp
index bc1e9f9e6..dfabebeae 100644
--- a/src/module.cpp
+++ b/src/module.cpp
@@ -5579,8 +5579,10 @@ void RM_SetClusterFlags(RedisModuleCtx *ctx, uint64_t flags) {
 
 static rax *Timers;     /* The radix tree of all the timers sorted by expire. */
 long long aeTimer = -1; /* Main event loop (ae.c) timer identifier. */
-bool aeTimerSet = false;/* Checks whether the main event loop timer is set
-                           or if an aePostFunction is queued up that will set it */
+static bool aeTimerSet = false;     /* Checks whether the main event loop timer is set */
+static mstime_t aeTimerPeriod;      /* The period of the aeTimer */
+static bool aeTimerPending = false; /* Keeps track of if there is a aePostFunction in flight 
+                                     * that would modify the current main event loop timer */
 
 typedef void (*RedisModuleTimerProc)(RedisModuleCtx *ctx, void *data);
 
@@ -5669,13 +5671,12 @@ RedisModuleTimerID RM_CreateTimer(RedisModuleCtx *ctx, mstime_t period, RedisMod
             expiretime++;
         }
     }
-
     /* We need to install the main event loop timer if it's not already
      * installed, or we may need to refresh its period if we just installed
      * a timer that will expire sooner than any other else (i.e. the timer
      * we just installed is the first timer in the Timers rax). */
     bool isFirstExpiry = false;
-    if (raxSize(Timers) > 0){
+    if (aeTimerSet){
         raxIterator ri;
         raxStart(&ri, Timers);
         raxSeek(&ri,"^",NULL,0);
@@ -5688,20 +5689,31 @@ RedisModuleTimerID RM_CreateTimer(RedisModuleCtx *ctx, mstime_t period, RedisMod
     }
 
     /* Now that either we know that we either need to refresh the period of the
-    *  recently installed timer, or that there is no timer to begin with, we must post
-    *  a function call to install the main event timer */
+     * recently installed timer, or that there is no timer to begin with, we must post
+     * a function call to install the main event timer. */
     if (isFirstExpiry || !aeTimerSet){
+        /* We set the period for the posted function in a global variable
+         * That is so if a function has been posted but not executed, and another
+         * function with a different period were to be posted, we can just update
+         * the period instead of posting a new function. */
+        aeTimerPeriod = period;
         aeTimerSet = true;
-        aePostFunction(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el, [period, isFirstExpiry]{
-            /* If we deemed that this timer required a reinstall, delete it before proceeding
-             * to the install */
-            if (isFirstExpiry)
-                aeDeleteTimeEvent(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el,aeTimer);
-            /* If we have no main timer (the old one was invalidated, or this is the
-             * first module timer we have), install one. */
-            aeTimer = aeCreateTimeEvent(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el,period,moduleTimerHandler,NULL,NULL);
-        });
-    }
+        if (!aeTimerPending) {
+            /* We should only have one aePostFunction in flight at a time, aeTimerPending 
+             * keeps track of that. */
+            aeTimerPending = true;
+            aePostFunction(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el, [isFirstExpiry]{
+                /* If we deemed that this timer required a reinstall, delete it before proceeding
+                 * to the install */
+                if (isFirstExpiry)
+                    aeDeleteTimeEvent(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el,aeTimer);
+                /* If we have no main timer (the old one was invalidated, or this is the
+                 * first module timer we have), install one. */
+                aeTimer = aeCreateTimeEvent(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el,aeTimerPeriod,moduleTimerHandler,NULL,NULL);
+                aeTimerPending = false;
+            });
+        }
+    }   
 
     return key;
 }

From cb5119b756523dbad83ae98e3444e796d124180a Mon Sep 17 00:00:00 2001
From: John Sully <john@csquare.ca>
Date: Sun, 31 Jan 2021 06:31:51 +0000
Subject: [PATCH 17/40] Simplify the handling of CreateTimer

Former-commit-id: 7b4e25008a352bd425582a3e60b26894826af626
---
 src/module.cpp | 76 ++++++++++++++++++++++----------------------------
 1 file changed, 34 insertions(+), 42 deletions(-)

diff --git a/src/module.cpp b/src/module.cpp
index dfabebeae..ed57e7ef4 100644
--- a/src/module.cpp
+++ b/src/module.cpp
@@ -5579,10 +5579,6 @@ void RM_SetClusterFlags(RedisModuleCtx *ctx, uint64_t flags) {
 
 static rax *Timers;     /* The radix tree of all the timers sorted by expire. */
 long long aeTimer = -1; /* Main event loop (ae.c) timer identifier. */
-static bool aeTimerSet = false;     /* Checks whether the main event loop timer is set */
-static mstime_t aeTimerPeriod;      /* The period of the aeTimer */
-static bool aeTimerPending = false; /* Keeps track of if there is a aePostFunction in flight 
-                                     * that would modify the current main event loop timer */
 
 typedef void (*RedisModuleTimerProc)(RedisModuleCtx *ctx, void *data);
 
@@ -5654,6 +5650,9 @@ int moduleTimerHandler(struct aeEventLoop *eventLoop, long long id, void *client
  * (If the time it takes to execute 'callback' is negligible the two
  * statements above mean the same) */
 RedisModuleTimerID RM_CreateTimer(RedisModuleCtx *ctx, mstime_t period, RedisModuleTimerProc callback, void *data) {
+    static uint64_t pending_key;
+    static mstime_t pending_period = -1; 
+
     RedisModuleTimer *timer = (RedisModuleTimer*)zmalloc(sizeof(*timer), MALLOC_LOCAL);
     timer->module = ctx->module;
     timer->callback = callback;
@@ -5671,49 +5670,42 @@ RedisModuleTimerID RM_CreateTimer(RedisModuleCtx *ctx, mstime_t period, RedisMod
             expiretime++;
         }
     }
+
+    bool fNeedPost = (pending_period < 0);    // If pending_period is already set, then a PostFunction is in flight and we don't need to set a new one
+    if (pending_period < 0 || period < pending_period) {
+        pending_period = period;
+        pending_key = key;
+    }
+
     /* We need to install the main event loop timer if it's not already
      * installed, or we may need to refresh its period if we just installed
      * a timer that will expire sooner than any other else (i.e. the timer
      * we just installed is the first timer in the Timers rax). */
-    bool isFirstExpiry = false;
-    if (aeTimerSet){
-        raxIterator ri;
-        raxStart(&ri, Timers);
-        raxSeek(&ri,"^",NULL,0);
-        raxNext(&ri);    
-        if (memcmp(ri.key,&key,sizeof(key)) == 0)
-            /* This is the first key, we need to re-install the timer according
-             * to the just added event. */
-            isFirstExpiry = true;
-        raxStop(&ri);
-    }
-
-    /* Now that either we know that we either need to refresh the period of the
-     * recently installed timer, or that there is no timer to begin with, we must post
-     * a function call to install the main event timer. */
-    if (isFirstExpiry || !aeTimerSet){
-        /* We set the period for the posted function in a global variable
-         * That is so if a function has been posted but not executed, and another
-         * function with a different period were to be posted, we can just update
-         * the period instead of posting a new function. */
-        aeTimerPeriod = period;
-        aeTimerSet = true;
-        if (!aeTimerPending) {
-            /* We should only have one aePostFunction in flight at a time, aeTimerPending 
-             * keeps track of that. */
-            aeTimerPending = true;
-            aePostFunction(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el, [isFirstExpiry]{
-                /* If we deemed that this timer required a reinstall, delete it before proceeding
-                 * to the install */
-                if (isFirstExpiry)
+    if (fNeedPost) {
+        aePostFunction(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el, []{
+            if (aeTimer != -1) {
+                raxIterator ri;
+                raxStart(&ri,Timers);
+                raxSeek(&ri,"^",NULL,0);
+                raxNext(&ri);
+                if (memcmp(ri.key,&pending_key,sizeof(key)) == 0) {
+                    /* This is the first key, we need to re-install the timer according
+                    * to the just added event. */
                     aeDeleteTimeEvent(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el,aeTimer);
-                /* If we have no main timer (the old one was invalidated, or this is the
-                 * first module timer we have), install one. */
-                aeTimer = aeCreateTimeEvent(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el,aeTimerPeriod,moduleTimerHandler,NULL,NULL);
-                aeTimerPending = false;
-            });
-        }
-    }   
+                    aeTimer = -1;
+                }
+                raxStop(&ri);
+            }
+
+            /* If we have no main timer (the old one was invalidated, or this is the
+            * first module timer we have), install one. */
+            if (aeTimer == -1) {
+                aeTimer = aeCreateTimeEvent(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el,pending_period,moduleTimerHandler,NULL,NULL);
+            }
+
+            pending_period = -1;
+        });
+    }
 
     return key;
 }

From 651abfdca0e3d80965eb5578a7cd23d61d79e95e Mon Sep 17 00:00:00 2001
From: John Sully <john@csquare.ca>
Date: Mon, 1 Feb 2021 22:21:49 +0000
Subject: [PATCH 18/40] Module test reliability

Former-commit-id: cc827e157a8cdd0224eaa76d6a70e0cccd95f544
---
 tests/unit/moduleapi/hooks.tcl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/unit/moduleapi/hooks.tcl b/tests/unit/moduleapi/hooks.tcl
index c4af59bd2..a9387e757 100644
--- a/tests/unit/moduleapi/hooks.tcl
+++ b/tests/unit/moduleapi/hooks.tcl
@@ -132,6 +132,7 @@ tags "modules" {
             }
 
             $replica replicaof no one
+            after 300
 
             test {Test role-master hook} {
                 assert_equal [r hooks.event_count role-replica] 1

From aa47e643b0df8db58e809388d9de26f6e79aacb6 Mon Sep 17 00:00:00 2001
From: John Sully <john@csquare.ca>
Date: Sun, 7 Feb 2021 23:38:09 +0000
Subject: [PATCH 19/40] Fix memory leak in mvccRestore

Former-commit-id: 165333b0fc648c79e66f04d9c8c4a1d0059fe57a
---
 src/Makefile    |  4 ++--
 src/cluster.cpp | 13 ++++++++-----
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 3af99b3b4..9e4070d4a 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -48,8 +48,8 @@ endif
 USEASM?=true
 
 ifneq ($(strip $(SANITIZE)),)
-	CFLAGS+= -fsanitize=$(SANITIZE) -DSANITIZE
-	CXXFLAGS+= -fsanitize=$(SANITIZE) -DSANITIZE
+	CFLAGS+= -fsanitize=$(SANITIZE) -DSANITIZE  -fno-omit-frame-pointer
+	CXXFLAGS+= -fsanitize=$(SANITIZE) -DSANITIZE  -fno-omit-frame-pointer
 	LDFLAGS+= -fsanitize=$(SANITIZE)
 	MALLOC=libc
 	USEASM=false
diff --git a/src/cluster.cpp b/src/cluster.cpp
index e60807180..8556f1d28 100644
--- a/src/cluster.cpp
+++ b/src/cluster.cpp
@@ -5066,12 +5066,15 @@ void mvccrestoreCommand(client *c) {
     setMvccTstamp(obj, mvcc);
 
     /* Create the key and set the TTL if any */
-    dbMerge(c->db,key,obj,true);
-    if (expire >= 0) {
-        setExpire(c,c->db,key,nullptr,expire);
+    if (dbMerge(c->db,key,obj,true)) {
+        if (expire >= 0) {
+            setExpire(c,c->db,key,nullptr,expire);
+        }
+        signalModifiedKey(c,c->db,key);
+        notifyKeyspaceEvent(NOTIFY_GENERIC,"restore",key,c->db->id);
+    } else {
+        decrRefCount(obj);
     }
-    signalModifiedKey(c,c->db,key);
-    notifyKeyspaceEvent(NOTIFY_GENERIC,"restore",key,c->db->id);
     addReply(c,shared.ok);
     g_pserver->dirty++;
 }

From 1c0b603def4b18b286ca33233b0816416cd460ac Mon Sep 17 00:00:00 2001
From: John Sully <john@csquare.ca>
Date: Wed, 20 Jan 2021 20:23:56 +0000
Subject: [PATCH 20/40] Initial implementation

Former-commit-id: 958f2c00c8efc15dc91fdeec2ff2e2ae2016c124
---
 src/dict.cpp   | 107 +++++++++++++++++++++++++++++++++++++++++++++++--
 src/dict.h     |  31 ++++++++++++++
 src/server.cpp |  64 ++++++++++++++++++++---------
 3 files changed, 180 insertions(+), 22 deletions(-)

diff --git a/src/dict.cpp b/src/dict.cpp
index 2c16af119..e0b193312 100644
--- a/src/dict.cpp
+++ b/src/dict.cpp
@@ -127,6 +127,7 @@ int _dictInit(dict *d, dictType *type,
     d->privdata = privDataPtr;
     d->rehashidx = -1;
     d->iterators = 0;
+    d->asyncdata = nullptr;
     return DICT_OK;
 }
 
@@ -188,13 +189,13 @@ int dictExpand(dict *d, unsigned long size)
 int dictRehash(dict *d, int n) {
     int empty_visits = n*10; /* Max number of empty buckets to visit. */
     if (!dictIsRehashing(d)) return 0;
+    if (d->asyncdata) return 0;
 
     while(n-- && d->ht[0].used != 0) {
         dictEntry *de, *nextde;
 
         /* Note that rehashidx can't overflow as we are sure there are more
          * elements because ht[0].used != 0 */
-        assert(d->ht[0].size > (unsigned long)d->rehashidx);
         while(d->ht[0].table[d->rehashidx] == NULL) {
             d->rehashidx++;
             if (--empty_visits == 0) return 1;
@@ -230,6 +231,95 @@ int dictRehash(dict *d, int n) {
     return 1;
 }
 
+dictAsyncRehashCtl *dictRehashAsyncStart(dict *d) {
+    if (!dictIsRehashing(d)) return 0;
+    if (d->asyncdata != nullptr) {
+        /* An async rehash is already in progress, in the future we could give out an additional block */
+        return nullptr;
+    }
+    d->asyncdata = new dictAsyncRehashCtl(d);
+
+    int empty_visits = dictAsyncRehashCtl::c_targetQueueSize * 10;
+
+    while (d->asyncdata->queue.size() < dictAsyncRehashCtl::c_targetQueueSize && (d->ht[0].used - d->asyncdata->queue.size()) != 0) {
+        dictEntry *de;
+
+        /* Note that rehashidx can't overflow as we are sure there are more
+         * elements because ht[0].used != 0 */
+        while(d->ht[0].table[d->rehashidx] == NULL) {
+            d->rehashidx++;
+            if (--empty_visits == 0) goto LDone;
+        }
+
+        de = d->ht[0].table[d->rehashidx];
+        // We have to queue every node in the bucket, even if we go over our target size
+        while (de) {
+            d->asyncdata->queue.emplace_back(de);
+            de = de->next;
+        }
+        d->rehashidx++;
+    }
+
+LDone:
+    if (d->asyncdata->queue.empty()) {
+        // We didn't find any work to do (can happen if there is a large gap in the hash table)
+        delete d->asyncdata;
+        d->asyncdata = nullptr;
+    }
+    return d->asyncdata;
+}
+
+void dictRehashAsync(dictAsyncRehashCtl *ctl) {
+    for (auto &wi : ctl->queue) {
+        wi.hash = dictHashKey(ctl->dict, dictGetKey(wi.de));
+    }
+}
+
+void dictCompleteRehashAsync(dictAsyncRehashCtl *ctl) {
+    dict *d = ctl->dict;
+
+    // Was the dictionary free'd while we were in flight?
+    if (ctl->release) {
+        dictRelease(d);
+        delete ctl;
+        return;
+    }
+
+    for (auto &wi : ctl->queue) {
+        // We need to remove it from the source hash table, and store it in the dest.
+        //  Note that it may have been deleted in the meantime and therefore not exist.
+        //  In this case it will be in the garbage collection list
+        
+        dictEntry **pdePrev = &d->ht[0].table[wi.hash & d->ht[0].sizemask];
+        while (*pdePrev != nullptr && *pdePrev != wi.de) {
+            pdePrev = &((*pdePrev)->next);
+        }
+        if (*pdePrev != nullptr) {
+            assert(*pdePrev == wi.de);
+            *pdePrev = wi.de->next;
+            // Now link it to the dest hash table
+            wi.de->next = d->ht[1].table[wi.hash & d->ht[1].sizemask];
+            d->ht[1].table[wi.hash & d->ht[1].sizemask] = wi.de;
+            d->ht[0].used--;
+            d->ht[1].used++;
+        } else {
+            // In this scenario the de should be in the free list, find and delete it
+            bool found = false;
+            for (dictEntry **pdeGCPrev = &ctl->deGCList; *pdeGCPrev != nullptr && !found; pdeGCPrev = &((*pdeGCPrev)->next)) {
+                if (*pdeGCPrev == wi.de) {
+                    *pdeGCPrev = wi.de->next;
+                    dictFreeUnlinkedEntry(d, wi.de);
+                    found = true;
+                }
+            }
+            // Note: We may not find it if the entry was just unlinked but reused elsewhere
+        }
+    }
+
+    delete ctl;
+    d->asyncdata = nullptr;
+}
+
 long long timeInMilliseconds(void) {
     struct timeval tv;
 
@@ -385,9 +475,14 @@ static dictEntry *dictGenericDelete(dict *d, const void *key, int nofree) {
                 else
                     d->ht[table].table[idx] = he->next;
                 if (!nofree) {
-                    dictFreeKey(d, he);
-                    dictFreeVal(d, he);
-                    zfree(he);
+                    if (d->asyncdata != nullptr) {
+                        he->next = d->asyncdata->deGCList;
+                        d->asyncdata->deGCList = he->next;
+                    } else {
+                        dictFreeKey(d, he);
+                        dictFreeVal(d, he);
+                        zfree(he);
+                    }
                 }
                 d->ht[table].used--;
                 return he;
@@ -470,6 +565,10 @@ int _dictClear(dict *d, dictht *ht, void(callback)(void *)) {
 /* Clear & Release the hash table */
 void dictRelease(dict *d)
 {
+    if (d->asyncdata) {
+        d->asyncdata->release = true;
+        return;
+    }
     _dictClear(d,&d->ht[0],NULL);
     _dictClear(d,&d->ht[1],NULL);
     zfree(d);
diff --git a/src/dict.h b/src/dict.h
index 03c39c7ff..42d097f9b 100644
--- a/src/dict.h
+++ b/src/dict.h
@@ -36,6 +36,7 @@
 #include <stdint.h>
 
 #ifdef __cplusplus
+#include <vector>
 extern "C" {
 #endif
 
@@ -81,12 +82,37 @@ typedef struct dictht {
     unsigned long used;
 } dictht;
 
+#ifdef __cplusplus
+struct dictAsyncRehashCtl {
+    struct workItem {
+        dictEntry *de;
+        uint64_t hash;
+        workItem(dictEntry *de) {
+            this->de = de;
+        }
+    };
+
+    static const int c_targetQueueSize = 100;
+    dictEntry *deGCList = nullptr;
+    struct dict *dict = nullptr;
+    std::vector<workItem> queue;
+    bool release = false;
+
+    dictAsyncRehashCtl(struct dict *d) : dict(d) {
+        queue.reserve(c_targetQueueSize);
+    }
+};
+#else
+struct  dictAsyncRehashCtl;
+#endif
+
 typedef struct dict {
     dictType *type;
     void *privdata;
     dictht ht[2];
     long rehashidx; /* rehashing not in progress if rehashidx == -1 */
     unsigned long iterators; /* number of iterators currently running */
+    dictAsyncRehashCtl *asyncdata;
 } dict;
 
 /* If safe is set to 1 this is a safe iterator, that means, you can call
@@ -190,6 +216,11 @@ unsigned long dictScan(dict *d, unsigned long v, dictScanFunction *fn, dictScanB
 uint64_t dictGetHash(dict *d, const void *key);
 dictEntry **dictFindEntryRefByPtrAndHash(dict *d, const void *oldptr, uint64_t hash);
 
+/* Async Rehash Functions */
+dictAsyncRehashCtl *dictRehashAsyncStart(dict *d);
+void dictRehashAsync(dictAsyncRehashCtl *ctl);
+void dictCompleteRehashAsync(dictAsyncRehashCtl *ctl);
+
 /* Hash table types */
 extern dictType dictTypeHeapStringCopyKey;
 extern dictType dictTypeHeapStrings;
diff --git a/src/server.cpp b/src/server.cpp
index 848b28bcf..fdb8885b1 100644
--- a/src/server.cpp
+++ b/src/server.cpp
@@ -1775,19 +1775,23 @@ bool expireOwnKeys()
 /* This function handles 'background' operations we are required to do
  * incrementally in Redis databases, such as active key expiring, resizing,
  * rehashing. */
-void databasesCron(void) {
-    /* Expire keys by random sampling. Not required for slaves
-     * as master will synthesize DELs for us. */
-    if (g_pserver->active_expire_enabled) {
-        if (expireOwnKeys()) {
-            activeExpireCycle(ACTIVE_EXPIRE_CYCLE_SLOW);
-        } else {
-            expireSlaveKeys();
-        }
-    }
+dictAsyncRehashCtl* databasesCron(bool fMainThread) {
+    dictAsyncRehashCtl *ctl = nullptr;
 
-    /* Defrag keys gradually. */
-    activeDefragCycle();
+    if (fMainThread) {
+        /* Expire keys by random sampling. Not required for slaves
+        * as master will synthesize DELs for us. */
+        if (g_pserver->active_expire_enabled) {
+            if (expireOwnKeys()) {
+                activeExpireCycle(ACTIVE_EXPIRE_CYCLE_SLOW);
+            } else {
+                expireSlaveKeys();
+            }
+        }
+
+        /* Defrag keys gradually. */
+        activeDefragCycle();
+    }
 
     /* Perform hash tables rehashing if needed, but only if there are no
      * other processes saving the DB on disk. Otherwise rehashing is bad
@@ -1804,16 +1808,21 @@ void databasesCron(void) {
         /* Don't test more DBs than we have. */
         if (dbs_per_call > cserver.dbnum) dbs_per_call = cserver.dbnum;
 
-        /* Resize */
-        for (j = 0; j < dbs_per_call; j++) {
-            tryResizeHashTables(resize_db % cserver.dbnum);
-            resize_db++;
+        if (fMainThread) {
+            /* Resize */
+            for (j = 0; j < dbs_per_call; j++) {
+                tryResizeHashTables(resize_db % cserver.dbnum);
+                resize_db++;
+            }
         }
 
         /* Rehash */
         if (g_pserver->activerehashing) {
             for (j = 0; j < dbs_per_call; j++) {
-                int work_done = incrementallyRehash(rehash_db);
+                ctl = dictRehashAsyncStart(g_pserver->db[rehash_db].pdict);
+                if (ctl)
+                    break;
+                int work_done = fMainThread && incrementallyRehash(rehash_db);
                 if (work_done) {
                     /* If the function did some work, stop here, we'll do
                      * more at the next cron loop. */
@@ -1826,6 +1835,8 @@ void databasesCron(void) {
             }
         }
     }
+
+    return ctl;
 }
 
 /* We take a cached value of the unix time in the global state because with
@@ -2065,7 +2076,14 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
     clientsCron(IDX_EVENT_LOOP_MAIN);
 
     /* Handle background operations on Redis databases. */
-    databasesCron();
+    auto asyncRehash = databasesCron(true /* fMainThread */);
+
+    if (asyncRehash) {
+        aeReleaseLock();
+        dictRehashAsync(asyncRehash);
+        aeAcquireLock();
+        dictCompleteRehashAsync(asyncRehash);
+    }
 
     /* Start a scheduled AOF rewrite if this was requested by the user while
      * a BGSAVE was in progress. */
@@ -2215,6 +2233,16 @@ int serverCronLite(struct aeEventLoop *eventLoop, long long id, void *clientData
         processUnblockedClients(iel);
     }
 
+    /* Handle background operations on Redis databases. */
+    auto asyncRehash = databasesCron(false /* fMainThread */);
+
+    if (asyncRehash) {
+        aeReleaseLock();
+        dictRehashAsync(asyncRehash);
+        aeAcquireLock();
+        dictCompleteRehashAsync(asyncRehash);
+    }
+
     /* Unpause clients if enough time has elapsed */
     unpauseClientsIfNecessary();
     

From 6379ae6c7f861e884bb21bcf01ae4d1b597d72ab Mon Sep 17 00:00:00 2001
From: John Sully <john@csquare.ca>
Date: Thu, 21 Jan 2021 17:18:54 +0000
Subject: [PATCH 21/40] Fix races in free

Former-commit-id: 3881d5c1f22f022855c0abcf7b0c2070e204c0a3
---
 src/dict.cpp | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/src/dict.cpp b/src/dict.cpp
index e0b193312..966abc255 100644
--- a/src/dict.cpp
+++ b/src/dict.cpp
@@ -277,6 +277,7 @@ void dictRehashAsync(dictAsyncRehashCtl *ctl) {
 
 void dictCompleteRehashAsync(dictAsyncRehashCtl *ctl) {
     dict *d = ctl->dict;
+    d->asyncdata = nullptr; // dictFreeUnlinkedEntry won't actually free if we don't NULL this
 
     // Was the dictionary free'd while we were in flight?
     if (ctl->release) {
@@ -315,9 +316,9 @@ void dictCompleteRehashAsync(dictAsyncRehashCtl *ctl) {
             // Note: We may not find it if the entry was just unlinked but reused elsewhere
         }
     }
+    assert(ctl->deGCList == nullptr);
 
     delete ctl;
-    d->asyncdata = nullptr;
 }
 
 long long timeInMilliseconds(void) {
@@ -475,7 +476,7 @@ static dictEntry *dictGenericDelete(dict *d, const void *key, int nofree) {
                 else
                     d->ht[table].table[idx] = he->next;
                 if (!nofree) {
-                    if (d->asyncdata != nullptr) {
+                    if (table == 0 && d->asyncdata != nullptr) {
                         he->next = d->asyncdata->deGCList;
                         d->asyncdata->deGCList = he->next;
                     } else {
@@ -530,9 +531,14 @@ dictEntry *dictUnlink(dict *ht, const void *key) {
  * to dictUnlink(). It's safe to call this function with 'he' = NULL. */
 void dictFreeUnlinkedEntry(dict *d, dictEntry *he) {
     if (he == NULL) return;
-    dictFreeKey(d, he);
-    dictFreeVal(d, he);
-    zfree(he);
+    if (d->asyncdata) {
+        he->next = d->asyncdata->deGCList;
+        d->asyncdata->deGCList = he;
+    } else { 
+        dictFreeKey(d, he);
+        dictFreeVal(d, he);
+        zfree(he);
+    }
 }
 
 /* Destroy an entire dictionary */
@@ -548,9 +554,14 @@ int _dictClear(dict *d, dictht *ht, void(callback)(void *)) {
         if ((he = ht->table[i]) == NULL) continue;
         while(he) {
             nextHe = he->next;
-            dictFreeKey(d, he);
-            dictFreeVal(d, he);
-            zfree(he);
+            if (i == 0 && d->asyncdata) {
+                he->next = d->asyncdata->deGCList;
+                d->asyncdata->deGCList = he;
+            } else {
+                dictFreeKey(d, he);
+                dictFreeVal(d, he);
+                zfree(he);
+            }
             ht->used--;
             he = nextHe;
         }

From 071ecb801a728a37c105e9fd3ba62681c704f525 Mon Sep 17 00:00:00 2001
From: John Sully <john@csquare.ca>
Date: Thu, 21 Jan 2021 20:27:50 +0000
Subject: [PATCH 22/40] Allow multiple threads to rehash simultaneously

Former-commit-id: 5a2cc90786dfd1bfd341dbf5713bcde01f0cfff3
---
 src/dict.cpp | 53 +++++++++++++++++++++++++++++++---------------------
 src/dict.h   |  5 +++--
 2 files changed, 35 insertions(+), 23 deletions(-)

diff --git a/src/dict.cpp b/src/dict.cpp
index 966abc255..37e6998c4 100644
--- a/src/dict.cpp
+++ b/src/dict.cpp
@@ -233,11 +233,7 @@ int dictRehash(dict *d, int n) {
 
 dictAsyncRehashCtl *dictRehashAsyncStart(dict *d) {
     if (!dictIsRehashing(d)) return 0;
-    if (d->asyncdata != nullptr) {
-        /* An async rehash is already in progress, in the future we could give out an additional block */
-        return nullptr;
-    }
-    d->asyncdata = new dictAsyncRehashCtl(d);
+    d->asyncdata = new dictAsyncRehashCtl(d, d->asyncdata);
 
     int empty_visits = dictAsyncRehashCtl::c_targetQueueSize * 10;
 
@@ -263,8 +259,10 @@ dictAsyncRehashCtl *dictRehashAsyncStart(dict *d) {
 LDone:
     if (d->asyncdata->queue.empty()) {
         // We didn't find any work to do (can happen if there is a large gap in the hash table)
-        delete d->asyncdata;
-        d->asyncdata = nullptr;
+        auto asyncT = d->asyncdata;
+        d->asyncdata = d->asyncdata->next;
+        delete asyncT;
+        return nullptr;
     }
     return d->asyncdata;
 }
@@ -277,7 +275,15 @@ void dictRehashAsync(dictAsyncRehashCtl *ctl) {
 
 void dictCompleteRehashAsync(dictAsyncRehashCtl *ctl) {
     dict *d = ctl->dict;
-    d->asyncdata = nullptr; // dictFreeUnlinkedEntry won't actually free if we don't NULL this
+    // Unlink ourselves
+    dictAsyncRehashCtl **pprev = &d->asyncdata;
+    while (*pprev != nullptr) {
+        if (*pprev == ctl) {
+            *pprev = ctl->next;
+            break;
+        }
+        pprev = &((*pprev)->next);
+    }
 
     // Was the dictionary free'd while we were in flight?
     if (ctl->release) {
@@ -295,7 +301,7 @@ void dictCompleteRehashAsync(dictAsyncRehashCtl *ctl) {
         while (*pdePrev != nullptr && *pdePrev != wi.de) {
             pdePrev = &((*pdePrev)->next);
         }
-        if (*pdePrev != nullptr) {
+        if (*pdePrev != nullptr) {  // The element may be NULL if its in the GC list
             assert(*pdePrev == wi.de);
             *pdePrev = wi.de->next;
             // Now link it to the dest hash table
@@ -303,20 +309,25 @@ void dictCompleteRehashAsync(dictAsyncRehashCtl *ctl) {
             d->ht[1].table[wi.hash & d->ht[1].sizemask] = wi.de;
             d->ht[0].used--;
             d->ht[1].used++;
-        } else {
-            // In this scenario the de should be in the free list, find and delete it
-            bool found = false;
-            for (dictEntry **pdeGCPrev = &ctl->deGCList; *pdeGCPrev != nullptr && !found; pdeGCPrev = &((*pdeGCPrev)->next)) {
-                if (*pdeGCPrev == wi.de) {
-                    *pdeGCPrev = wi.de->next;
-                    dictFreeUnlinkedEntry(d, wi.de);
-                    found = true;
-                }
-            }
-            // Note: We may not find it if the entry was just unlinked but reused elsewhere
         }
     }
-    assert(ctl->deGCList == nullptr);
+
+    // Now free anyting in the GC list
+    while (ctl->deGCList != nullptr) {
+        auto next = ctl->deGCList->next;
+        dictFreeKey(d, ctl->deGCList);
+        dictFreeVal(d, ctl->deGCList);
+        zfree(ctl->deGCList);
+        ctl->deGCList = next;
+    }
+
+    /* Check if we already rehashed the whole table... */
+    if (d->ht[0].used == 0 && d->asyncdata == nullptr) {
+        zfree(d->ht[0].table);
+        d->ht[0] = d->ht[1];
+        _dictReset(&d->ht[1]);
+        d->rehashidx = -1;
+    }
 
     delete ctl;
 }
diff --git a/src/dict.h b/src/dict.h
index 42d097f9b..b5cac21f9 100644
--- a/src/dict.h
+++ b/src/dict.h
@@ -92,13 +92,14 @@ struct dictAsyncRehashCtl {
         }
     };
 
-    static const int c_targetQueueSize = 100;
+    static const int c_targetQueueSize = 512;
     dictEntry *deGCList = nullptr;
     struct dict *dict = nullptr;
     std::vector<workItem> queue;
     bool release = false;
+    dictAsyncRehashCtl *next = nullptr;
 
-    dictAsyncRehashCtl(struct dict *d) : dict(d) {
+    dictAsyncRehashCtl(struct dict *d, dictAsyncRehashCtl *next) : dict(d), next(next) {
         queue.reserve(c_targetQueueSize);
     }
 };

From 495dff1e8cb1388f745d120b92b4bca633a71b59 Mon Sep 17 00:00:00 2001
From: John Sully <john@csquare.ca>
Date: Mon, 25 Jan 2021 16:29:47 +0000
Subject: [PATCH 23/40] Implement rehash during spinlock

Former-commit-id: f68a26381a35b27948046d46c2c7bcfbdc21143d
---
 src/ae.cpp           |  8 ++++-
 src/ae.h             |  1 +
 src/dict.cpp         | 67 +++++++++++++++++++++++-------------
 src/dict.h           |  4 ++-
 src/fastlock.cpp     | 11 ++++--
 src/fastlock.h       | 12 +++++--
 src/fastlock_x64.asm | 52 +++++++++++++++++++++++++---
 src/server.cpp       | 81 +++++++++++++++++++++++++-------------------
 src/server.h         |  1 +
 9 files changed, 167 insertions(+), 70 deletions(-)

diff --git a/src/ae.cpp b/src/ae.cpp
index 89b36beaa..16a2ddb64 100644
--- a/src/ae.cpp
+++ b/src/ae.cpp
@@ -842,9 +842,15 @@ void aeSetAfterSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *aftersleep,
     eventLoop->aftersleepFlags = flags;
 }
 
+thread_local spin_worker tl_worker = nullptr;
+void setAeLockSetThreadSpinWorker(spin_worker worker)
+{
+    tl_worker = worker;
+}
+
 void aeAcquireLock()
 {
-    g_lock.lock();
+    g_lock.lock(tl_worker);
 }
 
 int aeTryAcquireLock(int fWeak)
diff --git a/src/ae.h b/src/ae.h
index ab7127f8b..3a240877d 100644
--- a/src/ae.h
+++ b/src/ae.h
@@ -164,6 +164,7 @@ aeEventLoop *aeGetCurrentEventLoop();
 int aeResizeSetSize(aeEventLoop *eventLoop, int setsize);
 void aeSetDontWait(aeEventLoop *eventLoop, int noWait);
 
+void setAeLockSetThreadSpinWorker(spin_worker worker);
 void aeAcquireLock();
 int aeTryAcquireLock(int fWeak);
 void aeReleaseLock();
diff --git a/src/dict.cpp b/src/dict.cpp
index 37e6998c4..e882fff2a 100644
--- a/src/dict.cpp
+++ b/src/dict.cpp
@@ -231,13 +231,14 @@ int dictRehash(dict *d, int n) {
     return 1;
 }
 
-dictAsyncRehashCtl *dictRehashAsyncStart(dict *d) {
+dictAsyncRehashCtl *dictRehashAsyncStart(dict *d, int buckets) {
     if (!dictIsRehashing(d)) return 0;
+
     d->asyncdata = new dictAsyncRehashCtl(d, d->asyncdata);
 
-    int empty_visits = dictAsyncRehashCtl::c_targetQueueSize * 10;
+    int empty_visits = buckets * 10;
 
-    while (d->asyncdata->queue.size() < dictAsyncRehashCtl::c_targetQueueSize && (d->ht[0].used - d->asyncdata->queue.size()) != 0) {
+    while (d->asyncdata->queue.size() < (size_t)buckets && d->rehashidx < d->ht[0].size) {
         dictEntry *de;
 
         /* Note that rehashidx can't overflow as we are sure there are more
@@ -245,11 +246,12 @@ dictAsyncRehashCtl *dictRehashAsyncStart(dict *d) {
         while(d->ht[0].table[d->rehashidx] == NULL) {
             d->rehashidx++;
             if (--empty_visits == 0) goto LDone;
+            if (d->rehashidx >= d->ht[0].size) goto LDone;
         }
 
         de = d->ht[0].table[d->rehashidx];
         // We have to queue every node in the bucket, even if we go over our target size
-        while (de) {
+        while (de != nullptr) {
             d->asyncdata->queue.emplace_back(de);
             de = de->next;
         }
@@ -268,9 +270,21 @@ LDone:
 }
 
 void dictRehashAsync(dictAsyncRehashCtl *ctl) {
-    for (auto &wi : ctl->queue) {
+    for (size_t idx = ctl->hashIdx; idx < ctl->queue.size(); ++idx) {
+        auto &wi = ctl->queue[idx];
         wi.hash = dictHashKey(ctl->dict, dictGetKey(wi.de));
     }
+    ctl->hashIdx = ctl->queue.size();
+}
+
+bool dictRehashSomeAsync(dictAsyncRehashCtl *ctl, size_t hashes) {
+    size_t max = std::min(ctl->hashIdx + hashes, ctl->queue.size());
+    for (; ctl->hashIdx < max; ++ctl->hashIdx) {
+        auto &wi = ctl->queue[ctl->hashIdx];
+        wi.hash = dictHashKey(ctl->dict, dictGetKey(wi.de));
+    }
+
+    return ctl->hashIdx < ctl->queue.size();
 }
 
 void dictCompleteRehashAsync(dictAsyncRehashCtl *ctl) {
@@ -287,28 +301,33 @@ void dictCompleteRehashAsync(dictAsyncRehashCtl *ctl) {
 
     // Was the dictionary free'd while we were in flight?
     if (ctl->release) {
-        dictRelease(d);
+        if (d->asyncdata != nullptr)
+            d->asyncdata->release = true;
+        else
+            dictRelease(d);
         delete ctl;
         return;
     }
 
-    for (auto &wi : ctl->queue) {
-        // We need to remove it from the source hash table, and store it in the dest.
-        //  Note that it may have been deleted in the meantime and therefore not exist.
-        //  In this case it will be in the garbage collection list
-        
-        dictEntry **pdePrev = &d->ht[0].table[wi.hash & d->ht[0].sizemask];
-        while (*pdePrev != nullptr && *pdePrev != wi.de) {
-            pdePrev = &((*pdePrev)->next);
-        }
-        if (*pdePrev != nullptr) {  // The element may be NULL if its in the GC list
-            assert(*pdePrev == wi.de);
-            *pdePrev = wi.de->next;
-            // Now link it to the dest hash table
-            wi.de->next = d->ht[1].table[wi.hash & d->ht[1].sizemask];
-            d->ht[1].table[wi.hash & d->ht[1].sizemask] = wi.de;
-            d->ht[0].used--;
-            d->ht[1].used++;
+    if (d->ht[0].table != nullptr) {    // can be null if we're cleared during the rehash
+        for (auto &wi : ctl->queue) {
+            // We need to remove it from the source hash table, and store it in the dest.
+            //  Note that it may have been deleted in the meantime and therefore not exist.
+            //  In this case it will be in the garbage collection list
+            
+            dictEntry **pdePrev = &d->ht[0].table[wi.hash & d->ht[0].sizemask];
+            while (*pdePrev != nullptr && *pdePrev != wi.de) {
+                pdePrev = &((*pdePrev)->next);
+            }
+            if (*pdePrev != nullptr) {  // The element may be NULL if its in the GC list
+                assert(*pdePrev == wi.de);
+                *pdePrev = wi.de->next;
+                // Now link it to the dest hash table
+                wi.de->next = d->ht[1].table[wi.hash & d->ht[1].sizemask];
+                d->ht[1].table[wi.hash & d->ht[1].sizemask] = wi.de;
+                d->ht[0].used--;
+                d->ht[1].used++;
+            }
         }
     }
 
@@ -565,7 +584,7 @@ int _dictClear(dict *d, dictht *ht, void(callback)(void *)) {
         if ((he = ht->table[i]) == NULL) continue;
         while(he) {
             nextHe = he->next;
-            if (i == 0 && d->asyncdata) {
+            if (d->asyncdata) {
                 he->next = d->asyncdata->deGCList;
                 d->asyncdata->deGCList = he;
             } else {
diff --git a/src/dict.h b/src/dict.h
index b5cac21f9..2f6c9677a 100644
--- a/src/dict.h
+++ b/src/dict.h
@@ -96,6 +96,7 @@ struct dictAsyncRehashCtl {
     dictEntry *deGCList = nullptr;
     struct dict *dict = nullptr;
     std::vector<workItem> queue;
+    size_t hashIdx = 0;
     bool release = false;
     dictAsyncRehashCtl *next = nullptr;
 
@@ -218,8 +219,9 @@ uint64_t dictGetHash(dict *d, const void *key);
 dictEntry **dictFindEntryRefByPtrAndHash(dict *d, const void *oldptr, uint64_t hash);
 
 /* Async Rehash Functions */
-dictAsyncRehashCtl *dictRehashAsyncStart(dict *d);
+dictAsyncRehashCtl *dictRehashAsyncStart(dict *d, int buckets = dictAsyncRehashCtl::c_targetQueueSize);
 void dictRehashAsync(dictAsyncRehashCtl *ctl);
+bool dictRehashSomeAsync(dictAsyncRehashCtl *ctl, size_t hashes);
 void dictCompleteRehashAsync(dictAsyncRehashCtl *ctl);
 
 /* Hash table types */
diff --git a/src/fastlock.cpp b/src/fastlock.cpp
index 4cada72cc..61cd1752e 100644
--- a/src/fastlock.cpp
+++ b/src/fastlock.cpp
@@ -338,7 +338,7 @@ extern "C" void fastlock_init(struct fastlock *lock, const char *name)
 }
 
 #ifndef ASM_SPINLOCK
-extern "C" void fastlock_lock(struct fastlock *lock)
+extern "C" void fastlock_lock(struct fastlock *lock, spin_worker worker)
 {
     int pidOwner;
     __atomic_load(&lock->m_pidOwner, &pidOwner, __ATOMIC_ACQUIRE);
@@ -362,11 +362,16 @@ extern "C" void fastlock_lock(struct fastlock *lock)
         if ((ticketT.u & 0xffff) == myticket)
             break;
 
+        if (worker != nullptr) {
+            worker();
+        } else {
 #if defined(__i386__) || defined(__amd64__)
-        __asm__ __volatile__ ("pause");
+            __asm__ __volatile__ ("pause");
 #elif defined(__aarch64__)
-        __asm__ __volatile__ ("yield");
+            __asm__ __volatile__ ("yield");
 #endif
+        }
+
         if ((++cloops % loopLimit) == 0)
         {
             fastlock_sleep(lock, tid, ticketT.u, myticket);
diff --git a/src/fastlock.h b/src/fastlock.h
index 0809c8bcd..ede6d4e16 100644
--- a/src/fastlock.h
+++ b/src/fastlock.h
@@ -6,10 +6,16 @@
 extern "C" {
 #endif
 
+typedef int (*spin_worker)();
+
 /* Begin C API */
 struct fastlock;
 void fastlock_init(struct fastlock *lock, const char *name);
-void fastlock_lock(struct fastlock *lock);
+#ifdef __cplusplus
+void fastlock_lock(struct fastlock *lock, spin_worker worker = nullptr);
+#else
+void fastlock_lock(struct fastlock *lock, spin_worker worker);
+#endif
 int fastlock_trylock(struct fastlock *lock, int fWeak);
 void fastlock_unlock(struct fastlock *lock);
 void fastlock_free(struct fastlock *lock);
@@ -56,9 +62,9 @@ struct fastlock
         fastlock_init(this, name);
     }
 
-    inline void lock()
+    inline void lock(spin_worker worker = nullptr)
     {
-        fastlock_lock(this);
+        fastlock_lock(this, worker);
     }
 
     inline bool try_lock(bool fWeak = false)
diff --git a/src/fastlock_x64.asm b/src/fastlock_x64.asm
index 26c302434..5fa8e0ade 100644
--- a/src/fastlock_x64.asm
+++ b/src/fastlock_x64.asm
@@ -22,14 +22,22 @@ fastlock_lock:
 	# [rdi+64] ...
 	#	uint16_t active
 	#	uint16_t avail
+	#
+	# RSI points to a spin function to call, or NULL
 	
 	# First get our TID and put it in ecx
-	push rdi                # we need our struct pointer (also balance the stack for the call)
-	.cfi_adjust_cfa_offset 8
+	sub rsp, 24				# We only use 16 bytes, but we also need the stack aligned
+	.cfi_adjust_cfa_offset 24
+	mov [rsp], rdi			# we need our struct pointer (also balance the stack for the call)
+	mov [rsp+8], rsi		# backup the spin  function
+	
 	call gettid             # get our thread ID (TLS is nasty in ASM so don't bother inlining)
 	mov esi, eax            # back it up in esi
-	pop rdi                 # get our pointer back
-	.cfi_adjust_cfa_offset -8
+
+	mov rdi, [rsp]          # Restore spin struct
+	mov r8, [rsp+8]         # restore the function (in a different register)
+	add rsp, 24
+	.cfi_adjust_cfa_offset -24
 
 	cmp [rdi], esi          # Is the TID we got back the owner of the lock?
 	je .LLocked             # Don't spin in that case
@@ -47,6 +55,8 @@ fastlock_lock:
 	# ax now contains the ticket
 	# OK Start the wait loop
 	xor ecx, ecx
+	test r8, r8
+	jnz .LLoopFunction
 .ALIGN 16
 .LLoop:
 	mov edx, [rdi+64]
@@ -83,6 +93,40 @@ fastlock_lock:
 	mov [rdi], esi          # lock->m_pidOwner = gettid()
 	inc dword ptr [rdi+4]   # lock->m_depth++
 	ret
+
+.LLoopFunction:
+	sub rsp, 40
+	.cfi_adjust_cfa_offset 40
+	xor ecx, ecx
+	mov [rsp], rcx
+	mov [rsp+8], r8
+	mov [rsp+16], rdi
+	mov [rsp+24], rsi
+	mov [rsp+32], eax
+.LLoopFunctionCore:
+	mov edx, [rdi+64]
+	cmp dx, ax
+	je .LExitLoopFunction
+	mov r8, [rsp+8]
+	call r8
+	test eax, eax
+	jz .LExitLoopFunctionForNormal
+	mov eax, [rsp+32]    # restore clobbered eax
+	mov rdi, [rsp+16]
+	jmp .LLoopFunctionCore
+.LExitLoopFunction:
+	mov rsi, [rsp+24]
+	add rsp, 40
+	.cfi_adjust_cfa_offset -40
+	jmp .LLocked
+.LExitLoopFunctionForNormal:
+	xor ecx, ecx
+	mov rdi, [rsp+16]
+	mov rsi, [rsp+24]
+	mov eax, [rsp+32]
+	add rsp, 40
+	.cfi_adjust_cfa_offset 40
+	jmp .LLoop
 .cfi_endproc
 
 .ALIGN 16
diff --git a/src/server.cpp b/src/server.cpp
index fdb8885b1..f1dd30d0d 100644
--- a/src/server.cpp
+++ b/src/server.cpp
@@ -1508,13 +1508,12 @@ void tryResizeHashTables(int dbid) {
  * table will use two tables for a long time. So we try to use 1 millisecond
  * of CPU time at every call of this function to perform some rehashing.
  *
- * The function returns 1 if some rehashing was performed, otherwise 0
+ * The function returns the number of rehashes if some rehashing was performed, otherwise 0
  * is returned. */
 int incrementallyRehash(int dbid) {
     /* Keys dictionary */
     if (dictIsRehashing(g_pserver->db[dbid].dict)) {
-        dictRehashMilliseconds(g_pserver->db[dbid].dict,1);
-        return 1; /* already used our millisecond for this loop... */
+        return dictRehashMilliseconds(g_pserver->db[dbid].dict,1);
     }
     return 0;
 }
@@ -1772,12 +1771,22 @@ bool expireOwnKeys()
     return false;
 }
 
+int hash_spin_worker() {
+    auto ctl = serverTL->rehashCtl;
+    if (ctl->hashIdx < ctl->queue.size()) {
+        ctl->queue[ctl->hashIdx].hash = dictHashKey(ctl->dict, dictGetKey(ctl->queue[ctl->hashIdx].de));
+        ctl->hashIdx++;
+        return true;
+    } else {
+        return false;
+    }
+}
+
 /* This function handles 'background' operations we are required to do
  * incrementally in Redis databases, such as active key expiring, resizing,
  * rehashing. */
-dictAsyncRehashCtl* databasesCron(bool fMainThread) {
-    dictAsyncRehashCtl *ctl = nullptr;
-
+void databasesCron(bool fMainThread) {
+    static int rehashes_per_ms = 0;
     if (fMainThread) {
         /* Expire keys by random sampling. Not required for slaves
         * as master will synthesize DELs for us. */
@@ -1819,24 +1828,42 @@ dictAsyncRehashCtl* databasesCron(bool fMainThread) {
         /* Rehash */
         if (g_pserver->activerehashing) {
             for (j = 0; j < dbs_per_call; j++) {
-                ctl = dictRehashAsyncStart(g_pserver->db[rehash_db].pdict);
-                if (ctl)
+                if (serverTL->rehashCtl != nullptr) {
+                    if (dictRehashSomeAsync(serverTL->rehashCtl, 5)) {
+                        break;
+                    } else {
+                        dictCompleteRehashAsync(serverTL->rehashCtl);
+                        serverTL->rehashCtl = nullptr;
+                    }
+                }
+                if (rehashes_per_ms > 0)
+                    serverTL->rehashCtl = dictRehashAsyncStart(g_pserver->db[rehash_db].dict, rehashes_per_ms);
+                if (serverTL->rehashCtl)
                     break;
-                int work_done = fMainThread && incrementallyRehash(rehash_db);
-                if (work_done) {
-                    /* If the function did some work, stop here, we'll do
-                     * more at the next cron loop. */
-                    break;
-                } else {
-                    /* If this db didn't need rehash, we'll try the next one. */
-                    rehash_db++;
-                    rehash_db %= cserver.dbnum;
+                
+                if (fMainThread) {
+                    // We only synchronously rehash on the main thread, otherwise we'll cause too much latency
+                    rehashes_per_ms = incrementallyRehash(rehash_db);
+                    if (rehashes_per_ms > 0) {
+                        /* If the function did some work, stop here, we'll do
+                        * more at the next cron loop. */
+                        serverLog(LL_WARNING, "Rehashes per ms: %d", rehashes_per_ms);
+                        break;
+                    } else {
+                        /* If this db didn't need rehash, we'll try the next one. */
+                        rehash_db++;
+                        rehash_db %= cserver.dbnum;
+                    }
                 }
             }
         }
     }
 
-    return ctl;
+    if (serverTL->rehashCtl) {
+        setAeLockSetThreadSpinWorker(hash_spin_worker);
+    } else {
+        setAeLockSetThreadSpinWorker(nullptr);
+    }
 }
 
 /* We take a cached value of the unix time in the global state because with
@@ -2076,14 +2103,7 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
     clientsCron(IDX_EVENT_LOOP_MAIN);
 
     /* Handle background operations on Redis databases. */
-    auto asyncRehash = databasesCron(true /* fMainThread */);
-
-    if (asyncRehash) {
-        aeReleaseLock();
-        dictRehashAsync(asyncRehash);
-        aeAcquireLock();
-        dictCompleteRehashAsync(asyncRehash);
-    }
+    databasesCron(true /* fMainThread */);
 
     /* Start a scheduled AOF rewrite if this was requested by the user while
      * a BGSAVE was in progress. */
@@ -2234,14 +2254,7 @@ int serverCronLite(struct aeEventLoop *eventLoop, long long id, void *clientData
     }
 
     /* Handle background operations on Redis databases. */
-    auto asyncRehash = databasesCron(false /* fMainThread */);
-
-    if (asyncRehash) {
-        aeReleaseLock();
-        dictRehashAsync(asyncRehash);
-        aeAcquireLock();
-        dictCompleteRehashAsync(asyncRehash);
-    }
+    databasesCron(false /* fMainThread */);
 
     /* Unpause clients if enough time has elapsed */
     unpauseClientsIfNecessary();
diff --git a/src/server.h b/src/server.h
index 0da8b84f3..ea1356889 100644
--- a/src/server.h
+++ b/src/server.h
@@ -1393,6 +1393,7 @@ struct redisServerThreadVars {
     long unsigned commandsExecuted = 0;
     bool fRetrySetAofEvent = false;
     std::vector<client*> vecclientsProcess;
+    dictAsyncRehashCtl *rehashCtl = nullptr;
 };
 
 struct redisMaster {

From ba006abe029c6766fd9dbcccbe03808134f294ce Mon Sep 17 00:00:00 2001
From: John Sully <john@csquare.ca>
Date: Wed, 27 Jan 2021 00:47:22 +0000
Subject: [PATCH 24/40] Ensure rehash completes even when we're in a long
 running task

Former-commit-id: f107746e90f7a8ff3c7094145ee1ad438911e8c2
---
 src/dict.cpp       | 112 ++++++++++++++++++++++++++-------------------
 src/dict.h         |   4 +-
 src/fastlock.cpp   |   1 +
 src/networking.cpp |  10 ++++
 src/server.cpp     |  44 ++++++++++--------
 5 files changed, 103 insertions(+), 68 deletions(-)

diff --git a/src/dict.cpp b/src/dict.cpp
index e882fff2a..c5d89eea9 100644
--- a/src/dict.cpp
+++ b/src/dict.cpp
@@ -275,6 +275,7 @@ void dictRehashAsync(dictAsyncRehashCtl *ctl) {
         wi.hash = dictHashKey(ctl->dict, dictGetKey(wi.de));
     }
     ctl->hashIdx = ctl->queue.size();
+    ctl->done = true;
 }
 
 bool dictRehashSomeAsync(dictAsyncRehashCtl *ctl, size_t hashes) {
@@ -284,71 +285,88 @@ bool dictRehashSomeAsync(dictAsyncRehashCtl *ctl, size_t hashes) {
         wi.hash = dictHashKey(ctl->dict, dictGetKey(wi.de));
     }
 
+    if (ctl->hashIdx == ctl->queue.size()) ctl->done = true;
     return ctl->hashIdx < ctl->queue.size();
 }
 
-void dictCompleteRehashAsync(dictAsyncRehashCtl *ctl) {
+void dictCompleteRehashAsync(dictAsyncRehashCtl *ctl, bool fFree) {
     dict *d = ctl->dict;
+    assert(ctl->done);
+    
     // Unlink ourselves
+    bool fUnlinked = false;
     dictAsyncRehashCtl **pprev = &d->asyncdata;
+
     while (*pprev != nullptr) {
         if (*pprev == ctl) {
             *pprev = ctl->next;
+            fUnlinked = true;
             break;
         }
         pprev = &((*pprev)->next);
     }
 
-    // Was the dictionary free'd while we were in flight?
-    if (ctl->release) {
-        if (d->asyncdata != nullptr)
-            d->asyncdata->release = true;
-        else
-            dictRelease(d);
-        delete ctl;
-        return;
-    }
-
-    if (d->ht[0].table != nullptr) {    // can be null if we're cleared during the rehash
-        for (auto &wi : ctl->queue) {
-            // We need to remove it from the source hash table, and store it in the dest.
-            //  Note that it may have been deleted in the meantime and therefore not exist.
-            //  In this case it will be in the garbage collection list
-            
-            dictEntry **pdePrev = &d->ht[0].table[wi.hash & d->ht[0].sizemask];
-            while (*pdePrev != nullptr && *pdePrev != wi.de) {
-                pdePrev = &((*pdePrev)->next);
-            }
-            if (*pdePrev != nullptr) {  // The element may be NULL if its in the GC list
-                assert(*pdePrev == wi.de);
-                *pdePrev = wi.de->next;
-                // Now link it to the dest hash table
-                wi.de->next = d->ht[1].table[wi.hash & d->ht[1].sizemask];
-                d->ht[1].table[wi.hash & d->ht[1].sizemask] = wi.de;
-                d->ht[0].used--;
-                d->ht[1].used++;
-            }
+    if (fUnlinked) {
+        if (ctl->next != nullptr && ctl->deGCList != nullptr) {
+            // An older work item may depend on our free list, so pass our free list to them
+            dictEntry **deGC = &ctl->next->deGCList;
+            while (*deGC != nullptr) deGC = &((*deGC)->next);
+            *deGC = ctl->deGCList;
+            ctl->deGCList = nullptr;
         }
     }
 
-    // Now free anyting in the GC list
-    while (ctl->deGCList != nullptr) {
-        auto next = ctl->deGCList->next;
-        dictFreeKey(d, ctl->deGCList);
-        dictFreeVal(d, ctl->deGCList);
-        zfree(ctl->deGCList);
-        ctl->deGCList = next;
+    if (fUnlinked && !ctl->release) {
+        if (d->ht[0].table != nullptr) {    // can be null if we're cleared during the rehash
+            for (auto &wi : ctl->queue) {
+                // We need to remove it from the source hash table, and store it in the dest.
+                //  Note that it may have been deleted in the meantime and therefore not exist.
+                //  In this case it will be in the garbage collection list
+                
+                dictEntry **pdePrev = &d->ht[0].table[wi.hash & d->ht[0].sizemask];
+                while (*pdePrev != nullptr && *pdePrev != wi.de) {
+                    pdePrev = &((*pdePrev)->next);
+                }
+                if (*pdePrev != nullptr) {  // The element may be NULL if its in the GC list
+                    assert(*pdePrev == wi.de);
+                    *pdePrev = wi.de->next;
+                    // Now link it to the dest hash table
+                    wi.de->next = d->ht[1].table[wi.hash & d->ht[1].sizemask];
+                    d->ht[1].table[wi.hash & d->ht[1].sizemask] = wi.de;
+                    d->ht[0].used--;
+                    d->ht[1].used++;
+                }
+            }
+        }
+
+        /* Check if we already rehashed the whole table... */
+        if (d->ht[0].used == 0 && d->asyncdata == nullptr) {
+            zfree(d->ht[0].table);
+            d->ht[0] = d->ht[1];
+            _dictReset(&d->ht[1]);
+            d->rehashidx = -1;
+        }
     }
 
-    /* Check if we already rehashed the whole table... */
-    if (d->ht[0].used == 0 && d->asyncdata == nullptr) {
-        zfree(d->ht[0].table);
-        d->ht[0] = d->ht[1];
-        _dictReset(&d->ht[1]);
-        d->rehashidx = -1;
-    }
+    if (fFree) {
+        while (ctl->deGCList != nullptr) {
+            auto next = ctl->deGCList->next;
+            dictFreeKey(d, ctl->deGCList);
+            dictFreeVal(d, ctl->deGCList);
+            zfree(ctl->deGCList);
+            ctl->deGCList = next;
+        }
 
-    delete ctl;
+        // Was the dictionary free'd while we were in flight?
+        if (ctl->release) {
+            if (d->asyncdata != nullptr)
+                d->asyncdata->release = true;
+            else
+                dictRelease(d);
+        }
+
+        delete ctl;
+    }
 }
 
 long long timeInMilliseconds(void) {
@@ -506,7 +524,7 @@ static dictEntry *dictGenericDelete(dict *d, const void *key, int nofree) {
                 else
                     d->ht[table].table[idx] = he->next;
                 if (!nofree) {
-                    if (table == 0 && d->asyncdata != nullptr) {
+                    if (table == 0 && d->asyncdata != nullptr && idx < d->rehashidx) {
                         he->next = d->asyncdata->deGCList;
                         d->asyncdata->deGCList = he->next;
                     } else {
@@ -584,7 +602,7 @@ int _dictClear(dict *d, dictht *ht, void(callback)(void *)) {
         if ((he = ht->table[i]) == NULL) continue;
         while(he) {
             nextHe = he->next;
-            if (d->asyncdata) {
+            if (d->asyncdata && i < d->rehashidx) {
                 he->next = d->asyncdata->deGCList;
                 d->asyncdata->deGCList = he;
             } else {
diff --git a/src/dict.h b/src/dict.h
index 2f6c9677a..9abba00eb 100644
--- a/src/dict.h
+++ b/src/dict.h
@@ -37,6 +37,7 @@
 
 #ifdef __cplusplus
 #include <vector>
+#include <atomic>
 extern "C" {
 #endif
 
@@ -99,6 +100,7 @@ struct dictAsyncRehashCtl {
     size_t hashIdx = 0;
     bool release = false;
     dictAsyncRehashCtl *next = nullptr;
+    std::atomic<bool> done { false };
 
     dictAsyncRehashCtl(struct dict *d, dictAsyncRehashCtl *next) : dict(d), next(next) {
         queue.reserve(c_targetQueueSize);
@@ -222,7 +224,7 @@ dictEntry **dictFindEntryRefByPtrAndHash(dict *d, const void *oldptr, uint64_t h
 dictAsyncRehashCtl *dictRehashAsyncStart(dict *d, int buckets = dictAsyncRehashCtl::c_targetQueueSize);
 void dictRehashAsync(dictAsyncRehashCtl *ctl);
 bool dictRehashSomeAsync(dictAsyncRehashCtl *ctl, size_t hashes);
-void dictCompleteRehashAsync(dictAsyncRehashCtl *ctl);
+void dictCompleteRehashAsync(dictAsyncRehashCtl *ctl, bool fFree);
 
 /* Hash table types */
 extern dictType dictTypeHeapStringCopyKey;
diff --git a/src/fastlock.cpp b/src/fastlock.cpp
index 61cd1752e..8c1c5f6e4 100644
--- a/src/fastlock.cpp
+++ b/src/fastlock.cpp
@@ -356,6 +356,7 @@ extern "C" void fastlock_lock(struct fastlock *lock, spin_worker worker)
     __atomic_load(&g_fHighCpuPressure, &fHighPressure, __ATOMIC_RELAXED);
     unsigned loopLimit = fHighPressure ? 0x10000 : 0x100000;
 
+    // WARNING:::: THIS DOESN"T MATCH ASM
     for (;;)
     {
         __atomic_load(&lock->m_ticket.u, &ticketT.u, __ATOMIC_ACQUIRE);
diff --git a/src/networking.cpp b/src/networking.cpp
index dc4e56e92..36470c0db 100644
--- a/src/networking.cpp
+++ b/src/networking.cpp
@@ -3486,6 +3486,16 @@ void processEventsWhileBlocked(int iel) {
     locker.arm(nullptr);
     locker.release();
 
+    // Try to complete any async rehashes (this would normally happen in dbCron, but that won't run here)
+    for (int idb = 0; idb < cserver.dbnum; ++idb) {
+        redisDb *db = &g_pserver->db[idb];
+        while (db->pdict->asyncdata != nullptr) {
+            if (!db->pdict->asyncdata->done)
+                break;
+            dictCompleteRehashAsync(db->pdict->asyncdata, false /*fFree*/);
+        }
+    }
+
     // Restore it so the calling code is not confused
     if (fReplBacklog) {
         g_pserver->repl_batch_idxStart = g_pserver->repl_backlog_idx;
diff --git a/src/server.cpp b/src/server.cpp
index f1dd30d0d..be140c48c 100644
--- a/src/server.cpp
+++ b/src/server.cpp
@@ -1773,19 +1773,14 @@ bool expireOwnKeys()
 
 int hash_spin_worker() {
     auto ctl = serverTL->rehashCtl;
-    if (ctl->hashIdx < ctl->queue.size()) {
-        ctl->queue[ctl->hashIdx].hash = dictHashKey(ctl->dict, dictGetKey(ctl->queue[ctl->hashIdx].de));
-        ctl->hashIdx++;
-        return true;
-    } else {
-        return false;
-    }
+    return dictRehashSomeAsync(ctl, 1);
 }
 
 /* This function handles 'background' operations we are required to do
  * incrementally in Redis databases, such as active key expiring, resizing,
  * rehashing. */
 void databasesCron(bool fMainThread) {
+    serverAssert(GlobalLocksAcquired());
     static int rehashes_per_ms = 0;
     if (fMainThread) {
         /* Expire keys by random sampling. Not required for slaves
@@ -1832,29 +1827,38 @@ void databasesCron(bool fMainThread) {
                     if (dictRehashSomeAsync(serverTL->rehashCtl, 5)) {
                         break;
                     } else {
-                        dictCompleteRehashAsync(serverTL->rehashCtl);
+                        dictCompleteRehashAsync(serverTL->rehashCtl, true /*fFree*/);
                         serverTL->rehashCtl = nullptr;
                     }
                 }
+
+                serverAssert(serverTL->rehashCtl == nullptr);
                 if (rehashes_per_ms > 0)
                     serverTL->rehashCtl = dictRehashAsyncStart(g_pserver->db[rehash_db].dict, rehashes_per_ms);
                 if (serverTL->rehashCtl)
                     break;
                 
-                if (fMainThread) {
-                    // We only synchronously rehash on the main thread, otherwise we'll cause too much latency
-                    rehashes_per_ms = incrementallyRehash(rehash_db);
-                    if (rehashes_per_ms > 0) {
-                        /* If the function did some work, stop here, we'll do
-                        * more at the next cron loop. */
-                        serverLog(LL_WARNING, "Rehashes per ms: %d", rehashes_per_ms);
-                        break;
-                    } else {
-                        /* If this db didn't need rehash, we'll try the next one. */
-                        rehash_db++;
-                        rehash_db %= cserver.dbnum;
+                // Before starting anything new, can we end the rehash of a blocked thread?
+                if (g_pserver->db[rehash_db].pdict->asyncdata != nullptr) {
+                    auto asyncdata = g_pserver->db[rehash_db].pdict->asyncdata;
+                    if (asyncdata->done) {
+                        dictCompleteRehashAsync(asyncdata, false /*fFree*/);    // Don't free because we don't own the pointer
+                        serverAssert(g_pserver->db[rehash_db].pdict->asyncdata != asyncdata);
+                        break;  // completion can be expensive, don't do anything else
                     }
                 }
+
+                rehashes_per_ms = incrementallyRehash(rehash_db);
+                if (rehashes_per_ms > 0) {
+                    /* If the function did some work, stop here, we'll do
+                    * more at the next cron loop. */
+                    serverLog(LL_WARNING, "Rehashes per ms: %d", rehashes_per_ms);
+                    break;
+                } else if (g_pserver->db[rehash_db].pdict->asyncdata == nullptr) {
+                    /* If this db didn't need rehash and we have none in flight, we'll try the next one. */
+                    rehash_db++;
+                    rehash_db %= cserver.dbnum;
+                }
             }
         }
     }

From 27552cb307911c87bf658f1cb2b95fa2a4a597a0 Mon Sep 17 00:00:00 2001
From: John Sully <john@csquare.ca>
Date: Wed, 27 Jan 2021 05:05:05 +0000
Subject: [PATCH 25/40] Ensure the C lock implementation behaves the same as
 the ASM

Former-commit-id: b5ddc11c46d2eabd28fae4c69927c356dd18bf6e
---
 src/fastlock.cpp | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/src/fastlock.cpp b/src/fastlock.cpp
index 8c1c5f6e4..6e6b6e4d2 100644
--- a/src/fastlock.cpp
+++ b/src/fastlock.cpp
@@ -356,26 +356,32 @@ extern "C" void fastlock_lock(struct fastlock *lock, spin_worker worker)
     __atomic_load(&g_fHighCpuPressure, &fHighPressure, __ATOMIC_RELAXED);
     unsigned loopLimit = fHighPressure ? 0x10000 : 0x100000;
 
-    // WARNING:::: THIS DOESN"T MATCH ASM
-    for (;;)
-    {
-        __atomic_load(&lock->m_ticket.u, &ticketT.u, __ATOMIC_ACQUIRE);
-        if ((ticketT.u & 0xffff) == myticket)
-            break;
+    if (worker != nullptr) {
+        for (;;) {
+            __atomic_load(&lock->m_ticket.u, &ticketT.u, __ATOMIC_ACQUIRE);
+            if ((ticketT.u & 0xffff) == myticket)
+                break;
+            if (!worker())
+                goto LNormalLoop;
+        }
+    } else {
+LNormalLoop:
+        for (;;)
+        {
+            __atomic_load(&lock->m_ticket.u, &ticketT.u, __ATOMIC_ACQUIRE);
+            if ((ticketT.u & 0xffff) == myticket)
+                break;
 
-        if (worker != nullptr) {
-            worker();
-        } else {
 #if defined(__i386__) || defined(__amd64__)
             __asm__ __volatile__ ("pause");
 #elif defined(__aarch64__)
             __asm__ __volatile__ ("yield");
 #endif
-        }
 
-        if ((++cloops % loopLimit) == 0)
-        {
-            fastlock_sleep(lock, tid, ticketT.u, myticket);
+            if ((++cloops % loopLimit) == 0)
+            {
+                fastlock_sleep(lock, tid, ticketT.u, myticket);
+            }
         }
     }
 

From 1b3dc3d422ccb7e7cfd87451a4b41b0f20159df7 Mon Sep 17 00:00:00 2001
From: John Sully <john@csquare.ca>
Date: Wed, 27 Jan 2021 06:37:10 +0000
Subject: [PATCH 26/40] The server should recalibrate periodically.  Also
 reduce log noise

Former-commit-id: 8d8e1810d49da6aa921d2327cb4ea250c2b5b234
---
 src/networking.cpp |  6 +++---
 src/server.cpp     | 18 ++++++++++++------
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/src/networking.cpp b/src/networking.cpp
index 36470c0db..8e5683f65 100644
--- a/src/networking.cpp
+++ b/src/networking.cpp
@@ -3489,10 +3489,10 @@ void processEventsWhileBlocked(int iel) {
     // Try to complete any async rehashes (this would normally happen in dbCron, but that won't run here)
     for (int idb = 0; idb < cserver.dbnum; ++idb) {
         redisDb *db = &g_pserver->db[idb];
-        while (db->pdict->asyncdata != nullptr) {
-            if (!db->pdict->asyncdata->done)
+        while (db->dict->asyncdata != nullptr) {
+            if (!db->dict->asyncdata->done)
                 break;
-            dictCompleteRehashAsync(db->pdict->asyncdata, false /*fFree*/);
+            dictCompleteRehashAsync(db->dict->asyncdata, false /*fFree*/);
         }
     }
 
diff --git a/src/server.cpp b/src/server.cpp
index be140c48c..d95e207dc 100644
--- a/src/server.cpp
+++ b/src/server.cpp
@@ -1782,6 +1782,7 @@ int hash_spin_worker() {
 void databasesCron(bool fMainThread) {
     serverAssert(GlobalLocksAcquired());
     static int rehashes_per_ms = 0;
+    static int async_rehashes = 0;
     if (fMainThread) {
         /* Expire keys by random sampling. Not required for slaves
         * as master will synthesize DELs for us. */
@@ -1833,28 +1834,33 @@ void databasesCron(bool fMainThread) {
                 }
 
                 serverAssert(serverTL->rehashCtl == nullptr);
-                if (rehashes_per_ms > 0)
+                /* Are we async rehashing? And if so is it time to re-calibrate? */
+                /* The recalibration limit is a prime number to ensure balancing across threads */
+                if (rehashes_per_ms > 0 && async_rehashes < 131) {
                     serverTL->rehashCtl = dictRehashAsyncStart(g_pserver->db[rehash_db].dict, rehashes_per_ms);
+                    ++async_rehashes;
+                }
                 if (serverTL->rehashCtl)
                     break;
                 
                 // Before starting anything new, can we end the rehash of a blocked thread?
-                if (g_pserver->db[rehash_db].pdict->asyncdata != nullptr) {
-                    auto asyncdata = g_pserver->db[rehash_db].pdict->asyncdata;
+                if (g_pserver->db[rehash_db].dict->asyncdata != nullptr) {
+                    auto asyncdata = g_pserver->db[rehash_db].dict->asyncdata;
                     if (asyncdata->done) {
                         dictCompleteRehashAsync(asyncdata, false /*fFree*/);    // Don't free because we don't own the pointer
-                        serverAssert(g_pserver->db[rehash_db].pdict->asyncdata != asyncdata);
+                        serverAssert(g_pserver->db[rehash_db].dict->asyncdata != asyncdata);
                         break;  // completion can be expensive, don't do anything else
                     }
                 }
 
                 rehashes_per_ms = incrementallyRehash(rehash_db);
+                async_rehashes = 0;
                 if (rehashes_per_ms > 0) {
                     /* If the function did some work, stop here, we'll do
                     * more at the next cron loop. */
-                    serverLog(LL_WARNING, "Rehashes per ms: %d", rehashes_per_ms);
+                    serverLog(LL_VERBOSE, "Calibrated rehashes per ms: %d", rehashes_per_ms);
                     break;
-                } else if (g_pserver->db[rehash_db].pdict->asyncdata == nullptr) {
+                } else if (g_pserver->db[rehash_db].dict->asyncdata == nullptr) {
                     /* If this db didn't need rehash and we have none in flight, we'll try the next one. */
                     rehash_db++;
                     rehash_db %= cserver.dbnum;

From 662fc28fdc8e1617589ae529176cf2adcd28a3e6 Mon Sep 17 00:00:00 2001
From: John Sully <john@csquare.ca>
Date: Wed, 27 Jan 2021 06:50:01 +0000
Subject: [PATCH 27/40] Disable multithreaded rehash when active defrag is
 enabled.  The two are not compatible

Former-commit-id: 56addcd17262ffbaefea26c0097cde53c616ece2
---
 src/server.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/server.cpp b/src/server.cpp
index d95e207dc..4e4418b02 100644
--- a/src/server.cpp
+++ b/src/server.cpp
@@ -1836,7 +1836,7 @@ void databasesCron(bool fMainThread) {
                 serverAssert(serverTL->rehashCtl == nullptr);
                 /* Are we async rehashing? And if so is it time to re-calibrate? */
                 /* The recalibration limit is a prime number to ensure balancing across threads */
-                if (rehashes_per_ms > 0 && async_rehashes < 131) {
+                if (rehashes_per_ms > 0 && async_rehashes < 131 && cserver.active_defrag_enabled) {
                     serverTL->rehashCtl = dictRehashAsyncStart(g_pserver->db[rehash_db].dict, rehashes_per_ms);
                     ++async_rehashes;
                 }
@@ -1858,7 +1858,9 @@ void databasesCron(bool fMainThread) {
                 if (rehashes_per_ms > 0) {
                     /* If the function did some work, stop here, we'll do
                     * more at the next cron loop. */
-                    serverLog(LL_VERBOSE, "Calibrated rehashes per ms: %d", rehashes_per_ms);
+                    if (!cserver.active_defrag_enabled) {
+                        serverLog(LL_VERBOSE, "Calibrated rehashes per ms: %d", rehashes_per_ms);
+                    }
                     break;
                 } else if (g_pserver->db[rehash_db].dict->asyncdata == nullptr) {
                     /* If this db didn't need rehash and we have none in flight, we'll try the next one. */

From 4efab08466b45729ace98d8a9c5f8ecb7753a219 Mon Sep 17 00:00:00 2001
From: John Sully <john@csquare.ca>
Date: Mon, 8 Feb 2021 00:10:38 +0000
Subject: [PATCH 28/40] Fix bug in fastlock stack metadata

Former-commit-id: 241d1bd7e1ed64885adbb07653a5c7e8ef882607
---
 src/fastlock_x64.asm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastlock_x64.asm b/src/fastlock_x64.asm
index 5fa8e0ade..9e17f8995 100644
--- a/src/fastlock_x64.asm
+++ b/src/fastlock_x64.asm
@@ -125,7 +125,7 @@ fastlock_lock:
 	mov rsi, [rsp+24]
 	mov eax, [rsp+32]
 	add rsp, 40
-	.cfi_adjust_cfa_offset 40
+	.cfi_adjust_cfa_offset -40
 	jmp .LLoop
 .cfi_endproc
 

From 3de15c16ac89f369d38a09aa6dc3746e009b1364 Mon Sep 17 00:00:00 2001
From: John Sully <john@csquare.ca>
Date: Mon, 8 Feb 2021 19:02:01 +0000
Subject: [PATCH 29/40] Fix deadlock with disconnecting client and storage
 provider

Former-commit-id: 0831745323e425f463322e8c0dc27fc25854868e
---
 src/server.cpp | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/src/server.cpp b/src/server.cpp
index 137976437..5e566f957 100644
--- a/src/server.cpp
+++ b/src/server.cpp
@@ -2592,6 +2592,18 @@ void beforeSleep(struct aeEventLoop *eventLoop) {
 
     int aof_state = g_pserver->aof_state;
 
+    mstime_t commit_latency;
+    latencyStartMonitor(commit_latency);
+    if (g_pserver->m_pstorageFactory != nullptr)
+    {
+        locker.disarm();
+        for (redisDb *db : vecdb)
+            db->commitChanges();
+        locker.arm();
+    }
+    latencyEndMonitor(commit_latency);
+    latencyAddSampleIfNeeded("storage-commit", commit_latency);
+
     /* We try to handle writes at the end so we don't have to reacquire the lock,
         but if there is a pending async close we need to ensure the writes happen
         first so perform it here */
@@ -2602,16 +2614,6 @@ void beforeSleep(struct aeEventLoop *eventLoop) {
         locker.arm();
         fSentReplies = true;
     }
-
-    mstime_t commit_latency;
-    latencyStartMonitor(commit_latency);
-    if (g_pserver->m_pstorageFactory != nullptr)
-    {
-        for (redisDb *db : vecdb)
-            db->commitChanges();
-    }
-    latencyEndMonitor(commit_latency);
-    latencyAddSampleIfNeeded("storage-commit", commit_latency);
     
     if (!serverTL->gcEpoch.isReset())
         g_pserver->garbageCollector.endEpoch(serverTL->gcEpoch, true /*fNoFree*/);

From e5e13502196741a8ffe83fb0d0a8668e10e0638b Mon Sep 17 00:00:00 2001
From: John Sully <john@csquare.ca>
Date: Tue, 9 Feb 2021 02:41:44 +0000
Subject: [PATCH 30/40] Fix polarity issue in async rehash

Former-commit-id: 9eefba49d4fcde7f12929929d7aeb36f5186a63d
---
 src/server.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/server.cpp b/src/server.cpp
index 5e566f957..e9e71b46e 100644
--- a/src/server.cpp
+++ b/src/server.cpp
@@ -1950,7 +1950,7 @@ void databasesCron(bool fMainThread) {
                 ::dict *dict = g_pserver->db[rehash_db]->dictUnsafeKeyOnly();
                 /* Are we async rehashing? And if so is it time to re-calibrate? */
                 /* The recalibration limit is a prime number to ensure balancing across threads */
-                if (rehashes_per_ms > 0 && async_rehashes < 131 && cserver.active_defrag_enabled) {
+                if (rehashes_per_ms > 0 && async_rehashes < 131 && !cserver.active_defrag_enabled) {
                     serverTL->rehashCtl = dictRehashAsyncStart(dict, rehashes_per_ms);
                     ++async_rehashes;
                 }

From ef471e6e53c3e2abe1514706282d963dc74e8991 Mon Sep 17 00:00:00 2001
From: John Sully <john@csquare.ca>
Date: Thu, 11 Feb 2021 00:50:21 +0000
Subject: [PATCH 31/40] Enable prefetch with storage providers

Former-commit-id: 21f9c52c446ad1b460a340c3d873839b2355728f
---
 src/aof.cpp                  |   7 +-
 src/blocked.cpp              |   2 +-
 src/db.cpp                   |  17 ++--
 src/networking.cpp           | 190 ++++++++++++++++++++++-------------
 src/replication.cpp          |  11 +-
 src/server.cpp               |   4 +-
 src/server.h                 |  65 ++++++++++--
 tests/unit/introspection.tcl |   2 +-
 8 files changed, 205 insertions(+), 93 deletions(-)

diff --git a/src/aof.cpp b/src/aof.cpp
index ab3aca2c6..e8f77e879 100644
--- a/src/aof.cpp
+++ b/src/aof.cpp
@@ -741,7 +741,7 @@ void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int a
 /* In Redis commands are always executed in the context of a client, so in
  * order to load the append only file we need to create a fake client. */
 struct client *createAOFClient(void) {
-    struct client *c =(client*) zmalloc(sizeof(*c), MALLOC_LOCAL);
+    struct client *c = new client();
 
     selectDb(c,0);
     c->id = CLIENT_ID_AOF; /* So modules can identify it's the AOF client. */
@@ -752,7 +752,6 @@ struct client *createAOFClient(void) {
     c->querybuf_peak = 0;
     c->argc = 0;
     c->argv = NULL;
-    c->argv_len_sum = 0;
     c->bufpos = 0;
     c->flags = 0;
     c->fPendingAsyncWrite = FALSE;
@@ -783,7 +782,7 @@ void freeFakeClientArgv(struct client *c) {
     for (j = 0; j < c->argc; j++)
         decrRefCount(c->argv[j]);
     zfree(c->argv);
-    c->argv_len_sum = 0;
+    c->argv_len_sumActive = 0;
 }
 
 void freeFakeClient(struct client *c) {
@@ -793,7 +792,7 @@ void freeFakeClient(struct client *c) {
     freeClientMultiState(c);
     fastlock_unlock(&c->lock);
     fastlock_free(&c->lock);
-    zfree(c);
+    delete c;
 }
 
 /* Replay the append log file. On success C_OK is returned. On non fatal
diff --git a/src/blocked.cpp b/src/blocked.cpp
index d4b7a4804..085090b50 100644
--- a/src/blocked.cpp
+++ b/src/blocked.cpp
@@ -121,7 +121,7 @@ void processUnblockedClients(int iel) {
          * the code is conceptually more correct this way. */
         if (!(c->flags & CLIENT_BLOCKED)) {
             if (c->querybuf && sdslen(c->querybuf) > 0) {
-                processInputBuffer(c, CMD_CALL_FULL);
+                processInputBuffer(c, true /*fParse*/, CMD_CALL_FULL);
             }
         }
         fastlock_unlock(&c->lock);
diff --git a/src/db.cpp b/src/db.cpp
index 57e29c7d1..14c292b6f 100644
--- a/src/db.cpp
+++ b/src/db.cpp
@@ -2959,24 +2959,27 @@ int dbnumFromDb(redisDb *db)
     serverPanic("invalid database pointer");
 }
 
-void redisDbPersistentData::prefetchKeysAsync(AeLocker &lock, client *c)
+void redisDbPersistentData::prefetchKeysAsync(parsed_command &command)
 {
     if (m_spstorage == nullptr)
         return;
 
+    AeLocker lock;
+
     std::vector<robj*> veckeys;
-    lock.arm(c);
-    getKeysResult* result = nullptr;
-    int numkeys = getKeysFromCommand(c->cmd, c->argv, c->argc, result);
+    lock.arm(nullptr);
+    getKeysResult result = GETKEYS_RESULT_INIT;
+    auto cmd = lookupCommand(szFromObj(command.argv[0]));
+    int numkeys = getKeysFromCommand(cmd, command.argv, command.argc, &result);
     for (int ikey = 0; ikey < numkeys; ++ikey)
     {
-        robj *objKey = c->argv[result->keys[ikey]];
+        robj *objKey = command.argv[result.keys[ikey]];
         if (this->find_cached_threadsafe(szFromObj(objKey)) == nullptr)
             veckeys.push_back(objKey);
     }
     lock.disarm();
 
-    getKeysFreeResult(result);
+    getKeysFreeResult(&result);
 
     std::vector<std::tuple<sds, robj*, std::unique_ptr<expireEntry>>> vecInserts;
     for (robj *objKey : veckeys)
@@ -2997,7 +3000,7 @@ void redisDbPersistentData::prefetchKeysAsync(AeLocker &lock, client *c)
         vecInserts.emplace_back(sharedKey, o, std::move(spexpire));
     }
 
-    lock.arm(c);
+    lock.arm(nullptr);
     for (auto &tuple : vecInserts)
     {
         sds sharedKey = std::get<0>(tuple);
diff --git a/src/networking.cpp b/src/networking.cpp
index 80214963d..be50373c9 100644
--- a/src/networking.cpp
+++ b/src/networking.cpp
@@ -102,7 +102,7 @@ void linkClient(client *c) {
 }
 
 client *createClient(connection *conn, int iel) {
-    client *c = (client*)zmalloc(sizeof(client), MALLOC_LOCAL);
+    client *c = new client;
     serverAssert(conn == nullptr || (iel == (serverTL - g_pserver->rgthreadvar)));
 
     c->iel = iel;
@@ -124,7 +124,6 @@ client *createClient(connection *conn, int iel) {
     uint64_t client_id;
     client_id = g_pserver->next_client_id.fetch_add(1);
     c->iel = iel;
-    fastlock_init(&c->lock, "client");
     c->id = client_id;
     c->resp = 2;
     c->conn = conn;
@@ -137,7 +136,6 @@ client *createClient(connection *conn, int iel) {
     c->reqtype = 0;
     c->argc = 0;
     c->argv = NULL;
-    c->argv_len_sum = 0;
     c->cmd = c->lastcmd = NULL;
     c->puser = DefaultUser;
     c->multibulklen = 0;
@@ -157,6 +155,7 @@ client *createClient(connection *conn, int iel) {
     c->reploff = 0;
     c->reploff_skipped = 0;
     c->read_reploff = 0;
+    c->reploff_cmd = 0;
     c->repl_ack_off = 0;
     c->repl_ack_time = 0;
     c->slave_listening_port = 0;
@@ -203,6 +202,13 @@ client *createClient(connection *conn, int iel) {
     return c;
 }
 
+size_t client::argv_len_sum() const {
+    size_t sum = 0;
+    for (auto &cmd : vecqueuedcmd)
+        sum += cmd.argv_len_sum;
+    return sum + argv_len_sumActive;
+}
+
 /* This function puts the client in the queue of clients that should write
  * their output buffers to the socket. Note that it does not *yet* install
  * the write handler, to start clients are put in a queue of clients that need
@@ -1343,7 +1349,7 @@ static void freeClientArgv(client *c) {
         decrRefCount(c->argv[j]);
     c->argc = 0;
     c->cmd = NULL;
-    c->argv_len_sum = 0;
+    c->argv_len_sumActive = 0;
 }
 
 void disconnectSlavesExcept(unsigned char *uuid)
@@ -1574,12 +1580,12 @@ bool freeClient(client *c) {
     zfree(c->replyAsync);
     if (c->name) decrRefCount(c->name);
     zfree(c->argv);
-    c->argv_len_sum = 0;
+    c->argv_len_sumActive = 0;
     freeClientMultiState(c);
     sdsfree(c->peerid);
     ulock.unlock();
     fastlock_free(&c->lock);
-    zfree(c);
+    delete c;
     return true;
 }
 
@@ -1918,9 +1924,6 @@ void resetClient(client *c) {
     redisCommandProc *prevcmd = c->cmd ? c->cmd->proc : NULL;
 
     freeClientArgv(c);
-    c->reqtype = 0;
-    c->multibulklen = 0;
-    c->bulklen = -1;
 
     /* We clear the ASKING flag as well if we are not inside a MULTI, and
      * if what we just executed is not the ASKING command itself. */
@@ -2042,16 +2045,13 @@ int processInlineBuffer(client *c) {
 
     /* Setup argv array on client structure */
     if (argc) {
-        if (c->argv) zfree(c->argv);
-        c->argv = (robj**)zmalloc(sizeof(robj*)*argc, MALLOC_LOCAL);
-        c->argv_len_sum = 0;
-    }
-
-    /* Create redis objects for all arguments. */
-    for (c->argc = 0, j = 0; j < argc; j++) {
-        c->argv[c->argc] = createObject(OBJ_STRING,argv[j]);
-        c->argc++;
-        c->argv_len_sum += sdslen(argv[j]);
+        /* Create redis objects for all arguments. */
+        c->vecqueuedcmd.emplace_back(argc);
+        auto &cmd = c->vecqueuedcmd.back();
+        for (cmd.argc = 0, j = 0; j < argc; j++) {
+            cmd.argv[cmd.argc++] = createObject(OBJ_STRING,argv[j]);
+            cmd.argv_len_sum += sdslen(argv[j]);
+        }
     }
     sds_free(argv);
     return C_OK;
@@ -2141,9 +2141,7 @@ int processMultibulkBuffer(client *c) {
         c->multibulklen = ll;
 
         /* Setup argv array on client structure */
-        if (c->argv) zfree(c->argv);
-        c->argv = (robj**)zmalloc(sizeof(robj*)*c->multibulklen, MALLOC_LOCAL);
-        c->argv_len_sum = 0;
+        c->vecqueuedcmd.emplace_back(c->multibulklen);
     }
 
     serverAssertWithInfo(c,NULL,c->multibulklen > 0);
@@ -2211,21 +2209,22 @@ int processMultibulkBuffer(client *c) {
             /* Optimization: if the buffer contains JUST our bulk element
              * instead of creating a new object by *copying* the sds we
              * just use the current sds string. */
+            auto &cmd = c->vecqueuedcmd.back();
             if (c->qb_pos == 0 &&
                 c->bulklen >= PROTO_MBULK_BIG_ARG &&
                 sdslen(c->querybuf) == (size_t)(c->bulklen+2))
             {
-                c->argv[c->argc++] = createObject(OBJ_STRING,c->querybuf);
-                c->argv_len_sum += c->bulklen;
+                cmd.argv[cmd.argc++] = createObject(OBJ_STRING,c->querybuf);
+                cmd.argv_len_sum += c->bulklen;
                 sdsIncrLen(c->querybuf,-2); /* remove CRLF */
                 /* Assume that if we saw a fat argument we'll see another one
                  * likely... */
                 c->querybuf = sdsnewlen(SDS_NOINIT,c->bulklen+2);
                 sdsclear(c->querybuf);
             } else {
-                c->argv[c->argc++] =
+                cmd.argv[cmd.argc++] =
                     createStringObject(c->querybuf+c->qb_pos,c->bulklen);
-                c->argv_len_sum += c->bulklen;
+                cmd.argv_len_sum += c->bulklen;
                 c->qb_pos += c->bulklen+2;
             }
             c->bulklen = -1;
@@ -2249,7 +2248,8 @@ void commandProcessed(client *c, int flags) {
     long long prev_offset = c->reploff;
     if (c->flags & CLIENT_MASTER && !(c->flags & CLIENT_MULTI)) {
         /* Update the applied replication offset of our master. */
-        c->reploff = c->read_reploff - sdslen(c->querybuf) + c->qb_pos;
+        serverAssert(c->reploff <= c->reploff_cmd);
+        c->reploff = c->reploff_cmd;
     }
 
     /* Don't reset the client structure for clients blocked in a
@@ -2306,34 +2306,34 @@ int processCommandAndResetClient(client *c, int flags) {
     return deadclient ? C_ERR : C_OK;
 }
 
-/* This function is called every time, in the client structure 'c', there is
- * more query buffer to process, because we read more data from the socket
- * or because a client was blocked and later reactivated, so there could be
- * pending query buffer, already representing a full command, to process. */
-void processInputBuffer(client *c, int callFlags) {
-    AssertCorrectThread(c);
+static bool FClientReady(client *c) {
+    /* Return if clients are paused. */
+    if (!(c->flags & CLIENT_SLAVE) && clientsArePaused()) return false;
+
+    /* Immediately abort if the client is in the middle of something. */
+    if (c->flags & CLIENT_BLOCKED) return false;
+
+    /* Don't process input from the master while there is a busy script
+        * condition on the replica. We want just to accumulate the replication
+        * stream (instead of replying -BUSY like we do with other clients) and
+        * later resume the processing. */
+    if (g_pserver->lua_timedout && c->flags & CLIENT_MASTER) return false;
+
+    /* CLIENT_CLOSE_AFTER_REPLY closes the connection once the reply is
+        * written to the client. Make sure to not let the reply grow after
+        * this flag has been set (i.e. don't process more commands).
+        *
+        * The same applies for clients we want to terminate ASAP. */
+    if (c->flags & (CLIENT_CLOSE_AFTER_REPLY|CLIENT_CLOSE_ASAP)) return false;
+
+    return true;
+}
+
+void parseClientCommandBuffer(client *c) {
+    if (!FClientReady(c))
+        return;
     
-    /* Keep processing while there is something in the input buffer */
-    while(c->qb_pos < sdslen(c->querybuf)) {
-        /* Return if clients are paused. */
-        if (!(c->flags & CLIENT_SLAVE) && clientsArePaused()) break;
-
-        /* Immediately abort if the client is in the middle of something. */
-        if (c->flags & CLIENT_BLOCKED) break;
-
-        /* Don't process input from the master while there is a busy script
-         * condition on the replica. We want just to accumulate the replication
-         * stream (instead of replying -BUSY like we do with other clients) and
-         * later resume the processing. */
-        if (g_pserver->lua_timedout && c->flags & CLIENT_MASTER) break;
-
-        /* CLIENT_CLOSE_AFTER_REPLY closes the connection once the reply is
-         * written to the client. Make sure to not let the reply grow after
-         * this flag has been set (i.e. don't process more commands).
-         *
-         * The same applies for clients we want to terminate ASAP. */
-        if (c->flags & (CLIENT_CLOSE_AFTER_REPLY|CLIENT_CLOSE_ASAP)) break;
-
+    while(c->qb_pos < sdslen(c->querybuf)) {    
         /* Determine request type when unknown. */
         if (!c->reqtype) {
             if (c->querybuf[c->qb_pos] == '*') {
@@ -2343,6 +2343,7 @@ void processInputBuffer(client *c, int callFlags) {
             }
         }
 
+        size_t cqueries = c->vecqueuedcmd.size();
         if (c->reqtype == PROTO_REQ_INLINE) {
             if (processInlineBuffer(c) != C_OK) break;
         } else if (c->reqtype == PROTO_REQ_MULTIBULK) {
@@ -2350,6 +2351,61 @@ void processInputBuffer(client *c, int callFlags) {
         } else {
             serverPanic("Unknown request type");
         }
+        if (!c->vecqueuedcmd.empty() && (c->vecqueuedcmd.back().argc <= 0 || c->vecqueuedcmd.back().argv == nullptr)) {
+            c->vecqueuedcmd.pop_back();
+        } else if (!c->vecqueuedcmd.empty()) {
+            if (c->flags & CLIENT_MASTER) c->vecqueuedcmd.back().reploff = c->read_reploff - sdslen(c->querybuf) + c->qb_pos;
+            serverAssert(c->vecqueuedcmd.back().reploff >= 0);
+        }
+
+        /* Prefetch if we have a storage provider and we're not in the global lock */
+        if (cqueries < c->vecqueuedcmd.size() && g_pserver->m_pstorageFactory != nullptr && !GlobalLocksAcquired()) {
+            auto &query = c->vecqueuedcmd.back();
+            if (query.argc > 0 && query.argc == query.argcMax) {
+                c->db->prefetchKeysAsync(query);
+            }
+        }
+        c->reqtype = 0;
+        c->multibulklen = 0;
+        c->bulklen = -1;
+        c->reqtype = 0;
+    }
+
+    /* Trim to pos */
+    if (c->qb_pos) {
+        sdsrange(c->querybuf,c->qb_pos,-1);
+        c->qb_pos = 0;
+    }
+}
+
+/* This function is called every time, in the client structure 'c', there is
+ * more query buffer to process, because we read more data from the socket
+ * or because a client was blocked and later reactivated, so there could be
+ * pending query buffer, already representing a full command, to process. */
+void processInputBuffer(client *c, bool fParse, int callFlags) {
+    AssertCorrectThread(c);
+    
+    if (fParse)
+        parseClientCommandBuffer(c);
+
+    /* Keep processing while there is something in the input buffer */
+    while (!c->vecqueuedcmd.empty()) {
+        /* Return if we're still parsing this command */
+        auto &cmd = c->vecqueuedcmd.front();
+        if (cmd.argc != cmd.argcMax) break;
+
+        if (!FClientReady(c)) break;
+
+        zfree(c->argv);
+        c->argc = cmd.argc;
+        c->argv = cmd.argv;
+        cmd.argv = nullptr;
+        c->argv_len_sumActive = cmd.argv_len_sum;
+        cmd.argv_len_sum = 0;
+        c->reploff_cmd = cmd.reploff;
+        serverAssert(c->argv != nullptr);
+
+        c->vecqueuedcmd.erase(c->vecqueuedcmd.begin());
 
         /* Multibulk processing could see a <= 0 length. */
         if (c->argc == 0) {
@@ -2364,12 +2420,6 @@ void processInputBuffer(client *c, int callFlags) {
             }
         }
     }
-
-    /* Trim to pos */
-    if (c->qb_pos) {
-        sdsrange(c->querybuf,c->qb_pos,-1);
-        c->qb_pos = 0;
-    }
 }
 
 void readQueryFromClient(connection *conn) {
@@ -2450,6 +2500,8 @@ void readQueryFromClient(connection *conn) {
         return;
     }
 
+    parseClientCommandBuffer(c);
+
     serverTL->vecclientsProcess.push_back(c);
 }
 
@@ -2461,7 +2513,7 @@ void processClients()
         /* There is more data in the client input buffer, continue parsing it
         * in case to check if there is a full command to execute. */
         std::unique_lock<fastlock> ul(c->lock);
-        processInputBuffer(c, CMD_CALL_FULL);
+        processInputBuffer(c, false /*fParse*/, CMD_CALL_FULL);
     }
 
     if (listLength(serverTL->clients_pending_asyncwrite))
@@ -2568,7 +2620,7 @@ sds catClientInfoString(sds s, client *client) {
     /* For efficiency (less work keeping track of the argv memory), it doesn't include the used memory
      * i.e. unused sds space and internal fragmentation, just the string length. but this is enough to
      * spot problematic clients. */
-    total_mem += client->argv_len_sum;
+    total_mem += client->argv_len_sum();
     if (client->argv)
         total_mem += zmalloc_size(client->argv);
 
@@ -2587,7 +2639,7 @@ sds catClientInfoString(sds s, client *client) {
         (client->flags & CLIENT_MULTI) ? client->mstate.count : -1,
         (unsigned long long) sdslen(client->querybuf),
         (unsigned long long) sdsavail(client->querybuf),
-        (unsigned long long) client->argv_len_sum,
+        (unsigned long long) client->argv_len_sum(),
         (unsigned long long) client->bufpos,
         (unsigned long long) listLength(client->reply),
         (unsigned long long) obufmem, /* should not include client->buf since we want to see 0 for static clients. */
@@ -3144,10 +3196,10 @@ void rewriteClientCommandVector(client *c, int argc, ...) {
     /* Replace argv and argc with our new versions. */
     c->argv = argv;
     c->argc = argc;
-    c->argv_len_sum = 0;
+    c->argv_len_sumActive = 0;
     for (j = 0; j < c->argc; j++)
         if (c->argv[j])
-            c->argv_len_sum += getStringObjectLen(c->argv[j]);
+            c->argv_len_sumActive += getStringObjectLen(c->argv[j]);
     c->cmd = lookupCommandOrOriginal((sds)ptrFromObj(c->argv[0]));
     serverAssertWithInfo(c,NULL,c->cmd != NULL);
     va_end(ap);
@@ -3160,10 +3212,10 @@ void replaceClientCommandVector(client *c, int argc, robj **argv) {
     zfree(c->argv);
     c->argv = argv;
     c->argc = argc;
-    c->argv_len_sum = 0;
+    c->argv_len_sumActive = 0;
     for (j = 0; j < c->argc; j++)
         if (c->argv[j])
-            c->argv_len_sum += getStringObjectLen(c->argv[j]);
+            c->argv_len_sumActive += getStringObjectLen(c->argv[j]);
     c->cmd = lookupCommandOrOriginal((sds)ptrFromObj(c->argv[0]));
     serverAssertWithInfo(c,NULL,c->cmd != NULL);
 }
@@ -3188,8 +3240,8 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) {
         c->argv[i] = NULL;
     }
     oldval = c->argv[i];
-    if (oldval) c->argv_len_sum -= getStringObjectLen(oldval);
-    if (newval) c->argv_len_sum += getStringObjectLen(newval);
+    if (oldval) c->argv_len_sumActive -= getStringObjectLen(oldval);
+    if (newval) c->argv_len_sumActive += getStringObjectLen(newval);
     c->argv[i] = newval;
     incrRefCount(newval);
     if (oldval) decrRefCount(oldval);
diff --git a/src/replication.cpp b/src/replication.cpp
index 1f87c77a4..76c949e4b 100644
--- a/src/replication.cpp
+++ b/src/replication.cpp
@@ -3432,6 +3432,13 @@ void replicationCacheMaster(redisMaster *mi, client *c) {
      * offsets, including pending transactions, already populated arguments,
      * pending outputs to the master. */
     sdsclear(mi->master->querybuf);
+    if (!mi->master->vecqueuedcmd.empty()) {
+        // Clear out everything except for partially parsed commands (which we'll cache)
+        auto cmd = std::move(mi->master->vecqueuedcmd.front());
+        mi->master->vecqueuedcmd.clear();
+        if (cmd.argc != cmd.argcMax)
+            mi->master->vecqueuedcmd.emplace_back(std::move(cmd));
+    }
     sdsclear(mi->master->pending_querybuf);
     mi->master->read_reploff = mi->master->reploff;
     if (c->flags & CLIENT_MULTI) discardTransaction(c);
@@ -4307,10 +4314,12 @@ void replicaReplayCommand(client *c)
     cFake->authenticated = c->authenticated;
     cFake->puser = c->puser;
     cFake->querybuf = sdscatsds(cFake->querybuf,(sds)ptrFromObj(c->argv[2]));
+    cFake->read_reploff = sdslen(cFake->querybuf);
+    cFake->reploff = 0;
     selectDb(cFake, c->db->id);
     auto ccmdPrev = serverTL->commandsExecuted;
     cFake->flags |= CLIENT_MASTER | CLIENT_PREVENT_REPL_PROP;
-    processInputBuffer(cFake, (CMD_CALL_FULL & (~CMD_CALL_PROPAGATE)));
+    processInputBuffer(cFake, true /*fParse*/, (CMD_CALL_FULL & (~CMD_CALL_PROPAGATE)));
     cFake->flags &= ~(CLIENT_MASTER | CLIENT_PREVENT_REPL_PROP);
     bool fExec = ccmdPrev != serverTL->commandsExecuted;
     cFake->lock.unlock();
diff --git a/src/server.cpp b/src/server.cpp
index e9e71b46e..eda89bc0e 100644
--- a/src/server.cpp
+++ b/src/server.cpp
@@ -1749,7 +1749,7 @@ size_t ClientsPeakMemInput[CLIENTS_PEAK_MEM_USAGE_SLOTS];
 size_t ClientsPeakMemOutput[CLIENTS_PEAK_MEM_USAGE_SLOTS];
 
 int clientsCronTrackExpansiveClients(client *c) {
-    size_t in_usage = sdsZmallocSize(c->querybuf) + c->argv_len_sum;
+    size_t in_usage = sdsZmallocSize(c->querybuf) + c->argv_len_sum();
     size_t out_usage = getClientOutputBufferMemoryUsage(c);
     int i = g_pserver->unixtime % CLIENTS_PEAK_MEM_USAGE_SLOTS;
     int zeroidx = (i+1) % CLIENTS_PEAK_MEM_USAGE_SLOTS;
@@ -1786,7 +1786,7 @@ int clientsCronTrackClientsMemUsage(client *c) {
     mem += getClientOutputBufferMemoryUsage(c);
     mem += sdsZmallocSize(c->querybuf);
     mem += zmalloc_size(c);
-    mem += c->argv_len_sum;
+    mem += c->argv_len_sum();
     if (c->argv) mem += zmalloc_size(c->argv);
     /* Now that we have the memory used by the client, remove the old
      * value from the old category, and add it back. */
diff --git a/src/server.h b/src/server.h
index 3f06c9f8b..7d2611c37 100644
--- a/src/server.h
+++ b/src/server.h
@@ -1136,7 +1136,7 @@ public:
     bool removeCachedValue(const char *key);
     void removeAllCachedValues();
 
-    void prefetchKeysAsync(class AeLocker &locker, client *c);
+    void prefetchKeysAsync(struct parsed_command &command);
 
     bool FSnapshot() const { return m_spdbSnapshotHOLDER != nullptr; }
 
@@ -1443,7 +1443,53 @@ typedef struct {
                                       need more reserved IDs use UINT64_MAX-1,
                                       -2, ... and so forth. */
 
-typedef struct client {
+struct parsed_command {
+    robj** argv = nullptr;
+    int argc = 0;
+    int argcMax;
+    long long reploff = 0;
+    size_t argv_len_sum = 0;    /* Sum of lengths of objects in argv list. */
+
+    parsed_command(int maxargs) {
+        argv = (robj**)zmalloc(sizeof(robj*)*maxargs);
+        argcMax = maxargs;
+    }
+
+    parsed_command &operator=(parsed_command &&o) {
+        argv = o.argv;
+        argc = o.argc;
+        argcMax = o.argcMax;
+        reploff = o.reploff;
+        o.argv = nullptr;
+        o.argc = 0;
+        o.argcMax = 0;
+        o.reploff = 0;
+        return *this;
+    }
+
+    parsed_command(parsed_command &o) = delete;
+    parsed_command(parsed_command &&o) {
+        argv = o.argv;
+        argc = o.argc;
+        argcMax = o.argcMax;
+        reploff = o.reploff;
+        o.argv = nullptr;
+        o.argc = 0;
+        o.argcMax = 0;
+        o.reploff = 0;
+    }
+
+    ~parsed_command() {
+        if (argv != nullptr) {
+            for (int i = 0; i < argc; ++i) {
+                decrRefCount(argv[i]);
+            }
+            zfree(argv);
+        }
+    }
+};
+
+struct client {
     uint64_t id;            /* Client incremental unique ID. */
     connection *conn;
     int resp;               /* RESP protocol version. Can be 2 or 3. */
@@ -1456,9 +1502,6 @@ typedef struct client {
                                replication stream that we are receiving from
                                the master. */
     size_t querybuf_peak;   /* Recent (100ms or more) peak of querybuf size. */
-    int argc;               /* Num of arguments of current command. */
-    robj **argv;            /* Arguments of current command. */
-    size_t argv_len_sum;    /* Sum of lengths of objects in argv list. */
     struct redisCommand *cmd, *lastcmd;  /* Last command executed. */
     user *puser;             /* User associated with this connection. If the
                                user is set to NULL the connection can do
@@ -1488,6 +1531,7 @@ typedef struct client {
     long long read_reploff; /* Read replication offset if this is a master. */
     long long reploff;      /* Applied replication offset if this is a master. */
     long long reploff_skipped;  /* Repl backlog we did not send to this client */
+    long long reploff_cmd;  /* The replication offset of the executing command, reploff gets set to this after the execution completes */
     long long repl_ack_off; /* Replication ack offset, if this is a replica. */
     long long repl_ack_time;/* Replication ack time, if this is a replica. */
     long long psync_initial_offset; /* FULLRESYNC reply offset other slaves
@@ -1545,12 +1589,17 @@ typedef struct client {
     uint64_t mvccCheckpoint = 0;    // the MVCC checkpoint of our last write
 
     int iel; /* the event loop index we're registered with */
-    struct fastlock lock;
+    struct fastlock lock {"client"};
     int master_error;
+    std::vector<parsed_command> vecqueuedcmd;
+    int argc;
+    robj **argv;
+    size_t argv_len_sumActive = 0;
 
     // post a function from a non-client thread to run on its client thread
     bool postFunction(std::function<void(client *)> fn, bool fLock = true);
-} client;
+    size_t argv_len_sum() const;
+};
 
 struct saveparam {
     time_t seconds;
@@ -2516,7 +2565,7 @@ void setDeferredMapLen(client *c, void *node, long length);
 void setDeferredSetLen(client *c, void *node, long length);
 void setDeferredAttributeLen(client *c, void *node, long length);
 void setDeferredPushLen(client *c, void *node, long length);
-void processInputBuffer(client *c, int callFlags);
+void processInputBuffer(client *c, bool fParse, int callFlags);
 void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
 void acceptTcpHandler(aeEventLoop *el, int fd, void *privdata, int mask);
 void acceptTLSHandler(aeEventLoop *el, int fd, void *privdata, int mask);
diff --git a/tests/unit/introspection.tcl b/tests/unit/introspection.tcl
index 278495931..54d9dbbcd 100644
--- a/tests/unit/introspection.tcl
+++ b/tests/unit/introspection.tcl
@@ -1,7 +1,7 @@
 start_server {tags {"introspection"}} {
     test {CLIENT LIST} {
         r client list
-    } {*addr=*:* fd=* age=* idle=* flags=N db=9 sub=0 psub=0 multi=-1 qbuf=26 qbuf-free=* argv-mem=* obl=0 oll=0 omem=0 tot-mem=* events=r cmd=client*}
+    } {*addr=*:* fd=* age=* idle=* flags=N db=9 sub=0 psub=0 multi=-1 qbuf=0 qbuf-free=* argv-mem=* obl=0 oll=0 omem=0 tot-mem=* events=r cmd=client*}
 
     test {MONITOR can log executed commands} {
         set rd [redis_deferring_client]

From c43bb8e8b5d447a936d7e14bc835f579beb0aee6 Mon Sep 17 00:00:00 2001
From: John Sully <john@csquare.ca>
Date: Fri, 12 Feb 2021 19:38:57 +0000
Subject: [PATCH 32/40] Fix deadlock in storage prefetch

Former-commit-id: 9e6162de08248f7726b97b73aee2f23808ff4e7b
---
 src/db.cpp         | 6 +++---
 src/networking.cpp | 2 +-
 src/server.h       | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/db.cpp b/src/db.cpp
index 14c292b6f..ee79536d6 100644
--- a/src/db.cpp
+++ b/src/db.cpp
@@ -2959,7 +2959,7 @@ int dbnumFromDb(redisDb *db)
     serverPanic("invalid database pointer");
 }
 
-void redisDbPersistentData::prefetchKeysAsync(parsed_command &command)
+void redisDbPersistentData::prefetchKeysAsync(client *c, parsed_command &command)
 {
     if (m_spstorage == nullptr)
         return;
@@ -2967,7 +2967,7 @@ void redisDbPersistentData::prefetchKeysAsync(parsed_command &command)
     AeLocker lock;
 
     std::vector<robj*> veckeys;
-    lock.arm(nullptr);
+    lock.arm(c);
     getKeysResult result = GETKEYS_RESULT_INIT;
     auto cmd = lookupCommand(szFromObj(command.argv[0]));
     int numkeys = getKeysFromCommand(cmd, command.argv, command.argc, &result);
@@ -3000,7 +3000,7 @@ void redisDbPersistentData::prefetchKeysAsync(parsed_command &command)
         vecInserts.emplace_back(sharedKey, o, std::move(spexpire));
     }
 
-    lock.arm(nullptr);
+    lock.arm(c);
     for (auto &tuple : vecInserts)
     {
         sds sharedKey = std::get<0>(tuple);
diff --git a/src/networking.cpp b/src/networking.cpp
index be50373c9..15aa6f43a 100644
--- a/src/networking.cpp
+++ b/src/networking.cpp
@@ -2362,7 +2362,7 @@ void parseClientCommandBuffer(client *c) {
         if (cqueries < c->vecqueuedcmd.size() && g_pserver->m_pstorageFactory != nullptr && !GlobalLocksAcquired()) {
             auto &query = c->vecqueuedcmd.back();
             if (query.argc > 0 && query.argc == query.argcMax) {
-                c->db->prefetchKeysAsync(query);
+                c->db->prefetchKeysAsync(c, query);
             }
         }
         c->reqtype = 0;
diff --git a/src/server.h b/src/server.h
index 7d2611c37..941770450 100644
--- a/src/server.h
+++ b/src/server.h
@@ -1136,7 +1136,7 @@ public:
     bool removeCachedValue(const char *key);
     void removeAllCachedValues();
 
-    void prefetchKeysAsync(struct parsed_command &command);
+    void prefetchKeysAsync(client *c, struct parsed_command &command);
 
     bool FSnapshot() const { return m_spdbSnapshotHOLDER != nullptr; }
 

From 3ac42ec80c3c3bb4b8b61e17c3fda338abb0ed40 Mon Sep 17 00:00:00 2001
From: John Sully <john@csquare.ca>
Date: Tue, 16 Feb 2021 21:07:43 +0000
Subject: [PATCH 33/40] Disable async rehash with single threads, it causes
 slowdowns as the rehash never completes

Former-commit-id: 5d08dbdf76c0fd1e0cfcf86b97ef3e656f0e4f5d
---
 src/server.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/server.cpp b/src/server.cpp
index eda89bc0e..a006d4da0 100644
--- a/src/server.cpp
+++ b/src/server.cpp
@@ -1950,7 +1950,7 @@ void databasesCron(bool fMainThread) {
                 ::dict *dict = g_pserver->db[rehash_db]->dictUnsafeKeyOnly();
                 /* Are we async rehashing? And if so is it time to re-calibrate? */
                 /* The recalibration limit is a prime number to ensure balancing across threads */
-                if (rehashes_per_ms > 0 && async_rehashes < 131 && !cserver.active_defrag_enabled) {
+                if (rehashes_per_ms > 0 && async_rehashes < 131 && !cserver.active_defrag_enabled && cserver.cthreads > 1) {
                     serverTL->rehashCtl = dictRehashAsyncStart(dict, rehashes_per_ms);
                     ++async_rehashes;
                 }

From 87fc0184154eaa8a0bd8426f3fbddd33c71dea04 Mon Sep 17 00:00:00 2001
From: John Sully <john@csquare.ca>
Date: Fri, 19 Feb 2021 05:17:41 +0000
Subject: [PATCH 34/40] Implement maxstorage config

Former-commit-id: 79e07859dfec14edf2a39646013e1a82db033d4f
---
 src/config.cpp | 1 +
 src/evict.cpp  | 5 +++++
 src/server.h   | 1 +
 3 files changed, 7 insertions(+)

diff --git a/src/config.cpp b/src/config.cpp
index a5efb2f10..ca9181788 100644
--- a/src/config.cpp
+++ b/src/config.cpp
@@ -2619,6 +2619,7 @@ standardConfig configs[] = {
 
     /* Unsigned Long Long configs */
     createULongLongConfig("maxmemory", NULL, MODIFIABLE_CONFIG, 0, LLONG_MAX, g_pserver->maxmemory, 0, MEMORY_CONFIG, NULL, updateMaxmemory),
+    createULongLongConfig("maxstorage", NULL, MODIFIABLE_CONFIG, 0, LLONG_MAX, g_pserver->maxstorage, 0, MEMORY_CONFIG, NULL, NULL),
 
     /* Size_t configs */
     createSizeTConfig("hash-max-ziplist-entries", NULL, MODIFIABLE_CONFIG, 0, LONG_MAX, g_pserver->hash_max_ziplist_entries, 512, INTEGER_CONFIG, NULL, NULL),
diff --git a/src/evict.cpp b/src/evict.cpp
index d4b0c1f12..31cadeae5 100644
--- a/src/evict.cpp
+++ b/src/evict.cpp
@@ -500,6 +500,9 @@ int freeMemoryIfNeeded(bool fQuickCycle, bool fPreSnapshot) {
     int ckeysFailed = 0;
     int keys_freed = 0;
 
+    if (g_pserver->maxstorage && g_pserver->m_pstorageFactory != nullptr && g_pserver->m_pstorageFactory->totalDiskspaceUsed() >= g_pserver->maxstorage)
+        goto cant_free_storage;
+
     /* When clients are paused the dataset should be static not just from the
      * POV of clients not being able to write, but also from the POV of
      * expires and evictions of keys not being performed. */
@@ -726,8 +729,10 @@ cant_free:
         latencyEndMonitor(lazyfree_latency);
         latencyAddSampleIfNeeded("eviction-lazyfree",lazyfree_latency);
     }
+
     latencyEndMonitor(latency);
     latencyAddSampleIfNeeded("eviction-cycle",latency);
+cant_free_storage:
     return result;
 }
 
diff --git a/src/server.h b/src/server.h
index 941770450..6fa6807c7 100644
--- a/src/server.h
+++ b/src/server.h
@@ -2230,6 +2230,7 @@ struct redisServer {
     /* Limits */
     unsigned int maxclients;            /* Max number of simultaneous clients */
     unsigned long long maxmemory;   /* Max number of memory bytes to use */
+    unsigned long long maxstorage;  /* Max number of bytes to use in a storage provider */
     int maxmemory_policy;           /* Policy for key eviction */
     int maxmemory_samples;          /* Precision of random sampling */
     int lfu_log_factor;             /* LFU logarithmic counter factor. */

From ca13fda90fad7c68d2153b2ddc9d14e46f72f48c Mon Sep 17 00:00:00 2001
From: John Sully <john@csquare.ca>
Date: Mon, 22 Feb 2021 09:14:24 +0000
Subject: [PATCH 35/40] Fix issue where finding random keys is slow due to not
 shrinking the hash table.

Former-commit-id: fd05010cdcf9d6a6187ca2e18bc55adbaa680a02
---
 src/dict.cpp        | 19 +++++++++++++------
 src/dict.h          |  2 +-
 src/replication.cpp |  1 +
 3 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/src/dict.cpp b/src/dict.cpp
index 819921dbb..0bf7e6c36 100644
--- a/src/dict.cpp
+++ b/src/dict.cpp
@@ -142,15 +142,15 @@ int dictResize(dict *d)
     minimal = d->ht[0].used;
     if (minimal < DICT_HT_INITIAL_SIZE)
         minimal = DICT_HT_INITIAL_SIZE;
-    return dictExpand(d, minimal);
+    return dictExpand(d, minimal, false /*fShirnk*/);
 }
 
 /* Expand or create the hash table */
-int dictExpand(dict *d, unsigned long size)
+int dictExpand(dict *d, unsigned long size, bool fShrink)
 {
     /* the size is invalid if it is smaller than the number of
      * elements already inside the hash table */
-    if (dictIsRehashing(d) || d->ht[0].used > size)
+    if (dictIsRehashing(d) || (d->ht[0].used > size && !fShrink) || size == 0)
         return DICT_ERR;
 
     dictht n; /* the new hash table */
@@ -264,7 +264,7 @@ int dictMerge(dict *dst, dict *src)
         return DICT_OK;
     }
 
-    dictExpand(dst, dictSize(dst)+dictSize(src));    // start dst rehashing if necessary
+    dictExpand(dst, dictSize(dst)+dictSize(src), false /* fShrink */);    // start dst rehashing if necessary
     auto &htDst = dictIsRehashing(dst) ? dst->ht[1] : dst->ht[0];
     for (int iht = 0; iht < 2; ++iht)
     {
@@ -683,6 +683,8 @@ static dictEntry *dictGenericDelete(dict *d, const void *key, int nofree) {
         }
         if (!dictIsRehashing(d)) break;
     }
+
+    _dictExpandIfNeeded(d);
     return NULL; /* not found */
 }
 
@@ -1269,7 +1271,7 @@ static int _dictExpandIfNeeded(dict *d)
     if (dictIsRehashing(d)) return DICT_OK;
 
     /* If the hash table is empty expand it to the initial size. */
-    if (d->ht[0].size == 0) return dictExpand(d, DICT_HT_INITIAL_SIZE);
+    if (d->ht[0].size == 0) return dictExpand(d, DICT_HT_INITIAL_SIZE, false /*fShrink*/);
 
     /* If we reached the 1:1 ratio, and we are allowed to resize the hash
      * table (global setting) or we should avoid it but the ratio between
@@ -1279,7 +1281,12 @@ static int _dictExpandIfNeeded(dict *d)
         (dict_can_resize ||
          d->ht[0].used/d->ht[0].size > dict_force_resize_ratio))
     {
-        return dictExpand(d, d->ht[0].used*2);
+        return dictExpand(d, d->ht[0].used*2, false /*fShrink*/);
+    }
+    else if (d->ht[0].used > 0 && d->ht[0].used * 16 < d->ht[0].size && dict_can_resize)
+    {
+        // If the dictionary has shurnk a lot we'll need to shrink the hash table instead
+        return dictExpand(d, d->ht[0].used*2, true /*fShrink*/);
     }
     return DICT_OK;
 }
diff --git a/src/dict.h b/src/dict.h
index 0be8e1c53..54519996b 100644
--- a/src/dict.h
+++ b/src/dict.h
@@ -187,7 +187,7 @@ typedef void (dictScanBucketFunction)(void *privdata, dictEntry **bucketref);
 
 /* API */
 dict *dictCreate(dictType *type, void *privDataPtr);
-int dictExpand(dict *d, unsigned long size);
+int dictExpand(dict *d, unsigned long size, bool fShrink = false);
 int dictAdd(dict *d, void *key, void *val);
 dictEntry *dictAddRaw(dict *d, void *key, dictEntry **existing);
 dictEntry *dictAddOrFind(dict *d, void *key);
diff --git a/src/replication.cpp b/src/replication.cpp
index 76c949e4b..a6c13c044 100644
--- a/src/replication.cpp
+++ b/src/replication.cpp
@@ -1902,6 +1902,7 @@ void readSyncBulkPayload(connection *conn) {
     int use_diskless_load = useDisklessLoad();
     const dbBackup *diskless_load_backup = NULL;
     rdbSaveInfo rsi = RDB_SAVE_INFO_INIT;
+    rsi.fForceSetKey = !!g_pserver->fActiveReplica;
     int empty_db_flags = g_pserver->repl_slave_lazy_flush ? EMPTYDB_ASYNC :
                                                         EMPTYDB_NO_FLAGS;
     off_t left;

From 53b7b83af6c0a2de0000a8b526b9a1aa240ceb9e Mon Sep 17 00:00:00 2001
From: christian <christian@eqalpha.com>
Date: Fri, 26 Feb 2021 00:50:22 +0000
Subject: [PATCH 36/40] Offload updating cached time to dedicated thread

Former-commit-id: 9bfc8a43952481b5b54a7b051d44b8bece4a18dd
---
 src/blocked.cpp |  1 -
 src/rdb.cpp     |  4 ----
 src/server.cpp  | 27 +++++++++++++++------------
 src/server.h    |  3 ++-
 4 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/src/blocked.cpp b/src/blocked.cpp
index 085090b50..a37dcca58 100644
--- a/src/blocked.cpp
+++ b/src/blocked.cpp
@@ -525,7 +525,6 @@ void handleClientsBlockedOnKeys(void) {
              * lookup, invalidating the first one.
              * See https://github.com/antirez/redis/pull/6554. */
             serverTL->fixed_time_expire++;
-            updateCachedTime(0);
 
             /* Serve clients blocked on the key. */
             robj *o = lookupKeyWrite(rl->db,rl->key);
diff --git a/src/rdb.cpp b/src/rdb.cpp
index 3dad6e989..aefc730c0 100644
--- a/src/rdb.cpp
+++ b/src/rdb.cpp
@@ -2341,10 +2341,6 @@ void rdbLoadProgressCallback(rio *r, const void *buf, size_t len) {
         (g_pserver->loading_process_events_interval_keys &&
         (r->keys_since_last_callback >= g_pserver->loading_process_events_interval_keys)))
     {
-        /* The DB can take some non trivial amount of time to load. Update
-         * our cached time since it is used to create and update the last
-         * interaction time with clients and for other important things. */
-        updateCachedTime(0);
         listIter li;
         listNode *ln;
         listRewind(g_pserver->masters, &li);
diff --git a/src/server.cpp b/src/server.cpp
index a006d4da0..f0510c9b6 100644
--- a/src/server.cpp
+++ b/src/server.cpp
@@ -2002,7 +2002,7 @@ void databasesCron(bool fMainThread) {
  * info or not using the 'update_daylight_info' argument. Normally we update
  * such info only when calling this function from serverCron() but not when
  * calling it from call(). */
-void updateCachedTime(int update_daylight_info) {
+void updateCachedTime() {
     long long t = ustime();
     __atomic_store(&g_pserver->ustime, &t, __ATOMIC_RELAXED);
     t /= 1000;
@@ -2015,12 +2015,10 @@ void updateCachedTime(int update_daylight_info) {
      * context is safe since we will never fork() while here, in the main
      * thread. The logging function will call a thread safe version of
      * localtime that has no locks. */
-    if (update_daylight_info) {
-        struct tm tm;
-        time_t ut = g_pserver->unixtime;
-        localtime_r(&ut,&tm);
-        __atomic_store(&g_pserver->daylight_active, &tm.tm_isdst, __ATOMIC_RELAXED);
-    }
+    struct tm tm;
+    time_t ut = g_pserver->unixtime;
+    localtime_r(&ut,&tm);
+    __atomic_store(&g_pserver->daylight_active, &tm.tm_isdst, __ATOMIC_RELAXED);
 }
 
 void checkChildrenDone(void) {
@@ -2172,9 +2170,6 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
      * handler if we don't return here fast enough. */
     if (g_pserver->watchdog_period) watchdogScheduleSignal(g_pserver->watchdog_period);
 
-    /* Update the time cache. */
-    updateCachedTime(1);
-
     /* Unpause clients if enough time has elapsed */
     unpauseClientsIfNecessary();
 
@@ -2812,7 +2807,7 @@ void initMasterInfo(redisMaster *master)
 void initServerConfig(void) {
     int j;
 
-    updateCachedTime(true);
+    updateCachedTime();
     getRandomHexChars(g_pserver->runid,CONFIG_RUN_ID_SIZE);
     g_pserver->runid[CONFIG_RUN_ID_SIZE] = '\0';
     changeReplicationId();
@@ -3915,7 +3910,6 @@ void call(client *c, int flags) {
 
     /* Call the command. */
     dirty = g_pserver->dirty;
-    updateCachedTime(0);
     incrementMvccTstamp();
     start = g_pserver->ustime;
     try {
@@ -6072,6 +6066,13 @@ void OnTerminate()
     serverPanic("std::teminate() called");
 }
 
+void *timeThreadMain(void*) {
+    while (true) {
+        updateCachedTime();
+        usleep(1);
+    }
+} 
+
 void *workerThreadMain(void *parg)
 {
     int iel = (int)((int64_t)parg);
@@ -6419,6 +6420,8 @@ int main(int argc, char **argv) {
     setOOMScoreAdj(-1);
     serverAssert(cserver.cthreads > 0 && cserver.cthreads <= MAX_EVENT_LOOPS);
 
+    pthread_create(&cserver.time_thread_id, nullptr, timeThreadMain, nullptr);
+
     pthread_attr_t tattr;
     pthread_attr_init(&tattr);
     pthread_attr_setstacksize(&tattr, 1 << 23); // 8 MB
diff --git a/src/server.h b/src/server.h
index 6fa6807c7..1ecf838ea 100644
--- a/src/server.h
+++ b/src/server.h
@@ -1920,6 +1920,7 @@ struct redisServerConst {
     pid_t pid;                  /* Main process pid. */
     time_t stat_starttime;          /* Server start time */
     pthread_t main_thread_id;         /* Main thread id */
+    pthread_t time_thread_id;
     char *configfile;           /* Absolute config file path, or NULL */
     char *executable;           /* Absolute executable file path. */
     char **exec_argv;           /* Executable argv vector (copy). */
@@ -2965,7 +2966,7 @@ void populateCommandTable(void);
 void resetCommandTableStats(void);
 void adjustOpenFilesLimit(void);
 void closeListeningSockets(int unlink_unix_socket);
-void updateCachedTime(int update_daylight_info);
+void updateCachedTime();
 void resetServerStats(void);
 void activeDefragCycle(void);
 unsigned int getLRUClock(void);

From 4f632962c17740f9aebe9e853c1c7b369640eb3a Mon Sep 17 00:00:00 2001
From: John Sully <john@csquare.ca>
Date: Fri, 26 Feb 2021 01:03:10 +0000
Subject: [PATCH 37/40] eliminate syscall in call()

Former-commit-id: 3ee111a2e50bc29818ba85ae4fb786171d695c37
---
 src/server.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/server.cpp b/src/server.cpp
index f0510c9b6..4dcca26bb 100644
--- a/src/server.cpp
+++ b/src/server.cpp
@@ -3911,6 +3911,7 @@ void call(client *c, int flags) {
     /* Call the command. */
     dirty = g_pserver->dirty;
     incrementMvccTstamp();
+    __atomic_load(&g_pserver->ustime, &start, __ATOMIC_SEQ_CST);
     start = g_pserver->ustime;
     try {
         c->cmd->proc(c);
@@ -3922,7 +3923,9 @@ void call(client *c, int flags) {
         addReplyError(c, sz);
     }
     serverTL->commandsExecuted++;
-    duration = ustime()-start;
+    ustime_t end;
+    __atomic_load(&g_pserver->ustime, &end, __ATOMIC_SEQ_CST);
+    duration = end-start;
     dirty = g_pserver->dirty-dirty;
     if (dirty < 0) dirty = 0;
 

From 6ee9d69763f337e402b5c5dd3d3e985e4a53f002 Mon Sep 17 00:00:00 2001
From: John Sully <john@csquare.ca>
Date: Fri, 26 Feb 2021 01:28:05 +0000
Subject: [PATCH 38/40] Eliminate the need for an mfence by tricking the CPU
 into ordering the futex read

Former-commit-id: 340e6f5bc94cd1c3b0c6fb6da833e8504acaf23a
---
 src/fastlock_x64.asm | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/fastlock_x64.asm b/src/fastlock_x64.asm
index 9e17f8995..3fecfa439 100644
--- a/src/fastlock_x64.asm
+++ b/src/fastlock_x64.asm
@@ -190,10 +190,11 @@ fastlock_unlock:
 	mov dword ptr [rdi], -1      # pidOwner = -1 (we don't own it anymore)
 	mov esi, [rdi+64]            # get current active (this one)
 	inc esi                      # bump it to the next thread
+	sfence                       # ensure whatever was written in the lock is visible
 	mov word ptr [rdi+64], si    # give up our ticket (note: lock is not required here because the spinlock itself guards this variable)
-	mfence                       # sync other threads
 	# At this point the lock is removed, however we must wake up any pending futexs
-	mov edx, [rdi+64+4]          # load the futex mask
+	mov rdx, [rdi+64]            # load the futex mask, note we intentionally also read the ticket we just wrote to ensure this is ordered with the above mov
+	shr rdx, 32                  # isolate the mask
 	bt edx, esi                  # is the next thread waiting on a futex?
 	jc unlock_futex              # unlock the futex if necessary
 	ret                          # if not we're done.

From bbb310f419cd4afd79d7c7906762b3c9653d0377 Mon Sep 17 00:00:00 2001
From: John Sully <john@csquare.ca>
Date: Fri, 26 Feb 2021 05:40:56 +0000
Subject: [PATCH 39/40] Eliminate needless lock

Former-commit-id: 60f972d463f202edb33ff9a25bc2bd3e2558105c
---
 src/db.cpp     | 4 ++--
 src/server.cpp | 2 --
 src/server.h   | 4 ++--
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/db.cpp b/src/db.cpp
index ee79536d6..20e8ac014 100644
--- a/src/db.cpp
+++ b/src/db.cpp
@@ -2834,9 +2834,9 @@ bool redisDbPersistentData::removeCachedValue(const char *key)
 
 void redisDbPersistentData::trackChanges(bool fBulk)
 {
-    m_fTrackingChanges++;
+    m_fTrackingChanges.fetch_add(1, std::memory_order_relaxed);
     if (fBulk)
-        m_fAllChanged++;
+        m_fAllChanged.fetch_add(1, std::memory_order_acq_rel);
 }
 
 void redisDbPersistentData::removeAllCachedValues()
diff --git a/src/server.cpp b/src/server.cpp
index 4dcca26bb..1bc9ab175 100644
--- a/src/server.cpp
+++ b/src/server.cpp
@@ -2650,10 +2650,8 @@ void afterSleep(struct aeEventLoop *eventLoop) {
 
     serverAssert(serverTL->gcEpoch.isReset());
     serverTL->gcEpoch = g_pserver->garbageCollector.startEpoch();
-    aeAcquireLock();
     for (int idb = 0; idb < cserver.dbnum; ++idb)
         g_pserver->db[idb]->trackChanges(false);
-    aeReleaseLock();
 }
 
 /* =========================== Server initialization ======================== */
diff --git a/src/server.h b/src/server.h
index 1ecf838ea..8e29ac196 100644
--- a/src/server.h
+++ b/src/server.h
@@ -1172,8 +1172,8 @@ private:
     // Keyspace
     dict *m_pdict = nullptr;                 /* The keyspace for this DB */
     dict *m_pdictTombstone = nullptr;        /* Track deletes when we have a snapshot */
-    int m_fTrackingChanges = 0;     // Note: Stack based
-    int m_fAllChanged = 0;
+    std::atomic<int> m_fTrackingChanges {0};     // Note: Stack based
+    std::atomic<int> m_fAllChanged {0};
     std::set<changedesc, changedescCmp> m_setchanged;
     size_t m_cnewKeysPending = 0;
     std::shared_ptr<StorageCache> m_spstorage = nullptr;

From 877ad1e4b25fdca623c53ce80d129ecea4a41967 Mon Sep 17 00:00:00 2001
From: John Sully <john@csquare.ca>
Date: Fri, 26 Feb 2021 06:06:58 +0000
Subject: [PATCH 40/40] Don't complain about unclean shutdowns with an empty
 database

Former-commit-id: 99f5c02e87062552eaa2f26e960eb7c9dd977c84