From 05fe41b33a3b1ebc25a5c17748a5c00a3e0a2984 Mon Sep 17 00:00:00 2001
From: VivekSainiEQ <vivek@eqalpha.com>
Date: Mon, 26 Apr 2021 22:13:32 +0000
Subject: [PATCH 01/22] Primitive implementation of bypassing client buffer,
 stats are all messed up and print statements everywhere

Former-commit-id: 8ae310fb0f7b53add826f76891da333b63860001
---
 src/networking.cpp  | 159 +++++++++++++++++++++++++++++++++++++++-----
 src/replication.cpp | 156 +++++++++++++++++++++++++++++++++++++++++++
 src/server.h        |  11 +++
 3 files changed, 308 insertions(+), 18 deletions(-)

diff --git a/src/networking.cpp b/src/networking.cpp
index 574a8bc6c..18ab382bd 100644
--- a/src/networking.cpp
+++ b/src/networking.cpp
@@ -224,6 +224,7 @@ void clientInstallWriteHandler(client *c) {
         (c->replstate == REPL_STATE_NONE ||
          (c->replstate == SLAVE_STATE_ONLINE && !c->repl_put_online_on_ack)))
     {
+        // serverLog(LL_NOTICE, "we installing boyz");
         AssertCorrectThread(c);
         serverAssert(c->lock.fOwnLock());
         /* Here instead of installing the write handler, we just flag the
@@ -301,7 +302,7 @@ int prepareClientToWrite(client *c) {
 
     /* Schedule the client to write the output buffers to the socket, unless
      * it should already be setup to do so (it has already pending data). */
-    if (!fAsync && !clientHasPendingReplies(c)) clientInstallWriteHandler(c);
+    if (!fAsync && !clientHasPendingReplies(c) && c->repl_curr_idx == -1) clientInstallWriteHandler(c);
     if (fAsync && !(c->fPendingAsyncWrite)) clientInstallAsyncWriteHandler(c);
 
     /* Authorize the caller to queue in the output buffer of this client. */
@@ -1676,15 +1677,33 @@ int writeToClient(client *c, int handler_installed) {
 
     std::unique_lock<decltype(c->lock)> lock(c->lock);
    
+    /* if this is a write to a replica, it's coming straight from the replication backlog */        
+    long long repl_backlog_idx = g_pserver->repl_backlog_idx;
+
+    bool wroteFromClientBuffer = false; /* True if you wrote from the client buffer in this function call */ 
+
     while(clientHasPendingReplies(c)) {
+        wroteFromClientBuffer = true;
+        if (c->flags & CLIENT_SLAVE && listLength(c->reply) % 10 == 0){
+
+            serverLog(LL_NOTICE, "-----------------------------------------");
+            serverLog(LL_NOTICE, "replica w/ pending replies, with a reply list size of: %lu", listLength(c->reply));            
+            serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size);
+            serverLog(LL_NOTICE, "repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset);
+            serverLog(LL_NOTICE, "-----------------------------------------");
+
+        }
         if (c->bufpos > 0) {
+            // serverLog(LL_NOTICE, "Sending reply %d", x);
+            // serverLog(LL_NOTICE, "SUSSUS AMOGUS, %ld", c->bufpos);
             nwritten = connWrite(c->conn,c->buf+c->sentlen,c->bufpos-c->sentlen);
             if (nwritten <= 0) break;
             c->sentlen += nwritten;
             totwritten += nwritten;
 
             /* If the buffer was sent, set bufpos to zero to continue with
-             * the remainder of the reply. */
+            * the remainder of the reply. */
+            // serverLog(LL_NOTICE, "buf pos: %d, sentlen: %ld", c->bufpos, c->sentlen);
             if ((int)c->sentlen == c->bufpos) {
                 c->bufpos = 0;
                 c->sentlen = 0;
@@ -1714,23 +1733,112 @@ int writeToClient(client *c, int handler_installed) {
             }
         }
         /* Note that we avoid to send more than NET_MAX_WRITES_PER_EVENT
-         * bytes, in a single threaded server it's a good idea to serve
-         * other clients as well, even if a very large request comes from
-         * super fast link that is always able to accept data (in real world
-         * scenario think about 'KEYS *' against the loopback interface).
-         *
-         * However if we are over the maxmemory limit we ignore that and
-         * just deliver as much data as it is possible to deliver.
-         *
-         * Moreover, we also send as much as possible if the client is
-         * a replica or a monitor (otherwise, on high-speed traffic, the
-         * replication/output buffer will grow indefinitely) */
+        * bytes, in a single threaded server it's a good idea to serve
+        * other clients as well, even if a very large request comes from
+        * super fast link that is always able to accept data (in real world
+        * scenario think about 'KEYS *' against the loopback interface).
+        *
+        * However if we are over the maxmemory limit we ignore that and
+        * just deliver as much data as it is possible to deliver.
+        *
+        * Moreover, we also send as much as possible if the client is
+        * a replica or a monitor (otherwise, on high-speed traffic, the
+        * replication/output buffer will grow indefinitely) */
         if (totwritten > NET_MAX_WRITES_PER_EVENT &&
             (g_pserver->maxmemory == 0 ||
-             zmalloc_used_memory() < g_pserver->maxmemory) &&
+            zmalloc_used_memory() < g_pserver->maxmemory) &&
             !(c->flags & CLIENT_SLAVE)) break;
     }
-    
+
+    /* If there are no more pending replies, then we have transmitted the RDB.
+     * This means further replication commands will be taken straight from the
+     * replication backlog from now on. */
+    if (c->flags & CLIENT_SLAVE && c->replstate == SLAVE_STATE_ONLINE && !clientHasPendingReplies(c)){
+        if (!c->transmittedRDB)
+            serverLog(LL_NOTICE, "---------->>>>>>>> TRANSMISSION OF THE RDB HAS COMPLETED <<<<<<<<----------");
+        c->transmittedRDB = true;
+    }
+
+    /* For replicas, we don't store all the information in the client buffer
+     * Most of the time (aside from immediately after synchronizing), we read
+     * from the replication backlog directly */
+    if (c->flags & CLIENT_SLAVE && c->repl_curr_idx != -1 && c->transmittedRDB){
+        /* copy global variables into local scope so if they change in between we don't care */
+        long long repl_backlog_size = g_pserver->repl_backlog_size;
+        long long nwrittenPart2 = 0;
+
+        ssize_t nrequested; /* The number of bytes requested to write */
+        /* normal case with no wrap around */
+        if (repl_backlog_idx >= c->repl_curr_idx){
+            nrequested = repl_backlog_idx - c->repl_curr_idx;
+            nwritten = connWrite(c->conn, g_pserver->repl_backlog + c->repl_curr_idx, repl_backlog_idx - c->repl_curr_idx);
+        /* wrap around case, v. rare */
+        /* also v. buggy so there's that */
+        } else {
+            serverLog(LL_NOTICE, "WRAP CASE");
+            serverLog(LL_NOTICE, "-----------------------------------------");
+            serverLog(LL_NOTICE, "requested to write: %ld", nrequested);
+            serverLog(LL_NOTICE, "actually written: %ld", nwritten);
+            serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size);
+            serverLog(LL_NOTICE, "buf pos: %d, sentlen: %ld", c->bufpos, c->sentlen);
+            serverLog(LL_NOTICE, "nwritten: %ld", nwritten);
+            serverLog(LL_NOTICE, "-----------------------------------------");
+
+            nrequested = repl_backlog_size + repl_backlog_idx - c->repl_curr_idx;
+            nwritten = connWrite(c->conn, g_pserver->repl_backlog + c->repl_curr_idx, repl_backlog_size - c->repl_curr_idx);
+            /* only attempt wrapping if we write the correct number of bytes */
+            if (nwritten == repl_backlog_size - c->repl_curr_idx){
+                serverLog(LL_NOTICE, "SECOND STAGE");
+                serverLog(LL_NOTICE, "-----------------------------------------");
+                serverLog(LL_NOTICE, "requested to write: %ld", nrequested);
+                serverLog(LL_NOTICE, "actually written: %ld", nwritten);
+                serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size);
+                serverLog(LL_NOTICE, "buf pos: %d, sentlen: %ld", c->bufpos, c->sentlen);
+                serverLog(LL_NOTICE, "-----------------------------------------");
+
+                long long nwrittenPart2 = connWrite(c->conn, g_pserver->repl_backlog, repl_backlog_idx);
+                if (nwrittenPart2 != -1)
+                    nwritten += nwrittenPart2;
+
+                serverLog(LL_NOTICE, "nwrittenPart2: %lld", nwrittenPart2);
+                serverLog(LL_NOTICE, "-----------------------------------------");
+            } else {
+                serverLog(LL_NOTICE, "SUPER SHORT");
+            }
+                
+        }
+
+        /* only update the replica's current index if bytes were sent */
+
+        // if (nrequested != nwritten){
+        serverLog(LL_NOTICE, "-----------------------------------------");
+        serverLog(LL_NOTICE, "AFTER THE FACT");
+        serverLog(LL_NOTICE, "requested to write: %ld", nrequested);
+        serverLog(LL_NOTICE, "actually written: %ld", nwritten);
+        serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size);
+        serverLog(LL_NOTICE, "repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset);
+        serverLog(LL_NOTICE, "-----------------------------------------");
+        // }
+
+
+        if (nwritten == nrequested){
+            c->repl_curr_idx = -1; /* -1 denotes no more replica writes */
+        }
+        else if (nwritten > 0)
+            c->repl_curr_idx = (c->repl_curr_idx + nwritten) % repl_backlog_size;
+
+        serverAssert(c->repl_curr_idx < repl_backlog_size);
+
+        /* only increment bytes if an error didn't occur */
+        if (nwritten > 0){
+            totwritten += nwritten;
+            c->repl_curr_off += nwritten;
+        }
+
+        /* If the second part of a write didn't go through, we still need to register that */
+        if (nwrittenPart2 == -1) nwritten = -1;
+    }
+
     g_pserver->stat_net_output_bytes += totwritten;
     if (nwritten == -1) {
         if (connGetState(c->conn) == CONN_STATE_CONNECTED) {
@@ -1750,7 +1858,7 @@ int writeToClient(client *c, int handler_installed) {
          * We just rely on data / pings received for timeout detection. */
         if (!(c->flags & CLIENT_MASTER)) c->lastinteraction = g_pserver->unixtime;
     }
-    if (!clientHasPendingReplies(c)) {
+    if (!clientHasPendingReplies(c) && c->repl_curr_idx == -1) {
         c->sentlen = 0;
         if (handler_installed) connSetWriteHandler(c->conn, NULL);
 
@@ -1904,6 +2012,12 @@ int handleClientsWithPendingWrites(int iel, int aof_state) {
         /* Don't write to clients that are going to be closed anyway. */
         if (c->flags & CLIENT_CLOSE_ASAP) continue;
 
+        // if (c->flags & CLIENT_SLAVE){
+        //     if(clientHasPendingReplies(c))
+        //         serverLog(LL_NOTICE, "somehow the client buffer has these values: %s", c->buf);
+        //     serverLog(LL_NOTICE, "LOL");
+        // }
+
         /* Try to write buffers to the client socket. */
         if (writeToClient(c,0) == C_ERR) 
         {
@@ -1920,7 +2034,7 @@ int handleClientsWithPendingWrites(int iel, int aof_state) {
 
         /* If after the synchronous writes above we still have data to
         * output to the client, we need to install the writable handler. */
-        if (clientHasPendingReplies(c)) {
+        if (clientHasPendingReplies(c) || c->repl_curr_idx != -1) {
             if (connSetWriteHandlerWithBarrier(c->conn, sendReplyToClient, ae_flags, true) == C_ERR) 
                 freeClientAsync(c);
         }
@@ -3268,6 +3382,13 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) {
     }
 }
 
+/* In the case of a replica client, it is possible (and very likely) 
+ * that writes to said replica are using data from the replication backlog
+ * as opposed to it's own internal buffer, this number should keep track of that */
+unsigned long getClientReplicationBacklogSharedUsage(client *c) {
+    return (c->repl_curr_idx == -1 && c->flags & CLIENT_SLAVE) ? 0 : g_pserver->master_repl_offset - c->repl_curr_off;
+}
+
 /* This function returns the number of bytes that Redis is
  * using to store the reply still not read by the client.
  *
@@ -3276,9 +3397,11 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) {
  * enforcing the client output length limits. */
 unsigned long getClientOutputBufferMemoryUsage(client *c) {
     unsigned long list_item_size = sizeof(listNode) + sizeof(clientReplyBlock);
-    return c->reply_bytes + (list_item_size*listLength(c->reply)) + (c->replyAsync ? c->replyAsync->size : 0);
+    return c->reply_bytes + (list_item_size*listLength(c->reply)) + (c->replyAsync ? c->replyAsync->size : 0) + getClientReplicationBacklogSharedUsage(c);
 }
 
+
+
 /* Get the class of a client, used in order to enforce limits to different
  * classes of clients.
  *
diff --git a/src/replication.cpp b/src/replication.cpp
index 2533bae52..ccb538a69 100644
--- a/src/replication.cpp
+++ b/src/replication.cpp
@@ -47,6 +47,9 @@
 #include <unordered_map>
 #include <string>
 
+#define BYPASS_BUFFER
+// #define BYPASS_PSYNC
+
 void replicationDiscardCachedMaster(redisMaster *mi);
 void replicationResurrectCachedMaster(redisMaster *mi, connection *conn);
 void replicationSendAck(redisMaster *mi);
@@ -59,6 +62,18 @@ static void propagateMasterStaleKeys();
  * the instance is configured to have no persistence. */
 int RDBGeneratedByReplication = 0;
 
+void resizeReplicationBacklogForClients(long long newsize);
+
+void setReplIdx(client *c, long long idx, long long off){
+    if (prepareClientToWrite(c) != C_OK) return;
+    // serverLog(LL_NOTICE, "calling this garbage function w/ idx and off: %lld, %lld, %lld", idx, off, off-idx);
+    // serverLog(LL_NOTICE, "What is this value? %lld", c->repl_curr_idx);
+    if (c->repl_curr_idx == -1){
+        c->repl_curr_idx = idx;
+        c->repl_curr_off = off;
+    }
+}
+
 /* --------------------------- Utility functions ---------------------------- */
 
 /* Return the pointer to a string representing the replica ip:listening_port
@@ -213,6 +228,8 @@ void resizeReplicationBacklog(long long newsize) {
         newsize = CONFIG_REPL_BACKLOG_MIN_SIZE;
     if (g_pserver->repl_backlog_size == newsize) return;
 
+    serverLog(LL_NOTICE, "WE HAD TO RESIZE from %lld to %lld", g_pserver->repl_backlog_size, newsize);
+
     if (g_pserver->repl_backlog != NULL) {
         /* What we actually do is to flush the old buffer and realloc a new
          * empty one. It will refill with new data incrementally.
@@ -253,6 +270,80 @@ void resizeReplicationBacklog(long long newsize) {
     g_pserver->repl_backlog_size = newsize;
 }
 
+
+/* The above but for when clients need extra replication backlog because ??? */
+void resizeReplicationBacklogForClients(long long newsize) {
+    if (newsize < CONFIG_REPL_BACKLOG_MIN_SIZE)
+        newsize = CONFIG_REPL_BACKLOG_MIN_SIZE;
+    if (g_pserver->repl_backlog_size == newsize) return;
+
+    serverLog(LL_NOTICE, "WE HAD TO RESIZE from %lld to %lld", g_pserver->repl_backlog_size, newsize);
+    /* get the critical client size, i.e. the size of the data unflushed to clients */
+    long long earliest_off = LONG_LONG_MAX;
+    long long earliest_idx = -1;
+    listIter li;
+    listNode *ln;
+    listRewind(g_pserver->slaves, &li);
+    while ((ln = listNext(&li))) {
+        client *replica = (client*)listNodeValue(ln);
+        if (replica->repl_curr_off != -1 && replica->repl_curr_off < earliest_off){
+            earliest_off = replica->repl_curr_off;
+            earliest_idx = replica->repl_curr_idx;
+        }
+    }
+
+    if (g_pserver->repl_backlog != NULL) {
+        /* What we actually do is to flush the old buffer and realloc a new
+         * empty one. It will refill with new data incrementally.
+         * The reason is that copying a few gigabytes adds latency and even
+         * worse often we need to alloc additional space before freeing the
+         * old buffer. */
+
+        if (earliest_idx >= 0) {
+            // We need to keep critical data so we can't shrink less than the hot data in the buffer
+            newsize = std::max(newsize, g_pserver->master_repl_offset - earliest_off);
+            char *backlog = (char*)zmalloc(newsize);
+            g_pserver->repl_backlog_histlen = g_pserver->master_repl_offset - earliest_off;
+
+            if (g_pserver->repl_backlog_idx >= earliest_idx) {
+                auto cbActiveBacklog = g_pserver->repl_backlog_idx - earliest_idx;
+                memcpy(backlog, g_pserver->repl_backlog + earliest_idx, cbActiveBacklog);
+                serverLog(LL_NOTICE, "g_pserver->master_repl_offset: %lld, earliest_off: %lld, g_pserver->repl_backlog_idx: %lld, earliest_idx: %lld",
+                    g_pserver->master_repl_offset, earliest_off, g_pserver->repl_backlog_idx, earliest_idx);
+                serverAssert(g_pserver->repl_backlog_histlen == cbActiveBacklog);
+            } else {
+                auto cbPhase1 = g_pserver->repl_backlog_size - earliest_idx;
+                memcpy(backlog, g_pserver->repl_backlog + earliest_idx, cbPhase1);
+                memcpy(backlog + cbPhase1, g_pserver->repl_backlog, g_pserver->repl_backlog_idx);
+                auto cbActiveBacklog = cbPhase1 + g_pserver->repl_backlog_idx;
+                serverAssert(g_pserver->repl_backlog_histlen == cbActiveBacklog);
+            }
+            zfree(g_pserver->repl_backlog);
+            g_pserver->repl_backlog = backlog;
+            g_pserver->repl_backlog_idx = g_pserver->repl_backlog_histlen;
+            listRewind(g_pserver->slaves, &li);
+            /* Go through the clients and update their replication indicies */
+            while ((ln = listNext(&li))) {
+                client *replica = (client*)listNodeValue(ln);
+                if (replica->repl_curr_idx != -1){
+                    replica->repl_curr_idx -= earliest_idx;
+                    if (replica->repl_curr_idx < 0)
+                        replica->repl_curr_idx += g_pserver->repl_backlog_size;
+                }
+            }
+            g_pserver->repl_batch_idxStart = 0;
+        } else {
+            zfree(g_pserver->repl_backlog);
+            g_pserver->repl_backlog = (char*)zmalloc(newsize);
+            g_pserver->repl_backlog_histlen = 0;
+            g_pserver->repl_backlog_idx = 0;
+            /* Next byte we have is... the next since the buffer is empty. */
+            g_pserver->repl_backlog_off = g_pserver->master_repl_offset+1;
+        }
+    }
+    g_pserver->repl_backlog_size = newsize;
+}
+
 void freeReplicationBacklog(void) {
     serverAssert(GlobalLocksAcquired());
     listIter li;
@@ -683,6 +774,10 @@ long long addReplyReplicationBacklog(client *c, long long offset) {
      * split the reply in two parts if we are cross-boundary. */
     len = g_pserver->repl_backlog_histlen - skip;
     serverLog(LL_DEBUG, "[PSYNC] Reply total length: %lld", len);
+    serverLog(LL_NOTICE, "Coming through from the addReplicationBacklog");
+#ifdef BYPASS_PSYNC
+    setReplIdx(c, j, offset);
+#else
     while(len) {
         long long thislen =
             ((g_pserver->repl_backlog_size - j) < len) ?
@@ -693,6 +788,8 @@ long long addReplyReplicationBacklog(client *c, long long offset) {
         len -= thislen;
         j = 0;
     }
+#endif
+    serverLog(LL_NOTICE, "rdb transmitted? %d, pending replies? %d", c->transmittedRDB, clientHasPendingReplies(c));
     return g_pserver->repl_backlog_histlen - skip;
 }
 
@@ -731,6 +828,8 @@ int replicationSetupSlaveForFullResync(client *replica, long long offset) {
      * a SELECT statement in the replication stream. */
     g_pserver->replicaseldb = -1;
 
+    serverLog(LL_NOTICE, "We are setting up here lad");
+
     /* Don't send this reply to slaves that approached us with
      * the old SYNC command. */
     if (!(replica->flags & CLIENT_PRE_PSYNC)) {
@@ -989,6 +1088,7 @@ void syncCommand(client *c) {
     if (!strcasecmp((const char*)ptrFromObj(c->argv[0]),"psync")) {
         if (masterTryPartialResynchronization(c) == C_OK) {
             g_pserver->stat_sync_partial_ok++;
+            // c->repl_curr_idx = g_pserver->repl_backlog_idx;
             return; /* No full resync needed, return. */
         } else {
             char *master_replid = (char*)ptrFromObj(c->argv[1]);
@@ -1016,6 +1116,7 @@ void syncCommand(client *c) {
         connDisableTcpNoDelay(c->conn); /* Non critical if it fails. */
     c->repldbfd = -1;
     c->flags |= CLIENT_SLAVE;
+    // c->repl_curr_idx = g_pserver->repl_backlog_idx;
     listAddNodeTail(g_pserver->slaves,c);
 
     /* Create the replication backlog if needed. */
@@ -1035,6 +1136,7 @@ void syncCommand(client *c) {
     if (g_pserver->FRdbSaveInProgress() &&
         g_pserver->rdb_child_type == RDB_CHILD_TYPE_DISK)
     {
+        serverLog(LL_NOTICE, "case 1");
         /* Ok a background save is in progress. Let's check if it is a good
          * one for replication, i.e. if there is another replica that is
          * registering differences since the server forked to save. */
@@ -1066,6 +1168,7 @@ void syncCommand(client *c) {
     } else if (g_pserver->FRdbSaveInProgress() &&
                g_pserver->rdb_child_type == RDB_CHILD_TYPE_SOCKET)
     {
+        serverLog(LL_NOTICE, "case 2");
         /* There is an RDB child process but it is writing directly to
          * children sockets. We need to wait for the next BGSAVE
          * in order to synchronize. */
@@ -1073,6 +1176,7 @@ void syncCommand(client *c) {
 
     /* CASE 3: There is no BGSAVE is progress. */
     } else {
+        serverLog(LL_NOTICE, "case 3");
         if (g_pserver->repl_diskless_sync && (c->slave_capa & SLAVE_CAPA_EOF)) {
             /* Diskless replication RDB child is created inside
              * replicationCron() since we want to delay its start a
@@ -1278,6 +1382,7 @@ void replconfCommand(client *c) {
  * 3) Update the count of "good replicas". */
 void putSlaveOnline(client *replica) {
     replica->replstate = SLAVE_STATE_ONLINE;
+    
     replica->repl_put_online_on_ack = 0;
     replica->repl_ack_time = g_pserver->unixtime; /* Prevent false timeout. */
     if (connSetWriteHandler(replica->conn, sendReplyToClient, true) == C_ERR) {
@@ -1415,11 +1520,13 @@ void sendBulkToSlave(connection *conn) {
 
     replica->repldboff += nwritten;
     g_pserver->stat_net_output_bytes += nwritten;
+    // replica->repl_curr_idx = g_pserver->repl_backlog_idx;
     if (replica->repldboff == replica->repldbsize) {
         close(replica->repldbfd);
         replica->repldbfd = -1;
         connSetWriteHandler(replica->conn,NULL);
         putSlaveOnline(replica);
+        serverLog(LL_NOTICE, "ABOUT TO DIE HERE");
     }
 }
 
@@ -4450,6 +4557,21 @@ void replicateSubkeyExpire(redisDb *db, robj_roptr key, robj_roptr subkey, long
 }
 
 void _clientAsyncReplyBufferReserve(client *c, size_t len);
+
+/* Has the end of the replication backlog overflowed past the beginning? */
+bool replOverflowHasOccured(){
+    if (g_pserver->repl_backlog_idx > g_pserver->repl_batch_idxStart){
+        long long repl_idx_difference = g_pserver->repl_backlog_idx > g_pserver->repl_batch_idxStart ? 
+                g_pserver->repl_backlog_idx - g_pserver->repl_batch_idxStart :
+                (g_pserver->repl_backlog_size + g_pserver->repl_backlog_idx) - g_pserver->repl_batch_idxStart;
+
+        return g_pserver->master_repl_offset - g_pserver->repl_batch_offStart > repl_idx_difference;
+    }
+    return false;
+}
+
+thread_local int transmittedCount = 0;
+
 void flushReplBacklogToClients()
 {
     serverAssert(GlobalLocksAcquired());
@@ -4463,11 +4585,31 @@ void flushReplBacklogToClients()
         serverAssert(g_pserver->master_repl_offset - g_pserver->repl_batch_offStart <= g_pserver->repl_backlog_size);
         serverAssert(g_pserver->repl_batch_idxStart != g_pserver->repl_backlog_idx);
 
+        serverAssert(!replOverflowHasOccured());
         listIter li;
         listNode *ln;
         listRewind(g_pserver->slaves, &li);
+
+#if 0
+        // check for potential overflow first
         while ((ln = listNext(&li))) {
             client *replica = (client*)listNodeValue(ln);
+            // serverLog(LL_NOTICE, "replica state: %d", replica->replstate);
+
+            if (replica->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue;
+            if (replica->flags & CLIENT_CLOSE_ASAP) continue;
+            if (replica->repl_curr_idx == -1) continue;
+
+            std::unique_lock<fastlock> ul(replica->lock, std::defer_lock);
+            if (FCorrectThread(replica))
+                ul.lock();
+            else
+                fAsyncWrite = true;
+#endif
+
+        while ((ln = listNext(&li))) {
+            client *replica = (client*)listNodeValue(ln);
+            // serverLog(LL_NOTICE, "replica state: %d", replica->replstate);
 
             if (replica->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue;
             if (replica->flags & CLIENT_CLOSE_ASAP) continue;
@@ -4478,6 +4620,19 @@ void flushReplBacklogToClients()
             else
                 fAsyncWrite = true;
             
+
+#ifdef BYPASS_BUFFER
+            /* If we are online and the RDB has been sent, there is no need to feed the client buffer
+             * We will send our replies directly from the replication backlog instead */
+            if (replica->replstate == SLAVE_STATE_ONLINE && replica->transmittedRDB){
+                setReplIdx(replica, g_pserver->repl_batch_idxStart, g_pserver->repl_batch_offStart);
+                continue;
+            }
+#else
+            if (replica->replstate == SLAVE_STATE_ONLINE){
+                // serverLog(LL_NOTICE, "would be calling this garbage function w/ offset: %lld", g_pserver->repl_batch_idxStart);
+            }
+#endif
             if (g_pserver->repl_backlog_idx >= g_pserver->repl_batch_idxStart) {
                 long long cbCopy = g_pserver->repl_backlog_idx - g_pserver->repl_batch_idxStart;
                 serverAssert((g_pserver->master_repl_offset - g_pserver->repl_batch_offStart) == cbCopy);
@@ -4491,6 +4646,7 @@ void flushReplBacklogToClients()
                     _clientAsyncReplyBufferReserve(replica, cbPhase1 + g_pserver->repl_backlog_idx);
                 addReplyProto(replica, g_pserver->repl_backlog + g_pserver->repl_batch_idxStart, cbPhase1);
                 addReplyProto(replica, g_pserver->repl_backlog, g_pserver->repl_backlog_idx);
+
                 serverAssert((cbPhase1 + g_pserver->repl_backlog_idx) == (g_pserver->master_repl_offset - g_pserver->repl_batch_offStart));
             }
         }
diff --git a/src/server.h b/src/server.h
index 878154bc5..cfd6c34a0 100644
--- a/src/server.h
+++ b/src/server.h
@@ -1516,6 +1516,8 @@ struct client {
     long long psync_initial_offset; /* FULLRESYNC reply offset other slaves
                                        copying this replica output buffer
                                        should use. */
+    long long repl_curr_idx = -1;  /* Replication index sent, if this is a replica */
+    long long repl_curr_off = -1;
     char replid[CONFIG_RUN_ID_SIZE+1]; /* Master replication ID (if master). */
     int slave_listening_port; /* As configured with: REPLCONF listening-port */
     char slave_ip[NET_IP_STR_LEN]; /* Optionally given by REPLCONF ip-address */
@@ -1575,6 +1577,9 @@ struct client {
     robj **argv;
     size_t argv_len_sumActive = 0;
 
+    bool transmittedRDB = false; /* Have we finished transmitting the RDB to this replica? */
+                                 /* If so, we can read from the replication backlog instead of the client buffer */
+
     // post a function from a non-client thread to run on its client thread
     bool postFunction(std::function<void(client *)> fn, bool fLock = true);
     size_t argv_len_sum() const;
@@ -3470,6 +3475,8 @@ void mixDigest(unsigned char *digest, const void *ptr, size_t len);
 void xorDigest(unsigned char *digest, const void *ptr, size_t len);
 int populateCommandTableParseFlags(struct redisCommand *c, const char *strflags);
 
+
+
 int moduleGILAcquiredByModule(void);
 extern int g_fInCrash;
 static inline int GlobalLocksAcquired(void)  // Used in asserts to verify all global locks are correctly acquired for a server-thread to operate
@@ -3526,6 +3533,8 @@ void tlsInit(void);
 void tlsInitThread();
 int tlsConfigure(redisTLSContextConfig *ctx_config);
 
+int prepareClientToWrite(client *c);
+
 
 class ShutdownException
 {};
@@ -3538,3 +3547,5 @@ class ShutdownException
 int iAmMaster(void);
 
 #endif
+
+

From d8367a92b2bea452d39561c201085a377c2021a6 Mon Sep 17 00:00:00 2001
From: VivekSainiEQ <vivek@eqalpha.com>
Date: Thu, 29 Apr 2021 17:01:06 +0000
Subject: [PATCH 02/22] Updated resize logic

Former-commit-id: e6d892ef21b7fc6f51433f32b01198038e555419
---
 src/networking.cpp  | 104 +++++++++++++++----------------------
 src/replication.cpp | 123 ++++++++++++++++++++++++++++++++++++++------
 src/server.cpp      |   3 +-
 3 files changed, 151 insertions(+), 79 deletions(-)

diff --git a/src/networking.cpp b/src/networking.cpp
index 18ab382bd..cac58ff07 100644
--- a/src/networking.cpp
+++ b/src/networking.cpp
@@ -1680,22 +1680,8 @@ int writeToClient(client *c, int handler_installed) {
     /* if this is a write to a replica, it's coming straight from the replication backlog */        
     long long repl_backlog_idx = g_pserver->repl_backlog_idx;
 
-    bool wroteFromClientBuffer = false; /* True if you wrote from the client buffer in this function call */ 
-
     while(clientHasPendingReplies(c)) {
-        wroteFromClientBuffer = true;
-        if (c->flags & CLIENT_SLAVE && listLength(c->reply) % 10 == 0){
-
-            serverLog(LL_NOTICE, "-----------------------------------------");
-            serverLog(LL_NOTICE, "replica w/ pending replies, with a reply list size of: %lu", listLength(c->reply));            
-            serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size);
-            serverLog(LL_NOTICE, "repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset);
-            serverLog(LL_NOTICE, "-----------------------------------------");
-
-        }
         if (c->bufpos > 0) {
-            // serverLog(LL_NOTICE, "Sending reply %d", x);
-            // serverLog(LL_NOTICE, "SUSSUS AMOGUS, %ld", c->bufpos);
             nwritten = connWrite(c->conn,c->buf+c->sentlen,c->bufpos-c->sentlen);
             if (nwritten <= 0) break;
             c->sentlen += nwritten;
@@ -1753,9 +1739,7 @@ int writeToClient(client *c, int handler_installed) {
     /* If there are no more pending replies, then we have transmitted the RDB.
      * This means further replication commands will be taken straight from the
      * replication backlog from now on. */
-    if (c->flags & CLIENT_SLAVE && c->replstate == SLAVE_STATE_ONLINE && !clientHasPendingReplies(c)){
-        if (!c->transmittedRDB)
-            serverLog(LL_NOTICE, "---------->>>>>>>> TRANSMISSION OF THE RDB HAS COMPLETED <<<<<<<<----------");
+    if (c->flags & CLIENT_SLAVE && c->replstate == SLAVE_STATE_ONLINE && !clientHasPendingReplies(c) && c->replyAsync == nullptr){
         c->transmittedRDB = true;
     }
 
@@ -1775,49 +1759,27 @@ int writeToClient(client *c, int handler_installed) {
         /* wrap around case, v. rare */
         /* also v. buggy so there's that */
         } else {
-            serverLog(LL_NOTICE, "WRAP CASE");
-            serverLog(LL_NOTICE, "-----------------------------------------");
-            serverLog(LL_NOTICE, "requested to write: %ld", nrequested);
-            serverLog(LL_NOTICE, "actually written: %ld", nwritten);
-            serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size);
-            serverLog(LL_NOTICE, "buf pos: %d, sentlen: %ld", c->bufpos, c->sentlen);
-            serverLog(LL_NOTICE, "nwritten: %ld", nwritten);
-            serverLog(LL_NOTICE, "-----------------------------------------");
-
             nrequested = repl_backlog_size + repl_backlog_idx - c->repl_curr_idx;
             nwritten = connWrite(c->conn, g_pserver->repl_backlog + c->repl_curr_idx, repl_backlog_size - c->repl_curr_idx);
             /* only attempt wrapping if we write the correct number of bytes */
             if (nwritten == repl_backlog_size - c->repl_curr_idx){
-                serverLog(LL_NOTICE, "SECOND STAGE");
-                serverLog(LL_NOTICE, "-----------------------------------------");
-                serverLog(LL_NOTICE, "requested to write: %ld", nrequested);
-                serverLog(LL_NOTICE, "actually written: %ld", nwritten);
-                serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size);
-                serverLog(LL_NOTICE, "buf pos: %d, sentlen: %ld", c->bufpos, c->sentlen);
-                serverLog(LL_NOTICE, "-----------------------------------------");
-
                 long long nwrittenPart2 = connWrite(c->conn, g_pserver->repl_backlog, repl_backlog_idx);
                 if (nwrittenPart2 != -1)
                     nwritten += nwrittenPart2;
 
-                serverLog(LL_NOTICE, "nwrittenPart2: %lld", nwrittenPart2);
-                serverLog(LL_NOTICE, "-----------------------------------------");
-            } else {
-                serverLog(LL_NOTICE, "SUPER SHORT");
-            }
-                
+            }                
         }
 
         /* only update the replica's current index if bytes were sent */
 
         // if (nrequested != nwritten){
-        serverLog(LL_NOTICE, "-----------------------------------------");
-        serverLog(LL_NOTICE, "AFTER THE FACT");
-        serverLog(LL_NOTICE, "requested to write: %ld", nrequested);
-        serverLog(LL_NOTICE, "actually written: %ld", nwritten);
-        serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size);
-        serverLog(LL_NOTICE, "repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset);
-        serverLog(LL_NOTICE, "-----------------------------------------");
+            // serverLog(LL_NOTICE, "-----------------------------------------");
+            // serverLog(LL_NOTICE, "AFTER THE FACT");
+            // serverLog(LL_NOTICE, "requested to write: %ld", nrequested);
+            // serverLog(LL_NOTICE, "actually written: %ld", nwritten);
+            // serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size);
+            // serverLog(LL_NOTICE, "repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset);
+            // serverLog(LL_NOTICE, "-----------------------------------------");
         // }
 
 
@@ -1902,25 +1864,36 @@ void ProcessPendingAsyncWrites()
         serverAssert(c->fPendingAsyncWrite);
         if (c->flags & (CLIENT_CLOSE_ASAP | CLIENT_CLOSE_AFTER_REPLY))
         {
-            zfree(c->replyAsync);
-            c->replyAsync = nullptr;
+            if (c->replyAsync != nullptr){
+                zfree(c->replyAsync);
+                c->replyAsync = nullptr;
+            }
             c->fPendingAsyncWrite = FALSE;
             continue;
         }
 
-        int size = c->replyAsync->used;
+        /* since writes from master to replica can come directly from the replication backlog,
+         * writes may have been signalled without having been copied to the replyAsync buffer,
+         * thus causing the buffer to be NULL */ 
+        if (c->replyAsync != nullptr){
+            int size = c->replyAsync->used;
 
-        if (listLength(c->reply) == 0 && size <= (PROTO_REPLY_CHUNK_BYTES - c->bufpos)) {
-            memcpy(c->buf + c->bufpos, c->replyAsync->buf(), size);
-            c->bufpos += size;
-        } else {
-            c->reply_bytes += c->replyAsync->size;
-            listAddNodeTail(c->reply, c->replyAsync);
+            if (listLength(c->reply) == 0 && size <= (PROTO_REPLY_CHUNK_BYTES - c->bufpos)) {
+                memcpy(c->buf + c->bufpos, c->replyAsync->buf(), size);
+                c->bufpos += size;
+            } else {
+                c->reply_bytes += c->replyAsync->size;
+                listAddNodeTail(c->reply, c->replyAsync);
+                c->replyAsync = nullptr;
+            }
+
+            zfree(c->replyAsync);
             c->replyAsync = nullptr;
+        } else {
+            /* Only replicas should have empty async reply buffers */
+            serverAssert(c->flags & CLIENT_SLAVE);
         }
 
-        zfree(c->replyAsync);
-        c->replyAsync = nullptr;
         c->fPendingAsyncWrite = FALSE;
 
         // Now install the write event handler
@@ -1935,17 +1908,17 @@ void ProcessPendingAsyncWrites()
         {
             ae_flags |= AE_BARRIER;
         }
-        
+
         if (!((c->replstate == REPL_STATE_NONE ||
          (c->replstate == SLAVE_STATE_ONLINE && !c->repl_put_online_on_ack))))
             continue;
-
+        
         asyncCloseClientOnOutputBufferLimitReached(c);
         if (c->flags & CLIENT_CLOSE_ASAP)
             continue;   // we will never write this so don't post an op
-        
+
         std::atomic_thread_fence(std::memory_order_seq_cst);
-        
+
         if (FCorrectThread(c))
         {
             prepareClientToWrite(c); // queue an event
@@ -3386,7 +3359,12 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) {
  * that writes to said replica are using data from the replication backlog
  * as opposed to it's own internal buffer, this number should keep track of that */
 unsigned long getClientReplicationBacklogSharedUsage(client *c) {
-    return (c->repl_curr_idx == -1 && c->flags & CLIENT_SLAVE) ? 0 : g_pserver->master_repl_offset - c->repl_curr_off;
+    if (c->flags & CLIENT_SLAVE && c->repl_curr_idx != -1){
+        // serverLog(LL_NOTICE, "repl_backlog_size %lld, repl_backlog_idx %lld, master_repl_offset %lld, repl_curr_idx %lld, repl_curr_off %lld", 
+        //     g_pserver->repl_backlog_size, g_pserver->repl_backlog_idx, g_pserver->master_repl_offset, c->repl_curr_idx, c->repl_curr_off);
+    }
+    
+    return (!(c->flags & CLIENT_SLAVE) || c->repl_curr_idx == -1) ? 0 : g_pserver->master_repl_offset - c->repl_curr_off;
 }
 
 /* This function returns the number of bytes that Redis is
diff --git a/src/replication.cpp b/src/replication.cpp
index ccb538a69..ef33fbfd9 100644
--- a/src/replication.cpp
+++ b/src/replication.cpp
@@ -47,8 +47,8 @@
 #include <unordered_map>
 #include <string>
 
-#define BYPASS_BUFFER
-// #define BYPASS_PSYNC
+// #define BYPASS_BUFFER
+// #define RESIZE_BACKLOG
 
 void replicationDiscardCachedMaster(redisMaster *mi);
 void replicationResurrectCachedMaster(redisMaster *mi, connection *conn);
@@ -57,6 +57,30 @@ void putSlaveOnline(client *replica);
 int cancelReplicationHandshake(redisMaster *mi);
 static void propagateMasterStaleKeys();
 
+/* gets the lowest offset amongst all of the replicas */
+long long getLowestOffsetAmongReplicas(){
+    serverAssert(GlobalLocksAcquired());
+    long long min_offset = LONG_LONG_MAX;
+    listIter li;
+    listNode *ln;
+    listRewind(g_pserver->slaves, &li);
+    // check for potential overflow first
+    while ((ln = listNext(&li))) {
+        client *replica = (client*)listNodeValue(ln);
+
+        if (replica->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue;
+        if (replica->flags & CLIENT_CLOSE_ASAP) continue;
+        if (replica->repl_curr_idx == -1) continue;
+
+        std::unique_lock<fastlock> ul(replica->lock, std::defer_lock);
+        if (FCorrectThread(replica))
+            ul.lock();
+
+        min_offset = std::min(min_offset, replica->repl_curr_off);        
+    }
+    /* return -1 if no other minimum was found */
+    return min_offset == LONG_LONG_MAX ? -1 : min_offset;
+}
 /* We take a global flag to remember if this instance generated an RDB
  * because of replication, so that we can remove the RDB file in case
  * the instance is configured to have no persistence. */
@@ -67,11 +91,13 @@ void resizeReplicationBacklogForClients(long long newsize);
 void setReplIdx(client *c, long long idx, long long off){
     if (prepareClientToWrite(c) != C_OK) return;
     // serverLog(LL_NOTICE, "calling this garbage function w/ idx and off: %lld, %lld, %lld", idx, off, off-idx);
-    // serverLog(LL_NOTICE, "What is this value? %lld", c->repl_curr_idx);
+    // serverLog(LL_NOTICE, "Repl Index started at: %lld", c->repl_curr_idx);
     if (c->repl_curr_idx == -1){
         c->repl_curr_idx = idx;
         c->repl_curr_off = off;
     }
+    // serverLog(LL_NOTICE, "Repl Index has become: %lld", c->repl_curr_idx);
+
 }
 
 /* --------------------------- Utility functions ---------------------------- */
@@ -277,7 +303,7 @@ void resizeReplicationBacklogForClients(long long newsize) {
         newsize = CONFIG_REPL_BACKLOG_MIN_SIZE;
     if (g_pserver->repl_backlog_size == newsize) return;
 
-    serverLog(LL_NOTICE, "WE HAD TO RESIZE from %lld to %lld", g_pserver->repl_backlog_size, newsize);
+    serverLog(LL_NOTICE, "WE HAVE TO RESIZE from %lld to %lld", g_pserver->repl_backlog_size, newsize);
     /* get the critical client size, i.e. the size of the data unflushed to clients */
     long long earliest_off = LONG_LONG_MAX;
     long long earliest_idx = -1;
@@ -290,6 +316,20 @@ void resizeReplicationBacklogForClients(long long newsize) {
             earliest_off = replica->repl_curr_off;
             earliest_idx = replica->repl_curr_idx;
         }
+        serverLog(LL_NOTICE, "repl_curr_idx: %lld, earlistidx: %lld", replica->repl_curr_idx, earliest_idx);
+    }
+    serverLog(LL_NOTICE, "We are starting with: master_repl_offset: %lld, repl_batch_offStart: %lld, earliest_off: %lld, " 
+                         "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, earliest_idx: %lld, repl_backlog_size: %lld",
+            g_pserver->master_repl_offset, g_pserver->repl_batch_offStart, earliest_off,
+            g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, earliest_idx, g_pserver->repl_backlog_size
+        );
+
+    long long new_off = 0, new_idx = 0;
+
+    /* if no earliest offset is found amongst the clients, they are all up to date with the flushed index */
+    if (earliest_off == LONG_LONG_MAX && earliest_idx == -1){
+        earliest_idx = g_pserver->repl_batch_idxStart;
+        earliest_off = g_pserver->repl_batch_offStart;
     }
 
     if (g_pserver->repl_backlog != NULL) {
@@ -330,8 +370,11 @@ void resizeReplicationBacklogForClients(long long newsize) {
                     if (replica->repl_curr_idx < 0)
                         replica->repl_curr_idx += g_pserver->repl_backlog_size;
                 }
+                new_idx = replica->repl_curr_idx;
             }
-            g_pserver->repl_batch_idxStart = 0;
+            g_pserver->repl_batch_idxStart -= earliest_idx;
+            if (g_pserver->repl_batch_idxStart < 0)
+                g_pserver->repl_batch_idxStart += g_pserver->repl_backlog_size;
         } else {
             zfree(g_pserver->repl_backlog);
             g_pserver->repl_backlog = (char*)zmalloc(newsize);
@@ -342,6 +385,12 @@ void resizeReplicationBacklogForClients(long long newsize) {
         }
     }
     g_pserver->repl_backlog_size = newsize;
+
+    serverLog(LL_NOTICE, "We are ending with: master_repl_offset: %lld, repl_batch_offStart: %lld, new_off: %lld, " 
+                         "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, new_idx: %lld, repl_backlog_size: %lld",
+        g_pserver->master_repl_offset, g_pserver->repl_batch_offStart, new_off,
+        g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, new_idx, g_pserver->repl_backlog_size
+    );
 }
 
 void freeReplicationBacklog(void) {
@@ -367,20 +416,41 @@ void feedReplicationBacklog(const void *ptr, size_t len) {
     const unsigned char *p = (const unsigned char*)ptr;
 
     if (g_pserver->repl_batch_idxStart >= 0) {
-        long long minimumsize = g_pserver->master_repl_offset + len - g_pserver->repl_batch_offStart+1;
+        /* we are lower bounded by the lower client offset or the offStart if all the clients are up to date */
+        long long lower_bound = getLowestOffsetAmongReplicas();
+        if (lower_bound == -1)
+            lower_bound = g_pserver->repl_batch_offStart;
+        long long minimumsize = g_pserver->master_repl_offset + len - lower_bound + 1;
         if (minimumsize > g_pserver->repl_backlog_size) {
             flushReplBacklogToClients();
-            minimumsize = g_pserver->master_repl_offset + len - g_pserver->repl_batch_offStart+1;
+            minimumsize = g_pserver->master_repl_offset + len - lower_bound +1;
 
             if (minimumsize > g_pserver->repl_backlog_size) {
                 // This is an emergency overflow, we better resize to fit
                 long long newsize = std::max(g_pserver->repl_backlog_size*2, minimumsize);
                 serverLog(LL_WARNING, "Replication backlog is too small, resizing to: %lld", newsize);
-                resizeReplicationBacklog(newsize);
+                resizeReplicationBacklogForClients(newsize);
             }
         }
+#ifdef RESIZE_BACKLOG
+        long long lowest_replica_offset = getLowestOffsetAmongReplicas();
+        minimumsize = g_pserver->master_repl_offset + len - lowest_replica_offset;
+        if (lowest_replica_offset != -1 && minimumsize > g_pserver->repl_backlog_size){
+            serverLog(LL_WARNING, "THE REPLICATION BACKLOG SIZE IS TOO SMALL, THIS IS A PROBLEM");
+            long long oldsize = g_pserver->repl_backlog_size;
+            resizeReplicationBacklogForClients(std::max(g_pserver->repl_backlog_size * 2, minimumsize));
+            serverLog(LL_WARNING, "changed size from %lld to %lld", oldsize, g_pserver->repl_backlog_size);
+            flushReplBacklogToClients();
+        }
+#endif
     }
 
+    // serverLog(LL_NOTICE, "Pt2 start with: master_repl_offset: %lld, repl_batch_offStart: %lld, " 
+    //                      "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, repl_backlog_size: %lld",
+    //             g_pserver->master_repl_offset, g_pserver->repl_batch_offStart,
+    //             g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, g_pserver->repl_backlog_size
+    // );
+
     g_pserver->master_repl_offset += len;
 
     /* This is a circular buffer, so write as much data we can at every
@@ -395,12 +465,23 @@ void feedReplicationBacklog(const void *ptr, size_t len) {
         len -= thislen;
         p += thislen;
         g_pserver->repl_backlog_histlen += thislen;
+        // serverLog(LL_NOTICE, "Pt2 intermediate with: master_repl_offset: %lld, repl_batch_offStart: %lld, " 
+        //                      "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, repl_backlog_size: %lld",
+        //         g_pserver->master_repl_offset, g_pserver->repl_batch_offStart,
+        //         g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, g_pserver->repl_backlog_size
+        // );
     }
     if (g_pserver->repl_backlog_histlen > g_pserver->repl_backlog_size)
         g_pserver->repl_backlog_histlen = g_pserver->repl_backlog_size;
     /* Set the offset of the first byte we have in the backlog. */
     g_pserver->repl_backlog_off = g_pserver->master_repl_offset -
                               g_pserver->repl_backlog_histlen + 1;
+
+    // serverLog(LL_NOTICE, "Pt2 end with: master_repl_offset: %lld, repl_batch_offStart: %lld, " 
+    //                      "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, repl_backlog_size: %lld",
+    //         g_pserver->master_repl_offset, g_pserver->repl_batch_offStart,
+    //         g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, g_pserver->repl_backlog_size
+    // );
 }
 
 /* Wrapper for feedReplicationBacklog() that takes Redis string objects
@@ -774,7 +855,6 @@ long long addReplyReplicationBacklog(client *c, long long offset) {
      * split the reply in two parts if we are cross-boundary. */
     len = g_pserver->repl_backlog_histlen - skip;
     serverLog(LL_DEBUG, "[PSYNC] Reply total length: %lld", len);
-    serverLog(LL_NOTICE, "Coming through from the addReplicationBacklog");
 #ifdef BYPASS_PSYNC
     setReplIdx(c, j, offset);
 #else
@@ -789,7 +869,6 @@ long long addReplyReplicationBacklog(client *c, long long offset) {
         j = 0;
     }
 #endif
-    serverLog(LL_NOTICE, "rdb transmitted? %d, pending replies? %d", c->transmittedRDB, clientHasPendingReplies(c));
     return g_pserver->repl_backlog_histlen - skip;
 }
 
@@ -1520,13 +1599,11 @@ void sendBulkToSlave(connection *conn) {
 
     replica->repldboff += nwritten;
     g_pserver->stat_net_output_bytes += nwritten;
-    // replica->repl_curr_idx = g_pserver->repl_backlog_idx;
     if (replica->repldboff == replica->repldbsize) {
         close(replica->repldbfd);
         replica->repldbfd = -1;
         connSetWriteHandler(replica->conn,NULL);
         putSlaveOnline(replica);
-        serverLog(LL_NOTICE, "ABOUT TO DIE HERE");
     }
 }
 
@@ -4560,6 +4637,7 @@ void _clientAsyncReplyBufferReserve(client *c, size_t len);
 
 /* Has the end of the replication backlog overflowed past the beginning? */
 bool replOverflowHasOccured(){
+    
     if (g_pserver->repl_backlog_idx > g_pserver->repl_batch_idxStart){
         long long repl_idx_difference = g_pserver->repl_backlog_idx > g_pserver->repl_batch_idxStart ? 
                 g_pserver->repl_backlog_idx - g_pserver->repl_batch_idxStart :
@@ -4575,8 +4653,13 @@ thread_local int transmittedCount = 0;
 void flushReplBacklogToClients()
 {
     serverAssert(GlobalLocksAcquired());
-    if (g_pserver->repl_batch_offStart < 0)
+    if (g_pserver->repl_batch_offStart < 0){
+        if (getLowestOffsetAmongReplicas() == -1){
+            serverLog(LL_NOTICE, "this is a case i probably have to handle");
+        }
         return;
+    }
+
     
     if (g_pserver->repl_batch_offStart != g_pserver->master_repl_offset) {
         bool fAsyncWrite = false;
@@ -4585,7 +4668,7 @@ void flushReplBacklogToClients()
         serverAssert(g_pserver->master_repl_offset - g_pserver->repl_batch_offStart <= g_pserver->repl_backlog_size);
         serverAssert(g_pserver->repl_batch_idxStart != g_pserver->repl_backlog_idx);
 
-        serverAssert(!replOverflowHasOccured());
+        // serverAssert(!replOverflowHasOccured());
         listIter li;
         listNode *ln;
         listRewind(g_pserver->slaves, &li);
@@ -4605,11 +4688,21 @@ void flushReplBacklogToClients()
                 ul.lock();
             else
                 fAsyncWrite = true;
+
+            if (g_pserver->master_repl_offset - replica->repl_curr_off > g_pserver->repl_backlog_size){
+                serverLog(LL_WARNING, "THE REPLICATION BACKLOG SIZE IS TOO SMALL, THIS IS A PROBLEM");
+                long long oldsize = g_pserver->repl_backlog_size;
+                resizeReplicationBacklogForClients(std::max(g_pserver->repl_backlog_size * 2, g_pserver->master_repl_offset - replica->repl_curr_off));
+                serverLog(LL_WARNING, "changing size from %lld to %lld", oldsize, g_pserver->repl_backlog_size);
+            }
+
+        }
+
+        listRewind(g_pserver->slaves, &li);
 #endif
 
         while ((ln = listNext(&li))) {
             client *replica = (client*)listNodeValue(ln);
-            // serverLog(LL_NOTICE, "replica state: %d", replica->replstate);
 
             if (replica->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue;
             if (replica->flags & CLIENT_CLOSE_ASAP) continue;
diff --git a/src/server.cpp b/src/server.cpp
index 3d547f748..9664a4a6b 100644
--- a/src/server.cpp
+++ b/src/server.cpp
@@ -1796,6 +1796,7 @@ int clientsCronTrackClientsMemUsage(client *c) {
     mem += zmalloc_size(c);
     mem += c->argv_len_sum();
     if (c->argv) mem += zmalloc_size(c->argv);
+    // serverLog(LL_NOTICE, "Mem here is : %lu", mem);
     /* Now that we have the memory used by the client, remove the old
      * value from the old category, and add it back. */
     g_pserver->stat_clients_type_memory[c->client_cron_last_memory_type] -=
@@ -1854,7 +1855,7 @@ void clientsCron(int iel) {
     while(listLength(g_pserver->clients) && iterations--) {
         client *c;
         listNode *head;
-
+        // serverLog(LL_NOTICE, "we are at iteration: %d", iterations);
         /* Rotate the list, take the current head, process.
          * This way if the client must be removed from the list it's the
          * first element and we don't incur into O(N) computation. */

From 7ef58a333f9331e8fd144163626ed7a6ccaa1a59 Mon Sep 17 00:00:00 2001
From: VivekSainiEQ <vivek@eqalpha.com>
Date: Thu, 29 Apr 2021 18:51:30 +0000
Subject: [PATCH 03/22] Performance optimizations

Former-commit-id: 7fd83d467784d293f7da78b74f9b9763ce387238
---
 src/replication.cpp | 71 ++-------------------------------------------
 1 file changed, 3 insertions(+), 68 deletions(-)

diff --git a/src/replication.cpp b/src/replication.cpp
index ef33fbfd9..1bae2773a 100644
--- a/src/replication.cpp
+++ b/src/replication.cpp
@@ -47,8 +47,7 @@
 #include <unordered_map>
 #include <string>
 
-// #define BYPASS_BUFFER
-// #define RESIZE_BACKLOG
+#define BYPASS_BUFFER
 
 void replicationDiscardCachedMaster(redisMaster *mi);
 void replicationResurrectCachedMaster(redisMaster *mi, connection *conn);
@@ -89,10 +88,10 @@ int RDBGeneratedByReplication = 0;
 void resizeReplicationBacklogForClients(long long newsize);
 
 void setReplIdx(client *c, long long idx, long long off){
-    if (prepareClientToWrite(c) != C_OK) return;
     // serverLog(LL_NOTICE, "calling this garbage function w/ idx and off: %lld, %lld, %lld", idx, off, off-idx);
     // serverLog(LL_NOTICE, "Repl Index started at: %lld", c->repl_curr_idx);
     if (c->repl_curr_idx == -1){
+        if (prepareClientToWrite(c) != C_OK) return;
         c->repl_curr_idx = idx;
         c->repl_curr_off = off;
     }
@@ -432,17 +431,6 @@ void feedReplicationBacklog(const void *ptr, size_t len) {
                 resizeReplicationBacklogForClients(newsize);
             }
         }
-#ifdef RESIZE_BACKLOG
-        long long lowest_replica_offset = getLowestOffsetAmongReplicas();
-        minimumsize = g_pserver->master_repl_offset + len - lowest_replica_offset;
-        if (lowest_replica_offset != -1 && minimumsize > g_pserver->repl_backlog_size){
-            serverLog(LL_WARNING, "THE REPLICATION BACKLOG SIZE IS TOO SMALL, THIS IS A PROBLEM");
-            long long oldsize = g_pserver->repl_backlog_size;
-            resizeReplicationBacklogForClients(std::max(g_pserver->repl_backlog_size * 2, minimumsize));
-            serverLog(LL_WARNING, "changed size from %lld to %lld", oldsize, g_pserver->repl_backlog_size);
-            flushReplBacklogToClients();
-        }
-#endif
     }
 
     // serverLog(LL_NOTICE, "Pt2 start with: master_repl_offset: %lld, repl_batch_offStart: %lld, " 
@@ -4635,30 +4623,11 @@ void replicateSubkeyExpire(redisDb *db, robj_roptr key, robj_roptr subkey, long
 
 void _clientAsyncReplyBufferReserve(client *c, size_t len);
 
-/* Has the end of the replication backlog overflowed past the beginning? */
-bool replOverflowHasOccured(){
-    
-    if (g_pserver->repl_backlog_idx > g_pserver->repl_batch_idxStart){
-        long long repl_idx_difference = g_pserver->repl_backlog_idx > g_pserver->repl_batch_idxStart ? 
-                g_pserver->repl_backlog_idx - g_pserver->repl_batch_idxStart :
-                (g_pserver->repl_backlog_size + g_pserver->repl_backlog_idx) - g_pserver->repl_batch_idxStart;
-
-        return g_pserver->master_repl_offset - g_pserver->repl_batch_offStart > repl_idx_difference;
-    }
-    return false;
-}
-
-thread_local int transmittedCount = 0;
-
 void flushReplBacklogToClients()
 {
     serverAssert(GlobalLocksAcquired());
-    if (g_pserver->repl_batch_offStart < 0){
-        if (getLowestOffsetAmongReplicas() == -1){
-            serverLog(LL_NOTICE, "this is a case i probably have to handle");
-        }
+    if (g_pserver->repl_batch_offStart < 0)
         return;
-    }
 
     
     if (g_pserver->repl_batch_offStart != g_pserver->master_repl_offset) {
@@ -4668,39 +4637,9 @@ void flushReplBacklogToClients()
         serverAssert(g_pserver->master_repl_offset - g_pserver->repl_batch_offStart <= g_pserver->repl_backlog_size);
         serverAssert(g_pserver->repl_batch_idxStart != g_pserver->repl_backlog_idx);
 
-        // serverAssert(!replOverflowHasOccured());
         listIter li;
         listNode *ln;
         listRewind(g_pserver->slaves, &li);
-
-#if 0
-        // check for potential overflow first
-        while ((ln = listNext(&li))) {
-            client *replica = (client*)listNodeValue(ln);
-            // serverLog(LL_NOTICE, "replica state: %d", replica->replstate);
-
-            if (replica->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue;
-            if (replica->flags & CLIENT_CLOSE_ASAP) continue;
-            if (replica->repl_curr_idx == -1) continue;
-
-            std::unique_lock<fastlock> ul(replica->lock, std::defer_lock);
-            if (FCorrectThread(replica))
-                ul.lock();
-            else
-                fAsyncWrite = true;
-
-            if (g_pserver->master_repl_offset - replica->repl_curr_off > g_pserver->repl_backlog_size){
-                serverLog(LL_WARNING, "THE REPLICATION BACKLOG SIZE IS TOO SMALL, THIS IS A PROBLEM");
-                long long oldsize = g_pserver->repl_backlog_size;
-                resizeReplicationBacklogForClients(std::max(g_pserver->repl_backlog_size * 2, g_pserver->master_repl_offset - replica->repl_curr_off));
-                serverLog(LL_WARNING, "changing size from %lld to %lld", oldsize, g_pserver->repl_backlog_size);
-            }
-
-        }
-
-        listRewind(g_pserver->slaves, &li);
-#endif
-
         while ((ln = listNext(&li))) {
             client *replica = (client*)listNodeValue(ln);
 
@@ -4721,10 +4660,6 @@ void flushReplBacklogToClients()
                 setReplIdx(replica, g_pserver->repl_batch_idxStart, g_pserver->repl_batch_offStart);
                 continue;
             }
-#else
-            if (replica->replstate == SLAVE_STATE_ONLINE){
-                // serverLog(LL_NOTICE, "would be calling this garbage function w/ offset: %lld", g_pserver->repl_batch_idxStart);
-            }
 #endif
             if (g_pserver->repl_backlog_idx >= g_pserver->repl_batch_idxStart) {
                 long long cbCopy = g_pserver->repl_backlog_idx - g_pserver->repl_batch_idxStart;

From f6305ed15bca84719504890f85dd0f1297e05365 Mon Sep 17 00:00:00 2001
From: VivekSainiEQ <vivek@eqalpha.com>
Date: Mon, 3 May 2021 16:29:11 +0000
Subject: [PATCH 04/22] Now tracks memory and resizes 'accurately', need to fix
 cluster

Former-commit-id: 5f0e01cc199427ab6dfd7f8f28321f6a1f34fd1c
---
 src/config.cpp      |  1 +
 src/evict.cpp       | 10 +++++++++-
 src/networking.cpp  | 20 +++++++++++++-------
 src/replication.cpp | 16 ++++++++++++++++
 4 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/src/config.cpp b/src/config.cpp
index 9d7f14007..b546ef607 100644
--- a/src/config.cpp
+++ b/src/config.cpp
@@ -2347,6 +2347,7 @@ static int updateReplBacklogSize(long long val, long long prev, const char **err
     UNUSED(err);
     g_pserver->repl_backlog_size = prev;
     resizeReplicationBacklog(val);
+    g_pserver->repl_backlog_config_size = g_pserver->repl_backlog_size;
     return 1;
 }
 
diff --git a/src/evict.cpp b/src/evict.cpp
index 31cadeae5..36837e17d 100644
--- a/src/evict.cpp
+++ b/src/evict.cpp
@@ -392,9 +392,16 @@ size_t freeMemoryGetNotCountedMemory(void) {
         while((ln = listNext(&li))) {
             client *replica = (client*)listNodeValue(ln);
             std::unique_lock<fastlock>(replica->lock);
-            overhead += getClientOutputBufferMemoryUsage(replica);
+            /* we don't wish to multiple count the replication backlog shared usage */
+            overhead += (getClientOutputBufferMemoryUsage(replica) - getClientReplicationBacklogSharedUsage(replica));
         }
     }
+
+    /* also don't count the replication backlog memory
+     * that's where the replication clients get their memory from */
+    overhead += (g_pserver->repl_backlog_size - g_pserver->repl_backlog_config_size);
+    
+
     if (g_pserver->aof_state != AOF_OFF) {
         overhead += sdsalloc(g_pserver->aof_buf)+aofRewriteBufferSize();
     }
@@ -516,6 +523,7 @@ int freeMemoryIfNeeded(bool fQuickCycle, bool fPreSnapshot) {
     if (g_pserver->maxmemory_policy == MAXMEMORY_NO_EVICTION)
         goto cant_free; /* We need to free memory, but policy forbids. */
 
+    serverLog(LL_NOTICE, "evicting i guess lol, the overhead was %ld, the repl_backlog_size, %lld", freeMemoryGetNotCountedMemory(), g_pserver->repl_backlog_size);
     while (mem_freed < mem_tofree) {
         int j, k, i;
         static unsigned int next_db = 0;
diff --git a/src/networking.cpp b/src/networking.cpp
index cac58ff07..c51a02a1d 100644
--- a/src/networking.cpp
+++ b/src/networking.cpp
@@ -224,7 +224,6 @@ void clientInstallWriteHandler(client *c) {
         (c->replstate == REPL_STATE_NONE ||
          (c->replstate == SLAVE_STATE_ONLINE && !c->repl_put_online_on_ack)))
     {
-        // serverLog(LL_NOTICE, "we installing boyz");
         AssertCorrectThread(c);
         serverAssert(c->lock.fOwnLock());
         /* Here instead of installing the write handler, we just flag the
@@ -1801,6 +1800,9 @@ int writeToClient(client *c, int handler_installed) {
         if (nwrittenPart2 == -1) nwritten = -1;
     }
 
+    if (c->flags & CLIENT_SLAVE && handler_installed)
+        serverLog(LL_NOTICE, "Total bytes written, %ld, write handler installed?: %d", totwritten, handler_installed);
+
     g_pserver->stat_net_output_bytes += totwritten;
     if (nwritten == -1) {
         if (connGetState(c->conn) == CONN_STATE_CONNECTED) {
@@ -1821,6 +1823,11 @@ int writeToClient(client *c, int handler_installed) {
         if (!(c->flags & CLIENT_MASTER)) c->lastinteraction = g_pserver->unixtime;
     }
     if (!clientHasPendingReplies(c) && c->repl_curr_idx == -1) {
+        if(c->flags & CLIENT_SLAVE && handler_installed){
+            serverLog(LL_NOTICE, "Uninstalling handler");
+            serverLog(LL_NOTICE, "handler repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size);
+            serverLog(LL_NOTICE, "handler repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset);
+        }
         c->sentlen = 0;
         if (handler_installed) connSetWriteHandler(c->conn, NULL);
 
@@ -1836,6 +1843,7 @@ int writeToClient(client *c, int handler_installed) {
 /* Write event handler. Just send data to the client. */
 void sendReplyToClient(connection *conn) {
     client *c = (client*)connGetPrivateData(conn);
+    serverLog(LL_NOTICE, "called the sendreplytoclient");
     if (writeToClient(c,1) == C_ERR)
     {
         AeLocker ae;
@@ -1970,6 +1978,7 @@ int handleClientsWithPendingWrites(int iel, int aof_state) {
 
     auto vec = std::move(g_pserver->rgthreadvar[iel].clients_pending_write);
     processed += (int)vec.size();
+    // serverLog(LL_NOTICE, "entered handleClientsWithPendingWrites");
 
     for (client *c : vec) {
         serverAssertDebug(FCorrectThread(c));
@@ -2008,8 +2017,10 @@ int handleClientsWithPendingWrites(int iel, int aof_state) {
         /* If after the synchronous writes above we still have data to
         * output to the client, we need to install the writable handler. */
         if (clientHasPendingReplies(c) || c->repl_curr_idx != -1) {
-            if (connSetWriteHandlerWithBarrier(c->conn, sendReplyToClient, ae_flags, true) == C_ERR) 
+            serverLog(LL_NOTICE, "Setting a write handler for later");
+            if (connSetWriteHandlerWithBarrier(c->conn, sendReplyToClient, ae_flags, true) == C_ERR) {
                 freeClientAsync(c);
+            }
         }
     }
 
@@ -3359,11 +3370,6 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) {
  * that writes to said replica are using data from the replication backlog
  * as opposed to it's own internal buffer, this number should keep track of that */
 unsigned long getClientReplicationBacklogSharedUsage(client *c) {
-    if (c->flags & CLIENT_SLAVE && c->repl_curr_idx != -1){
-        // serverLog(LL_NOTICE, "repl_backlog_size %lld, repl_backlog_idx %lld, master_repl_offset %lld, repl_curr_idx %lld, repl_curr_off %lld", 
-        //     g_pserver->repl_backlog_size, g_pserver->repl_backlog_idx, g_pserver->master_repl_offset, c->repl_curr_idx, c->repl_curr_off);
-    }
-    
     return (!(c->flags & CLIENT_SLAVE) || c->repl_curr_idx == -1) ? 0 : g_pserver->master_repl_offset - c->repl_curr_off;
 }
 
diff --git a/src/replication.cpp b/src/replication.cpp
index 1bae2773a..60f25052a 100644
--- a/src/replication.cpp
+++ b/src/replication.cpp
@@ -4684,5 +4684,21 @@ void flushReplBacklogToClients()
         // This may be called multiple times per "frame" so update with our progress flushing to clients
         g_pserver->repl_batch_idxStart = g_pserver->repl_backlog_idx;
         g_pserver->repl_batch_offStart = g_pserver->master_repl_offset;
+    } else if (getLowestOffsetAmongReplicas() != -1){
+        listIter li;
+        listNode *ln;
+        listRewind(g_pserver->slaves, &li);
+        while ((ln = listNext(&li))) {
+            client *replica = (client*)listNodeValue(ln);
+
+            std::unique_lock<fastlock> ul(replica->lock, std::defer_lock);
+            if (FCorrectThread(replica))
+                ul.lock();
+            
+            /* try to force prepare client to write i guess? */
+            if (replica->repl_curr_idx != -1){
+                if (prepareClientToWrite(replica) != C_OK) continue;
+            }
+        }
     }
 }

From 33a7b52899a10432e1f9085027ed9e30c07dda32 Mon Sep 17 00:00:00 2001
From: VivekSainiEQ <vivek@eqalpha.com>
Date: Mon, 3 May 2021 16:49:09 +0000
Subject: [PATCH 05/22] Forgot to add server.h in last commit

Former-commit-id: 34fa6119c9a3f1533cc3e6e5d118dc6424a70891
---
 src/server.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/server.h b/src/server.h
index cfd6c34a0..6c5265fbd 100644
--- a/src/server.h
+++ b/src/server.h
@@ -2411,6 +2411,9 @@ struct redisServer {
     uint16_t rglockSamples[s_lockContentionSamples];
     unsigned ilockRingHead = 0;
 
+    long long repl_backlog_config_size = 1024*1024; /* This is a hack to ignore the resizing of the replication backlog
+                                                       when using it as a defacto for the client buffer */
+
     bool FRdbSaveInProgress() const { return rdbThreadVars.fRdbThreadActive; }
 };
 
@@ -2657,6 +2660,7 @@ sds getAllClientsInfoString(int type);
 void rewriteClientCommandVector(client *c, int argc, ...);
 void rewriteClientCommandArgument(client *c, int i, robj *newval);
 void replaceClientCommandVector(client *c, int argc, robj **argv);
+unsigned long getClientReplicationBacklogSharedUsage(client *c);
 unsigned long getClientOutputBufferMemoryUsage(client *c);
 int freeClientsInAsyncFreeQueue(int iel);
 void asyncCloseClientOnOutputBufferLimitReached(client *c);

From eb35d7e9ec36d73e0aa8fa2bdb0eb7bb808e4627 Mon Sep 17 00:00:00 2001
From: VivekSainiEQ <vivek@eqalpha.com>
Date: Wed, 5 May 2021 16:37:02 +0000
Subject: [PATCH 06/22] Updated maxmemory tests to account for overhead in new
 replication backlog behaviour

Former-commit-id: 4cd197959693dfe4d1497c3f703cf6aaa27d34ad
---
 tests/unit/maxmemory.tcl | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/tests/unit/maxmemory.tcl b/tests/unit/maxmemory.tcl
index 414733d1e..23879c38a 100644
--- a/tests/unit/maxmemory.tcl
+++ b/tests/unit/maxmemory.tcl
@@ -33,7 +33,8 @@ start_server {tags {"maxmemory"}} {
             # Get the current memory limit and calculate a new limit.
             # We just add 100k to the current memory size so that it is
             # fast for us to reach that limit.
-            set used [s used_memory]
+            set overhead [s mem_not_counted_for_evict]
+            set used [expr [s used_memory] - $overhead]
             set limit [expr {$used+100*1024}]
             r config set maxmemory $limit
             r config set maxmemory-policy $policy
@@ -42,7 +43,7 @@ start_server {tags {"maxmemory"}} {
             while 1 {
                 r setex [randomKey] 10000 x
                 incr numkeys
-                if {[s used_memory]+4096 > $limit} {
+                if {[expr {[s used_memory] - $overhead + 4096}] > $limit} {
                     assert {$numkeys > 10}
                     break
                 }
@@ -52,7 +53,8 @@ start_server {tags {"maxmemory"}} {
             for {set j 0} {$j < $numkeys} {incr j} {
                 r setex [randomKey] 10000 x
             }
-            assert {[s used_memory] < ($limit+4096)}
+            set used_amt [expr [s used_memory] - $overhead]
+            assert {$used_amt < ($limit+4096)}
         }
     }
 
@@ -65,7 +67,8 @@ start_server {tags {"maxmemory"}} {
             # Get the current memory limit and calculate a new limit.
             # We just add 100k to the current memory size so that it is
             # fast for us to reach that limit.
-            set used [s used_memory]
+            set overhead [s mem_not_counted_for_evict]
+            set used [expr [s used_memory] - $overhead]
             set limit [expr {$used+100*1024}]
             r config set maxmemory $limit
             r config set maxmemory-policy $policy
@@ -74,7 +77,7 @@ start_server {tags {"maxmemory"}} {
             while 1 {
                 r set [randomKey] x
                 incr numkeys
-                if {[s used_memory]+4096 > $limit} {
+                if {[expr [s used_memory] - $overhead]+4096 > $limit} {
                     assert {$numkeys > 10}
                     break
                 }
@@ -91,7 +94,7 @@ start_server {tags {"maxmemory"}} {
                 }
             }
             if {[string match allkeys-* $policy]} {
-                assert {[s used_memory] < ($limit+4096)}
+                assert {[expr [s used_memory] - $overhead] < ($limit+4096)}
             } else {
                 assert {$err == 1}
             }
@@ -107,7 +110,8 @@ start_server {tags {"maxmemory"}} {
             # Get the current memory limit and calculate a new limit.
             # We just add 100k to the current memory size so that it is
             # fast for us to reach that limit.
-            set used [s used_memory]
+            set overhead [s mem_not_counted_for_evict]
+            set used [expr [s used_memory] - $overhead]
             set limit [expr {$used+100*1024}]
             r config set maxmemory $limit
             r config set maxmemory-policy $policy
@@ -121,7 +125,7 @@ start_server {tags {"maxmemory"}} {
                 } else {
                     r set "key:$numkeys" x
                 }
-                if {[s used_memory]+4096 > $limit} {
+                if {[expr [s used_memory] - $overhead]+4096 > $limit} {
                     assert {$numkeys > 10}
                     break
                 }
@@ -135,7 +139,7 @@ start_server {tags {"maxmemory"}} {
                 catch {r setex "foo:$j" 10000 x}
             }
             # We should still be under the limit.
-            assert {[s used_memory] < ($limit+4096)}
+            assert {[expr [s used_memory] - $overhead] < ($limit+4096)}
             # However all our non volatile keys should be here.
             for {set j 0} {$j < $numkeys} {incr j 2} {
                 assert {[r exists "key:$j"]}
@@ -284,7 +288,8 @@ start_server {tags {"maxmemory"} overrides {server-threads 1}} {
         # we need to make sure to evict keynames of a total size of more than
         # 16kb since the (PROTO_REPLY_CHUNK_BYTES), only after that the
         # invalidation messages have a chance to trigger further eviction.
-        set used [s used_memory]
+        set overhead [s mem_not_counted_for_evict]
+        set used [expr [s used_memory] - $overhead]
         set limit [expr {$used - 40000}]
         r config set maxmemory $limit
 

From f6a714db2658a4b8baa924e62dd0ab2c6a7adb9f Mon Sep 17 00:00:00 2001
From: VivekSainiEQ <vivek@eqalpha.com>
Date: Wed, 5 May 2021 17:04:08 +0000
Subject: [PATCH 07/22] Updated overhead calculation to only use
 repl_backlog_size

Former-commit-id: 6f93c7eb44d84bb143b4ad4fff3c6a5436ebaaf7
---
 src/evict.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/evict.cpp b/src/evict.cpp
index 36837e17d..e7f0a10ef 100644
--- a/src/evict.cpp
+++ b/src/evict.cpp
@@ -399,8 +399,8 @@ size_t freeMemoryGetNotCountedMemory(void) {
 
     /* also don't count the replication backlog memory
      * that's where the replication clients get their memory from */
-    overhead += (g_pserver->repl_backlog_size - g_pserver->repl_backlog_config_size);
-    
+    // overhead += (g_pserver->repl_backlog_size - g_pserver->repl_backlog_config_size);
+    overhead += g_pserver->repl_backlog_size;
 
     if (g_pserver->aof_state != AOF_OFF) {
         overhead += sdsalloc(g_pserver->aof_buf)+aofRewriteBufferSize();

From 7ff2fb716a4a92bf78a06a888fec82f246889c74 Mon Sep 17 00:00:00 2001
From: VivekSainiEQ <vivek@eqalpha.com>
Date: Wed, 5 May 2021 22:26:36 +0000
Subject: [PATCH 08/22] Fixed data race? Seems to be passing multithreaded test
 cases now

Former-commit-id: cb13edd1200c1230fa7e313d69c69e06129951d3
---
 src/networking.cpp  | 2 +-
 src/replication.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/networking.cpp b/src/networking.cpp
index c51a02a1d..6f4aa6268 100644
--- a/src/networking.cpp
+++ b/src/networking.cpp
@@ -1782,7 +1782,7 @@ int writeToClient(client *c, int handler_installed) {
         // }
 
 
-        if (nwritten == nrequested){
+        if (nwritten == nrequested && g_pserver->repl_backlog_idx == c->repl_curr_idx){
             c->repl_curr_idx = -1; /* -1 denotes no more replica writes */
         }
         else if (nwritten > 0)
diff --git a/src/replication.cpp b/src/replication.cpp
index 60f25052a..d3df6d12a 100644
--- a/src/replication.cpp
+++ b/src/replication.cpp
@@ -90,7 +90,7 @@ void resizeReplicationBacklogForClients(long long newsize);
 void setReplIdx(client *c, long long idx, long long off){
     // serverLog(LL_NOTICE, "calling this garbage function w/ idx and off: %lld, %lld, %lld", idx, off, off-idx);
     // serverLog(LL_NOTICE, "Repl Index started at: %lld", c->repl_curr_idx);
-    if (c->repl_curr_idx == -1){
+    if (c->repl_curr_idx == -1 && off >= c->repl_curr_off){
         if (prepareClientToWrite(c) != C_OK) return;
         c->repl_curr_idx = idx;
         c->repl_curr_off = off;

From 4fd76c47911f506909e54a138bf8f72b0fea8687 Mon Sep 17 00:00:00 2001
From: VivekSainiEQ <vivek@eqalpha.com>
Date: Fri, 21 May 2021 17:05:55 +0000
Subject: [PATCH 09/22] Fixed single threaded for real this time, need to add
 synchronization for multi threaded

Former-commit-id: 4d858dac1a503f4d518477212ba585069af22574
---
 src/networking.cpp  | 8 +++++---
 src/replication.cpp | 5 +++++
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/networking.cpp b/src/networking.cpp
index 6f4aa6268..c39d8ce42 100644
--- a/src/networking.cpp
+++ b/src/networking.cpp
@@ -1676,8 +1676,7 @@ int writeToClient(client *c, int handler_installed) {
 
     std::unique_lock<decltype(c->lock)> lock(c->lock);
    
-    /* if this is a write to a replica, it's coming straight from the replication backlog */        
-    long long repl_backlog_idx = g_pserver->repl_backlog_idx;
+
 
     while(clientHasPendingReplies(c)) {
         if (c->bufpos > 0) {
@@ -1742,6 +1741,9 @@ int writeToClient(client *c, int handler_installed) {
         c->transmittedRDB = true;
     }
 
+    /* if this is a write to a replica, it's coming straight from the replication backlog */        
+    long long repl_backlog_idx = g_pserver->repl_backlog_idx;
+
     /* For replicas, we don't store all the information in the client buffer
      * Most of the time (aside from immediately after synchronizing), we read
      * from the replication backlog directly */
@@ -1782,7 +1784,7 @@ int writeToClient(client *c, int handler_installed) {
         // }
 
 
-        if (nwritten == nrequested && g_pserver->repl_backlog_idx == c->repl_curr_idx){
+        if (nwritten == nrequested && g_pserver->repl_backlog_idx == repl_backlog_idx){
             c->repl_curr_idx = -1; /* -1 denotes no more replica writes */
         }
         else if (nwritten > 0)
diff --git a/src/replication.cpp b/src/replication.cpp
index d3df6d12a..1d4e01289 100644
--- a/src/replication.cpp
+++ b/src/replication.cpp
@@ -3059,6 +3059,11 @@ void syncWithMaster(connection *conn) {
 
     if (psync_result == PSYNC_CONTINUE) {
         serverLog(LL_NOTICE, "MASTER <-> REPLICA sync: Master accepted a Partial Resynchronization.");
+        /* Reset the bulklen information in case it is lingering from the last connection
+         * The partial sync will start from the beginning of a command so these should be reset */
+        mi->master->reqtype = 0;
+        mi->master->multibulklen = 0;
+        mi->master->bulklen = -1;
         if (cserver.supervised_mode == SUPERVISED_SYSTEMD) {
             redisCommunicateSystemd("STATUS=MASTER <-> REPLICA sync: Partial Resynchronization accepted. Ready to accept connections.\n");
             redisCommunicateSystemd("READY=1\n");

From 6080ee8f2f33fd21de8dfa9c103ba569759bc127 Mon Sep 17 00:00:00 2001
From: VivekSainiEQ <vivek@eqalpha.com>
Date: Wed, 26 May 2021 20:10:33 +0000
Subject: [PATCH 10/22] Added transmitted RDB lock

Former-commit-id: 4b32167afc85742d85ff9b47b2c2e0b6b02e140a
---
 src/networking.cpp  | 13 +++++++++++--
 src/replication.cpp | 15 ++++++++++-----
 src/server.h        |  2 ++
 3 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/src/networking.cpp b/src/networking.cpp
index c39d8ce42..176693501 100644
--- a/src/networking.cpp
+++ b/src/networking.cpp
@@ -319,6 +319,7 @@ void _clientAsyncReplyBufferReserve(client *c, size_t len) {
     clientReplyBlock *replyNew = (clientReplyBlock*)zmalloc(sizeof(clientReplyBlock) + newsize);
     replyNew->size = zmalloc_usable(replyNew) - sizeof(clientReplyBlock);
     replyNew->used = 0;
+    std::unique_lock<fastlock> tRDBLock (c->transmittedRDBLock);
     c->replyAsync = replyNew;
 }
 
@@ -332,6 +333,7 @@ int _addReplyToBuffer(client *c, const char *s, size_t len) {
     if (fAsync)
     {
         serverAssert(GlobalLocksAcquired());
+        std::unique_lock<fastlock> tRDBLock (c->transmittedRDBLock);
         if (c->replyAsync == nullptr || (c->replyAsync->size - c->replyAsync->used) < len)
         {
             if (c->replyAsync == nullptr) {
@@ -1737,9 +1739,14 @@ int writeToClient(client *c, int handler_installed) {
     /* If there are no more pending replies, then we have transmitted the RDB.
      * This means further replication commands will be taken straight from the
      * replication backlog from now on. */
+
+    std::unique_lock<fastlock> tRDBLock (c->transmittedRDBLock);
+
     if (c->flags & CLIENT_SLAVE && c->replstate == SLAVE_STATE_ONLINE && !clientHasPendingReplies(c) && c->replyAsync == nullptr){
         c->transmittedRDB = true;
     }
+    bool transmittedRDB = c->transmittedRDB;
+    tRDBLock.unlock();
 
     /* if this is a write to a replica, it's coming straight from the replication backlog */        
     long long repl_backlog_idx = g_pserver->repl_backlog_idx;
@@ -1747,7 +1754,7 @@ int writeToClient(client *c, int handler_installed) {
     /* For replicas, we don't store all the information in the client buffer
      * Most of the time (aside from immediately after synchronizing), we read
      * from the replication backlog directly */
-    if (c->flags & CLIENT_SLAVE && c->repl_curr_idx != -1 && c->transmittedRDB){
+    if (c->flags & CLIENT_SLAVE && c->repl_curr_idx != -1 && transmittedRDB){
         /* copy global variables into local scope so if they change in between we don't care */
         long long repl_backlog_size = g_pserver->repl_backlog_size;
         long long nwrittenPart2 = 0;
@@ -1874,6 +1881,7 @@ void ProcessPendingAsyncWrites()
         serverAssert(c->fPendingAsyncWrite);
         if (c->flags & (CLIENT_CLOSE_ASAP | CLIENT_CLOSE_AFTER_REPLY))
         {
+            std::unique_lock<fastlock> tRDBLock (c->transmittedRDBLock);
             if (c->replyAsync != nullptr){
                 zfree(c->replyAsync);
                 c->replyAsync = nullptr;
@@ -1885,6 +1893,7 @@ void ProcessPendingAsyncWrites()
         /* since writes from master to replica can come directly from the replication backlog,
          * writes may have been signalled without having been copied to the replyAsync buffer,
          * thus causing the buffer to be NULL */ 
+        std::unique_lock<fastlock> tRDBLock (c->transmittedRDBLock);
         if (c->replyAsync != nullptr){
             int size = c->replyAsync->used;
 
@@ -1905,7 +1914,7 @@ void ProcessPendingAsyncWrites()
         }
 
         c->fPendingAsyncWrite = FALSE;
-
+        tRDBLock.unlock();
         // Now install the write event handler
         int ae_flags = AE_WRITABLE|AE_WRITE_THREADSAFE;
         /* For the fsync=always policy, we want that a given FD is never
diff --git a/src/replication.cpp b/src/replication.cpp
index 1d4e01289..ad79f4887 100644
--- a/src/replication.cpp
+++ b/src/replication.cpp
@@ -441,6 +441,8 @@ void feedReplicationBacklog(const void *ptr, size_t len) {
 
     g_pserver->master_repl_offset += len;
 
+    
+
     /* This is a circular buffer, so write as much data we can at every
      * iteration and rewind the "idx" index if we reach the limit. */
     while(len) {
@@ -4659,11 +4661,14 @@ void flushReplBacklogToClients()
             
 
 #ifdef BYPASS_BUFFER
-            /* If we are online and the RDB has been sent, there is no need to feed the client buffer
-             * We will send our replies directly from the replication backlog instead */
-            if (replica->replstate == SLAVE_STATE_ONLINE && replica->transmittedRDB){
-                setReplIdx(replica, g_pserver->repl_batch_idxStart, g_pserver->repl_batch_offStart);
-                continue;
+            {
+                /* If we are online and the RDB has been sent, there is no need to feed the client buffer
+                * We will send our replies directly from the replication backlog instead */
+                std::unique_lock<fastlock> tRDBLock (replica->transmittedRDBLock);
+                if (replica->replstate == SLAVE_STATE_ONLINE && replica->transmittedRDB){
+                    setReplIdx(replica, g_pserver->repl_batch_idxStart, g_pserver->repl_batch_offStart);
+                    continue;
+                }
             }
 #endif
             if (g_pserver->repl_backlog_idx >= g_pserver->repl_batch_idxStart) {
diff --git a/src/server.h b/src/server.h
index 6c5265fbd..14005e7d5 100644
--- a/src/server.h
+++ b/src/server.h
@@ -1582,6 +1582,7 @@ struct client {
 
     // post a function from a non-client thread to run on its client thread
     bool postFunction(std::function<void(client *)> fn, bool fLock = true);
+    fastlock transmittedRDBLock {"transmittedRDB"};
     size_t argv_len_sum() const;
 };
 
@@ -2228,6 +2229,7 @@ struct redisServer {
                                        that is the next byte will'll write to.*/
     long long repl_backlog_off;     /* Replication "master offset" of first
                                        byte in the replication backlog buffer.*/
+    fastlock repl_backlog_lock {"replication backlog"};
     time_t repl_backlog_time_limit; /* Time without slaves after the backlog
                                        gets released. */
     time_t repl_no_slaves_since;    /* We have no slaves since that time.

From bf120245faa6867db1b464f319ee3944c017ad28 Mon Sep 17 00:00:00 2001
From: VivekSainiEQ <vivek@eqalpha.com>
Date: Thu, 27 May 2021 18:57:23 +0000
Subject: [PATCH 11/22] Added more synchronization and fixed some data races

Former-commit-id: 183e015dac6f85df1c94d0761e89bc23d9f53319
---
 src/multi.cpp       |   2 +
 src/networking.cpp  | 141 +++++++++++++++++++++++---------------------
 src/replication.cpp |  57 ++++++++----------
 src/server.cpp      |   1 +
 src/server.h        |   3 +
 5 files changed, 105 insertions(+), 99 deletions(-)

diff --git a/src/multi.cpp b/src/multi.cpp
index 9df72383d..9fd5206fb 100644
--- a/src/multi.cpp
+++ b/src/multi.cpp
@@ -237,6 +237,8 @@ void execCommand(client *c) {
          * backlog with the final EXEC. */
         if (g_pserver->repl_backlog && was_master && !is_master) {
             const char *execcmd = "*1\r\n$4\r\nEXEC\r\n";
+            updateLowestOffsetAmongReplicas();
+            std::unique_lock<fastlock> repl_backlog_lock (g_pserver->repl_backlog_lock);
             feedReplicationBacklog(execcmd,strlen(execcmd));
         }
     }
diff --git a/src/networking.cpp b/src/networking.cpp
index 176693501..caefd6d1e 100644
--- a/src/networking.cpp
+++ b/src/networking.cpp
@@ -125,6 +125,7 @@ client *createClient(connection *conn, int iel) {
     client_id = g_pserver->next_client_id.fetch_add(1);
     c->iel = iel;
     c->id = client_id;
+    sprintf(c->lock.szName, "client %lu", client_id);
     c->resp = 2;
     c->conn = conn;
     c->name = NULL;
@@ -1677,8 +1678,7 @@ int writeToClient(client *c, int handler_installed) {
     serverAssertDebug(FCorrectThread(c));
 
     std::unique_lock<decltype(c->lock)> lock(c->lock);
-   
-
+    // serverLog(LL_NOTICE, "acq client");
 
     while(clientHasPendingReplies(c)) {
         if (c->bufpos > 0) {
@@ -1736,82 +1736,87 @@ int writeToClient(client *c, int handler_installed) {
             !(c->flags & CLIENT_SLAVE)) break;
     }
 
-    /* If there are no more pending replies, then we have transmitted the RDB.
-     * This means further replication commands will be taken straight from the
-     * replication backlog from now on. */
+    /* We can only directly read from the replication backlog if the client 
+       is a replica, so only attempt to do so if that's the case. */
+    if (c->flags & CLIENT_SLAVE) {
+        /* If there are no more pending replies, then we have transmitted the RDB.
+        * This means further replication commands will be taken straight from the
+        * replication backlog from now on. */
+        std::unique_lock<fastlock> tRDBLock (c->transmittedRDBLock);
 
-    std::unique_lock<fastlock> tRDBLock (c->transmittedRDBLock);
+        if (c->replstate == SLAVE_STATE_ONLINE && !clientHasPendingReplies(c) && c->replyAsync == nullptr){
+            c->transmittedRDB = true;
+        }
+        bool transmittedRDB = c->transmittedRDB;
+        tRDBLock.unlock();
 
-    if (c->flags & CLIENT_SLAVE && c->replstate == SLAVE_STATE_ONLINE && !clientHasPendingReplies(c) && c->replyAsync == nullptr){
-        c->transmittedRDB = true;
-    }
-    bool transmittedRDB = c->transmittedRDB;
-    tRDBLock.unlock();
+        /* For replicas, we don't store all the information in the client buffer
+        * Most of the time (aside from immediately after synchronizing), we read
+        * from the replication backlog directly */
+        if (c->repl_curr_idx != -1 && transmittedRDB){
+            std::unique_lock<fastlock> repl_backlog_lock (g_pserver->repl_backlog_lock);
 
-    /* if this is a write to a replica, it's coming straight from the replication backlog */        
-    long long repl_backlog_idx = g_pserver->repl_backlog_idx;
+            /* copy global variables into local scope so if they change in between we don't care */
+            long long repl_backlog_idx = g_pserver->repl_backlog_idx;
+            long long repl_backlog_size = g_pserver->repl_backlog_size;
+            long long nwrittenPart2 = 0;
 
-    /* For replicas, we don't store all the information in the client buffer
-     * Most of the time (aside from immediately after synchronizing), we read
-     * from the replication backlog directly */
-    if (c->flags & CLIENT_SLAVE && c->repl_curr_idx != -1 && transmittedRDB){
-        /* copy global variables into local scope so if they change in between we don't care */
-        long long repl_backlog_size = g_pserver->repl_backlog_size;
-        long long nwrittenPart2 = 0;
+            ssize_t nrequested; /* The number of bytes requested to write */
+            /* normal case with no wrap around */
+            if (repl_backlog_idx >= c->repl_curr_idx){
+                nrequested = repl_backlog_idx - c->repl_curr_idx;
+                nwritten = connWrite(c->conn, g_pserver->repl_backlog + c->repl_curr_idx, repl_backlog_idx - c->repl_curr_idx);
+            /* wrap around case, v. rare */
+            /* also v. buggy so there's that */
+            } else {
+                nrequested = repl_backlog_size + repl_backlog_idx - c->repl_curr_idx;
+                nwritten = connWrite(c->conn, g_pserver->repl_backlog + c->repl_curr_idx, repl_backlog_size - c->repl_curr_idx);
+                /* only attempt wrapping if we write the correct number of bytes */
+                if (nwritten == repl_backlog_size - c->repl_curr_idx){
+                    long long nwrittenPart2 = connWrite(c->conn, g_pserver->repl_backlog, repl_backlog_idx);
+                    if (nwrittenPart2 != -1)
+                        nwritten += nwrittenPart2;
 
-        ssize_t nrequested; /* The number of bytes requested to write */
-        /* normal case with no wrap around */
-        if (repl_backlog_idx >= c->repl_curr_idx){
-            nrequested = repl_backlog_idx - c->repl_curr_idx;
-            nwritten = connWrite(c->conn, g_pserver->repl_backlog + c->repl_curr_idx, repl_backlog_idx - c->repl_curr_idx);
-        /* wrap around case, v. rare */
-        /* also v. buggy so there's that */
-        } else {
-            nrequested = repl_backlog_size + repl_backlog_idx - c->repl_curr_idx;
-            nwritten = connWrite(c->conn, g_pserver->repl_backlog + c->repl_curr_idx, repl_backlog_size - c->repl_curr_idx);
-            /* only attempt wrapping if we write the correct number of bytes */
-            if (nwritten == repl_backlog_size - c->repl_curr_idx){
-                long long nwrittenPart2 = connWrite(c->conn, g_pserver->repl_backlog, repl_backlog_idx);
-                if (nwrittenPart2 != -1)
-                    nwritten += nwrittenPart2;
+                }                
+            }
 
-            }                
+            /* only update the replica's current index if bytes were sent */
+
+            // if (nrequested != nwritten){
+                // serverLog(LL_NOTICE, "-----------------------------------------");
+                // serverLog(LL_NOTICE, "AFTER THE FACT");
+                // serverLog(LL_NOTICE, "requested to write: %ld", nrequested);
+                // serverLog(LL_NOTICE, "actually written: %ld", nwritten);
+                // serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size);
+                // serverLog(LL_NOTICE, "repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset);
+                // serverLog(LL_NOTICE, "-----------------------------------------");
+            // }
+
+
+            if (nwritten == nrequested && g_pserver->repl_backlog_idx == repl_backlog_idx){
+                c->repl_curr_idx = -1; /* -1 denotes no more replica writes */
+            }
+            else if (nwritten > 0)
+                c->repl_curr_idx = (c->repl_curr_idx + nwritten) % repl_backlog_size;
+
+            serverAssert(c->repl_curr_idx < repl_backlog_size);
+
+            /* only increment bytes if an error didn't occur */
+            if (nwritten > 0){
+                totwritten += nwritten;
+                c->repl_curr_off += nwritten;
+            }
+
+            /* If the second part of a write didn't go through, we still need to register that */
+            if (nwrittenPart2 == -1) nwritten = -1;
         }
 
-        /* only update the replica's current index if bytes were sent */
+        if (c->flags & CLIENT_SLAVE && handler_installed)
+            serverLog(LL_NOTICE, "Total bytes written, %ld, write handler installed?: %d", totwritten, handler_installed);
 
-        // if (nrequested != nwritten){
-            // serverLog(LL_NOTICE, "-----------------------------------------");
-            // serverLog(LL_NOTICE, "AFTER THE FACT");
-            // serverLog(LL_NOTICE, "requested to write: %ld", nrequested);
-            // serverLog(LL_NOTICE, "actually written: %ld", nwritten);
-            // serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size);
-            // serverLog(LL_NOTICE, "repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset);
-            // serverLog(LL_NOTICE, "-----------------------------------------");
-        // }
-
-
-        if (nwritten == nrequested && g_pserver->repl_backlog_idx == repl_backlog_idx){
-            c->repl_curr_idx = -1; /* -1 denotes no more replica writes */
-        }
-        else if (nwritten > 0)
-            c->repl_curr_idx = (c->repl_curr_idx + nwritten) % repl_backlog_size;
-
-        serverAssert(c->repl_curr_idx < repl_backlog_size);
-
-        /* only increment bytes if an error didn't occur */
-        if (nwritten > 0){
-            totwritten += nwritten;
-            c->repl_curr_off += nwritten;
-        }
-
-        /* If the second part of a write didn't go through, we still need to register that */
-        if (nwrittenPart2 == -1) nwritten = -1;
     }
 
-    if (c->flags & CLIENT_SLAVE && handler_installed)
-        serverLog(LL_NOTICE, "Total bytes written, %ld, write handler installed?: %d", totwritten, handler_installed);
-
+    // serverLog(LL_NOTICE, "rel client");
     g_pserver->stat_net_output_bytes += totwritten;
     if (nwritten == -1) {
         if (connGetState(c->conn) == CONN_STATE_CONNECTED) {
@@ -1834,7 +1839,7 @@ int writeToClient(client *c, int handler_installed) {
     if (!clientHasPendingReplies(c) && c->repl_curr_idx == -1) {
         if(c->flags & CLIENT_SLAVE && handler_installed){
             serverLog(LL_NOTICE, "Uninstalling handler");
-            serverLog(LL_NOTICE, "handler repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size);
+            serverLog(LL_NOTICE, "handler repl_curr_idx: %lld, repl_backlog_size: %lld", c->repl_curr_idx, g_pserver->repl_backlog_size);
             serverLog(LL_NOTICE, "handler repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset);
         }
         c->sentlen = 0;
diff --git a/src/replication.cpp b/src/replication.cpp
index ad79f4887..d1181bdf4 100644
--- a/src/replication.cpp
+++ b/src/replication.cpp
@@ -56,9 +56,11 @@ void putSlaveOnline(client *replica);
 int cancelReplicationHandshake(redisMaster *mi);
 static void propagateMasterStaleKeys();
 
-/* gets the lowest offset amongst all of the replicas */
-long long getLowestOffsetAmongReplicas(){
+/* gets the lowest offset amongst all of the replicas and stores it globally*/
+void updateLowestOffsetAmongReplicas(){
     serverAssert(GlobalLocksAcquired());
+    serverAssert(!g_pserver->repl_backlog_lock.fOwnLock());
+    // serverLog(LL_NOTICE, "off- have repl");
     long long min_offset = LONG_LONG_MAX;
     listIter li;
     listNode *ln;
@@ -69,16 +71,15 @@ long long getLowestOffsetAmongReplicas(){
 
         if (replica->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue;
         if (replica->flags & CLIENT_CLOSE_ASAP) continue;
-        if (replica->repl_curr_idx == -1) continue;
 
-        std::unique_lock<fastlock> ul(replica->lock, std::defer_lock);
-        if (FCorrectThread(replica))
-            ul.lock();
+        std::unique_lock<fastlock> ul(replica->lock);
+        // serverLog(LL_NOTICE, "off- acq client");
 
-        min_offset = std::min(min_offset, replica->repl_curr_off);        
+        min_offset = std::min(min_offset, replica->repl_curr_off);  
+        // serverLog(LL_NOTICE, "off- rel client");      
     }
     /* return -1 if no other minimum was found */
-    return min_offset == LONG_LONG_MAX ? -1 : min_offset;
+    g_pserver->repl_lowest_off.store(min_offset == LONG_LONG_MAX ? -1 : min_offset, std::memory_order_seq_cst);
 }
 /* We take a global flag to remember if this instance generated an RDB
  * because of replication, so that we can remove the RDB file in case
@@ -412,11 +413,12 @@ void freeReplicationBacklog(void) {
  * the backlog without incrementing the offset. */
 void feedReplicationBacklog(const void *ptr, size_t len) {
     serverAssert(GlobalLocksAcquired());
+    serverAssert(g_pserver->repl_backlog_lock.fOwnLock());
     const unsigned char *p = (const unsigned char*)ptr;
 
     if (g_pserver->repl_batch_idxStart >= 0) {
         /* we are lower bounded by the lower client offset or the offStart if all the clients are up to date */
-        long long lower_bound = getLowestOffsetAmongReplicas();
+        long long lower_bound = g_pserver->repl_lowest_off.load(std::memory_order_seq_cst);
         if (lower_bound == -1)
             lower_bound = g_pserver->repl_batch_offStart;
         long long minimumsize = g_pserver->master_repl_offset + len - lower_bound + 1;
@@ -441,10 +443,9 @@ void feedReplicationBacklog(const void *ptr, size_t len) {
 
     g_pserver->master_repl_offset += len;
 
-    
-
     /* This is a circular buffer, so write as much data we can at every
      * iteration and rewind the "idx" index if we reach the limit. */
+    
     while(len) {
         size_t thislen = g_pserver->repl_backlog_size - g_pserver->repl_backlog_idx;
         if (thislen > len) thislen = len;
@@ -598,6 +599,8 @@ void replicationFeedSlavesCore(list *slaves, int dictid, robj **argv, int argc)
     serverAssert(!(listLength(slaves) != 0 && g_pserver->repl_backlog == NULL));
 
     bool fSendRaw = !g_pserver->fActiveReplica;
+    updateLowestOffsetAmongReplicas();
+    std::unique_lock<fastlock> repl_backlog_lock (g_pserver->repl_backlog_lock);
 
     /* Send SELECT command to every replica if needed. */
     if (g_pserver->replicaseldb != dictid) {
@@ -619,7 +622,9 @@ void replicationFeedSlavesCore(list *slaves, int dictid, robj **argv, int argc)
 
         /* Add the SELECT command into the backlog. */
         /* We don't do this for advanced replication because this will be done later when it adds the whole RREPLAY command */
-        if (g_pserver->repl_backlog && fSendRaw) feedReplicationBacklogWithObject(selectcmd);
+        if (g_pserver->repl_backlog && fSendRaw) {
+            feedReplicationBacklogWithObject(selectcmd);
+        }
 
         if (dictid < 0 || dictid >= PROTO_SHARED_SELECT_CMDS)
             decrRefCount(selectcmd);
@@ -632,7 +637,6 @@ void replicationFeedSlavesCore(list *slaves, int dictid, robj **argv, int argc)
         if (fSendRaw)
         {
             char aux[LONG_STR_SIZE+3];
-
             /* Add the multi bulk reply length. */
             aux[0] = '*';
             int multilen = ll2string(aux+1,sizeof(aux)-1,argc);
@@ -759,7 +763,11 @@ void replicationFeedSlavesFromMasterStream(char *buf, size_t buflen) {
         printf("\n");
     }
 
-    if (g_pserver->repl_backlog) feedReplicationBacklog(buf,buflen);
+    if (g_pserver->repl_backlog){
+        updateLowestOffsetAmongReplicas();
+        std::unique_lock<fastlock> repl_backlog_lock (g_pserver->repl_backlog_lock);   
+        feedReplicationBacklog(buf,buflen);
+    }
 }
 
 void replicationFeedMonitors(client *c, list *monitors, int dictid, robj **argv, int argc) {
@@ -4662,6 +4670,9 @@ void flushReplBacklogToClients()
 
 #ifdef BYPASS_BUFFER
             {
+                std::unique_lock<fastlock> asyncUl(replica->lock, std::defer_lock);
+                if (!FCorrectThread(replica))
+                    asyncUl.lock();
                 /* If we are online and the RDB has been sent, there is no need to feed the client buffer
                 * We will send our replies directly from the replication backlog instead */
                 std::unique_lock<fastlock> tRDBLock (replica->transmittedRDBLock);
@@ -4694,21 +4705,5 @@ void flushReplBacklogToClients()
         // This may be called multiple times per "frame" so update with our progress flushing to clients
         g_pserver->repl_batch_idxStart = g_pserver->repl_backlog_idx;
         g_pserver->repl_batch_offStart = g_pserver->master_repl_offset;
-    } else if (getLowestOffsetAmongReplicas() != -1){
-        listIter li;
-        listNode *ln;
-        listRewind(g_pserver->slaves, &li);
-        while ((ln = listNext(&li))) {
-            client *replica = (client*)listNodeValue(ln);
-
-            std::unique_lock<fastlock> ul(replica->lock, std::defer_lock);
-            if (FCorrectThread(replica))
-                ul.lock();
-            
-            /* try to force prepare client to write i guess? */
-            if (replica->repl_curr_idx != -1){
-                if (prepareClientToWrite(replica) != C_OK) continue;
-            }
-        }
-    }
+    } 
 }
diff --git a/src/server.cpp b/src/server.cpp
index 9664a4a6b..439e1aeff 100644
--- a/src/server.cpp
+++ b/src/server.cpp
@@ -2924,6 +2924,7 @@ void initServerConfig(void) {
     g_pserver->enable_multimaster = CONFIG_DEFAULT_ENABLE_MULTIMASTER;
     g_pserver->repl_syncio_timeout = CONFIG_REPL_SYNCIO_TIMEOUT;
     g_pserver->master_repl_offset = 0;
+    g_pserver->repl_lowest_off.store(-1, std::memory_order_seq_cst);
 
     /* Replication partial resync backlog */
     g_pserver->repl_backlog = NULL;
diff --git a/src/server.h b/src/server.h
index 14005e7d5..da1fce52e 100644
--- a/src/server.h
+++ b/src/server.h
@@ -2241,6 +2241,8 @@ struct redisServer {
     int repl_diskless_load;         /* Slave parse RDB directly from the socket.
                                      * see REPL_DISKLESS_LOAD_* enum */
     int repl_diskless_sync_delay;   /* Delay to start a diskless repl BGSAVE. */
+    std::atomic <long long> repl_lowest_off; /* The lowest offset amongst all clients 
+                                               Updated before calls to feed the replication backlog */
     /* Replication (replica) */
     list *masters;
     int enable_multimaster; 
@@ -2838,6 +2840,7 @@ void rdbPipeReadHandler(struct aeEventLoop *eventLoop, int fd, void *clientData,
 void rdbPipeWriteHandlerConnRemoved(struct connection *conn);
 void replicationNotifyLoadedKey(redisDb *db, robj_roptr key, robj_roptr val, long long expire);
 void replicateSubkeyExpire(redisDb *db, robj_roptr key, robj_roptr subkey, long long expire);
+void updateLowestOffsetAmongReplicas(void);
 
 /* Generic persistence functions */
 void startLoadingFile(FILE* fp, const char * filename, int rdbflags);

From 2a6848a65a513926d3da6608d334351ed6878089 Mon Sep 17 00:00:00 2001
From: VivekSainiEQ <vivek@eqalpha.com>
Date: Wed, 2 Jun 2021 23:41:36 +0000
Subject: [PATCH 12/22] Sync works single threaded properly, passes all but one
 testcase (which hangs)

Former-commit-id: 9a6ca3a5d906b9d87fe70652d218decbb2775ac1
---
 src/Makefile        |   2 +-
 src/networking.cpp  | 165 ++++++++++++++++++++++++++------------------
 src/replication.cpp | 106 +++++++++++-----------------
 src/server.h        |   9 +--
 4 files changed, 145 insertions(+), 137 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 966ce4400..a0ee5fe2a 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -15,7 +15,7 @@
 release_hdr := $(shell sh -c './mkreleasehdr.sh')
 uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not')
 uname_M := $(shell sh -c 'uname -m 2>/dev/null || echo not')
-OPTIMIZATION?=-O2 -flto
+OPTIMIZATION?=-O2 
 DEPENDENCY_TARGETS=hiredis linenoise lua rocksdb
 NODEPS:=clean distclean
 
diff --git a/src/networking.cpp b/src/networking.cpp
index caefd6d1e..80120d0ca 100644
--- a/src/networking.cpp
+++ b/src/networking.cpp
@@ -146,6 +146,7 @@ client *createClient(connection *conn, int iel) {
     c->flags = 0;
     c->fPendingAsyncWrite = FALSE;
     c->fPendingAsyncWriteHandler = FALSE;
+    c->fPendingReplicaWrite = FALSE;
     c->ctime = c->lastinteraction = g_pserver->unixtime;
     /* If the default user does not require authentication, the user is
      * directly authenticated. */
@@ -221,6 +222,10 @@ void clientInstallWriteHandler(client *c) {
     /* Schedule the client to write the output buffers to the socket only
      * if not already done and, for slaves, if the replica can actually receive
      * writes at this stage. */
+
+    if (c->flags & CLIENT_SLAVE)
+        serverLog(LL_NOTICE, "installing write handler");
+
     if (!(c->flags & CLIENT_PENDING_WRITE) &&
         (c->replstate == REPL_STATE_NONE ||
          (c->replstate == SLAVE_STATE_ONLINE && !c->repl_put_online_on_ack)))
@@ -272,6 +277,9 @@ void clientInstallAsyncWriteHandler(client *c) {
 int prepareClientToWrite(client *c) {
     bool fAsync = !FCorrectThread(c);  // Not async if we're on the right thread
 
+    if (c->flags & CLIENT_SLAVE)
+        serverLog(LL_NOTICE, "got into prepareClientToWrite");
+
 	if (!fAsync) {
 		serverAssert(c->conn == nullptr || c->lock.fOwnLock());
 	} else {
@@ -302,7 +310,7 @@ int prepareClientToWrite(client *c) {
 
     /* Schedule the client to write the output buffers to the socket, unless
      * it should already be setup to do so (it has already pending data). */
-    if (!fAsync && !clientHasPendingReplies(c) && c->repl_curr_idx == -1) clientInstallWriteHandler(c);
+    if (!fAsync && !clientHasPendingReplies(c) && !c->fPendingReplicaWrite) clientInstallWriteHandler(c);
     if (fAsync && !(c->fPendingAsyncWrite)) clientInstallAsyncWriteHandler(c);
 
     /* Authorize the caller to queue in the output buffer of this client. */
@@ -320,7 +328,6 @@ void _clientAsyncReplyBufferReserve(client *c, size_t len) {
     clientReplyBlock *replyNew = (clientReplyBlock*)zmalloc(sizeof(clientReplyBlock) + newsize);
     replyNew->size = zmalloc_usable(replyNew) - sizeof(clientReplyBlock);
     replyNew->used = 0;
-    std::unique_lock<fastlock> tRDBLock (c->transmittedRDBLock);
     c->replyAsync = replyNew;
 }
 
@@ -334,7 +341,6 @@ int _addReplyToBuffer(client *c, const char *s, size_t len) {
     if (fAsync)
     {
         serverAssert(GlobalLocksAcquired());
-        std::unique_lock<fastlock> tRDBLock (c->transmittedRDBLock);
         if (c->replyAsync == nullptr || (c->replyAsync->size - c->replyAsync->used) < len)
         {
             if (c->replyAsync == nullptr) {
@@ -1661,6 +1667,16 @@ client *lookupClientByID(uint64_t id) {
     return (c == raxNotFound) ? NULL : c;
 }
 
+/* Compute the corresponding index from a replication backlog offset
+ * by taking the distance between the input offset and the replication backlog offset 
+ * and applying that to the replication backlog index, wrapping around if the index 
+ * becomes negative. 
+ * TODO: Rewrite comment for new logic */
+long long getReplIndexFromOffset(long long offset){
+    long long index = (offset - g_pserver->repl_backlog_start) % g_pserver->repl_backlog_size;
+    return index;
+}
+
 /* Write data in output buffers to client. Return C_OK if the client
  * is still valid after the call, C_ERR if it was freed because of some
  * error.  If handler_installed is set, it will attempt to clear the
@@ -1680,7 +1696,11 @@ int writeToClient(client *c, int handler_installed) {
     std::unique_lock<decltype(c->lock)> lock(c->lock);
     // serverLog(LL_NOTICE, "acq client");
 
+    if (c->flags & CLIENT_SLAVE)
+        serverLog(LL_NOTICE, "writeToClient has happened");
+
     while(clientHasPendingReplies(c)) {
+        serverAssert(!(c->flags & CLIENT_SLAVE) || c->flags & CLIENT_MONITOR);
         if (c->bufpos > 0) {
             nwritten = connWrite(c->conn,c->buf+c->sentlen,c->bufpos-c->sentlen);
             if (nwritten <= 0) break;
@@ -1739,80 +1759,67 @@ int writeToClient(client *c, int handler_installed) {
     /* We can only directly read from the replication backlog if the client 
        is a replica, so only attempt to do so if that's the case. */
     if (c->flags & CLIENT_SLAVE) {
-        /* If there are no more pending replies, then we have transmitted the RDB.
-        * This means further replication commands will be taken straight from the
-        * replication backlog from now on. */
-        std::unique_lock<fastlock> tRDBLock (c->transmittedRDBLock);
-
-        if (c->replstate == SLAVE_STATE_ONLINE && !clientHasPendingReplies(c) && c->replyAsync == nullptr){
-            c->transmittedRDB = true;
-        }
-        bool transmittedRDB = c->transmittedRDB;
-        tRDBLock.unlock();
-
         /* For replicas, we don't store all the information in the client buffer
-        * Most of the time (aside from immediately after synchronizing), we read
-        * from the replication backlog directly */
-        if (c->repl_curr_idx != -1 && transmittedRDB){
-            std::unique_lock<fastlock> repl_backlog_lock (g_pserver->repl_backlog_lock);
+        * We always read from the replication backlog directly */
+        std::unique_lock<fastlock> repl_backlog_lock (g_pserver->repl_backlog_lock);
 
-            /* copy global variables into local scope so if they change in between we don't care */
-            long long repl_backlog_idx = g_pserver->repl_backlog_idx;
-            long long repl_backlog_size = g_pserver->repl_backlog_size;
+        /* Right now, we're bringing in the offStart into the scope
+         * If repl_batch_offStart is equal to -1, that means the mechanism is disabled
+         * which implies there is no data to flush and that the global offset is accurate */
+        long long offStart = g_pserver->repl_batch_offStart == -1 ? g_pserver->master_repl_offset : g_pserver->repl_batch_offStart;
+        long long idxStart = getReplIndexFromOffset(offStart);
+        if (g_pserver->repl_batch_offStart != -1)
+            serverAssert(idxStart == g_pserver->repl_batch_idxStart);
+        else 
+            serverAssert(idxStart == g_pserver->repl_backlog_idx);
+
+        if (c->repl_curr_off != -1 && c->repl_curr_off != offStart){
+            serverLog(LL_NOTICE, "printing the stats for client %lu: c->repl_curr_off: %lld, repl_batch_offStart: %lld, nwritten: %ld, offStart: %lld", 
+                c->id, c->repl_curr_off, g_pserver->repl_batch_offStart, nwritten, offStart);
+
+            long long curr_idx = getReplIndexFromOffset(c->repl_curr_off); 
             long long nwrittenPart2 = 0;
-
-            ssize_t nrequested; /* The number of bytes requested to write */
             /* normal case with no wrap around */
-            if (repl_backlog_idx >= c->repl_curr_idx){
-                nrequested = repl_backlog_idx - c->repl_curr_idx;
-                nwritten = connWrite(c->conn, g_pserver->repl_backlog + c->repl_curr_idx, repl_backlog_idx - c->repl_curr_idx);
+            if (idxStart >= curr_idx){
+                nwritten = connWrite(c->conn, g_pserver->repl_backlog + curr_idx, idxStart - curr_idx);
             /* wrap around case, v. rare */
             /* also v. buggy so there's that */
             } else {
-                nrequested = repl_backlog_size + repl_backlog_idx - c->repl_curr_idx;
-                nwritten = connWrite(c->conn, g_pserver->repl_backlog + c->repl_curr_idx, repl_backlog_size - c->repl_curr_idx);
+                serverLog(LL_NOTICE, "ROAD OF RESISTANCE");
+                nwritten = connWrite(c->conn, g_pserver->repl_backlog + curr_idx, g_pserver->repl_backlog_size - curr_idx);
                 /* only attempt wrapping if we write the correct number of bytes */
-                if (nwritten == repl_backlog_size - c->repl_curr_idx){
-                    long long nwrittenPart2 = connWrite(c->conn, g_pserver->repl_backlog, repl_backlog_idx);
+                if (nwritten == g_pserver->repl_backlog_size - curr_idx){
+                    long long nwrittenPart2 = connWrite(c->conn, g_pserver->repl_backlog, idxStart);
                     if (nwrittenPart2 != -1)
                         nwritten += nwrittenPart2;
-
                 }                
             }
 
-            /* only update the replica's current index if bytes were sent */
-
-            // if (nrequested != nwritten){
-                // serverLog(LL_NOTICE, "-----------------------------------------");
-                // serverLog(LL_NOTICE, "AFTER THE FACT");
-                // serverLog(LL_NOTICE, "requested to write: %ld", nrequested);
-                // serverLog(LL_NOTICE, "actually written: %ld", nwritten);
-                // serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size);
-                // serverLog(LL_NOTICE, "repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset);
-                // serverLog(LL_NOTICE, "-----------------------------------------");
-            // }
-
-
-            if (nwritten == nrequested && g_pserver->repl_backlog_idx == repl_backlog_idx){
-                c->repl_curr_idx = -1; /* -1 denotes no more replica writes */
-            }
-            else if (nwritten > 0)
-                c->repl_curr_idx = (c->repl_curr_idx + nwritten) % repl_backlog_size;
-
-            serverAssert(c->repl_curr_idx < repl_backlog_size);
-
             /* only increment bytes if an error didn't occur */
             if (nwritten > 0){
                 totwritten += nwritten;
                 c->repl_curr_off += nwritten;
+                if (1){
+                    serverLog(LL_NOTICE, "printing the stats for client %lu: c->repl_curr_off: %lld, repl_batch_offStart: %lld, nwritten: %ld, offStart: %lld", 
+                        c->id, c->repl_curr_off, g_pserver->repl_batch_offStart, nwritten, offStart);
+                }
+                serverAssert(c->repl_curr_off <= offStart);
+                /* If the client offset matches the global offset, we wrote all we needed to,
+                 * in which case, there is no pending write */
+                if (c->repl_curr_off == offStart){
+                    serverLog(LL_NOTICE, "good, %lld", offStart);
+                    c->fPendingReplicaWrite = false;
+                } else {
+                    serverLog(LL_NOTICE, "mismatch between repl_curr_off (%lld) and offStart (%lld)", c->repl_curr_off, offStart);
+                }
             }
 
             /* If the second part of a write didn't go through, we still need to register that */
             if (nwrittenPart2 == -1) nwritten = -1;
         }
 
-        if (c->flags & CLIENT_SLAVE && handler_installed)
-            serverLog(LL_NOTICE, "Total bytes written, %ld, write handler installed?: %d", totwritten, handler_installed);
+        // if (c->flags & CLIENT_SLAVE && handler_installed)
+        //     serverLog(LL_NOTICE, "Total bytes written, %ld, write handler installed?: %d", totwritten, handler_installed);
 
     }
 
@@ -1836,12 +1843,12 @@ int writeToClient(client *c, int handler_installed) {
          * We just rely on data / pings received for timeout detection. */
         if (!(c->flags & CLIENT_MASTER)) c->lastinteraction = g_pserver->unixtime;
     }
-    if (!clientHasPendingReplies(c) && c->repl_curr_idx == -1) {
-        if(c->flags & CLIENT_SLAVE && handler_installed){
-            serverLog(LL_NOTICE, "Uninstalling handler");
-            serverLog(LL_NOTICE, "handler repl_curr_idx: %lld, repl_backlog_size: %lld", c->repl_curr_idx, g_pserver->repl_backlog_size);
-            serverLog(LL_NOTICE, "handler repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset);
-        }
+    if (!clientHasPendingReplies(c) && !c->fPendingReplicaWrite) {
+        // if(c->flags & CLIENT_SLAVE && handler_installed){
+        //     serverLog(LL_NOTICE, "Uninstalling handler");
+        //     serverLog(LL_NOTICE, "handler repl_curr_idx: %lld, repl_backlog_size: %lld", c->repl_curr_idx, g_pserver->repl_backlog_size);
+        //     serverLog(LL_NOTICE, "handler repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset);
+        // }
         c->sentlen = 0;
         if (handler_installed) connSetWriteHandler(c->conn, NULL);
 
@@ -1857,7 +1864,7 @@ int writeToClient(client *c, int handler_installed) {
 /* Write event handler. Just send data to the client. */
 void sendReplyToClient(connection *conn) {
     client *c = (client*)connGetPrivateData(conn);
-    serverLog(LL_NOTICE, "called the sendreplytoclient");
+    // serverLog(LL_NOTICE, "called the sendreplytoclient");
     if (writeToClient(c,1) == C_ERR)
     {
         AeLocker ae;
@@ -1886,7 +1893,6 @@ void ProcessPendingAsyncWrites()
         serverAssert(c->fPendingAsyncWrite);
         if (c->flags & (CLIENT_CLOSE_ASAP | CLIENT_CLOSE_AFTER_REPLY))
         {
-            std::unique_lock<fastlock> tRDBLock (c->transmittedRDBLock);
             if (c->replyAsync != nullptr){
                 zfree(c->replyAsync);
                 c->replyAsync = nullptr;
@@ -1898,7 +1904,6 @@ void ProcessPendingAsyncWrites()
         /* since writes from master to replica can come directly from the replication backlog,
          * writes may have been signalled without having been copied to the replyAsync buffer,
          * thus causing the buffer to be NULL */ 
-        std::unique_lock<fastlock> tRDBLock (c->transmittedRDBLock);
         if (c->replyAsync != nullptr){
             int size = c->replyAsync->used;
 
@@ -1919,7 +1924,6 @@ void ProcessPendingAsyncWrites()
         }
 
         c->fPendingAsyncWrite = FALSE;
-        tRDBLock.unlock();
         // Now install the write event handler
         int ae_flags = AE_WRITABLE|AE_WRITE_THREADSAFE;
         /* For the fsync=always policy, we want that a given FD is never
@@ -2032,8 +2036,7 @@ int handleClientsWithPendingWrites(int iel, int aof_state) {
 
         /* If after the synchronous writes above we still have data to
         * output to the client, we need to install the writable handler. */
-        if (clientHasPendingReplies(c) || c->repl_curr_idx != -1) {
-            serverLog(LL_NOTICE, "Setting a write handler for later");
+        if (clientHasPendingReplies(c) || c->fPendingReplicaWrite) {
             if (connSetWriteHandlerWithBarrier(c->conn, sendReplyToClient, ae_flags, true) == C_ERR) {
                 freeClientAsync(c);
             }
@@ -2214,6 +2217,34 @@ static void setProtocolError(const char *errstr, client *c) {
     c->flags |= (CLIENT_CLOSE_AFTER_REPLY|CLIENT_PROTOCOL_ERROR);
 }
 
+static void printQueryBuffer(client *c) {
+    if (cserver.verbosity <= LL_VERBOSE || c->flags & CLIENT_MASTER) {
+        sds client = catClientInfoString(sdsempty(),c);
+
+        /* Sample some protocol to given an idea about what was inside. */
+        char buf[PROTO_DUMP_LEN*2];
+        if (sdslen(c->querybuf)-c->qb_pos < PROTO_DUMP_LEN) {
+            snprintf(buf,sizeof(buf),"%s", c->querybuf+c->qb_pos);
+        } else {
+            snprintf(buf,sizeof(buf),"%.*s (... more %zu bytes ...) %.*s", PROTO_DUMP_LEN/2, c->querybuf+c->qb_pos, sdslen(c->querybuf)-c->qb_pos-PROTO_DUMP_LEN, PROTO_DUMP_LEN/2, c->querybuf+sdslen(c->querybuf)-PROTO_DUMP_LEN/2);
+        }
+
+        /* Remove non printable chars. */
+        char *p = buf;
+        while (*p != '\0') {
+            if (!isprint(*p)) *p = '.';
+            p++;
+        }
+
+        /* Log all the client and protocol info. */
+        int loglevel = (c->flags & CLIENT_MASTER) ? LL_WARNING :
+                                                    LL_VERBOSE;
+        serverLog(loglevel,
+            "Query buffer from client %lu: %s. %s", c->id, client, buf);
+        sdsfree(client);
+    }
+}
+
 /* Process the query buffer for client 'c', setting up the client argument
  * vector for command execution. Returns C_OK if after running the function
  * the client has a well-formed ready to be processed command, otherwise
@@ -2468,6 +2499,8 @@ void parseClientCommandBuffer(client *c) {
         }
 
         size_t cqueriesStart = c->vecqueuedcmd.size();
+        // if (c->flags & CLIENT_MASTER)
+        //     printQueryBuffer(c);
         if (c->reqtype == PROTO_REQ_INLINE) {
             if (processInlineBuffer(c) != C_OK) break;
         } else if (c->reqtype == PROTO_REQ_MULTIBULK) {
diff --git a/src/replication.cpp b/src/replication.cpp
index d1181bdf4..97638e833 100644
--- a/src/replication.cpp
+++ b/src/replication.cpp
@@ -88,18 +88,6 @@ int RDBGeneratedByReplication = 0;
 
 void resizeReplicationBacklogForClients(long long newsize);
 
-void setReplIdx(client *c, long long idx, long long off){
-    // serverLog(LL_NOTICE, "calling this garbage function w/ idx and off: %lld, %lld, %lld", idx, off, off-idx);
-    // serverLog(LL_NOTICE, "Repl Index started at: %lld", c->repl_curr_idx);
-    if (c->repl_curr_idx == -1 && off >= c->repl_curr_off){
-        if (prepareClientToWrite(c) != C_OK) return;
-        c->repl_curr_idx = idx;
-        c->repl_curr_off = off;
-    }
-    // serverLog(LL_NOTICE, "Repl Index has become: %lld", c->repl_curr_idx);
-
-}
-
 /* --------------------------- Utility functions ---------------------------- */
 
 /* Return the pointer to a string representing the replica ip:listening_port
@@ -232,6 +220,7 @@ void createReplicationBacklog(void) {
     g_pserver->repl_backlog = (char*)zmalloc(g_pserver->repl_backlog_size, MALLOC_LOCAL);
     g_pserver->repl_backlog_histlen = 0;
     g_pserver->repl_backlog_idx = 0;
+    g_pserver->repl_backlog_start = g_pserver->master_repl_offset;
 
     /* We don't have any data inside our buffer, but virtually the first
      * byte we have is the next byte that will be generated for the
@@ -284,6 +273,7 @@ void resizeReplicationBacklog(long long newsize) {
             g_pserver->repl_backlog = backlog;
             g_pserver->repl_backlog_idx = g_pserver->repl_backlog_histlen;
             g_pserver->repl_batch_idxStart = 0;
+            g_pserver->repl_backlog_start = g_pserver->master_repl_offset;
         } else {
             zfree(g_pserver->repl_backlog);
             g_pserver->repl_backlog = (char*)zmalloc(newsize);
@@ -296,6 +286,7 @@ void resizeReplicationBacklog(long long newsize) {
     g_pserver->repl_backlog_size = newsize;
 }
 
+long long getReplIndexFromOffset(long long offset);
 
 /* The above but for when clients need extra replication backlog because ??? */
 void resizeReplicationBacklogForClients(long long newsize) {
@@ -305,32 +296,8 @@ void resizeReplicationBacklogForClients(long long newsize) {
 
     serverLog(LL_NOTICE, "WE HAVE TO RESIZE from %lld to %lld", g_pserver->repl_backlog_size, newsize);
     /* get the critical client size, i.e. the size of the data unflushed to clients */
-    long long earliest_off = LONG_LONG_MAX;
-    long long earliest_idx = -1;
-    listIter li;
-    listNode *ln;
-    listRewind(g_pserver->slaves, &li);
-    while ((ln = listNext(&li))) {
-        client *replica = (client*)listNodeValue(ln);
-        if (replica->repl_curr_off != -1 && replica->repl_curr_off < earliest_off){
-            earliest_off = replica->repl_curr_off;
-            earliest_idx = replica->repl_curr_idx;
-        }
-        serverLog(LL_NOTICE, "repl_curr_idx: %lld, earlistidx: %lld", replica->repl_curr_idx, earliest_idx);
-    }
-    serverLog(LL_NOTICE, "We are starting with: master_repl_offset: %lld, repl_batch_offStart: %lld, earliest_off: %lld, " 
-                         "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, earliest_idx: %lld, repl_backlog_size: %lld",
-            g_pserver->master_repl_offset, g_pserver->repl_batch_offStart, earliest_off,
-            g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, earliest_idx, g_pserver->repl_backlog_size
-        );
+    long long earliest_off = g_pserver->repl_lowest_off.load();
 
-    long long new_off = 0, new_idx = 0;
-
-    /* if no earliest offset is found amongst the clients, they are all up to date with the flushed index */
-    if (earliest_off == LONG_LONG_MAX && earliest_idx == -1){
-        earliest_idx = g_pserver->repl_batch_idxStart;
-        earliest_off = g_pserver->repl_batch_offStart;
-    }
 
     if (g_pserver->repl_backlog != NULL) {
         /* What we actually do is to flush the old buffer and realloc a new
@@ -339,17 +306,18 @@ void resizeReplicationBacklogForClients(long long newsize) {
          * worse often we need to alloc additional space before freeing the
          * old buffer. */
 
-        if (earliest_idx >= 0) {
+        if (earliest_off != -1) {
             // We need to keep critical data so we can't shrink less than the hot data in the buffer
             newsize = std::max(newsize, g_pserver->master_repl_offset - earliest_off);
             char *backlog = (char*)zmalloc(newsize);
             g_pserver->repl_backlog_histlen = g_pserver->master_repl_offset - earliest_off;
+            long long earliest_idx = getReplIndexFromOffset(earliest_off);
 
             if (g_pserver->repl_backlog_idx >= earliest_idx) {
                 auto cbActiveBacklog = g_pserver->repl_backlog_idx - earliest_idx;
                 memcpy(backlog, g_pserver->repl_backlog + earliest_idx, cbActiveBacklog);
-                serverLog(LL_NOTICE, "g_pserver->master_repl_offset: %lld, earliest_off: %lld, g_pserver->repl_backlog_idx: %lld, earliest_idx: %lld",
-                    g_pserver->master_repl_offset, earliest_off, g_pserver->repl_backlog_idx, earliest_idx);
+                serverLog(LL_NOTICE, "g_pserver->master_repl_offset: %lld, earliest_off: %lld, g_pserver->repl_backlog_idx: %lld, earliest_idx: %lld, repl_backlog_start: %lld",
+                    g_pserver->master_repl_offset, earliest_off, g_pserver->repl_backlog_idx, earliest_idx, g_pserver->repl_backlog_start);
                 serverAssert(g_pserver->repl_backlog_histlen == cbActiveBacklog);
             } else {
                 auto cbPhase1 = g_pserver->repl_backlog_size - earliest_idx;
@@ -361,20 +329,10 @@ void resizeReplicationBacklogForClients(long long newsize) {
             zfree(g_pserver->repl_backlog);
             g_pserver->repl_backlog = backlog;
             g_pserver->repl_backlog_idx = g_pserver->repl_backlog_histlen;
-            listRewind(g_pserver->slaves, &li);
-            /* Go through the clients and update their replication indicies */
-            while ((ln = listNext(&li))) {
-                client *replica = (client*)listNodeValue(ln);
-                if (replica->repl_curr_idx != -1){
-                    replica->repl_curr_idx -= earliest_idx;
-                    if (replica->repl_curr_idx < 0)
-                        replica->repl_curr_idx += g_pserver->repl_backlog_size;
-                }
-                new_idx = replica->repl_curr_idx;
-            }
             g_pserver->repl_batch_idxStart -= earliest_idx;
             if (g_pserver->repl_batch_idxStart < 0)
                 g_pserver->repl_batch_idxStart += g_pserver->repl_backlog_size;
+            g_pserver->repl_backlog_start = earliest_off;
         } else {
             zfree(g_pserver->repl_backlog);
             g_pserver->repl_backlog = (char*)zmalloc(newsize);
@@ -382,14 +340,15 @@ void resizeReplicationBacklogForClients(long long newsize) {
             g_pserver->repl_backlog_idx = 0;
             /* Next byte we have is... the next since the buffer is empty. */
             g_pserver->repl_backlog_off = g_pserver->master_repl_offset+1;
+            g_pserver->repl_backlog_start = g_pserver->master_repl_offset;
         }
     }
     g_pserver->repl_backlog_size = newsize;
 
     serverLog(LL_NOTICE, "We are ending with: master_repl_offset: %lld, repl_batch_offStart: %lld, new_off: %lld, " 
                          "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, new_idx: %lld, repl_backlog_size: %lld",
-        g_pserver->master_repl_offset, g_pserver->repl_batch_offStart, new_off,
-        g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, new_idx, g_pserver->repl_backlog_size
+        g_pserver->master_repl_offset, g_pserver->repl_batch_offStart, 0LL,
+        g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, 0LL, g_pserver->repl_backlog_size
     );
 }
 
@@ -456,11 +415,6 @@ void feedReplicationBacklog(const void *ptr, size_t len) {
         len -= thislen;
         p += thislen;
         g_pserver->repl_backlog_histlen += thislen;
-        // serverLog(LL_NOTICE, "Pt2 intermediate with: master_repl_offset: %lld, repl_batch_offStart: %lld, " 
-        //                      "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, repl_backlog_size: %lld",
-        //         g_pserver->master_repl_offset, g_pserver->repl_batch_offStart,
-        //         g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, g_pserver->repl_backlog_size
-        // );
     }
     if (g_pserver->repl_backlog_histlen > g_pserver->repl_backlog_size)
         g_pserver->repl_backlog_histlen = g_pserver->repl_backlog_size;
@@ -722,7 +676,7 @@ void replicationFeedSlaves(list *replicas, int dictid, robj **argv, int argc) {
 void showLatestBacklog(void) {
     if (g_pserver->repl_backlog == NULL) return;
 
-    long long dumplen = 256;
+    long long dumplen = 1024;
     if (g_pserver->repl_backlog_histlen < dumplen)
         dumplen = g_pserver->repl_backlog_histlen;
 
@@ -813,7 +767,7 @@ void replicationFeedMonitors(client *c, list *monitors, int dictid, robj **argv,
     }
     decrRefCount(cmdobj);
 }
-
+#define BYPASS_PSYNC
 /* Feed the replica 'c' with the replication backlog starting from the
  * specified 'offset' up to the end of the backlog. */
 long long addReplyReplicationBacklog(client *c, long long offset) {
@@ -854,7 +808,8 @@ long long addReplyReplicationBacklog(client *c, long long offset) {
     len = g_pserver->repl_backlog_histlen - skip;
     serverLog(LL_DEBUG, "[PSYNC] Reply total length: %lld", len);
 #ifdef BYPASS_PSYNC
-    setReplIdx(c, j, offset);
+    c->repl_curr_off = offset - 1;
+    serverLog(LL_NOTICE, "This client %lu at addr %s synchronized to %lld", c->id, getClientPeerId(c), c->repl_curr_off);
 #else
     while(len) {
         long long thislen =
@@ -900,6 +855,11 @@ int replicationSetupSlaveForFullResync(client *replica, long long offset) {
 
     replica->psync_initial_offset = offset;
     replica->replstate = SLAVE_STATE_WAIT_BGSAVE_END;
+
+    replica->repl_curr_off = offset;
+
+    serverLog(LL_NOTICE, "This client %lu at addr %s synchronized to %lld", replica->id, getClientPeerId(replica), replica->repl_curr_off);
+
     /* We are going to accumulate the incremental changes for this
      * replica as well. Set replicaseldb to -1 in order to force to re-emit
      * a SELECT statement in the replication stream. */
@@ -2006,7 +1966,6 @@ void replicationCreateMasterClient(redisMaster *mi, connection *conn, int dbid)
     mi->master->reploff_skipped = 0;
     mi->master->read_reploff = mi->master->reploff;
     mi->master->puser = NULL; /* This client can do everything. */
-    
     memcpy(mi->master->uuid, mi->master_uuid, UUID_BINARY_LEN);
     memset(mi->master_uuid, 0, UUID_BINARY_LEN); // make sure people don't use this temp storage buffer
 
@@ -4652,12 +4611,17 @@ void flushReplBacklogToClients()
         serverAssert(g_pserver->master_repl_offset - g_pserver->repl_batch_offStart <= g_pserver->repl_backlog_size);
         serverAssert(g_pserver->repl_batch_idxStart != g_pserver->repl_backlog_idx);
 
+        serverLog(LL_NOTICE, "the master repl offset is %lld", g_pserver->master_repl_offset);
+        showLatestBacklog();
         listIter li;
         listNode *ln;
         listRewind(g_pserver->slaves, &li);
         while ((ln = listNext(&li))) {
             client *replica = (client*)listNodeValue(ln);
 
+            // serverLog(LL_NOTICE, "client %lu is in the party", replica->id);
+
+            // serverLog(LL_NOTICE, "is there a write pending for %lu, %d", replica->id, replica->fPendingReplicaWrite);
             if (replica->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue;
             if (replica->flags & CLIENT_CLOSE_ASAP) continue;
 
@@ -4675,11 +4639,21 @@ void flushReplBacklogToClients()
                     asyncUl.lock();
                 /* If we are online and the RDB has been sent, there is no need to feed the client buffer
                 * We will send our replies directly from the replication backlog instead */
-                std::unique_lock<fastlock> tRDBLock (replica->transmittedRDBLock);
-                if (replica->replstate == SLAVE_STATE_ONLINE && replica->transmittedRDB){
-                    setReplIdx(replica, g_pserver->repl_batch_idxStart, g_pserver->repl_batch_offStart);
-                    continue;
+                if (replica->repl_curr_off == -1){
+                    replica->repl_curr_off = g_pserver->repl_batch_offStart;
+
+                    serverLog(LL_NOTICE, "This client %lu at addr %s synchronized to %lld", replica->id, getClientPeerId(replica), replica->repl_curr_off);
+
                 }
+
+                /* Only if the there isn't already a pending write do we prepare the client to write */
+                if (!replica->fPendingReplicaWrite){
+                    serverAssert(replica->repl_curr_off != g_pserver->master_repl_offset);
+                    prepareClientToWrite(replica);
+                    replica->fPendingReplicaWrite = true; 
+                }
+
+                continue;
             }
 #endif
             if (g_pserver->repl_backlog_idx >= g_pserver->repl_batch_idxStart) {
diff --git a/src/server.h b/src/server.h
index da1fce52e..9fdf5e0ef 100644
--- a/src/server.h
+++ b/src/server.h
@@ -1516,8 +1516,11 @@ struct client {
     long long psync_initial_offset; /* FULLRESYNC reply offset other slaves
                                        copying this replica output buffer
                                        should use. */
+                                       
     long long repl_curr_idx = -1;  /* Replication index sent, if this is a replica */
     long long repl_curr_off = -1;
+    int fPendingReplicaWrite;
+
     char replid[CONFIG_RUN_ID_SIZE+1]; /* Master replication ID (if master). */
     int slave_listening_port; /* As configured with: REPLCONF listening-port */
     char slave_ip[NET_IP_STR_LEN]; /* Optionally given by REPLCONF ip-address */
@@ -1577,12 +1580,8 @@ struct client {
     robj **argv;
     size_t argv_len_sumActive = 0;
 
-    bool transmittedRDB = false; /* Have we finished transmitting the RDB to this replica? */
-                                 /* If so, we can read from the replication backlog instead of the client buffer */
-
     // post a function from a non-client thread to run on its client thread
     bool postFunction(std::function<void(client *)> fn, bool fLock = true);
-    fastlock transmittedRDBLock {"transmittedRDB"};
     size_t argv_len_sum() const;
 };
 
@@ -2229,6 +2228,8 @@ struct redisServer {
                                        that is the next byte will'll write to.*/
     long long repl_backlog_off;     /* Replication "master offset" of first
                                        byte in the replication backlog buffer.*/
+    long long repl_backlog_start;   /* Used to compute indicies from offsets
+                                       basically, index = (offset - start) % size */
     fastlock repl_backlog_lock {"replication backlog"};
     time_t repl_backlog_time_limit; /* Time without slaves after the backlog
                                        gets released. */

From 2e9c7aed031f5822ddbe955803b6a09c6c1a9aca Mon Sep 17 00:00:00 2001
From: VivekSainiEQ <vivek@eqalpha.com>
Date: Thu, 3 Jun 2021 20:44:32 +0000
Subject: [PATCH 13/22] Single threaded tests work now

Former-commit-id: 0e760d7c71231c7f52102909a31fc8db1b3e2860
---
 src/networking.cpp  | 2 +-
 src/replication.cpp | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/networking.cpp b/src/networking.cpp
index 80120d0ca..e8ede3338 100644
--- a/src/networking.cpp
+++ b/src/networking.cpp
@@ -3419,7 +3419,7 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) {
  * that writes to said replica are using data from the replication backlog
  * as opposed to it's own internal buffer, this number should keep track of that */
 unsigned long getClientReplicationBacklogSharedUsage(client *c) {
-    return (!(c->flags & CLIENT_SLAVE) || c->repl_curr_idx == -1) ? 0 : g_pserver->master_repl_offset - c->repl_curr_off;
+    return (!(c->flags & CLIENT_SLAVE) || !c->fPendingReplicaWrite ) ? 0 : g_pserver->master_repl_offset - c->repl_curr_off;
 }
 
 /* This function returns the number of bytes that Redis is
diff --git a/src/replication.cpp b/src/replication.cpp
index 97638e833..a7a2aa79e 100644
--- a/src/replication.cpp
+++ b/src/replication.cpp
@@ -810,6 +810,10 @@ long long addReplyReplicationBacklog(client *c, long long offset) {
 #ifdef BYPASS_PSYNC
     c->repl_curr_off = offset - 1;
     serverLog(LL_NOTICE, "This client %lu at addr %s synchronized to %lld", c->id, getClientPeerId(c), c->repl_curr_off);
+
+    /* Force the partial sync to be queued */
+    prepareClientToWrite(c);
+    c->fPendingReplicaWrite = true;
 #else
     while(len) {
         long long thislen =

From 667d2763c0df3ca48b52949a365d3237dbcc0c52 Mon Sep 17 00:00:00 2001
From: VivekSainiEQ <vivek@eqalpha.com>
Date: Thu, 3 Jun 2021 21:47:33 +0000
Subject: [PATCH 14/22] Removed unused variables

Former-commit-id: 48663bc480f7279a94c68aeebdd9721ca64f7038
---
 src/config.cpp      | 1 -
 src/evict.cpp       | 1 -
 src/replication.cpp | 2 --
 src/server.h        | 6 +-----
 4 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/src/config.cpp b/src/config.cpp
index b546ef607..9d7f14007 100644
--- a/src/config.cpp
+++ b/src/config.cpp
@@ -2347,7 +2347,6 @@ static int updateReplBacklogSize(long long val, long long prev, const char **err
     UNUSED(err);
     g_pserver->repl_backlog_size = prev;
     resizeReplicationBacklog(val);
-    g_pserver->repl_backlog_config_size = g_pserver->repl_backlog_size;
     return 1;
 }
 
diff --git a/src/evict.cpp b/src/evict.cpp
index e7f0a10ef..7ec223f6d 100644
--- a/src/evict.cpp
+++ b/src/evict.cpp
@@ -399,7 +399,6 @@ size_t freeMemoryGetNotCountedMemory(void) {
 
     /* also don't count the replication backlog memory
      * that's where the replication clients get their memory from */
-    // overhead += (g_pserver->repl_backlog_size - g_pserver->repl_backlog_config_size);
     overhead += g_pserver->repl_backlog_size;
 
     if (g_pserver->aof_state != AOF_OFF) {
diff --git a/src/replication.cpp b/src/replication.cpp
index a7a2aa79e..3a48963ab 100644
--- a/src/replication.cpp
+++ b/src/replication.cpp
@@ -1129,7 +1129,6 @@ void syncCommand(client *c) {
     if (!strcasecmp((const char*)ptrFromObj(c->argv[0]),"psync")) {
         if (masterTryPartialResynchronization(c) == C_OK) {
             g_pserver->stat_sync_partial_ok++;
-            // c->repl_curr_idx = g_pserver->repl_backlog_idx;
             return; /* No full resync needed, return. */
         } else {
             char *master_replid = (char*)ptrFromObj(c->argv[1]);
@@ -1157,7 +1156,6 @@ void syncCommand(client *c) {
         connDisableTcpNoDelay(c->conn); /* Non critical if it fails. */
     c->repldbfd = -1;
     c->flags |= CLIENT_SLAVE;
-    // c->repl_curr_idx = g_pserver->repl_backlog_idx;
     listAddNodeTail(g_pserver->slaves,c);
 
     /* Create the replication backlog if needed. */
diff --git a/src/server.h b/src/server.h
index 9fdf5e0ef..2aba985ed 100644
--- a/src/server.h
+++ b/src/server.h
@@ -1517,8 +1517,7 @@ struct client {
                                        copying this replica output buffer
                                        should use. */
                                        
-    long long repl_curr_idx = -1;  /* Replication index sent, if this is a replica */
-    long long repl_curr_off = -1;
+    long long repl_curr_off = -1;  /* Replication offset of the client, only if it's a replica*/
     int fPendingReplicaWrite;
 
     char replid[CONFIG_RUN_ID_SIZE+1]; /* Master replication ID (if master). */
@@ -2416,9 +2415,6 @@ struct redisServer {
     uint16_t rglockSamples[s_lockContentionSamples];
     unsigned ilockRingHead = 0;
 
-    long long repl_backlog_config_size = 1024*1024; /* This is a hack to ignore the resizing of the replication backlog
-                                                       when using it as a defacto for the client buffer */
-
     bool FRdbSaveInProgress() const { return rdbThreadVars.fRdbThreadActive; }
 };
 

From da0b7a3900ba50b37a2e3ac0cac1196aa19d734d Mon Sep 17 00:00:00 2001
From: VivekSainiEQ <vivek@eqalpha.com>
Date: Fri, 4 Jun 2021 20:09:47 +0000
Subject: [PATCH 15/22] Seems to pass multithreaded test cases, thank the lord

Former-commit-id: 6cbf70cfff5735f3d4ef2e980945b4b1a1f85971
---
 src/networking.cpp  | 19 +++++++++----------
 src/replication.cpp | 15 +++++++++------
 src/server.h        |  1 +
 3 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/src/networking.cpp b/src/networking.cpp
index e8ede3338..cead76998 100644
--- a/src/networking.cpp
+++ b/src/networking.cpp
@@ -277,8 +277,9 @@ void clientInstallAsyncWriteHandler(client *c) {
 int prepareClientToWrite(client *c) {
     bool fAsync = !FCorrectThread(c);  // Not async if we're on the right thread
 
-    if (c->flags & CLIENT_SLAVE)
+    if (c->flags & CLIENT_SLAVE){
         serverLog(LL_NOTICE, "got into prepareClientToWrite");
+    }
 
 	if (!fAsync) {
 		serverAssert(c->conn == nullptr || c->lock.fOwnLock());
@@ -1758,7 +1759,7 @@ int writeToClient(client *c, int handler_installed) {
 
     /* We can only directly read from the replication backlog if the client 
        is a replica, so only attempt to do so if that's the case. */
-    if (c->flags & CLIENT_SLAVE) {
+    if (c->flags & CLIENT_SLAVE && !(c->flags & CLIENT_MONITOR)) {
         /* For replicas, we don't store all the information in the client buffer
         * We always read from the replication backlog directly */
         std::unique_lock<fastlock> repl_backlog_lock (g_pserver->repl_backlog_lock);
@@ -1766,14 +1767,12 @@ int writeToClient(client *c, int handler_installed) {
         /* Right now, we're bringing in the offStart into the scope
          * If repl_batch_offStart is equal to -1, that means the mechanism is disabled
          * which implies there is no data to flush and that the global offset is accurate */
-        long long offStart = g_pserver->repl_batch_offStart == -1 ? g_pserver->master_repl_offset : g_pserver->repl_batch_offStart;
+        // long long offStart = g_pserver->repl_batch_offStart == -1 ? g_pserver->master_repl_offset : g_pserver->repl_batch_offStart;
+        long long offStart = c->repl_end_off;
         long long idxStart = getReplIndexFromOffset(offStart);
-        if (g_pserver->repl_batch_offStart != -1)
-            serverAssert(idxStart == g_pserver->repl_batch_idxStart);
-        else 
-            serverAssert(idxStart == g_pserver->repl_backlog_idx);
-
-        if (c->repl_curr_off != -1 && c->repl_curr_off != offStart){
+        
+        serverAssert(c->repl_curr_off != -1);
+        if (c->repl_curr_off != offStart){
             serverLog(LL_NOTICE, "printing the stats for client %lu: c->repl_curr_off: %lld, repl_batch_offStart: %lld, nwritten: %ld, offStart: %lld", 
                 c->id, c->repl_curr_off, g_pserver->repl_batch_offStart, nwritten, offStart);
 
@@ -1846,7 +1845,7 @@ int writeToClient(client *c, int handler_installed) {
     if (!clientHasPendingReplies(c) && !c->fPendingReplicaWrite) {
         // if(c->flags & CLIENT_SLAVE && handler_installed){
         //     serverLog(LL_NOTICE, "Uninstalling handler");
-        //     serverLog(LL_NOTICE, "handler repl_curr_idx: %lld, repl_backlog_size: %lld", c->repl_curr_idx, g_pserver->repl_backlog_size);
+        //     serverLog(LL_NOTICE, "repl_backlog_size: %lld", g_pserver->repl_backlog_size);
         //     serverLog(LL_NOTICE, "handler repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset);
         // }
         c->sentlen = 0;
diff --git a/src/replication.cpp b/src/replication.cpp
index 3a48963ab..96bf161f9 100644
--- a/src/replication.cpp
+++ b/src/replication.cpp
@@ -382,7 +382,9 @@ void feedReplicationBacklog(const void *ptr, size_t len) {
             lower_bound = g_pserver->repl_batch_offStart;
         long long minimumsize = g_pserver->master_repl_offset + len - lower_bound + 1;
         if (minimumsize > g_pserver->repl_backlog_size) {
+            g_pserver->repl_backlog_lock.unlock();
             flushReplBacklogToClients();
+            g_pserver->repl_backlog_lock.lock();
             minimumsize = g_pserver->master_repl_offset + len - lower_bound +1;
 
             if (minimumsize > g_pserver->repl_backlog_size) {
@@ -809,6 +811,7 @@ long long addReplyReplicationBacklog(client *c, long long offset) {
     serverLog(LL_DEBUG, "[PSYNC] Reply total length: %lld", len);
 #ifdef BYPASS_PSYNC
     c->repl_curr_off = offset - 1;
+    c->repl_end_off = g_pserver->master_repl_offset;
     serverLog(LL_NOTICE, "This client %lu at addr %s synchronized to %lld", c->id, getClientPeerId(c), c->repl_curr_off);
 
     /* Force the partial sync to be queued */
@@ -861,6 +864,7 @@ int replicationSetupSlaveForFullResync(client *replica, long long offset) {
     replica->replstate = SLAVE_STATE_WAIT_BGSAVE_END;
 
     replica->repl_curr_off = offset;
+    replica->repl_end_off = g_pserver->master_repl_offset;
 
     serverLog(LL_NOTICE, "This client %lu at addr %s synchronized to %lld", replica->id, getClientPeerId(replica), replica->repl_curr_off);
 
@@ -4634,19 +4638,18 @@ void flushReplBacklogToClients()
                 fAsyncWrite = true;
             
 
+            /* If we are online and the RDB has been sent, there is no need to feed the client buffer
+             * We will send our replies directly from the replication backlog instead */
 #ifdef BYPASS_BUFFER
             {
                 std::unique_lock<fastlock> asyncUl(replica->lock, std::defer_lock);
                 if (!FCorrectThread(replica))
                     asyncUl.lock();
-                /* If we are online and the RDB has been sent, there is no need to feed the client buffer
-                * We will send our replies directly from the replication backlog instead */
-                if (replica->repl_curr_off == -1){
-                    replica->repl_curr_off = g_pserver->repl_batch_offStart;
 
-                    serverLog(LL_NOTICE, "This client %lu at addr %s synchronized to %lld", replica->id, getClientPeerId(replica), replica->repl_curr_off);
+                /* We should have set the repl_curr_off when synchronizing, so it shouldn't be -1 here */
+                serverAssert(replica->repl_curr_off != -1);
 
-                }
+                replica->repl_end_off = g_pserver->master_repl_offset;
 
                 /* Only if the there isn't already a pending write do we prepare the client to write */
                 if (!replica->fPendingReplicaWrite){
diff --git a/src/server.h b/src/server.h
index 2aba985ed..64a2ca515 100644
--- a/src/server.h
+++ b/src/server.h
@@ -1518,6 +1518,7 @@ struct client {
                                        should use. */
                                        
     long long repl_curr_off = -1;  /* Replication offset of the client, only if it's a replica*/
+    long long repl_end_off = -1; /* Replication offset to write to */
     int fPendingReplicaWrite;
 
     char replid[CONFIG_RUN_ID_SIZE+1]; /* Master replication ID (if master). */

From 9db8556e91d46c5e2fb7f96ea5fb3880d56274aa Mon Sep 17 00:00:00 2001
From: VivekSainiEQ <vivek@eqalpha.com>
Date: Tue, 8 Jun 2021 23:10:53 +0000
Subject: [PATCH 16/22] Cleaned up code a bit, need to rewrite some comments to
 reflect new behaviour

Former-commit-id: 850ec766cd71614ce9e61c12414545cd212d3878
---
 src/evict.cpp       |   1 -
 src/networking.cpp  | 108 ++++----------------------
 src/replication.cpp | 179 +++++++-------------------------------------
 src/server.cpp      |   2 -
 src/server.h        |   1 -
 5 files changed, 43 insertions(+), 248 deletions(-)

diff --git a/src/evict.cpp b/src/evict.cpp
index 7ec223f6d..54153dc27 100644
--- a/src/evict.cpp
+++ b/src/evict.cpp
@@ -522,7 +522,6 @@ int freeMemoryIfNeeded(bool fQuickCycle, bool fPreSnapshot) {
     if (g_pserver->maxmemory_policy == MAXMEMORY_NO_EVICTION)
         goto cant_free; /* We need to free memory, but policy forbids. */
 
-    serverLog(LL_NOTICE, "evicting i guess lol, the overhead was %ld, the repl_backlog_size, %lld", freeMemoryGetNotCountedMemory(), g_pserver->repl_backlog_size);
     while (mem_freed < mem_tofree) {
         int j, k, i;
         static unsigned int next_db = 0;
diff --git a/src/networking.cpp b/src/networking.cpp
index cead76998..aba1f1705 100644
--- a/src/networking.cpp
+++ b/src/networking.cpp
@@ -223,9 +223,6 @@ void clientInstallWriteHandler(client *c) {
      * if not already done and, for slaves, if the replica can actually receive
      * writes at this stage. */
 
-    if (c->flags & CLIENT_SLAVE)
-        serverLog(LL_NOTICE, "installing write handler");
-
     if (!(c->flags & CLIENT_PENDING_WRITE) &&
         (c->replstate == REPL_STATE_NONE ||
          (c->replstate == SLAVE_STATE_ONLINE && !c->repl_put_online_on_ack)))
@@ -277,10 +274,6 @@ void clientInstallAsyncWriteHandler(client *c) {
 int prepareClientToWrite(client *c) {
     bool fAsync = !FCorrectThread(c);  // Not async if we're on the right thread
 
-    if (c->flags & CLIENT_SLAVE){
-        serverLog(LL_NOTICE, "got into prepareClientToWrite");
-    }
-
 	if (!fAsync) {
 		serverAssert(c->conn == nullptr || c->lock.fOwnLock());
 	} else {
@@ -1695,10 +1688,6 @@ int writeToClient(client *c, int handler_installed) {
     serverAssertDebug(FCorrectThread(c));
 
     std::unique_lock<decltype(c->lock)> lock(c->lock);
-    // serverLog(LL_NOTICE, "acq client");
-
-    if (c->flags & CLIENT_SLAVE)
-        serverLog(LL_NOTICE, "writeToClient has happened");
 
     while(clientHasPendingReplies(c)) {
         serverAssert(!(c->flags & CLIENT_SLAVE) || c->flags & CLIENT_MONITOR);
@@ -1710,7 +1699,6 @@ int writeToClient(client *c, int handler_installed) {
 
             /* If the buffer was sent, set bufpos to zero to continue with
             * the remainder of the reply. */
-            // serverLog(LL_NOTICE, "buf pos: %d, sentlen: %ld", c->bufpos, c->sentlen);
             if ((int)c->sentlen == c->bufpos) {
                 c->bufpos = 0;
                 c->sentlen = 0;
@@ -1764,33 +1752,24 @@ int writeToClient(client *c, int handler_installed) {
         * We always read from the replication backlog directly */
         std::unique_lock<fastlock> repl_backlog_lock (g_pserver->repl_backlog_lock);
 
-        /* Right now, we're bringing in the offStart into the scope
-         * If repl_batch_offStart is equal to -1, that means the mechanism is disabled
-         * which implies there is no data to flush and that the global offset is accurate */
-        // long long offStart = g_pserver->repl_batch_offStart == -1 ? g_pserver->master_repl_offset : g_pserver->repl_batch_offStart;
-        long long offStart = c->repl_end_off;
-        long long idxStart = getReplIndexFromOffset(offStart);
+        long long repl_end_idx = getReplIndexFromOffset(c->repl_end_off);
         
         serverAssert(c->repl_curr_off != -1);
-        if (c->repl_curr_off != offStart){
-            serverLog(LL_NOTICE, "printing the stats for client %lu: c->repl_curr_off: %lld, repl_batch_offStart: %lld, nwritten: %ld, offStart: %lld", 
-                c->id, c->repl_curr_off, g_pserver->repl_batch_offStart, nwritten, offStart);
-
-            long long curr_idx = getReplIndexFromOffset(c->repl_curr_off); 
-            long long nwrittenPart2 = 0;
+        if (c->repl_curr_off != c->repl_end_off){
+            long long repl_curr_idx = getReplIndexFromOffset(c->repl_curr_off); 
+            long long nwritten2ndStage = 0; /* How much was written from the start of the replication backlog
+                                             * in the event of a wrap around write */
             /* normal case with no wrap around */
-            if (idxStart >= curr_idx){
-                nwritten = connWrite(c->conn, g_pserver->repl_backlog + curr_idx, idxStart - curr_idx);
-            /* wrap around case, v. rare */
-            /* also v. buggy so there's that */
+            if (repl_end_idx >= repl_curr_idx){
+                nwritten = connWrite(c->conn, g_pserver->repl_backlog + repl_curr_idx, repl_end_idx - repl_curr_idx);
+            /* wrap around case */
             } else {
-                serverLog(LL_NOTICE, "ROAD OF RESISTANCE");
-                nwritten = connWrite(c->conn, g_pserver->repl_backlog + curr_idx, g_pserver->repl_backlog_size - curr_idx);
+                nwritten = connWrite(c->conn, g_pserver->repl_backlog + repl_curr_idx, g_pserver->repl_backlog_size - repl_curr_idx);
                 /* only attempt wrapping if we write the correct number of bytes */
-                if (nwritten == g_pserver->repl_backlog_size - curr_idx){
-                    long long nwrittenPart2 = connWrite(c->conn, g_pserver->repl_backlog, idxStart);
-                    if (nwrittenPart2 != -1)
-                        nwritten += nwrittenPart2;
+                if (nwritten == g_pserver->repl_backlog_size - repl_curr_idx){
+                    nwritten2ndStage = connWrite(c->conn, g_pserver->repl_backlog, repl_end_idx);
+                    if (nwritten2ndStage != -1)
+                        nwritten += nwritten2ndStage;
                 }                
             }
 
@@ -1798,31 +1777,19 @@ int writeToClient(client *c, int handler_installed) {
             if (nwritten > 0){
                 totwritten += nwritten;
                 c->repl_curr_off += nwritten;
-                if (1){
-                    serverLog(LL_NOTICE, "printing the stats for client %lu: c->repl_curr_off: %lld, repl_batch_offStart: %lld, nwritten: %ld, offStart: %lld", 
-                        c->id, c->repl_curr_off, g_pserver->repl_batch_offStart, nwritten, offStart);
-                }
-                serverAssert(c->repl_curr_off <= offStart);
+                serverAssert(c->repl_curr_off <= c->repl_end_off);
                 /* If the client offset matches the global offset, we wrote all we needed to,
                  * in which case, there is no pending write */
-                if (c->repl_curr_off == offStart){
-                    serverLog(LL_NOTICE, "good, %lld", offStart);
+                if (c->repl_curr_off == c->repl_end_off){
                     c->fPendingReplicaWrite = false;
-                } else {
-                    serverLog(LL_NOTICE, "mismatch between repl_curr_off (%lld) and offStart (%lld)", c->repl_curr_off, offStart);
                 }
             }
 
             /* If the second part of a write didn't go through, we still need to register that */
-            if (nwrittenPart2 == -1) nwritten = -1;
+            if (nwritten2ndStage == -1) nwritten = -1;
         }
-
-        // if (c->flags & CLIENT_SLAVE && handler_installed)
-        //     serverLog(LL_NOTICE, "Total bytes written, %ld, write handler installed?: %d", totwritten, handler_installed);
-
     }
 
-    // serverLog(LL_NOTICE, "rel client");
     g_pserver->stat_net_output_bytes += totwritten;
     if (nwritten == -1) {
         if (connGetState(c->conn) == CONN_STATE_CONNECTED) {
@@ -1843,11 +1810,6 @@ int writeToClient(client *c, int handler_installed) {
         if (!(c->flags & CLIENT_MASTER)) c->lastinteraction = g_pserver->unixtime;
     }
     if (!clientHasPendingReplies(c) && !c->fPendingReplicaWrite) {
-        // if(c->flags & CLIENT_SLAVE && handler_installed){
-        //     serverLog(LL_NOTICE, "Uninstalling handler");
-        //     serverLog(LL_NOTICE, "repl_backlog_size: %lld", g_pserver->repl_backlog_size);
-        //     serverLog(LL_NOTICE, "handler repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset);
-        // }
         c->sentlen = 0;
         if (handler_installed) connSetWriteHandler(c->conn, NULL);
 
@@ -1863,7 +1825,6 @@ int writeToClient(client *c, int handler_installed) {
 /* Write event handler. Just send data to the client. */
 void sendReplyToClient(connection *conn) {
     client *c = (client*)connGetPrivateData(conn);
-    // serverLog(LL_NOTICE, "called the sendreplytoclient");
     if (writeToClient(c,1) == C_ERR)
     {
         AeLocker ae;
@@ -1997,7 +1958,6 @@ int handleClientsWithPendingWrites(int iel, int aof_state) {
 
     auto vec = std::move(g_pserver->rgthreadvar[iel].clients_pending_write);
     processed += (int)vec.size();
-    // serverLog(LL_NOTICE, "entered handleClientsWithPendingWrites");
 
     for (client *c : vec) {
         serverAssertDebug(FCorrectThread(c));
@@ -2013,12 +1973,6 @@ int handleClientsWithPendingWrites(int iel, int aof_state) {
         /* Don't write to clients that are going to be closed anyway. */
         if (c->flags & CLIENT_CLOSE_ASAP) continue;
 
-        // if (c->flags & CLIENT_SLAVE){
-        //     if(clientHasPendingReplies(c))
-        //         serverLog(LL_NOTICE, "somehow the client buffer has these values: %s", c->buf);
-        //     serverLog(LL_NOTICE, "LOL");
-        // }
-
         /* Try to write buffers to the client socket. */
         if (writeToClient(c,0) == C_ERR) 
         {
@@ -2216,34 +2170,6 @@ static void setProtocolError(const char *errstr, client *c) {
     c->flags |= (CLIENT_CLOSE_AFTER_REPLY|CLIENT_PROTOCOL_ERROR);
 }
 
-static void printQueryBuffer(client *c) {
-    if (cserver.verbosity <= LL_VERBOSE || c->flags & CLIENT_MASTER) {
-        sds client = catClientInfoString(sdsempty(),c);
-
-        /* Sample some protocol to given an idea about what was inside. */
-        char buf[PROTO_DUMP_LEN*2];
-        if (sdslen(c->querybuf)-c->qb_pos < PROTO_DUMP_LEN) {
-            snprintf(buf,sizeof(buf),"%s", c->querybuf+c->qb_pos);
-        } else {
-            snprintf(buf,sizeof(buf),"%.*s (... more %zu bytes ...) %.*s", PROTO_DUMP_LEN/2, c->querybuf+c->qb_pos, sdslen(c->querybuf)-c->qb_pos-PROTO_DUMP_LEN, PROTO_DUMP_LEN/2, c->querybuf+sdslen(c->querybuf)-PROTO_DUMP_LEN/2);
-        }
-
-        /* Remove non printable chars. */
-        char *p = buf;
-        while (*p != '\0') {
-            if (!isprint(*p)) *p = '.';
-            p++;
-        }
-
-        /* Log all the client and protocol info. */
-        int loglevel = (c->flags & CLIENT_MASTER) ? LL_WARNING :
-                                                    LL_VERBOSE;
-        serverLog(loglevel,
-            "Query buffer from client %lu: %s. %s", c->id, client, buf);
-        sdsfree(client);
-    }
-}
-
 /* Process the query buffer for client 'c', setting up the client argument
  * vector for command execution. Returns C_OK if after running the function
  * the client has a well-formed ready to be processed command, otherwise
@@ -2498,8 +2424,6 @@ void parseClientCommandBuffer(client *c) {
         }
 
         size_t cqueriesStart = c->vecqueuedcmd.size();
-        // if (c->flags & CLIENT_MASTER)
-        //     printQueryBuffer(c);
         if (c->reqtype == PROTO_REQ_INLINE) {
             if (processInlineBuffer(c) != C_OK) break;
         } else if (c->reqtype == PROTO_REQ_MULTIBULK) {
diff --git a/src/replication.cpp b/src/replication.cpp
index 96bf161f9..ebdb8af78 100644
--- a/src/replication.cpp
+++ b/src/replication.cpp
@@ -60,7 +60,6 @@ static void propagateMasterStaleKeys();
 void updateLowestOffsetAmongReplicas(){
     serverAssert(GlobalLocksAcquired());
     serverAssert(!g_pserver->repl_backlog_lock.fOwnLock());
-    // serverLog(LL_NOTICE, "off- have repl");
     long long min_offset = LONG_LONG_MAX;
     listIter li;
     listNode *ln;
@@ -73,14 +72,13 @@ void updateLowestOffsetAmongReplicas(){
         if (replica->flags & CLIENT_CLOSE_ASAP) continue;
 
         std::unique_lock<fastlock> ul(replica->lock);
-        // serverLog(LL_NOTICE, "off- acq client");
 
-        min_offset = std::min(min_offset, replica->repl_curr_off);  
-        // serverLog(LL_NOTICE, "off- rel client");      
+        min_offset = std::min(min_offset, replica->repl_curr_off);      
     }
     /* return -1 if no other minimum was found */
     g_pserver->repl_lowest_off.store(min_offset == LONG_LONG_MAX ? -1 : min_offset, std::memory_order_seq_cst);
 }
+
 /* We take a global flag to remember if this instance generated an RDB
  * because of replication, so that we can remove the RDB file in case
  * the instance is configured to have no persistence. */
@@ -232,6 +230,8 @@ void createReplicationBacklog(void) {
     g_pserver->repl_batch_offStart = g_pserver->master_repl_offset;
 }
 
+long long getReplIndexFromOffset(long long offset);
+
 /* This function is called when the user modifies the replication backlog
  * size at runtime. It is up to the function to both update the
  * g_pserver->repl_backlog_size and to resize the buffer and setup it so that
@@ -243,8 +243,6 @@ void resizeReplicationBacklog(long long newsize) {
         newsize = CONFIG_REPL_BACKLOG_MIN_SIZE;
     if (g_pserver->repl_backlog_size == newsize) return;
 
-    serverLog(LL_NOTICE, "WE HAD TO RESIZE from %lld to %lld", g_pserver->repl_backlog_size, newsize);
-
     if (g_pserver->repl_backlog != NULL) {
         /* What we actually do is to flush the old buffer and realloc a new
          * empty one. It will refill with new data incrementally.
@@ -252,59 +250,8 @@ void resizeReplicationBacklog(long long newsize) {
          * worse often we need to alloc additional space before freeing the
          * old buffer. */
 
-        if (g_pserver->repl_batch_idxStart >= 0) {
-            // We need to keep critical data so we can't shrink less than the hot data in the buffer
-            newsize = std::max(newsize, g_pserver->master_repl_offset - g_pserver->repl_batch_offStart);
-            char *backlog = (char*)zmalloc(newsize);
-            g_pserver->repl_backlog_histlen = g_pserver->master_repl_offset - g_pserver->repl_batch_offStart;
-
-            if (g_pserver->repl_backlog_idx >= g_pserver->repl_batch_idxStart) {
-                auto cbActiveBacklog = g_pserver->repl_backlog_idx - g_pserver->repl_batch_idxStart;
-                memcpy(backlog, g_pserver->repl_backlog + g_pserver->repl_batch_idxStart, cbActiveBacklog);
-                serverAssert(g_pserver->repl_backlog_histlen == cbActiveBacklog);
-            } else {
-                auto cbPhase1 = g_pserver->repl_backlog_size - g_pserver->repl_batch_idxStart;
-                memcpy(backlog, g_pserver->repl_backlog + g_pserver->repl_batch_idxStart, cbPhase1);
-                memcpy(backlog + cbPhase1, g_pserver->repl_backlog, g_pserver->repl_backlog_idx);
-                auto cbActiveBacklog = cbPhase1 + g_pserver->repl_backlog_idx;
-                serverAssert(g_pserver->repl_backlog_histlen == cbActiveBacklog);
-            }
-            zfree(g_pserver->repl_backlog);
-            g_pserver->repl_backlog = backlog;
-            g_pserver->repl_backlog_idx = g_pserver->repl_backlog_histlen;
-            g_pserver->repl_batch_idxStart = 0;
-            g_pserver->repl_backlog_start = g_pserver->master_repl_offset;
-        } else {
-            zfree(g_pserver->repl_backlog);
-            g_pserver->repl_backlog = (char*)zmalloc(newsize);
-            g_pserver->repl_backlog_histlen = 0;
-            g_pserver->repl_backlog_idx = 0;
-            /* Next byte we have is... the next since the buffer is empty. */
-            g_pserver->repl_backlog_off = g_pserver->master_repl_offset+1;
-        }
-    }
-    g_pserver->repl_backlog_size = newsize;
-}
-
-long long getReplIndexFromOffset(long long offset);
-
-/* The above but for when clients need extra replication backlog because ??? */
-void resizeReplicationBacklogForClients(long long newsize) {
-    if (newsize < CONFIG_REPL_BACKLOG_MIN_SIZE)
-        newsize = CONFIG_REPL_BACKLOG_MIN_SIZE;
-    if (g_pserver->repl_backlog_size == newsize) return;
-
-    serverLog(LL_NOTICE, "WE HAVE TO RESIZE from %lld to %lld", g_pserver->repl_backlog_size, newsize);
-    /* get the critical client size, i.e. the size of the data unflushed to clients */
-    long long earliest_off = g_pserver->repl_lowest_off.load();
-
-
-    if (g_pserver->repl_backlog != NULL) {
-        /* What we actually do is to flush the old buffer and realloc a new
-         * empty one. It will refill with new data incrementally.
-         * The reason is that copying a few gigabytes adds latency and even
-         * worse often we need to alloc additional space before freeing the
-         * old buffer. */
+        /* get the critical client size, i.e. the size of the data unflushed to clients */
+        long long earliest_off = g_pserver->repl_lowest_off.load();
 
         if (earliest_off != -1) {
             // We need to keep critical data so we can't shrink less than the hot data in the buffer
@@ -316,8 +263,6 @@ void resizeReplicationBacklogForClients(long long newsize) {
             if (g_pserver->repl_backlog_idx >= earliest_idx) {
                 auto cbActiveBacklog = g_pserver->repl_backlog_idx - earliest_idx;
                 memcpy(backlog, g_pserver->repl_backlog + earliest_idx, cbActiveBacklog);
-                serverLog(LL_NOTICE, "g_pserver->master_repl_offset: %lld, earliest_off: %lld, g_pserver->repl_backlog_idx: %lld, earliest_idx: %lld, repl_backlog_start: %lld",
-                    g_pserver->master_repl_offset, earliest_off, g_pserver->repl_backlog_idx, earliest_idx, g_pserver->repl_backlog_start);
                 serverAssert(g_pserver->repl_backlog_histlen == cbActiveBacklog);
             } else {
                 auto cbPhase1 = g_pserver->repl_backlog_size - earliest_idx;
@@ -344,14 +289,9 @@ void resizeReplicationBacklogForClients(long long newsize) {
         }
     }
     g_pserver->repl_backlog_size = newsize;
-
-    serverLog(LL_NOTICE, "We are ending with: master_repl_offset: %lld, repl_batch_offStart: %lld, new_off: %lld, " 
-                         "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, new_idx: %lld, repl_backlog_size: %lld",
-        g_pserver->master_repl_offset, g_pserver->repl_batch_offStart, 0LL,
-        g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, 0LL, g_pserver->repl_backlog_size
-    );
 }
 
+
 void freeReplicationBacklog(void) {
     serverAssert(GlobalLocksAcquired());
     listIter li;
@@ -391,17 +331,11 @@ void feedReplicationBacklog(const void *ptr, size_t len) {
                 // This is an emergency overflow, we better resize to fit
                 long long newsize = std::max(g_pserver->repl_backlog_size*2, minimumsize);
                 serverLog(LL_WARNING, "Replication backlog is too small, resizing to: %lld", newsize);
-                resizeReplicationBacklogForClients(newsize);
+                resizeReplicationBacklog(newsize);
             }
         }
     }
 
-    // serverLog(LL_NOTICE, "Pt2 start with: master_repl_offset: %lld, repl_batch_offStart: %lld, " 
-    //                      "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, repl_backlog_size: %lld",
-    //             g_pserver->master_repl_offset, g_pserver->repl_batch_offStart,
-    //             g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, g_pserver->repl_backlog_size
-    // );
-
     g_pserver->master_repl_offset += len;
 
     /* This is a circular buffer, so write as much data we can at every
@@ -423,12 +357,6 @@ void feedReplicationBacklog(const void *ptr, size_t len) {
     /* Set the offset of the first byte we have in the backlog. */
     g_pserver->repl_backlog_off = g_pserver->master_repl_offset -
                               g_pserver->repl_backlog_histlen + 1;
-
-    // serverLog(LL_NOTICE, "Pt2 end with: master_repl_offset: %lld, repl_batch_offStart: %lld, " 
-    //                      "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, repl_backlog_size: %lld",
-    //         g_pserver->master_repl_offset, g_pserver->repl_batch_offStart,
-    //         g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, g_pserver->repl_backlog_size
-    // );
 }
 
 /* Wrapper for feedReplicationBacklog() that takes Redis string objects
@@ -578,9 +506,7 @@ void replicationFeedSlavesCore(list *slaves, int dictid, robj **argv, int argc)
 
         /* Add the SELECT command into the backlog. */
         /* We don't do this for advanced replication because this will be done later when it adds the whole RREPLAY command */
-        if (g_pserver->repl_backlog && fSendRaw) {
-            feedReplicationBacklogWithObject(selectcmd);
-        }
+        if (g_pserver->repl_backlog && fSendRaw) feedReplicationBacklogWithObject(selectcmd);
 
         if (dictid < 0 || dictid >= PROTO_SHARED_SELECT_CMDS)
             decrRefCount(selectcmd);
@@ -678,7 +604,7 @@ void replicationFeedSlaves(list *replicas, int dictid, robj **argv, int argc) {
 void showLatestBacklog(void) {
     if (g_pserver->repl_backlog == NULL) return;
 
-    long long dumplen = 1024;
+    long long dumplen = 256;
     if (g_pserver->repl_backlog_histlen < dumplen)
         dumplen = g_pserver->repl_backlog_histlen;
 
@@ -769,7 +695,9 @@ void replicationFeedMonitors(client *c, list *monitors, int dictid, robj **argv,
     }
     decrRefCount(cmdobj);
 }
-#define BYPASS_PSYNC
+
+int prepareClientToWrite(client *c);
+
 /* Feed the replica 'c' with the replication backlog starting from the
  * specified 'offset' up to the end of the backlog. */
 long long addReplyReplicationBacklog(client *c, long long offset) {
@@ -809,26 +737,14 @@ long long addReplyReplicationBacklog(client *c, long long offset) {
      * split the reply in two parts if we are cross-boundary. */
     len = g_pserver->repl_backlog_histlen - skip;
     serverLog(LL_DEBUG, "[PSYNC] Reply total length: %lld", len);
-#ifdef BYPASS_PSYNC
+
     c->repl_curr_off = offset - 1;
     c->repl_end_off = g_pserver->master_repl_offset;
-    serverLog(LL_NOTICE, "This client %lu at addr %s synchronized to %lld", c->id, getClientPeerId(c), c->repl_curr_off);
 
     /* Force the partial sync to be queued */
     prepareClientToWrite(c);
-    c->fPendingReplicaWrite = true;
-#else
-    while(len) {
-        long long thislen =
-            ((g_pserver->repl_backlog_size - j) < len) ?
-            (g_pserver->repl_backlog_size - j) : len;
+    c->fPendingReplicaWrite = true; 
 
-        serverLog(LL_DEBUG, "[PSYNC] addReply() length: %lld", thislen);
-        addReplySds(c,sdsnewlen(g_pserver->repl_backlog + j, thislen));
-        len -= thislen;
-        j = 0;
-    }
-#endif
     return g_pserver->repl_backlog_histlen - skip;
 }
 
@@ -866,15 +782,11 @@ int replicationSetupSlaveForFullResync(client *replica, long long offset) {
     replica->repl_curr_off = offset;
     replica->repl_end_off = g_pserver->master_repl_offset;
 
-    serverLog(LL_NOTICE, "This client %lu at addr %s synchronized to %lld", replica->id, getClientPeerId(replica), replica->repl_curr_off);
-
     /* We are going to accumulate the incremental changes for this
      * replica as well. Set replicaseldb to -1 in order to force to re-emit
      * a SELECT statement in the replication stream. */
     g_pserver->replicaseldb = -1;
 
-    serverLog(LL_NOTICE, "We are setting up here lad");
-
     /* Don't send this reply to slaves that approached us with
      * the old SYNC command. */
     if (!(replica->flags & CLIENT_PRE_PSYNC)) {
@@ -1179,7 +1091,6 @@ void syncCommand(client *c) {
     if (g_pserver->FRdbSaveInProgress() &&
         g_pserver->rdb_child_type == RDB_CHILD_TYPE_DISK)
     {
-        serverLog(LL_NOTICE, "case 1");
         /* Ok a background save is in progress. Let's check if it is a good
          * one for replication, i.e. if there is another replica that is
          * registering differences since the server forked to save. */
@@ -1211,7 +1122,6 @@ void syncCommand(client *c) {
     } else if (g_pserver->FRdbSaveInProgress() &&
                g_pserver->rdb_child_type == RDB_CHILD_TYPE_SOCKET)
     {
-        serverLog(LL_NOTICE, "case 2");
         /* There is an RDB child process but it is writing directly to
          * children sockets. We need to wait for the next BGSAVE
          * in order to synchronize. */
@@ -1219,7 +1129,6 @@ void syncCommand(client *c) {
 
     /* CASE 3: There is no BGSAVE is progress. */
     } else {
-        serverLog(LL_NOTICE, "case 3");
         if (g_pserver->repl_diskless_sync && (c->slave_capa & SLAVE_CAPA_EOF)) {
             /* Diskless replication RDB child is created inside
              * replicationCron() since we want to delay its start a
@@ -4606,9 +4515,10 @@ void _clientAsyncReplyBufferReserve(client *c, size_t len);
 void flushReplBacklogToClients()
 {
     serverAssert(GlobalLocksAcquired());
+    /* If we have the repl backlog lock, we will deadlock */
+    serverAssert(!g_pserver->repl_backlog_lock.fOwnLock());
     if (g_pserver->repl_batch_offStart < 0)
         return;
-
     
     if (g_pserver->repl_batch_offStart != g_pserver->master_repl_offset) {
         bool fAsyncWrite = false;
@@ -4617,66 +4527,31 @@ void flushReplBacklogToClients()
         serverAssert(g_pserver->master_repl_offset - g_pserver->repl_batch_offStart <= g_pserver->repl_backlog_size);
         serverAssert(g_pserver->repl_batch_idxStart != g_pserver->repl_backlog_idx);
 
-        serverLog(LL_NOTICE, "the master repl offset is %lld", g_pserver->master_repl_offset);
-        showLatestBacklog();
         listIter li;
         listNode *ln;
         listRewind(g_pserver->slaves, &li);
         while ((ln = listNext(&li))) {
             client *replica = (client*)listNodeValue(ln);
 
-            // serverLog(LL_NOTICE, "client %lu is in the party", replica->id);
-
-            // serverLog(LL_NOTICE, "is there a write pending for %lu, %d", replica->id, replica->fPendingReplicaWrite);
             if (replica->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue;
             if (replica->flags & CLIENT_CLOSE_ASAP) continue;
 
-            std::unique_lock<fastlock> ul(replica->lock, std::defer_lock);
-            if (FCorrectThread(replica))
-                ul.lock();
-            else
+            std::unique_lock<fastlock> ul(replica->lock);
+            if (!FCorrectThread(replica))
                 fAsyncWrite = true;
-            
 
-            /* If we are online and the RDB has been sent, there is no need to feed the client buffer
-             * We will send our replies directly from the replication backlog instead */
-#ifdef BYPASS_BUFFER
-            {
-                std::unique_lock<fastlock> asyncUl(replica->lock, std::defer_lock);
-                if (!FCorrectThread(replica))
-                    asyncUl.lock();
+            /* We should have set the repl_curr_off when synchronizing, so it shouldn't be -1 here */
+            serverAssert(replica->repl_curr_off != -1);
 
-                /* We should have set the repl_curr_off when synchronizing, so it shouldn't be -1 here */
-                serverAssert(replica->repl_curr_off != -1);
+            replica->repl_end_off = g_pserver->master_repl_offset;
 
-                replica->repl_end_off = g_pserver->master_repl_offset;
-
-                /* Only if the there isn't already a pending write do we prepare the client to write */
-                if (!replica->fPendingReplicaWrite){
-                    serverAssert(replica->repl_curr_off != g_pserver->master_repl_offset);
-                    prepareClientToWrite(replica);
-                    replica->fPendingReplicaWrite = true; 
-                }
-
-                continue;
+            /* Only if the there isn't already a pending write do we prepare the client to write */
+            if (!replica->fPendingReplicaWrite){
+                serverAssert(replica->repl_curr_off != g_pserver->master_repl_offset);
+                prepareClientToWrite(replica);
+                replica->fPendingReplicaWrite = true; 
             }
-#endif
-            if (g_pserver->repl_backlog_idx >= g_pserver->repl_batch_idxStart) {
-                long long cbCopy = g_pserver->repl_backlog_idx - g_pserver->repl_batch_idxStart;
-                serverAssert((g_pserver->master_repl_offset - g_pserver->repl_batch_offStart) == cbCopy);
-                serverAssert((g_pserver->repl_backlog_size - g_pserver->repl_batch_idxStart) >= (cbCopy));
-                serverAssert((g_pserver->repl_batch_idxStart + cbCopy) <= g_pserver->repl_backlog_size);
-                
-                addReplyProto(replica, g_pserver->repl_backlog + g_pserver->repl_batch_idxStart, cbCopy);
-            } else {
-                auto cbPhase1 = g_pserver->repl_backlog_size - g_pserver->repl_batch_idxStart;
-                if (fAsyncWrite)
-                    _clientAsyncReplyBufferReserve(replica, cbPhase1 + g_pserver->repl_backlog_idx);
-                addReplyProto(replica, g_pserver->repl_backlog + g_pserver->repl_batch_idxStart, cbPhase1);
-                addReplyProto(replica, g_pserver->repl_backlog, g_pserver->repl_backlog_idx);
 
-                serverAssert((cbPhase1 + g_pserver->repl_backlog_idx) == (g_pserver->master_repl_offset - g_pserver->repl_batch_offStart));
-            }
         }
         if (fAsyncWrite)
             ProcessPendingAsyncWrites();
diff --git a/src/server.cpp b/src/server.cpp
index 439e1aeff..362569bfa 100644
--- a/src/server.cpp
+++ b/src/server.cpp
@@ -1796,7 +1796,6 @@ int clientsCronTrackClientsMemUsage(client *c) {
     mem += zmalloc_size(c);
     mem += c->argv_len_sum();
     if (c->argv) mem += zmalloc_size(c->argv);
-    // serverLog(LL_NOTICE, "Mem here is : %lu", mem);
     /* Now that we have the memory used by the client, remove the old
      * value from the old category, and add it back. */
     g_pserver->stat_clients_type_memory[c->client_cron_last_memory_type] -=
@@ -1855,7 +1854,6 @@ void clientsCron(int iel) {
     while(listLength(g_pserver->clients) && iterations--) {
         client *c;
         listNode *head;
-        // serverLog(LL_NOTICE, "we are at iteration: %d", iterations);
         /* Rotate the list, take the current head, process.
          * This way if the client must be removed from the list it's the
          * first element and we don't incur into O(N) computation. */
diff --git a/src/server.h b/src/server.h
index 64a2ca515..0fcd8f5ef 100644
--- a/src/server.h
+++ b/src/server.h
@@ -3540,7 +3540,6 @@ void tlsInit(void);
 void tlsInitThread();
 int tlsConfigure(redisTLSContextConfig *ctx_config);
 
-int prepareClientToWrite(client *c);
 
 
 class ShutdownException

From 80dddab0c4e587332b497cbe8157f39dcc417eb4 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@huaweicollab.dev.eqalpha.local>
Date: Mon, 14 Jun 2021 19:30:49 +0000
Subject: [PATCH 17/22] Relaxed locking, should run faster now

Former-commit-id: 5cec4d026dc1766b9ecbade6ec4b9d0e75a94e0f
---
 src/multi.cpp       |  1 -
 src/networking.cpp  |  6 ++++++
 src/replication.cpp | 18 ++++++++++--------
 3 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/src/multi.cpp b/src/multi.cpp
index f74748e90..589dba589 100644
--- a/src/multi.cpp
+++ b/src/multi.cpp
@@ -268,7 +268,6 @@ void execCommand(client *c) {
         if (g_pserver->repl_backlog && was_master && !is_master) {
             const char *execcmd = "*1\r\n$4\r\nEXEC\r\n";
             updateLowestOffsetAmongReplicas();
-            std::unique_lock<fastlock> repl_backlog_lock (g_pserver->repl_backlog_lock);
             feedReplicationBacklog(execcmd,strlen(execcmd));
         }
         afterPropagateExec();
diff --git a/src/networking.cpp b/src/networking.cpp
index d8d91751d..07312a9ee 100644
--- a/src/networking.cpp
+++ b/src/networking.cpp
@@ -1856,6 +1856,8 @@ int writeToClient(client *c, int handler_installed) {
         * We always read from the replication backlog directly */
         std::unique_lock<fastlock> repl_backlog_lock (g_pserver->repl_backlog_lock);
 
+        // serverLog(LL_NOTICE, "written to handler");
+
         long long repl_end_idx = getReplIndexFromOffset(c->repl_end_off);
         
         serverAssert(c->repl_curr_off != -1);
@@ -1884,8 +1886,12 @@ int writeToClient(client *c, int handler_installed) {
                 serverAssert(c->repl_curr_off <= c->repl_end_off);
                 /* If the client offset matches the global offset, we wrote all we needed to,
                  * in which case, there is no pending write */
+                
                 if (c->repl_curr_off == c->repl_end_off){
+                    // serverLog(LL_NOTICE, "Successfully wrote up until %lld", c->repl_end_off);
                     c->fPendingReplicaWrite = false;
+                } else {
+                    // serverLog(LL_NOTICE, "Wrote to %lld out of %lld", c->repl_curr_off, c->repl_end_off);
                 }
             }
 
diff --git a/src/replication.cpp b/src/replication.cpp
index d10bac99a..a5f9c3acf 100644
--- a/src/replication.cpp
+++ b/src/replication.cpp
@@ -241,6 +241,8 @@ void resizeReplicationBacklog(long long newsize) {
         newsize = CONFIG_REPL_BACKLOG_MIN_SIZE;
     if (g_pserver->repl_backlog_size == newsize) return;
 
+    std::unique_lock<fastlock> repl_backlog_lock (g_pserver->repl_backlog_lock);
+
     if (g_pserver->repl_backlog != NULL) {
         /* What we actually do is to flush the old buffer and realloc a new
          * empty one. It will refill with new data incrementally.
@@ -310,9 +312,9 @@ void freeReplicationBacklog(void) {
  * the backlog without incrementing the offset. */
 void feedReplicationBacklog(const void *ptr, size_t len) {
     serverAssert(GlobalLocksAcquired());
-    serverAssert(g_pserver->repl_backlog_lock.fOwnLock());
     const unsigned char *p = (const unsigned char*)ptr;
 
+
     if (g_pserver->repl_batch_idxStart >= 0) {
         /* we are lower bounded by the lower client offset or the offStart if all the clients are up to date */
         long long lower_bound = g_pserver->repl_lowest_off.load(std::memory_order_seq_cst);
@@ -320,10 +322,11 @@ void feedReplicationBacklog(const void *ptr, size_t len) {
             lower_bound = g_pserver->repl_batch_offStart;
         long long minimumsize = g_pserver->master_repl_offset + len - lower_bound + 1;
         if (minimumsize > g_pserver->repl_backlog_size) {
-            g_pserver->repl_backlog_lock.unlock();
             flushReplBacklogToClients();
-            g_pserver->repl_backlog_lock.lock();
-            minimumsize = g_pserver->master_repl_offset + len - lower_bound +1;
+            minimumsize = g_pserver->master_repl_offset + len - lower_bound + 1;
+
+            serverLog(LL_NOTICE, "minimumsize: %lld, g_pserver->master_repl_offset: %lld, len: %lu, lower_bound: %lld",
+                minimumsize, g_pserver->master_repl_offset, len, lower_bound);
 
             if (minimumsize > g_pserver->repl_backlog_size) {
                 // This is an emergency overflow, we better resize to fit
@@ -492,7 +495,6 @@ void replicationFeedSlavesCore(list *slaves, int dictid, robj **argv, int argc)
 
     bool fSendRaw = !g_pserver->fActiveReplica;
     updateLowestOffsetAmongReplicas();
-    std::unique_lock<fastlock> repl_backlog_lock (g_pserver->repl_backlog_lock);
 
     /* Send SELECT command to every replica if needed. */
     if (g_pserver->replicaseldb != dictid) {
@@ -655,7 +657,6 @@ void replicationFeedSlavesFromMasterStream(char *buf, size_t buflen) {
 
     if (g_pserver->repl_backlog){
         updateLowestOffsetAmongReplicas();
-        std::unique_lock<fastlock> repl_backlog_lock (g_pserver->repl_backlog_lock);   
         feedReplicationBacklog(buf,buflen);
     }
 }
@@ -750,7 +751,7 @@ long long addReplyReplicationBacklog(client *c, long long offset) {
     serverLog(LL_DEBUG, "[PSYNC] Reply total length: %lld", len);
 
     c->repl_curr_off = offset - 1;
-    serverLog(LL_NOTICE, "Client %s, replica offset %lld in psync", replicationGetSlaveName(c), c->repl_curr_off);
+    // serverLog(LL_NOTICE, "Client %s, replica offset %lld in psync", replicationGetSlaveName(c), c->repl_curr_off);
     c->repl_end_off = g_pserver->master_repl_offset;
 
     /* Force the partial sync to be queued */
@@ -4988,7 +4989,7 @@ void flushReplBacklogToClients()
             if (!canFeedReplicaReplBuffer(replica)) continue;
             if (replica->flags & CLIENT_CLOSE_ASAP) continue;
 
-            serverLog(LL_NOTICE, "Client %s, replica offset %lld", replicationGetSlaveName(replica), replica->repl_curr_off);
+            // serverLog(LL_NOTICE, "Client %s, replica offset %lld", replicationGetSlaveName(replica), replica->repl_curr_off);
 
             std::unique_lock<fastlock> ul(replica->lock);
             if (!FCorrectThread(replica))
@@ -5013,6 +5014,7 @@ void flushReplBacklogToClients()
         // This may be called multiple times per "frame" so update with our progress flushing to clients
         g_pserver->repl_batch_idxStart = g_pserver->repl_backlog_idx;
         g_pserver->repl_batch_offStart = g_pserver->master_repl_offset;
+        updateLowestOffsetAmongReplicas();
     } 
 }
 

From 6a65b8bbaa318429c69dadd852f62fb6364414fd Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@huaweicollab.dev.eqalpha.local>
Date: Tue, 15 Jun 2021 23:13:49 +0000
Subject: [PATCH 18/22] Optimized use of repl_lowest_off to reduce lock
 contention

Former-commit-id: 30a957e5399fe94675f0b6d2d34c24112d5a9734
---
 src/multi.cpp       |  1 -
 src/replication.cpp | 34 ++++++++--------------------------
 2 files changed, 8 insertions(+), 27 deletions(-)

diff --git a/src/multi.cpp b/src/multi.cpp
index 589dba589..1b91a05a0 100644
--- a/src/multi.cpp
+++ b/src/multi.cpp
@@ -267,7 +267,6 @@ void execCommand(client *c) {
          * backlog with the final EXEC. */
         if (g_pserver->repl_backlog && was_master && !is_master) {
             const char *execcmd = "*1\r\n$4\r\nEXEC\r\n";
-            updateLowestOffsetAmongReplicas();
             feedReplicationBacklog(execcmd,strlen(execcmd));
         }
         afterPropagateExec();
diff --git a/src/replication.cpp b/src/replication.cpp
index a5f9c3acf..cb0b562b1 100644
--- a/src/replication.cpp
+++ b/src/replication.cpp
@@ -56,29 +56,6 @@ void putSlaveOnline(client *replica);
 int cancelReplicationHandshake(redisMaster *mi, int reconnect);
 static void propagateMasterStaleKeys();
 
-/* gets the lowest offset amongst all of the replicas and stores it globally*/
-void updateLowestOffsetAmongReplicas(){
-    serverAssert(GlobalLocksAcquired());
-    serverAssert(!g_pserver->repl_backlog_lock.fOwnLock());
-    long long min_offset = LONG_LONG_MAX;
-    listIter li;
-    listNode *ln;
-    listRewind(g_pserver->slaves, &li);
-    // check for potential overflow first
-    while ((ln = listNext(&li))) {
-        client *replica = (client*)listNodeValue(ln);
-
-        if (replica->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue;
-        if (replica->flags & CLIENT_CLOSE_ASAP) continue;
-
-        std::unique_lock<fastlock> ul(replica->lock);
-
-        min_offset = std::min(min_offset, replica->repl_curr_off);      
-    }
-    /* return -1 if no other minimum was found */
-    g_pserver->repl_lowest_off.store(min_offset == LONG_LONG_MAX ? -1 : min_offset, std::memory_order_seq_cst);
-}
-
 /* We take a global flag to remember if this instance generated an RDB
  * because of replication, so that we can remove the RDB file in case
  * the instance is configured to have no persistence. */
@@ -323,6 +300,10 @@ void feedReplicationBacklog(const void *ptr, size_t len) {
         long long minimumsize = g_pserver->master_repl_offset + len - lower_bound + 1;
         if (minimumsize > g_pserver->repl_backlog_size) {
             flushReplBacklogToClients();
+            lower_bound = g_pserver->repl_lowest_off.load(std::memory_order_seq_cst);
+            if (lower_bound == -1)
+                lower_bound = g_pserver->repl_batch_offStart;
+
             minimumsize = g_pserver->master_repl_offset + len - lower_bound + 1;
 
             serverLog(LL_NOTICE, "minimumsize: %lld, g_pserver->master_repl_offset: %lld, len: %lu, lower_bound: %lld",
@@ -494,7 +475,6 @@ void replicationFeedSlavesCore(list *slaves, int dictid, robj **argv, int argc)
     serverAssert(!(listLength(slaves) != 0 && g_pserver->repl_backlog == NULL));
 
     bool fSendRaw = !g_pserver->fActiveReplica;
-    updateLowestOffsetAmongReplicas();
 
     /* Send SELECT command to every replica if needed. */
     if (g_pserver->replicaseldb != dictid) {
@@ -656,7 +636,6 @@ void replicationFeedSlavesFromMasterStream(char *buf, size_t buflen) {
     }
 
     if (g_pserver->repl_backlog){
-        updateLowestOffsetAmongReplicas();
         feedReplicationBacklog(buf,buflen);
     }
 }
@@ -4975,6 +4954,7 @@ void flushReplBacklogToClients()
     
     if (g_pserver->repl_batch_offStart != g_pserver->master_repl_offset) {
         bool fAsyncWrite = false;
+        long long min_offset = LONG_LONG_MAX;
         // Ensure no overflow
         serverAssert(g_pserver->repl_batch_offStart < g_pserver->master_repl_offset);
         serverAssert(g_pserver->master_repl_offset - g_pserver->repl_batch_offStart <= g_pserver->repl_backlog_size);
@@ -4998,6 +4978,8 @@ void flushReplBacklogToClients()
             /* We should have set the repl_curr_off when synchronizing, so it shouldn't be -1 here */
             serverAssert(replica->repl_curr_off != -1);
 
+            min_offset = std::min(min_offset, replica->repl_curr_off);      
+
             replica->repl_end_off = g_pserver->master_repl_offset;
 
             /* Only if the there isn't already a pending write do we prepare the client to write */
@@ -5014,7 +4996,7 @@ void flushReplBacklogToClients()
         // This may be called multiple times per "frame" so update with our progress flushing to clients
         g_pserver->repl_batch_idxStart = g_pserver->repl_backlog_idx;
         g_pserver->repl_batch_offStart = g_pserver->master_repl_offset;
-        updateLowestOffsetAmongReplicas();
+        g_pserver->repl_lowest_off.store(min_offset == LONG_LONG_MAX ? -1 : min_offset, std::memory_order_seq_cst);
     } 
 }
 

From 29f4c661799107ed6db8168ecb297b1e0b64f575 Mon Sep 17 00:00:00 2001
From: VivekSainiEQ <vivek@eqalpha.com>
Date: Wed, 16 Jun 2021 19:41:55 +0000
Subject: [PATCH 19/22] More code cleanup

Former-commit-id: 8e9962b9b7b9093399451bf93d30e5b5d26e3d33
---
 src/evict.cpp       |  2 ++
 src/networking.cpp  | 52 +++++++++++++++------------------------------
 src/replication.cpp | 51 ++++++++++++++++++--------------------------
 src/server.h        | 14 ++++++------
 4 files changed, 47 insertions(+), 72 deletions(-)

diff --git a/src/evict.cpp b/src/evict.cpp
index ba426f0ee..d336bc8b8 100644
--- a/src/evict.cpp
+++ b/src/evict.cpp
@@ -354,6 +354,8 @@ unsigned long LFUDecrAndReturn(robj_roptr o) {
     return counter;
 }
 
+unsigned long getClientReplicationBacklogSharedUsage(client *c);
+
 /* We don't want to count AOF buffers and slaves output buffers as
  * used memory: the eviction should use mostly data size. This function
  * returns the sum of AOF and slaves buffer. */
diff --git a/src/networking.cpp b/src/networking.cpp
index 07312a9ee..767fe9c2b 100644
--- a/src/networking.cpp
+++ b/src/networking.cpp
@@ -1765,15 +1765,7 @@ client *lookupClientByID(uint64_t id) {
     return (c == raxNotFound) ? NULL : c;
 }
 
-/* Compute the corresponding index from a replication backlog offset
- * by taking the distance between the input offset and the replication backlog offset 
- * and applying that to the replication backlog index, wrapping around if the index 
- * becomes negative. 
- * TODO: Rewrite comment for new logic */
-long long getReplIndexFromOffset(long long offset){
-    long long index = (offset - g_pserver->repl_backlog_start) % g_pserver->repl_backlog_size;
-    return index;
-}
+long long getReplIndexFromOffset(long long offset);
 
 /* Write data in output buffers to client. Return C_OK if the client
  * is still valid after the call, C_ERR if it was freed because of some
@@ -1832,35 +1824,31 @@ int writeToClient(client *c, int handler_installed) {
             }
         }
         /* Note that we avoid to send more than NET_MAX_WRITES_PER_EVENT
-        * bytes, in a single threaded server it's a good idea to serve
-        * other clients as well, even if a very large request comes from
-        * super fast link that is always able to accept data (in real world
-        * scenario think about 'KEYS *' against the loopback interface).
-        *
-        * However if we are over the maxmemory limit we ignore that and
-        * just deliver as much data as it is possible to deliver.
-        *
-        * Moreover, we also send as much as possible if the client is
-        * a replica or a monitor (otherwise, on high-speed traffic, the
-        * replication/output buffer will grow indefinitely) */
+         * bytes, in a single threaded server it's a good idea to serve
+         * other clients as well, even if a very large request comes from
+         * super fast link that is always able to accept data (in real world
+         * scenario think about 'KEYS *' against the loopback interface).
+         *
+         * However if we are over the maxmemory limit we ignore that and
+         * just deliver as much data as it is possible to deliver.
+         *
+         * Moreover, we also send as much as possible if the client is
+         * a replica or a monitor (otherwise, on high-speed traffic, the
+         * replication/output buffer will grow indefinitely) */
         if (totwritten > NET_MAX_WRITES_PER_EVENT &&
             (g_pserver->maxmemory == 0 ||
-            zmalloc_used_memory() < g_pserver->maxmemory) &&
+             zmalloc_used_memory() < g_pserver->maxmemory) &&
             !(c->flags & CLIENT_SLAVE)) break;
     }
 
     /* We can only directly read from the replication backlog if the client 
        is a replica, so only attempt to do so if that's the case. */
     if (c->flags & CLIENT_SLAVE && !(c->flags & CLIENT_MONITOR)) {
-        /* For replicas, we don't store all the information in the client buffer
-        * We always read from the replication backlog directly */
+
         std::unique_lock<fastlock> repl_backlog_lock (g_pserver->repl_backlog_lock);
-
-        // serverLog(LL_NOTICE, "written to handler");
-
         long long repl_end_idx = getReplIndexFromOffset(c->repl_end_off);
-        
         serverAssert(c->repl_curr_off != -1);
+
         if (c->repl_curr_off != c->repl_end_off){
             long long repl_curr_idx = getReplIndexFromOffset(c->repl_curr_off); 
             long long nwritten2ndStage = 0; /* How much was written from the start of the replication backlog
@@ -1884,14 +1872,9 @@ int writeToClient(client *c, int handler_installed) {
                 totwritten += nwritten;
                 c->repl_curr_off += nwritten;
                 serverAssert(c->repl_curr_off <= c->repl_end_off);
-                /* If the client offset matches the global offset, we wrote all we needed to,
-                 * in which case, there is no pending write */
-                
+                /* If the client's current offset matches the last offset it can read from, there is no pending write */
                 if (c->repl_curr_off == c->repl_end_off){
-                    // serverLog(LL_NOTICE, "Successfully wrote up until %lld", c->repl_end_off);
                     c->fPendingReplicaWrite = false;
-                } else {
-                    // serverLog(LL_NOTICE, "Wrote to %lld out of %lld", c->repl_curr_off, c->repl_end_off);
                 }
             }
 
@@ -3719,8 +3702,7 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) {
     }
 }
 
-/* In the case of a replica client, it is possible (and very likely) 
- * that writes to said replica are using data from the replication backlog
+/* In the case of a replica client, writes to said replica are using data from the replication backlog
  * as opposed to it's own internal buffer, this number should keep track of that */
 unsigned long getClientReplicationBacklogSharedUsage(client *c) {
     return (!(c->flags & CLIENT_SLAVE) || !c->fPendingReplicaWrite ) ? 0 : g_pserver->master_repl_offset - c->repl_curr_off;
diff --git a/src/replication.cpp b/src/replication.cpp
index cb0b562b1..b9465680e 100644
--- a/src/replication.cpp
+++ b/src/replication.cpp
@@ -47,8 +47,6 @@
 #include <unordered_map>
 #include <string>
 
-#define BYPASS_BUFFER
-
 void replicationDiscardCachedMaster(redisMaster *mi);
 void replicationResurrectCachedMaster(redisMaster *mi, connection *conn);
 void replicationSendAck(redisMaster *mi);
@@ -61,8 +59,6 @@ static void propagateMasterStaleKeys();
  * the instance is configured to have no persistence. */
 int RDBGeneratedByReplication = 0;
 
-void resizeReplicationBacklogForClients(long long newsize);
-
 /* --------------------------- Utility functions ---------------------------- */
 
 /* Return the pointer to a string representing the replica ip:listening_port
@@ -205,7 +201,14 @@ void createReplicationBacklog(void) {
     g_pserver->repl_batch_offStart = g_pserver->master_repl_offset;
 }
 
-long long getReplIndexFromOffset(long long offset);
+/* Compute the corresponding index from a replication backlog offset 
+ * Since this computation needs the size of the replication backlog, 
+ * you need to have the repl_backlog_lock in order to call it */
+long long getReplIndexFromOffset(long long offset){
+    serverAssert(g_pserver->repl_backlog_lock.fOwnLock());
+    long long index = (offset - g_pserver->repl_backlog_start) % g_pserver->repl_backlog_size;
+    return index;
+}
 
 /* This function is called when the user modifies the replication backlog
  * size at runtime. It is up to the function to both update the
@@ -293,7 +296,7 @@ void feedReplicationBacklog(const void *ptr, size_t len) {
 
 
     if (g_pserver->repl_batch_idxStart >= 0) {
-        /* we are lower bounded by the lower client offset or the offStart if all the clients are up to date */
+        /* We are lower bounded by the lowest replica offset, or the batch offset start if not applicable */
         long long lower_bound = g_pserver->repl_lowest_off.load(std::memory_order_seq_cst);
         if (lower_bound == -1)
             lower_bound = g_pserver->repl_batch_offStart;
@@ -306,9 +309,6 @@ void feedReplicationBacklog(const void *ptr, size_t len) {
 
             minimumsize = g_pserver->master_repl_offset + len - lower_bound + 1;
 
-            serverLog(LL_NOTICE, "minimumsize: %lld, g_pserver->master_repl_offset: %lld, len: %lu, lower_bound: %lld",
-                minimumsize, g_pserver->master_repl_offset, len, lower_bound);
-
             if (minimumsize > g_pserver->repl_backlog_size) {
                 // This is an emergency overflow, we better resize to fit
                 long long newsize = std::max(g_pserver->repl_backlog_size*2, minimumsize);
@@ -635,9 +635,7 @@ void replicationFeedSlavesFromMasterStream(char *buf, size_t buflen) {
         printf("\n");
     }
 
-    if (g_pserver->repl_backlog){
-        feedReplicationBacklog(buf,buflen);
-    }
+    if (g_pserver->repl_backlog) feedReplicationBacklog(buf,buflen);
 }
 
 void replicationFeedMonitors(client *c, list *monitors, int dictid, robj **argv, int argc) {
@@ -689,13 +687,12 @@ int prepareClientToWrite(client *c);
 /* Feed the replica 'c' with the replication backlog starting from the
  * specified 'offset' up to the end of the backlog. */
 long long addReplyReplicationBacklog(client *c, long long offset) {
-    long long j, skip, len;
+    long long skip, len;
 
     serverLog(LL_DEBUG, "[PSYNC] Replica request offset: %lld", offset);
 
     if (g_pserver->repl_backlog_histlen == 0) {
         serverLog(LL_DEBUG, "[PSYNC] Backlog history len is zero");
-        serverLog(LL_NOTICE, "REOAD TO RESIST");
         c->repl_curr_off = g_pserver->master_repl_offset;
         c->repl_end_off = g_pserver->master_repl_offset;
         return 0;
@@ -714,30 +711,20 @@ long long addReplyReplicationBacklog(client *c, long long offset) {
     skip = offset - g_pserver->repl_backlog_off;
     serverLog(LL_DEBUG, "[PSYNC] Skipping: %lld", skip);
 
-    /* Point j to the oldest byte, that is actually our
-     * g_pserver->repl_backlog_off byte. */
-    j = (g_pserver->repl_backlog_idx +
-        (g_pserver->repl_backlog_size-g_pserver->repl_backlog_histlen)) %
-        g_pserver->repl_backlog_size;
-    serverLog(LL_DEBUG, "[PSYNC] Index of first byte: %lld", j);
-
-    /* Discard the amount of data to seek to the specified 'offset'. */
-    j = (j + skip) % g_pserver->repl_backlog_size;
-
-    /* Feed replica with data. Since it is a circular buffer we have to
-     * split the reply in two parts if we are cross-boundary. */
     len = g_pserver->repl_backlog_histlen - skip;
     serverLog(LL_DEBUG, "[PSYNC] Reply total length: %lld", len);
 
+    /* Set the start and end offsets for the replica so that a future
+     * writeToClient will send the backlog from the given offset to 
+     * the current end of the backlog to said replica */
     c->repl_curr_off = offset - 1;
-    // serverLog(LL_NOTICE, "Client %s, replica offset %lld in psync", replicationGetSlaveName(c), c->repl_curr_off);
     c->repl_end_off = g_pserver->master_repl_offset;
 
     /* Force the partial sync to be queued */
     prepareClientToWrite(c);
     c->fPendingReplicaWrite = true; 
 
-    return g_pserver->repl_backlog_histlen - skip;
+    return len;
 }
 
 /* Return the offset to provide as reply to the PSYNC command received
@@ -4963,14 +4950,18 @@ void flushReplBacklogToClients()
         listIter li;
         listNode *ln;
         listRewind(g_pserver->slaves, &li);
+        /* We don't actually write any data in this function since we send data 
+         * directly from the replication backlog to replicas in writeToClient.
+         * 
+         * What we do however, is set the end offset of each replica here. This way, 
+         * future calls to writeToClient will know up to where in the replication
+         * backlog is valid for writing. */  
         while ((ln = listNext(&li))) {
             client *replica = (client*)listNodeValue(ln);
 
             if (!canFeedReplicaReplBuffer(replica)) continue;
             if (replica->flags & CLIENT_CLOSE_ASAP) continue;
 
-            // serverLog(LL_NOTICE, "Client %s, replica offset %lld", replicationGetSlaveName(replica), replica->repl_curr_off);
-
             std::unique_lock<fastlock> ul(replica->lock);
             if (!FCorrectThread(replica))
                 fAsyncWrite = true;
diff --git a/src/server.h b/src/server.h
index cb3973969..0d6f766ce 100644
--- a/src/server.h
+++ b/src/server.h
@@ -1590,9 +1590,11 @@ struct client {
                                        copying this replica output buffer
                                        should use. */
                                        
-    long long repl_curr_off = -1;  /* Replication offset of the client, only if it's a replica*/
-    long long repl_end_off = -1; /* Replication offset to write to */
-    int fPendingReplicaWrite;
+    long long repl_curr_off = -1;/* Replication offset of the replica, also where in the backlog we need to start from
+                                  * when sending data to this replica. */
+    long long repl_end_off = -1; /* Replication offset to write to, stored in the replica, as opposed to using the global offset 
+                                  * to prevent needing the global lock */
+    int fPendingReplicaWrite;    /* Is there a write queued for this replica? */
 
     char replid[CONFIG_RUN_ID_SIZE+1]; /* Master replication ID (if master). */
     int slave_listening_port; /* As configured with: REPLCONF listening-port */
@@ -2375,8 +2377,8 @@ struct redisServer {
     int repl_diskless_load;         /* Slave parse RDB directly from the socket.
                                      * see REPL_DISKLESS_LOAD_* enum */
     int repl_diskless_sync_delay;   /* Delay to start a diskless repl BGSAVE. */
-    std::atomic <long long> repl_lowest_off; /* The lowest offset amongst all clients 
-                                               Updated before calls to feed the replication backlog */
+    std::atomic <long long> repl_lowest_off; /* The lowest offset amongst all replicas
+                                                -1 if there are no replicas */
     /* Replication (replica) */
     list *masters;
     int enable_multimaster; 
@@ -2825,7 +2827,6 @@ sds getAllClientsInfoString(int type);
 void rewriteClientCommandVector(client *c, int argc, ...);
 void rewriteClientCommandArgument(client *c, int i, robj *newval);
 void replaceClientCommandVector(client *c, int argc, robj **argv);
-unsigned long getClientReplicationBacklogSharedUsage(client *c);
 unsigned long getClientOutputBufferMemoryUsage(client *c);
 int freeClientsInAsyncFreeQueue(int iel);
 void asyncCloseClientOnOutputBufferLimitReached(client *c);
@@ -3017,7 +3018,6 @@ void rdbPipeReadHandler(struct aeEventLoop *eventLoop, int fd, void *clientData,
 void rdbPipeWriteHandlerConnRemoved(struct connection *conn);
 void replicationNotifyLoadedKey(redisDb *db, robj_roptr key, robj_roptr val, long long expire);
 void replicateSubkeyExpire(redisDb *db, robj_roptr key, robj_roptr subkey, long long expire);
-void updateLowestOffsetAmongReplicas(void);
 void clearFailoverState(void);
 void updateFailoverStatus(void);
 void abortFailover(redisMaster *mi, const char *err);

From 815ebe1e6b0b7ad13db30dc342e8d6cb92330651 Mon Sep 17 00:00:00 2001
From: John Sully <john@csquare.ca>
Date: Fri, 25 Jun 2021 01:54:38 +0000
Subject: [PATCH 20/22] Remove fPendingReplicaWrite flag which can instead be
 calculated on demand

Former-commit-id: ae26afd13f955eb230b5c2cab20ec90db9b714ad
---
 src/networking.cpp  | 128 +++++++++++++++++++++-----------------------
 src/replication.cpp |   8 +--
 src/server.h        |   5 +-
 3 files changed, 67 insertions(+), 74 deletions(-)

diff --git a/src/networking.cpp b/src/networking.cpp
index 767fe9c2b..690b03a51 100644
--- a/src/networking.cpp
+++ b/src/networking.cpp
@@ -158,7 +158,6 @@ client *createClient(connection *conn, int iel) {
     c->flags = 0;
     c->fPendingAsyncWrite = FALSE;
     c->fPendingAsyncWriteHandler = FALSE;
-    c->fPendingReplicaWrite = FALSE;
     c->ctime = c->lastinteraction = g_pserver->unixtime;
     /* If the default user does not require authentication, the user is
      * directly authenticated. */
@@ -318,7 +317,7 @@ int prepareClientToWrite(client *c) {
 
     /* Schedule the client to write the output buffers to the socket, unless
      * it should already be setup to do so (it has already pending data). */
-    if (!fAsync && !clientHasPendingReplies(c) && !c->fPendingReplicaWrite) clientInstallWriteHandler(c);
+    if (!fAsync && (c->flags & CLIENT_SLAVE || !clientHasPendingReplies(c))) clientInstallWriteHandler(c);
     if (fAsync && !(c->fPendingAsyncWrite)) clientInstallAsyncWriteHandler(c);
 
     /* Authorize the caller to queue in the output buffer of this client. */
@@ -1132,7 +1131,7 @@ void copyClientOutputBuffer(client *dst, client *src) {
 /* Return true if the specified client has pending reply buffers to write to
  * the socket. */
 int clientHasPendingReplies(client *c) {
-    return (c->bufpos || listLength(c->reply));
+    return (c->bufpos || listLength(c->reply) || c->FPendingReplicaWrite());
 }
 
 static std::atomic<int> rgacceptsInFlight[MAX_EVENT_LOOPS];
@@ -1785,66 +1784,9 @@ int writeToClient(client *c, int handler_installed) {
 
     std::unique_lock<decltype(c->lock)> lock(c->lock);
 
-    while(clientHasPendingReplies(c)) {
-        serverAssert(!(c->flags & CLIENT_SLAVE) || c->flags & CLIENT_MONITOR);
-        if (c->bufpos > 0) {
-            nwritten = connWrite(c->conn,c->buf+c->sentlen,c->bufpos-c->sentlen);
-            if (nwritten <= 0) break;
-            c->sentlen += nwritten;
-            totwritten += nwritten;
-
-            /* If the buffer was sent, set bufpos to zero to continue with
-            * the remainder of the reply. */
-            if ((int)c->sentlen == c->bufpos) {
-                c->bufpos = 0;
-                c->sentlen = 0;
-            }
-        } else {
-            o = (clientReplyBlock*)listNodeValue(listFirst(c->reply));
-            if (o->used == 0) {
-                c->reply_bytes -= o->size;
-                listDelNode(c->reply,listFirst(c->reply));
-                continue;
-            }
-
-            nwritten = connWrite(c->conn, o->buf() + c->sentlen, o->used - c->sentlen);
-            if (nwritten <= 0) break;
-            c->sentlen += nwritten;
-            totwritten += nwritten;
-            
-            /* If we fully sent the object on head go to the next one */
-            if (c->sentlen == o->used) {
-                c->reply_bytes -= o->size;
-                listDelNode(c->reply,listFirst(c->reply));
-                c->sentlen = 0;
-                /* If there are no longer objects in the list, we expect
-                    * the count of reply bytes to be exactly zero. */
-                if (listLength(c->reply) == 0)
-                    serverAssert(c->reply_bytes == 0);
-            }
-        }
-        /* Note that we avoid to send more than NET_MAX_WRITES_PER_EVENT
-         * bytes, in a single threaded server it's a good idea to serve
-         * other clients as well, even if a very large request comes from
-         * super fast link that is always able to accept data (in real world
-         * scenario think about 'KEYS *' against the loopback interface).
-         *
-         * However if we are over the maxmemory limit we ignore that and
-         * just deliver as much data as it is possible to deliver.
-         *
-         * Moreover, we also send as much as possible if the client is
-         * a replica or a monitor (otherwise, on high-speed traffic, the
-         * replication/output buffer will grow indefinitely) */
-        if (totwritten > NET_MAX_WRITES_PER_EVENT &&
-            (g_pserver->maxmemory == 0 ||
-             zmalloc_used_memory() < g_pserver->maxmemory) &&
-            !(c->flags & CLIENT_SLAVE)) break;
-    }
-
     /* We can only directly read from the replication backlog if the client 
        is a replica, so only attempt to do so if that's the case. */
     if (c->flags & CLIENT_SLAVE && !(c->flags & CLIENT_MONITOR)) {
-
         std::unique_lock<fastlock> repl_backlog_lock (g_pserver->repl_backlog_lock);
         long long repl_end_idx = getReplIndexFromOffset(c->repl_end_off);
         serverAssert(c->repl_curr_off != -1);
@@ -1872,15 +1814,67 @@ int writeToClient(client *c, int handler_installed) {
                 totwritten += nwritten;
                 c->repl_curr_off += nwritten;
                 serverAssert(c->repl_curr_off <= c->repl_end_off);
-                /* If the client's current offset matches the last offset it can read from, there is no pending write */
-                if (c->repl_curr_off == c->repl_end_off){
-                    c->fPendingReplicaWrite = false;
-                }
             }
 
             /* If the second part of a write didn't go through, we still need to register that */
             if (nwritten2ndStage == -1) nwritten = -1;
         }
+    } else {
+        while(clientHasPendingReplies(c)) {
+            serverAssert(!(c->flags & CLIENT_SLAVE) || c->flags & CLIENT_MONITOR);
+            if (c->bufpos > 0) {
+                nwritten = connWrite(c->conn,c->buf+c->sentlen,c->bufpos-c->sentlen);
+                if (nwritten <= 0) break;
+                c->sentlen += nwritten;
+                totwritten += nwritten;
+
+                /* If the buffer was sent, set bufpos to zero to continue with
+                * the remainder of the reply. */
+                if ((int)c->sentlen == c->bufpos) {
+                    c->bufpos = 0;
+                    c->sentlen = 0;
+                }
+            } else {
+                o = (clientReplyBlock*)listNodeValue(listFirst(c->reply));
+                if (o->used == 0) {
+                    c->reply_bytes -= o->size;
+                    listDelNode(c->reply,listFirst(c->reply));
+                    continue;
+                }
+
+                nwritten = connWrite(c->conn, o->buf() + c->sentlen, o->used - c->sentlen);
+                if (nwritten <= 0) break;
+                c->sentlen += nwritten;
+                totwritten += nwritten;
+                
+                /* If we fully sent the object on head go to the next one */
+                if (c->sentlen == o->used) {
+                    c->reply_bytes -= o->size;
+                    listDelNode(c->reply,listFirst(c->reply));
+                    c->sentlen = 0;
+                    /* If there are no longer objects in the list, we expect
+                        * the count of reply bytes to be exactly zero. */
+                    if (listLength(c->reply) == 0)
+                        serverAssert(c->reply_bytes == 0);
+                }
+            }
+            /* Note that we avoid to send more than NET_MAX_WRITES_PER_EVENT
+            * bytes, in a single threaded server it's a good idea to serve
+            * other clients as well, even if a very large request comes from
+            * super fast link that is always able to accept data (in real world
+            * scenario think about 'KEYS *' against the loopback interface).
+            *
+            * However if we are over the maxmemory limit we ignore that and
+            * just deliver as much data as it is possible to deliver.
+            *
+            * Moreover, we also send as much as possible if the client is
+            * a replica or a monitor (otherwise, on high-speed traffic, the
+            * replication/output buffer will grow indefinitely) */
+            if (totwritten > NET_MAX_WRITES_PER_EVENT &&
+                (g_pserver->maxmemory == 0 ||
+                zmalloc_used_memory() < g_pserver->maxmemory) &&
+                !(c->flags & CLIENT_SLAVE)) break;
+        }
     }
 
     g_pserver->stat_net_output_bytes += totwritten;
@@ -1900,7 +1894,7 @@ int writeToClient(client *c, int handler_installed) {
          * We just rely on data / pings received for timeout detection. */
         if (!(c->flags & CLIENT_MASTER)) c->lastinteraction = g_pserver->unixtime;
     }
-    if (!clientHasPendingReplies(c) && !c->fPendingReplicaWrite) {
+    if (!clientHasPendingReplies(c)) {
         c->sentlen = 0;
         if (handler_installed) connSetWriteHandler(c->conn, NULL);
 
@@ -2080,7 +2074,7 @@ int handleClientsWithPendingWrites(int iel, int aof_state) {
 
         /* If after the synchronous writes above we still have data to
         * output to the client, we need to install the writable handler. */
-        if (clientHasPendingReplies(c) || c->fPendingReplicaWrite) {
+        if (clientHasPendingReplies(c)) {
             if (connSetWriteHandlerWithBarrier(c->conn, sendReplyToClient, ae_flags, true) == C_ERR) {
                 freeClientAsync(c);
             }
@@ -3705,7 +3699,7 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) {
 /* In the case of a replica client, writes to said replica are using data from the replication backlog
  * as opposed to it's own internal buffer, this number should keep track of that */
 unsigned long getClientReplicationBacklogSharedUsage(client *c) {
-    return (!(c->flags & CLIENT_SLAVE) || !c->fPendingReplicaWrite ) ? 0 : g_pserver->master_repl_offset - c->repl_curr_off;
+    return (!(c->flags & CLIENT_SLAVE) || !c->FPendingReplicaWrite() ) ? 0 : g_pserver->master_repl_offset - c->repl_curr_off;
 }
 
 /* This function returns the number of bytes that Redis is
diff --git a/src/replication.cpp b/src/replication.cpp
index b9465680e..94b35e314 100644
--- a/src/replication.cpp
+++ b/src/replication.cpp
@@ -722,7 +722,6 @@ long long addReplyReplicationBacklog(client *c, long long offset) {
 
     /* Force the partial sync to be queued */
     prepareClientToWrite(c);
-    c->fPendingReplicaWrite = true; 
 
     return len;
 }
@@ -4974,11 +4973,8 @@ void flushReplBacklogToClients()
             replica->repl_end_off = g_pserver->master_repl_offset;
 
             /* Only if the there isn't already a pending write do we prepare the client to write */
-            if (!replica->fPendingReplicaWrite){
-                serverAssert(replica->repl_curr_off != g_pserver->master_repl_offset);
-                prepareClientToWrite(replica);
-                replica->fPendingReplicaWrite = true; 
-            }
+            serverAssert(replica->repl_curr_off != g_pserver->master_repl_offset);
+            prepareClientToWrite(replica);
 
         }
         if (fAsyncWrite)
diff --git a/src/server.h b/src/server.h
index 0d6f766ce..07608632e 100644
--- a/src/server.h
+++ b/src/server.h
@@ -1594,7 +1594,6 @@ struct client {
                                   * when sending data to this replica. */
     long long repl_end_off = -1; /* Replication offset to write to, stored in the replica, as opposed to using the global offset 
                                   * to prevent needing the global lock */
-    int fPendingReplicaWrite;    /* Is there a write queued for this replica? */
 
     char replid[CONFIG_RUN_ID_SIZE+1]; /* Master replication ID (if master). */
     int slave_listening_port; /* As configured with: REPLCONF listening-port */
@@ -1657,6 +1656,10 @@ struct client {
     robj **argv;
     size_t argv_len_sumActive = 0;
 
+    bool FPendingReplicaWrite() const {
+        return repl_curr_off != repl_end_off;
+    }
+
     // post a function from a non-client thread to run on its client thread
     bool postFunction(std::function<void(client *)> fn, bool fLock = true);
     size_t argv_len_sum() const;

From e6a82692b7be9d62a619f9968e0f9ae5f90ca71e Mon Sep 17 00:00:00 2001
From: John Sully <john@csquare.ca>
Date: Fri, 25 Jun 2021 02:31:17 +0000
Subject: [PATCH 21/22] Avoid holding the lockPendingWrite for too long and
 deadlocking due to lock inversion

Former-commit-id: a4b49fbec60e2333a4407d24383ae204d5d2b413
---
 src/networking.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/networking.cpp b/src/networking.cpp
index 690b03a51..5ced371d1 100644
--- a/src/networking.cpp
+++ b/src/networking.cpp
@@ -2018,7 +2018,6 @@ void ProcessPendingAsyncWrites()
  * need to use a syscall in order to install the writable event handler,
  * get it called, and so forth. */
 int handleClientsWithPendingWrites(int iel, int aof_state) {
-    std::unique_lock<fastlock> lockf(g_pserver->rgthreadvar[iel].lockPendingWrite);
     int processed = 0;
     serverAssert(iel == (serverTL - g_pserver->rgthreadvar));
 
@@ -2041,7 +2040,9 @@ int handleClientsWithPendingWrites(int iel, int aof_state) {
         ae_flags |= AE_BARRIER;
     }
 
+    std::unique_lock<fastlock> lockf(g_pserver->rgthreadvar[iel].lockPendingWrite);
     auto vec = std::move(g_pserver->rgthreadvar[iel].clients_pending_write);
+    lockf.unlock();
     processed += (int)vec.size();
 
     for (client *c : vec) {

From 5949e253cab606c0bd7616e00c42e7ebcfca872a Mon Sep 17 00:00:00 2001
From: John Sully <john@csquare.ca>
Date: Fri, 25 Jun 2021 02:46:32 +0000
Subject: [PATCH 22/22] remove unnecessary newline

Former-commit-id: 532af9cd0286ac6ece6f401c42aea18e36d16f7c
---
 src/replication.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/replication.cpp b/src/replication.cpp
index 94b35e314..e9a503167 100644
--- a/src/replication.cpp
+++ b/src/replication.cpp
@@ -4975,7 +4975,6 @@ void flushReplBacklogToClients()
             /* Only if the there isn't already a pending write do we prepare the client to write */
             serverAssert(replica->repl_curr_off != g_pserver->master_repl_offset);
             prepareClientToWrite(replica);
-
         }
         if (fAsyncWrite)
             ProcessPendingAsyncWrites();