Merge branch 'keydbpro' into keydbpro_collab

Former-commit-id: ecc69952dfd1f145e1aff12bca56a4b4e102d669
2021-06-25 06:21:58 +00:00 · 2021-06-25 06:21:58 +00:00 · d55bcf23bd
commit d55bcf23bd
parent 7523c29cec ae94ce3438
9 changed files with 251 additions and 99 deletions
--- a/src/Makefile
+++ b/src/Makefile
@ -15,7 +15,7 @@
 release_hdr := $(shell sh -c './mkreleasehdr.sh')
 uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not')
 uname_M := $(shell sh -c 'uname -m 2>/dev/null || echo not')
-OPTIMIZATION?=-O2
+OPTIMIZATION?=-O2 -flto
 DEPENDENCY_TARGETS=hiredis linenoise lua hdr_histogram rocksdb
 NODEPS:=clean distclean

@ -349,9 +349,9 @@ endif

 REDIS_SERVER_NAME=keydb-server$(PROG_SUFFIX)
 REDIS_SENTINEL_NAME=keydb-sentinel$(PROG_SUFFIX)
-REDIS_SERVER_OBJ=adlist.o quicklist.o ae.o anet.o dict.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o t_nhash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o crc16.o endianconv.o slowlog.o scripting.o bio.o rio.o rand.o memtest.o crcspeed.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o acl.o storage.o rdb-s3.o fastlock.o new.o tracking.o cron.o connection.o tls.o sha256.o motd.o timeout.o setcpuaffinity.o AsyncWorkQueue.o snapshot.o storage/rocksdb.o storage/rocksdbfactory.o storage/teststorageprovider.o keydbutils.o StorageCache.o monotonic.o cli_common.o mt19937-64.o $(ASM_OBJ)
+REDIS_SERVER_OBJ=adlist.o quicklist.o ae.o anet.o dict.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o t_nhash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o crc16.o endianconv.o slowlog.o scripting.o bio.o rio.o rand.o memtest.o crcspeed.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o acl.o storage.o rdb-s3.o fastlock.o new.o tracking.o cron.o connection.o tls.o sha256.o motd_server.o timeout.o setcpuaffinity.o AsyncWorkQueue.o snapshot.o storage/rocksdb.o storage/rocksdbfactory.o storage/teststorageprovider.o keydbutils.o StorageCache.o monotonic.o cli_common.o mt19937-64.o $(ASM_OBJ)
 REDIS_CLI_NAME=keydb-cli$(PROG_SUFFIX)
-REDIS_CLI_OBJ=anet.o adlist.o dict.o redis-cli.o redis-cli-cpphelper.o zmalloc.o release.o anet.o ae.o crcspeed.o crc64.o siphash.o crc16.o storage-lite.o fastlock.o motd.o monotonic.o cli_common.o mt19937-64.o $(ASM_OBJ)
+REDIS_CLI_OBJ=anet.o adlist.o dict.o redis-cli.o redis-cli-cpphelper.o zmalloc.o release.o anet.o ae.o crcspeed.o crc64.o siphash.o crc16.o storage-lite.o fastlock.o motd_client.o monotonic.o cli_common.o mt19937-64.o $(ASM_OBJ)
 REDIS_BENCHMARK_NAME=keydb-benchmark$(PROG_SUFFIX)
 REDIS_BENCHMARK_OBJ=ae.o anet.o redis-benchmark.o adlist.o dict.o zmalloc.o release.o crcspeed.o crc64.o siphash.o redis-benchmark.o storage-lite.o fastlock.o new.o monotonic.o cli_common.o mt19937-64.o $(ASM_OBJ)
 REDIS_CHECK_RDB_NAME=keydb-check-rdb$(PROG_SUFFIX)
@ -435,6 +435,12 @@ DEP = $(REDIS_SERVER_OBJ:%.o=%.d) $(REDIS_CLI_OBJ:%.o=%.d) $(REDIS_BENCHMARK_OBJ
 # Because the jemalloc.h header is generated as a part of the jemalloc build,
 # building it should complete before building any other object. Instead of
 # depending on a single artifact, build all dependencies first.
+motd_client.o: motd.cpp .make-prerequisites
+	$(REDIS_CXX) -MMD -o motd_client.o -c $< -DCLIENT -fno-lto
+
+motd_server.o: motd.cpp .make-prerequisites
+	$(REDIS_CXX) -MMD -o motd_server.o -c $< -DSERVER
+
 %.o: %.c .make-prerequisites
 	$(REDIS_CC) -MMD -o $@ -c $<

--- a/src/db.cpp
+++ b/src/db.cpp
@ -2726,6 +2726,8 @@ void redisDbPersistentData::ensure(const char *sdsKey, dictEntry **pde)
    serverAssert(sdsKey != nullptr);
    serverAssert(FImplies(*pde != nullptr, dictGetVal(*pde) != nullptr));    // early versions set a NULL object, this is no longer valid
    serverAssert(m_refCount == 0);
+    if (m_pdbSnapshot == nullptr && g_pserver->m_pstorageFactory == nullptr)
+        return;
    std::unique_lock<fastlock> ul(g_expireLock);

    // First see if the key can be obtained from a snapshot
--- a/src/evict.cpp
+++ b/src/evict.cpp
@ -355,6 +355,8 @@ unsigned long LFUDecrAndReturn(robj_roptr o) {
    return counter;
 }

+unsigned long getClientReplicationBacklogSharedUsage(client *c);
+
 /* We don't want to count AOF buffers and slaves output buffers as
 * used memory: the eviction should use mostly data size. This function
 * returns the sum of AOF and slaves buffer. */
@ -371,9 +373,15 @@ size_t freeMemoryGetNotCountedMemory(void) {
        while((ln = listNext(&li))) {
            client *replica = (client*)listNodeValue(ln);
            std::unique_lock<fastlock>(replica->lock);
-            overhead += getClientOutputBufferMemoryUsage(replica);
+            /* we don't wish to multiple count the replication backlog shared usage */
+            overhead += (getClientOutputBufferMemoryUsage(replica) - getClientReplicationBacklogSharedUsage(replica));
        }
    }
+
+    /* also don't count the replication backlog memory
+     * that's where the replication clients get their memory from */
+    overhead += g_pserver->repl_backlog_size;
+
    if (g_pserver->aof_state != AOF_OFF) {
        overhead += sdsalloc(g_pserver->aof_buf)+aofRewriteBufferSize();
    }
--- a/src/motd.cpp
+++ b/src/motd.cpp
@ -1,7 +1,11 @@
+#ifdef CLIENT
 extern "C" {
 #include <sdscompat.h>
 #include <sds.h>
 }
+#else
+#include "sds.h"
+#endif
 #include <cstring>
 #include <unistd.h>
 #include <sys/types.h>
@ -15,6 +19,7 @@ extern "C" {
 #ifdef MOTD
 #include <curl/curl.h> 

+#ifdef CLIENT
 extern "C" {
 __attribute__ ((weak)) hisds hi_sdscatlen(hisds s, const void *t, size_t len) {
    return sdscatlen(s, t, len);
@ -23,6 +28,7 @@ __attribute__ ((weak)) hisds hi_sdscat(hisds s, const char *t) {
    return sdscat(s, t);
 }
 }
+#endif

 static const char *szMotdCachePath()
 {
--- a/src/networking.cpp
+++ b/src/networking.cpp
@ -136,6 +136,7 @@ client *createClient(connection *conn, int iel) {
    client_id = g_pserver->next_client_id.fetch_add(1);
    c->iel = iel;
    c->id = client_id;
+    sprintf(c->lock.szName, "client %lu", client_id);
    c->resp = 2;
    c->conn = conn;
    c->name = NULL;
@ -157,6 +158,7 @@ client *createClient(connection *conn, int iel) {
    c->flags = 0;
    c->fPendingAsyncWrite = FALSE;
    c->fPendingAsyncWriteHandler = FALSE;
+    c->fPendingReplicaWrite = FALSE;
    c->ctime = c->lastinteraction = g_pserver->unixtime;
    /* If the default user does not require authentication, the user is
     * directly authenticated. */
@ -234,6 +236,7 @@ void clientInstallWriteHandler(client *c) {
    /* Schedule the client to write the output buffers to the socket only
     * if not already done and, for slaves, if the replica can actually receive
     * writes at this stage. */
+
    if (!(c->flags & CLIENT_PENDING_WRITE) &&
        (c->replstate == REPL_STATE_NONE ||
         (c->replstate == SLAVE_STATE_ONLINE && !c->repl_put_online_on_ack)))
@ -315,7 +318,7 @@ int prepareClientToWrite(client *c) {

    /* Schedule the client to write the output buffers to the socket, unless
     * it should already be setup to do so (it has already pending data). */
-    if (!fAsync && !clientHasPendingReplies(c)) clientInstallWriteHandler(c);
+    if (!fAsync && !clientHasPendingReplies(c) && !c->fPendingReplicaWrite) clientInstallWriteHandler(c);
    if (fAsync && !(c->fPendingAsyncWrite)) clientInstallAsyncWriteHandler(c);

    /* Authorize the caller to queue in the output buffer of this client. */
@ -1764,6 +1767,8 @@ client *lookupClientByID(uint64_t id) {
    return (c == raxNotFound) ? NULL : c;
 }

+long long getReplIndexFromOffset(long long offset);
+
 /* Write data in output buffers to client. Return C_OK if the client
 * is still valid after the call, C_ERR if it was freed because of some
 * error.  If handler_installed is set, it will attempt to clear the
@ -1783,6 +1788,7 @@ int writeToClient(client *c, int handler_installed) {
    std::unique_lock<decltype(c->lock)> lock(c->lock);

    while(clientHasPendingReplies(c)) {
+        serverAssert(!(c->flags & CLIENT_SLAVE) || c->flags & CLIENT_MONITOR);
        if (c->bufpos > 0) {
            nwritten = connWrite(c->conn,c->buf+c->sentlen,c->bufpos-c->sentlen);
            if (nwritten <= 0) break;
@ -1790,7 +1796,7 @@ int writeToClient(client *c, int handler_installed) {
            totwritten += nwritten;

            /* If the buffer was sent, set bufpos to zero to continue with
-             * the remainder of the reply. */
+            * the remainder of the reply. */
            if ((int)c->sentlen == c->bufpos) {
                c->bufpos = 0;
                c->sentlen = 0;
@ -1837,6 +1843,48 @@ int writeToClient(client *c, int handler_installed) {
            !(c->flags & CLIENT_SLAVE)) break;
    }

+    /* We can only directly read from the replication backlog if the client 
+       is a replica, so only attempt to do so if that's the case. */
+    if (c->flags & CLIENT_SLAVE && !(c->flags & CLIENT_MONITOR)) {
+
+        std::unique_lock<fastlock> repl_backlog_lock (g_pserver->repl_backlog_lock);
+        long long repl_end_idx = getReplIndexFromOffset(c->repl_end_off);
+        serverAssert(c->repl_curr_off != -1);
+
+        if (c->repl_curr_off != c->repl_end_off){
+            long long repl_curr_idx = getReplIndexFromOffset(c->repl_curr_off); 
+            long long nwritten2ndStage = 0; /* How much was written from the start of the replication backlog
+                                             * in the event of a wrap around write */
+            /* normal case with no wrap around */
+            if (repl_end_idx >= repl_curr_idx){
+                nwritten = connWrite(c->conn, g_pserver->repl_backlog + repl_curr_idx, repl_end_idx - repl_curr_idx);
+            /* wrap around case */
+            } else {
+                nwritten = connWrite(c->conn, g_pserver->repl_backlog + repl_curr_idx, g_pserver->repl_backlog_size - repl_curr_idx);
+                /* only attempt wrapping if we write the correct number of bytes */
+                if (nwritten == g_pserver->repl_backlog_size - repl_curr_idx){
+                    nwritten2ndStage = connWrite(c->conn, g_pserver->repl_backlog, repl_end_idx);
+                    if (nwritten2ndStage != -1)
+                        nwritten += nwritten2ndStage;
+                }                
+            }
+
+            /* only increment bytes if an error didn't occur */
+            if (nwritten > 0){
+                totwritten += nwritten;
+                c->repl_curr_off += nwritten;
+                serverAssert(c->repl_curr_off <= c->repl_end_off);
+                /* If the client's current offset matches the last offset it can read from, there is no pending write */
+                if (c->repl_curr_off == c->repl_end_off){
+                    c->fPendingReplicaWrite = false;
+                }
+            }
+
+            /* If the second part of a write didn't go through, we still need to register that */
+            if (nwritten2ndStage == -1) nwritten = -1;
+        }
+    }
+
    g_pserver->stat_net_output_bytes += totwritten;
    if (nwritten == -1) {
        if (connGetState(c->conn) != CONN_STATE_CONNECTED) {
@ -1854,7 +1902,7 @@ int writeToClient(client *c, int handler_installed) {
         * We just rely on data / pings received for timeout detection. */
        if (!(c->flags & CLIENT_MASTER)) c->lastinteraction = g_pserver->unixtime;
    }
-    if (!clientHasPendingReplies(c)) {
+    if (!clientHasPendingReplies(c) && !c->fPendingReplicaWrite) {
        c->sentlen = 0;
        if (handler_installed) connSetWriteHandler(c->conn, NULL);

@ -1898,27 +1946,37 @@ void ProcessPendingAsyncWrites()
        serverAssert(c->fPendingAsyncWrite);
        if (c->flags & (CLIENT_CLOSE_ASAP | CLIENT_CLOSE_AFTER_REPLY))
        {
-            zfree(c->replyAsync);
-            c->replyAsync = nullptr;
+            if (c->replyAsync != nullptr){
+                zfree(c->replyAsync);
+                c->replyAsync = nullptr;
+            }
            c->fPendingAsyncWrite = FALSE;
            continue;
        }

-        int size = c->replyAsync->used;
+        /* since writes from master to replica can come directly from the replication backlog,
+         * writes may have been signalled without having been copied to the replyAsync buffer,
+         * thus causing the buffer to be NULL */ 
+        if (c->replyAsync != nullptr){
+            int size = c->replyAsync->used;

-        if (listLength(c->reply) == 0 && size <= (PROTO_REPLY_CHUNK_BYTES - c->bufpos)) {
-            memcpy(c->buf + c->bufpos, c->replyAsync->buf(), size);
-            c->bufpos += size;
-        } else {
-            c->reply_bytes += c->replyAsync->size;
-            listAddNodeTail(c->reply, c->replyAsync);
+            if (listLength(c->reply) == 0 && size <= (PROTO_REPLY_CHUNK_BYTES - c->bufpos)) {
+                memcpy(c->buf + c->bufpos, c->replyAsync->buf(), size);
+                c->bufpos += size;
+            } else {
+                c->reply_bytes += c->replyAsync->size;
+                listAddNodeTail(c->reply, c->replyAsync);
+                c->replyAsync = nullptr;
+            }
+
+            zfree(c->replyAsync);
            c->replyAsync = nullptr;
+        } else {
+            /* Only replicas should have empty async reply buffers */
+            serverAssert(c->flags & CLIENT_SLAVE);
        }

-        zfree(c->replyAsync);
-        c->replyAsync = nullptr;
        c->fPendingAsyncWrite = FALSE;
-
        // Now install the write event handler
        int ae_flags = AE_WRITABLE|AE_WRITE_THREADSAFE;
        /* For the fsync=always policy, we want that a given FD is never
@ -2024,9 +2082,10 @@ int handleClientsWithPendingWrites(int iel, int aof_state) {

        /* If after the synchronous writes above we still have data to
        * output to the client, we need to install the writable handler. */
-        if (clientHasPendingReplies(c)) {
-            if (connSetWriteHandlerWithBarrier(c->conn, sendReplyToClient, ae_flags, true) == C_ERR) 
+        if (clientHasPendingReplies(c) || c->fPendingReplicaWrite) {
+            if (connSetWriteHandlerWithBarrier(c->conn, sendReplyToClient, ae_flags, true) == C_ERR) {
                freeClientAsync(c);
+            }
        }
    }

@ -3647,6 +3706,12 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) {
    }
 }

+/* In the case of a replica client, writes to said replica are using data from the replication backlog
+ * as opposed to it's own internal buffer, this number should keep track of that */
+unsigned long getClientReplicationBacklogSharedUsage(client *c) {
+    return (!(c->flags & CLIENT_SLAVE) || !c->fPendingReplicaWrite ) ? 0 : g_pserver->master_repl_offset - c->repl_curr_off;
+}
+
 /* This function returns the number of bytes that Redis is
 * using to store the reply still not read by the client.
 *
@ -3655,9 +3720,11 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) {
 * enforcing the client output length limits. */
 unsigned long getClientOutputBufferMemoryUsage(client *c) {
    unsigned long list_item_size = sizeof(listNode) + sizeof(clientReplyBlock);
-    return c->reply_bytes + (list_item_size*listLength(c->reply)) + (c->replyAsync ? c->replyAsync->size : 0);
+    return c->reply_bytes + (list_item_size*listLength(c->reply)) + (c->replyAsync ? c->replyAsync->size : 0) + getClientReplicationBacklogSharedUsage(c);
 }

+
+
 /* Get the class of a client, used in order to enforce limits to different
 * classes of clients.
 *
--- a/src/replication.cpp
+++ b/src/replication.cpp
@ -189,6 +189,7 @@ void createReplicationBacklog(void) {
    g_pserver->repl_backlog = (char*)zmalloc(g_pserver->repl_backlog_size, MALLOC_LOCAL);
    g_pserver->repl_backlog_histlen = 0;
    g_pserver->repl_backlog_idx = 0;
+    g_pserver->repl_backlog_start = g_pserver->master_repl_offset;

    /* We don't have any data inside our buffer, but virtually the first
     * byte we have is the next byte that will be generated for the
@ -200,6 +201,15 @@ void createReplicationBacklog(void) {
    g_pserver->repl_batch_offStart = g_pserver->master_repl_offset;
 }

+/* Compute the corresponding index from a replication backlog offset 
+ * Since this computation needs the size of the replication backlog, 
+ * you need to have the repl_backlog_lock in order to call it */
+long long getReplIndexFromOffset(long long offset){
+    serverAssert(g_pserver->repl_backlog_lock.fOwnLock());
+    long long index = (offset - g_pserver->repl_backlog_start) % g_pserver->repl_backlog_size;
+    return index;
+}
+
 /* This function is called when the user modifies the replication backlog
 * size at runtime. It is up to the function to both update the
 * g_pserver->repl_backlog_size and to resize the buffer and setup it so that
@ -211,6 +221,8 @@ void resizeReplicationBacklog(long long newsize) {
        newsize = CONFIG_REPL_BACKLOG_MIN_SIZE;
    if (g_pserver->repl_backlog_size == newsize) return;

+    std::unique_lock<fastlock> repl_backlog_lock (g_pserver->repl_backlog_lock);
+
    if (g_pserver->repl_backlog != NULL) {
        /* What we actually do is to flush the old buffer and realloc a new
         * empty one. It will refill with new data incrementally.
@ -218,19 +230,23 @@ void resizeReplicationBacklog(long long newsize) {
         * worse often we need to alloc additional space before freeing the
         * old buffer. */

-        if (g_pserver->repl_batch_idxStart >= 0) {
-            // We need to keep critical data so we can't shrink less than the hot data in the buffer
-            newsize = std::max(newsize, g_pserver->master_repl_offset - g_pserver->repl_batch_offStart);
-            char *backlog = (char*)zmalloc(newsize);
-            g_pserver->repl_backlog_histlen = g_pserver->master_repl_offset - g_pserver->repl_batch_offStart;
+        /* get the critical client size, i.e. the size of the data unflushed to clients */
+        long long earliest_off = g_pserver->repl_lowest_off.load();

-            if (g_pserver->repl_backlog_idx >= g_pserver->repl_batch_idxStart) {
-                auto cbActiveBacklog = g_pserver->repl_backlog_idx - g_pserver->repl_batch_idxStart;
-                memcpy(backlog, g_pserver->repl_backlog + g_pserver->repl_batch_idxStart, cbActiveBacklog);
+        if (earliest_off != -1) {
+            // We need to keep critical data so we can't shrink less than the hot data in the buffer
+            newsize = std::max(newsize, g_pserver->master_repl_offset - earliest_off);
+            char *backlog = (char*)zmalloc(newsize);
+            g_pserver->repl_backlog_histlen = g_pserver->master_repl_offset - earliest_off;
+            long long earliest_idx = getReplIndexFromOffset(earliest_off);
+
+            if (g_pserver->repl_backlog_idx >= earliest_idx) {
+                auto cbActiveBacklog = g_pserver->repl_backlog_idx - earliest_idx;
+                memcpy(backlog, g_pserver->repl_backlog + earliest_idx, cbActiveBacklog);
                serverAssert(g_pserver->repl_backlog_histlen == cbActiveBacklog);
            } else {
-                auto cbPhase1 = g_pserver->repl_backlog_size - g_pserver->repl_batch_idxStart;
-                memcpy(backlog, g_pserver->repl_backlog + g_pserver->repl_batch_idxStart, cbPhase1);
+                auto cbPhase1 = g_pserver->repl_backlog_size - earliest_idx;
+                memcpy(backlog, g_pserver->repl_backlog + earliest_idx, cbPhase1);
                memcpy(backlog + cbPhase1, g_pserver->repl_backlog, g_pserver->repl_backlog_idx);
                auto cbActiveBacklog = cbPhase1 + g_pserver->repl_backlog_idx;
                serverAssert(g_pserver->repl_backlog_histlen == cbActiveBacklog);
@ -238,7 +254,10 @@ void resizeReplicationBacklog(long long newsize) {
            zfree(g_pserver->repl_backlog);
            g_pserver->repl_backlog = backlog;
            g_pserver->repl_backlog_idx = g_pserver->repl_backlog_histlen;
-            g_pserver->repl_batch_idxStart = 0;
+            g_pserver->repl_batch_idxStart -= earliest_idx;
+            if (g_pserver->repl_batch_idxStart < 0)
+                g_pserver->repl_batch_idxStart += g_pserver->repl_backlog_size;
+            g_pserver->repl_backlog_start = earliest_off;
        } else {
            zfree(g_pserver->repl_backlog);
            g_pserver->repl_backlog = (char*)zmalloc(newsize);
@ -246,11 +265,13 @@ void resizeReplicationBacklog(long long newsize) {
            g_pserver->repl_backlog_idx = 0;
            /* Next byte we have is... the next since the buffer is empty. */
            g_pserver->repl_backlog_off = g_pserver->master_repl_offset+1;
+            g_pserver->repl_backlog_start = g_pserver->master_repl_offset;
        }
    }
    g_pserver->repl_backlog_size = newsize;
 }

+
 void freeReplicationBacklog(void) {
    serverAssert(GlobalLocksAcquired());
    listIter li;
@ -273,12 +294,20 @@ void feedReplicationBacklog(const void *ptr, size_t len) {
    serverAssert(GlobalLocksAcquired());
    const unsigned char *p = (const unsigned char*)ptr;

+
    if (g_pserver->repl_batch_idxStart >= 0) {
-        long long minimumsize = g_pserver->master_repl_offset + len - g_pserver->repl_batch_offStart+1;
+        /* We are lower bounded by the lowest replica offset, or the batch offset start if not applicable */
+        long long lower_bound = g_pserver->repl_lowest_off.load(std::memory_order_seq_cst);
+        if (lower_bound == -1)
+            lower_bound = g_pserver->repl_batch_offStart;
+        long long minimumsize = g_pserver->master_repl_offset + len - lower_bound + 1;
        if (minimumsize > g_pserver->repl_backlog_size) {
            flushReplBacklogToClients();
-            serverAssert(g_pserver->master_repl_offset == g_pserver->repl_batch_offStart);
-            minimumsize = g_pserver->master_repl_offset + len - g_pserver->repl_batch_offStart+1;
+            lower_bound = g_pserver->repl_lowest_off.load(std::memory_order_seq_cst);
+            if (lower_bound == -1)
+                lower_bound = g_pserver->repl_batch_offStart;
+
+            minimumsize = g_pserver->master_repl_offset + len - lower_bound + 1;

            if (minimumsize > g_pserver->repl_backlog_size && minimumsize < (long long)cserver.client_obuf_limits[CLIENT_TYPE_SLAVE].hard_limit_bytes) {
                // This is an emergency overflow, we better resize to fit
@ -293,6 +322,7 @@ void feedReplicationBacklog(const void *ptr, size_t len) {

    /* This is a circular buffer, so write as much data we can at every
     * iteration and rewind the "idx" index if we reach the limit. */
+    
    while(len) {
        size_t thislen = g_pserver->repl_backlog_size - g_pserver->repl_backlog_idx;
        if (thislen > len) thislen = len;
@ -479,7 +509,6 @@ void replicationFeedSlavesCore(list *slaves, int dictid, robj **argv, int argc)
        if (fSendRaw)
        {
            char aux[LONG_STR_SIZE+3];
-
            /* Add the multi bulk reply length. */
            aux[0] = '*';
            int multilen = ll2string(aux+1,sizeof(aux)-1,argc);
@ -653,15 +682,19 @@ void replicationFeedMonitors(client *c, list *monitors, int dictid, robj **argv,
    decrRefCount(cmdobj);
 }

+int prepareClientToWrite(client *c);
+
 /* Feed the replica 'c' with the replication backlog starting from the
 * specified 'offset' up to the end of the backlog. */
 long long addReplyReplicationBacklog(client *c, long long offset) {
-    long long j, skip, len;
+    long long skip, len;

    serverLog(LL_DEBUG, "[PSYNC] Replica request offset: %lld", offset);

    if (g_pserver->repl_backlog_histlen == 0) {
        serverLog(LL_DEBUG, "[PSYNC] Backlog history len is zero");
+        c->repl_curr_off = g_pserver->master_repl_offset;
+        c->repl_end_off = g_pserver->master_repl_offset;
        return 0;
    }

@ -678,31 +711,20 @@ long long addReplyReplicationBacklog(client *c, long long offset) {
    skip = offset - g_pserver->repl_backlog_off;
    serverLog(LL_DEBUG, "[PSYNC] Skipping: %lld", skip);

-    /* Point j to the oldest byte, that is actually our
-     * g_pserver->repl_backlog_off byte. */
-    j = (g_pserver->repl_backlog_idx +
-        (g_pserver->repl_backlog_size-g_pserver->repl_backlog_histlen)) %
-        g_pserver->repl_backlog_size;
-    serverLog(LL_DEBUG, "[PSYNC] Index of first byte: %lld", j);
-
-    /* Discard the amount of data to seek to the specified 'offset'. */
-    j = (j + skip) % g_pserver->repl_backlog_size;
-
-    /* Feed replica with data. Since it is a circular buffer we have to
-     * split the reply in two parts if we are cross-boundary. */
    len = g_pserver->repl_backlog_histlen - skip;
    serverLog(LL_DEBUG, "[PSYNC] Reply total length: %lld", len);
-    while(len) {
-        long long thislen =
-            ((g_pserver->repl_backlog_size - j) < len) ?
-            (g_pserver->repl_backlog_size - j) : len;

-        serverLog(LL_DEBUG, "[PSYNC] addReply() length: %lld", thislen);
-        addReplySds(c,sdsnewlen(g_pserver->repl_backlog + j, thislen));
-        len -= thislen;
-        j = 0;
-    }
-    return g_pserver->repl_backlog_histlen - skip;
+    /* Set the start and end offsets for the replica so that a future
+     * writeToClient will send the backlog from the given offset to 
+     * the current end of the backlog to said replica */
+    c->repl_curr_off = offset - 1;
+    c->repl_end_off = g_pserver->master_repl_offset;
+
+    /* Force the partial sync to be queued */
+    prepareClientToWrite(c);
+    c->fPendingReplicaWrite = true; 
+
+    return len;
 }

 /* Return the offset to provide as reply to the PSYNC command received
@ -735,6 +757,10 @@ int replicationSetupSlaveForFullResync(client *replica, long long offset) {

    replica->psync_initial_offset = offset;
    replica->replstate = SLAVE_STATE_WAIT_BGSAVE_END;
+
+    replica->repl_curr_off = offset;
+    replica->repl_end_off = g_pserver->master_repl_offset;
+
    /* We are going to accumulate the incremental changes for this
     * replica as well. Set replicaseldb to -1 in order to force to re-emit
     * a SELECT statement in the replication stream. */
@ -1357,6 +1383,7 @@ void replconfCommand(client *c) {
 * 4) Update the count of "good replicas". */
 void putSlaveOnline(client *replica) {
    replica->replstate = SLAVE_STATE_ONLINE;
+    
    replica->repl_put_online_on_ack = 0;
    replica->repl_ack_time = g_pserver->unixtime; /* Prevent false timeout. */

@ -3059,6 +3086,11 @@ void syncWithMaster(connection *conn) {

    if (psync_result == PSYNC_CONTINUE) {
        serverLog(LL_NOTICE, "MASTER <-> REPLICA sync: Master accepted a Partial Resynchronization.");
+        /* Reset the bulklen information in case it is lingering from the last connection
+         * The partial sync will start from the beginning of a command so these should be reset */
+        mi->master->reqtype = 0;
+        mi->master->multibulklen = 0;
+        mi->master->bulklen = -1;
        if (cserver.supervised_mode == SUPERVISED_SYSTEMD) {
            redisCommunicateSystemd("STATUS=MASTER <-> REPLICA sync: Partial Resynchronization accepted. Ready to accept connections in read-write mode.\n");
        }
@ -4898,15 +4930,19 @@ void replicateSubkeyExpire(redisDb *db, robj_roptr key, robj_roptr subkey, long
 }

 void _clientAsyncReplyBufferReserve(client *c, size_t len);
+
 void flushReplBacklogToClients()
 {
    serverAssert(GlobalLocksAcquired());
+    /* If we have the repl backlog lock, we will deadlock */
+    serverAssert(!g_pserver->repl_backlog_lock.fOwnLock());
    if (g_pserver->repl_batch_offStart < 0)
        return;
    
    if (g_pserver->repl_batch_offStart != g_pserver->master_repl_offset) {
        bool fAsyncWrite = false;
-        
+        long long min_offset = LONG_LONG_MAX;
+        // Ensure no overflow
        serverAssert(g_pserver->repl_batch_offStart < g_pserver->master_repl_offset);
        if (g_pserver->master_repl_offset - g_pserver->repl_batch_offStart > g_pserver->repl_backlog_size) {
            // We overflowed
@ -4930,33 +4966,36 @@ void flushReplBacklogToClients()
        listIter li;
        listNode *ln;
        listRewind(g_pserver->slaves, &li);
+        /* We don't actually write any data in this function since we send data 
+         * directly from the replication backlog to replicas in writeToClient.
+         * 
+         * What we do however, is set the end offset of each replica here. This way, 
+         * future calls to writeToClient will know up to where in the replication
+         * backlog is valid for writing. */  
        while ((ln = listNext(&li))) {
            client *replica = (client*)listNodeValue(ln);

            if (!canFeedReplicaReplBuffer(replica)) continue;
            if (replica->flags & CLIENT_CLOSE_ASAP) continue;

-            std::unique_lock<fastlock> ul(replica->lock, std::defer_lock);
-            if (FCorrectThread(replica))
-                ul.lock();
-            else
+            std::unique_lock<fastlock> ul(replica->lock);
+            if (!FCorrectThread(replica))
                fAsyncWrite = true;

-            if (g_pserver->repl_backlog_idx >= g_pserver->repl_batch_idxStart) {
-                long long cbCopy = g_pserver->repl_backlog_idx - g_pserver->repl_batch_idxStart;
-                serverAssert((g_pserver->master_repl_offset - g_pserver->repl_batch_offStart) == cbCopy);
-                serverAssert((g_pserver->repl_backlog_size - g_pserver->repl_batch_idxStart) >= (cbCopy));
-                serverAssert((g_pserver->repl_batch_idxStart + cbCopy) <= g_pserver->repl_backlog_size);
+            /* We should have set the repl_curr_off when synchronizing, so it shouldn't be -1 here */
+            serverAssert(replica->repl_curr_off != -1);

-                addReplyProto(replica, g_pserver->repl_backlog + g_pserver->repl_batch_idxStart, cbCopy);
-            } else {
-                auto cbPhase1 = g_pserver->repl_backlog_size - g_pserver->repl_batch_idxStart;
-                if (fAsyncWrite)
-                    _clientAsyncReplyBufferReserve(replica, cbPhase1 + g_pserver->repl_backlog_idx);
-                addReplyProto(replica, g_pserver->repl_backlog + g_pserver->repl_batch_idxStart, cbPhase1);
-                addReplyProto(replica, g_pserver->repl_backlog, g_pserver->repl_backlog_idx);
-                serverAssert((cbPhase1 + g_pserver->repl_backlog_idx) == (g_pserver->master_repl_offset - g_pserver->repl_batch_offStart));
+            min_offset = std::min(min_offset, replica->repl_curr_off);      
+
+            replica->repl_end_off = g_pserver->master_repl_offset;
+
+            /* Only if the there isn't already a pending write do we prepare the client to write */
+            if (!replica->fPendingReplicaWrite){
+                serverAssert(replica->repl_curr_off != g_pserver->master_repl_offset);
+                prepareClientToWrite(replica);
+                replica->fPendingReplicaWrite = true; 
            }
+
        }
        if (fAsyncWrite)
            ProcessPendingAsyncWrites();
@ -4965,6 +5004,7 @@ LDone:
        // This may be called multiple times per "frame" so update with our progress flushing to clients
        g_pserver->repl_batch_idxStart = g_pserver->repl_backlog_idx;
        g_pserver->repl_batch_offStart = g_pserver->master_repl_offset;
+        g_pserver->repl_lowest_off.store(min_offset == LONG_LONG_MAX ? -1 : min_offset, std::memory_order_seq_cst);
    } 
 }

--- a/src/server.cpp
+++ b/src/server.cpp
@ -2021,7 +2021,6 @@ void clientsCron(int iel) {
    while(listLength(g_pserver->clients) && iterations--) {
        client *c;
        listNode *head;
-
        /* Rotate the list, take the current head, process.
         * This way if the client must be removed from the list it's the
         * first element and we don't incur into O(N) computation. */
@ -3261,6 +3260,7 @@ void initServerConfig(void) {
    g_pserver->enable_multimaster = CONFIG_DEFAULT_ENABLE_MULTIMASTER;
    g_pserver->repl_syncio_timeout = CONFIG_REPL_SYNCIO_TIMEOUT;
    g_pserver->master_repl_offset = 0;
+    g_pserver->repl_lowest_off.store(-1, std::memory_order_seq_cst);

    /* Replication partial resync backlog */
    g_pserver->repl_backlog = NULL;
@ -7012,9 +7012,10 @@ void OnTerminate()
 void wakeTimeThread() {
    updateCachedTime();
    std::lock_guard<std::mutex> lock(time_thread_mutex);
+    if (sleeping_threads >= cserver.cthreads)
+        time_thread_cv.notify_one();
    sleeping_threads--;
    serverAssert(sleeping_threads >= 0);
-    time_thread_cv.notify_one();
 }

 void *timeThreadMain(void*) {
--- a/src/server.h
+++ b/src/server.h
@ -1589,6 +1589,13 @@ struct client {
    long long psync_initial_offset; /* FULLRESYNC reply offset other slaves
                                       copying this replica output buffer
                                       should use. */
+                                       
+    long long repl_curr_off = -1;/* Replication offset of the replica, also where in the backlog we need to start from
+                                  * when sending data to this replica. */
+    long long repl_end_off = -1; /* Replication offset to write to, stored in the replica, as opposed to using the global offset 
+                                  * to prevent needing the global lock */
+    int fPendingReplicaWrite;    /* Is there a write queued for this replica? */
+
    char replid[CONFIG_RUN_ID_SIZE+1]; /* Master replication ID (if master). */
    int slave_listening_port; /* As configured with: REPLCONF listening-port */
    char *slave_addr;       /* Optionally given by REPLCONF ip-address */
@ -2357,6 +2364,9 @@ struct redisServer {
                                       that is the next byte will'll write to.*/
    long long repl_backlog_off;     /* Replication "master offset" of first
                                       byte in the replication backlog buffer.*/
+    long long repl_backlog_start;   /* Used to compute indicies from offsets
+                                       basically, index = (offset - start) % size */
+    fastlock repl_backlog_lock {"replication backlog"};
    time_t repl_backlog_time_limit; /* Time without slaves after the backlog
                                       gets released. */
    time_t repl_no_slaves_since;    /* We have no slaves since that time.
@ -2368,6 +2378,8 @@ struct redisServer {
    int repl_diskless_load;         /* Slave parse RDB directly from the socket.
                                     * see REPL_DISKLESS_LOAD_* enum */
    int repl_diskless_sync_delay;   /* Delay to start a diskless repl BGSAVE. */
+    std::atomic <long long> repl_lowest_off; /* The lowest offset amongst all replicas
+                                                -1 if there are no replicas */
    /* Replication (replica) */
    list *masters;
    int enable_multimaster; 
@ -3713,6 +3725,8 @@ void mixDigest(unsigned char *digest, const void *ptr, size_t len);
 void xorDigest(unsigned char *digest, const void *ptr, size_t len);
 int populateCommandTableParseFlags(struct redisCommand *c, const char *strflags);

+
+
 int moduleGILAcquiredByModule(void);
 extern int g_fInCrash;
 static inline int GlobalLocksAcquired(void)  // Used in asserts to verify all global locks are correctly acquired for a server-thread to operate
@ -3780,6 +3794,7 @@ void tlsCleanup(void);
 int tlsConfigure(redisTLSContextConfig *ctx_config);


+
 class ShutdownException
 {};

@ -3791,3 +3806,5 @@ class ShutdownException
 int iAmMaster(void);

 #endif
+
+
--- a/tests/unit/maxmemory.tcl
+++ b/tests/unit/maxmemory.tcl
@ -33,7 +33,8 @@ start_server {tags {"maxmemory"}} {
            # Get the current memory limit and calculate a new limit.
            # We just add 100k to the current memory size so that it is
            # fast for us to reach that limit.
-            set used [s used_memory]
+            set overhead [s mem_not_counted_for_evict]
+            set used [expr [s used_memory] - $overhead]
            set limit [expr {$used+100*1024}]
            r config set maxmemory $limit
            r config set maxmemory-policy $policy
@ -42,7 +43,7 @@ start_server {tags {"maxmemory"}} {
            while 1 {
                r setex [randomKey] 10000 x
                incr numkeys
-                if {[s used_memory]+4096 > $limit} {
+                if {[expr {[s used_memory] - $overhead + 4096}] > $limit} {
                    assert {$numkeys > 10}
                    break
                }
@ -52,7 +53,8 @@ start_server {tags {"maxmemory"}} {
            for {set j 0} {$j < $numkeys} {incr j} {
                r setex [randomKey] 10000 x
            }
-            assert {[s used_memory] < ($limit+4096)}
+            set used_amt [expr [s used_memory] - $overhead]
+            assert {$used_amt < ($limit+4096)}
        }
    }

@ -65,7 +67,8 @@ start_server {tags {"maxmemory"}} {
            # Get the current memory limit and calculate a new limit.
            # We just add 100k to the current memory size so that it is
            # fast for us to reach that limit.
-            set used [s used_memory]
+            set overhead [s mem_not_counted_for_evict]
+            set used [expr [s used_memory] - $overhead]
            set limit [expr {$used+100*1024}]
            r config set maxmemory $limit
            r config set maxmemory-policy $policy
@ -74,7 +77,7 @@ start_server {tags {"maxmemory"}} {
            while 1 {
                r set [randomKey] x
                incr numkeys
-                if {[s used_memory]+4096 > $limit} {
+                if {[expr [s used_memory] - $overhead]+4096 > $limit} {
                    assert {$numkeys > 10}
                    break
                }
@ -91,7 +94,7 @@ start_server {tags {"maxmemory"}} {
                }
            }
            if {[string match allkeys-* $policy]} {
-                assert {[s used_memory] < ($limit+4096)}
+                assert {[expr [s used_memory] - $overhead] < ($limit+4096)}
            } else {
                assert {$err == 1}
            }
@ -107,7 +110,8 @@ start_server {tags {"maxmemory"}} {
            # Get the current memory limit and calculate a new limit.
            # We just add 100k to the current memory size so that it is
            # fast for us to reach that limit.
-            set used [s used_memory]
+            set overhead [s mem_not_counted_for_evict]
+            set used [expr [s used_memory] - $overhead]
            set limit [expr {$used+100*1024}]
            r config set maxmemory $limit
            r config set maxmemory-policy $policy
@ -121,7 +125,7 @@ start_server {tags {"maxmemory"}} {
                } else {
                    r set "key:$numkeys" x
                }
-                if {[s used_memory]+4096 > $limit} {
+                if {[expr [s used_memory] - $overhead]+4096 > $limit} {
                    assert {$numkeys > 10}
                    break
                }
@ -135,7 +139,7 @@ start_server {tags {"maxmemory"}} {
                catch {r setex "foo:$j" 10000 x}
            }
            # We should still be under the limit.
-            assert {[s used_memory] < ($limit+4096)}
+            assert {[expr [s used_memory] - $overhead] < ($limit+4096)}
            # However all our non volatile keys should be here.
            for {set j 0} {$j < $numkeys} {incr j 2} {
                assert {[r exists "key:$j"]}
@ -305,7 +309,8 @@ start_server {tags {"maxmemory"} overrides {server-threads 1}} {
        # we need to make sure to evict keynames of a total size of more than
        # 16kb since the (PROTO_REPLY_CHUNK_BYTES), only after that the
        # invalidation messages have a chance to trigger further eviction.
-        set used [s used_memory]
+        set overhead [s mem_not_counted_for_evict]
+        set used [expr [s used_memory] - $overhead]
        set limit [expr {$used - 40000}]
        r config set maxmemory $limit