From 05fe41b33a3b1ebc25a5c17748a5c00a3e0a2984 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Mon, 26 Apr 2021 22:13:32 +0000 Subject: [PATCH 01/22] Primitive implementation of bypassing client buffer, stats are all messed up and print statements everywhere Former-commit-id: 8ae310fb0f7b53add826f76891da333b63860001 --- src/networking.cpp | 159 +++++++++++++++++++++++++++++++++++++++----- src/replication.cpp | 156 +++++++++++++++++++++++++++++++++++++++++++ src/server.h | 11 +++ 3 files changed, 308 insertions(+), 18 deletions(-) diff --git a/src/networking.cpp b/src/networking.cpp index 574a8bc6c..18ab382bd 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -224,6 +224,7 @@ void clientInstallWriteHandler(client *c) { (c->replstate == REPL_STATE_NONE || (c->replstate == SLAVE_STATE_ONLINE && !c->repl_put_online_on_ack))) { + // serverLog(LL_NOTICE, "we installing boyz"); AssertCorrectThread(c); serverAssert(c->lock.fOwnLock()); /* Here instead of installing the write handler, we just flag the @@ -301,7 +302,7 @@ int prepareClientToWrite(client *c) { /* Schedule the client to write the output buffers to the socket, unless * it should already be setup to do so (it has already pending data). */ - if (!fAsync && !clientHasPendingReplies(c)) clientInstallWriteHandler(c); + if (!fAsync && !clientHasPendingReplies(c) && c->repl_curr_idx == -1) clientInstallWriteHandler(c); if (fAsync && !(c->fPendingAsyncWrite)) clientInstallAsyncWriteHandler(c); /* Authorize the caller to queue in the output buffer of this client. */ @@ -1676,15 +1677,33 @@ int writeToClient(client *c, int handler_installed) { std::unique_locklock)> lock(c->lock); + /* if this is a write to a replica, it's coming straight from the replication backlog */ + long long repl_backlog_idx = g_pserver->repl_backlog_idx; + + bool wroteFromClientBuffer = false; /* True if you wrote from the client buffer in this function call */ + while(clientHasPendingReplies(c)) { + wroteFromClientBuffer = true; + if (c->flags & CLIENT_SLAVE && listLength(c->reply) % 10 == 0){ + + serverLog(LL_NOTICE, "-----------------------------------------"); + serverLog(LL_NOTICE, "replica w/ pending replies, with a reply list size of: %lu", listLength(c->reply)); + serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size); + serverLog(LL_NOTICE, "repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset); + serverLog(LL_NOTICE, "-----------------------------------------"); + + } if (c->bufpos > 0) { + // serverLog(LL_NOTICE, "Sending reply %d", x); + // serverLog(LL_NOTICE, "SUSSUS AMOGUS, %ld", c->bufpos); nwritten = connWrite(c->conn,c->buf+c->sentlen,c->bufpos-c->sentlen); if (nwritten <= 0) break; c->sentlen += nwritten; totwritten += nwritten; /* If the buffer was sent, set bufpos to zero to continue with - * the remainder of the reply. */ + * the remainder of the reply. */ + // serverLog(LL_NOTICE, "buf pos: %d, sentlen: %ld", c->bufpos, c->sentlen); if ((int)c->sentlen == c->bufpos) { c->bufpos = 0; c->sentlen = 0; @@ -1714,23 +1733,112 @@ int writeToClient(client *c, int handler_installed) { } } /* Note that we avoid to send more than NET_MAX_WRITES_PER_EVENT - * bytes, in a single threaded server it's a good idea to serve - * other clients as well, even if a very large request comes from - * super fast link that is always able to accept data (in real world - * scenario think about 'KEYS *' against the loopback interface). - * - * However if we are over the maxmemory limit we ignore that and - * just deliver as much data as it is possible to deliver. - * - * Moreover, we also send as much as possible if the client is - * a replica or a monitor (otherwise, on high-speed traffic, the - * replication/output buffer will grow indefinitely) */ + * bytes, in a single threaded server it's a good idea to serve + * other clients as well, even if a very large request comes from + * super fast link that is always able to accept data (in real world + * scenario think about 'KEYS *' against the loopback interface). + * + * However if we are over the maxmemory limit we ignore that and + * just deliver as much data as it is possible to deliver. + * + * Moreover, we also send as much as possible if the client is + * a replica or a monitor (otherwise, on high-speed traffic, the + * replication/output buffer will grow indefinitely) */ if (totwritten > NET_MAX_WRITES_PER_EVENT && (g_pserver->maxmemory == 0 || - zmalloc_used_memory() < g_pserver->maxmemory) && + zmalloc_used_memory() < g_pserver->maxmemory) && !(c->flags & CLIENT_SLAVE)) break; } - + + /* If there are no more pending replies, then we have transmitted the RDB. + * This means further replication commands will be taken straight from the + * replication backlog from now on. */ + if (c->flags & CLIENT_SLAVE && c->replstate == SLAVE_STATE_ONLINE && !clientHasPendingReplies(c)){ + if (!c->transmittedRDB) + serverLog(LL_NOTICE, "---------->>>>>>>> TRANSMISSION OF THE RDB HAS COMPLETED <<<<<<<<----------"); + c->transmittedRDB = true; + } + + /* For replicas, we don't store all the information in the client buffer + * Most of the time (aside from immediately after synchronizing), we read + * from the replication backlog directly */ + if (c->flags & CLIENT_SLAVE && c->repl_curr_idx != -1 && c->transmittedRDB){ + /* copy global variables into local scope so if they change in between we don't care */ + long long repl_backlog_size = g_pserver->repl_backlog_size; + long long nwrittenPart2 = 0; + + ssize_t nrequested; /* The number of bytes requested to write */ + /* normal case with no wrap around */ + if (repl_backlog_idx >= c->repl_curr_idx){ + nrequested = repl_backlog_idx - c->repl_curr_idx; + nwritten = connWrite(c->conn, g_pserver->repl_backlog + c->repl_curr_idx, repl_backlog_idx - c->repl_curr_idx); + /* wrap around case, v. rare */ + /* also v. buggy so there's that */ + } else { + serverLog(LL_NOTICE, "WRAP CASE"); + serverLog(LL_NOTICE, "-----------------------------------------"); + serverLog(LL_NOTICE, "requested to write: %ld", nrequested); + serverLog(LL_NOTICE, "actually written: %ld", nwritten); + serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size); + serverLog(LL_NOTICE, "buf pos: %d, sentlen: %ld", c->bufpos, c->sentlen); + serverLog(LL_NOTICE, "nwritten: %ld", nwritten); + serverLog(LL_NOTICE, "-----------------------------------------"); + + nrequested = repl_backlog_size + repl_backlog_idx - c->repl_curr_idx; + nwritten = connWrite(c->conn, g_pserver->repl_backlog + c->repl_curr_idx, repl_backlog_size - c->repl_curr_idx); + /* only attempt wrapping if we write the correct number of bytes */ + if (nwritten == repl_backlog_size - c->repl_curr_idx){ + serverLog(LL_NOTICE, "SECOND STAGE"); + serverLog(LL_NOTICE, "-----------------------------------------"); + serverLog(LL_NOTICE, "requested to write: %ld", nrequested); + serverLog(LL_NOTICE, "actually written: %ld", nwritten); + serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size); + serverLog(LL_NOTICE, "buf pos: %d, sentlen: %ld", c->bufpos, c->sentlen); + serverLog(LL_NOTICE, "-----------------------------------------"); + + long long nwrittenPart2 = connWrite(c->conn, g_pserver->repl_backlog, repl_backlog_idx); + if (nwrittenPart2 != -1) + nwritten += nwrittenPart2; + + serverLog(LL_NOTICE, "nwrittenPart2: %lld", nwrittenPart2); + serverLog(LL_NOTICE, "-----------------------------------------"); + } else { + serverLog(LL_NOTICE, "SUPER SHORT"); + } + + } + + /* only update the replica's current index if bytes were sent */ + + // if (nrequested != nwritten){ + serverLog(LL_NOTICE, "-----------------------------------------"); + serverLog(LL_NOTICE, "AFTER THE FACT"); + serverLog(LL_NOTICE, "requested to write: %ld", nrequested); + serverLog(LL_NOTICE, "actually written: %ld", nwritten); + serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size); + serverLog(LL_NOTICE, "repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset); + serverLog(LL_NOTICE, "-----------------------------------------"); + // } + + + if (nwritten == nrequested){ + c->repl_curr_idx = -1; /* -1 denotes no more replica writes */ + } + else if (nwritten > 0) + c->repl_curr_idx = (c->repl_curr_idx + nwritten) % repl_backlog_size; + + serverAssert(c->repl_curr_idx < repl_backlog_size); + + /* only increment bytes if an error didn't occur */ + if (nwritten > 0){ + totwritten += nwritten; + c->repl_curr_off += nwritten; + } + + /* If the second part of a write didn't go through, we still need to register that */ + if (nwrittenPart2 == -1) nwritten = -1; + } + g_pserver->stat_net_output_bytes += totwritten; if (nwritten == -1) { if (connGetState(c->conn) == CONN_STATE_CONNECTED) { @@ -1750,7 +1858,7 @@ int writeToClient(client *c, int handler_installed) { * We just rely on data / pings received for timeout detection. */ if (!(c->flags & CLIENT_MASTER)) c->lastinteraction = g_pserver->unixtime; } - if (!clientHasPendingReplies(c)) { + if (!clientHasPendingReplies(c) && c->repl_curr_idx == -1) { c->sentlen = 0; if (handler_installed) connSetWriteHandler(c->conn, NULL); @@ -1904,6 +2012,12 @@ int handleClientsWithPendingWrites(int iel, int aof_state) { /* Don't write to clients that are going to be closed anyway. */ if (c->flags & CLIENT_CLOSE_ASAP) continue; + // if (c->flags & CLIENT_SLAVE){ + // if(clientHasPendingReplies(c)) + // serverLog(LL_NOTICE, "somehow the client buffer has these values: %s", c->buf); + // serverLog(LL_NOTICE, "LOL"); + // } + /* Try to write buffers to the client socket. */ if (writeToClient(c,0) == C_ERR) { @@ -1920,7 +2034,7 @@ int handleClientsWithPendingWrites(int iel, int aof_state) { /* If after the synchronous writes above we still have data to * output to the client, we need to install the writable handler. */ - if (clientHasPendingReplies(c)) { + if (clientHasPendingReplies(c) || c->repl_curr_idx != -1) { if (connSetWriteHandlerWithBarrier(c->conn, sendReplyToClient, ae_flags, true) == C_ERR) freeClientAsync(c); } @@ -3268,6 +3382,13 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) { } } +/* In the case of a replica client, it is possible (and very likely) + * that writes to said replica are using data from the replication backlog + * as opposed to it's own internal buffer, this number should keep track of that */ +unsigned long getClientReplicationBacklogSharedUsage(client *c) { + return (c->repl_curr_idx == -1 && c->flags & CLIENT_SLAVE) ? 0 : g_pserver->master_repl_offset - c->repl_curr_off; +} + /* This function returns the number of bytes that Redis is * using to store the reply still not read by the client. * @@ -3276,9 +3397,11 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) { * enforcing the client output length limits. */ unsigned long getClientOutputBufferMemoryUsage(client *c) { unsigned long list_item_size = sizeof(listNode) + sizeof(clientReplyBlock); - return c->reply_bytes + (list_item_size*listLength(c->reply)) + (c->replyAsync ? c->replyAsync->size : 0); + return c->reply_bytes + (list_item_size*listLength(c->reply)) + (c->replyAsync ? c->replyAsync->size : 0) + getClientReplicationBacklogSharedUsage(c); } + + /* Get the class of a client, used in order to enforce limits to different * classes of clients. * diff --git a/src/replication.cpp b/src/replication.cpp index 2533bae52..ccb538a69 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -47,6 +47,9 @@ #include #include +#define BYPASS_BUFFER +// #define BYPASS_PSYNC + void replicationDiscardCachedMaster(redisMaster *mi); void replicationResurrectCachedMaster(redisMaster *mi, connection *conn); void replicationSendAck(redisMaster *mi); @@ -59,6 +62,18 @@ static void propagateMasterStaleKeys(); * the instance is configured to have no persistence. */ int RDBGeneratedByReplication = 0; +void resizeReplicationBacklogForClients(long long newsize); + +void setReplIdx(client *c, long long idx, long long off){ + if (prepareClientToWrite(c) != C_OK) return; + // serverLog(LL_NOTICE, "calling this garbage function w/ idx and off: %lld, %lld, %lld", idx, off, off-idx); + // serverLog(LL_NOTICE, "What is this value? %lld", c->repl_curr_idx); + if (c->repl_curr_idx == -1){ + c->repl_curr_idx = idx; + c->repl_curr_off = off; + } +} + /* --------------------------- Utility functions ---------------------------- */ /* Return the pointer to a string representing the replica ip:listening_port @@ -213,6 +228,8 @@ void resizeReplicationBacklog(long long newsize) { newsize = CONFIG_REPL_BACKLOG_MIN_SIZE; if (g_pserver->repl_backlog_size == newsize) return; + serverLog(LL_NOTICE, "WE HAD TO RESIZE from %lld to %lld", g_pserver->repl_backlog_size, newsize); + if (g_pserver->repl_backlog != NULL) { /* What we actually do is to flush the old buffer and realloc a new * empty one. It will refill with new data incrementally. @@ -253,6 +270,80 @@ void resizeReplicationBacklog(long long newsize) { g_pserver->repl_backlog_size = newsize; } + +/* The above but for when clients need extra replication backlog because ??? */ +void resizeReplicationBacklogForClients(long long newsize) { + if (newsize < CONFIG_REPL_BACKLOG_MIN_SIZE) + newsize = CONFIG_REPL_BACKLOG_MIN_SIZE; + if (g_pserver->repl_backlog_size == newsize) return; + + serverLog(LL_NOTICE, "WE HAD TO RESIZE from %lld to %lld", g_pserver->repl_backlog_size, newsize); + /* get the critical client size, i.e. the size of the data unflushed to clients */ + long long earliest_off = LONG_LONG_MAX; + long long earliest_idx = -1; + listIter li; + listNode *ln; + listRewind(g_pserver->slaves, &li); + while ((ln = listNext(&li))) { + client *replica = (client*)listNodeValue(ln); + if (replica->repl_curr_off != -1 && replica->repl_curr_off < earliest_off){ + earliest_off = replica->repl_curr_off; + earliest_idx = replica->repl_curr_idx; + } + } + + if (g_pserver->repl_backlog != NULL) { + /* What we actually do is to flush the old buffer and realloc a new + * empty one. It will refill with new data incrementally. + * The reason is that copying a few gigabytes adds latency and even + * worse often we need to alloc additional space before freeing the + * old buffer. */ + + if (earliest_idx >= 0) { + // We need to keep critical data so we can't shrink less than the hot data in the buffer + newsize = std::max(newsize, g_pserver->master_repl_offset - earliest_off); + char *backlog = (char*)zmalloc(newsize); + g_pserver->repl_backlog_histlen = g_pserver->master_repl_offset - earliest_off; + + if (g_pserver->repl_backlog_idx >= earliest_idx) { + auto cbActiveBacklog = g_pserver->repl_backlog_idx - earliest_idx; + memcpy(backlog, g_pserver->repl_backlog + earliest_idx, cbActiveBacklog); + serverLog(LL_NOTICE, "g_pserver->master_repl_offset: %lld, earliest_off: %lld, g_pserver->repl_backlog_idx: %lld, earliest_idx: %lld", + g_pserver->master_repl_offset, earliest_off, g_pserver->repl_backlog_idx, earliest_idx); + serverAssert(g_pserver->repl_backlog_histlen == cbActiveBacklog); + } else { + auto cbPhase1 = g_pserver->repl_backlog_size - earliest_idx; + memcpy(backlog, g_pserver->repl_backlog + earliest_idx, cbPhase1); + memcpy(backlog + cbPhase1, g_pserver->repl_backlog, g_pserver->repl_backlog_idx); + auto cbActiveBacklog = cbPhase1 + g_pserver->repl_backlog_idx; + serverAssert(g_pserver->repl_backlog_histlen == cbActiveBacklog); + } + zfree(g_pserver->repl_backlog); + g_pserver->repl_backlog = backlog; + g_pserver->repl_backlog_idx = g_pserver->repl_backlog_histlen; + listRewind(g_pserver->slaves, &li); + /* Go through the clients and update their replication indicies */ + while ((ln = listNext(&li))) { + client *replica = (client*)listNodeValue(ln); + if (replica->repl_curr_idx != -1){ + replica->repl_curr_idx -= earliest_idx; + if (replica->repl_curr_idx < 0) + replica->repl_curr_idx += g_pserver->repl_backlog_size; + } + } + g_pserver->repl_batch_idxStart = 0; + } else { + zfree(g_pserver->repl_backlog); + g_pserver->repl_backlog = (char*)zmalloc(newsize); + g_pserver->repl_backlog_histlen = 0; + g_pserver->repl_backlog_idx = 0; + /* Next byte we have is... the next since the buffer is empty. */ + g_pserver->repl_backlog_off = g_pserver->master_repl_offset+1; + } + } + g_pserver->repl_backlog_size = newsize; +} + void freeReplicationBacklog(void) { serverAssert(GlobalLocksAcquired()); listIter li; @@ -683,6 +774,10 @@ long long addReplyReplicationBacklog(client *c, long long offset) { * split the reply in two parts if we are cross-boundary. */ len = g_pserver->repl_backlog_histlen - skip; serverLog(LL_DEBUG, "[PSYNC] Reply total length: %lld", len); + serverLog(LL_NOTICE, "Coming through from the addReplicationBacklog"); +#ifdef BYPASS_PSYNC + setReplIdx(c, j, offset); +#else while(len) { long long thislen = ((g_pserver->repl_backlog_size - j) < len) ? @@ -693,6 +788,8 @@ long long addReplyReplicationBacklog(client *c, long long offset) { len -= thislen; j = 0; } +#endif + serverLog(LL_NOTICE, "rdb transmitted? %d, pending replies? %d", c->transmittedRDB, clientHasPendingReplies(c)); return g_pserver->repl_backlog_histlen - skip; } @@ -731,6 +828,8 @@ int replicationSetupSlaveForFullResync(client *replica, long long offset) { * a SELECT statement in the replication stream. */ g_pserver->replicaseldb = -1; + serverLog(LL_NOTICE, "We are setting up here lad"); + /* Don't send this reply to slaves that approached us with * the old SYNC command. */ if (!(replica->flags & CLIENT_PRE_PSYNC)) { @@ -989,6 +1088,7 @@ void syncCommand(client *c) { if (!strcasecmp((const char*)ptrFromObj(c->argv[0]),"psync")) { if (masterTryPartialResynchronization(c) == C_OK) { g_pserver->stat_sync_partial_ok++; + // c->repl_curr_idx = g_pserver->repl_backlog_idx; return; /* No full resync needed, return. */ } else { char *master_replid = (char*)ptrFromObj(c->argv[1]); @@ -1016,6 +1116,7 @@ void syncCommand(client *c) { connDisableTcpNoDelay(c->conn); /* Non critical if it fails. */ c->repldbfd = -1; c->flags |= CLIENT_SLAVE; + // c->repl_curr_idx = g_pserver->repl_backlog_idx; listAddNodeTail(g_pserver->slaves,c); /* Create the replication backlog if needed. */ @@ -1035,6 +1136,7 @@ void syncCommand(client *c) { if (g_pserver->FRdbSaveInProgress() && g_pserver->rdb_child_type == RDB_CHILD_TYPE_DISK) { + serverLog(LL_NOTICE, "case 1"); /* Ok a background save is in progress. Let's check if it is a good * one for replication, i.e. if there is another replica that is * registering differences since the server forked to save. */ @@ -1066,6 +1168,7 @@ void syncCommand(client *c) { } else if (g_pserver->FRdbSaveInProgress() && g_pserver->rdb_child_type == RDB_CHILD_TYPE_SOCKET) { + serverLog(LL_NOTICE, "case 2"); /* There is an RDB child process but it is writing directly to * children sockets. We need to wait for the next BGSAVE * in order to synchronize. */ @@ -1073,6 +1176,7 @@ void syncCommand(client *c) { /* CASE 3: There is no BGSAVE is progress. */ } else { + serverLog(LL_NOTICE, "case 3"); if (g_pserver->repl_diskless_sync && (c->slave_capa & SLAVE_CAPA_EOF)) { /* Diskless replication RDB child is created inside * replicationCron() since we want to delay its start a @@ -1278,6 +1382,7 @@ void replconfCommand(client *c) { * 3) Update the count of "good replicas". */ void putSlaveOnline(client *replica) { replica->replstate = SLAVE_STATE_ONLINE; + replica->repl_put_online_on_ack = 0; replica->repl_ack_time = g_pserver->unixtime; /* Prevent false timeout. */ if (connSetWriteHandler(replica->conn, sendReplyToClient, true) == C_ERR) { @@ -1415,11 +1520,13 @@ void sendBulkToSlave(connection *conn) { replica->repldboff += nwritten; g_pserver->stat_net_output_bytes += nwritten; + // replica->repl_curr_idx = g_pserver->repl_backlog_idx; if (replica->repldboff == replica->repldbsize) { close(replica->repldbfd); replica->repldbfd = -1; connSetWriteHandler(replica->conn,NULL); putSlaveOnline(replica); + serverLog(LL_NOTICE, "ABOUT TO DIE HERE"); } } @@ -4450,6 +4557,21 @@ void replicateSubkeyExpire(redisDb *db, robj_roptr key, robj_roptr subkey, long } void _clientAsyncReplyBufferReserve(client *c, size_t len); + +/* Has the end of the replication backlog overflowed past the beginning? */ +bool replOverflowHasOccured(){ + if (g_pserver->repl_backlog_idx > g_pserver->repl_batch_idxStart){ + long long repl_idx_difference = g_pserver->repl_backlog_idx > g_pserver->repl_batch_idxStart ? + g_pserver->repl_backlog_idx - g_pserver->repl_batch_idxStart : + (g_pserver->repl_backlog_size + g_pserver->repl_backlog_idx) - g_pserver->repl_batch_idxStart; + + return g_pserver->master_repl_offset - g_pserver->repl_batch_offStart > repl_idx_difference; + } + return false; +} + +thread_local int transmittedCount = 0; + void flushReplBacklogToClients() { serverAssert(GlobalLocksAcquired()); @@ -4463,11 +4585,31 @@ void flushReplBacklogToClients() serverAssert(g_pserver->master_repl_offset - g_pserver->repl_batch_offStart <= g_pserver->repl_backlog_size); serverAssert(g_pserver->repl_batch_idxStart != g_pserver->repl_backlog_idx); + serverAssert(!replOverflowHasOccured()); listIter li; listNode *ln; listRewind(g_pserver->slaves, &li); + +#if 0 + // check for potential overflow first while ((ln = listNext(&li))) { client *replica = (client*)listNodeValue(ln); + // serverLog(LL_NOTICE, "replica state: %d", replica->replstate); + + if (replica->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue; + if (replica->flags & CLIENT_CLOSE_ASAP) continue; + if (replica->repl_curr_idx == -1) continue; + + std::unique_lock ul(replica->lock, std::defer_lock); + if (FCorrectThread(replica)) + ul.lock(); + else + fAsyncWrite = true; +#endif + + while ((ln = listNext(&li))) { + client *replica = (client*)listNodeValue(ln); + // serverLog(LL_NOTICE, "replica state: %d", replica->replstate); if (replica->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue; if (replica->flags & CLIENT_CLOSE_ASAP) continue; @@ -4478,6 +4620,19 @@ void flushReplBacklogToClients() else fAsyncWrite = true; + +#ifdef BYPASS_BUFFER + /* If we are online and the RDB has been sent, there is no need to feed the client buffer + * We will send our replies directly from the replication backlog instead */ + if (replica->replstate == SLAVE_STATE_ONLINE && replica->transmittedRDB){ + setReplIdx(replica, g_pserver->repl_batch_idxStart, g_pserver->repl_batch_offStart); + continue; + } +#else + if (replica->replstate == SLAVE_STATE_ONLINE){ + // serverLog(LL_NOTICE, "would be calling this garbage function w/ offset: %lld", g_pserver->repl_batch_idxStart); + } +#endif if (g_pserver->repl_backlog_idx >= g_pserver->repl_batch_idxStart) { long long cbCopy = g_pserver->repl_backlog_idx - g_pserver->repl_batch_idxStart; serverAssert((g_pserver->master_repl_offset - g_pserver->repl_batch_offStart) == cbCopy); @@ -4491,6 +4646,7 @@ void flushReplBacklogToClients() _clientAsyncReplyBufferReserve(replica, cbPhase1 + g_pserver->repl_backlog_idx); addReplyProto(replica, g_pserver->repl_backlog + g_pserver->repl_batch_idxStart, cbPhase1); addReplyProto(replica, g_pserver->repl_backlog, g_pserver->repl_backlog_idx); + serverAssert((cbPhase1 + g_pserver->repl_backlog_idx) == (g_pserver->master_repl_offset - g_pserver->repl_batch_offStart)); } } diff --git a/src/server.h b/src/server.h index 878154bc5..cfd6c34a0 100644 --- a/src/server.h +++ b/src/server.h @@ -1516,6 +1516,8 @@ struct client { long long psync_initial_offset; /* FULLRESYNC reply offset other slaves copying this replica output buffer should use. */ + long long repl_curr_idx = -1; /* Replication index sent, if this is a replica */ + long long repl_curr_off = -1; char replid[CONFIG_RUN_ID_SIZE+1]; /* Master replication ID (if master). */ int slave_listening_port; /* As configured with: REPLCONF listening-port */ char slave_ip[NET_IP_STR_LEN]; /* Optionally given by REPLCONF ip-address */ @@ -1575,6 +1577,9 @@ struct client { robj **argv; size_t argv_len_sumActive = 0; + bool transmittedRDB = false; /* Have we finished transmitting the RDB to this replica? */ + /* If so, we can read from the replication backlog instead of the client buffer */ + // post a function from a non-client thread to run on its client thread bool postFunction(std::function fn, bool fLock = true); size_t argv_len_sum() const; @@ -3470,6 +3475,8 @@ void mixDigest(unsigned char *digest, const void *ptr, size_t len); void xorDigest(unsigned char *digest, const void *ptr, size_t len); int populateCommandTableParseFlags(struct redisCommand *c, const char *strflags); + + int moduleGILAcquiredByModule(void); extern int g_fInCrash; static inline int GlobalLocksAcquired(void) // Used in asserts to verify all global locks are correctly acquired for a server-thread to operate @@ -3526,6 +3533,8 @@ void tlsInit(void); void tlsInitThread(); int tlsConfigure(redisTLSContextConfig *ctx_config); +int prepareClientToWrite(client *c); + class ShutdownException {}; @@ -3538,3 +3547,5 @@ class ShutdownException int iAmMaster(void); #endif + + From d8367a92b2bea452d39561c201085a377c2021a6 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Thu, 29 Apr 2021 17:01:06 +0000 Subject: [PATCH 02/22] Updated resize logic Former-commit-id: e6d892ef21b7fc6f51433f32b01198038e555419 --- src/networking.cpp | 104 +++++++++++++++---------------------- src/replication.cpp | 123 ++++++++++++++++++++++++++++++++++++++------ src/server.cpp | 3 +- 3 files changed, 151 insertions(+), 79 deletions(-) diff --git a/src/networking.cpp b/src/networking.cpp index 18ab382bd..cac58ff07 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -1680,22 +1680,8 @@ int writeToClient(client *c, int handler_installed) { /* if this is a write to a replica, it's coming straight from the replication backlog */ long long repl_backlog_idx = g_pserver->repl_backlog_idx; - bool wroteFromClientBuffer = false; /* True if you wrote from the client buffer in this function call */ - while(clientHasPendingReplies(c)) { - wroteFromClientBuffer = true; - if (c->flags & CLIENT_SLAVE && listLength(c->reply) % 10 == 0){ - - serverLog(LL_NOTICE, "-----------------------------------------"); - serverLog(LL_NOTICE, "replica w/ pending replies, with a reply list size of: %lu", listLength(c->reply)); - serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size); - serverLog(LL_NOTICE, "repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset); - serverLog(LL_NOTICE, "-----------------------------------------"); - - } if (c->bufpos > 0) { - // serverLog(LL_NOTICE, "Sending reply %d", x); - // serverLog(LL_NOTICE, "SUSSUS AMOGUS, %ld", c->bufpos); nwritten = connWrite(c->conn,c->buf+c->sentlen,c->bufpos-c->sentlen); if (nwritten <= 0) break; c->sentlen += nwritten; @@ -1753,9 +1739,7 @@ int writeToClient(client *c, int handler_installed) { /* If there are no more pending replies, then we have transmitted the RDB. * This means further replication commands will be taken straight from the * replication backlog from now on. */ - if (c->flags & CLIENT_SLAVE && c->replstate == SLAVE_STATE_ONLINE && !clientHasPendingReplies(c)){ - if (!c->transmittedRDB) - serverLog(LL_NOTICE, "---------->>>>>>>> TRANSMISSION OF THE RDB HAS COMPLETED <<<<<<<<----------"); + if (c->flags & CLIENT_SLAVE && c->replstate == SLAVE_STATE_ONLINE && !clientHasPendingReplies(c) && c->replyAsync == nullptr){ c->transmittedRDB = true; } @@ -1775,49 +1759,27 @@ int writeToClient(client *c, int handler_installed) { /* wrap around case, v. rare */ /* also v. buggy so there's that */ } else { - serverLog(LL_NOTICE, "WRAP CASE"); - serverLog(LL_NOTICE, "-----------------------------------------"); - serverLog(LL_NOTICE, "requested to write: %ld", nrequested); - serverLog(LL_NOTICE, "actually written: %ld", nwritten); - serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size); - serverLog(LL_NOTICE, "buf pos: %d, sentlen: %ld", c->bufpos, c->sentlen); - serverLog(LL_NOTICE, "nwritten: %ld", nwritten); - serverLog(LL_NOTICE, "-----------------------------------------"); - nrequested = repl_backlog_size + repl_backlog_idx - c->repl_curr_idx; nwritten = connWrite(c->conn, g_pserver->repl_backlog + c->repl_curr_idx, repl_backlog_size - c->repl_curr_idx); /* only attempt wrapping if we write the correct number of bytes */ if (nwritten == repl_backlog_size - c->repl_curr_idx){ - serverLog(LL_NOTICE, "SECOND STAGE"); - serverLog(LL_NOTICE, "-----------------------------------------"); - serverLog(LL_NOTICE, "requested to write: %ld", nrequested); - serverLog(LL_NOTICE, "actually written: %ld", nwritten); - serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size); - serverLog(LL_NOTICE, "buf pos: %d, sentlen: %ld", c->bufpos, c->sentlen); - serverLog(LL_NOTICE, "-----------------------------------------"); - long long nwrittenPart2 = connWrite(c->conn, g_pserver->repl_backlog, repl_backlog_idx); if (nwrittenPart2 != -1) nwritten += nwrittenPart2; - serverLog(LL_NOTICE, "nwrittenPart2: %lld", nwrittenPart2); - serverLog(LL_NOTICE, "-----------------------------------------"); - } else { - serverLog(LL_NOTICE, "SUPER SHORT"); - } - + } } /* only update the replica's current index if bytes were sent */ // if (nrequested != nwritten){ - serverLog(LL_NOTICE, "-----------------------------------------"); - serverLog(LL_NOTICE, "AFTER THE FACT"); - serverLog(LL_NOTICE, "requested to write: %ld", nrequested); - serverLog(LL_NOTICE, "actually written: %ld", nwritten); - serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size); - serverLog(LL_NOTICE, "repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset); - serverLog(LL_NOTICE, "-----------------------------------------"); + // serverLog(LL_NOTICE, "-----------------------------------------"); + // serverLog(LL_NOTICE, "AFTER THE FACT"); + // serverLog(LL_NOTICE, "requested to write: %ld", nrequested); + // serverLog(LL_NOTICE, "actually written: %ld", nwritten); + // serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size); + // serverLog(LL_NOTICE, "repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset); + // serverLog(LL_NOTICE, "-----------------------------------------"); // } @@ -1902,25 +1864,36 @@ void ProcessPendingAsyncWrites() serverAssert(c->fPendingAsyncWrite); if (c->flags & (CLIENT_CLOSE_ASAP | CLIENT_CLOSE_AFTER_REPLY)) { - zfree(c->replyAsync); - c->replyAsync = nullptr; + if (c->replyAsync != nullptr){ + zfree(c->replyAsync); + c->replyAsync = nullptr; + } c->fPendingAsyncWrite = FALSE; continue; } - int size = c->replyAsync->used; + /* since writes from master to replica can come directly from the replication backlog, + * writes may have been signalled without having been copied to the replyAsync buffer, + * thus causing the buffer to be NULL */ + if (c->replyAsync != nullptr){ + int size = c->replyAsync->used; - if (listLength(c->reply) == 0 && size <= (PROTO_REPLY_CHUNK_BYTES - c->bufpos)) { - memcpy(c->buf + c->bufpos, c->replyAsync->buf(), size); - c->bufpos += size; - } else { - c->reply_bytes += c->replyAsync->size; - listAddNodeTail(c->reply, c->replyAsync); + if (listLength(c->reply) == 0 && size <= (PROTO_REPLY_CHUNK_BYTES - c->bufpos)) { + memcpy(c->buf + c->bufpos, c->replyAsync->buf(), size); + c->bufpos += size; + } else { + c->reply_bytes += c->replyAsync->size; + listAddNodeTail(c->reply, c->replyAsync); + c->replyAsync = nullptr; + } + + zfree(c->replyAsync); c->replyAsync = nullptr; + } else { + /* Only replicas should have empty async reply buffers */ + serverAssert(c->flags & CLIENT_SLAVE); } - zfree(c->replyAsync); - c->replyAsync = nullptr; c->fPendingAsyncWrite = FALSE; // Now install the write event handler @@ -1935,17 +1908,17 @@ void ProcessPendingAsyncWrites() { ae_flags |= AE_BARRIER; } - + if (!((c->replstate == REPL_STATE_NONE || (c->replstate == SLAVE_STATE_ONLINE && !c->repl_put_online_on_ack)))) continue; - + asyncCloseClientOnOutputBufferLimitReached(c); if (c->flags & CLIENT_CLOSE_ASAP) continue; // we will never write this so don't post an op - + std::atomic_thread_fence(std::memory_order_seq_cst); - + if (FCorrectThread(c)) { prepareClientToWrite(c); // queue an event @@ -3386,7 +3359,12 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) { * that writes to said replica are using data from the replication backlog * as opposed to it's own internal buffer, this number should keep track of that */ unsigned long getClientReplicationBacklogSharedUsage(client *c) { - return (c->repl_curr_idx == -1 && c->flags & CLIENT_SLAVE) ? 0 : g_pserver->master_repl_offset - c->repl_curr_off; + if (c->flags & CLIENT_SLAVE && c->repl_curr_idx != -1){ + // serverLog(LL_NOTICE, "repl_backlog_size %lld, repl_backlog_idx %lld, master_repl_offset %lld, repl_curr_idx %lld, repl_curr_off %lld", + // g_pserver->repl_backlog_size, g_pserver->repl_backlog_idx, g_pserver->master_repl_offset, c->repl_curr_idx, c->repl_curr_off); + } + + return (!(c->flags & CLIENT_SLAVE) || c->repl_curr_idx == -1) ? 0 : g_pserver->master_repl_offset - c->repl_curr_off; } /* This function returns the number of bytes that Redis is diff --git a/src/replication.cpp b/src/replication.cpp index ccb538a69..ef33fbfd9 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -47,8 +47,8 @@ #include #include -#define BYPASS_BUFFER -// #define BYPASS_PSYNC +// #define BYPASS_BUFFER +// #define RESIZE_BACKLOG void replicationDiscardCachedMaster(redisMaster *mi); void replicationResurrectCachedMaster(redisMaster *mi, connection *conn); @@ -57,6 +57,30 @@ void putSlaveOnline(client *replica); int cancelReplicationHandshake(redisMaster *mi); static void propagateMasterStaleKeys(); +/* gets the lowest offset amongst all of the replicas */ +long long getLowestOffsetAmongReplicas(){ + serverAssert(GlobalLocksAcquired()); + long long min_offset = LONG_LONG_MAX; + listIter li; + listNode *ln; + listRewind(g_pserver->slaves, &li); + // check for potential overflow first + while ((ln = listNext(&li))) { + client *replica = (client*)listNodeValue(ln); + + if (replica->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue; + if (replica->flags & CLIENT_CLOSE_ASAP) continue; + if (replica->repl_curr_idx == -1) continue; + + std::unique_lock ul(replica->lock, std::defer_lock); + if (FCorrectThread(replica)) + ul.lock(); + + min_offset = std::min(min_offset, replica->repl_curr_off); + } + /* return -1 if no other minimum was found */ + return min_offset == LONG_LONG_MAX ? -1 : min_offset; +} /* We take a global flag to remember if this instance generated an RDB * because of replication, so that we can remove the RDB file in case * the instance is configured to have no persistence. */ @@ -67,11 +91,13 @@ void resizeReplicationBacklogForClients(long long newsize); void setReplIdx(client *c, long long idx, long long off){ if (prepareClientToWrite(c) != C_OK) return; // serverLog(LL_NOTICE, "calling this garbage function w/ idx and off: %lld, %lld, %lld", idx, off, off-idx); - // serverLog(LL_NOTICE, "What is this value? %lld", c->repl_curr_idx); + // serverLog(LL_NOTICE, "Repl Index started at: %lld", c->repl_curr_idx); if (c->repl_curr_idx == -1){ c->repl_curr_idx = idx; c->repl_curr_off = off; } + // serverLog(LL_NOTICE, "Repl Index has become: %lld", c->repl_curr_idx); + } /* --------------------------- Utility functions ---------------------------- */ @@ -277,7 +303,7 @@ void resizeReplicationBacklogForClients(long long newsize) { newsize = CONFIG_REPL_BACKLOG_MIN_SIZE; if (g_pserver->repl_backlog_size == newsize) return; - serverLog(LL_NOTICE, "WE HAD TO RESIZE from %lld to %lld", g_pserver->repl_backlog_size, newsize); + serverLog(LL_NOTICE, "WE HAVE TO RESIZE from %lld to %lld", g_pserver->repl_backlog_size, newsize); /* get the critical client size, i.e. the size of the data unflushed to clients */ long long earliest_off = LONG_LONG_MAX; long long earliest_idx = -1; @@ -290,6 +316,20 @@ void resizeReplicationBacklogForClients(long long newsize) { earliest_off = replica->repl_curr_off; earliest_idx = replica->repl_curr_idx; } + serverLog(LL_NOTICE, "repl_curr_idx: %lld, earlistidx: %lld", replica->repl_curr_idx, earliest_idx); + } + serverLog(LL_NOTICE, "We are starting with: master_repl_offset: %lld, repl_batch_offStart: %lld, earliest_off: %lld, " + "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, earliest_idx: %lld, repl_backlog_size: %lld", + g_pserver->master_repl_offset, g_pserver->repl_batch_offStart, earliest_off, + g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, earliest_idx, g_pserver->repl_backlog_size + ); + + long long new_off = 0, new_idx = 0; + + /* if no earliest offset is found amongst the clients, they are all up to date with the flushed index */ + if (earliest_off == LONG_LONG_MAX && earliest_idx == -1){ + earliest_idx = g_pserver->repl_batch_idxStart; + earliest_off = g_pserver->repl_batch_offStart; } if (g_pserver->repl_backlog != NULL) { @@ -330,8 +370,11 @@ void resizeReplicationBacklogForClients(long long newsize) { if (replica->repl_curr_idx < 0) replica->repl_curr_idx += g_pserver->repl_backlog_size; } + new_idx = replica->repl_curr_idx; } - g_pserver->repl_batch_idxStart = 0; + g_pserver->repl_batch_idxStart -= earliest_idx; + if (g_pserver->repl_batch_idxStart < 0) + g_pserver->repl_batch_idxStart += g_pserver->repl_backlog_size; } else { zfree(g_pserver->repl_backlog); g_pserver->repl_backlog = (char*)zmalloc(newsize); @@ -342,6 +385,12 @@ void resizeReplicationBacklogForClients(long long newsize) { } } g_pserver->repl_backlog_size = newsize; + + serverLog(LL_NOTICE, "We are ending with: master_repl_offset: %lld, repl_batch_offStart: %lld, new_off: %lld, " + "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, new_idx: %lld, repl_backlog_size: %lld", + g_pserver->master_repl_offset, g_pserver->repl_batch_offStart, new_off, + g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, new_idx, g_pserver->repl_backlog_size + ); } void freeReplicationBacklog(void) { @@ -367,20 +416,41 @@ void feedReplicationBacklog(const void *ptr, size_t len) { const unsigned char *p = (const unsigned char*)ptr; if (g_pserver->repl_batch_idxStart >= 0) { - long long minimumsize = g_pserver->master_repl_offset + len - g_pserver->repl_batch_offStart+1; + /* we are lower bounded by the lower client offset or the offStart if all the clients are up to date */ + long long lower_bound = getLowestOffsetAmongReplicas(); + if (lower_bound == -1) + lower_bound = g_pserver->repl_batch_offStart; + long long minimumsize = g_pserver->master_repl_offset + len - lower_bound + 1; if (minimumsize > g_pserver->repl_backlog_size) { flushReplBacklogToClients(); - minimumsize = g_pserver->master_repl_offset + len - g_pserver->repl_batch_offStart+1; + minimumsize = g_pserver->master_repl_offset + len - lower_bound +1; if (minimumsize > g_pserver->repl_backlog_size) { // This is an emergency overflow, we better resize to fit long long newsize = std::max(g_pserver->repl_backlog_size*2, minimumsize); serverLog(LL_WARNING, "Replication backlog is too small, resizing to: %lld", newsize); - resizeReplicationBacklog(newsize); + resizeReplicationBacklogForClients(newsize); } } +#ifdef RESIZE_BACKLOG + long long lowest_replica_offset = getLowestOffsetAmongReplicas(); + minimumsize = g_pserver->master_repl_offset + len - lowest_replica_offset; + if (lowest_replica_offset != -1 && minimumsize > g_pserver->repl_backlog_size){ + serverLog(LL_WARNING, "THE REPLICATION BACKLOG SIZE IS TOO SMALL, THIS IS A PROBLEM"); + long long oldsize = g_pserver->repl_backlog_size; + resizeReplicationBacklogForClients(std::max(g_pserver->repl_backlog_size * 2, minimumsize)); + serverLog(LL_WARNING, "changed size from %lld to %lld", oldsize, g_pserver->repl_backlog_size); + flushReplBacklogToClients(); + } +#endif } + // serverLog(LL_NOTICE, "Pt2 start with: master_repl_offset: %lld, repl_batch_offStart: %lld, " + // "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, repl_backlog_size: %lld", + // g_pserver->master_repl_offset, g_pserver->repl_batch_offStart, + // g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, g_pserver->repl_backlog_size + // ); + g_pserver->master_repl_offset += len; /* This is a circular buffer, so write as much data we can at every @@ -395,12 +465,23 @@ void feedReplicationBacklog(const void *ptr, size_t len) { len -= thislen; p += thislen; g_pserver->repl_backlog_histlen += thislen; + // serverLog(LL_NOTICE, "Pt2 intermediate with: master_repl_offset: %lld, repl_batch_offStart: %lld, " + // "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, repl_backlog_size: %lld", + // g_pserver->master_repl_offset, g_pserver->repl_batch_offStart, + // g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, g_pserver->repl_backlog_size + // ); } if (g_pserver->repl_backlog_histlen > g_pserver->repl_backlog_size) g_pserver->repl_backlog_histlen = g_pserver->repl_backlog_size; /* Set the offset of the first byte we have in the backlog. */ g_pserver->repl_backlog_off = g_pserver->master_repl_offset - g_pserver->repl_backlog_histlen + 1; + + // serverLog(LL_NOTICE, "Pt2 end with: master_repl_offset: %lld, repl_batch_offStart: %lld, " + // "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, repl_backlog_size: %lld", + // g_pserver->master_repl_offset, g_pserver->repl_batch_offStart, + // g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, g_pserver->repl_backlog_size + // ); } /* Wrapper for feedReplicationBacklog() that takes Redis string objects @@ -774,7 +855,6 @@ long long addReplyReplicationBacklog(client *c, long long offset) { * split the reply in two parts if we are cross-boundary. */ len = g_pserver->repl_backlog_histlen - skip; serverLog(LL_DEBUG, "[PSYNC] Reply total length: %lld", len); - serverLog(LL_NOTICE, "Coming through from the addReplicationBacklog"); #ifdef BYPASS_PSYNC setReplIdx(c, j, offset); #else @@ -789,7 +869,6 @@ long long addReplyReplicationBacklog(client *c, long long offset) { j = 0; } #endif - serverLog(LL_NOTICE, "rdb transmitted? %d, pending replies? %d", c->transmittedRDB, clientHasPendingReplies(c)); return g_pserver->repl_backlog_histlen - skip; } @@ -1520,13 +1599,11 @@ void sendBulkToSlave(connection *conn) { replica->repldboff += nwritten; g_pserver->stat_net_output_bytes += nwritten; - // replica->repl_curr_idx = g_pserver->repl_backlog_idx; if (replica->repldboff == replica->repldbsize) { close(replica->repldbfd); replica->repldbfd = -1; connSetWriteHandler(replica->conn,NULL); putSlaveOnline(replica); - serverLog(LL_NOTICE, "ABOUT TO DIE HERE"); } } @@ -4560,6 +4637,7 @@ void _clientAsyncReplyBufferReserve(client *c, size_t len); /* Has the end of the replication backlog overflowed past the beginning? */ bool replOverflowHasOccured(){ + if (g_pserver->repl_backlog_idx > g_pserver->repl_batch_idxStart){ long long repl_idx_difference = g_pserver->repl_backlog_idx > g_pserver->repl_batch_idxStart ? g_pserver->repl_backlog_idx - g_pserver->repl_batch_idxStart : @@ -4575,8 +4653,13 @@ thread_local int transmittedCount = 0; void flushReplBacklogToClients() { serverAssert(GlobalLocksAcquired()); - if (g_pserver->repl_batch_offStart < 0) + if (g_pserver->repl_batch_offStart < 0){ + if (getLowestOffsetAmongReplicas() == -1){ + serverLog(LL_NOTICE, "this is a case i probably have to handle"); + } return; + } + if (g_pserver->repl_batch_offStart != g_pserver->master_repl_offset) { bool fAsyncWrite = false; @@ -4585,7 +4668,7 @@ void flushReplBacklogToClients() serverAssert(g_pserver->master_repl_offset - g_pserver->repl_batch_offStart <= g_pserver->repl_backlog_size); serverAssert(g_pserver->repl_batch_idxStart != g_pserver->repl_backlog_idx); - serverAssert(!replOverflowHasOccured()); + // serverAssert(!replOverflowHasOccured()); listIter li; listNode *ln; listRewind(g_pserver->slaves, &li); @@ -4605,11 +4688,21 @@ void flushReplBacklogToClients() ul.lock(); else fAsyncWrite = true; + + if (g_pserver->master_repl_offset - replica->repl_curr_off > g_pserver->repl_backlog_size){ + serverLog(LL_WARNING, "THE REPLICATION BACKLOG SIZE IS TOO SMALL, THIS IS A PROBLEM"); + long long oldsize = g_pserver->repl_backlog_size; + resizeReplicationBacklogForClients(std::max(g_pserver->repl_backlog_size * 2, g_pserver->master_repl_offset - replica->repl_curr_off)); + serverLog(LL_WARNING, "changing size from %lld to %lld", oldsize, g_pserver->repl_backlog_size); + } + + } + + listRewind(g_pserver->slaves, &li); #endif while ((ln = listNext(&li))) { client *replica = (client*)listNodeValue(ln); - // serverLog(LL_NOTICE, "replica state: %d", replica->replstate); if (replica->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue; if (replica->flags & CLIENT_CLOSE_ASAP) continue; diff --git a/src/server.cpp b/src/server.cpp index 3d547f748..9664a4a6b 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -1796,6 +1796,7 @@ int clientsCronTrackClientsMemUsage(client *c) { mem += zmalloc_size(c); mem += c->argv_len_sum(); if (c->argv) mem += zmalloc_size(c->argv); + // serverLog(LL_NOTICE, "Mem here is : %lu", mem); /* Now that we have the memory used by the client, remove the old * value from the old category, and add it back. */ g_pserver->stat_clients_type_memory[c->client_cron_last_memory_type] -= @@ -1854,7 +1855,7 @@ void clientsCron(int iel) { while(listLength(g_pserver->clients) && iterations--) { client *c; listNode *head; - + // serverLog(LL_NOTICE, "we are at iteration: %d", iterations); /* Rotate the list, take the current head, process. * This way if the client must be removed from the list it's the * first element and we don't incur into O(N) computation. */ From 7ef58a333f9331e8fd144163626ed7a6ccaa1a59 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Thu, 29 Apr 2021 18:51:30 +0000 Subject: [PATCH 03/22] Performance optimizations Former-commit-id: 7fd83d467784d293f7da78b74f9b9763ce387238 --- src/replication.cpp | 71 ++------------------------------------------- 1 file changed, 3 insertions(+), 68 deletions(-) diff --git a/src/replication.cpp b/src/replication.cpp index ef33fbfd9..1bae2773a 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -47,8 +47,7 @@ #include #include -// #define BYPASS_BUFFER -// #define RESIZE_BACKLOG +#define BYPASS_BUFFER void replicationDiscardCachedMaster(redisMaster *mi); void replicationResurrectCachedMaster(redisMaster *mi, connection *conn); @@ -89,10 +88,10 @@ int RDBGeneratedByReplication = 0; void resizeReplicationBacklogForClients(long long newsize); void setReplIdx(client *c, long long idx, long long off){ - if (prepareClientToWrite(c) != C_OK) return; // serverLog(LL_NOTICE, "calling this garbage function w/ idx and off: %lld, %lld, %lld", idx, off, off-idx); // serverLog(LL_NOTICE, "Repl Index started at: %lld", c->repl_curr_idx); if (c->repl_curr_idx == -1){ + if (prepareClientToWrite(c) != C_OK) return; c->repl_curr_idx = idx; c->repl_curr_off = off; } @@ -432,17 +431,6 @@ void feedReplicationBacklog(const void *ptr, size_t len) { resizeReplicationBacklogForClients(newsize); } } -#ifdef RESIZE_BACKLOG - long long lowest_replica_offset = getLowestOffsetAmongReplicas(); - minimumsize = g_pserver->master_repl_offset + len - lowest_replica_offset; - if (lowest_replica_offset != -1 && minimumsize > g_pserver->repl_backlog_size){ - serverLog(LL_WARNING, "THE REPLICATION BACKLOG SIZE IS TOO SMALL, THIS IS A PROBLEM"); - long long oldsize = g_pserver->repl_backlog_size; - resizeReplicationBacklogForClients(std::max(g_pserver->repl_backlog_size * 2, minimumsize)); - serverLog(LL_WARNING, "changed size from %lld to %lld", oldsize, g_pserver->repl_backlog_size); - flushReplBacklogToClients(); - } -#endif } // serverLog(LL_NOTICE, "Pt2 start with: master_repl_offset: %lld, repl_batch_offStart: %lld, " @@ -4635,30 +4623,11 @@ void replicateSubkeyExpire(redisDb *db, robj_roptr key, robj_roptr subkey, long void _clientAsyncReplyBufferReserve(client *c, size_t len); -/* Has the end of the replication backlog overflowed past the beginning? */ -bool replOverflowHasOccured(){ - - if (g_pserver->repl_backlog_idx > g_pserver->repl_batch_idxStart){ - long long repl_idx_difference = g_pserver->repl_backlog_idx > g_pserver->repl_batch_idxStart ? - g_pserver->repl_backlog_idx - g_pserver->repl_batch_idxStart : - (g_pserver->repl_backlog_size + g_pserver->repl_backlog_idx) - g_pserver->repl_batch_idxStart; - - return g_pserver->master_repl_offset - g_pserver->repl_batch_offStart > repl_idx_difference; - } - return false; -} - -thread_local int transmittedCount = 0; - void flushReplBacklogToClients() { serverAssert(GlobalLocksAcquired()); - if (g_pserver->repl_batch_offStart < 0){ - if (getLowestOffsetAmongReplicas() == -1){ - serverLog(LL_NOTICE, "this is a case i probably have to handle"); - } + if (g_pserver->repl_batch_offStart < 0) return; - } if (g_pserver->repl_batch_offStart != g_pserver->master_repl_offset) { @@ -4668,39 +4637,9 @@ void flushReplBacklogToClients() serverAssert(g_pserver->master_repl_offset - g_pserver->repl_batch_offStart <= g_pserver->repl_backlog_size); serverAssert(g_pserver->repl_batch_idxStart != g_pserver->repl_backlog_idx); - // serverAssert(!replOverflowHasOccured()); listIter li; listNode *ln; listRewind(g_pserver->slaves, &li); - -#if 0 - // check for potential overflow first - while ((ln = listNext(&li))) { - client *replica = (client*)listNodeValue(ln); - // serverLog(LL_NOTICE, "replica state: %d", replica->replstate); - - if (replica->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue; - if (replica->flags & CLIENT_CLOSE_ASAP) continue; - if (replica->repl_curr_idx == -1) continue; - - std::unique_lock ul(replica->lock, std::defer_lock); - if (FCorrectThread(replica)) - ul.lock(); - else - fAsyncWrite = true; - - if (g_pserver->master_repl_offset - replica->repl_curr_off > g_pserver->repl_backlog_size){ - serverLog(LL_WARNING, "THE REPLICATION BACKLOG SIZE IS TOO SMALL, THIS IS A PROBLEM"); - long long oldsize = g_pserver->repl_backlog_size; - resizeReplicationBacklogForClients(std::max(g_pserver->repl_backlog_size * 2, g_pserver->master_repl_offset - replica->repl_curr_off)); - serverLog(LL_WARNING, "changing size from %lld to %lld", oldsize, g_pserver->repl_backlog_size); - } - - } - - listRewind(g_pserver->slaves, &li); -#endif - while ((ln = listNext(&li))) { client *replica = (client*)listNodeValue(ln); @@ -4721,10 +4660,6 @@ void flushReplBacklogToClients() setReplIdx(replica, g_pserver->repl_batch_idxStart, g_pserver->repl_batch_offStart); continue; } -#else - if (replica->replstate == SLAVE_STATE_ONLINE){ - // serverLog(LL_NOTICE, "would be calling this garbage function w/ offset: %lld", g_pserver->repl_batch_idxStart); - } #endif if (g_pserver->repl_backlog_idx >= g_pserver->repl_batch_idxStart) { long long cbCopy = g_pserver->repl_backlog_idx - g_pserver->repl_batch_idxStart; From f6305ed15bca84719504890f85dd0f1297e05365 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Mon, 3 May 2021 16:29:11 +0000 Subject: [PATCH 04/22] Now tracks memory and resizes 'accurately', need to fix cluster Former-commit-id: 5f0e01cc199427ab6dfd7f8f28321f6a1f34fd1c --- src/config.cpp | 1 + src/evict.cpp | 10 +++++++++- src/networking.cpp | 20 +++++++++++++------- src/replication.cpp | 16 ++++++++++++++++ 4 files changed, 39 insertions(+), 8 deletions(-) diff --git a/src/config.cpp b/src/config.cpp index 9d7f14007..b546ef607 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -2347,6 +2347,7 @@ static int updateReplBacklogSize(long long val, long long prev, const char **err UNUSED(err); g_pserver->repl_backlog_size = prev; resizeReplicationBacklog(val); + g_pserver->repl_backlog_config_size = g_pserver->repl_backlog_size; return 1; } diff --git a/src/evict.cpp b/src/evict.cpp index 31cadeae5..36837e17d 100644 --- a/src/evict.cpp +++ b/src/evict.cpp @@ -392,9 +392,16 @@ size_t freeMemoryGetNotCountedMemory(void) { while((ln = listNext(&li))) { client *replica = (client*)listNodeValue(ln); std::unique_lock(replica->lock); - overhead += getClientOutputBufferMemoryUsage(replica); + /* we don't wish to multiple count the replication backlog shared usage */ + overhead += (getClientOutputBufferMemoryUsage(replica) - getClientReplicationBacklogSharedUsage(replica)); } } + + /* also don't count the replication backlog memory + * that's where the replication clients get their memory from */ + overhead += (g_pserver->repl_backlog_size - g_pserver->repl_backlog_config_size); + + if (g_pserver->aof_state != AOF_OFF) { overhead += sdsalloc(g_pserver->aof_buf)+aofRewriteBufferSize(); } @@ -516,6 +523,7 @@ int freeMemoryIfNeeded(bool fQuickCycle, bool fPreSnapshot) { if (g_pserver->maxmemory_policy == MAXMEMORY_NO_EVICTION) goto cant_free; /* We need to free memory, but policy forbids. */ + serverLog(LL_NOTICE, "evicting i guess lol, the overhead was %ld, the repl_backlog_size, %lld", freeMemoryGetNotCountedMemory(), g_pserver->repl_backlog_size); while (mem_freed < mem_tofree) { int j, k, i; static unsigned int next_db = 0; diff --git a/src/networking.cpp b/src/networking.cpp index cac58ff07..c51a02a1d 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -224,7 +224,6 @@ void clientInstallWriteHandler(client *c) { (c->replstate == REPL_STATE_NONE || (c->replstate == SLAVE_STATE_ONLINE && !c->repl_put_online_on_ack))) { - // serverLog(LL_NOTICE, "we installing boyz"); AssertCorrectThread(c); serverAssert(c->lock.fOwnLock()); /* Here instead of installing the write handler, we just flag the @@ -1801,6 +1800,9 @@ int writeToClient(client *c, int handler_installed) { if (nwrittenPart2 == -1) nwritten = -1; } + if (c->flags & CLIENT_SLAVE && handler_installed) + serverLog(LL_NOTICE, "Total bytes written, %ld, write handler installed?: %d", totwritten, handler_installed); + g_pserver->stat_net_output_bytes += totwritten; if (nwritten == -1) { if (connGetState(c->conn) == CONN_STATE_CONNECTED) { @@ -1821,6 +1823,11 @@ int writeToClient(client *c, int handler_installed) { if (!(c->flags & CLIENT_MASTER)) c->lastinteraction = g_pserver->unixtime; } if (!clientHasPendingReplies(c) && c->repl_curr_idx == -1) { + if(c->flags & CLIENT_SLAVE && handler_installed){ + serverLog(LL_NOTICE, "Uninstalling handler"); + serverLog(LL_NOTICE, "handler repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size); + serverLog(LL_NOTICE, "handler repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset); + } c->sentlen = 0; if (handler_installed) connSetWriteHandler(c->conn, NULL); @@ -1836,6 +1843,7 @@ int writeToClient(client *c, int handler_installed) { /* Write event handler. Just send data to the client. */ void sendReplyToClient(connection *conn) { client *c = (client*)connGetPrivateData(conn); + serverLog(LL_NOTICE, "called the sendreplytoclient"); if (writeToClient(c,1) == C_ERR) { AeLocker ae; @@ -1970,6 +1978,7 @@ int handleClientsWithPendingWrites(int iel, int aof_state) { auto vec = std::move(g_pserver->rgthreadvar[iel].clients_pending_write); processed += (int)vec.size(); + // serverLog(LL_NOTICE, "entered handleClientsWithPendingWrites"); for (client *c : vec) { serverAssertDebug(FCorrectThread(c)); @@ -2008,8 +2017,10 @@ int handleClientsWithPendingWrites(int iel, int aof_state) { /* If after the synchronous writes above we still have data to * output to the client, we need to install the writable handler. */ if (clientHasPendingReplies(c) || c->repl_curr_idx != -1) { - if (connSetWriteHandlerWithBarrier(c->conn, sendReplyToClient, ae_flags, true) == C_ERR) + serverLog(LL_NOTICE, "Setting a write handler for later"); + if (connSetWriteHandlerWithBarrier(c->conn, sendReplyToClient, ae_flags, true) == C_ERR) { freeClientAsync(c); + } } } @@ -3359,11 +3370,6 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) { * that writes to said replica are using data from the replication backlog * as opposed to it's own internal buffer, this number should keep track of that */ unsigned long getClientReplicationBacklogSharedUsage(client *c) { - if (c->flags & CLIENT_SLAVE && c->repl_curr_idx != -1){ - // serverLog(LL_NOTICE, "repl_backlog_size %lld, repl_backlog_idx %lld, master_repl_offset %lld, repl_curr_idx %lld, repl_curr_off %lld", - // g_pserver->repl_backlog_size, g_pserver->repl_backlog_idx, g_pserver->master_repl_offset, c->repl_curr_idx, c->repl_curr_off); - } - return (!(c->flags & CLIENT_SLAVE) || c->repl_curr_idx == -1) ? 0 : g_pserver->master_repl_offset - c->repl_curr_off; } diff --git a/src/replication.cpp b/src/replication.cpp index 1bae2773a..60f25052a 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -4684,5 +4684,21 @@ void flushReplBacklogToClients() // This may be called multiple times per "frame" so update with our progress flushing to clients g_pserver->repl_batch_idxStart = g_pserver->repl_backlog_idx; g_pserver->repl_batch_offStart = g_pserver->master_repl_offset; + } else if (getLowestOffsetAmongReplicas() != -1){ + listIter li; + listNode *ln; + listRewind(g_pserver->slaves, &li); + while ((ln = listNext(&li))) { + client *replica = (client*)listNodeValue(ln); + + std::unique_lock ul(replica->lock, std::defer_lock); + if (FCorrectThread(replica)) + ul.lock(); + + /* try to force prepare client to write i guess? */ + if (replica->repl_curr_idx != -1){ + if (prepareClientToWrite(replica) != C_OK) continue; + } + } } } From 33a7b52899a10432e1f9085027ed9e30c07dda32 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Mon, 3 May 2021 16:49:09 +0000 Subject: [PATCH 05/22] Forgot to add server.h in last commit Former-commit-id: 34fa6119c9a3f1533cc3e6e5d118dc6424a70891 --- src/server.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/server.h b/src/server.h index cfd6c34a0..6c5265fbd 100644 --- a/src/server.h +++ b/src/server.h @@ -2411,6 +2411,9 @@ struct redisServer { uint16_t rglockSamples[s_lockContentionSamples]; unsigned ilockRingHead = 0; + long long repl_backlog_config_size = 1024*1024; /* This is a hack to ignore the resizing of the replication backlog + when using it as a defacto for the client buffer */ + bool FRdbSaveInProgress() const { return rdbThreadVars.fRdbThreadActive; } }; @@ -2657,6 +2660,7 @@ sds getAllClientsInfoString(int type); void rewriteClientCommandVector(client *c, int argc, ...); void rewriteClientCommandArgument(client *c, int i, robj *newval); void replaceClientCommandVector(client *c, int argc, robj **argv); +unsigned long getClientReplicationBacklogSharedUsage(client *c); unsigned long getClientOutputBufferMemoryUsage(client *c); int freeClientsInAsyncFreeQueue(int iel); void asyncCloseClientOnOutputBufferLimitReached(client *c); From eb35d7e9ec36d73e0aa8fa2bdb0eb7bb808e4627 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Wed, 5 May 2021 16:37:02 +0000 Subject: [PATCH 06/22] Updated maxmemory tests to account for overhead in new replication backlog behaviour Former-commit-id: 4cd197959693dfe4d1497c3f703cf6aaa27d34ad --- tests/unit/maxmemory.tcl | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/tests/unit/maxmemory.tcl b/tests/unit/maxmemory.tcl index 414733d1e..23879c38a 100644 --- a/tests/unit/maxmemory.tcl +++ b/tests/unit/maxmemory.tcl @@ -33,7 +33,8 @@ start_server {tags {"maxmemory"}} { # Get the current memory limit and calculate a new limit. # We just add 100k to the current memory size so that it is # fast for us to reach that limit. - set used [s used_memory] + set overhead [s mem_not_counted_for_evict] + set used [expr [s used_memory] - $overhead] set limit [expr {$used+100*1024}] r config set maxmemory $limit r config set maxmemory-policy $policy @@ -42,7 +43,7 @@ start_server {tags {"maxmemory"}} { while 1 { r setex [randomKey] 10000 x incr numkeys - if {[s used_memory]+4096 > $limit} { + if {[expr {[s used_memory] - $overhead + 4096}] > $limit} { assert {$numkeys > 10} break } @@ -52,7 +53,8 @@ start_server {tags {"maxmemory"}} { for {set j 0} {$j < $numkeys} {incr j} { r setex [randomKey] 10000 x } - assert {[s used_memory] < ($limit+4096)} + set used_amt [expr [s used_memory] - $overhead] + assert {$used_amt < ($limit+4096)} } } @@ -65,7 +67,8 @@ start_server {tags {"maxmemory"}} { # Get the current memory limit and calculate a new limit. # We just add 100k to the current memory size so that it is # fast for us to reach that limit. - set used [s used_memory] + set overhead [s mem_not_counted_for_evict] + set used [expr [s used_memory] - $overhead] set limit [expr {$used+100*1024}] r config set maxmemory $limit r config set maxmemory-policy $policy @@ -74,7 +77,7 @@ start_server {tags {"maxmemory"}} { while 1 { r set [randomKey] x incr numkeys - if {[s used_memory]+4096 > $limit} { + if {[expr [s used_memory] - $overhead]+4096 > $limit} { assert {$numkeys > 10} break } @@ -91,7 +94,7 @@ start_server {tags {"maxmemory"}} { } } if {[string match allkeys-* $policy]} { - assert {[s used_memory] < ($limit+4096)} + assert {[expr [s used_memory] - $overhead] < ($limit+4096)} } else { assert {$err == 1} } @@ -107,7 +110,8 @@ start_server {tags {"maxmemory"}} { # Get the current memory limit and calculate a new limit. # We just add 100k to the current memory size so that it is # fast for us to reach that limit. - set used [s used_memory] + set overhead [s mem_not_counted_for_evict] + set used [expr [s used_memory] - $overhead] set limit [expr {$used+100*1024}] r config set maxmemory $limit r config set maxmemory-policy $policy @@ -121,7 +125,7 @@ start_server {tags {"maxmemory"}} { } else { r set "key:$numkeys" x } - if {[s used_memory]+4096 > $limit} { + if {[expr [s used_memory] - $overhead]+4096 > $limit} { assert {$numkeys > 10} break } @@ -135,7 +139,7 @@ start_server {tags {"maxmemory"}} { catch {r setex "foo:$j" 10000 x} } # We should still be under the limit. - assert {[s used_memory] < ($limit+4096)} + assert {[expr [s used_memory] - $overhead] < ($limit+4096)} # However all our non volatile keys should be here. for {set j 0} {$j < $numkeys} {incr j 2} { assert {[r exists "key:$j"]} @@ -284,7 +288,8 @@ start_server {tags {"maxmemory"} overrides {server-threads 1}} { # we need to make sure to evict keynames of a total size of more than # 16kb since the (PROTO_REPLY_CHUNK_BYTES), only after that the # invalidation messages have a chance to trigger further eviction. - set used [s used_memory] + set overhead [s mem_not_counted_for_evict] + set used [expr [s used_memory] - $overhead] set limit [expr {$used - 40000}] r config set maxmemory $limit From f6a714db2658a4b8baa924e62dd0ab2c6a7adb9f Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Wed, 5 May 2021 17:04:08 +0000 Subject: [PATCH 07/22] Updated overhead calculation to only use repl_backlog_size Former-commit-id: 6f93c7eb44d84bb143b4ad4fff3c6a5436ebaaf7 --- src/evict.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/evict.cpp b/src/evict.cpp index 36837e17d..e7f0a10ef 100644 --- a/src/evict.cpp +++ b/src/evict.cpp @@ -399,8 +399,8 @@ size_t freeMemoryGetNotCountedMemory(void) { /* also don't count the replication backlog memory * that's where the replication clients get their memory from */ - overhead += (g_pserver->repl_backlog_size - g_pserver->repl_backlog_config_size); - + // overhead += (g_pserver->repl_backlog_size - g_pserver->repl_backlog_config_size); + overhead += g_pserver->repl_backlog_size; if (g_pserver->aof_state != AOF_OFF) { overhead += sdsalloc(g_pserver->aof_buf)+aofRewriteBufferSize(); From 7ff2fb716a4a92bf78a06a888fec82f246889c74 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Wed, 5 May 2021 22:26:36 +0000 Subject: [PATCH 08/22] Fixed data race? Seems to be passing multithreaded test cases now Former-commit-id: cb13edd1200c1230fa7e313d69c69e06129951d3 --- src/networking.cpp | 2 +- src/replication.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/networking.cpp b/src/networking.cpp index c51a02a1d..6f4aa6268 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -1782,7 +1782,7 @@ int writeToClient(client *c, int handler_installed) { // } - if (nwritten == nrequested){ + if (nwritten == nrequested && g_pserver->repl_backlog_idx == c->repl_curr_idx){ c->repl_curr_idx = -1; /* -1 denotes no more replica writes */ } else if (nwritten > 0) diff --git a/src/replication.cpp b/src/replication.cpp index 60f25052a..d3df6d12a 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -90,7 +90,7 @@ void resizeReplicationBacklogForClients(long long newsize); void setReplIdx(client *c, long long idx, long long off){ // serverLog(LL_NOTICE, "calling this garbage function w/ idx and off: %lld, %lld, %lld", idx, off, off-idx); // serverLog(LL_NOTICE, "Repl Index started at: %lld", c->repl_curr_idx); - if (c->repl_curr_idx == -1){ + if (c->repl_curr_idx == -1 && off >= c->repl_curr_off){ if (prepareClientToWrite(c) != C_OK) return; c->repl_curr_idx = idx; c->repl_curr_off = off; From 4fd76c47911f506909e54a138bf8f72b0fea8687 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Fri, 21 May 2021 17:05:55 +0000 Subject: [PATCH 09/22] Fixed single threaded for real this time, need to add synchronization for multi threaded Former-commit-id: 4d858dac1a503f4d518477212ba585069af22574 --- src/networking.cpp | 8 +++++--- src/replication.cpp | 5 +++++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/networking.cpp b/src/networking.cpp index 6f4aa6268..c39d8ce42 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -1676,8 +1676,7 @@ int writeToClient(client *c, int handler_installed) { std::unique_locklock)> lock(c->lock); - /* if this is a write to a replica, it's coming straight from the replication backlog */ - long long repl_backlog_idx = g_pserver->repl_backlog_idx; + while(clientHasPendingReplies(c)) { if (c->bufpos > 0) { @@ -1742,6 +1741,9 @@ int writeToClient(client *c, int handler_installed) { c->transmittedRDB = true; } + /* if this is a write to a replica, it's coming straight from the replication backlog */ + long long repl_backlog_idx = g_pserver->repl_backlog_idx; + /* For replicas, we don't store all the information in the client buffer * Most of the time (aside from immediately after synchronizing), we read * from the replication backlog directly */ @@ -1782,7 +1784,7 @@ int writeToClient(client *c, int handler_installed) { // } - if (nwritten == nrequested && g_pserver->repl_backlog_idx == c->repl_curr_idx){ + if (nwritten == nrequested && g_pserver->repl_backlog_idx == repl_backlog_idx){ c->repl_curr_idx = -1; /* -1 denotes no more replica writes */ } else if (nwritten > 0) diff --git a/src/replication.cpp b/src/replication.cpp index d3df6d12a..1d4e01289 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -3059,6 +3059,11 @@ void syncWithMaster(connection *conn) { if (psync_result == PSYNC_CONTINUE) { serverLog(LL_NOTICE, "MASTER <-> REPLICA sync: Master accepted a Partial Resynchronization."); + /* Reset the bulklen information in case it is lingering from the last connection + * The partial sync will start from the beginning of a command so these should be reset */ + mi->master->reqtype = 0; + mi->master->multibulklen = 0; + mi->master->bulklen = -1; if (cserver.supervised_mode == SUPERVISED_SYSTEMD) { redisCommunicateSystemd("STATUS=MASTER <-> REPLICA sync: Partial Resynchronization accepted. Ready to accept connections.\n"); redisCommunicateSystemd("READY=1\n"); From 6080ee8f2f33fd21de8dfa9c103ba569759bc127 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Wed, 26 May 2021 20:10:33 +0000 Subject: [PATCH 10/22] Added transmitted RDB lock Former-commit-id: 4b32167afc85742d85ff9b47b2c2e0b6b02e140a --- src/networking.cpp | 13 +++++++++++-- src/replication.cpp | 15 ++++++++++----- src/server.h | 2 ++ 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/src/networking.cpp b/src/networking.cpp index c39d8ce42..176693501 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -319,6 +319,7 @@ void _clientAsyncReplyBufferReserve(client *c, size_t len) { clientReplyBlock *replyNew = (clientReplyBlock*)zmalloc(sizeof(clientReplyBlock) + newsize); replyNew->size = zmalloc_usable(replyNew) - sizeof(clientReplyBlock); replyNew->used = 0; + std::unique_lock tRDBLock (c->transmittedRDBLock); c->replyAsync = replyNew; } @@ -332,6 +333,7 @@ int _addReplyToBuffer(client *c, const char *s, size_t len) { if (fAsync) { serverAssert(GlobalLocksAcquired()); + std::unique_lock tRDBLock (c->transmittedRDBLock); if (c->replyAsync == nullptr || (c->replyAsync->size - c->replyAsync->used) < len) { if (c->replyAsync == nullptr) { @@ -1737,9 +1739,14 @@ int writeToClient(client *c, int handler_installed) { /* If there are no more pending replies, then we have transmitted the RDB. * This means further replication commands will be taken straight from the * replication backlog from now on. */ + + std::unique_lock tRDBLock (c->transmittedRDBLock); + if (c->flags & CLIENT_SLAVE && c->replstate == SLAVE_STATE_ONLINE && !clientHasPendingReplies(c) && c->replyAsync == nullptr){ c->transmittedRDB = true; } + bool transmittedRDB = c->transmittedRDB; + tRDBLock.unlock(); /* if this is a write to a replica, it's coming straight from the replication backlog */ long long repl_backlog_idx = g_pserver->repl_backlog_idx; @@ -1747,7 +1754,7 @@ int writeToClient(client *c, int handler_installed) { /* For replicas, we don't store all the information in the client buffer * Most of the time (aside from immediately after synchronizing), we read * from the replication backlog directly */ - if (c->flags & CLIENT_SLAVE && c->repl_curr_idx != -1 && c->transmittedRDB){ + if (c->flags & CLIENT_SLAVE && c->repl_curr_idx != -1 && transmittedRDB){ /* copy global variables into local scope so if they change in between we don't care */ long long repl_backlog_size = g_pserver->repl_backlog_size; long long nwrittenPart2 = 0; @@ -1874,6 +1881,7 @@ void ProcessPendingAsyncWrites() serverAssert(c->fPendingAsyncWrite); if (c->flags & (CLIENT_CLOSE_ASAP | CLIENT_CLOSE_AFTER_REPLY)) { + std::unique_lock tRDBLock (c->transmittedRDBLock); if (c->replyAsync != nullptr){ zfree(c->replyAsync); c->replyAsync = nullptr; @@ -1885,6 +1893,7 @@ void ProcessPendingAsyncWrites() /* since writes from master to replica can come directly from the replication backlog, * writes may have been signalled without having been copied to the replyAsync buffer, * thus causing the buffer to be NULL */ + std::unique_lock tRDBLock (c->transmittedRDBLock); if (c->replyAsync != nullptr){ int size = c->replyAsync->used; @@ -1905,7 +1914,7 @@ void ProcessPendingAsyncWrites() } c->fPendingAsyncWrite = FALSE; - + tRDBLock.unlock(); // Now install the write event handler int ae_flags = AE_WRITABLE|AE_WRITE_THREADSAFE; /* For the fsync=always policy, we want that a given FD is never diff --git a/src/replication.cpp b/src/replication.cpp index 1d4e01289..ad79f4887 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -441,6 +441,8 @@ void feedReplicationBacklog(const void *ptr, size_t len) { g_pserver->master_repl_offset += len; + + /* This is a circular buffer, so write as much data we can at every * iteration and rewind the "idx" index if we reach the limit. */ while(len) { @@ -4659,11 +4661,14 @@ void flushReplBacklogToClients() #ifdef BYPASS_BUFFER - /* If we are online and the RDB has been sent, there is no need to feed the client buffer - * We will send our replies directly from the replication backlog instead */ - if (replica->replstate == SLAVE_STATE_ONLINE && replica->transmittedRDB){ - setReplIdx(replica, g_pserver->repl_batch_idxStart, g_pserver->repl_batch_offStart); - continue; + { + /* If we are online and the RDB has been sent, there is no need to feed the client buffer + * We will send our replies directly from the replication backlog instead */ + std::unique_lock tRDBLock (replica->transmittedRDBLock); + if (replica->replstate == SLAVE_STATE_ONLINE && replica->transmittedRDB){ + setReplIdx(replica, g_pserver->repl_batch_idxStart, g_pserver->repl_batch_offStart); + continue; + } } #endif if (g_pserver->repl_backlog_idx >= g_pserver->repl_batch_idxStart) { diff --git a/src/server.h b/src/server.h index 6c5265fbd..14005e7d5 100644 --- a/src/server.h +++ b/src/server.h @@ -1582,6 +1582,7 @@ struct client { // post a function from a non-client thread to run on its client thread bool postFunction(std::function fn, bool fLock = true); + fastlock transmittedRDBLock {"transmittedRDB"}; size_t argv_len_sum() const; }; @@ -2228,6 +2229,7 @@ struct redisServer { that is the next byte will'll write to.*/ long long repl_backlog_off; /* Replication "master offset" of first byte in the replication backlog buffer.*/ + fastlock repl_backlog_lock {"replication backlog"}; time_t repl_backlog_time_limit; /* Time without slaves after the backlog gets released. */ time_t repl_no_slaves_since; /* We have no slaves since that time. From bf120245faa6867db1b464f319ee3944c017ad28 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Thu, 27 May 2021 18:57:23 +0000 Subject: [PATCH 11/22] Added more synchronization and fixed some data races Former-commit-id: 183e015dac6f85df1c94d0761e89bc23d9f53319 --- src/multi.cpp | 2 + src/networking.cpp | 141 +++++++++++++++++++++++--------------------- src/replication.cpp | 57 ++++++++---------- src/server.cpp | 1 + src/server.h | 3 + 5 files changed, 105 insertions(+), 99 deletions(-) diff --git a/src/multi.cpp b/src/multi.cpp index 9df72383d..9fd5206fb 100644 --- a/src/multi.cpp +++ b/src/multi.cpp @@ -237,6 +237,8 @@ void execCommand(client *c) { * backlog with the final EXEC. */ if (g_pserver->repl_backlog && was_master && !is_master) { const char *execcmd = "*1\r\n$4\r\nEXEC\r\n"; + updateLowestOffsetAmongReplicas(); + std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); feedReplicationBacklog(execcmd,strlen(execcmd)); } } diff --git a/src/networking.cpp b/src/networking.cpp index 176693501..caefd6d1e 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -125,6 +125,7 @@ client *createClient(connection *conn, int iel) { client_id = g_pserver->next_client_id.fetch_add(1); c->iel = iel; c->id = client_id; + sprintf(c->lock.szName, "client %lu", client_id); c->resp = 2; c->conn = conn; c->name = NULL; @@ -1677,8 +1678,7 @@ int writeToClient(client *c, int handler_installed) { serverAssertDebug(FCorrectThread(c)); std::unique_locklock)> lock(c->lock); - - + // serverLog(LL_NOTICE, "acq client"); while(clientHasPendingReplies(c)) { if (c->bufpos > 0) { @@ -1736,82 +1736,87 @@ int writeToClient(client *c, int handler_installed) { !(c->flags & CLIENT_SLAVE)) break; } - /* If there are no more pending replies, then we have transmitted the RDB. - * This means further replication commands will be taken straight from the - * replication backlog from now on. */ + /* We can only directly read from the replication backlog if the client + is a replica, so only attempt to do so if that's the case. */ + if (c->flags & CLIENT_SLAVE) { + /* If there are no more pending replies, then we have transmitted the RDB. + * This means further replication commands will be taken straight from the + * replication backlog from now on. */ + std::unique_lock tRDBLock (c->transmittedRDBLock); - std::unique_lock tRDBLock (c->transmittedRDBLock); + if (c->replstate == SLAVE_STATE_ONLINE && !clientHasPendingReplies(c) && c->replyAsync == nullptr){ + c->transmittedRDB = true; + } + bool transmittedRDB = c->transmittedRDB; + tRDBLock.unlock(); - if (c->flags & CLIENT_SLAVE && c->replstate == SLAVE_STATE_ONLINE && !clientHasPendingReplies(c) && c->replyAsync == nullptr){ - c->transmittedRDB = true; - } - bool transmittedRDB = c->transmittedRDB; - tRDBLock.unlock(); + /* For replicas, we don't store all the information in the client buffer + * Most of the time (aside from immediately after synchronizing), we read + * from the replication backlog directly */ + if (c->repl_curr_idx != -1 && transmittedRDB){ + std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); - /* if this is a write to a replica, it's coming straight from the replication backlog */ - long long repl_backlog_idx = g_pserver->repl_backlog_idx; + /* copy global variables into local scope so if they change in between we don't care */ + long long repl_backlog_idx = g_pserver->repl_backlog_idx; + long long repl_backlog_size = g_pserver->repl_backlog_size; + long long nwrittenPart2 = 0; - /* For replicas, we don't store all the information in the client buffer - * Most of the time (aside from immediately after synchronizing), we read - * from the replication backlog directly */ - if (c->flags & CLIENT_SLAVE && c->repl_curr_idx != -1 && transmittedRDB){ - /* copy global variables into local scope so if they change in between we don't care */ - long long repl_backlog_size = g_pserver->repl_backlog_size; - long long nwrittenPart2 = 0; + ssize_t nrequested; /* The number of bytes requested to write */ + /* normal case with no wrap around */ + if (repl_backlog_idx >= c->repl_curr_idx){ + nrequested = repl_backlog_idx - c->repl_curr_idx; + nwritten = connWrite(c->conn, g_pserver->repl_backlog + c->repl_curr_idx, repl_backlog_idx - c->repl_curr_idx); + /* wrap around case, v. rare */ + /* also v. buggy so there's that */ + } else { + nrequested = repl_backlog_size + repl_backlog_idx - c->repl_curr_idx; + nwritten = connWrite(c->conn, g_pserver->repl_backlog + c->repl_curr_idx, repl_backlog_size - c->repl_curr_idx); + /* only attempt wrapping if we write the correct number of bytes */ + if (nwritten == repl_backlog_size - c->repl_curr_idx){ + long long nwrittenPart2 = connWrite(c->conn, g_pserver->repl_backlog, repl_backlog_idx); + if (nwrittenPart2 != -1) + nwritten += nwrittenPart2; - ssize_t nrequested; /* The number of bytes requested to write */ - /* normal case with no wrap around */ - if (repl_backlog_idx >= c->repl_curr_idx){ - nrequested = repl_backlog_idx - c->repl_curr_idx; - nwritten = connWrite(c->conn, g_pserver->repl_backlog + c->repl_curr_idx, repl_backlog_idx - c->repl_curr_idx); - /* wrap around case, v. rare */ - /* also v. buggy so there's that */ - } else { - nrequested = repl_backlog_size + repl_backlog_idx - c->repl_curr_idx; - nwritten = connWrite(c->conn, g_pserver->repl_backlog + c->repl_curr_idx, repl_backlog_size - c->repl_curr_idx); - /* only attempt wrapping if we write the correct number of bytes */ - if (nwritten == repl_backlog_size - c->repl_curr_idx){ - long long nwrittenPart2 = connWrite(c->conn, g_pserver->repl_backlog, repl_backlog_idx); - if (nwrittenPart2 != -1) - nwritten += nwrittenPart2; + } + } - } + /* only update the replica's current index if bytes were sent */ + + // if (nrequested != nwritten){ + // serverLog(LL_NOTICE, "-----------------------------------------"); + // serverLog(LL_NOTICE, "AFTER THE FACT"); + // serverLog(LL_NOTICE, "requested to write: %ld", nrequested); + // serverLog(LL_NOTICE, "actually written: %ld", nwritten); + // serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size); + // serverLog(LL_NOTICE, "repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset); + // serverLog(LL_NOTICE, "-----------------------------------------"); + // } + + + if (nwritten == nrequested && g_pserver->repl_backlog_idx == repl_backlog_idx){ + c->repl_curr_idx = -1; /* -1 denotes no more replica writes */ + } + else if (nwritten > 0) + c->repl_curr_idx = (c->repl_curr_idx + nwritten) % repl_backlog_size; + + serverAssert(c->repl_curr_idx < repl_backlog_size); + + /* only increment bytes if an error didn't occur */ + if (nwritten > 0){ + totwritten += nwritten; + c->repl_curr_off += nwritten; + } + + /* If the second part of a write didn't go through, we still need to register that */ + if (nwrittenPart2 == -1) nwritten = -1; } - /* only update the replica's current index if bytes were sent */ + if (c->flags & CLIENT_SLAVE && handler_installed) + serverLog(LL_NOTICE, "Total bytes written, %ld, write handler installed?: %d", totwritten, handler_installed); - // if (nrequested != nwritten){ - // serverLog(LL_NOTICE, "-----------------------------------------"); - // serverLog(LL_NOTICE, "AFTER THE FACT"); - // serverLog(LL_NOTICE, "requested to write: %ld", nrequested); - // serverLog(LL_NOTICE, "actually written: %ld", nwritten); - // serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size); - // serverLog(LL_NOTICE, "repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset); - // serverLog(LL_NOTICE, "-----------------------------------------"); - // } - - - if (nwritten == nrequested && g_pserver->repl_backlog_idx == repl_backlog_idx){ - c->repl_curr_idx = -1; /* -1 denotes no more replica writes */ - } - else if (nwritten > 0) - c->repl_curr_idx = (c->repl_curr_idx + nwritten) % repl_backlog_size; - - serverAssert(c->repl_curr_idx < repl_backlog_size); - - /* only increment bytes if an error didn't occur */ - if (nwritten > 0){ - totwritten += nwritten; - c->repl_curr_off += nwritten; - } - - /* If the second part of a write didn't go through, we still need to register that */ - if (nwrittenPart2 == -1) nwritten = -1; } - if (c->flags & CLIENT_SLAVE && handler_installed) - serverLog(LL_NOTICE, "Total bytes written, %ld, write handler installed?: %d", totwritten, handler_installed); - + // serverLog(LL_NOTICE, "rel client"); g_pserver->stat_net_output_bytes += totwritten; if (nwritten == -1) { if (connGetState(c->conn) == CONN_STATE_CONNECTED) { @@ -1834,7 +1839,7 @@ int writeToClient(client *c, int handler_installed) { if (!clientHasPendingReplies(c) && c->repl_curr_idx == -1) { if(c->flags & CLIENT_SLAVE && handler_installed){ serverLog(LL_NOTICE, "Uninstalling handler"); - serverLog(LL_NOTICE, "handler repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size); + serverLog(LL_NOTICE, "handler repl_curr_idx: %lld, repl_backlog_size: %lld", c->repl_curr_idx, g_pserver->repl_backlog_size); serverLog(LL_NOTICE, "handler repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset); } c->sentlen = 0; diff --git a/src/replication.cpp b/src/replication.cpp index ad79f4887..d1181bdf4 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -56,9 +56,11 @@ void putSlaveOnline(client *replica); int cancelReplicationHandshake(redisMaster *mi); static void propagateMasterStaleKeys(); -/* gets the lowest offset amongst all of the replicas */ -long long getLowestOffsetAmongReplicas(){ +/* gets the lowest offset amongst all of the replicas and stores it globally*/ +void updateLowestOffsetAmongReplicas(){ serverAssert(GlobalLocksAcquired()); + serverAssert(!g_pserver->repl_backlog_lock.fOwnLock()); + // serverLog(LL_NOTICE, "off- have repl"); long long min_offset = LONG_LONG_MAX; listIter li; listNode *ln; @@ -69,16 +71,15 @@ long long getLowestOffsetAmongReplicas(){ if (replica->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue; if (replica->flags & CLIENT_CLOSE_ASAP) continue; - if (replica->repl_curr_idx == -1) continue; - std::unique_lock ul(replica->lock, std::defer_lock); - if (FCorrectThread(replica)) - ul.lock(); + std::unique_lock ul(replica->lock); + // serverLog(LL_NOTICE, "off- acq client"); - min_offset = std::min(min_offset, replica->repl_curr_off); + min_offset = std::min(min_offset, replica->repl_curr_off); + // serverLog(LL_NOTICE, "off- rel client"); } /* return -1 if no other minimum was found */ - return min_offset == LONG_LONG_MAX ? -1 : min_offset; + g_pserver->repl_lowest_off.store(min_offset == LONG_LONG_MAX ? -1 : min_offset, std::memory_order_seq_cst); } /* We take a global flag to remember if this instance generated an RDB * because of replication, so that we can remove the RDB file in case @@ -412,11 +413,12 @@ void freeReplicationBacklog(void) { * the backlog without incrementing the offset. */ void feedReplicationBacklog(const void *ptr, size_t len) { serverAssert(GlobalLocksAcquired()); + serverAssert(g_pserver->repl_backlog_lock.fOwnLock()); const unsigned char *p = (const unsigned char*)ptr; if (g_pserver->repl_batch_idxStart >= 0) { /* we are lower bounded by the lower client offset or the offStart if all the clients are up to date */ - long long lower_bound = getLowestOffsetAmongReplicas(); + long long lower_bound = g_pserver->repl_lowest_off.load(std::memory_order_seq_cst); if (lower_bound == -1) lower_bound = g_pserver->repl_batch_offStart; long long minimumsize = g_pserver->master_repl_offset + len - lower_bound + 1; @@ -441,10 +443,9 @@ void feedReplicationBacklog(const void *ptr, size_t len) { g_pserver->master_repl_offset += len; - - /* This is a circular buffer, so write as much data we can at every * iteration and rewind the "idx" index if we reach the limit. */ + while(len) { size_t thislen = g_pserver->repl_backlog_size - g_pserver->repl_backlog_idx; if (thislen > len) thislen = len; @@ -598,6 +599,8 @@ void replicationFeedSlavesCore(list *slaves, int dictid, robj **argv, int argc) serverAssert(!(listLength(slaves) != 0 && g_pserver->repl_backlog == NULL)); bool fSendRaw = !g_pserver->fActiveReplica; + updateLowestOffsetAmongReplicas(); + std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); /* Send SELECT command to every replica if needed. */ if (g_pserver->replicaseldb != dictid) { @@ -619,7 +622,9 @@ void replicationFeedSlavesCore(list *slaves, int dictid, robj **argv, int argc) /* Add the SELECT command into the backlog. */ /* We don't do this for advanced replication because this will be done later when it adds the whole RREPLAY command */ - if (g_pserver->repl_backlog && fSendRaw) feedReplicationBacklogWithObject(selectcmd); + if (g_pserver->repl_backlog && fSendRaw) { + feedReplicationBacklogWithObject(selectcmd); + } if (dictid < 0 || dictid >= PROTO_SHARED_SELECT_CMDS) decrRefCount(selectcmd); @@ -632,7 +637,6 @@ void replicationFeedSlavesCore(list *slaves, int dictid, robj **argv, int argc) if (fSendRaw) { char aux[LONG_STR_SIZE+3]; - /* Add the multi bulk reply length. */ aux[0] = '*'; int multilen = ll2string(aux+1,sizeof(aux)-1,argc); @@ -759,7 +763,11 @@ void replicationFeedSlavesFromMasterStream(char *buf, size_t buflen) { printf("\n"); } - if (g_pserver->repl_backlog) feedReplicationBacklog(buf,buflen); + if (g_pserver->repl_backlog){ + updateLowestOffsetAmongReplicas(); + std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); + feedReplicationBacklog(buf,buflen); + } } void replicationFeedMonitors(client *c, list *monitors, int dictid, robj **argv, int argc) { @@ -4662,6 +4670,9 @@ void flushReplBacklogToClients() #ifdef BYPASS_BUFFER { + std::unique_lock asyncUl(replica->lock, std::defer_lock); + if (!FCorrectThread(replica)) + asyncUl.lock(); /* If we are online and the RDB has been sent, there is no need to feed the client buffer * We will send our replies directly from the replication backlog instead */ std::unique_lock tRDBLock (replica->transmittedRDBLock); @@ -4694,21 +4705,5 @@ void flushReplBacklogToClients() // This may be called multiple times per "frame" so update with our progress flushing to clients g_pserver->repl_batch_idxStart = g_pserver->repl_backlog_idx; g_pserver->repl_batch_offStart = g_pserver->master_repl_offset; - } else if (getLowestOffsetAmongReplicas() != -1){ - listIter li; - listNode *ln; - listRewind(g_pserver->slaves, &li); - while ((ln = listNext(&li))) { - client *replica = (client*)listNodeValue(ln); - - std::unique_lock ul(replica->lock, std::defer_lock); - if (FCorrectThread(replica)) - ul.lock(); - - /* try to force prepare client to write i guess? */ - if (replica->repl_curr_idx != -1){ - if (prepareClientToWrite(replica) != C_OK) continue; - } - } - } + } } diff --git a/src/server.cpp b/src/server.cpp index 9664a4a6b..439e1aeff 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -2924,6 +2924,7 @@ void initServerConfig(void) { g_pserver->enable_multimaster = CONFIG_DEFAULT_ENABLE_MULTIMASTER; g_pserver->repl_syncio_timeout = CONFIG_REPL_SYNCIO_TIMEOUT; g_pserver->master_repl_offset = 0; + g_pserver->repl_lowest_off.store(-1, std::memory_order_seq_cst); /* Replication partial resync backlog */ g_pserver->repl_backlog = NULL; diff --git a/src/server.h b/src/server.h index 14005e7d5..da1fce52e 100644 --- a/src/server.h +++ b/src/server.h @@ -2241,6 +2241,8 @@ struct redisServer { int repl_diskless_load; /* Slave parse RDB directly from the socket. * see REPL_DISKLESS_LOAD_* enum */ int repl_diskless_sync_delay; /* Delay to start a diskless repl BGSAVE. */ + std::atomic repl_lowest_off; /* The lowest offset amongst all clients + Updated before calls to feed the replication backlog */ /* Replication (replica) */ list *masters; int enable_multimaster; @@ -2838,6 +2840,7 @@ void rdbPipeReadHandler(struct aeEventLoop *eventLoop, int fd, void *clientData, void rdbPipeWriteHandlerConnRemoved(struct connection *conn); void replicationNotifyLoadedKey(redisDb *db, robj_roptr key, robj_roptr val, long long expire); void replicateSubkeyExpire(redisDb *db, robj_roptr key, robj_roptr subkey, long long expire); +void updateLowestOffsetAmongReplicas(void); /* Generic persistence functions */ void startLoadingFile(FILE* fp, const char * filename, int rdbflags); From 2a6848a65a513926d3da6608d334351ed6878089 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Wed, 2 Jun 2021 23:41:36 +0000 Subject: [PATCH 12/22] Sync works single threaded properly, passes all but one testcase (which hangs) Former-commit-id: 9a6ca3a5d906b9d87fe70652d218decbb2775ac1 --- src/Makefile | 2 +- src/networking.cpp | 165 ++++++++++++++++++++++++++------------------ src/replication.cpp | 106 +++++++++++----------------- src/server.h | 9 +-- 4 files changed, 145 insertions(+), 137 deletions(-) diff --git a/src/Makefile b/src/Makefile index 966ce4400..a0ee5fe2a 100644 --- a/src/Makefile +++ b/src/Makefile @@ -15,7 +15,7 @@ release_hdr := $(shell sh -c './mkreleasehdr.sh') uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not') uname_M := $(shell sh -c 'uname -m 2>/dev/null || echo not') -OPTIMIZATION?=-O2 -flto +OPTIMIZATION?=-O2 DEPENDENCY_TARGETS=hiredis linenoise lua rocksdb NODEPS:=clean distclean diff --git a/src/networking.cpp b/src/networking.cpp index caefd6d1e..80120d0ca 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -146,6 +146,7 @@ client *createClient(connection *conn, int iel) { c->flags = 0; c->fPendingAsyncWrite = FALSE; c->fPendingAsyncWriteHandler = FALSE; + c->fPendingReplicaWrite = FALSE; c->ctime = c->lastinteraction = g_pserver->unixtime; /* If the default user does not require authentication, the user is * directly authenticated. */ @@ -221,6 +222,10 @@ void clientInstallWriteHandler(client *c) { /* Schedule the client to write the output buffers to the socket only * if not already done and, for slaves, if the replica can actually receive * writes at this stage. */ + + if (c->flags & CLIENT_SLAVE) + serverLog(LL_NOTICE, "installing write handler"); + if (!(c->flags & CLIENT_PENDING_WRITE) && (c->replstate == REPL_STATE_NONE || (c->replstate == SLAVE_STATE_ONLINE && !c->repl_put_online_on_ack))) @@ -272,6 +277,9 @@ void clientInstallAsyncWriteHandler(client *c) { int prepareClientToWrite(client *c) { bool fAsync = !FCorrectThread(c); // Not async if we're on the right thread + if (c->flags & CLIENT_SLAVE) + serverLog(LL_NOTICE, "got into prepareClientToWrite"); + if (!fAsync) { serverAssert(c->conn == nullptr || c->lock.fOwnLock()); } else { @@ -302,7 +310,7 @@ int prepareClientToWrite(client *c) { /* Schedule the client to write the output buffers to the socket, unless * it should already be setup to do so (it has already pending data). */ - if (!fAsync && !clientHasPendingReplies(c) && c->repl_curr_idx == -1) clientInstallWriteHandler(c); + if (!fAsync && !clientHasPendingReplies(c) && !c->fPendingReplicaWrite) clientInstallWriteHandler(c); if (fAsync && !(c->fPendingAsyncWrite)) clientInstallAsyncWriteHandler(c); /* Authorize the caller to queue in the output buffer of this client. */ @@ -320,7 +328,6 @@ void _clientAsyncReplyBufferReserve(client *c, size_t len) { clientReplyBlock *replyNew = (clientReplyBlock*)zmalloc(sizeof(clientReplyBlock) + newsize); replyNew->size = zmalloc_usable(replyNew) - sizeof(clientReplyBlock); replyNew->used = 0; - std::unique_lock tRDBLock (c->transmittedRDBLock); c->replyAsync = replyNew; } @@ -334,7 +341,6 @@ int _addReplyToBuffer(client *c, const char *s, size_t len) { if (fAsync) { serverAssert(GlobalLocksAcquired()); - std::unique_lock tRDBLock (c->transmittedRDBLock); if (c->replyAsync == nullptr || (c->replyAsync->size - c->replyAsync->used) < len) { if (c->replyAsync == nullptr) { @@ -1661,6 +1667,16 @@ client *lookupClientByID(uint64_t id) { return (c == raxNotFound) ? NULL : c; } +/* Compute the corresponding index from a replication backlog offset + * by taking the distance between the input offset and the replication backlog offset + * and applying that to the replication backlog index, wrapping around if the index + * becomes negative. + * TODO: Rewrite comment for new logic */ +long long getReplIndexFromOffset(long long offset){ + long long index = (offset - g_pserver->repl_backlog_start) % g_pserver->repl_backlog_size; + return index; +} + /* Write data in output buffers to client. Return C_OK if the client * is still valid after the call, C_ERR if it was freed because of some * error. If handler_installed is set, it will attempt to clear the @@ -1680,7 +1696,11 @@ int writeToClient(client *c, int handler_installed) { std::unique_locklock)> lock(c->lock); // serverLog(LL_NOTICE, "acq client"); + if (c->flags & CLIENT_SLAVE) + serverLog(LL_NOTICE, "writeToClient has happened"); + while(clientHasPendingReplies(c)) { + serverAssert(!(c->flags & CLIENT_SLAVE) || c->flags & CLIENT_MONITOR); if (c->bufpos > 0) { nwritten = connWrite(c->conn,c->buf+c->sentlen,c->bufpos-c->sentlen); if (nwritten <= 0) break; @@ -1739,80 +1759,67 @@ int writeToClient(client *c, int handler_installed) { /* We can only directly read from the replication backlog if the client is a replica, so only attempt to do so if that's the case. */ if (c->flags & CLIENT_SLAVE) { - /* If there are no more pending replies, then we have transmitted the RDB. - * This means further replication commands will be taken straight from the - * replication backlog from now on. */ - std::unique_lock tRDBLock (c->transmittedRDBLock); - - if (c->replstate == SLAVE_STATE_ONLINE && !clientHasPendingReplies(c) && c->replyAsync == nullptr){ - c->transmittedRDB = true; - } - bool transmittedRDB = c->transmittedRDB; - tRDBLock.unlock(); - /* For replicas, we don't store all the information in the client buffer - * Most of the time (aside from immediately after synchronizing), we read - * from the replication backlog directly */ - if (c->repl_curr_idx != -1 && transmittedRDB){ - std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); + * We always read from the replication backlog directly */ + std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); - /* copy global variables into local scope so if they change in between we don't care */ - long long repl_backlog_idx = g_pserver->repl_backlog_idx; - long long repl_backlog_size = g_pserver->repl_backlog_size; + /* Right now, we're bringing in the offStart into the scope + * If repl_batch_offStart is equal to -1, that means the mechanism is disabled + * which implies there is no data to flush and that the global offset is accurate */ + long long offStart = g_pserver->repl_batch_offStart == -1 ? g_pserver->master_repl_offset : g_pserver->repl_batch_offStart; + long long idxStart = getReplIndexFromOffset(offStart); + if (g_pserver->repl_batch_offStart != -1) + serverAssert(idxStart == g_pserver->repl_batch_idxStart); + else + serverAssert(idxStart == g_pserver->repl_backlog_idx); + + if (c->repl_curr_off != -1 && c->repl_curr_off != offStart){ + serverLog(LL_NOTICE, "printing the stats for client %lu: c->repl_curr_off: %lld, repl_batch_offStart: %lld, nwritten: %ld, offStart: %lld", + c->id, c->repl_curr_off, g_pserver->repl_batch_offStart, nwritten, offStart); + + long long curr_idx = getReplIndexFromOffset(c->repl_curr_off); long long nwrittenPart2 = 0; - - ssize_t nrequested; /* The number of bytes requested to write */ /* normal case with no wrap around */ - if (repl_backlog_idx >= c->repl_curr_idx){ - nrequested = repl_backlog_idx - c->repl_curr_idx; - nwritten = connWrite(c->conn, g_pserver->repl_backlog + c->repl_curr_idx, repl_backlog_idx - c->repl_curr_idx); + if (idxStart >= curr_idx){ + nwritten = connWrite(c->conn, g_pserver->repl_backlog + curr_idx, idxStart - curr_idx); /* wrap around case, v. rare */ /* also v. buggy so there's that */ } else { - nrequested = repl_backlog_size + repl_backlog_idx - c->repl_curr_idx; - nwritten = connWrite(c->conn, g_pserver->repl_backlog + c->repl_curr_idx, repl_backlog_size - c->repl_curr_idx); + serverLog(LL_NOTICE, "ROAD OF RESISTANCE"); + nwritten = connWrite(c->conn, g_pserver->repl_backlog + curr_idx, g_pserver->repl_backlog_size - curr_idx); /* only attempt wrapping if we write the correct number of bytes */ - if (nwritten == repl_backlog_size - c->repl_curr_idx){ - long long nwrittenPart2 = connWrite(c->conn, g_pserver->repl_backlog, repl_backlog_idx); + if (nwritten == g_pserver->repl_backlog_size - curr_idx){ + long long nwrittenPart2 = connWrite(c->conn, g_pserver->repl_backlog, idxStart); if (nwrittenPart2 != -1) nwritten += nwrittenPart2; - } } - /* only update the replica's current index if bytes were sent */ - - // if (nrequested != nwritten){ - // serverLog(LL_NOTICE, "-----------------------------------------"); - // serverLog(LL_NOTICE, "AFTER THE FACT"); - // serverLog(LL_NOTICE, "requested to write: %ld", nrequested); - // serverLog(LL_NOTICE, "actually written: %ld", nwritten); - // serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size); - // serverLog(LL_NOTICE, "repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset); - // serverLog(LL_NOTICE, "-----------------------------------------"); - // } - - - if (nwritten == nrequested && g_pserver->repl_backlog_idx == repl_backlog_idx){ - c->repl_curr_idx = -1; /* -1 denotes no more replica writes */ - } - else if (nwritten > 0) - c->repl_curr_idx = (c->repl_curr_idx + nwritten) % repl_backlog_size; - - serverAssert(c->repl_curr_idx < repl_backlog_size); - /* only increment bytes if an error didn't occur */ if (nwritten > 0){ totwritten += nwritten; c->repl_curr_off += nwritten; + if (1){ + serverLog(LL_NOTICE, "printing the stats for client %lu: c->repl_curr_off: %lld, repl_batch_offStart: %lld, nwritten: %ld, offStart: %lld", + c->id, c->repl_curr_off, g_pserver->repl_batch_offStart, nwritten, offStart); + } + serverAssert(c->repl_curr_off <= offStart); + /* If the client offset matches the global offset, we wrote all we needed to, + * in which case, there is no pending write */ + if (c->repl_curr_off == offStart){ + serverLog(LL_NOTICE, "good, %lld", offStart); + c->fPendingReplicaWrite = false; + } else { + serverLog(LL_NOTICE, "mismatch between repl_curr_off (%lld) and offStart (%lld)", c->repl_curr_off, offStart); + } } /* If the second part of a write didn't go through, we still need to register that */ if (nwrittenPart2 == -1) nwritten = -1; } - if (c->flags & CLIENT_SLAVE && handler_installed) - serverLog(LL_NOTICE, "Total bytes written, %ld, write handler installed?: %d", totwritten, handler_installed); + // if (c->flags & CLIENT_SLAVE && handler_installed) + // serverLog(LL_NOTICE, "Total bytes written, %ld, write handler installed?: %d", totwritten, handler_installed); } @@ -1836,12 +1843,12 @@ int writeToClient(client *c, int handler_installed) { * We just rely on data / pings received for timeout detection. */ if (!(c->flags & CLIENT_MASTER)) c->lastinteraction = g_pserver->unixtime; } - if (!clientHasPendingReplies(c) && c->repl_curr_idx == -1) { - if(c->flags & CLIENT_SLAVE && handler_installed){ - serverLog(LL_NOTICE, "Uninstalling handler"); - serverLog(LL_NOTICE, "handler repl_curr_idx: %lld, repl_backlog_size: %lld", c->repl_curr_idx, g_pserver->repl_backlog_size); - serverLog(LL_NOTICE, "handler repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset); - } + if (!clientHasPendingReplies(c) && !c->fPendingReplicaWrite) { + // if(c->flags & CLIENT_SLAVE && handler_installed){ + // serverLog(LL_NOTICE, "Uninstalling handler"); + // serverLog(LL_NOTICE, "handler repl_curr_idx: %lld, repl_backlog_size: %lld", c->repl_curr_idx, g_pserver->repl_backlog_size); + // serverLog(LL_NOTICE, "handler repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset); + // } c->sentlen = 0; if (handler_installed) connSetWriteHandler(c->conn, NULL); @@ -1857,7 +1864,7 @@ int writeToClient(client *c, int handler_installed) { /* Write event handler. Just send data to the client. */ void sendReplyToClient(connection *conn) { client *c = (client*)connGetPrivateData(conn); - serverLog(LL_NOTICE, "called the sendreplytoclient"); + // serverLog(LL_NOTICE, "called the sendreplytoclient"); if (writeToClient(c,1) == C_ERR) { AeLocker ae; @@ -1886,7 +1893,6 @@ void ProcessPendingAsyncWrites() serverAssert(c->fPendingAsyncWrite); if (c->flags & (CLIENT_CLOSE_ASAP | CLIENT_CLOSE_AFTER_REPLY)) { - std::unique_lock tRDBLock (c->transmittedRDBLock); if (c->replyAsync != nullptr){ zfree(c->replyAsync); c->replyAsync = nullptr; @@ -1898,7 +1904,6 @@ void ProcessPendingAsyncWrites() /* since writes from master to replica can come directly from the replication backlog, * writes may have been signalled without having been copied to the replyAsync buffer, * thus causing the buffer to be NULL */ - std::unique_lock tRDBLock (c->transmittedRDBLock); if (c->replyAsync != nullptr){ int size = c->replyAsync->used; @@ -1919,7 +1924,6 @@ void ProcessPendingAsyncWrites() } c->fPendingAsyncWrite = FALSE; - tRDBLock.unlock(); // Now install the write event handler int ae_flags = AE_WRITABLE|AE_WRITE_THREADSAFE; /* For the fsync=always policy, we want that a given FD is never @@ -2032,8 +2036,7 @@ int handleClientsWithPendingWrites(int iel, int aof_state) { /* If after the synchronous writes above we still have data to * output to the client, we need to install the writable handler. */ - if (clientHasPendingReplies(c) || c->repl_curr_idx != -1) { - serverLog(LL_NOTICE, "Setting a write handler for later"); + if (clientHasPendingReplies(c) || c->fPendingReplicaWrite) { if (connSetWriteHandlerWithBarrier(c->conn, sendReplyToClient, ae_flags, true) == C_ERR) { freeClientAsync(c); } @@ -2214,6 +2217,34 @@ static void setProtocolError(const char *errstr, client *c) { c->flags |= (CLIENT_CLOSE_AFTER_REPLY|CLIENT_PROTOCOL_ERROR); } +static void printQueryBuffer(client *c) { + if (cserver.verbosity <= LL_VERBOSE || c->flags & CLIENT_MASTER) { + sds client = catClientInfoString(sdsempty(),c); + + /* Sample some protocol to given an idea about what was inside. */ + char buf[PROTO_DUMP_LEN*2]; + if (sdslen(c->querybuf)-c->qb_pos < PROTO_DUMP_LEN) { + snprintf(buf,sizeof(buf),"%s", c->querybuf+c->qb_pos); + } else { + snprintf(buf,sizeof(buf),"%.*s (... more %zu bytes ...) %.*s", PROTO_DUMP_LEN/2, c->querybuf+c->qb_pos, sdslen(c->querybuf)-c->qb_pos-PROTO_DUMP_LEN, PROTO_DUMP_LEN/2, c->querybuf+sdslen(c->querybuf)-PROTO_DUMP_LEN/2); + } + + /* Remove non printable chars. */ + char *p = buf; + while (*p != '\0') { + if (!isprint(*p)) *p = '.'; + p++; + } + + /* Log all the client and protocol info. */ + int loglevel = (c->flags & CLIENT_MASTER) ? LL_WARNING : + LL_VERBOSE; + serverLog(loglevel, + "Query buffer from client %lu: %s. %s", c->id, client, buf); + sdsfree(client); + } +} + /* Process the query buffer for client 'c', setting up the client argument * vector for command execution. Returns C_OK if after running the function * the client has a well-formed ready to be processed command, otherwise @@ -2468,6 +2499,8 @@ void parseClientCommandBuffer(client *c) { } size_t cqueriesStart = c->vecqueuedcmd.size(); + // if (c->flags & CLIENT_MASTER) + // printQueryBuffer(c); if (c->reqtype == PROTO_REQ_INLINE) { if (processInlineBuffer(c) != C_OK) break; } else if (c->reqtype == PROTO_REQ_MULTIBULK) { diff --git a/src/replication.cpp b/src/replication.cpp index d1181bdf4..97638e833 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -88,18 +88,6 @@ int RDBGeneratedByReplication = 0; void resizeReplicationBacklogForClients(long long newsize); -void setReplIdx(client *c, long long idx, long long off){ - // serverLog(LL_NOTICE, "calling this garbage function w/ idx and off: %lld, %lld, %lld", idx, off, off-idx); - // serverLog(LL_NOTICE, "Repl Index started at: %lld", c->repl_curr_idx); - if (c->repl_curr_idx == -1 && off >= c->repl_curr_off){ - if (prepareClientToWrite(c) != C_OK) return; - c->repl_curr_idx = idx; - c->repl_curr_off = off; - } - // serverLog(LL_NOTICE, "Repl Index has become: %lld", c->repl_curr_idx); - -} - /* --------------------------- Utility functions ---------------------------- */ /* Return the pointer to a string representing the replica ip:listening_port @@ -232,6 +220,7 @@ void createReplicationBacklog(void) { g_pserver->repl_backlog = (char*)zmalloc(g_pserver->repl_backlog_size, MALLOC_LOCAL); g_pserver->repl_backlog_histlen = 0; g_pserver->repl_backlog_idx = 0; + g_pserver->repl_backlog_start = g_pserver->master_repl_offset; /* We don't have any data inside our buffer, but virtually the first * byte we have is the next byte that will be generated for the @@ -284,6 +273,7 @@ void resizeReplicationBacklog(long long newsize) { g_pserver->repl_backlog = backlog; g_pserver->repl_backlog_idx = g_pserver->repl_backlog_histlen; g_pserver->repl_batch_idxStart = 0; + g_pserver->repl_backlog_start = g_pserver->master_repl_offset; } else { zfree(g_pserver->repl_backlog); g_pserver->repl_backlog = (char*)zmalloc(newsize); @@ -296,6 +286,7 @@ void resizeReplicationBacklog(long long newsize) { g_pserver->repl_backlog_size = newsize; } +long long getReplIndexFromOffset(long long offset); /* The above but for when clients need extra replication backlog because ??? */ void resizeReplicationBacklogForClients(long long newsize) { @@ -305,32 +296,8 @@ void resizeReplicationBacklogForClients(long long newsize) { serverLog(LL_NOTICE, "WE HAVE TO RESIZE from %lld to %lld", g_pserver->repl_backlog_size, newsize); /* get the critical client size, i.e. the size of the data unflushed to clients */ - long long earliest_off = LONG_LONG_MAX; - long long earliest_idx = -1; - listIter li; - listNode *ln; - listRewind(g_pserver->slaves, &li); - while ((ln = listNext(&li))) { - client *replica = (client*)listNodeValue(ln); - if (replica->repl_curr_off != -1 && replica->repl_curr_off < earliest_off){ - earliest_off = replica->repl_curr_off; - earliest_idx = replica->repl_curr_idx; - } - serverLog(LL_NOTICE, "repl_curr_idx: %lld, earlistidx: %lld", replica->repl_curr_idx, earliest_idx); - } - serverLog(LL_NOTICE, "We are starting with: master_repl_offset: %lld, repl_batch_offStart: %lld, earliest_off: %lld, " - "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, earliest_idx: %lld, repl_backlog_size: %lld", - g_pserver->master_repl_offset, g_pserver->repl_batch_offStart, earliest_off, - g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, earliest_idx, g_pserver->repl_backlog_size - ); + long long earliest_off = g_pserver->repl_lowest_off.load(); - long long new_off = 0, new_idx = 0; - - /* if no earliest offset is found amongst the clients, they are all up to date with the flushed index */ - if (earliest_off == LONG_LONG_MAX && earliest_idx == -1){ - earliest_idx = g_pserver->repl_batch_idxStart; - earliest_off = g_pserver->repl_batch_offStart; - } if (g_pserver->repl_backlog != NULL) { /* What we actually do is to flush the old buffer and realloc a new @@ -339,17 +306,18 @@ void resizeReplicationBacklogForClients(long long newsize) { * worse often we need to alloc additional space before freeing the * old buffer. */ - if (earliest_idx >= 0) { + if (earliest_off != -1) { // We need to keep critical data so we can't shrink less than the hot data in the buffer newsize = std::max(newsize, g_pserver->master_repl_offset - earliest_off); char *backlog = (char*)zmalloc(newsize); g_pserver->repl_backlog_histlen = g_pserver->master_repl_offset - earliest_off; + long long earliest_idx = getReplIndexFromOffset(earliest_off); if (g_pserver->repl_backlog_idx >= earliest_idx) { auto cbActiveBacklog = g_pserver->repl_backlog_idx - earliest_idx; memcpy(backlog, g_pserver->repl_backlog + earliest_idx, cbActiveBacklog); - serverLog(LL_NOTICE, "g_pserver->master_repl_offset: %lld, earliest_off: %lld, g_pserver->repl_backlog_idx: %lld, earliest_idx: %lld", - g_pserver->master_repl_offset, earliest_off, g_pserver->repl_backlog_idx, earliest_idx); + serverLog(LL_NOTICE, "g_pserver->master_repl_offset: %lld, earliest_off: %lld, g_pserver->repl_backlog_idx: %lld, earliest_idx: %lld, repl_backlog_start: %lld", + g_pserver->master_repl_offset, earliest_off, g_pserver->repl_backlog_idx, earliest_idx, g_pserver->repl_backlog_start); serverAssert(g_pserver->repl_backlog_histlen == cbActiveBacklog); } else { auto cbPhase1 = g_pserver->repl_backlog_size - earliest_idx; @@ -361,20 +329,10 @@ void resizeReplicationBacklogForClients(long long newsize) { zfree(g_pserver->repl_backlog); g_pserver->repl_backlog = backlog; g_pserver->repl_backlog_idx = g_pserver->repl_backlog_histlen; - listRewind(g_pserver->slaves, &li); - /* Go through the clients and update their replication indicies */ - while ((ln = listNext(&li))) { - client *replica = (client*)listNodeValue(ln); - if (replica->repl_curr_idx != -1){ - replica->repl_curr_idx -= earliest_idx; - if (replica->repl_curr_idx < 0) - replica->repl_curr_idx += g_pserver->repl_backlog_size; - } - new_idx = replica->repl_curr_idx; - } g_pserver->repl_batch_idxStart -= earliest_idx; if (g_pserver->repl_batch_idxStart < 0) g_pserver->repl_batch_idxStart += g_pserver->repl_backlog_size; + g_pserver->repl_backlog_start = earliest_off; } else { zfree(g_pserver->repl_backlog); g_pserver->repl_backlog = (char*)zmalloc(newsize); @@ -382,14 +340,15 @@ void resizeReplicationBacklogForClients(long long newsize) { g_pserver->repl_backlog_idx = 0; /* Next byte we have is... the next since the buffer is empty. */ g_pserver->repl_backlog_off = g_pserver->master_repl_offset+1; + g_pserver->repl_backlog_start = g_pserver->master_repl_offset; } } g_pserver->repl_backlog_size = newsize; serverLog(LL_NOTICE, "We are ending with: master_repl_offset: %lld, repl_batch_offStart: %lld, new_off: %lld, " "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, new_idx: %lld, repl_backlog_size: %lld", - g_pserver->master_repl_offset, g_pserver->repl_batch_offStart, new_off, - g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, new_idx, g_pserver->repl_backlog_size + g_pserver->master_repl_offset, g_pserver->repl_batch_offStart, 0LL, + g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, 0LL, g_pserver->repl_backlog_size ); } @@ -456,11 +415,6 @@ void feedReplicationBacklog(const void *ptr, size_t len) { len -= thislen; p += thislen; g_pserver->repl_backlog_histlen += thislen; - // serverLog(LL_NOTICE, "Pt2 intermediate with: master_repl_offset: %lld, repl_batch_offStart: %lld, " - // "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, repl_backlog_size: %lld", - // g_pserver->master_repl_offset, g_pserver->repl_batch_offStart, - // g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, g_pserver->repl_backlog_size - // ); } if (g_pserver->repl_backlog_histlen > g_pserver->repl_backlog_size) g_pserver->repl_backlog_histlen = g_pserver->repl_backlog_size; @@ -722,7 +676,7 @@ void replicationFeedSlaves(list *replicas, int dictid, robj **argv, int argc) { void showLatestBacklog(void) { if (g_pserver->repl_backlog == NULL) return; - long long dumplen = 256; + long long dumplen = 1024; if (g_pserver->repl_backlog_histlen < dumplen) dumplen = g_pserver->repl_backlog_histlen; @@ -813,7 +767,7 @@ void replicationFeedMonitors(client *c, list *monitors, int dictid, robj **argv, } decrRefCount(cmdobj); } - +#define BYPASS_PSYNC /* Feed the replica 'c' with the replication backlog starting from the * specified 'offset' up to the end of the backlog. */ long long addReplyReplicationBacklog(client *c, long long offset) { @@ -854,7 +808,8 @@ long long addReplyReplicationBacklog(client *c, long long offset) { len = g_pserver->repl_backlog_histlen - skip; serverLog(LL_DEBUG, "[PSYNC] Reply total length: %lld", len); #ifdef BYPASS_PSYNC - setReplIdx(c, j, offset); + c->repl_curr_off = offset - 1; + serverLog(LL_NOTICE, "This client %lu at addr %s synchronized to %lld", c->id, getClientPeerId(c), c->repl_curr_off); #else while(len) { long long thislen = @@ -900,6 +855,11 @@ int replicationSetupSlaveForFullResync(client *replica, long long offset) { replica->psync_initial_offset = offset; replica->replstate = SLAVE_STATE_WAIT_BGSAVE_END; + + replica->repl_curr_off = offset; + + serverLog(LL_NOTICE, "This client %lu at addr %s synchronized to %lld", replica->id, getClientPeerId(replica), replica->repl_curr_off); + /* We are going to accumulate the incremental changes for this * replica as well. Set replicaseldb to -1 in order to force to re-emit * a SELECT statement in the replication stream. */ @@ -2006,7 +1966,6 @@ void replicationCreateMasterClient(redisMaster *mi, connection *conn, int dbid) mi->master->reploff_skipped = 0; mi->master->read_reploff = mi->master->reploff; mi->master->puser = NULL; /* This client can do everything. */ - memcpy(mi->master->uuid, mi->master_uuid, UUID_BINARY_LEN); memset(mi->master_uuid, 0, UUID_BINARY_LEN); // make sure people don't use this temp storage buffer @@ -4652,12 +4611,17 @@ void flushReplBacklogToClients() serverAssert(g_pserver->master_repl_offset - g_pserver->repl_batch_offStart <= g_pserver->repl_backlog_size); serverAssert(g_pserver->repl_batch_idxStart != g_pserver->repl_backlog_idx); + serverLog(LL_NOTICE, "the master repl offset is %lld", g_pserver->master_repl_offset); + showLatestBacklog(); listIter li; listNode *ln; listRewind(g_pserver->slaves, &li); while ((ln = listNext(&li))) { client *replica = (client*)listNodeValue(ln); + // serverLog(LL_NOTICE, "client %lu is in the party", replica->id); + + // serverLog(LL_NOTICE, "is there a write pending for %lu, %d", replica->id, replica->fPendingReplicaWrite); if (replica->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue; if (replica->flags & CLIENT_CLOSE_ASAP) continue; @@ -4675,11 +4639,21 @@ void flushReplBacklogToClients() asyncUl.lock(); /* If we are online and the RDB has been sent, there is no need to feed the client buffer * We will send our replies directly from the replication backlog instead */ - std::unique_lock tRDBLock (replica->transmittedRDBLock); - if (replica->replstate == SLAVE_STATE_ONLINE && replica->transmittedRDB){ - setReplIdx(replica, g_pserver->repl_batch_idxStart, g_pserver->repl_batch_offStart); - continue; + if (replica->repl_curr_off == -1){ + replica->repl_curr_off = g_pserver->repl_batch_offStart; + + serverLog(LL_NOTICE, "This client %lu at addr %s synchronized to %lld", replica->id, getClientPeerId(replica), replica->repl_curr_off); + } + + /* Only if the there isn't already a pending write do we prepare the client to write */ + if (!replica->fPendingReplicaWrite){ + serverAssert(replica->repl_curr_off != g_pserver->master_repl_offset); + prepareClientToWrite(replica); + replica->fPendingReplicaWrite = true; + } + + continue; } #endif if (g_pserver->repl_backlog_idx >= g_pserver->repl_batch_idxStart) { diff --git a/src/server.h b/src/server.h index da1fce52e..9fdf5e0ef 100644 --- a/src/server.h +++ b/src/server.h @@ -1516,8 +1516,11 @@ struct client { long long psync_initial_offset; /* FULLRESYNC reply offset other slaves copying this replica output buffer should use. */ + long long repl_curr_idx = -1; /* Replication index sent, if this is a replica */ long long repl_curr_off = -1; + int fPendingReplicaWrite; + char replid[CONFIG_RUN_ID_SIZE+1]; /* Master replication ID (if master). */ int slave_listening_port; /* As configured with: REPLCONF listening-port */ char slave_ip[NET_IP_STR_LEN]; /* Optionally given by REPLCONF ip-address */ @@ -1577,12 +1580,8 @@ struct client { robj **argv; size_t argv_len_sumActive = 0; - bool transmittedRDB = false; /* Have we finished transmitting the RDB to this replica? */ - /* If so, we can read from the replication backlog instead of the client buffer */ - // post a function from a non-client thread to run on its client thread bool postFunction(std::function fn, bool fLock = true); - fastlock transmittedRDBLock {"transmittedRDB"}; size_t argv_len_sum() const; }; @@ -2229,6 +2228,8 @@ struct redisServer { that is the next byte will'll write to.*/ long long repl_backlog_off; /* Replication "master offset" of first byte in the replication backlog buffer.*/ + long long repl_backlog_start; /* Used to compute indicies from offsets + basically, index = (offset - start) % size */ fastlock repl_backlog_lock {"replication backlog"}; time_t repl_backlog_time_limit; /* Time without slaves after the backlog gets released. */ From 2e9c7aed031f5822ddbe955803b6a09c6c1a9aca Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Thu, 3 Jun 2021 20:44:32 +0000 Subject: [PATCH 13/22] Single threaded tests work now Former-commit-id: 0e760d7c71231c7f52102909a31fc8db1b3e2860 --- src/networking.cpp | 2 +- src/replication.cpp | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/networking.cpp b/src/networking.cpp index 80120d0ca..e8ede3338 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -3419,7 +3419,7 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) { * that writes to said replica are using data from the replication backlog * as opposed to it's own internal buffer, this number should keep track of that */ unsigned long getClientReplicationBacklogSharedUsage(client *c) { - return (!(c->flags & CLIENT_SLAVE) || c->repl_curr_idx == -1) ? 0 : g_pserver->master_repl_offset - c->repl_curr_off; + return (!(c->flags & CLIENT_SLAVE) || !c->fPendingReplicaWrite ) ? 0 : g_pserver->master_repl_offset - c->repl_curr_off; } /* This function returns the number of bytes that Redis is diff --git a/src/replication.cpp b/src/replication.cpp index 97638e833..a7a2aa79e 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -810,6 +810,10 @@ long long addReplyReplicationBacklog(client *c, long long offset) { #ifdef BYPASS_PSYNC c->repl_curr_off = offset - 1; serverLog(LL_NOTICE, "This client %lu at addr %s synchronized to %lld", c->id, getClientPeerId(c), c->repl_curr_off); + + /* Force the partial sync to be queued */ + prepareClientToWrite(c); + c->fPendingReplicaWrite = true; #else while(len) { long long thislen = From 667d2763c0df3ca48b52949a365d3237dbcc0c52 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Thu, 3 Jun 2021 21:47:33 +0000 Subject: [PATCH 14/22] Removed unused variables Former-commit-id: 48663bc480f7279a94c68aeebdd9721ca64f7038 --- src/config.cpp | 1 - src/evict.cpp | 1 - src/replication.cpp | 2 -- src/server.h | 6 +----- 4 files changed, 1 insertion(+), 9 deletions(-) diff --git a/src/config.cpp b/src/config.cpp index b546ef607..9d7f14007 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -2347,7 +2347,6 @@ static int updateReplBacklogSize(long long val, long long prev, const char **err UNUSED(err); g_pserver->repl_backlog_size = prev; resizeReplicationBacklog(val); - g_pserver->repl_backlog_config_size = g_pserver->repl_backlog_size; return 1; } diff --git a/src/evict.cpp b/src/evict.cpp index e7f0a10ef..7ec223f6d 100644 --- a/src/evict.cpp +++ b/src/evict.cpp @@ -399,7 +399,6 @@ size_t freeMemoryGetNotCountedMemory(void) { /* also don't count the replication backlog memory * that's where the replication clients get their memory from */ - // overhead += (g_pserver->repl_backlog_size - g_pserver->repl_backlog_config_size); overhead += g_pserver->repl_backlog_size; if (g_pserver->aof_state != AOF_OFF) { diff --git a/src/replication.cpp b/src/replication.cpp index a7a2aa79e..3a48963ab 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -1129,7 +1129,6 @@ void syncCommand(client *c) { if (!strcasecmp((const char*)ptrFromObj(c->argv[0]),"psync")) { if (masterTryPartialResynchronization(c) == C_OK) { g_pserver->stat_sync_partial_ok++; - // c->repl_curr_idx = g_pserver->repl_backlog_idx; return; /* No full resync needed, return. */ } else { char *master_replid = (char*)ptrFromObj(c->argv[1]); @@ -1157,7 +1156,6 @@ void syncCommand(client *c) { connDisableTcpNoDelay(c->conn); /* Non critical if it fails. */ c->repldbfd = -1; c->flags |= CLIENT_SLAVE; - // c->repl_curr_idx = g_pserver->repl_backlog_idx; listAddNodeTail(g_pserver->slaves,c); /* Create the replication backlog if needed. */ diff --git a/src/server.h b/src/server.h index 9fdf5e0ef..2aba985ed 100644 --- a/src/server.h +++ b/src/server.h @@ -1517,8 +1517,7 @@ struct client { copying this replica output buffer should use. */ - long long repl_curr_idx = -1; /* Replication index sent, if this is a replica */ - long long repl_curr_off = -1; + long long repl_curr_off = -1; /* Replication offset of the client, only if it's a replica*/ int fPendingReplicaWrite; char replid[CONFIG_RUN_ID_SIZE+1]; /* Master replication ID (if master). */ @@ -2416,9 +2415,6 @@ struct redisServer { uint16_t rglockSamples[s_lockContentionSamples]; unsigned ilockRingHead = 0; - long long repl_backlog_config_size = 1024*1024; /* This is a hack to ignore the resizing of the replication backlog - when using it as a defacto for the client buffer */ - bool FRdbSaveInProgress() const { return rdbThreadVars.fRdbThreadActive; } }; From da0b7a3900ba50b37a2e3ac0cac1196aa19d734d Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Fri, 4 Jun 2021 20:09:47 +0000 Subject: [PATCH 15/22] Seems to pass multithreaded test cases, thank the lord Former-commit-id: 6cbf70cfff5735f3d4ef2e980945b4b1a1f85971 --- src/networking.cpp | 19 +++++++++---------- src/replication.cpp | 15 +++++++++------ src/server.h | 1 + 3 files changed, 19 insertions(+), 16 deletions(-) diff --git a/src/networking.cpp b/src/networking.cpp index e8ede3338..cead76998 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -277,8 +277,9 @@ void clientInstallAsyncWriteHandler(client *c) { int prepareClientToWrite(client *c) { bool fAsync = !FCorrectThread(c); // Not async if we're on the right thread - if (c->flags & CLIENT_SLAVE) + if (c->flags & CLIENT_SLAVE){ serverLog(LL_NOTICE, "got into prepareClientToWrite"); + } if (!fAsync) { serverAssert(c->conn == nullptr || c->lock.fOwnLock()); @@ -1758,7 +1759,7 @@ int writeToClient(client *c, int handler_installed) { /* We can only directly read from the replication backlog if the client is a replica, so only attempt to do so if that's the case. */ - if (c->flags & CLIENT_SLAVE) { + if (c->flags & CLIENT_SLAVE && !(c->flags & CLIENT_MONITOR)) { /* For replicas, we don't store all the information in the client buffer * We always read from the replication backlog directly */ std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); @@ -1766,14 +1767,12 @@ int writeToClient(client *c, int handler_installed) { /* Right now, we're bringing in the offStart into the scope * If repl_batch_offStart is equal to -1, that means the mechanism is disabled * which implies there is no data to flush and that the global offset is accurate */ - long long offStart = g_pserver->repl_batch_offStart == -1 ? g_pserver->master_repl_offset : g_pserver->repl_batch_offStart; + // long long offStart = g_pserver->repl_batch_offStart == -1 ? g_pserver->master_repl_offset : g_pserver->repl_batch_offStart; + long long offStart = c->repl_end_off; long long idxStart = getReplIndexFromOffset(offStart); - if (g_pserver->repl_batch_offStart != -1) - serverAssert(idxStart == g_pserver->repl_batch_idxStart); - else - serverAssert(idxStart == g_pserver->repl_backlog_idx); - - if (c->repl_curr_off != -1 && c->repl_curr_off != offStart){ + + serverAssert(c->repl_curr_off != -1); + if (c->repl_curr_off != offStart){ serverLog(LL_NOTICE, "printing the stats for client %lu: c->repl_curr_off: %lld, repl_batch_offStart: %lld, nwritten: %ld, offStart: %lld", c->id, c->repl_curr_off, g_pserver->repl_batch_offStart, nwritten, offStart); @@ -1846,7 +1845,7 @@ int writeToClient(client *c, int handler_installed) { if (!clientHasPendingReplies(c) && !c->fPendingReplicaWrite) { // if(c->flags & CLIENT_SLAVE && handler_installed){ // serverLog(LL_NOTICE, "Uninstalling handler"); - // serverLog(LL_NOTICE, "handler repl_curr_idx: %lld, repl_backlog_size: %lld", c->repl_curr_idx, g_pserver->repl_backlog_size); + // serverLog(LL_NOTICE, "repl_backlog_size: %lld", g_pserver->repl_backlog_size); // serverLog(LL_NOTICE, "handler repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset); // } c->sentlen = 0; diff --git a/src/replication.cpp b/src/replication.cpp index 3a48963ab..96bf161f9 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -382,7 +382,9 @@ void feedReplicationBacklog(const void *ptr, size_t len) { lower_bound = g_pserver->repl_batch_offStart; long long minimumsize = g_pserver->master_repl_offset + len - lower_bound + 1; if (minimumsize > g_pserver->repl_backlog_size) { + g_pserver->repl_backlog_lock.unlock(); flushReplBacklogToClients(); + g_pserver->repl_backlog_lock.lock(); minimumsize = g_pserver->master_repl_offset + len - lower_bound +1; if (minimumsize > g_pserver->repl_backlog_size) { @@ -809,6 +811,7 @@ long long addReplyReplicationBacklog(client *c, long long offset) { serverLog(LL_DEBUG, "[PSYNC] Reply total length: %lld", len); #ifdef BYPASS_PSYNC c->repl_curr_off = offset - 1; + c->repl_end_off = g_pserver->master_repl_offset; serverLog(LL_NOTICE, "This client %lu at addr %s synchronized to %lld", c->id, getClientPeerId(c), c->repl_curr_off); /* Force the partial sync to be queued */ @@ -861,6 +864,7 @@ int replicationSetupSlaveForFullResync(client *replica, long long offset) { replica->replstate = SLAVE_STATE_WAIT_BGSAVE_END; replica->repl_curr_off = offset; + replica->repl_end_off = g_pserver->master_repl_offset; serverLog(LL_NOTICE, "This client %lu at addr %s synchronized to %lld", replica->id, getClientPeerId(replica), replica->repl_curr_off); @@ -4634,19 +4638,18 @@ void flushReplBacklogToClients() fAsyncWrite = true; + /* If we are online and the RDB has been sent, there is no need to feed the client buffer + * We will send our replies directly from the replication backlog instead */ #ifdef BYPASS_BUFFER { std::unique_lock asyncUl(replica->lock, std::defer_lock); if (!FCorrectThread(replica)) asyncUl.lock(); - /* If we are online and the RDB has been sent, there is no need to feed the client buffer - * We will send our replies directly from the replication backlog instead */ - if (replica->repl_curr_off == -1){ - replica->repl_curr_off = g_pserver->repl_batch_offStart; - serverLog(LL_NOTICE, "This client %lu at addr %s synchronized to %lld", replica->id, getClientPeerId(replica), replica->repl_curr_off); + /* We should have set the repl_curr_off when synchronizing, so it shouldn't be -1 here */ + serverAssert(replica->repl_curr_off != -1); - } + replica->repl_end_off = g_pserver->master_repl_offset; /* Only if the there isn't already a pending write do we prepare the client to write */ if (!replica->fPendingReplicaWrite){ diff --git a/src/server.h b/src/server.h index 2aba985ed..64a2ca515 100644 --- a/src/server.h +++ b/src/server.h @@ -1518,6 +1518,7 @@ struct client { should use. */ long long repl_curr_off = -1; /* Replication offset of the client, only if it's a replica*/ + long long repl_end_off = -1; /* Replication offset to write to */ int fPendingReplicaWrite; char replid[CONFIG_RUN_ID_SIZE+1]; /* Master replication ID (if master). */ From 9db8556e91d46c5e2fb7f96ea5fb3880d56274aa Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Tue, 8 Jun 2021 23:10:53 +0000 Subject: [PATCH 16/22] Cleaned up code a bit, need to rewrite some comments to reflect new behaviour Former-commit-id: 850ec766cd71614ce9e61c12414545cd212d3878 --- src/evict.cpp | 1 - src/networking.cpp | 108 ++++---------------------- src/replication.cpp | 179 +++++++------------------------------------- src/server.cpp | 2 - src/server.h | 1 - 5 files changed, 43 insertions(+), 248 deletions(-) diff --git a/src/evict.cpp b/src/evict.cpp index 7ec223f6d..54153dc27 100644 --- a/src/evict.cpp +++ b/src/evict.cpp @@ -522,7 +522,6 @@ int freeMemoryIfNeeded(bool fQuickCycle, bool fPreSnapshot) { if (g_pserver->maxmemory_policy == MAXMEMORY_NO_EVICTION) goto cant_free; /* We need to free memory, but policy forbids. */ - serverLog(LL_NOTICE, "evicting i guess lol, the overhead was %ld, the repl_backlog_size, %lld", freeMemoryGetNotCountedMemory(), g_pserver->repl_backlog_size); while (mem_freed < mem_tofree) { int j, k, i; static unsigned int next_db = 0; diff --git a/src/networking.cpp b/src/networking.cpp index cead76998..aba1f1705 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -223,9 +223,6 @@ void clientInstallWriteHandler(client *c) { * if not already done and, for slaves, if the replica can actually receive * writes at this stage. */ - if (c->flags & CLIENT_SLAVE) - serverLog(LL_NOTICE, "installing write handler"); - if (!(c->flags & CLIENT_PENDING_WRITE) && (c->replstate == REPL_STATE_NONE || (c->replstate == SLAVE_STATE_ONLINE && !c->repl_put_online_on_ack))) @@ -277,10 +274,6 @@ void clientInstallAsyncWriteHandler(client *c) { int prepareClientToWrite(client *c) { bool fAsync = !FCorrectThread(c); // Not async if we're on the right thread - if (c->flags & CLIENT_SLAVE){ - serverLog(LL_NOTICE, "got into prepareClientToWrite"); - } - if (!fAsync) { serverAssert(c->conn == nullptr || c->lock.fOwnLock()); } else { @@ -1695,10 +1688,6 @@ int writeToClient(client *c, int handler_installed) { serverAssertDebug(FCorrectThread(c)); std::unique_locklock)> lock(c->lock); - // serverLog(LL_NOTICE, "acq client"); - - if (c->flags & CLIENT_SLAVE) - serverLog(LL_NOTICE, "writeToClient has happened"); while(clientHasPendingReplies(c)) { serverAssert(!(c->flags & CLIENT_SLAVE) || c->flags & CLIENT_MONITOR); @@ -1710,7 +1699,6 @@ int writeToClient(client *c, int handler_installed) { /* If the buffer was sent, set bufpos to zero to continue with * the remainder of the reply. */ - // serverLog(LL_NOTICE, "buf pos: %d, sentlen: %ld", c->bufpos, c->sentlen); if ((int)c->sentlen == c->bufpos) { c->bufpos = 0; c->sentlen = 0; @@ -1764,33 +1752,24 @@ int writeToClient(client *c, int handler_installed) { * We always read from the replication backlog directly */ std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); - /* Right now, we're bringing in the offStart into the scope - * If repl_batch_offStart is equal to -1, that means the mechanism is disabled - * which implies there is no data to flush and that the global offset is accurate */ - // long long offStart = g_pserver->repl_batch_offStart == -1 ? g_pserver->master_repl_offset : g_pserver->repl_batch_offStart; - long long offStart = c->repl_end_off; - long long idxStart = getReplIndexFromOffset(offStart); + long long repl_end_idx = getReplIndexFromOffset(c->repl_end_off); serverAssert(c->repl_curr_off != -1); - if (c->repl_curr_off != offStart){ - serverLog(LL_NOTICE, "printing the stats for client %lu: c->repl_curr_off: %lld, repl_batch_offStart: %lld, nwritten: %ld, offStart: %lld", - c->id, c->repl_curr_off, g_pserver->repl_batch_offStart, nwritten, offStart); - - long long curr_idx = getReplIndexFromOffset(c->repl_curr_off); - long long nwrittenPart2 = 0; + if (c->repl_curr_off != c->repl_end_off){ + long long repl_curr_idx = getReplIndexFromOffset(c->repl_curr_off); + long long nwritten2ndStage = 0; /* How much was written from the start of the replication backlog + * in the event of a wrap around write */ /* normal case with no wrap around */ - if (idxStart >= curr_idx){ - nwritten = connWrite(c->conn, g_pserver->repl_backlog + curr_idx, idxStart - curr_idx); - /* wrap around case, v. rare */ - /* also v. buggy so there's that */ + if (repl_end_idx >= repl_curr_idx){ + nwritten = connWrite(c->conn, g_pserver->repl_backlog + repl_curr_idx, repl_end_idx - repl_curr_idx); + /* wrap around case */ } else { - serverLog(LL_NOTICE, "ROAD OF RESISTANCE"); - nwritten = connWrite(c->conn, g_pserver->repl_backlog + curr_idx, g_pserver->repl_backlog_size - curr_idx); + nwritten = connWrite(c->conn, g_pserver->repl_backlog + repl_curr_idx, g_pserver->repl_backlog_size - repl_curr_idx); /* only attempt wrapping if we write the correct number of bytes */ - if (nwritten == g_pserver->repl_backlog_size - curr_idx){ - long long nwrittenPart2 = connWrite(c->conn, g_pserver->repl_backlog, idxStart); - if (nwrittenPart2 != -1) - nwritten += nwrittenPart2; + if (nwritten == g_pserver->repl_backlog_size - repl_curr_idx){ + nwritten2ndStage = connWrite(c->conn, g_pserver->repl_backlog, repl_end_idx); + if (nwritten2ndStage != -1) + nwritten += nwritten2ndStage; } } @@ -1798,31 +1777,19 @@ int writeToClient(client *c, int handler_installed) { if (nwritten > 0){ totwritten += nwritten; c->repl_curr_off += nwritten; - if (1){ - serverLog(LL_NOTICE, "printing the stats for client %lu: c->repl_curr_off: %lld, repl_batch_offStart: %lld, nwritten: %ld, offStart: %lld", - c->id, c->repl_curr_off, g_pserver->repl_batch_offStart, nwritten, offStart); - } - serverAssert(c->repl_curr_off <= offStart); + serverAssert(c->repl_curr_off <= c->repl_end_off); /* If the client offset matches the global offset, we wrote all we needed to, * in which case, there is no pending write */ - if (c->repl_curr_off == offStart){ - serverLog(LL_NOTICE, "good, %lld", offStart); + if (c->repl_curr_off == c->repl_end_off){ c->fPendingReplicaWrite = false; - } else { - serverLog(LL_NOTICE, "mismatch between repl_curr_off (%lld) and offStart (%lld)", c->repl_curr_off, offStart); } } /* If the second part of a write didn't go through, we still need to register that */ - if (nwrittenPart2 == -1) nwritten = -1; + if (nwritten2ndStage == -1) nwritten = -1; } - - // if (c->flags & CLIENT_SLAVE && handler_installed) - // serverLog(LL_NOTICE, "Total bytes written, %ld, write handler installed?: %d", totwritten, handler_installed); - } - // serverLog(LL_NOTICE, "rel client"); g_pserver->stat_net_output_bytes += totwritten; if (nwritten == -1) { if (connGetState(c->conn) == CONN_STATE_CONNECTED) { @@ -1843,11 +1810,6 @@ int writeToClient(client *c, int handler_installed) { if (!(c->flags & CLIENT_MASTER)) c->lastinteraction = g_pserver->unixtime; } if (!clientHasPendingReplies(c) && !c->fPendingReplicaWrite) { - // if(c->flags & CLIENT_SLAVE && handler_installed){ - // serverLog(LL_NOTICE, "Uninstalling handler"); - // serverLog(LL_NOTICE, "repl_backlog_size: %lld", g_pserver->repl_backlog_size); - // serverLog(LL_NOTICE, "handler repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset); - // } c->sentlen = 0; if (handler_installed) connSetWriteHandler(c->conn, NULL); @@ -1863,7 +1825,6 @@ int writeToClient(client *c, int handler_installed) { /* Write event handler. Just send data to the client. */ void sendReplyToClient(connection *conn) { client *c = (client*)connGetPrivateData(conn); - // serverLog(LL_NOTICE, "called the sendreplytoclient"); if (writeToClient(c,1) == C_ERR) { AeLocker ae; @@ -1997,7 +1958,6 @@ int handleClientsWithPendingWrites(int iel, int aof_state) { auto vec = std::move(g_pserver->rgthreadvar[iel].clients_pending_write); processed += (int)vec.size(); - // serverLog(LL_NOTICE, "entered handleClientsWithPendingWrites"); for (client *c : vec) { serverAssertDebug(FCorrectThread(c)); @@ -2013,12 +1973,6 @@ int handleClientsWithPendingWrites(int iel, int aof_state) { /* Don't write to clients that are going to be closed anyway. */ if (c->flags & CLIENT_CLOSE_ASAP) continue; - // if (c->flags & CLIENT_SLAVE){ - // if(clientHasPendingReplies(c)) - // serverLog(LL_NOTICE, "somehow the client buffer has these values: %s", c->buf); - // serverLog(LL_NOTICE, "LOL"); - // } - /* Try to write buffers to the client socket. */ if (writeToClient(c,0) == C_ERR) { @@ -2216,34 +2170,6 @@ static void setProtocolError(const char *errstr, client *c) { c->flags |= (CLIENT_CLOSE_AFTER_REPLY|CLIENT_PROTOCOL_ERROR); } -static void printQueryBuffer(client *c) { - if (cserver.verbosity <= LL_VERBOSE || c->flags & CLIENT_MASTER) { - sds client = catClientInfoString(sdsempty(),c); - - /* Sample some protocol to given an idea about what was inside. */ - char buf[PROTO_DUMP_LEN*2]; - if (sdslen(c->querybuf)-c->qb_pos < PROTO_DUMP_LEN) { - snprintf(buf,sizeof(buf),"%s", c->querybuf+c->qb_pos); - } else { - snprintf(buf,sizeof(buf),"%.*s (... more %zu bytes ...) %.*s", PROTO_DUMP_LEN/2, c->querybuf+c->qb_pos, sdslen(c->querybuf)-c->qb_pos-PROTO_DUMP_LEN, PROTO_DUMP_LEN/2, c->querybuf+sdslen(c->querybuf)-PROTO_DUMP_LEN/2); - } - - /* Remove non printable chars. */ - char *p = buf; - while (*p != '\0') { - if (!isprint(*p)) *p = '.'; - p++; - } - - /* Log all the client and protocol info. */ - int loglevel = (c->flags & CLIENT_MASTER) ? LL_WARNING : - LL_VERBOSE; - serverLog(loglevel, - "Query buffer from client %lu: %s. %s", c->id, client, buf); - sdsfree(client); - } -} - /* Process the query buffer for client 'c', setting up the client argument * vector for command execution. Returns C_OK if after running the function * the client has a well-formed ready to be processed command, otherwise @@ -2498,8 +2424,6 @@ void parseClientCommandBuffer(client *c) { } size_t cqueriesStart = c->vecqueuedcmd.size(); - // if (c->flags & CLIENT_MASTER) - // printQueryBuffer(c); if (c->reqtype == PROTO_REQ_INLINE) { if (processInlineBuffer(c) != C_OK) break; } else if (c->reqtype == PROTO_REQ_MULTIBULK) { diff --git a/src/replication.cpp b/src/replication.cpp index 96bf161f9..ebdb8af78 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -60,7 +60,6 @@ static void propagateMasterStaleKeys(); void updateLowestOffsetAmongReplicas(){ serverAssert(GlobalLocksAcquired()); serverAssert(!g_pserver->repl_backlog_lock.fOwnLock()); - // serverLog(LL_NOTICE, "off- have repl"); long long min_offset = LONG_LONG_MAX; listIter li; listNode *ln; @@ -73,14 +72,13 @@ void updateLowestOffsetAmongReplicas(){ if (replica->flags & CLIENT_CLOSE_ASAP) continue; std::unique_lock ul(replica->lock); - // serverLog(LL_NOTICE, "off- acq client"); - min_offset = std::min(min_offset, replica->repl_curr_off); - // serverLog(LL_NOTICE, "off- rel client"); + min_offset = std::min(min_offset, replica->repl_curr_off); } /* return -1 if no other minimum was found */ g_pserver->repl_lowest_off.store(min_offset == LONG_LONG_MAX ? -1 : min_offset, std::memory_order_seq_cst); } + /* We take a global flag to remember if this instance generated an RDB * because of replication, so that we can remove the RDB file in case * the instance is configured to have no persistence. */ @@ -232,6 +230,8 @@ void createReplicationBacklog(void) { g_pserver->repl_batch_offStart = g_pserver->master_repl_offset; } +long long getReplIndexFromOffset(long long offset); + /* This function is called when the user modifies the replication backlog * size at runtime. It is up to the function to both update the * g_pserver->repl_backlog_size and to resize the buffer and setup it so that @@ -243,8 +243,6 @@ void resizeReplicationBacklog(long long newsize) { newsize = CONFIG_REPL_BACKLOG_MIN_SIZE; if (g_pserver->repl_backlog_size == newsize) return; - serverLog(LL_NOTICE, "WE HAD TO RESIZE from %lld to %lld", g_pserver->repl_backlog_size, newsize); - if (g_pserver->repl_backlog != NULL) { /* What we actually do is to flush the old buffer and realloc a new * empty one. It will refill with new data incrementally. @@ -252,59 +250,8 @@ void resizeReplicationBacklog(long long newsize) { * worse often we need to alloc additional space before freeing the * old buffer. */ - if (g_pserver->repl_batch_idxStart >= 0) { - // We need to keep critical data so we can't shrink less than the hot data in the buffer - newsize = std::max(newsize, g_pserver->master_repl_offset - g_pserver->repl_batch_offStart); - char *backlog = (char*)zmalloc(newsize); - g_pserver->repl_backlog_histlen = g_pserver->master_repl_offset - g_pserver->repl_batch_offStart; - - if (g_pserver->repl_backlog_idx >= g_pserver->repl_batch_idxStart) { - auto cbActiveBacklog = g_pserver->repl_backlog_idx - g_pserver->repl_batch_idxStart; - memcpy(backlog, g_pserver->repl_backlog + g_pserver->repl_batch_idxStart, cbActiveBacklog); - serverAssert(g_pserver->repl_backlog_histlen == cbActiveBacklog); - } else { - auto cbPhase1 = g_pserver->repl_backlog_size - g_pserver->repl_batch_idxStart; - memcpy(backlog, g_pserver->repl_backlog + g_pserver->repl_batch_idxStart, cbPhase1); - memcpy(backlog + cbPhase1, g_pserver->repl_backlog, g_pserver->repl_backlog_idx); - auto cbActiveBacklog = cbPhase1 + g_pserver->repl_backlog_idx; - serverAssert(g_pserver->repl_backlog_histlen == cbActiveBacklog); - } - zfree(g_pserver->repl_backlog); - g_pserver->repl_backlog = backlog; - g_pserver->repl_backlog_idx = g_pserver->repl_backlog_histlen; - g_pserver->repl_batch_idxStart = 0; - g_pserver->repl_backlog_start = g_pserver->master_repl_offset; - } else { - zfree(g_pserver->repl_backlog); - g_pserver->repl_backlog = (char*)zmalloc(newsize); - g_pserver->repl_backlog_histlen = 0; - g_pserver->repl_backlog_idx = 0; - /* Next byte we have is... the next since the buffer is empty. */ - g_pserver->repl_backlog_off = g_pserver->master_repl_offset+1; - } - } - g_pserver->repl_backlog_size = newsize; -} - -long long getReplIndexFromOffset(long long offset); - -/* The above but for when clients need extra replication backlog because ??? */ -void resizeReplicationBacklogForClients(long long newsize) { - if (newsize < CONFIG_REPL_BACKLOG_MIN_SIZE) - newsize = CONFIG_REPL_BACKLOG_MIN_SIZE; - if (g_pserver->repl_backlog_size == newsize) return; - - serverLog(LL_NOTICE, "WE HAVE TO RESIZE from %lld to %lld", g_pserver->repl_backlog_size, newsize); - /* get the critical client size, i.e. the size of the data unflushed to clients */ - long long earliest_off = g_pserver->repl_lowest_off.load(); - - - if (g_pserver->repl_backlog != NULL) { - /* What we actually do is to flush the old buffer and realloc a new - * empty one. It will refill with new data incrementally. - * The reason is that copying a few gigabytes adds latency and even - * worse often we need to alloc additional space before freeing the - * old buffer. */ + /* get the critical client size, i.e. the size of the data unflushed to clients */ + long long earliest_off = g_pserver->repl_lowest_off.load(); if (earliest_off != -1) { // We need to keep critical data so we can't shrink less than the hot data in the buffer @@ -316,8 +263,6 @@ void resizeReplicationBacklogForClients(long long newsize) { if (g_pserver->repl_backlog_idx >= earliest_idx) { auto cbActiveBacklog = g_pserver->repl_backlog_idx - earliest_idx; memcpy(backlog, g_pserver->repl_backlog + earliest_idx, cbActiveBacklog); - serverLog(LL_NOTICE, "g_pserver->master_repl_offset: %lld, earliest_off: %lld, g_pserver->repl_backlog_idx: %lld, earliest_idx: %lld, repl_backlog_start: %lld", - g_pserver->master_repl_offset, earliest_off, g_pserver->repl_backlog_idx, earliest_idx, g_pserver->repl_backlog_start); serverAssert(g_pserver->repl_backlog_histlen == cbActiveBacklog); } else { auto cbPhase1 = g_pserver->repl_backlog_size - earliest_idx; @@ -344,14 +289,9 @@ void resizeReplicationBacklogForClients(long long newsize) { } } g_pserver->repl_backlog_size = newsize; - - serverLog(LL_NOTICE, "We are ending with: master_repl_offset: %lld, repl_batch_offStart: %lld, new_off: %lld, " - "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, new_idx: %lld, repl_backlog_size: %lld", - g_pserver->master_repl_offset, g_pserver->repl_batch_offStart, 0LL, - g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, 0LL, g_pserver->repl_backlog_size - ); } + void freeReplicationBacklog(void) { serverAssert(GlobalLocksAcquired()); listIter li; @@ -391,17 +331,11 @@ void feedReplicationBacklog(const void *ptr, size_t len) { // This is an emergency overflow, we better resize to fit long long newsize = std::max(g_pserver->repl_backlog_size*2, minimumsize); serverLog(LL_WARNING, "Replication backlog is too small, resizing to: %lld", newsize); - resizeReplicationBacklogForClients(newsize); + resizeReplicationBacklog(newsize); } } } - // serverLog(LL_NOTICE, "Pt2 start with: master_repl_offset: %lld, repl_batch_offStart: %lld, " - // "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, repl_backlog_size: %lld", - // g_pserver->master_repl_offset, g_pserver->repl_batch_offStart, - // g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, g_pserver->repl_backlog_size - // ); - g_pserver->master_repl_offset += len; /* This is a circular buffer, so write as much data we can at every @@ -423,12 +357,6 @@ void feedReplicationBacklog(const void *ptr, size_t len) { /* Set the offset of the first byte we have in the backlog. */ g_pserver->repl_backlog_off = g_pserver->master_repl_offset - g_pserver->repl_backlog_histlen + 1; - - // serverLog(LL_NOTICE, "Pt2 end with: master_repl_offset: %lld, repl_batch_offStart: %lld, " - // "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, repl_backlog_size: %lld", - // g_pserver->master_repl_offset, g_pserver->repl_batch_offStart, - // g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, g_pserver->repl_backlog_size - // ); } /* Wrapper for feedReplicationBacklog() that takes Redis string objects @@ -578,9 +506,7 @@ void replicationFeedSlavesCore(list *slaves, int dictid, robj **argv, int argc) /* Add the SELECT command into the backlog. */ /* We don't do this for advanced replication because this will be done later when it adds the whole RREPLAY command */ - if (g_pserver->repl_backlog && fSendRaw) { - feedReplicationBacklogWithObject(selectcmd); - } + if (g_pserver->repl_backlog && fSendRaw) feedReplicationBacklogWithObject(selectcmd); if (dictid < 0 || dictid >= PROTO_SHARED_SELECT_CMDS) decrRefCount(selectcmd); @@ -678,7 +604,7 @@ void replicationFeedSlaves(list *replicas, int dictid, robj **argv, int argc) { void showLatestBacklog(void) { if (g_pserver->repl_backlog == NULL) return; - long long dumplen = 1024; + long long dumplen = 256; if (g_pserver->repl_backlog_histlen < dumplen) dumplen = g_pserver->repl_backlog_histlen; @@ -769,7 +695,9 @@ void replicationFeedMonitors(client *c, list *monitors, int dictid, robj **argv, } decrRefCount(cmdobj); } -#define BYPASS_PSYNC + +int prepareClientToWrite(client *c); + /* Feed the replica 'c' with the replication backlog starting from the * specified 'offset' up to the end of the backlog. */ long long addReplyReplicationBacklog(client *c, long long offset) { @@ -809,26 +737,14 @@ long long addReplyReplicationBacklog(client *c, long long offset) { * split the reply in two parts if we are cross-boundary. */ len = g_pserver->repl_backlog_histlen - skip; serverLog(LL_DEBUG, "[PSYNC] Reply total length: %lld", len); -#ifdef BYPASS_PSYNC + c->repl_curr_off = offset - 1; c->repl_end_off = g_pserver->master_repl_offset; - serverLog(LL_NOTICE, "This client %lu at addr %s synchronized to %lld", c->id, getClientPeerId(c), c->repl_curr_off); /* Force the partial sync to be queued */ prepareClientToWrite(c); - c->fPendingReplicaWrite = true; -#else - while(len) { - long long thislen = - ((g_pserver->repl_backlog_size - j) < len) ? - (g_pserver->repl_backlog_size - j) : len; + c->fPendingReplicaWrite = true; - serverLog(LL_DEBUG, "[PSYNC] addReply() length: %lld", thislen); - addReplySds(c,sdsnewlen(g_pserver->repl_backlog + j, thislen)); - len -= thislen; - j = 0; - } -#endif return g_pserver->repl_backlog_histlen - skip; } @@ -866,15 +782,11 @@ int replicationSetupSlaveForFullResync(client *replica, long long offset) { replica->repl_curr_off = offset; replica->repl_end_off = g_pserver->master_repl_offset; - serverLog(LL_NOTICE, "This client %lu at addr %s synchronized to %lld", replica->id, getClientPeerId(replica), replica->repl_curr_off); - /* We are going to accumulate the incremental changes for this * replica as well. Set replicaseldb to -1 in order to force to re-emit * a SELECT statement in the replication stream. */ g_pserver->replicaseldb = -1; - serverLog(LL_NOTICE, "We are setting up here lad"); - /* Don't send this reply to slaves that approached us with * the old SYNC command. */ if (!(replica->flags & CLIENT_PRE_PSYNC)) { @@ -1179,7 +1091,6 @@ void syncCommand(client *c) { if (g_pserver->FRdbSaveInProgress() && g_pserver->rdb_child_type == RDB_CHILD_TYPE_DISK) { - serverLog(LL_NOTICE, "case 1"); /* Ok a background save is in progress. Let's check if it is a good * one for replication, i.e. if there is another replica that is * registering differences since the server forked to save. */ @@ -1211,7 +1122,6 @@ void syncCommand(client *c) { } else if (g_pserver->FRdbSaveInProgress() && g_pserver->rdb_child_type == RDB_CHILD_TYPE_SOCKET) { - serverLog(LL_NOTICE, "case 2"); /* There is an RDB child process but it is writing directly to * children sockets. We need to wait for the next BGSAVE * in order to synchronize. */ @@ -1219,7 +1129,6 @@ void syncCommand(client *c) { /* CASE 3: There is no BGSAVE is progress. */ } else { - serverLog(LL_NOTICE, "case 3"); if (g_pserver->repl_diskless_sync && (c->slave_capa & SLAVE_CAPA_EOF)) { /* Diskless replication RDB child is created inside * replicationCron() since we want to delay its start a @@ -4606,9 +4515,10 @@ void _clientAsyncReplyBufferReserve(client *c, size_t len); void flushReplBacklogToClients() { serverAssert(GlobalLocksAcquired()); + /* If we have the repl backlog lock, we will deadlock */ + serverAssert(!g_pserver->repl_backlog_lock.fOwnLock()); if (g_pserver->repl_batch_offStart < 0) return; - if (g_pserver->repl_batch_offStart != g_pserver->master_repl_offset) { bool fAsyncWrite = false; @@ -4617,66 +4527,31 @@ void flushReplBacklogToClients() serverAssert(g_pserver->master_repl_offset - g_pserver->repl_batch_offStart <= g_pserver->repl_backlog_size); serverAssert(g_pserver->repl_batch_idxStart != g_pserver->repl_backlog_idx); - serverLog(LL_NOTICE, "the master repl offset is %lld", g_pserver->master_repl_offset); - showLatestBacklog(); listIter li; listNode *ln; listRewind(g_pserver->slaves, &li); while ((ln = listNext(&li))) { client *replica = (client*)listNodeValue(ln); - // serverLog(LL_NOTICE, "client %lu is in the party", replica->id); - - // serverLog(LL_NOTICE, "is there a write pending for %lu, %d", replica->id, replica->fPendingReplicaWrite); if (replica->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue; if (replica->flags & CLIENT_CLOSE_ASAP) continue; - std::unique_lock ul(replica->lock, std::defer_lock); - if (FCorrectThread(replica)) - ul.lock(); - else + std::unique_lock ul(replica->lock); + if (!FCorrectThread(replica)) fAsyncWrite = true; - - /* If we are online and the RDB has been sent, there is no need to feed the client buffer - * We will send our replies directly from the replication backlog instead */ -#ifdef BYPASS_BUFFER - { - std::unique_lock asyncUl(replica->lock, std::defer_lock); - if (!FCorrectThread(replica)) - asyncUl.lock(); + /* We should have set the repl_curr_off when synchronizing, so it shouldn't be -1 here */ + serverAssert(replica->repl_curr_off != -1); - /* We should have set the repl_curr_off when synchronizing, so it shouldn't be -1 here */ - serverAssert(replica->repl_curr_off != -1); + replica->repl_end_off = g_pserver->master_repl_offset; - replica->repl_end_off = g_pserver->master_repl_offset; - - /* Only if the there isn't already a pending write do we prepare the client to write */ - if (!replica->fPendingReplicaWrite){ - serverAssert(replica->repl_curr_off != g_pserver->master_repl_offset); - prepareClientToWrite(replica); - replica->fPendingReplicaWrite = true; - } - - continue; + /* Only if the there isn't already a pending write do we prepare the client to write */ + if (!replica->fPendingReplicaWrite){ + serverAssert(replica->repl_curr_off != g_pserver->master_repl_offset); + prepareClientToWrite(replica); + replica->fPendingReplicaWrite = true; } -#endif - if (g_pserver->repl_backlog_idx >= g_pserver->repl_batch_idxStart) { - long long cbCopy = g_pserver->repl_backlog_idx - g_pserver->repl_batch_idxStart; - serverAssert((g_pserver->master_repl_offset - g_pserver->repl_batch_offStart) == cbCopy); - serverAssert((g_pserver->repl_backlog_size - g_pserver->repl_batch_idxStart) >= (cbCopy)); - serverAssert((g_pserver->repl_batch_idxStart + cbCopy) <= g_pserver->repl_backlog_size); - - addReplyProto(replica, g_pserver->repl_backlog + g_pserver->repl_batch_idxStart, cbCopy); - } else { - auto cbPhase1 = g_pserver->repl_backlog_size - g_pserver->repl_batch_idxStart; - if (fAsyncWrite) - _clientAsyncReplyBufferReserve(replica, cbPhase1 + g_pserver->repl_backlog_idx); - addReplyProto(replica, g_pserver->repl_backlog + g_pserver->repl_batch_idxStart, cbPhase1); - addReplyProto(replica, g_pserver->repl_backlog, g_pserver->repl_backlog_idx); - serverAssert((cbPhase1 + g_pserver->repl_backlog_idx) == (g_pserver->master_repl_offset - g_pserver->repl_batch_offStart)); - } } if (fAsyncWrite) ProcessPendingAsyncWrites(); diff --git a/src/server.cpp b/src/server.cpp index 439e1aeff..362569bfa 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -1796,7 +1796,6 @@ int clientsCronTrackClientsMemUsage(client *c) { mem += zmalloc_size(c); mem += c->argv_len_sum(); if (c->argv) mem += zmalloc_size(c->argv); - // serverLog(LL_NOTICE, "Mem here is : %lu", mem); /* Now that we have the memory used by the client, remove the old * value from the old category, and add it back. */ g_pserver->stat_clients_type_memory[c->client_cron_last_memory_type] -= @@ -1855,7 +1854,6 @@ void clientsCron(int iel) { while(listLength(g_pserver->clients) && iterations--) { client *c; listNode *head; - // serverLog(LL_NOTICE, "we are at iteration: %d", iterations); /* Rotate the list, take the current head, process. * This way if the client must be removed from the list it's the * first element and we don't incur into O(N) computation. */ diff --git a/src/server.h b/src/server.h index 64a2ca515..0fcd8f5ef 100644 --- a/src/server.h +++ b/src/server.h @@ -3540,7 +3540,6 @@ void tlsInit(void); void tlsInitThread(); int tlsConfigure(redisTLSContextConfig *ctx_config); -int prepareClientToWrite(client *c); class ShutdownException From 80dddab0c4e587332b497cbe8157f39dcc417eb4 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 14 Jun 2021 19:30:49 +0000 Subject: [PATCH 17/22] Relaxed locking, should run faster now Former-commit-id: 5cec4d026dc1766b9ecbade6ec4b9d0e75a94e0f --- src/multi.cpp | 1 - src/networking.cpp | 6 ++++++ src/replication.cpp | 18 ++++++++++-------- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/src/multi.cpp b/src/multi.cpp index f74748e90..589dba589 100644 --- a/src/multi.cpp +++ b/src/multi.cpp @@ -268,7 +268,6 @@ void execCommand(client *c) { if (g_pserver->repl_backlog && was_master && !is_master) { const char *execcmd = "*1\r\n$4\r\nEXEC\r\n"; updateLowestOffsetAmongReplicas(); - std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); feedReplicationBacklog(execcmd,strlen(execcmd)); } afterPropagateExec(); diff --git a/src/networking.cpp b/src/networking.cpp index d8d91751d..07312a9ee 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -1856,6 +1856,8 @@ int writeToClient(client *c, int handler_installed) { * We always read from the replication backlog directly */ std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); + // serverLog(LL_NOTICE, "written to handler"); + long long repl_end_idx = getReplIndexFromOffset(c->repl_end_off); serverAssert(c->repl_curr_off != -1); @@ -1884,8 +1886,12 @@ int writeToClient(client *c, int handler_installed) { serverAssert(c->repl_curr_off <= c->repl_end_off); /* If the client offset matches the global offset, we wrote all we needed to, * in which case, there is no pending write */ + if (c->repl_curr_off == c->repl_end_off){ + // serverLog(LL_NOTICE, "Successfully wrote up until %lld", c->repl_end_off); c->fPendingReplicaWrite = false; + } else { + // serverLog(LL_NOTICE, "Wrote to %lld out of %lld", c->repl_curr_off, c->repl_end_off); } } diff --git a/src/replication.cpp b/src/replication.cpp index d10bac99a..a5f9c3acf 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -241,6 +241,8 @@ void resizeReplicationBacklog(long long newsize) { newsize = CONFIG_REPL_BACKLOG_MIN_SIZE; if (g_pserver->repl_backlog_size == newsize) return; + std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); + if (g_pserver->repl_backlog != NULL) { /* What we actually do is to flush the old buffer and realloc a new * empty one. It will refill with new data incrementally. @@ -310,9 +312,9 @@ void freeReplicationBacklog(void) { * the backlog without incrementing the offset. */ void feedReplicationBacklog(const void *ptr, size_t len) { serverAssert(GlobalLocksAcquired()); - serverAssert(g_pserver->repl_backlog_lock.fOwnLock()); const unsigned char *p = (const unsigned char*)ptr; + if (g_pserver->repl_batch_idxStart >= 0) { /* we are lower bounded by the lower client offset or the offStart if all the clients are up to date */ long long lower_bound = g_pserver->repl_lowest_off.load(std::memory_order_seq_cst); @@ -320,10 +322,11 @@ void feedReplicationBacklog(const void *ptr, size_t len) { lower_bound = g_pserver->repl_batch_offStart; long long minimumsize = g_pserver->master_repl_offset + len - lower_bound + 1; if (minimumsize > g_pserver->repl_backlog_size) { - g_pserver->repl_backlog_lock.unlock(); flushReplBacklogToClients(); - g_pserver->repl_backlog_lock.lock(); - minimumsize = g_pserver->master_repl_offset + len - lower_bound +1; + minimumsize = g_pserver->master_repl_offset + len - lower_bound + 1; + + serverLog(LL_NOTICE, "minimumsize: %lld, g_pserver->master_repl_offset: %lld, len: %lu, lower_bound: %lld", + minimumsize, g_pserver->master_repl_offset, len, lower_bound); if (minimumsize > g_pserver->repl_backlog_size) { // This is an emergency overflow, we better resize to fit @@ -492,7 +495,6 @@ void replicationFeedSlavesCore(list *slaves, int dictid, robj **argv, int argc) bool fSendRaw = !g_pserver->fActiveReplica; updateLowestOffsetAmongReplicas(); - std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); /* Send SELECT command to every replica if needed. */ if (g_pserver->replicaseldb != dictid) { @@ -655,7 +657,6 @@ void replicationFeedSlavesFromMasterStream(char *buf, size_t buflen) { if (g_pserver->repl_backlog){ updateLowestOffsetAmongReplicas(); - std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); feedReplicationBacklog(buf,buflen); } } @@ -750,7 +751,7 @@ long long addReplyReplicationBacklog(client *c, long long offset) { serverLog(LL_DEBUG, "[PSYNC] Reply total length: %lld", len); c->repl_curr_off = offset - 1; - serverLog(LL_NOTICE, "Client %s, replica offset %lld in psync", replicationGetSlaveName(c), c->repl_curr_off); + // serverLog(LL_NOTICE, "Client %s, replica offset %lld in psync", replicationGetSlaveName(c), c->repl_curr_off); c->repl_end_off = g_pserver->master_repl_offset; /* Force the partial sync to be queued */ @@ -4988,7 +4989,7 @@ void flushReplBacklogToClients() if (!canFeedReplicaReplBuffer(replica)) continue; if (replica->flags & CLIENT_CLOSE_ASAP) continue; - serverLog(LL_NOTICE, "Client %s, replica offset %lld", replicationGetSlaveName(replica), replica->repl_curr_off); + // serverLog(LL_NOTICE, "Client %s, replica offset %lld", replicationGetSlaveName(replica), replica->repl_curr_off); std::unique_lock ul(replica->lock); if (!FCorrectThread(replica)) @@ -5013,6 +5014,7 @@ void flushReplBacklogToClients() // This may be called multiple times per "frame" so update with our progress flushing to clients g_pserver->repl_batch_idxStart = g_pserver->repl_backlog_idx; g_pserver->repl_batch_offStart = g_pserver->master_repl_offset; + updateLowestOffsetAmongReplicas(); } } From 6a65b8bbaa318429c69dadd852f62fb6364414fd Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 15 Jun 2021 23:13:49 +0000 Subject: [PATCH 18/22] Optimized use of repl_lowest_off to reduce lock contention Former-commit-id: 30a957e5399fe94675f0b6d2d34c24112d5a9734 --- src/multi.cpp | 1 - src/replication.cpp | 34 ++++++++-------------------------- 2 files changed, 8 insertions(+), 27 deletions(-) diff --git a/src/multi.cpp b/src/multi.cpp index 589dba589..1b91a05a0 100644 --- a/src/multi.cpp +++ b/src/multi.cpp @@ -267,7 +267,6 @@ void execCommand(client *c) { * backlog with the final EXEC. */ if (g_pserver->repl_backlog && was_master && !is_master) { const char *execcmd = "*1\r\n$4\r\nEXEC\r\n"; - updateLowestOffsetAmongReplicas(); feedReplicationBacklog(execcmd,strlen(execcmd)); } afterPropagateExec(); diff --git a/src/replication.cpp b/src/replication.cpp index a5f9c3acf..cb0b562b1 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -56,29 +56,6 @@ void putSlaveOnline(client *replica); int cancelReplicationHandshake(redisMaster *mi, int reconnect); static void propagateMasterStaleKeys(); -/* gets the lowest offset amongst all of the replicas and stores it globally*/ -void updateLowestOffsetAmongReplicas(){ - serverAssert(GlobalLocksAcquired()); - serverAssert(!g_pserver->repl_backlog_lock.fOwnLock()); - long long min_offset = LONG_LONG_MAX; - listIter li; - listNode *ln; - listRewind(g_pserver->slaves, &li); - // check for potential overflow first - while ((ln = listNext(&li))) { - client *replica = (client*)listNodeValue(ln); - - if (replica->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue; - if (replica->flags & CLIENT_CLOSE_ASAP) continue; - - std::unique_lock ul(replica->lock); - - min_offset = std::min(min_offset, replica->repl_curr_off); - } - /* return -1 if no other minimum was found */ - g_pserver->repl_lowest_off.store(min_offset == LONG_LONG_MAX ? -1 : min_offset, std::memory_order_seq_cst); -} - /* We take a global flag to remember if this instance generated an RDB * because of replication, so that we can remove the RDB file in case * the instance is configured to have no persistence. */ @@ -323,6 +300,10 @@ void feedReplicationBacklog(const void *ptr, size_t len) { long long minimumsize = g_pserver->master_repl_offset + len - lower_bound + 1; if (minimumsize > g_pserver->repl_backlog_size) { flushReplBacklogToClients(); + lower_bound = g_pserver->repl_lowest_off.load(std::memory_order_seq_cst); + if (lower_bound == -1) + lower_bound = g_pserver->repl_batch_offStart; + minimumsize = g_pserver->master_repl_offset + len - lower_bound + 1; serverLog(LL_NOTICE, "minimumsize: %lld, g_pserver->master_repl_offset: %lld, len: %lu, lower_bound: %lld", @@ -494,7 +475,6 @@ void replicationFeedSlavesCore(list *slaves, int dictid, robj **argv, int argc) serverAssert(!(listLength(slaves) != 0 && g_pserver->repl_backlog == NULL)); bool fSendRaw = !g_pserver->fActiveReplica; - updateLowestOffsetAmongReplicas(); /* Send SELECT command to every replica if needed. */ if (g_pserver->replicaseldb != dictid) { @@ -656,7 +636,6 @@ void replicationFeedSlavesFromMasterStream(char *buf, size_t buflen) { } if (g_pserver->repl_backlog){ - updateLowestOffsetAmongReplicas(); feedReplicationBacklog(buf,buflen); } } @@ -4975,6 +4954,7 @@ void flushReplBacklogToClients() if (g_pserver->repl_batch_offStart != g_pserver->master_repl_offset) { bool fAsyncWrite = false; + long long min_offset = LONG_LONG_MAX; // Ensure no overflow serverAssert(g_pserver->repl_batch_offStart < g_pserver->master_repl_offset); serverAssert(g_pserver->master_repl_offset - g_pserver->repl_batch_offStart <= g_pserver->repl_backlog_size); @@ -4998,6 +4978,8 @@ void flushReplBacklogToClients() /* We should have set the repl_curr_off when synchronizing, so it shouldn't be -1 here */ serverAssert(replica->repl_curr_off != -1); + min_offset = std::min(min_offset, replica->repl_curr_off); + replica->repl_end_off = g_pserver->master_repl_offset; /* Only if the there isn't already a pending write do we prepare the client to write */ @@ -5014,7 +4996,7 @@ void flushReplBacklogToClients() // This may be called multiple times per "frame" so update with our progress flushing to clients g_pserver->repl_batch_idxStart = g_pserver->repl_backlog_idx; g_pserver->repl_batch_offStart = g_pserver->master_repl_offset; - updateLowestOffsetAmongReplicas(); + g_pserver->repl_lowest_off.store(min_offset == LONG_LONG_MAX ? -1 : min_offset, std::memory_order_seq_cst); } } From 29f4c661799107ed6db8168ecb297b1e0b64f575 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Wed, 16 Jun 2021 19:41:55 +0000 Subject: [PATCH 19/22] More code cleanup Former-commit-id: 8e9962b9b7b9093399451bf93d30e5b5d26e3d33 --- src/evict.cpp | 2 ++ src/networking.cpp | 52 +++++++++++++++------------------------------ src/replication.cpp | 51 ++++++++++++++++++-------------------------- src/server.h | 14 ++++++------ 4 files changed, 47 insertions(+), 72 deletions(-) diff --git a/src/evict.cpp b/src/evict.cpp index ba426f0ee..d336bc8b8 100644 --- a/src/evict.cpp +++ b/src/evict.cpp @@ -354,6 +354,8 @@ unsigned long LFUDecrAndReturn(robj_roptr o) { return counter; } +unsigned long getClientReplicationBacklogSharedUsage(client *c); + /* We don't want to count AOF buffers and slaves output buffers as * used memory: the eviction should use mostly data size. This function * returns the sum of AOF and slaves buffer. */ diff --git a/src/networking.cpp b/src/networking.cpp index 07312a9ee..767fe9c2b 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -1765,15 +1765,7 @@ client *lookupClientByID(uint64_t id) { return (c == raxNotFound) ? NULL : c; } -/* Compute the corresponding index from a replication backlog offset - * by taking the distance between the input offset and the replication backlog offset - * and applying that to the replication backlog index, wrapping around if the index - * becomes negative. - * TODO: Rewrite comment for new logic */ -long long getReplIndexFromOffset(long long offset){ - long long index = (offset - g_pserver->repl_backlog_start) % g_pserver->repl_backlog_size; - return index; -} +long long getReplIndexFromOffset(long long offset); /* Write data in output buffers to client. Return C_OK if the client * is still valid after the call, C_ERR if it was freed because of some @@ -1832,35 +1824,31 @@ int writeToClient(client *c, int handler_installed) { } } /* Note that we avoid to send more than NET_MAX_WRITES_PER_EVENT - * bytes, in a single threaded server it's a good idea to serve - * other clients as well, even if a very large request comes from - * super fast link that is always able to accept data (in real world - * scenario think about 'KEYS *' against the loopback interface). - * - * However if we are over the maxmemory limit we ignore that and - * just deliver as much data as it is possible to deliver. - * - * Moreover, we also send as much as possible if the client is - * a replica or a monitor (otherwise, on high-speed traffic, the - * replication/output buffer will grow indefinitely) */ + * bytes, in a single threaded server it's a good idea to serve + * other clients as well, even if a very large request comes from + * super fast link that is always able to accept data (in real world + * scenario think about 'KEYS *' against the loopback interface). + * + * However if we are over the maxmemory limit we ignore that and + * just deliver as much data as it is possible to deliver. + * + * Moreover, we also send as much as possible if the client is + * a replica or a monitor (otherwise, on high-speed traffic, the + * replication/output buffer will grow indefinitely) */ if (totwritten > NET_MAX_WRITES_PER_EVENT && (g_pserver->maxmemory == 0 || - zmalloc_used_memory() < g_pserver->maxmemory) && + zmalloc_used_memory() < g_pserver->maxmemory) && !(c->flags & CLIENT_SLAVE)) break; } /* We can only directly read from the replication backlog if the client is a replica, so only attempt to do so if that's the case. */ if (c->flags & CLIENT_SLAVE && !(c->flags & CLIENT_MONITOR)) { - /* For replicas, we don't store all the information in the client buffer - * We always read from the replication backlog directly */ + std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); - - // serverLog(LL_NOTICE, "written to handler"); - long long repl_end_idx = getReplIndexFromOffset(c->repl_end_off); - serverAssert(c->repl_curr_off != -1); + if (c->repl_curr_off != c->repl_end_off){ long long repl_curr_idx = getReplIndexFromOffset(c->repl_curr_off); long long nwritten2ndStage = 0; /* How much was written from the start of the replication backlog @@ -1884,14 +1872,9 @@ int writeToClient(client *c, int handler_installed) { totwritten += nwritten; c->repl_curr_off += nwritten; serverAssert(c->repl_curr_off <= c->repl_end_off); - /* If the client offset matches the global offset, we wrote all we needed to, - * in which case, there is no pending write */ - + /* If the client's current offset matches the last offset it can read from, there is no pending write */ if (c->repl_curr_off == c->repl_end_off){ - // serverLog(LL_NOTICE, "Successfully wrote up until %lld", c->repl_end_off); c->fPendingReplicaWrite = false; - } else { - // serverLog(LL_NOTICE, "Wrote to %lld out of %lld", c->repl_curr_off, c->repl_end_off); } } @@ -3719,8 +3702,7 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) { } } -/* In the case of a replica client, it is possible (and very likely) - * that writes to said replica are using data from the replication backlog +/* In the case of a replica client, writes to said replica are using data from the replication backlog * as opposed to it's own internal buffer, this number should keep track of that */ unsigned long getClientReplicationBacklogSharedUsage(client *c) { return (!(c->flags & CLIENT_SLAVE) || !c->fPendingReplicaWrite ) ? 0 : g_pserver->master_repl_offset - c->repl_curr_off; diff --git a/src/replication.cpp b/src/replication.cpp index cb0b562b1..b9465680e 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -47,8 +47,6 @@ #include #include -#define BYPASS_BUFFER - void replicationDiscardCachedMaster(redisMaster *mi); void replicationResurrectCachedMaster(redisMaster *mi, connection *conn); void replicationSendAck(redisMaster *mi); @@ -61,8 +59,6 @@ static void propagateMasterStaleKeys(); * the instance is configured to have no persistence. */ int RDBGeneratedByReplication = 0; -void resizeReplicationBacklogForClients(long long newsize); - /* --------------------------- Utility functions ---------------------------- */ /* Return the pointer to a string representing the replica ip:listening_port @@ -205,7 +201,14 @@ void createReplicationBacklog(void) { g_pserver->repl_batch_offStart = g_pserver->master_repl_offset; } -long long getReplIndexFromOffset(long long offset); +/* Compute the corresponding index from a replication backlog offset + * Since this computation needs the size of the replication backlog, + * you need to have the repl_backlog_lock in order to call it */ +long long getReplIndexFromOffset(long long offset){ + serverAssert(g_pserver->repl_backlog_lock.fOwnLock()); + long long index = (offset - g_pserver->repl_backlog_start) % g_pserver->repl_backlog_size; + return index; +} /* This function is called when the user modifies the replication backlog * size at runtime. It is up to the function to both update the @@ -293,7 +296,7 @@ void feedReplicationBacklog(const void *ptr, size_t len) { if (g_pserver->repl_batch_idxStart >= 0) { - /* we are lower bounded by the lower client offset or the offStart if all the clients are up to date */ + /* We are lower bounded by the lowest replica offset, or the batch offset start if not applicable */ long long lower_bound = g_pserver->repl_lowest_off.load(std::memory_order_seq_cst); if (lower_bound == -1) lower_bound = g_pserver->repl_batch_offStart; @@ -306,9 +309,6 @@ void feedReplicationBacklog(const void *ptr, size_t len) { minimumsize = g_pserver->master_repl_offset + len - lower_bound + 1; - serverLog(LL_NOTICE, "minimumsize: %lld, g_pserver->master_repl_offset: %lld, len: %lu, lower_bound: %lld", - minimumsize, g_pserver->master_repl_offset, len, lower_bound); - if (minimumsize > g_pserver->repl_backlog_size) { // This is an emergency overflow, we better resize to fit long long newsize = std::max(g_pserver->repl_backlog_size*2, minimumsize); @@ -635,9 +635,7 @@ void replicationFeedSlavesFromMasterStream(char *buf, size_t buflen) { printf("\n"); } - if (g_pserver->repl_backlog){ - feedReplicationBacklog(buf,buflen); - } + if (g_pserver->repl_backlog) feedReplicationBacklog(buf,buflen); } void replicationFeedMonitors(client *c, list *monitors, int dictid, robj **argv, int argc) { @@ -689,13 +687,12 @@ int prepareClientToWrite(client *c); /* Feed the replica 'c' with the replication backlog starting from the * specified 'offset' up to the end of the backlog. */ long long addReplyReplicationBacklog(client *c, long long offset) { - long long j, skip, len; + long long skip, len; serverLog(LL_DEBUG, "[PSYNC] Replica request offset: %lld", offset); if (g_pserver->repl_backlog_histlen == 0) { serverLog(LL_DEBUG, "[PSYNC] Backlog history len is zero"); - serverLog(LL_NOTICE, "REOAD TO RESIST"); c->repl_curr_off = g_pserver->master_repl_offset; c->repl_end_off = g_pserver->master_repl_offset; return 0; @@ -714,30 +711,20 @@ long long addReplyReplicationBacklog(client *c, long long offset) { skip = offset - g_pserver->repl_backlog_off; serverLog(LL_DEBUG, "[PSYNC] Skipping: %lld", skip); - /* Point j to the oldest byte, that is actually our - * g_pserver->repl_backlog_off byte. */ - j = (g_pserver->repl_backlog_idx + - (g_pserver->repl_backlog_size-g_pserver->repl_backlog_histlen)) % - g_pserver->repl_backlog_size; - serverLog(LL_DEBUG, "[PSYNC] Index of first byte: %lld", j); - - /* Discard the amount of data to seek to the specified 'offset'. */ - j = (j + skip) % g_pserver->repl_backlog_size; - - /* Feed replica with data. Since it is a circular buffer we have to - * split the reply in two parts if we are cross-boundary. */ len = g_pserver->repl_backlog_histlen - skip; serverLog(LL_DEBUG, "[PSYNC] Reply total length: %lld", len); + /* Set the start and end offsets for the replica so that a future + * writeToClient will send the backlog from the given offset to + * the current end of the backlog to said replica */ c->repl_curr_off = offset - 1; - // serverLog(LL_NOTICE, "Client %s, replica offset %lld in psync", replicationGetSlaveName(c), c->repl_curr_off); c->repl_end_off = g_pserver->master_repl_offset; /* Force the partial sync to be queued */ prepareClientToWrite(c); c->fPendingReplicaWrite = true; - return g_pserver->repl_backlog_histlen - skip; + return len; } /* Return the offset to provide as reply to the PSYNC command received @@ -4963,14 +4950,18 @@ void flushReplBacklogToClients() listIter li; listNode *ln; listRewind(g_pserver->slaves, &li); + /* We don't actually write any data in this function since we send data + * directly from the replication backlog to replicas in writeToClient. + * + * What we do however, is set the end offset of each replica here. This way, + * future calls to writeToClient will know up to where in the replication + * backlog is valid for writing. */ while ((ln = listNext(&li))) { client *replica = (client*)listNodeValue(ln); if (!canFeedReplicaReplBuffer(replica)) continue; if (replica->flags & CLIENT_CLOSE_ASAP) continue; - // serverLog(LL_NOTICE, "Client %s, replica offset %lld", replicationGetSlaveName(replica), replica->repl_curr_off); - std::unique_lock ul(replica->lock); if (!FCorrectThread(replica)) fAsyncWrite = true; diff --git a/src/server.h b/src/server.h index cb3973969..0d6f766ce 100644 --- a/src/server.h +++ b/src/server.h @@ -1590,9 +1590,11 @@ struct client { copying this replica output buffer should use. */ - long long repl_curr_off = -1; /* Replication offset of the client, only if it's a replica*/ - long long repl_end_off = -1; /* Replication offset to write to */ - int fPendingReplicaWrite; + long long repl_curr_off = -1;/* Replication offset of the replica, also where in the backlog we need to start from + * when sending data to this replica. */ + long long repl_end_off = -1; /* Replication offset to write to, stored in the replica, as opposed to using the global offset + * to prevent needing the global lock */ + int fPendingReplicaWrite; /* Is there a write queued for this replica? */ char replid[CONFIG_RUN_ID_SIZE+1]; /* Master replication ID (if master). */ int slave_listening_port; /* As configured with: REPLCONF listening-port */ @@ -2375,8 +2377,8 @@ struct redisServer { int repl_diskless_load; /* Slave parse RDB directly from the socket. * see REPL_DISKLESS_LOAD_* enum */ int repl_diskless_sync_delay; /* Delay to start a diskless repl BGSAVE. */ - std::atomic repl_lowest_off; /* The lowest offset amongst all clients - Updated before calls to feed the replication backlog */ + std::atomic repl_lowest_off; /* The lowest offset amongst all replicas + -1 if there are no replicas */ /* Replication (replica) */ list *masters; int enable_multimaster; @@ -2825,7 +2827,6 @@ sds getAllClientsInfoString(int type); void rewriteClientCommandVector(client *c, int argc, ...); void rewriteClientCommandArgument(client *c, int i, robj *newval); void replaceClientCommandVector(client *c, int argc, robj **argv); -unsigned long getClientReplicationBacklogSharedUsage(client *c); unsigned long getClientOutputBufferMemoryUsage(client *c); int freeClientsInAsyncFreeQueue(int iel); void asyncCloseClientOnOutputBufferLimitReached(client *c); @@ -3017,7 +3018,6 @@ void rdbPipeReadHandler(struct aeEventLoop *eventLoop, int fd, void *clientData, void rdbPipeWriteHandlerConnRemoved(struct connection *conn); void replicationNotifyLoadedKey(redisDb *db, robj_roptr key, robj_roptr val, long long expire); void replicateSubkeyExpire(redisDb *db, robj_roptr key, robj_roptr subkey, long long expire); -void updateLowestOffsetAmongReplicas(void); void clearFailoverState(void); void updateFailoverStatus(void); void abortFailover(redisMaster *mi, const char *err); From 815ebe1e6b0b7ad13db30dc342e8d6cb92330651 Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 25 Jun 2021 01:54:38 +0000 Subject: [PATCH 20/22] Remove fPendingReplicaWrite flag which can instead be calculated on demand Former-commit-id: ae26afd13f955eb230b5c2cab20ec90db9b714ad --- src/networking.cpp | 128 +++++++++++++++++++++----------------------- src/replication.cpp | 8 +-- src/server.h | 5 +- 3 files changed, 67 insertions(+), 74 deletions(-) diff --git a/src/networking.cpp b/src/networking.cpp index 767fe9c2b..690b03a51 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -158,7 +158,6 @@ client *createClient(connection *conn, int iel) { c->flags = 0; c->fPendingAsyncWrite = FALSE; c->fPendingAsyncWriteHandler = FALSE; - c->fPendingReplicaWrite = FALSE; c->ctime = c->lastinteraction = g_pserver->unixtime; /* If the default user does not require authentication, the user is * directly authenticated. */ @@ -318,7 +317,7 @@ int prepareClientToWrite(client *c) { /* Schedule the client to write the output buffers to the socket, unless * it should already be setup to do so (it has already pending data). */ - if (!fAsync && !clientHasPendingReplies(c) && !c->fPendingReplicaWrite) clientInstallWriteHandler(c); + if (!fAsync && (c->flags & CLIENT_SLAVE || !clientHasPendingReplies(c))) clientInstallWriteHandler(c); if (fAsync && !(c->fPendingAsyncWrite)) clientInstallAsyncWriteHandler(c); /* Authorize the caller to queue in the output buffer of this client. */ @@ -1132,7 +1131,7 @@ void copyClientOutputBuffer(client *dst, client *src) { /* Return true if the specified client has pending reply buffers to write to * the socket. */ int clientHasPendingReplies(client *c) { - return (c->bufpos || listLength(c->reply)); + return (c->bufpos || listLength(c->reply) || c->FPendingReplicaWrite()); } static std::atomic rgacceptsInFlight[MAX_EVENT_LOOPS]; @@ -1785,66 +1784,9 @@ int writeToClient(client *c, int handler_installed) { std::unique_locklock)> lock(c->lock); - while(clientHasPendingReplies(c)) { - serverAssert(!(c->flags & CLIENT_SLAVE) || c->flags & CLIENT_MONITOR); - if (c->bufpos > 0) { - nwritten = connWrite(c->conn,c->buf+c->sentlen,c->bufpos-c->sentlen); - if (nwritten <= 0) break; - c->sentlen += nwritten; - totwritten += nwritten; - - /* If the buffer was sent, set bufpos to zero to continue with - * the remainder of the reply. */ - if ((int)c->sentlen == c->bufpos) { - c->bufpos = 0; - c->sentlen = 0; - } - } else { - o = (clientReplyBlock*)listNodeValue(listFirst(c->reply)); - if (o->used == 0) { - c->reply_bytes -= o->size; - listDelNode(c->reply,listFirst(c->reply)); - continue; - } - - nwritten = connWrite(c->conn, o->buf() + c->sentlen, o->used - c->sentlen); - if (nwritten <= 0) break; - c->sentlen += nwritten; - totwritten += nwritten; - - /* If we fully sent the object on head go to the next one */ - if (c->sentlen == o->used) { - c->reply_bytes -= o->size; - listDelNode(c->reply,listFirst(c->reply)); - c->sentlen = 0; - /* If there are no longer objects in the list, we expect - * the count of reply bytes to be exactly zero. */ - if (listLength(c->reply) == 0) - serverAssert(c->reply_bytes == 0); - } - } - /* Note that we avoid to send more than NET_MAX_WRITES_PER_EVENT - * bytes, in a single threaded server it's a good idea to serve - * other clients as well, even if a very large request comes from - * super fast link that is always able to accept data (in real world - * scenario think about 'KEYS *' against the loopback interface). - * - * However if we are over the maxmemory limit we ignore that and - * just deliver as much data as it is possible to deliver. - * - * Moreover, we also send as much as possible if the client is - * a replica or a monitor (otherwise, on high-speed traffic, the - * replication/output buffer will grow indefinitely) */ - if (totwritten > NET_MAX_WRITES_PER_EVENT && - (g_pserver->maxmemory == 0 || - zmalloc_used_memory() < g_pserver->maxmemory) && - !(c->flags & CLIENT_SLAVE)) break; - } - /* We can only directly read from the replication backlog if the client is a replica, so only attempt to do so if that's the case. */ if (c->flags & CLIENT_SLAVE && !(c->flags & CLIENT_MONITOR)) { - std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); long long repl_end_idx = getReplIndexFromOffset(c->repl_end_off); serverAssert(c->repl_curr_off != -1); @@ -1872,15 +1814,67 @@ int writeToClient(client *c, int handler_installed) { totwritten += nwritten; c->repl_curr_off += nwritten; serverAssert(c->repl_curr_off <= c->repl_end_off); - /* If the client's current offset matches the last offset it can read from, there is no pending write */ - if (c->repl_curr_off == c->repl_end_off){ - c->fPendingReplicaWrite = false; - } } /* If the second part of a write didn't go through, we still need to register that */ if (nwritten2ndStage == -1) nwritten = -1; } + } else { + while(clientHasPendingReplies(c)) { + serverAssert(!(c->flags & CLIENT_SLAVE) || c->flags & CLIENT_MONITOR); + if (c->bufpos > 0) { + nwritten = connWrite(c->conn,c->buf+c->sentlen,c->bufpos-c->sentlen); + if (nwritten <= 0) break; + c->sentlen += nwritten; + totwritten += nwritten; + + /* If the buffer was sent, set bufpos to zero to continue with + * the remainder of the reply. */ + if ((int)c->sentlen == c->bufpos) { + c->bufpos = 0; + c->sentlen = 0; + } + } else { + o = (clientReplyBlock*)listNodeValue(listFirst(c->reply)); + if (o->used == 0) { + c->reply_bytes -= o->size; + listDelNode(c->reply,listFirst(c->reply)); + continue; + } + + nwritten = connWrite(c->conn, o->buf() + c->sentlen, o->used - c->sentlen); + if (nwritten <= 0) break; + c->sentlen += nwritten; + totwritten += nwritten; + + /* If we fully sent the object on head go to the next one */ + if (c->sentlen == o->used) { + c->reply_bytes -= o->size; + listDelNode(c->reply,listFirst(c->reply)); + c->sentlen = 0; + /* If there are no longer objects in the list, we expect + * the count of reply bytes to be exactly zero. */ + if (listLength(c->reply) == 0) + serverAssert(c->reply_bytes == 0); + } + } + /* Note that we avoid to send more than NET_MAX_WRITES_PER_EVENT + * bytes, in a single threaded server it's a good idea to serve + * other clients as well, even if a very large request comes from + * super fast link that is always able to accept data (in real world + * scenario think about 'KEYS *' against the loopback interface). + * + * However if we are over the maxmemory limit we ignore that and + * just deliver as much data as it is possible to deliver. + * + * Moreover, we also send as much as possible if the client is + * a replica or a monitor (otherwise, on high-speed traffic, the + * replication/output buffer will grow indefinitely) */ + if (totwritten > NET_MAX_WRITES_PER_EVENT && + (g_pserver->maxmemory == 0 || + zmalloc_used_memory() < g_pserver->maxmemory) && + !(c->flags & CLIENT_SLAVE)) break; + } } g_pserver->stat_net_output_bytes += totwritten; @@ -1900,7 +1894,7 @@ int writeToClient(client *c, int handler_installed) { * We just rely on data / pings received for timeout detection. */ if (!(c->flags & CLIENT_MASTER)) c->lastinteraction = g_pserver->unixtime; } - if (!clientHasPendingReplies(c) && !c->fPendingReplicaWrite) { + if (!clientHasPendingReplies(c)) { c->sentlen = 0; if (handler_installed) connSetWriteHandler(c->conn, NULL); @@ -2080,7 +2074,7 @@ int handleClientsWithPendingWrites(int iel, int aof_state) { /* If after the synchronous writes above we still have data to * output to the client, we need to install the writable handler. */ - if (clientHasPendingReplies(c) || c->fPendingReplicaWrite) { + if (clientHasPendingReplies(c)) { if (connSetWriteHandlerWithBarrier(c->conn, sendReplyToClient, ae_flags, true) == C_ERR) { freeClientAsync(c); } @@ -3705,7 +3699,7 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) { /* In the case of a replica client, writes to said replica are using data from the replication backlog * as opposed to it's own internal buffer, this number should keep track of that */ unsigned long getClientReplicationBacklogSharedUsage(client *c) { - return (!(c->flags & CLIENT_SLAVE) || !c->fPendingReplicaWrite ) ? 0 : g_pserver->master_repl_offset - c->repl_curr_off; + return (!(c->flags & CLIENT_SLAVE) || !c->FPendingReplicaWrite() ) ? 0 : g_pserver->master_repl_offset - c->repl_curr_off; } /* This function returns the number of bytes that Redis is diff --git a/src/replication.cpp b/src/replication.cpp index b9465680e..94b35e314 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -722,7 +722,6 @@ long long addReplyReplicationBacklog(client *c, long long offset) { /* Force the partial sync to be queued */ prepareClientToWrite(c); - c->fPendingReplicaWrite = true; return len; } @@ -4974,11 +4973,8 @@ void flushReplBacklogToClients() replica->repl_end_off = g_pserver->master_repl_offset; /* Only if the there isn't already a pending write do we prepare the client to write */ - if (!replica->fPendingReplicaWrite){ - serverAssert(replica->repl_curr_off != g_pserver->master_repl_offset); - prepareClientToWrite(replica); - replica->fPendingReplicaWrite = true; - } + serverAssert(replica->repl_curr_off != g_pserver->master_repl_offset); + prepareClientToWrite(replica); } if (fAsyncWrite) diff --git a/src/server.h b/src/server.h index 0d6f766ce..07608632e 100644 --- a/src/server.h +++ b/src/server.h @@ -1594,7 +1594,6 @@ struct client { * when sending data to this replica. */ long long repl_end_off = -1; /* Replication offset to write to, stored in the replica, as opposed to using the global offset * to prevent needing the global lock */ - int fPendingReplicaWrite; /* Is there a write queued for this replica? */ char replid[CONFIG_RUN_ID_SIZE+1]; /* Master replication ID (if master). */ int slave_listening_port; /* As configured with: REPLCONF listening-port */ @@ -1657,6 +1656,10 @@ struct client { robj **argv; size_t argv_len_sumActive = 0; + bool FPendingReplicaWrite() const { + return repl_curr_off != repl_end_off; + } + // post a function from a non-client thread to run on its client thread bool postFunction(std::function fn, bool fLock = true); size_t argv_len_sum() const; From e6a82692b7be9d62a619f9968e0f9ae5f90ca71e Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 25 Jun 2021 02:31:17 +0000 Subject: [PATCH 21/22] Avoid holding the lockPendingWrite for too long and deadlocking due to lock inversion Former-commit-id: a4b49fbec60e2333a4407d24383ae204d5d2b413 --- src/networking.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/networking.cpp b/src/networking.cpp index 690b03a51..5ced371d1 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -2018,7 +2018,6 @@ void ProcessPendingAsyncWrites() * need to use a syscall in order to install the writable event handler, * get it called, and so forth. */ int handleClientsWithPendingWrites(int iel, int aof_state) { - std::unique_lock lockf(g_pserver->rgthreadvar[iel].lockPendingWrite); int processed = 0; serverAssert(iel == (serverTL - g_pserver->rgthreadvar)); @@ -2041,7 +2040,9 @@ int handleClientsWithPendingWrites(int iel, int aof_state) { ae_flags |= AE_BARRIER; } + std::unique_lock lockf(g_pserver->rgthreadvar[iel].lockPendingWrite); auto vec = std::move(g_pserver->rgthreadvar[iel].clients_pending_write); + lockf.unlock(); processed += (int)vec.size(); for (client *c : vec) { From 5949e253cab606c0bd7616e00c42e7ebcfca872a Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 25 Jun 2021 02:46:32 +0000 Subject: [PATCH 22/22] remove unnecessary newline Former-commit-id: 532af9cd0286ac6ece6f401c42aea18e36d16f7c --- src/replication.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/replication.cpp b/src/replication.cpp index 94b35e314..e9a503167 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -4975,7 +4975,6 @@ void flushReplBacklogToClients() /* Only if the there isn't already a pending write do we prepare the client to write */ serverAssert(replica->repl_curr_off != g_pserver->master_repl_offset); prepareClientToWrite(replica); - } if (fAsyncWrite) ProcessPendingAsyncWrites();