diff --git a/src/replication.cpp b/src/replication.cpp index 94d4a7fbe..a7a95ed01 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -219,6 +219,7 @@ void feedReplicationBacklog(const void *ptr, size_t len) { const unsigned char *p = (const unsigned char*)ptr; g_pserver->master_repl_offset += len; + g_pserver->master_repl_meaningful_offset = g_pserver->master_repl_offset; /* This is a circular buffer, so write as much data we can at every * iteration and rewind the "idx" index if we reach the limit. */ @@ -2066,6 +2067,7 @@ void readSyncBulkPayload(connection *conn) { * we are starting a new history. */ memcpy(g_pserver->replid,mi->master->replid,sizeof(g_pserver->replid)); g_pserver->master_repl_offset = mi->master->reploff; + g_pserver->master_repl_meaningful_offset = mi->master->reploff; } clearReplicationId2(); @@ -3159,12 +3161,43 @@ void replicationCacheMaster(redisMaster *mi, client *c) { * current offset if no data was lost during the failover. So we use our * current replication ID and offset in order to synthesize a cached master. */ void replicationCacheMasterUsingMyself(redisMaster *mi) { + serverLog(LL_NOTICE, + "Before turning into a replica, using my own master parameters " + "to synthesize a cached master: I may be able to synchronize with " + "the new master with just a partial transfer."); + if (mi->cached_master != nullptr) { // This can happen on first load of the RDB, the master we created in config load is stale freeClient(mi->cached_master); } + /* This will be used to populate the field server.master->reploff + * by replicationCreateMasterClient(). We'll later set the created + * master as server.cached_master, so the replica will use such + * offset for PSYNC. */ + mi->master_initial_offset = g_pserver->master_repl_offset; + + /* However if the "meaningful" offset, that is the offset without + * the final PINGs in the stream, is different, use this instead: + * often when the master is no longer reachable, replicas will never + * receive the PINGs, however the master will end with an incremented + * offset because of the PINGs and will not be able to incrementally + * PSYNC with the new master. */ + if (g_pserver->master_repl_offset > g_pserver->master_repl_meaningful_offset) { + long long delta = g_pserver->master_repl_offset - + g_pserver->master_repl_meaningful_offset; + serverLog(LL_NOTICE, + "Using the meaningful offset %lld instead of %lld to exclude " + "the final PINGs (%lld bytes difference)", + g_pserver->master_repl_meaningful_offset, + g_pserver->master_repl_offset, + delta); + mi->master_initial_offset = g_pserver->master_repl_meaningful_offset; + g_pserver->repl_backlog_histlen -= delta; + if (g_pserver->repl_backlog_histlen < 0) g_pserver->repl_backlog_histlen = 0; + } + /* The master client we create can be set to any DBID, because * the new master will start its replication stream with SELECT. */ mi->master_initial_offset = g_pserver->master_repl_offset; @@ -3178,7 +3211,6 @@ void replicationCacheMasterUsingMyself(redisMaster *mi) { unlinkClient(mi->master); mi->cached_master = mi->master; mi->master = NULL; - serverLog(LL_NOTICE,"Before turning into a replica, using my master parameters to synthesize a cached master: I may be able to synchronize with the new master with just a partial transfer."); } /* Free a cached master, called when there are no longer the conditions for @@ -3583,10 +3615,18 @@ void replicationCron(void) { clientsArePaused(); if (!manual_failover_in_progress) { + long long before_ping = g_pserver->master_repl_meaningful_offset; ping_argv[0] = createStringObject("PING",4); replicationFeedSlaves(g_pserver->slaves, g_pserver->replicaseldb, ping_argv, 1); decrRefCount(ping_argv[0]); + /* The server.master_repl_meaningful_offset variable represents + * the offset of the replication stream without the pending PINGs. + * This is useful to set the right replication offset for PSYNC + * when the master is turned into a replica. Otherwise pending + * PINGs may not allow it to perform an incremental sync with the + * new master. */ + g_pserver->master_repl_meaningful_offset = before_ping; } } diff --git a/src/server.cpp b/src/server.cpp index 5f0a72d69..000bdd76f 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -2475,6 +2475,7 @@ void initServerConfig(void) { g_pserver->enable_multimaster = CONFIG_DEFAULT_ENABLE_MULTIMASTER; g_pserver->repl_syncio_timeout = CONFIG_REPL_SYNCIO_TIMEOUT; g_pserver->master_repl_offset = 0; + g_pserver->master_repl_meaningful_offset = 0; /* Replication partial resync backlog */ g_pserver->repl_backlog = NULL; @@ -4626,6 +4627,7 @@ sds genRedisInfoString(const char *section) { "master_replid:%s\r\n" "master_replid2:%s\r\n" "master_repl_offset:%lld\r\n" + "master_repl_meaningful_offset:%lld\r\n" "second_repl_offset:%lld\r\n" "repl_backlog_active:%d\r\n" "repl_backlog_size:%lld\r\n" @@ -4634,6 +4636,7 @@ sds genRedisInfoString(const char *section) { g_pserver->replid, g_pserver->replid2, g_pserver->master_repl_offset, + g_pserver->master_repl_meaningful_offset, g_pserver->second_replid_offset, g_pserver->repl_backlog != NULL, g_pserver->repl_backlog_size, @@ -5027,6 +5030,7 @@ void loadDataFromDisk(void) { { memcpy(g_pserver->replid,rsi.repl_id,sizeof(g_pserver->replid)); g_pserver->master_repl_offset = rsi.repl_offset; + g_pserver->master_repl_meaningful_offset = rsi.repl_offset; listIter li; listNode *ln; diff --git a/src/server.h b/src/server.h index b9bd9b7c4..80e6f1f1b 100644 --- a/src/server.h +++ b/src/server.h @@ -1792,6 +1792,7 @@ struct redisServer { char replid[CONFIG_RUN_ID_SIZE+1]; /* My current replication ID. */ char replid2[CONFIG_RUN_ID_SIZE+1]; /* replid inherited from master*/ long long master_repl_offset; /* My current replication offset */ + long long master_repl_meaningful_offset; /* Offset minus latest PINGs. */ long long second_replid_offset; /* Accept offsets up to this for replid2. */ int replicaseldb; /* Last SELECTed DB in replication output */ int repl_ping_slave_period; /* Master pings the replica every N seconds */