From c38bc83f472c73b537a1b913ec14dc3eae0e33d1 Mon Sep 17 00:00:00 2001
From: antirez <antirez@gmail.com>
Date: Tue, 24 Mar 2020 11:02:40 +0100
Subject: [PATCH] PSYNC2: meaningful offset implemented.

A very commonly signaled operational problem with Redis master-replicas
sets is that, once the master becomes unavailable for some reason,
especially because of network problems, many times it wont be able to
perform a partial resynchronization with the new master, once it rejoins
the partition, for the following reason:

1. The master becomes isolated, however it keeps sending PINGs to the
replicas. Such PINGs will never be received since the link connection is
actually already severed.
2. On the other side, one of the replicas will turn into the new master,
setting its secondary replication ID offset to the one of the last
command received from the old master: this offset will not include the
PINGs sent by the master once the link was already disconnected.
3. When the master rejoins the partion and is turned into a replica, its
offset will be too advanced because of the PINGs, so a PSYNC will fail,
and a full synchronization will be required.

Related to issue #7002 and other discussion we had in the past around
this problem.


Former-commit-id: 5d6e8fe3e3e43162f0c57f580b6e8432274fca30
---
 src/replication.cpp | 42 +++++++++++++++++++++++++++++++++++++++++-
 src/server.cpp      |  4 ++++
 src/server.h        |  1 +
 3 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/src/replication.cpp b/src/replication.cpp
index 94d4a7fbe..a7a95ed01 100644
--- a/src/replication.cpp
+++ b/src/replication.cpp
@@ -219,6 +219,7 @@ void feedReplicationBacklog(const void *ptr, size_t len) {
     const unsigned char *p = (const unsigned char*)ptr;
 
     g_pserver->master_repl_offset += len;
+    g_pserver->master_repl_meaningful_offset = g_pserver->master_repl_offset;
 
     /* This is a circular buffer, so write as much data we can at every
      * iteration and rewind the "idx" index if we reach the limit. */
@@ -2066,6 +2067,7 @@ void readSyncBulkPayload(connection *conn) {
         * we are starting a new history. */
         memcpy(g_pserver->replid,mi->master->replid,sizeof(g_pserver->replid));
         g_pserver->master_repl_offset = mi->master->reploff;
+        g_pserver->master_repl_meaningful_offset = mi->master->reploff;
     }
     clearReplicationId2();
 
@@ -3159,12 +3161,43 @@ void replicationCacheMaster(redisMaster *mi, client *c) {
  * current offset if no data was lost during the failover. So we use our
  * current replication ID and offset in order to synthesize a cached master. */
 void replicationCacheMasterUsingMyself(redisMaster *mi) {
+    serverLog(LL_NOTICE,
+        "Before turning into a replica, using my own master parameters "
+        "to synthesize a cached master: I may be able to synchronize with "
+        "the new master with just a partial transfer.");
+
     if (mi->cached_master != nullptr)
     {
         // This can happen on first load of the RDB, the master we created in config load is stale
         freeClient(mi->cached_master);
     }
 
+    /* This will be used to populate the field server.master->reploff
+     * by replicationCreateMasterClient(). We'll later set the created
+     * master as server.cached_master, so the replica will use such
+     * offset for PSYNC. */
+    mi->master_initial_offset = g_pserver->master_repl_offset;
+
+    /* However if the "meaningful" offset, that is the offset without
+     * the final PINGs in the stream, is different, use this instead:
+     * often when the master is no longer reachable, replicas will never
+     * receive the PINGs, however the master will end with an incremented
+     * offset because of the PINGs and will not be able to incrementally
+     * PSYNC with the new master. */
+    if (g_pserver->master_repl_offset > g_pserver->master_repl_meaningful_offset) {
+        long long delta = g_pserver->master_repl_offset -
+                          g_pserver->master_repl_meaningful_offset;
+        serverLog(LL_NOTICE,
+            "Using the meaningful offset %lld instead of %lld to exclude "
+            "the final PINGs (%lld bytes difference)",
+                g_pserver->master_repl_meaningful_offset,
+                g_pserver->master_repl_offset,
+                delta);
+        mi->master_initial_offset = g_pserver->master_repl_meaningful_offset;
+        g_pserver->repl_backlog_histlen -= delta;
+        if (g_pserver->repl_backlog_histlen < 0) g_pserver->repl_backlog_histlen = 0;
+    }
+
     /* The master client we create can be set to any DBID, because
      * the new master will start its replication stream with SELECT. */
     mi->master_initial_offset = g_pserver->master_repl_offset;
@@ -3178,7 +3211,6 @@ void replicationCacheMasterUsingMyself(redisMaster *mi) {
     unlinkClient(mi->master);
     mi->cached_master = mi->master;
     mi->master = NULL;
-    serverLog(LL_NOTICE,"Before turning into a replica, using my master parameters to synthesize a cached master: I may be able to synchronize with the new master with just a partial transfer.");
 }
 
 /* Free a cached master, called when there are no longer the conditions for
@@ -3583,10 +3615,18 @@ void replicationCron(void) {
             clientsArePaused();
 
         if (!manual_failover_in_progress) {
+            long long before_ping = g_pserver->master_repl_meaningful_offset;
             ping_argv[0] = createStringObject("PING",4);
             replicationFeedSlaves(g_pserver->slaves, g_pserver->replicaseldb,
                 ping_argv, 1);
             decrRefCount(ping_argv[0]);
+            /* The server.master_repl_meaningful_offset variable represents
+             * the offset of the replication stream without the pending PINGs.
+             * This is useful to set the right replication offset for PSYNC
+             * when the master is turned into a replica. Otherwise pending
+             * PINGs may not allow it to perform an incremental sync with the
+             * new master. */
+            g_pserver->master_repl_meaningful_offset = before_ping;
         }
     }
 
diff --git a/src/server.cpp b/src/server.cpp
index 5f0a72d69..000bdd76f 100644
--- a/src/server.cpp
+++ b/src/server.cpp
@@ -2475,6 +2475,7 @@ void initServerConfig(void) {
     g_pserver->enable_multimaster = CONFIG_DEFAULT_ENABLE_MULTIMASTER;
     g_pserver->repl_syncio_timeout = CONFIG_REPL_SYNCIO_TIMEOUT;
     g_pserver->master_repl_offset = 0;
+    g_pserver->master_repl_meaningful_offset = 0;
 
     /* Replication partial resync backlog */
     g_pserver->repl_backlog = NULL;
@@ -4626,6 +4627,7 @@ sds genRedisInfoString(const char *section) {
             "master_replid:%s\r\n"
             "master_replid2:%s\r\n"
             "master_repl_offset:%lld\r\n"
+            "master_repl_meaningful_offset:%lld\r\n"
             "second_repl_offset:%lld\r\n"
             "repl_backlog_active:%d\r\n"
             "repl_backlog_size:%lld\r\n"
@@ -4634,6 +4636,7 @@ sds genRedisInfoString(const char *section) {
             g_pserver->replid,
             g_pserver->replid2,
             g_pserver->master_repl_offset,
+            g_pserver->master_repl_meaningful_offset,
             g_pserver->second_replid_offset,
             g_pserver->repl_backlog != NULL,
             g_pserver->repl_backlog_size,
@@ -5027,6 +5030,7 @@ void loadDataFromDisk(void) {
             {
                 memcpy(g_pserver->replid,rsi.repl_id,sizeof(g_pserver->replid));
                 g_pserver->master_repl_offset = rsi.repl_offset;
+                g_pserver->master_repl_meaningful_offset = rsi.repl_offset;
                 listIter li;
                 listNode *ln;
                 
diff --git a/src/server.h b/src/server.h
index b9bd9b7c4..80e6f1f1b 100644
--- a/src/server.h
+++ b/src/server.h
@@ -1792,6 +1792,7 @@ struct redisServer {
     char replid[CONFIG_RUN_ID_SIZE+1];  /* My current replication ID. */
     char replid2[CONFIG_RUN_ID_SIZE+1]; /* replid inherited from master*/
     long long master_repl_offset;   /* My current replication offset */
+    long long master_repl_meaningful_offset; /* Offset minus latest PINGs. */
     long long second_replid_offset; /* Accept offsets up to this for replid2. */
     int replicaseldb;                 /* Last SELECTed DB in replication output */
     int repl_ping_slave_period;     /* Master pings the replica every N seconds */