From 7fa56dd773cac7155f7a7768458b1f59d1f39a13 Mon Sep 17 00:00:00 2001
From: WuYunlong <xzsyeb@126.com>
Date: Tue, 27 Oct 2020 14:13:59 +0800
Subject: [PATCH] Speedup cluster failover. (#7948)

This commit deals with manual failover as well as non-manual failover.

We did tests with manual failover as follows:
1, Setup redis cluster which holds 16 partions, each having only
   1 corresponding replica.
2, Write a batch of data to redis cluster and make sure the redis is doing
   a active expire in serverCron.
3, Do a manual failover sequentially to each partions with a time interval
   of 3 minutes.
4, Collect logs and do some computaiton work.

The result:
case    avgTime    maxTime    minTime
C1      95.8ms	   227ms      25ms
C2      47.9ms     96ms       12ms
C3      12.6ms     27ms       7ms

Explanation
case C1: All nodes use the version before optimization
case C2: Masters use the elder version while replicas use the optimized version
case C3: All nodes use the optimized version
failover time: The time between when replica got a `manual failover request` and
               when it `won the failover election`.
avgTime: average failover time
maxTime: maximum failover time
minTime: mimimum failover time
ms: millisecond

Co-authored-by: chendq8 <c.d_q@163.com>
---
 src/cluster.c | 50 +++++++++++++++++++++++++++++++++++---------------
 src/cluster.h |  1 +
 2 files changed, 36 insertions(+), 15 deletions(-)

diff --git a/src/cluster.c b/src/cluster.c
index 248d5c17d..6da71c5da 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -1816,6 +1816,7 @@ int clusterProcessPacket(clusterLink *link) {
             server.cluster->mf_master_offset == 0)
         {
             server.cluster->mf_master_offset = sender->repl_offset;
+            clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_MANUALFAILOVER);
             serverLog(LL_WARNING,
                 "Received replication offset for paused "
                 "master manual failover: %lld",
@@ -2160,6 +2161,12 @@ int clusterProcessPacket(clusterLink *link) {
         pauseClients(now+(CLUSTER_MF_TIMEOUT*CLUSTER_MF_PAUSE_MULT));
         serverLog(LL_WARNING,"Manual failover requested by replica %.40s.",
             sender->name);
+        /* We need to send a ping message to the replica, as it would carry
+         * `server.cluster->mf_master_offset`, which means the master paused clients
+         * at offset `server.cluster->mf_master_offset`, so that the replica would
+         * know that it is safe to set its `server.cluster->mf_can_start` to 1 so as
+         * to complete failover as quickly as possible. */
+        clusterSendPing(link, CLUSTERMSG_TYPE_PING);
     } else if (type == CLUSTERMSG_TYPE_UPDATE) {
         clusterNode *n; /* The node the update is about. */
         uint64_t reportedConfigEpoch =
@@ -3434,7 +3441,10 @@ void clusterHandleManualFailover(void) {
         serverLog(LL_WARNING,
             "All master replication stream processed, "
             "manual failover can start.");
+        clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER);
+        return;
     }
+    clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_MANUALFAILOVER);
 }
 
 /* -----------------------------------------------------------------------------
@@ -3709,25 +3719,35 @@ void clusterCron(void) {
  * handlers, or to perform potentially expansive tasks that we need to do
  * a single time before replying to clients. */
 void clusterBeforeSleep(void) {
-    /* Handle failover, this is needed when it is likely that there is already
-     * the quorum from masters in order to react fast. */
-    if (server.cluster->todo_before_sleep & CLUSTER_TODO_HANDLE_FAILOVER)
-        clusterHandleSlaveFailover();
-
-    /* Update the cluster state. */
-    if (server.cluster->todo_before_sleep & CLUSTER_TODO_UPDATE_STATE)
-        clusterUpdateState();
-
-    /* Save the config, possibly using fsync. */
-    if (server.cluster->todo_before_sleep & CLUSTER_TODO_SAVE_CONFIG) {
-        int fsync = server.cluster->todo_before_sleep &
-                    CLUSTER_TODO_FSYNC_CONFIG;
-        clusterSaveConfigOrDie(fsync);
-    }
+    int flags = server.cluster->todo_before_sleep;
 
     /* Reset our flags (not strictly needed since every single function
      * called for flags set should be able to clear its flag). */
     server.cluster->todo_before_sleep = 0;
+
+    if (flags & CLUSTER_TODO_HANDLE_MANUALFAILOVER) {
+        /* Handle manual failover as soon as possible so that won't have a 100ms
+         * as it was handled only in clusterCron */
+        if(nodeIsSlave(myself)) {
+            clusterHandleManualFailover();
+            if (!(server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_FAILOVER))
+                clusterHandleSlaveFailover();
+        }
+    } else if (flags & CLUSTER_TODO_HANDLE_FAILOVER) {
+        /* Handle failover, this is needed when it is likely that there is already
+         * the quorum from masters in order to react fast. */
+        clusterHandleSlaveFailover();
+    }
+
+    /* Update the cluster state. */
+    if (flags & CLUSTER_TODO_UPDATE_STATE)
+        clusterUpdateState();
+
+    /* Save the config, possibly using fsync. */
+    if (flags & CLUSTER_TODO_SAVE_CONFIG) {
+        int fsync = flags & CLUSTER_TODO_FSYNC_CONFIG;
+        clusterSaveConfigOrDie(fsync);
+    }
 }
 
 void clusterDoBeforeSleep(int flags) {
diff --git a/src/cluster.h b/src/cluster.h
index 48a111764..95b710383 100644
--- a/src/cluster.h
+++ b/src/cluster.h
@@ -77,6 +77,7 @@ typedef struct clusterLink {
 #define CLUSTER_TODO_UPDATE_STATE (1<<1)
 #define CLUSTER_TODO_SAVE_CONFIG (1<<2)
 #define CLUSTER_TODO_FSYNC_CONFIG (1<<3)
+#define CLUSTER_TODO_HANDLE_MANUALFAILOVER (1<<4)
 
 /* Message types.
  *