From 7fa56dd773cac7155f7a7768458b1f59d1f39a13 Mon Sep 17 00:00:00 2001 From: WuYunlong Date: Tue, 27 Oct 2020 14:13:59 +0800 Subject: [PATCH] Speedup cluster failover. (#7948) This commit deals with manual failover as well as non-manual failover. We did tests with manual failover as follows: 1, Setup redis cluster which holds 16 partions, each having only 1 corresponding replica. 2, Write a batch of data to redis cluster and make sure the redis is doing a active expire in serverCron. 3, Do a manual failover sequentially to each partions with a time interval of 3 minutes. 4, Collect logs and do some computaiton work. The result: case avgTime maxTime minTime C1 95.8ms 227ms 25ms C2 47.9ms 96ms 12ms C3 12.6ms 27ms 7ms Explanation case C1: All nodes use the version before optimization case C2: Masters use the elder version while replicas use the optimized version case C3: All nodes use the optimized version failover time: The time between when replica got a `manual failover request` and when it `won the failover election`. avgTime: average failover time maxTime: maximum failover time minTime: mimimum failover time ms: millisecond Co-authored-by: chendq8 --- src/cluster.c | 50 +++++++++++++++++++++++++++++++++++--------------- src/cluster.h | 1 + 2 files changed, 36 insertions(+), 15 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 248d5c17d..6da71c5da 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -1816,6 +1816,7 @@ int clusterProcessPacket(clusterLink *link) { server.cluster->mf_master_offset == 0) { server.cluster->mf_master_offset = sender->repl_offset; + clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_MANUALFAILOVER); serverLog(LL_WARNING, "Received replication offset for paused " "master manual failover: %lld", @@ -2160,6 +2161,12 @@ int clusterProcessPacket(clusterLink *link) { pauseClients(now+(CLUSTER_MF_TIMEOUT*CLUSTER_MF_PAUSE_MULT)); serverLog(LL_WARNING,"Manual failover requested by replica %.40s.", sender->name); + /* We need to send a ping message to the replica, as it would carry + * `server.cluster->mf_master_offset`, which means the master paused clients + * at offset `server.cluster->mf_master_offset`, so that the replica would + * know that it is safe to set its `server.cluster->mf_can_start` to 1 so as + * to complete failover as quickly as possible. */ + clusterSendPing(link, CLUSTERMSG_TYPE_PING); } else if (type == CLUSTERMSG_TYPE_UPDATE) { clusterNode *n; /* The node the update is about. */ uint64_t reportedConfigEpoch = @@ -3434,7 +3441,10 @@ void clusterHandleManualFailover(void) { serverLog(LL_WARNING, "All master replication stream processed, " "manual failover can start."); + clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER); + return; } + clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_MANUALFAILOVER); } /* ----------------------------------------------------------------------------- @@ -3709,25 +3719,35 @@ void clusterCron(void) { * handlers, or to perform potentially expansive tasks that we need to do * a single time before replying to clients. */ void clusterBeforeSleep(void) { - /* Handle failover, this is needed when it is likely that there is already - * the quorum from masters in order to react fast. */ - if (server.cluster->todo_before_sleep & CLUSTER_TODO_HANDLE_FAILOVER) - clusterHandleSlaveFailover(); - - /* Update the cluster state. */ - if (server.cluster->todo_before_sleep & CLUSTER_TODO_UPDATE_STATE) - clusterUpdateState(); - - /* Save the config, possibly using fsync. */ - if (server.cluster->todo_before_sleep & CLUSTER_TODO_SAVE_CONFIG) { - int fsync = server.cluster->todo_before_sleep & - CLUSTER_TODO_FSYNC_CONFIG; - clusterSaveConfigOrDie(fsync); - } + int flags = server.cluster->todo_before_sleep; /* Reset our flags (not strictly needed since every single function * called for flags set should be able to clear its flag). */ server.cluster->todo_before_sleep = 0; + + if (flags & CLUSTER_TODO_HANDLE_MANUALFAILOVER) { + /* Handle manual failover as soon as possible so that won't have a 100ms + * as it was handled only in clusterCron */ + if(nodeIsSlave(myself)) { + clusterHandleManualFailover(); + if (!(server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_FAILOVER)) + clusterHandleSlaveFailover(); + } + } else if (flags & CLUSTER_TODO_HANDLE_FAILOVER) { + /* Handle failover, this is needed when it is likely that there is already + * the quorum from masters in order to react fast. */ + clusterHandleSlaveFailover(); + } + + /* Update the cluster state. */ + if (flags & CLUSTER_TODO_UPDATE_STATE) + clusterUpdateState(); + + /* Save the config, possibly using fsync. */ + if (flags & CLUSTER_TODO_SAVE_CONFIG) { + int fsync = flags & CLUSTER_TODO_FSYNC_CONFIG; + clusterSaveConfigOrDie(fsync); + } } void clusterDoBeforeSleep(int flags) { diff --git a/src/cluster.h b/src/cluster.h index 48a111764..95b710383 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -77,6 +77,7 @@ typedef struct clusterLink { #define CLUSTER_TODO_UPDATE_STATE (1<<1) #define CLUSTER_TODO_SAVE_CONFIG (1<<2) #define CLUSTER_TODO_FSYNC_CONFIG (1<<3) +#define CLUSTER_TODO_HANDLE_MANUALFAILOVER (1<<4) /* Message types. *