Make manual failover reset the on-going election to promote failover (#1274)

If a manual failover got timed out, like the election don't get the enough votes, since we have a auth_timeout and a auth_retry_time, a new manual failover will not be able to proceed on the replica side. Like if we initiate a new manual failover after a election timed out, we will pause the primary, but on the replica side, due to retry_time, replica does not trigger the new election and the manual failover will eventually time out. In this case, if we initiate manual failover again and there is an ongoing election, we will reset it so that the replica can initiate a new election at the manual failover's request. Signed-off-by: Binbin <binloveplay1314@qq.com>
2024-11-22 10:28:59 +08:00 · 2024-11-22 10:28:59 +08:00 · c4be326c32
commit c4be326c32
parent b56eed2479
2 changed files with 65 additions and 2 deletions
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@ -4848,6 +4848,27 @@ void clusterHandleReplicaMigration(int max_replicas) {
 * data loss due to the asynchronous primary-replica replication.
 * -------------------------------------------------------------------------- */

+void manualFailoverCanStart(void) {
+    serverAssert(server.cluster->mf_can_start == 0);
+
+    if (server.cluster->failover_auth_time) {
+        /* There is another manual failover requested by the user.
+         * If we have an ongoing election, reset it because the user may initiate
+         * manual failover again when the previous manual failover timed out.
+         * Otherwise, if the previous election timed out (see auth_timeout) and
+         * before the next retry (see auth_retry_time), the new manual failover
+         * will pause the primary and replica can not do anything to advance the
+         * manual failover, and then the manual failover eventually times out. */
+        server.cluster->failover_auth_time = 0;
+        serverLog(LL_WARNING,
+                  "Failover election in progress for epoch %llu, but received a new manual failover. "
+                  "Resetting the election.",
+                  (unsigned long long)server.cluster->failover_auth_epoch);
+    }
+
+    server.cluster->mf_can_start = 1;
+}
+
 /* Reset the manual failover state. This works for both primaries and replicas
 * as all the state about manual failover is cleared.
 *
@ -4888,7 +4909,7 @@ void clusterHandleManualFailover(void) {
    if (server.cluster->mf_primary_offset == replicationGetReplicaOffset()) {
        /* Our replication offset matches the primary replication offset
         * announced after clients were paused. We can start the failover. */
-        server.cluster->mf_can_start = 1;
+        manualFailoverCanStart();
        serverLog(LL_NOTICE, "All primary replication stream processed, "
                             "manual failover can start.");
        clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER);
@ -6785,7 +6806,7 @@ int clusterCommandSpecial(client *c) {
             * primary to agree about the offset. We just failover taking over
             * it without coordination. */
            serverLog(LL_NOTICE, "Forced failover user request accepted (user request from '%s').", client);
-            server.cluster->mf_can_start = 1;
+            manualFailoverCanStart();
            /* We can start a manual failover as soon as possible, setting a flag
             * here so that we don't need to waiting for the cron to kick in. */
            clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_MANUALFAILOVER);
--- a/tests/unit/cluster/manual-failover.tcl
+++ b/tests/unit/cluster/manual-failover.tcl
@ -271,3 +271,45 @@ start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-ping-interval
        wait_for_cluster_propagation
    }
 } ;# start_cluster
+
+start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 15000}} {
+    test "Manual failover will reset the on-going election" {
+        set CLUSTER_PACKET_TYPE_FAILOVER_AUTH_REQUEST 5
+        set CLUSTER_PACKET_TYPE_NONE -1
+
+        # Let other primaries drop FAILOVER_AUTH_REQUEST so that the election won't
+        # get the enough votes and the election will time out.
+        R 1 debug drop-cluster-packet-filter $CLUSTER_PACKET_TYPE_FAILOVER_AUTH_REQUEST
+        R 2 debug drop-cluster-packet-filter $CLUSTER_PACKET_TYPE_FAILOVER_AUTH_REQUEST
+
+        # Replica doing the manual failover.
+        R 3 cluster failover
+
+        # Waiting for primary and replica to confirm manual failover timeout.
+        wait_for_log_messages 0 {"*Manual failover timed out*"} 0 1000 50
+        wait_for_log_messages -3 {"*Manual failover timed out*"} 0 1000 50
+        set loglines1 [count_log_lines 0]
+        set loglines2 [count_log_lines -3]
+
+        # Undo packet drop, so that replica can win the next election.
+        R 1 debug drop-cluster-packet-filter $CLUSTER_PACKET_TYPE_NONE
+        R 2 debug drop-cluster-packet-filter $CLUSTER_PACKET_TYPE_NONE
+
+        # Replica doing the manual failover again.
+        R 3 cluster failover
+
+        # Make sure the election is reset.
+        wait_for_log_messages -3 {"*Failover election in progress*Resetting the election*"} $loglines2 1000 50
+
+        # Wait for failover.
+        wait_for_condition 1000 50 {
+            [s -3 role] == "master"
+        } else {
+            fail "No failover detected"
+        }
+
+        # Make sure that the second manual failover does not time out.
+        verify_no_log_message 0 "*Manual failover timed out*" $loglines1
+        verify_no_log_message -3 "*Manual failover timed out*" $loglines2
+    }
+} ;# start_cluster