diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 04a04774f..ee7e4c531 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -3135,6 +3135,24 @@ int clusterProcessPacket(clusterLink *link) { if (sender_claims_to_be_primary && sender_claimed_config_epoch > sender->configEpoch) { sender->configEpoch = sender_claimed_config_epoch; clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_FSYNC_CONFIG); + + if (server.cluster->failover_auth_time && sender->configEpoch >= server.cluster->failover_auth_epoch) { + /* Another node has claimed an epoch greater than or equal to ours. + * If we have an ongoing election, reset it because we cannot win + * with an epoch smaller than or equal to the incoming claim. This + * allows us to start a new election as soon as possible. */ + server.cluster->failover_auth_time = 0; + serverLog(LL_WARNING, + "Failover election in progress for epoch %llu, but received a claim from " + "node %.40s (%s) with an equal or higher epoch %llu. Resetting the election " + "since we cannot win an election in the past.", + (unsigned long long)server.cluster->failover_auth_epoch, + sender->name, sender->human_nodename, + (unsigned long long)sender->configEpoch); + /* Maybe we could start a new election, set a flag here to make sure + * we check as soon as possible, instead of waiting for a cron. */ + clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER); + } } /* Update the replication offset info for this node. */ sender->repl_offset = ntohu64(hdr->offset); diff --git a/tests/unit/cluster/failover2.tcl b/tests/unit/cluster/failover2.tcl index 7bc6a05e9..21c4f4a67 100644 --- a/tests/unit/cluster/failover2.tcl +++ b/tests/unit/cluster/failover2.tcl @@ -64,3 +64,36 @@ start_cluster 3 4 {tags {external:skip cluster} overrides {cluster-ping-interval } } ;# start_cluster + + +start_cluster 7 3 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 5000}} { + test "Primaries will not time out then they are elected in the same epoch" { + # Since we have the delay time, so these node may not initiate the + # election at the same time (same epoch). But if they do, we make + # sure there is no failover timeout. + + # Killing there primary nodes. + pause_process [srv 0 pid] + pause_process [srv -1 pid] + pause_process [srv -2 pid] + + # Wait for the failover + wait_for_condition 1000 50 { + [s -7 role] == "master" && + [s -8 role] == "master" && + [s -9 role] == "master" + } else { + fail "No failover detected" + } + + # Make sure there is no failover timeout. + verify_no_log_message -7 "*Failover attempt expired*" 0 + verify_no_log_message -8 "*Failover attempt expired*" 0 + verify_no_log_message -9 "*Failover attempt expired*" 0 + + # Resuming these primary nodes, speed up the shutdown. + resume_process [srv 0 pid] + resume_process [srv -1 pid] + resume_process [srv -2 pid] + } +} ;# start_cluster