Optimize failover time when the new primary node is down again (#782)
We will not reset failover_auth_time after setting it, this is used
to check auth_timeout and auth_retry_time, but we should at least
reset it after a successful failover.
Let's assume the following scenario:
1. Two replicas initiate an election.
2. Replica 1 is elected as the primary node, and replica 2 does not have
enough votes.
3. Replica 1 is down, ie the new primary node down again in a short
time.
4. Replica 2 know that the new primary node is down and wants to
initiate
a failover, but because the failover_auth_time of the previous round
has not been reset, it needs to wait for it to time out and then wait
for the next retry time, which will take cluster-node-timeout * 4 times,
this adds a lot of delay.
There is another problem. Like we will set additional random time for
failover_auth_time, such as random 500ms and replicas ranking 1s. If
replica 2 receives PONG from the new primary node before sending the
FAILOVER_AUTH_REQUEST, that is, before the failover_auth_time, it will
change itself to a replica. If the new primary node goes down again at
this time, replica 2 will use the previous failover_auth_time to
initiate
an election instead of going through the logic of random 500ms and
replicas ranking 1s again, which may lead to unexpected consequences
(for example, a low-ranking replica initiates an election and becomes
the new primary node).
That is, we need to reset failover_auth_time at the appropriate time.
When the replica switches to a new primary, we reset it, because the
existing failover_auth_time is already out of date in this case.
---------
Signed-off-by: Binbin <binloveplay1314@qq.com>
2024-07-20 03:27:49 +08:00
|
|
|
# Check the basic monitoring and failover capabilities.
|
|
|
|
|
|
|
|
start_cluster 3 4 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 5000}} {
|
|
|
|
|
|
|
|
test "Cluster is up" {
|
|
|
|
wait_for_cluster_state ok
|
|
|
|
}
|
|
|
|
|
|
|
|
test "Cluster is writable" {
|
|
|
|
cluster_write_test [srv 0 port]
|
|
|
|
}
|
|
|
|
|
|
|
|
set paused_pid [srv 0 pid]
|
|
|
|
test "Killing one primary node" {
|
|
|
|
pause_process $paused_pid
|
|
|
|
}
|
|
|
|
|
|
|
|
test "Wait for failover" {
|
|
|
|
wait_for_condition 1000 50 {
|
|
|
|
[s -3 role] == "master" || [s -6 role] == "master"
|
|
|
|
} else {
|
|
|
|
fail "No failover detected"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
test "Killing the new primary node" {
|
|
|
|
if {[s -3 role] == "master"} {
|
|
|
|
set replica_to_be_primary -6
|
|
|
|
set paused_pid2 [srv -3 pid]
|
|
|
|
} else {
|
|
|
|
set replica_to_be_primary -3
|
|
|
|
set paused_pid2 [srv -6 pid]
|
|
|
|
}
|
|
|
|
pause_process $paused_pid2
|
|
|
|
}
|
|
|
|
|
|
|
|
test "Cluster should eventually be up again" {
|
|
|
|
for {set j 0} {$j < [llength $::servers]} {incr j} {
|
|
|
|
if {[process_is_paused [srv -$j pid]]} continue
|
|
|
|
wait_for_condition 1000 50 {
|
|
|
|
[CI $j cluster_state] eq "ok"
|
|
|
|
} else {
|
|
|
|
fail "Cluster node $j cluster_state:[CI $j cluster_state]"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
test "wait for new failover" {
|
|
|
|
wait_for_condition 1000 50 {
|
|
|
|
[s $replica_to_be_primary role] == "master"
|
|
|
|
} else {
|
|
|
|
fail "No failover detected"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
test "Restarting the previously killed primary nodes" {
|
|
|
|
resume_process $paused_pid
|
|
|
|
resume_process $paused_pid2
|
|
|
|
}
|
|
|
|
|
|
|
|
test "Make sure there is no failover timeout" {
|
|
|
|
verify_no_log_message -3 "*Failover attempt expired*" 0
|
|
|
|
verify_no_log_message -6 "*Failover attempt expired*" 0
|
|
|
|
}
|
|
|
|
|
|
|
|
} ;# start_cluster
|
2024-11-11 22:12:49 +08:00
|
|
|
|
|
|
|
|
|
|
|
start_cluster 7 3 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 5000}} {
|
|
|
|
test "Primaries will not time out then they are elected in the same epoch" {
|
|
|
|
# Since we have the delay time, so these node may not initiate the
|
|
|
|
# election at the same time (same epoch). But if they do, we make
|
|
|
|
# sure there is no failover timeout.
|
|
|
|
|
|
|
|
# Killing there primary nodes.
|
|
|
|
pause_process [srv 0 pid]
|
|
|
|
pause_process [srv -1 pid]
|
|
|
|
pause_process [srv -2 pid]
|
|
|
|
|
|
|
|
# Wait for the failover
|
|
|
|
wait_for_condition 1000 50 {
|
|
|
|
[s -7 role] == "master" &&
|
|
|
|
[s -8 role] == "master" &&
|
|
|
|
[s -9 role] == "master"
|
|
|
|
} else {
|
|
|
|
fail "No failover detected"
|
|
|
|
}
|
|
|
|
|
|
|
|
# Make sure there is no failover timeout.
|
|
|
|
verify_no_log_message -7 "*Failover attempt expired*" 0
|
|
|
|
verify_no_log_message -8 "*Failover attempt expired*" 0
|
|
|
|
verify_no_log_message -9 "*Failover attempt expired*" 0
|
|
|
|
|
|
|
|
# Resuming these primary nodes, speed up the shutdown.
|
|
|
|
resume_process [srv 0 pid]
|
|
|
|
resume_process [srv -1 pid]
|
|
|
|
resume_process [srv -2 pid]
|
|
|
|
}
|
|
|
|
} ;# start_cluster
|