futriix/tests/unit/cluster/failover2.tcl
Binbin 211b250aad
Do election in order based on failed primary rank to avoid voting conflicts (#1018)
When multiple primary nodes fail simultaneously, the cluster can not recover
within the default effective time (data_age limit). The main reason is that
the vote is without ranking among multiple replica nodes, which case too many
epoch conflicts.

Therefore, we introduced into ranking based on the failed primary shard-id.
Introduced a new failed_primary_rank var, this var means the rank of this
myself instance in the context of all failed primary list. This var will be
used in failover and we will do the failover election packets in order based
on the rank, this can effectively avoid the voting conflicts.

If a single primary is down, the behavior is the same as before. If multiple
primaries are down, their replica election initiation time will be delayed
by 500ms according to the ranking.

Signed-off-by: Binbin <binloveplay1314@qq.com>
2025-01-11 10:43:18 +08:00

136 lines
4.5 KiB
Tcl

# Check the basic monitoring and failover capabilities.
start_cluster 3 4 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 5000}} {
test "Cluster is up" {
wait_for_cluster_state ok
}
test "Cluster is writable" {
cluster_write_test [srv 0 port]
}
set paused_pid [srv 0 pid]
test "Killing one primary node" {
pause_process $paused_pid
}
test "Wait for failover" {
wait_for_condition 1000 50 {
[s -3 role] == "master" || [s -6 role] == "master"
} else {
fail "No failover detected"
}
}
test "Killing the new primary node" {
if {[s -3 role] == "master"} {
set replica_to_be_primary -6
set paused_pid2 [srv -3 pid]
} else {
set replica_to_be_primary -3
set paused_pid2 [srv -6 pid]
}
pause_process $paused_pid2
}
test "Cluster should eventually be up again" {
for {set j 0} {$j < [llength $::servers]} {incr j} {
if {[process_is_paused [srv -$j pid]]} continue
wait_for_condition 1000 50 {
[CI $j cluster_state] eq "ok"
} else {
fail "Cluster node $j cluster_state:[CI $j cluster_state]"
}
}
}
test "wait for new failover" {
wait_for_condition 1000 50 {
[s $replica_to_be_primary role] == "master"
} else {
fail "No failover detected"
}
}
test "Restarting the previously killed primary nodes" {
resume_process $paused_pid
resume_process $paused_pid2
}
test "Make sure there is no failover timeout" {
verify_no_log_message -3 "*Failover attempt expired*" 0
verify_no_log_message -6 "*Failover attempt expired*" 0
}
} ;# start_cluster
start_cluster 7 3 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 5000}} {
test "Primaries will not time out then they are elected in the same epoch" {
# Since we have the delay time, so these node may not initiate the
# election at the same time (same epoch). But if they do, we make
# sure there is no failover timeout.
# Killing there primary nodes.
pause_process [srv 0 pid]
pause_process [srv -1 pid]
pause_process [srv -2 pid]
# Wait for the failover
wait_for_condition 1000 50 {
[s -7 role] == "master" &&
[s -8 role] == "master" &&
[s -9 role] == "master"
} else {
fail "No failover detected"
}
# Make sure there is no false epoch 0.
verify_no_log_message -7 "*Failover election in progress for epoch 0*" 0
verify_no_log_message -8 "*Failover election in progress for epoch 0*" 0
verify_no_log_message -9 "*Failover election in progress for epoch 0*" 0
# Make sure there is no failover timeout.
verify_no_log_message -7 "*Failover attempt expired*" 0
verify_no_log_message -8 "*Failover attempt expired*" 0
verify_no_log_message -9 "*Failover attempt expired*" 0
# Resuming these primary nodes, speed up the shutdown.
resume_process [srv 0 pid]
resume_process [srv -1 pid]
resume_process [srv -2 pid]
}
} ;# start_cluster
run_solo {cluster} {
start_cluster 32 15 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 15000}} {
test "Multiple primary nodes are down, rank them based on the failed primary" {
# Killing these primary nodes.
for {set j 0} {$j < 15} {incr j} {
pause_process [srv -$j pid]
}
# Make sure that a node starts failover.
wait_for_condition 1000 100 {
[s -40 role] == "master"
} else {
fail "No failover detected"
}
# Wait for the cluster state to become ok.
for {set j 0} {$j < [llength $::servers]} {incr j} {
if {[process_is_paused [srv -$j pid]]} continue
wait_for_condition 1000 100 {
[CI $j cluster_state] eq "ok"
} else {
fail "Cluster node $j cluster_state:[CI $j cluster_state]"
}
}
# Resuming these primary nodes, speed up the shutdown.
for {set j 0} {$j < 15} {incr j} {
resume_process [srv -$j pid]
}
}
} ;# start_cluster
} ;# run_solo