
When multiple primary nodes fail simultaneously, the cluster can not recover within the default effective time (data_age limit). The main reason is that the vote is without ranking among multiple replica nodes, which case too many epoch conflicts. Therefore, we introduced into ranking based on the failed primary shard-id. Introduced a new failed_primary_rank var, this var means the rank of this myself instance in the context of all failed primary list. This var will be used in failover and we will do the failover election packets in order based on the rank, this can effectively avoid the voting conflicts. If a single primary is down, the behavior is the same as before. If multiple primaries are down, their replica election initiation time will be delayed by 500ms according to the ranking. Signed-off-by: Binbin <binloveplay1314@qq.com>
136 lines
4.5 KiB
Tcl
136 lines
4.5 KiB
Tcl
# Check the basic monitoring and failover capabilities.
|
|
|
|
start_cluster 3 4 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 5000}} {
|
|
|
|
test "Cluster is up" {
|
|
wait_for_cluster_state ok
|
|
}
|
|
|
|
test "Cluster is writable" {
|
|
cluster_write_test [srv 0 port]
|
|
}
|
|
|
|
set paused_pid [srv 0 pid]
|
|
test "Killing one primary node" {
|
|
pause_process $paused_pid
|
|
}
|
|
|
|
test "Wait for failover" {
|
|
wait_for_condition 1000 50 {
|
|
[s -3 role] == "master" || [s -6 role] == "master"
|
|
} else {
|
|
fail "No failover detected"
|
|
}
|
|
}
|
|
|
|
test "Killing the new primary node" {
|
|
if {[s -3 role] == "master"} {
|
|
set replica_to_be_primary -6
|
|
set paused_pid2 [srv -3 pid]
|
|
} else {
|
|
set replica_to_be_primary -3
|
|
set paused_pid2 [srv -6 pid]
|
|
}
|
|
pause_process $paused_pid2
|
|
}
|
|
|
|
test "Cluster should eventually be up again" {
|
|
for {set j 0} {$j < [llength $::servers]} {incr j} {
|
|
if {[process_is_paused [srv -$j pid]]} continue
|
|
wait_for_condition 1000 50 {
|
|
[CI $j cluster_state] eq "ok"
|
|
} else {
|
|
fail "Cluster node $j cluster_state:[CI $j cluster_state]"
|
|
}
|
|
}
|
|
}
|
|
|
|
test "wait for new failover" {
|
|
wait_for_condition 1000 50 {
|
|
[s $replica_to_be_primary role] == "master"
|
|
} else {
|
|
fail "No failover detected"
|
|
}
|
|
}
|
|
|
|
test "Restarting the previously killed primary nodes" {
|
|
resume_process $paused_pid
|
|
resume_process $paused_pid2
|
|
}
|
|
|
|
test "Make sure there is no failover timeout" {
|
|
verify_no_log_message -3 "*Failover attempt expired*" 0
|
|
verify_no_log_message -6 "*Failover attempt expired*" 0
|
|
}
|
|
} ;# start_cluster
|
|
|
|
start_cluster 7 3 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 5000}} {
|
|
test "Primaries will not time out then they are elected in the same epoch" {
|
|
# Since we have the delay time, so these node may not initiate the
|
|
# election at the same time (same epoch). But if they do, we make
|
|
# sure there is no failover timeout.
|
|
|
|
# Killing there primary nodes.
|
|
pause_process [srv 0 pid]
|
|
pause_process [srv -1 pid]
|
|
pause_process [srv -2 pid]
|
|
|
|
# Wait for the failover
|
|
wait_for_condition 1000 50 {
|
|
[s -7 role] == "master" &&
|
|
[s -8 role] == "master" &&
|
|
[s -9 role] == "master"
|
|
} else {
|
|
fail "No failover detected"
|
|
}
|
|
|
|
# Make sure there is no false epoch 0.
|
|
verify_no_log_message -7 "*Failover election in progress for epoch 0*" 0
|
|
verify_no_log_message -8 "*Failover election in progress for epoch 0*" 0
|
|
verify_no_log_message -9 "*Failover election in progress for epoch 0*" 0
|
|
|
|
# Make sure there is no failover timeout.
|
|
verify_no_log_message -7 "*Failover attempt expired*" 0
|
|
verify_no_log_message -8 "*Failover attempt expired*" 0
|
|
verify_no_log_message -9 "*Failover attempt expired*" 0
|
|
|
|
# Resuming these primary nodes, speed up the shutdown.
|
|
resume_process [srv 0 pid]
|
|
resume_process [srv -1 pid]
|
|
resume_process [srv -2 pid]
|
|
}
|
|
} ;# start_cluster
|
|
|
|
run_solo {cluster} {
|
|
start_cluster 32 15 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 15000}} {
|
|
test "Multiple primary nodes are down, rank them based on the failed primary" {
|
|
# Killing these primary nodes.
|
|
for {set j 0} {$j < 15} {incr j} {
|
|
pause_process [srv -$j pid]
|
|
}
|
|
|
|
# Make sure that a node starts failover.
|
|
wait_for_condition 1000 100 {
|
|
[s -40 role] == "master"
|
|
} else {
|
|
fail "No failover detected"
|
|
}
|
|
|
|
# Wait for the cluster state to become ok.
|
|
for {set j 0} {$j < [llength $::servers]} {incr j} {
|
|
if {[process_is_paused [srv -$j pid]]} continue
|
|
wait_for_condition 1000 100 {
|
|
[CI $j cluster_state] eq "ok"
|
|
} else {
|
|
fail "Cluster node $j cluster_state:[CI $j cluster_state]"
|
|
}
|
|
}
|
|
|
|
# Resuming these primary nodes, speed up the shutdown.
|
|
for {set j 0} {$j < 15} {incr j} {
|
|
resume_process [srv -$j pid]
|
|
}
|
|
}
|
|
} ;# start_cluster
|
|
} ;# run_solo
|