
After #1009, we will reset the election when we received a claim with an equal or higher epoch since a node can win an election in the past. But we need to consider the time before the node actually obtains the failover_auth_epoch. The failover_auth_epoch default is 0, so before the node actually get the failover epoch, we might wrongly reset the election. This is probably harmless, but will produce misleading log output and may delay election by a cron cycle or beforesleep. Now we will only reset the election when a node is actually obtains the failover epoch. Signed-off-by: Binbin <binloveplay1314@qq.com>
105 lines
3.3 KiB
Tcl
105 lines
3.3 KiB
Tcl
# Check the basic monitoring and failover capabilities.
|
|
|
|
start_cluster 3 4 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 5000}} {
|
|
|
|
test "Cluster is up" {
|
|
wait_for_cluster_state ok
|
|
}
|
|
|
|
test "Cluster is writable" {
|
|
cluster_write_test [srv 0 port]
|
|
}
|
|
|
|
set paused_pid [srv 0 pid]
|
|
test "Killing one primary node" {
|
|
pause_process $paused_pid
|
|
}
|
|
|
|
test "Wait for failover" {
|
|
wait_for_condition 1000 50 {
|
|
[s -3 role] == "master" || [s -6 role] == "master"
|
|
} else {
|
|
fail "No failover detected"
|
|
}
|
|
}
|
|
|
|
test "Killing the new primary node" {
|
|
if {[s -3 role] == "master"} {
|
|
set replica_to_be_primary -6
|
|
set paused_pid2 [srv -3 pid]
|
|
} else {
|
|
set replica_to_be_primary -3
|
|
set paused_pid2 [srv -6 pid]
|
|
}
|
|
pause_process $paused_pid2
|
|
}
|
|
|
|
test "Cluster should eventually be up again" {
|
|
for {set j 0} {$j < [llength $::servers]} {incr j} {
|
|
if {[process_is_paused [srv -$j pid]]} continue
|
|
wait_for_condition 1000 50 {
|
|
[CI $j cluster_state] eq "ok"
|
|
} else {
|
|
fail "Cluster node $j cluster_state:[CI $j cluster_state]"
|
|
}
|
|
}
|
|
}
|
|
|
|
test "wait for new failover" {
|
|
wait_for_condition 1000 50 {
|
|
[s $replica_to_be_primary role] == "master"
|
|
} else {
|
|
fail "No failover detected"
|
|
}
|
|
}
|
|
|
|
test "Restarting the previously killed primary nodes" {
|
|
resume_process $paused_pid
|
|
resume_process $paused_pid2
|
|
}
|
|
|
|
test "Make sure there is no failover timeout" {
|
|
verify_no_log_message -3 "*Failover attempt expired*" 0
|
|
verify_no_log_message -6 "*Failover attempt expired*" 0
|
|
}
|
|
|
|
} ;# start_cluster
|
|
|
|
|
|
start_cluster 7 3 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 5000}} {
|
|
test "Primaries will not time out then they are elected in the same epoch" {
|
|
# Since we have the delay time, so these node may not initiate the
|
|
# election at the same time (same epoch). But if they do, we make
|
|
# sure there is no failover timeout.
|
|
|
|
# Killing there primary nodes.
|
|
pause_process [srv 0 pid]
|
|
pause_process [srv -1 pid]
|
|
pause_process [srv -2 pid]
|
|
|
|
# Wait for the failover
|
|
wait_for_condition 1000 50 {
|
|
[s -7 role] == "master" &&
|
|
[s -8 role] == "master" &&
|
|
[s -9 role] == "master"
|
|
} else {
|
|
fail "No failover detected"
|
|
}
|
|
|
|
# Make sure there is no false epoch 0.
|
|
verify_no_log_message -7 "*Failover election in progress for epoch 0*" 0
|
|
verify_no_log_message -8 "*Failover election in progress for epoch 0*" 0
|
|
verify_no_log_message -9 "*Failover election in progress for epoch 0*" 0
|
|
|
|
# Make sure there is no failover timeout.
|
|
verify_no_log_message -7 "*Failover attempt expired*" 0
|
|
verify_no_log_message -8 "*Failover attempt expired*" 0
|
|
verify_no_log_message -9 "*Failover attempt expired*" 0
|
|
|
|
# Resuming these primary nodes, speed up the shutdown.
|
|
resume_process [srv 0 pid]
|
|
resume_process [srv -1 pid]
|
|
resume_process [srv -2 pid]
|
|
}
|
|
} ;# start_cluster
|