
In markNodeAsFailingIfNeeded we will count needed_quorum and failures, needed_quorum is the half the cluster->size and plus one, and cluster-size is the size of primary node which contain slots, but when counting failures, we dit not check if primary has slots. Only the primary has slots that has the rights to vote, adding a new clusterNodeIsVotingPrimary to formalize this concept. Release notes: bugfix where nodes not in the quorum group might spuriously mark nodes as failed --------- Signed-off-by: Binbin <binloveplay1314@qq.com> Co-authored-by: Ping Xie <pingxie@outlook.com> Signed-off-by: Ping Xie <pingxie@google.com>
122 lines
4.1 KiB
Tcl
122 lines
4.1 KiB
Tcl
# Test a single primary can mark replica as `fail`
|
|
start_cluster 1 1 {tags {external:skip cluster}} {
|
|
|
|
test "Verify that single primary marks replica as failed" {
|
|
set primary [srv -0 client]
|
|
|
|
set replica1 [srv -1 client]
|
|
set replica1_pid [srv -1 pid]
|
|
set replica1_instance_id [dict get [cluster_get_myself 1] id]
|
|
|
|
assert {[lindex [$primary role] 0] eq {master}}
|
|
assert {[lindex [$replica1 role] 0] eq {slave}}
|
|
|
|
wait_for_sync $replica1
|
|
|
|
pause_process $replica1_pid
|
|
|
|
wait_node_marked_fail 0 $replica1_instance_id
|
|
|
|
resume_process $replica1_pid
|
|
}
|
|
}
|
|
|
|
# Test multiple primaries wait for a quorum and then mark a replica as `fail`
|
|
start_cluster 2 1 {tags {external:skip cluster}} {
|
|
|
|
test "Verify that multiple primaries mark replica as failed" {
|
|
set primary1 [srv -0 client]
|
|
|
|
set primary2 [srv -1 client]
|
|
set primary2_pid [srv -1 pid]
|
|
|
|
set replica1 [srv -2 client]
|
|
set replica1_pid [srv -2 pid]
|
|
set replica1_instance_id [dict get [cluster_get_myself 2] id]
|
|
|
|
assert {[lindex [$primary1 role] 0] eq {master}}
|
|
assert {[lindex [$primary2 role] 0] eq {master}}
|
|
assert {[lindex [$replica1 role] 0] eq {slave}}
|
|
|
|
wait_for_sync $replica1
|
|
|
|
pause_process $replica1_pid
|
|
|
|
# Pause other primary to allow time for pfail flag to appear
|
|
pause_process $primary2_pid
|
|
|
|
wait_node_marked_pfail 0 $replica1_instance_id
|
|
|
|
# Resume other primary and wait for to show replica as failed
|
|
resume_process $primary2_pid
|
|
|
|
wait_node_marked_fail 0 $replica1_instance_id
|
|
|
|
resume_process $replica1_pid
|
|
}
|
|
}
|
|
|
|
set old_singledb $::singledb
|
|
set ::singledb 1
|
|
|
|
tags {external:skip tls:skip cluster} {
|
|
set base_conf [list cluster-enabled yes cluster-ping-interval 100 cluster-node-timeout 3000 save ""]
|
|
start_multiple_servers 5 [list overrides $base_conf] {
|
|
test "Only primary with slots has the right to mark a node as failed" {
|
|
set primary_host [srv 0 host]
|
|
set primary_port [srv 0 port]
|
|
set primary_pid [srv 0 pid]
|
|
set primary_id [R 0 CLUSTER MYID]
|
|
set replica_id [R 1 CLUSTER MYID]
|
|
set replica_pid [srv -1 pid]
|
|
|
|
# Meet others nodes.
|
|
R 1 CLUSTER MEET $primary_host $primary_port
|
|
R 2 CLUSTER MEET $primary_host $primary_port
|
|
R 3 CLUSTER MEET $primary_host $primary_port
|
|
R 4 CLUSTER MEET $primary_host $primary_port
|
|
|
|
# Build a single primary cluster.
|
|
cluster_allocate_slots 1 1
|
|
wait_for_cluster_propagation
|
|
R 1 CLUSTER REPLICATE $primary_id
|
|
wait_for_cluster_propagation
|
|
wait_for_cluster_state "ok"
|
|
|
|
# Pause the primary, marking the primary as pfail.
|
|
pause_process $primary_pid
|
|
wait_node_marked_pfail 1 $primary_id
|
|
wait_node_marked_pfail 2 $primary_id
|
|
wait_node_marked_pfail 3 $primary_id
|
|
wait_node_marked_pfail 4 $primary_id
|
|
|
|
# Pause the replica, marking the replica as pfail.
|
|
pause_process $replica_pid
|
|
wait_node_marked_pfail 2 $replica_id
|
|
wait_node_marked_pfail 3 $replica_id
|
|
wait_node_marked_pfail 4 $replica_id
|
|
|
|
# Resume the primary, marking the replica as fail.
|
|
resume_process $primary_pid
|
|
wait_node_marked_fail 0 $replica_id
|
|
wait_node_marked_fail 2 $replica_id
|
|
wait_node_marked_fail 3 $replica_id
|
|
wait_node_marked_fail 4 $replica_id
|
|
|
|
# Check if we got the right failure reports.
|
|
wait_for_condition 1000 50 {
|
|
[R 0 CLUSTER COUNT-FAILURE-REPORTS $replica_id] == 0 &&
|
|
[R 2 CLUSTER COUNT-FAILURE-REPORTS $replica_id] == 1 &&
|
|
[R 3 CLUSTER COUNT-FAILURE-REPORTS $replica_id] == 1 &&
|
|
[R 4 CLUSTER COUNT-FAILURE-REPORTS $replica_id] == 1
|
|
} else {
|
|
fail "Cluster COUNT-FAILURE-REPORTS is not right."
|
|
}
|
|
|
|
resume_process $replica_pid
|
|
}
|
|
}
|
|
}
|
|
|
|
set ::singledb $old_singledb
|