When one shard, sole primary node marks potentially failed replica as FAIL instead of PFAIL (#12824)
Fixes issue where a single primary cannot mark a replica as failed in a single-shard cluster.
This commit is contained in:
parent
b351a04b1e
commit
b3aaa0a136
@ -959,7 +959,7 @@ void clusterInit(void) {
|
|||||||
server.cluster->myself = NULL;
|
server.cluster->myself = NULL;
|
||||||
server.cluster->currentEpoch = 0;
|
server.cluster->currentEpoch = 0;
|
||||||
server.cluster->state = CLUSTER_FAIL;
|
server.cluster->state = CLUSTER_FAIL;
|
||||||
server.cluster->size = 1;
|
server.cluster->size = 0;
|
||||||
server.cluster->todo_before_sleep = 0;
|
server.cluster->todo_before_sleep = 0;
|
||||||
server.cluster->nodes = dictCreate(&clusterNodesDictType);
|
server.cluster->nodes = dictCreate(&clusterNodesDictType);
|
||||||
server.cluster->shards = dictCreate(&clusterSdsToListType);
|
server.cluster->shards = dictCreate(&clusterSdsToListType);
|
||||||
@ -4691,10 +4691,13 @@ void clusterCron(void) {
|
|||||||
/* Timeout reached. Set the node as possibly failing if it is
|
/* Timeout reached. Set the node as possibly failing if it is
|
||||||
* not already in this state. */
|
* not already in this state. */
|
||||||
if (!(node->flags & (CLUSTER_NODE_PFAIL|CLUSTER_NODE_FAIL))) {
|
if (!(node->flags & (CLUSTER_NODE_PFAIL|CLUSTER_NODE_FAIL))) {
|
||||||
serverLog(LL_DEBUG,"*** NODE %.40s possibly failing",
|
|
||||||
node->name);
|
|
||||||
node->flags |= CLUSTER_NODE_PFAIL;
|
node->flags |= CLUSTER_NODE_PFAIL;
|
||||||
update_state = 1;
|
update_state = 1;
|
||||||
|
if (clusterNodeIsMaster(myself) && server.cluster->size == 1) {
|
||||||
|
markNodeAsFailingIfNeeded(node);
|
||||||
|
} else {
|
||||||
|
serverLog(LL_DEBUG,"*** NODE %.40s possibly failing", node->name);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -199,3 +199,30 @@ proc are_hostnames_propagated {match_string} {
|
|||||||
}
|
}
|
||||||
return 1
|
return 1
|
||||||
}
|
}
|
||||||
|
|
||||||
|
proc wait_node_marked_fail {ref_node_index instance_id_to_check} {
|
||||||
|
wait_for_condition 1000 50 {
|
||||||
|
[check_cluster_node_mark fail $ref_node_index $instance_id_to_check]
|
||||||
|
} else {
|
||||||
|
fail "Replica node never marked as FAIL ('fail')"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
proc wait_node_marked_pfail {ref_node_index instance_id_to_check} {
|
||||||
|
wait_for_condition 1000 50 {
|
||||||
|
[check_cluster_node_mark fail\? $ref_node_index $instance_id_to_check]
|
||||||
|
} else {
|
||||||
|
fail "Replica node never marked as PFAIL ('fail?')"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
proc check_cluster_node_mark {flag ref_node_index instance_id_to_check} {
|
||||||
|
set nodes [get_cluster_nodes $ref_node_index]
|
||||||
|
|
||||||
|
foreach n $nodes {
|
||||||
|
if {[dict get $n id] eq $instance_id_to_check} {
|
||||||
|
return [cluster_has_flag $n $flag]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fail "Unable to find instance id in cluster nodes. ID: $instance_id_to_check"
|
||||||
|
}
|
||||||
|
@ -104,6 +104,7 @@ set ::all_tests {
|
|||||||
unit/cluster/slot-ownership
|
unit/cluster/slot-ownership
|
||||||
unit/cluster/links
|
unit/cluster/links
|
||||||
unit/cluster/cluster-response-tls
|
unit/cluster/cluster-response-tls
|
||||||
|
unit/cluster/failure-marking
|
||||||
}
|
}
|
||||||
# Index to the next test to run in the ::all_tests list.
|
# Index to the next test to run in the ::all_tests list.
|
||||||
set ::next_test 0
|
set ::next_test 0
|
||||||
|
53
tests/unit/cluster/failure-marking.tcl
Normal file
53
tests/unit/cluster/failure-marking.tcl
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
# Test a single primary can mark replica as `fail`
|
||||||
|
start_cluster 1 1 {tags {external:skip cluster}} {
|
||||||
|
|
||||||
|
test "Verify that single primary marks replica as failed" {
|
||||||
|
set primary [srv -0 client]
|
||||||
|
|
||||||
|
set replica1 [srv -1 client]
|
||||||
|
set replica1_pid [srv -1 pid]
|
||||||
|
set replica1_instance_id [dict get [cluster_get_myself 1] id]
|
||||||
|
|
||||||
|
assert {[lindex [$primary role] 0] eq {master}}
|
||||||
|
assert {[lindex [$replica1 role] 0] eq {slave}}
|
||||||
|
|
||||||
|
wait_for_sync $replica1
|
||||||
|
|
||||||
|
pause_process $replica1_pid
|
||||||
|
|
||||||
|
wait_node_marked_fail 0 $replica1_instance_id
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Test multiple primaries wait for a quorum and then mark a replica as `fail`
|
||||||
|
start_cluster 2 1 {tags {external:skip cluster}} {
|
||||||
|
|
||||||
|
test "Verify that multiple primaries mark replica as failed" {
|
||||||
|
set primary1 [srv -0 client]
|
||||||
|
|
||||||
|
set primary2 [srv -1 client]
|
||||||
|
set primary2_pid [srv -1 pid]
|
||||||
|
|
||||||
|
set replica1 [srv -2 client]
|
||||||
|
set replica1_pid [srv -2 pid]
|
||||||
|
set replica1_instance_id [dict get [cluster_get_myself 2] id]
|
||||||
|
|
||||||
|
assert {[lindex [$primary1 role] 0] eq {master}}
|
||||||
|
assert {[lindex [$primary2 role] 0] eq {master}}
|
||||||
|
assert {[lindex [$replica1 role] 0] eq {slave}}
|
||||||
|
|
||||||
|
wait_for_sync $replica1
|
||||||
|
|
||||||
|
pause_process $replica1_pid
|
||||||
|
|
||||||
|
# Pause other primary to allow time for pfail flag to appear
|
||||||
|
pause_process $primary2_pid
|
||||||
|
|
||||||
|
wait_node_marked_pfail 0 $replica1_instance_id
|
||||||
|
|
||||||
|
# Resume other primary and wait for to show replica as failed
|
||||||
|
resume_process $primary2_pid
|
||||||
|
|
||||||
|
wait_node_marked_fail 0 $replica1_instance_id
|
||||||
|
}
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user