
Imagine we have a cluster, for example a three-shard cluster, if shard 1 doing a CLUSTER RESET HARD, it will change the node name, and then other nodes will mark it as NOADR since the node name received by PONG has changed. In the eyes of other nodes, there is one working primary node left but with no address, and in this case, the address report in MOVED will be invalid and will confuse the clients. And in the same time, the replica will not failover since its primary is not in the FAIL state. And the cluster looks OK to everyone. This leaves a cluster that appears OK, but with no coverage for shard 1, obviously we should do something like CLUSTER FORGET to remove the node and fix the cluster before using it. But the point in here, we can mark the NOADDR node as FAIL to advance the cluster state. If a node is NOADDR means it does not have a valid address, so we won't reconnect it, we won't send PING, we won't gossip it, it seems reasonable to mark it as FAIL. Signed-off-by: Binbin <binloveplay1314@qq.com>
48 lines
2.1 KiB
Tcl
48 lines
2.1 KiB
Tcl
start_cluster 3 3 {tags {external:skip cluster} overrides {cluster-replica-no-failover yes}} {
|
|
test "NOADDR nodes will be marked as FAIL" {
|
|
set primary0_id [R 0 CLUSTER MYID]
|
|
|
|
# R 0 is a primary, after doing a CLUSTER RESET, the node name will be modified,
|
|
# and other nodes will set it to NOADDR and FAIL.
|
|
R 0 cluster reset hard
|
|
wait_for_log_messages -1 {"*PONG contains mismatching sender ID*"} 0 1000 10
|
|
wait_for_log_messages -2 {"*PONG contains mismatching sender ID*"} 0 1000 10
|
|
wait_for_condition 1000 10 {
|
|
[cluster_has_flag [cluster_get_node_by_id 1 $primary0_id] noaddr] eq 1 &&
|
|
[cluster_has_flag [cluster_get_node_by_id 1 $primary0_id] fail] eq 1 &&
|
|
[cluster_has_flag [cluster_get_node_by_id 2 $primary0_id] noaddr] eq 1 &&
|
|
[cluster_has_flag [cluster_get_node_by_id 2 $primary0_id] fail] eq 1
|
|
} else {
|
|
fail "The node is not marked with the correct flag"
|
|
}
|
|
|
|
# Also we will set the node to FAIL, so the cluster will eventually be down.
|
|
wait_for_condition 1000 50 {
|
|
[CI 1 cluster_state] eq {fail} &&
|
|
[CI 2 cluster_state] eq {fail} &&
|
|
[CI 3 cluster_state] eq {fail} &&
|
|
[CI 4 cluster_state] eq {fail} &&
|
|
[CI 5 cluster_state] eq {fail}
|
|
} else {
|
|
fail "Cluster doesn't fail"
|
|
}
|
|
|
|
# key_977613 belong to slot 0 and belong to R 0.
|
|
# Make sure we get a CLUSTER DOWN instead of an invalid MOVED.
|
|
assert_error {CLUSTERDOWN*} {R 1 set key_977613 bar}
|
|
|
|
# Let the replica 3 do the failover.
|
|
R 3 config set cluster-replica-validity-factor 0
|
|
R 3 config set cluster-replica-no-failover no
|
|
wait_for_condition 1000 50 {
|
|
[CI 1 cluster_state] eq {ok} &&
|
|
[CI 2 cluster_state] eq {ok} &&
|
|
[CI 3 cluster_state] eq {ok} &&
|
|
[CI 4 cluster_state] eq {ok} &&
|
|
[CI 5 cluster_state] eq {ok}
|
|
} else {
|
|
fail "Cluster doesn't stabilize"
|
|
}
|
|
}
|
|
} ;# start_cluster
|