futriix/tests/unit/cluster/cluster-multiple-meets.tcl
Pierre 5f7fe9ef21
Send MEET packet to node if there is no inbound link to fix inconsistency when handshake timedout (#1307)
In some cases, when meeting a new node, if the handshake times out, we
can end up with an inconsistent view of the cluster where the new node
knows about all the nodes in the cluster, but the cluster does not know
about this new node (or vice versa).
To detect this inconsistency, we now check if a node has an outbound
link but no inbound link, in this case it probably means this node does
not know us. In this case we (re-)send a MEET packet to this node to do
a new handshake with it.
If we receive a MEET packet from a known node, we disconnect the
outbound link to force a reconnect and sending of a PING packet so that
the other node recognizes the link as belonging to us. This prevents
cases where a node could send MEET packets in a loop because it thinks
the other node does not have an inbound link.

This fixes the bug described in #1251.

---------

Signed-off-by: Pierre Turin <pieturin@amazon.com>
2024-12-11 17:26:06 -08:00

92 lines
3.7 KiB
Tcl

# make sure the test infra won't use SELECT
set old_singledb $::singledb
set ::singledb 1
tags {tls:skip external:skip cluster} {
set base_conf [list cluster-enabled yes]
start_multiple_servers 2 [list overrides $base_conf] {
test "Cluster nodes are reachable" {
for {set id 0} {$id < [llength $::servers]} {incr id} {
# Every node should be reachable.
wait_for_condition 1000 50 {
([catch {R $id ping} ping_reply] == 0) &&
($ping_reply eq {PONG})
} else {
catch {R $id ping} err
fail "Node #$id keeps replying '$err' to PING."
}
}
}
test "Before slots allocation, all nodes report cluster failure" {
wait_for_cluster_state fail
}
set CLUSTER_PACKET_TYPE_PONG 1
set CLUSTER_PACKET_TYPE_NONE -1
test "Cluster nodes haven't met each other" {
assert {[llength [get_cluster_nodes 1]] == 1}
assert {[llength [get_cluster_nodes 0]] == 1}
}
test "Allocate slots" {
cluster_allocate_slots 2 0;# primaries replicas
}
test "Multiple MEETs from Node 1 to Node 0 should work" {
# Make 1 drop the PONG responses to MEET
R 1 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_PONG
# It is important to close the connection on drop, otherwise a subsequent MEET won't be sent
R 1 DEBUG CLOSE-CLUSTER-LINK-ON-PACKET-DROP 1
R 1 CLUSTER MEET 127.0.0.1 [srv 0 port]
# Wait for at least a few MEETs to be sent so that we are sure that 1 is dropping the response to MEET.
wait_for_condition 1000 50 {
[CI 0 cluster_stats_messages_meet_received] > 1 &&
[CI 1 cluster_state] eq {fail} && [CI 0 cluster_state] eq {ok}
} else {
fail "Cluster node 1 never sent multiple MEETs to 0"
}
# 0 will be connected to 1, but 1 won't see that 0 is connected
# Using a wait condition here as an assert can be flaky - especially
# when cluster nodes is processed when the link is established to send MEET.
wait_for_condition 1000 50 {
[llength [get_cluster_nodes 1 connected]] == 1
} else {
fail "Node 1 recognizes node 0 even though it drops PONGs from node 0"
}
assert {[llength [get_cluster_nodes 0]] == 2}
# Drop incoming and outgoing links from/to 1
R 0 DEBUG CLUSTERLINK KILL ALL [R 1 CLUSTER MYID]
# Wait for 0 to know about 1 again after 1 sends a MEET
wait_for_condition 1000 50 {
[llength [get_cluster_nodes 0 connected]] == 2
} else {
fail "Cluster node 1 never sent multiple MEETs to 0"
}
# Undo packet drop
R 1 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_NONE
R 1 DEBUG CLOSE-CLUSTER-LINK-ON-PACKET-DROP 0
# Both a and b will turn to cluster state ok
wait_for_condition 1000 50 {
[CI 1 cluster_state] eq {ok} && [CI 0 cluster_state] eq {ok} &&
[llength [get_cluster_nodes 0 connected]] == 2 &&
[llength [get_cluster_nodes 1 connected]] == 2 &&
[CI 1 cluster_stats_messages_meet_sent] == [CI 0 cluster_stats_messages_meet_received]
} else {
fail "1 cluster_state:[CI 1 cluster_state], 0 cluster_state: [CI 0 cluster_state]"
}
}
} ;# stop servers
} ;# tags
set ::singledb $old_singledb