futriix/tests/unit/cluster/cluster-multiple-meets.tcl
Sankar eff45f5467
Fix flakiness of cluster-multiple-meets and cluster-reliable-meet (#728)
Tests in cluster-multiple-meets were flaky as reported by @madolson 

*
https://github.com/valkey-io/valkey/actions/runs/9688455588/job/26776953320
*
https://github.com/valkey-io/valkey/actions/runs/9688455588/job/26776953585

I wasn't able to reproduce this locally, but I suspect that the
flakiness is coming from the fact that nodes are reported as "connected"
as long as there is an outgoing link. An outgoing link is created before
MEET is sent out.

Signed-off-by: Sankar <1890648+srgsanky@users.noreply.github.com>
2024-07-01 22:27:38 -07:00

90 lines
3.5 KiB
Tcl

# make sure the test infra won't use SELECT
set old_singledb $::singledb
set ::singledb 1
tags {tls:skip external:skip cluster} {
set base_conf [list cluster-enabled yes]
start_multiple_servers 2 [list overrides $base_conf] {
test "Cluster nodes are reachable" {
for {set id 0} {$id < [llength $::servers]} {incr id} {
# Every node should be reachable.
wait_for_condition 1000 50 {
([catch {R $id ping} ping_reply] == 0) &&
($ping_reply eq {PONG})
} else {
catch {R $id ping} err
fail "Node #$id keeps replying '$err' to PING."
}
}
}
test "Before slots allocation, all nodes report cluster failure" {
wait_for_cluster_state fail
}
set CLUSTER_PACKET_TYPE_PONG 1
set CLUSTER_PACKET_TYPE_NONE -1
test "Cluster nodes haven't met each other" {
assert {[llength [get_cluster_nodes 1]] == 1}
assert {[llength [get_cluster_nodes 0]] == 1}
}
test "Allocate slots" {
cluster_allocate_slots 2 0;# primaries replicas
}
test "Multiple MEETs from Node 1 to Node 0 should work" {
# Make 1 drop the PONG responses to MEET
R 1 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_PONG
# It is important to close the connection on drop, otherwise a subsequent MEET won't be sent
R 1 DEBUG CLOSE-CLUSTER-LINK-ON-PACKET-DROP 1
R 1 CLUSTER MEET 127.0.0.1 [srv 0 port]
# Wait for at least a few MEETs to be sent so that we are sure that 1 is dropping the response to MEET.
wait_for_condition 1000 50 {
[CI 0 cluster_stats_messages_meet_received] > 1 &&
[CI 1 cluster_state] eq {fail} && [CI 0 cluster_state] eq {ok}
} else {
fail "Cluster node 1 never sent multiple MEETs to 0"
}
# 0 will be connected to 1, but 1 won't see that 0 is connected
# Using a wait condition here as an assert can be flaky - especially
# when cluster nodes is processed when the link is established to send MEET.
wait_for_condition 1000 50 {
[llength [get_cluster_nodes 1 connected]] == 1
} else {
fail "Node 1 recognizes node 0 even though it drops PONGs from node 0"
}
assert {[llength [get_cluster_nodes 0 connected]] == 2}
# Drop incoming and outgoing links from/to 1
R 0 DEBUG CLUSTERLINK KILL ALL [R 1 CLUSTER MYID]
# Wait for 0 to know about 1 again after 1 sends a MEET
wait_for_condition 1000 50 {
[llength [get_cluster_nodes 0 connected]] == 2
} else {
fail "Cluster node 1 never sent multiple MEETs to 0"
}
# Undo packet drop
R 1 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_NONE
R 1 DEBUG CLOSE-CLUSTER-LINK-ON-PACKET-DROP 0
# Both a and b will turn to cluster state ok
wait_for_condition 1000 50 {
[CI 1 cluster_state] eq {ok} && [CI 0 cluster_state] eq {ok} &&
[CI 1 cluster_stats_messages_meet_sent] == [CI 0 cluster_stats_messages_meet_received]
} else {
fail "1 cluster_state:[CI 1 cluster_state], 0 cluster_state: [CI 0 cluster_state]"
}
}
} ;# stop servers
} ;# tags
set ::singledb $old_singledb