2024-06-16 20:37:09 -07:00
|
|
|
# make sure the test infra won't use SELECT
|
|
|
|
set old_singledb $::singledb
|
|
|
|
set ::singledb 1
|
|
|
|
|
|
|
|
tags {tls:skip external:skip cluster} {
|
2024-12-11 17:26:06 -08:00
|
|
|
set CLUSTER_PACKET_TYPE_PING 0
|
|
|
|
set CLUSTER_PACKET_TYPE_PONG 1
|
|
|
|
set CLUSTER_PACKET_TYPE_MEET 2
|
|
|
|
set CLUSTER_PACKET_TYPE_NONE -1
|
|
|
|
set CLUSTER_PACKET_TYPE_ALL -2
|
|
|
|
|
2024-06-16 20:37:09 -07:00
|
|
|
set base_conf [list cluster-enabled yes]
|
|
|
|
start_multiple_servers 2 [list overrides $base_conf] {
|
|
|
|
test "Cluster nodes are reachable" {
|
|
|
|
for {set id 0} {$id < [llength $::servers]} {incr id} {
|
|
|
|
# Every node should be reachable.
|
|
|
|
wait_for_condition 1000 50 {
|
|
|
|
([catch {R $id ping} ping_reply] == 0) &&
|
|
|
|
($ping_reply eq {PONG})
|
|
|
|
} else {
|
|
|
|
catch {R $id ping} err
|
|
|
|
fail "Node #$id keeps replying '$err' to PING."
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
test "Before slots allocation, all nodes report cluster failure" {
|
|
|
|
wait_for_cluster_state fail
|
|
|
|
}
|
|
|
|
|
|
|
|
test "Cluster nodes haven't met each other" {
|
|
|
|
assert {[llength [get_cluster_nodes 1]] == 1}
|
|
|
|
assert {[llength [get_cluster_nodes 0]] == 1}
|
|
|
|
}
|
|
|
|
|
|
|
|
test "Allocate slots" {
|
|
|
|
cluster_allocate_slots 2 0
|
|
|
|
}
|
|
|
|
|
|
|
|
test "MEET is reliable when target drops the initial MEETs" {
|
|
|
|
# Make 0 drop the initial MEET messages due to link failure
|
|
|
|
R 0 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_MEET
|
|
|
|
R 0 DEBUG CLOSE-CLUSTER-LINK-ON-PACKET-DROP 1
|
|
|
|
|
|
|
|
R 1 CLUSTER MEET 127.0.0.1 [srv 0 port]
|
|
|
|
|
|
|
|
# Wait for at least a few MEETs to be sent so that we are sure that 0 is
|
|
|
|
# dropping them.
|
|
|
|
wait_for_condition 1000 50 {
|
|
|
|
[CI 0 cluster_stats_messages_meet_received] >= 3
|
|
|
|
} else {
|
|
|
|
fail "Cluster node 1 never sent multiple MEETs to 0"
|
|
|
|
}
|
|
|
|
|
|
|
|
# Make sure the nodes still don't know about each other
|
2024-07-01 22:27:38 -07:00
|
|
|
# Using a wait condition here as an assert can be flaky - especially
|
|
|
|
# when cluster nodes is processed when the link is established to send MEET.
|
|
|
|
wait_for_condition 1000 50 {
|
|
|
|
[llength [get_cluster_nodes 1 connected]] == 1
|
|
|
|
} else {
|
|
|
|
fail "Node 1 recognizes node 0 even though node 0 drops MEETs from node 1"
|
|
|
|
}
|
2024-06-16 20:37:09 -07:00
|
|
|
assert {[llength [get_cluster_nodes 0 connected]] == 1}
|
|
|
|
|
|
|
|
R 0 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_NONE
|
|
|
|
|
|
|
|
# If the MEET is reliable, both a and b will turn to cluster state ok
|
|
|
|
wait_for_condition 1000 50 {
|
|
|
|
[CI 1 cluster_state] eq {ok} && [CI 0 cluster_state] eq {ok} &&
|
|
|
|
[CI 0 cluster_stats_messages_meet_received] >= 4 &&
|
|
|
|
[CI 1 cluster_stats_messages_meet_sent] == [CI 0 cluster_stats_messages_meet_received]
|
|
|
|
} else {
|
2024-12-30 12:56:39 -08:00
|
|
|
fail "Unexpected cluster state: node 1 cluster_state:[CI 1 cluster_state], node 0 cluster_state: [CI 0 cluster_state]"
|
2024-06-16 20:37:09 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
} ;# stop servers
|
|
|
|
} ;# tags
|
|
|
|
|
|
|
|
set ::singledb $old_singledb
|
|
|
|
|
2024-12-11 17:26:06 -08:00
|
|
|
proc cluster_get_first_node_in_handshake id {
|
|
|
|
set nodes [get_cluster_nodes $id]
|
|
|
|
foreach n $nodes {
|
|
|
|
if {[cluster_has_flag $n handshake]} {
|
|
|
|
return [dict get $n id]
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return {}
|
|
|
|
}
|
|
|
|
|
|
|
|
proc cluster_nodes_all_know_each_other {num_nodes} {
|
|
|
|
# Collect node IDs dynamically
|
|
|
|
set node_ids {}
|
|
|
|
for {set i 0} {$i < $num_nodes} {incr i} {
|
|
|
|
lappend node_ids [dict get [get_myself $i] id]
|
|
|
|
}
|
|
|
|
|
|
|
|
# Check if all nodes know each other
|
|
|
|
foreach node_id $node_ids {
|
|
|
|
foreach check_node_id $node_ids {
|
|
|
|
for {set node_index 0} {$node_index < $num_nodes} {incr node_index} {
|
|
|
|
if {[cluster_get_node_by_id $node_index $check_node_id] == {}} {
|
|
|
|
return 0
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
# Verify cluster link counts for each node
|
|
|
|
set expected_links [expr {2 * ($num_nodes - 1)}]
|
|
|
|
for {set i 0} {$i < $num_nodes} {incr i} {
|
|
|
|
if {[llength [R $i CLUSTER LINKS]] != $expected_links} {
|
|
|
|
return 0
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return 1
|
|
|
|
}
|
|
|
|
|
|
|
|
start_cluster 2 0 {tags {external:skip cluster} overrides {cluster-node-timeout 4000 cluster-replica-no-failover yes}} {
|
|
|
|
set CLUSTER_PACKET_TYPE_PING 0
|
|
|
|
set CLUSTER_PACKET_TYPE_PONG 1
|
|
|
|
set CLUSTER_PACKET_TYPE_MEET 2
|
|
|
|
set CLUSTER_PACKET_TYPE_NONE -1
|
|
|
|
set CLUSTER_PACKET_TYPE_ALL -2
|
|
|
|
|
|
|
|
test "Handshake eventually succeeds after node handshake timeout on both sides with inconsistent view of the cluster" {
|
|
|
|
set cluster_port [find_available_port $::baseport $::portcount]
|
|
|
|
start_server [list overrides [list cluster-enabled yes cluster-node-timeout 4000 cluster-port $cluster_port]] {
|
|
|
|
# In this test we will trigger a handshake timeout on both sides of the handshake.
|
|
|
|
# Node 1 and 2 already know each other, then we make node 1 meet node 0:
|
|
|
|
#
|
|
|
|
# Node 1 -- MEET -> Node 0 [Node 0 might learn about Node 2 from the gossip section of the msg]
|
|
|
|
# Node 1 <- PONG -- Node 0 [we drop this message, so Node 1 will eventually mark the handshake as timed out]
|
|
|
|
# Node 1 <- PING -- Node 0 [we drop this message, so Node 1 will never send a PONG and Node 0 will eventually mark the handshake as timed out]
|
|
|
|
#
|
|
|
|
# After the handshake is timed out, we allow all cluster bus messages to go through.
|
|
|
|
# Eventually Node 0 should send a MEET packet to the other nodes to complete the handshake.
|
|
|
|
|
|
|
|
set node0_id [dict get [get_myself 0] id]
|
|
|
|
set node1_id [dict get [get_myself 1] id]
|
|
|
|
set node2_id [dict get [get_myself 2] id]
|
|
|
|
|
|
|
|
# Drop all cluster bus messages
|
|
|
|
R 1 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_ALL
|
|
|
|
# Drop MEET cluster bus messages, so that Node 0 cannot start a handshake with Node 2.
|
|
|
|
R 2 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_MEET
|
|
|
|
|
|
|
|
R 1 CLUSTER MEET [srv 0 host] [srv 0 port] $cluster_port
|
|
|
|
|
|
|
|
# Wait for Node 0 to be in handshake
|
|
|
|
wait_for_condition 10 400 {
|
|
|
|
[cluster_get_first_node_in_handshake 0] != {}
|
|
|
|
} else {
|
|
|
|
fail "Node 0 never entered handshake state"
|
|
|
|
}
|
|
|
|
|
|
|
|
# We want Node 0 to learn about Node 2 through the gossip section of the MEET message
|
|
|
|
set meet_retry 0
|
|
|
|
while {[cluster_get_node_by_id 0 $node2_id] eq {}} {
|
|
|
|
if {$meet_retry == 10} {
|
|
|
|
error "assertion: Retried to meet Node 0 too many times"
|
|
|
|
}
|
|
|
|
# If Node 0 doesn't know about Node 1 & 2, it means Node 1 did not gossip about node 2 in its MEET message.
|
|
|
|
# So we kill the outbound link from Node 1 to Node 0, to force a reconnect and a re-send of the MEET message.
|
|
|
|
after 100
|
|
|
|
# Since we are in handshake, we use a randomly generated ID we have to find
|
|
|
|
R 1 DEBUG CLUSTERLINK KILL ALL [cluster_get_first_node_in_handshake 1]
|
|
|
|
incr meet_retry 1
|
|
|
|
}
|
|
|
|
|
|
|
|
# Wait for Node 1's handshake to timeout
|
|
|
|
wait_for_condition 50 100 {
|
|
|
|
[cluster_get_first_node_in_handshake 1] eq {}
|
|
|
|
} else {
|
|
|
|
fail "Node 1 never exited handshake state"
|
|
|
|
}
|
|
|
|
|
|
|
|
# Wait for Node 0's handshake to timeout
|
|
|
|
wait_for_condition 50 100 {
|
2024-12-30 12:56:39 -08:00
|
|
|
[cluster_get_first_node_in_handshake 0] eq {}
|
2024-12-11 17:26:06 -08:00
|
|
|
} else {
|
|
|
|
fail "Node 0 never exited handshake state"
|
|
|
|
}
|
|
|
|
|
2024-12-30 12:56:39 -08:00
|
|
|
# At this point Node 0 knows Node 2 through the gossip, but Node 1 & 2 don't know Node 0.
|
2024-12-11 17:26:06 -08:00
|
|
|
wait_for_condition 50 100 {
|
|
|
|
[cluster_get_node_by_id 0 $node2_id] != {} &&
|
|
|
|
[cluster_get_node_by_id 1 $node0_id] eq {} &&
|
|
|
|
[cluster_get_node_by_id 2 $node0_id] eq {}
|
|
|
|
} else {
|
|
|
|
fail "Unexpected CLUSTER NODES output, nodes 1 & 2 should not know node 0."
|
|
|
|
}
|
|
|
|
|
|
|
|
# Allow all messages to go through again
|
|
|
|
R 1 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_NONE
|
|
|
|
R 2 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_NONE
|
|
|
|
|
|
|
|
# Now Node 0 will send a MEET packet to Node 1 & 2 since it has an outbound link to these nodes but no inbound link.
|
|
|
|
# Handshake should now complete successfully.
|
|
|
|
wait_for_condition 50 200 {
|
|
|
|
[cluster_nodes_all_know_each_other 3]
|
|
|
|
} else {
|
|
|
|
fail "Unexpected CLUSTER NODES output, all nodes should know each other."
|
|
|
|
}
|
|
|
|
} ;# stop Node 0
|
|
|
|
} ;# test
|
|
|
|
} ;# stop cluster
|
|
|
|
|
|
|
|
start_cluster 2 0 {tags {external:skip cluster} overrides {cluster-node-timeout 4000 cluster-replica-no-failover yes}} {
|
|
|
|
set CLUSTER_PACKET_TYPE_PING 0
|
|
|
|
set CLUSTER_PACKET_TYPE_PONG 1
|
|
|
|
set CLUSTER_PACKET_TYPE_MEET 2
|
|
|
|
set CLUSTER_PACKET_TYPE_NONE -1
|
|
|
|
set CLUSTER_PACKET_TYPE_ALL -2
|
|
|
|
|
|
|
|
test "Handshake eventually succeeds after node handshake timeout on one side with inconsistent view of the cluster" {
|
|
|
|
set cluster_port [find_available_port $::baseport $::portcount]
|
|
|
|
start_server [list overrides [list cluster-enabled yes cluster-node-timeout 4000 cluster-port $cluster_port]] {
|
|
|
|
# In this test we will trigger a handshake timeout on one side of the handshake.
|
|
|
|
# Node 1 and 2 already know each other, then we make node 0 meet node 1:
|
|
|
|
#
|
|
|
|
# Node 0 -- MEET -> Node 1
|
|
|
|
# Node 0 <- PONG -- Node 1
|
|
|
|
# Node 0 <- PING -- Node 1 [Node 0 will mark the handshake as successful]
|
|
|
|
# Node 0 -- PONG -> Node 1 [we drop this message, so node 1 will eventually mark the handshake as timed out]
|
|
|
|
#
|
|
|
|
# After the handshake is timed out, we allow all cluster bus messages to go through.
|
|
|
|
# Eventually Node 0 should send a MEET packet to the other nodes to complete the handshake.
|
|
|
|
|
|
|
|
set node0_id [dict get [get_myself 0] id]
|
|
|
|
set node1_id [dict get [get_myself 1] id]
|
|
|
|
set node2_id [dict get [get_myself 2] id]
|
|
|
|
|
|
|
|
# Drop PONG messages
|
|
|
|
R 1 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_PONG
|
|
|
|
# Drop MEET cluster bus messages, so that Node 0 cannot start a handshake with Node 2.
|
|
|
|
R 2 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_MEET
|
|
|
|
|
|
|
|
# Node 0 meets node 1
|
|
|
|
R 0 CLUSTER MEET [srv -1 host] [srv -1 port]
|
|
|
|
|
|
|
|
# Wait for node 0 to know about the other nodes in the cluster
|
|
|
|
wait_for_condition 50 100 {
|
|
|
|
[cluster_get_node_by_id 0 $node1_id] != {}
|
|
|
|
} else {
|
|
|
|
fail "Node 0 never learned about node 1"
|
|
|
|
}
|
|
|
|
# At this point, node 0 knows about node 1 and might know node 2 if node 1 gossiped about it.
|
|
|
|
wait_for_condition 50 100 {
|
|
|
|
[cluster_get_first_node_in_handshake 0] eq {}
|
|
|
|
} else {
|
|
|
|
fail "Node 1 never exited handshake state"
|
|
|
|
}
|
|
|
|
# At this point, from node 0 point of view, the handshake with node 1 succeeded.
|
|
|
|
|
|
|
|
wait_for_condition 50 100 {
|
|
|
|
[cluster_get_first_node_in_handshake 1] eq {}
|
|
|
|
} else {
|
|
|
|
fail "Node 1 never exited handshake state"
|
|
|
|
}
|
|
|
|
assert {[cluster_get_node_by_id 1 $node0_id] eq {}}
|
|
|
|
# At this point, from node 1 point of view, the handshake with node 0 timed out.
|
|
|
|
|
|
|
|
# Allow all messages
|
|
|
|
R 1 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_NONE
|
|
|
|
R 2 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_NONE
|
|
|
|
|
|
|
|
# Now Node 0 will send a MEET packet to Node 1 & 2 since it has an outbound link to these nodes but no inblound link.
|
|
|
|
# Handshake should now complete successfully.
|
|
|
|
wait_for_condition 50 200 {
|
|
|
|
[cluster_nodes_all_know_each_other 3]
|
|
|
|
} else {
|
|
|
|
fail "Unexpected CLUSTER NODES output, all nodes should know each other."
|
|
|
|
}
|
|
|
|
} ;# stop Node 0
|
|
|
|
} ;# test
|
|
|
|
} ;# stop cluster
|