From 73cf0243df5063aa9167c3fc3049d54b2772e29b Mon Sep 17 00:00:00 2001 From: Madelyn Olson <34459052+madolson@users.noreply.github.com> Date: Tue, 20 Jun 2023 18:00:55 -0700 Subject: [PATCH] Make nodename test more consistent (#12330) To determine when everything was stable, we couldn't just query the nodename since they aren't API visible by design. Instead, we were using a proxy piece of information which was bumping the epoch and waiting for everyone to observe that. This works for making source Node 0 and Node 1 had pinged, and Node 0 and Node 2 had pinged, but did not guarantee that Node 1 and Node 2 had pinged. Although unlikely, this can cause this failure message. To fix it I hijacked hostnames and used its validation that it has been propagated, since we know that it is stable. I also noticed while stress testing this sometimes the test took almost 4.5 seconds to finish, which is really close to the current 5 second limit of the log check, so I bumped that up as well just to make it a bit more consistent. --- tests/support/cluster_util.tcl | 15 +++++++++++++++ tests/unit/cluster/hostnames.tcl | 15 --------------- tests/unit/cluster/human-announced-nodename.tcl | 17 ++++++++--------- 3 files changed, 23 insertions(+), 24 deletions(-) diff --git a/tests/support/cluster_util.tcl b/tests/support/cluster_util.tcl index 13a6b05d6..7b1a7b5c8 100644 --- a/tests/support/cluster_util.tcl +++ b/tests/support/cluster_util.tcl @@ -175,3 +175,18 @@ proc isolate_node {id} { fail "CLUSTER FORGET was not propagated to all nodes" } } + +# Check if cluster's view of hostnames is consistent +proc are_hostnames_propagated {match_string} { + for {set j 0} {$j < [llength $::servers]} {incr j} { + set cfg [R $j cluster slots] + foreach node $cfg { + for {set i 2} {$i < [llength $node]} {incr i} { + if {! [string match $match_string [lindex [lindex [lindex $node $i] 3] 1]] } { + return 0 + } + } + } + } + return 1 +} diff --git a/tests/unit/cluster/hostnames.tcl b/tests/unit/cluster/hostnames.tcl index 42024587e..f31824062 100644 --- a/tests/unit/cluster/hostnames.tcl +++ b/tests/unit/cluster/hostnames.tcl @@ -1,18 +1,3 @@ -# Check if cluster's view of hostnames is consistent -proc are_hostnames_propagated {match_string} { - for {set j 0} {$j < [llength $::servers]} {incr j} { - set cfg [R $j cluster slots] - foreach node $cfg { - for {set i 2} {$i < [llength $node]} {incr i} { - if {! [string match $match_string [lindex [lindex [lindex $node $i] 3] 1]] } { - return 0 - } - } - } - } - return 1 -} - proc get_slot_field {slot_output shard_id node_id attrib_id} { return [lindex [lindex [lindex $slot_output $shard_id] $node_id] $attrib_id] } diff --git a/tests/unit/cluster/human-announced-nodename.tcl b/tests/unit/cluster/human-announced-nodename.tcl index 5db598238..a595ca632 100644 --- a/tests/unit/cluster/human-announced-nodename.tcl +++ b/tests/unit/cluster/human-announced-nodename.tcl @@ -2,17 +2,16 @@ start_cluster 3 0 {tags {external:skip cluster}} { test "Set cluster human announced nodename and let it propagate" { for {set j 0} {$j < [llength $::servers]} {incr j} { + R $j config set cluster-announce-hostname "host-$j.com" R $j config set cluster-announce-human-nodename "nodename-$j" } - # We wait for everyone to agree on the epoch bump, which means everyone - # has exchanged messages so they know about the nodenames. - R 0 CLUSTER BUMPEPOCH - wait_for_condition 1000 50 { - [CI 0 cluster_current_epoch] == [CI 1 cluster_current_epoch] && - [CI 0 cluster_current_epoch] == [CI 2 cluster_current_epoch] + # We wait for everyone to agree on the hostnames. Since they are gossiped + # the same way as nodenames, it implies everyone knows the nodenames too. + wait_for_condition 50 100 { + [are_hostnames_propagated "host-*.com"] eq 1 } else { - fail "Cluster did not converge" + fail "cluster hostnames were not propagated" } } @@ -22,8 +21,8 @@ start_cluster 3 0 {tags {external:skip cluster}} { # We're going to use a message we will know will be sent, node unreachable, # since it includes the other node gossiping. - wait_for_log_messages -1 {"*Node * (nodename-2) reported node * (nodename-0) as not reachable*"} 0 10 500 - wait_for_log_messages -2 {"*Node * (nodename-1) reported node * (nodename-0) as not reachable*"} 0 10 500 + wait_for_log_messages -1 {"*Node * (nodename-2) reported node * (nodename-0) as not reachable*"} 0 20 500 + wait_for_log_messages -2 {"*Node * (nodename-1) reported node * (nodename-0) as not reachable*"} 0 20 500 resume_process [srv 0 pid] }