Make nodename test more consistent (#12330)

To determine when everything was stable, we couldn't just query the nodename since they aren't API visible by design. Instead, we were using a proxy piece of information which was bumping the epoch and waiting for everyone to observe that. This works for making source Node 0 and Node 1 had pinged, and Node 0 and Node 2 had pinged, but did not guarantee that Node 1 and Node 2 had pinged. Although unlikely, this can cause this failure message. To fix it I hijacked hostnames and used its validation that it has been propagated, since we know that it is stable. I also noticed while stress testing this sometimes the test took almost 4.5 seconds to finish, which is really close to the current 5 second limit of the log check, so I bumped that up as well just to make it a bit more consistent.
2023-06-20 18:00:55 -07:00 · 2023-06-20 18:00:55 -07:00 · 73cf0243df
commit 73cf0243df
parent 20fa156067
3 changed files with 23 additions and 24 deletions
--- a/tests/support/cluster_util.tcl
+++ b/tests/support/cluster_util.tcl
@ -175,3 +175,18 @@ proc isolate_node {id} {
        fail "CLUSTER FORGET was not propagated to all nodes"
    }
 }
+
+# Check if cluster's view of hostnames is consistent
+proc are_hostnames_propagated {match_string} {
+    for {set j 0} {$j < [llength $::servers]} {incr j} {
+        set cfg [R $j cluster slots]
+        foreach node $cfg {
+            for {set i 2} {$i < [llength $node]} {incr i} {
+                if {! [string match $match_string [lindex [lindex [lindex $node $i] 3] 1]] } {
+                    return 0
+                }
+            }
+        }
+    }
+    return 1
+}
--- a/tests/unit/cluster/hostnames.tcl
+++ b/tests/unit/cluster/hostnames.tcl
@ -1,18 +1,3 @@
-# Check if cluster's view of hostnames is consistent
-proc are_hostnames_propagated {match_string} {
-    for {set j 0} {$j < [llength $::servers]} {incr j} {
-        set cfg [R $j cluster slots]
-        foreach node $cfg {
-            for {set i 2} {$i < [llength $node]} {incr i} {
-                if {! [string match $match_string [lindex [lindex [lindex $node $i] 3] 1]] } {
-                    return 0
-                }
-            }
-        }
-    }
-    return 1
-}
-
 proc get_slot_field {slot_output shard_id node_id attrib_id} {
    return [lindex [lindex [lindex $slot_output $shard_id] $node_id] $attrib_id]
 }
--- a/tests/unit/cluster/human-announced-nodename.tcl
+++ b/tests/unit/cluster/human-announced-nodename.tcl
@ -2,17 +2,16 @@
 start_cluster 3 0 {tags {external:skip cluster}} {
    test "Set cluster human announced nodename and let it propagate" {
        for {set j 0} {$j < [llength $::servers]} {incr j} {
+            R $j config set cluster-announce-hostname "host-$j.com"
            R $j config set cluster-announce-human-nodename "nodename-$j"
        }

-        # We wait for everyone to agree on the epoch bump, which means everyone
-        # has exchanged messages so they know about the nodenames.
-        R 0 CLUSTER BUMPEPOCH
-        wait_for_condition 1000 50 {
-            [CI 0 cluster_current_epoch] == [CI 1 cluster_current_epoch] &&
-            [CI 0 cluster_current_epoch] == [CI 2 cluster_current_epoch]
+        # We wait for everyone to agree on the hostnames. Since they are gossiped
+        # the same way as nodenames, it implies everyone knows the nodenames too.
+        wait_for_condition 50 100 {
+            [are_hostnames_propagated "host-*.com"] eq 1
        } else {
-            fail "Cluster did not converge"
+            fail "cluster hostnames were not propagated"
        }
    }

@ -22,8 +21,8 @@ start_cluster 3 0 {tags {external:skip cluster}} {

        # We're going to use a message we will know will be sent, node unreachable,
        # since it includes the other node gossiping.
-        wait_for_log_messages -1 {"*Node * (nodename-2) reported node * (nodename-0) as not reachable*"} 0 10 500
-        wait_for_log_messages -2 {"*Node * (nodename-1) reported node * (nodename-0) as not reachable*"} 0 10 500
+        wait_for_log_messages -1 {"*Node * (nodename-2) reported node * (nodename-0) as not reachable*"} 0 20 500
+        wait_for_log_messages -2 {"*Node * (nodename-1) reported node * (nodename-0) as not reachable*"} 0 20 500
        
        resume_process [srv 0 pid]
    }