From 2b207ee1b3808c5eb5de6879651104044ca162b2 Mon Sep 17 00:00:00 2001
From: Madelyn Olson <madelyneolson@gmail.com>
Date: Wed, 11 Sep 2024 09:52:34 -0700
Subject: [PATCH] Improve stability of hostnames test (#1016)

Maybe partially resolves https://github.com/valkey-io/valkey/issues/952.

The hostnames test relies on an assumption that node zero and node six
don't communicate with each other to test a bunch of behavior in the
handshake stake. This was done by previously dropping all meet packets,
however it seems like there was some case where node zero was sending a
single pong message to node 6, which was partially initializing the
state.

I couldn't track down why this happened, but I adjusted the test to
simply pause node zero which also correctly emulates the state we want
to be in since we're just testing state on node 6, and removes the
chance of errant messages. The test was failing about 5% of the time
locally, and I wasn't able to reproduce a failure with this new
configuration.

---------

Signed-off-by: Madelyn Olson <madelyneolson@gmail.com>
---
 tests/unit/cluster/hostnames.tcl | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tests/unit/cluster/hostnames.tcl b/tests/unit/cluster/hostnames.tcl
index 232c6cf81..9a74fd0d5 100644
--- a/tests/unit/cluster/hostnames.tcl
+++ b/tests/unit/cluster/hostnames.tcl
@@ -132,11 +132,14 @@ test "Verify the nodes configured with prefer hostname only show hostname for ne
         R $j config set cluster-announce-hostname "shard-$j.com"
     }
 
+    # Grab the ID so we have it later for validation
+    set primary_id [R 0 CLUSTER MYID]
+
     # Prevent Node 0 and Node 6 from properly meeting,
     # they'll hang in the handshake phase. This allows us to 
     # test the case where we "know" about it but haven't
     # successfully retrieved information about it yet.
-    R 0 DEBUG DROP-CLUSTER-PACKET-FILTER 0
+    pause_process [srv 0 pid]
     R 6 DEBUG DROP-CLUSTER-PACKET-FILTER 0
 
     # Have a replica meet the isolated node
@@ -174,12 +177,11 @@ test "Verify the nodes configured with prefer hostname only show hostname for ne
 
     # Also make sure we know about the isolated master, we 
     # just can't reach it.
-    set master_id [R 0 CLUSTER MYID]
-    assert_match "*$master_id*" [R 6 CLUSTER NODES]
+    assert_match "*$primary_id*" [R 6 CLUSTER NODES]
 
     # Stop dropping cluster packets, and make sure everything
     # stabilizes
-    R 0 DEBUG DROP-CLUSTER-PACKET-FILTER -1
+    resume_process [srv 0 pid]
     R 6 DEBUG DROP-CLUSTER-PACKET-FILTER -1
 
     # This operation sometimes spikes to around 5 seconds to resolve the state,