Fix failing cluster tests (#9707)

Fix failures introduced by #9695 which was an attempt to solve failures introduced by #9679. And alternative to #9703 (i didn't like the extra argument to kill_instance). Reverting #9695. Instead of stopping AOF on all terminations, stop it only on the two which need it. Do it as part of the test rather than the infra (it was add that kill_instance used `R` to communicate to the instance) Note that the original purpose of these tests was to trigger a crash, but that upsets valgrind so in redis 6.2 i changed it to use SIGTERM, so i now rename the tests (remove "kill" and "crash"). Also add some colors to failures, and the word "FAILED" so that it's searchable. And solve a semi-related race condition in 14-consistency-check.tcl
2021-10-31 19:22:21 +02:00 · 2021-10-31 19:22:21 +02:00 · 48d54265ce
commit 48d54265ce
parent f26e90be0c
4 changed files with 16 additions and 14 deletions
--- a/tests/cluster/tests/03-failover-loop.tcl
+++ b/tests/cluster/tests/03-failover-loop.tcl
@ -61,7 +61,9 @@ while {[incr iterations -1]} {
        }
    }

-    test "Killing node #$tokill" {
+    test "Terminating node #$tokill" {
+        # Stop AOF so that an initial AOFRW won't prevent the instance from terminating
+        R $tokill config set appendonly no
        kill_instance redis $tokill
    }

--- a/tests/cluster/tests/04-resharding.tcl
+++ b/tests/cluster/tests/04-resharding.tcl
@ -137,8 +137,10 @@ test "Verify $numkeys keys for consistency with logical content" {
    }
 }

-test "Crash and restart all the instances" {
+test "Terminate and restart all the instances" {
    foreach_redis_id id {
+        # Stop AOF so that an initial AOFRW won't prevent the instance from terminating
+        R $id config set appendonly no
        kill_instance redis $id
        restart_instance redis $id
    }
@ -148,7 +150,7 @@ test "Cluster should eventually be up again" {
    assert_cluster_state ok
 }

-test "Verify $numkeys keys after the crash & restart" {
+test "Verify $numkeys keys after the restart" {
    # Check that the Redis Cluster content matches our logical content.
    foreach {key value} [array get content] {
        if {[$cluster lrange $key 0 -1] ne $value} {
--- a/tests/cluster/tests/14-consistency-check.tcl
+++ b/tests/cluster/tests/14-consistency-check.tcl
@ -77,9 +77,10 @@ proc test_slave_load_expired_keys {aof} {
            # we need to wait for the initial AOFRW to be done, otherwise
            # kill_instance (which now uses SIGTERM will fail ("Writing initial AOF, can't exit")
            wait_for_condition 100 10 {
+                [RI $replica_id aof_rewrite_scheduled] eq 0 &&
                [RI $replica_id aof_rewrite_in_progress] eq 0
            } else {
-                fail "keys didn't expire"
+                fail "AOFRW didn't finish"
            }
        } else {
            R $replica_id save
--- a/tests/instances.tcl
+++ b/tests/instances.tcl
@ -204,10 +204,10 @@ proc stop_instance pid {
        incr wait 10

        if {$wait == $max_wait} {
-            puts "Forcing process $pid to crash..."
+            puts [colorstr red "Forcing process $pid to crash..."]
            catch {exec kill -SEGV $pid}
        } elseif {$wait >= $max_wait * 2} {
-            puts "Forcing process $pid to exit..."
+            puts [colorstr red "Forcing process $pid to exit..."]
            catch {exec kill -KILL $pid}
        } elseif {$wait % 1000 == 0} {
            puts "Waiting for process $pid to exit..."
@ -376,10 +376,10 @@ proc test {descr code} {
    if {[catch {set retval [uplevel 1 $code]} error]} {
        incr ::failed
        if {[string match "assertion:*" $error]} {
-            set msg [string range $error 10 end]
+            set msg "FAILED: [string range $error 10 end]"
            puts [colorstr red $msg]
            if {$::pause_on_error} pause_on_error
-            puts "(Jumping to next unit after error)"
+            puts [colorstr red "(Jumping to next unit after error)"]
            return -code continue
        } else {
            # Re-raise, let handler up the stack take care of this.
@ -451,10 +451,10 @@ proc run_tests {} {
 # Print a message and exists with 0 / 1 according to zero or more failures.
 proc end_tests {} {
    if {$::failed == 0 } {
-        puts "GOOD! No errors."
+        puts [colorstr green "GOOD! No errors."]
        exit 0
    } else {
-        puts "WARNING $::failed test(s) failed."
+        puts [colorstr red "WARNING $::failed test(s) failed."]
        exit 1
    }
 }
@ -583,7 +583,7 @@ proc get_instance_id_by_port {type port} {
    fail "Instance $type port $port not found."
 }

-# Kill an instance of the specified type/id with SIGTERM.
+# Kill an instance of the specified type/id with SIGKILL.
 # This function will mark the instance PID as -1 to remember that this instance
 # is no longer running and will remove its PID from the list of pids that
 # we kill at cleanup.
@ -597,9 +597,6 @@ proc kill_instance {type id} {
        error "You tried to kill $type $id twice."
    }

-    # stop appendonly so that the instance won't refuse to go down
-    R $id config set appendonly no
-
    stop_instance $pid
    set_instance_attrib $type $id pid -1
    set_instance_attrib $type $id link you_tried_to_talk_with_killed_instance