Fix failing cluster tests (#9707)

Fix failures introduced by #9695 which was an attempt to solve failures introduced by #9679.
And alternative to #9703 (i didn't like the extra argument to kill_instance).

Reverting #9695.
Instead of stopping AOF on all terminations, stop it only on the two which need it.
Do it as part of the test rather than the infra (it was add that kill_instance used `R`
to communicate to the instance)

Note that the original purpose of these tests was to trigger a crash, but that upsets
valgrind so in redis 6.2 i changed it to use SIGTERM, so i now rename the tests
(remove "kill" and "crash").

Also add some colors to failures, and the word "FAILED" so that it's searchable.

And solve a semi-related race condition in 14-consistency-check.tcl
This commit is contained in:
Oran Agra 2021-10-31 19:22:21 +02:00 committed by GitHub
parent f26e90be0c
commit 48d54265ce
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 16 additions and 14 deletions

View File

@ -61,7 +61,9 @@ while {[incr iterations -1]} {
}
}
test "Killing node #$tokill" {
test "Terminating node #$tokill" {
# Stop AOF so that an initial AOFRW won't prevent the instance from terminating
R $tokill config set appendonly no
kill_instance redis $tokill
}

View File

@ -137,8 +137,10 @@ test "Verify $numkeys keys for consistency with logical content" {
}
}
test "Crash and restart all the instances" {
test "Terminate and restart all the instances" {
foreach_redis_id id {
# Stop AOF so that an initial AOFRW won't prevent the instance from terminating
R $id config set appendonly no
kill_instance redis $id
restart_instance redis $id
}
@ -148,7 +150,7 @@ test "Cluster should eventually be up again" {
assert_cluster_state ok
}
test "Verify $numkeys keys after the crash & restart" {
test "Verify $numkeys keys after the restart" {
# Check that the Redis Cluster content matches our logical content.
foreach {key value} [array get content] {
if {[$cluster lrange $key 0 -1] ne $value} {

View File

@ -77,9 +77,10 @@ proc test_slave_load_expired_keys {aof} {
# we need to wait for the initial AOFRW to be done, otherwise
# kill_instance (which now uses SIGTERM will fail ("Writing initial AOF, can't exit")
wait_for_condition 100 10 {
[RI $replica_id aof_rewrite_scheduled] eq 0 &&
[RI $replica_id aof_rewrite_in_progress] eq 0
} else {
fail "keys didn't expire"
fail "AOFRW didn't finish"
}
} else {
R $replica_id save

View File

@ -204,10 +204,10 @@ proc stop_instance pid {
incr wait 10
if {$wait == $max_wait} {
puts "Forcing process $pid to crash..."
puts [colorstr red "Forcing process $pid to crash..."]
catch {exec kill -SEGV $pid}
} elseif {$wait >= $max_wait * 2} {
puts "Forcing process $pid to exit..."
puts [colorstr red "Forcing process $pid to exit..."]
catch {exec kill -KILL $pid}
} elseif {$wait % 1000 == 0} {
puts "Waiting for process $pid to exit..."
@ -376,10 +376,10 @@ proc test {descr code} {
if {[catch {set retval [uplevel 1 $code]} error]} {
incr ::failed
if {[string match "assertion:*" $error]} {
set msg [string range $error 10 end]
set msg "FAILED: [string range $error 10 end]"
puts [colorstr red $msg]
if {$::pause_on_error} pause_on_error
puts "(Jumping to next unit after error)"
puts [colorstr red "(Jumping to next unit after error)"]
return -code continue
} else {
# Re-raise, let handler up the stack take care of this.
@ -451,10 +451,10 @@ proc run_tests {} {
# Print a message and exists with 0 / 1 according to zero or more failures.
proc end_tests {} {
if {$::failed == 0 } {
puts "GOOD! No errors."
puts [colorstr green "GOOD! No errors."]
exit 0
} else {
puts "WARNING $::failed test(s) failed."
puts [colorstr red "WARNING $::failed test(s) failed."]
exit 1
}
}
@ -583,7 +583,7 @@ proc get_instance_id_by_port {type port} {
fail "Instance $type port $port not found."
}
# Kill an instance of the specified type/id with SIGTERM.
# Kill an instance of the specified type/id with SIGKILL.
# This function will mark the instance PID as -1 to remember that this instance
# is no longer running and will remove its PID from the list of pids that
# we kill at cleanup.
@ -597,9 +597,6 @@ proc kill_instance {type id} {
error "You tried to kill $type $id twice."
}
# stop appendonly so that the instance won't refuse to go down
R $id config set appendonly no
stop_instance $pid
set_instance_attrib $type $id pid -1
set_instance_attrib $type $id link you_tried_to_talk_with_killed_instance