futriix/tests/cluster/tests/03-failover-loop.tcl
Oran Agra 48d54265ce
Fix failing cluster tests (#9707)
Fix failures introduced by #9695 which was an attempt to solve failures introduced by #9679.
And alternative to #9703 (i didn't like the extra argument to kill_instance).

Reverting #9695.
Instead of stopping AOF on all terminations, stop it only on the two which need it.
Do it as part of the test rather than the infra (it was add that kill_instance used `R`
to communicate to the instance)

Note that the original purpose of these tests was to trigger a crash, but that upsets
valgrind so in redis 6.2 i changed it to use SIGTERM, so i now rename the tests
(remove "kill" and "crash").

Also add some colors to failures, and the word "FAILED" so that it's searchable.

And solve a semi-related race condition in 14-consistency-check.tcl
2021-10-31 19:22:21 +02:00

118 lines
3.4 KiB
Tcl

# Failover stress test.
# In this test a different node is killed in a loop for N
# iterations. The test checks that certain properties
# are preserved across iterations.
source "../tests/includes/init-tests.tcl"
test "Create a 5 nodes cluster" {
create_cluster 5 5
}
test "Cluster is up" {
assert_cluster_state ok
}
set iterations 20
set cluster [redis_cluster 127.0.0.1:[get_instance_attrib redis 0 port]]
while {[incr iterations -1]} {
set tokill [randomInt 10]
set other [expr {($tokill+1)%10}] ; # Some other instance.
set key [randstring 20 20 alpha]
set val [randstring 20 20 alpha]
set role [RI $tokill role]
if {$role eq {master}} {
set slave {}
set myid [dict get [get_myself $tokill] id]
foreach_redis_id id {
if {$id == $tokill} continue
if {[dict get [get_myself $id] slaveof] eq $myid} {
set slave $id
}
}
if {$slave eq {}} {
fail "Unable to retrieve slave's ID for master #$tokill"
}
}
puts "--- Iteration $iterations ---"
if {$role eq {master}} {
test "Wait for slave of #$tokill to sync" {
wait_for_condition 1000 50 {
[string match {*state=online*} [RI $tokill slave0]]
} else {
fail "Slave of node #$tokill is not ok"
}
}
set slave_config_epoch [CI $slave cluster_my_epoch]
}
test "Cluster is writable before failover" {
for {set i 0} {$i < 100} {incr i} {
catch {$cluster set $key:$i $val:$i} err
assert {$err eq {OK}}
}
# Wait for the write to propagate to the slave if we
# are going to kill a master.
if {$role eq {master}} {
R $tokill wait 1 20000
}
}
test "Terminating node #$tokill" {
# Stop AOF so that an initial AOFRW won't prevent the instance from terminating
R $tokill config set appendonly no
kill_instance redis $tokill
}
if {$role eq {master}} {
test "Wait failover by #$slave with old epoch $slave_config_epoch" {
wait_for_condition 1000 50 {
[CI $slave cluster_my_epoch] > $slave_config_epoch
} else {
fail "No failover detected, epoch is still [CI $slave cluster_my_epoch]"
}
}
}
test "Cluster should eventually be up again" {
assert_cluster_state ok
}
test "Cluster is writable again" {
for {set i 0} {$i < 100} {incr i} {
catch {$cluster set $key:$i:2 $val:$i:2} err
assert {$err eq {OK}}
}
}
test "Restarting node #$tokill" {
restart_instance redis $tokill
}
test "Instance #$tokill is now a slave" {
wait_for_condition 1000 50 {
[RI $tokill role] eq {slave}
} else {
fail "Restarted instance is not a slave"
}
}
test "We can read back the value we set before" {
for {set i 0} {$i < 100} {incr i} {
catch {$cluster get $key:$i} err
assert {$err eq "$val:$i"}
catch {$cluster get $key:$i:2} err
assert {$err eq "$val:$i:2"}
}
}
}
test "Post condition: current_epoch >= my_epoch everywhere" {
foreach_redis_id id {
assert {[CI $id cluster_current_epoch] >= [CI $id cluster_my_epoch]}
}
}