futriix-old/tests/cluster/tests/03-failover-loop.tcl
antirez 255a6e32aa Cluster test: test that writes are retained during failovers.
In the test we use WAIT when the master and slave are up, and only later the
partition is created killing the master, so we are sure we don't incur
in failure modes that may lose writes in this test: the goal here is to
make sure that the elected slave was replicating correctly with the
master.
2014-07-18 17:13:46 +02:00

113 lines
3.1 KiB
Tcl

# Failover stress test.
# In this test a different node is killed in a loop for N
# iterations. The test checks that certain properties
# are preseved across iterations.
source "../tests/includes/init-tests.tcl"
test "Create a 5 nodes cluster" {
create_cluster 5 5
}
test "Cluster is up" {
assert_cluster_state ok
}
set iterations 20
set cluster [redis_cluster 127.0.0.1:[get_instance_attrib redis 0 port]]
while {[incr iterations -1]} {
set tokill [randomInt 10]
set other [expr {($tokill+1)%10}] ; # Some other instance.
set key [randstring 20 20 alpha]
set val [randstring 20 20 alpha]
set role [RI $tokill role]
if {$role eq {master}} {
set slave {}
set myid [dict get [get_myself $tokill] id]
foreach_redis_id id {
if {$id == $tokill} continue
if {[dict get [get_myself $id] slaveof] eq $myid} {
set slave $id
}
}
if {$slave eq {}} {
fail "Unable to retrieve slave's ID for master #$tokill"
}
}
puts "--- Iteration $iterations ---"
if {$role eq {master}} {
test "Wait for slave of #$tokill to sync" {
wait_for_condition 1000 50 {
[string match {*state=online*} [RI $tokill slave0]]
} else {
fail "Slave of node #$tokill is not ok"
}
}
set slave_config_epoch [CI $slave cluster_my_epoch]
}
test "Cluster is writable before failover" {
for {set i 0} {$i < 100} {incr i} {
catch {$cluster set $key:$i $val:$i} err
assert {$err eq {OK}}
}
# Wait for the write to propagate to the slave
R $tokill wait 1 20000
}
test "Killing node #$tokill" {
kill_instance redis $tokill
}
if {$role eq {master}} {
test "Wait failover by #$slave with old epoch $slave_config_epoch" {
wait_for_condition 1000 50 {
[CI $slave cluster_my_epoch] > $slave_config_epoch
} else {
fail "No failover detected, epoch is still [CI $slave cluster_my_epoch]"
}
}
}
test "Cluster should eventually be up again" {
assert_cluster_state ok
}
test "Cluster is writable again" {
for {set i 0} {$i < 100} {incr i} {
catch {$cluster set $key:$i:2 $val:$i:2} err
assert {$err eq {OK}}
}
}
test "Restarting node #$tokill" {
restart_instance redis $tokill
}
test "Instance #$tokill is now a slave" {
wait_for_condition 1000 50 {
[RI $tokill role] eq {slave}
} else {
fail "Restarted instance is not a slave"
}
}
test "We can read back the value we set before" {
for {set i 0} {$i < 100} {incr i} {
catch {$cluster get $key:$i} err
assert {$err eq "$val:$i"}
catch {$cluster get $key:$i:2} err
assert {$err eq "$val:$i:2"}
}
}
}
test "Post condition: current_epoch >= my_epoch everywhere" {
foreach_redis_id id {
assert {[CI $id cluster_current_epoch] >= [CI $id cluster_my_epoch]}
}
}