futriix/tests/unit/cluster/slave-selection.tcl

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

204 lines
5.7 KiB
Tcl
Raw Normal View History

# Slave selection test
# Check the algorithm trying to pick the slave with the most complete history.
# Create a cluster with 5 master and 10 slaves, so that we have 2
# slaves for each master.
start_cluster 5 10 {tags {external:skip cluster}} {
test "Cluster is up" {
wait_for_cluster_state ok
}
test "The first master has actually two slaves" {
Set repl-diskless-sync to yes by default, add repl-diskless-sync-max-replicas (#10092) 1. enable diskless replication by default 2. add a new config named repl-diskless-sync-max-replicas that enables replication to start before the full repl-diskless-sync-delay was reached. 3. put replica online sooner on the master (see below) 4. test suite uses repl-diskless-sync-delay of 0 to be faster 5. a few tests that use multiple replica on a pre-populated master, are now using the new repl-diskless-sync-max-replicas 6. fix possible timing issues in a few cluster tests (see below) put replica online sooner on the master ---------------------------------------------------- there were two tests that failed because they needed for the master to realize that the replica is online, but the test code was actually only waiting for the replica to realize it's online, and in diskless it could have been before the master realized it. changes include two things: 1. the tests wait on the right thing 2. issues in the master, putting the replica online in two steps. the master used to put the replica as online in 2 steps. the first step was to mark it as online, and the second step was to enable the write event (only after getting ACK), but in fact the first step didn't contains some of the tasks to put it online (like updating good slave count, and sending the module event). this meant that if a test was waiting to see that the replica is online form the point of view of the master, and then confirm that the module got an event, or that the master has enough good replicas, it could fail due to timing issues. so now the full effect of putting the replica online, happens at once, and only the part about enabling the writes is delayed till the ACK. fix cluster tests -------------------- I added some code to wait for the replica to sync and avoid race conditions. later realized the sentinel and cluster tests where using the original 5 seconds delay, so changed it to 0. this means the other changes are probably not needed, but i suppose they're still better (avoid race conditions)
2022-01-17 14:11:11 +02:00
wait_for_condition 1000 50 {
[llength [lindex [R 0 role] 2]] == 2
&& [llength [R 0 cluster replicas [R 0 CLUSTER MYID]]] == 2
Set repl-diskless-sync to yes by default, add repl-diskless-sync-max-replicas (#10092) 1. enable diskless replication by default 2. add a new config named repl-diskless-sync-max-replicas that enables replication to start before the full repl-diskless-sync-delay was reached. 3. put replica online sooner on the master (see below) 4. test suite uses repl-diskless-sync-delay of 0 to be faster 5. a few tests that use multiple replica on a pre-populated master, are now using the new repl-diskless-sync-max-replicas 6. fix possible timing issues in a few cluster tests (see below) put replica online sooner on the master ---------------------------------------------------- there were two tests that failed because they needed for the master to realize that the replica is online, but the test code was actually only waiting for the replica to realize it's online, and in diskless it could have been before the master realized it. changes include two things: 1. the tests wait on the right thing 2. issues in the master, putting the replica online in two steps. the master used to put the replica as online in 2 steps. the first step was to mark it as online, and the second step was to enable the write event (only after getting ACK), but in fact the first step didn't contains some of the tasks to put it online (like updating good slave count, and sending the module event). this meant that if a test was waiting to see that the replica is online form the point of view of the master, and then confirm that the module got an event, or that the master has enough good replicas, it could fail due to timing issues. so now the full effect of putting the replica online, happens at once, and only the part about enabling the writes is delayed till the ACK. fix cluster tests -------------------- I added some code to wait for the replica to sync and avoid race conditions. later realized the sentinel and cluster tests where using the original 5 seconds delay, so changed it to 0. this means the other changes are probably not needed, but i suppose they're still better (avoid race conditions)
2022-01-17 14:11:11 +02:00
} else {
fail "replicas didn't connect"
}
}
test "CLUSTER SLAVES and CLUSTER REPLICAS output is consistent" {
# Because we already have command output that cover CLUSTER REPLICAS elsewhere,
# here we simply judge whether their output is consistent to cover CLUSTER SLAVES.
set myid [R 0 CLUSTER MYID]
R 0 multi
R 0 cluster slaves $myid
R 0 cluster replicas $myid
lassign [R 0 exec] res res2
assert_equal $res $res2
}
test {Slaves of #0 are instance #5 and #10 as expected} {
set port0 [srv 0 port]
assert {[lindex [R 5 role] 2] == $port0}
assert {[lindex [R 10 role] 2] == $port0}
}
test "Instance #5 and #10 synced with the master" {
wait_for_condition 1000 50 {
[s -5 master_link_status] eq {up} &&
[s -10 master_link_status] eq {up}
} else {
fail "Instance #5 or #10 master link status is not up"
}
}
set cluster [valkey_cluster 127.0.0.1:[srv 0 port]]
test "Slaves are both able to receive and acknowledge writes" {
for {set j 0} {$j < 100} {incr j} {
$cluster set $j $j
}
assert {[R 0 wait 2 60000] == 2}
}
set paused_pid [srv 0 pid]
test "Write data while slave #10 is paused and can't receive it" {
# Stop the slave with a multi/exec transaction so that the master will
# be killed as soon as it can accept writes again.
R 10 multi
R 10 debug sleep 10
R 10 client kill 127.0.0.1:$port0
R 10 deferred 1
R 10 exec
# Write some data the slave can't receive.
for {set j 0} {$j < 100} {incr j} {
$cluster set $j $j
}
# Prevent the master from accepting new slaves.
# Use a large pause value since we'll kill it anyway.
R 0 CLIENT PAUSE 60000
# Wait for the slave to return available again
R 10 deferred 0
assert {[R 10 read] eq {OK OK}}
# Kill the master so that a reconnection will not be possible.
pause_process $paused_pid
}
test "Wait for instance #5 (and not #10) to turn into a master" {
wait_for_condition 1000 50 {
[s -5 role] eq {master}
} else {
fail "No failover detected"
}
}
test "Wait for the node #10 to return alive before ending the test" {
R 10 ping
}
test "Cluster should eventually be up again" {
for {set j 0} {$j < [llength $::servers]} {incr j} {
if {[process_is_paused [srv -$j pid]]} continue
wait_for_condition 1000 50 {
[CI $j cluster_state] eq "ok"
} else {
fail "Cluster node $j cluster_state:[CI $j cluster_state]"
}
}
}
2014-07-31 14:33:50 -04:00
test "Node #10 should eventually replicate node #5" {
set port5 [srv -5 port]
wait_for_condition 1000 50 {
([lindex [R 10 role] 2] == $port5) &&
([lindex [R 10 role] 3] eq {connected})
} else {
fail "#10 didn't became slave of #5"
}
}
} ;# start_cluster
# Create a cluster with 3 master and 15 slaves, so that we have 5
# slaves for eatch master.
start_cluster 3 15 {tags {external:skip cluster}} {
test "Cluster is up" {
wait_for_cluster_state ok
}
test "The first master has actually 5 slaves" {
Set repl-diskless-sync to yes by default, add repl-diskless-sync-max-replicas (#10092) 1. enable diskless replication by default 2. add a new config named repl-diskless-sync-max-replicas that enables replication to start before the full repl-diskless-sync-delay was reached. 3. put replica online sooner on the master (see below) 4. test suite uses repl-diskless-sync-delay of 0 to be faster 5. a few tests that use multiple replica on a pre-populated master, are now using the new repl-diskless-sync-max-replicas 6. fix possible timing issues in a few cluster tests (see below) put replica online sooner on the master ---------------------------------------------------- there were two tests that failed because they needed for the master to realize that the replica is online, but the test code was actually only waiting for the replica to realize it's online, and in diskless it could have been before the master realized it. changes include two things: 1. the tests wait on the right thing 2. issues in the master, putting the replica online in two steps. the master used to put the replica as online in 2 steps. the first step was to mark it as online, and the second step was to enable the write event (only after getting ACK), but in fact the first step didn't contains some of the tasks to put it online (like updating good slave count, and sending the module event). this meant that if a test was waiting to see that the replica is online form the point of view of the master, and then confirm that the module got an event, or that the master has enough good replicas, it could fail due to timing issues. so now the full effect of putting the replica online, happens at once, and only the part about enabling the writes is delayed till the ACK. fix cluster tests -------------------- I added some code to wait for the replica to sync and avoid race conditions. later realized the sentinel and cluster tests where using the original 5 seconds delay, so changed it to 0. this means the other changes are probably not needed, but i suppose they're still better (avoid race conditions)
2022-01-17 14:11:11 +02:00
wait_for_condition 1000 50 {
[llength [lindex [R 0 role] 2]] == 5
} else {
fail "replicas didn't connect"
}
}
test {Slaves of #0 are instance #3, #6, #9, #12 and #15 as expected} {
set port0 [srv 0 port]
assert {[lindex [R 3 role] 2] == $port0}
assert {[lindex [R 6 role] 2] == $port0}
assert {[lindex [R 9 role] 2] == $port0}
assert {[lindex [R 12 role] 2] == $port0}
assert {[lindex [R 15 role] 2] == $port0}
}
test {Instance #3, #6, #9, #12 and #15 synced with the master} {
wait_for_condition 1000 50 {
[s -3 master_link_status] eq {up} &&
[s -6 master_link_status] eq {up} &&
[s -9 master_link_status] eq {up} &&
[s -12 master_link_status] eq {up} &&
[s -15 master_link_status] eq {up}
} else {
fail "Instance #3 or #6 or #9 or #12 or #15 master link status is not up"
}
}
proc master_detected {instances} {
foreach instance [dict keys $instances] {
if {[s -$instance role] eq {master}} {
return true
}
}
return false
}
test "New Master down consecutively" {
set instances [dict create 0 1 3 1 6 1 9 1 12 1 15 1]
set loops [expr {[dict size $instances]-1}]
for {set i 0} {$i < $loops} {incr i} {
set master_id -1
foreach instance [dict keys $instances] {
if {[s -$instance role] eq {master}} {
set master_id $instance
break;
}
}
if {$master_id eq -1} {
fail "no master detected, #loop $i"
}
set instances [dict remove $instances $master_id]
set paused_pid [srv [expr $master_id * -1] pid]
pause_process $paused_pid
wait_for_condition 1000 50 {
[master_detected $instances]
} else {
fail "No failover detected when master $master_id fails"
}
for {set j 0} {$j < [llength $::servers]} {incr j} {
if {[process_is_paused [srv -$j pid]]} continue
wait_for_condition 1000 50 {
[CI $j cluster_state] eq "ok"
} else {
fail "Cluster node $j cluster_state:[CI $j cluster_state]"
}
}
}
}
} ;# start_cluster