2024-05-09 10:14:47 +08:00
|
|
|
start_cluster 5 5 {tags {external:skip cluster}} {
|
2020-03-18 16:17:46 +08:00
|
|
|
|
|
|
|
test "Cluster should start ok" {
|
2024-05-09 10:14:47 +08:00
|
|
|
wait_for_cluster_state ok
|
2020-03-18 16:17:46 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
test "Cluster is writable" {
|
2024-05-09 10:14:47 +08:00
|
|
|
cluster_write_test [srv 0 port]
|
2020-03-18 16:17:46 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
proc find_non_empty_master {} {
|
|
|
|
set master_id_no {}
|
2024-05-09 10:14:47 +08:00
|
|
|
|
|
|
|
for {set id 0} {$id < [llength $::servers]} {incr id} {
|
|
|
|
if {[s -$id role] eq {master} && [R $id dbsize] > 0} {
|
2020-03-18 16:17:46 +08:00
|
|
|
set master_id_no $id
|
Set repl-diskless-sync to yes by default, add repl-diskless-sync-max-replicas (#10092)
1. enable diskless replication by default
2. add a new config named repl-diskless-sync-max-replicas that enables
replication to start before the full repl-diskless-sync-delay was
reached.
3. put replica online sooner on the master (see below)
4. test suite uses repl-diskless-sync-delay of 0 to be faster
5. a few tests that use multiple replica on a pre-populated master, are
now using the new repl-diskless-sync-max-replicas
6. fix possible timing issues in a few cluster tests (see below)
put replica online sooner on the master
----------------------------------------------------
there were two tests that failed because they needed for the master to
realize that the replica is online, but the test code was actually only
waiting for the replica to realize it's online, and in diskless it could
have been before the master realized it.
changes include two things:
1. the tests wait on the right thing
2. issues in the master, putting the replica online in two steps.
the master used to put the replica as online in 2 steps. the first
step was to mark it as online, and the second step was to enable the
write event (only after getting ACK), but in fact the first step didn't
contains some of the tasks to put it online (like updating good slave
count, and sending the module event). this meant that if a test was
waiting to see that the replica is online form the point of view of the
master, and then confirm that the module got an event, or that the
master has enough good replicas, it could fail due to timing issues.
so now the full effect of putting the replica online, happens at once,
and only the part about enabling the writes is delayed till the ACK.
fix cluster tests
--------------------
I added some code to wait for the replica to sync and avoid race
conditions.
later realized the sentinel and cluster tests where using the original 5
seconds delay, so changed it to 0.
this means the other changes are probably not needed, but i suppose
they're still better (avoid race conditions)
2022-01-17 14:11:11 +02:00
|
|
|
break
|
2020-03-18 16:17:46 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return $master_id_no
|
|
|
|
}
|
|
|
|
|
|
|
|
proc get_one_of_my_replica {id} {
|
Set repl-diskless-sync to yes by default, add repl-diskless-sync-max-replicas (#10092)
1. enable diskless replication by default
2. add a new config named repl-diskless-sync-max-replicas that enables
replication to start before the full repl-diskless-sync-delay was
reached.
3. put replica online sooner on the master (see below)
4. test suite uses repl-diskless-sync-delay of 0 to be faster
5. a few tests that use multiple replica on a pre-populated master, are
now using the new repl-diskless-sync-max-replicas
6. fix possible timing issues in a few cluster tests (see below)
put replica online sooner on the master
----------------------------------------------------
there were two tests that failed because they needed for the master to
realize that the replica is online, but the test code was actually only
waiting for the replica to realize it's online, and in diskless it could
have been before the master realized it.
changes include two things:
1. the tests wait on the right thing
2. issues in the master, putting the replica online in two steps.
the master used to put the replica as online in 2 steps. the first
step was to mark it as online, and the second step was to enable the
write event (only after getting ACK), but in fact the first step didn't
contains some of the tasks to put it online (like updating good slave
count, and sending the module event). this meant that if a test was
waiting to see that the replica is online form the point of view of the
master, and then confirm that the module got an event, or that the
master has enough good replicas, it could fail due to timing issues.
so now the full effect of putting the replica online, happens at once,
and only the part about enabling the writes is delayed till the ACK.
fix cluster tests
--------------------
I added some code to wait for the replica to sync and avoid race
conditions.
later realized the sentinel and cluster tests where using the original 5
seconds delay, so changed it to 0.
this means the other changes are probably not needed, but i suppose
they're still better (avoid race conditions)
2022-01-17 14:11:11 +02:00
|
|
|
wait_for_condition 1000 50 {
|
|
|
|
[llength [lindex [R $id role] 2]] > 0
|
|
|
|
} else {
|
|
|
|
fail "replicas didn't connect"
|
|
|
|
}
|
2020-03-18 16:17:46 +08:00
|
|
|
set replica_port [lindex [lindex [lindex [R $id role] 2] 0] 1]
|
2024-04-18 09:57:17 -04:00
|
|
|
set replica_id_num [get_instance_id_by_port valkey $replica_port]
|
2020-03-18 16:17:46 +08:00
|
|
|
return $replica_id_num
|
|
|
|
}
|
|
|
|
|
|
|
|
proc cluster_write_keys_with_expire {id ttl} {
|
|
|
|
set prefix [randstring 20 20 alpha]
|
2024-05-09 10:14:47 +08:00
|
|
|
set port [srv -$id port]
|
2024-04-24 12:01:33 -04:00
|
|
|
set cluster [valkey_cluster 127.0.0.1:$port]
|
2020-03-18 16:17:46 +08:00
|
|
|
for {set j 100} {$j < 200} {incr j} {
|
|
|
|
$cluster setex key_expire.$j $ttl $prefix.$j
|
|
|
|
}
|
|
|
|
$cluster close
|
|
|
|
}
|
|
|
|
|
2020-09-07 18:06:25 +03:00
|
|
|
# make sure that replica who restarts from persistence will load keys
|
|
|
|
# that have already expired, critical for correct execution of commands
|
|
|
|
# that arrive from the master
|
2020-03-18 16:17:46 +08:00
|
|
|
proc test_slave_load_expired_keys {aof} {
|
|
|
|
test "Slave expired keys is loaded when restarted: appendonly=$aof" {
|
|
|
|
set master_id [find_non_empty_master]
|
|
|
|
set replica_id [get_one_of_my_replica $master_id]
|
|
|
|
|
2020-09-07 18:06:25 +03:00
|
|
|
set master_dbsize_0 [R $master_id dbsize]
|
|
|
|
set replica_dbsize_0 [R $replica_id dbsize]
|
|
|
|
assert_equal $master_dbsize_0 $replica_dbsize_0
|
2020-03-18 16:17:46 +08:00
|
|
|
|
2020-09-07 18:06:25 +03:00
|
|
|
# config the replica persistency and rewrite the config file to survive restart
|
|
|
|
# note that this needs to be done before populating the volatile keys since
|
2021-05-29 23:20:32 -07:00
|
|
|
# that triggers and AOFRW, and we rather the AOF file to have 'SET PXAT' commands
|
2020-09-07 18:06:25 +03:00
|
|
|
# rather than an RDB with volatile keys
|
2020-03-18 16:17:46 +08:00
|
|
|
R $replica_id config set appendonly $aof
|
|
|
|
R $replica_id config rewrite
|
|
|
|
|
2020-09-07 18:06:25 +03:00
|
|
|
# fill with 100 keys with 3 second TTL
|
|
|
|
set data_ttl 3
|
|
|
|
cluster_write_keys_with_expire $master_id $data_ttl
|
|
|
|
|
|
|
|
# wait for replica to be in sync with master
|
|
|
|
wait_for_condition 500 10 {
|
|
|
|
[R $replica_id dbsize] eq [R $master_id dbsize]
|
|
|
|
} else {
|
|
|
|
fail "replica didn't sync"
|
|
|
|
}
|
|
|
|
|
|
|
|
set replica_dbsize_1 [R $replica_id dbsize]
|
|
|
|
assert {$replica_dbsize_1 > $replica_dbsize_0}
|
|
|
|
|
|
|
|
# make replica create persistence file
|
|
|
|
if {$aof == "yes"} {
|
2024-05-09 10:14:47 +08:00
|
|
|
# we need to wait for the initial AOFRW to be done
|
2020-09-07 18:06:25 +03:00
|
|
|
wait_for_condition 100 10 {
|
2024-05-09 10:14:47 +08:00
|
|
|
[s -$replica_id aof_rewrite_scheduled] eq 0 &&
|
|
|
|
[s -$replica_id aof_rewrite_in_progress] eq 0
|
2020-09-07 18:06:25 +03:00
|
|
|
} else {
|
2021-10-31 19:22:21 +02:00
|
|
|
fail "AOFRW didn't finish"
|
2020-09-07 18:06:25 +03:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
R $replica_id save
|
|
|
|
}
|
|
|
|
|
|
|
|
# kill the replica (would stay down until re-started)
|
2024-05-09 10:14:47 +08:00
|
|
|
set paused_pid [srv -$replica_id pid]
|
|
|
|
pause_process $paused_pid
|
2020-03-18 16:17:46 +08:00
|
|
|
|
2020-09-07 18:06:25 +03:00
|
|
|
# Make sure the master doesn't do active expire (sending DELs to the replica)
|
|
|
|
R $master_id DEBUG SET-ACTIVE-EXPIRE 0
|
2020-03-18 16:17:46 +08:00
|
|
|
|
2020-09-07 18:06:25 +03:00
|
|
|
# wait for all the keys to get logically expired
|
|
|
|
after [expr $data_ttl*1000]
|
|
|
|
|
|
|
|
# start the replica again (loading an RDB or AOF file)
|
2024-05-09 10:14:47 +08:00
|
|
|
resume_process $paused_pid
|
2020-03-18 16:17:46 +08:00
|
|
|
|
2020-09-07 18:06:25 +03:00
|
|
|
# make sure the keys are still there
|
|
|
|
set replica_dbsize_3 [R $replica_id dbsize]
|
|
|
|
assert {$replica_dbsize_3 > $replica_dbsize_0}
|
|
|
|
|
|
|
|
# restore settings
|
|
|
|
R $master_id DEBUG SET-ACTIVE-EXPIRE 1
|
|
|
|
|
|
|
|
# wait for the master to expire all keys and replica to get the DELs
|
|
|
|
wait_for_condition 500 10 {
|
|
|
|
[R $replica_id dbsize] eq $master_dbsize_0
|
2020-03-18 16:17:46 +08:00
|
|
|
} else {
|
2020-09-07 18:06:25 +03:00
|
|
|
fail "keys didn't expire"
|
2020-03-18 16:17:46 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
test_slave_load_expired_keys no
|
|
|
|
test_slave_load_expired_keys yes
|
2024-05-09 10:14:47 +08:00
|
|
|
|
|
|
|
} ;# start_cluster
|