
Introduce several improvements to improve the stability of dual-channel replication and fix compatibility issues. 1. Make dual-channel-replication tests more reliable: use pause instead of forced sleep. 2. Fix race conditions when freeing RDB client. 3. Check if sync was stopped during local buffer streaming. 4. Fix $ENDOFFSET reply format to work on 32-bit machines too. --------- Signed-off-by: naglera <anagler123@gmail.com> Signed-off-by: Madelyn Olson <madelyneolson@gmail.com> Co-authored-by: Madelyn Olson <madelyneolson@gmail.com>
1120 lines
46 KiB
Tcl
1120 lines
46 KiB
Tcl
proc log_file_matches {log pattern} {
|
|
set fp [open $log r]
|
|
set content [read $fp]
|
|
close $fp
|
|
string match $pattern $content
|
|
}
|
|
|
|
proc get_client_id_by_last_cmd {r cmd} {
|
|
set client_list [$r client list]
|
|
set client_id ""
|
|
set lines [split $client_list "\n"]
|
|
foreach line $lines {
|
|
if {[string match *cmd=$cmd* $line]} {
|
|
set parts [split $line " "]
|
|
foreach part $parts {
|
|
if {[string match id=* $part]} {
|
|
set client_id [lindex [split $part "="] 1]
|
|
return $client_id
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return $client_id
|
|
}
|
|
|
|
start_server {tags {"dual-channel-replication external:skip"}} {
|
|
set replica [srv 0 client]
|
|
set replica_host [srv 0 host]
|
|
set replica_port [srv 0 port]
|
|
set replica_log [srv 0 stdout]
|
|
$replica config set loglevel debug
|
|
start_server {} {
|
|
set primary [srv 0 client]
|
|
set primary_host [srv 0 host]
|
|
set primary_port [srv 0 port]
|
|
|
|
# Configure the primary in order to hang waiting for the BGSAVE
|
|
# operation, so that the replica remains in the handshake state.
|
|
$primary config set repl-diskless-sync yes
|
|
$primary config set repl-diskless-sync-delay 1000
|
|
$primary config set dual-channel-replication-enabled yes
|
|
$primary config set loglevel debug
|
|
|
|
# Start the replication process...
|
|
$replica config set dual-channel-replication-enabled yes
|
|
$replica replicaof $primary_host $primary_port
|
|
|
|
test "Test dual-channel-replication-enabled replica enters handshake" {
|
|
wait_for_condition 50 1000 {
|
|
[string match *handshake* [$replica role]]
|
|
} else {
|
|
fail "Replica does not enter handshake state"
|
|
}
|
|
}
|
|
|
|
test "Test dual-channel-replication-enabled enters wait_bgsave" {
|
|
wait_for_condition 50 1000 {
|
|
[string match *state=wait_bgsave* [$primary info replication]]
|
|
} else {
|
|
fail "Replica does not enter wait_bgsave state"
|
|
}
|
|
}
|
|
|
|
$primary config set repl-diskless-sync-delay 0
|
|
|
|
test "Test dual-channel-replication-enabled replica is able to sync" {
|
|
verify_replica_online $primary 0 500
|
|
wait_for_condition 50 1000 {
|
|
[string match *connected_slaves:1* [$primary info]]
|
|
} else {
|
|
fail "Replica rdb connection is still open"
|
|
}
|
|
set offset [status $primary master_repl_offset]
|
|
wait_for_condition 500 100 {
|
|
[string match "*slave0:*,offset=$offset,*" [$primary info replication]] &&
|
|
$offset == [status $replica master_repl_offset]
|
|
} else {
|
|
fail "Replicas and primary offsets were unable to match."
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
start_server {tags {"dual-channel-replication external:skip"}} {
|
|
set replica [srv 0 client]
|
|
set replica_host [srv 0 host]
|
|
set replica_port [srv 0 port]
|
|
set replica_log [srv 0 stdout]
|
|
start_server {} {
|
|
set primary [srv 0 client]
|
|
set primary_host [srv 0 host]
|
|
set primary_port [srv 0 port]
|
|
|
|
$primary config set rdb-key-save-delay 200
|
|
$primary config set dual-channel-replication-enabled yes
|
|
$replica config set dual-channel-replication-enabled yes
|
|
$replica config set repl-diskless-sync no
|
|
|
|
populate 1000 primary 10000
|
|
set load_handle1 [start_one_key_write_load $primary_host $primary_port 100 "mykey1"]
|
|
set load_handle2 [start_one_key_write_load $primary_host $primary_port 100 "mykey2"]
|
|
set load_handle3 [start_one_key_write_load $primary_host $primary_port 100 "mykey3"]
|
|
|
|
# wait for load handlers to start
|
|
wait_for_condition 50 1000 {
|
|
([$primary get "mykey1"] != "") &&
|
|
([$primary get "mykey2"] != "") &&
|
|
([$primary get "mykey3"] != "")
|
|
} else {
|
|
fail "Can't set new keys"
|
|
}
|
|
|
|
set before_used [s 0 used_memory]
|
|
|
|
test "Primary memory usage does not increase during dual-channel-replication sync" {
|
|
$replica replicaof $primary_host $primary_port
|
|
|
|
# Verify used_memory stays low through all the sync
|
|
set max_retry 500
|
|
while {$max_retry} {
|
|
# Verify memory
|
|
set used_memory [s 0 used_memory]
|
|
assert {$used_memory-$before_used <= 1.5*10^6}; # ~1/3 of the space
|
|
# Check replica state
|
|
set primary_info [$primary info]
|
|
set replica_info [$replica info]
|
|
if {[string match *slave0:*state=online* $primary_info] &&
|
|
[string match *master_link_status:up* $replica_info]} {
|
|
break
|
|
} else {
|
|
incr max_retry -1
|
|
after 10
|
|
}
|
|
}
|
|
if {$max_retry == 0} {
|
|
error "assertion:Replica not in sync after 5 seconds"
|
|
}
|
|
}
|
|
stop_write_load $load_handle1
|
|
stop_write_load $load_handle2
|
|
stop_write_load $load_handle3
|
|
|
|
test "Steady state after dual channel sync" {
|
|
wait_for_condition 50 1000 {
|
|
([$replica get "mykey1"] eq [$primary get mykey1]) &&
|
|
([$replica get "mykey2"] eq [$primary get mykey2]) &&
|
|
([$replica get "mykey3"] eq [$primary get mykey3])
|
|
} else {
|
|
fail "Can't set new keys"
|
|
}
|
|
}
|
|
|
|
test "Dual channel replication sync doesn't impair subsequent normal syncs" {
|
|
$replica replicaof no one
|
|
$replica config set dual-channel-replication-enabled no
|
|
$primary set newkey newval
|
|
|
|
set sync_full [s 0 sync_full]
|
|
set sync_partial [s 0 sync_partial_ok]
|
|
|
|
$replica replicaof $primary_host $primary_port
|
|
verify_replica_online $primary 0 500
|
|
# Verify replica used normal sync this time
|
|
assert_equal [expr $sync_full + 1] [s 0 sync_full]
|
|
assert_equal [expr $sync_partial] [s 0 sync_partial_ok]
|
|
assert [string match *connected_slaves:1* [$primary info]]
|
|
}
|
|
}
|
|
}
|
|
|
|
start_server {tags {"dual-channel-replication external:skip"}} {
|
|
set replica [srv 0 client]
|
|
set replica_host [srv 0 host]
|
|
set replica_port [srv 0 port]
|
|
set replica_log [srv 0 stdout]
|
|
start_server {} {
|
|
foreach enable {yes no} {
|
|
set primary [srv 0 client]
|
|
set primary_host [srv 0 host]
|
|
set primary_port [srv 0 port]
|
|
|
|
$primary config set repl-diskless-sync yes
|
|
# Set primary shared replication buffer size to a bit more then the size of
|
|
# a replication buffer block.
|
|
$primary config set client-output-buffer-limit "replica 1100k 0 0"
|
|
$primary config set dual-channel-replication-enabled $enable
|
|
$replica config set dual-channel-replication-enabled $enable
|
|
|
|
test "Toggle dual-channel-replication-enabled: $enable start" {
|
|
populate 1000 primary 10000
|
|
set prev_sync_full [s 0 sync_full]
|
|
set prev_sync_partial [s 0 sync_partial_ok]
|
|
|
|
$replica replicaof $primary_host $primary_port
|
|
verify_replica_online $primary 0 500
|
|
wait_for_sync $replica
|
|
|
|
|
|
set cur_sync_full [s 0 sync_full]
|
|
set cur_sync_partial [s 0 sync_partial_ok]
|
|
if {$enable == "yes"} {
|
|
# Verify that dual channel replication sync was used
|
|
assert {$cur_sync_full == [expr $prev_sync_full + 1]}
|
|
assert {$cur_sync_partial == [expr $prev_sync_partial + 1]}
|
|
} else {
|
|
# Verify that normal sync was used
|
|
assert {[s 0 sync_full] == [expr $prev_sync_full + 1]}
|
|
assert {[s 0 sync_partial_ok] == $prev_sync_partial}
|
|
}
|
|
|
|
$replica replicaof no one
|
|
if {$enable == "yes"} {
|
|
# disable dual channel sync
|
|
$replica config set dual-channel-replication-enabled no
|
|
$primary config set dual-channel-replication-enabled no
|
|
} else {
|
|
$replica config set dual-channel-replication-enabled yes
|
|
$primary config set dual-channel-replication-enabled yes
|
|
}
|
|
|
|
# Force replica to full sync next time
|
|
populate 1000 primary 10000
|
|
set prev_sync_full [s 0 sync_full]
|
|
set prev_sync_partial [s 0 sync_partial_ok]
|
|
|
|
$replica replicaof $primary_host $primary_port
|
|
verify_replica_online $primary 0 500
|
|
wait_for_sync $replica
|
|
|
|
set cur_sync_full [s 0 sync_full]
|
|
set cur_sync_partial [s 0 sync_partial_ok]
|
|
if {$enable == "yes"} {
|
|
# Verify that normal sync was used
|
|
assert {$cur_sync_full == [expr $prev_sync_full + 1]}
|
|
assert {$cur_sync_partial == $prev_sync_partial}
|
|
} else {
|
|
# Verify that dual channel replication sync was used
|
|
assert {$cur_sync_full == [expr $prev_sync_full + 1]}
|
|
assert {$cur_sync_partial == [expr $prev_sync_partial + 1]}
|
|
}
|
|
$replica replicaof no one
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
start_server {tags {"dual-channel-replication external:skip"}} {
|
|
set replica1 [srv 0 client]
|
|
set replica1_host [srv 0 host]
|
|
set replica1_port [srv 0 port]
|
|
set replica1_log [srv 0 stdout]
|
|
start_server {} {
|
|
set replica2 [srv 0 client]
|
|
set replica2_host [srv 0 host]
|
|
set replica2_port [srv 0 port]
|
|
set replica2_log [srv 0 stdout]
|
|
start_server {} {
|
|
set primary [srv 0 client]
|
|
set primary_host [srv 0 host]
|
|
set primary_port [srv 0 port]
|
|
set loglines [count_log_lines -1]
|
|
|
|
populate 10000 primary 10
|
|
$primary set key1 val1
|
|
|
|
$primary config set repl-diskless-sync yes
|
|
$primary config set repl-diskless-sync-delay 5; # allow both replicas to ask for sync
|
|
$primary config set dual-channel-replication-enabled yes
|
|
|
|
$replica1 config set dual-channel-replication-enabled yes
|
|
$replica2 config set dual-channel-replication-enabled yes
|
|
$replica1 config set repl-diskless-sync no
|
|
$replica2 config set repl-diskless-sync no
|
|
$replica1 config set loglevel debug
|
|
$replica2 config set loglevel debug
|
|
|
|
test "dual-channel-replication with multiple replicas" {
|
|
$replica1 replicaof $primary_host $primary_port
|
|
$replica2 replicaof $primary_host $primary_port
|
|
verify_replica_online $primary 0 500
|
|
verify_replica_online $primary 1 500
|
|
|
|
wait_for_value_to_propegate_to_replica $primary $replica1 "key1"
|
|
wait_for_value_to_propegate_to_replica $primary $replica2 "key1"
|
|
|
|
assert {[s 0 total_forks] eq "1" }
|
|
}
|
|
|
|
$replica1 replicaof no one
|
|
$replica2 replicaof no one
|
|
|
|
$replica1 config set dual-channel-replication-enabled yes
|
|
$replica2 config set dual-channel-replication-enabled no
|
|
|
|
$primary set key2 val2
|
|
|
|
test "Test diverse replica sync: dual-channel on/off" {
|
|
$replica1 replicaof $primary_host $primary_port
|
|
$replica2 replicaof $primary_host $primary_port
|
|
verify_replica_online $primary 0 500
|
|
verify_replica_online $primary 1 500
|
|
wait_for_value_to_propegate_to_replica $primary $replica1 "key2"
|
|
wait_for_value_to_propegate_to_replica $primary $replica2 "key2"
|
|
wait_for_condition 50 1000 {
|
|
[status $replica1 master_link_status] == "up"
|
|
} else {
|
|
fail "Replica is not synced"
|
|
}
|
|
}
|
|
|
|
$replica1 replicaof no one
|
|
$primary set key3 val3
|
|
|
|
test "Test replica's buffer limit reached" {
|
|
$primary config set repl-diskless-sync-delay 0
|
|
$primary config set rdb-key-save-delay 500
|
|
# At this point we have about 10k keys in the db,
|
|
# We expect that the next full sync will take 5 seconds (10k*500)ms
|
|
# It will give us enough time to fill the replica buffer.
|
|
$replica1 config set dual-channel-replication-enabled yes
|
|
$replica1 config set client-output-buffer-limit "replica 16383 16383 0"
|
|
|
|
$replica1 replicaof $primary_host $primary_port
|
|
# Wait for replica to establish psync using main channel
|
|
wait_for_condition 500 1000 {
|
|
[string match "*slave*,state=bg_transfer*,type=main-channel*" [$primary info replication]]
|
|
} else {
|
|
fail "replica didn't start sync session in time"
|
|
}
|
|
|
|
populate 10000 primary 10; # set ~ 100kb
|
|
# Wait for replica's buffer limit reached
|
|
wait_for_condition 50 1000 {
|
|
[log_file_matches $replica1_log "*Replication buffer limit reached, stopping buffering*"]
|
|
} else {
|
|
fail "Replica buffer should fill"
|
|
}
|
|
assert {[s -2 replicas_replication_buffer_size] <= 16385*2}
|
|
|
|
# Wait for sync to succeed
|
|
wait_for_condition 50 1000 {
|
|
[status $replica1 master_link_status] == "up"
|
|
} else {
|
|
fail "Replica is not synced"
|
|
}
|
|
wait_for_value_to_propegate_to_replica $primary $replica1 "key3"
|
|
}
|
|
|
|
$replica1 replicaof no one
|
|
$replica1 config set client-output-buffer-limit "replica 256mb 256mb 0"; # remove repl buffer limitation
|
|
|
|
$primary set key4 val4
|
|
|
|
test "dual-channel-replication fails when primary diskless disabled" {
|
|
set cur_psync [status $primary sync_partial_ok]
|
|
$primary config set repl-diskless-sync no
|
|
|
|
$replica1 config set dual-channel-replication-enabled yes
|
|
$replica1 replicaof $primary_host $primary_port
|
|
|
|
# Wait for mitigation and resync
|
|
wait_for_condition 50 1000 {
|
|
[status $replica1 master_link_status] == "up"
|
|
} else {
|
|
fail "Replica is not synced"
|
|
}
|
|
wait_for_value_to_propegate_to_replica $primary $replica1 "key4"
|
|
|
|
# Verify that we did not use dual-channel-replication sync
|
|
assert {[status $primary sync_partial_ok] == $cur_psync}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
start_server {tags {"dual-channel-replication external:skip"}} {
|
|
set replica [srv 0 client]
|
|
set replica_host [srv 0 host]
|
|
set replica_port [srv 0 port]
|
|
set replica_log [srv 0 stdout]
|
|
start_server {} {
|
|
set primary [srv 0 client]
|
|
set primary_host [srv 0 host]
|
|
set primary_port [srv 0 port]
|
|
set primary_pid [srv 0 pid]
|
|
# Create small enough db to be loaded before replica establish psync connection
|
|
$primary set key1 val1
|
|
|
|
$primary config set repl-diskless-sync yes
|
|
$primary debug pause-after-fork 1
|
|
$primary config set dual-channel-replication-enabled yes
|
|
|
|
$replica config set dual-channel-replication-enabled yes
|
|
$replica config set loglevel debug
|
|
|
|
test "Test dual-channel-replication sync- psync established after rdb load" {
|
|
$replica replicaof $primary_host $primary_port
|
|
wait_for_log_messages -1 {"*Done loading RDB*"} 0 2000 1
|
|
resume_process $primary_pid
|
|
|
|
verify_replica_online $primary 0 500
|
|
wait_for_condition 50 1000 {
|
|
[status $replica master_link_status] == "up"
|
|
} else {
|
|
fail "Replica is not synced"
|
|
}
|
|
wait_for_value_to_propegate_to_replica $primary $replica "key1"
|
|
# Confirm the occurrence of a race condition.
|
|
wait_for_log_messages -1 {"*Dual channel sync - psync established after rdb load*"} 0 2000 1
|
|
}
|
|
}
|
|
}
|
|
|
|
start_server {tags {"dual-channel-replication external:skip"}} {
|
|
set replica [srv 0 client]
|
|
set replica_host [srv 0 host]
|
|
set replica_port [srv 0 port]
|
|
set replica_log [srv 0 stdout]
|
|
set replica_pid [srv 0 pid]
|
|
start_server {} {
|
|
set primary [srv 0 client]
|
|
set primary_host [srv 0 host]
|
|
set primary_port [srv 0 port]
|
|
set backlog_size [expr {10 ** 5}]
|
|
set loglines [count_log_lines -1]
|
|
|
|
$primary config set repl-diskless-sync yes
|
|
$primary config set dual-channel-replication-enabled yes
|
|
$primary config set repl-backlog-size $backlog_size
|
|
$primary config set loglevel debug
|
|
$primary config set repl-timeout 10
|
|
$primary config set rdb-key-save-delay 200
|
|
populate 10000 primary 10000
|
|
|
|
set load_handle1 [start_one_key_write_load $primary_host $primary_port 100 "mykey1"]
|
|
set load_handle2 [start_one_key_write_load $primary_host $primary_port 100 "mykey2"]
|
|
set load_handle3 [start_one_key_write_load $primary_host $primary_port 100 "mykey3"]
|
|
|
|
$replica config set dual-channel-replication-enabled yes
|
|
$replica config set loglevel debug
|
|
$replica config set repl-timeout 10
|
|
# Pause replica after primary fork
|
|
$replica debug pause-after-fork 1
|
|
|
|
test "dual-channel-replication: Primary COB growth with inactive replica" {
|
|
$replica replicaof $primary_host $primary_port
|
|
# Verify repl backlog can grow
|
|
wait_for_condition 1000 10 {
|
|
[s 0 mem_total_replication_buffers] > [expr {2 * $backlog_size}]
|
|
} else {
|
|
fail "Primary should allow backlog to grow beyond its limits during dual-channel-replication sync handshake"
|
|
}
|
|
resume_process $replica_pid
|
|
|
|
verify_replica_online $primary 0 500
|
|
wait_for_condition 50 1000 {
|
|
[status $replica master_link_status] == "up"
|
|
} else {
|
|
fail "Replica is not synced"
|
|
}
|
|
}
|
|
|
|
stop_write_load $load_handle1
|
|
stop_write_load $load_handle2
|
|
stop_write_load $load_handle3
|
|
|
|
}
|
|
}
|
|
|
|
start_server {tags {"dual-channel-replication external:skip"}} {
|
|
set replica1 [srv 0 client]
|
|
set replica1_host [srv 0 host]
|
|
set replica1_port [srv 0 port]
|
|
set replica1_log [srv 0 stdout]
|
|
set replica1_pid [srv 0 pid]
|
|
start_server {} {
|
|
set replica2 [srv 0 client]
|
|
set replica2_host [srv 0 host]
|
|
set replica2_port [srv 0 port]
|
|
set replica2_log [srv 0 stdout]
|
|
set replica2_pid [srv 0 pid]
|
|
start_server {} {
|
|
set primary [srv 0 client]
|
|
set primary_host [srv 0 host]
|
|
set primary_port [srv 0 port]
|
|
set backlog_size [expr {10 ** 6}]
|
|
set loglines [count_log_lines -1]
|
|
|
|
$primary config set repl-diskless-sync yes
|
|
$primary config set dual-channel-replication-enabled yes
|
|
$primary config set repl-backlog-size $backlog_size
|
|
$primary config set loglevel debug
|
|
$primary config set repl-timeout 10
|
|
$primary config set rdb-key-save-delay 10
|
|
populate 1024 primary 16
|
|
|
|
set load_handle0 [start_write_load $primary_host $primary_port 20]
|
|
|
|
$replica1 config set dual-channel-replication-enabled yes
|
|
$replica2 config set dual-channel-replication-enabled yes
|
|
$replica1 config set loglevel debug
|
|
$replica2 config set loglevel debug
|
|
$replica1 config set repl-timeout 10
|
|
$replica2 config set repl-timeout 10
|
|
|
|
# Pause replicas after primary forks for
|
|
$replica1 debug pause-after-fork 1
|
|
$replica2 debug pause-after-fork 1
|
|
test "Test dual-channel: primary tracking replica backlog refcount - start with empty backlog" {
|
|
$replica1 replicaof $primary_host $primary_port
|
|
set res [wait_for_log_messages 0 {"*Add rdb replica * no repl-backlog to track*"} $loglines 2000 1]
|
|
set res [wait_for_log_messages 0 {"*Attach replica rdb client*"} $loglines 2000 1]
|
|
set loglines [lindex $res 1]
|
|
incr $loglines
|
|
resume_process $replica1_pid
|
|
verify_replica_online $primary 0 700
|
|
wait_for_condition 50 1000 {
|
|
[status $replica1 master_link_status] == "up"
|
|
} else {
|
|
fail "Replica is not synced"
|
|
}
|
|
$replica1 replicaof no one
|
|
assert [string match *replicas_waiting_psync:0* [$primary info replication]]
|
|
}
|
|
|
|
test "Test dual-channel: primary tracking replica backlog refcount - start with backlog" {
|
|
$replica2 replicaof $primary_host $primary_port
|
|
set res [wait_for_log_messages 0 {"*Add rdb replica * tracking repl-backlog tail*"} $loglines 2000 1]
|
|
set loglines [lindex $res 1]
|
|
incr $loglines
|
|
resume_process $replica2_pid
|
|
verify_replica_online $primary 0 700
|
|
wait_for_condition 50 1000 {
|
|
[status $replica2 master_link_status] == "up"
|
|
} else {
|
|
fail "Replica is not synced"
|
|
}
|
|
assert [string match *replicas_waiting_psync:0* [$primary info replication]]
|
|
}
|
|
|
|
stop_write_load $load_handle0
|
|
}
|
|
}
|
|
}
|
|
|
|
start_server {tags {"dual-channel-replication external:skip"}} {
|
|
set primary [srv 0 client]
|
|
set primary_host [srv 0 host]
|
|
set primary_port [srv 0 port]
|
|
set primary_pid [srv 0 pid]
|
|
|
|
$primary config set repl-diskless-sync yes
|
|
$primary config set dual-channel-replication-enabled yes
|
|
$primary config set repl-backlog-size [expr {10 ** 6}]
|
|
$primary config set loglevel debug
|
|
$primary config set repl-timeout 10
|
|
# generate small db
|
|
populate 10 primary 10
|
|
# Pause primary main process after fork
|
|
$primary debug pause-after-fork 1
|
|
# Give replica two second grace period before disconnection
|
|
$primary debug delay-rdb-client-free-seconds 2
|
|
|
|
start_server {} {
|
|
set replica [srv 0 client]
|
|
set replica_host [srv 0 host]
|
|
set replica_port [srv 0 port]
|
|
set replica_log [srv 0 stdout]
|
|
set replica_pid [srv 0 pid]
|
|
set loglines [count_log_lines 0]
|
|
|
|
set load_handle0 [start_write_load $primary_host $primary_port 20]
|
|
|
|
$replica config set dual-channel-replication-enabled yes
|
|
$replica config set loglevel debug
|
|
$replica config set repl-timeout 10
|
|
|
|
test "Psync established after rdb load - within grace period" {
|
|
# Test Sequence:
|
|
# 1. Replica initiates synchronization via RDB channel.
|
|
# 2. Primary's main process is suspended.
|
|
# 3. Replica completes RDB loading and pauses before establishing PSYNC connection.
|
|
# 4. Primary resumes operation and detects closed RDB channel.
|
|
# 5. Replica resumes operation.
|
|
# Expected outcome: Primary maintains RDB channel until replica establishes PSYNC connection.
|
|
$replica replicaof $primary_host $primary_port
|
|
wait_for_log_messages 0 {"*Done loading RDB*"} $loglines 2000 1
|
|
pause_process $replica_pid
|
|
resume_process $primary_pid
|
|
wait_for_condition 50 100 {
|
|
[string match {*replicas_waiting_psync:1*} [$primary info replication]]
|
|
} else {
|
|
fail "Primary freed RDB client before psync was established"
|
|
}
|
|
resume_process $replica_pid
|
|
|
|
verify_replica_online $primary 0 500
|
|
wait_for_condition 50 100 {
|
|
[string match {*replicas_waiting_psync:0*} [$primary info replication]]
|
|
} else {
|
|
fail "Primary did not free repl buf block after psync establishment"
|
|
}
|
|
$replica replicaof no one
|
|
}
|
|
stop_write_load $load_handle0
|
|
}
|
|
|
|
start_server {} {
|
|
set replica [srv 0 client]
|
|
set replica_host [srv 0 host]
|
|
set replica_port [srv 0 port]
|
|
set replica_log [srv 0 stdout]
|
|
set replica_pid [srv 0 pid]
|
|
set loglines [count_log_lines 0]
|
|
|
|
set load_handle0 [start_write_load $primary_host $primary_port 20]
|
|
|
|
$replica config set dual-channel-replication-enabled yes
|
|
$replica config set loglevel debug
|
|
$replica config set repl-timeout 10
|
|
|
|
test "Psync established after RDB load - beyond grace period" {
|
|
# Test Sequence:
|
|
# 1. Replica initiates synchronization via RDB channel.
|
|
# 2. Primary's main process is suspended.
|
|
# 3. Replica completes RDB loading and pauses before establishing PSYNC connection.
|
|
# 4. Primary resumes operation and detects closed RDB channel.
|
|
# Expected outcome: Primary drops the RDB channel after grace period is done.
|
|
$replica replicaof $primary_host $primary_port
|
|
wait_for_log_messages 0 {"*Done loading RDB*"} $loglines 2000 1
|
|
pause_process $replica_pid
|
|
resume_process $primary_pid
|
|
wait_for_condition 50 100 {
|
|
[string match {*replicas_waiting_psync:1*} [$primary info replication]]
|
|
} else {
|
|
fail "Primary should wait before freeing repl block"
|
|
}
|
|
|
|
# Sync should fail once the replica ask for PSYNC using main channel
|
|
set res [wait_for_log_messages -1 {"*Replica main channel failed to establish PSYNC within the grace period*"} 0 4000 1]
|
|
wait_for_condition 50 100 {
|
|
[string match {*replicas_waiting_psync:0*} [$primary info replication]]
|
|
} else {
|
|
fail "Primary did not free waiting psync replica after grace period"
|
|
}
|
|
resume_process $replica_pid
|
|
}
|
|
stop_write_load $load_handle0
|
|
}
|
|
}
|
|
|
|
start_server {tags {"dual-channel-replication external:skip"}} {
|
|
set primary [srv 0 client]
|
|
set primary_host [srv 0 host]
|
|
set primary_port [srv 0 port]
|
|
set loglines [count_log_lines 0]
|
|
set primary_pid [srv 0 pid]
|
|
|
|
$primary config set repl-diskless-sync yes
|
|
$primary config set dual-channel-replication-enabled yes
|
|
$primary config set client-output-buffer-limit "replica 1100k 0 0"
|
|
$primary config set loglevel debug
|
|
# generate small db
|
|
populate 10 primary 10
|
|
start_server {} {
|
|
set replica [srv 0 client]
|
|
set replica_host [srv 0 host]
|
|
set replica_port [srv 0 port]
|
|
set replica_log [srv 0 stdout]
|
|
set replica_pid [srv 0 pid]
|
|
|
|
set load_handle0 [start_write_load $primary_host $primary_port 20]
|
|
set load_handle1 [start_write_load $primary_host $primary_port 20]
|
|
set load_handle2 [start_write_load $primary_host $primary_port 20]
|
|
|
|
$replica config set dual-channel-replication-enabled yes
|
|
$replica config set loglevel debug
|
|
$replica config set repl-timeout 10
|
|
|
|
test "Test dual-channel-replication primary gets cob overrun before established psync" {
|
|
# Pause primary main process after fork
|
|
$primary debug pause-after-fork 1
|
|
$replica replicaof $primary_host $primary_port
|
|
wait_for_log_messages 0 {"*Done loading RDB*"} 0 2000 1
|
|
|
|
# At this point rdb is loaded but psync hasn't been established yet.
|
|
# Pause the replica so the primary main process will wake up while the
|
|
# replica is unresponsive. We expect the main process to fill the COB and disconnect the replica.
|
|
pause_process $replica_pid
|
|
resume_process $primary_pid
|
|
$primary debug pause-after-fork 0
|
|
wait_for_log_messages -1 {"*Client * closed * for overcoming of output buffer limits.*"} $loglines 2000 1
|
|
wait_for_condition 50 100 {
|
|
[string match {*replicas_waiting_psync:0*} [$primary info replication]]
|
|
} else {
|
|
fail "Primary did not free repl buf block after sync failure"
|
|
}
|
|
resume_process $replica_pid
|
|
set res [wait_for_log_messages -1 {"*Unable to partial resync with replica * for lack of backlog*"} $loglines 20000 1]
|
|
set loglines [lindex $res 1]
|
|
}
|
|
$replica replicaof no one
|
|
wait_for_condition 500 1000 {
|
|
[s -1 rdb_bgsave_in_progress] eq 0
|
|
} else {
|
|
fail "Primary should abort sync"
|
|
}
|
|
|
|
$replica debug pause-after-fork 1
|
|
$primary debug populate 1000 primary 100000
|
|
# Set primary with a slow rdb generation, so that we can easily intercept loading
|
|
# 10ms per key, with 1000 keys is 10 seconds
|
|
$primary config set rdb-key-save-delay 10000
|
|
|
|
test "Test dual-channel-replication primary gets cob overrun during replica rdb load" {
|
|
set cur_client_closed_count [s -1 client_output_buffer_limit_disconnections]
|
|
$replica replicaof $primary_host $primary_port
|
|
wait_for_condition 500 1000 {
|
|
[s -1 client_output_buffer_limit_disconnections] > $cur_client_closed_count
|
|
} else {
|
|
fail "Primary should disconnect replica due to COB overrun"
|
|
}
|
|
|
|
wait_for_condition 50 100 {
|
|
[string match {*replicas_waiting_psync:0*} [$primary info replication]]
|
|
} else {
|
|
fail "Primary did not free repl buf block after sync failure"
|
|
}
|
|
resume_process $replica_pid
|
|
set res [wait_for_log_messages -1 {"*Unable to partial resync with replica * for lack of backlog*"} $loglines 20000 1]
|
|
set loglines [lindex $res 0]
|
|
}
|
|
stop_write_load $load_handle0
|
|
stop_write_load $load_handle1
|
|
stop_write_load $load_handle2
|
|
}
|
|
}
|
|
|
|
foreach dualchannel {yes no} {
|
|
start_server {tags {"dual-channel-replication external:skip"}} {
|
|
set primary [srv 0 client]
|
|
set primary_host [srv 0 host]
|
|
set primary_port [srv 0 port]
|
|
set loglines [count_log_lines 0]
|
|
|
|
$primary config set repl-diskless-sync yes
|
|
$primary config set dual-channel-replication-enabled yes
|
|
$primary config set loglevel debug
|
|
$primary config set repl-diskless-sync-delay 5
|
|
|
|
# Generating RDB will cost 5s(10000 * 0.0005s)
|
|
$primary debug populate 10000 primary 1
|
|
$primary config set rdb-key-save-delay 500
|
|
|
|
$primary config set dual-channel-replication-enabled $dualchannel
|
|
|
|
start_server {} {
|
|
set replica1 [srv 0 client]
|
|
$replica1 config set dual-channel-replication-enabled $dualchannel
|
|
$replica1 config set loglevel debug
|
|
start_server {} {
|
|
set replica2 [srv 0 client]
|
|
$replica2 config set dual-channel-replication-enabled $dualchannel
|
|
$replica2 config set loglevel debug
|
|
$replica2 config set repl-timeout 60
|
|
|
|
set load_handle [start_one_key_write_load $primary_host $primary_port 100 "mykey1"]
|
|
test "Sync should continue if not all slaves dropped dual-channel-replication $dualchannel" {
|
|
$replica1 replicaof $primary_host $primary_port
|
|
$replica2 replicaof $primary_host $primary_port
|
|
|
|
wait_for_condition 50 1000 {
|
|
[status $primary rdb_bgsave_in_progress] == 1
|
|
} else {
|
|
fail "Sync did not start"
|
|
}
|
|
if {$dualchannel == "yes"} {
|
|
# Wait for both replicas main conns to establish psync
|
|
wait_for_condition 50 1000 {
|
|
[status $primary sync_partial_ok] == 2
|
|
} else {
|
|
fail "Replicas main conns didn't establish psync [status $primary sync_partial_ok]"
|
|
}
|
|
}
|
|
|
|
catch {$replica1 shutdown nosave}
|
|
wait_for_condition 50 2000 {
|
|
[status $replica2 master_link_status] == "up" &&
|
|
[status $primary sync_full] == 2 &&
|
|
(($dualchannel == "yes" && [status $primary sync_partial_ok] == 2) || $dualchannel == "no")
|
|
} else {
|
|
fail "Sync session interapted\n
|
|
sync_full:[status $primary sync_full]\n
|
|
sync_partial_ok:[status $primary sync_partial_ok]"
|
|
}
|
|
}
|
|
|
|
$replica2 replicaof no one
|
|
|
|
# Generating RDB will cost 500s(1000000 * 0.0001s)
|
|
$primary debug populate 1000000 primary 1
|
|
$primary config set rdb-key-save-delay 100
|
|
|
|
test "Primary abort sync if all slaves dropped dual-channel-replication $dualchannel" {
|
|
set cur_psync [status $primary sync_partial_ok]
|
|
$replica2 replicaof $primary_host $primary_port
|
|
|
|
wait_for_condition 50 1000 {
|
|
[status $primary rdb_bgsave_in_progress] == 1
|
|
} else {
|
|
fail "Sync did not start"
|
|
}
|
|
if {$dualchannel == "yes"} {
|
|
# Wait for both replicas main conns to establish psync
|
|
wait_for_condition 50 1000 {
|
|
[status $primary sync_partial_ok] == $cur_psync + 1
|
|
} else {
|
|
fail "Replicas main conns didn't establish psync [status $primary sync_partial_ok]"
|
|
}
|
|
}
|
|
|
|
catch {$replica2 shutdown nosave}
|
|
wait_for_condition 50 1000 {
|
|
[status $primary rdb_bgsave_in_progress] == 0
|
|
} else {
|
|
fail "Primary should abort the sync"
|
|
}
|
|
}
|
|
stop_write_load $load_handle
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
start_server {tags {"dual-channel-replication external:skip"}} {
|
|
set primary [srv 0 client]
|
|
set primary_host [srv 0 host]
|
|
set primary_port [srv 0 port]
|
|
set loglines [count_log_lines 0]
|
|
|
|
$primary config set repl-diskless-sync yes
|
|
$primary config set dual-channel-replication-enabled yes
|
|
$primary config set loglevel debug
|
|
$primary config set repl-diskless-sync-delay 5; # allow catch failed sync before retry
|
|
|
|
# Generating RDB will cost 500s(1000000 * 0.0001s)
|
|
$primary debug populate 1000000 primary 1
|
|
$primary config set rdb-key-save-delay 100
|
|
|
|
start_server {} {
|
|
set replica [srv 0 client]
|
|
set replica_host [srv 0 host]
|
|
set replica_port [srv 0 port]
|
|
set replica_log [srv 0 stdout]
|
|
set replica_pid [srv 0 pid]
|
|
|
|
set load_handle [start_write_load $primary_host $primary_port 20]
|
|
|
|
$replica config set dual-channel-replication-enabled yes
|
|
$replica config set loglevel debug
|
|
$replica config set repl-timeout 10
|
|
test "Test dual-channel-replication replica main channel disconnected" {
|
|
$replica replicaof $primary_host $primary_port
|
|
# Wait for sync session to start
|
|
wait_for_condition 500 1000 {
|
|
[string match "*slave*,state=wait_bgsave*,type=rdb-channel*" [$primary info replication]] &&
|
|
[string match "*slave*,state=bg_transfer*,type=main-channel*" [$primary info replication]] &&
|
|
[s -1 rdb_bgsave_in_progress] eq 1
|
|
} else {
|
|
fail "replica didn't start sync session in time"
|
|
}
|
|
|
|
$primary debug log "killing replica main connection"
|
|
set replica_main_conn_id [get_client_id_by_last_cmd $primary "psync"]
|
|
assert {$replica_main_conn_id != ""}
|
|
$primary client kill id $replica_main_conn_id
|
|
# Wait for primary to abort the sync
|
|
wait_for_condition 50 1000 {
|
|
[string match {*replicas_waiting_psync:0*} [$primary info replication]]
|
|
} else {
|
|
fail "Primary did not free repl buf block after sync failure"
|
|
}
|
|
wait_for_condition 1000 10 {
|
|
[s -1 rdb_last_bgsave_status] eq "err"
|
|
} else {
|
|
fail "bgsave did not stop in time"
|
|
}
|
|
}
|
|
|
|
test "Test dual channel replication slave of no one after main conn kill" {
|
|
$replica replicaof no one
|
|
wait_for_condition 500 1000 {
|
|
[s -1 rdb_bgsave_in_progress] eq 0
|
|
} else {
|
|
fail "Primary should abort sync"
|
|
}
|
|
}
|
|
|
|
test "Test dual-channel-replication replica rdb connection disconnected" {
|
|
$replica replicaof $primary_host $primary_port
|
|
# Wait for sync session to start
|
|
wait_for_condition 500 1000 {
|
|
[string match "*slave*,state=wait_bgsave*,type=rdb-channel*" [$primary info replication]] &&
|
|
[string match "*slave*,state=bg_transfer*,type=main-channel*" [$primary info replication]] &&
|
|
[s -1 rdb_bgsave_in_progress] eq 1
|
|
} else {
|
|
fail "replica didn't start sync session in time"
|
|
}
|
|
|
|
$primary debug log "killing replica rdb connection"
|
|
set replica_rdb_channel_id [get_client_id_by_last_cmd $primary "sync"]
|
|
assert {$replica_rdb_channel_id != ""}
|
|
$primary client kill id $replica_rdb_channel_id
|
|
# Wait for primary to abort the sync
|
|
wait_for_condition 1000 10 {
|
|
[s -1 rdb_bgsave_in_progress] eq 0 &&
|
|
[s -1 rdb_last_bgsave_status] eq "err"
|
|
} else {
|
|
fail "Primary should abort sync"
|
|
}
|
|
}
|
|
|
|
test "Test dual channel replication slave of no one after rdb conn kill" {
|
|
$replica replicaof no one
|
|
wait_for_condition 500 1000 {
|
|
[s -1 rdb_bgsave_in_progress] eq 0
|
|
} else {
|
|
fail "Primary should abort sync"
|
|
}
|
|
}
|
|
|
|
test "Test dual-channel-replication primary reject set-rdb-client after client killed" {
|
|
# Ensure replica main channel will not handshake before rdb client is killed
|
|
$replica debug pause-after-fork 1
|
|
$replica replicaof $primary_host $primary_port
|
|
# Wait for sync session to start
|
|
wait_for_condition 500 1000 {
|
|
[string match "*slave*,state=wait_bgsave*,type=rdb-channel*" [$primary info replication]] &&
|
|
[s -1 rdb_bgsave_in_progress] eq 1
|
|
} else {
|
|
fail "replica didn't start sync session in time"
|
|
}
|
|
|
|
set replica_rdb_channel_id [get_client_id_by_last_cmd $primary "sync"]
|
|
assert {$replica_rdb_channel_id != ""}
|
|
$primary debug log "killing replica rdb connection $replica_rdb_channel_id"
|
|
$primary client kill id $replica_rdb_channel_id
|
|
# Wait for primary to abort the sync
|
|
wait_for_condition 10000000 10 {
|
|
[s -1 rdb_bgsave_in_progress] eq 0 &&
|
|
[string match {*replicas_waiting_psync:0*} [$primary info replication]]
|
|
} else {
|
|
fail "Primary should abort sync"
|
|
}
|
|
# Verify primary reject replconf set-rdb-client-id
|
|
set res [catch {$primary replconf set-rdb-client-id $replica_rdb_channel_id} err]
|
|
assert [string match *ERR* $err]
|
|
resume_process $replica_pid
|
|
}
|
|
stop_write_load $load_handle
|
|
}
|
|
}
|
|
|
|
start_server {tags {"dual-channel-replication external:skip"}} {
|
|
set primary [srv 0 client]
|
|
set primary_host [srv 0 host]
|
|
set primary_port [srv 0 port]
|
|
set loglines [count_log_lines 0]
|
|
|
|
$primary config set repl-diskless-sync yes
|
|
$primary config set dual-channel-replication-enabled yes
|
|
$primary config set loglevel debug
|
|
$primary config set repl-diskless-sync-delay 0; # don't wait for other replicas
|
|
|
|
# Generating RDB will cost 5s(10000 * 0.0001s)
|
|
$primary debug populate 10000 primary 1
|
|
$primary config set rdb-key-save-delay 100
|
|
|
|
start_server {} {
|
|
set replica_1 [srv 0 client]
|
|
set replica_host_1 [srv 0 host]
|
|
set replica_port_1 [srv 0 port]
|
|
set replica_log_1 [srv 0 stdout]
|
|
|
|
$replica_1 config set dual-channel-replication-enabled yes
|
|
$replica_1 config set loglevel debug
|
|
$replica_1 config set repl-timeout 10
|
|
start_server {} {
|
|
set replica_2 [srv 0 client]
|
|
set replica_host_2 [srv 0 host]
|
|
set replica_port_2 [srv 0 port]
|
|
set replica_log_2 [srv 0 stdout]
|
|
|
|
set load_handle [start_write_load $primary_host $primary_port 20]
|
|
|
|
$replica_2 config set dual-channel-replication-enabled yes
|
|
$replica_2 config set loglevel debug
|
|
$replica_2 config set repl-timeout 10
|
|
test "Test replica unable to join dual channel replication sync after started" {
|
|
$replica_1 replicaof $primary_host $primary_port
|
|
# Wait for sync session to start
|
|
wait_for_condition 50 100 {
|
|
[s -2 rdb_bgsave_in_progress] eq 1
|
|
} else {
|
|
fail "replica didn't start sync session in time1"
|
|
}
|
|
$replica_2 replicaof $primary_host $primary_port
|
|
wait_for_log_messages -2 {"*Current BGSAVE has socket target. Waiting for next BGSAVE for SYNC*"} $loglines 100 1000
|
|
$primary config set rdb-key-save-delay 0
|
|
# Verify second replica needed new session
|
|
wait_for_sync $replica_2
|
|
assert {[s -2 sync_partial_ok] eq 2}
|
|
assert {[s -2 sync_full] eq 2}
|
|
}
|
|
stop_write_load $load_handle
|
|
}
|
|
}
|
|
}
|
|
|
|
start_server {tags {"dual-channel-replication external:skip"}} {
|
|
set primary [srv 0 client]
|
|
set primary_host [srv 0 host]
|
|
set primary_port [srv 0 port]
|
|
set loglines [count_log_lines 0]
|
|
|
|
$primary config set repl-diskless-sync yes
|
|
$primary config set dual-channel-replication-enabled yes
|
|
$primary config set loglevel debug
|
|
$primary config set repl-diskless-sync-delay 5; # allow catch failed sync before retry
|
|
|
|
# Generating RDB will cost 5s(10000 * 0.0001s)
|
|
$primary debug populate 10000 primary 1
|
|
$primary config set rdb-key-save-delay 100
|
|
|
|
start_server {} {
|
|
set replica [srv 0 client]
|
|
set replica_host [srv 0 host]
|
|
set replica_port [srv 0 port]
|
|
set replica_log [srv 0 stdout]
|
|
|
|
$replica config set dual-channel-replication-enabled yes
|
|
$replica config set loglevel debug
|
|
$replica config set repl-timeout 10
|
|
test "Replica recover rdb-connection killed" {
|
|
set load_handle [start_one_key_write_load $primary_host $primary_port 100 "mykey"]
|
|
$replica replicaof $primary_host $primary_port
|
|
# Wait for sync session to start
|
|
wait_for_condition 500 1000 {
|
|
[string match "*slave*,state=wait_bgsave*,type=rdb-channel*" [$primary info replication]] &&
|
|
[string match "*slave*,state=bg_transfer*,type=main-channel*" [$primary info replication]] &&
|
|
[s -1 rdb_bgsave_in_progress] eq 1
|
|
} else {
|
|
fail "replica didn't start sync session in time"
|
|
}
|
|
|
|
$primary debug log "killing replica rdb connection"
|
|
set replica_rdb_channel_id [get_client_id_by_last_cmd $primary "sync"]
|
|
assert {$replica_rdb_channel_id != ""}
|
|
$primary client kill id $replica_rdb_channel_id
|
|
# Wait for primary to abort the sync
|
|
wait_for_condition 50 1000 {
|
|
[string match {*replicas_waiting_psync:0*} [$primary info replication]]
|
|
} else {
|
|
fail "Primary did not free repl buf block after sync failure"
|
|
}
|
|
wait_for_condition 1000 10 {
|
|
[s -1 rdb_last_bgsave_status] eq "err"
|
|
} else {
|
|
fail "bgsave did not stop in time"
|
|
}
|
|
# Replica should retry
|
|
verify_replica_online $primary 0 500
|
|
stop_write_load $load_handle
|
|
wait_for_condition 1000 100 {
|
|
[s -1 master_repl_offset] eq [s master_repl_offset]
|
|
} else {
|
|
fail "Replica offset didn't catch up with the primary after too long time"
|
|
}
|
|
}
|
|
$replica replicaof no one
|
|
|
|
test "Replica recover main-connection killed" {
|
|
set load_handle [start_one_key_write_load $primary_host $primary_port 100 "mykey"]
|
|
$replica replicaof $primary_host $primary_port
|
|
# Wait for sync session to start
|
|
wait_for_condition 500 1000 {
|
|
[string match "*slave*,state=wait_bgsave*,type=rdb-channel*" [$primary info replication]] &&
|
|
[string match "*slave*,state=bg_transfer*,type=main-channel*" [$primary info replication]] &&
|
|
[s -1 rdb_bgsave_in_progress] eq 1
|
|
} else {
|
|
fail "replica didn't start sync session in time"
|
|
}
|
|
|
|
$primary debug log "killing replica main connection"
|
|
set replica_main_conn_id [get_client_id_by_last_cmd $primary "sync"]
|
|
assert {$replica_main_conn_id != ""}
|
|
$primary client kill id $replica_main_conn_id
|
|
# Wait for primary to abort the sync
|
|
wait_for_condition 50 1000 {
|
|
[string match {*replicas_waiting_psync:0*} [$primary info replication]]
|
|
} else {
|
|
fail "Primary did not free repl buf block after sync failure"
|
|
}
|
|
wait_for_condition 1000 10 {
|
|
[s -1 rdb_last_bgsave_status] eq "err"
|
|
} else {
|
|
fail "bgsave did not stop in time"
|
|
}
|
|
# Replica should retry
|
|
verify_replica_online $primary 0 500
|
|
stop_write_load $load_handle
|
|
wait_for_condition 1000 100 {
|
|
[s -1 master_repl_offset] eq [s master_repl_offset]
|
|
} else {
|
|
fail "Replica offset didn't catch up with the primary after too long time"
|
|
}
|
|
}
|
|
}
|
|
}
|