From 32a2584e079a1b3c2d1e6649e38239381a73a459 Mon Sep 17 00:00:00 2001 From: YaacovHazan <31382944+YaacovHazan@users.noreply.github.com> Date: Thu, 20 May 2021 15:29:43 +0300 Subject: [PATCH] stabilize tests that involved with load handlers (#8967) When test stop 'load handler' by killing the process that generating the load, some commands that already in the input buffer, still might be processed by the server. This may cause some instability in tests, that count on that no more commands processed after we stop the `load handler' In this commit, new proc 'wait_load_handlers_disconnected' added, to verify that no more cammands from any 'load handler' prossesed, by checking that the clients who genreate the load is disconnceted. Also, replacing check of dbsize with wait_for_ofs_sync before comparing debug digest, as it would fail in case the last key the workload wrote was an overridden key (not a new one). Affected tests Race fix: - failover command to specific replica works - Connect multiple replicas at the same time (issue #141), master diskless=$mdl, replica diskless=$sdl - AOF rewrite during write load: RDB preamble=$rdbpre Cleanup and speedup: - Test replication with blocking lists and sorted sets operations - Test replication with parallel clients writing in different DBs - Test replication partial resync: $descr (diskless: $mdl, $sdl, reconnect: $reconnect --- tests/helpers/bg_block_op.tcl | 1 + tests/helpers/bg_complex_data.tcl | 1 + tests/helpers/gen_write_load.tcl | 1 + tests/integration/block-repl.tcl | 15 ++++----------- tests/integration/failover.tcl | 4 ++++ tests/integration/replication-4.tcl | 17 +++++------------ tests/integration/replication-psync.tcl | 17 +++++------------ tests/integration/replication.tcl | 15 ++++++--------- tests/support/util.tcl | 8 ++++++++ tests/unit/aofrw.tcl | 11 ++--------- 10 files changed, 37 insertions(+), 53 deletions(-) diff --git a/tests/helpers/bg_block_op.tcl b/tests/helpers/bg_block_op.tcl index c8b323308..dc4e1a999 100644 --- a/tests/helpers/bg_block_op.tcl +++ b/tests/helpers/bg_block_op.tcl @@ -12,6 +12,7 @@ set ::tlsdir "tests/tls" # blocking. proc bg_block_op {host port db ops tls} { set r [redis $host $port 0 $tls] + $r client setname LOAD_HANDLER $r select $db for {set j 0} {$j < $ops} {incr j} { diff --git a/tests/helpers/bg_complex_data.tcl b/tests/helpers/bg_complex_data.tcl index e888748a7..9c0044e7f 100644 --- a/tests/helpers/bg_complex_data.tcl +++ b/tests/helpers/bg_complex_data.tcl @@ -5,6 +5,7 @@ set ::tlsdir "tests/tls" proc bg_complex_data {host port db ops tls} { set r [redis $host $port 0 $tls] + $r client setname LOAD_HANDLER $r select $db createComplexDataset $r $ops } diff --git a/tests/helpers/gen_write_load.tcl b/tests/helpers/gen_write_load.tcl index cbf6651bd..568f5cde2 100644 --- a/tests/helpers/gen_write_load.tcl +++ b/tests/helpers/gen_write_load.tcl @@ -5,6 +5,7 @@ set ::tlsdir "tests/tls" proc gen_write_load {host port seconds tls} { set start_time [clock seconds] set r [redis $host $port 1 $tls] + $r client setname LOAD_HANDLER $r select 9 while 1 { $r set [expr rand()] [expr rand()] diff --git a/tests/integration/block-repl.tcl b/tests/integration/block-repl.tcl index 07eceb228..7c2ba840d 100644 --- a/tests/integration/block-repl.tcl +++ b/tests/integration/block-repl.tcl @@ -33,14 +33,9 @@ start_server {tags {"repl"}} { stop_bg_block_op $load_handle0 stop_bg_block_op $load_handle1 stop_bg_block_op $load_handle2 - set retry 10 - while {$retry && ([$master debug digest] ne [$slave debug digest])}\ - { - after 1000 - incr retry -1 - } - - if {[$master debug digest] ne [$slave debug digest]} { + wait_for_condition 100 100 { + [$master debug digest] == [$slave debug digest] + } else { set csv1 [csvdump r] set csv2 [csvdump {r -1}] set fd [open /tmp/repldump1.txt w] @@ -49,10 +44,8 @@ start_server {tags {"repl"}} { set fd [open /tmp/repldump2.txt w] puts -nonewline $fd $csv2 close $fd - puts "Master - Replica inconsistency" - puts "Run diff -u against /tmp/repldump*.txt for more info" + fail "Master - Replica inconsistency, Run diff -u against /tmp/repldump*.txt for more info" } - assert_equal [r debug digest] [r -1 debug digest] } } } diff --git a/tests/integration/failover.tcl b/tests/integration/failover.tcl index c6818700d..10642eb32 100644 --- a/tests/integration/failover.tcl +++ b/tests/integration/failover.tcl @@ -83,7 +83,11 @@ start_server {} { } else { fail "Failover from node 0 to node 1 did not finish" } + + # stop the write load and make sure no more commands processed stop_write_load $load_handler + wait_load_handlers_disconnected + $node_2 replicaof $node_1_host $node_1_port wait_for_sync $node_0 wait_for_sync $node_2 diff --git a/tests/integration/replication-4.tcl b/tests/integration/replication-4.tcl index 8715ae999..e4ac83e12 100644 --- a/tests/integration/replication-4.tcl +++ b/tests/integration/replication-4.tcl @@ -21,15 +21,9 @@ start_server {tags {"repl network"}} { stop_bg_complex_data $load_handle0 stop_bg_complex_data $load_handle1 stop_bg_complex_data $load_handle2 - set retry 10 - while {$retry && ([$master debug digest] ne [$slave debug digest])}\ - { - after 1000 - incr retry -1 - } - assert {[$master dbsize] > 0} - - if {[$master debug digest] ne [$slave debug digest]} { + wait_for_condition 100 100 { + [$master debug digest] == [$slave debug digest] + } else { set csv1 [csvdump r] set csv2 [csvdump {r -1}] set fd [open /tmp/repldump1.txt w] @@ -38,10 +32,9 @@ start_server {tags {"repl network"}} { set fd [open /tmp/repldump2.txt w] puts -nonewline $fd $csv2 close $fd - puts "Master - Replica inconsistency" - puts "Run diff -u against /tmp/repldump*.txt for more info" + fail "Master - Replica inconsistency, Run diff -u against /tmp/repldump*.txt for more info" } - assert_equal [r debug digest] [r -1 debug digest] + assert {[$master dbsize] > 0} } } } diff --git a/tests/integration/replication-psync.tcl b/tests/integration/replication-psync.tcl index 3c98723af..08e21d310 100644 --- a/tests/integration/replication-psync.tcl +++ b/tests/integration/replication-psync.tcl @@ -97,15 +97,9 @@ proc test_psync {descr duration backlog_size backlog_ttl delay cond mdl sdl reco fail "Slave still not connected after some time" } - set retry 10 - while {$retry && ([$master debug digest] ne [$slave debug digest])}\ - { - after 1000 - incr retry -1 - } - assert {[$master dbsize] > 0} - - if {[$master debug digest] ne [$slave debug digest]} { + wait_for_condition 100 100 { + [$master debug digest] == [$slave debug digest] + } else { set csv1 [csvdump r] set csv2 [csvdump {r -1}] set fd [open /tmp/repldump1.txt w] @@ -114,10 +108,9 @@ proc test_psync {descr duration backlog_size backlog_ttl delay cond mdl sdl reco set fd [open /tmp/repldump2.txt w] puts -nonewline $fd $csv2 close $fd - puts "Master - Replica inconsistency" - puts "Run diff -u against /tmp/repldump*.txt for more info" + fail "Master - Replica inconsistency, Run diff -u against /tmp/repldump*.txt for more info" } - assert_equal [r debug digest] [r -1 debug digest] + assert {[$master dbsize] > 0} eval $cond } } diff --git a/tests/integration/replication.tcl b/tests/integration/replication.tcl index 7ec59f766..2b96c85f0 100644 --- a/tests/integration/replication.tcl +++ b/tests/integration/replication.tcl @@ -316,15 +316,12 @@ foreach mdl {no yes} { stop_write_load $load_handle3 stop_write_load $load_handle4 - # Make sure that slaves and master have same - # number of keys - wait_for_condition 500 100 { - [$master dbsize] == [[lindex $slaves 0] dbsize] && - [$master dbsize] == [[lindex $slaves 1] dbsize] && - [$master dbsize] == [[lindex $slaves 2] dbsize] - } else { - fail "Different number of keys between master and replica after too long time." - } + # Make sure no more commands processed + wait_load_handlers_disconnected + + wait_for_ofs_sync $master [lindex $slaves 0] + wait_for_ofs_sync $master [lindex $slaves 1] + wait_for_ofs_sync $master [lindex $slaves 2] # Check digests set digest [$master debug digest] diff --git a/tests/support/util.tcl b/tests/support/util.tcl index 0eaee1ad0..0f185d57a 100644 --- a/tests/support/util.tcl +++ b/tests/support/util.tcl @@ -504,6 +504,14 @@ proc stop_write_load {handle} { catch {exec /bin/kill -9 $handle} } +proc wait_load_handlers_disconnected {{level 0}} { + wait_for_condition 50 100 { + ![string match {*name=LOAD_HANDLER*} [r $level client list]] + } else { + fail "load_handler(s) still connected after too long time." + } +} + proc K { x y } { set x } # Shuffle a list with Fisher-Yates algorithm. diff --git a/tests/unit/aofrw.tcl b/tests/unit/aofrw.tcl index 1a686a2fa..5bdf87256 100644 --- a/tests/unit/aofrw.tcl +++ b/tests/unit/aofrw.tcl @@ -41,15 +41,8 @@ start_server {tags {"aofrw"}} { stop_write_load $load_handle3 stop_write_load $load_handle4 - # Make sure that we remain the only connected client. - # This step is needed to make sure there are no pending writes - # that will be processed between the two "debug digest" calls. - wait_for_condition 50 100 { - [llength [split [string trim [r client list]] "\n"]] == 1 - } else { - puts [r client list] - fail "Clients generating loads are not disconnecting" - } + # Make sure no more commands processed, before taking debug digest + wait_load_handlers_disconnected # Get the data set digest set d1 [r debug digest]