From 7e5ded2ad0521600ceb57d71b0dbb19bbbc087b7 Mon Sep 17 00:00:00 2001 From: Binbin Date: Sun, 23 Jan 2022 19:54:50 +0800 Subject: [PATCH] Fix timing issue in sentinel CKQUORUM test (#10036) A test failure was reported in Daily CI (test-centos7-tls). `CKQUORUM detects failover authorization cannot be reached`. ``` CKQUORUM detects failover authorization cannot be reached: FAILED: Expected 'invalid command name "OK 4 usable Sentinels. Quorum and failover authorization can be reached"' to match '*NOQUORUM*' ``` It seems that current sentinel does not confirm that the other sentinels are actually `down`, and then check the quorum. It at least take 3 seconds on my machine, and we can see there will be a timing issue with the hard code `after 5000`. In this commit, we check the response of `SENTINEL SENTINELS mymaster` to ensure that other sentinels are actually `down` in the view the current sentinel. Solve the timing issue due to sentinel monitor mechanism. --- tests/sentinel/tests/06-ckquorum.tcl | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/tests/sentinel/tests/06-ckquorum.tcl b/tests/sentinel/tests/06-ckquorum.tcl index 4ea4e55d8..36c3dc650 100644 --- a/tests/sentinel/tests/06-ckquorum.tcl +++ b/tests/sentinel/tests/06-ckquorum.tcl @@ -24,9 +24,16 @@ test "CKQUORUM detects failover authorization cannot be reached" { kill_instance sentinel [expr {$i + 1}] } - after 5000 - catch {[S 0 SENTINEL CKQUORUM mymaster]} err - assert_match "*NOQUORUM*" $err + # We need to make sure that other sentinels are in `DOWN` state + # from the point of view of S 0 before we executing `CKQUORUM`. + wait_for_condition 300 50 { + [catch {S 0 SENTINEL CKQUORUM mymaster}] == 1 + } else { + fail "At least $orig_quorum sentinels did not enter the down state." + } + + assert_error "*NOQUORUM*" {S 0 SENTINEL CKQUORUM mymaster} + S 0 SENTINEL SET mymaster quorum $orig_quorum for {set i 0} {$i < $orig_quorum} {incr i} { restart_instance sentinel [expr {$i + 1}]