Binbin fbbfe5d3d3
Print logs when the cluster state changes to fail or the fail reason changes (#1188)
This log allows us to easily distinguish between full coverage and
minority partition when the cluster fails. Sometimes it is not easy
to see the minority partition in a healthy shards (both primary and
replicas).

And we decided not to add a cluster_fail_reason field to cluster info.
Given that there are only two reasons and both are well-known and if
we ended up adding more down the road we can add it in the furture.

Signed-off-by: Binbin <binloveplay1314@qq.com>
2024-12-02 15:55:24 +08:00

67 lines
2.1 KiB
Tcl

# Check cluster info stats
start_cluster 2 0 {tags {external:skip cluster}} {
test "Cluster should start ok" {
wait_for_cluster_state ok
}
set primary1 [srv 0 "client"]
set primary2 [srv -1 "client"]
proc cmdstat {instance cmd} {
return [cmdrstat $cmd $instance]
}
proc errorstat {instance cmd} {
return [errorrstat $cmd $instance]
}
test "errorstats: rejected call due to MOVED Redirection" {
$primary1 config resetstat
$primary2 config resetstat
assert_match {} [errorstat $primary1 MOVED]
assert_match {} [errorstat $primary2 MOVED]
# we know that one will have a MOVED reply and one will succeed
catch {$primary1 set key b} replyP1
catch {$primary2 set key b} replyP2
# sort servers so we know which one failed
if {$replyP1 eq {OK}} {
assert_match {MOVED*} $replyP2
set pok $primary1
set perr $primary2
} else {
assert_match {MOVED*} $replyP1
set pok $primary2
set perr $primary1
}
assert_match {} [errorstat $pok MOVED]
assert_match {*count=1*} [errorstat $perr MOVED]
assert_match {*calls=0,*,rejected_calls=1,failed_calls=0} [cmdstat $perr set]
}
} ;# start_cluster
start_cluster 3 0 {tags {external:skip cluster} overrides {cluster-node-timeout 1000}} {
test "fail reason changed" {
# Kill one primary, so the cluster fail with not-full-coverage.
pause_process [srv 0 pid]
wait_for_condition 1000 50 {
[CI 1 cluster_state] eq {fail} &&
[CI 2 cluster_state] eq {fail}
} else {
fail "Cluster doesn't fail"
}
verify_log_message -1 "*At least one hash slot is not served by any available node*" 0
verify_log_message -2 "*At least one hash slot is not served by any available node*" 0
# Kill one more primary, so the cluster fail with minority-partition.
pause_process [srv -1 pid]
wait_for_log_messages -2 {"*minority partition*"} 0 1000 50
resume_process [srv 0 pid]
resume_process [srv -1 pid]
wait_for_cluster_state ok
}
}