Print logs when the cluster state changes to fail or the fail reason changes (#1188)

This log allows us to easily distinguish between full coverage and
minority partition when the cluster fails. Sometimes it is not easy
to see the minority partition in a healthy shards (both primary and
replicas).

And we decided not to add a cluster_fail_reason field to cluster info.
Given that there are only two reasons and both are well-known and if
we ended up adding more down the road we can add it in the furture.

Signed-off-by: Binbin <binloveplay1314@qq.com>
This commit is contained in:
Binbin 2024-12-02 15:55:24 +08:00 committed by GitHub
parent 90475af594
commit fbbfe5d3d3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 67 additions and 2 deletions

View File

@ -12,6 +12,12 @@
#define CLUSTER_FAIL 1 /* The cluster can't work */
#define CLUSTER_NAMELEN 40 /* sha1 hex length */
/* Reason why the cluster state changes to fail. When adding new reasons,
* make sure to update clusterLogFailReason. */
#define CLUSTER_FAIL_NONE 0
#define CLUSTER_FAIL_NOT_FULL_COVERAGE 1
#define CLUSTER_FAIL_MINORITY_PARTITION 2
/* Redirection errors returned by getNodeByQuery(). */
#define CLUSTER_REDIR_NONE 0 /* Node can serve the request. */
#define CLUSTER_REDIR_CROSS_SLOT 1 /* -CROSSSLOT request. */

View File

@ -1082,6 +1082,7 @@ void clusterInit(void) {
server.cluster->myself = NULL;
server.cluster->currentEpoch = 0;
server.cluster->state = CLUSTER_FAIL;
server.cluster->fail_reason = CLUSTER_FAIL_NONE;
server.cluster->size = 0;
server.cluster->todo_before_sleep = 0;
server.cluster->nodes = dictCreate(&clusterNodesDictType);
@ -4493,7 +4494,7 @@ void clusterLogCantFailover(int reason) {
case CLUSTER_CANT_FAILOVER_WAITING_DELAY: msg = "Waiting the delay before I can start a new failover."; break;
case CLUSTER_CANT_FAILOVER_EXPIRED: msg = "Failover attempt expired."; break;
case CLUSTER_CANT_FAILOVER_WAITING_VOTES: msg = "Waiting for votes, but majority still not reached."; break;
default: msg = "Unknown reason code."; break;
default: serverPanic("Unknown cant failover reason code.");
}
lastlog_time = time(NULL);
serverLog(LL_NOTICE, "Currently unable to failover: %s", msg);
@ -5362,6 +5363,23 @@ void clusterCloseAllSlots(void) {
* Cluster state evaluation function
* -------------------------------------------------------------------------- */
void clusterLogFailReason(int reason) {
if (reason == CLUSTER_FAIL_NONE) return;
char *msg;
switch (reason) {
case CLUSTER_FAIL_NOT_FULL_COVERAGE:
msg = "At least one hash slot is not served by any available node. "
"Please check the 'cluster-require-full-coverage' configuration.";
break;
case CLUSTER_FAIL_MINORITY_PARTITION:
msg = "I am part of a minority partition.";
break;
default: serverPanic("Unknown fail reason code.");
}
serverLog(LL_WARNING, "Cluster is currently down: %s", msg);
}
/* The following are defines that are only used in the evaluation function
* and are based on heuristics. Actually the main point about the rejoin and
* writable delay is that they should be a few orders of magnitude larger
@ -5371,7 +5389,7 @@ void clusterCloseAllSlots(void) {
#define CLUSTER_WRITABLE_DELAY 2000
void clusterUpdateState(void) {
int j, new_state;
int j, new_state, new_reason;
int reachable_primaries = 0;
static mstime_t among_minority_time;
static mstime_t first_call_time = 0;
@ -5392,12 +5410,14 @@ void clusterUpdateState(void) {
/* Start assuming the state is OK. We'll turn it into FAIL if there
* are the right conditions. */
new_state = CLUSTER_OK;
new_reason = CLUSTER_FAIL_NONE;
/* Check if all the slots are covered. */
if (server.cluster_require_full_coverage) {
for (j = 0; j < CLUSTER_SLOTS; j++) {
if (server.cluster->slots[j] == NULL || server.cluster->slots[j]->flags & (CLUSTER_NODE_FAIL)) {
new_state = CLUSTER_FAIL;
new_reason = CLUSTER_FAIL_NOT_FULL_COVERAGE;
break;
}
}
@ -5432,6 +5452,7 @@ void clusterUpdateState(void) {
if (reachable_primaries < needed_quorum) {
new_state = CLUSTER_FAIL;
new_reason = CLUSTER_FAIL_MINORITY_PARTITION;
among_minority_time = mstime();
}
}
@ -5455,7 +5476,21 @@ void clusterUpdateState(void) {
serverLog(new_state == CLUSTER_OK ? LL_NOTICE : LL_WARNING, "Cluster state changed: %s",
new_state == CLUSTER_OK ? "ok" : "fail");
server.cluster->state = new_state;
/* Cluster state changes from ok to fail, print a log. */
if (new_state == CLUSTER_FAIL) {
clusterLogFailReason(new_reason);
server.cluster->fail_reason = new_reason;
}
}
/* Cluster state is still fail, but the reason has changed, print a log. */
if (new_state == CLUSTER_FAIL && new_reason != server.cluster->fail_reason) {
clusterLogFailReason(new_reason);
server.cluster->fail_reason = new_reason;
}
if (new_state == CLUSTER_OK) server.cluster->fail_reason = CLUSTER_FAIL_NONE;
}
/* This function is called after the node startup in order to verify that data

View File

@ -370,6 +370,7 @@ struct clusterState {
clusterNode *myself; /* This node */
uint64_t currentEpoch;
int state; /* CLUSTER_OK, CLUSTER_FAIL, ... */
int fail_reason; /* Why the cluster state changes to fail. */
int size; /* Num of primary nodes with at least one slot */
dict *nodes; /* Hash table of name -> clusterNode structures */
dict *shards; /* Hash table of shard_id -> list (of nodes) structures */

View File

@ -41,3 +41,26 @@ test "errorstats: rejected call due to MOVED Redirection" {
}
} ;# start_cluster
start_cluster 3 0 {tags {external:skip cluster} overrides {cluster-node-timeout 1000}} {
test "fail reason changed" {
# Kill one primary, so the cluster fail with not-full-coverage.
pause_process [srv 0 pid]
wait_for_condition 1000 50 {
[CI 1 cluster_state] eq {fail} &&
[CI 2 cluster_state] eq {fail}
} else {
fail "Cluster doesn't fail"
}
verify_log_message -1 "*At least one hash slot is not served by any available node*" 0
verify_log_message -2 "*At least one hash slot is not served by any available node*" 0
# Kill one more primary, so the cluster fail with minority-partition.
pause_process [srv -1 pid]
wait_for_log_messages -2 {"*minority partition*"} 0 1000 50
resume_process [srv 0 pid]
resume_process [srv -1 pid]
wait_for_cluster_state ok
}
}