Print logs when the cluster state changes to fail or the fail reason changes (#1188)
This log allows us to easily distinguish between full coverage and minority partition when the cluster fails. Sometimes it is not easy to see the minority partition in a healthy shards (both primary and replicas). And we decided not to add a cluster_fail_reason field to cluster info. Given that there are only two reasons and both are well-known and if we ended up adding more down the road we can add it in the furture. Signed-off-by: Binbin <binloveplay1314@qq.com>
This commit is contained in:
parent
90475af594
commit
fbbfe5d3d3
@ -12,6 +12,12 @@
|
||||
#define CLUSTER_FAIL 1 /* The cluster can't work */
|
||||
#define CLUSTER_NAMELEN 40 /* sha1 hex length */
|
||||
|
||||
/* Reason why the cluster state changes to fail. When adding new reasons,
|
||||
* make sure to update clusterLogFailReason. */
|
||||
#define CLUSTER_FAIL_NONE 0
|
||||
#define CLUSTER_FAIL_NOT_FULL_COVERAGE 1
|
||||
#define CLUSTER_FAIL_MINORITY_PARTITION 2
|
||||
|
||||
/* Redirection errors returned by getNodeByQuery(). */
|
||||
#define CLUSTER_REDIR_NONE 0 /* Node can serve the request. */
|
||||
#define CLUSTER_REDIR_CROSS_SLOT 1 /* -CROSSSLOT request. */
|
||||
|
@ -1082,6 +1082,7 @@ void clusterInit(void) {
|
||||
server.cluster->myself = NULL;
|
||||
server.cluster->currentEpoch = 0;
|
||||
server.cluster->state = CLUSTER_FAIL;
|
||||
server.cluster->fail_reason = CLUSTER_FAIL_NONE;
|
||||
server.cluster->size = 0;
|
||||
server.cluster->todo_before_sleep = 0;
|
||||
server.cluster->nodes = dictCreate(&clusterNodesDictType);
|
||||
@ -4493,7 +4494,7 @@ void clusterLogCantFailover(int reason) {
|
||||
case CLUSTER_CANT_FAILOVER_WAITING_DELAY: msg = "Waiting the delay before I can start a new failover."; break;
|
||||
case CLUSTER_CANT_FAILOVER_EXPIRED: msg = "Failover attempt expired."; break;
|
||||
case CLUSTER_CANT_FAILOVER_WAITING_VOTES: msg = "Waiting for votes, but majority still not reached."; break;
|
||||
default: msg = "Unknown reason code."; break;
|
||||
default: serverPanic("Unknown cant failover reason code.");
|
||||
}
|
||||
lastlog_time = time(NULL);
|
||||
serverLog(LL_NOTICE, "Currently unable to failover: %s", msg);
|
||||
@ -5362,6 +5363,23 @@ void clusterCloseAllSlots(void) {
|
||||
* Cluster state evaluation function
|
||||
* -------------------------------------------------------------------------- */
|
||||
|
||||
void clusterLogFailReason(int reason) {
|
||||
if (reason == CLUSTER_FAIL_NONE) return;
|
||||
|
||||
char *msg;
|
||||
switch (reason) {
|
||||
case CLUSTER_FAIL_NOT_FULL_COVERAGE:
|
||||
msg = "At least one hash slot is not served by any available node. "
|
||||
"Please check the 'cluster-require-full-coverage' configuration.";
|
||||
break;
|
||||
case CLUSTER_FAIL_MINORITY_PARTITION:
|
||||
msg = "I am part of a minority partition.";
|
||||
break;
|
||||
default: serverPanic("Unknown fail reason code.");
|
||||
}
|
||||
serverLog(LL_WARNING, "Cluster is currently down: %s", msg);
|
||||
}
|
||||
|
||||
/* The following are defines that are only used in the evaluation function
|
||||
* and are based on heuristics. Actually the main point about the rejoin and
|
||||
* writable delay is that they should be a few orders of magnitude larger
|
||||
@ -5371,7 +5389,7 @@ void clusterCloseAllSlots(void) {
|
||||
#define CLUSTER_WRITABLE_DELAY 2000
|
||||
|
||||
void clusterUpdateState(void) {
|
||||
int j, new_state;
|
||||
int j, new_state, new_reason;
|
||||
int reachable_primaries = 0;
|
||||
static mstime_t among_minority_time;
|
||||
static mstime_t first_call_time = 0;
|
||||
@ -5392,12 +5410,14 @@ void clusterUpdateState(void) {
|
||||
/* Start assuming the state is OK. We'll turn it into FAIL if there
|
||||
* are the right conditions. */
|
||||
new_state = CLUSTER_OK;
|
||||
new_reason = CLUSTER_FAIL_NONE;
|
||||
|
||||
/* Check if all the slots are covered. */
|
||||
if (server.cluster_require_full_coverage) {
|
||||
for (j = 0; j < CLUSTER_SLOTS; j++) {
|
||||
if (server.cluster->slots[j] == NULL || server.cluster->slots[j]->flags & (CLUSTER_NODE_FAIL)) {
|
||||
new_state = CLUSTER_FAIL;
|
||||
new_reason = CLUSTER_FAIL_NOT_FULL_COVERAGE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -5432,6 +5452,7 @@ void clusterUpdateState(void) {
|
||||
|
||||
if (reachable_primaries < needed_quorum) {
|
||||
new_state = CLUSTER_FAIL;
|
||||
new_reason = CLUSTER_FAIL_MINORITY_PARTITION;
|
||||
among_minority_time = mstime();
|
||||
}
|
||||
}
|
||||
@ -5455,7 +5476,21 @@ void clusterUpdateState(void) {
|
||||
serverLog(new_state == CLUSTER_OK ? LL_NOTICE : LL_WARNING, "Cluster state changed: %s",
|
||||
new_state == CLUSTER_OK ? "ok" : "fail");
|
||||
server.cluster->state = new_state;
|
||||
|
||||
/* Cluster state changes from ok to fail, print a log. */
|
||||
if (new_state == CLUSTER_FAIL) {
|
||||
clusterLogFailReason(new_reason);
|
||||
server.cluster->fail_reason = new_reason;
|
||||
}
|
||||
}
|
||||
|
||||
/* Cluster state is still fail, but the reason has changed, print a log. */
|
||||
if (new_state == CLUSTER_FAIL && new_reason != server.cluster->fail_reason) {
|
||||
clusterLogFailReason(new_reason);
|
||||
server.cluster->fail_reason = new_reason;
|
||||
}
|
||||
|
||||
if (new_state == CLUSTER_OK) server.cluster->fail_reason = CLUSTER_FAIL_NONE;
|
||||
}
|
||||
|
||||
/* This function is called after the node startup in order to verify that data
|
||||
|
@ -370,6 +370,7 @@ struct clusterState {
|
||||
clusterNode *myself; /* This node */
|
||||
uint64_t currentEpoch;
|
||||
int state; /* CLUSTER_OK, CLUSTER_FAIL, ... */
|
||||
int fail_reason; /* Why the cluster state changes to fail. */
|
||||
int size; /* Num of primary nodes with at least one slot */
|
||||
dict *nodes; /* Hash table of name -> clusterNode structures */
|
||||
dict *shards; /* Hash table of shard_id -> list (of nodes) structures */
|
||||
|
@ -41,3 +41,26 @@ test "errorstats: rejected call due to MOVED Redirection" {
|
||||
}
|
||||
|
||||
} ;# start_cluster
|
||||
|
||||
start_cluster 3 0 {tags {external:skip cluster} overrides {cluster-node-timeout 1000}} {
|
||||
test "fail reason changed" {
|
||||
# Kill one primary, so the cluster fail with not-full-coverage.
|
||||
pause_process [srv 0 pid]
|
||||
wait_for_condition 1000 50 {
|
||||
[CI 1 cluster_state] eq {fail} &&
|
||||
[CI 2 cluster_state] eq {fail}
|
||||
} else {
|
||||
fail "Cluster doesn't fail"
|
||||
}
|
||||
verify_log_message -1 "*At least one hash slot is not served by any available node*" 0
|
||||
verify_log_message -2 "*At least one hash slot is not served by any available node*" 0
|
||||
|
||||
# Kill one more primary, so the cluster fail with minority-partition.
|
||||
pause_process [srv -1 pid]
|
||||
wait_for_log_messages -2 {"*minority partition*"} 0 1000 50
|
||||
|
||||
resume_process [srv 0 pid]
|
||||
resume_process [srv -1 pid]
|
||||
wait_for_cluster_state ok
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user