Print logs when the cluster state changes to fail or the fail reason changes (#1188)

This log allows us to easily distinguish between full coverage and minority partition when the cluster fails. Sometimes it is not easy to see the minority partition in a healthy shards (both primary and replicas). And we decided not to add a cluster_fail_reason field to cluster info. Given that there are only two reasons and both are well-known and if we ended up adding more down the road we can add it in the furture. Signed-off-by: Binbin <binloveplay1314@qq.com>
2024-12-02 15:55:24 +08:00 · 2024-12-02 15:55:24 +08:00 · fbbfe5d3d3
commit fbbfe5d3d3
parent 90475af594
4 changed files with 67 additions and 2 deletions
--- a/src/cluster.h
+++ b/src/cluster.h
@ -12,6 +12,12 @@
 #define CLUSTER_FAIL 1                                              /* The cluster can't work */
 #define CLUSTER_NAMELEN 40                                          /* sha1 hex length */

+/* Reason why the cluster state changes to fail. When adding new reasons,
+ * make sure to update clusterLogFailReason. */
+#define CLUSTER_FAIL_NONE 0
+#define CLUSTER_FAIL_NOT_FULL_COVERAGE 1
+#define CLUSTER_FAIL_MINORITY_PARTITION 2
+
 /* Redirection errors returned by getNodeByQuery(). */
 #define CLUSTER_REDIR_NONE 0          /* Node can serve the request. */
 #define CLUSTER_REDIR_CROSS_SLOT 1    /* -CROSSSLOT request. */
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@ -1082,6 +1082,7 @@ void clusterInit(void) {
    server.cluster->myself = NULL;
    server.cluster->currentEpoch = 0;
    server.cluster->state = CLUSTER_FAIL;
+    server.cluster->fail_reason = CLUSTER_FAIL_NONE;
    server.cluster->size = 0;
    server.cluster->todo_before_sleep = 0;
    server.cluster->nodes = dictCreate(&clusterNodesDictType);
@ -4493,7 +4494,7 @@ void clusterLogCantFailover(int reason) {
    case CLUSTER_CANT_FAILOVER_WAITING_DELAY: msg = "Waiting the delay before I can start a new failover."; break;
    case CLUSTER_CANT_FAILOVER_EXPIRED: msg = "Failover attempt expired."; break;
    case CLUSTER_CANT_FAILOVER_WAITING_VOTES: msg = "Waiting for votes, but majority still not reached."; break;
-    default: msg = "Unknown reason code."; break;
+    default: serverPanic("Unknown cant failover reason code.");
    }
    lastlog_time = time(NULL);
    serverLog(LL_NOTICE, "Currently unable to failover: %s", msg);
@ -5362,6 +5363,23 @@ void clusterCloseAllSlots(void) {
 * Cluster state evaluation function
 * -------------------------------------------------------------------------- */

+void clusterLogFailReason(int reason) {
+    if (reason == CLUSTER_FAIL_NONE) return;
+
+    char *msg;
+    switch (reason) {
+    case CLUSTER_FAIL_NOT_FULL_COVERAGE:
+        msg = "At least one hash slot is not served by any available node. "
+              "Please check the 'cluster-require-full-coverage' configuration.";
+        break;
+    case CLUSTER_FAIL_MINORITY_PARTITION:
+        msg = "I am part of a minority partition.";
+        break;
+    default: serverPanic("Unknown fail reason code.");
+    }
+    serverLog(LL_WARNING, "Cluster is currently down: %s", msg);
+}
+
 /* The following are defines that are only used in the evaluation function
 * and are based on heuristics. Actually the main point about the rejoin and
 * writable delay is that they should be a few orders of magnitude larger
@ -5371,7 +5389,7 @@ void clusterCloseAllSlots(void) {
 #define CLUSTER_WRITABLE_DELAY 2000

 void clusterUpdateState(void) {
-    int j, new_state;
+    int j, new_state, new_reason;
    int reachable_primaries = 0;
    static mstime_t among_minority_time;
    static mstime_t first_call_time = 0;
@ -5392,12 +5410,14 @@ void clusterUpdateState(void) {
    /* Start assuming the state is OK. We'll turn it into FAIL if there
     * are the right conditions. */
    new_state = CLUSTER_OK;
+    new_reason = CLUSTER_FAIL_NONE;

    /* Check if all the slots are covered. */
    if (server.cluster_require_full_coverage) {
        for (j = 0; j < CLUSTER_SLOTS; j++) {
            if (server.cluster->slots[j] == NULL || server.cluster->slots[j]->flags & (CLUSTER_NODE_FAIL)) {
                new_state = CLUSTER_FAIL;
+                new_reason = CLUSTER_FAIL_NOT_FULL_COVERAGE;
                break;
            }
        }
@ -5432,6 +5452,7 @@ void clusterUpdateState(void) {

        if (reachable_primaries < needed_quorum) {
            new_state = CLUSTER_FAIL;
+            new_reason = CLUSTER_FAIL_MINORITY_PARTITION;
            among_minority_time = mstime();
        }
    }
@ -5455,7 +5476,21 @@ void clusterUpdateState(void) {
        serverLog(new_state == CLUSTER_OK ? LL_NOTICE : LL_WARNING, "Cluster state changed: %s",
                  new_state == CLUSTER_OK ? "ok" : "fail");
        server.cluster->state = new_state;
+
+        /* Cluster state changes from ok to fail, print a log. */
+        if (new_state == CLUSTER_FAIL) {
+            clusterLogFailReason(new_reason);
+            server.cluster->fail_reason = new_reason;
+        }
    }
+
+    /* Cluster state is still fail, but the reason has changed, print a log. */
+    if (new_state == CLUSTER_FAIL && new_reason != server.cluster->fail_reason) {
+        clusterLogFailReason(new_reason);
+        server.cluster->fail_reason = new_reason;
+    }
+
+    if (new_state == CLUSTER_OK) server.cluster->fail_reason = CLUSTER_FAIL_NONE;
 }

 /* This function is called after the node startup in order to verify that data
--- a/src/cluster_legacy.h
+++ b/src/cluster_legacy.h
@ -370,6 +370,7 @@ struct clusterState {
    clusterNode *myself; /* This node */
    uint64_t currentEpoch;
    int state;              /* CLUSTER_OK, CLUSTER_FAIL, ... */
+    int fail_reason;        /* Why the cluster state changes to fail. */
    int size;               /* Num of primary nodes with at least one slot */
    dict *nodes;            /* Hash table of name -> clusterNode structures */
    dict *shards;           /* Hash table of shard_id -> list (of nodes) structures */
--- a/tests/unit/cluster/info.tcl
+++ b/tests/unit/cluster/info.tcl
@ -41,3 +41,26 @@ test "errorstats: rejected call due to MOVED Redirection" {
 }

 } ;# start_cluster
+
+start_cluster 3 0 {tags {external:skip cluster} overrides {cluster-node-timeout 1000}} {
+    test "fail reason changed" {
+        # Kill one primary, so the cluster fail with not-full-coverage.
+        pause_process [srv 0 pid]
+        wait_for_condition 1000 50 {
+            [CI 1 cluster_state] eq {fail} &&
+            [CI 2 cluster_state] eq {fail}
+        } else {
+            fail "Cluster doesn't fail"
+        }
+        verify_log_message -1 "*At least one hash slot is not served by any available node*" 0
+        verify_log_message -2 "*At least one hash slot is not served by any available node*" 0
+
+        # Kill one more primary, so the cluster fail with minority-partition.
+        pause_process [srv -1 pid]
+        wait_for_log_messages -2 {"*minority partition*"} 0 1000 50
+
+        resume_process [srv 0 pid]
+        resume_process [srv -1 pid]
+        wait_for_cluster_state ok
+    }
+}