From d746556e4fa5a4c6afaee1d869eea4b4f06d203c Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Mon, 17 Jun 2024 11:46:08 +0800
Subject: [PATCH] Only primary with slots has the right to mark a node as
 failed (#634)

In markNodeAsFailingIfNeeded we will count needed_quorum and failures,
needed_quorum is the half the cluster->size and plus one, and
cluster-size
is the size of primary node which contain slots, but when counting
failures, we dit not check if primary has slots.

Only the primary has slots that has the rights to vote, adding a new
clusterNodeIsVotingPrimary to formalize this concept.

Release notes:

bugfix where nodes not in the quorum group might spuriously mark nodes
as failed

---------

Signed-off-by: Binbin <binloveplay1314@qq.com>
Co-authored-by: Ping Xie <pingxie@outlook.com>
Signed-off-by: Ping Xie <pingxie@google.com>
---
 src/cluster.c                          | 24 +++++----
 tests/unit/cluster/failure-marking.tcl | 68 ++++++++++++++++++++++++++
 2 files changed, 83 insertions(+), 9 deletions(-)

diff --git a/src/cluster.c b/src/cluster.c
index 1be4de2c9..a9b1b9a64 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -43,6 +43,12 @@
 #include <math.h>
 #include <ctype.h>
 
+/* Only primaries that own slots have voting rights.
+ * Returns 1 if the node has voting rights, otherwise returns 0. */
+static inline int clusterNodeIsVotingPrimary(clusterNode *n) {
+    return (n->flags & CLUSTER_NODE_MASTER) && n->numslots;
+}
+
 /* A global reference to myself is handy to make code more clear.
  * Myself always points to server.cluster->myself, that is, the clusterNode
  * that represents this node. */
@@ -1961,8 +1967,8 @@ void markNodeAsFailingIfNeeded(clusterNode *node) {
     if (nodeFailed(node)) return; /* Already FAILing. */
 
     failures = clusterNodeFailureReportsCount(node);
-    /* Also count myself as a voter if I'm a master. */
-    if (nodeIsMaster(myself)) failures++;
+    /* Also count myself as a voter if I'm a voting primary. */
+    if (clusterNodeIsVotingPrimary(myself)) failures++;
     if (failures < needed_quorum) return; /* No weak agreement from masters. */
 
     serverLog(LL_NOTICE,
@@ -2005,7 +2011,7 @@ void clearNodeFailureIfNeeded(clusterNode *node) {
      * 1) The FAIL state is old enough.
      * 2) It is yet serving slots from our point of view (not failed over).
      * Apparently no one is going to fix these slots, clear the FAIL flag. */
-    if (nodeIsMaster(node) && node->numslots > 0 &&
+    if (clusterNodeIsVotingPrimary(node) &&
         (now - node->fail_time) >
         (server.cluster_node_timeout * CLUSTER_FAIL_UNDO_TIME_MULT))
     {
@@ -2154,8 +2160,8 @@ void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) {
         node = clusterLookupNode(g->nodename, CLUSTER_NAMELEN);
         if (node) {
             /* We already know this node.
-               Handle failure reports, only when the sender is a master. */
-            if (sender && nodeIsMaster(sender) && node != myself) {
+               Handle failure reports, only when the sender is a voting primary. */
+            if (sender && clusterNodeIsVotingPrimary(sender) && node != myself) {
                 if (flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL)) {
                     if (clusterNodeAddFailureReport(node,sender)) {
                         serverLog(LL_VERBOSE,
@@ -3198,10 +3204,10 @@ int clusterProcessPacket(clusterLink *link) {
         clusterSendFailoverAuthIfNeeded(sender,hdr);
     } else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK) {
         if (!sender) return 1;  /* We don't know that node. */
-        /* We consider this vote only if the sender is a master serving
+        /* We consider this vote only if the sender is a primary serving
          * a non zero number of slots, and its currentEpoch is greater or
          * equal to epoch where this node started the election. */
-        if (nodeIsMaster(sender) && sender->numslots > 0 &&
+        if (clusterNodeIsVotingPrimary(sender) &&
             senderCurrentEpoch >= server.cluster->failover_auth_epoch)
         {
             server.cluster->failover_auth_count++;
@@ -4803,7 +4809,7 @@ void clusterCron(void) {
                 node->flags |= CLUSTER_NODE_PFAIL;
                 update_state = 1;
             }
-            if (nodeIsMaster(myself) && server.cluster->size == 1) {
+            if (clusterNodeIsVotingPrimary(myself) && server.cluster->size == 1) {
                 markNodeAsFailingIfNeeded(node);
             } else {
                 serverLog(LL_DEBUG,"*** NODE %.40s possibly failing", node->name);
@@ -5081,7 +5087,7 @@ void clusterUpdateState(void) {
         while((de = dictNext(di)) != NULL) {
             clusterNode *node = dictGetVal(de);
 
-            if (nodeIsMaster(node) && node->numslots) {
+            if (clusterNodeIsVotingPrimary(node)) {
                 server.cluster->size++;
                 if ((node->flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL)) == 0)
                     reachable_masters++;
diff --git a/tests/unit/cluster/failure-marking.tcl b/tests/unit/cluster/failure-marking.tcl
index c4746c826..cfed7fff0 100644
--- a/tests/unit/cluster/failure-marking.tcl
+++ b/tests/unit/cluster/failure-marking.tcl
@@ -16,6 +16,8 @@ start_cluster 1 1 {tags {external:skip cluster}} {
         pause_process $replica1_pid
 
         wait_node_marked_fail 0 $replica1_instance_id
+
+        resume_process $replica1_pid
     }
 }
 
@@ -49,5 +51,71 @@ start_cluster 2 1 {tags {external:skip cluster}} {
         resume_process $primary2_pid
 
         wait_node_marked_fail 0 $replica1_instance_id
+
+        resume_process $replica1_pid
     }
 }
+
+set old_singledb $::singledb
+set ::singledb 1
+
+tags {external:skip tls:skip cluster} {
+    set base_conf [list cluster-enabled yes cluster-ping-interval 100 cluster-node-timeout 3000 save ""]
+    start_multiple_servers 5 [list overrides $base_conf] {
+        test "Only primary with slots has the right to mark a node as failed" {
+            set primary_host [srv 0 host]
+            set primary_port [srv 0 port]
+            set primary_pid [srv 0 pid]
+            set primary_id [R 0 CLUSTER MYID]
+            set replica_id [R 1 CLUSTER MYID]
+            set replica_pid [srv -1 pid]
+
+            # Meet others nodes.
+            R 1 CLUSTER MEET $primary_host $primary_port
+            R 2 CLUSTER MEET $primary_host $primary_port
+            R 3 CLUSTER MEET $primary_host $primary_port
+            R 4 CLUSTER MEET $primary_host $primary_port
+
+            # Build a single primary cluster.
+            cluster_allocate_slots 1 1
+            wait_for_cluster_propagation
+            R 1 CLUSTER REPLICATE $primary_id
+            wait_for_cluster_propagation
+            wait_for_cluster_state "ok"
+
+            # Pause the primary, marking the primary as pfail.
+            pause_process $primary_pid
+            wait_node_marked_pfail 1 $primary_id
+            wait_node_marked_pfail 2 $primary_id
+            wait_node_marked_pfail 3 $primary_id
+            wait_node_marked_pfail 4 $primary_id
+
+            # Pause the replica, marking the replica as pfail.
+            pause_process $replica_pid
+            wait_node_marked_pfail 2 $replica_id
+            wait_node_marked_pfail 3 $replica_id
+            wait_node_marked_pfail 4 $replica_id
+
+            # Resume the primary, marking the replica as fail.
+            resume_process $primary_pid
+            wait_node_marked_fail 0 $replica_id
+            wait_node_marked_fail 2 $replica_id
+            wait_node_marked_fail 3 $replica_id
+            wait_node_marked_fail 4 $replica_id
+
+            # Check if we got the right failure reports.
+            wait_for_condition 1000 50 {
+                [R 0 CLUSTER COUNT-FAILURE-REPORTS $replica_id] == 0 &&
+                [R 2 CLUSTER COUNT-FAILURE-REPORTS $replica_id] == 1 &&
+                [R 3 CLUSTER COUNT-FAILURE-REPORTS $replica_id] == 1 &&
+                [R 4 CLUSTER COUNT-FAILURE-REPORTS $replica_id] == 1
+            } else {
+                fail "Cluster COUNT-FAILURE-REPORTS is not right."
+            }
+
+            resume_process $replica_pid
+        }
+    }
+}
+
+set ::singledb $old_singledb