From 5d97f5133c271313cd5a172617e8af1fe845e0e2 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Fri, 23 Aug 2024 16:21:53 +0800
Subject: [PATCH] Fix CLUSTER SETSLOT block and unblock error when all replicas
 are down (#879)

In CLUSTER SETSLOT propagation logic, if the replicas are down, the
client will get block during command processing and then unblock
with `NOREPLICAS Not enough good replicas to write`.

The reason is that all replicas are down (or some are down), but
myself->num_replicas is including all replicas, so the client will
get block and always get timeout.

We should only wait for those online replicas, otherwise the waiting
propagation will always timeout since there are not enough replicas.
The admin can easily check if there are replicas that are down for an
extended period of time. If they decide to move forward anyways, we
should not block it. If a replica  failed right before the replication and
was not included in the replication, it would also unlikely win the election.

Signed-off-by: Binbin <binloveplay1314@qq.com>
Co-authored-by: Ping Xie <pingxie@google.com>
---
 src/cluster_legacy.c                  | 24 +++++++++++++-------
 tests/unit/cluster/slot-migration.tcl | 32 +++++++++++++++++++++++++++
 2 files changed, 48 insertions(+), 8 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 423ea5aab..33ef7054e 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -6271,7 +6271,7 @@ void clusterCommandSetSlot(client *c) {
      * To mitigate this issue, the following order needs to be enforced for slot
      * migration finalization such that the replicas finalize the slot ownership
      * before the primary:
- .   *
+     *
      * 1. Client C issues SETSLOT n NODE B against node B.
      * 2. Primary B replicates `SETSLOT n NODE B` to all of its replicas (e.g., B', B'').
      * 3. Upon replication completion, primary B executes `SETSLOT n NODE B` and
@@ -6291,17 +6291,25 @@ void clusterCommandSetSlot(client *c) {
          * non-replicated behavior.*/
         listIter li;
         listNode *ln;
-        int legacy_replica_found = 0;
+        int num_eligible_replicas = 0;
         listRewind(server.replicas, &li);
         while ((ln = listNext(&li))) {
             client *r = ln->value;
-            if (r->replica_version < 0x702ff /* 7.2.255 */) {
-                legacy_replica_found++;
-                break;
+
+            /* We think that when the command comes in, the primary only needs to
+             * wait for the online replicas. The admin can easily check if there
+             * are replicas that are down for an extended period of time. If they
+             * decide to move forward anyways, we should not block it. If a replica
+             * failed right before the replication and was not included in the
+             * replication, it would also unlikely win the election.
+             *
+             * And 0x702ff is 7.2.255, we only support new versions in this case. */
+            if (r->repl_state == REPLICA_STATE_ONLINE && r->replica_version > 0x702ff) {
+                num_eligible_replicas++;
             }
         }
 
-        if (!legacy_replica_found) {
+        if (num_eligible_replicas != 0) {
             forceCommandPropagation(c, PROPAGATE_REPL);
             /* We are a primary and this is the first time we see this `SETSLOT`
              * command. Force-replicate the command to all of our replicas
@@ -6311,7 +6319,7 @@ void clusterCommandSetSlot(client *c) {
              * 2. The repl offset target is set to the primary's current repl offset + 1.
              *    There is no concern of partial replication because replicas always
              *    ack the repl offset at the command boundary. */
-            blockClientForReplicaAck(c, timeout_ms, server.primary_repl_offset + 1, myself->num_replicas, 0);
+            blockClientForReplicaAck(c, timeout_ms, server.primary_repl_offset + 1, num_eligible_replicas, 0);
             /* Mark client as pending command for execution after replication to replicas. */
             c->flag.pending_command = 1;
             replicationRequestAckFromReplicas();
@@ -6320,7 +6328,7 @@ void clusterCommandSetSlot(client *c) {
     }
 
     /* Slot states have been updated on the compatible replicas (if any).
-     * Now exuecte the command on the primary. */
+     * Now execute the command on the primary. */
     if (!strcasecmp(c->argv[3]->ptr, "migrating")) {
         serverLog(LL_NOTICE, "Migrating slot %d to node %.40s (%s)", slot, n->name, n->human_nodename);
         server.cluster->migrating_slots_to[slot] = n;
diff --git a/tests/unit/cluster/slot-migration.tcl b/tests/unit/cluster/slot-migration.tcl
index 030404dfd..d79897196 100644
--- a/tests/unit/cluster/slot-migration.tcl
+++ b/tests/unit/cluster/slot-migration.tcl
@@ -435,3 +435,35 @@ start_cluster 2 0 {tags {tls:skip external:skip cluster regression} overrides {c
         R 0 MIGRATE 127.0.0.1 [lindex [R 1 CONFIG GET port] 1] $stream_name 0 5000
     }
 }
+
+start_cluster 3 6 {tags {external:skip cluster} overrides {cluster-node-timeout 1000} } {
+    test "Slot migration is ok when the replicas are down" {
+        # Killing all replicas in primary 0.
+        assert_equal 2 [s 0 connected_slaves]
+        catch {R 3 shutdown nosave}
+        catch {R 6 shutdown nosave}
+        wait_for_condition 50 100 {
+            [s 0 connected_slaves] == 0
+        } else {
+            fail "The replicas in primary 0 are still connecting"
+        }
+
+        # Killing one replica in primary 1.
+        assert_equal 2 [s -1 connected_slaves]
+        catch {R 4 shutdown nosave}
+        wait_for_condition 50 100 {
+            [s -1 connected_slaves] == 1
+        } else {
+            fail "The replica in primary 1 is still connecting"
+        }
+
+        # Check slot migration is ok when the replicas are down.
+        migrate_slot 0 1 0
+        migrate_slot 0 2 1
+        assert_equal {OK} [R 0 CLUSTER SETSLOT 0 NODE [R 1 CLUSTER MYID]]
+        assert_equal {OK} [R 0 CLUSTER SETSLOT 1 NODE [R 2 CLUSTER MYID]]
+        wait_for_slot_state 0 ""
+        wait_for_slot_state 1 ""
+        wait_for_slot_state 2 ""
+    }
+}