Fix CLUSTER SETSLOT block and unblock error when all replicas are down (#879)

In CLUSTER SETSLOT propagation logic, if the replicas are down, the client will get block during command processing and then unblock with `NOREPLICAS Not enough good replicas to write`. The reason is that all replicas are down (or some are down), but myself->num_replicas is including all replicas, so the client will get block and always get timeout. We should only wait for those online replicas, otherwise the waiting propagation will always timeout since there are not enough replicas. The admin can easily check if there are replicas that are down for an extended period of time. If they decide to move forward anyways, we should not block it. If a replica failed right before the replication and was not included in the replication, it would also unlikely win the election. Signed-off-by: Binbin <binloveplay1314@qq.com> Co-authored-by: Ping Xie <pingxie@google.com>
2024-08-23 16:21:53 +08:00 · 2024-08-23 16:21:53 +08:00 · 5d97f5133c
commit 5d97f5133c
parent 0a11c4a140
2 changed files with 48 additions and 8 deletions
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@ -6271,7 +6271,7 @@ void clusterCommandSetSlot(client *c) {
     * To mitigate this issue, the following order needs to be enforced for slot
     * migration finalization such that the replicas finalize the slot ownership
     * before the primary:
- .   *
+     *
     * 1. Client C issues SETSLOT n NODE B against node B.
     * 2. Primary B replicates `SETSLOT n NODE B` to all of its replicas (e.g., B', B'').
     * 3. Upon replication completion, primary B executes `SETSLOT n NODE B` and
@ -6291,17 +6291,25 @@ void clusterCommandSetSlot(client *c) {
         * non-replicated behavior.*/
        listIter li;
        listNode *ln;
-        int legacy_replica_found = 0;
+        int num_eligible_replicas = 0;
        listRewind(server.replicas, &li);
        while ((ln = listNext(&li))) {
            client *r = ln->value;
-            if (r->replica_version < 0x702ff /* 7.2.255 */) {
-                legacy_replica_found++;
-                break;
+
+            /* We think that when the command comes in, the primary only needs to
+             * wait for the online replicas. The admin can easily check if there
+             * are replicas that are down for an extended period of time. If they
+             * decide to move forward anyways, we should not block it. If a replica
+             * failed right before the replication and was not included in the
+             * replication, it would also unlikely win the election.
+             *
+             * And 0x702ff is 7.2.255, we only support new versions in this case. */
+            if (r->repl_state == REPLICA_STATE_ONLINE && r->replica_version > 0x702ff) {
+                num_eligible_replicas++;
            }
        }

-        if (!legacy_replica_found) {
+        if (num_eligible_replicas != 0) {
            forceCommandPropagation(c, PROPAGATE_REPL);
            /* We are a primary and this is the first time we see this `SETSLOT`
             * command. Force-replicate the command to all of our replicas
@ -6311,7 +6319,7 @@ void clusterCommandSetSlot(client *c) {
             * 2. The repl offset target is set to the primary's current repl offset + 1.
             *    There is no concern of partial replication because replicas always
             *    ack the repl offset at the command boundary. */
-            blockClientForReplicaAck(c, timeout_ms, server.primary_repl_offset + 1, myself->num_replicas, 0);
+            blockClientForReplicaAck(c, timeout_ms, server.primary_repl_offset + 1, num_eligible_replicas, 0);
            /* Mark client as pending command for execution after replication to replicas. */
            c->flag.pending_command = 1;
            replicationRequestAckFromReplicas();
@ -6320,7 +6328,7 @@ void clusterCommandSetSlot(client *c) {
    }

    /* Slot states have been updated on the compatible replicas (if any).
-     * Now exuecte the command on the primary. */
+     * Now execute the command on the primary. */
    if (!strcasecmp(c->argv[3]->ptr, "migrating")) {
        serverLog(LL_NOTICE, "Migrating slot %d to node %.40s (%s)", slot, n->name, n->human_nodename);
        server.cluster->migrating_slots_to[slot] = n;
--- a/tests/unit/cluster/slot-migration.tcl
+++ b/tests/unit/cluster/slot-migration.tcl
@ -435,3 +435,35 @@ start_cluster 2 0 {tags {tls:skip external:skip cluster regression} overrides {c
        R 0 MIGRATE 127.0.0.1 [lindex [R 1 CONFIG GET port] 1] $stream_name 0 5000
    }
 }
+
+start_cluster 3 6 {tags {external:skip cluster} overrides {cluster-node-timeout 1000} } {
+    test "Slot migration is ok when the replicas are down" {
+        # Killing all replicas in primary 0.
+        assert_equal 2 [s 0 connected_slaves]
+        catch {R 3 shutdown nosave}
+        catch {R 6 shutdown nosave}
+        wait_for_condition 50 100 {
+            [s 0 connected_slaves] == 0
+        } else {
+            fail "The replicas in primary 0 are still connecting"
+        }
+
+        # Killing one replica in primary 1.
+        assert_equal 2 [s -1 connected_slaves]
+        catch {R 4 shutdown nosave}
+        wait_for_condition 50 100 {
+            [s -1 connected_slaves] == 1
+        } else {
+            fail "The replica in primary 1 is still connecting"
+        }
+
+        # Check slot migration is ok when the replicas are down.
+        migrate_slot 0 1 0
+        migrate_slot 0 2 1
+        assert_equal {OK} [R 0 CLUSTER SETSLOT 0 NODE [R 1 CLUSTER MYID]]
+        assert_equal {OK} [R 0 CLUSTER SETSLOT 1 NODE [R 2 CLUSTER MYID]]
+        wait_for_slot_state 0 ""
+        wait_for_slot_state 1 ""
+        wait_for_slot_state 2 ""
+    }
+}