From 1190f25ca7644e5cdea0b5c8c88e07daaccc36aa Mon Sep 17 00:00:00 2001
From: Sankar <1890648+srgsanky@users.noreply.github.com>
Date: Wed, 5 Jul 2023 17:46:23 -0700
Subject: [PATCH] Process loss of slot ownership in cluster bus (#12344)

Process loss of slot ownership in cluster bus

When a node no longer owns a slot, it clears the bit corresponding
to the slot in the cluster bus messages. The receiving nodes
currently don't record the fact that the sender stopped claiming
a slot until some other node in the cluster starts claiming the slot.
This can cause a slot to go missing during slot migration when subjected
to inopportune race with addition of new shards or a failover.
This fix forces the receiving nodes to process the loss of ownership
to avoid spreading wrong information.
---
 src/cluster.c                            | 35 ++++++++++++++++++----
 src/cluster.h                            |  7 +++++
 tests/cluster/tests/15-cluster-slots.tcl | 37 ++++++++++++++++++++++++
 3 files changed, 73 insertions(+), 6 deletions(-)

diff --git a/src/cluster.c b/src/cluster.c
index c4bc690bf..a390585f3 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -67,6 +67,8 @@ void clusterSetMaster(clusterNode *n);
 void clusterHandleSlaveFailover(void);
 void clusterHandleSlaveMigration(int max_slaves);
 int bitmapTestBit(unsigned char *bitmap, int pos);
+void bitmapSetBit(unsigned char *bitmap, int pos);
+void bitmapClearBit(unsigned char *bitmap, int pos);
 void clusterDoBeforeSleep(int flags);
 void clusterSendUpdate(clusterLink *link, clusterNode *node);
 void resetManualFailover(void);
@@ -124,6 +126,10 @@ static inline int defaultClientPort(void) {
 #define dictEntryPrevInSlot(de) \
     (((clusterDictEntryMetadata *)dictEntryMetadata(de))->prev)
 
+#define isSlotUnclaimed(slot) \
+    (server.cluster->slots[slot] == NULL || \
+        bitmapTestBit(server.cluster->owner_not_claiming_slot, slot))
+
 #define RCVBUF_INIT_LEN 1024
 #define RCVBUF_MAX_PREALLOC (1<<20) /* 1MB */
 
@@ -2352,7 +2358,10 @@ void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoc
             sender_slots++;
 
             /* The slot is already bound to the sender of this message. */
-            if (server.cluster->slots[j] == sender) continue;
+            if (server.cluster->slots[j] == sender) {
+                bitmapClearBit(server.cluster->owner_not_claiming_slot, j);
+                continue;
+            }
 
             /* The slot is in importing state, it should be modified only
              * manually via redis-cli (example: a resharding is in progress
@@ -2361,10 +2370,10 @@ void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoc
             if (server.cluster->importing_slots_from[j]) continue;
 
             /* We rebind the slot to the new node claiming it if:
-             * 1) The slot was unassigned or the new node claims it with a
-             *    greater configEpoch.
+             * 1) The slot was unassigned or the previous owner no longer owns the slot or
+             *    the new node claims it with a greater configEpoch.
              * 2) We are not currently importing the slot. */
-            if (server.cluster->slots[j] == NULL ||
+            if (isSlotUnclaimed(j) ||
                 server.cluster->slots[j]->configEpoch < senderConfigEpoch)
             {
                 /* Was this slot mine, and still contains keys? Mark it as
@@ -2383,10 +2392,20 @@ void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoc
                 }
                 clusterDelSlot(j);
                 clusterAddSlot(sender,j);
+                bitmapClearBit(server.cluster->owner_not_claiming_slot, j);
                 clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
                                      CLUSTER_TODO_UPDATE_STATE|
                                      CLUSTER_TODO_FSYNC_CONFIG);
             }
+        } else if (server.cluster->slots[j] == sender) {
+            /* The slot is currently bound to the sender but the sender is no longer
+             * claiming it. We don't want to unbind the slot yet as it can cause the cluster
+             * to move to FAIL state and also throw client error. Keeping the slot bound to
+             * the previous owner will cause a few client side redirects, but won't throw
+             * any errors. We will keep track of the uncertainty in ownership to avoid
+             * propagating misinformation about this slot's ownership using UPDATE
+             * messages. */
+            bitmapSetBit(server.cluster->owner_not_claiming_slot, j);
         }
     }
 
@@ -3043,7 +3062,7 @@ int clusterProcessPacket(clusterLink *link) {
             for (j = 0; j < CLUSTER_SLOTS; j++) {
                 if (bitmapTestBit(hdr->myslots,j)) {
                     if (server.cluster->slots[j] == sender ||
-                        server.cluster->slots[j] == NULL) continue;
+                        isSlotUnclaimed(j)) continue;
                     if (server.cluster->slots[j]->configEpoch >
                         senderConfigEpoch)
                     {
@@ -3746,6 +3765,10 @@ void clusterSendUpdate(clusterLink *link, clusterNode *node) {
     memcpy(hdr->data.update.nodecfg.nodename,node->name,CLUSTER_NAMELEN);
     hdr->data.update.nodecfg.configEpoch = htonu64(node->configEpoch);
     memcpy(hdr->data.update.nodecfg.slots,node->slots,sizeof(node->slots));
+    for (unsigned int i = 0; i < sizeof(node->slots); i++) {
+        /* Don't advertise slots that the node stopped claiming */
+        hdr->data.update.nodecfg.slots[i] = hdr->data.update.nodecfg.slots[i] & (~server.cluster->owner_not_claiming_slot[i]);
+    }
 
     clusterSendMessage(link,msgblock);
     clusterMsgSendBlockDecrRefCount(msgblock);
@@ -3951,7 +3974,7 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) {
      * slots in the current configuration. */
     for (j = 0; j < CLUSTER_SLOTS; j++) {
         if (bitmapTestBit(claimed_slots, j) == 0) continue;
-        if (server.cluster->slots[j] == NULL ||
+        if (isSlotUnclaimed(j) ||
             server.cluster->slots[j]->configEpoch <= requestConfigEpoch)
         {
             continue;
diff --git a/src/cluster.h b/src/cluster.h
index 0aecec9b3..21c9c4499 100644
--- a/src/cluster.h
+++ b/src/cluster.h
@@ -213,6 +213,13 @@ typedef struct clusterState {
     long long stats_pfail_nodes;    /* Number of nodes in PFAIL status,
                                        excluding nodes without address. */
     unsigned long long stat_cluster_links_buffer_limit_exceeded;  /* Total number of cluster links freed due to exceeding buffer limit */
+
+    /* Bit map for slots that are no longer claimed by the owner in cluster PING
+     * messages. During slot migration, the owner will stop claiming the slot after
+     * the ownership transfer. Set the bit corresponding to the slot when a node
+     * stops claiming the slot. This prevents spreading incorrect information (that
+     * source still owns the slot) using UPDATE messages. */
+    unsigned char owner_not_claiming_slot[CLUSTER_SLOTS / 8];
 } clusterState;
 
 /* Redis cluster messages header */
diff --git a/tests/cluster/tests/15-cluster-slots.tcl b/tests/cluster/tests/15-cluster-slots.tcl
index 93b64b408..892e9049b 100644
--- a/tests/cluster/tests/15-cluster-slots.tcl
+++ b/tests/cluster/tests/15-cluster-slots.tcl
@@ -77,6 +77,43 @@ test "slot migration is invalid from primary to replica" {
     assert_match "*Target node is not a master" $err
 }
 
+proc count_bound_slots {n} {
+     set slot_count 0
+     foreach slot_range_mapping [$n cluster slots] {
+         set start_slot [lindex $slot_range_mapping 0]
+         set end_slot [lindex $slot_range_mapping 1]
+         incr slot_count [expr $end_slot - $start_slot + 1]
+     }
+     return $slot_count
+ }
+
+ test "slot must be unbound on the owner when it is deleted" {
+     set node0 [Rn 0]
+     set node1 [Rn 1]
+     assert {[count_bound_slots $node0] eq 16384}
+     assert {[count_bound_slots $node1] eq 16384}
+
+     set slot_to_delete 0
+     # Delete
+     $node0 CLUSTER DELSLOTS $slot_to_delete
+
+     # Verify
+     # The node that owns the slot must unbind the slot that was deleted
+     wait_for_condition 1000 50 {
+         [count_bound_slots $node0] == 16383
+     } else {
+         fail "Cluster slot deletion was not recorded on the node that owns the slot"
+     }
+
+     # We don't propagate slot deletion across all nodes in the cluster.
+     # This can lead to extra redirect before the clients find out that the slot is unbound.
+     wait_for_condition 1000 50 {
+         [count_bound_slots $node1] == 16384
+     } else {
+         fail "Cluster slot deletion should not be propagated to all nodes in the cluster"
+     }
+ }
+
 if {$::tls} {
     test {CLUSTER SLOTS from non-TLS client in TLS cluster} {
         set slots_tls [R 0 cluster slots]