From 5a2f4a1e94dac4c74717bea2e3ba2a4697c3ddb6 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Sun, 7 Jan 2024 12:24:41 +0800
Subject: [PATCH] Use shard-id of the master if the replica does not support
 shard-id (#12805)

If there are nodes in the cluster that do not support shard-id, they
will gossip shard-id. From the perspective of nodes that support shard-id,
their shard-id is meaningless (since shard-id is randomly generated when
we create a node.)

Nodes that support shard-id will save the shard-id information in nodes.conf.
If the node is restarted according to nodes.conf, the server will report a
corrupted cluster config file error. Because auxShardIdSetter will reject
configurations with inconsistent master-replica shard-ids.

A cluster-wide consensus for the node's shard_id is not necessary. The key
is maintaining consistency of the shard_id on each individual 7.2 node.
As the cluster progressively upgrades to version 7.2, we can expect the
shard_ids across all nodes to naturally converge and align.

In this PR, when processing the gossip, if sender is a replica and does not
support shard-id, set the shard_id to the shard_id of its master.

(cherry picked from commit 4cae66f5e803c527c4e6141c06b94670162eca2c)
---
 src/cluster.c | 20 +++++++++++++++++++-
 src/cluster.h |  1 +
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/src/cluster.c b/src/cluster.c
index a752ad1bf..e4b2d0f53 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -2675,11 +2675,24 @@ void clusterProcessPingExtensions(clusterMsg *hdr, clusterLink *link) {
         /* We know this will be valid since we validated it ahead of time */
         ext = getNextPingExt(ext);
     }
+
     /* If the node did not send us a hostname extension, assume
      * they don't have an announced hostname. Otherwise, we'll
      * set it now. */
     updateAnnouncedHostname(sender, ext_hostname);
     updateAnnouncedHumanNodename(sender, ext_humannodename);
+
+    /* If the node did not send us a shard-id extension, it means the sender
+     * does not support it (old version), node->shard_id is randomly generated.
+     * A cluster-wide consensus for the node's shard_id is not necessary.
+     * The key is maintaining consistency of the shard_id on each individual 7.2 node.
+     * As the cluster progressively upgrades to version 7.2, we can expect the shard_ids
+     * across all nodes to naturally converge and align.
+     *
+     * If sender is a replica, set the shard_id to the shard_id of its master.
+     * Otherwise, we'll set it now. */
+    if (ext_shardid == NULL) ext_shardid = clusterNodeGetMaster(sender)->shard_id;
+
     updateShardId(sender, ext_shardid);
 }
 
@@ -5710,7 +5723,7 @@ void addShardReplyForClusterShards(client *c, list *nodes) {
     addReplyBulkCString(c, "slots");
 
     /* Use slot_info_pairs from the primary only */
-    while (n->slaveof != NULL) n = n->slaveof;
+    n = clusterNodeGetMaster(n);
 
     if (n->slot_info_pairs != NULL) {
         serverAssert((n->slot_info_pairs_count % 2) == 0);
@@ -7644,6 +7657,11 @@ unsigned int countKeysInSlot(unsigned int hashslot) {
     return (*server.db->slots_to_keys).by_slot[hashslot].count;
 }
 
+clusterNode *clusterNodeGetMaster(clusterNode *node) {
+    while (node->slaveof != NULL) node = node->slaveof;
+    return node;
+}
+
 /* -----------------------------------------------------------------------------
  * Operation(s) on channel rax tree.
  * -------------------------------------------------------------------------- */
diff --git a/src/cluster.h b/src/cluster.h
index 21c9c4499..db05ebcab 100644
--- a/src/cluster.h
+++ b/src/cluster.h
@@ -413,6 +413,7 @@ void clusterInit(void);
 void clusterInitListeners(void);
 void clusterCron(void);
 void clusterBeforeSleep(void);
+clusterNode *clusterNodeGetMaster(clusterNode *node);
 clusterNode *getNodeByQuery(client *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot, int *ask);
 int verifyClusterNodeId(const char *name, int length);
 clusterNode *clusterLookupNode(const char *name, int length);