From bcd5c4746baf0dd64a8d45f270856b9e41aa3949 Mon Sep 17 00:00:00 2001 From: Binbin Date: Fri, 13 Sep 2024 00:02:39 -0700 Subject: [PATCH] Fix replica unable trigger migration when it received CLUSTER SETSLOT in advance (#981) Fix timing issue in evaluating `cluster-allow-replica-migration` for replicas There is a timing bug where the primary and replica have different `cluster-allow-replica-migration` settings. In issue #970, we found that if the replica receives `CLUSTER SETSLOT` before the gossip update, it remains in the original shard. This happens because we only process the `cluster-allow-replica-migration` flag for primaries during `CLUSTER SETSLOT`. This commit fixes the issue by also evaluating this flag for replicas in the `CLUSTER SETSLOT` path, ensuring correct replica migration behavior. Closes #970 --------- Signed-off-by: Binbin Co-authored-by: Ping Xie Signed-off-by: Ping Xie --- src/cluster_legacy.c | 18 +++++++++++++---- src/networking.c | 7 ++++++- src/replication.c | 14 ++++++------- src/server.h | 5 ++++- tests/unit/cluster/replica-migration.tcl | 25 ++++++++++++++++++++++-- tests/unit/cluster/slot-migration.tcl | 2 ++ 6 files changed, 55 insertions(+), 16 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 65d808197..469530adc 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -6224,6 +6224,9 @@ int clusterParseSetSlotCommand(client *c, int *slot_out, clusterNode **node_out, return 0; } + /* If 'myself' is a replica, 'c' must be the primary client. */ + serverAssert(!nodeIsReplica(myself) || c == server.primary); + if ((slot = getSlotOrReply(c, c->argv[2])) == -1) return 0; if (!strcasecmp(c->argv[3]->ptr, "migrating") && c->argc >= 5) { @@ -6412,20 +6415,27 @@ void clusterCommandSetSlot(client *c) { server.cluster->migrating_slots_to[slot] = NULL; } - int slot_was_mine = server.cluster->slots[slot] == myself; + clusterNode *my_primary = clusterNodeGetPrimary(myself); + int slot_was_mine = server.cluster->slots[slot] == my_primary; clusterDelSlot(slot); clusterAddSlot(n, slot); - /* If we are a primary left without slots, we should turn into a - * replica of the new primary. */ - if (slot_was_mine && n != myself && myself->numslots == 0 && server.cluster_allow_replica_migration) { + /* If replica migration is allowed, check if the primary of this shard + * loses its last slot and the shard becomes empty. In this case, we + * should turn into a replica of the new primary. */ + if (server.cluster_allow_replica_migration && slot_was_mine && my_primary->numslots == 0) { + serverAssert(n != my_primary); serverLog(LL_NOTICE, "Lost my last slot during slot migration. Reconfiguring myself " "as a replica of %.40s (%s) in shard %.40s", n->name, n->human_nodename, n->shard_id); + /* `c` is the primary client if `myself` is a replica, prevent it + * from being freed by clusterSetPrimary. */ + if (nodeIsReplica(myself)) protectClient(c); /* We are migrating to a different shard that has a completely different * replication history, so a full sync is required. */ clusterSetPrimary(n, 1, 1); + if (nodeIsReplica(myself)) unprotectClient(c); clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_FSYNC_CONFIG); } diff --git a/src/networking.c b/src/networking.c index f9e725e16..44a94087c 100644 --- a/src/networking.c +++ b/src/networking.c @@ -1671,7 +1671,7 @@ void freeClient(client *c) { * some unexpected state, by checking its flags. */ if (server.primary && c->flag.primary) { serverLog(LL_NOTICE, "Connection with primary lost."); - if (!(c->flag.protocol_error || c->flag.blocked)) { + if (!c->flag.dont_cache_primary && !(c->flag.protocol_error || c->flag.blocked)) { c->flag.close_asap = 0; c->flag.close_after_reply = 0; replicationCachePrimary(c); @@ -2553,6 +2553,7 @@ void freeSharedQueryBuf(void) { * * * DEBUG RELOAD and similar. * * When a Lua script is in -BUSY state. + * * A cluster replica executing CLUSTER SETSLOT during slot migration. * * So the function will protect the client by doing two things: * @@ -3485,6 +3486,10 @@ void clientCommand(client *c) { const char *help[] = { "CACHING (YES|NO)", " Enable/disable tracking of the keys for next command in OPTIN/OPTOUT modes.", +"CAPA