From 300c6c17aa9cabd98a286b6e0038ce722f9b393a Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 15 Mar 2013 16:39:49 +0100 Subject: [PATCH] Cluster: slaves start failover with a small delay. Redis Cluster can cope with a minority of nodes not informed about the failure of a master in time for some reason (netsplit or node not functioning properly, blocked, ...) however to wait a few seconds before to start the failover will make most "normal" failovers simpler as the FAIL message will propagate before the slave election happens. --- src/cluster.c | 8 +++++++- src/redis.h | 1 + 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/cluster.c b/src/cluster.c index 7424f2d9d..68628f066 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -1565,10 +1565,16 @@ void clusterCron(void) { } /* If we are a slave and our master is down, but is serving slots, - * call the function that handles the failover. */ + * call the function that handles the failover. + * This function is called with a small delay in order to let the + * FAIL message to propagate after failure detection, this is not + * strictly required but makes 99.99% of failovers mechanically + * simpler. */ if (server.cluster->myself->flags & REDIS_NODE_SLAVE && server.cluster->myself->slaveof && server.cluster->myself->slaveof->flags & REDIS_NODE_FAIL && + (server.unixtime - server.cluster->myself->slaveof->fail_time) > + REDIS_CLUSTER_FAILOVER_DELAY && server.cluster->myself->slaveof->numslots != 0) { clusterHandleSlaveFailover(); diff --git a/src/redis.h b/src/redis.h index 2c662d6d6..0eb746759 100644 --- a/src/redis.h +++ b/src/redis.h @@ -521,6 +521,7 @@ typedef struct redisOpArray { #define REDIS_CLUSTER_FAIL 1 /* The cluster can't work */ #define REDIS_CLUSTER_NAMELEN 40 /* sha1 hex length */ #define REDIS_CLUSTER_PORT_INCR 10000 /* Cluster port = baseport + PORT_INCR */ +#define REDIS_CLUSTER_FAILOVER_DELAY 5 /* Seconds */ struct clusterNode;