implement FAILOVER command (#8315)
Implement FAILOVER command, which coordinates failover between the server and one of its replicas.
This commit is contained in:
parent
26301897d0
commit
0d18a1e85f
@ -2680,7 +2680,7 @@ NULL
|
||||
c->argc == 4))
|
||||
{
|
||||
/* CLIENT PAUSE TIMEOUT [WRITE|ALL] */
|
||||
long long duration;
|
||||
mstime_t end;
|
||||
int type = CLIENT_PAUSE_ALL;
|
||||
if (c->argc == 4) {
|
||||
if (!strcasecmp(c->argv[3]->ptr,"write")) {
|
||||
@ -2694,9 +2694,9 @@ NULL
|
||||
}
|
||||
}
|
||||
|
||||
if (getTimeoutFromObjectOrReply(c,c->argv[2],&duration,
|
||||
if (getTimeoutFromObjectOrReply(c,c->argv[2],&end,
|
||||
UNIT_MILLISECONDS) != C_OK) return;
|
||||
pauseClients(duration, type);
|
||||
pauseClients(end, type);
|
||||
addReply(c,shared.ok);
|
||||
} else if (!strcasecmp(c->argv[1]->ptr,"tracking") && c->argc >= 3) {
|
||||
/* CLIENT TRACKING (on|off) [REDIRECT <id>] [BCAST] [PREFIX first]
|
||||
|
@ -721,6 +721,36 @@ void syncCommand(client *c) {
|
||||
/* ignore SYNC if already slave or in monitor mode */
|
||||
if (c->flags & CLIENT_SLAVE) return;
|
||||
|
||||
/* Check if this is a failover request to a replica with the same replid and
|
||||
* become a master if so. */
|
||||
if (c->argc > 3 && !strcasecmp(c->argv[0]->ptr,"psync") &&
|
||||
!strcasecmp(c->argv[3]->ptr,"failover"))
|
||||
{
|
||||
serverLog(LL_WARNING, "Failover request received for replid %s.",
|
||||
(unsigned char *)c->argv[1]->ptr);
|
||||
if (!server.masterhost) {
|
||||
addReplyError(c, "PSYNC FAILOVER can't be sent to a master.");
|
||||
return;
|
||||
}
|
||||
|
||||
if (!strcasecmp(c->argv[1]->ptr,server.replid)) {
|
||||
replicationUnsetMaster();
|
||||
sds client = catClientInfoString(sdsempty(),c);
|
||||
serverLog(LL_NOTICE,
|
||||
"MASTER MODE enabled (failover request from '%s')",client);
|
||||
sdsfree(client);
|
||||
} else {
|
||||
addReplyError(c, "PSYNC FAILOVER replid must match my replid.");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/* Don't let replicas sync with us while we're failing over */
|
||||
if (server.failover_state != NO_FAILOVER) {
|
||||
addReplyError(c,"-NOMASTERLINK Can't SYNC while failing over");
|
||||
return;
|
||||
}
|
||||
|
||||
/* Refuse SYNC requests if we are a slave but the link with our master
|
||||
* is not ok... */
|
||||
if (server.masterhost && server.repl_state != REPL_STATE_CONNECTED) {
|
||||
@ -2031,8 +2061,15 @@ int slaveTryPartialResynchronization(connection *conn, int read_reply) {
|
||||
memcpy(psync_offset,"-1",3);
|
||||
}
|
||||
|
||||
/* Issue the PSYNC command */
|
||||
reply = sendCommand(conn,"PSYNC",psync_replid,psync_offset,NULL);
|
||||
/* Issue the PSYNC command, if this is a master with a failover in
|
||||
* progress then send the failover argument to the replica to cause it
|
||||
* to become a master */
|
||||
if (server.failover_state == FAILOVER_IN_PROGRESS) {
|
||||
reply = sendCommand(conn,"PSYNC",psync_replid,psync_offset,"FAILOVER",NULL);
|
||||
} else {
|
||||
reply = sendCommand(conn,"PSYNC",psync_replid,psync_offset,NULL);
|
||||
}
|
||||
|
||||
if (reply != NULL) {
|
||||
serverLog(LL_WARNING,"Unable to send PSYNC to master: %s",reply);
|
||||
sdsfree(reply);
|
||||
@ -2356,6 +2393,7 @@ void syncWithMaster(connection *conn) {
|
||||
if (server.repl_state == REPL_STATE_SEND_PSYNC) {
|
||||
if (slaveTryPartialResynchronization(conn,0) == PSYNC_WRITE_ERROR) {
|
||||
err = sdsnew("Write error sending the PSYNC command.");
|
||||
abortFailover("Write error to failover target");
|
||||
goto write_error;
|
||||
}
|
||||
server.repl_state = REPL_STATE_RECEIVE_PSYNC_REPLY;
|
||||
@ -2373,6 +2411,18 @@ void syncWithMaster(connection *conn) {
|
||||
psync_result = slaveTryPartialResynchronization(conn,1);
|
||||
if (psync_result == PSYNC_WAIT_REPLY) return; /* Try again later... */
|
||||
|
||||
/* Check the status of the planned failover. We expect PSYNC_CONTINUE,
|
||||
* but there is nothing technically wrong with a full resync which
|
||||
* could happen in edge cases. */
|
||||
if (server.failover_state == FAILOVER_IN_PROGRESS) {
|
||||
if (psync_result == PSYNC_CONTINUE || psync_result == PSYNC_FULLRESYNC) {
|
||||
clearFailoverState();
|
||||
} else {
|
||||
abortFailover("Failover target rejected psync request");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/* If the master is in an transient error, we should try to PSYNC
|
||||
* from scratch later, so go to the error path. This happens when
|
||||
* the server is loading the dataset or is not connected with its
|
||||
@ -2678,6 +2728,11 @@ void replicaofCommand(client *c) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (server.failover_state != NO_FAILOVER) {
|
||||
addReplyError(c,"REPLICAOF not allowed while failing over.");
|
||||
return;
|
||||
}
|
||||
|
||||
/* The special host/port combination "NO" "ONE" turns the instance
|
||||
* into a master. Otherwise the new master address is set. */
|
||||
if (!strcasecmp(c->argv[1]->ptr,"no") &&
|
||||
@ -3211,6 +3266,10 @@ long long replicationGetSlaveOffset(void) {
|
||||
void replicationCron(void) {
|
||||
static long long replication_cron_loops = 0;
|
||||
|
||||
/* Check failover status first, to see if we need to start
|
||||
* handling the failover. */
|
||||
updateFailoverStatus();
|
||||
|
||||
/* Non blocking connection timeout? */
|
||||
if (server.masterhost &&
|
||||
(server.repl_state == REPL_STATE_CONNECTING ||
|
||||
@ -3268,8 +3327,9 @@ void replicationCron(void) {
|
||||
* alter the replication offsets of master and slave, and will no longer
|
||||
* match the one stored into 'mf_master_offset' state. */
|
||||
int manual_failover_in_progress =
|
||||
server.cluster_enabled &&
|
||||
server.cluster->mf_end &&
|
||||
((server.cluster_enabled &&
|
||||
server.cluster->mf_end) ||
|
||||
server.failover_end_time) &&
|
||||
checkClientPauseTimeoutAndReturnIfPaused();
|
||||
|
||||
if (!manual_failover_in_progress) {
|
||||
@ -3423,3 +3483,271 @@ void replicationStartPendingFork(void) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Find replica at IP:PORT from replica list */
|
||||
static client *findReplica(char *host, int port) {
|
||||
listIter li;
|
||||
listNode *ln;
|
||||
client *replica;
|
||||
|
||||
listRewind(server.slaves,&li);
|
||||
while((ln = listNext(&li))) {
|
||||
replica = ln->value;
|
||||
char ip[NET_IP_STR_LEN], *replicaip = replica->slave_ip;
|
||||
|
||||
if (replicaip[0] == '\0') {
|
||||
if (connPeerToString(replica->conn, ip, sizeof(ip), NULL) == -1)
|
||||
continue;
|
||||
replicaip = ip;
|
||||
}
|
||||
|
||||
if (!strcasecmp(host, replicaip) &&
|
||||
(port == replica->slave_listening_port))
|
||||
return replica;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
const char *getFailoverStateString() {
|
||||
switch(server.failover_state) {
|
||||
case NO_FAILOVER: return "no-failover";
|
||||
case FAILOVER_IN_PROGRESS: return "failover-in-progress";
|
||||
case FAILOVER_WAIT_FOR_SYNC: return "waiting-for-sync";
|
||||
default: return "unknown";
|
||||
}
|
||||
}
|
||||
|
||||
/* Resets the internal failover configuration, this needs
|
||||
* to be called after a failover either succeeds or fails
|
||||
* as it includes the client unpause. */
|
||||
void clearFailoverState() {
|
||||
server.failover_end_time = 0;
|
||||
server.force_failover = 0;
|
||||
zfree(server.target_replica_host);
|
||||
server.target_replica_host = NULL;
|
||||
server.target_replica_port = 0;
|
||||
server.failover_state = NO_FAILOVER;
|
||||
unpauseClients();
|
||||
}
|
||||
|
||||
/* Abort an ongoing failover if one is going on. */
|
||||
void abortFailover(const char *err) {
|
||||
if (server.failover_state == NO_FAILOVER) return;
|
||||
|
||||
if (server.target_replica_host) {
|
||||
serverLog(LL_NOTICE,"FAILOVER to %s:%d aborted: %s",
|
||||
server.target_replica_host,server.target_replica_port,err);
|
||||
} else {
|
||||
serverLog(LL_NOTICE,"FAILOVER to any replica aborted: %s",err);
|
||||
}
|
||||
if (server.failover_state == FAILOVER_IN_PROGRESS) {
|
||||
replicationUnsetMaster();
|
||||
}
|
||||
clearFailoverState();
|
||||
}
|
||||
|
||||
/*
|
||||
* FAILOVER [TO <HOST> <IP> [FORCE]] [ABORT] [TIMEOUT <timeout>]
|
||||
*
|
||||
* This command will coordinate a failover between the master and one
|
||||
* of its replicas. The happy path contains the following steps:
|
||||
* 1) The master will initiate a client pause write, to stop replication
|
||||
* traffic.
|
||||
* 2) The master will periodically check if any of its replicas has
|
||||
* consumed the entire replication stream through acks.
|
||||
* 3) Once any replica has caught up, the master will itself become a replica.
|
||||
* 4) The master will send a PSYNC FAILOVER request to the target replica, which
|
||||
* if accepted will cause the replica to become the new master and start a sync.
|
||||
*
|
||||
* FAILOVER ABORT is the only way to abort a failover command, as replicaof
|
||||
* will be disabled. This may be needed if the failover is unable to progress.
|
||||
*
|
||||
* The optional arguments [TO <HOST> <IP>] allows designating a specific replica
|
||||
* to be failed over to.
|
||||
*
|
||||
* FORCE flag indicates that even if the target replica is not caught up,
|
||||
* failover to it anyway. This must be specified with a timeout and a target
|
||||
* HOST and IP.
|
||||
*
|
||||
* TIMEOUT <timeout> indicates how long should the primary wait for
|
||||
* a replica to sync up before aborting. If not specified, the failover
|
||||
* will attempt forever and must be manually aborted.
|
||||
*/
|
||||
void failoverCommand(client *c) {
|
||||
if (server.cluster_enabled) {
|
||||
addReplyError(c,"FAILOVER not allowed in cluster mode. "
|
||||
"Use CLUSTER FAILOVER command instead.");
|
||||
return;
|
||||
}
|
||||
|
||||
/* Handle special case for abort */
|
||||
if ((c->argc == 2) && !strcasecmp(c->argv[1]->ptr,"abort")) {
|
||||
if (server.failover_state == NO_FAILOVER) {
|
||||
addReplyError(c, "No failover in progress.");
|
||||
return;
|
||||
}
|
||||
|
||||
abortFailover("Failover manually aborted");
|
||||
addReply(c,shared.ok);
|
||||
return;
|
||||
}
|
||||
|
||||
long timeout_in_ms = 0;
|
||||
int force_flag = 0;
|
||||
long port = 0;
|
||||
char *host = NULL;
|
||||
|
||||
/* Parse the command for syntax and arguments. */
|
||||
for (int j = 1; j < c->argc; j++) {
|
||||
if (!strcasecmp(c->argv[j]->ptr,"timeout") && (j + 1 < c->argc) &&
|
||||
timeout_in_ms == 0)
|
||||
{
|
||||
if (getLongFromObjectOrReply(c,c->argv[j + 1],
|
||||
&timeout_in_ms,NULL) != C_OK) return;
|
||||
if (timeout_in_ms <= 0) {
|
||||
addReplyError(c,"FAILOVER timeout must be greater than 0");
|
||||
return;
|
||||
}
|
||||
j++;
|
||||
} else if (!strcasecmp(c->argv[j]->ptr,"to") && (j + 2 < c->argc) &&
|
||||
!host)
|
||||
{
|
||||
if (getLongFromObjectOrReply(c,c->argv[j + 2],&port,NULL) != C_OK)
|
||||
return;
|
||||
host = c->argv[j + 1]->ptr;
|
||||
j += 2;
|
||||
} else if (!strcasecmp(c->argv[j]->ptr,"force") && !force_flag) {
|
||||
force_flag = 1;
|
||||
} else {
|
||||
addReplyErrorObject(c,shared.syntaxerr);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (server.failover_state != NO_FAILOVER) {
|
||||
addReplyError(c,"FAILOVER already in progress.");
|
||||
return;
|
||||
}
|
||||
|
||||
if (server.masterhost) {
|
||||
addReplyError(c,"FAILOVER is not valid when server is a replica.");
|
||||
return;
|
||||
}
|
||||
|
||||
if (listLength(server.slaves) == 0) {
|
||||
addReplyError(c,"FAILOVER requires connected replicas.");
|
||||
return;
|
||||
}
|
||||
|
||||
if (force_flag && (!timeout_in_ms || !host)) {
|
||||
addReplyError(c,"FAILOVER with force option requires both a timeout "
|
||||
"and target HOST and IP.");
|
||||
return;
|
||||
}
|
||||
|
||||
/* If a replica address was provided, validate that it is connected. */
|
||||
if (host) {
|
||||
client *replica = findReplica(host, port);
|
||||
|
||||
if (replica == NULL) {
|
||||
addReplyError(c,"FAILOVER target HOST and IP is not "
|
||||
"a replica.");
|
||||
return;
|
||||
}
|
||||
|
||||
/* Check if requested replica is online */
|
||||
if (replica->replstate != SLAVE_STATE_ONLINE) {
|
||||
addReplyError(c,"FAILOVER target replica is not online.");
|
||||
return;
|
||||
}
|
||||
|
||||
server.target_replica_host = zstrdup(host);
|
||||
server.target_replica_port = port;
|
||||
serverLog(LL_NOTICE,"FAILOVER requested to %s:%ld.",host,port);
|
||||
} else {
|
||||
serverLog(LL_NOTICE,"FAILOVER requested to any replica.");
|
||||
}
|
||||
|
||||
mstime_t now = mstime();
|
||||
if (timeout_in_ms) {
|
||||
server.failover_end_time = now + timeout_in_ms;
|
||||
}
|
||||
|
||||
server.force_failover = force_flag;
|
||||
server.failover_state = FAILOVER_WAIT_FOR_SYNC;
|
||||
/* Cluster failover will unpause eventually */
|
||||
pauseClients(LLONG_MAX,CLIENT_PAUSE_WRITE);
|
||||
addReply(c,shared.ok);
|
||||
}
|
||||
|
||||
/* Failover cron function, checks coordinated failover state.
|
||||
*
|
||||
* Implementation note: The current implementation calls replicationSetMaster()
|
||||
* to start the failover request, this has some unintended side effects if the
|
||||
* failover doesn't work like blocked clients will be unblocked and replicas will
|
||||
* be disconnected. This could be optimized further.
|
||||
*/
|
||||
void updateFailoverStatus(void) {
|
||||
if (server.failover_state != FAILOVER_WAIT_FOR_SYNC) return;
|
||||
mstime_t now = server.mstime;
|
||||
|
||||
/* Check if failover operation has timed out */
|
||||
if (server.failover_end_time && server.failover_end_time <= now) {
|
||||
if (server.force_failover) {
|
||||
serverLog(LL_NOTICE,
|
||||
"FAILOVER to %s:%d time out exceeded, failing over.",
|
||||
server.target_replica_host, server.target_replica_port);
|
||||
server.failover_state = FAILOVER_IN_PROGRESS;
|
||||
/* If timeout has expired force a failover if requested. */
|
||||
replicationSetMaster(server.target_replica_host,
|
||||
server.target_replica_port);
|
||||
return;
|
||||
} else {
|
||||
/* Force was not requested, so timeout. */
|
||||
abortFailover("Replica never caught up before timeout");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/* Check to see if the replica has caught up so failover can start */
|
||||
client *replica = NULL;
|
||||
if (server.target_replica_host) {
|
||||
replica = findReplica(server.target_replica_host,
|
||||
server.target_replica_port);
|
||||
} else {
|
||||
listIter li;
|
||||
listNode *ln;
|
||||
|
||||
listRewind(server.slaves,&li);
|
||||
/* Find any replica that has matched our repl_offset */
|
||||
while((ln = listNext(&li))) {
|
||||
replica = ln->value;
|
||||
if (replica->repl_ack_off == server.master_repl_offset) {
|
||||
char ip[NET_IP_STR_LEN], *replicaip = replica->slave_ip;
|
||||
|
||||
if (replicaip[0] == '\0') {
|
||||
if (connPeerToString(replica->conn,ip,sizeof(ip),NULL) == -1)
|
||||
continue;
|
||||
replicaip = ip;
|
||||
}
|
||||
|
||||
/* We are now failing over to this specific node */
|
||||
server.target_replica_host = zstrdup(replicaip);
|
||||
server.target_replica_port = replica->slave_listening_port;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* We've found a replica that is caught up */
|
||||
if (replica && (replica->repl_ack_off == server.master_repl_offset)) {
|
||||
server.failover_state = FAILOVER_IN_PROGRESS;
|
||||
serverLog(LL_NOTICE,
|
||||
"Failover target %s:%d is synced, failing over.",
|
||||
server.target_replica_host, server.target_replica_port);
|
||||
/* Designated replica is caught up, failover to it. */
|
||||
replicationSetMaster(server.target_replica_host,
|
||||
server.target_replica_port);
|
||||
}
|
||||
}
|
||||
|
31
src/server.c
31
src/server.c
@ -752,7 +752,7 @@ struct redisCommand redisCommandTable[] = {
|
||||
"admin no-script",
|
||||
0,NULL,0,0,0,0,0,0},
|
||||
|
||||
{"psync",syncCommand,3,
|
||||
{"psync",syncCommand,-3,
|
||||
"admin no-script",
|
||||
0,NULL,0,0,0,0,0,0},
|
||||
|
||||
@ -1092,6 +1092,10 @@ struct redisCommand redisCommandTable[] = {
|
||||
|
||||
{"reset",resetCommand,1,
|
||||
"no-script ok-stale ok-loading fast @connection",
|
||||
0,NULL,0,0,0,0,0,0},
|
||||
|
||||
{"failover",failoverCommand,-1,
|
||||
"admin no-script ok-stale",
|
||||
0,NULL,0,0,0,0,0,0}
|
||||
};
|
||||
|
||||
@ -2185,8 +2189,15 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
|
||||
checkClientPauseTimeoutAndReturnIfPaused();
|
||||
|
||||
/* Replication cron function -- used to reconnect to master,
|
||||
* detect transfer failures, start background RDB transfers and so forth. */
|
||||
run_with_period(1000) replicationCron();
|
||||
* detect transfer failures, start background RDB transfers and so forth.
|
||||
*
|
||||
* If Redis is trying to failover then run the replication cron faster so
|
||||
* progress on the handshake happens more quickly. */
|
||||
if (server.failover_state != NO_FAILOVER) {
|
||||
run_with_period(100) replicationCron();
|
||||
} else {
|
||||
run_with_period(1000) replicationCron();
|
||||
}
|
||||
|
||||
/* Run the Redis Cluster cron. */
|
||||
run_with_period(100) {
|
||||
@ -2397,6 +2408,11 @@ void beforeSleep(struct aeEventLoop *eventLoop) {
|
||||
server.get_ack_from_slaves = 0;
|
||||
}
|
||||
|
||||
/* We may have recieved updates from clients about their current offset. NOTE:
|
||||
* this can't be done where the ACK is recieved since failover will disconnect
|
||||
* our clients. */
|
||||
updateFailoverStatus();
|
||||
|
||||
/* Send the invalidation messages to clients participating to the
|
||||
* client side caching protocol in broadcasting (BCAST) mode. */
|
||||
trackingBroadcastInvalidationMessages();
|
||||
@ -2651,6 +2667,13 @@ void initServerConfig(void) {
|
||||
server.repl_backlog_off = 0;
|
||||
server.repl_no_slaves_since = time(NULL);
|
||||
|
||||
/* Failover related */
|
||||
server.failover_end_time = 0;
|
||||
server.force_failover = 0;
|
||||
server.target_replica_host = NULL;
|
||||
server.target_replica_port = 0;
|
||||
server.failover_state = NO_FAILOVER;
|
||||
|
||||
/* Client output buffer limits */
|
||||
for (j = 0; j < CLIENT_TYPE_OBUF_COUNT; j++)
|
||||
server.client_obuf_limits[j] = clientBufferLimitsDefaults[j];
|
||||
@ -4991,6 +5014,7 @@ sds genRedisInfoString(const char *section) {
|
||||
}
|
||||
}
|
||||
info = sdscatprintf(info,
|
||||
"master_failover_state:%s\r\n"
|
||||
"master_replid:%s\r\n"
|
||||
"master_replid2:%s\r\n"
|
||||
"master_repl_offset:%lld\r\n"
|
||||
@ -4999,6 +5023,7 @@ sds genRedisInfoString(const char *section) {
|
||||
"repl_backlog_size:%lld\r\n"
|
||||
"repl_backlog_first_byte_offset:%lld\r\n"
|
||||
"repl_backlog_histlen:%lld\r\n",
|
||||
getFailoverStateString(),
|
||||
server.replid,
|
||||
server.replid2,
|
||||
server.master_repl_offset,
|
||||
|
21
src/server.h
21
src/server.h
@ -320,6 +320,14 @@ typedef enum {
|
||||
REPL_STATE_CONNECTED, /* Connected to master */
|
||||
} repl_state;
|
||||
|
||||
/* The state of an in progress coordinated failover */
|
||||
typedef enum {
|
||||
NO_FAILOVER = 0, /* No failover in progress */
|
||||
FAILOVER_WAIT_FOR_SYNC, /* Waiting for target replica to catch up */
|
||||
FAILOVER_IN_PROGRESS /* Waiting for target replica to accept
|
||||
* PSYNC FAILOVER request. */
|
||||
} failover_state;
|
||||
|
||||
/* State of slaves from the POV of the master. Used in client->replstate.
|
||||
* In SEND_BULK and ONLINE state the slave receives new updates
|
||||
* in its output queue. In the WAIT_BGSAVE states instead the server is waiting
|
||||
@ -1577,6 +1585,14 @@ struct redisServer {
|
||||
char *bgsave_cpulist; /* cpu affinity list of bgsave process. */
|
||||
/* Sentinel config */
|
||||
struct sentinelConfig *sentinel_config; /* sentinel config to load at startup time. */
|
||||
/* Coordinate failover info */
|
||||
mstime_t failover_end_time; /* Deadline for failover command. */
|
||||
int force_failover; /* If true then failover will be foreced at the
|
||||
* deadline, otherwise failover is aborted. */
|
||||
char *target_replica_host; /* Failover target host. If null during a
|
||||
* failover then any replica can be used. */
|
||||
int target_replica_port; /* Failover target port */
|
||||
int failover_state; /* Failover state */
|
||||
};
|
||||
|
||||
typedef struct pubsubPattern {
|
||||
@ -1997,6 +2013,10 @@ void feedReplicationBacklog(void *ptr, size_t len);
|
||||
void showLatestBacklog(void);
|
||||
void rdbPipeReadHandler(struct aeEventLoop *eventLoop, int fd, void *clientData, int mask);
|
||||
void rdbPipeWriteHandlerConnRemoved(struct connection *conn);
|
||||
void clearFailoverState(void);
|
||||
void updateFailoverStatus(void);
|
||||
void abortFailover(const char *err);
|
||||
const char *getFailoverStateString();
|
||||
|
||||
/* Generic persistence functions */
|
||||
void startLoadingFile(FILE* fp, char* filename, int rdbflags);
|
||||
@ -2637,6 +2657,7 @@ void lolwutCommand(client *c);
|
||||
void aclCommand(client *c);
|
||||
void stralgoCommand(client *c);
|
||||
void resetCommand(client *c);
|
||||
void failoverCommand(client *c);
|
||||
|
||||
#if defined(__GNUC__)
|
||||
void *calloc(size_t count, size_t size) __attribute__ ((deprecated));
|
||||
|
290
tests/integration/failover.tcl
Normal file
290
tests/integration/failover.tcl
Normal file
@ -0,0 +1,290 @@
|
||||
start_server {tags {"failover"}} {
|
||||
start_server {} {
|
||||
start_server {} {
|
||||
set node_0 [srv 0 client]
|
||||
set node_0_host [srv 0 host]
|
||||
set node_0_port [srv 0 port]
|
||||
set node_0_pid [srv 0 pid]
|
||||
|
||||
set node_1 [srv -1 client]
|
||||
set node_1_host [srv -1 host]
|
||||
set node_1_port [srv -1 port]
|
||||
set node_1_pid [srv -1 pid]
|
||||
|
||||
set node_2 [srv -2 client]
|
||||
set node_2_host [srv -2 host]
|
||||
set node_2_port [srv -2 port]
|
||||
set node_2_pid [srv -2 pid]
|
||||
|
||||
proc assert_digests_match {n1 n2 n3} {
|
||||
assert_equal [$n1 debug digest] [$n2 debug digest]
|
||||
assert_equal [$n2 debug digest] [$n3 debug digest]
|
||||
}
|
||||
|
||||
test {failover command fails without connected replica} {
|
||||
catch { $node_0 failover to $node_1_host $node_1_port } err
|
||||
if {! [string match "ERR*" $err]} {
|
||||
fail "failover command succeeded when replica not connected"
|
||||
}
|
||||
}
|
||||
|
||||
test {setup replication for following tests} {
|
||||
$node_1 replicaof $node_0_host $node_0_port
|
||||
$node_2 replicaof $node_0_host $node_0_port
|
||||
wait_for_sync $node_1
|
||||
wait_for_sync $node_2
|
||||
}
|
||||
|
||||
test {failover command fails with invalid host} {
|
||||
catch { $node_0 failover to invalidhost $node_1_port } err
|
||||
assert_match "ERR*" $err
|
||||
}
|
||||
|
||||
test {failover command fails with invalid port} {
|
||||
catch { $node_0 failover to $node_1_host invalidport } err
|
||||
assert_match "ERR*" $err
|
||||
}
|
||||
|
||||
test {failover command fails with just force and timeout} {
|
||||
catch { $node_0 FAILOVER FORCE TIMEOUT 100} err
|
||||
assert_match "ERR*" $err
|
||||
}
|
||||
|
||||
test {failover command fails when sent to a replica} {
|
||||
catch { $node_1 failover to $node_1_host $node_1_port } err
|
||||
assert_match "ERR*" $err
|
||||
}
|
||||
|
||||
test {failover command fails with force without timeout} {
|
||||
catch { $node_0 failover to $node_1_host $node_1_port FORCE } err
|
||||
assert_match "ERR*" $err
|
||||
}
|
||||
|
||||
test {failover command to specific replica works} {
|
||||
set initial_psyncs [s -1 sync_partial_ok]
|
||||
set initial_syncs [s -1 sync_full]
|
||||
|
||||
# Generate a delta between primary and replica
|
||||
set load_handler [start_write_load $node_0_host $node_0_port 5]
|
||||
exec kill -SIGSTOP [srv -1 pid]
|
||||
wait_for_condition 50 100 {
|
||||
[s 0 total_commands_processed] > 100
|
||||
} else {
|
||||
fail "Node 0 did not accept writes"
|
||||
}
|
||||
exec kill -SIGCONT [srv -1 pid]
|
||||
|
||||
# Execute the failover
|
||||
$node_0 failover to $node_1_host $node_1_port
|
||||
|
||||
# Wait for failover to end
|
||||
wait_for_condition 50 100 {
|
||||
[s 0 master_failover_state] == "no-failover"
|
||||
} else {
|
||||
fail "Failover from node 0 to node 1 did not finish"
|
||||
}
|
||||
stop_write_load $load_handler
|
||||
$node_2 replicaof $node_1_host $node_1_port
|
||||
wait_for_sync $node_0
|
||||
wait_for_sync $node_2
|
||||
|
||||
assert_match *slave* [$node_0 role]
|
||||
assert_match *master* [$node_1 role]
|
||||
assert_match *slave* [$node_2 role]
|
||||
|
||||
# We should accept psyncs from both nodes
|
||||
assert_equal [expr [s -1 sync_partial_ok] - $initial_psyncs] 2
|
||||
assert_equal [expr [s -1 sync_full] - $initial_psyncs] 0
|
||||
assert_digests_match $node_0 $node_1 $node_2
|
||||
}
|
||||
|
||||
test {failover command to any replica works} {
|
||||
set initial_psyncs [s -2 sync_partial_ok]
|
||||
set initial_syncs [s -2 sync_full]
|
||||
|
||||
wait_for_ofs_sync $node_1 $node_2
|
||||
# We stop node 0 to and make sure node 2 is selected
|
||||
exec kill -SIGSTOP $node_0_pid
|
||||
$node_1 set CASE 1
|
||||
$node_1 FAILOVER
|
||||
|
||||
# Wait for failover to end
|
||||
wait_for_condition 50 100 {
|
||||
[s -1 master_failover_state] == "no-failover"
|
||||
} else {
|
||||
fail "Failover from node 1 to node 2 did not finish"
|
||||
}
|
||||
exec kill -SIGCONT $node_0_pid
|
||||
$node_0 replicaof $node_2_host $node_2_port
|
||||
|
||||
wait_for_sync $node_0
|
||||
wait_for_sync $node_1
|
||||
|
||||
assert_match *slave* [$node_0 role]
|
||||
assert_match *slave* [$node_1 role]
|
||||
assert_match *master* [$node_2 role]
|
||||
|
||||
# We should accept Psyncs from both nodes
|
||||
assert_equal [expr [s -2 sync_partial_ok] - $initial_psyncs] 2
|
||||
assert_equal [expr [s -1 sync_full] - $initial_psyncs] 0
|
||||
assert_digests_match $node_0 $node_1 $node_2
|
||||
}
|
||||
|
||||
test {failover to a replica with force works} {
|
||||
set initial_psyncs [s 0 sync_partial_ok]
|
||||
set initial_syncs [s 0 sync_full]
|
||||
|
||||
exec kill -SIGSTOP $node_0_pid
|
||||
# node 0 will never acknowledge this write
|
||||
$node_2 set case 2
|
||||
$node_2 failover to $node_0_host $node_0_port TIMEOUT 100 FORCE
|
||||
|
||||
# Wait for node 0 to give up on sync attempt and start failover
|
||||
wait_for_condition 50 100 {
|
||||
[s -2 master_failover_state] == "failover-in-progress"
|
||||
} else {
|
||||
fail "Failover from node 2 to node 0 did not timeout"
|
||||
}
|
||||
|
||||
# Quick check that everyone is a replica, we never want a
|
||||
# state where there are two masters.
|
||||
assert_match *slave* [$node_1 role]
|
||||
assert_match *slave* [$node_2 role]
|
||||
|
||||
exec kill -SIGCONT $node_0_pid
|
||||
|
||||
# Wait for failover to end
|
||||
wait_for_condition 50 100 {
|
||||
[s -2 master_failover_state] == "no-failover"
|
||||
} else {
|
||||
fail "Failover from node 2 to node 0 did not finish"
|
||||
}
|
||||
$node_1 replicaof $node_0_host $node_0_port
|
||||
|
||||
wait_for_sync $node_1
|
||||
wait_for_sync $node_2
|
||||
|
||||
assert_match *master* [$node_0 role]
|
||||
assert_match *slave* [$node_1 role]
|
||||
assert_match *slave* [$node_2 role]
|
||||
|
||||
assert_equal [count_log_message -2 "time out exceeded, failing over."] 1
|
||||
|
||||
# We should accept both psyncs, although this is the condition we might not
|
||||
# since we didn't catch up.
|
||||
assert_equal [expr [s 0 sync_partial_ok] - $initial_psyncs] 2
|
||||
assert_equal [expr [s 0 sync_full] - $initial_syncs] 0
|
||||
assert_digests_match $node_0 $node_1 $node_2
|
||||
}
|
||||
|
||||
test {failover with timeout aborts if replica never catches up} {
|
||||
set initial_psyncs [s 0 sync_partial_ok]
|
||||
set initial_syncs [s 0 sync_full]
|
||||
|
||||
# Stop replica so it never catches up
|
||||
exec kill -SIGSTOP [srv -1 pid]
|
||||
$node_0 SET CASE 1
|
||||
|
||||
$node_0 failover to [srv -1 host] [srv -1 port] TIMEOUT 500
|
||||
# Wait for failover to end
|
||||
wait_for_condition 50 20 {
|
||||
[s 0 master_failover_state] == "no-failover"
|
||||
} else {
|
||||
fail "Failover from node_0 to replica did not finish"
|
||||
}
|
||||
|
||||
exec kill -SIGCONT [srv -1 pid]
|
||||
|
||||
# We need to make sure the nodes actually sync back up
|
||||
wait_for_ofs_sync $node_0 $node_1
|
||||
wait_for_ofs_sync $node_0 $node_2
|
||||
|
||||
assert_match *master* [$node_0 role]
|
||||
assert_match *slave* [$node_1 role]
|
||||
assert_match *slave* [$node_2 role]
|
||||
|
||||
# Since we never caught up, there should be no syncs
|
||||
assert_equal [expr [s 0 sync_partial_ok] - $initial_psyncs] 0
|
||||
assert_equal [expr [s 0 sync_full] - $initial_syncs] 0
|
||||
assert_digests_match $node_0 $node_1 $node_2
|
||||
}
|
||||
|
||||
test {failovers can be aborted} {
|
||||
set initial_psyncs [s 0 sync_partial_ok]
|
||||
set initial_syncs [s 0 sync_full]
|
||||
|
||||
# Stop replica so it never catches up
|
||||
exec kill -SIGSTOP [srv -1 pid]
|
||||
$node_0 SET CASE 2
|
||||
|
||||
$node_0 failover to [srv -1 host] [srv -1 port] TIMEOUT 60000
|
||||
assert_match [s 0 master_failover_state] "waiting-for-sync"
|
||||
|
||||
# Sanity check that read commands are still accepted
|
||||
$node_0 GET CASE
|
||||
|
||||
$node_0 failover abort
|
||||
assert_match [s 0 master_failover_state] "no-failover"
|
||||
|
||||
exec kill -SIGCONT [srv -1 pid]
|
||||
|
||||
# Just make sure everything is still synced
|
||||
wait_for_ofs_sync $node_0 $node_1
|
||||
wait_for_ofs_sync $node_0 $node_2
|
||||
|
||||
assert_match *master* [$node_0 role]
|
||||
assert_match *slave* [$node_1 role]
|
||||
assert_match *slave* [$node_2 role]
|
||||
|
||||
# Since we never caught up, there should be no syncs
|
||||
assert_equal [expr [s 0 sync_partial_ok] - $initial_psyncs] 0
|
||||
assert_equal [expr [s 0 sync_full] - $initial_syncs] 0
|
||||
assert_digests_match $node_0 $node_1 $node_2
|
||||
}
|
||||
|
||||
test {failover aborts if target rejects sync request} {
|
||||
set initial_psyncs [s 0 sync_partial_ok]
|
||||
set initial_syncs [s 0 sync_full]
|
||||
|
||||
# We block psync, so the failover will fail
|
||||
$node_1 acl setuser default -psync
|
||||
|
||||
# We pause the target long enough to send a write command
|
||||
# during the pause. This write will not be interrupted.
|
||||
exec kill -SIGSTOP [srv -1 pid]
|
||||
set rd [redis_deferring_client]
|
||||
$rd SET FOO BAR
|
||||
$node_0 failover to $node_1_host $node_1_port
|
||||
exec kill -SIGCONT [srv -1 pid]
|
||||
|
||||
# Wait for failover to end
|
||||
wait_for_condition 50 100 {
|
||||
[s 0 master_failover_state] == "no-failover"
|
||||
} else {
|
||||
fail "Failover from node_0 to replica did not finish"
|
||||
}
|
||||
|
||||
assert_equal [$rd read] "OK"
|
||||
$rd close
|
||||
|
||||
# restore access to psync
|
||||
$node_1 acl setuser default +psync
|
||||
|
||||
# We need to make sure the nodes actually sync back up
|
||||
wait_for_sync $node_1
|
||||
wait_for_sync $node_2
|
||||
|
||||
assert_match *master* [$node_0 role]
|
||||
assert_match *slave* [$node_1 role]
|
||||
assert_match *slave* [$node_2 role]
|
||||
|
||||
# We will cycle all of our replicas here and force a psync.
|
||||
assert_equal [expr [s 0 sync_partial_ok] - $initial_psyncs] 2
|
||||
assert_equal [expr [s 0 sync_full] - $initial_syncs] 0
|
||||
|
||||
assert_equal [count_log_message 0 "Failover target rejected psync request"] 1
|
||||
assert_digests_match $node_0 $node_1 $node_2
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -86,12 +86,10 @@ proc waitForBgrewriteaof r {
|
||||
}
|
||||
|
||||
proc wait_for_sync r {
|
||||
while 1 {
|
||||
if {[status $r master_link_status] eq "down"} {
|
||||
after 10
|
||||
} else {
|
||||
break
|
||||
}
|
||||
wait_for_condition 50 100 {
|
||||
[status $r master_link_status] eq "up"
|
||||
} else {
|
||||
fail "replica didn't sync in time"
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -52,6 +52,7 @@ set ::all_tests {
|
||||
integration/psync2
|
||||
integration/psync2-reg
|
||||
integration/psync2-pingoff
|
||||
integration/failover
|
||||
integration/redis-cli
|
||||
integration/redis-benchmark
|
||||
unit/pubsub
|
||||
|
Loading…
x
Reference in New Issue
Block a user