diff --git a/sentinel.conf b/sentinel.conf index c5341168e..b145ae518 100644 --- a/sentinel.conf +++ b/sentinel.conf @@ -339,3 +339,13 @@ SENTINEL resolve-hostnames no # to retain the hostnames when announced, enable announce-hostnames below. # SENTINEL announce-hostnames no + +# When master_reboot_down_after_period is set to 0, Sentinel does not fail over +# when receiving a -LOADING response from a master. This was the only supported +# behavior before version 7.0. +# +# Otherwise, Sentinel will use this value as the time (in ms) it is willing to +# accept a -LOADING response after a master has been rebooted, before failing +# over. + +SENTINEL master-reboot-down-after-period mymaster 0 diff --git a/src/sentinel.c b/src/sentinel.c index 1db3bc261..297f0591f 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -76,6 +76,7 @@ typedef struct sentinelAddr { #define SRI_RECONF_DONE (1<<10) /* Slave synchronized with new master. */ #define SRI_FORCE_FAILOVER (1<<11) /* Force failover with master up. */ #define SRI_SCRIPT_KILL_SENT (1<<12) /* SCRIPT KILL already sent on -BUSY */ +#define SRI_MASTER_REBOOT (1<<13) /* Master was detected as rebooting */ /* Note: times are in milliseconds. */ #define SENTINEL_PING_PERIOD 1000 @@ -193,6 +194,8 @@ typedef struct sentinelRedisInstance { mstime_t s_down_since_time; /* Subjectively down since time. */ mstime_t o_down_since_time; /* Objectively down since time. */ mstime_t down_after_period; /* Consider it down after that period. */ + mstime_t master_reboot_down_after_period; /* Consider master down after that period. */ + mstime_t master_reboot_since_time; /* master reboot time since time. */ mstime_t info_refresh; /* Time at which we received INFO output from it. */ dict *renamed_commands; /* Commands renamed in this instance: Sentinel will use the alternative commands @@ -1294,8 +1297,8 @@ sentinelRedisInstance *createSentinelRedisInstance(char *name, int flags, char * ri->last_master_down_reply_time = mstime(); ri->s_down_since_time = 0; ri->o_down_since_time = 0; - ri->down_after_period = master ? master->down_after_period : - sentinel_default_down_after; + ri->down_after_period = master ? master->down_after_period : sentinel_default_down_after; + ri->master_reboot_down_after_period = 0; ri->master_link_down_time = 0; ri->auth_pass = NULL; ri->auth_user = NULL; @@ -1971,6 +1974,13 @@ const char *sentinelHandleConfiguration(char **argv, int argc) { if ((sentinel.announce_hostnames = yesnotoi(argv[1])) == -1) { return "Please specify yes or no for the announce-hostnames option."; } + } else if (!strcasecmp(argv[0],"master-reboot-down-after-period") && argc == 3) { + /* master-reboot-down-after-period */ + ri = sentinelGetMasterByName(argv[1]); + if (!ri) return "No such master with specified name."; + ri->master_reboot_down_after_period = atoi(argv[2]); + if (ri->master_reboot_down_after_period < 0) + return "negative time parameter."; } else { return "Unrecognized sentinel configuration statement."; } @@ -2090,6 +2100,15 @@ void rewriteConfigSentinelOption(struct rewriteConfigState *state) { /* rewriteConfigMarkAsProcessed is handled after the loop */ } + /* sentinel master-reboot-down-after-period */ + if (master->master_reboot_down_after_period != 0) { + line = sdscatprintf(sdsempty(), + "sentinel master-reboot-down-after-period %s %ld", + master->name, (long) master->master_reboot_down_after_period); + rewriteConfigRewriteLine(state,"sentinel master-reboot-down-after-period",line,1); + /* rewriteConfigMarkAsProcessed is handled after the loop */ + } + /* sentinel config-epoch */ line = sdscatprintf(sdsempty(), "sentinel config-epoch %s %llu", @@ -2214,6 +2233,7 @@ void rewriteConfigSentinelOption(struct rewriteConfigState *state) { rewriteConfigMarkAsProcessed(state,"sentinel known-replica"); rewriteConfigMarkAsProcessed(state,"sentinel known-sentinel"); rewriteConfigMarkAsProcessed(state,"sentinel rename-command"); + rewriteConfigMarkAsProcessed(state,"sentinel master-reboot-down-after-period"); } /* This function uses the config rewriting Redis engine in order to persist @@ -2456,6 +2476,12 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { } else { if (strncmp(ri->runid,l+7,40) != 0) { sentinelEvent(LL_NOTICE,"+reboot",ri,"%@"); + + if (ri->flags & SRI_MASTER && ri->master_reboot_down_after_period != 0) { + ri->flags |= SRI_MASTER_REBOOT; + ri->master_reboot_since_time = mstime(); + } + sdsfree(ri->runid); ri->runid = sdsnewlen(l+7,40); } @@ -2723,6 +2749,10 @@ void sentinelPingReplyCallback(redisAsyncContext *c, void *reply, void *privdata { link->last_avail_time = mstime(); link->act_ping_time = 0; /* Flag the pong as received. */ + + if (ri->flags & SRI_MASTER_REBOOT && strncmp(r->str,"PONG",4) == 0) + ri->flags &= ~SRI_MASTER_REBOOT; + } else { /* Send a SCRIPT KILL command if the instance appears to be * down because of a busy script. */ @@ -4255,6 +4285,15 @@ void sentinelSetCommand(client *c) { dictAdd(ri->renamed_commands,oldname,newname); } changes++; + } else if (!strcasecmp(option,"master-reboot-down-after-period") && moreargs > 0) { + /* master-reboot-down-after-period */ + robj *o = c->argv[++j]; + if (getLongLongFromObject(o,&ll) == C_ERR || ll < 0) { + badarg = j; + goto badfmt; + } + ri->master_reboot_down_after_period = ll; + changes++; } else { addReplyErrorFormat(c,"Unknown option or number of arguments for " "SENTINEL SET '%s'", option); @@ -4358,7 +4397,9 @@ void sentinelCheckSubjectivelyDown(sentinelRedisInstance *ri) { (ri->flags & SRI_MASTER && ri->role_reported == SRI_SLAVE && mstime() - ri->role_reported_time > - (ri->down_after_period+sentinel_info_period*2))) + (ri->down_after_period+sentinel_info_period*2)) || + (ri->flags & SRI_MASTER_REBOOT && + mstime()-ri->master_reboot_since_time > ri->master_reboot_down_after_period)) { /* Is subjectively down */ if ((ri->flags & SRI_S_DOWN) == 0) { diff --git a/tests/sentinel/tests/12-master-reboot.tcl b/tests/sentinel/tests/12-master-reboot.tcl new file mode 100644 index 000000000..1fdd91d6a --- /dev/null +++ b/tests/sentinel/tests/12-master-reboot.tcl @@ -0,0 +1,103 @@ +# Check the basic monitoring and failover capabilities. +source "../tests/includes/init-tests.tcl" + + +if {$::simulate_error} { + test "This test will fail" { + fail "Simulated error" + } +} + + +# Reboot an instance previously in very short time but do not check if it is loading +proc reboot_instance {type id} { + set dirname "${type}_${id}" + set cfgfile [file join $dirname $type.conf] + set port [get_instance_attrib $type $id port] + + # Execute the instance with its old setup and append the new pid + # file for cleanup. + set pid [exec_instance $type $dirname $cfgfile] + set_instance_attrib $type $id pid $pid + lappend ::pids $pid + + # Check that the instance is running + if {[server_is_up 127.0.0.1 $port 100] == 0} { + set logfile [file join $dirname log.txt] + puts [exec tail $logfile] + abort_sentinel_test "Problems starting $type #$id: ping timeout, maybe server start failed, check $logfile" + } + + # Connect with it with a fresh link + set link [redis 127.0.0.1 $port 0 $::tls] + $link reconnect 1 + set_instance_attrib $type $id link $link +} + + +test "Master reboot in very short time" { + set old_port [RPort $master_id] + set addr [S 0 SENTINEL GET-MASTER-ADDR-BY-NAME mymaster] + assert {[lindex $addr 1] == $old_port} + + R $master_id debug populate 10000 + R $master_id bgsave + R $master_id config set key-load-delay 1500 + R $master_id config set loading-process-events-interval-bytes 1024 + R $master_id config rewrite + + foreach_sentinel_id id { + S $id SENTINEL SET mymaster master-reboot-down-after-period 5000 + S $id sentinel debug ping-period 500 + S $id sentinel debug ask-period 500 + } + + kill_instance redis $master_id + reboot_instance redis $master_id + + foreach_sentinel_id id { + wait_for_condition 1000 100 { + [lindex [S $id SENTINEL GET-MASTER-ADDR-BY-NAME mymaster] 1] != $old_port + } else { + fail "At least one Sentinel did not receive failover info" + } + } + + set addr [S 0 SENTINEL GET-MASTER-ADDR-BY-NAME mymaster] + set master_id [get_instance_id_by_port redis [lindex $addr 1]] + + # Make sure the instance load all the dataset + while 1 { + catch {[$link ping]} retval + if {[string match {*LOADING*} $retval]} { + after 100 + continue + } else { + break + } + } +} + +test "New master [join $addr {:}] role matches" { + assert {[RI $master_id role] eq {master}} +} + +test "All the other slaves now point to the new master" { + foreach_redis_id id { + if {$id != $master_id && $id != 0} { + wait_for_condition 1000 50 { + [RI $id master_port] == [lindex $addr 1] + } else { + fail "Redis ID $id not configured to replicate with new master" + } + } + } +} + +test "The old master eventually gets reconfigured as a slave" { + wait_for_condition 1000 50 { + [RI 0 master_port] == [lindex $addr 1] + } else { + fail "Old master not reconfigured as slave of new master" + } +} \ No newline at end of file