Sentinel: use active/last time for ping logic

The PING trigger was improved again by using two fields instead of a
single one to remember when the last ping was sent:

1. The "active" ping is the time at which we sent the last ping that
still received no reply. However we continue to ping non replying
instances even if they have an old active ping: the link may be
disconnected and reconencted in the meantime so the older pings may get
lost even if it's a TCP socket.

2. The "last" ping is the time at which we really sent the last ping
on the wire, and this is used in order to throttle the amount of pings
we send during failures (when no pong is received).

All in all the failure detector effectiveness should be identical but we
avoid to flood instances with pings during failures or when they are
slow.
This commit is contained in:
antirez 2015-05-14 09:56:23 +02:00
parent 3ab49895b4
commit 58d2bb951a

View File

@ -139,10 +139,15 @@ typedef struct instanceLink {
mstime_t pc_last_activity; /* Last time we received any message. */ mstime_t pc_last_activity; /* Last time we received any message. */
mstime_t last_avail_time; /* Last time the instance replied to ping with mstime_t last_avail_time; /* Last time the instance replied to ping with
a reply we consider valid. */ a reply we consider valid. */
mstime_t last_ping_time; /* Last time a pending ping was sent in the mstime_t act_ping_time; /* Time at which the last pending ping (no pong
context of the current command connection received after it) was sent. This field is
with the instance. 0 if still not sent or set to 0 when a pong is received, and set again
if pong already received. */ to the current time if the value is 0 and a new
ping is sent. */
mstime_t last_ping_time; /* Time at which we sent the last ping. This is
only used to avoid sending too many pings
during failure. Idle time is computed using
the act_ping_time field. */
mstime_t last_pong_time; /* Last time the instance replied to ping, mstime_t last_pong_time; /* Last time the instance replied to ping,
whatever the reply was. That's used to check whatever the reply was. That's used to check
if the link is idle and must be reconnected. */ if the link is idle and must be reconnected. */
@ -925,11 +930,12 @@ instanceLink *createInstanceLink(void) {
link->pc_conn_time = 0; link->pc_conn_time = 0;
link->last_reconn_time = 0; link->last_reconn_time = 0;
link->pc_last_activity = 0; link->pc_last_activity = 0;
/* We set the last_ping_time to "now" even if we actually don't have yet /* We set the act_ping_time to "now" even if we actually don't have yet
* a connection with the node, nor we sent a ping. * a connection with the node, nor we sent a ping.
* This is useful to detect a timeout in case we'll not be able to connect * This is useful to detect a timeout in case we'll not be able to connect
* with the node at all. */ * with the node at all. */
link->last_ping_time = mstime(); link->act_ping_time = mstime();
link->last_ping_time = 0;
link->last_avail_time = mstime(); link->last_avail_time = mstime();
link->last_pong_time = mstime(); link->last_pong_time = mstime();
return link; return link;
@ -1344,7 +1350,8 @@ void sentinelResetMaster(sentinelRedisInstance *ri, int flags) {
sdsfree(ri->slave_master_host); sdsfree(ri->slave_master_host);
ri->runid = NULL; ri->runid = NULL;
ri->slave_master_host = NULL; ri->slave_master_host = NULL;
ri->link->last_ping_time = mstime(); ri->link->act_ping_time = mstime();
ri->link->last_ping_time = 0;
ri->link->last_avail_time = mstime(); ri->link->last_avail_time = mstime();
ri->link->last_pong_time = mstime(); ri->link->last_pong_time = mstime();
ri->role_reported_time = mstime(); ri->role_reported_time = mstime();
@ -2199,7 +2206,7 @@ void sentinelPingReplyCallback(redisAsyncContext *c, void *reply, void *privdata
strncmp(r->str,"MASTERDOWN",10) == 0) strncmp(r->str,"MASTERDOWN",10) == 0)
{ {
link->last_avail_time = mstime(); link->last_avail_time = mstime();
link->last_ping_time = 0; /* Flag the pong as received. */ link->act_ping_time = 0; /* Flag the pong as received. */
} else { } else {
/* Send a SCRIPT KILL command if the instance appears to be /* Send a SCRIPT KILL command if the instance appears to be
* down because of a busy script. */ * down because of a busy script. */
@ -2440,20 +2447,31 @@ int sentinelForceHelloUpdateForMaster(sentinelRedisInstance *master) {
return REDIS_OK; return REDIS_OK;
} }
/* Send a PING to the specified instance and refresh the last_ping_time /* Send a PING to the specified instance and refresh the act_ping_time
* if it is zero (that is, if we received a pong for the previous ping). * if it is zero (that is, if we received a pong for the previous ping).
* *
* On error zero is returned, and we can't consider the PING command * On error zero is returned, and we can't consider the PING command
* queued in the connection. */ * queued in the connection. */
int sentinelSendPing(sentinelRedisInstance *ri) { int sentinelSendPing(sentinelRedisInstance *ri) {
static unsigned long long counters[256];
static time_t last;
// printf("(%lld) PING %s\n", mstime(), sentinelGetInstanceTypeString(ri));
counters[ri->flags & (SRI_SLAVE|SRI_MASTER|SRI_SENTINEL)]++;
if (time(NULL)-last >= 5) {
printf("slave: %llu master: %llu sentinel: %llu\n",
counters[SRI_SLAVE], counters[SRI_MASTER], counters[SRI_SENTINEL]);
last = time(NULL);
}
int retval = redisAsyncCommand(ri->link->cc, int retval = redisAsyncCommand(ri->link->cc,
sentinelPingReplyCallback, ri, "PING"); sentinelPingReplyCallback, ri, "PING");
if (retval == REDIS_OK) { if (retval == REDIS_OK) {
ri->link->pending_commands++; ri->link->pending_commands++;
/* We update the ping time only if we received the pong for ri->link->last_ping_time = mstime();
* the previous ping, otherwise we are technically waiting /* We update the active ping time only if we received the pong for
* since the first ping that did not received a reply. */ * the previous ping, otherwise we are technically waiting since the
if (ri->link->last_ping_time == 0) ri->link->last_ping_time = mstime(); * first ping that did not received a reply. */
if (ri->link->act_ping_time == 0)
ri->link->act_ping_time = ri->link->last_ping_time;
return 1; return 1;
} else { } else {
return 0; return 0;
@ -2506,9 +2524,7 @@ void sentinelSendPeriodicCommands(sentinelRedisInstance *ri) {
sentinelInfoReplyCallback, ri, "INFO"); sentinelInfoReplyCallback, ri, "INFO");
if (retval == REDIS_OK) ri->link->pending_commands++; if (retval == REDIS_OK) ri->link->pending_commands++;
} else if ((now - ri->link->last_pong_time) > ping_period && } else if ((now - ri->link->last_pong_time) > ping_period &&
(ri->link->last_ping_time == 0 || (now - ri->link->last_ping_time) > ping_period/2) {
now - ri->link->last_ping_time > ping_period*2))
{
/* Send PING to all the three kinds of instances. */ /* Send PING to all the three kinds of instances. */
sentinelSendPing(ri); sentinelSendPing(ri);
} else if ((now - ri->last_pub_time) > SENTINEL_PUBLISH_PERIOD) { } else if ((now - ri->last_pub_time) > SENTINEL_PUBLISH_PERIOD) {
@ -2592,7 +2608,7 @@ void addReplySentinelRedisInstance(redisClient *c, sentinelRedisInstance *ri) {
addReplyBulkCString(c,"last-ping-sent"); addReplyBulkCString(c,"last-ping-sent");
addReplyBulkLongLong(c, addReplyBulkLongLong(c,
ri->link->last_ping_time ? (mstime() - ri->link->last_ping_time) : 0); ri->link->act_ping_time ? (mstime() - ri->link->act_ping_time) : 0);
fields++; fields++;
addReplyBulkCString(c,"last-ok-ping-reply"); addReplyBulkCString(c,"last-ok-ping-reply");
@ -3202,8 +3218,8 @@ void sentinelPublishCommand(redisClient *c) {
void sentinelCheckSubjectivelyDown(sentinelRedisInstance *ri) { void sentinelCheckSubjectivelyDown(sentinelRedisInstance *ri) {
mstime_t elapsed = 0; mstime_t elapsed = 0;
if (ri->link->last_ping_time) if (ri->link->act_ping_time)
elapsed = mstime() - ri->link->last_ping_time; elapsed = mstime() - ri->link->act_ping_time;
/* Check if we are in need for a reconnection of one of the /* Check if we are in need for a reconnection of one of the
* links, because we are detecting low activity. * links, because we are detecting low activity.
@ -3214,10 +3230,10 @@ void sentinelCheckSubjectivelyDown(sentinelRedisInstance *ri) {
if (ri->link->cc && if (ri->link->cc &&
(mstime() - ri->link->cc_conn_time) > (mstime() - ri->link->cc_conn_time) >
SENTINEL_MIN_LINK_RECONNECT_PERIOD && SENTINEL_MIN_LINK_RECONNECT_PERIOD &&
ri->link->last_ping_time != 0 && /* Ther is a pending ping... */ ri->link->act_ping_time != 0 && /* Ther is a pending ping... */
/* The pending ping is delayed, and we did not received /* The pending ping is delayed, and we did not received
* error replies as well. */ * error replies as well. */
(mstime() - ri->link->last_ping_time) > (ri->down_after_period/2) && (mstime() - ri->link->act_ping_time) > (ri->down_after_period/2) &&
(mstime() - ri->link->last_pong_time) > (ri->down_after_period/2)) (mstime() - ri->link->last_pong_time) > (ri->down_after_period/2))
{ {
instanceLinkCloseConnection(ri->link,ri->link->cc); instanceLinkCloseConnection(ri->link,ri->link->cc);