Increase frequency of failover log and emit the status of the election to help debugging (#11665)
This change increase the frequency of the failover log from 5 minutes to 10 seconds. This log is only emitted when a replica has an outstanding election is progress, and waiting 5 minutes for the next log makes debugging and alarming on the log messages too slow. It also now prints out the number of votes the replica has currently received as well as the number of votes it needs to achieve quorum so that we can track the progress if it's running slowly. Co-authored-by: Madelyn Olson <34459052+madolson@users.noreply.github.com>
This commit is contained in:
parent
12826fa38f
commit
395d801a2d
@ -3868,7 +3868,14 @@ void clusterLogCantFailover(int reason) {
|
||||
break;
|
||||
}
|
||||
lastlog_time = time(NULL);
|
||||
serverLog(LL_WARNING,"Currently unable to failover: %s", msg);
|
||||
serverLog(LL_NOTICE,"Currently unable to failover: %s", msg);
|
||||
|
||||
int cur_vote = server.cluster->failover_auth_count;
|
||||
int cur_quorum = (server.cluster->size / 2) + 1;
|
||||
/* Emits a log when an election is in progress and waiting for votes or when the failover attempt expired. */
|
||||
if (reason == CLUSTER_CANT_FAILOVER_WAITING_VOTES || reason == CLUSTER_CANT_FAILOVER_EXPIRED) {
|
||||
serverLog(LL_NOTICE, "Needed quorum: %d. Number of votes received so far: %d", cur_quorum, cur_vote);
|
||||
}
|
||||
}
|
||||
|
||||
/* This function implements the final part of automatic and manual failovers,
|
||||
|
@ -73,7 +73,7 @@ typedef struct clusterLink {
|
||||
#define CLUSTER_CANT_FAILOVER_WAITING_DELAY 2
|
||||
#define CLUSTER_CANT_FAILOVER_EXPIRED 3
|
||||
#define CLUSTER_CANT_FAILOVER_WAITING_VOTES 4
|
||||
#define CLUSTER_CANT_FAILOVER_RELOG_PERIOD (60*5) /* seconds. */
|
||||
#define CLUSTER_CANT_FAILOVER_RELOG_PERIOD (10) /* seconds. */
|
||||
|
||||
/* clusterState todo_before_sleep flags. */
|
||||
#define CLUSTER_TODO_HANDLE_FAILOVER (1<<0)
|
||||
|
Loading…
x
Reference in New Issue
Block a user