Clear outdated failure reports more accurately (#1184)
There are two changes here: 1. The one in clusterNodeCleanupFailureReports, only primary with slots can report the failure report, if the primary became a replica its failure report should be cleared. This may lead to inaccurate node fail judgment in some network partition cases i guess, it will also affect the CLUSTER COUNT-FAILURE-REPORTS command. 2. The one in clusterProcessGossipSection, it is not that important, but it can print a "node is back online" log helps us troubleshoot the problem, although it may conflict with 1 at some points. Signed-off-by: Binbin <binloveplay1314@qq.com>
This commit is contained in:
parent
e48317eb34
commit
ca0b0c662a
@ -1552,9 +1552,14 @@ int clusterNodeAddFailureReport(clusterNode *failing, clusterNode *sender) {
|
||||
* older than the global node timeout. Note that anyway for a node to be
|
||||
* flagged as FAIL we need to have a local PFAIL state that is at least
|
||||
* older than the global node timeout, so we don't just trust the number
|
||||
* of failure reports from other nodes. */
|
||||
* of failure reports from other nodes.
|
||||
*
|
||||
* If the reporting node loses its voting right during this time, we will
|
||||
* also clear its report. */
|
||||
void clusterNodeCleanupFailureReports(clusterNode *node) {
|
||||
list *l = node->fail_reports;
|
||||
if (!listLength(l)) return;
|
||||
|
||||
listNode *ln;
|
||||
listIter li;
|
||||
clusterNodeFailReport *fr;
|
||||
@ -1564,7 +1569,11 @@ void clusterNodeCleanupFailureReports(clusterNode *node) {
|
||||
listRewind(l, &li);
|
||||
while ((ln = listNext(&li)) != NULL) {
|
||||
fr = ln->value;
|
||||
if (now - fr->time > maxtime) listDelNode(l, ln);
|
||||
if (now - fr->time > maxtime) {
|
||||
listDelNode(l, ln);
|
||||
} else if (!clusterNodeIsVotingPrimary(fr->node)) {
|
||||
listDelNode(l, ln);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -1581,6 +1590,8 @@ void clusterNodeCleanupFailureReports(clusterNode *node) {
|
||||
* Otherwise 0 is returned. */
|
||||
int clusterNodeDelFailureReport(clusterNode *node, clusterNode *sender) {
|
||||
list *l = node->fail_reports;
|
||||
if (!listLength(l)) return 0;
|
||||
|
||||
listNode *ln;
|
||||
listIter li;
|
||||
clusterNodeFailReport *fr;
|
||||
@ -2254,10 +2265,11 @@ void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) {
|
||||
/* Ignore gossips about self. */
|
||||
if (node && node != myself) {
|
||||
/* We already know this node.
|
||||
Handle failure reports, only when the sender is a voting primary. */
|
||||
if (sender && clusterNodeIsVotingPrimary(sender)) {
|
||||
* Handle failure reports, the report is added only if the sender is a voting primary,
|
||||
* and deletion of a failure report is not restricted. */
|
||||
if (sender) {
|
||||
if (flags & (CLUSTER_NODE_FAIL | CLUSTER_NODE_PFAIL)) {
|
||||
if (clusterNodeAddFailureReport(node, sender)) {
|
||||
if (clusterNodeIsVotingPrimary(sender) && clusterNodeAddFailureReport(node, sender)) {
|
||||
serverLog(LL_NOTICE, "Node %.40s (%s) reported node %.40s (%s) as not reachable.", sender->name,
|
||||
sender->human_nodename, node->name, node->human_nodename);
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user