From 1980f639b161f46da2944d60f1c2facaf547dc1a Mon Sep 17 00:00:00 2001 From: Yossi Gottlieb Date: Tue, 22 Sep 2020 11:38:52 +0300 Subject: [PATCH] Fix occasional hangs on replication reconnection. (#7830) This happens only on diskless replicas when attempting to reconnect after failing to load an RDB file. It is more likely to occur with larger datasets. After reconnection is initiated, replicationEmptyDbCallback() may get called and try to write to an unconnected socket. This triggered another issue where the connection is put into an error state and the connect handler never gets called. The problem is a regression introduced by commit c17e597. --- src/connection.c | 14 ++++++++++++-- src/replication.c | 3 ++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/connection.c b/src/connection.c index 23b44a314..415cbdf78 100644 --- a/src/connection.c +++ b/src/connection.c @@ -168,7 +168,12 @@ static int connSocketWrite(connection *conn, const void *data, size_t data_len) int ret = write(conn->fd, data, data_len); if (ret < 0 && errno != EAGAIN) { conn->last_errno = errno; - conn->state = CONN_STATE_ERROR; + + /* Don't overwrite the state of a connection that is not already + * connected, not to mess with handler callbacks. + */ + if (conn->state == CONN_STATE_CONNECTED) + conn->state = CONN_STATE_ERROR; } return ret; @@ -180,7 +185,12 @@ static int connSocketRead(connection *conn, void *buf, size_t buf_len) { conn->state = CONN_STATE_CLOSED; } else if (ret < 0 && errno != EAGAIN) { conn->last_errno = errno; - conn->state = CONN_STATE_ERROR; + + /* Don't overwrite the state of a connection that is not already + * connected, not to mess with handler callbacks. + */ + if (conn->state == CONN_STATE_CONNECTED) + conn->state = CONN_STATE_ERROR; } return ret; diff --git a/src/replication.c b/src/replication.c index be05254e8..445ee1970 100644 --- a/src/replication.c +++ b/src/replication.c @@ -1361,7 +1361,8 @@ void replicationSendNewlineToMaster(void) { * the new dataset received by the master. */ void replicationEmptyDbCallback(void *privdata) { UNUSED(privdata); - replicationSendNewlineToMaster(); + if (server.repl_state == REPL_STATE_TRANSFER) + replicationSendNewlineToMaster(); } /* Once we have a link with the master and the synchronization was