Fix occasional hangs on replication reconnection. (#7830)

This happens only on diskless replicas when attempting to reconnect after 
failing to load an RDB file. It is more likely to occur with larger datasets.

After reconnection is initiated, replicationEmptyDbCallback() may get called 
and try to write to an unconnected socket. This triggered another issue where
the connection is put into an error state and the connect handler never gets
called. The problem is a regression introduced by commit cad93ed.

(cherry picked from commit ecd86283ec292c1062f377f5707be57a8a77adb4)
This commit is contained in:
Yossi Gottlieb 2020-09-22 11:38:52 +03:00 committed by Oran Agra
parent 29f6e9fe95
commit 24f258e39c
2 changed files with 14 additions and 3 deletions

View File

@ -168,7 +168,12 @@ static int connSocketWrite(connection *conn, const void *data, size_t data_len)
int ret = write(conn->fd, data, data_len);
if (ret < 0 && errno != EAGAIN) {
conn->last_errno = errno;
conn->state = CONN_STATE_ERROR;
/* Don't overwrite the state of a connection that is not already
* connected, not to mess with handler callbacks.
*/
if (conn->state == CONN_STATE_CONNECTED)
conn->state = CONN_STATE_ERROR;
}
return ret;
@ -180,7 +185,12 @@ static int connSocketRead(connection *conn, void *buf, size_t buf_len) {
conn->state = CONN_STATE_CLOSED;
} else if (ret < 0 && errno != EAGAIN) {
conn->last_errno = errno;
conn->state = CONN_STATE_ERROR;
/* Don't overwrite the state of a connection that is not already
* connected, not to mess with handler callbacks.
*/
if (conn->state == CONN_STATE_CONNECTED)
conn->state = CONN_STATE_ERROR;
}
return ret;

View File

@ -1374,7 +1374,8 @@ void replicationSendNewlineToMaster(void) {
* the new dataset received by the master. */
void replicationEmptyDbCallback(void *privdata) {
UNUSED(privdata);
replicationSendNewlineToMaster();
if (server.repl_state == REPL_STATE_TRANSFER)
replicationSendNewlineToMaster();
}
/* Once we have a link with the master and the synchronization was