Fix occasional hangs on replication reconnection. (#7830)

This happens only on diskless replicas when attempting to reconnect after 
failing to load an RDB file. It is more likely to occur with larger datasets.

After reconnection is initiated, replicationEmptyDbCallback() may get called 
and try to write to an unconnected socket. This triggered another issue where
the connection is put into an error state and the connect handler never gets
called. The problem is a regression introduced by commit cad93ed.
This commit is contained in:
Yossi Gottlieb 2020-09-22 11:38:52 +03:00 committed by GitHub
parent b914d4fc48
commit ecd86283ec
2 changed files with 14 additions and 3 deletions

View File

@ -168,7 +168,12 @@ static int connSocketWrite(connection *conn, const void *data, size_t data_len)
int ret = write(conn->fd, data, data_len);
if (ret < 0 && errno != EAGAIN) {
conn->last_errno = errno;
conn->state = CONN_STATE_ERROR;
/* Don't overwrite the state of a connection that is not already
* connected, not to mess with handler callbacks.
*/
if (conn->state == CONN_STATE_CONNECTED)
conn->state = CONN_STATE_ERROR;
}
return ret;
@ -180,7 +185,12 @@ static int connSocketRead(connection *conn, void *buf, size_t buf_len) {
conn->state = CONN_STATE_CLOSED;
} else if (ret < 0 && errno != EAGAIN) {
conn->last_errno = errno;
conn->state = CONN_STATE_ERROR;
/* Don't overwrite the state of a connection that is not already
* connected, not to mess with handler callbacks.
*/
if (conn->state == CONN_STATE_CONNECTED)
conn->state = CONN_STATE_ERROR;
}
return ret;

View File

@ -1361,7 +1361,8 @@ void replicationSendNewlineToMaster(void) {
* the new dataset received by the master. */
void replicationEmptyDbCallback(void *privdata) {
UNUSED(privdata);
replicationSendNewlineToMaster();
if (server.repl_state == REPL_STATE_TRANSFER)
replicationSendNewlineToMaster();
}
/* Once we have a link with the master and the synchronization was