diff --git a/src/rdb.c b/src/rdb.c index d4e44be5d..827ddc4f2 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -2449,6 +2449,18 @@ void backgroundSaveDoneHandlerSocket(int exitcode, int bysignal) { server.rdb_child_pid = -1; server.rdb_child_type = RDB_CHILD_TYPE_NONE; server.rdb_save_time_start = -1; + if (server.rdb_child_exit_pipe!=-1) + close(server.rdb_child_exit_pipe); + close(server.rdb_pipe_read); + server.rdb_child_exit_pipe = -1; + server.rdb_pipe_read = -1; + zfree(server.rdb_pipe_conns); + server.rdb_pipe_conns = NULL; + server.rdb_pipe_numconns = 0; + server.rdb_pipe_numconns_writing = 0; + zfree(server.rdb_pipe_buff); + server.rdb_pipe_buff = NULL; + server.rdb_pipe_bufflen = 0; updateSlavesWaitingBgsave((!bysignal && exitcode == 0) ? C_OK : C_ERR, RDB_CHILD_TYPE_SOCKET); } @@ -2484,7 +2496,7 @@ int rdbSaveToSlavesSockets(rdbSaveInfo *rsi) { listNode *ln; listIter li; pid_t childpid; - int pipefds[2]; + int pipefds[2], rdb_pipe_write, safe_to_exit_pipe; if (hasActiveChildProcess()) return C_ERR; @@ -2497,10 +2509,20 @@ int rdbSaveToSlavesSockets(rdbSaveInfo *rsi) { * of TLS we must let the parent handle a continuous TLS state when the * child terminates and parent takes over. */ if (pipe(pipefds) == -1) return C_ERR; - server.rdb_pipe_read = pipefds[0]; - server.rdb_pipe_write = pipefds[1]; + server.rdb_pipe_read = pipefds[0]; /* read end */ + rdb_pipe_write = pipefds[1]; /* write end */ anetNonBlock(NULL, server.rdb_pipe_read); + /* create another pipe that is used by the parent to signal to the child + * that it can exit. */ + if (pipe(pipefds) == -1) { + close(rdb_pipe_write); + close(server.rdb_pipe_read); + return C_ERR; + } + safe_to_exit_pipe = pipefds[0]; /* read end */ + server.rdb_child_exit_pipe = pipefds[1]; /* write end */ + /* Collect the connections of the replicas we want to transfer * the RDB to, which are i WAIT_BGSAVE_START state. */ server.rdb_pipe_conns = zmalloc(sizeof(connection *)*listLength(server.slaves)); @@ -2519,10 +2541,10 @@ int rdbSaveToSlavesSockets(rdbSaveInfo *rsi) { openChildInfoPipe(); if ((childpid = redisFork()) == 0) { /* Child */ - int retval; + int retval, dummy; rio rdb; - rioInitWithFd(&rdb,server.rdb_pipe_write); + rioInitWithFd(&rdb,rdb_pipe_write); redisSetProcTitle("redis-rdb-to-slaves"); redisSetCpuAffinity(server.bgsave_cpulist); @@ -2536,10 +2558,17 @@ int rdbSaveToSlavesSockets(rdbSaveInfo *rsi) { } rioFreeFd(&rdb); - close(server.rdb_pipe_write); /* wake up the reader, tell it we're done. */ + /* wake up the reader, tell it we're done. */ + close(rdb_pipe_write); + close(server.rdb_child_exit_pipe); /* close write end so that we can detect the close on the parent. */ + /* hold exit until the parent tells us it's safe. we're not expecting + * to read anything, just get the error when the pipe is closed. */ + dummy = read(safe_to_exit_pipe, pipefds, 1); + UNUSED(dummy); exitFromChild((retval == C_OK) ? 0 : 1); } else { /* Parent */ + close(safe_to_exit_pipe); if (childpid == -1) { serverLog(LL_WARNING,"Can't save in background: fork: %s", strerror(errno)); @@ -2554,7 +2583,7 @@ int rdbSaveToSlavesSockets(rdbSaveInfo *rsi) { slave->replstate = SLAVE_STATE_WAIT_BGSAVE_START; } } - close(server.rdb_pipe_write); + close(rdb_pipe_write); close(server.rdb_pipe_read); zfree(server.rdb_pipe_conns); server.rdb_pipe_conns = NULL; @@ -2567,7 +2596,7 @@ int rdbSaveToSlavesSockets(rdbSaveInfo *rsi) { server.rdb_save_time_start = time(NULL); server.rdb_child_pid = childpid; server.rdb_child_type = RDB_CHILD_TYPE_SOCKET; - close(server.rdb_pipe_write); /* close write in parent so that it can detect the close on the child. */ + close(rdb_pipe_write); /* close write in parent so that it can detect the close on the child. */ if (aeCreateFileEvent(server.el, server.rdb_pipe_read, AE_READABLE, rdbPipeReadHandler,NULL) == AE_ERR) { serverPanic("Unrecoverable error creating server.rdb_pipe_read file event."); } diff --git a/src/replication.c b/src/replication.c index d629a197f..32c41361c 100644 --- a/src/replication.c +++ b/src/replication.c @@ -912,7 +912,12 @@ void replconfCommand(client *c) { * the slave online when the first ACK is received (which * confirms slave is online and ready to get more data). This * allows for simpler and less CPU intensive EOF detection - * when streaming RDB files. */ + * when streaming RDB files. + * There's a chance the ACK got to us before we detected that the + * bgsave is done (since that depends on cron ticks), so run a + * quick check first (instead of waiting for the next ACK. */ + if (server.rdb_child_pid != -1 && c->replstate == SLAVE_STATE_WAIT_BGSAVE_END) + checkChildrenDone(); if (c->repl_put_online_on_ack && c->replstate == SLAVE_STATE_ONLINE) putSlaveOnline(c); /* Note: this command does not reply anything! */ @@ -1104,24 +1109,6 @@ void rdbPipeWriteHandler(struct connection *conn) { rdbPipeWriteHandlerConnRemoved(conn); } -/* When the the pipe serving diskless rdb transfer is drained (write end was - * closed), we can clean up all the temporary variables, and cleanup after the - * fork child. */ -void RdbPipeCleanup() { - close(server.rdb_pipe_read); - zfree(server.rdb_pipe_conns); - server.rdb_pipe_conns = NULL; - server.rdb_pipe_numconns = 0; - server.rdb_pipe_numconns_writing = 0; - zfree(server.rdb_pipe_buff); - server.rdb_pipe_buff = NULL; - server.rdb_pipe_bufflen = 0; - - /* Since we're avoiding to detect the child exited as long as the pipe is - * not drained, so now is the time to check. */ - checkChildrenDone(); -} - /* Called in diskless master, when there's data to read from the child's rdb pipe */ void rdbPipeReadHandler(struct aeEventLoop *eventLoop, int fd, void *clientData, int mask) { UNUSED(mask); @@ -1162,7 +1149,11 @@ void rdbPipeReadHandler(struct aeEventLoop *eventLoop, int fd, void *clientData, stillUp++; } serverLog(LL_WARNING,"Diskless rdb transfer, done reading from pipe, %d replicas still up.", stillUp); - RdbPipeCleanup(); + /* Now that the replicas have finished reading, notify the child that it's safe to exit. + * When the server detectes the child has exited, it can mark the replica as online, and + * start streaming the replication buffers. */ + close(server.rdb_child_exit_pipe); + server.rdb_child_exit_pipe = -1; return; } @@ -1203,7 +1194,6 @@ void rdbPipeReadHandler(struct aeEventLoop *eventLoop, int fd, void *clientData, if (stillAlive == 0) { serverLog(LL_WARNING,"Diskless rdb transfer, last replica dropped, killing fork child."); killRDBChild(); - RdbPipeCleanup(); } /* Remove the pipe read handler if at least one write handler was set. */ if (server.rdb_pipe_numconns_writing || stillAlive == 0) { diff --git a/src/server.c b/src/server.c index 5e9c85585..aa3e6b21b 100644 --- a/src/server.c +++ b/src/server.c @@ -1785,14 +1785,6 @@ void checkChildrenDone(void) { int statloc; pid_t pid; - /* If we have a diskless rdb child (note that we support only one concurrent - * child), we want to avoid collecting it's exit status and acting on it - * as long as we didn't finish to drain the pipe, since then we're at risk - * of starting a new fork and a new pipe before we're done with the previous - * one. */ - if (server.rdb_child_pid != -1 && server.rdb_pipe_conns) - return; - if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) { int exitcode = WEXITSTATUS(statloc); int bysignal = 0; diff --git a/src/server.h b/src/server.h index 6dac914c6..05b7d1dd7 100644 --- a/src/server.h +++ b/src/server.h @@ -1257,8 +1257,9 @@ struct redisServer { int rdb_child_type; /* Type of save by active child. */ int lastbgsave_status; /* C_OK or C_ERR */ int stop_writes_on_bgsave_err; /* Don't allow writes if can't BGSAVE */ - int rdb_pipe_write; /* RDB pipes used to transfer the rdb */ - int rdb_pipe_read; /* data to the parent process in diskless repl. */ + int rdb_pipe_read; /* RDB pipe used to transfer the rdb data */ + /* to the parent process in diskless repl. */ + int rdb_child_exit_pipe; /* Used by the diskless parent allow child exit. */ connection **rdb_pipe_conns; /* Connections which are currently the */ int rdb_pipe_numconns; /* target of diskless rdb fork child. */ int rdb_pipe_numconns_writing; /* Number of rdb conns with pending writes. */ diff --git a/tests/integration/replication.tcl b/tests/integration/replication.tcl index a2f9e75bd..5438b0eba 100644 --- a/tests/integration/replication.tcl +++ b/tests/integration/replication.tcl @@ -659,6 +659,48 @@ start_server {tags {"repl"}} { } } +test "diskless replication child being killed is collected" { + # when diskless master is waiting for the replica to become writable + # it removes the read event from the rdb pipe so if the child gets killed + # the replica will hung. and the master may not collect the pid with wait3 + start_server {tags {"repl"}} { + set master [srv 0 client] + set master_host [srv 0 host] + set master_port [srv 0 port] + set master_pid [srv 0 pid] + $master config set repl-diskless-sync yes + $master config set repl-diskless-sync-delay 0 + # put enough data in the db that the rdb file will be bigger than the socket buffers + $master debug populate 20000 test 10000 + $master config set rdbcompression no + start_server {} { + set replica [srv 0 client] + set loglines [count_log_lines 0] + $replica config set repl-diskless-load swapdb + $replica config set key-load-delay 1000000 + $replica replicaof $master_host $master_port + + # wait for the replicas to start reading the rdb + wait_for_log_messages 0 {"*Loading DB in memory*"} $loglines 800 10 + + # wait to be sure the eplica is hung and the master is blocked on write + after 500 + + # simulate the OOM killer or anyone else kills the child + set fork_child_pid [get_child_pid -1] + puts "fork child is $fork_child_pid" + exec kill -9 $fork_child_pid + + # wait for the parent to notice the child have exited + wait_for_condition 50 100 { + [s -1 rdb_bgsave_in_progress] == 0 + } else { + fail "rdb child didn't terminate" + } + } + } +} + test {replicaof right after disconnection} { # this is a rare race condition that was reproduced sporadically by the psync2 unit. # see details in #7205 diff --git a/tests/support/util.tcl b/tests/support/util.tcl index 6d11e5520..ed4ff713a 100644 --- a/tests/support/util.tcl +++ b/tests/support/util.tcl @@ -505,3 +505,12 @@ proc populate {num prefix size} { } $rd close } + +proc get_child_pid {idx} { + set pid [srv $idx pid] + set fd [open "|ps --ppid $pid -o pid" "r"] + set child_pid [string trim [lindex [split [read $fd] \n] 1]] + close $fd + + return $child_pid +} diff --git a/tests/unit/oom-score-adj.tcl b/tests/unit/oom-score-adj.tcl index 993004602..8eb09a993 100644 --- a/tests/unit/oom-score-adj.tcl +++ b/tests/unit/oom-score-adj.tcl @@ -14,15 +14,6 @@ if {$system_name eq {linux}} { return $val } - proc get_child_pid {} { - set pid [srv 0 pid] - set fd [open "|ps --ppid $pid -o pid" "r"] - set child_pid [string trim [lindex [split [read $fd] \n] 1]] - close $fd - - return $child_pid - } - test {CONFIG SET oom-score-adj works as expected} { set base [get_oom_score_adj] @@ -47,7 +38,7 @@ if {$system_name eq {linux}} { r config set rdb-key-save-delay 100000 r bgsave - set child_pid [get_child_pid] + set child_pid [get_child_pid 0] assert {[get_oom_score_adj $child_pid] == [expr $base + 30]} }