From 725616534e77362b8c0a0203a7df024a82baffc7 Mon Sep 17 00:00:00 2001
From: Oran Agra <oran@redislabs.com>
Date: Sun, 6 Sep 2020 16:43:57 +0300
Subject: [PATCH] if diskless repl child is killed, make sure to reap the pid
 (#7742)

Starting redis 6.0 and the changes we made to the diskless master to be
suitable for TLS, I made the master avoid reaping (wait3) the pid of the
child until we know all replicas are done reading their rdb.

I did that in order to avoid a state where the rdb_child_pid is -1 but
we don't yet want to start another fork (still busy serving that data to
replicas).

It turns out that the solution used so far was problematic in case the
fork child was being killed (e.g. by the kernel OOM killer), in that
case there's a chance that we currently disabled the read event on the
rdb pipe, since we're waiting for a replica to become writable again.
and in that scenario the master would have never realized the child
exited, and the replica will remain hung too.
Note that there's no mechanism to detect a hung replica while it's in
rdb transfer state.

The solution here is to add another pipe which is used by the parent to
tell the child it is safe to exit. this mean that when the child exits,
for whatever reason, it is safe to reap it.

Besides that, i'm re-introducing an adjustment to REPLCONF ACK which was
part of #6271 (Accelerate diskless master connections) but was dropped
when that PR was rebased after the TLS fork/pipe changes (6fd5ff8).
Now that RdbPipeCleanup no longer calls checkChildrenDone, and the ACK
has chance to detect that the child exited, it should be the one to call
it so that we don't have to wait for cron (server.hz) to do that.
---
 src/rdb.c                         | 45 +++++++++++++++++++++++++------
 src/replication.c                 | 32 ++++++++--------------
 src/server.c                      |  8 ------
 src/server.h                      |  5 ++--
 tests/integration/replication.tcl | 42 +++++++++++++++++++++++++++++
 tests/support/util.tcl            |  9 +++++++
 tests/unit/oom-score-adj.tcl      | 11 +-------
 7 files changed, 103 insertions(+), 49 deletions(-)

diff --git a/src/rdb.c b/src/rdb.c
index d4e44be5d..827ddc4f2 100644
--- a/src/rdb.c
+++ b/src/rdb.c
@@ -2449,6 +2449,18 @@ void backgroundSaveDoneHandlerSocket(int exitcode, int bysignal) {
     server.rdb_child_pid = -1;
     server.rdb_child_type = RDB_CHILD_TYPE_NONE;
     server.rdb_save_time_start = -1;
+    if (server.rdb_child_exit_pipe!=-1)
+        close(server.rdb_child_exit_pipe);
+    close(server.rdb_pipe_read);
+    server.rdb_child_exit_pipe = -1;
+    server.rdb_pipe_read = -1;
+    zfree(server.rdb_pipe_conns);
+    server.rdb_pipe_conns = NULL;
+    server.rdb_pipe_numconns = 0;
+    server.rdb_pipe_numconns_writing = 0;
+    zfree(server.rdb_pipe_buff);
+    server.rdb_pipe_buff = NULL;
+    server.rdb_pipe_bufflen = 0;
 
     updateSlavesWaitingBgsave((!bysignal && exitcode == 0) ? C_OK : C_ERR, RDB_CHILD_TYPE_SOCKET);
 }
@@ -2484,7 +2496,7 @@ int rdbSaveToSlavesSockets(rdbSaveInfo *rsi) {
     listNode *ln;
     listIter li;
     pid_t childpid;
-    int pipefds[2];
+    int pipefds[2], rdb_pipe_write, safe_to_exit_pipe;
 
     if (hasActiveChildProcess()) return C_ERR;
 
@@ -2497,10 +2509,20 @@ int rdbSaveToSlavesSockets(rdbSaveInfo *rsi) {
      * of TLS we must let the parent handle a continuous TLS state when the
      * child terminates and parent takes over. */
     if (pipe(pipefds) == -1) return C_ERR;
-    server.rdb_pipe_read = pipefds[0];
-    server.rdb_pipe_write = pipefds[1];
+    server.rdb_pipe_read = pipefds[0]; /* read end */
+    rdb_pipe_write = pipefds[1]; /* write end */
     anetNonBlock(NULL, server.rdb_pipe_read);
 
+    /* create another pipe that is used by the parent to signal to the child
+     * that it can exit. */
+    if (pipe(pipefds) == -1) {
+        close(rdb_pipe_write);
+        close(server.rdb_pipe_read);
+        return C_ERR;
+    }
+    safe_to_exit_pipe = pipefds[0]; /* read end */
+    server.rdb_child_exit_pipe = pipefds[1]; /* write end */
+
     /* Collect the connections of the replicas we want to transfer
      * the RDB to, which are i WAIT_BGSAVE_START state. */
     server.rdb_pipe_conns = zmalloc(sizeof(connection *)*listLength(server.slaves));
@@ -2519,10 +2541,10 @@ int rdbSaveToSlavesSockets(rdbSaveInfo *rsi) {
     openChildInfoPipe();
     if ((childpid = redisFork()) == 0) {
         /* Child */
-        int retval;
+        int retval, dummy;
         rio rdb;
 
-        rioInitWithFd(&rdb,server.rdb_pipe_write);
+        rioInitWithFd(&rdb,rdb_pipe_write);
 
         redisSetProcTitle("redis-rdb-to-slaves");
         redisSetCpuAffinity(server.bgsave_cpulist);
@@ -2536,10 +2558,17 @@ int rdbSaveToSlavesSockets(rdbSaveInfo *rsi) {
         }
 
         rioFreeFd(&rdb);
-        close(server.rdb_pipe_write); /* wake up the reader, tell it we're done. */
+        /* wake up the reader, tell it we're done. */
+        close(rdb_pipe_write);
+        close(server.rdb_child_exit_pipe); /* close write end so that we can detect the close on the parent. */
+        /* hold exit until the parent tells us it's safe. we're not expecting
+         * to read anything, just get the error when the pipe is closed. */
+        dummy = read(safe_to_exit_pipe, pipefds, 1);
+        UNUSED(dummy);
         exitFromChild((retval == C_OK) ? 0 : 1);
     } else {
         /* Parent */
+        close(safe_to_exit_pipe);
         if (childpid == -1) {
             serverLog(LL_WARNING,"Can't save in background: fork: %s",
                 strerror(errno));
@@ -2554,7 +2583,7 @@ int rdbSaveToSlavesSockets(rdbSaveInfo *rsi) {
                     slave->replstate = SLAVE_STATE_WAIT_BGSAVE_START;
                 }
             }
-            close(server.rdb_pipe_write);
+            close(rdb_pipe_write);
             close(server.rdb_pipe_read);
             zfree(server.rdb_pipe_conns);
             server.rdb_pipe_conns = NULL;
@@ -2567,7 +2596,7 @@ int rdbSaveToSlavesSockets(rdbSaveInfo *rsi) {
             server.rdb_save_time_start = time(NULL);
             server.rdb_child_pid = childpid;
             server.rdb_child_type = RDB_CHILD_TYPE_SOCKET;
-            close(server.rdb_pipe_write); /* close write in parent so that it can detect the close on the child. */
+            close(rdb_pipe_write); /* close write in parent so that it can detect the close on the child. */
             if (aeCreateFileEvent(server.el, server.rdb_pipe_read, AE_READABLE, rdbPipeReadHandler,NULL) == AE_ERR) {
                 serverPanic("Unrecoverable error creating server.rdb_pipe_read file event.");
             }
diff --git a/src/replication.c b/src/replication.c
index d629a197f..32c41361c 100644
--- a/src/replication.c
+++ b/src/replication.c
@@ -912,7 +912,12 @@ void replconfCommand(client *c) {
              * the slave online when the first ACK is received (which
              * confirms slave is online and ready to get more data). This
              * allows for simpler and less CPU intensive EOF detection
-             * when streaming RDB files. */
+             * when streaming RDB files.
+             * There's a chance the ACK got to us before we detected that the
+             * bgsave is done (since that depends on cron ticks), so run a
+             * quick check first (instead of waiting for the next ACK. */
+            if (server.rdb_child_pid != -1 && c->replstate == SLAVE_STATE_WAIT_BGSAVE_END)
+                checkChildrenDone();
             if (c->repl_put_online_on_ack && c->replstate == SLAVE_STATE_ONLINE)
                 putSlaveOnline(c);
             /* Note: this command does not reply anything! */
@@ -1104,24 +1109,6 @@ void rdbPipeWriteHandler(struct connection *conn) {
     rdbPipeWriteHandlerConnRemoved(conn);
 }
 
-/* When the the pipe serving diskless rdb transfer is drained (write end was
- * closed), we can clean up all the temporary variables, and cleanup after the
- * fork child. */
-void RdbPipeCleanup() {
-    close(server.rdb_pipe_read);
-    zfree(server.rdb_pipe_conns);
-    server.rdb_pipe_conns = NULL;
-    server.rdb_pipe_numconns = 0;
-    server.rdb_pipe_numconns_writing = 0;
-    zfree(server.rdb_pipe_buff);
-    server.rdb_pipe_buff = NULL;
-    server.rdb_pipe_bufflen = 0;
-
-    /* Since we're avoiding to detect the child exited as long as the pipe is
-     * not drained, so now is the time to check. */
-    checkChildrenDone();
-}
-
 /* Called in diskless master, when there's data to read from the child's rdb pipe */
 void rdbPipeReadHandler(struct aeEventLoop *eventLoop, int fd, void *clientData, int mask) {
     UNUSED(mask);
@@ -1162,7 +1149,11 @@ void rdbPipeReadHandler(struct aeEventLoop *eventLoop, int fd, void *clientData,
                 stillUp++;
             }
             serverLog(LL_WARNING,"Diskless rdb transfer, done reading from pipe, %d replicas still up.", stillUp);
-            RdbPipeCleanup();
+            /* Now that the replicas have finished reading, notify the child that it's safe to exit. 
+             * When the server detectes the child has exited, it can mark the replica as online, and
+             * start streaming the replication buffers. */
+            close(server.rdb_child_exit_pipe);
+            server.rdb_child_exit_pipe = -1;
             return;
         }
 
@@ -1203,7 +1194,6 @@ void rdbPipeReadHandler(struct aeEventLoop *eventLoop, int fd, void *clientData,
         if (stillAlive == 0) {
             serverLog(LL_WARNING,"Diskless rdb transfer, last replica dropped, killing fork child.");
             killRDBChild();
-            RdbPipeCleanup();
         }
         /*  Remove the pipe read handler if at least one write handler was set. */
         if (server.rdb_pipe_numconns_writing || stillAlive == 0) {
diff --git a/src/server.c b/src/server.c
index 5e9c85585..aa3e6b21b 100644
--- a/src/server.c
+++ b/src/server.c
@@ -1785,14 +1785,6 @@ void checkChildrenDone(void) {
     int statloc;
     pid_t pid;
 
-    /* If we have a diskless rdb child (note that we support only one concurrent
-     * child), we want to avoid collecting it's exit status and acting on it
-     * as long as we didn't finish to drain the pipe, since then we're at risk
-     * of starting a new fork and a new pipe before we're done with the previous
-     * one. */
-    if (server.rdb_child_pid != -1 && server.rdb_pipe_conns)
-        return;
-
     if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
         int exitcode = WEXITSTATUS(statloc);
         int bysignal = 0;
diff --git a/src/server.h b/src/server.h
index 6dac914c6..05b7d1dd7 100644
--- a/src/server.h
+++ b/src/server.h
@@ -1257,8 +1257,9 @@ struct redisServer {
     int rdb_child_type;             /* Type of save by active child. */
     int lastbgsave_status;          /* C_OK or C_ERR */
     int stop_writes_on_bgsave_err;  /* Don't allow writes if can't BGSAVE */
-    int rdb_pipe_write;             /* RDB pipes used to transfer the rdb */
-    int rdb_pipe_read;              /* data to the parent process in diskless repl. */
+    int rdb_pipe_read;              /* RDB pipe used to transfer the rdb data */
+                                    /* to the parent process in diskless repl. */
+    int rdb_child_exit_pipe;        /* Used by the diskless parent allow child exit. */
     connection **rdb_pipe_conns;    /* Connections which are currently the */
     int rdb_pipe_numconns;          /* target of diskless rdb fork child. */
     int rdb_pipe_numconns_writing;  /* Number of rdb conns with pending writes. */
diff --git a/tests/integration/replication.tcl b/tests/integration/replication.tcl
index a2f9e75bd..5438b0eba 100644
--- a/tests/integration/replication.tcl
+++ b/tests/integration/replication.tcl
@@ -659,6 +659,48 @@ start_server {tags {"repl"}} {
     }
 }
 
+test "diskless replication child being killed is collected" {
+    # when diskless master is waiting for the replica to become writable
+    # it removes the read event from the rdb pipe so if the child gets killed
+    # the replica will hung. and the master may not collect the pid with wait3
+    start_server {tags {"repl"}} {
+        set master [srv 0 client]
+        set master_host [srv 0 host]
+        set master_port [srv 0 port]
+        set master_pid [srv 0 pid]
+        $master config set repl-diskless-sync yes
+        $master config set repl-diskless-sync-delay 0
+        # put enough data in the db that the rdb file will be bigger than the socket buffers
+        $master debug populate 20000 test 10000
+        $master config set rdbcompression no
+        start_server {} {
+            set replica [srv 0 client]
+            set loglines [count_log_lines 0]
+            $replica config set repl-diskless-load swapdb
+            $replica config set key-load-delay 1000000
+            $replica replicaof $master_host $master_port
+
+            # wait for the replicas to start reading the rdb
+            wait_for_log_messages 0 {"*Loading DB in memory*"} $loglines 800 10
+
+            # wait to be sure the eplica is hung and the master is blocked on write
+            after 500
+
+            # simulate the OOM killer or anyone else kills the child
+            set fork_child_pid [get_child_pid -1]
+            puts "fork child is $fork_child_pid"
+            exec kill -9 $fork_child_pid
+
+            # wait for the parent to notice the child have exited
+            wait_for_condition 50 100 {
+                [s -1 rdb_bgsave_in_progress] == 0
+            } else {
+                fail "rdb child didn't terminate"
+            }
+        }
+    }
+}
+
 test {replicaof right after disconnection} {
     # this is a rare race condition that was reproduced sporadically by the psync2 unit.
     # see details in #7205
diff --git a/tests/support/util.tcl b/tests/support/util.tcl
index 6d11e5520..ed4ff713a 100644
--- a/tests/support/util.tcl
+++ b/tests/support/util.tcl
@@ -505,3 +505,12 @@ proc populate {num prefix size} {
     }
     $rd close
 }
+
+proc get_child_pid {idx} {
+    set pid [srv $idx pid]
+    set fd [open "|ps --ppid $pid -o pid" "r"]
+    set child_pid [string trim [lindex [split [read $fd] \n] 1]]
+    close $fd
+
+    return $child_pid
+}
diff --git a/tests/unit/oom-score-adj.tcl b/tests/unit/oom-score-adj.tcl
index 993004602..8eb09a993 100644
--- a/tests/unit/oom-score-adj.tcl
+++ b/tests/unit/oom-score-adj.tcl
@@ -14,15 +14,6 @@ if {$system_name eq {linux}} {
             return $val
         }
 
-        proc get_child_pid {} {
-            set pid [srv 0 pid]
-            set fd [open "|ps --ppid $pid -o pid" "r"]
-            set child_pid [string trim [lindex [split [read $fd] \n] 1]]
-            close $fd
-
-            return $child_pid
-        }
-
         test {CONFIG SET oom-score-adj works as expected} {
             set base [get_oom_score_adj]
 
@@ -47,7 +38,7 @@ if {$system_name eq {linux}} {
             r config set rdb-key-save-delay 100000
             r bgsave
 
-            set child_pid [get_child_pid]
+            set child_pid [get_child_pid 0]
             assert {[get_oom_score_adj $child_pid] == [expr $base + 30]}
         }