From cbaf3c5bbafd43e009a2d6b38dd0e9fc450a3e12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=A8=E5=8D=9A=E4=B8=9C?= Date: Thu, 20 Aug 2020 13:59:02 +0800 Subject: [PATCH] Fix flock cluster config may cause failure to restart after kill -9 (#7674) After fork, the child process(redis-aof-rewrite) will get the fd opened by the parent process(redis), when redis killed by kill -9, it will not graceful exit(call prepareForShutdown()), so redis-aof-rewrite thread may still alive, the fd(lock) will still be held by redis-aof-rewrite thread, and redis restart will fail to get lock, means fail to start. This issue was causing failures in the cluster tests in github actions. Co-authored-by: Oran Agra --- src/cluster.c | 11 ++++++++++- src/server.c | 12 +++++++++++- src/server.h | 1 + tests/instances.tcl | 14 +++++++++----- 4 files changed, 31 insertions(+), 7 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 463503700..ba0ada04a 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -418,7 +418,15 @@ int clusterLockConfig(char *filename) { return C_ERR; } /* Lock acquired: leak the 'fd' by not closing it, so that we'll retain the - * lock to the file as long as the process exists. */ + * lock to the file as long as the process exists. + * + * After fork, the child process will get the fd opened by the parent process, + * we need save `fd` to `cluster_config_file_lock_fd`, so that in redisFork(), + * it will be closed in the child process. + * If it is not closed, when the main process is killed -9, but the child process + * (redis-aof-rewrite) is still alive, the fd(lock) will still be held by the + * child process, and the main process will fail to get lock, means fail to start. */ + server.cluster_config_file_lock_fd = fd; #endif /* __sun */ return C_OK; @@ -468,6 +476,7 @@ void clusterInit(void) { /* Lock the cluster config file to make sure every node uses * its own nodes.conf. */ + server.cluster_config_file_lock_fd = -1; if (clusterLockConfig(server.cluster_configfile) == C_ERR) exit(1); diff --git a/src/server.c b/src/server.c index 7c381ce24..454c57862 100644 --- a/src/server.c +++ b/src/server.c @@ -4917,14 +4917,24 @@ void setupChildSignalHandlers(void) { return; } +/* After fork, the child process will inherit the resources + * of the parent process, e.g. fd(socket or flock) etc. + * should close the resources not used by the child process, so that if the + * parent restarts it can bind/lock despite the child possibly still running. */ +void closeClildUnusedResourceAfterFork() { + closeListeningSockets(0); + if (server.cluster_enabled && server.cluster_config_file_lock_fd != -1) + close(server.cluster_config_file_lock_fd); /* don't care if this fails */ +} + int redisFork() { int childpid; long long start = ustime(); if ((childpid = fork()) == 0) { /* Child */ setOOMScoreAdj(CONFIG_OOM_BGCHILD); - closeListeningSockets(0); setupChildSignalHandlers(); + closeClildUnusedResourceAfterFork(); } else { /* Parent */ server.stat_fork_time = ustime()-start; diff --git a/src/server.h b/src/server.h index 4940b0c5b..9badea9e8 100644 --- a/src/server.h +++ b/src/server.h @@ -1419,6 +1419,7 @@ struct redisServer { REDISMODULE_CLUSTER_FLAG_*. */ int cluster_allow_reads_when_down; /* Are reads allowed when the cluster is down? */ + int cluster_config_file_lock_fd; /* cluster config fd, will be flock */ /* Scripting */ lua_State *lua; /* The Lua interpreter. We use just one for all clients */ client *lua_client; /* The "fake client" to query Redis from Lua */ diff --git a/tests/instances.tcl b/tests/instances.tcl index e2aa4ab13..a43a4cc87 100644 --- a/tests/instances.tcl +++ b/tests/instances.tcl @@ -116,7 +116,9 @@ proc spawn_instance {type base_port count {conf {}}} { # Check availability finally if {[server_is_up 127.0.0.1 $port 100] == 0} { - abort_sentinel_test "Problems starting $type #$j: ping timeout" + set logfile [file join $dirname log.txt] + puts [exec tail $logfile] + abort_sentinel_test "Problems starting $type #$j: ping timeout, maybe server start failed, check $logfile" } # Push the instance into the right list @@ -493,12 +495,12 @@ proc kill_instance {type id} { # Wait for the port it was using to be available again, so that's not # an issue to start a new server ASAP with the same port. - set retry 10 + set retry 100 while {[incr retry -1]} { - set port_is_free [catch {set s [socket 127.0.01 $port]}] + set port_is_free [catch {set s [socket 127.0.0.1 $port]}] if {$port_is_free} break catch {close $s} - after 1000 + after 100 } if {$retry == 0} { error "Port $port does not return available after killing instance." @@ -525,7 +527,9 @@ proc restart_instance {type id} { # Check that the instance is running if {[server_is_up 127.0.0.1 $port 100] == 0} { - abort_sentinel_test "Problems starting $type #$id: ping timeout" + set logfile [file join $dirname log.txt] + puts [exec tail $logfile] + abort_sentinel_test "Problems starting $type #$id: ping timeout, maybe server start failed, check $logfile" } # Connect with it with a fresh link