Fix flock cluster config may cause failure to restart after kill -9 (#7674)
After fork, the child process(redis-aof-rewrite) will get the fd opened by the parent process(redis), when redis killed by kill -9, it will not graceful exit(call prepareForShutdown()), so redis-aof-rewrite thread may still alive, the fd(lock) will still be held by redis-aof-rewrite thread, and redis restart will fail to get lock, means fail to start. This issue was causing failures in the cluster tests in github actions. Co-authored-by: Oran Agra <oran@redislabs.com>
This commit is contained in:
parent
34c3be365a
commit
cbaf3c5bba
@ -418,7 +418,15 @@ int clusterLockConfig(char *filename) {
|
|||||||
return C_ERR;
|
return C_ERR;
|
||||||
}
|
}
|
||||||
/* Lock acquired: leak the 'fd' by not closing it, so that we'll retain the
|
/* Lock acquired: leak the 'fd' by not closing it, so that we'll retain the
|
||||||
* lock to the file as long as the process exists. */
|
* lock to the file as long as the process exists.
|
||||||
|
*
|
||||||
|
* After fork, the child process will get the fd opened by the parent process,
|
||||||
|
* we need save `fd` to `cluster_config_file_lock_fd`, so that in redisFork(),
|
||||||
|
* it will be closed in the child process.
|
||||||
|
* If it is not closed, when the main process is killed -9, but the child process
|
||||||
|
* (redis-aof-rewrite) is still alive, the fd(lock) will still be held by the
|
||||||
|
* child process, and the main process will fail to get lock, means fail to start. */
|
||||||
|
server.cluster_config_file_lock_fd = fd;
|
||||||
#endif /* __sun */
|
#endif /* __sun */
|
||||||
|
|
||||||
return C_OK;
|
return C_OK;
|
||||||
@ -468,6 +476,7 @@ void clusterInit(void) {
|
|||||||
|
|
||||||
/* Lock the cluster config file to make sure every node uses
|
/* Lock the cluster config file to make sure every node uses
|
||||||
* its own nodes.conf. */
|
* its own nodes.conf. */
|
||||||
|
server.cluster_config_file_lock_fd = -1;
|
||||||
if (clusterLockConfig(server.cluster_configfile) == C_ERR)
|
if (clusterLockConfig(server.cluster_configfile) == C_ERR)
|
||||||
exit(1);
|
exit(1);
|
||||||
|
|
||||||
|
12
src/server.c
12
src/server.c
@ -4917,14 +4917,24 @@ void setupChildSignalHandlers(void) {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* After fork, the child process will inherit the resources
|
||||||
|
* of the parent process, e.g. fd(socket or flock) etc.
|
||||||
|
* should close the resources not used by the child process, so that if the
|
||||||
|
* parent restarts it can bind/lock despite the child possibly still running. */
|
||||||
|
void closeClildUnusedResourceAfterFork() {
|
||||||
|
closeListeningSockets(0);
|
||||||
|
if (server.cluster_enabled && server.cluster_config_file_lock_fd != -1)
|
||||||
|
close(server.cluster_config_file_lock_fd); /* don't care if this fails */
|
||||||
|
}
|
||||||
|
|
||||||
int redisFork() {
|
int redisFork() {
|
||||||
int childpid;
|
int childpid;
|
||||||
long long start = ustime();
|
long long start = ustime();
|
||||||
if ((childpid = fork()) == 0) {
|
if ((childpid = fork()) == 0) {
|
||||||
/* Child */
|
/* Child */
|
||||||
setOOMScoreAdj(CONFIG_OOM_BGCHILD);
|
setOOMScoreAdj(CONFIG_OOM_BGCHILD);
|
||||||
closeListeningSockets(0);
|
|
||||||
setupChildSignalHandlers();
|
setupChildSignalHandlers();
|
||||||
|
closeClildUnusedResourceAfterFork();
|
||||||
} else {
|
} else {
|
||||||
/* Parent */
|
/* Parent */
|
||||||
server.stat_fork_time = ustime()-start;
|
server.stat_fork_time = ustime()-start;
|
||||||
|
@ -1419,6 +1419,7 @@ struct redisServer {
|
|||||||
REDISMODULE_CLUSTER_FLAG_*. */
|
REDISMODULE_CLUSTER_FLAG_*. */
|
||||||
int cluster_allow_reads_when_down; /* Are reads allowed when the cluster
|
int cluster_allow_reads_when_down; /* Are reads allowed when the cluster
|
||||||
is down? */
|
is down? */
|
||||||
|
int cluster_config_file_lock_fd; /* cluster config fd, will be flock */
|
||||||
/* Scripting */
|
/* Scripting */
|
||||||
lua_State *lua; /* The Lua interpreter. We use just one for all clients */
|
lua_State *lua; /* The Lua interpreter. We use just one for all clients */
|
||||||
client *lua_client; /* The "fake client" to query Redis from Lua */
|
client *lua_client; /* The "fake client" to query Redis from Lua */
|
||||||
|
@ -116,7 +116,9 @@ proc spawn_instance {type base_port count {conf {}}} {
|
|||||||
|
|
||||||
# Check availability finally
|
# Check availability finally
|
||||||
if {[server_is_up 127.0.0.1 $port 100] == 0} {
|
if {[server_is_up 127.0.0.1 $port 100] == 0} {
|
||||||
abort_sentinel_test "Problems starting $type #$j: ping timeout"
|
set logfile [file join $dirname log.txt]
|
||||||
|
puts [exec tail $logfile]
|
||||||
|
abort_sentinel_test "Problems starting $type #$j: ping timeout, maybe server start failed, check $logfile"
|
||||||
}
|
}
|
||||||
|
|
||||||
# Push the instance into the right list
|
# Push the instance into the right list
|
||||||
@ -493,12 +495,12 @@ proc kill_instance {type id} {
|
|||||||
|
|
||||||
# Wait for the port it was using to be available again, so that's not
|
# Wait for the port it was using to be available again, so that's not
|
||||||
# an issue to start a new server ASAP with the same port.
|
# an issue to start a new server ASAP with the same port.
|
||||||
set retry 10
|
set retry 100
|
||||||
while {[incr retry -1]} {
|
while {[incr retry -1]} {
|
||||||
set port_is_free [catch {set s [socket 127.0.01 $port]}]
|
set port_is_free [catch {set s [socket 127.0.0.1 $port]}]
|
||||||
if {$port_is_free} break
|
if {$port_is_free} break
|
||||||
catch {close $s}
|
catch {close $s}
|
||||||
after 1000
|
after 100
|
||||||
}
|
}
|
||||||
if {$retry == 0} {
|
if {$retry == 0} {
|
||||||
error "Port $port does not return available after killing instance."
|
error "Port $port does not return available after killing instance."
|
||||||
@ -525,7 +527,9 @@ proc restart_instance {type id} {
|
|||||||
|
|
||||||
# Check that the instance is running
|
# Check that the instance is running
|
||||||
if {[server_is_up 127.0.0.1 $port 100] == 0} {
|
if {[server_is_up 127.0.0.1 $port 100] == 0} {
|
||||||
abort_sentinel_test "Problems starting $type #$id: ping timeout"
|
set logfile [file join $dirname log.txt]
|
||||||
|
puts [exec tail $logfile]
|
||||||
|
abort_sentinel_test "Problems starting $type #$id: ping timeout, maybe server start failed, check $logfile"
|
||||||
}
|
}
|
||||||
|
|
||||||
# Connect with it with a fresh link
|
# Connect with it with a fresh link
|
||||||
|
Loading…
x
Reference in New Issue
Block a user