From 624742d9b40d20afe8b694c2dc0b0349f8b7aaad Mon Sep 17 00:00:00 2001 From: antirez Date: Sat, 16 May 2020 17:15:35 +0200 Subject: [PATCH] Remove the client from CLOSE_ASAP list before caching the master. This was broken in 1a7cd2c: we identified a crash in the CI, what was happening before the fix should be like that: 1. The client gets in the async free list. 2. However freeClient() gets called again against the same client which is a master. 3. The client arrived in freeClient() with the CLOSE_ASAP flag set. 4. The master gets cached, but NOT removed from the CLOSE_ASAP linked list. 5. The master client that was cached was immediately removed since it was still in the list. 6. Redis accessed a freed cached master. This is how the crash looked like: === REDIS BUG REPORT START: Cut & paste starting from here === 1092:S 16 May 2020 11:44:09.731 # Redis 999.999.999 crashed by signal: 11 1092:S 16 May 2020 11:44:09.731 # Crashed running the instruction at: 0x447e18 1092:S 16 May 2020 11:44:09.731 # Accessing address: 0xffffffffffffffff 1092:S 16 May 2020 11:44:09.731 # Failed assertion: (:0) ------ STACK TRACE ------ EIP: src/redis-server 127.0.0.1:21300(readQueryFromClient+0x48)[0x447e18] And the 0xffff address access likely comes from accessing an SDS that is set to NULL (we go -1 offset to read the header). --- src/networking.c | 21 ++++++++++++--------- src/replication.c | 3 --- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/networking.c b/src/networking.c index 6de00be09..0a04c0489 100644 --- a/src/networking.c +++ b/src/networking.c @@ -1134,6 +1134,16 @@ void freeClient(client *c) { /* Notify module system that this client auth status changed. */ moduleNotifyUserChanged(c); + /* If this client was scheduled for async freeing we need to remove it + * from the queue. Note that we need to do this here, because later + * we may call replicationCacheMaster() and the client should already + * be removed from the list of clients to free. */ + if (c->flags & CLIENT_CLOSE_ASAP) { + ln = listSearchKey(server.clients_to_close,c); + serverAssert(ln != NULL); + listDelNode(server.clients_to_close,ln); + } + /* If it is our master that's beging disconnected we should make sure * to cache the state to try a partial resynchronization later. * @@ -1142,6 +1152,7 @@ void freeClient(client *c) { if (server.master && c->flags & CLIENT_MASTER) { serverLog(LL_WARNING,"Connection with master lost."); if (!(c->flags & (CLIENT_PROTOCOL_ERROR|CLIENT_BLOCKED))) { + c->flags &= ~(CLIENT_CLOSE_ASAP|CLIENT_CLOSE_AFTER_REPLY); replicationCacheMaster(c); return; } @@ -1209,15 +1220,7 @@ void freeClient(client *c) { * we lost the connection with the master. */ if (c->flags & CLIENT_MASTER) replicationHandleMasterDisconnection(); - /* If this client was scheduled for async freeing we need to remove it - * from the queue. */ - if (c->flags & CLIENT_CLOSE_ASAP) { - ln = listSearchKey(server.clients_to_close,c); - serverAssert(ln != NULL); - listDelNode(server.clients_to_close,ln); - } - - /* Remove the contribution that this client gave to our + /* Remove the contribution that this client gave to our * incrementally computed memory usage. */ server.stat_clients_type_memory[c->client_cron_last_memory_type] -= c->client_cron_last_memory_usage; diff --git a/src/replication.c b/src/replication.c index f433d6413..b10635f25 100644 --- a/src/replication.c +++ b/src/replication.c @@ -2688,9 +2688,6 @@ void replicationCacheMaster(client *c) { /* Unlink the client from the server structures. */ unlinkClient(c); - /* Clear flags that can create issues once we reconnect the client. */ - c->flags &= ~(CLIENT_CLOSE_ASAP|CLIENT_CLOSE_AFTER_REPLY); - /* Reset the master client so that's ready to accept new commands: * we want to discard te non processed query buffers and non processed * offsets, including pending transactions, already populated arguments,