From a93234acccecbbbd2e19f188e091072dd068ca46 Mon Sep 17 00:00:00 2001 From: christianEQ Date: Tue, 16 Mar 2021 23:19:29 +0000 Subject: [PATCH 01/75] parameterized monkey options Former-commit-id: eda1055689587c25dca3e889dd3a74d3f9c9378f --- monkey/monkey.py | 68 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 54 insertions(+), 14 deletions(-) diff --git a/monkey/monkey.py b/monkey/monkey.py index 653970728..67e426486 100644 --- a/monkey/monkey.py +++ b/monkey/monkey.py @@ -4,18 +4,17 @@ import sched, time import socket import asyncore import threading +import argparse import sys from pprint import pprint -# Parameters -numclients = 50 -#numkeys = 1000000 -numkeys = 100000 - # Globals ops=0 s = sched.scheduler(time.time, time.sleep) g_exit = False +numclients = 0 +numkeys = 0 +runtime = 0 def _buildResp(*args): result = "*" + str(len(args)) + "\r\n" @@ -127,6 +126,10 @@ class Client(asyncore.dispatcher): self.buf += _buildResp("lpush", key, val) self.callbacks.append(callback) + def blpop(self, *keys, timeout=0, callback=default_result_handler): + self.buf += _buildResp("blpop", *keys, str(timeout)) + self.callbacks.append(callback) + def delete(self, key, callback = default_result_handler): self.buf += _buildResp("del", key) self.callbacks.append(callback) @@ -149,13 +152,19 @@ class Client(asyncore.dispatcher): def getrandomkey(): return str(random.randrange(0, numkeys)) -def handle_lpush_response(c, resp): +def handle_lpush_response(c, resp, delay=0): global ops if resp != None: ops = ops + 1 assert(resp[0] == ord(':')) c.lpush("list_" + getrandomkey(), 'bardsklfjkldsjfdlsjflksdfjklsdjflksd kldsjflksd jlkdsjf lksdjklds jrfklsdjfklsdjfkl', handle_lpush_response) +def handle_blpop_response(c, resp): + global ops + if resp != None: + ops = ops + 1 + c.blpop("list_" + getrandomkey(), callback=handle_blpop_response) + def handle_set_response(c, resp): global ops if resp != None: @@ -178,19 +187,28 @@ def scan_callback(c, resp): def stats_thread(): global ops global g_exit - while not g_exit: + global runtime + i = 0 + while not g_exit and not (runtime and i > runtime): time.sleep(1) print("Ops per second: " + str(ops)) ops = 0 + i += 1 + g_exit = True -def main(): - global g_exit - clients = [] - +def init_blocking(): for i in range(numclients): - clients.append(Client('127.0.0.1', 6379)) + c = Client('127.0.0.1', 6379) + if i % 2: + handle_lpush_response(c, None, delay=1) + else: + handle_blpop_response(c, None) + +def init_lpush(): + for i in range(numclients): + c = Client('127.0.0.1', 6379) for i in range (10): - handle_lpush_response(clients[-1], None) + handle_lpush_response(c, None) #handle_set_response(clients[-1], None) scan_client = Client('127.0.0.1', 6379) @@ -199,11 +217,33 @@ def main(): del_client = Client('127.0.0.1', 6379) handle_del_response(del_client, None) +def main(test): + global g_exit + + try: + globals()[f"init_{test}"]() + except KeyError: + print(f"Test \"{test}\" not found. Exiting...") + exit() + threading.Thread(target=stats_thread).start() asyncore.loop() g_exit = True sys.exit(0) print("DONE") +parser = argparse.ArgumentParser(description="Test use cases for KeyDB.") +parser.add_argument('test', choices=[x[5:] for x in filter(lambda name: name.startswith("init_"), globals().keys())]) +parser.add_argument('-c', '--clients', type=int, default=50) +parser.add_argument('-k', '--keys', type=int, default=100000) +parser.add_argument('-t', '--runtime', type=int, default=0) + if __name__ == "__main__": - main() + try: + args = parser.parse_args() + except: + exit() + numclients = args.clients + numkeys = args.keys + runtime = args.runtime + main(args.test) From 6119273c5e7591742c7caea1c81e30d4d4083d5b Mon Sep 17 00:00:00 2001 From: christianEQ Date: Wed, 17 Mar 2021 01:40:38 +0000 Subject: [PATCH 02/75] unblocking test to monkey, works with 1 client Former-commit-id: 9fbe8cf6a8aeb141d4a502532a456e4256f4daf8 --- monkey/monkey.py | 85 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 66 insertions(+), 19 deletions(-) diff --git a/monkey/monkey.py b/monkey/monkey.py index 67e426486..5308a8276 100644 --- a/monkey/monkey.py +++ b/monkey/monkey.py @@ -15,6 +15,7 @@ g_exit = False numclients = 0 numkeys = 0 runtime = 0 +clients = [] def _buildResp(*args): result = "*" + str(len(args)) + "\r\n" @@ -31,6 +32,8 @@ class Client(asyncore.dispatcher): self.buf = b'' self.inbuf = b'' self.callbacks = list() + self.client_id = 0 + self.get_client_id() def handle_connect(self): pass @@ -54,6 +57,9 @@ class Client(asyncore.dispatcher): endrange = self.inbuf[startpos+1:].find(ord('\r')) + 1 + startpos assert(endrange > 0) numargs = int(self.inbuf[startpos+1:endrange]) + if numargs == -1: # Nil array, used in some returns + startpos = endrange + 2 + return startpos, [] assert(numargs > 0) args = list() startpos = endrange + 2 # plus 1 gets us to the '\n' and the next gets us to the start char @@ -134,6 +140,10 @@ class Client(asyncore.dispatcher): self.buf += _buildResp("del", key) self.callbacks.append(callback) + def unblock(self, client_id, callback=default_result_handler): + self.buf += _buildResp("client", "unblock", str(client_id)) + self.callbacks.append(callback) + def scan(self, iter, match=None, count=None, callback = default_result_handler): args = ["scan", str(iter)] if match != None: @@ -145,10 +155,18 @@ class Client(asyncore.dispatcher): self.buf += _buildResp(*args) self.callbacks.append(callback) + def get_client_id(self): + self.buf += _buildResp("client", "id") + self.callbacks.append(self.store_client_id) + + def store_client_id(self, c, resp): + assert(resp[0] == ord(':')) + self.client_id = int(resp[1:]) + assert(self.client_id == c.client_id) + def get(self, key, callback = None): return - def getrandomkey(): return str(random.randrange(0, numkeys)) @@ -162,28 +180,43 @@ def handle_lpush_response(c, resp, delay=0): def handle_blpop_response(c, resp): global ops if resp != None: + print("unblocked thread") ops = ops + 1 - c.blpop("list_" + getrandomkey(), callback=handle_blpop_response) + else: + c.blpop("list_" + getrandomkey(), callback=handle_blpop_response) -def handle_set_response(c, resp): +def handle_set_response(c, resp=None): global ops if resp != None: ops = ops + 1 assert(resp[0] == ord('+')) c.set("str_" + getrandomkey(), 'bardsklfjkldsjfdlsjflksdfjklsdjflksd kldsjflksd jlkdsjf lksdjklds jrfklsdjfklsdjfkl', handle_set_response) -def handle_del_response(c, resp): +def handle_del_response(c, resp=None): global ops if resp != None: ops = ops + 1 c.delete("list_" + getrandomkey(), handle_del_response) -def scan_callback(c, resp): +def scan_callback(c, resp=None): global ops nextstart = int(resp[0]) c.scan(nextstart, count=500, callback=scan_callback) ops = ops+1 +def unblock_clients(c, resp=None): + global clients + global ops + if resp != None: + ops = ops + 1 + time.sleep(1) + client_ids = list(map(lambda x: x.client_id, clients)) + for id in client_ids: + if id: + print(f"unblocking client {id}") + time.sleep(1) + c.unblock(id, unblock_clients) + def stats_thread(): global ops global g_exit @@ -192,34 +225,47 @@ def stats_thread(): while not g_exit and not (runtime and i > runtime): time.sleep(1) print("Ops per second: " + str(ops)) + #print(f"Blocked threads: {len(list(filter(lambda x: x.blocked, clients)))}") ops = 0 i += 1 g_exit = True +def flush_db_sync(): + server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + server.connect(('127.0.0.1', 6379)) + server.send(_buildResp("flushdb")) + resp = server.recv(8192) + assert(resp[:3] == "+OK".encode('utf-8')) + def init_blocking(): + global clients for i in range(numclients): - c = Client('127.0.0.1', 6379) - if i % 2: - handle_lpush_response(c, None, delay=1) - else: - handle_blpop_response(c, None) + clients.append(Client('127.0.0.1', 6379)) + handle_blpop_response(clients[-1], None) + + unblock_client = Client('127.0.0.1', 6379) + unblock_clients(unblock_client) def init_lpush(): + global clients for i in range(numclients): - c = Client('127.0.0.1', 6379) + clients.append(Client('127.0.0.1', 6379)) for i in range (10): - handle_lpush_response(c, None) + handle_lpush_response(c) #handle_set_response(clients[-1], None) scan_client = Client('127.0.0.1', 6379) scan_client.scan(0, count=500, callback=scan_callback) del_client = Client('127.0.0.1', 6379) - handle_del_response(del_client, None) + handle_del_response(del_client) -def main(test): +def main(test, flush): global g_exit + if flush: + flush_db_sync() + try: globals()[f"init_{test}"]() except KeyError: @@ -233,10 +279,11 @@ def main(test): print("DONE") parser = argparse.ArgumentParser(description="Test use cases for KeyDB.") -parser.add_argument('test', choices=[x[5:] for x in filter(lambda name: name.startswith("init_"), globals().keys())]) -parser.add_argument('-c', '--clients', type=int, default=50) -parser.add_argument('-k', '--keys', type=int, default=100000) -parser.add_argument('-t', '--runtime', type=int, default=0) +parser.add_argument('test', choices=[x[5:] for x in filter(lambda name: name.startswith("init_"), globals().keys())], help="which test to run") +parser.add_argument('-c', '--clients', type=int, default=50, help="number of running clients to use") +parser.add_argument('-k', '--keys', type=int, default=100000, help="number of keys to choose from for random tests") +parser.add_argument('-t', '--runtime', type=int, default=0, help="how long to run the test for (default: 0 for infinite)") +parser.add_argument('-f', '--flush', action="store_true", help="flush the db before running the test") if __name__ == "__main__": try: @@ -246,4 +293,4 @@ if __name__ == "__main__": numclients = args.clients numkeys = args.keys runtime = args.runtime - main(args.test) + main(args.test, args.flush) From a2d8079d49320fc73fccaaff20e6839c641c1d35 Mon Sep 17 00:00:00 2001 From: christianEQ Date: Wed, 17 Mar 2021 16:03:51 +0000 Subject: [PATCH 03/75] track each command separately in monkey Former-commit-id: e3accdaad985fd9f93490a5cc6704d9ecb909604 --- monkey/monkey.py | 44 +++++++++++++++++++------------------------- 1 file changed, 19 insertions(+), 25 deletions(-) diff --git a/monkey/monkey.py b/monkey/monkey.py index 5308a8276..e1b8fbb0e 100644 --- a/monkey/monkey.py +++ b/monkey/monkey.py @@ -9,7 +9,7 @@ import sys from pprint import pprint # Globals -ops=0 +ops = {} s = sched.scheduler(time.time, time.sleep) g_exit = False numclients = 0 @@ -170,52 +170,42 @@ class Client(asyncore.dispatcher): def getrandomkey(): return str(random.randrange(0, numkeys)) -def handle_lpush_response(c, resp, delay=0): +def handle_lpush_response(c, resp=None): global ops if resp != None: - ops = ops + 1 + ops['lpush'] += 1 assert(resp[0] == ord(':')) c.lpush("list_" + getrandomkey(), 'bardsklfjkldsjfdlsjflksdfjklsdjflksd kldsjflksd jlkdsjf lksdjklds jrfklsdjfklsdjfkl', handle_lpush_response) -def handle_blpop_response(c, resp): +def handle_blpop_response(c, resp=None): global ops if resp != None: - print("unblocked thread") - ops = ops + 1 + ops['blpop'] += 1 else: c.blpop("list_" + getrandomkey(), callback=handle_blpop_response) def handle_set_response(c, resp=None): global ops if resp != None: - ops = ops + 1 + ops['set'] += 1 assert(resp[0] == ord('+')) c.set("str_" + getrandomkey(), 'bardsklfjkldsjfdlsjflksdfjklsdjflksd kldsjflksd jlkdsjf lksdjklds jrfklsdjfklsdjfkl', handle_set_response) def handle_del_response(c, resp=None): global ops if resp != None: - ops = ops + 1 + ops['del'] += 1 c.delete("list_" + getrandomkey(), handle_del_response) def scan_callback(c, resp=None): global ops nextstart = int(resp[0]) c.scan(nextstart, count=500, callback=scan_callback) - ops = ops+1 + ops['scan'] += 1 -def unblock_clients(c, resp=None): - global clients +def clear_ops(): global ops - if resp != None: - ops = ops + 1 - time.sleep(1) - client_ids = list(map(lambda x: x.client_id, clients)) - for id in client_ids: - if id: - print(f"unblocking client {id}") - time.sleep(1) - c.unblock(id, unblock_clients) + ops = {'lpush': 0, 'blpop': 0, 'del': 0, 'scan': 0, 'set': 0, 'get': 0} def stats_thread(): global ops @@ -226,7 +216,7 @@ def stats_thread(): time.sleep(1) print("Ops per second: " + str(ops)) #print(f"Blocked threads: {len(list(filter(lambda x: x.blocked, clients)))}") - ops = 0 + clear_ops() i += 1 g_exit = True @@ -239,12 +229,14 @@ def flush_db_sync(): def init_blocking(): global clients + if numkeys > 5 * numclients: + print("WARNING: High ratio of keys to clients. Most lpushes will not be popped and unblocking will take a long time!") for i in range(numclients): clients.append(Client('127.0.0.1', 6379)) - handle_blpop_response(clients[-1], None) - - unblock_client = Client('127.0.0.1', 6379) - unblock_clients(unblock_client) + if i % 2: + handle_blpop_response(clients[-1]) + else: + handle_lpush_response(clients[-1]) def init_lpush(): global clients @@ -262,6 +254,8 @@ def init_lpush(): def main(test, flush): global g_exit + + clear_ops() if flush: flush_db_sync() From ca97f6e4548bffce500d928a2086d183979977ff Mon Sep 17 00:00:00 2001 From: christianEQ Date: Wed, 17 Mar 2021 16:10:06 +0000 Subject: [PATCH 04/75] working blpop test in monkey Former-commit-id: 21af13c2a38741ad9e036fad42b23ea2454ac33c --- monkey/monkey.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/monkey/monkey.py b/monkey/monkey.py index e1b8fbb0e..07f157961 100644 --- a/monkey/monkey.py +++ b/monkey/monkey.py @@ -181,8 +181,7 @@ def handle_blpop_response(c, resp=None): global ops if resp != None: ops['blpop'] += 1 - else: - c.blpop("list_" + getrandomkey(), callback=handle_blpop_response) + c.blpop("list_" + getrandomkey(), callback=handle_blpop_response) def handle_set_response(c, resp=None): global ops @@ -214,7 +213,7 @@ def stats_thread(): i = 0 while not g_exit and not (runtime and i > runtime): time.sleep(1) - print("Ops per second: " + str(ops)) + print("Ops per second: " + str({k:v for (k,v) in ops.items() if v})) #print(f"Blocked threads: {len(list(filter(lambda x: x.blocked, clients)))}") clear_ops() i += 1 @@ -229,7 +228,7 @@ def flush_db_sync(): def init_blocking(): global clients - if numkeys > 5 * numclients: + if numkeys > 100 * numclients: print("WARNING: High ratio of keys to clients. Most lpushes will not be popped and unblocking will take a long time!") for i in range(numclients): clients.append(Client('127.0.0.1', 6379)) @@ -265,6 +264,10 @@ def main(test, flush): except KeyError: print(f"Test \"{test}\" not found. Exiting...") exit() + except ConnectionRefusedError: + print("Could not connect to server. Is it running?") + print("Exiting...") + exit() threading.Thread(target=stats_thread).start() asyncore.loop() From 34d019f4836f7aaa3c61c8e12595aa165defd4f6 Mon Sep 17 00:00:00 2001 From: christianEQ Date: Wed, 17 Mar 2021 19:50:28 +0000 Subject: [PATCH 05/75] fixed runtime arg for monkey Former-commit-id: b3e6e1f13b3d24c92d9f7e8441831232f47ff053 --- monkey/monkey.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/monkey/monkey.py b/monkey/monkey.py index 07f157961..3c23874a8 100644 --- a/monkey/monkey.py +++ b/monkey/monkey.py @@ -211,13 +211,14 @@ def stats_thread(): global g_exit global runtime i = 0 - while not g_exit and not (runtime and i > runtime): + while not g_exit and not (runtime and i >= runtime): time.sleep(1) print("Ops per second: " + str({k:v for (k,v) in ops.items() if v})) #print(f"Blocked threads: {len(list(filter(lambda x: x.blocked, clients)))}") clear_ops() i += 1 g_exit = True + asyncore.close_all() def flush_db_sync(): server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) @@ -253,7 +254,7 @@ def init_lpush(): def main(test, flush): global g_exit - + clear_ops() if flush: @@ -272,8 +273,7 @@ def main(test, flush): threading.Thread(target=stats_thread).start() asyncore.loop() g_exit = True - sys.exit(0) - print("DONE") + print("Done.") parser = argparse.ArgumentParser(description="Test use cases for KeyDB.") parser.add_argument('test', choices=[x[5:] for x in filter(lambda name: name.startswith("init_"), globals().keys())], help="which test to run") From 4944836478ecd2a58d526637a3febf2426e19054 Mon Sep 17 00:00:00 2001 From: christianEQ Date: Wed, 17 Mar 2021 20:17:52 +0000 Subject: [PATCH 06/75] removed unused stuff from monkey Former-commit-id: be55daa6f7bcbf137b37269d1ee7a5fcacc14ff0 --- monkey/monkey.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/monkey/monkey.py b/monkey/monkey.py index 3c23874a8..a31b35f9b 100644 --- a/monkey/monkey.py +++ b/monkey/monkey.py @@ -1,17 +1,13 @@ -import keydb import random -import sched, time +import time import socket import asyncore import threading import argparse -import sys from pprint import pprint # Globals ops = {} -s = sched.scheduler(time.time, time.sleep) -g_exit = False numclients = 0 numkeys = 0 runtime = 0 @@ -208,16 +204,13 @@ def clear_ops(): def stats_thread(): global ops - global g_exit global runtime i = 0 - while not g_exit and not (runtime and i >= runtime): + while i < runtime or not runtime: time.sleep(1) print("Ops per second: " + str({k:v for (k,v) in ops.items() if v})) - #print(f"Blocked threads: {len(list(filter(lambda x: x.blocked, clients)))}") clear_ops() i += 1 - g_exit = True asyncore.close_all() def flush_db_sync(): @@ -253,8 +246,6 @@ def init_lpush(): handle_del_response(del_client) def main(test, flush): - global g_exit - clear_ops() if flush: @@ -272,7 +263,6 @@ def main(test, flush): threading.Thread(target=stats_thread).start() asyncore.loop() - g_exit = True print("Done.") parser = argparse.ArgumentParser(description="Test use cases for KeyDB.") From 0436e62b7a586854c57a8c056f46202d401d45eb Mon Sep 17 00:00:00 2001 From: christianEQ Date: Wed, 17 Mar 2021 20:22:01 +0000 Subject: [PATCH 07/75] fix old reference to c variable (monkey) Former-commit-id: 0b9b6413c70d4ba71bbabcd0b22fb004d804958f --- monkey/monkey.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/monkey/monkey.py b/monkey/monkey.py index a31b35f9b..37e5e4440 100644 --- a/monkey/monkey.py +++ b/monkey/monkey.py @@ -236,7 +236,7 @@ def init_lpush(): for i in range(numclients): clients.append(Client('127.0.0.1', 6379)) for i in range (10): - handle_lpush_response(c) + handle_lpush_response(clients[-1]) #handle_set_response(clients[-1], None) scan_client = Client('127.0.0.1', 6379) From 05fe41b33a3b1ebc25a5c17748a5c00a3e0a2984 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Mon, 26 Apr 2021 22:13:32 +0000 Subject: [PATCH 08/75] Primitive implementation of bypassing client buffer, stats are all messed up and print statements everywhere Former-commit-id: 8ae310fb0f7b53add826f76891da333b63860001 --- src/networking.cpp | 159 +++++++++++++++++++++++++++++++++++++++----- src/replication.cpp | 156 +++++++++++++++++++++++++++++++++++++++++++ src/server.h | 11 +++ 3 files changed, 308 insertions(+), 18 deletions(-) diff --git a/src/networking.cpp b/src/networking.cpp index 574a8bc6c..18ab382bd 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -224,6 +224,7 @@ void clientInstallWriteHandler(client *c) { (c->replstate == REPL_STATE_NONE || (c->replstate == SLAVE_STATE_ONLINE && !c->repl_put_online_on_ack))) { + // serverLog(LL_NOTICE, "we installing boyz"); AssertCorrectThread(c); serverAssert(c->lock.fOwnLock()); /* Here instead of installing the write handler, we just flag the @@ -301,7 +302,7 @@ int prepareClientToWrite(client *c) { /* Schedule the client to write the output buffers to the socket, unless * it should already be setup to do so (it has already pending data). */ - if (!fAsync && !clientHasPendingReplies(c)) clientInstallWriteHandler(c); + if (!fAsync && !clientHasPendingReplies(c) && c->repl_curr_idx == -1) clientInstallWriteHandler(c); if (fAsync && !(c->fPendingAsyncWrite)) clientInstallAsyncWriteHandler(c); /* Authorize the caller to queue in the output buffer of this client. */ @@ -1676,15 +1677,33 @@ int writeToClient(client *c, int handler_installed) { std::unique_locklock)> lock(c->lock); + /* if this is a write to a replica, it's coming straight from the replication backlog */ + long long repl_backlog_idx = g_pserver->repl_backlog_idx; + + bool wroteFromClientBuffer = false; /* True if you wrote from the client buffer in this function call */ + while(clientHasPendingReplies(c)) { + wroteFromClientBuffer = true; + if (c->flags & CLIENT_SLAVE && listLength(c->reply) % 10 == 0){ + + serverLog(LL_NOTICE, "-----------------------------------------"); + serverLog(LL_NOTICE, "replica w/ pending replies, with a reply list size of: %lu", listLength(c->reply)); + serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size); + serverLog(LL_NOTICE, "repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset); + serverLog(LL_NOTICE, "-----------------------------------------"); + + } if (c->bufpos > 0) { + // serverLog(LL_NOTICE, "Sending reply %d", x); + // serverLog(LL_NOTICE, "SUSSUS AMOGUS, %ld", c->bufpos); nwritten = connWrite(c->conn,c->buf+c->sentlen,c->bufpos-c->sentlen); if (nwritten <= 0) break; c->sentlen += nwritten; totwritten += nwritten; /* If the buffer was sent, set bufpos to zero to continue with - * the remainder of the reply. */ + * the remainder of the reply. */ + // serverLog(LL_NOTICE, "buf pos: %d, sentlen: %ld", c->bufpos, c->sentlen); if ((int)c->sentlen == c->bufpos) { c->bufpos = 0; c->sentlen = 0; @@ -1714,23 +1733,112 @@ int writeToClient(client *c, int handler_installed) { } } /* Note that we avoid to send more than NET_MAX_WRITES_PER_EVENT - * bytes, in a single threaded server it's a good idea to serve - * other clients as well, even if a very large request comes from - * super fast link that is always able to accept data (in real world - * scenario think about 'KEYS *' against the loopback interface). - * - * However if we are over the maxmemory limit we ignore that and - * just deliver as much data as it is possible to deliver. - * - * Moreover, we also send as much as possible if the client is - * a replica or a monitor (otherwise, on high-speed traffic, the - * replication/output buffer will grow indefinitely) */ + * bytes, in a single threaded server it's a good idea to serve + * other clients as well, even if a very large request comes from + * super fast link that is always able to accept data (in real world + * scenario think about 'KEYS *' against the loopback interface). + * + * However if we are over the maxmemory limit we ignore that and + * just deliver as much data as it is possible to deliver. + * + * Moreover, we also send as much as possible if the client is + * a replica or a monitor (otherwise, on high-speed traffic, the + * replication/output buffer will grow indefinitely) */ if (totwritten > NET_MAX_WRITES_PER_EVENT && (g_pserver->maxmemory == 0 || - zmalloc_used_memory() < g_pserver->maxmemory) && + zmalloc_used_memory() < g_pserver->maxmemory) && !(c->flags & CLIENT_SLAVE)) break; } - + + /* If there are no more pending replies, then we have transmitted the RDB. + * This means further replication commands will be taken straight from the + * replication backlog from now on. */ + if (c->flags & CLIENT_SLAVE && c->replstate == SLAVE_STATE_ONLINE && !clientHasPendingReplies(c)){ + if (!c->transmittedRDB) + serverLog(LL_NOTICE, "---------->>>>>>>> TRANSMISSION OF THE RDB HAS COMPLETED <<<<<<<<----------"); + c->transmittedRDB = true; + } + + /* For replicas, we don't store all the information in the client buffer + * Most of the time (aside from immediately after synchronizing), we read + * from the replication backlog directly */ + if (c->flags & CLIENT_SLAVE && c->repl_curr_idx != -1 && c->transmittedRDB){ + /* copy global variables into local scope so if they change in between we don't care */ + long long repl_backlog_size = g_pserver->repl_backlog_size; + long long nwrittenPart2 = 0; + + ssize_t nrequested; /* The number of bytes requested to write */ + /* normal case with no wrap around */ + if (repl_backlog_idx >= c->repl_curr_idx){ + nrequested = repl_backlog_idx - c->repl_curr_idx; + nwritten = connWrite(c->conn, g_pserver->repl_backlog + c->repl_curr_idx, repl_backlog_idx - c->repl_curr_idx); + /* wrap around case, v. rare */ + /* also v. buggy so there's that */ + } else { + serverLog(LL_NOTICE, "WRAP CASE"); + serverLog(LL_NOTICE, "-----------------------------------------"); + serverLog(LL_NOTICE, "requested to write: %ld", nrequested); + serverLog(LL_NOTICE, "actually written: %ld", nwritten); + serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size); + serverLog(LL_NOTICE, "buf pos: %d, sentlen: %ld", c->bufpos, c->sentlen); + serverLog(LL_NOTICE, "nwritten: %ld", nwritten); + serverLog(LL_NOTICE, "-----------------------------------------"); + + nrequested = repl_backlog_size + repl_backlog_idx - c->repl_curr_idx; + nwritten = connWrite(c->conn, g_pserver->repl_backlog + c->repl_curr_idx, repl_backlog_size - c->repl_curr_idx); + /* only attempt wrapping if we write the correct number of bytes */ + if (nwritten == repl_backlog_size - c->repl_curr_idx){ + serverLog(LL_NOTICE, "SECOND STAGE"); + serverLog(LL_NOTICE, "-----------------------------------------"); + serverLog(LL_NOTICE, "requested to write: %ld", nrequested); + serverLog(LL_NOTICE, "actually written: %ld", nwritten); + serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size); + serverLog(LL_NOTICE, "buf pos: %d, sentlen: %ld", c->bufpos, c->sentlen); + serverLog(LL_NOTICE, "-----------------------------------------"); + + long long nwrittenPart2 = connWrite(c->conn, g_pserver->repl_backlog, repl_backlog_idx); + if (nwrittenPart2 != -1) + nwritten += nwrittenPart2; + + serverLog(LL_NOTICE, "nwrittenPart2: %lld", nwrittenPart2); + serverLog(LL_NOTICE, "-----------------------------------------"); + } else { + serverLog(LL_NOTICE, "SUPER SHORT"); + } + + } + + /* only update the replica's current index if bytes were sent */ + + // if (nrequested != nwritten){ + serverLog(LL_NOTICE, "-----------------------------------------"); + serverLog(LL_NOTICE, "AFTER THE FACT"); + serverLog(LL_NOTICE, "requested to write: %ld", nrequested); + serverLog(LL_NOTICE, "actually written: %ld", nwritten); + serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size); + serverLog(LL_NOTICE, "repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset); + serverLog(LL_NOTICE, "-----------------------------------------"); + // } + + + if (nwritten == nrequested){ + c->repl_curr_idx = -1; /* -1 denotes no more replica writes */ + } + else if (nwritten > 0) + c->repl_curr_idx = (c->repl_curr_idx + nwritten) % repl_backlog_size; + + serverAssert(c->repl_curr_idx < repl_backlog_size); + + /* only increment bytes if an error didn't occur */ + if (nwritten > 0){ + totwritten += nwritten; + c->repl_curr_off += nwritten; + } + + /* If the second part of a write didn't go through, we still need to register that */ + if (nwrittenPart2 == -1) nwritten = -1; + } + g_pserver->stat_net_output_bytes += totwritten; if (nwritten == -1) { if (connGetState(c->conn) == CONN_STATE_CONNECTED) { @@ -1750,7 +1858,7 @@ int writeToClient(client *c, int handler_installed) { * We just rely on data / pings received for timeout detection. */ if (!(c->flags & CLIENT_MASTER)) c->lastinteraction = g_pserver->unixtime; } - if (!clientHasPendingReplies(c)) { + if (!clientHasPendingReplies(c) && c->repl_curr_idx == -1) { c->sentlen = 0; if (handler_installed) connSetWriteHandler(c->conn, NULL); @@ -1904,6 +2012,12 @@ int handleClientsWithPendingWrites(int iel, int aof_state) { /* Don't write to clients that are going to be closed anyway. */ if (c->flags & CLIENT_CLOSE_ASAP) continue; + // if (c->flags & CLIENT_SLAVE){ + // if(clientHasPendingReplies(c)) + // serverLog(LL_NOTICE, "somehow the client buffer has these values: %s", c->buf); + // serverLog(LL_NOTICE, "LOL"); + // } + /* Try to write buffers to the client socket. */ if (writeToClient(c,0) == C_ERR) { @@ -1920,7 +2034,7 @@ int handleClientsWithPendingWrites(int iel, int aof_state) { /* If after the synchronous writes above we still have data to * output to the client, we need to install the writable handler. */ - if (clientHasPendingReplies(c)) { + if (clientHasPendingReplies(c) || c->repl_curr_idx != -1) { if (connSetWriteHandlerWithBarrier(c->conn, sendReplyToClient, ae_flags, true) == C_ERR) freeClientAsync(c); } @@ -3268,6 +3382,13 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) { } } +/* In the case of a replica client, it is possible (and very likely) + * that writes to said replica are using data from the replication backlog + * as opposed to it's own internal buffer, this number should keep track of that */ +unsigned long getClientReplicationBacklogSharedUsage(client *c) { + return (c->repl_curr_idx == -1 && c->flags & CLIENT_SLAVE) ? 0 : g_pserver->master_repl_offset - c->repl_curr_off; +} + /* This function returns the number of bytes that Redis is * using to store the reply still not read by the client. * @@ -3276,9 +3397,11 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) { * enforcing the client output length limits. */ unsigned long getClientOutputBufferMemoryUsage(client *c) { unsigned long list_item_size = sizeof(listNode) + sizeof(clientReplyBlock); - return c->reply_bytes + (list_item_size*listLength(c->reply)) + (c->replyAsync ? c->replyAsync->size : 0); + return c->reply_bytes + (list_item_size*listLength(c->reply)) + (c->replyAsync ? c->replyAsync->size : 0) + getClientReplicationBacklogSharedUsage(c); } + + /* Get the class of a client, used in order to enforce limits to different * classes of clients. * diff --git a/src/replication.cpp b/src/replication.cpp index 2533bae52..ccb538a69 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -47,6 +47,9 @@ #include #include +#define BYPASS_BUFFER +// #define BYPASS_PSYNC + void replicationDiscardCachedMaster(redisMaster *mi); void replicationResurrectCachedMaster(redisMaster *mi, connection *conn); void replicationSendAck(redisMaster *mi); @@ -59,6 +62,18 @@ static void propagateMasterStaleKeys(); * the instance is configured to have no persistence. */ int RDBGeneratedByReplication = 0; +void resizeReplicationBacklogForClients(long long newsize); + +void setReplIdx(client *c, long long idx, long long off){ + if (prepareClientToWrite(c) != C_OK) return; + // serverLog(LL_NOTICE, "calling this garbage function w/ idx and off: %lld, %lld, %lld", idx, off, off-idx); + // serverLog(LL_NOTICE, "What is this value? %lld", c->repl_curr_idx); + if (c->repl_curr_idx == -1){ + c->repl_curr_idx = idx; + c->repl_curr_off = off; + } +} + /* --------------------------- Utility functions ---------------------------- */ /* Return the pointer to a string representing the replica ip:listening_port @@ -213,6 +228,8 @@ void resizeReplicationBacklog(long long newsize) { newsize = CONFIG_REPL_BACKLOG_MIN_SIZE; if (g_pserver->repl_backlog_size == newsize) return; + serverLog(LL_NOTICE, "WE HAD TO RESIZE from %lld to %lld", g_pserver->repl_backlog_size, newsize); + if (g_pserver->repl_backlog != NULL) { /* What we actually do is to flush the old buffer and realloc a new * empty one. It will refill with new data incrementally. @@ -253,6 +270,80 @@ void resizeReplicationBacklog(long long newsize) { g_pserver->repl_backlog_size = newsize; } + +/* The above but for when clients need extra replication backlog because ??? */ +void resizeReplicationBacklogForClients(long long newsize) { + if (newsize < CONFIG_REPL_BACKLOG_MIN_SIZE) + newsize = CONFIG_REPL_BACKLOG_MIN_SIZE; + if (g_pserver->repl_backlog_size == newsize) return; + + serverLog(LL_NOTICE, "WE HAD TO RESIZE from %lld to %lld", g_pserver->repl_backlog_size, newsize); + /* get the critical client size, i.e. the size of the data unflushed to clients */ + long long earliest_off = LONG_LONG_MAX; + long long earliest_idx = -1; + listIter li; + listNode *ln; + listRewind(g_pserver->slaves, &li); + while ((ln = listNext(&li))) { + client *replica = (client*)listNodeValue(ln); + if (replica->repl_curr_off != -1 && replica->repl_curr_off < earliest_off){ + earliest_off = replica->repl_curr_off; + earliest_idx = replica->repl_curr_idx; + } + } + + if (g_pserver->repl_backlog != NULL) { + /* What we actually do is to flush the old buffer and realloc a new + * empty one. It will refill with new data incrementally. + * The reason is that copying a few gigabytes adds latency and even + * worse often we need to alloc additional space before freeing the + * old buffer. */ + + if (earliest_idx >= 0) { + // We need to keep critical data so we can't shrink less than the hot data in the buffer + newsize = std::max(newsize, g_pserver->master_repl_offset - earliest_off); + char *backlog = (char*)zmalloc(newsize); + g_pserver->repl_backlog_histlen = g_pserver->master_repl_offset - earliest_off; + + if (g_pserver->repl_backlog_idx >= earliest_idx) { + auto cbActiveBacklog = g_pserver->repl_backlog_idx - earliest_idx; + memcpy(backlog, g_pserver->repl_backlog + earliest_idx, cbActiveBacklog); + serverLog(LL_NOTICE, "g_pserver->master_repl_offset: %lld, earliest_off: %lld, g_pserver->repl_backlog_idx: %lld, earliest_idx: %lld", + g_pserver->master_repl_offset, earliest_off, g_pserver->repl_backlog_idx, earliest_idx); + serverAssert(g_pserver->repl_backlog_histlen == cbActiveBacklog); + } else { + auto cbPhase1 = g_pserver->repl_backlog_size - earliest_idx; + memcpy(backlog, g_pserver->repl_backlog + earliest_idx, cbPhase1); + memcpy(backlog + cbPhase1, g_pserver->repl_backlog, g_pserver->repl_backlog_idx); + auto cbActiveBacklog = cbPhase1 + g_pserver->repl_backlog_idx; + serverAssert(g_pserver->repl_backlog_histlen == cbActiveBacklog); + } + zfree(g_pserver->repl_backlog); + g_pserver->repl_backlog = backlog; + g_pserver->repl_backlog_idx = g_pserver->repl_backlog_histlen; + listRewind(g_pserver->slaves, &li); + /* Go through the clients and update their replication indicies */ + while ((ln = listNext(&li))) { + client *replica = (client*)listNodeValue(ln); + if (replica->repl_curr_idx != -1){ + replica->repl_curr_idx -= earliest_idx; + if (replica->repl_curr_idx < 0) + replica->repl_curr_idx += g_pserver->repl_backlog_size; + } + } + g_pserver->repl_batch_idxStart = 0; + } else { + zfree(g_pserver->repl_backlog); + g_pserver->repl_backlog = (char*)zmalloc(newsize); + g_pserver->repl_backlog_histlen = 0; + g_pserver->repl_backlog_idx = 0; + /* Next byte we have is... the next since the buffer is empty. */ + g_pserver->repl_backlog_off = g_pserver->master_repl_offset+1; + } + } + g_pserver->repl_backlog_size = newsize; +} + void freeReplicationBacklog(void) { serverAssert(GlobalLocksAcquired()); listIter li; @@ -683,6 +774,10 @@ long long addReplyReplicationBacklog(client *c, long long offset) { * split the reply in two parts if we are cross-boundary. */ len = g_pserver->repl_backlog_histlen - skip; serverLog(LL_DEBUG, "[PSYNC] Reply total length: %lld", len); + serverLog(LL_NOTICE, "Coming through from the addReplicationBacklog"); +#ifdef BYPASS_PSYNC + setReplIdx(c, j, offset); +#else while(len) { long long thislen = ((g_pserver->repl_backlog_size - j) < len) ? @@ -693,6 +788,8 @@ long long addReplyReplicationBacklog(client *c, long long offset) { len -= thislen; j = 0; } +#endif + serverLog(LL_NOTICE, "rdb transmitted? %d, pending replies? %d", c->transmittedRDB, clientHasPendingReplies(c)); return g_pserver->repl_backlog_histlen - skip; } @@ -731,6 +828,8 @@ int replicationSetupSlaveForFullResync(client *replica, long long offset) { * a SELECT statement in the replication stream. */ g_pserver->replicaseldb = -1; + serverLog(LL_NOTICE, "We are setting up here lad"); + /* Don't send this reply to slaves that approached us with * the old SYNC command. */ if (!(replica->flags & CLIENT_PRE_PSYNC)) { @@ -989,6 +1088,7 @@ void syncCommand(client *c) { if (!strcasecmp((const char*)ptrFromObj(c->argv[0]),"psync")) { if (masterTryPartialResynchronization(c) == C_OK) { g_pserver->stat_sync_partial_ok++; + // c->repl_curr_idx = g_pserver->repl_backlog_idx; return; /* No full resync needed, return. */ } else { char *master_replid = (char*)ptrFromObj(c->argv[1]); @@ -1016,6 +1116,7 @@ void syncCommand(client *c) { connDisableTcpNoDelay(c->conn); /* Non critical if it fails. */ c->repldbfd = -1; c->flags |= CLIENT_SLAVE; + // c->repl_curr_idx = g_pserver->repl_backlog_idx; listAddNodeTail(g_pserver->slaves,c); /* Create the replication backlog if needed. */ @@ -1035,6 +1136,7 @@ void syncCommand(client *c) { if (g_pserver->FRdbSaveInProgress() && g_pserver->rdb_child_type == RDB_CHILD_TYPE_DISK) { + serverLog(LL_NOTICE, "case 1"); /* Ok a background save is in progress. Let's check if it is a good * one for replication, i.e. if there is another replica that is * registering differences since the server forked to save. */ @@ -1066,6 +1168,7 @@ void syncCommand(client *c) { } else if (g_pserver->FRdbSaveInProgress() && g_pserver->rdb_child_type == RDB_CHILD_TYPE_SOCKET) { + serverLog(LL_NOTICE, "case 2"); /* There is an RDB child process but it is writing directly to * children sockets. We need to wait for the next BGSAVE * in order to synchronize. */ @@ -1073,6 +1176,7 @@ void syncCommand(client *c) { /* CASE 3: There is no BGSAVE is progress. */ } else { + serverLog(LL_NOTICE, "case 3"); if (g_pserver->repl_diskless_sync && (c->slave_capa & SLAVE_CAPA_EOF)) { /* Diskless replication RDB child is created inside * replicationCron() since we want to delay its start a @@ -1278,6 +1382,7 @@ void replconfCommand(client *c) { * 3) Update the count of "good replicas". */ void putSlaveOnline(client *replica) { replica->replstate = SLAVE_STATE_ONLINE; + replica->repl_put_online_on_ack = 0; replica->repl_ack_time = g_pserver->unixtime; /* Prevent false timeout. */ if (connSetWriteHandler(replica->conn, sendReplyToClient, true) == C_ERR) { @@ -1415,11 +1520,13 @@ void sendBulkToSlave(connection *conn) { replica->repldboff += nwritten; g_pserver->stat_net_output_bytes += nwritten; + // replica->repl_curr_idx = g_pserver->repl_backlog_idx; if (replica->repldboff == replica->repldbsize) { close(replica->repldbfd); replica->repldbfd = -1; connSetWriteHandler(replica->conn,NULL); putSlaveOnline(replica); + serverLog(LL_NOTICE, "ABOUT TO DIE HERE"); } } @@ -4450,6 +4557,21 @@ void replicateSubkeyExpire(redisDb *db, robj_roptr key, robj_roptr subkey, long } void _clientAsyncReplyBufferReserve(client *c, size_t len); + +/* Has the end of the replication backlog overflowed past the beginning? */ +bool replOverflowHasOccured(){ + if (g_pserver->repl_backlog_idx > g_pserver->repl_batch_idxStart){ + long long repl_idx_difference = g_pserver->repl_backlog_idx > g_pserver->repl_batch_idxStart ? + g_pserver->repl_backlog_idx - g_pserver->repl_batch_idxStart : + (g_pserver->repl_backlog_size + g_pserver->repl_backlog_idx) - g_pserver->repl_batch_idxStart; + + return g_pserver->master_repl_offset - g_pserver->repl_batch_offStart > repl_idx_difference; + } + return false; +} + +thread_local int transmittedCount = 0; + void flushReplBacklogToClients() { serverAssert(GlobalLocksAcquired()); @@ -4463,11 +4585,31 @@ void flushReplBacklogToClients() serverAssert(g_pserver->master_repl_offset - g_pserver->repl_batch_offStart <= g_pserver->repl_backlog_size); serverAssert(g_pserver->repl_batch_idxStart != g_pserver->repl_backlog_idx); + serverAssert(!replOverflowHasOccured()); listIter li; listNode *ln; listRewind(g_pserver->slaves, &li); + +#if 0 + // check for potential overflow first while ((ln = listNext(&li))) { client *replica = (client*)listNodeValue(ln); + // serverLog(LL_NOTICE, "replica state: %d", replica->replstate); + + if (replica->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue; + if (replica->flags & CLIENT_CLOSE_ASAP) continue; + if (replica->repl_curr_idx == -1) continue; + + std::unique_lock ul(replica->lock, std::defer_lock); + if (FCorrectThread(replica)) + ul.lock(); + else + fAsyncWrite = true; +#endif + + while ((ln = listNext(&li))) { + client *replica = (client*)listNodeValue(ln); + // serverLog(LL_NOTICE, "replica state: %d", replica->replstate); if (replica->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue; if (replica->flags & CLIENT_CLOSE_ASAP) continue; @@ -4478,6 +4620,19 @@ void flushReplBacklogToClients() else fAsyncWrite = true; + +#ifdef BYPASS_BUFFER + /* If we are online and the RDB has been sent, there is no need to feed the client buffer + * We will send our replies directly from the replication backlog instead */ + if (replica->replstate == SLAVE_STATE_ONLINE && replica->transmittedRDB){ + setReplIdx(replica, g_pserver->repl_batch_idxStart, g_pserver->repl_batch_offStart); + continue; + } +#else + if (replica->replstate == SLAVE_STATE_ONLINE){ + // serverLog(LL_NOTICE, "would be calling this garbage function w/ offset: %lld", g_pserver->repl_batch_idxStart); + } +#endif if (g_pserver->repl_backlog_idx >= g_pserver->repl_batch_idxStart) { long long cbCopy = g_pserver->repl_backlog_idx - g_pserver->repl_batch_idxStart; serverAssert((g_pserver->master_repl_offset - g_pserver->repl_batch_offStart) == cbCopy); @@ -4491,6 +4646,7 @@ void flushReplBacklogToClients() _clientAsyncReplyBufferReserve(replica, cbPhase1 + g_pserver->repl_backlog_idx); addReplyProto(replica, g_pserver->repl_backlog + g_pserver->repl_batch_idxStart, cbPhase1); addReplyProto(replica, g_pserver->repl_backlog, g_pserver->repl_backlog_idx); + serverAssert((cbPhase1 + g_pserver->repl_backlog_idx) == (g_pserver->master_repl_offset - g_pserver->repl_batch_offStart)); } } diff --git a/src/server.h b/src/server.h index 878154bc5..cfd6c34a0 100644 --- a/src/server.h +++ b/src/server.h @@ -1516,6 +1516,8 @@ struct client { long long psync_initial_offset; /* FULLRESYNC reply offset other slaves copying this replica output buffer should use. */ + long long repl_curr_idx = -1; /* Replication index sent, if this is a replica */ + long long repl_curr_off = -1; char replid[CONFIG_RUN_ID_SIZE+1]; /* Master replication ID (if master). */ int slave_listening_port; /* As configured with: REPLCONF listening-port */ char slave_ip[NET_IP_STR_LEN]; /* Optionally given by REPLCONF ip-address */ @@ -1575,6 +1577,9 @@ struct client { robj **argv; size_t argv_len_sumActive = 0; + bool transmittedRDB = false; /* Have we finished transmitting the RDB to this replica? */ + /* If so, we can read from the replication backlog instead of the client buffer */ + // post a function from a non-client thread to run on its client thread bool postFunction(std::function fn, bool fLock = true); size_t argv_len_sum() const; @@ -3470,6 +3475,8 @@ void mixDigest(unsigned char *digest, const void *ptr, size_t len); void xorDigest(unsigned char *digest, const void *ptr, size_t len); int populateCommandTableParseFlags(struct redisCommand *c, const char *strflags); + + int moduleGILAcquiredByModule(void); extern int g_fInCrash; static inline int GlobalLocksAcquired(void) // Used in asserts to verify all global locks are correctly acquired for a server-thread to operate @@ -3526,6 +3533,8 @@ void tlsInit(void); void tlsInitThread(); int tlsConfigure(redisTLSContextConfig *ctx_config); +int prepareClientToWrite(client *c); + class ShutdownException {}; @@ -3538,3 +3547,5 @@ class ShutdownException int iAmMaster(void); #endif + + From d8367a92b2bea452d39561c201085a377c2021a6 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Thu, 29 Apr 2021 17:01:06 +0000 Subject: [PATCH 09/75] Updated resize logic Former-commit-id: e6d892ef21b7fc6f51433f32b01198038e555419 --- src/networking.cpp | 104 +++++++++++++++---------------------- src/replication.cpp | 123 ++++++++++++++++++++++++++++++++++++++------ src/server.cpp | 3 +- 3 files changed, 151 insertions(+), 79 deletions(-) diff --git a/src/networking.cpp b/src/networking.cpp index 18ab382bd..cac58ff07 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -1680,22 +1680,8 @@ int writeToClient(client *c, int handler_installed) { /* if this is a write to a replica, it's coming straight from the replication backlog */ long long repl_backlog_idx = g_pserver->repl_backlog_idx; - bool wroteFromClientBuffer = false; /* True if you wrote from the client buffer in this function call */ - while(clientHasPendingReplies(c)) { - wroteFromClientBuffer = true; - if (c->flags & CLIENT_SLAVE && listLength(c->reply) % 10 == 0){ - - serverLog(LL_NOTICE, "-----------------------------------------"); - serverLog(LL_NOTICE, "replica w/ pending replies, with a reply list size of: %lu", listLength(c->reply)); - serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size); - serverLog(LL_NOTICE, "repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset); - serverLog(LL_NOTICE, "-----------------------------------------"); - - } if (c->bufpos > 0) { - // serverLog(LL_NOTICE, "Sending reply %d", x); - // serverLog(LL_NOTICE, "SUSSUS AMOGUS, %ld", c->bufpos); nwritten = connWrite(c->conn,c->buf+c->sentlen,c->bufpos-c->sentlen); if (nwritten <= 0) break; c->sentlen += nwritten; @@ -1753,9 +1739,7 @@ int writeToClient(client *c, int handler_installed) { /* If there are no more pending replies, then we have transmitted the RDB. * This means further replication commands will be taken straight from the * replication backlog from now on. */ - if (c->flags & CLIENT_SLAVE && c->replstate == SLAVE_STATE_ONLINE && !clientHasPendingReplies(c)){ - if (!c->transmittedRDB) - serverLog(LL_NOTICE, "---------->>>>>>>> TRANSMISSION OF THE RDB HAS COMPLETED <<<<<<<<----------"); + if (c->flags & CLIENT_SLAVE && c->replstate == SLAVE_STATE_ONLINE && !clientHasPendingReplies(c) && c->replyAsync == nullptr){ c->transmittedRDB = true; } @@ -1775,49 +1759,27 @@ int writeToClient(client *c, int handler_installed) { /* wrap around case, v. rare */ /* also v. buggy so there's that */ } else { - serverLog(LL_NOTICE, "WRAP CASE"); - serverLog(LL_NOTICE, "-----------------------------------------"); - serverLog(LL_NOTICE, "requested to write: %ld", nrequested); - serverLog(LL_NOTICE, "actually written: %ld", nwritten); - serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size); - serverLog(LL_NOTICE, "buf pos: %d, sentlen: %ld", c->bufpos, c->sentlen); - serverLog(LL_NOTICE, "nwritten: %ld", nwritten); - serverLog(LL_NOTICE, "-----------------------------------------"); - nrequested = repl_backlog_size + repl_backlog_idx - c->repl_curr_idx; nwritten = connWrite(c->conn, g_pserver->repl_backlog + c->repl_curr_idx, repl_backlog_size - c->repl_curr_idx); /* only attempt wrapping if we write the correct number of bytes */ if (nwritten == repl_backlog_size - c->repl_curr_idx){ - serverLog(LL_NOTICE, "SECOND STAGE"); - serverLog(LL_NOTICE, "-----------------------------------------"); - serverLog(LL_NOTICE, "requested to write: %ld", nrequested); - serverLog(LL_NOTICE, "actually written: %ld", nwritten); - serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size); - serverLog(LL_NOTICE, "buf pos: %d, sentlen: %ld", c->bufpos, c->sentlen); - serverLog(LL_NOTICE, "-----------------------------------------"); - long long nwrittenPart2 = connWrite(c->conn, g_pserver->repl_backlog, repl_backlog_idx); if (nwrittenPart2 != -1) nwritten += nwrittenPart2; - serverLog(LL_NOTICE, "nwrittenPart2: %lld", nwrittenPart2); - serverLog(LL_NOTICE, "-----------------------------------------"); - } else { - serverLog(LL_NOTICE, "SUPER SHORT"); - } - + } } /* only update the replica's current index if bytes were sent */ // if (nrequested != nwritten){ - serverLog(LL_NOTICE, "-----------------------------------------"); - serverLog(LL_NOTICE, "AFTER THE FACT"); - serverLog(LL_NOTICE, "requested to write: %ld", nrequested); - serverLog(LL_NOTICE, "actually written: %ld", nwritten); - serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size); - serverLog(LL_NOTICE, "repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset); - serverLog(LL_NOTICE, "-----------------------------------------"); + // serverLog(LL_NOTICE, "-----------------------------------------"); + // serverLog(LL_NOTICE, "AFTER THE FACT"); + // serverLog(LL_NOTICE, "requested to write: %ld", nrequested); + // serverLog(LL_NOTICE, "actually written: %ld", nwritten); + // serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size); + // serverLog(LL_NOTICE, "repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset); + // serverLog(LL_NOTICE, "-----------------------------------------"); // } @@ -1902,25 +1864,36 @@ void ProcessPendingAsyncWrites() serverAssert(c->fPendingAsyncWrite); if (c->flags & (CLIENT_CLOSE_ASAP | CLIENT_CLOSE_AFTER_REPLY)) { - zfree(c->replyAsync); - c->replyAsync = nullptr; + if (c->replyAsync != nullptr){ + zfree(c->replyAsync); + c->replyAsync = nullptr; + } c->fPendingAsyncWrite = FALSE; continue; } - int size = c->replyAsync->used; + /* since writes from master to replica can come directly from the replication backlog, + * writes may have been signalled without having been copied to the replyAsync buffer, + * thus causing the buffer to be NULL */ + if (c->replyAsync != nullptr){ + int size = c->replyAsync->used; - if (listLength(c->reply) == 0 && size <= (PROTO_REPLY_CHUNK_BYTES - c->bufpos)) { - memcpy(c->buf + c->bufpos, c->replyAsync->buf(), size); - c->bufpos += size; - } else { - c->reply_bytes += c->replyAsync->size; - listAddNodeTail(c->reply, c->replyAsync); + if (listLength(c->reply) == 0 && size <= (PROTO_REPLY_CHUNK_BYTES - c->bufpos)) { + memcpy(c->buf + c->bufpos, c->replyAsync->buf(), size); + c->bufpos += size; + } else { + c->reply_bytes += c->replyAsync->size; + listAddNodeTail(c->reply, c->replyAsync); + c->replyAsync = nullptr; + } + + zfree(c->replyAsync); c->replyAsync = nullptr; + } else { + /* Only replicas should have empty async reply buffers */ + serverAssert(c->flags & CLIENT_SLAVE); } - zfree(c->replyAsync); - c->replyAsync = nullptr; c->fPendingAsyncWrite = FALSE; // Now install the write event handler @@ -1935,17 +1908,17 @@ void ProcessPendingAsyncWrites() { ae_flags |= AE_BARRIER; } - + if (!((c->replstate == REPL_STATE_NONE || (c->replstate == SLAVE_STATE_ONLINE && !c->repl_put_online_on_ack)))) continue; - + asyncCloseClientOnOutputBufferLimitReached(c); if (c->flags & CLIENT_CLOSE_ASAP) continue; // we will never write this so don't post an op - + std::atomic_thread_fence(std::memory_order_seq_cst); - + if (FCorrectThread(c)) { prepareClientToWrite(c); // queue an event @@ -3386,7 +3359,12 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) { * that writes to said replica are using data from the replication backlog * as opposed to it's own internal buffer, this number should keep track of that */ unsigned long getClientReplicationBacklogSharedUsage(client *c) { - return (c->repl_curr_idx == -1 && c->flags & CLIENT_SLAVE) ? 0 : g_pserver->master_repl_offset - c->repl_curr_off; + if (c->flags & CLIENT_SLAVE && c->repl_curr_idx != -1){ + // serverLog(LL_NOTICE, "repl_backlog_size %lld, repl_backlog_idx %lld, master_repl_offset %lld, repl_curr_idx %lld, repl_curr_off %lld", + // g_pserver->repl_backlog_size, g_pserver->repl_backlog_idx, g_pserver->master_repl_offset, c->repl_curr_idx, c->repl_curr_off); + } + + return (!(c->flags & CLIENT_SLAVE) || c->repl_curr_idx == -1) ? 0 : g_pserver->master_repl_offset - c->repl_curr_off; } /* This function returns the number of bytes that Redis is diff --git a/src/replication.cpp b/src/replication.cpp index ccb538a69..ef33fbfd9 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -47,8 +47,8 @@ #include #include -#define BYPASS_BUFFER -// #define BYPASS_PSYNC +// #define BYPASS_BUFFER +// #define RESIZE_BACKLOG void replicationDiscardCachedMaster(redisMaster *mi); void replicationResurrectCachedMaster(redisMaster *mi, connection *conn); @@ -57,6 +57,30 @@ void putSlaveOnline(client *replica); int cancelReplicationHandshake(redisMaster *mi); static void propagateMasterStaleKeys(); +/* gets the lowest offset amongst all of the replicas */ +long long getLowestOffsetAmongReplicas(){ + serverAssert(GlobalLocksAcquired()); + long long min_offset = LONG_LONG_MAX; + listIter li; + listNode *ln; + listRewind(g_pserver->slaves, &li); + // check for potential overflow first + while ((ln = listNext(&li))) { + client *replica = (client*)listNodeValue(ln); + + if (replica->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue; + if (replica->flags & CLIENT_CLOSE_ASAP) continue; + if (replica->repl_curr_idx == -1) continue; + + std::unique_lock ul(replica->lock, std::defer_lock); + if (FCorrectThread(replica)) + ul.lock(); + + min_offset = std::min(min_offset, replica->repl_curr_off); + } + /* return -1 if no other minimum was found */ + return min_offset == LONG_LONG_MAX ? -1 : min_offset; +} /* We take a global flag to remember if this instance generated an RDB * because of replication, so that we can remove the RDB file in case * the instance is configured to have no persistence. */ @@ -67,11 +91,13 @@ void resizeReplicationBacklogForClients(long long newsize); void setReplIdx(client *c, long long idx, long long off){ if (prepareClientToWrite(c) != C_OK) return; // serverLog(LL_NOTICE, "calling this garbage function w/ idx and off: %lld, %lld, %lld", idx, off, off-idx); - // serverLog(LL_NOTICE, "What is this value? %lld", c->repl_curr_idx); + // serverLog(LL_NOTICE, "Repl Index started at: %lld", c->repl_curr_idx); if (c->repl_curr_idx == -1){ c->repl_curr_idx = idx; c->repl_curr_off = off; } + // serverLog(LL_NOTICE, "Repl Index has become: %lld", c->repl_curr_idx); + } /* --------------------------- Utility functions ---------------------------- */ @@ -277,7 +303,7 @@ void resizeReplicationBacklogForClients(long long newsize) { newsize = CONFIG_REPL_BACKLOG_MIN_SIZE; if (g_pserver->repl_backlog_size == newsize) return; - serverLog(LL_NOTICE, "WE HAD TO RESIZE from %lld to %lld", g_pserver->repl_backlog_size, newsize); + serverLog(LL_NOTICE, "WE HAVE TO RESIZE from %lld to %lld", g_pserver->repl_backlog_size, newsize); /* get the critical client size, i.e. the size of the data unflushed to clients */ long long earliest_off = LONG_LONG_MAX; long long earliest_idx = -1; @@ -290,6 +316,20 @@ void resizeReplicationBacklogForClients(long long newsize) { earliest_off = replica->repl_curr_off; earliest_idx = replica->repl_curr_idx; } + serverLog(LL_NOTICE, "repl_curr_idx: %lld, earlistidx: %lld", replica->repl_curr_idx, earliest_idx); + } + serverLog(LL_NOTICE, "We are starting with: master_repl_offset: %lld, repl_batch_offStart: %lld, earliest_off: %lld, " + "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, earliest_idx: %lld, repl_backlog_size: %lld", + g_pserver->master_repl_offset, g_pserver->repl_batch_offStart, earliest_off, + g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, earliest_idx, g_pserver->repl_backlog_size + ); + + long long new_off = 0, new_idx = 0; + + /* if no earliest offset is found amongst the clients, they are all up to date with the flushed index */ + if (earliest_off == LONG_LONG_MAX && earliest_idx == -1){ + earliest_idx = g_pserver->repl_batch_idxStart; + earliest_off = g_pserver->repl_batch_offStart; } if (g_pserver->repl_backlog != NULL) { @@ -330,8 +370,11 @@ void resizeReplicationBacklogForClients(long long newsize) { if (replica->repl_curr_idx < 0) replica->repl_curr_idx += g_pserver->repl_backlog_size; } + new_idx = replica->repl_curr_idx; } - g_pserver->repl_batch_idxStart = 0; + g_pserver->repl_batch_idxStart -= earliest_idx; + if (g_pserver->repl_batch_idxStart < 0) + g_pserver->repl_batch_idxStart += g_pserver->repl_backlog_size; } else { zfree(g_pserver->repl_backlog); g_pserver->repl_backlog = (char*)zmalloc(newsize); @@ -342,6 +385,12 @@ void resizeReplicationBacklogForClients(long long newsize) { } } g_pserver->repl_backlog_size = newsize; + + serverLog(LL_NOTICE, "We are ending with: master_repl_offset: %lld, repl_batch_offStart: %lld, new_off: %lld, " + "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, new_idx: %lld, repl_backlog_size: %lld", + g_pserver->master_repl_offset, g_pserver->repl_batch_offStart, new_off, + g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, new_idx, g_pserver->repl_backlog_size + ); } void freeReplicationBacklog(void) { @@ -367,20 +416,41 @@ void feedReplicationBacklog(const void *ptr, size_t len) { const unsigned char *p = (const unsigned char*)ptr; if (g_pserver->repl_batch_idxStart >= 0) { - long long minimumsize = g_pserver->master_repl_offset + len - g_pserver->repl_batch_offStart+1; + /* we are lower bounded by the lower client offset or the offStart if all the clients are up to date */ + long long lower_bound = getLowestOffsetAmongReplicas(); + if (lower_bound == -1) + lower_bound = g_pserver->repl_batch_offStart; + long long minimumsize = g_pserver->master_repl_offset + len - lower_bound + 1; if (minimumsize > g_pserver->repl_backlog_size) { flushReplBacklogToClients(); - minimumsize = g_pserver->master_repl_offset + len - g_pserver->repl_batch_offStart+1; + minimumsize = g_pserver->master_repl_offset + len - lower_bound +1; if (minimumsize > g_pserver->repl_backlog_size) { // This is an emergency overflow, we better resize to fit long long newsize = std::max(g_pserver->repl_backlog_size*2, minimumsize); serverLog(LL_WARNING, "Replication backlog is too small, resizing to: %lld", newsize); - resizeReplicationBacklog(newsize); + resizeReplicationBacklogForClients(newsize); } } +#ifdef RESIZE_BACKLOG + long long lowest_replica_offset = getLowestOffsetAmongReplicas(); + minimumsize = g_pserver->master_repl_offset + len - lowest_replica_offset; + if (lowest_replica_offset != -1 && minimumsize > g_pserver->repl_backlog_size){ + serverLog(LL_WARNING, "THE REPLICATION BACKLOG SIZE IS TOO SMALL, THIS IS A PROBLEM"); + long long oldsize = g_pserver->repl_backlog_size; + resizeReplicationBacklogForClients(std::max(g_pserver->repl_backlog_size * 2, minimumsize)); + serverLog(LL_WARNING, "changed size from %lld to %lld", oldsize, g_pserver->repl_backlog_size); + flushReplBacklogToClients(); + } +#endif } + // serverLog(LL_NOTICE, "Pt2 start with: master_repl_offset: %lld, repl_batch_offStart: %lld, " + // "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, repl_backlog_size: %lld", + // g_pserver->master_repl_offset, g_pserver->repl_batch_offStart, + // g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, g_pserver->repl_backlog_size + // ); + g_pserver->master_repl_offset += len; /* This is a circular buffer, so write as much data we can at every @@ -395,12 +465,23 @@ void feedReplicationBacklog(const void *ptr, size_t len) { len -= thislen; p += thislen; g_pserver->repl_backlog_histlen += thislen; + // serverLog(LL_NOTICE, "Pt2 intermediate with: master_repl_offset: %lld, repl_batch_offStart: %lld, " + // "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, repl_backlog_size: %lld", + // g_pserver->master_repl_offset, g_pserver->repl_batch_offStart, + // g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, g_pserver->repl_backlog_size + // ); } if (g_pserver->repl_backlog_histlen > g_pserver->repl_backlog_size) g_pserver->repl_backlog_histlen = g_pserver->repl_backlog_size; /* Set the offset of the first byte we have in the backlog. */ g_pserver->repl_backlog_off = g_pserver->master_repl_offset - g_pserver->repl_backlog_histlen + 1; + + // serverLog(LL_NOTICE, "Pt2 end with: master_repl_offset: %lld, repl_batch_offStart: %lld, " + // "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, repl_backlog_size: %lld", + // g_pserver->master_repl_offset, g_pserver->repl_batch_offStart, + // g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, g_pserver->repl_backlog_size + // ); } /* Wrapper for feedReplicationBacklog() that takes Redis string objects @@ -774,7 +855,6 @@ long long addReplyReplicationBacklog(client *c, long long offset) { * split the reply in two parts if we are cross-boundary. */ len = g_pserver->repl_backlog_histlen - skip; serverLog(LL_DEBUG, "[PSYNC] Reply total length: %lld", len); - serverLog(LL_NOTICE, "Coming through from the addReplicationBacklog"); #ifdef BYPASS_PSYNC setReplIdx(c, j, offset); #else @@ -789,7 +869,6 @@ long long addReplyReplicationBacklog(client *c, long long offset) { j = 0; } #endif - serverLog(LL_NOTICE, "rdb transmitted? %d, pending replies? %d", c->transmittedRDB, clientHasPendingReplies(c)); return g_pserver->repl_backlog_histlen - skip; } @@ -1520,13 +1599,11 @@ void sendBulkToSlave(connection *conn) { replica->repldboff += nwritten; g_pserver->stat_net_output_bytes += nwritten; - // replica->repl_curr_idx = g_pserver->repl_backlog_idx; if (replica->repldboff == replica->repldbsize) { close(replica->repldbfd); replica->repldbfd = -1; connSetWriteHandler(replica->conn,NULL); putSlaveOnline(replica); - serverLog(LL_NOTICE, "ABOUT TO DIE HERE"); } } @@ -4560,6 +4637,7 @@ void _clientAsyncReplyBufferReserve(client *c, size_t len); /* Has the end of the replication backlog overflowed past the beginning? */ bool replOverflowHasOccured(){ + if (g_pserver->repl_backlog_idx > g_pserver->repl_batch_idxStart){ long long repl_idx_difference = g_pserver->repl_backlog_idx > g_pserver->repl_batch_idxStart ? g_pserver->repl_backlog_idx - g_pserver->repl_batch_idxStart : @@ -4575,8 +4653,13 @@ thread_local int transmittedCount = 0; void flushReplBacklogToClients() { serverAssert(GlobalLocksAcquired()); - if (g_pserver->repl_batch_offStart < 0) + if (g_pserver->repl_batch_offStart < 0){ + if (getLowestOffsetAmongReplicas() == -1){ + serverLog(LL_NOTICE, "this is a case i probably have to handle"); + } return; + } + if (g_pserver->repl_batch_offStart != g_pserver->master_repl_offset) { bool fAsyncWrite = false; @@ -4585,7 +4668,7 @@ void flushReplBacklogToClients() serverAssert(g_pserver->master_repl_offset - g_pserver->repl_batch_offStart <= g_pserver->repl_backlog_size); serverAssert(g_pserver->repl_batch_idxStart != g_pserver->repl_backlog_idx); - serverAssert(!replOverflowHasOccured()); + // serverAssert(!replOverflowHasOccured()); listIter li; listNode *ln; listRewind(g_pserver->slaves, &li); @@ -4605,11 +4688,21 @@ void flushReplBacklogToClients() ul.lock(); else fAsyncWrite = true; + + if (g_pserver->master_repl_offset - replica->repl_curr_off > g_pserver->repl_backlog_size){ + serverLog(LL_WARNING, "THE REPLICATION BACKLOG SIZE IS TOO SMALL, THIS IS A PROBLEM"); + long long oldsize = g_pserver->repl_backlog_size; + resizeReplicationBacklogForClients(std::max(g_pserver->repl_backlog_size * 2, g_pserver->master_repl_offset - replica->repl_curr_off)); + serverLog(LL_WARNING, "changing size from %lld to %lld", oldsize, g_pserver->repl_backlog_size); + } + + } + + listRewind(g_pserver->slaves, &li); #endif while ((ln = listNext(&li))) { client *replica = (client*)listNodeValue(ln); - // serverLog(LL_NOTICE, "replica state: %d", replica->replstate); if (replica->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue; if (replica->flags & CLIENT_CLOSE_ASAP) continue; diff --git a/src/server.cpp b/src/server.cpp index 3d547f748..9664a4a6b 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -1796,6 +1796,7 @@ int clientsCronTrackClientsMemUsage(client *c) { mem += zmalloc_size(c); mem += c->argv_len_sum(); if (c->argv) mem += zmalloc_size(c->argv); + // serverLog(LL_NOTICE, "Mem here is : %lu", mem); /* Now that we have the memory used by the client, remove the old * value from the old category, and add it back. */ g_pserver->stat_clients_type_memory[c->client_cron_last_memory_type] -= @@ -1854,7 +1855,7 @@ void clientsCron(int iel) { while(listLength(g_pserver->clients) && iterations--) { client *c; listNode *head; - + // serverLog(LL_NOTICE, "we are at iteration: %d", iterations); /* Rotate the list, take the current head, process. * This way if the client must be removed from the list it's the * first element and we don't incur into O(N) computation. */ From 7ef58a333f9331e8fd144163626ed7a6ccaa1a59 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Thu, 29 Apr 2021 18:51:30 +0000 Subject: [PATCH 10/75] Performance optimizations Former-commit-id: 7fd83d467784d293f7da78b74f9b9763ce387238 --- src/replication.cpp | 71 ++------------------------------------------- 1 file changed, 3 insertions(+), 68 deletions(-) diff --git a/src/replication.cpp b/src/replication.cpp index ef33fbfd9..1bae2773a 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -47,8 +47,7 @@ #include #include -// #define BYPASS_BUFFER -// #define RESIZE_BACKLOG +#define BYPASS_BUFFER void replicationDiscardCachedMaster(redisMaster *mi); void replicationResurrectCachedMaster(redisMaster *mi, connection *conn); @@ -89,10 +88,10 @@ int RDBGeneratedByReplication = 0; void resizeReplicationBacklogForClients(long long newsize); void setReplIdx(client *c, long long idx, long long off){ - if (prepareClientToWrite(c) != C_OK) return; // serverLog(LL_NOTICE, "calling this garbage function w/ idx and off: %lld, %lld, %lld", idx, off, off-idx); // serverLog(LL_NOTICE, "Repl Index started at: %lld", c->repl_curr_idx); if (c->repl_curr_idx == -1){ + if (prepareClientToWrite(c) != C_OK) return; c->repl_curr_idx = idx; c->repl_curr_off = off; } @@ -432,17 +431,6 @@ void feedReplicationBacklog(const void *ptr, size_t len) { resizeReplicationBacklogForClients(newsize); } } -#ifdef RESIZE_BACKLOG - long long lowest_replica_offset = getLowestOffsetAmongReplicas(); - minimumsize = g_pserver->master_repl_offset + len - lowest_replica_offset; - if (lowest_replica_offset != -1 && minimumsize > g_pserver->repl_backlog_size){ - serverLog(LL_WARNING, "THE REPLICATION BACKLOG SIZE IS TOO SMALL, THIS IS A PROBLEM"); - long long oldsize = g_pserver->repl_backlog_size; - resizeReplicationBacklogForClients(std::max(g_pserver->repl_backlog_size * 2, minimumsize)); - serverLog(LL_WARNING, "changed size from %lld to %lld", oldsize, g_pserver->repl_backlog_size); - flushReplBacklogToClients(); - } -#endif } // serverLog(LL_NOTICE, "Pt2 start with: master_repl_offset: %lld, repl_batch_offStart: %lld, " @@ -4635,30 +4623,11 @@ void replicateSubkeyExpire(redisDb *db, robj_roptr key, robj_roptr subkey, long void _clientAsyncReplyBufferReserve(client *c, size_t len); -/* Has the end of the replication backlog overflowed past the beginning? */ -bool replOverflowHasOccured(){ - - if (g_pserver->repl_backlog_idx > g_pserver->repl_batch_idxStart){ - long long repl_idx_difference = g_pserver->repl_backlog_idx > g_pserver->repl_batch_idxStart ? - g_pserver->repl_backlog_idx - g_pserver->repl_batch_idxStart : - (g_pserver->repl_backlog_size + g_pserver->repl_backlog_idx) - g_pserver->repl_batch_idxStart; - - return g_pserver->master_repl_offset - g_pserver->repl_batch_offStart > repl_idx_difference; - } - return false; -} - -thread_local int transmittedCount = 0; - void flushReplBacklogToClients() { serverAssert(GlobalLocksAcquired()); - if (g_pserver->repl_batch_offStart < 0){ - if (getLowestOffsetAmongReplicas() == -1){ - serverLog(LL_NOTICE, "this is a case i probably have to handle"); - } + if (g_pserver->repl_batch_offStart < 0) return; - } if (g_pserver->repl_batch_offStart != g_pserver->master_repl_offset) { @@ -4668,39 +4637,9 @@ void flushReplBacklogToClients() serverAssert(g_pserver->master_repl_offset - g_pserver->repl_batch_offStart <= g_pserver->repl_backlog_size); serverAssert(g_pserver->repl_batch_idxStart != g_pserver->repl_backlog_idx); - // serverAssert(!replOverflowHasOccured()); listIter li; listNode *ln; listRewind(g_pserver->slaves, &li); - -#if 0 - // check for potential overflow first - while ((ln = listNext(&li))) { - client *replica = (client*)listNodeValue(ln); - // serverLog(LL_NOTICE, "replica state: %d", replica->replstate); - - if (replica->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue; - if (replica->flags & CLIENT_CLOSE_ASAP) continue; - if (replica->repl_curr_idx == -1) continue; - - std::unique_lock ul(replica->lock, std::defer_lock); - if (FCorrectThread(replica)) - ul.lock(); - else - fAsyncWrite = true; - - if (g_pserver->master_repl_offset - replica->repl_curr_off > g_pserver->repl_backlog_size){ - serverLog(LL_WARNING, "THE REPLICATION BACKLOG SIZE IS TOO SMALL, THIS IS A PROBLEM"); - long long oldsize = g_pserver->repl_backlog_size; - resizeReplicationBacklogForClients(std::max(g_pserver->repl_backlog_size * 2, g_pserver->master_repl_offset - replica->repl_curr_off)); - serverLog(LL_WARNING, "changing size from %lld to %lld", oldsize, g_pserver->repl_backlog_size); - } - - } - - listRewind(g_pserver->slaves, &li); -#endif - while ((ln = listNext(&li))) { client *replica = (client*)listNodeValue(ln); @@ -4721,10 +4660,6 @@ void flushReplBacklogToClients() setReplIdx(replica, g_pserver->repl_batch_idxStart, g_pserver->repl_batch_offStart); continue; } -#else - if (replica->replstate == SLAVE_STATE_ONLINE){ - // serverLog(LL_NOTICE, "would be calling this garbage function w/ offset: %lld", g_pserver->repl_batch_idxStart); - } #endif if (g_pserver->repl_backlog_idx >= g_pserver->repl_batch_idxStart) { long long cbCopy = g_pserver->repl_backlog_idx - g_pserver->repl_batch_idxStart; From f6305ed15bca84719504890f85dd0f1297e05365 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Mon, 3 May 2021 16:29:11 +0000 Subject: [PATCH 11/75] Now tracks memory and resizes 'accurately', need to fix cluster Former-commit-id: 5f0e01cc199427ab6dfd7f8f28321f6a1f34fd1c --- src/config.cpp | 1 + src/evict.cpp | 10 +++++++++- src/networking.cpp | 20 +++++++++++++------- src/replication.cpp | 16 ++++++++++++++++ 4 files changed, 39 insertions(+), 8 deletions(-) diff --git a/src/config.cpp b/src/config.cpp index 9d7f14007..b546ef607 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -2347,6 +2347,7 @@ static int updateReplBacklogSize(long long val, long long prev, const char **err UNUSED(err); g_pserver->repl_backlog_size = prev; resizeReplicationBacklog(val); + g_pserver->repl_backlog_config_size = g_pserver->repl_backlog_size; return 1; } diff --git a/src/evict.cpp b/src/evict.cpp index 31cadeae5..36837e17d 100644 --- a/src/evict.cpp +++ b/src/evict.cpp @@ -392,9 +392,16 @@ size_t freeMemoryGetNotCountedMemory(void) { while((ln = listNext(&li))) { client *replica = (client*)listNodeValue(ln); std::unique_lock(replica->lock); - overhead += getClientOutputBufferMemoryUsage(replica); + /* we don't wish to multiple count the replication backlog shared usage */ + overhead += (getClientOutputBufferMemoryUsage(replica) - getClientReplicationBacklogSharedUsage(replica)); } } + + /* also don't count the replication backlog memory + * that's where the replication clients get their memory from */ + overhead += (g_pserver->repl_backlog_size - g_pserver->repl_backlog_config_size); + + if (g_pserver->aof_state != AOF_OFF) { overhead += sdsalloc(g_pserver->aof_buf)+aofRewriteBufferSize(); } @@ -516,6 +523,7 @@ int freeMemoryIfNeeded(bool fQuickCycle, bool fPreSnapshot) { if (g_pserver->maxmemory_policy == MAXMEMORY_NO_EVICTION) goto cant_free; /* We need to free memory, but policy forbids. */ + serverLog(LL_NOTICE, "evicting i guess lol, the overhead was %ld, the repl_backlog_size, %lld", freeMemoryGetNotCountedMemory(), g_pserver->repl_backlog_size); while (mem_freed < mem_tofree) { int j, k, i; static unsigned int next_db = 0; diff --git a/src/networking.cpp b/src/networking.cpp index cac58ff07..c51a02a1d 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -224,7 +224,6 @@ void clientInstallWriteHandler(client *c) { (c->replstate == REPL_STATE_NONE || (c->replstate == SLAVE_STATE_ONLINE && !c->repl_put_online_on_ack))) { - // serverLog(LL_NOTICE, "we installing boyz"); AssertCorrectThread(c); serverAssert(c->lock.fOwnLock()); /* Here instead of installing the write handler, we just flag the @@ -1801,6 +1800,9 @@ int writeToClient(client *c, int handler_installed) { if (nwrittenPart2 == -1) nwritten = -1; } + if (c->flags & CLIENT_SLAVE && handler_installed) + serverLog(LL_NOTICE, "Total bytes written, %ld, write handler installed?: %d", totwritten, handler_installed); + g_pserver->stat_net_output_bytes += totwritten; if (nwritten == -1) { if (connGetState(c->conn) == CONN_STATE_CONNECTED) { @@ -1821,6 +1823,11 @@ int writeToClient(client *c, int handler_installed) { if (!(c->flags & CLIENT_MASTER)) c->lastinteraction = g_pserver->unixtime; } if (!clientHasPendingReplies(c) && c->repl_curr_idx == -1) { + if(c->flags & CLIENT_SLAVE && handler_installed){ + serverLog(LL_NOTICE, "Uninstalling handler"); + serverLog(LL_NOTICE, "handler repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size); + serverLog(LL_NOTICE, "handler repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset); + } c->sentlen = 0; if (handler_installed) connSetWriteHandler(c->conn, NULL); @@ -1836,6 +1843,7 @@ int writeToClient(client *c, int handler_installed) { /* Write event handler. Just send data to the client. */ void sendReplyToClient(connection *conn) { client *c = (client*)connGetPrivateData(conn); + serverLog(LL_NOTICE, "called the sendreplytoclient"); if (writeToClient(c,1) == C_ERR) { AeLocker ae; @@ -1970,6 +1978,7 @@ int handleClientsWithPendingWrites(int iel, int aof_state) { auto vec = std::move(g_pserver->rgthreadvar[iel].clients_pending_write); processed += (int)vec.size(); + // serverLog(LL_NOTICE, "entered handleClientsWithPendingWrites"); for (client *c : vec) { serverAssertDebug(FCorrectThread(c)); @@ -2008,8 +2017,10 @@ int handleClientsWithPendingWrites(int iel, int aof_state) { /* If after the synchronous writes above we still have data to * output to the client, we need to install the writable handler. */ if (clientHasPendingReplies(c) || c->repl_curr_idx != -1) { - if (connSetWriteHandlerWithBarrier(c->conn, sendReplyToClient, ae_flags, true) == C_ERR) + serverLog(LL_NOTICE, "Setting a write handler for later"); + if (connSetWriteHandlerWithBarrier(c->conn, sendReplyToClient, ae_flags, true) == C_ERR) { freeClientAsync(c); + } } } @@ -3359,11 +3370,6 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) { * that writes to said replica are using data from the replication backlog * as opposed to it's own internal buffer, this number should keep track of that */ unsigned long getClientReplicationBacklogSharedUsage(client *c) { - if (c->flags & CLIENT_SLAVE && c->repl_curr_idx != -1){ - // serverLog(LL_NOTICE, "repl_backlog_size %lld, repl_backlog_idx %lld, master_repl_offset %lld, repl_curr_idx %lld, repl_curr_off %lld", - // g_pserver->repl_backlog_size, g_pserver->repl_backlog_idx, g_pserver->master_repl_offset, c->repl_curr_idx, c->repl_curr_off); - } - return (!(c->flags & CLIENT_SLAVE) || c->repl_curr_idx == -1) ? 0 : g_pserver->master_repl_offset - c->repl_curr_off; } diff --git a/src/replication.cpp b/src/replication.cpp index 1bae2773a..60f25052a 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -4684,5 +4684,21 @@ void flushReplBacklogToClients() // This may be called multiple times per "frame" so update with our progress flushing to clients g_pserver->repl_batch_idxStart = g_pserver->repl_backlog_idx; g_pserver->repl_batch_offStart = g_pserver->master_repl_offset; + } else if (getLowestOffsetAmongReplicas() != -1){ + listIter li; + listNode *ln; + listRewind(g_pserver->slaves, &li); + while ((ln = listNext(&li))) { + client *replica = (client*)listNodeValue(ln); + + std::unique_lock ul(replica->lock, std::defer_lock); + if (FCorrectThread(replica)) + ul.lock(); + + /* try to force prepare client to write i guess? */ + if (replica->repl_curr_idx != -1){ + if (prepareClientToWrite(replica) != C_OK) continue; + } + } } } From 33a7b52899a10432e1f9085027ed9e30c07dda32 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Mon, 3 May 2021 16:49:09 +0000 Subject: [PATCH 12/75] Forgot to add server.h in last commit Former-commit-id: 34fa6119c9a3f1533cc3e6e5d118dc6424a70891 --- src/server.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/server.h b/src/server.h index cfd6c34a0..6c5265fbd 100644 --- a/src/server.h +++ b/src/server.h @@ -2411,6 +2411,9 @@ struct redisServer { uint16_t rglockSamples[s_lockContentionSamples]; unsigned ilockRingHead = 0; + long long repl_backlog_config_size = 1024*1024; /* This is a hack to ignore the resizing of the replication backlog + when using it as a defacto for the client buffer */ + bool FRdbSaveInProgress() const { return rdbThreadVars.fRdbThreadActive; } }; @@ -2657,6 +2660,7 @@ sds getAllClientsInfoString(int type); void rewriteClientCommandVector(client *c, int argc, ...); void rewriteClientCommandArgument(client *c, int i, robj *newval); void replaceClientCommandVector(client *c, int argc, robj **argv); +unsigned long getClientReplicationBacklogSharedUsage(client *c); unsigned long getClientOutputBufferMemoryUsage(client *c); int freeClientsInAsyncFreeQueue(int iel); void asyncCloseClientOnOutputBufferLimitReached(client *c); From 3e970746491a5a9e1a70804bb49e5527b4aaf17c Mon Sep 17 00:00:00 2001 From: Madelyn Olson <34459052+madolson@users.noreply.github.com> Date: Mon, 19 Apr 2021 22:16:27 -0700 Subject: [PATCH 13/75] Fix memory leak when doing lazyfreeing client tracking table (#8822) Interior rax pointers were not being freed (cherry picked from commit c73b4ddfd96d00ed0d0fde17953ce63d78bc3777) --- src/lazyfree.c | 5 ++--- src/server.h | 1 + tests/unit/tracking.tcl | 11 +++++++++++ 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/lazyfree.c b/src/lazyfree.c index f18b2027f..a2cf2c3ed 100644 --- a/src/lazyfree.c +++ b/src/lazyfree.c @@ -39,12 +39,11 @@ void lazyfreeFreeSlotsMap(void *args[]) { atomicIncr(lazyfreed_objects,len); } -/* Release the rax mapping Redis Cluster keys to slots in the - * lazyfree thread. */ +/* Release the key tracking table. */ void lazyFreeTrackingTable(void *args[]) { rax *rt = args[0]; size_t len = rt->numele; - raxFree(rt); + freeTrackingRadixTree(rt); atomicDecr(lazyfree_objects,len); atomicIncr(lazyfreed_objects,len); } diff --git a/src/server.h b/src/server.h index d35eaa425..a2a722c6d 100644 --- a/src/server.h +++ b/src/server.h @@ -1911,6 +1911,7 @@ void disableTracking(client *c); void trackingRememberKeys(client *c); void trackingInvalidateKey(client *c, robj *keyobj); void trackingInvalidateKeysOnFlush(int async); +void freeTrackingRadixTree(rax *rt); void freeTrackingRadixTreeAsync(rax *rt); void trackingLimitUsedSlots(void); uint64_t trackingGetTotalItems(void); diff --git a/tests/unit/tracking.tcl b/tests/unit/tracking.tcl index 40f1a2a66..4c75b6f48 100644 --- a/tests/unit/tracking.tcl +++ b/tests/unit/tracking.tcl @@ -395,6 +395,17 @@ start_server {tags {"tracking network"}} { assert {[lindex msg 2] eq {} } } + test {Test ASYNC flushall} { + clean_all + r CLIENT TRACKING on REDIRECT $redir_id + r GET key1 + r GET key2 + assert_equal [s 0 tracking_total_keys] 2 + $rd_sg FLUSHALL ASYNC + assert_equal [s 0 tracking_total_keys] 0 + assert_equal [lindex [$rd_redirection read] 2] {} + } + # Keys are defined to be evicted 100 at a time by default. # If after eviction the number of keys still surpasses the limit # defined in tracking-table-max-keys, we increases eviction From 5ce259333e8fe944cfc06dc7d1b00fa6cd7a7afc Mon Sep 17 00:00:00 2001 From: Huang Zhw Date: Tue, 20 Apr 2021 15:59:44 +0800 Subject: [PATCH 14/75] Fix migrateCommand may migrate wrong value. (#8815) This scene is hard to happen. When first attempt some keys expired, only kv position is updated not ov. Then socket err happens, second attempt is taken. This time kv items may be mismatching with ov items. (cherry picked from commit 080d4579db40d965f8392af5b1da7a99d1a817d5) --- src/cluster.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/cluster.c b/src/cluster.c index 0f0ab737e..ba21024be 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -5465,9 +5465,10 @@ try_again: if (ttl < 1) ttl = 1; } - /* Relocate valid (non expired) keys into the array in successive + /* Relocate valid (non expired) keys and values into the array in successive * positions to remove holes created by the keys that were present * in the first lookup but are now expired after the second lookup. */ + ov[non_expired] = ov[j]; kv[non_expired++] = kv[j]; serverAssertWithInfo(c,NULL, From d7221e0135cbf9c8cb1c29e671a6050a25302b81 Mon Sep 17 00:00:00 2001 From: bugwz Date: Wed, 21 Apr 2021 02:51:24 +0800 Subject: [PATCH 15/75] Print the number of abnormal line in AOF (#8823) When redis-check-aof finds an error, it prints the line number for faster troubleshooting. (cherry picked from commit 761d7d27711edfbf737def41ff28f5b325fb16c8) --- src/redis-check-aof.c | 6 ++++-- tests/integration/aof.tcl | 12 ++++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/src/redis-check-aof.c b/src/redis-check-aof.c index eedb09db5..1507e0a06 100644 --- a/src/redis-check-aof.c +++ b/src/redis-check-aof.c @@ -39,12 +39,14 @@ static char error[1044]; static off_t epos; +static long long line = 1; int consumeNewline(char *buf) { if (strncmp(buf,"\r\n",2) != 0) { ERROR("Expected \\r\\n, got: %02x%02x",buf[0],buf[1]); return 0; } + line += 1; return 1; } @@ -201,8 +203,8 @@ int redis_check_aof_main(int argc, char **argv) { off_t pos = process(fp); off_t diff = size-pos; - printf("AOF analyzed: size=%lld, ok_up_to=%lld, diff=%lld\n", - (long long) size, (long long) pos, (long long) diff); + printf("AOF analyzed: size=%lld, ok_up_to=%lld, ok_up_to_line=%lld, diff=%lld\n", + (long long) size, (long long) pos, line, (long long) diff); if (diff > 0) { if (fix) { char buf[2]; diff --git a/tests/integration/aof.tcl b/tests/integration/aof.tcl index e64e2022a..abe2dc10c 100644 --- a/tests/integration/aof.tcl +++ b/tests/integration/aof.tcl @@ -158,6 +158,18 @@ tags {"aof"} { assert_match "*not valid*" $result } + test "Short read: Utility should show the abnormal line num in AOF" { + create_aof { + append_to_aof [formatCommand set foo hello] + append_to_aof "!!!" + } + + catch { + exec src/redis-check-aof $aof_path + } result + assert_match "*ok_up_to_line=8*" $result + } + test "Short read: Utility should be able to fix the AOF" { set result [exec src/redis-check-aof --fix $aof_path << "y\n"] assert_match "*Successfully truncated AOF*" $result From 3b9ceeb2a4e35f6cef6d731acb5c2e57f1588a6a Mon Sep 17 00:00:00 2001 From: Wang Yuan Date: Thu, 22 Apr 2021 13:32:43 +0800 Subject: [PATCH 16/75] Expire key firstly and then notify keyspace event (#8830) Modules event subscribers may get wrong things in notifyKeyspaceEvent callback, such as wrong number of keys, or be able to lookup this key. This commit changes the order to be like the one in evict.c. Cleanup: Since we know the key exists (it expires now), db*Delete is sure to return 1, so there's no need to check it's output (misleading). (cherry picked from commit 63acfe4b00b9d3e34a53559f965c0bc44c03db61) --- src/db.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/db.c b/src/db.c index ec68c228c..40377ec3f 100644 --- a/src/db.c +++ b/src/db.c @@ -1541,14 +1541,17 @@ int expireIfNeeded(redisDb *db, robj *key) { if (checkClientPauseTimeoutAndReturnIfPaused()) return 1; /* Delete the key */ + if (server.lazyfree_lazy_expire) { + dbAsyncDelete(db,key); + } else { + dbSyncDelete(db,key); + } server.stat_expiredkeys++; propagateExpire(db,key,server.lazyfree_lazy_expire); notifyKeyspaceEvent(NOTIFY_EXPIRED, "expired",key,db->id); - int retval = server.lazyfree_lazy_expire ? dbAsyncDelete(db,key) : - dbSyncDelete(db,key); - if (retval) signalModifiedKey(NULL,db,key); - return retval; + signalModifiedKey(NULL,db,key); + return 1; } /* ----------------------------------------------------------------------------- From 7ac26c3497789da4365a40d1f920a328c99abc8d Mon Sep 17 00:00:00 2001 From: zyxwvu Shi Date: Thu, 22 Apr 2021 13:59:10 +0800 Subject: [PATCH 17/75] Use monotonic clock to check for Lua script timeout. (#8812) This prevents a case where NTP moves the system clock forward resulting in a false detection of a busy script. Signed-off-by: zyxwvu Shi (cherry picked from commit f61c37cec900ba391541f20f7655aad44a26bafc) --- src/db.c | 2 +- src/scripting.c | 11 +++++++---- src/server.h | 3 ++- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/src/db.c b/src/db.c index 40377ec3f..840e95e21 100644 --- a/src/db.c +++ b/src/db.c @@ -1480,7 +1480,7 @@ int keyIsExpired(redisDb *db, robj *key) { * script execution, making propagation to slaves / AOF consistent. * See issue #1525 on Github for more information. */ if (server.lua_caller) { - now = server.lua_time_start; + now = server.lua_time_snapshot; } /* If we are in the middle of a command execution, we still want to use * a reference time that does not change: in that case we just use the diff --git a/src/scripting.c b/src/scripting.c index 299e60810..dbbd50eaf 100644 --- a/src/scripting.c +++ b/src/scripting.c @@ -31,6 +31,7 @@ #include "sha1.h" #include "rand.h" #include "cluster.h" +#include "monotonic.h" #include #include @@ -1427,7 +1428,7 @@ sds luaCreateFunction(client *c, lua_State *lua, robj *body) { /* This is the Lua script "count" hook that we use to detect scripts timeout. */ void luaMaskCountHook(lua_State *lua, lua_Debug *ar) { - long long elapsed = mstime() - server.lua_time_start; + long long elapsed = elapsedMs(server.lua_time_start); UNUSED(ar); UNUSED(lua); @@ -1578,7 +1579,8 @@ void evalGenericCommand(client *c, int evalsha) { server.in_eval = 1; server.lua_caller = c; server.lua_cur_script = funcname + 2; - server.lua_time_start = mstime(); + server.lua_time_start = getMonotonicUs(); + server.lua_time_snapshot = mstime(); server.lua_kill = 0; if (server.lua_time_limit > 0 && ldb.active == 0) { lua_sethook(lua,luaMaskCountHook,LUA_MASKCOUNT,100000); @@ -2729,7 +2731,7 @@ void luaLdbLineHook(lua_State *lua, lua_Debug *ar) { /* Check if a timeout occurred. */ if (ar->event == LUA_HOOKCOUNT && ldb.step == 0 && bp == 0) { - mstime_t elapsed = mstime() - server.lua_time_start; + mstime_t elapsed = elapsedMs(server.lua_time_start); mstime_t timelimit = server.lua_time_limit ? server.lua_time_limit : 5000; if (elapsed >= timelimit) { @@ -2759,6 +2761,7 @@ void luaLdbLineHook(lua_State *lua, lua_Debug *ar) { lua_pushstring(lua, "timeout during Lua debugging with client closing connection"); lua_error(lua); } - server.lua_time_start = mstime(); + server.lua_time_start = getMonotonicUs(); + server.lua_time_snapshot = mstime(); } } diff --git a/src/server.h b/src/server.h index a2a722c6d..9b7e16a0d 100644 --- a/src/server.h +++ b/src/server.h @@ -1571,7 +1571,8 @@ struct redisServer { dict *lua_scripts; /* A dictionary of SHA1 -> Lua scripts */ unsigned long long lua_scripts_mem; /* Cached scripts' memory + oh */ mstime_t lua_time_limit; /* Script timeout in milliseconds */ - mstime_t lua_time_start; /* Start time of script, milliseconds time */ + monotime lua_time_start; /* monotonic timer to detect timed-out script */ + mstime_t lua_time_snapshot; /* Snapshot of mstime when script is started */ int lua_write_dirty; /* True if a write command was called during the execution of the current script. */ int lua_random_dirty; /* True if a random command was called during the From 9158a510e1f06e8460447e0210deed9dc7dd03f3 Mon Sep 17 00:00:00 2001 From: Istemi Ekin Akkus <5419814+iakkus@users.noreply.github.com> Date: Sun, 25 Apr 2021 09:05:12 +0200 Subject: [PATCH 18/75] Modules: Fix RM_GetClusterNodeInfo() to correctly populate the master_id (#8846) (cherry picked from commit af035c1e1d3bcaf662051cff4dc49f6051321c9c) --- src/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/module.c b/src/module.c index 05bf3a275..727cdc43f 100644 --- a/src/module.c +++ b/src/module.c @@ -6168,7 +6168,7 @@ int RM_GetClusterNodeInfo(RedisModuleCtx *ctx, const char *id, char *ip, char *m /* If the information is not available, the function will set the * field to zero bytes, so that when the field can't be populated the * function kinda remains predictable. */ - if (node->flags & CLUSTER_NODE_MASTER && node->slaveof) + if (node->flags & CLUSTER_NODE_SLAVE && node->slaveof) memcpy(master_id,node->slaveof->name,REDISMODULE_NODE_ID_LEN); else memset(master_id,0,REDISMODULE_NODE_ID_LEN); From ce288cb5598e6735e2fc84bda9d89b41111249c0 Mon Sep 17 00:00:00 2001 From: Yang Bodong Date: Sun, 25 Apr 2021 19:00:35 +0800 Subject: [PATCH 19/75] When the password is wrong, redis-benchmark should exit (#8855) (cherry picked from commit 8423b77f14c0d3a58e580c65a70b4f980f5cdcf6) --- src/redis-benchmark.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/redis-benchmark.c b/src/redis-benchmark.c index 351335862..068c094e1 100644 --- a/src/redis-benchmark.c +++ b/src/redis-benchmark.c @@ -1782,8 +1782,10 @@ int main(int argc, const char **argv) { } else { config.redis_config = getRedisConfig(config.hostip, config.hostport, config.hostsocket); - if (config.redis_config == NULL) + if (config.redis_config == NULL) { fprintf(stderr, "WARN: could not fetch server CONFIG\n"); + exit(1); + } } if (config.num_threads > 0) { pthread_mutex_init(&(config.liveclients_mutex), NULL); From 9d71ac0a6ea2586a3a7e452cd4449b62993c4309 Mon Sep 17 00:00:00 2001 From: Yossi Gottlieb Date: Mon, 26 Apr 2021 18:43:57 +0300 Subject: [PATCH 20/75] Remove redundant -latomic on arm64. (#8867) (cherry picked from commit ebfbb091096b6f36cf82e9f6e6583b10fd5b5acb) --- src/Makefile | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/Makefile b/src/Makefile index 28d50da02..62e37cb48 100644 --- a/src/Makefile +++ b/src/Makefile @@ -93,14 +93,10 @@ FINAL_LDFLAGS=$(LDFLAGS) $(REDIS_LDFLAGS) $(DEBUG) FINAL_LIBS=-lm DEBUG=-g -ggdb -# Linux ARM needs -latomic at linking time -ifneq (,$(filter aarch64 armv,$(uname_M))) - FINAL_LIBS+=-latomic -else +# Linux ARM32 needs -latomic at linking time ifneq (,$(findstring armv,$(uname_M))) FINAL_LIBS+=-latomic endif -endif ifeq ($(uname_S),SunOS) # SunOS From 91c450e80a336f8966da713d2ba137b6e9c3acfd Mon Sep 17 00:00:00 2001 From: Oran Agra Date: Tue, 27 Apr 2021 08:15:10 +0300 Subject: [PATCH 21/75] Prevent replicas from sending commands that interact with keyspace (#8868) This solves an issue reported in #8712 in which a replica would bypass the client write pause check and cause an assertion due to executing a write command during failover. The fact is that we don't expect replicas to execute any command other than maybe REPLCONF and PING, etc. but matching against the ADMIN command flag is insufficient, so instead i just block keyspace access for now. (cherry picked from commit 46f4ebbe842620f0976a36741a72482620aa4b48) --- src/server.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/server.c b/src/server.c index 993260619..1fe150b3b 100644 --- a/src/server.c +++ b/src/server.c @@ -3985,6 +3985,8 @@ int processCommand(client *c) { return C_OK; } + int is_read_command = (c->cmd->flags & CMD_READONLY) || + (c->cmd->proc == execCommand && (c->mstate.cmd_flags & CMD_READONLY)); int is_write_command = (c->cmd->flags & CMD_WRITE) || (c->cmd->proc == execCommand && (c->mstate.cmd_flags & CMD_WRITE)); int is_denyoom_command = (c->cmd->flags & CMD_DENYOOM) || @@ -4194,7 +4196,7 @@ int processCommand(client *c) { c->cmd->proc != discardCommand && c->cmd->proc != watchCommand && c->cmd->proc != unwatchCommand && - c->cmd->proc != resetCommand && + c->cmd->proc != resetCommand && !(c->cmd->proc == shutdownCommand && c->argc == 2 && tolower(((char*)c->argv[1]->ptr)[0]) == 'n') && @@ -4206,6 +4208,14 @@ int processCommand(client *c) { return C_OK; } + /* Prevent a replica from sending commands that access the keyspace. + * The main objective here is to prevent abuse of client pause check + * from which replicas are exempt. */ + if ((c->flags & CLIENT_SLAVE) && (is_may_replicate_command || is_write_command || is_read_command)) { + rejectCommandFormat(c, "Replica can't interract with the keyspace"); + return C_OK; + } + /* If the server is paused, block the client until * the pause has ended. Replicas are never paused. */ if (!(c->flags & CLIENT_SLAVE) && From 2149c5a03041992104b6a8a5847b6d630319bb06 Mon Sep 17 00:00:00 2001 From: yoav-steinberg Date: Tue, 27 Apr 2021 16:22:22 +0300 Subject: [PATCH 22/75] Bump freebsd-vm version to fix CI failures (#8876) Specifically we had issues with NTP sync failure which was resolved here: https://github.com/vmactions/freebsd-vm/commit/457af7345642e154a79d219971a2d4a7c7fe2118 (cherry picked from commit 2e88b0639689a3019e27f55dfa40578847443eeb) --- .github/workflows/daily.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/daily.yml b/.github/workflows/daily.yml index ee9ac1bbf..9e4630e29 100644 --- a/.github/workflows/daily.yml +++ b/.github/workflows/daily.yml @@ -253,7 +253,7 @@ jobs: steps: - uses: actions/checkout@v2 - name: test - uses: vmactions/freebsd-vm@v0.1.2 + uses: vmactions/freebsd-vm@v0.1.4 with: usesh: true sync: rsync From 9868ad801d7f3aa608d7331fd51f4cc3ef0f4100 Mon Sep 17 00:00:00 2001 From: Huang Zhw Date: Tue, 27 Apr 2021 23:02:23 +0800 Subject: [PATCH 23/75] Fix potential CONFIG SET bind test failure. (#8875) Use an invalid IP address to trigger CONFIG SET bind failure, instead of DNS which is not guaranteed to always fail. (cherry picked from commit 2b22fffc787e91df789dabf23ddcf19ecf34cf6f) --- tests/unit/networking.tcl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/networking.tcl b/tests/unit/networking.tcl index 19feee8c3..38b49d45e 100644 --- a/tests/unit/networking.tcl +++ b/tests/unit/networking.tcl @@ -25,7 +25,7 @@ test {CONFIG SET port number} { test {CONFIG SET bind address} { start_server {} { # non-valid address - catch {r CONFIG SET bind "some.wrong.bind.address"} e + catch {r CONFIG SET bind "999.999.999.999"} e assert_match {*Failed to bind to specified addresses*} $e # make sure server still bound to the previous address @@ -33,4 +33,4 @@ test {CONFIG SET bind address} { $rd PING $rd close } -} \ No newline at end of file +} From ee542fbbd9065ae1c527bbc4bf77c3eba3a77cc1 Mon Sep 17 00:00:00 2001 From: filipe oliveira Date: Wed, 28 Apr 2021 07:51:07 +0100 Subject: [PATCH 24/75] redis-benchmark: Error/Warning handling updates. (#8869) - Immediately exit on errors that are not related to topology updates. - Deprecates the `-e` option ( retro compatible ) and warns that we now exit immediately on errors that are not related to topology updates. - Fixed wrongfully failing on config fetch error (warning only). This only affects RE. Bottom line: - MOVED and ASK errors will not show any warning (unlike the throttled error with `-e` before). - CLUSTERDOWN still prints an error unconditionally and sleeps for 1 second. - other errors are fatal. (cherry picked from commit ef6f902372d4646b1894ec5dbd5f857dea5688d6) --- src/redis-benchmark.c | 82 +++++++++++++++++++++---------------------- 1 file changed, 40 insertions(+), 42 deletions(-) diff --git a/src/redis-benchmark.c b/src/redis-benchmark.c index 068c094e1..51dba9511 100644 --- a/src/redis-benchmark.c +++ b/src/redis-benchmark.c @@ -99,7 +99,6 @@ static struct config { int randomkeys_keyspacelen; int keepalive; int pipeline; - int showerrors; long long start; long long totlatency; const char *title; @@ -307,7 +306,9 @@ static redisContext *getRedisContext(const char *ip, int port, fprintf(stderr, "Node %s:%d replied with error:\n%s\n", ip, port, reply->str); else fprintf(stderr, "Node %s replied with error:\n%s\n", hostsocket, reply->str); - goto cleanup; + freeReplyObject(reply); + redisFree(ctx); + exit(1); } freeReplyObject(reply); return ctx; @@ -366,9 +367,15 @@ fail: fprintf(stderr, "ERROR: failed to fetch CONFIG from "); if (hostsocket == NULL) fprintf(stderr, "%s:%d\n", ip, port); else fprintf(stderr, "%s\n", hostsocket); + int abort_test = 0; + if (!strncmp(reply->str,"NOAUTH",5) || + !strncmp(reply->str,"WRONGPASS",9) || + !strncmp(reply->str,"NOPERM",5)) + abort_test = 1; freeReplyObject(reply); redisFree(c); freeRedisConfig(cfg); + if (abort_test) exit(1); return NULL; } static void freeRedisConfig(redisConfig *cfg) { @@ -513,44 +520,39 @@ static void readHandler(aeEventLoop *el, int fd, void *privdata, int mask) { exit(1); } redisReply *r = reply; - int is_err = (r->type == REDIS_REPLY_ERROR); - - if (is_err && config.showerrors) { - /* TODO: static lasterr_time not thread-safe */ - static time_t lasterr_time = 0; - time_t now = time(NULL); - if (lasterr_time != now) { - lasterr_time = now; - if (c->cluster_node) { - printf("Error from server %s:%d: %s\n", + if (r->type == REDIS_REPLY_ERROR) { + /* Try to update slots configuration if reply error is + * MOVED/ASK/CLUSTERDOWN and the key(s) used by the command + * contain(s) the slot hash tag. + * If the error is not topology-update related then we + * immediately exit to avoid false results. */ + if (c->cluster_node && c->staglen) { + int fetch_slots = 0, do_wait = 0; + if (!strncmp(r->str,"MOVED",5) || !strncmp(r->str,"ASK",3)) + fetch_slots = 1; + else if (!strncmp(r->str,"CLUSTERDOWN",11)) { + /* Usually the cluster is able to recover itself after + * a CLUSTERDOWN error, so try to sleep one second + * before requesting the new configuration. */ + fetch_slots = 1; + do_wait = 1; + printf("Error from server %s:%d: %s.\n", c->cluster_node->ip, c->cluster_node->port, r->str); + } + if (do_wait) sleep(1); + if (fetch_slots && !fetchClusterSlotsConfiguration(c)) + exit(1); + } else { + if (c->cluster_node) { + printf("Error from server %s:%d: %s\n", + c->cluster_node->ip, + c->cluster_node->port, + r->str); } else printf("Error from server: %s\n", r->str); - } - } - - /* Try to update slots configuration if reply error is - * MOVED/ASK/CLUSTERDOWN and the key(s) used by the command - * contain(s) the slot hash tag. */ - if (is_err && c->cluster_node && c->staglen) { - int fetch_slots = 0, do_wait = 0; - if (!strncmp(r->str,"MOVED",5) || !strncmp(r->str,"ASK",3)) - fetch_slots = 1; - else if (!strncmp(r->str,"CLUSTERDOWN",11)) { - /* Usually the cluster is able to recover itself after - * a CLUSTERDOWN error, so try to sleep one second - * before requesting the new configuration. */ - fetch_slots = 1; - do_wait = 1; - printf("Error from server %s:%d: %s\n", - c->cluster_node->ip, - c->cluster_node->port, - r->str); - } - if (do_wait) sleep(1); - if (fetch_slots && !fetchClusterSlotsConfiguration(c)) exit(1); + } } freeReplyObject(reply); @@ -1293,8 +1295,7 @@ static int fetchClusterSlotsConfiguration(client c) { atomicGetIncr(config.is_fetching_slots, is_fetching_slots, 1); if (is_fetching_slots) return -1; //TODO: use other codes || errno ? atomicSet(config.is_fetching_slots, 1); - if (config.showerrors) - printf("Cluster slots configuration changed, fetching new one...\n"); + printf("WARNING: Cluster slots configuration changed, fetching new one...\n"); const char *errmsg = "Failed to update cluster slots configuration"; static dictType dtype = { dictSdsHash, /* hash function */ @@ -1470,7 +1471,8 @@ int parseOptions(int argc, const char **argv) { } else if (!strcmp(argv[i],"-I")) { config.idlemode = 1; } else if (!strcmp(argv[i],"-e")) { - config.showerrors = 1; + printf("WARNING: -e option has been deprecated. " + "We now immediatly exit on error to avoid false results.\n"); } else if (!strcmp(argv[i],"-t")) { if (lastarg) goto invalid; /* We get the list of tests to run as a string in the form @@ -1573,8 +1575,6 @@ usage: " is executed. Default tests use this to hit random keys in the\n" " specified range.\n" " -P Pipeline requests. Default 1 (no pipeline).\n" -" -e If server replies with errors, show them on stdout.\n" -" (no more than 1 error per second is displayed)\n" " -q Quiet. Just show query/sec values\n" " --precision Number of decimal places to display in latency output (default 0)\n" " --csv Output in CSV format\n" @@ -1699,7 +1699,6 @@ int main(int argc, const char **argv) { config.keepalive = 1; config.datasize = 3; config.pipeline = 1; - config.showerrors = 0; config.randomkeys = 0; config.randomkeys_keyspacelen = 0; config.quiet = 0; @@ -1784,7 +1783,6 @@ int main(int argc, const char **argv) { getRedisConfig(config.hostip, config.hostport, config.hostsocket); if (config.redis_config == NULL) { fprintf(stderr, "WARN: could not fetch server CONFIG\n"); - exit(1); } } if (config.num_threads > 0) { From e919deac128d0648407bc163d639527e1ca7a031 Mon Sep 17 00:00:00 2001 From: Binbin Date: Wed, 28 Apr 2021 18:19:55 +0800 Subject: [PATCH 25/75] redis-cli: Do not use hostsocket when we got redirected in cluster mode (#8870) When redis-cli was used with both -c (cluster) and -s (unix socket), it would have kept trying to use that unix socket, even if it got redirected by the cluster (resulting in an infinite loop). (cherry picked from commit 416f2773395ffcd72d8d8408e1558f49d59a0077) --- src/redis-cli.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/redis-cli.c b/src/redis-cli.c index 7e1fe3934..ff34f2b6a 100644 --- a/src/redis-cli.c +++ b/src/redis-cli.c @@ -844,7 +844,9 @@ static int cliConnect(int flags) { cliRefreshPrompt(); } - if (config.hostsocket == NULL) { + /* Do not use hostsocket when we got redirected in cluster mode */ + if (config.hostsocket == NULL || + (config.cluster_mode && config.cluster_reissue_command)) { context = redisConnect(config.hostip,config.hostport); } else { context = redisConnectUnix(config.hostsocket); From 0d10460235fa1d13ef4574e3af7a2288ca1ec08e Mon Sep 17 00:00:00 2001 From: Binbin Date: Wed, 28 Apr 2021 21:03:24 +0800 Subject: [PATCH 26/75] redis-benchmark: Add zfree(data) and fix lrange size / text mismatch (#8872) missing zfree(data) in redis-benchmark. And also correct the wrong size in lrange. the text mentioned 500, but size was 450, changed to 500 (cherry picked from commit 1eff8564c78011f7257e485796990a0d4d607a5b) --- src/redis-benchmark.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/redis-benchmark.c b/src/redis-benchmark.c index 51dba9511..fa024d44f 100644 --- a/src/redis-benchmark.c +++ b/src/redis-benchmark.c @@ -1946,8 +1946,8 @@ int main(int argc, const char **argv) { } if (test_is_selected("lrange") || test_is_selected("lrange_500")) { - len = redisFormatCommand(&cmd,"LRANGE mylist%s 0 449",tag); - benchmark("LRANGE_500 (first 450 elements)",cmd,len); + len = redisFormatCommand(&cmd,"LRANGE mylist%s 0 499",tag); + benchmark("LRANGE_500 (first 500 elements)",cmd,len); free(cmd); } @@ -1974,6 +1974,7 @@ int main(int argc, const char **argv) { if (!config.csv) printf("\n"); } while(config.loop); + zfree(data); if (config.redis_config != NULL) freeRedisConfig(config.redis_config); return 0; From 02bd008bd1a8ba62811b794d71741a6b9af7baeb Mon Sep 17 00:00:00 2001 From: Huang Zhw Date: Thu, 29 Apr 2021 17:08:52 +0800 Subject: [PATCH 27/75] Improve redis-cli help. When help command, we only match command (#8879) prefix args not all args. So when we help commands with subcommands, all subcommands will be output. (cherry picked from commit 0b1b9edb2843730b03f78b6073cdd30873dbba95) --- src/redis-cli.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/redis-cli.c b/src/redis-cli.c index ff34f2b6a..9f88a9c88 100644 --- a/src/redis-cli.c +++ b/src/redis-cli.c @@ -663,7 +663,7 @@ static void cliOutputHelp(int argc, char **argv) { help = entry->org; if (group == -1) { /* Compare all arguments */ - if (argc == entry->argc) { + if (argc <= entry->argc) { for (j = 0; j < argc; j++) { if (strcasecmp(argv[j],entry->argv[j]) != 0) break; } From 23e126a9933c0cfe8cb3cdced7fb7efd67acb45f Mon Sep 17 00:00:00 2001 From: sundb Date: Sun, 2 May 2021 15:32:57 +0800 Subject: [PATCH 28/75] Fix memory leak in moduleDefragGlobals (#8853) (cherry picked from commit 5100ef9f8246dec6590f35f6b9f0b88c2dea0cfb) --- src/module.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/module.c b/src/module.c index 727cdc43f..c76241bfb 100644 --- a/src/module.c +++ b/src/module.c @@ -9205,6 +9205,7 @@ long moduleDefragGlobals(void) { module->defrag_cb(&defrag_ctx); defragged += defrag_ctx.defragged; } + dictReleaseIterator(di); return defragged; } From 832b7682480f5057cab4b314a5f73b6073f963a7 Mon Sep 17 00:00:00 2001 From: Oran Agra Date: Mon, 3 May 2021 08:27:22 +0300 Subject: [PATCH 29/75] Fix integer overflow in intset (CVE-2021-29478) An integer overflow bug in Redis 6.2 could be exploited to corrupt the heap and potentially result with remote code execution. The vulnerability involves changing the default set-max-intset-entries configuration value, creating a large set key that consists of integer values and using the COPY command to duplicate it. The integer overflow bug exists in all versions of Redis starting with 2.6, where it could result with a corrupted RDB or DUMP payload, but not exploited through COPY (which did not exist before 6.2). (cherry picked from commit 29900d4e6bccdf3691bedf0ea9a5d84863fa3592) --- src/intset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/intset.c b/src/intset.c index 1a64ecae8..9ba13898d 100644 --- a/src/intset.c +++ b/src/intset.c @@ -281,7 +281,7 @@ uint32_t intsetLen(const intset *is) { /* Return intset blob size in bytes. */ size_t intsetBlobLen(intset *is) { - return sizeof(intset)+intrev32ifbe(is->length)*intrev32ifbe(is->encoding); + return sizeof(intset)+(size_t)intrev32ifbe(is->length)*intrev32ifbe(is->encoding); } /* Validate the integrity of the data structure. From fe0345931c93f5a46f35f0d92c28f5347f2062f2 Mon Sep 17 00:00:00 2001 From: Oran Agra Date: Mon, 3 May 2021 08:32:31 +0300 Subject: [PATCH 30/75] Fix integer overflow in STRALGO LCS (CVE-2021-29477) An integer overflow bug in Redis version 6.0 or newer could be exploited using the STRALGO LCS command to corrupt the heap and potentially result with remote code execution. (cherry picked from commit f0c5f920d0f88bd8aa376a2c05af4902789d1ef9) --- src/t_string.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/t_string.c b/src/t_string.c index 0967e30e1..490d5983a 100644 --- a/src/t_string.c +++ b/src/t_string.c @@ -805,7 +805,7 @@ void stralgoLCS(client *c) { /* Setup an uint32_t array to store at LCS[i,j] the length of the * LCS A0..i-1, B0..j-1. Note that we have a linear array here, so * we index it as LCS[j+(blen+1)*j] */ - uint32_t *lcs = zmalloc((alen+1)*(blen+1)*sizeof(uint32_t)); + uint32_t *lcs = zmalloc((size_t)(alen+1)*(blen+1)*sizeof(uint32_t)); #define LCS(A,B) lcs[(B)+((A)*(blen+1))] /* Start building the LCS table. */ From f72cad07e1d9b5710833c89d737623b7ccfc83d2 Mon Sep 17 00:00:00 2001 From: Oran Agra Date: Mon, 3 May 2021 08:33:21 +0300 Subject: [PATCH 31/75] Resolve nonsense static analysis warnings (cherry picked from commit fd7d51c353607f350c865155444bce9236f3d682) --- .../jemalloc/internal/jemalloc_internal_inlines_c.h | 2 +- src/lolwut.c | 4 ++-- src/memtest.c | 2 +- src/object.c | 2 +- src/redis-check-rdb.c | 2 +- src/redis-cli.c | 6 +++--- src/sentinel.c | 12 ++++++------ 7 files changed, 15 insertions(+), 15 deletions(-) diff --git a/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h b/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h index 2685802b8..b19a94207 100644 --- a/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h +++ b/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h @@ -235,7 +235,7 @@ iget_defrag_hint(tsdn_t *tsdn, void* ptr) { int free_in_slab = extent_nfree_get(slab); if (free_in_slab) { const bin_info_t *bin_info = &bin_infos[binind]; - int curslabs = bin->stats.curslabs; + unsigned long curslabs = bin->stats.curslabs; size_t curregs = bin->stats.curregs; if (bin->slabcur) { /* remove slabcur from the overall utilization */ diff --git a/src/lolwut.c b/src/lolwut.c index eebd5da6a..931f311cd 100644 --- a/src/lolwut.c +++ b/src/lolwut.c @@ -94,8 +94,8 @@ lwCanvas *lwCreateCanvas(int width, int height, int bgcolor) { lwCanvas *canvas = zmalloc(sizeof(*canvas)); canvas->width = width; canvas->height = height; - canvas->pixels = zmalloc(width*height); - memset(canvas->pixels,bgcolor,width*height); + canvas->pixels = zmalloc((size_t)width*height); + memset(canvas->pixels,bgcolor,(size_t)width*height); return canvas; } diff --git a/src/memtest.c b/src/memtest.c index cb4d35e83..bc0ac3a66 100644 --- a/src/memtest.c +++ b/src/memtest.c @@ -71,7 +71,7 @@ void memtest_progress_start(char *title, int pass) { printf("\x1b[H\x1b[2K"); /* Cursor home, clear current line. */ printf("%s [%d]\n", title, pass); /* Print title. */ progress_printed = 0; - progress_full = ws.ws_col*(ws.ws_row-3); + progress_full = (size_t)ws.ws_col*(ws.ws_row-3); fflush(stdout); } diff --git a/src/object.c b/src/object.c index b75e547b9..c7b25ffd4 100644 --- a/src/object.c +++ b/src/object.c @@ -836,7 +836,7 @@ size_t objectComputeSize(robj *o, size_t sample_size) { if (samples) asize += (double)elesize/samples*dictSize(d); } else if (o->encoding == OBJ_ENCODING_INTSET) { intset *is = o->ptr; - asize = sizeof(*o)+sizeof(*is)+is->encoding*is->length; + asize = sizeof(*o)+sizeof(*is)+(size_t)is->encoding*is->length; } else { serverPanic("Unknown set encoding"); } diff --git a/src/redis-check-rdb.c b/src/redis-check-rdb.c index 4f451969a..6ddfda7ff 100644 --- a/src/redis-check-rdb.c +++ b/src/redis-check-rdb.c @@ -250,7 +250,7 @@ int redis_check_rdb(char *rdbfilename, FILE *fp) { rdbstate.doing = RDB_CHECK_DOING_READ_LEN; if ((dbid = rdbLoadLen(&rdb,NULL)) == RDB_LENERR) goto eoferr; - rdbCheckInfo("Selecting DB ID %d", dbid); + rdbCheckInfo("Selecting DB ID %llu", (unsigned long long)dbid); continue; /* Read type again. */ } else if (type == RDB_OPCODE_RESIZEDB) { /* RESIZEDB: Hint about the size of the keys in the currently diff --git a/src/redis-cli.c b/src/redis-cli.c index 9f88a9c88..81be58b17 100644 --- a/src/redis-cli.c +++ b/src/redis-cli.c @@ -5483,7 +5483,7 @@ static void clusterManagerNodeArrayReset(clusterManagerNodeArray *array) { static void clusterManagerNodeArrayShift(clusterManagerNodeArray *array, clusterManagerNode **nodeptr) { - assert(array->nodes < (array->nodes + array->len)); + assert(array->len > 0); /* If the first node to be shifted is not NULL, decrement count. */ if (*array->nodes != NULL) array->count--; /* Store the first node to be shifted into 'nodeptr'. */ @@ -5496,7 +5496,7 @@ static void clusterManagerNodeArrayShift(clusterManagerNodeArray *array, static void clusterManagerNodeArrayAdd(clusterManagerNodeArray *array, clusterManagerNode *node) { - assert(array->nodes < (array->nodes + array->len)); + assert(array->len > 0); assert(node != NULL); assert(array->count < array->len); array->nodes[array->count++] = node; @@ -6873,7 +6873,7 @@ void showLatencyDistSamples(struct distsamples *samples, long long tot) { printf("\033[38;5;0m"); /* Set foreground color to black. */ for (j = 0; ; j++) { int coloridx = - ceil((float) samples[j].count / tot * (spectrum_palette_size-1)); + ceil((double) samples[j].count / tot * (spectrum_palette_size-1)); int color = spectrum_palette[coloridx]; printf("\033[48;5;%dm%c", (int)color, samples[j].character); samples[j].count = 0; diff --git a/src/sentinel.c b/src/sentinel.c index a56cd8b15..2d81d98ac 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -4119,16 +4119,16 @@ void sentinelSetCommand(client *c) { int numargs = j-old_j+1; switch(numargs) { case 2: - sentinelEvent(LL_WARNING,"+set",ri,"%@ %s %s",c->argv[old_j]->ptr, - c->argv[old_j+1]->ptr); + sentinelEvent(LL_WARNING,"+set",ri,"%@ %s %s",(char*)c->argv[old_j]->ptr, + (char*)c->argv[old_j+1]->ptr); break; case 3: - sentinelEvent(LL_WARNING,"+set",ri,"%@ %s %s %s",c->argv[old_j]->ptr, - c->argv[old_j+1]->ptr, - c->argv[old_j+2]->ptr); + sentinelEvent(LL_WARNING,"+set",ri,"%@ %s %s %s",(char*)c->argv[old_j]->ptr, + (char*)c->argv[old_j+1]->ptr, + (char*)c->argv[old_j+2]->ptr); break; default: - sentinelEvent(LL_WARNING,"+set",ri,"%@ %s",c->argv[old_j]->ptr); + sentinelEvent(LL_WARNING,"+set",ri,"%@ %s",(char*)c->argv[old_j]->ptr); break; } } From 439c356fe600b0ce29728ddca8961eb4c09414a9 Mon Sep 17 00:00:00 2001 From: Oran Agra Date: Mon, 3 May 2021 12:08:20 +0300 Subject: [PATCH 32/75] Redis 6.2.3 --- 00-RELEASENOTES | 34 ++++++++++++++++++++++++++++++++++ src/version.h | 4 ++-- 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/00-RELEASENOTES b/00-RELEASENOTES index 8a1405e41..4f6cb9978 100644 --- a/00-RELEASENOTES +++ b/00-RELEASENOTES @@ -11,6 +11,40 @@ CRITICAL: There is a critical bug affecting MOST USERS. Upgrade ASAP. SECURITY: There are security fixes in the release. -------------------------------------------------------------------------------- +================================================================================ +Redis 6.2.3 Released Mon May 3 19:00:00 IST 2021 +================================================================================ + +Upgrade urgency: SECURITY, Contains fixes to security issues that affect +authenticated client connections. LOW otherwise. + +Integer overflow in STRALGO LCS command (CVE-2021-29477): +An integer overflow bug in Redis version 6.0 or newer could be exploited using +the STRALGO LCS command to corrupt the heap and potentially result in remote +code execution. The integer overflow bug exists in all versions of Redis +starting with 6.0. + +Integer overflow in COPY command for large intsets (CVE-2021-29478): +An integer overflow bug in Redis 6.2 could be exploited to corrupt the heap and +potentially result with remote code execution. The vulnerability involves +changing the default set-max-intset-entries configuration value, creating a +large set key that consists of integer values and using the COPY command to +duplicate it. The integer overflow bug exists in all versions of Redis starting +with 2.6, where it could result with a corrupted RDB or DUMP payload, but not +exploited through COPY (which did not exist before 6.2). + +Bug fixes that are only applicable to previous releases of Redis 6.2: +* Fix memory leak in moduleDefragGlobals (#8853) +* Fix memory leak when doing lazy freeing client tracking table (#8822) +* Block abusive replicas from sending command that could assert and crash redis (#8868) + +Other bug fixes: +* Use a monotonic clock to check for Lua script timeout (#8812) +* redis-cli: Do not use unix socket when we got redirected in cluster mode (#8870) + +Modules: +* Fix RM_GetClusterNodeInfo() to correctly populate master id (#8846) + ================================================================================ Redis 6.2.2 Released Mon April 19 19:00:00 IST 2021 ================================================================================ diff --git a/src/version.h b/src/version.h index 3c5dc02c5..b87f2b9c3 100644 --- a/src/version.h +++ b/src/version.h @@ -1,2 +1,2 @@ -#define REDIS_VERSION "6.2.2" -#define REDIS_VERSION_NUM 0x00060202 +#define REDIS_VERSION "6.2.3" +#define REDIS_VERSION_NUM 0x00060203 From eb35d7e9ec36d73e0aa8fa2bdb0eb7bb808e4627 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Wed, 5 May 2021 16:37:02 +0000 Subject: [PATCH 33/75] Updated maxmemory tests to account for overhead in new replication backlog behaviour Former-commit-id: 4cd197959693dfe4d1497c3f703cf6aaa27d34ad --- tests/unit/maxmemory.tcl | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/tests/unit/maxmemory.tcl b/tests/unit/maxmemory.tcl index 414733d1e..23879c38a 100644 --- a/tests/unit/maxmemory.tcl +++ b/tests/unit/maxmemory.tcl @@ -33,7 +33,8 @@ start_server {tags {"maxmemory"}} { # Get the current memory limit and calculate a new limit. # We just add 100k to the current memory size so that it is # fast for us to reach that limit. - set used [s used_memory] + set overhead [s mem_not_counted_for_evict] + set used [expr [s used_memory] - $overhead] set limit [expr {$used+100*1024}] r config set maxmemory $limit r config set maxmemory-policy $policy @@ -42,7 +43,7 @@ start_server {tags {"maxmemory"}} { while 1 { r setex [randomKey] 10000 x incr numkeys - if {[s used_memory]+4096 > $limit} { + if {[expr {[s used_memory] - $overhead + 4096}] > $limit} { assert {$numkeys > 10} break } @@ -52,7 +53,8 @@ start_server {tags {"maxmemory"}} { for {set j 0} {$j < $numkeys} {incr j} { r setex [randomKey] 10000 x } - assert {[s used_memory] < ($limit+4096)} + set used_amt [expr [s used_memory] - $overhead] + assert {$used_amt < ($limit+4096)} } } @@ -65,7 +67,8 @@ start_server {tags {"maxmemory"}} { # Get the current memory limit and calculate a new limit. # We just add 100k to the current memory size so that it is # fast for us to reach that limit. - set used [s used_memory] + set overhead [s mem_not_counted_for_evict] + set used [expr [s used_memory] - $overhead] set limit [expr {$used+100*1024}] r config set maxmemory $limit r config set maxmemory-policy $policy @@ -74,7 +77,7 @@ start_server {tags {"maxmemory"}} { while 1 { r set [randomKey] x incr numkeys - if {[s used_memory]+4096 > $limit} { + if {[expr [s used_memory] - $overhead]+4096 > $limit} { assert {$numkeys > 10} break } @@ -91,7 +94,7 @@ start_server {tags {"maxmemory"}} { } } if {[string match allkeys-* $policy]} { - assert {[s used_memory] < ($limit+4096)} + assert {[expr [s used_memory] - $overhead] < ($limit+4096)} } else { assert {$err == 1} } @@ -107,7 +110,8 @@ start_server {tags {"maxmemory"}} { # Get the current memory limit and calculate a new limit. # We just add 100k to the current memory size so that it is # fast for us to reach that limit. - set used [s used_memory] + set overhead [s mem_not_counted_for_evict] + set used [expr [s used_memory] - $overhead] set limit [expr {$used+100*1024}] r config set maxmemory $limit r config set maxmemory-policy $policy @@ -121,7 +125,7 @@ start_server {tags {"maxmemory"}} { } else { r set "key:$numkeys" x } - if {[s used_memory]+4096 > $limit} { + if {[expr [s used_memory] - $overhead]+4096 > $limit} { assert {$numkeys > 10} break } @@ -135,7 +139,7 @@ start_server {tags {"maxmemory"}} { catch {r setex "foo:$j" 10000 x} } # We should still be under the limit. - assert {[s used_memory] < ($limit+4096)} + assert {[expr [s used_memory] - $overhead] < ($limit+4096)} # However all our non volatile keys should be here. for {set j 0} {$j < $numkeys} {incr j 2} { assert {[r exists "key:$j"]} @@ -284,7 +288,8 @@ start_server {tags {"maxmemory"} overrides {server-threads 1}} { # we need to make sure to evict keynames of a total size of more than # 16kb since the (PROTO_REPLY_CHUNK_BYTES), only after that the # invalidation messages have a chance to trigger further eviction. - set used [s used_memory] + set overhead [s mem_not_counted_for_evict] + set used [expr [s used_memory] - $overhead] set limit [expr {$used - 40000}] r config set maxmemory $limit From f6a714db2658a4b8baa924e62dd0ab2c6a7adb9f Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Wed, 5 May 2021 17:04:08 +0000 Subject: [PATCH 34/75] Updated overhead calculation to only use repl_backlog_size Former-commit-id: 6f93c7eb44d84bb143b4ad4fff3c6a5436ebaaf7 --- src/evict.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/evict.cpp b/src/evict.cpp index 36837e17d..e7f0a10ef 100644 --- a/src/evict.cpp +++ b/src/evict.cpp @@ -399,8 +399,8 @@ size_t freeMemoryGetNotCountedMemory(void) { /* also don't count the replication backlog memory * that's where the replication clients get their memory from */ - overhead += (g_pserver->repl_backlog_size - g_pserver->repl_backlog_config_size); - + // overhead += (g_pserver->repl_backlog_size - g_pserver->repl_backlog_config_size); + overhead += g_pserver->repl_backlog_size; if (g_pserver->aof_state != AOF_OFF) { overhead += sdsalloc(g_pserver->aof_buf)+aofRewriteBufferSize(); From 7ff2fb716a4a92bf78a06a888fec82f246889c74 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Wed, 5 May 2021 22:26:36 +0000 Subject: [PATCH 35/75] Fixed data race? Seems to be passing multithreaded test cases now Former-commit-id: cb13edd1200c1230fa7e313d69c69e06129951d3 --- src/networking.cpp | 2 +- src/replication.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/networking.cpp b/src/networking.cpp index c51a02a1d..6f4aa6268 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -1782,7 +1782,7 @@ int writeToClient(client *c, int handler_installed) { // } - if (nwritten == nrequested){ + if (nwritten == nrequested && g_pserver->repl_backlog_idx == c->repl_curr_idx){ c->repl_curr_idx = -1; /* -1 denotes no more replica writes */ } else if (nwritten > 0) diff --git a/src/replication.cpp b/src/replication.cpp index 60f25052a..d3df6d12a 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -90,7 +90,7 @@ void resizeReplicationBacklogForClients(long long newsize); void setReplIdx(client *c, long long idx, long long off){ // serverLog(LL_NOTICE, "calling this garbage function w/ idx and off: %lld, %lld, %lld", idx, off, off-idx); // serverLog(LL_NOTICE, "Repl Index started at: %lld", c->repl_curr_idx); - if (c->repl_curr_idx == -1){ + if (c->repl_curr_idx == -1 && off >= c->repl_curr_off){ if (prepareClientToWrite(c) != C_OK) return; c->repl_curr_idx = idx; c->repl_curr_off = off; From a9552f9635216699d7f64aed8c969a9f93d4e500 Mon Sep 17 00:00:00 2001 From: christianEQ Date: Thu, 20 May 2021 19:09:02 +0000 Subject: [PATCH 36/75] license status OK if not checking license Former-commit-id: 6bfdc9d41dc638989d50f005af8d66e4ed47ce77 --- src/server.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/server.cpp b/src/server.cpp index bc268b4fd..cbb7654e1 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -5474,7 +5474,11 @@ sds genRedisInfoString(const char *section) { "variant:enterprise\r\n" "license_status:%s\r\n" "mvcc_depth:%d\r\n", + #ifdef NO_LICENSE_CHECK + "OK", + #else cserver.license_key ? "OK" : "Trial", + #endif mvcc_depth ); } From 4fd76c47911f506909e54a138bf8f72b0fea8687 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Fri, 21 May 2021 17:05:55 +0000 Subject: [PATCH 37/75] Fixed single threaded for real this time, need to add synchronization for multi threaded Former-commit-id: 4d858dac1a503f4d518477212ba585069af22574 --- src/networking.cpp | 8 +++++--- src/replication.cpp | 5 +++++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/networking.cpp b/src/networking.cpp index 6f4aa6268..c39d8ce42 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -1676,8 +1676,7 @@ int writeToClient(client *c, int handler_installed) { std::unique_locklock)> lock(c->lock); - /* if this is a write to a replica, it's coming straight from the replication backlog */ - long long repl_backlog_idx = g_pserver->repl_backlog_idx; + while(clientHasPendingReplies(c)) { if (c->bufpos > 0) { @@ -1742,6 +1741,9 @@ int writeToClient(client *c, int handler_installed) { c->transmittedRDB = true; } + /* if this is a write to a replica, it's coming straight from the replication backlog */ + long long repl_backlog_idx = g_pserver->repl_backlog_idx; + /* For replicas, we don't store all the information in the client buffer * Most of the time (aside from immediately after synchronizing), we read * from the replication backlog directly */ @@ -1782,7 +1784,7 @@ int writeToClient(client *c, int handler_installed) { // } - if (nwritten == nrequested && g_pserver->repl_backlog_idx == c->repl_curr_idx){ + if (nwritten == nrequested && g_pserver->repl_backlog_idx == repl_backlog_idx){ c->repl_curr_idx = -1; /* -1 denotes no more replica writes */ } else if (nwritten > 0) diff --git a/src/replication.cpp b/src/replication.cpp index d3df6d12a..1d4e01289 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -3059,6 +3059,11 @@ void syncWithMaster(connection *conn) { if (psync_result == PSYNC_CONTINUE) { serverLog(LL_NOTICE, "MASTER <-> REPLICA sync: Master accepted a Partial Resynchronization."); + /* Reset the bulklen information in case it is lingering from the last connection + * The partial sync will start from the beginning of a command so these should be reset */ + mi->master->reqtype = 0; + mi->master->multibulklen = 0; + mi->master->bulklen = -1; if (cserver.supervised_mode == SUPERVISED_SYSTEMD) { redisCommunicateSystemd("STATUS=MASTER <-> REPLICA sync: Partial Resynchronization accepted. Ready to accept connections.\n"); redisCommunicateSystemd("READY=1\n"); From 5043fb40917beaeae185622cb38debb460edf186 Mon Sep 17 00:00:00 2001 From: malavan Date: Wed, 26 May 2021 17:01:51 +0000 Subject: [PATCH 38/75] fix leak caused by dict entry not being added to GCList Former-commit-id: d8c1b3b6ec64f63fdff04d53102e4563c2c6764a --- src/dict.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dict.cpp b/src/dict.cpp index 88ad116bb..b37b35a6a 100644 --- a/src/dict.cpp +++ b/src/dict.cpp @@ -673,7 +673,7 @@ static dictEntry *dictGenericDelete(dict *d, const void *key, int nofree) { if (!nofree) { if (table == 0 && d->asyncdata != nullptr && (ssize_t)idx < d->rehashidx) { he->next = d->asyncdata->deGCList; - d->asyncdata->deGCList = he->next; + d->asyncdata->deGCList = he; } else { dictFreeKey(d, he); dictFreeVal(d, he); From 6080ee8f2f33fd21de8dfa9c103ba569759bc127 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Wed, 26 May 2021 20:10:33 +0000 Subject: [PATCH 39/75] Added transmitted RDB lock Former-commit-id: 4b32167afc85742d85ff9b47b2c2e0b6b02e140a --- src/networking.cpp | 13 +++++++++++-- src/replication.cpp | 15 ++++++++++----- src/server.h | 2 ++ 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/src/networking.cpp b/src/networking.cpp index c39d8ce42..176693501 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -319,6 +319,7 @@ void _clientAsyncReplyBufferReserve(client *c, size_t len) { clientReplyBlock *replyNew = (clientReplyBlock*)zmalloc(sizeof(clientReplyBlock) + newsize); replyNew->size = zmalloc_usable(replyNew) - sizeof(clientReplyBlock); replyNew->used = 0; + std::unique_lock tRDBLock (c->transmittedRDBLock); c->replyAsync = replyNew; } @@ -332,6 +333,7 @@ int _addReplyToBuffer(client *c, const char *s, size_t len) { if (fAsync) { serverAssert(GlobalLocksAcquired()); + std::unique_lock tRDBLock (c->transmittedRDBLock); if (c->replyAsync == nullptr || (c->replyAsync->size - c->replyAsync->used) < len) { if (c->replyAsync == nullptr) { @@ -1737,9 +1739,14 @@ int writeToClient(client *c, int handler_installed) { /* If there are no more pending replies, then we have transmitted the RDB. * This means further replication commands will be taken straight from the * replication backlog from now on. */ + + std::unique_lock tRDBLock (c->transmittedRDBLock); + if (c->flags & CLIENT_SLAVE && c->replstate == SLAVE_STATE_ONLINE && !clientHasPendingReplies(c) && c->replyAsync == nullptr){ c->transmittedRDB = true; } + bool transmittedRDB = c->transmittedRDB; + tRDBLock.unlock(); /* if this is a write to a replica, it's coming straight from the replication backlog */ long long repl_backlog_idx = g_pserver->repl_backlog_idx; @@ -1747,7 +1754,7 @@ int writeToClient(client *c, int handler_installed) { /* For replicas, we don't store all the information in the client buffer * Most of the time (aside from immediately after synchronizing), we read * from the replication backlog directly */ - if (c->flags & CLIENT_SLAVE && c->repl_curr_idx != -1 && c->transmittedRDB){ + if (c->flags & CLIENT_SLAVE && c->repl_curr_idx != -1 && transmittedRDB){ /* copy global variables into local scope so if they change in between we don't care */ long long repl_backlog_size = g_pserver->repl_backlog_size; long long nwrittenPart2 = 0; @@ -1874,6 +1881,7 @@ void ProcessPendingAsyncWrites() serverAssert(c->fPendingAsyncWrite); if (c->flags & (CLIENT_CLOSE_ASAP | CLIENT_CLOSE_AFTER_REPLY)) { + std::unique_lock tRDBLock (c->transmittedRDBLock); if (c->replyAsync != nullptr){ zfree(c->replyAsync); c->replyAsync = nullptr; @@ -1885,6 +1893,7 @@ void ProcessPendingAsyncWrites() /* since writes from master to replica can come directly from the replication backlog, * writes may have been signalled without having been copied to the replyAsync buffer, * thus causing the buffer to be NULL */ + std::unique_lock tRDBLock (c->transmittedRDBLock); if (c->replyAsync != nullptr){ int size = c->replyAsync->used; @@ -1905,7 +1914,7 @@ void ProcessPendingAsyncWrites() } c->fPendingAsyncWrite = FALSE; - + tRDBLock.unlock(); // Now install the write event handler int ae_flags = AE_WRITABLE|AE_WRITE_THREADSAFE; /* For the fsync=always policy, we want that a given FD is never diff --git a/src/replication.cpp b/src/replication.cpp index 1d4e01289..ad79f4887 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -441,6 +441,8 @@ void feedReplicationBacklog(const void *ptr, size_t len) { g_pserver->master_repl_offset += len; + + /* This is a circular buffer, so write as much data we can at every * iteration and rewind the "idx" index if we reach the limit. */ while(len) { @@ -4659,11 +4661,14 @@ void flushReplBacklogToClients() #ifdef BYPASS_BUFFER - /* If we are online and the RDB has been sent, there is no need to feed the client buffer - * We will send our replies directly from the replication backlog instead */ - if (replica->replstate == SLAVE_STATE_ONLINE && replica->transmittedRDB){ - setReplIdx(replica, g_pserver->repl_batch_idxStart, g_pserver->repl_batch_offStart); - continue; + { + /* If we are online and the RDB has been sent, there is no need to feed the client buffer + * We will send our replies directly from the replication backlog instead */ + std::unique_lock tRDBLock (replica->transmittedRDBLock); + if (replica->replstate == SLAVE_STATE_ONLINE && replica->transmittedRDB){ + setReplIdx(replica, g_pserver->repl_batch_idxStart, g_pserver->repl_batch_offStart); + continue; + } } #endif if (g_pserver->repl_backlog_idx >= g_pserver->repl_batch_idxStart) { diff --git a/src/server.h b/src/server.h index 6c5265fbd..14005e7d5 100644 --- a/src/server.h +++ b/src/server.h @@ -1582,6 +1582,7 @@ struct client { // post a function from a non-client thread to run on its client thread bool postFunction(std::function fn, bool fLock = true); + fastlock transmittedRDBLock {"transmittedRDB"}; size_t argv_len_sum() const; }; @@ -2228,6 +2229,7 @@ struct redisServer { that is the next byte will'll write to.*/ long long repl_backlog_off; /* Replication "master offset" of first byte in the replication backlog buffer.*/ + fastlock repl_backlog_lock {"replication backlog"}; time_t repl_backlog_time_limit; /* Time without slaves after the backlog gets released. */ time_t repl_no_slaves_since; /* We have no slaves since that time. From bf120245faa6867db1b464f319ee3944c017ad28 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Thu, 27 May 2021 18:57:23 +0000 Subject: [PATCH 40/75] Added more synchronization and fixed some data races Former-commit-id: 183e015dac6f85df1c94d0761e89bc23d9f53319 --- src/multi.cpp | 2 + src/networking.cpp | 141 +++++++++++++++++++++++--------------------- src/replication.cpp | 57 ++++++++---------- src/server.cpp | 1 + src/server.h | 3 + 5 files changed, 105 insertions(+), 99 deletions(-) diff --git a/src/multi.cpp b/src/multi.cpp index 9df72383d..9fd5206fb 100644 --- a/src/multi.cpp +++ b/src/multi.cpp @@ -237,6 +237,8 @@ void execCommand(client *c) { * backlog with the final EXEC. */ if (g_pserver->repl_backlog && was_master && !is_master) { const char *execcmd = "*1\r\n$4\r\nEXEC\r\n"; + updateLowestOffsetAmongReplicas(); + std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); feedReplicationBacklog(execcmd,strlen(execcmd)); } } diff --git a/src/networking.cpp b/src/networking.cpp index 176693501..caefd6d1e 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -125,6 +125,7 @@ client *createClient(connection *conn, int iel) { client_id = g_pserver->next_client_id.fetch_add(1); c->iel = iel; c->id = client_id; + sprintf(c->lock.szName, "client %lu", client_id); c->resp = 2; c->conn = conn; c->name = NULL; @@ -1677,8 +1678,7 @@ int writeToClient(client *c, int handler_installed) { serverAssertDebug(FCorrectThread(c)); std::unique_locklock)> lock(c->lock); - - + // serverLog(LL_NOTICE, "acq client"); while(clientHasPendingReplies(c)) { if (c->bufpos > 0) { @@ -1736,82 +1736,87 @@ int writeToClient(client *c, int handler_installed) { !(c->flags & CLIENT_SLAVE)) break; } - /* If there are no more pending replies, then we have transmitted the RDB. - * This means further replication commands will be taken straight from the - * replication backlog from now on. */ + /* We can only directly read from the replication backlog if the client + is a replica, so only attempt to do so if that's the case. */ + if (c->flags & CLIENT_SLAVE) { + /* If there are no more pending replies, then we have transmitted the RDB. + * This means further replication commands will be taken straight from the + * replication backlog from now on. */ + std::unique_lock tRDBLock (c->transmittedRDBLock); - std::unique_lock tRDBLock (c->transmittedRDBLock); + if (c->replstate == SLAVE_STATE_ONLINE && !clientHasPendingReplies(c) && c->replyAsync == nullptr){ + c->transmittedRDB = true; + } + bool transmittedRDB = c->transmittedRDB; + tRDBLock.unlock(); - if (c->flags & CLIENT_SLAVE && c->replstate == SLAVE_STATE_ONLINE && !clientHasPendingReplies(c) && c->replyAsync == nullptr){ - c->transmittedRDB = true; - } - bool transmittedRDB = c->transmittedRDB; - tRDBLock.unlock(); + /* For replicas, we don't store all the information in the client buffer + * Most of the time (aside from immediately after synchronizing), we read + * from the replication backlog directly */ + if (c->repl_curr_idx != -1 && transmittedRDB){ + std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); - /* if this is a write to a replica, it's coming straight from the replication backlog */ - long long repl_backlog_idx = g_pserver->repl_backlog_idx; + /* copy global variables into local scope so if they change in between we don't care */ + long long repl_backlog_idx = g_pserver->repl_backlog_idx; + long long repl_backlog_size = g_pserver->repl_backlog_size; + long long nwrittenPart2 = 0; - /* For replicas, we don't store all the information in the client buffer - * Most of the time (aside from immediately after synchronizing), we read - * from the replication backlog directly */ - if (c->flags & CLIENT_SLAVE && c->repl_curr_idx != -1 && transmittedRDB){ - /* copy global variables into local scope so if they change in between we don't care */ - long long repl_backlog_size = g_pserver->repl_backlog_size; - long long nwrittenPart2 = 0; + ssize_t nrequested; /* The number of bytes requested to write */ + /* normal case with no wrap around */ + if (repl_backlog_idx >= c->repl_curr_idx){ + nrequested = repl_backlog_idx - c->repl_curr_idx; + nwritten = connWrite(c->conn, g_pserver->repl_backlog + c->repl_curr_idx, repl_backlog_idx - c->repl_curr_idx); + /* wrap around case, v. rare */ + /* also v. buggy so there's that */ + } else { + nrequested = repl_backlog_size + repl_backlog_idx - c->repl_curr_idx; + nwritten = connWrite(c->conn, g_pserver->repl_backlog + c->repl_curr_idx, repl_backlog_size - c->repl_curr_idx); + /* only attempt wrapping if we write the correct number of bytes */ + if (nwritten == repl_backlog_size - c->repl_curr_idx){ + long long nwrittenPart2 = connWrite(c->conn, g_pserver->repl_backlog, repl_backlog_idx); + if (nwrittenPart2 != -1) + nwritten += nwrittenPart2; - ssize_t nrequested; /* The number of bytes requested to write */ - /* normal case with no wrap around */ - if (repl_backlog_idx >= c->repl_curr_idx){ - nrequested = repl_backlog_idx - c->repl_curr_idx; - nwritten = connWrite(c->conn, g_pserver->repl_backlog + c->repl_curr_idx, repl_backlog_idx - c->repl_curr_idx); - /* wrap around case, v. rare */ - /* also v. buggy so there's that */ - } else { - nrequested = repl_backlog_size + repl_backlog_idx - c->repl_curr_idx; - nwritten = connWrite(c->conn, g_pserver->repl_backlog + c->repl_curr_idx, repl_backlog_size - c->repl_curr_idx); - /* only attempt wrapping if we write the correct number of bytes */ - if (nwritten == repl_backlog_size - c->repl_curr_idx){ - long long nwrittenPart2 = connWrite(c->conn, g_pserver->repl_backlog, repl_backlog_idx); - if (nwrittenPart2 != -1) - nwritten += nwrittenPart2; + } + } - } + /* only update the replica's current index if bytes were sent */ + + // if (nrequested != nwritten){ + // serverLog(LL_NOTICE, "-----------------------------------------"); + // serverLog(LL_NOTICE, "AFTER THE FACT"); + // serverLog(LL_NOTICE, "requested to write: %ld", nrequested); + // serverLog(LL_NOTICE, "actually written: %ld", nwritten); + // serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size); + // serverLog(LL_NOTICE, "repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset); + // serverLog(LL_NOTICE, "-----------------------------------------"); + // } + + + if (nwritten == nrequested && g_pserver->repl_backlog_idx == repl_backlog_idx){ + c->repl_curr_idx = -1; /* -1 denotes no more replica writes */ + } + else if (nwritten > 0) + c->repl_curr_idx = (c->repl_curr_idx + nwritten) % repl_backlog_size; + + serverAssert(c->repl_curr_idx < repl_backlog_size); + + /* only increment bytes if an error didn't occur */ + if (nwritten > 0){ + totwritten += nwritten; + c->repl_curr_off += nwritten; + } + + /* If the second part of a write didn't go through, we still need to register that */ + if (nwrittenPart2 == -1) nwritten = -1; } - /* only update the replica's current index if bytes were sent */ + if (c->flags & CLIENT_SLAVE && handler_installed) + serverLog(LL_NOTICE, "Total bytes written, %ld, write handler installed?: %d", totwritten, handler_installed); - // if (nrequested != nwritten){ - // serverLog(LL_NOTICE, "-----------------------------------------"); - // serverLog(LL_NOTICE, "AFTER THE FACT"); - // serverLog(LL_NOTICE, "requested to write: %ld", nrequested); - // serverLog(LL_NOTICE, "actually written: %ld", nwritten); - // serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size); - // serverLog(LL_NOTICE, "repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset); - // serverLog(LL_NOTICE, "-----------------------------------------"); - // } - - - if (nwritten == nrequested && g_pserver->repl_backlog_idx == repl_backlog_idx){ - c->repl_curr_idx = -1; /* -1 denotes no more replica writes */ - } - else if (nwritten > 0) - c->repl_curr_idx = (c->repl_curr_idx + nwritten) % repl_backlog_size; - - serverAssert(c->repl_curr_idx < repl_backlog_size); - - /* only increment bytes if an error didn't occur */ - if (nwritten > 0){ - totwritten += nwritten; - c->repl_curr_off += nwritten; - } - - /* If the second part of a write didn't go through, we still need to register that */ - if (nwrittenPart2 == -1) nwritten = -1; } - if (c->flags & CLIENT_SLAVE && handler_installed) - serverLog(LL_NOTICE, "Total bytes written, %ld, write handler installed?: %d", totwritten, handler_installed); - + // serverLog(LL_NOTICE, "rel client"); g_pserver->stat_net_output_bytes += totwritten; if (nwritten == -1) { if (connGetState(c->conn) == CONN_STATE_CONNECTED) { @@ -1834,7 +1839,7 @@ int writeToClient(client *c, int handler_installed) { if (!clientHasPendingReplies(c) && c->repl_curr_idx == -1) { if(c->flags & CLIENT_SLAVE && handler_installed){ serverLog(LL_NOTICE, "Uninstalling handler"); - serverLog(LL_NOTICE, "handler repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size); + serverLog(LL_NOTICE, "handler repl_curr_idx: %lld, repl_backlog_size: %lld", c->repl_curr_idx, g_pserver->repl_backlog_size); serverLog(LL_NOTICE, "handler repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset); } c->sentlen = 0; diff --git a/src/replication.cpp b/src/replication.cpp index ad79f4887..d1181bdf4 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -56,9 +56,11 @@ void putSlaveOnline(client *replica); int cancelReplicationHandshake(redisMaster *mi); static void propagateMasterStaleKeys(); -/* gets the lowest offset amongst all of the replicas */ -long long getLowestOffsetAmongReplicas(){ +/* gets the lowest offset amongst all of the replicas and stores it globally*/ +void updateLowestOffsetAmongReplicas(){ serverAssert(GlobalLocksAcquired()); + serverAssert(!g_pserver->repl_backlog_lock.fOwnLock()); + // serverLog(LL_NOTICE, "off- have repl"); long long min_offset = LONG_LONG_MAX; listIter li; listNode *ln; @@ -69,16 +71,15 @@ long long getLowestOffsetAmongReplicas(){ if (replica->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue; if (replica->flags & CLIENT_CLOSE_ASAP) continue; - if (replica->repl_curr_idx == -1) continue; - std::unique_lock ul(replica->lock, std::defer_lock); - if (FCorrectThread(replica)) - ul.lock(); + std::unique_lock ul(replica->lock); + // serverLog(LL_NOTICE, "off- acq client"); - min_offset = std::min(min_offset, replica->repl_curr_off); + min_offset = std::min(min_offset, replica->repl_curr_off); + // serverLog(LL_NOTICE, "off- rel client"); } /* return -1 if no other minimum was found */ - return min_offset == LONG_LONG_MAX ? -1 : min_offset; + g_pserver->repl_lowest_off.store(min_offset == LONG_LONG_MAX ? -1 : min_offset, std::memory_order_seq_cst); } /* We take a global flag to remember if this instance generated an RDB * because of replication, so that we can remove the RDB file in case @@ -412,11 +413,12 @@ void freeReplicationBacklog(void) { * the backlog without incrementing the offset. */ void feedReplicationBacklog(const void *ptr, size_t len) { serverAssert(GlobalLocksAcquired()); + serverAssert(g_pserver->repl_backlog_lock.fOwnLock()); const unsigned char *p = (const unsigned char*)ptr; if (g_pserver->repl_batch_idxStart >= 0) { /* we are lower bounded by the lower client offset or the offStart if all the clients are up to date */ - long long lower_bound = getLowestOffsetAmongReplicas(); + long long lower_bound = g_pserver->repl_lowest_off.load(std::memory_order_seq_cst); if (lower_bound == -1) lower_bound = g_pserver->repl_batch_offStart; long long minimumsize = g_pserver->master_repl_offset + len - lower_bound + 1; @@ -441,10 +443,9 @@ void feedReplicationBacklog(const void *ptr, size_t len) { g_pserver->master_repl_offset += len; - - /* This is a circular buffer, so write as much data we can at every * iteration and rewind the "idx" index if we reach the limit. */ + while(len) { size_t thislen = g_pserver->repl_backlog_size - g_pserver->repl_backlog_idx; if (thislen > len) thislen = len; @@ -598,6 +599,8 @@ void replicationFeedSlavesCore(list *slaves, int dictid, robj **argv, int argc) serverAssert(!(listLength(slaves) != 0 && g_pserver->repl_backlog == NULL)); bool fSendRaw = !g_pserver->fActiveReplica; + updateLowestOffsetAmongReplicas(); + std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); /* Send SELECT command to every replica if needed. */ if (g_pserver->replicaseldb != dictid) { @@ -619,7 +622,9 @@ void replicationFeedSlavesCore(list *slaves, int dictid, robj **argv, int argc) /* Add the SELECT command into the backlog. */ /* We don't do this for advanced replication because this will be done later when it adds the whole RREPLAY command */ - if (g_pserver->repl_backlog && fSendRaw) feedReplicationBacklogWithObject(selectcmd); + if (g_pserver->repl_backlog && fSendRaw) { + feedReplicationBacklogWithObject(selectcmd); + } if (dictid < 0 || dictid >= PROTO_SHARED_SELECT_CMDS) decrRefCount(selectcmd); @@ -632,7 +637,6 @@ void replicationFeedSlavesCore(list *slaves, int dictid, robj **argv, int argc) if (fSendRaw) { char aux[LONG_STR_SIZE+3]; - /* Add the multi bulk reply length. */ aux[0] = '*'; int multilen = ll2string(aux+1,sizeof(aux)-1,argc); @@ -759,7 +763,11 @@ void replicationFeedSlavesFromMasterStream(char *buf, size_t buflen) { printf("\n"); } - if (g_pserver->repl_backlog) feedReplicationBacklog(buf,buflen); + if (g_pserver->repl_backlog){ + updateLowestOffsetAmongReplicas(); + std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); + feedReplicationBacklog(buf,buflen); + } } void replicationFeedMonitors(client *c, list *monitors, int dictid, robj **argv, int argc) { @@ -4662,6 +4670,9 @@ void flushReplBacklogToClients() #ifdef BYPASS_BUFFER { + std::unique_lock asyncUl(replica->lock, std::defer_lock); + if (!FCorrectThread(replica)) + asyncUl.lock(); /* If we are online and the RDB has been sent, there is no need to feed the client buffer * We will send our replies directly from the replication backlog instead */ std::unique_lock tRDBLock (replica->transmittedRDBLock); @@ -4694,21 +4705,5 @@ void flushReplBacklogToClients() // This may be called multiple times per "frame" so update with our progress flushing to clients g_pserver->repl_batch_idxStart = g_pserver->repl_backlog_idx; g_pserver->repl_batch_offStart = g_pserver->master_repl_offset; - } else if (getLowestOffsetAmongReplicas() != -1){ - listIter li; - listNode *ln; - listRewind(g_pserver->slaves, &li); - while ((ln = listNext(&li))) { - client *replica = (client*)listNodeValue(ln); - - std::unique_lock ul(replica->lock, std::defer_lock); - if (FCorrectThread(replica)) - ul.lock(); - - /* try to force prepare client to write i guess? */ - if (replica->repl_curr_idx != -1){ - if (prepareClientToWrite(replica) != C_OK) continue; - } - } - } + } } diff --git a/src/server.cpp b/src/server.cpp index 9664a4a6b..439e1aeff 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -2924,6 +2924,7 @@ void initServerConfig(void) { g_pserver->enable_multimaster = CONFIG_DEFAULT_ENABLE_MULTIMASTER; g_pserver->repl_syncio_timeout = CONFIG_REPL_SYNCIO_TIMEOUT; g_pserver->master_repl_offset = 0; + g_pserver->repl_lowest_off.store(-1, std::memory_order_seq_cst); /* Replication partial resync backlog */ g_pserver->repl_backlog = NULL; diff --git a/src/server.h b/src/server.h index 14005e7d5..da1fce52e 100644 --- a/src/server.h +++ b/src/server.h @@ -2241,6 +2241,8 @@ struct redisServer { int repl_diskless_load; /* Slave parse RDB directly from the socket. * see REPL_DISKLESS_LOAD_* enum */ int repl_diskless_sync_delay; /* Delay to start a diskless repl BGSAVE. */ + std::atomic repl_lowest_off; /* The lowest offset amongst all clients + Updated before calls to feed the replication backlog */ /* Replication (replica) */ list *masters; int enable_multimaster; @@ -2838,6 +2840,7 @@ void rdbPipeReadHandler(struct aeEventLoop *eventLoop, int fd, void *clientData, void rdbPipeWriteHandlerConnRemoved(struct connection *conn); void replicationNotifyLoadedKey(redisDb *db, robj_roptr key, robj_roptr val, long long expire); void replicateSubkeyExpire(redisDb *db, robj_roptr key, robj_roptr subkey, long long expire); +void updateLowestOffsetAmongReplicas(void); /* Generic persistence functions */ void startLoadingFile(FILE* fp, const char * filename, int rdbflags); From 982067b16a3162d2c26dcd11a4d5db662dacaf47 Mon Sep 17 00:00:00 2001 From: christianEQ Date: Wed, 2 Jun 2021 15:02:27 +0000 Subject: [PATCH 41/75] fixed code style for ifdef Former-commit-id: 93c41fa31c91098af98d2bc0362eb29685cd6678 --- src/server.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/server.cpp b/src/server.cpp index cbb7654e1..1424ffbd1 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -5474,11 +5474,11 @@ sds genRedisInfoString(const char *section) { "variant:enterprise\r\n" "license_status:%s\r\n" "mvcc_depth:%d\r\n", - #ifdef NO_LICENSE_CHECK +#ifdef NO_LICENSE_CHECK "OK", - #else +#else cserver.license_key ? "OK" : "Trial", - #endif +#endif mvcc_depth ); } From 2a6848a65a513926d3da6608d334351ed6878089 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Wed, 2 Jun 2021 23:41:36 +0000 Subject: [PATCH 42/75] Sync works single threaded properly, passes all but one testcase (which hangs) Former-commit-id: 9a6ca3a5d906b9d87fe70652d218decbb2775ac1 --- src/Makefile | 2 +- src/networking.cpp | 165 ++++++++++++++++++++++++++------------------ src/replication.cpp | 106 +++++++++++----------------- src/server.h | 9 +-- 4 files changed, 145 insertions(+), 137 deletions(-) diff --git a/src/Makefile b/src/Makefile index 966ce4400..a0ee5fe2a 100644 --- a/src/Makefile +++ b/src/Makefile @@ -15,7 +15,7 @@ release_hdr := $(shell sh -c './mkreleasehdr.sh') uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not') uname_M := $(shell sh -c 'uname -m 2>/dev/null || echo not') -OPTIMIZATION?=-O2 -flto +OPTIMIZATION?=-O2 DEPENDENCY_TARGETS=hiredis linenoise lua rocksdb NODEPS:=clean distclean diff --git a/src/networking.cpp b/src/networking.cpp index caefd6d1e..80120d0ca 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -146,6 +146,7 @@ client *createClient(connection *conn, int iel) { c->flags = 0; c->fPendingAsyncWrite = FALSE; c->fPendingAsyncWriteHandler = FALSE; + c->fPendingReplicaWrite = FALSE; c->ctime = c->lastinteraction = g_pserver->unixtime; /* If the default user does not require authentication, the user is * directly authenticated. */ @@ -221,6 +222,10 @@ void clientInstallWriteHandler(client *c) { /* Schedule the client to write the output buffers to the socket only * if not already done and, for slaves, if the replica can actually receive * writes at this stage. */ + + if (c->flags & CLIENT_SLAVE) + serverLog(LL_NOTICE, "installing write handler"); + if (!(c->flags & CLIENT_PENDING_WRITE) && (c->replstate == REPL_STATE_NONE || (c->replstate == SLAVE_STATE_ONLINE && !c->repl_put_online_on_ack))) @@ -272,6 +277,9 @@ void clientInstallAsyncWriteHandler(client *c) { int prepareClientToWrite(client *c) { bool fAsync = !FCorrectThread(c); // Not async if we're on the right thread + if (c->flags & CLIENT_SLAVE) + serverLog(LL_NOTICE, "got into prepareClientToWrite"); + if (!fAsync) { serverAssert(c->conn == nullptr || c->lock.fOwnLock()); } else { @@ -302,7 +310,7 @@ int prepareClientToWrite(client *c) { /* Schedule the client to write the output buffers to the socket, unless * it should already be setup to do so (it has already pending data). */ - if (!fAsync && !clientHasPendingReplies(c) && c->repl_curr_idx == -1) clientInstallWriteHandler(c); + if (!fAsync && !clientHasPendingReplies(c) && !c->fPendingReplicaWrite) clientInstallWriteHandler(c); if (fAsync && !(c->fPendingAsyncWrite)) clientInstallAsyncWriteHandler(c); /* Authorize the caller to queue in the output buffer of this client. */ @@ -320,7 +328,6 @@ void _clientAsyncReplyBufferReserve(client *c, size_t len) { clientReplyBlock *replyNew = (clientReplyBlock*)zmalloc(sizeof(clientReplyBlock) + newsize); replyNew->size = zmalloc_usable(replyNew) - sizeof(clientReplyBlock); replyNew->used = 0; - std::unique_lock tRDBLock (c->transmittedRDBLock); c->replyAsync = replyNew; } @@ -334,7 +341,6 @@ int _addReplyToBuffer(client *c, const char *s, size_t len) { if (fAsync) { serverAssert(GlobalLocksAcquired()); - std::unique_lock tRDBLock (c->transmittedRDBLock); if (c->replyAsync == nullptr || (c->replyAsync->size - c->replyAsync->used) < len) { if (c->replyAsync == nullptr) { @@ -1661,6 +1667,16 @@ client *lookupClientByID(uint64_t id) { return (c == raxNotFound) ? NULL : c; } +/* Compute the corresponding index from a replication backlog offset + * by taking the distance between the input offset and the replication backlog offset + * and applying that to the replication backlog index, wrapping around if the index + * becomes negative. + * TODO: Rewrite comment for new logic */ +long long getReplIndexFromOffset(long long offset){ + long long index = (offset - g_pserver->repl_backlog_start) % g_pserver->repl_backlog_size; + return index; +} + /* Write data in output buffers to client. Return C_OK if the client * is still valid after the call, C_ERR if it was freed because of some * error. If handler_installed is set, it will attempt to clear the @@ -1680,7 +1696,11 @@ int writeToClient(client *c, int handler_installed) { std::unique_locklock)> lock(c->lock); // serverLog(LL_NOTICE, "acq client"); + if (c->flags & CLIENT_SLAVE) + serverLog(LL_NOTICE, "writeToClient has happened"); + while(clientHasPendingReplies(c)) { + serverAssert(!(c->flags & CLIENT_SLAVE) || c->flags & CLIENT_MONITOR); if (c->bufpos > 0) { nwritten = connWrite(c->conn,c->buf+c->sentlen,c->bufpos-c->sentlen); if (nwritten <= 0) break; @@ -1739,80 +1759,67 @@ int writeToClient(client *c, int handler_installed) { /* We can only directly read from the replication backlog if the client is a replica, so only attempt to do so if that's the case. */ if (c->flags & CLIENT_SLAVE) { - /* If there are no more pending replies, then we have transmitted the RDB. - * This means further replication commands will be taken straight from the - * replication backlog from now on. */ - std::unique_lock tRDBLock (c->transmittedRDBLock); - - if (c->replstate == SLAVE_STATE_ONLINE && !clientHasPendingReplies(c) && c->replyAsync == nullptr){ - c->transmittedRDB = true; - } - bool transmittedRDB = c->transmittedRDB; - tRDBLock.unlock(); - /* For replicas, we don't store all the information in the client buffer - * Most of the time (aside from immediately after synchronizing), we read - * from the replication backlog directly */ - if (c->repl_curr_idx != -1 && transmittedRDB){ - std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); + * We always read from the replication backlog directly */ + std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); - /* copy global variables into local scope so if they change in between we don't care */ - long long repl_backlog_idx = g_pserver->repl_backlog_idx; - long long repl_backlog_size = g_pserver->repl_backlog_size; + /* Right now, we're bringing in the offStart into the scope + * If repl_batch_offStart is equal to -1, that means the mechanism is disabled + * which implies there is no data to flush and that the global offset is accurate */ + long long offStart = g_pserver->repl_batch_offStart == -1 ? g_pserver->master_repl_offset : g_pserver->repl_batch_offStart; + long long idxStart = getReplIndexFromOffset(offStart); + if (g_pserver->repl_batch_offStart != -1) + serverAssert(idxStart == g_pserver->repl_batch_idxStart); + else + serverAssert(idxStart == g_pserver->repl_backlog_idx); + + if (c->repl_curr_off != -1 && c->repl_curr_off != offStart){ + serverLog(LL_NOTICE, "printing the stats for client %lu: c->repl_curr_off: %lld, repl_batch_offStart: %lld, nwritten: %ld, offStart: %lld", + c->id, c->repl_curr_off, g_pserver->repl_batch_offStart, nwritten, offStart); + + long long curr_idx = getReplIndexFromOffset(c->repl_curr_off); long long nwrittenPart2 = 0; - - ssize_t nrequested; /* The number of bytes requested to write */ /* normal case with no wrap around */ - if (repl_backlog_idx >= c->repl_curr_idx){ - nrequested = repl_backlog_idx - c->repl_curr_idx; - nwritten = connWrite(c->conn, g_pserver->repl_backlog + c->repl_curr_idx, repl_backlog_idx - c->repl_curr_idx); + if (idxStart >= curr_idx){ + nwritten = connWrite(c->conn, g_pserver->repl_backlog + curr_idx, idxStart - curr_idx); /* wrap around case, v. rare */ /* also v. buggy so there's that */ } else { - nrequested = repl_backlog_size + repl_backlog_idx - c->repl_curr_idx; - nwritten = connWrite(c->conn, g_pserver->repl_backlog + c->repl_curr_idx, repl_backlog_size - c->repl_curr_idx); + serverLog(LL_NOTICE, "ROAD OF RESISTANCE"); + nwritten = connWrite(c->conn, g_pserver->repl_backlog + curr_idx, g_pserver->repl_backlog_size - curr_idx); /* only attempt wrapping if we write the correct number of bytes */ - if (nwritten == repl_backlog_size - c->repl_curr_idx){ - long long nwrittenPart2 = connWrite(c->conn, g_pserver->repl_backlog, repl_backlog_idx); + if (nwritten == g_pserver->repl_backlog_size - curr_idx){ + long long nwrittenPart2 = connWrite(c->conn, g_pserver->repl_backlog, idxStart); if (nwrittenPart2 != -1) nwritten += nwrittenPart2; - } } - /* only update the replica's current index if bytes were sent */ - - // if (nrequested != nwritten){ - // serverLog(LL_NOTICE, "-----------------------------------------"); - // serverLog(LL_NOTICE, "AFTER THE FACT"); - // serverLog(LL_NOTICE, "requested to write: %ld", nrequested); - // serverLog(LL_NOTICE, "actually written: %ld", nwritten); - // serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size); - // serverLog(LL_NOTICE, "repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset); - // serverLog(LL_NOTICE, "-----------------------------------------"); - // } - - - if (nwritten == nrequested && g_pserver->repl_backlog_idx == repl_backlog_idx){ - c->repl_curr_idx = -1; /* -1 denotes no more replica writes */ - } - else if (nwritten > 0) - c->repl_curr_idx = (c->repl_curr_idx + nwritten) % repl_backlog_size; - - serverAssert(c->repl_curr_idx < repl_backlog_size); - /* only increment bytes if an error didn't occur */ if (nwritten > 0){ totwritten += nwritten; c->repl_curr_off += nwritten; + if (1){ + serverLog(LL_NOTICE, "printing the stats for client %lu: c->repl_curr_off: %lld, repl_batch_offStart: %lld, nwritten: %ld, offStart: %lld", + c->id, c->repl_curr_off, g_pserver->repl_batch_offStart, nwritten, offStart); + } + serverAssert(c->repl_curr_off <= offStart); + /* If the client offset matches the global offset, we wrote all we needed to, + * in which case, there is no pending write */ + if (c->repl_curr_off == offStart){ + serverLog(LL_NOTICE, "good, %lld", offStart); + c->fPendingReplicaWrite = false; + } else { + serverLog(LL_NOTICE, "mismatch between repl_curr_off (%lld) and offStart (%lld)", c->repl_curr_off, offStart); + } } /* If the second part of a write didn't go through, we still need to register that */ if (nwrittenPart2 == -1) nwritten = -1; } - if (c->flags & CLIENT_SLAVE && handler_installed) - serverLog(LL_NOTICE, "Total bytes written, %ld, write handler installed?: %d", totwritten, handler_installed); + // if (c->flags & CLIENT_SLAVE && handler_installed) + // serverLog(LL_NOTICE, "Total bytes written, %ld, write handler installed?: %d", totwritten, handler_installed); } @@ -1836,12 +1843,12 @@ int writeToClient(client *c, int handler_installed) { * We just rely on data / pings received for timeout detection. */ if (!(c->flags & CLIENT_MASTER)) c->lastinteraction = g_pserver->unixtime; } - if (!clientHasPendingReplies(c) && c->repl_curr_idx == -1) { - if(c->flags & CLIENT_SLAVE && handler_installed){ - serverLog(LL_NOTICE, "Uninstalling handler"); - serverLog(LL_NOTICE, "handler repl_curr_idx: %lld, repl_backlog_size: %lld", c->repl_curr_idx, g_pserver->repl_backlog_size); - serverLog(LL_NOTICE, "handler repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset); - } + if (!clientHasPendingReplies(c) && !c->fPendingReplicaWrite) { + // if(c->flags & CLIENT_SLAVE && handler_installed){ + // serverLog(LL_NOTICE, "Uninstalling handler"); + // serverLog(LL_NOTICE, "handler repl_curr_idx: %lld, repl_backlog_size: %lld", c->repl_curr_idx, g_pserver->repl_backlog_size); + // serverLog(LL_NOTICE, "handler repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset); + // } c->sentlen = 0; if (handler_installed) connSetWriteHandler(c->conn, NULL); @@ -1857,7 +1864,7 @@ int writeToClient(client *c, int handler_installed) { /* Write event handler. Just send data to the client. */ void sendReplyToClient(connection *conn) { client *c = (client*)connGetPrivateData(conn); - serverLog(LL_NOTICE, "called the sendreplytoclient"); + // serverLog(LL_NOTICE, "called the sendreplytoclient"); if (writeToClient(c,1) == C_ERR) { AeLocker ae; @@ -1886,7 +1893,6 @@ void ProcessPendingAsyncWrites() serverAssert(c->fPendingAsyncWrite); if (c->flags & (CLIENT_CLOSE_ASAP | CLIENT_CLOSE_AFTER_REPLY)) { - std::unique_lock tRDBLock (c->transmittedRDBLock); if (c->replyAsync != nullptr){ zfree(c->replyAsync); c->replyAsync = nullptr; @@ -1898,7 +1904,6 @@ void ProcessPendingAsyncWrites() /* since writes from master to replica can come directly from the replication backlog, * writes may have been signalled without having been copied to the replyAsync buffer, * thus causing the buffer to be NULL */ - std::unique_lock tRDBLock (c->transmittedRDBLock); if (c->replyAsync != nullptr){ int size = c->replyAsync->used; @@ -1919,7 +1924,6 @@ void ProcessPendingAsyncWrites() } c->fPendingAsyncWrite = FALSE; - tRDBLock.unlock(); // Now install the write event handler int ae_flags = AE_WRITABLE|AE_WRITE_THREADSAFE; /* For the fsync=always policy, we want that a given FD is never @@ -2032,8 +2036,7 @@ int handleClientsWithPendingWrites(int iel, int aof_state) { /* If after the synchronous writes above we still have data to * output to the client, we need to install the writable handler. */ - if (clientHasPendingReplies(c) || c->repl_curr_idx != -1) { - serverLog(LL_NOTICE, "Setting a write handler for later"); + if (clientHasPendingReplies(c) || c->fPendingReplicaWrite) { if (connSetWriteHandlerWithBarrier(c->conn, sendReplyToClient, ae_flags, true) == C_ERR) { freeClientAsync(c); } @@ -2214,6 +2217,34 @@ static void setProtocolError(const char *errstr, client *c) { c->flags |= (CLIENT_CLOSE_AFTER_REPLY|CLIENT_PROTOCOL_ERROR); } +static void printQueryBuffer(client *c) { + if (cserver.verbosity <= LL_VERBOSE || c->flags & CLIENT_MASTER) { + sds client = catClientInfoString(sdsempty(),c); + + /* Sample some protocol to given an idea about what was inside. */ + char buf[PROTO_DUMP_LEN*2]; + if (sdslen(c->querybuf)-c->qb_pos < PROTO_DUMP_LEN) { + snprintf(buf,sizeof(buf),"%s", c->querybuf+c->qb_pos); + } else { + snprintf(buf,sizeof(buf),"%.*s (... more %zu bytes ...) %.*s", PROTO_DUMP_LEN/2, c->querybuf+c->qb_pos, sdslen(c->querybuf)-c->qb_pos-PROTO_DUMP_LEN, PROTO_DUMP_LEN/2, c->querybuf+sdslen(c->querybuf)-PROTO_DUMP_LEN/2); + } + + /* Remove non printable chars. */ + char *p = buf; + while (*p != '\0') { + if (!isprint(*p)) *p = '.'; + p++; + } + + /* Log all the client and protocol info. */ + int loglevel = (c->flags & CLIENT_MASTER) ? LL_WARNING : + LL_VERBOSE; + serverLog(loglevel, + "Query buffer from client %lu: %s. %s", c->id, client, buf); + sdsfree(client); + } +} + /* Process the query buffer for client 'c', setting up the client argument * vector for command execution. Returns C_OK if after running the function * the client has a well-formed ready to be processed command, otherwise @@ -2468,6 +2499,8 @@ void parseClientCommandBuffer(client *c) { } size_t cqueriesStart = c->vecqueuedcmd.size(); + // if (c->flags & CLIENT_MASTER) + // printQueryBuffer(c); if (c->reqtype == PROTO_REQ_INLINE) { if (processInlineBuffer(c) != C_OK) break; } else if (c->reqtype == PROTO_REQ_MULTIBULK) { diff --git a/src/replication.cpp b/src/replication.cpp index d1181bdf4..97638e833 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -88,18 +88,6 @@ int RDBGeneratedByReplication = 0; void resizeReplicationBacklogForClients(long long newsize); -void setReplIdx(client *c, long long idx, long long off){ - // serverLog(LL_NOTICE, "calling this garbage function w/ idx and off: %lld, %lld, %lld", idx, off, off-idx); - // serverLog(LL_NOTICE, "Repl Index started at: %lld", c->repl_curr_idx); - if (c->repl_curr_idx == -1 && off >= c->repl_curr_off){ - if (prepareClientToWrite(c) != C_OK) return; - c->repl_curr_idx = idx; - c->repl_curr_off = off; - } - // serverLog(LL_NOTICE, "Repl Index has become: %lld", c->repl_curr_idx); - -} - /* --------------------------- Utility functions ---------------------------- */ /* Return the pointer to a string representing the replica ip:listening_port @@ -232,6 +220,7 @@ void createReplicationBacklog(void) { g_pserver->repl_backlog = (char*)zmalloc(g_pserver->repl_backlog_size, MALLOC_LOCAL); g_pserver->repl_backlog_histlen = 0; g_pserver->repl_backlog_idx = 0; + g_pserver->repl_backlog_start = g_pserver->master_repl_offset; /* We don't have any data inside our buffer, but virtually the first * byte we have is the next byte that will be generated for the @@ -284,6 +273,7 @@ void resizeReplicationBacklog(long long newsize) { g_pserver->repl_backlog = backlog; g_pserver->repl_backlog_idx = g_pserver->repl_backlog_histlen; g_pserver->repl_batch_idxStart = 0; + g_pserver->repl_backlog_start = g_pserver->master_repl_offset; } else { zfree(g_pserver->repl_backlog); g_pserver->repl_backlog = (char*)zmalloc(newsize); @@ -296,6 +286,7 @@ void resizeReplicationBacklog(long long newsize) { g_pserver->repl_backlog_size = newsize; } +long long getReplIndexFromOffset(long long offset); /* The above but for when clients need extra replication backlog because ??? */ void resizeReplicationBacklogForClients(long long newsize) { @@ -305,32 +296,8 @@ void resizeReplicationBacklogForClients(long long newsize) { serverLog(LL_NOTICE, "WE HAVE TO RESIZE from %lld to %lld", g_pserver->repl_backlog_size, newsize); /* get the critical client size, i.e. the size of the data unflushed to clients */ - long long earliest_off = LONG_LONG_MAX; - long long earliest_idx = -1; - listIter li; - listNode *ln; - listRewind(g_pserver->slaves, &li); - while ((ln = listNext(&li))) { - client *replica = (client*)listNodeValue(ln); - if (replica->repl_curr_off != -1 && replica->repl_curr_off < earliest_off){ - earliest_off = replica->repl_curr_off; - earliest_idx = replica->repl_curr_idx; - } - serverLog(LL_NOTICE, "repl_curr_idx: %lld, earlistidx: %lld", replica->repl_curr_idx, earliest_idx); - } - serverLog(LL_NOTICE, "We are starting with: master_repl_offset: %lld, repl_batch_offStart: %lld, earliest_off: %lld, " - "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, earliest_idx: %lld, repl_backlog_size: %lld", - g_pserver->master_repl_offset, g_pserver->repl_batch_offStart, earliest_off, - g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, earliest_idx, g_pserver->repl_backlog_size - ); + long long earliest_off = g_pserver->repl_lowest_off.load(); - long long new_off = 0, new_idx = 0; - - /* if no earliest offset is found amongst the clients, they are all up to date with the flushed index */ - if (earliest_off == LONG_LONG_MAX && earliest_idx == -1){ - earliest_idx = g_pserver->repl_batch_idxStart; - earliest_off = g_pserver->repl_batch_offStart; - } if (g_pserver->repl_backlog != NULL) { /* What we actually do is to flush the old buffer and realloc a new @@ -339,17 +306,18 @@ void resizeReplicationBacklogForClients(long long newsize) { * worse often we need to alloc additional space before freeing the * old buffer. */ - if (earliest_idx >= 0) { + if (earliest_off != -1) { // We need to keep critical data so we can't shrink less than the hot data in the buffer newsize = std::max(newsize, g_pserver->master_repl_offset - earliest_off); char *backlog = (char*)zmalloc(newsize); g_pserver->repl_backlog_histlen = g_pserver->master_repl_offset - earliest_off; + long long earliest_idx = getReplIndexFromOffset(earliest_off); if (g_pserver->repl_backlog_idx >= earliest_idx) { auto cbActiveBacklog = g_pserver->repl_backlog_idx - earliest_idx; memcpy(backlog, g_pserver->repl_backlog + earliest_idx, cbActiveBacklog); - serverLog(LL_NOTICE, "g_pserver->master_repl_offset: %lld, earliest_off: %lld, g_pserver->repl_backlog_idx: %lld, earliest_idx: %lld", - g_pserver->master_repl_offset, earliest_off, g_pserver->repl_backlog_idx, earliest_idx); + serverLog(LL_NOTICE, "g_pserver->master_repl_offset: %lld, earliest_off: %lld, g_pserver->repl_backlog_idx: %lld, earliest_idx: %lld, repl_backlog_start: %lld", + g_pserver->master_repl_offset, earliest_off, g_pserver->repl_backlog_idx, earliest_idx, g_pserver->repl_backlog_start); serverAssert(g_pserver->repl_backlog_histlen == cbActiveBacklog); } else { auto cbPhase1 = g_pserver->repl_backlog_size - earliest_idx; @@ -361,20 +329,10 @@ void resizeReplicationBacklogForClients(long long newsize) { zfree(g_pserver->repl_backlog); g_pserver->repl_backlog = backlog; g_pserver->repl_backlog_idx = g_pserver->repl_backlog_histlen; - listRewind(g_pserver->slaves, &li); - /* Go through the clients and update their replication indicies */ - while ((ln = listNext(&li))) { - client *replica = (client*)listNodeValue(ln); - if (replica->repl_curr_idx != -1){ - replica->repl_curr_idx -= earliest_idx; - if (replica->repl_curr_idx < 0) - replica->repl_curr_idx += g_pserver->repl_backlog_size; - } - new_idx = replica->repl_curr_idx; - } g_pserver->repl_batch_idxStart -= earliest_idx; if (g_pserver->repl_batch_idxStart < 0) g_pserver->repl_batch_idxStart += g_pserver->repl_backlog_size; + g_pserver->repl_backlog_start = earliest_off; } else { zfree(g_pserver->repl_backlog); g_pserver->repl_backlog = (char*)zmalloc(newsize); @@ -382,14 +340,15 @@ void resizeReplicationBacklogForClients(long long newsize) { g_pserver->repl_backlog_idx = 0; /* Next byte we have is... the next since the buffer is empty. */ g_pserver->repl_backlog_off = g_pserver->master_repl_offset+1; + g_pserver->repl_backlog_start = g_pserver->master_repl_offset; } } g_pserver->repl_backlog_size = newsize; serverLog(LL_NOTICE, "We are ending with: master_repl_offset: %lld, repl_batch_offStart: %lld, new_off: %lld, " "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, new_idx: %lld, repl_backlog_size: %lld", - g_pserver->master_repl_offset, g_pserver->repl_batch_offStart, new_off, - g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, new_idx, g_pserver->repl_backlog_size + g_pserver->master_repl_offset, g_pserver->repl_batch_offStart, 0LL, + g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, 0LL, g_pserver->repl_backlog_size ); } @@ -456,11 +415,6 @@ void feedReplicationBacklog(const void *ptr, size_t len) { len -= thislen; p += thislen; g_pserver->repl_backlog_histlen += thislen; - // serverLog(LL_NOTICE, "Pt2 intermediate with: master_repl_offset: %lld, repl_batch_offStart: %lld, " - // "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, repl_backlog_size: %lld", - // g_pserver->master_repl_offset, g_pserver->repl_batch_offStart, - // g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, g_pserver->repl_backlog_size - // ); } if (g_pserver->repl_backlog_histlen > g_pserver->repl_backlog_size) g_pserver->repl_backlog_histlen = g_pserver->repl_backlog_size; @@ -722,7 +676,7 @@ void replicationFeedSlaves(list *replicas, int dictid, robj **argv, int argc) { void showLatestBacklog(void) { if (g_pserver->repl_backlog == NULL) return; - long long dumplen = 256; + long long dumplen = 1024; if (g_pserver->repl_backlog_histlen < dumplen) dumplen = g_pserver->repl_backlog_histlen; @@ -813,7 +767,7 @@ void replicationFeedMonitors(client *c, list *monitors, int dictid, robj **argv, } decrRefCount(cmdobj); } - +#define BYPASS_PSYNC /* Feed the replica 'c' with the replication backlog starting from the * specified 'offset' up to the end of the backlog. */ long long addReplyReplicationBacklog(client *c, long long offset) { @@ -854,7 +808,8 @@ long long addReplyReplicationBacklog(client *c, long long offset) { len = g_pserver->repl_backlog_histlen - skip; serverLog(LL_DEBUG, "[PSYNC] Reply total length: %lld", len); #ifdef BYPASS_PSYNC - setReplIdx(c, j, offset); + c->repl_curr_off = offset - 1; + serverLog(LL_NOTICE, "This client %lu at addr %s synchronized to %lld", c->id, getClientPeerId(c), c->repl_curr_off); #else while(len) { long long thislen = @@ -900,6 +855,11 @@ int replicationSetupSlaveForFullResync(client *replica, long long offset) { replica->psync_initial_offset = offset; replica->replstate = SLAVE_STATE_WAIT_BGSAVE_END; + + replica->repl_curr_off = offset; + + serverLog(LL_NOTICE, "This client %lu at addr %s synchronized to %lld", replica->id, getClientPeerId(replica), replica->repl_curr_off); + /* We are going to accumulate the incremental changes for this * replica as well. Set replicaseldb to -1 in order to force to re-emit * a SELECT statement in the replication stream. */ @@ -2006,7 +1966,6 @@ void replicationCreateMasterClient(redisMaster *mi, connection *conn, int dbid) mi->master->reploff_skipped = 0; mi->master->read_reploff = mi->master->reploff; mi->master->puser = NULL; /* This client can do everything. */ - memcpy(mi->master->uuid, mi->master_uuid, UUID_BINARY_LEN); memset(mi->master_uuid, 0, UUID_BINARY_LEN); // make sure people don't use this temp storage buffer @@ -4652,12 +4611,17 @@ void flushReplBacklogToClients() serverAssert(g_pserver->master_repl_offset - g_pserver->repl_batch_offStart <= g_pserver->repl_backlog_size); serverAssert(g_pserver->repl_batch_idxStart != g_pserver->repl_backlog_idx); + serverLog(LL_NOTICE, "the master repl offset is %lld", g_pserver->master_repl_offset); + showLatestBacklog(); listIter li; listNode *ln; listRewind(g_pserver->slaves, &li); while ((ln = listNext(&li))) { client *replica = (client*)listNodeValue(ln); + // serverLog(LL_NOTICE, "client %lu is in the party", replica->id); + + // serverLog(LL_NOTICE, "is there a write pending for %lu, %d", replica->id, replica->fPendingReplicaWrite); if (replica->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue; if (replica->flags & CLIENT_CLOSE_ASAP) continue; @@ -4675,11 +4639,21 @@ void flushReplBacklogToClients() asyncUl.lock(); /* If we are online and the RDB has been sent, there is no need to feed the client buffer * We will send our replies directly from the replication backlog instead */ - std::unique_lock tRDBLock (replica->transmittedRDBLock); - if (replica->replstate == SLAVE_STATE_ONLINE && replica->transmittedRDB){ - setReplIdx(replica, g_pserver->repl_batch_idxStart, g_pserver->repl_batch_offStart); - continue; + if (replica->repl_curr_off == -1){ + replica->repl_curr_off = g_pserver->repl_batch_offStart; + + serverLog(LL_NOTICE, "This client %lu at addr %s synchronized to %lld", replica->id, getClientPeerId(replica), replica->repl_curr_off); + } + + /* Only if the there isn't already a pending write do we prepare the client to write */ + if (!replica->fPendingReplicaWrite){ + serverAssert(replica->repl_curr_off != g_pserver->master_repl_offset); + prepareClientToWrite(replica); + replica->fPendingReplicaWrite = true; + } + + continue; } #endif if (g_pserver->repl_backlog_idx >= g_pserver->repl_batch_idxStart) { diff --git a/src/server.h b/src/server.h index da1fce52e..9fdf5e0ef 100644 --- a/src/server.h +++ b/src/server.h @@ -1516,8 +1516,11 @@ struct client { long long psync_initial_offset; /* FULLRESYNC reply offset other slaves copying this replica output buffer should use. */ + long long repl_curr_idx = -1; /* Replication index sent, if this is a replica */ long long repl_curr_off = -1; + int fPendingReplicaWrite; + char replid[CONFIG_RUN_ID_SIZE+1]; /* Master replication ID (if master). */ int slave_listening_port; /* As configured with: REPLCONF listening-port */ char slave_ip[NET_IP_STR_LEN]; /* Optionally given by REPLCONF ip-address */ @@ -1577,12 +1580,8 @@ struct client { robj **argv; size_t argv_len_sumActive = 0; - bool transmittedRDB = false; /* Have we finished transmitting the RDB to this replica? */ - /* If so, we can read from the replication backlog instead of the client buffer */ - // post a function from a non-client thread to run on its client thread bool postFunction(std::function fn, bool fLock = true); - fastlock transmittedRDBLock {"transmittedRDB"}; size_t argv_len_sum() const; }; @@ -2229,6 +2228,8 @@ struct redisServer { that is the next byte will'll write to.*/ long long repl_backlog_off; /* Replication "master offset" of first byte in the replication backlog buffer.*/ + long long repl_backlog_start; /* Used to compute indicies from offsets + basically, index = (offset - start) % size */ fastlock repl_backlog_lock {"replication backlog"}; time_t repl_backlog_time_limit; /* Time without slaves after the backlog gets released. */ From 2e9c7aed031f5822ddbe955803b6a09c6c1a9aca Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Thu, 3 Jun 2021 20:44:32 +0000 Subject: [PATCH 43/75] Single threaded tests work now Former-commit-id: 0e760d7c71231c7f52102909a31fc8db1b3e2860 --- src/networking.cpp | 2 +- src/replication.cpp | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/networking.cpp b/src/networking.cpp index 80120d0ca..e8ede3338 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -3419,7 +3419,7 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) { * that writes to said replica are using data from the replication backlog * as opposed to it's own internal buffer, this number should keep track of that */ unsigned long getClientReplicationBacklogSharedUsage(client *c) { - return (!(c->flags & CLIENT_SLAVE) || c->repl_curr_idx == -1) ? 0 : g_pserver->master_repl_offset - c->repl_curr_off; + return (!(c->flags & CLIENT_SLAVE) || !c->fPendingReplicaWrite ) ? 0 : g_pserver->master_repl_offset - c->repl_curr_off; } /* This function returns the number of bytes that Redis is diff --git a/src/replication.cpp b/src/replication.cpp index 97638e833..a7a2aa79e 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -810,6 +810,10 @@ long long addReplyReplicationBacklog(client *c, long long offset) { #ifdef BYPASS_PSYNC c->repl_curr_off = offset - 1; serverLog(LL_NOTICE, "This client %lu at addr %s synchronized to %lld", c->id, getClientPeerId(c), c->repl_curr_off); + + /* Force the partial sync to be queued */ + prepareClientToWrite(c); + c->fPendingReplicaWrite = true; #else while(len) { long long thislen = From 667d2763c0df3ca48b52949a365d3237dbcc0c52 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Thu, 3 Jun 2021 21:47:33 +0000 Subject: [PATCH 44/75] Removed unused variables Former-commit-id: 48663bc480f7279a94c68aeebdd9721ca64f7038 --- src/config.cpp | 1 - src/evict.cpp | 1 - src/replication.cpp | 2 -- src/server.h | 6 +----- 4 files changed, 1 insertion(+), 9 deletions(-) diff --git a/src/config.cpp b/src/config.cpp index b546ef607..9d7f14007 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -2347,7 +2347,6 @@ static int updateReplBacklogSize(long long val, long long prev, const char **err UNUSED(err); g_pserver->repl_backlog_size = prev; resizeReplicationBacklog(val); - g_pserver->repl_backlog_config_size = g_pserver->repl_backlog_size; return 1; } diff --git a/src/evict.cpp b/src/evict.cpp index e7f0a10ef..7ec223f6d 100644 --- a/src/evict.cpp +++ b/src/evict.cpp @@ -399,7 +399,6 @@ size_t freeMemoryGetNotCountedMemory(void) { /* also don't count the replication backlog memory * that's where the replication clients get their memory from */ - // overhead += (g_pserver->repl_backlog_size - g_pserver->repl_backlog_config_size); overhead += g_pserver->repl_backlog_size; if (g_pserver->aof_state != AOF_OFF) { diff --git a/src/replication.cpp b/src/replication.cpp index a7a2aa79e..3a48963ab 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -1129,7 +1129,6 @@ void syncCommand(client *c) { if (!strcasecmp((const char*)ptrFromObj(c->argv[0]),"psync")) { if (masterTryPartialResynchronization(c) == C_OK) { g_pserver->stat_sync_partial_ok++; - // c->repl_curr_idx = g_pserver->repl_backlog_idx; return; /* No full resync needed, return. */ } else { char *master_replid = (char*)ptrFromObj(c->argv[1]); @@ -1157,7 +1156,6 @@ void syncCommand(client *c) { connDisableTcpNoDelay(c->conn); /* Non critical if it fails. */ c->repldbfd = -1; c->flags |= CLIENT_SLAVE; - // c->repl_curr_idx = g_pserver->repl_backlog_idx; listAddNodeTail(g_pserver->slaves,c); /* Create the replication backlog if needed. */ diff --git a/src/server.h b/src/server.h index 9fdf5e0ef..2aba985ed 100644 --- a/src/server.h +++ b/src/server.h @@ -1517,8 +1517,7 @@ struct client { copying this replica output buffer should use. */ - long long repl_curr_idx = -1; /* Replication index sent, if this is a replica */ - long long repl_curr_off = -1; + long long repl_curr_off = -1; /* Replication offset of the client, only if it's a replica*/ int fPendingReplicaWrite; char replid[CONFIG_RUN_ID_SIZE+1]; /* Master replication ID (if master). */ @@ -2416,9 +2415,6 @@ struct redisServer { uint16_t rglockSamples[s_lockContentionSamples]; unsigned ilockRingHead = 0; - long long repl_backlog_config_size = 1024*1024; /* This is a hack to ignore the resizing of the replication backlog - when using it as a defacto for the client buffer */ - bool FRdbSaveInProgress() const { return rdbThreadVars.fRdbThreadActive; } }; From da0b7a3900ba50b37a2e3ac0cac1196aa19d734d Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Fri, 4 Jun 2021 20:09:47 +0000 Subject: [PATCH 45/75] Seems to pass multithreaded test cases, thank the lord Former-commit-id: 6cbf70cfff5735f3d4ef2e980945b4b1a1f85971 --- src/networking.cpp | 19 +++++++++---------- src/replication.cpp | 15 +++++++++------ src/server.h | 1 + 3 files changed, 19 insertions(+), 16 deletions(-) diff --git a/src/networking.cpp b/src/networking.cpp index e8ede3338..cead76998 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -277,8 +277,9 @@ void clientInstallAsyncWriteHandler(client *c) { int prepareClientToWrite(client *c) { bool fAsync = !FCorrectThread(c); // Not async if we're on the right thread - if (c->flags & CLIENT_SLAVE) + if (c->flags & CLIENT_SLAVE){ serverLog(LL_NOTICE, "got into prepareClientToWrite"); + } if (!fAsync) { serverAssert(c->conn == nullptr || c->lock.fOwnLock()); @@ -1758,7 +1759,7 @@ int writeToClient(client *c, int handler_installed) { /* We can only directly read from the replication backlog if the client is a replica, so only attempt to do so if that's the case. */ - if (c->flags & CLIENT_SLAVE) { + if (c->flags & CLIENT_SLAVE && !(c->flags & CLIENT_MONITOR)) { /* For replicas, we don't store all the information in the client buffer * We always read from the replication backlog directly */ std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); @@ -1766,14 +1767,12 @@ int writeToClient(client *c, int handler_installed) { /* Right now, we're bringing in the offStart into the scope * If repl_batch_offStart is equal to -1, that means the mechanism is disabled * which implies there is no data to flush and that the global offset is accurate */ - long long offStart = g_pserver->repl_batch_offStart == -1 ? g_pserver->master_repl_offset : g_pserver->repl_batch_offStart; + // long long offStart = g_pserver->repl_batch_offStart == -1 ? g_pserver->master_repl_offset : g_pserver->repl_batch_offStart; + long long offStart = c->repl_end_off; long long idxStart = getReplIndexFromOffset(offStart); - if (g_pserver->repl_batch_offStart != -1) - serverAssert(idxStart == g_pserver->repl_batch_idxStart); - else - serverAssert(idxStart == g_pserver->repl_backlog_idx); - - if (c->repl_curr_off != -1 && c->repl_curr_off != offStart){ + + serverAssert(c->repl_curr_off != -1); + if (c->repl_curr_off != offStart){ serverLog(LL_NOTICE, "printing the stats for client %lu: c->repl_curr_off: %lld, repl_batch_offStart: %lld, nwritten: %ld, offStart: %lld", c->id, c->repl_curr_off, g_pserver->repl_batch_offStart, nwritten, offStart); @@ -1846,7 +1845,7 @@ int writeToClient(client *c, int handler_installed) { if (!clientHasPendingReplies(c) && !c->fPendingReplicaWrite) { // if(c->flags & CLIENT_SLAVE && handler_installed){ // serverLog(LL_NOTICE, "Uninstalling handler"); - // serverLog(LL_NOTICE, "handler repl_curr_idx: %lld, repl_backlog_size: %lld", c->repl_curr_idx, g_pserver->repl_backlog_size); + // serverLog(LL_NOTICE, "repl_backlog_size: %lld", g_pserver->repl_backlog_size); // serverLog(LL_NOTICE, "handler repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset); // } c->sentlen = 0; diff --git a/src/replication.cpp b/src/replication.cpp index 3a48963ab..96bf161f9 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -382,7 +382,9 @@ void feedReplicationBacklog(const void *ptr, size_t len) { lower_bound = g_pserver->repl_batch_offStart; long long minimumsize = g_pserver->master_repl_offset + len - lower_bound + 1; if (minimumsize > g_pserver->repl_backlog_size) { + g_pserver->repl_backlog_lock.unlock(); flushReplBacklogToClients(); + g_pserver->repl_backlog_lock.lock(); minimumsize = g_pserver->master_repl_offset + len - lower_bound +1; if (minimumsize > g_pserver->repl_backlog_size) { @@ -809,6 +811,7 @@ long long addReplyReplicationBacklog(client *c, long long offset) { serverLog(LL_DEBUG, "[PSYNC] Reply total length: %lld", len); #ifdef BYPASS_PSYNC c->repl_curr_off = offset - 1; + c->repl_end_off = g_pserver->master_repl_offset; serverLog(LL_NOTICE, "This client %lu at addr %s synchronized to %lld", c->id, getClientPeerId(c), c->repl_curr_off); /* Force the partial sync to be queued */ @@ -861,6 +864,7 @@ int replicationSetupSlaveForFullResync(client *replica, long long offset) { replica->replstate = SLAVE_STATE_WAIT_BGSAVE_END; replica->repl_curr_off = offset; + replica->repl_end_off = g_pserver->master_repl_offset; serverLog(LL_NOTICE, "This client %lu at addr %s synchronized to %lld", replica->id, getClientPeerId(replica), replica->repl_curr_off); @@ -4634,19 +4638,18 @@ void flushReplBacklogToClients() fAsyncWrite = true; + /* If we are online and the RDB has been sent, there is no need to feed the client buffer + * We will send our replies directly from the replication backlog instead */ #ifdef BYPASS_BUFFER { std::unique_lock asyncUl(replica->lock, std::defer_lock); if (!FCorrectThread(replica)) asyncUl.lock(); - /* If we are online and the RDB has been sent, there is no need to feed the client buffer - * We will send our replies directly from the replication backlog instead */ - if (replica->repl_curr_off == -1){ - replica->repl_curr_off = g_pserver->repl_batch_offStart; - serverLog(LL_NOTICE, "This client %lu at addr %s synchronized to %lld", replica->id, getClientPeerId(replica), replica->repl_curr_off); + /* We should have set the repl_curr_off when synchronizing, so it shouldn't be -1 here */ + serverAssert(replica->repl_curr_off != -1); - } + replica->repl_end_off = g_pserver->master_repl_offset; /* Only if the there isn't already a pending write do we prepare the client to write */ if (!replica->fPendingReplicaWrite){ diff --git a/src/server.h b/src/server.h index 2aba985ed..64a2ca515 100644 --- a/src/server.h +++ b/src/server.h @@ -1518,6 +1518,7 @@ struct client { should use. */ long long repl_curr_off = -1; /* Replication offset of the client, only if it's a replica*/ + long long repl_end_off = -1; /* Replication offset to write to */ int fPendingReplicaWrite; char replid[CONFIG_RUN_ID_SIZE+1]; /* Master replication ID (if master). */ From 9db8556e91d46c5e2fb7f96ea5fb3880d56274aa Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Tue, 8 Jun 2021 23:10:53 +0000 Subject: [PATCH 46/75] Cleaned up code a bit, need to rewrite some comments to reflect new behaviour Former-commit-id: 850ec766cd71614ce9e61c12414545cd212d3878 --- src/evict.cpp | 1 - src/networking.cpp | 108 ++++---------------------- src/replication.cpp | 179 +++++++------------------------------------- src/server.cpp | 2 - src/server.h | 1 - 5 files changed, 43 insertions(+), 248 deletions(-) diff --git a/src/evict.cpp b/src/evict.cpp index 7ec223f6d..54153dc27 100644 --- a/src/evict.cpp +++ b/src/evict.cpp @@ -522,7 +522,6 @@ int freeMemoryIfNeeded(bool fQuickCycle, bool fPreSnapshot) { if (g_pserver->maxmemory_policy == MAXMEMORY_NO_EVICTION) goto cant_free; /* We need to free memory, but policy forbids. */ - serverLog(LL_NOTICE, "evicting i guess lol, the overhead was %ld, the repl_backlog_size, %lld", freeMemoryGetNotCountedMemory(), g_pserver->repl_backlog_size); while (mem_freed < mem_tofree) { int j, k, i; static unsigned int next_db = 0; diff --git a/src/networking.cpp b/src/networking.cpp index cead76998..aba1f1705 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -223,9 +223,6 @@ void clientInstallWriteHandler(client *c) { * if not already done and, for slaves, if the replica can actually receive * writes at this stage. */ - if (c->flags & CLIENT_SLAVE) - serverLog(LL_NOTICE, "installing write handler"); - if (!(c->flags & CLIENT_PENDING_WRITE) && (c->replstate == REPL_STATE_NONE || (c->replstate == SLAVE_STATE_ONLINE && !c->repl_put_online_on_ack))) @@ -277,10 +274,6 @@ void clientInstallAsyncWriteHandler(client *c) { int prepareClientToWrite(client *c) { bool fAsync = !FCorrectThread(c); // Not async if we're on the right thread - if (c->flags & CLIENT_SLAVE){ - serverLog(LL_NOTICE, "got into prepareClientToWrite"); - } - if (!fAsync) { serverAssert(c->conn == nullptr || c->lock.fOwnLock()); } else { @@ -1695,10 +1688,6 @@ int writeToClient(client *c, int handler_installed) { serverAssertDebug(FCorrectThread(c)); std::unique_locklock)> lock(c->lock); - // serverLog(LL_NOTICE, "acq client"); - - if (c->flags & CLIENT_SLAVE) - serverLog(LL_NOTICE, "writeToClient has happened"); while(clientHasPendingReplies(c)) { serverAssert(!(c->flags & CLIENT_SLAVE) || c->flags & CLIENT_MONITOR); @@ -1710,7 +1699,6 @@ int writeToClient(client *c, int handler_installed) { /* If the buffer was sent, set bufpos to zero to continue with * the remainder of the reply. */ - // serverLog(LL_NOTICE, "buf pos: %d, sentlen: %ld", c->bufpos, c->sentlen); if ((int)c->sentlen == c->bufpos) { c->bufpos = 0; c->sentlen = 0; @@ -1764,33 +1752,24 @@ int writeToClient(client *c, int handler_installed) { * We always read from the replication backlog directly */ std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); - /* Right now, we're bringing in the offStart into the scope - * If repl_batch_offStart is equal to -1, that means the mechanism is disabled - * which implies there is no data to flush and that the global offset is accurate */ - // long long offStart = g_pserver->repl_batch_offStart == -1 ? g_pserver->master_repl_offset : g_pserver->repl_batch_offStart; - long long offStart = c->repl_end_off; - long long idxStart = getReplIndexFromOffset(offStart); + long long repl_end_idx = getReplIndexFromOffset(c->repl_end_off); serverAssert(c->repl_curr_off != -1); - if (c->repl_curr_off != offStart){ - serverLog(LL_NOTICE, "printing the stats for client %lu: c->repl_curr_off: %lld, repl_batch_offStart: %lld, nwritten: %ld, offStart: %lld", - c->id, c->repl_curr_off, g_pserver->repl_batch_offStart, nwritten, offStart); - - long long curr_idx = getReplIndexFromOffset(c->repl_curr_off); - long long nwrittenPart2 = 0; + if (c->repl_curr_off != c->repl_end_off){ + long long repl_curr_idx = getReplIndexFromOffset(c->repl_curr_off); + long long nwritten2ndStage = 0; /* How much was written from the start of the replication backlog + * in the event of a wrap around write */ /* normal case with no wrap around */ - if (idxStart >= curr_idx){ - nwritten = connWrite(c->conn, g_pserver->repl_backlog + curr_idx, idxStart - curr_idx); - /* wrap around case, v. rare */ - /* also v. buggy so there's that */ + if (repl_end_idx >= repl_curr_idx){ + nwritten = connWrite(c->conn, g_pserver->repl_backlog + repl_curr_idx, repl_end_idx - repl_curr_idx); + /* wrap around case */ } else { - serverLog(LL_NOTICE, "ROAD OF RESISTANCE"); - nwritten = connWrite(c->conn, g_pserver->repl_backlog + curr_idx, g_pserver->repl_backlog_size - curr_idx); + nwritten = connWrite(c->conn, g_pserver->repl_backlog + repl_curr_idx, g_pserver->repl_backlog_size - repl_curr_idx); /* only attempt wrapping if we write the correct number of bytes */ - if (nwritten == g_pserver->repl_backlog_size - curr_idx){ - long long nwrittenPart2 = connWrite(c->conn, g_pserver->repl_backlog, idxStart); - if (nwrittenPart2 != -1) - nwritten += nwrittenPart2; + if (nwritten == g_pserver->repl_backlog_size - repl_curr_idx){ + nwritten2ndStage = connWrite(c->conn, g_pserver->repl_backlog, repl_end_idx); + if (nwritten2ndStage != -1) + nwritten += nwritten2ndStage; } } @@ -1798,31 +1777,19 @@ int writeToClient(client *c, int handler_installed) { if (nwritten > 0){ totwritten += nwritten; c->repl_curr_off += nwritten; - if (1){ - serverLog(LL_NOTICE, "printing the stats for client %lu: c->repl_curr_off: %lld, repl_batch_offStart: %lld, nwritten: %ld, offStart: %lld", - c->id, c->repl_curr_off, g_pserver->repl_batch_offStart, nwritten, offStart); - } - serverAssert(c->repl_curr_off <= offStart); + serverAssert(c->repl_curr_off <= c->repl_end_off); /* If the client offset matches the global offset, we wrote all we needed to, * in which case, there is no pending write */ - if (c->repl_curr_off == offStart){ - serverLog(LL_NOTICE, "good, %lld", offStart); + if (c->repl_curr_off == c->repl_end_off){ c->fPendingReplicaWrite = false; - } else { - serverLog(LL_NOTICE, "mismatch between repl_curr_off (%lld) and offStart (%lld)", c->repl_curr_off, offStart); } } /* If the second part of a write didn't go through, we still need to register that */ - if (nwrittenPart2 == -1) nwritten = -1; + if (nwritten2ndStage == -1) nwritten = -1; } - - // if (c->flags & CLIENT_SLAVE && handler_installed) - // serverLog(LL_NOTICE, "Total bytes written, %ld, write handler installed?: %d", totwritten, handler_installed); - } - // serverLog(LL_NOTICE, "rel client"); g_pserver->stat_net_output_bytes += totwritten; if (nwritten == -1) { if (connGetState(c->conn) == CONN_STATE_CONNECTED) { @@ -1843,11 +1810,6 @@ int writeToClient(client *c, int handler_installed) { if (!(c->flags & CLIENT_MASTER)) c->lastinteraction = g_pserver->unixtime; } if (!clientHasPendingReplies(c) && !c->fPendingReplicaWrite) { - // if(c->flags & CLIENT_SLAVE && handler_installed){ - // serverLog(LL_NOTICE, "Uninstalling handler"); - // serverLog(LL_NOTICE, "repl_backlog_size: %lld", g_pserver->repl_backlog_size); - // serverLog(LL_NOTICE, "handler repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset); - // } c->sentlen = 0; if (handler_installed) connSetWriteHandler(c->conn, NULL); @@ -1863,7 +1825,6 @@ int writeToClient(client *c, int handler_installed) { /* Write event handler. Just send data to the client. */ void sendReplyToClient(connection *conn) { client *c = (client*)connGetPrivateData(conn); - // serverLog(LL_NOTICE, "called the sendreplytoclient"); if (writeToClient(c,1) == C_ERR) { AeLocker ae; @@ -1997,7 +1958,6 @@ int handleClientsWithPendingWrites(int iel, int aof_state) { auto vec = std::move(g_pserver->rgthreadvar[iel].clients_pending_write); processed += (int)vec.size(); - // serverLog(LL_NOTICE, "entered handleClientsWithPendingWrites"); for (client *c : vec) { serverAssertDebug(FCorrectThread(c)); @@ -2013,12 +1973,6 @@ int handleClientsWithPendingWrites(int iel, int aof_state) { /* Don't write to clients that are going to be closed anyway. */ if (c->flags & CLIENT_CLOSE_ASAP) continue; - // if (c->flags & CLIENT_SLAVE){ - // if(clientHasPendingReplies(c)) - // serverLog(LL_NOTICE, "somehow the client buffer has these values: %s", c->buf); - // serverLog(LL_NOTICE, "LOL"); - // } - /* Try to write buffers to the client socket. */ if (writeToClient(c,0) == C_ERR) { @@ -2216,34 +2170,6 @@ static void setProtocolError(const char *errstr, client *c) { c->flags |= (CLIENT_CLOSE_AFTER_REPLY|CLIENT_PROTOCOL_ERROR); } -static void printQueryBuffer(client *c) { - if (cserver.verbosity <= LL_VERBOSE || c->flags & CLIENT_MASTER) { - sds client = catClientInfoString(sdsempty(),c); - - /* Sample some protocol to given an idea about what was inside. */ - char buf[PROTO_DUMP_LEN*2]; - if (sdslen(c->querybuf)-c->qb_pos < PROTO_DUMP_LEN) { - snprintf(buf,sizeof(buf),"%s", c->querybuf+c->qb_pos); - } else { - snprintf(buf,sizeof(buf),"%.*s (... more %zu bytes ...) %.*s", PROTO_DUMP_LEN/2, c->querybuf+c->qb_pos, sdslen(c->querybuf)-c->qb_pos-PROTO_DUMP_LEN, PROTO_DUMP_LEN/2, c->querybuf+sdslen(c->querybuf)-PROTO_DUMP_LEN/2); - } - - /* Remove non printable chars. */ - char *p = buf; - while (*p != '\0') { - if (!isprint(*p)) *p = '.'; - p++; - } - - /* Log all the client and protocol info. */ - int loglevel = (c->flags & CLIENT_MASTER) ? LL_WARNING : - LL_VERBOSE; - serverLog(loglevel, - "Query buffer from client %lu: %s. %s", c->id, client, buf); - sdsfree(client); - } -} - /* Process the query buffer for client 'c', setting up the client argument * vector for command execution. Returns C_OK if after running the function * the client has a well-formed ready to be processed command, otherwise @@ -2498,8 +2424,6 @@ void parseClientCommandBuffer(client *c) { } size_t cqueriesStart = c->vecqueuedcmd.size(); - // if (c->flags & CLIENT_MASTER) - // printQueryBuffer(c); if (c->reqtype == PROTO_REQ_INLINE) { if (processInlineBuffer(c) != C_OK) break; } else if (c->reqtype == PROTO_REQ_MULTIBULK) { diff --git a/src/replication.cpp b/src/replication.cpp index 96bf161f9..ebdb8af78 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -60,7 +60,6 @@ static void propagateMasterStaleKeys(); void updateLowestOffsetAmongReplicas(){ serverAssert(GlobalLocksAcquired()); serverAssert(!g_pserver->repl_backlog_lock.fOwnLock()); - // serverLog(LL_NOTICE, "off- have repl"); long long min_offset = LONG_LONG_MAX; listIter li; listNode *ln; @@ -73,14 +72,13 @@ void updateLowestOffsetAmongReplicas(){ if (replica->flags & CLIENT_CLOSE_ASAP) continue; std::unique_lock ul(replica->lock); - // serverLog(LL_NOTICE, "off- acq client"); - min_offset = std::min(min_offset, replica->repl_curr_off); - // serverLog(LL_NOTICE, "off- rel client"); + min_offset = std::min(min_offset, replica->repl_curr_off); } /* return -1 if no other minimum was found */ g_pserver->repl_lowest_off.store(min_offset == LONG_LONG_MAX ? -1 : min_offset, std::memory_order_seq_cst); } + /* We take a global flag to remember if this instance generated an RDB * because of replication, so that we can remove the RDB file in case * the instance is configured to have no persistence. */ @@ -232,6 +230,8 @@ void createReplicationBacklog(void) { g_pserver->repl_batch_offStart = g_pserver->master_repl_offset; } +long long getReplIndexFromOffset(long long offset); + /* This function is called when the user modifies the replication backlog * size at runtime. It is up to the function to both update the * g_pserver->repl_backlog_size and to resize the buffer and setup it so that @@ -243,8 +243,6 @@ void resizeReplicationBacklog(long long newsize) { newsize = CONFIG_REPL_BACKLOG_MIN_SIZE; if (g_pserver->repl_backlog_size == newsize) return; - serverLog(LL_NOTICE, "WE HAD TO RESIZE from %lld to %lld", g_pserver->repl_backlog_size, newsize); - if (g_pserver->repl_backlog != NULL) { /* What we actually do is to flush the old buffer and realloc a new * empty one. It will refill with new data incrementally. @@ -252,59 +250,8 @@ void resizeReplicationBacklog(long long newsize) { * worse often we need to alloc additional space before freeing the * old buffer. */ - if (g_pserver->repl_batch_idxStart >= 0) { - // We need to keep critical data so we can't shrink less than the hot data in the buffer - newsize = std::max(newsize, g_pserver->master_repl_offset - g_pserver->repl_batch_offStart); - char *backlog = (char*)zmalloc(newsize); - g_pserver->repl_backlog_histlen = g_pserver->master_repl_offset - g_pserver->repl_batch_offStart; - - if (g_pserver->repl_backlog_idx >= g_pserver->repl_batch_idxStart) { - auto cbActiveBacklog = g_pserver->repl_backlog_idx - g_pserver->repl_batch_idxStart; - memcpy(backlog, g_pserver->repl_backlog + g_pserver->repl_batch_idxStart, cbActiveBacklog); - serverAssert(g_pserver->repl_backlog_histlen == cbActiveBacklog); - } else { - auto cbPhase1 = g_pserver->repl_backlog_size - g_pserver->repl_batch_idxStart; - memcpy(backlog, g_pserver->repl_backlog + g_pserver->repl_batch_idxStart, cbPhase1); - memcpy(backlog + cbPhase1, g_pserver->repl_backlog, g_pserver->repl_backlog_idx); - auto cbActiveBacklog = cbPhase1 + g_pserver->repl_backlog_idx; - serverAssert(g_pserver->repl_backlog_histlen == cbActiveBacklog); - } - zfree(g_pserver->repl_backlog); - g_pserver->repl_backlog = backlog; - g_pserver->repl_backlog_idx = g_pserver->repl_backlog_histlen; - g_pserver->repl_batch_idxStart = 0; - g_pserver->repl_backlog_start = g_pserver->master_repl_offset; - } else { - zfree(g_pserver->repl_backlog); - g_pserver->repl_backlog = (char*)zmalloc(newsize); - g_pserver->repl_backlog_histlen = 0; - g_pserver->repl_backlog_idx = 0; - /* Next byte we have is... the next since the buffer is empty. */ - g_pserver->repl_backlog_off = g_pserver->master_repl_offset+1; - } - } - g_pserver->repl_backlog_size = newsize; -} - -long long getReplIndexFromOffset(long long offset); - -/* The above but for when clients need extra replication backlog because ??? */ -void resizeReplicationBacklogForClients(long long newsize) { - if (newsize < CONFIG_REPL_BACKLOG_MIN_SIZE) - newsize = CONFIG_REPL_BACKLOG_MIN_SIZE; - if (g_pserver->repl_backlog_size == newsize) return; - - serverLog(LL_NOTICE, "WE HAVE TO RESIZE from %lld to %lld", g_pserver->repl_backlog_size, newsize); - /* get the critical client size, i.e. the size of the data unflushed to clients */ - long long earliest_off = g_pserver->repl_lowest_off.load(); - - - if (g_pserver->repl_backlog != NULL) { - /* What we actually do is to flush the old buffer and realloc a new - * empty one. It will refill with new data incrementally. - * The reason is that copying a few gigabytes adds latency and even - * worse often we need to alloc additional space before freeing the - * old buffer. */ + /* get the critical client size, i.e. the size of the data unflushed to clients */ + long long earliest_off = g_pserver->repl_lowest_off.load(); if (earliest_off != -1) { // We need to keep critical data so we can't shrink less than the hot data in the buffer @@ -316,8 +263,6 @@ void resizeReplicationBacklogForClients(long long newsize) { if (g_pserver->repl_backlog_idx >= earliest_idx) { auto cbActiveBacklog = g_pserver->repl_backlog_idx - earliest_idx; memcpy(backlog, g_pserver->repl_backlog + earliest_idx, cbActiveBacklog); - serverLog(LL_NOTICE, "g_pserver->master_repl_offset: %lld, earliest_off: %lld, g_pserver->repl_backlog_idx: %lld, earliest_idx: %lld, repl_backlog_start: %lld", - g_pserver->master_repl_offset, earliest_off, g_pserver->repl_backlog_idx, earliest_idx, g_pserver->repl_backlog_start); serverAssert(g_pserver->repl_backlog_histlen == cbActiveBacklog); } else { auto cbPhase1 = g_pserver->repl_backlog_size - earliest_idx; @@ -344,14 +289,9 @@ void resizeReplicationBacklogForClients(long long newsize) { } } g_pserver->repl_backlog_size = newsize; - - serverLog(LL_NOTICE, "We are ending with: master_repl_offset: %lld, repl_batch_offStart: %lld, new_off: %lld, " - "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, new_idx: %lld, repl_backlog_size: %lld", - g_pserver->master_repl_offset, g_pserver->repl_batch_offStart, 0LL, - g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, 0LL, g_pserver->repl_backlog_size - ); } + void freeReplicationBacklog(void) { serverAssert(GlobalLocksAcquired()); listIter li; @@ -391,17 +331,11 @@ void feedReplicationBacklog(const void *ptr, size_t len) { // This is an emergency overflow, we better resize to fit long long newsize = std::max(g_pserver->repl_backlog_size*2, minimumsize); serverLog(LL_WARNING, "Replication backlog is too small, resizing to: %lld", newsize); - resizeReplicationBacklogForClients(newsize); + resizeReplicationBacklog(newsize); } } } - // serverLog(LL_NOTICE, "Pt2 start with: master_repl_offset: %lld, repl_batch_offStart: %lld, " - // "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, repl_backlog_size: %lld", - // g_pserver->master_repl_offset, g_pserver->repl_batch_offStart, - // g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, g_pserver->repl_backlog_size - // ); - g_pserver->master_repl_offset += len; /* This is a circular buffer, so write as much data we can at every @@ -423,12 +357,6 @@ void feedReplicationBacklog(const void *ptr, size_t len) { /* Set the offset of the first byte we have in the backlog. */ g_pserver->repl_backlog_off = g_pserver->master_repl_offset - g_pserver->repl_backlog_histlen + 1; - - // serverLog(LL_NOTICE, "Pt2 end with: master_repl_offset: %lld, repl_batch_offStart: %lld, " - // "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, repl_backlog_size: %lld", - // g_pserver->master_repl_offset, g_pserver->repl_batch_offStart, - // g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, g_pserver->repl_backlog_size - // ); } /* Wrapper for feedReplicationBacklog() that takes Redis string objects @@ -578,9 +506,7 @@ void replicationFeedSlavesCore(list *slaves, int dictid, robj **argv, int argc) /* Add the SELECT command into the backlog. */ /* We don't do this for advanced replication because this will be done later when it adds the whole RREPLAY command */ - if (g_pserver->repl_backlog && fSendRaw) { - feedReplicationBacklogWithObject(selectcmd); - } + if (g_pserver->repl_backlog && fSendRaw) feedReplicationBacklogWithObject(selectcmd); if (dictid < 0 || dictid >= PROTO_SHARED_SELECT_CMDS) decrRefCount(selectcmd); @@ -678,7 +604,7 @@ void replicationFeedSlaves(list *replicas, int dictid, robj **argv, int argc) { void showLatestBacklog(void) { if (g_pserver->repl_backlog == NULL) return; - long long dumplen = 1024; + long long dumplen = 256; if (g_pserver->repl_backlog_histlen < dumplen) dumplen = g_pserver->repl_backlog_histlen; @@ -769,7 +695,9 @@ void replicationFeedMonitors(client *c, list *monitors, int dictid, robj **argv, } decrRefCount(cmdobj); } -#define BYPASS_PSYNC + +int prepareClientToWrite(client *c); + /* Feed the replica 'c' with the replication backlog starting from the * specified 'offset' up to the end of the backlog. */ long long addReplyReplicationBacklog(client *c, long long offset) { @@ -809,26 +737,14 @@ long long addReplyReplicationBacklog(client *c, long long offset) { * split the reply in two parts if we are cross-boundary. */ len = g_pserver->repl_backlog_histlen - skip; serverLog(LL_DEBUG, "[PSYNC] Reply total length: %lld", len); -#ifdef BYPASS_PSYNC + c->repl_curr_off = offset - 1; c->repl_end_off = g_pserver->master_repl_offset; - serverLog(LL_NOTICE, "This client %lu at addr %s synchronized to %lld", c->id, getClientPeerId(c), c->repl_curr_off); /* Force the partial sync to be queued */ prepareClientToWrite(c); - c->fPendingReplicaWrite = true; -#else - while(len) { - long long thislen = - ((g_pserver->repl_backlog_size - j) < len) ? - (g_pserver->repl_backlog_size - j) : len; + c->fPendingReplicaWrite = true; - serverLog(LL_DEBUG, "[PSYNC] addReply() length: %lld", thislen); - addReplySds(c,sdsnewlen(g_pserver->repl_backlog + j, thislen)); - len -= thislen; - j = 0; - } -#endif return g_pserver->repl_backlog_histlen - skip; } @@ -866,15 +782,11 @@ int replicationSetupSlaveForFullResync(client *replica, long long offset) { replica->repl_curr_off = offset; replica->repl_end_off = g_pserver->master_repl_offset; - serverLog(LL_NOTICE, "This client %lu at addr %s synchronized to %lld", replica->id, getClientPeerId(replica), replica->repl_curr_off); - /* We are going to accumulate the incremental changes for this * replica as well. Set replicaseldb to -1 in order to force to re-emit * a SELECT statement in the replication stream. */ g_pserver->replicaseldb = -1; - serverLog(LL_NOTICE, "We are setting up here lad"); - /* Don't send this reply to slaves that approached us with * the old SYNC command. */ if (!(replica->flags & CLIENT_PRE_PSYNC)) { @@ -1179,7 +1091,6 @@ void syncCommand(client *c) { if (g_pserver->FRdbSaveInProgress() && g_pserver->rdb_child_type == RDB_CHILD_TYPE_DISK) { - serverLog(LL_NOTICE, "case 1"); /* Ok a background save is in progress. Let's check if it is a good * one for replication, i.e. if there is another replica that is * registering differences since the server forked to save. */ @@ -1211,7 +1122,6 @@ void syncCommand(client *c) { } else if (g_pserver->FRdbSaveInProgress() && g_pserver->rdb_child_type == RDB_CHILD_TYPE_SOCKET) { - serverLog(LL_NOTICE, "case 2"); /* There is an RDB child process but it is writing directly to * children sockets. We need to wait for the next BGSAVE * in order to synchronize. */ @@ -1219,7 +1129,6 @@ void syncCommand(client *c) { /* CASE 3: There is no BGSAVE is progress. */ } else { - serverLog(LL_NOTICE, "case 3"); if (g_pserver->repl_diskless_sync && (c->slave_capa & SLAVE_CAPA_EOF)) { /* Diskless replication RDB child is created inside * replicationCron() since we want to delay its start a @@ -4606,9 +4515,10 @@ void _clientAsyncReplyBufferReserve(client *c, size_t len); void flushReplBacklogToClients() { serverAssert(GlobalLocksAcquired()); + /* If we have the repl backlog lock, we will deadlock */ + serverAssert(!g_pserver->repl_backlog_lock.fOwnLock()); if (g_pserver->repl_batch_offStart < 0) return; - if (g_pserver->repl_batch_offStart != g_pserver->master_repl_offset) { bool fAsyncWrite = false; @@ -4617,66 +4527,31 @@ void flushReplBacklogToClients() serverAssert(g_pserver->master_repl_offset - g_pserver->repl_batch_offStart <= g_pserver->repl_backlog_size); serverAssert(g_pserver->repl_batch_idxStart != g_pserver->repl_backlog_idx); - serverLog(LL_NOTICE, "the master repl offset is %lld", g_pserver->master_repl_offset); - showLatestBacklog(); listIter li; listNode *ln; listRewind(g_pserver->slaves, &li); while ((ln = listNext(&li))) { client *replica = (client*)listNodeValue(ln); - // serverLog(LL_NOTICE, "client %lu is in the party", replica->id); - - // serverLog(LL_NOTICE, "is there a write pending for %lu, %d", replica->id, replica->fPendingReplicaWrite); if (replica->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue; if (replica->flags & CLIENT_CLOSE_ASAP) continue; - std::unique_lock ul(replica->lock, std::defer_lock); - if (FCorrectThread(replica)) - ul.lock(); - else + std::unique_lock ul(replica->lock); + if (!FCorrectThread(replica)) fAsyncWrite = true; - - /* If we are online and the RDB has been sent, there is no need to feed the client buffer - * We will send our replies directly from the replication backlog instead */ -#ifdef BYPASS_BUFFER - { - std::unique_lock asyncUl(replica->lock, std::defer_lock); - if (!FCorrectThread(replica)) - asyncUl.lock(); + /* We should have set the repl_curr_off when synchronizing, so it shouldn't be -1 here */ + serverAssert(replica->repl_curr_off != -1); - /* We should have set the repl_curr_off when synchronizing, so it shouldn't be -1 here */ - serverAssert(replica->repl_curr_off != -1); + replica->repl_end_off = g_pserver->master_repl_offset; - replica->repl_end_off = g_pserver->master_repl_offset; - - /* Only if the there isn't already a pending write do we prepare the client to write */ - if (!replica->fPendingReplicaWrite){ - serverAssert(replica->repl_curr_off != g_pserver->master_repl_offset); - prepareClientToWrite(replica); - replica->fPendingReplicaWrite = true; - } - - continue; + /* Only if the there isn't already a pending write do we prepare the client to write */ + if (!replica->fPendingReplicaWrite){ + serverAssert(replica->repl_curr_off != g_pserver->master_repl_offset); + prepareClientToWrite(replica); + replica->fPendingReplicaWrite = true; } -#endif - if (g_pserver->repl_backlog_idx >= g_pserver->repl_batch_idxStart) { - long long cbCopy = g_pserver->repl_backlog_idx - g_pserver->repl_batch_idxStart; - serverAssert((g_pserver->master_repl_offset - g_pserver->repl_batch_offStart) == cbCopy); - serverAssert((g_pserver->repl_backlog_size - g_pserver->repl_batch_idxStart) >= (cbCopy)); - serverAssert((g_pserver->repl_batch_idxStart + cbCopy) <= g_pserver->repl_backlog_size); - - addReplyProto(replica, g_pserver->repl_backlog + g_pserver->repl_batch_idxStart, cbCopy); - } else { - auto cbPhase1 = g_pserver->repl_backlog_size - g_pserver->repl_batch_idxStart; - if (fAsyncWrite) - _clientAsyncReplyBufferReserve(replica, cbPhase1 + g_pserver->repl_backlog_idx); - addReplyProto(replica, g_pserver->repl_backlog + g_pserver->repl_batch_idxStart, cbPhase1); - addReplyProto(replica, g_pserver->repl_backlog, g_pserver->repl_backlog_idx); - serverAssert((cbPhase1 + g_pserver->repl_backlog_idx) == (g_pserver->master_repl_offset - g_pserver->repl_batch_offStart)); - } } if (fAsyncWrite) ProcessPendingAsyncWrites(); diff --git a/src/server.cpp b/src/server.cpp index 439e1aeff..362569bfa 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -1796,7 +1796,6 @@ int clientsCronTrackClientsMemUsage(client *c) { mem += zmalloc_size(c); mem += c->argv_len_sum(); if (c->argv) mem += zmalloc_size(c->argv); - // serverLog(LL_NOTICE, "Mem here is : %lu", mem); /* Now that we have the memory used by the client, remove the old * value from the old category, and add it back. */ g_pserver->stat_clients_type_memory[c->client_cron_last_memory_type] -= @@ -1855,7 +1854,6 @@ void clientsCron(int iel) { while(listLength(g_pserver->clients) && iterations--) { client *c; listNode *head; - // serverLog(LL_NOTICE, "we are at iteration: %d", iterations); /* Rotate the list, take the current head, process. * This way if the client must be removed from the list it's the * first element and we don't incur into O(N) computation. */ diff --git a/src/server.h b/src/server.h index 64a2ca515..0fcd8f5ef 100644 --- a/src/server.h +++ b/src/server.h @@ -3540,7 +3540,6 @@ void tlsInit(void); void tlsInitThread(); int tlsConfigure(redisTLSContextConfig *ctx_config); -int prepareClientToWrite(client *c); class ShutdownException From 5998dc233afa724060fe2a8855d226ab98112e90 Mon Sep 17 00:00:00 2001 From: malavan Date: Wed, 9 Jun 2021 21:49:15 +0000 Subject: [PATCH 47/75] add global locks to FreeMemoryLazyFree Former-commit-id: d850ce20219a3e29a6a816ebfa0d714963d6a88b --- src/evict.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/evict.cpp b/src/evict.cpp index b673e165d..8438064a4 100644 --- a/src/evict.cpp +++ b/src/evict.cpp @@ -470,11 +470,13 @@ public: FreeMemoryLazyFree(FreeMemoryLazyFree&&) = default; ~FreeMemoryLazyFree() { + aeAcquireLock(); for (auto &pair : vecdictvecde) { for (auto de : pair.second) { dictFreeUnlinkedEntry(pair.first, de); } } + aeReleaseLock(); --s_clazyFreesInProgress; } From 562c0ae3de29d2419d5291b7253117a746194490 Mon Sep 17 00:00:00 2001 From: John Sully Date: Mon, 14 Jun 2021 03:34:56 +0000 Subject: [PATCH 48/75] memefficiency tests need to run single threaded as jemalloc has seperate pools for threads Former-commit-id: 02152c7bd8bc0462edd809122873ceb8644dc69a --- tests/unit/memefficiency.tcl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/memefficiency.tcl b/tests/unit/memefficiency.tcl index 4ee6fdbdb..db18e7128 100644 --- a/tests/unit/memefficiency.tcl +++ b/tests/unit/memefficiency.tcl @@ -395,7 +395,7 @@ start_server {tags {"defrag"} overrides {appendonly yes auto-aof-rewrite-percent # if the current slab is lower in utilization the defragger would have ended up in stagnation, # keept running and not move any allocation. # this test is more consistent on a fresh server with no history - start_server {tags {"defrag"} overrides {save ""}} { + start_server {tags {"defrag"} overrides {save "" server-threads 1}} { r flushdb r config resetstat r config set hz 100 From dcffe221ce144f753daf619b2de78b56d08bddcd Mon Sep 17 00:00:00 2001 From: John Sully Date: Mon, 14 Jun 2021 03:50:47 +0000 Subject: [PATCH 49/75] Remove lock Former-commit-id: bb8efe2ed942fd67d091b16bb27f67ccefcbbf19 --- src/StorageCache.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/StorageCache.h b/src/StorageCache.h index 2536cf2ec..ed868e74b 100644 --- a/src/StorageCache.h +++ b/src/StorageCache.h @@ -45,11 +45,11 @@ public: bool enumerate(IStorage::callback fn) const { return m_spstorage->enumerate(fn); } void beginWriteBatch(); - void endWriteBatch() { m_spstorage->endWriteBatch(); m_lock.unlock(); } + void endWriteBatch() { m_spstorage->endWriteBatch(); } void batch_lock() { return m_spstorage->batch_lock(); } void batch_unlock() { return m_spstorage->batch_unlock(); } size_t count() const; const StorageCache *clone(); -}; \ No newline at end of file +}; From 3ca4e0b4f9dc05f382d38bd191b0ab32be7ac4ab Mon Sep 17 00:00:00 2001 From: John Sully Date: Mon, 14 Jun 2021 04:06:34 +0000 Subject: [PATCH 50/75] Free objects immediately before adding to the GC list, this cuts down on mem consumption Former-commit-id: 49d718ae9c1c8a850df5ffa2c550df3381ad7174 --- src/dict.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/dict.cpp b/src/dict.cpp index 57872b5c4..d16fd77f7 100644 --- a/src/dict.cpp +++ b/src/dict.cpp @@ -535,7 +535,8 @@ dictAsyncRehashCtl::~dictAsyncRehashCtl() { while (deGCList != nullptr) { auto next = deGCList->next; dictFreeKey(dict, deGCList); - dictFreeVal(dict, deGCList); + if (deGCList->v.val != nullptr) + dictFreeVal(dict, deGCList); zfree(deGCList); deGCList = next; } @@ -694,6 +695,8 @@ static dictEntry *dictGenericDelete(dict *d, const void *key, int nofree) { d->ht[table].table[idx] = he->next; if (!nofree) { if (table == 0 && d->asyncdata != nullptr && (ssize_t)idx < d->rehashidx) { + dictFreeVal(d, he); + he->v.val = nullptr; he->next = d->asyncdata->deGCList; d->asyncdata->deGCList = he; } else { @@ -752,6 +755,8 @@ void dictFreeUnlinkedEntry(dict *d, dictEntry *he) { if (he == NULL) return; if (d->asyncdata) { + dictFreeVal(d, he); + he->v.val = nullptr; he->next = d->asyncdata->deGCList; d->asyncdata->deGCList = he; } else { @@ -775,6 +780,8 @@ int _dictClear(dict *d, dictht *ht, void(callback)(void *)) { while(he) { nextHe = he->next; if (d->asyncdata && (ssize_t)i < d->rehashidx) { + dictFreeVal(d, he); + he->v.val = nullptr; he->next = d->asyncdata->deGCList; d->asyncdata->deGCList = he; } else { From bdc29a935e886c92a05208cb47800c99af7b83fe Mon Sep 17 00:00:00 2001 From: John Sully Date: Mon, 14 Jun 2021 06:32:58 +0000 Subject: [PATCH 51/75] Fix deadlock in storage cache Former-commit-id: e74711e8131cd29a1e0294fbb28e1737ee98afce --- src/StorageCache.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/StorageCache.cpp b/src/StorageCache.cpp index e33c97ff7..98c797c71 100644 --- a/src/StorageCache.cpp +++ b/src/StorageCache.cpp @@ -130,9 +130,10 @@ void StorageCache::retrieve(sds key, IStorage::callbackSingle fn) const size_t StorageCache::count() const { - std::unique_lock ul(m_lock); + std::unique_lock ul(m_lock, std::defer_lock); + bool fLocked = ul.try_lock(); size_t count = m_spstorage->count(); - if (m_pdict != nullptr) { + if (m_pdict != nullptr && fLocked) { serverAssert(bulkInsertsInProgress.load(std::memory_order_seq_cst) || count == (dictSize(m_pdict) + m_collisionCount)); } return count; @@ -140,6 +141,5 @@ size_t StorageCache::count() const void StorageCache::beginWriteBatch() { serverAssert(GlobalLocksAcquired()); // Otherwise we deadlock - m_lock.lock(); m_spstorage->beginWriteBatch(); } \ No newline at end of file From dd8e8b098c954b4ea19c92ca631fef81168c7065 Mon Sep 17 00:00:00 2001 From: John Sully Date: Mon, 14 Jun 2021 16:32:47 +0000 Subject: [PATCH 52/75] active defrag tests need to run single threaded because jemalloc has seperate mempools per thread and the numbers won't match otherwise Former-commit-id: 3a1d3090f2ec5a442e3a7c192987cdfa24094145 --- tests/unit/memefficiency.tcl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/memefficiency.tcl b/tests/unit/memefficiency.tcl index d5c2feb4f..2a2db72cd 100644 --- a/tests/unit/memefficiency.tcl +++ b/tests/unit/memefficiency.tcl @@ -395,7 +395,7 @@ start_server {tags {"defrag"} overrides {appendonly yes auto-aof-rewrite-percent # if the current slab is lower in utilization the defragger would have ended up in stagnation, # keept running and not move any allocation. # this test is more consistent on a fresh server with no history - start_server {tags {"defrag"} overrides {save ""}} { + start_server {tags {"defrag"} overrides {save "" server-threads 1}} { r flushdb r config resetstat r config set hz 100 From 80dddab0c4e587332b497cbe8157f39dcc417eb4 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 14 Jun 2021 19:30:49 +0000 Subject: [PATCH 53/75] Relaxed locking, should run faster now Former-commit-id: 5cec4d026dc1766b9ecbade6ec4b9d0e75a94e0f --- src/multi.cpp | 1 - src/networking.cpp | 6 ++++++ src/replication.cpp | 18 ++++++++++-------- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/src/multi.cpp b/src/multi.cpp index f74748e90..589dba589 100644 --- a/src/multi.cpp +++ b/src/multi.cpp @@ -268,7 +268,6 @@ void execCommand(client *c) { if (g_pserver->repl_backlog && was_master && !is_master) { const char *execcmd = "*1\r\n$4\r\nEXEC\r\n"; updateLowestOffsetAmongReplicas(); - std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); feedReplicationBacklog(execcmd,strlen(execcmd)); } afterPropagateExec(); diff --git a/src/networking.cpp b/src/networking.cpp index d8d91751d..07312a9ee 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -1856,6 +1856,8 @@ int writeToClient(client *c, int handler_installed) { * We always read from the replication backlog directly */ std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); + // serverLog(LL_NOTICE, "written to handler"); + long long repl_end_idx = getReplIndexFromOffset(c->repl_end_off); serverAssert(c->repl_curr_off != -1); @@ -1884,8 +1886,12 @@ int writeToClient(client *c, int handler_installed) { serverAssert(c->repl_curr_off <= c->repl_end_off); /* If the client offset matches the global offset, we wrote all we needed to, * in which case, there is no pending write */ + if (c->repl_curr_off == c->repl_end_off){ + // serverLog(LL_NOTICE, "Successfully wrote up until %lld", c->repl_end_off); c->fPendingReplicaWrite = false; + } else { + // serverLog(LL_NOTICE, "Wrote to %lld out of %lld", c->repl_curr_off, c->repl_end_off); } } diff --git a/src/replication.cpp b/src/replication.cpp index d10bac99a..a5f9c3acf 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -241,6 +241,8 @@ void resizeReplicationBacklog(long long newsize) { newsize = CONFIG_REPL_BACKLOG_MIN_SIZE; if (g_pserver->repl_backlog_size == newsize) return; + std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); + if (g_pserver->repl_backlog != NULL) { /* What we actually do is to flush the old buffer and realloc a new * empty one. It will refill with new data incrementally. @@ -310,9 +312,9 @@ void freeReplicationBacklog(void) { * the backlog without incrementing the offset. */ void feedReplicationBacklog(const void *ptr, size_t len) { serverAssert(GlobalLocksAcquired()); - serverAssert(g_pserver->repl_backlog_lock.fOwnLock()); const unsigned char *p = (const unsigned char*)ptr; + if (g_pserver->repl_batch_idxStart >= 0) { /* we are lower bounded by the lower client offset or the offStart if all the clients are up to date */ long long lower_bound = g_pserver->repl_lowest_off.load(std::memory_order_seq_cst); @@ -320,10 +322,11 @@ void feedReplicationBacklog(const void *ptr, size_t len) { lower_bound = g_pserver->repl_batch_offStart; long long minimumsize = g_pserver->master_repl_offset + len - lower_bound + 1; if (minimumsize > g_pserver->repl_backlog_size) { - g_pserver->repl_backlog_lock.unlock(); flushReplBacklogToClients(); - g_pserver->repl_backlog_lock.lock(); - minimumsize = g_pserver->master_repl_offset + len - lower_bound +1; + minimumsize = g_pserver->master_repl_offset + len - lower_bound + 1; + + serverLog(LL_NOTICE, "minimumsize: %lld, g_pserver->master_repl_offset: %lld, len: %lu, lower_bound: %lld", + minimumsize, g_pserver->master_repl_offset, len, lower_bound); if (minimumsize > g_pserver->repl_backlog_size) { // This is an emergency overflow, we better resize to fit @@ -492,7 +495,6 @@ void replicationFeedSlavesCore(list *slaves, int dictid, robj **argv, int argc) bool fSendRaw = !g_pserver->fActiveReplica; updateLowestOffsetAmongReplicas(); - std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); /* Send SELECT command to every replica if needed. */ if (g_pserver->replicaseldb != dictid) { @@ -655,7 +657,6 @@ void replicationFeedSlavesFromMasterStream(char *buf, size_t buflen) { if (g_pserver->repl_backlog){ updateLowestOffsetAmongReplicas(); - std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); feedReplicationBacklog(buf,buflen); } } @@ -750,7 +751,7 @@ long long addReplyReplicationBacklog(client *c, long long offset) { serverLog(LL_DEBUG, "[PSYNC] Reply total length: %lld", len); c->repl_curr_off = offset - 1; - serverLog(LL_NOTICE, "Client %s, replica offset %lld in psync", replicationGetSlaveName(c), c->repl_curr_off); + // serverLog(LL_NOTICE, "Client %s, replica offset %lld in psync", replicationGetSlaveName(c), c->repl_curr_off); c->repl_end_off = g_pserver->master_repl_offset; /* Force the partial sync to be queued */ @@ -4988,7 +4989,7 @@ void flushReplBacklogToClients() if (!canFeedReplicaReplBuffer(replica)) continue; if (replica->flags & CLIENT_CLOSE_ASAP) continue; - serverLog(LL_NOTICE, "Client %s, replica offset %lld", replicationGetSlaveName(replica), replica->repl_curr_off); + // serverLog(LL_NOTICE, "Client %s, replica offset %lld", replicationGetSlaveName(replica), replica->repl_curr_off); std::unique_lock ul(replica->lock); if (!FCorrectThread(replica)) @@ -5013,6 +5014,7 @@ void flushReplBacklogToClients() // This may be called multiple times per "frame" so update with our progress flushing to clients g_pserver->repl_batch_idxStart = g_pserver->repl_backlog_idx; g_pserver->repl_batch_offStart = g_pserver->master_repl_offset; + updateLowestOffsetAmongReplicas(); } } From 420b07960c358ff02cbd54a1f4795a0972f8f036 Mon Sep 17 00:00:00 2001 From: John Sully Date: Mon, 14 Jun 2021 22:06:36 +0000 Subject: [PATCH 54/75] Prevent test code crash due to no log data Former-commit-id: 0a56a73bd98d4e692ae77683fdb9dd644ecfc2eb --- tests/integration/psync2.tcl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/integration/psync2.tcl b/tests/integration/psync2.tcl index 8459d2378..eccf6df2d 100644 --- a/tests/integration/psync2.tcl +++ b/tests/integration/psync2.tcl @@ -39,6 +39,7 @@ proc show_cluster_status {} { # all the lists are empty. # # regexp {^[0-9]+:[A-Z] [0-9]+ [A-z]+ [0-9]+ ([0-9:.]+) .*} $l - logdate + catch { while 1 { # Find the log with smallest time. set empty 0 @@ -67,6 +68,7 @@ proc show_cluster_status {} { puts "\[$best port $R_port($best)\] [lindex $log($best) 0]" set log($best) [lrange $log($best) 1 end] } + } } } From 6a65b8bbaa318429c69dadd852f62fb6364414fd Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 15 Jun 2021 23:13:49 +0000 Subject: [PATCH 55/75] Optimized use of repl_lowest_off to reduce lock contention Former-commit-id: 30a957e5399fe94675f0b6d2d34c24112d5a9734 --- src/multi.cpp | 1 - src/replication.cpp | 34 ++++++++-------------------------- 2 files changed, 8 insertions(+), 27 deletions(-) diff --git a/src/multi.cpp b/src/multi.cpp index 589dba589..1b91a05a0 100644 --- a/src/multi.cpp +++ b/src/multi.cpp @@ -267,7 +267,6 @@ void execCommand(client *c) { * backlog with the final EXEC. */ if (g_pserver->repl_backlog && was_master && !is_master) { const char *execcmd = "*1\r\n$4\r\nEXEC\r\n"; - updateLowestOffsetAmongReplicas(); feedReplicationBacklog(execcmd,strlen(execcmd)); } afterPropagateExec(); diff --git a/src/replication.cpp b/src/replication.cpp index a5f9c3acf..cb0b562b1 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -56,29 +56,6 @@ void putSlaveOnline(client *replica); int cancelReplicationHandshake(redisMaster *mi, int reconnect); static void propagateMasterStaleKeys(); -/* gets the lowest offset amongst all of the replicas and stores it globally*/ -void updateLowestOffsetAmongReplicas(){ - serverAssert(GlobalLocksAcquired()); - serverAssert(!g_pserver->repl_backlog_lock.fOwnLock()); - long long min_offset = LONG_LONG_MAX; - listIter li; - listNode *ln; - listRewind(g_pserver->slaves, &li); - // check for potential overflow first - while ((ln = listNext(&li))) { - client *replica = (client*)listNodeValue(ln); - - if (replica->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue; - if (replica->flags & CLIENT_CLOSE_ASAP) continue; - - std::unique_lock ul(replica->lock); - - min_offset = std::min(min_offset, replica->repl_curr_off); - } - /* return -1 if no other minimum was found */ - g_pserver->repl_lowest_off.store(min_offset == LONG_LONG_MAX ? -1 : min_offset, std::memory_order_seq_cst); -} - /* We take a global flag to remember if this instance generated an RDB * because of replication, so that we can remove the RDB file in case * the instance is configured to have no persistence. */ @@ -323,6 +300,10 @@ void feedReplicationBacklog(const void *ptr, size_t len) { long long minimumsize = g_pserver->master_repl_offset + len - lower_bound + 1; if (minimumsize > g_pserver->repl_backlog_size) { flushReplBacklogToClients(); + lower_bound = g_pserver->repl_lowest_off.load(std::memory_order_seq_cst); + if (lower_bound == -1) + lower_bound = g_pserver->repl_batch_offStart; + minimumsize = g_pserver->master_repl_offset + len - lower_bound + 1; serverLog(LL_NOTICE, "minimumsize: %lld, g_pserver->master_repl_offset: %lld, len: %lu, lower_bound: %lld", @@ -494,7 +475,6 @@ void replicationFeedSlavesCore(list *slaves, int dictid, robj **argv, int argc) serverAssert(!(listLength(slaves) != 0 && g_pserver->repl_backlog == NULL)); bool fSendRaw = !g_pserver->fActiveReplica; - updateLowestOffsetAmongReplicas(); /* Send SELECT command to every replica if needed. */ if (g_pserver->replicaseldb != dictid) { @@ -656,7 +636,6 @@ void replicationFeedSlavesFromMasterStream(char *buf, size_t buflen) { } if (g_pserver->repl_backlog){ - updateLowestOffsetAmongReplicas(); feedReplicationBacklog(buf,buflen); } } @@ -4975,6 +4954,7 @@ void flushReplBacklogToClients() if (g_pserver->repl_batch_offStart != g_pserver->master_repl_offset) { bool fAsyncWrite = false; + long long min_offset = LONG_LONG_MAX; // Ensure no overflow serverAssert(g_pserver->repl_batch_offStart < g_pserver->master_repl_offset); serverAssert(g_pserver->master_repl_offset - g_pserver->repl_batch_offStart <= g_pserver->repl_backlog_size); @@ -4998,6 +4978,8 @@ void flushReplBacklogToClients() /* We should have set the repl_curr_off when synchronizing, so it shouldn't be -1 here */ serverAssert(replica->repl_curr_off != -1); + min_offset = std::min(min_offset, replica->repl_curr_off); + replica->repl_end_off = g_pserver->master_repl_offset; /* Only if the there isn't already a pending write do we prepare the client to write */ @@ -5014,7 +4996,7 @@ void flushReplBacklogToClients() // This may be called multiple times per "frame" so update with our progress flushing to clients g_pserver->repl_batch_idxStart = g_pserver->repl_backlog_idx; g_pserver->repl_batch_offStart = g_pserver->master_repl_offset; - updateLowestOffsetAmongReplicas(); + g_pserver->repl_lowest_off.store(min_offset == LONG_LONG_MAX ? -1 : min_offset, std::memory_order_seq_cst); } } From 29f4c661799107ed6db8168ecb297b1e0b64f575 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Wed, 16 Jun 2021 19:41:55 +0000 Subject: [PATCH 56/75] More code cleanup Former-commit-id: 8e9962b9b7b9093399451bf93d30e5b5d26e3d33 --- src/evict.cpp | 2 ++ src/networking.cpp | 52 +++++++++++++++------------------------------ src/replication.cpp | 51 ++++++++++++++++++-------------------------- src/server.h | 14 ++++++------ 4 files changed, 47 insertions(+), 72 deletions(-) diff --git a/src/evict.cpp b/src/evict.cpp index ba426f0ee..d336bc8b8 100644 --- a/src/evict.cpp +++ b/src/evict.cpp @@ -354,6 +354,8 @@ unsigned long LFUDecrAndReturn(robj_roptr o) { return counter; } +unsigned long getClientReplicationBacklogSharedUsage(client *c); + /* We don't want to count AOF buffers and slaves output buffers as * used memory: the eviction should use mostly data size. This function * returns the sum of AOF and slaves buffer. */ diff --git a/src/networking.cpp b/src/networking.cpp index 07312a9ee..767fe9c2b 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -1765,15 +1765,7 @@ client *lookupClientByID(uint64_t id) { return (c == raxNotFound) ? NULL : c; } -/* Compute the corresponding index from a replication backlog offset - * by taking the distance between the input offset and the replication backlog offset - * and applying that to the replication backlog index, wrapping around if the index - * becomes negative. - * TODO: Rewrite comment for new logic */ -long long getReplIndexFromOffset(long long offset){ - long long index = (offset - g_pserver->repl_backlog_start) % g_pserver->repl_backlog_size; - return index; -} +long long getReplIndexFromOffset(long long offset); /* Write data in output buffers to client. Return C_OK if the client * is still valid after the call, C_ERR if it was freed because of some @@ -1832,35 +1824,31 @@ int writeToClient(client *c, int handler_installed) { } } /* Note that we avoid to send more than NET_MAX_WRITES_PER_EVENT - * bytes, in a single threaded server it's a good idea to serve - * other clients as well, even if a very large request comes from - * super fast link that is always able to accept data (in real world - * scenario think about 'KEYS *' against the loopback interface). - * - * However if we are over the maxmemory limit we ignore that and - * just deliver as much data as it is possible to deliver. - * - * Moreover, we also send as much as possible if the client is - * a replica or a monitor (otherwise, on high-speed traffic, the - * replication/output buffer will grow indefinitely) */ + * bytes, in a single threaded server it's a good idea to serve + * other clients as well, even if a very large request comes from + * super fast link that is always able to accept data (in real world + * scenario think about 'KEYS *' against the loopback interface). + * + * However if we are over the maxmemory limit we ignore that and + * just deliver as much data as it is possible to deliver. + * + * Moreover, we also send as much as possible if the client is + * a replica or a monitor (otherwise, on high-speed traffic, the + * replication/output buffer will grow indefinitely) */ if (totwritten > NET_MAX_WRITES_PER_EVENT && (g_pserver->maxmemory == 0 || - zmalloc_used_memory() < g_pserver->maxmemory) && + zmalloc_used_memory() < g_pserver->maxmemory) && !(c->flags & CLIENT_SLAVE)) break; } /* We can only directly read from the replication backlog if the client is a replica, so only attempt to do so if that's the case. */ if (c->flags & CLIENT_SLAVE && !(c->flags & CLIENT_MONITOR)) { - /* For replicas, we don't store all the information in the client buffer - * We always read from the replication backlog directly */ + std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); - - // serverLog(LL_NOTICE, "written to handler"); - long long repl_end_idx = getReplIndexFromOffset(c->repl_end_off); - serverAssert(c->repl_curr_off != -1); + if (c->repl_curr_off != c->repl_end_off){ long long repl_curr_idx = getReplIndexFromOffset(c->repl_curr_off); long long nwritten2ndStage = 0; /* How much was written from the start of the replication backlog @@ -1884,14 +1872,9 @@ int writeToClient(client *c, int handler_installed) { totwritten += nwritten; c->repl_curr_off += nwritten; serverAssert(c->repl_curr_off <= c->repl_end_off); - /* If the client offset matches the global offset, we wrote all we needed to, - * in which case, there is no pending write */ - + /* If the client's current offset matches the last offset it can read from, there is no pending write */ if (c->repl_curr_off == c->repl_end_off){ - // serverLog(LL_NOTICE, "Successfully wrote up until %lld", c->repl_end_off); c->fPendingReplicaWrite = false; - } else { - // serverLog(LL_NOTICE, "Wrote to %lld out of %lld", c->repl_curr_off, c->repl_end_off); } } @@ -3719,8 +3702,7 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) { } } -/* In the case of a replica client, it is possible (and very likely) - * that writes to said replica are using data from the replication backlog +/* In the case of a replica client, writes to said replica are using data from the replication backlog * as opposed to it's own internal buffer, this number should keep track of that */ unsigned long getClientReplicationBacklogSharedUsage(client *c) { return (!(c->flags & CLIENT_SLAVE) || !c->fPendingReplicaWrite ) ? 0 : g_pserver->master_repl_offset - c->repl_curr_off; diff --git a/src/replication.cpp b/src/replication.cpp index cb0b562b1..b9465680e 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -47,8 +47,6 @@ #include #include -#define BYPASS_BUFFER - void replicationDiscardCachedMaster(redisMaster *mi); void replicationResurrectCachedMaster(redisMaster *mi, connection *conn); void replicationSendAck(redisMaster *mi); @@ -61,8 +59,6 @@ static void propagateMasterStaleKeys(); * the instance is configured to have no persistence. */ int RDBGeneratedByReplication = 0; -void resizeReplicationBacklogForClients(long long newsize); - /* --------------------------- Utility functions ---------------------------- */ /* Return the pointer to a string representing the replica ip:listening_port @@ -205,7 +201,14 @@ void createReplicationBacklog(void) { g_pserver->repl_batch_offStart = g_pserver->master_repl_offset; } -long long getReplIndexFromOffset(long long offset); +/* Compute the corresponding index from a replication backlog offset + * Since this computation needs the size of the replication backlog, + * you need to have the repl_backlog_lock in order to call it */ +long long getReplIndexFromOffset(long long offset){ + serverAssert(g_pserver->repl_backlog_lock.fOwnLock()); + long long index = (offset - g_pserver->repl_backlog_start) % g_pserver->repl_backlog_size; + return index; +} /* This function is called when the user modifies the replication backlog * size at runtime. It is up to the function to both update the @@ -293,7 +296,7 @@ void feedReplicationBacklog(const void *ptr, size_t len) { if (g_pserver->repl_batch_idxStart >= 0) { - /* we are lower bounded by the lower client offset or the offStart if all the clients are up to date */ + /* We are lower bounded by the lowest replica offset, or the batch offset start if not applicable */ long long lower_bound = g_pserver->repl_lowest_off.load(std::memory_order_seq_cst); if (lower_bound == -1) lower_bound = g_pserver->repl_batch_offStart; @@ -306,9 +309,6 @@ void feedReplicationBacklog(const void *ptr, size_t len) { minimumsize = g_pserver->master_repl_offset + len - lower_bound + 1; - serverLog(LL_NOTICE, "minimumsize: %lld, g_pserver->master_repl_offset: %lld, len: %lu, lower_bound: %lld", - minimumsize, g_pserver->master_repl_offset, len, lower_bound); - if (minimumsize > g_pserver->repl_backlog_size) { // This is an emergency overflow, we better resize to fit long long newsize = std::max(g_pserver->repl_backlog_size*2, minimumsize); @@ -635,9 +635,7 @@ void replicationFeedSlavesFromMasterStream(char *buf, size_t buflen) { printf("\n"); } - if (g_pserver->repl_backlog){ - feedReplicationBacklog(buf,buflen); - } + if (g_pserver->repl_backlog) feedReplicationBacklog(buf,buflen); } void replicationFeedMonitors(client *c, list *monitors, int dictid, robj **argv, int argc) { @@ -689,13 +687,12 @@ int prepareClientToWrite(client *c); /* Feed the replica 'c' with the replication backlog starting from the * specified 'offset' up to the end of the backlog. */ long long addReplyReplicationBacklog(client *c, long long offset) { - long long j, skip, len; + long long skip, len; serverLog(LL_DEBUG, "[PSYNC] Replica request offset: %lld", offset); if (g_pserver->repl_backlog_histlen == 0) { serverLog(LL_DEBUG, "[PSYNC] Backlog history len is zero"); - serverLog(LL_NOTICE, "REOAD TO RESIST"); c->repl_curr_off = g_pserver->master_repl_offset; c->repl_end_off = g_pserver->master_repl_offset; return 0; @@ -714,30 +711,20 @@ long long addReplyReplicationBacklog(client *c, long long offset) { skip = offset - g_pserver->repl_backlog_off; serverLog(LL_DEBUG, "[PSYNC] Skipping: %lld", skip); - /* Point j to the oldest byte, that is actually our - * g_pserver->repl_backlog_off byte. */ - j = (g_pserver->repl_backlog_idx + - (g_pserver->repl_backlog_size-g_pserver->repl_backlog_histlen)) % - g_pserver->repl_backlog_size; - serverLog(LL_DEBUG, "[PSYNC] Index of first byte: %lld", j); - - /* Discard the amount of data to seek to the specified 'offset'. */ - j = (j + skip) % g_pserver->repl_backlog_size; - - /* Feed replica with data. Since it is a circular buffer we have to - * split the reply in two parts if we are cross-boundary. */ len = g_pserver->repl_backlog_histlen - skip; serverLog(LL_DEBUG, "[PSYNC] Reply total length: %lld", len); + /* Set the start and end offsets for the replica so that a future + * writeToClient will send the backlog from the given offset to + * the current end of the backlog to said replica */ c->repl_curr_off = offset - 1; - // serverLog(LL_NOTICE, "Client %s, replica offset %lld in psync", replicationGetSlaveName(c), c->repl_curr_off); c->repl_end_off = g_pserver->master_repl_offset; /* Force the partial sync to be queued */ prepareClientToWrite(c); c->fPendingReplicaWrite = true; - return g_pserver->repl_backlog_histlen - skip; + return len; } /* Return the offset to provide as reply to the PSYNC command received @@ -4963,14 +4950,18 @@ void flushReplBacklogToClients() listIter li; listNode *ln; listRewind(g_pserver->slaves, &li); + /* We don't actually write any data in this function since we send data + * directly from the replication backlog to replicas in writeToClient. + * + * What we do however, is set the end offset of each replica here. This way, + * future calls to writeToClient will know up to where in the replication + * backlog is valid for writing. */ while ((ln = listNext(&li))) { client *replica = (client*)listNodeValue(ln); if (!canFeedReplicaReplBuffer(replica)) continue; if (replica->flags & CLIENT_CLOSE_ASAP) continue; - // serverLog(LL_NOTICE, "Client %s, replica offset %lld", replicationGetSlaveName(replica), replica->repl_curr_off); - std::unique_lock ul(replica->lock); if (!FCorrectThread(replica)) fAsyncWrite = true; diff --git a/src/server.h b/src/server.h index cb3973969..0d6f766ce 100644 --- a/src/server.h +++ b/src/server.h @@ -1590,9 +1590,11 @@ struct client { copying this replica output buffer should use. */ - long long repl_curr_off = -1; /* Replication offset of the client, only if it's a replica*/ - long long repl_end_off = -1; /* Replication offset to write to */ - int fPendingReplicaWrite; + long long repl_curr_off = -1;/* Replication offset of the replica, also where in the backlog we need to start from + * when sending data to this replica. */ + long long repl_end_off = -1; /* Replication offset to write to, stored in the replica, as opposed to using the global offset + * to prevent needing the global lock */ + int fPendingReplicaWrite; /* Is there a write queued for this replica? */ char replid[CONFIG_RUN_ID_SIZE+1]; /* Master replication ID (if master). */ int slave_listening_port; /* As configured with: REPLCONF listening-port */ @@ -2375,8 +2377,8 @@ struct redisServer { int repl_diskless_load; /* Slave parse RDB directly from the socket. * see REPL_DISKLESS_LOAD_* enum */ int repl_diskless_sync_delay; /* Delay to start a diskless repl BGSAVE. */ - std::atomic repl_lowest_off; /* The lowest offset amongst all clients - Updated before calls to feed the replication backlog */ + std::atomic repl_lowest_off; /* The lowest offset amongst all replicas + -1 if there are no replicas */ /* Replication (replica) */ list *masters; int enable_multimaster; @@ -2825,7 +2827,6 @@ sds getAllClientsInfoString(int type); void rewriteClientCommandVector(client *c, int argc, ...); void rewriteClientCommandArgument(client *c, int i, robj *newval); void replaceClientCommandVector(client *c, int argc, robj **argv); -unsigned long getClientReplicationBacklogSharedUsage(client *c); unsigned long getClientOutputBufferMemoryUsage(client *c); int freeClientsInAsyncFreeQueue(int iel); void asyncCloseClientOnOutputBufferLimitReached(client *c); @@ -3017,7 +3018,6 @@ void rdbPipeReadHandler(struct aeEventLoop *eventLoop, int fd, void *clientData, void rdbPipeWriteHandlerConnRemoved(struct connection *conn); void replicationNotifyLoadedKey(redisDb *db, robj_roptr key, robj_roptr val, long long expire); void replicateSubkeyExpire(redisDb *db, robj_roptr key, robj_roptr subkey, long long expire); -void updateLowestOffsetAmongReplicas(void); void clearFailoverState(void); void updateFailoverStatus(void); void abortFailover(redisMaster *mi, const char *err); From 815ebe1e6b0b7ad13db30dc342e8d6cb92330651 Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 25 Jun 2021 01:54:38 +0000 Subject: [PATCH 57/75] Remove fPendingReplicaWrite flag which can instead be calculated on demand Former-commit-id: ae26afd13f955eb230b5c2cab20ec90db9b714ad --- src/networking.cpp | 128 +++++++++++++++++++++----------------------- src/replication.cpp | 8 +-- src/server.h | 5 +- 3 files changed, 67 insertions(+), 74 deletions(-) diff --git a/src/networking.cpp b/src/networking.cpp index 767fe9c2b..690b03a51 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -158,7 +158,6 @@ client *createClient(connection *conn, int iel) { c->flags = 0; c->fPendingAsyncWrite = FALSE; c->fPendingAsyncWriteHandler = FALSE; - c->fPendingReplicaWrite = FALSE; c->ctime = c->lastinteraction = g_pserver->unixtime; /* If the default user does not require authentication, the user is * directly authenticated. */ @@ -318,7 +317,7 @@ int prepareClientToWrite(client *c) { /* Schedule the client to write the output buffers to the socket, unless * it should already be setup to do so (it has already pending data). */ - if (!fAsync && !clientHasPendingReplies(c) && !c->fPendingReplicaWrite) clientInstallWriteHandler(c); + if (!fAsync && (c->flags & CLIENT_SLAVE || !clientHasPendingReplies(c))) clientInstallWriteHandler(c); if (fAsync && !(c->fPendingAsyncWrite)) clientInstallAsyncWriteHandler(c); /* Authorize the caller to queue in the output buffer of this client. */ @@ -1132,7 +1131,7 @@ void copyClientOutputBuffer(client *dst, client *src) { /* Return true if the specified client has pending reply buffers to write to * the socket. */ int clientHasPendingReplies(client *c) { - return (c->bufpos || listLength(c->reply)); + return (c->bufpos || listLength(c->reply) || c->FPendingReplicaWrite()); } static std::atomic rgacceptsInFlight[MAX_EVENT_LOOPS]; @@ -1785,66 +1784,9 @@ int writeToClient(client *c, int handler_installed) { std::unique_locklock)> lock(c->lock); - while(clientHasPendingReplies(c)) { - serverAssert(!(c->flags & CLIENT_SLAVE) || c->flags & CLIENT_MONITOR); - if (c->bufpos > 0) { - nwritten = connWrite(c->conn,c->buf+c->sentlen,c->bufpos-c->sentlen); - if (nwritten <= 0) break; - c->sentlen += nwritten; - totwritten += nwritten; - - /* If the buffer was sent, set bufpos to zero to continue with - * the remainder of the reply. */ - if ((int)c->sentlen == c->bufpos) { - c->bufpos = 0; - c->sentlen = 0; - } - } else { - o = (clientReplyBlock*)listNodeValue(listFirst(c->reply)); - if (o->used == 0) { - c->reply_bytes -= o->size; - listDelNode(c->reply,listFirst(c->reply)); - continue; - } - - nwritten = connWrite(c->conn, o->buf() + c->sentlen, o->used - c->sentlen); - if (nwritten <= 0) break; - c->sentlen += nwritten; - totwritten += nwritten; - - /* If we fully sent the object on head go to the next one */ - if (c->sentlen == o->used) { - c->reply_bytes -= o->size; - listDelNode(c->reply,listFirst(c->reply)); - c->sentlen = 0; - /* If there are no longer objects in the list, we expect - * the count of reply bytes to be exactly zero. */ - if (listLength(c->reply) == 0) - serverAssert(c->reply_bytes == 0); - } - } - /* Note that we avoid to send more than NET_MAX_WRITES_PER_EVENT - * bytes, in a single threaded server it's a good idea to serve - * other clients as well, even if a very large request comes from - * super fast link that is always able to accept data (in real world - * scenario think about 'KEYS *' against the loopback interface). - * - * However if we are over the maxmemory limit we ignore that and - * just deliver as much data as it is possible to deliver. - * - * Moreover, we also send as much as possible if the client is - * a replica or a monitor (otherwise, on high-speed traffic, the - * replication/output buffer will grow indefinitely) */ - if (totwritten > NET_MAX_WRITES_PER_EVENT && - (g_pserver->maxmemory == 0 || - zmalloc_used_memory() < g_pserver->maxmemory) && - !(c->flags & CLIENT_SLAVE)) break; - } - /* We can only directly read from the replication backlog if the client is a replica, so only attempt to do so if that's the case. */ if (c->flags & CLIENT_SLAVE && !(c->flags & CLIENT_MONITOR)) { - std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); long long repl_end_idx = getReplIndexFromOffset(c->repl_end_off); serverAssert(c->repl_curr_off != -1); @@ -1872,15 +1814,67 @@ int writeToClient(client *c, int handler_installed) { totwritten += nwritten; c->repl_curr_off += nwritten; serverAssert(c->repl_curr_off <= c->repl_end_off); - /* If the client's current offset matches the last offset it can read from, there is no pending write */ - if (c->repl_curr_off == c->repl_end_off){ - c->fPendingReplicaWrite = false; - } } /* If the second part of a write didn't go through, we still need to register that */ if (nwritten2ndStage == -1) nwritten = -1; } + } else { + while(clientHasPendingReplies(c)) { + serverAssert(!(c->flags & CLIENT_SLAVE) || c->flags & CLIENT_MONITOR); + if (c->bufpos > 0) { + nwritten = connWrite(c->conn,c->buf+c->sentlen,c->bufpos-c->sentlen); + if (nwritten <= 0) break; + c->sentlen += nwritten; + totwritten += nwritten; + + /* If the buffer was sent, set bufpos to zero to continue with + * the remainder of the reply. */ + if ((int)c->sentlen == c->bufpos) { + c->bufpos = 0; + c->sentlen = 0; + } + } else { + o = (clientReplyBlock*)listNodeValue(listFirst(c->reply)); + if (o->used == 0) { + c->reply_bytes -= o->size; + listDelNode(c->reply,listFirst(c->reply)); + continue; + } + + nwritten = connWrite(c->conn, o->buf() + c->sentlen, o->used - c->sentlen); + if (nwritten <= 0) break; + c->sentlen += nwritten; + totwritten += nwritten; + + /* If we fully sent the object on head go to the next one */ + if (c->sentlen == o->used) { + c->reply_bytes -= o->size; + listDelNode(c->reply,listFirst(c->reply)); + c->sentlen = 0; + /* If there are no longer objects in the list, we expect + * the count of reply bytes to be exactly zero. */ + if (listLength(c->reply) == 0) + serverAssert(c->reply_bytes == 0); + } + } + /* Note that we avoid to send more than NET_MAX_WRITES_PER_EVENT + * bytes, in a single threaded server it's a good idea to serve + * other clients as well, even if a very large request comes from + * super fast link that is always able to accept data (in real world + * scenario think about 'KEYS *' against the loopback interface). + * + * However if we are over the maxmemory limit we ignore that and + * just deliver as much data as it is possible to deliver. + * + * Moreover, we also send as much as possible if the client is + * a replica or a monitor (otherwise, on high-speed traffic, the + * replication/output buffer will grow indefinitely) */ + if (totwritten > NET_MAX_WRITES_PER_EVENT && + (g_pserver->maxmemory == 0 || + zmalloc_used_memory() < g_pserver->maxmemory) && + !(c->flags & CLIENT_SLAVE)) break; + } } g_pserver->stat_net_output_bytes += totwritten; @@ -1900,7 +1894,7 @@ int writeToClient(client *c, int handler_installed) { * We just rely on data / pings received for timeout detection. */ if (!(c->flags & CLIENT_MASTER)) c->lastinteraction = g_pserver->unixtime; } - if (!clientHasPendingReplies(c) && !c->fPendingReplicaWrite) { + if (!clientHasPendingReplies(c)) { c->sentlen = 0; if (handler_installed) connSetWriteHandler(c->conn, NULL); @@ -2080,7 +2074,7 @@ int handleClientsWithPendingWrites(int iel, int aof_state) { /* If after the synchronous writes above we still have data to * output to the client, we need to install the writable handler. */ - if (clientHasPendingReplies(c) || c->fPendingReplicaWrite) { + if (clientHasPendingReplies(c)) { if (connSetWriteHandlerWithBarrier(c->conn, sendReplyToClient, ae_flags, true) == C_ERR) { freeClientAsync(c); } @@ -3705,7 +3699,7 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) { /* In the case of a replica client, writes to said replica are using data from the replication backlog * as opposed to it's own internal buffer, this number should keep track of that */ unsigned long getClientReplicationBacklogSharedUsage(client *c) { - return (!(c->flags & CLIENT_SLAVE) || !c->fPendingReplicaWrite ) ? 0 : g_pserver->master_repl_offset - c->repl_curr_off; + return (!(c->flags & CLIENT_SLAVE) || !c->FPendingReplicaWrite() ) ? 0 : g_pserver->master_repl_offset - c->repl_curr_off; } /* This function returns the number of bytes that Redis is diff --git a/src/replication.cpp b/src/replication.cpp index b9465680e..94b35e314 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -722,7 +722,6 @@ long long addReplyReplicationBacklog(client *c, long long offset) { /* Force the partial sync to be queued */ prepareClientToWrite(c); - c->fPendingReplicaWrite = true; return len; } @@ -4974,11 +4973,8 @@ void flushReplBacklogToClients() replica->repl_end_off = g_pserver->master_repl_offset; /* Only if the there isn't already a pending write do we prepare the client to write */ - if (!replica->fPendingReplicaWrite){ - serverAssert(replica->repl_curr_off != g_pserver->master_repl_offset); - prepareClientToWrite(replica); - replica->fPendingReplicaWrite = true; - } + serverAssert(replica->repl_curr_off != g_pserver->master_repl_offset); + prepareClientToWrite(replica); } if (fAsyncWrite) diff --git a/src/server.h b/src/server.h index 0d6f766ce..07608632e 100644 --- a/src/server.h +++ b/src/server.h @@ -1594,7 +1594,6 @@ struct client { * when sending data to this replica. */ long long repl_end_off = -1; /* Replication offset to write to, stored in the replica, as opposed to using the global offset * to prevent needing the global lock */ - int fPendingReplicaWrite; /* Is there a write queued for this replica? */ char replid[CONFIG_RUN_ID_SIZE+1]; /* Master replication ID (if master). */ int slave_listening_port; /* As configured with: REPLCONF listening-port */ @@ -1657,6 +1656,10 @@ struct client { robj **argv; size_t argv_len_sumActive = 0; + bool FPendingReplicaWrite() const { + return repl_curr_off != repl_end_off; + } + // post a function from a non-client thread to run on its client thread bool postFunction(std::function fn, bool fLock = true); size_t argv_len_sum() const; From e6a82692b7be9d62a619f9968e0f9ae5f90ca71e Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 25 Jun 2021 02:31:17 +0000 Subject: [PATCH 58/75] Avoid holding the lockPendingWrite for too long and deadlocking due to lock inversion Former-commit-id: a4b49fbec60e2333a4407d24383ae204d5d2b413 --- src/networking.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/networking.cpp b/src/networking.cpp index 690b03a51..5ced371d1 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -2018,7 +2018,6 @@ void ProcessPendingAsyncWrites() * need to use a syscall in order to install the writable event handler, * get it called, and so forth. */ int handleClientsWithPendingWrites(int iel, int aof_state) { - std::unique_lock lockf(g_pserver->rgthreadvar[iel].lockPendingWrite); int processed = 0; serverAssert(iel == (serverTL - g_pserver->rgthreadvar)); @@ -2041,7 +2040,9 @@ int handleClientsWithPendingWrites(int iel, int aof_state) { ae_flags |= AE_BARRIER; } + std::unique_lock lockf(g_pserver->rgthreadvar[iel].lockPendingWrite); auto vec = std::move(g_pserver->rgthreadvar[iel].clients_pending_write); + lockf.unlock(); processed += (int)vec.size(); for (client *c : vec) { From 5949e253cab606c0bd7616e00c42e7ebcfca872a Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 25 Jun 2021 02:46:32 +0000 Subject: [PATCH 59/75] remove unnecessary newline Former-commit-id: 532af9cd0286ac6ece6f401c42aea18e36d16f7c --- src/replication.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/replication.cpp b/src/replication.cpp index 94b35e314..e9a503167 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -4975,7 +4975,6 @@ void flushReplBacklogToClients() /* Only if the there isn't already a pending write do we prepare the client to write */ serverAssert(replica->repl_curr_off != g_pserver->master_repl_offset); prepareClientToWrite(replica); - } if (fAsyncWrite) ProcessPendingAsyncWrites(); From 0e953fb91612392c0a7ac86c3427d230ebd00b85 Mon Sep 17 00:00:00 2001 From: vivek Date: Fri, 25 Jun 2021 03:10:56 +0000 Subject: [PATCH 60/75] Primitive implementation of bypassing client buffer, stats are all messed up and print statements everywhere Former-commit-id: 59b2ae8ff451f8a5ac2f3baf3c7b509f6872895e --- src/evict.cpp | 10 ++- src/networking.cpp | 115 +++++++++++++++++++++++------ src/replication.cpp | 155 +++++++++++++++++++++++++-------------- src/server.cpp | 2 +- src/server.h | 17 +++++ tests/unit/maxmemory.tcl | 25 ++++--- 6 files changed, 231 insertions(+), 93 deletions(-) diff --git a/src/evict.cpp b/src/evict.cpp index 009713d73..d336bc8b8 100644 --- a/src/evict.cpp +++ b/src/evict.cpp @@ -354,6 +354,8 @@ unsigned long LFUDecrAndReturn(robj_roptr o) { return counter; } +unsigned long getClientReplicationBacklogSharedUsage(client *c); + /* We don't want to count AOF buffers and slaves output buffers as * used memory: the eviction should use mostly data size. This function * returns the sum of AOF and slaves buffer. */ @@ -370,9 +372,15 @@ size_t freeMemoryGetNotCountedMemory(void) { while((ln = listNext(&li))) { client *replica = (client*)listNodeValue(ln); std::unique_lock(replica->lock); - overhead += getClientOutputBufferMemoryUsage(replica); + /* we don't wish to multiple count the replication backlog shared usage */ + overhead += (getClientOutputBufferMemoryUsage(replica) - getClientReplicationBacklogSharedUsage(replica)); } } + + /* also don't count the replication backlog memory + * that's where the replication clients get their memory from */ + overhead += g_pserver->repl_backlog_size; + if (g_pserver->aof_state != AOF_OFF) { overhead += sdsalloc(g_pserver->aof_buf)+aofRewriteBufferSize(); } diff --git a/src/networking.cpp b/src/networking.cpp index 1c0644ec0..767fe9c2b 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -136,6 +136,7 @@ client *createClient(connection *conn, int iel) { client_id = g_pserver->next_client_id.fetch_add(1); c->iel = iel; c->id = client_id; + sprintf(c->lock.szName, "client %lu", client_id); c->resp = 2; c->conn = conn; c->name = NULL; @@ -157,6 +158,7 @@ client *createClient(connection *conn, int iel) { c->flags = 0; c->fPendingAsyncWrite = FALSE; c->fPendingAsyncWriteHandler = FALSE; + c->fPendingReplicaWrite = FALSE; c->ctime = c->lastinteraction = g_pserver->unixtime; /* If the default user does not require authentication, the user is * directly authenticated. */ @@ -234,6 +236,7 @@ void clientInstallWriteHandler(client *c) { /* Schedule the client to write the output buffers to the socket only * if not already done and, for slaves, if the replica can actually receive * writes at this stage. */ + if (!(c->flags & CLIENT_PENDING_WRITE) && (c->replstate == REPL_STATE_NONE || (c->replstate == SLAVE_STATE_ONLINE && !c->repl_put_online_on_ack))) @@ -315,7 +318,7 @@ int prepareClientToWrite(client *c) { /* Schedule the client to write the output buffers to the socket, unless * it should already be setup to do so (it has already pending data). */ - if (!fAsync && !clientHasPendingReplies(c)) clientInstallWriteHandler(c); + if (!fAsync && !clientHasPendingReplies(c) && !c->fPendingReplicaWrite) clientInstallWriteHandler(c); if (fAsync && !(c->fPendingAsyncWrite)) clientInstallAsyncWriteHandler(c); /* Authorize the caller to queue in the output buffer of this client. */ @@ -1762,6 +1765,8 @@ client *lookupClientByID(uint64_t id) { return (c == raxNotFound) ? NULL : c; } +long long getReplIndexFromOffset(long long offset); + /* Write data in output buffers to client. Return C_OK if the client * is still valid after the call, C_ERR if it was freed because of some * error. If handler_installed is set, it will attempt to clear the @@ -1779,8 +1784,9 @@ int writeToClient(client *c, int handler_installed) { serverAssertDebug(FCorrectThread(c)); std::unique_locklock)> lock(c->lock); - + while(clientHasPendingReplies(c)) { + serverAssert(!(c->flags & CLIENT_SLAVE) || c->flags & CLIENT_MONITOR); if (c->bufpos > 0) { nwritten = connWrite(c->conn,c->buf+c->sentlen,c->bufpos-c->sentlen); if (nwritten <= 0) break; @@ -1788,7 +1794,7 @@ int writeToClient(client *c, int handler_installed) { totwritten += nwritten; /* If the buffer was sent, set bufpos to zero to continue with - * the remainder of the reply. */ + * the remainder of the reply. */ if ((int)c->sentlen == c->bufpos) { c->bufpos = 0; c->sentlen = 0; @@ -1834,7 +1840,49 @@ int writeToClient(client *c, int handler_installed) { zmalloc_used_memory() < g_pserver->maxmemory) && !(c->flags & CLIENT_SLAVE)) break; } - + + /* We can only directly read from the replication backlog if the client + is a replica, so only attempt to do so if that's the case. */ + if (c->flags & CLIENT_SLAVE && !(c->flags & CLIENT_MONITOR)) { + + std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); + long long repl_end_idx = getReplIndexFromOffset(c->repl_end_off); + serverAssert(c->repl_curr_off != -1); + + if (c->repl_curr_off != c->repl_end_off){ + long long repl_curr_idx = getReplIndexFromOffset(c->repl_curr_off); + long long nwritten2ndStage = 0; /* How much was written from the start of the replication backlog + * in the event of a wrap around write */ + /* normal case with no wrap around */ + if (repl_end_idx >= repl_curr_idx){ + nwritten = connWrite(c->conn, g_pserver->repl_backlog + repl_curr_idx, repl_end_idx - repl_curr_idx); + /* wrap around case */ + } else { + nwritten = connWrite(c->conn, g_pserver->repl_backlog + repl_curr_idx, g_pserver->repl_backlog_size - repl_curr_idx); + /* only attempt wrapping if we write the correct number of bytes */ + if (nwritten == g_pserver->repl_backlog_size - repl_curr_idx){ + nwritten2ndStage = connWrite(c->conn, g_pserver->repl_backlog, repl_end_idx); + if (nwritten2ndStage != -1) + nwritten += nwritten2ndStage; + } + } + + /* only increment bytes if an error didn't occur */ + if (nwritten > 0){ + totwritten += nwritten; + c->repl_curr_off += nwritten; + serverAssert(c->repl_curr_off <= c->repl_end_off); + /* If the client's current offset matches the last offset it can read from, there is no pending write */ + if (c->repl_curr_off == c->repl_end_off){ + c->fPendingReplicaWrite = false; + } + } + + /* If the second part of a write didn't go through, we still need to register that */ + if (nwritten2ndStage == -1) nwritten = -1; + } + } + g_pserver->stat_net_output_bytes += totwritten; if (nwritten == -1) { if (connGetState(c->conn) != CONN_STATE_CONNECTED) { @@ -1852,7 +1900,7 @@ int writeToClient(client *c, int handler_installed) { * We just rely on data / pings received for timeout detection. */ if (!(c->flags & CLIENT_MASTER)) c->lastinteraction = g_pserver->unixtime; } - if (!clientHasPendingReplies(c)) { + if (!clientHasPendingReplies(c) && !c->fPendingReplicaWrite) { c->sentlen = 0; if (handler_installed) connSetWriteHandler(c->conn, NULL); @@ -1896,27 +1944,37 @@ void ProcessPendingAsyncWrites() serverAssert(c->fPendingAsyncWrite); if (c->flags & (CLIENT_CLOSE_ASAP | CLIENT_CLOSE_AFTER_REPLY)) { - zfree(c->replyAsync); - c->replyAsync = nullptr; + if (c->replyAsync != nullptr){ + zfree(c->replyAsync); + c->replyAsync = nullptr; + } c->fPendingAsyncWrite = FALSE; continue; } - int size = c->replyAsync->used; + /* since writes from master to replica can come directly from the replication backlog, + * writes may have been signalled without having been copied to the replyAsync buffer, + * thus causing the buffer to be NULL */ + if (c->replyAsync != nullptr){ + int size = c->replyAsync->used; - if (listLength(c->reply) == 0 && size <= (PROTO_REPLY_CHUNK_BYTES - c->bufpos)) { - memcpy(c->buf + c->bufpos, c->replyAsync->buf(), size); - c->bufpos += size; - } else { - c->reply_bytes += c->replyAsync->size; - listAddNodeTail(c->reply, c->replyAsync); + if (listLength(c->reply) == 0 && size <= (PROTO_REPLY_CHUNK_BYTES - c->bufpos)) { + memcpy(c->buf + c->bufpos, c->replyAsync->buf(), size); + c->bufpos += size; + } else { + c->reply_bytes += c->replyAsync->size; + listAddNodeTail(c->reply, c->replyAsync); + c->replyAsync = nullptr; + } + + zfree(c->replyAsync); c->replyAsync = nullptr; + } else { + /* Only replicas should have empty async reply buffers */ + serverAssert(c->flags & CLIENT_SLAVE); } - zfree(c->replyAsync); - c->replyAsync = nullptr; c->fPendingAsyncWrite = FALSE; - // Now install the write event handler int ae_flags = AE_WRITABLE|AE_WRITE_THREADSAFE; /* For the fsync=always policy, we want that a given FD is never @@ -1929,17 +1987,17 @@ void ProcessPendingAsyncWrites() { ae_flags |= AE_BARRIER; } - + if (!((c->replstate == REPL_STATE_NONE || (c->replstate == SLAVE_STATE_ONLINE && !c->repl_put_online_on_ack)))) continue; - + asyncCloseClientOnOutputBufferLimitReached(c); if (c->flags & CLIENT_CLOSE_ASAP) continue; // we will never write this so don't post an op - + std::atomic_thread_fence(std::memory_order_seq_cst); - + if (FCorrectThread(c)) { prepareClientToWrite(c); // queue an event @@ -2022,9 +2080,10 @@ int handleClientsWithPendingWrites(int iel, int aof_state) { /* If after the synchronous writes above we still have data to * output to the client, we need to install the writable handler. */ - if (clientHasPendingReplies(c)) { - if (connSetWriteHandlerWithBarrier(c->conn, sendReplyToClient, ae_flags, true) == C_ERR) + if (clientHasPendingReplies(c) || c->fPendingReplicaWrite) { + if (connSetWriteHandlerWithBarrier(c->conn, sendReplyToClient, ae_flags, true) == C_ERR) { freeClientAsync(c); + } } } @@ -3643,6 +3702,12 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) { } } +/* In the case of a replica client, writes to said replica are using data from the replication backlog + * as opposed to it's own internal buffer, this number should keep track of that */ +unsigned long getClientReplicationBacklogSharedUsage(client *c) { + return (!(c->flags & CLIENT_SLAVE) || !c->fPendingReplicaWrite ) ? 0 : g_pserver->master_repl_offset - c->repl_curr_off; +} + /* This function returns the number of bytes that Redis is * using to store the reply still not read by the client. * @@ -3651,9 +3716,11 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) { * enforcing the client output length limits. */ unsigned long getClientOutputBufferMemoryUsage(client *c) { unsigned long list_item_size = sizeof(listNode) + sizeof(clientReplyBlock); - return c->reply_bytes + (list_item_size*listLength(c->reply)) + (c->replyAsync ? c->replyAsync->size : 0); + return c->reply_bytes + (list_item_size*listLength(c->reply)) + (c->replyAsync ? c->replyAsync->size : 0) + getClientReplicationBacklogSharedUsage(c); } + + /* Get the class of a client, used in order to enforce limits to different * classes of clients. * diff --git a/src/replication.cpp b/src/replication.cpp index 748c50c7d..b9465680e 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -189,6 +189,7 @@ void createReplicationBacklog(void) { g_pserver->repl_backlog = (char*)zmalloc(g_pserver->repl_backlog_size, MALLOC_LOCAL); g_pserver->repl_backlog_histlen = 0; g_pserver->repl_backlog_idx = 0; + g_pserver->repl_backlog_start = g_pserver->master_repl_offset; /* We don't have any data inside our buffer, but virtually the first * byte we have is the next byte that will be generated for the @@ -200,6 +201,15 @@ void createReplicationBacklog(void) { g_pserver->repl_batch_offStart = g_pserver->master_repl_offset; } +/* Compute the corresponding index from a replication backlog offset + * Since this computation needs the size of the replication backlog, + * you need to have the repl_backlog_lock in order to call it */ +long long getReplIndexFromOffset(long long offset){ + serverAssert(g_pserver->repl_backlog_lock.fOwnLock()); + long long index = (offset - g_pserver->repl_backlog_start) % g_pserver->repl_backlog_size; + return index; +} + /* This function is called when the user modifies the replication backlog * size at runtime. It is up to the function to both update the * g_pserver->repl_backlog_size and to resize the buffer and setup it so that @@ -211,6 +221,8 @@ void resizeReplicationBacklog(long long newsize) { newsize = CONFIG_REPL_BACKLOG_MIN_SIZE; if (g_pserver->repl_backlog_size == newsize) return; + std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); + if (g_pserver->repl_backlog != NULL) { /* What we actually do is to flush the old buffer and realloc a new * empty one. It will refill with new data incrementally. @@ -218,19 +230,23 @@ void resizeReplicationBacklog(long long newsize) { * worse often we need to alloc additional space before freeing the * old buffer. */ - if (g_pserver->repl_batch_idxStart >= 0) { - // We need to keep critical data so we can't shrink less than the hot data in the buffer - newsize = std::max(newsize, g_pserver->master_repl_offset - g_pserver->repl_batch_offStart); - char *backlog = (char*)zmalloc(newsize); - g_pserver->repl_backlog_histlen = g_pserver->master_repl_offset - g_pserver->repl_batch_offStart; + /* get the critical client size, i.e. the size of the data unflushed to clients */ + long long earliest_off = g_pserver->repl_lowest_off.load(); - if (g_pserver->repl_backlog_idx >= g_pserver->repl_batch_idxStart) { - auto cbActiveBacklog = g_pserver->repl_backlog_idx - g_pserver->repl_batch_idxStart; - memcpy(backlog, g_pserver->repl_backlog + g_pserver->repl_batch_idxStart, cbActiveBacklog); + if (earliest_off != -1) { + // We need to keep critical data so we can't shrink less than the hot data in the buffer + newsize = std::max(newsize, g_pserver->master_repl_offset - earliest_off); + char *backlog = (char*)zmalloc(newsize); + g_pserver->repl_backlog_histlen = g_pserver->master_repl_offset - earliest_off; + long long earliest_idx = getReplIndexFromOffset(earliest_off); + + if (g_pserver->repl_backlog_idx >= earliest_idx) { + auto cbActiveBacklog = g_pserver->repl_backlog_idx - earliest_idx; + memcpy(backlog, g_pserver->repl_backlog + earliest_idx, cbActiveBacklog); serverAssert(g_pserver->repl_backlog_histlen == cbActiveBacklog); } else { - auto cbPhase1 = g_pserver->repl_backlog_size - g_pserver->repl_batch_idxStart; - memcpy(backlog, g_pserver->repl_backlog + g_pserver->repl_batch_idxStart, cbPhase1); + auto cbPhase1 = g_pserver->repl_backlog_size - earliest_idx; + memcpy(backlog, g_pserver->repl_backlog + earliest_idx, cbPhase1); memcpy(backlog + cbPhase1, g_pserver->repl_backlog, g_pserver->repl_backlog_idx); auto cbActiveBacklog = cbPhase1 + g_pserver->repl_backlog_idx; serverAssert(g_pserver->repl_backlog_histlen == cbActiveBacklog); @@ -238,7 +254,10 @@ void resizeReplicationBacklog(long long newsize) { zfree(g_pserver->repl_backlog); g_pserver->repl_backlog = backlog; g_pserver->repl_backlog_idx = g_pserver->repl_backlog_histlen; - g_pserver->repl_batch_idxStart = 0; + g_pserver->repl_batch_idxStart -= earliest_idx; + if (g_pserver->repl_batch_idxStart < 0) + g_pserver->repl_batch_idxStart += g_pserver->repl_backlog_size; + g_pserver->repl_backlog_start = earliest_off; } else { zfree(g_pserver->repl_backlog); g_pserver->repl_backlog = (char*)zmalloc(newsize); @@ -246,11 +265,13 @@ void resizeReplicationBacklog(long long newsize) { g_pserver->repl_backlog_idx = 0; /* Next byte we have is... the next since the buffer is empty. */ g_pserver->repl_backlog_off = g_pserver->master_repl_offset+1; + g_pserver->repl_backlog_start = g_pserver->master_repl_offset; } } g_pserver->repl_backlog_size = newsize; } + void freeReplicationBacklog(void) { serverAssert(GlobalLocksAcquired()); listIter li; @@ -273,11 +294,20 @@ void feedReplicationBacklog(const void *ptr, size_t len) { serverAssert(GlobalLocksAcquired()); const unsigned char *p = (const unsigned char*)ptr; + if (g_pserver->repl_batch_idxStart >= 0) { - long long minimumsize = g_pserver->master_repl_offset + len - g_pserver->repl_batch_offStart+1; + /* We are lower bounded by the lowest replica offset, or the batch offset start if not applicable */ + long long lower_bound = g_pserver->repl_lowest_off.load(std::memory_order_seq_cst); + if (lower_bound == -1) + lower_bound = g_pserver->repl_batch_offStart; + long long minimumsize = g_pserver->master_repl_offset + len - lower_bound + 1; if (minimumsize > g_pserver->repl_backlog_size) { flushReplBacklogToClients(); - minimumsize = g_pserver->master_repl_offset + len - g_pserver->repl_batch_offStart+1; + lower_bound = g_pserver->repl_lowest_off.load(std::memory_order_seq_cst); + if (lower_bound == -1) + lower_bound = g_pserver->repl_batch_offStart; + + minimumsize = g_pserver->master_repl_offset + len - lower_bound + 1; if (minimumsize > g_pserver->repl_backlog_size) { // This is an emergency overflow, we better resize to fit @@ -292,6 +322,7 @@ void feedReplicationBacklog(const void *ptr, size_t len) { /* This is a circular buffer, so write as much data we can at every * iteration and rewind the "idx" index if we reach the limit. */ + while(len) { size_t thislen = g_pserver->repl_backlog_size - g_pserver->repl_backlog_idx; if (thislen > len) thislen = len; @@ -478,7 +509,6 @@ void replicationFeedSlavesCore(list *slaves, int dictid, robj **argv, int argc) if (fSendRaw) { char aux[LONG_STR_SIZE+3]; - /* Add the multi bulk reply length. */ aux[0] = '*'; int multilen = ll2string(aux+1,sizeof(aux)-1,argc); @@ -652,15 +682,19 @@ void replicationFeedMonitors(client *c, list *monitors, int dictid, robj **argv, decrRefCount(cmdobj); } +int prepareClientToWrite(client *c); + /* Feed the replica 'c' with the replication backlog starting from the * specified 'offset' up to the end of the backlog. */ long long addReplyReplicationBacklog(client *c, long long offset) { - long long j, skip, len; + long long skip, len; serverLog(LL_DEBUG, "[PSYNC] Replica request offset: %lld", offset); if (g_pserver->repl_backlog_histlen == 0) { serverLog(LL_DEBUG, "[PSYNC] Backlog history len is zero"); + c->repl_curr_off = g_pserver->master_repl_offset; + c->repl_end_off = g_pserver->master_repl_offset; return 0; } @@ -677,31 +711,20 @@ long long addReplyReplicationBacklog(client *c, long long offset) { skip = offset - g_pserver->repl_backlog_off; serverLog(LL_DEBUG, "[PSYNC] Skipping: %lld", skip); - /* Point j to the oldest byte, that is actually our - * g_pserver->repl_backlog_off byte. */ - j = (g_pserver->repl_backlog_idx + - (g_pserver->repl_backlog_size-g_pserver->repl_backlog_histlen)) % - g_pserver->repl_backlog_size; - serverLog(LL_DEBUG, "[PSYNC] Index of first byte: %lld", j); - - /* Discard the amount of data to seek to the specified 'offset'. */ - j = (j + skip) % g_pserver->repl_backlog_size; - - /* Feed replica with data. Since it is a circular buffer we have to - * split the reply in two parts if we are cross-boundary. */ len = g_pserver->repl_backlog_histlen - skip; serverLog(LL_DEBUG, "[PSYNC] Reply total length: %lld", len); - while(len) { - long long thislen = - ((g_pserver->repl_backlog_size - j) < len) ? - (g_pserver->repl_backlog_size - j) : len; - serverLog(LL_DEBUG, "[PSYNC] addReply() length: %lld", thislen); - addReplySds(c,sdsnewlen(g_pserver->repl_backlog + j, thislen)); - len -= thislen; - j = 0; - } - return g_pserver->repl_backlog_histlen - skip; + /* Set the start and end offsets for the replica so that a future + * writeToClient will send the backlog from the given offset to + * the current end of the backlog to said replica */ + c->repl_curr_off = offset - 1; + c->repl_end_off = g_pserver->master_repl_offset; + + /* Force the partial sync to be queued */ + prepareClientToWrite(c); + c->fPendingReplicaWrite = true; + + return len; } /* Return the offset to provide as reply to the PSYNC command received @@ -734,6 +757,10 @@ int replicationSetupSlaveForFullResync(client *replica, long long offset) { replica->psync_initial_offset = offset; replica->replstate = SLAVE_STATE_WAIT_BGSAVE_END; + + replica->repl_curr_off = offset; + replica->repl_end_off = g_pserver->master_repl_offset; + /* We are going to accumulate the incremental changes for this * replica as well. Set replicaseldb to -1 in order to force to re-emit * a SELECT statement in the replication stream. */ @@ -1356,6 +1383,7 @@ void replconfCommand(client *c) { * 4) Update the count of "good replicas". */ void putSlaveOnline(client *replica) { replica->replstate = SLAVE_STATE_ONLINE; + replica->repl_put_online_on_ack = 0; replica->repl_ack_time = g_pserver->unixtime; /* Prevent false timeout. */ @@ -3058,6 +3086,11 @@ void syncWithMaster(connection *conn) { if (psync_result == PSYNC_CONTINUE) { serverLog(LL_NOTICE, "MASTER <-> REPLICA sync: Master accepted a Partial Resynchronization."); + /* Reset the bulklen information in case it is lingering from the last connection + * The partial sync will start from the beginning of a command so these should be reset */ + mi->master->reqtype = 0; + mi->master->multibulklen = 0; + mi->master->bulklen = -1; if (cserver.supervised_mode == SUPERVISED_SYSTEMD) { redisCommunicateSystemd("STATUS=MASTER <-> REPLICA sync: Partial Resynchronization accepted. Ready to accept connections in read-write mode.\n"); } @@ -4897,14 +4930,18 @@ void replicateSubkeyExpire(redisDb *db, robj_roptr key, robj_roptr subkey, long } void _clientAsyncReplyBufferReserve(client *c, size_t len); + void flushReplBacklogToClients() { serverAssert(GlobalLocksAcquired()); + /* If we have the repl backlog lock, we will deadlock */ + serverAssert(!g_pserver->repl_backlog_lock.fOwnLock()); if (g_pserver->repl_batch_offStart < 0) return; if (g_pserver->repl_batch_offStart != g_pserver->master_repl_offset) { bool fAsyncWrite = false; + long long min_offset = LONG_LONG_MAX; // Ensure no overflow serverAssert(g_pserver->repl_batch_offStart < g_pserver->master_repl_offset); serverAssert(g_pserver->master_repl_offset - g_pserver->repl_batch_offStart <= g_pserver->repl_backlog_size); @@ -4913,33 +4950,36 @@ void flushReplBacklogToClients() listIter li; listNode *ln; listRewind(g_pserver->slaves, &li); + /* We don't actually write any data in this function since we send data + * directly from the replication backlog to replicas in writeToClient. + * + * What we do however, is set the end offset of each replica here. This way, + * future calls to writeToClient will know up to where in the replication + * backlog is valid for writing. */ while ((ln = listNext(&li))) { client *replica = (client*)listNodeValue(ln); if (!canFeedReplicaReplBuffer(replica)) continue; if (replica->flags & CLIENT_CLOSE_ASAP) continue; - std::unique_lock ul(replica->lock, std::defer_lock); - if (FCorrectThread(replica)) - ul.lock(); - else + std::unique_lock ul(replica->lock); + if (!FCorrectThread(replica)) fAsyncWrite = true; - - if (g_pserver->repl_backlog_idx >= g_pserver->repl_batch_idxStart) { - long long cbCopy = g_pserver->repl_backlog_idx - g_pserver->repl_batch_idxStart; - serverAssert((g_pserver->master_repl_offset - g_pserver->repl_batch_offStart) == cbCopy); - serverAssert((g_pserver->repl_backlog_size - g_pserver->repl_batch_idxStart) >= (cbCopy)); - serverAssert((g_pserver->repl_batch_idxStart + cbCopy) <= g_pserver->repl_backlog_size); - - addReplyProto(replica, g_pserver->repl_backlog + g_pserver->repl_batch_idxStart, cbCopy); - } else { - auto cbPhase1 = g_pserver->repl_backlog_size - g_pserver->repl_batch_idxStart; - if (fAsyncWrite) - _clientAsyncReplyBufferReserve(replica, cbPhase1 + g_pserver->repl_backlog_idx); - addReplyProto(replica, g_pserver->repl_backlog + g_pserver->repl_batch_idxStart, cbPhase1); - addReplyProto(replica, g_pserver->repl_backlog, g_pserver->repl_backlog_idx); - serverAssert((cbPhase1 + g_pserver->repl_backlog_idx) == (g_pserver->master_repl_offset - g_pserver->repl_batch_offStart)); + + /* We should have set the repl_curr_off when synchronizing, so it shouldn't be -1 here */ + serverAssert(replica->repl_curr_off != -1); + + min_offset = std::min(min_offset, replica->repl_curr_off); + + replica->repl_end_off = g_pserver->master_repl_offset; + + /* Only if the there isn't already a pending write do we prepare the client to write */ + if (!replica->fPendingReplicaWrite){ + serverAssert(replica->repl_curr_off != g_pserver->master_repl_offset); + prepareClientToWrite(replica); + replica->fPendingReplicaWrite = true; } + } if (fAsyncWrite) ProcessPendingAsyncWrites(); @@ -4947,7 +4987,8 @@ void flushReplBacklogToClients() // This may be called multiple times per "frame" so update with our progress flushing to clients g_pserver->repl_batch_idxStart = g_pserver->repl_backlog_idx; g_pserver->repl_batch_offStart = g_pserver->master_repl_offset; - } + g_pserver->repl_lowest_off.store(min_offset == LONG_LONG_MAX ? -1 : min_offset, std::memory_order_seq_cst); + } } diff --git a/src/server.cpp b/src/server.cpp index 0d540c98b..b51634364 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -2021,7 +2021,6 @@ void clientsCron(int iel) { while(listLength(g_pserver->clients) && iterations--) { client *c; listNode *head; - /* Rotate the list, take the current head, process. * This way if the client must be removed from the list it's the * first element and we don't incur into O(N) computation. */ @@ -3245,6 +3244,7 @@ void initServerConfig(void) { g_pserver->enable_multimaster = CONFIG_DEFAULT_ENABLE_MULTIMASTER; g_pserver->repl_syncio_timeout = CONFIG_REPL_SYNCIO_TIMEOUT; g_pserver->master_repl_offset = 0; + g_pserver->repl_lowest_off.store(-1, std::memory_order_seq_cst); /* Replication partial resync backlog */ g_pserver->repl_backlog = NULL; diff --git a/src/server.h b/src/server.h index 129a3d716..0d6f766ce 100644 --- a/src/server.h +++ b/src/server.h @@ -1589,6 +1589,13 @@ struct client { long long psync_initial_offset; /* FULLRESYNC reply offset other slaves copying this replica output buffer should use. */ + + long long repl_curr_off = -1;/* Replication offset of the replica, also where in the backlog we need to start from + * when sending data to this replica. */ + long long repl_end_off = -1; /* Replication offset to write to, stored in the replica, as opposed to using the global offset + * to prevent needing the global lock */ + int fPendingReplicaWrite; /* Is there a write queued for this replica? */ + char replid[CONFIG_RUN_ID_SIZE+1]; /* Master replication ID (if master). */ int slave_listening_port; /* As configured with: REPLCONF listening-port */ char *slave_addr; /* Optionally given by REPLCONF ip-address */ @@ -2356,6 +2363,9 @@ struct redisServer { that is the next byte will'll write to.*/ long long repl_backlog_off; /* Replication "master offset" of first byte in the replication backlog buffer.*/ + long long repl_backlog_start; /* Used to compute indicies from offsets + basically, index = (offset - start) % size */ + fastlock repl_backlog_lock {"replication backlog"}; time_t repl_backlog_time_limit; /* Time without slaves after the backlog gets released. */ time_t repl_no_slaves_since; /* We have no slaves since that time. @@ -2367,6 +2377,8 @@ struct redisServer { int repl_diskless_load; /* Slave parse RDB directly from the socket. * see REPL_DISKLESS_LOAD_* enum */ int repl_diskless_sync_delay; /* Delay to start a diskless repl BGSAVE. */ + std::atomic repl_lowest_off; /* The lowest offset amongst all replicas + -1 if there are no replicas */ /* Replication (replica) */ list *masters; int enable_multimaster; @@ -3712,6 +3724,8 @@ void mixDigest(unsigned char *digest, const void *ptr, size_t len); void xorDigest(unsigned char *digest, const void *ptr, size_t len); int populateCommandTableParseFlags(struct redisCommand *c, const char *strflags); + + int moduleGILAcquiredByModule(void); extern int g_fInCrash; static inline int GlobalLocksAcquired(void) // Used in asserts to verify all global locks are correctly acquired for a server-thread to operate @@ -3779,6 +3793,7 @@ void tlsCleanup(void); int tlsConfigure(redisTLSContextConfig *ctx_config); + class ShutdownException {}; @@ -3790,3 +3805,5 @@ class ShutdownException int iAmMaster(void); #endif + + diff --git a/tests/unit/maxmemory.tcl b/tests/unit/maxmemory.tcl index e57c7e1e5..d1db6cc57 100644 --- a/tests/unit/maxmemory.tcl +++ b/tests/unit/maxmemory.tcl @@ -33,7 +33,8 @@ start_server {tags {"maxmemory"}} { # Get the current memory limit and calculate a new limit. # We just add 100k to the current memory size so that it is # fast for us to reach that limit. - set used [s used_memory] + set overhead [s mem_not_counted_for_evict] + set used [expr [s used_memory] - $overhead] set limit [expr {$used+100*1024}] r config set maxmemory $limit r config set maxmemory-policy $policy @@ -42,7 +43,7 @@ start_server {tags {"maxmemory"}} { while 1 { r setex [randomKey] 10000 x incr numkeys - if {[s used_memory]+4096 > $limit} { + if {[expr {[s used_memory] - $overhead + 4096}] > $limit} { assert {$numkeys > 10} break } @@ -52,7 +53,8 @@ start_server {tags {"maxmemory"}} { for {set j 0} {$j < $numkeys} {incr j} { r setex [randomKey] 10000 x } - assert {[s used_memory] < ($limit+4096)} + set used_amt [expr [s used_memory] - $overhead] + assert {$used_amt < ($limit+4096)} } } @@ -65,7 +67,8 @@ start_server {tags {"maxmemory"}} { # Get the current memory limit and calculate a new limit. # We just add 100k to the current memory size so that it is # fast for us to reach that limit. - set used [s used_memory] + set overhead [s mem_not_counted_for_evict] + set used [expr [s used_memory] - $overhead] set limit [expr {$used+100*1024}] r config set maxmemory $limit r config set maxmemory-policy $policy @@ -74,7 +77,7 @@ start_server {tags {"maxmemory"}} { while 1 { r set [randomKey] x incr numkeys - if {[s used_memory]+4096 > $limit} { + if {[expr [s used_memory] - $overhead]+4096 > $limit} { assert {$numkeys > 10} break } @@ -91,7 +94,7 @@ start_server {tags {"maxmemory"}} { } } if {[string match allkeys-* $policy]} { - assert {[s used_memory] < ($limit+4096)} + assert {[expr [s used_memory] - $overhead] < ($limit+4096)} } else { assert {$err == 1} } @@ -107,7 +110,8 @@ start_server {tags {"maxmemory"}} { # Get the current memory limit and calculate a new limit. # We just add 100k to the current memory size so that it is # fast for us to reach that limit. - set used [s used_memory] + set overhead [s mem_not_counted_for_evict] + set used [expr [s used_memory] - $overhead] set limit [expr {$used+100*1024}] r config set maxmemory $limit r config set maxmemory-policy $policy @@ -121,7 +125,7 @@ start_server {tags {"maxmemory"}} { } else { r set "key:$numkeys" x } - if {[s used_memory]+4096 > $limit} { + if {[expr [s used_memory] - $overhead]+4096 > $limit} { assert {$numkeys > 10} break } @@ -135,7 +139,7 @@ start_server {tags {"maxmemory"}} { catch {r setex "foo:$j" 10000 x} } # We should still be under the limit. - assert {[s used_memory] < ($limit+4096)} + assert {[expr [s used_memory] - $overhead] < ($limit+4096)} # However all our non volatile keys should be here. for {set j 0} {$j < $numkeys} {incr j 2} { assert {[r exists "key:$j"]} @@ -305,7 +309,8 @@ start_server {tags {"maxmemory"} overrides {server-threads 1}} { # we need to make sure to evict keynames of a total size of more than # 16kb since the (PROTO_REPLY_CHUNK_BYTES), only after that the # invalidation messages have a chance to trigger further eviction. - set used [s used_memory] + set overhead [s mem_not_counted_for_evict] + set used [expr [s used_memory] - $overhead] set limit [expr {$used - 40000}] r config set maxmemory $limit From 8db07641adfd3e316c060843d870304834497cc2 Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 25 Jun 2021 06:10:13 +0000 Subject: [PATCH 61/75] Reenable LTO Former-commit-id: e7c1e1c9d8021f48c4081a9dfb84dba9da2521fc --- src/Makefile | 12 +++++++++--- src/motd.cpp | 6 ++++++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/src/Makefile b/src/Makefile index 92bb346f4..e2ae02720 100644 --- a/src/Makefile +++ b/src/Makefile @@ -15,7 +15,7 @@ release_hdr := $(shell sh -c './mkreleasehdr.sh') uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not') uname_M := $(shell sh -c 'uname -m 2>/dev/null || echo not') -OPTIMIZATION?=-O2 +OPTIMIZATION?=-O2 -flto DEPENDENCY_TARGETS=hiredis linenoise lua hdr_histogram rocksdb NODEPS:=clean distclean @@ -349,9 +349,9 @@ endif REDIS_SERVER_NAME=keydb-server$(PROG_SUFFIX) REDIS_SENTINEL_NAME=keydb-sentinel$(PROG_SUFFIX) -REDIS_SERVER_OBJ=adlist.o quicklist.o ae.o anet.o dict.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o t_nhash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o crc16.o endianconv.o slowlog.o scripting.o bio.o rio.o rand.o memtest.o crcspeed.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o acl.o storage.o rdb-s3.o fastlock.o new.o tracking.o cron.o connection.o tls.o sha256.o motd.o timeout.o setcpuaffinity.o AsyncWorkQueue.o snapshot.o storage/rocksdb.o storage/rocksdbfactory.o storage/teststorageprovider.o keydbutils.o StorageCache.o monotonic.o cli_common.o mt19937-64.o $(ASM_OBJ) +REDIS_SERVER_OBJ=adlist.o quicklist.o ae.o anet.o dict.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o t_nhash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o crc16.o endianconv.o slowlog.o scripting.o bio.o rio.o rand.o memtest.o crcspeed.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o acl.o storage.o rdb-s3.o fastlock.o new.o tracking.o cron.o connection.o tls.o sha256.o motd_server.o timeout.o setcpuaffinity.o AsyncWorkQueue.o snapshot.o storage/rocksdb.o storage/rocksdbfactory.o storage/teststorageprovider.o keydbutils.o StorageCache.o monotonic.o cli_common.o mt19937-64.o $(ASM_OBJ) REDIS_CLI_NAME=keydb-cli$(PROG_SUFFIX) -REDIS_CLI_OBJ=anet.o adlist.o dict.o redis-cli.o redis-cli-cpphelper.o zmalloc.o release.o anet.o ae.o crcspeed.o crc64.o siphash.o crc16.o storage-lite.o fastlock.o motd.o monotonic.o cli_common.o mt19937-64.o $(ASM_OBJ) +REDIS_CLI_OBJ=anet.o adlist.o dict.o redis-cli.o redis-cli-cpphelper.o zmalloc.o release.o anet.o ae.o crcspeed.o crc64.o siphash.o crc16.o storage-lite.o fastlock.o motd_client.o monotonic.o cli_common.o mt19937-64.o $(ASM_OBJ) REDIS_BENCHMARK_NAME=keydb-benchmark$(PROG_SUFFIX) REDIS_BENCHMARK_OBJ=ae.o anet.o redis-benchmark.o adlist.o dict.o zmalloc.o release.o crcspeed.o crc64.o siphash.o redis-benchmark.o storage-lite.o fastlock.o new.o monotonic.o cli_common.o mt19937-64.o $(ASM_OBJ) REDIS_CHECK_RDB_NAME=keydb-check-rdb$(PROG_SUFFIX) @@ -435,6 +435,12 @@ DEP = $(REDIS_SERVER_OBJ:%.o=%.d) $(REDIS_CLI_OBJ:%.o=%.d) $(REDIS_BENCHMARK_OBJ # Because the jemalloc.h header is generated as a part of the jemalloc build, # building it should complete before building any other object. Instead of # depending on a single artifact, build all dependencies first. +motd_client.o: motd.cpp .make-prerequisites + $(REDIS_CXX) -MMD -o motd_client.o -c $< -DCLIENT -fno-lto + +motd_server.o: motd.cpp .make-prerequisites + $(REDIS_CXX) -MMD -o motd_server.o -c $< -DSERVER + %.o: %.c .make-prerequisites $(REDIS_CC) -MMD -o $@ -c $< diff --git a/src/motd.cpp b/src/motd.cpp index 370a11e68..795281734 100644 --- a/src/motd.cpp +++ b/src/motd.cpp @@ -1,7 +1,11 @@ +#ifdef CLIENT extern "C" { #include #include } +#else +#include "sds.h" +#endif #include #include #include @@ -15,6 +19,7 @@ extern "C" { #ifdef MOTD #include +#ifdef CLIENT extern "C" { __attribute__ ((weak)) hisds hi_sdscatlen(hisds s, const void *t, size_t len) { return sdscatlen(s, t, len); @@ -23,6 +28,7 @@ __attribute__ ((weak)) hisds hi_sdscat(hisds s, const char *t) { return sdscat(s, t); } } +#endif static const char *szMotdCachePath() { From 165e73353aa25e20578115d89114325f79be7857 Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 25 Jun 2021 06:11:01 +0000 Subject: [PATCH 62/75] OPTIMIZATION: Only notify the condition variable when needed Former-commit-id: 11f07b49c613f54cef682da1e3c8fc54918809b0 --- src/server.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/server.cpp b/src/server.cpp index b51634364..e7a1e1aaf 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -6996,9 +6996,10 @@ void OnTerminate() void wakeTimeThread() { updateCachedTime(); std::lock_guard lock(time_thread_mutex); + if (sleeping_threads >= cserver.cthreads) + time_thread_cv.notify_one(); sleeping_threads--; serverAssert(sleeping_threads >= 0); - time_thread_cv.notify_one(); } void *timeThreadMain(void*) { From 0d8da2c35002905751d07ac7705a5605ee7c5291 Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 25 Jun 2021 06:11:14 +0000 Subject: [PATCH 63/75] Avoid taking locks when we don't need to Former-commit-id: 0d8d3ee9e217cd1f1366a117e6e212f610a028e1 --- src/db.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/db.cpp b/src/db.cpp index 71cb27a03..ecda7e55b 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -2726,6 +2726,8 @@ void redisDbPersistentData::ensure(const char *sdsKey, dictEntry **pde) serverAssert(sdsKey != nullptr); serverAssert(FImplies(*pde != nullptr, dictGetVal(*pde) != nullptr)); // early versions set a NULL object, this is no longer valid serverAssert(m_refCount == 0); + if (m_pdbSnapshot == nullptr && g_pserver->m_pstorageFactory == nullptr) + return; std::unique_lock ul(g_expireLock); // First see if the key can be obtained from a snapshot From f518193862fc226b3ea93b4d25cbfb4d362f5255 Mon Sep 17 00:00:00 2001 From: Malavan Sotheeswaran Date: Wed, 14 Jul 2021 23:41:24 +0000 Subject: [PATCH 64/75] test CI do not merge Former-commit-id: ccdf18b1bef07ba076e5f86d74fe1e1f6ae50a8c --- .gitlab-ci.yml | 147 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100644 .gitlab-ci.yml diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 000000000..cc67d754a --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,147 @@ +build: + rules: + - if: '$COVERAGE' + when: never + - if: '$ENDURANCE' + when: never + - when: always + tags: + - docker + stage: build + script: + - git submodule init && git submodule update + - make distclean + - make -j + +make-test: + rules: + - if: '$COVERAGE' + when: never + - if: '$ENDURANCE' + when: never + - when: always + tags: + - docker + stage: test + script: + - git submodule init && git submodule update + - make distclean + - make -j + - make test -j + +node-redis-test: + rules: + - if: '$COVERAGE' + when: never + - if: '$ENDURANCE' + when: never + - when: always + tags: + - docker + - ipv6 + stage: test + script: + - git submodule init && git submodule update + - make distclean + - make -j + - make install + - git clone https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab.eqalpha.com/keydb-dev/node-redis.git + - cd node-redis + - npm install + - npm run test + +jedis-test: + rules: + - if: '$COVERAGE' + when: never + - if: '$ENDURANCE' + when: never + - when: always + tags: + - docker + - ipv4 + stage: test + script: + - git submodule init && git submodule update + - make distclean + - make -j + - make install + - git clone https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab.eqalpha.com/keydb-dev/jedis.git + - cd jedis + - make test + +redis-rs-test: + rules: + - if: '$COVERAGE' + when: never + - if: '$ENDURANCE' + when: never + - when: always + tags: + - docker + stage: test + script: + - git submodule init && git submodule update + - make distclean + - make -j + - make install + - git clone https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab.eqalpha.com/keydb-dev/redis-rs.git + - cd redis-rs + - make test + +endurance-test: + rules: + - if: '$ENDURANCE' + tags: + - docker + stage: test + script: + - git submodule init && git submodule update + - make distclean + - make -j + - ./runtest --loop --stop + +coverage-test: + rules: + - if: '$COVERAGE' + tags: + - docker + stage: test + script: + - git submodule init && git submodule update + - make distclean + - make gcov -j + - make install + - ./runtest || true + - pkill keydb-server || true + - pkill stunnel || true + - ./runtest-cluster || true + - pkill keydb-server || true + - pkill stunnel || true + - ./runtest-sentinel || true + - pkill keydb-server || true + - pkill stunnel || true + - ./runtest-moduleapi || true + - pkill keydb-server || true + - pkill stunnel || true + - git clone https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab.eqalpha.com/keydb-dev/redis-rs.git + - cd redis-rs + - make test || true + - pkill keydb-server || true + - pkill stunnel || true + - cd .. + - git clone https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab.eqalpha.com/keydb-dev/jedis.git + - cd jedis + - make test || true + - pkill keydb-server || true + - pkill stunnel || true + - cd .. + - git clone https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab.eqalpha.com/keydb-dev/node-redis.git + - cd node-redis + - npm install + - npm run test || true + - pkill keydb-server || true + - pkill stunnel || true + - cd .. + - geninfo -o KeyDB.info --no-external . + - genhtml --legend -o lcov-html KeyDB.info \ No newline at end of file From 0b220bddc7ab4f08cd412616ab85b96f3f633966 Mon Sep 17 00:00:00 2001 From: John Sully Date: Sun, 18 Jul 2021 20:28:42 +0000 Subject: [PATCH 65/75] Do not update batch variables when not in a batch Former-commit-id: ad1e0286cf9b2d9de33c65e8e798a05ead3f7d5a --- src/replication.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/replication.cpp b/src/replication.cpp index b9465680e..7eabeef6b 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -254,9 +254,11 @@ void resizeReplicationBacklog(long long newsize) { zfree(g_pserver->repl_backlog); g_pserver->repl_backlog = backlog; g_pserver->repl_backlog_idx = g_pserver->repl_backlog_histlen; - g_pserver->repl_batch_idxStart -= earliest_idx; - if (g_pserver->repl_batch_idxStart < 0) - g_pserver->repl_batch_idxStart += g_pserver->repl_backlog_size; + if (g_pserver->repl_batch_idxStart >= 0) { + g_pserver->repl_batch_idxStart -= earliest_idx; + if (g_pserver->repl_batch_idxStart < 0) + g_pserver->repl_batch_idxStart += g_pserver->repl_backlog_size; + } g_pserver->repl_backlog_start = earliest_off; } else { zfree(g_pserver->repl_backlog); From 5f72ce931752c9aa674caa214c092116285246d3 Mon Sep 17 00:00:00 2001 From: John Sully Date: Sun, 18 Jul 2021 20:45:32 +0000 Subject: [PATCH 66/75] Return the ring buffer to its original size if we temporarily resized it Former-commit-id: a12ce4a0d105bf7d6ccff95f7dc0044c4676b0a7 --- src/config.cpp | 1 + src/replication.cpp | 16 ++++++++++++++++ src/server.cpp | 2 ++ src/server.h | 3 +++ 4 files changed, 22 insertions(+) diff --git a/src/config.cpp b/src/config.cpp index a10cdbe12..6fc957485 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -2470,6 +2470,7 @@ static int updateReplBacklogSize(long long val, long long prev, const char **err * being able to tell when the size changes, so restore prev before calling it. */ UNUSED(err); g_pserver->repl_backlog_size = prev; + g_pserver->repl_backlog_config_size = val; resizeReplicationBacklog(val); return 1; } diff --git a/src/replication.cpp b/src/replication.cpp index 7eabeef6b..c3b902db4 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -4321,6 +4321,8 @@ void replicationCron(void) { replicationStartPendingFork(); + trimReplicationBacklog(); + /* Remove the RDB file used for replication if Redis is not running * with any persistence. */ removeRDBUsedToSyncReplicas(); @@ -5066,3 +5068,17 @@ void updateFailoverStatus(void) { g_pserver->target_replica_port); } } + +// If we automatically grew the backlog we need to trim it back to +// the config setting when possible +void trimReplicationBacklog() { + serverAssert(GlobalLocksAcquired()); + serverAssert(g_pserver->repl_batch_offStart < 0); // we shouldn't be in a batch + if (g_pserver->repl_backlog_size <= g_pserver->repl_backlog_config_size) + return; // We're already a good size + if (g_pserver->repl_lowest_off > 0 && (g_pserver->master_repl_offset - g_pserver->repl_lowest_off + 1) > g_pserver->repl_backlog_config_size) + return; // There is untransmitted data we can't truncate + + serverLog(LL_NOTICE, "Reclaiming %lld replication backlog bytes", g_pserver->repl_backlog_size - g_pserver->repl_backlog_config_size); + resizeReplicationBacklog(g_pserver->repl_backlog_config_size); +} \ No newline at end of file diff --git a/src/server.cpp b/src/server.cpp index 3d3b07172..183440f3b 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -7084,6 +7084,8 @@ static void validateConfiguration() serverLog(LL_WARNING, "\tKeyDB will now exit. Please update your configuration file."); exit(EXIT_FAILURE); } + + g_pserver->repl_backlog_config_size = g_pserver->repl_backlog_size; // this is normally set in the update logic, but not on initial config } int iAmMaster(void) { diff --git a/src/server.h b/src/server.h index 32040fb72..5178e86ba 100644 --- a/src/server.h +++ b/src/server.h @@ -2358,6 +2358,7 @@ struct redisServer { int repl_ping_slave_period; /* Master pings the replica every N seconds */ char *repl_backlog; /* Replication backlog for partial syncs */ long long repl_backlog_size; /* Backlog circular buffer size */ + long long repl_backlog_config_size; /* The repl backlog may grow but we want to know what the user set it to */ long long repl_backlog_histlen; /* Backlog actual data length */ long long repl_backlog_idx; /* Backlog circular buffer current offset, that is the next byte will'll write to.*/ @@ -3024,6 +3025,8 @@ void clearFailoverState(void); void updateFailoverStatus(void); void abortFailover(redisMaster *mi, const char *err); const char *getFailoverStateString(); +int canFeedReplicaReplBuffer(client *replica); +void trimReplicationBacklog(); /* Generic persistence functions */ void startLoadingFile(FILE* fp, const char * filename, int rdbflags); From 4000334b1f7589645b2f816cbc948b662fe1a302 Mon Sep 17 00:00:00 2001 From: John Sully Date: Sun, 18 Jul 2021 20:48:08 +0000 Subject: [PATCH 67/75] Do not resize replica buffer past the max client limit Former-commit-id: ba116500ca4fd53e4e40f04fc33981e60bb21ab7 --- src/replication.cpp | 51 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 44 insertions(+), 7 deletions(-) diff --git a/src/replication.cpp b/src/replication.cpp index c3b902db4..12142f9a5 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -303,19 +303,56 @@ void feedReplicationBacklog(const void *ptr, size_t len) { if (lower_bound == -1) lower_bound = g_pserver->repl_batch_offStart; long long minimumsize = g_pserver->master_repl_offset + len - lower_bound + 1; + if (minimumsize > g_pserver->repl_backlog_size) { - flushReplBacklogToClients(); - lower_bound = g_pserver->repl_lowest_off.load(std::memory_order_seq_cst); - if (lower_bound == -1) - lower_bound = g_pserver->repl_batch_offStart; + listIter li; + listNode *ln; + listRewind(g_pserver->slaves, &li); + long long maxClientBuffer = (long long)cserver.client_obuf_limits[CLIENT_TYPE_SLAVE].hard_limit_bytes; + if (maxClientBuffer <= 0) + maxClientBuffer = LLONG_MAX; // infinite essentially + long long min_offset = LLONG_MAX; + int listening_replicas = 0; + while ((ln = listNext(&li))) { + client *replica = (client*)listNodeValue(ln); + if (!canFeedReplicaReplBuffer(replica)) continue; + if (replica->flags & CLIENT_CLOSE_ASAP) continue; - minimumsize = g_pserver->master_repl_offset + len - lower_bound + 1; + std::unique_lock ul(replica->lock); - if (minimumsize > g_pserver->repl_backlog_size) { + // Would this client overflow? If so close it + long long neededBuffer = g_pserver->master_repl_offset + len - replica->repl_curr_off + 1; + if (neededBuffer > maxClientBuffer) { + + sds clientInfo = catClientInfoString(sdsempty(),replica); + freeClientAsync(replica); + serverLog(LL_WARNING,"Client %s scheduled to be closed ASAP due to exceeding output buffer hard limit.", clientInfo); + sdsfree(clientInfo); + continue; + } + min_offset = std::min(min_offset, replica->repl_curr_off); + ++listening_replicas; + } + + if (min_offset == LLONG_MAX) { + min_offset = g_pserver->repl_batch_offStart; + g_pserver->repl_lowest_off = -1; + } else { + g_pserver->repl_lowest_off = min_offset; + } + + minimumsize = g_pserver->master_repl_offset + len - min_offset + 1; + serverAssert(listening_replicas == 0 || minimumsize <= maxClientBuffer); + + if (minimumsize > g_pserver->repl_backlog_size && listening_replicas) { // This is an emergency overflow, we better resize to fit long long newsize = std::max(g_pserver->repl_backlog_size*2, minimumsize); - serverLog(LL_WARNING, "Replication backlog is too small, resizing to: %lld", newsize); + serverLog(LL_WARNING, "Replication backlog is too small, resizing to: %lld bytes", newsize); resizeReplicationBacklog(newsize); + } else if (!listening_replicas) { + // We need to update a few variables or later asserts will notice we dropped data + g_pserver->repl_batch_offStart = g_pserver->master_repl_offset + len; + g_pserver->repl_lowest_off = -1; } } } From 614860ce3c97e46466fd635e75ed260e13e4d5ac Mon Sep 17 00:00:00 2001 From: John Sully Date: Mon, 19 Jul 2021 15:10:48 +0000 Subject: [PATCH 68/75] StorageCache dtor leaks Former-commit-id: 0262c4dc76a320141b8a4454df2f6baab4f74ab3 --- src/StorageCache.cpp | 6 ++++++ src/StorageCache.h | 2 ++ 2 files changed, 8 insertions(+) diff --git a/src/StorageCache.cpp b/src/StorageCache.cpp index 29908c7f5..6e1d4af9a 100644 --- a/src/StorageCache.cpp +++ b/src/StorageCache.cpp @@ -25,6 +25,12 @@ StorageCache::StorageCache(IStorage *storage, bool fCache) m_pdict = dictCreate(&dbStorageCacheType, nullptr); } +StorageCache::~StorageCache() +{ + if (m_pdict != nullptr) + dictRelease(m_pdict); +} + void StorageCache::clear() { std::unique_lock ul(m_lock); diff --git a/src/StorageCache.h b/src/StorageCache.h index ed868e74b..9f92f75c0 100644 --- a/src/StorageCache.h +++ b/src/StorageCache.h @@ -29,6 +29,8 @@ class StorageCache } public: + ~StorageCache(); + static StorageCache *create(IStorageFactory *pfactory, int db, IStorageFactory::key_load_iterator fn, void *privdata) { StorageCache *cache = new StorageCache(nullptr, pfactory->FSlow() /*fCache*/); load_iter_data data = {cache, fn, privdata}; From d3793efb337af7d5d4478b040804a013620b84af Mon Sep 17 00:00:00 2001 From: John Sully Date: Mon, 19 Jul 2021 15:11:33 +0000 Subject: [PATCH 69/75] Info command should show how many keys are cached in RAM vs storage provider Former-commit-id: 08597bee69bc16ca7c3d5ff31020472774c6eec9 --- src/db.cpp | 6 +++--- src/server.cpp | 7 ++++--- src/server.h | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/db.cpp b/src/db.cpp index 947f03427..98b70574d 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -2963,13 +2963,13 @@ dict_iter redisDbPersistentData::random() return dict_iter(m_pdict, de); } -size_t redisDbPersistentData::size() const +size_t redisDbPersistentData::size(bool fCachedOnly) const { - if (m_spstorage != nullptr && !m_fAllChanged) + if (m_spstorage != nullptr && !m_fAllChanged && !fCachedOnly) return m_spstorage->count() + m_cnewKeysPending; return dictSize(m_pdict) - + (m_pdbSnapshot ? (m_pdbSnapshot->size() - dictSize(m_pdictTombstone)) : 0); + + (m_pdbSnapshot ? (m_pdbSnapshot->size(fCachedOnly) - dictSize(m_pdictTombstone)) : 0); } bool redisDbPersistentData::removeCachedValue(const char *key) diff --git a/src/server.cpp b/src/server.cpp index 183440f3b..ed153d599 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -6096,10 +6096,11 @@ sds genRedisInfoString(const char *section) { if (sections++) info = sdscat(info,"\r\n"); info = sdscatprintf(info, "# Keyspace\r\n"); for (j = 0; j < cserver.dbnum; j++) { - long long keys, vkeys; + long long keys, vkeys, cachedKeys; keys = g_pserver->db[j]->size(); vkeys = g_pserver->db[j]->expireSize(); + cachedKeys = g_pserver->db[j]->size(true /* fCachedOnly */); // Adjust TTL by the current time mstime_t mstime; @@ -6111,8 +6112,8 @@ sds genRedisInfoString(const char *section) { if (keys || vkeys) { info = sdscatprintf(info, - "db%d:keys=%lld,expires=%lld,avg_ttl=%lld\r\n", - j, keys, vkeys, static_cast(g_pserver->db[j]->avg_ttl)); + "db%d:keys=%lld,expires=%lld,avg_ttl=%lld,cached_keys=%lld\r\n", + j, keys, vkeys, static_cast(g_pserver->db[j]->avg_ttl), cachedKeys); } } } diff --git a/src/server.h b/src/server.h index 5178e86ba..edbcb4c8a 100644 --- a/src/server.h +++ b/src/server.h @@ -1095,7 +1095,7 @@ public: redisDbPersistentData(redisDbPersistentData &&) = default; size_t slots() const { return dictSlots(m_pdict); } - size_t size() const; + size_t size(bool fCachedOnly = false) const; void expand(uint64_t slots) { dictExpand(m_pdict, slots); } void trackkey(robj_roptr o, bool fUpdate) From 88f5bf1d90f47d58028712af37bc046173ab9f70 Mon Sep 17 00:00:00 2001 From: John Sully Date: Mon, 19 Jul 2021 16:50:48 +0000 Subject: [PATCH 70/75] We need to periodically flush the GC or we'll end up blocking with a huge backlog at the end of load Former-commit-id: 29c0bf79ad1a810e808790de2f7db24f3cc603e8 --- src/rdb.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/rdb.cpp b/src/rdb.cpp index 4975918a8..957c3fa88 100644 --- a/src/rdb.cpp +++ b/src/rdb.cpp @@ -2884,6 +2884,8 @@ int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi) { do this every 16 keys to limit the perf impact */ if (g_pserver->m_pstorageFactory && (ckeysLoaded % 128) == 0) { + g_pserver->garbageCollector.endEpoch(serverTL->gcEpoch); + serverTL->gcEpoch = g_pserver->garbageCollector.startEpoch(); bool fHighMemory = (getMaxmemoryState(NULL,NULL,NULL,NULL) != C_OK); if (fHighMemory || (ckeysLoaded % (1024)) == 0) { From 345ec75a36ef5e9d3ddcc7c23d3fcaed1e7af535 Mon Sep 17 00:00:00 2001 From: John Sully Date: Mon, 19 Jul 2021 18:01:39 +0000 Subject: [PATCH 71/75] We need to free in order since the first big async free is likely the largest, so don't set the hipri bit Former-commit-id: 76a9cefa94e0f446e12a690909cbda15d03ca211 --- src/db.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/db.cpp b/src/db.cpp index 98b70574d..56556ff7f 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -3026,7 +3026,7 @@ void redisDbPersistentData::removeAllCachedValues() dictExpand(m_pdict, dictSize(dT)/2, false); // Make room for about half so we don't excessively rehash g_pserver->asyncworkqueue->AddWorkFunction([dT]{ dictRelease(dT); - }, true); + }, false); } else { dictEmpty(m_pdict, nullptr); } From bacaa204cfc6db6f9e69249b6f5063cc25d9e368 Mon Sep 17 00:00:00 2001 From: John Sully Date: Mon, 19 Jul 2021 18:17:54 +0000 Subject: [PATCH 72/75] Disable async rehash during load as it interferes with eviction Former-commit-id: 54b4f39e9d634bf53b04cd94433b051b14323bc6 --- src/db.cpp | 2 +- src/server.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/db.cpp b/src/db.cpp index 56556ff7f..adcf4456f 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -3020,7 +3020,7 @@ void redisDbPersistentData::removeAllCachedValues() trackChanges(false); } - if (m_pdict->pauserehash == 0) { + if (m_pdict->pauserehash == 0 && m_pdict->refcount == 1) { dict *dT = m_pdict; m_pdict = dictCreate(&dbDictType, this); dictExpand(m_pdict, dictSize(dT)/2, false); // Make room for about half so we don't excessively rehash diff --git a/src/server.cpp b/src/server.cpp index ed153d599..d32bec710 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -2124,7 +2124,7 @@ void databasesCron(bool fMainThread) { ::dict *dict = g_pserver->db[rehash_db]->dictUnsafeKeyOnly(); /* Are we async rehashing? And if so is it time to re-calibrate? */ /* The recalibration limit is a prime number to ensure balancing across threads */ - if (rehashes_per_ms > 0 && async_rehashes < 131 && !cserver.active_defrag_enabled && cserver.cthreads > 1) { + if (rehashes_per_ms > 0 && async_rehashes < 131 && !cserver.active_defrag_enabled && cserver.cthreads > 1 && dictSize(dict) > 2048 && dictIsRehashing(dict) && !g_pserver->loading) { serverTL->rehashCtl = dictRehashAsyncStart(dict, rehashes_per_ms); ++async_rehashes; } From 9e8a28c0ed3582cfee0a6ddb7357498bed119759 Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 23 Jul 2021 16:02:29 +0000 Subject: [PATCH 73/75] We cannot create time events on threads that don't have an event loop Former-commit-id: 3812586a41bb7f974b5d9820c8a68ff34ee8aa9a --- src/evict.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/evict.cpp b/src/evict.cpp index da2a05a55..802784633 100644 --- a/src/evict.cpp +++ b/src/evict.cpp @@ -834,7 +834,7 @@ int performEvictions(bool fPreSnapshot) { * memory, don't want to spend too much time here. */ if (elapsedUs(evictionTimer) > eviction_time_limit_us) { // We still need to free memory - start eviction timer proc - if (!isEvictionProcRunning) { + if (!isEvictionProcRunning && serverTL->el != nullptr) { isEvictionProcRunning = 1; aeCreateTimeEvent(serverTL->el, 0, evictionTimeProc, NULL, NULL); From a8685235c30c683a0c455118156dc0a6cfbc0f1e Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 23 Jul 2021 19:31:22 +0000 Subject: [PATCH 74/75] Initialize el so we can detect if it is null Former-commit-id: ec0f833ea17c668971893aa8f198d22da2e1d289 --- src/server.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/server.h b/src/server.h index 2c0a69b8b..5aea41b23 100644 --- a/src/server.h +++ b/src/server.h @@ -1999,7 +1999,7 @@ public: // Per-thread variabels that may be accessed without a lock struct redisServerThreadVars { - aeEventLoop *el; + aeEventLoop *el = nullptr; socketFds ipfd; /* TCP socket file descriptors */ socketFds tlsfd; /* TLS socket file descriptors */ int in_eval; /* Are we inside EVAL? */ From aef0bd877fc23c60f32bcdb177ddfc40737f6003 Mon Sep 17 00:00:00 2001 From: John Sully Date: Mon, 26 Jul 2021 22:30:31 +0000 Subject: [PATCH 75/75] Fix issue collab #32 Former-commit-id: 0d192cf00ebe9fc0d898404b86e1173476edaefb --- src/evict.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/evict.cpp b/src/evict.cpp index 802784633..84bf21c36 100644 --- a/src/evict.cpp +++ b/src/evict.cpp @@ -832,7 +832,7 @@ int performEvictions(bool fPreSnapshot) { /* After some time, exit the loop early - even if memory limit * hasn't been reached. If we suddenly need to free a lot of * memory, don't want to spend too much time here. */ - if (elapsedUs(evictionTimer) > eviction_time_limit_us) { + if (g_pserver->m_pstorageFactory == nullptr && elapsedUs(evictionTimer) > eviction_time_limit_us) { // We still need to free memory - start eviction timer proc if (!isEvictionProcRunning && serverTL->el != nullptr) { isEvictionProcRunning = 1;