From 1b3f0c047a70bbc4d1dc3d6a2aa72badd416baac Mon Sep 17 00:00:00 2001 From: Itamar Haber Date: Tue, 16 Apr 2019 17:15:23 +0300 Subject: [PATCH 01/76] Adds RedisModule_ReplyWithCString Signed-off-by: Itamar Haber --- src/module.c | 11 +++++++++++ src/redismodule.h | 2 ++ 2 files changed, 13 insertions(+) diff --git a/src/module.c b/src/module.c index c29521670..ed4613af6 100644 --- a/src/module.c +++ b/src/module.c @@ -1242,6 +1242,17 @@ int RM_ReplyWithStringBuffer(RedisModuleCtx *ctx, const char *buf, size_t len) { return REDISMODULE_OK; } +/* Reply with a bulk string, taking in input a C buffer pointer that is + * assumed to be null-terminated. + * + * The function always returns REDISMODULE_OK. */ +int RM_ReplyWithCString(RedisModuleCtx *ctx, const char *buf) { + client *c = moduleGetReplyClient(ctx); + if (c == NULL) return REDISMODULE_OK; + addReplyBulkCBuffer(c,(char*)buf,strlen(buf)); + return REDISMODULE_OK; +} + /* Reply with a bulk string, taking in input a RedisModuleString object. * * The function always returns REDISMODULE_OK. */ diff --git a/src/redismodule.h b/src/redismodule.h index 259a5f1db..5c7643dee 100644 --- a/src/redismodule.h +++ b/src/redismodule.h @@ -226,6 +226,7 @@ int REDISMODULE_API_FUNC(RedisModule_ReplyWithSimpleString)(RedisModuleCtx *ctx, int REDISMODULE_API_FUNC(RedisModule_ReplyWithArray)(RedisModuleCtx *ctx, long len); void REDISMODULE_API_FUNC(RedisModule_ReplySetArrayLength)(RedisModuleCtx *ctx, long len); int REDISMODULE_API_FUNC(RedisModule_ReplyWithStringBuffer)(RedisModuleCtx *ctx, const char *buf, size_t len); +int REDISMODULE_API_FUNC(RedisModule_ReplyWithCString)(RedisModuleCtx *ctx, const char *buf); int REDISMODULE_API_FUNC(RedisModule_ReplyWithString)(RedisModuleCtx *ctx, RedisModuleString *str); int REDISMODULE_API_FUNC(RedisModule_ReplyWithNull)(RedisModuleCtx *ctx); int REDISMODULE_API_FUNC(RedisModule_ReplyWithDouble)(RedisModuleCtx *ctx, double d); @@ -376,6 +377,7 @@ static int RedisModule_Init(RedisModuleCtx *ctx, const char *name, int ver, int REDISMODULE_GET_API(ReplyWithArray); REDISMODULE_GET_API(ReplySetArrayLength); REDISMODULE_GET_API(ReplyWithStringBuffer); + REDISMODULE_GET_API(ReplyWithCString); REDISMODULE_GET_API(ReplyWithString); REDISMODULE_GET_API(ReplyWithNull); REDISMODULE_GET_API(ReplyWithCallReply); From 346355edc1f57492fe431bd567487c28c92cbefb Mon Sep 17 00:00:00 2001 From: Itamar Haber Date: Tue, 16 Apr 2019 17:38:33 +0300 Subject: [PATCH 02/76] Uses addReplyBulkCString Signed-off-by: Itamar Haber --- src/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/module.c b/src/module.c index ed4613af6..d46f484c4 100644 --- a/src/module.c +++ b/src/module.c @@ -1249,7 +1249,7 @@ int RM_ReplyWithStringBuffer(RedisModuleCtx *ctx, const char *buf, size_t len) { int RM_ReplyWithCString(RedisModuleCtx *ctx, const char *buf) { client *c = moduleGetReplyClient(ctx); if (c == NULL) return REDISMODULE_OK; - addReplyBulkCBuffer(c,(char*)buf,strlen(buf)); + addReplyBulkCString(c,(char*)buf); return REDISMODULE_OK; } From 1e294031345f12cefbc7434ccb13dfe1e3e42de5 Mon Sep 17 00:00:00 2001 From: chendianqiang Date: Wed, 17 Apr 2019 21:20:10 +0800 Subject: [PATCH 03/76] stop ping when client pause --- src/replication.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/replication.c b/src/replication.c index 9175bb420..237103d90 100644 --- a/src/replication.c +++ b/src/replication.c @@ -2579,7 +2579,7 @@ void replicationCron(void) { /* First, send PING according to ping_slave_period. */ if ((replication_cron_loops % server.repl_ping_slave_period) == 0 && - listLength(server.slaves)) + listLength(server.slaves) && !clientsArePaused()) { ping_argv[0] = createStringObject("PING",4); replicationFeedSlaves(server.slaves, server.slaveseldb, From 5606036fb49819312013efbbd9440da65a68c503 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 13 May 2019 17:27:06 +0200 Subject: [PATCH 04/76] Fix test false positive introduced by threaded I/O. Now clients that are ready to be terminated asynchronously are processed more often in beforeSleep() instead of being processed in serverCron(). This means that the test will not be able to catch the moment the client was terminated, also note that the 'omem' figure now changes in big steps, because of the new client output buffers layout. So we have to change the test range in order to accomodate for that. Yet the test is useful enough to be worth taking, even if its precision is reduced by this commit. Probably if we get more problems, a thing that makes sense is just to check that the limit is < 200k. That's more than enough actually. --- tests/unit/obuf-limits.tcl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/obuf-limits.tcl b/tests/unit/obuf-limits.tcl index 5d625cf45..c45bf8e86 100644 --- a/tests/unit/obuf-limits.tcl +++ b/tests/unit/obuf-limits.tcl @@ -15,7 +15,7 @@ start_server {tags {"obuf-limits"}} { if {![regexp {omem=([0-9]+)} $c - omem]} break if {$omem > 200000} break } - assert {$omem >= 90000 && $omem < 200000} + assert {$omem >= 70000 && $omem < 200000} $rd1 close } From d0d1cbbf5c527a64e972957d58fa77aa2d478253 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 13 May 2019 17:30:02 +0200 Subject: [PATCH 05/76] Make comment in getClientOutputBufferMemoryUsage() describing the present. --- src/networking.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/networking.c b/src/networking.c index 6fec97605..4bc22120a 100644 --- a/src/networking.c +++ b/src/networking.c @@ -2295,15 +2295,8 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) { } } -/* This function returns the number of bytes that Redis is virtually +/* This function returns the number of bytes that Redis is * using to store the reply still not read by the client. - * It is "virtual" since the reply output list may contain objects that - * are shared and are not really using additional memory. - * - * The function returns the total sum of the length of all the objects - * stored in the output list, plus the memory used to allocate every - * list node. The static reply buffer is not taken into account since it - * is allocated anyway. * * Note: this function is very fast so can be called as many time as * the caller wishes. The main usage of this function currently is From 78978eb5e1dee96b94c1d158e42796d8c8a4c305 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 14 May 2019 16:54:59 +0200 Subject: [PATCH 06/76] Test: fix slowlog test false positive. In fast systems "SLOWLOG RESET" is fast enough to don't be logged even when the time limit is "1" sometimes. Leading to false positives such as: [err]: SLOWLOG - can be disabled in tests/unit/slowlog.tcl Expected '1' to be equal to '0' --- tests/unit/slowlog.tcl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/unit/slowlog.tcl b/tests/unit/slowlog.tcl index dbd7a1547..22f088103 100644 --- a/tests/unit/slowlog.tcl +++ b/tests/unit/slowlog.tcl @@ -80,9 +80,11 @@ start_server {tags {"slowlog"} overrides {slowlog-log-slower-than 1000000}} { } test {SLOWLOG - can be disabled} { + r config set slowlog-max-len 1 r config set slowlog-log-slower-than 1 r slowlog reset - assert_equal [r slowlog len] 1 + r debug sleep 0.2 + assert_equal [r slowlog len] 1 r config set slowlog-log-slower-than -1 r slowlog reset r debug sleep 0.2 From 9eea57cc311e62a49358e09af7baebce9da9053f Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 15 May 2019 12:16:43 +0200 Subject: [PATCH 07/76] Narrow the effects of PR #6029 to the exact state. CLIENT PAUSE may be used, in other contexts, for a long time making all the slaves time out. Better for now to be more specific about what should disable senidng PINGs. An alternative to that would be to virtually refresh the slave interactions when clients are paused, however for now I went for this more conservative solution. --- src/replication.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/src/replication.c b/src/replication.c index bfe50c929..63a67a06a 100644 --- a/src/replication.c +++ b/src/replication.c @@ -30,6 +30,7 @@ #include "server.h" +#include "cluster.h" #include #include @@ -2601,12 +2602,23 @@ void replicationCron(void) { /* First, send PING according to ping_slave_period. */ if ((replication_cron_loops % server.repl_ping_slave_period) == 0 && - listLength(server.slaves) && !clientsArePaused()) + listLength(server.slaves)) { - ping_argv[0] = createStringObject("PING",4); - replicationFeedSlaves(server.slaves, server.slaveseldb, - ping_argv, 1); - decrRefCount(ping_argv[0]); + /* Note that we don't send the PING if the clients are paused during + * a Redis Cluster manual failover: the PING we send will otherwise + * alter the replication offsets of master and slave, and will no longer + * match the one stored into 'mf_master_offset' state. */ + int manual_failover_in_progress = + server.cluster_enabled && + server.cluster->mf_end && + clientsArePaused(); + + if (!manual_failover_in_progress) { + ping_argv[0] = createStringObject("PING",4); + replicationFeedSlaves(server.slaves, server.slaveseldb, + ping_argv, 1); + decrRefCount(ping_argv[0]); + } } /* Second, send a newline to all the slaves in pre-synchronization From 6e4635e8fd7e121439235c1271581000728843f5 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 15 May 2019 12:46:01 +0200 Subject: [PATCH 08/76] Update CONTRIBUTING with present info. --- CONTRIBUTING | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/CONTRIBUTING b/CONTRIBUTING index 7dee24c74..5fb038e49 100644 --- a/CONTRIBUTING +++ b/CONTRIBUTING @@ -14,9 +14,7 @@ each source file that you contribute. PLEASE DO NOT POST GENERAL QUESTIONS that are not about bugs or suspected bugs in the Github issues system. We'll be very happy to help you and provide - all the support at the Reddit sub: - - http://reddit.com/r/redis + all the support in the mainling list. There is also an active community of Redis users at Stack Overflow: @@ -24,7 +22,12 @@ each source file that you contribute. # How to provide a patch for a new feature -1. If it is a major feature or a semantical change, please post it as a new submission in r/redis on Reddit at http://reddit.com/r/redis. Try to be passionate about why the feature is needed, make users upvote your proposal to gain traction and so forth. Read feedbacks about the community. But in this first step **please don't write code yet**. +1. If it is a major feature or a semantical change, please don't start coding +straight away: if your feature is not a conceptual fit you'll lose a lot of +time writing the code without any reason. Start by posting in the mailing list +and creating an issue at Github with the description of, excatly, what you want +to accomplish and why. Use cases are important for features to be accepted. +Here you'll see if there is consensus about your idea. 2. If in step 1 you get an acknowledgment from the project leaders, use the following procedure to submit a patch: @@ -35,6 +38,13 @@ each source file that you contribute. d. Initiate a pull request on github ( https://help.github.com/articles/creating-a-pull-request/ ) e. Done :) -For minor fixes just open a pull request on Github. +3. Keep in mind that we are very overloaded, so issues and PRs sometimes wait +for a *very* long time. However this is not lack of interest, as the project +gets more and more users, we find ourselves in a constant need to prioritize +certain issues/PRs over others. If you think your issue/PR is very important +try to popularize it, have other users commenting and sharing their point of +view and so forth. This helps. + +4. For minor fixes just open a pull request on Github. Thanks! From 525fc336ef5172f1eddbf18cbbfc1a32b054c9e2 Mon Sep 17 00:00:00 2001 From: Christian Zeller Date: Wed, 15 May 2019 16:10:48 +0200 Subject: [PATCH 09/76] Typo fixes in CONTRIBUTING --- CONTRIBUTING | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CONTRIBUTING b/CONTRIBUTING index 5fb038e49..000edbeaf 100644 --- a/CONTRIBUTING +++ b/CONTRIBUTING @@ -14,7 +14,7 @@ each source file that you contribute. PLEASE DO NOT POST GENERAL QUESTIONS that are not about bugs or suspected bugs in the Github issues system. We'll be very happy to help you and provide - all the support in the mainling list. + all the support in the mailing list. There is also an active community of Redis users at Stack Overflow: @@ -25,7 +25,7 @@ each source file that you contribute. 1. If it is a major feature or a semantical change, please don't start coding straight away: if your feature is not a conceptual fit you'll lose a lot of time writing the code without any reason. Start by posting in the mailing list -and creating an issue at Github with the description of, excatly, what you want +and creating an issue at Github with the description of, exactly, what you want to accomplish and why. Use cases are important for features to be accepted. Here you'll see if there is consensus about your idea. From 4bbaf621a12f6ec22ec7dad6a2282b7908660497 Mon Sep 17 00:00:00 2001 From: Angus Pearson Date: Wed, 22 May 2019 16:39:04 +0100 Subject: [PATCH 10/76] Implement `SCAN cursor [TYPE type]` modifier suggested in issue #6107. Add tests to check basic functionality of this optional keyword, and also tested with a module (redisgraph). Checked quickly with valgrind, no issues. Copies name the type name canonicalisation code from `typeCommand`, perhaps this would be better factored out to prevent the two diverging and both needing to be edited to add new `OBJ_*` types, but this is a little fiddly with C strings. The [redis-doc](https://github.com/antirez/redis-doc/blob/master/commands.json) repo will need to be updated with this new arg if accepted. A quirk to be aware of here is that the GEO commands are backed by zsets not their own type, so they're not distinguishable from other zsets. Additionally, for sparse types this has the same behaviour as `MATCH` in that it may return many empty results before giving something, even for large `COUNT`s. --- src/db.c | 32 +++++++++++++++++++++++++++++++- tests/unit/scan.tcl | 45 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+), 1 deletion(-) diff --git a/src/db.c b/src/db.c index b537a29a4..6623f7f2f 100644 --- a/src/db.c +++ b/src/db.c @@ -613,7 +613,7 @@ int parseScanCursorOrReply(client *c, robj *o, unsigned long *cursor) { } /* This command implements SCAN, HSCAN and SSCAN commands. - * If object 'o' is passed, then it must be a Hash or Set object, otherwise + * If object 'o' is passed, then it must be a Hash, Set or Zset object, otherwise * if 'o' is NULL the command will operate on the dictionary associated with * the current database. * @@ -629,6 +629,7 @@ void scanGenericCommand(client *c, robj *o, unsigned long cursor) { listNode *node, *nextnode; long count = 10; sds pat = NULL; + sds typename = NULL; int patlen = 0, use_pattern = 0; dict *ht; @@ -665,6 +666,10 @@ void scanGenericCommand(client *c, robj *o, unsigned long cursor) { use_pattern = !(pat[0] == '*' && patlen == 1); i += 2; + } else if (!strcasecmp(c->argv[i]->ptr, "type") && o == NULL && j >= 2) { + /* SCAN for a particular type only applies to the db dict */ + typename = c->argv[i+1]->ptr; + i+= 2; } else { addReply(c,shared.syntaxerr); goto cleanup; @@ -759,6 +764,31 @@ void scanGenericCommand(client *c, robj *o, unsigned long cursor) { } } + /* Filter an element if it isn't the type we want. */ + if (!filter && o == NULL && typename){ + robj* typecheck; + char *type; + typecheck = lookupKeyReadWithFlags(c->db, kobj, LOOKUP_NOTOUCH); + if (typecheck == NULL) { + type = "none"; + } else { + switch(typecheck->type) { + case OBJ_STRING: type = "string"; break; + case OBJ_LIST: type = "list"; break; + case OBJ_SET: type = "set"; break; + case OBJ_ZSET: type = "zset"; break; + case OBJ_HASH: type = "hash"; break; + case OBJ_STREAM: type = "stream"; break; + case OBJ_MODULE: { + moduleValue *mv = typecheck->ptr; + type = mv->type->name; + }; break; + default: type = "unknown"; break; + } + } + if (strcasecmp((char*) typename, type)) filter = 1; + } + /* Filter element if it is an expired key. */ if (!filter && o == NULL && expireIfNeeded(c->db, kobj)) filter = 1; diff --git a/tests/unit/scan.tcl b/tests/unit/scan.tcl index c0f4349d2..9f9ff4df2 100644 --- a/tests/unit/scan.tcl +++ b/tests/unit/scan.tcl @@ -53,6 +53,51 @@ start_server {tags {"scan"}} { assert_equal 100 [llength $keys] } + test "SCAN TYPE" { + r flushdb + # populate only creates strings + r debug populate 1000 + + # Check non-strings are excluded + set cur 0 + set keys {} + while 1 { + set res [r scan $cur type "list"] + set cur [lindex $res 0] + set k [lindex $res 1] + lappend keys {*}$k + if {$cur == 0} break + } + + assert_equal 0 [llength $keys] + + # Check strings are included + set cur 0 + set keys {} + while 1 { + set res [r scan $cur type "string"] + set cur [lindex $res 0] + set k [lindex $res 1] + lappend keys {*}$k + if {$cur == 0} break + } + + assert_equal 1000 [llength $keys] + + # Check all three args work together + set cur 0 + set keys {} + while 1 { + set res [r scan $cur type "string" match "key:*" count 10] + set cur [lindex $res 0] + set k [lindex $res 1] + lappend keys {*}$k + if {$cur == 0} break + } + + assert_equal 1000 [llength $keys] + } + foreach enc {intset hashtable} { test "SSCAN with encoding $enc" { # Create the Set From ec45f5b39501e44adb8f20764c472d210a21ac54 Mon Sep 17 00:00:00 2001 From: artix Date: Wed, 5 Jun 2019 16:34:55 +0200 Subject: [PATCH 11/76] Redis Benchmark: prevent CONFIG failure from exiting program --- src/redis-benchmark.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/redis-benchmark.c b/src/redis-benchmark.c index 2785167a8..1d16fa4ee 100644 --- a/src/redis-benchmark.c +++ b/src/redis-benchmark.c @@ -1540,7 +1540,10 @@ int main(int argc, const char **argv) { if (node->name) printf("%s ", node->name); printf("%s:%d\n", node->ip, node->port); node->redis_config = getRedisConfig(node->ip, node->port, NULL); - if (node->redis_config == NULL) exit(1); + if (node->redis_config == NULL) { + fprintf(stderr, "WARN: could not fetch node CONFIG %s:%d\n", + node->ip, node->port); + } } printf("\n"); /* Automatically set thread number to node count if not specified @@ -1550,7 +1553,8 @@ int main(int argc, const char **argv) { } else { config.redis_config = getRedisConfig(config.hostip, config.hostport, config.hostsocket); - if (config.redis_config == NULL) exit(1); + if (config.redis_config == NULL) + fprintf(stderr, "WARN: could not fetch server CONFIG\n"); } if (config.num_threads > 0) { From 67a4bcac1bbab76e4e872299b32cdbfd995c0eac Mon Sep 17 00:00:00 2001 From: Madelyn Olson Date: Fri, 7 Jun 2019 13:20:22 -0700 Subject: [PATCH 12/76] Fixed some spelling issues in ACL codepath including user facing error --- src/acl.c | 22 +++++++++++----------- src/server.c | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/acl.c b/src/acl.c index 0205e51ad..a2ee65dd0 100644 --- a/src/acl.c +++ b/src/acl.c @@ -295,7 +295,7 @@ int ACLGetCommandBitCoordinates(uint64_t id, uint64_t *word, uint64_t *bit) { * Note that this function does not check the ALLCOMMANDS flag of the user * but just the lowlevel bitmask. * - * If the bit overflows the user internal represetation, zero is returned + * If the bit overflows the user internal representation, zero is returned * in order to disallow the execution of the command in such edge case. */ int ACLGetUserCommandBit(user *u, unsigned long id) { uint64_t word, bit; @@ -311,7 +311,7 @@ int ACLUserCanExecuteFutureCommands(user *u) { } /* Set the specified command bit for the specified user to 'value' (0 or 1). - * If the bit overflows the user internal represetation, no operation + * If the bit overflows the user internal representation, no operation * is performed. As a side effect of calling this function with a value of * zero, the user flag ALLCOMMANDS is cleared since it is no longer possible * to skip the command bit explicit test. */ @@ -350,7 +350,7 @@ int ACLSetUserCommandBitsForCategory(user *u, const char *category, int value) { /* Return the number of commands allowed (on) and denied (off) for the user 'u' * in the subset of commands flagged with the specified category name. - * If the categoty name is not valid, C_ERR is returend, otherwise C_OK is + * If the category name is not valid, C_ERR is returned, otherwise C_OK is * returned and on and off are populated by reference. */ int ACLCountCategoryBitsForUser(user *u, unsigned long *on, unsigned long *off, const char *category) @@ -626,7 +626,7 @@ void ACLAddAllowedSubcommand(user *u, unsigned long id, const char *sub) { * It is possible to specify multiple patterns. * allkeys Alias for ~* * resetkeys Flush the list of allowed keys patterns. - * > Add this passowrd to the list of valid password for the user. + * > Add this password to the list of valid password for the user. * For example >mypass will add "mypass" to the list. * This directive clears the "nopass" flag (see later). * < Remove this password from the list of valid passwords. @@ -949,9 +949,9 @@ user *ACLGetUserByName(const char *name, size_t namelen) { return myuser; } -/* Check if the command ready to be excuted in the client 'c', and already - * referenced by c->cmd, can be executed by this client according to the - * ACls associated to the client user c->user. +/* Check if the command is ready to be executed in the client 'c', already + * referenced by c->cmd, and can be executed by this client according to the + * ACLs associated to the client user c->user. * * If the user can execute the command ACL_OK is returned, otherwise * ACL_DENIED_CMD or ACL_DENIED_KEY is returned: the first in case the @@ -1122,7 +1122,7 @@ int ACLLoadConfiguredUsers(void) { } /* This function loads the ACL from the specified filename: every line - * is validated and shold be either empty or in the format used to specify + * is validated and should be either empty or in the format used to specify * users in the redis.conf configuration or in the ACL file, that is: * * user ... rules ... @@ -1172,7 +1172,7 @@ sds ACLLoadFromFile(const char *filename) { * to the real user mentioned in the ACL line. */ user *fakeuser = ACLCreateUnlinkedUser(); - /* We do all the loading in a fresh insteance of the Users radix tree, + /* We do all the loading in a fresh instance of the Users radix tree, * so if there are errors loading the ACL file we can rollback to the * old version. */ rax *old_users = Users; @@ -1248,7 +1248,7 @@ sds ACLLoadFromFile(const char *filename) { } /* Note that the same rules already applied to the fake user, so - * we just assert that everything goess well: it should. */ + * we just assert that everything goes well: it should. */ for (j = 2; j < argc; j++) serverAssert(ACLSetUser(u,argv[j],sdslen(argv[j])) == C_OK); @@ -1611,7 +1611,7 @@ void addReplyCommandCategories(client *c, struct redisCommand *cmd) { setDeferredSetLen(c, flaglen, flagcount); } -/* AUTH +/* AUTH * AUTH (Redis >= 6.0 form) * * When the user is omitted it means that we are trying to authenticate diff --git a/src/server.c b/src/server.c index 2643d7266..e4df04692 100644 --- a/src/server.c +++ b/src/server.c @@ -3325,7 +3325,7 @@ int processCommand(client *c) { if (acl_retval == ACL_DENIED_CMD) addReplyErrorFormat(c, "-NOPERM this user has no permissions to run " - "the '%s' command or its subcommnad", c->cmd->name); + "the '%s' command or its subcommand", c->cmd->name); else addReplyErrorFormat(c, "-NOPERM this user has no permissions to access " From 5ca48db2e2621984b5312715b0ff66e9a0bdfa92 Mon Sep 17 00:00:00 2001 From: Angus Pearson Date: Mon, 10 Jun 2019 17:41:44 +0100 Subject: [PATCH 13/76] Add char* typeNameCanonicalize(robj*) to remove duplicate code between SCAN and TYPE commands, and to keep OBJ_* enum to string canonicalization in one place. --- src/db.c | 37 +++++++++++-------------------------- src/server.h | 6 ++++++ 2 files changed, 17 insertions(+), 26 deletions(-) diff --git a/src/db.c b/src/db.c index 6623f7f2f..6557ddc3c 100644 --- a/src/db.c +++ b/src/db.c @@ -766,26 +766,8 @@ void scanGenericCommand(client *c, robj *o, unsigned long cursor) { /* Filter an element if it isn't the type we want. */ if (!filter && o == NULL && typename){ - robj* typecheck; - char *type; - typecheck = lookupKeyReadWithFlags(c->db, kobj, LOOKUP_NOTOUCH); - if (typecheck == NULL) { - type = "none"; - } else { - switch(typecheck->type) { - case OBJ_STRING: type = "string"; break; - case OBJ_LIST: type = "list"; break; - case OBJ_SET: type = "set"; break; - case OBJ_ZSET: type = "zset"; break; - case OBJ_HASH: type = "hash"; break; - case OBJ_STREAM: type = "stream"; break; - case OBJ_MODULE: { - moduleValue *mv = typecheck->ptr; - type = mv->type->name; - }; break; - default: type = "unknown"; break; - } - } + robj* typecheck = lookupKeyReadWithFlags(c->db, kobj, LOOKUP_NOTOUCH); + char* type = typeNameCanonicalize(typecheck); if (strcasecmp((char*) typename, type)) filter = 1; } @@ -845,11 +827,8 @@ void lastsaveCommand(client *c) { addReplyLongLong(c,server.lastsave); } -void typeCommand(client *c) { - robj *o; - char *type; - - o = lookupKeyReadWithFlags(c->db,c->argv[1],LOOKUP_NOTOUCH); +char* typeNameCanonicalize(robj *o) { + char* type; if (o == NULL) { type = "none"; } else { @@ -867,7 +846,13 @@ void typeCommand(client *c) { default: type = "unknown"; break; } } - addReplyStatus(c,type); + return type; +} + +void typeCommand(client *c) { + robj *o; + o = lookupKeyReadWithFlags(c->db,c->argv[1],LOOKUP_NOTOUCH); + addReplyStatus(c, typeNameCanonicalize(o)); } void shutdownCommand(client *c) { diff --git a/src/server.h b/src/server.h index 0813f8bd1..06d0611fd 100644 --- a/src/server.h +++ b/src/server.h @@ -646,6 +646,12 @@ typedef struct redisObject { void *ptr; } robj; +/* The 'cannonical' name for a type as enumerated above is given by the + * below function. Native types are checked against the OBJ_STRING, + * OBJ_LIST, OBJ_* defines, and Module types have their registered name + * returned.*/ +char* typeNameCanonicalize(robj*); + /* Macro used to initialize a Redis object allocated on the stack. * Note that this macro is taken near the structure definition to make sure * we'll update it when the structure is changed, to avoid bugs like From 7077d14afeaba6f65306248a460575a10180ed6c Mon Sep 17 00:00:00 2001 From: swilly22 Date: Wed, 12 Jun 2019 15:37:19 +0300 Subject: [PATCH 14/76] Extend REDISMODULE_CTX_FLAGS to indicate if redis is currently loading from either RDB or AOF --- src/module.c | 3 +++ src/redismodule.h | 2 ++ 2 files changed, 5 insertions(+) diff --git a/src/module.c b/src/module.c index 7dee7e776..1cdd94d1d 100644 --- a/src/module.c +++ b/src/module.c @@ -1455,6 +1455,9 @@ int RM_GetContextFlags(RedisModuleCtx *ctx) { if (server.cluster_enabled) flags |= REDISMODULE_CTX_FLAGS_CLUSTER; + if (server.loading) + flags |= REDISMODULE_CTX_FLAGS_LOADING; + /* Maxmemory and eviction policy */ if (server.maxmemory > 0) { flags |= REDISMODULE_CTX_FLAGS_MAXMEMORY; diff --git a/src/redismodule.h b/src/redismodule.h index 259a5f1db..16b8c1937 100644 --- a/src/redismodule.h +++ b/src/redismodule.h @@ -87,6 +87,8 @@ #define REDISMODULE_CTX_FLAGS_OOM_WARNING (1<<11) /* The command was sent over the replication link. */ #define REDISMODULE_CTX_FLAGS_REPLICATED (1<<12) +/* Redis is currently loading either from AOF or RDB. */ +#define REDISMODULE_CTX_FLAGS_LOADING (1<<13) #define REDISMODULE_NOTIFY_GENERIC (1<<2) /* g */ From 49edda41b96b2f4586a1a5020180a0a0941bafbc Mon Sep 17 00:00:00 2001 From: Angus Pearson Date: Thu, 13 Jun 2019 17:49:33 +0100 Subject: [PATCH 15/76] Spelling cannonical -> canonical --- src/server.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/server.h b/src/server.h index 06d0611fd..dc02edb5c 100644 --- a/src/server.h +++ b/src/server.h @@ -646,7 +646,7 @@ typedef struct redisObject { void *ptr; } robj; -/* The 'cannonical' name for a type as enumerated above is given by the +/* The 'canonical' name for a type as enumerated above is given by the * below function. Native types are checked against the OBJ_STRING, * OBJ_LIST, OBJ_* defines, and Module types have their registered name * returned.*/ From bf7eb02d3e8841f860119d0e566bed1d82ac5fb6 Mon Sep 17 00:00:00 2001 From: Madelyn Olson Date: Fri, 31 May 2019 12:05:18 -0700 Subject: [PATCH 16/76] Refactored yesno configs so there was less duplication --- src/config.c | 354 +++++++++++++-------------------------------------- 1 file changed, 89 insertions(+), 265 deletions(-) diff --git a/src/config.c b/src/config.c index 7f0e9af89..2e6e9a6b7 100644 --- a/src/config.c +++ b/src/config.c @@ -98,6 +98,48 @@ clientBufferLimitsConfig clientBufferLimitsDefaults[CLIENT_TYPE_OBUF_COUNT] = { {1024*1024*32, 1024*1024*8, 60} /* pubsub */ }; +/* Configuration values that require no special handling to set, get, load or + * rewrite. */ +typedef struct configYesNo { + const char *name; /* The user visible name of this config */ + const char *alias; /* An alias that can also be used for this config */ + int *config; /* The pointer to the server config this value is stored in */ + const int modifiable; /* Can this value be updated by CONFIG SET? */ + const int default_value; /* The default value of the config on rewrite */ +} configYesNo; + +configYesNo configs_yesno[] = { + /* Non-Modifiable */ + {"rdbchecksum",NULL,&server.rdb_checksum,0,CONFIG_DEFAULT_RDB_CHECKSUM}, + {"daemonize",NULL,&server.daemonize,0,0}, + {"io-threads-do-reads",NULL,&server.io_threads_do_reads, 0, CONFIG_DEFAULT_IO_THREADS_DO_READS}, + {"always-show-logo",NULL,&server.always_show_logo,0,CONFIG_DEFAULT_ALWAYS_SHOW_LOGO}, + /* Modifiable */ + {"protected-mode",NULL,&server.protected_mode,1,CONFIG_DEFAULT_PROTECTED_MODE}, + {"rdbcompression",NULL,&server.rdb_compression,1,CONFIG_DEFAULT_RDB_COMPRESSION}, + {"activerehashing",NULL,&server.activerehashing,1,CONFIG_DEFAULT_ACTIVE_REHASHING}, + {"stop-writes-on-bgsave-error",NULL,&server.stop_writes_on_bgsave_err,1,CONFIG_DEFAULT_STOP_WRITES_ON_BGSAVE_ERROR}, + {"dynamic-hz",NULL,&server.dynamic_hz,1,CONFIG_DEFAULT_DYNAMIC_HZ}, + {"lazyfree-lazy-eviction",NULL,&server.lazyfree_lazy_eviction,1,CONFIG_DEFAULT_LAZYFREE_LAZY_EVICTION}, + {"lazyfree-lazy-expire",NULL,&server.lazyfree_lazy_expire,1,CONFIG_DEFAULT_LAZYFREE_LAZY_EXPIRE}, + {"lazyfree-lazy-server-del",NULL,&server.lazyfree_lazy_server_del,1,CONFIG_DEFAULT_LAZYFREE_LAZY_SERVER_DEL}, + {"repl-disable-tcp-nodelay",NULL,&server.repl_disable_tcp_nodelay,1,CONFIG_DEFAULT_REPL_DISABLE_TCP_NODELAY}, + {"repl-diskless-sync",NULL,&server.repl_diskless_sync,1,CONFIG_DEFAULT_REPL_DISKLESS_SYNC}, + {"gopher-enabled",NULL,&server.gopher_enabled,1,CONFIG_DEFAULT_GOPHER_ENABLED}, + {"aof-rewrite-incremental-fsync",NULL,&server.aof_rewrite_incremental_fsync,1,CONFIG_DEFAULT_AOF_REWRITE_INCREMENTAL_FSYNC}, + {"no-appendfsync-on-rewrite",NULL,&server.aof_no_fsync_on_rewrite,1,CONFIG_DEFAULT_AOF_NO_FSYNC_ON_REWRITE}, + {"cluster-require-full-coverage",NULL,&server.cluster_require_full_coverage,CLUSTER_DEFAULT_REQUIRE_FULL_COVERAGE}, + {"rdb-save-incremental-fsync",NULL,&server.rdb_save_incremental_fsync,1,CONFIG_DEFAULT_RDB_SAVE_INCREMENTAL_FSYNC}, + {"aof-load-truncated",NULL,&server.aof_load_truncated,1,CONFIG_DEFAULT_AOF_LOAD_TRUNCATED}, + {"aof-use-rdb-preamble",NULL,&server.aof_use_rdb_preamble,1,CONFIG_DEFAULT_AOF_USE_RDB_PREAMBLE}, + {"cluster-replica-no-failover","cluster-slave-no-failover",&server.cluster_slave_no_failover,1,CLUSTER_DEFAULT_SLAVE_NO_FAILOVER}, + {"replica-lazy-flush","slave-lazy-flush",&server.repl_slave_lazy_flush,1,CONFIG_DEFAULT_SLAVE_LAZY_FLUSH}, + {"replica-serve-stale-data","slave-serve-stale-data",&server.repl_serve_stale_data,1,CONFIG_DEFAULT_SLAVE_SERVE_STALE_DATA}, + {"replica-read-only","slave-read-only",&server.repl_slave_ro,1,CONFIG_DEFAULT_SLAVE_READ_ONLY}, + {"replica-ignore-maxmemory","slave-ignore-maxmemory",&server.repl_slave_ignore_maxmemory,1,CONFIG_DEFAULT_SLAVE_IGNORE_MAXMEMORY}, + {NULL, NULL, 0, 0} +}; + /*----------------------------------------------------------------------------- * Enum access functions *----------------------------------------------------------------------------*/ @@ -201,6 +243,26 @@ void loadServerConfigFromString(char *config) { } sdstolower(argv[0]); + /* Iterate the configs that are standard */ + int match = 0; + for (configYesNo *config = configs_yesno; config->name != NULL; config++) { + if ((!strcasecmp(argv[0],config->name) || + (config->alias && !strcasecmp(argv[0],config->alias))) && + (argc == 2)) + { + if ((*(config->config) = yesnotoi(argv[1])) == -1) { + err = "argument must be 'yes' or 'no'"; goto loaderr; + } + match = 1; + break; + } + } + + if (match) { + sdsfreesplitres(argv,argc); + continue; + } + /* Execute config directives */ if (!strcasecmp(argv[0],"timeout") && argc == 2) { server.maxidletime = atoi(argv[1]); @@ -212,14 +274,6 @@ void loadServerConfigFromString(char *config) { if (server.tcpkeepalive < 0) { err = "Invalid tcp-keepalive value"; goto loaderr; } - } else if (!strcasecmp(argv[0],"protected-mode") && argc == 2) { - if ((server.protected_mode = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } - } else if (!strcasecmp(argv[0],"gopher-enabled") && argc == 2) { - if ((server.gopher_enabled = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } } else if (!strcasecmp(argv[0],"port") && argc == 2) { server.port = atoi(argv[1]); if (server.port < 0 || server.port > 65535) { @@ -290,10 +344,6 @@ void loadServerConfigFromString(char *config) { } else if (!strcasecmp(argv[0],"aclfile") && argc == 2) { zfree(server.acl_filename); server.acl_filename = zstrdup(argv[1]); - } else if (!strcasecmp(argv[0],"always-show-logo") && argc == 2) { - if ((server.always_show_logo = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } } else if (!strcasecmp(argv[0],"syslog-enabled") && argc == 2) { if ((server.syslog_enabled = yesnotoi(argv[1])) == -1) { err = "argument must be 'yes' or 'no'"; goto loaderr; @@ -318,10 +368,6 @@ void loadServerConfigFromString(char *config) { if (server.io_threads_num < 1 || server.io_threads_num > 512) { err = "Invalid number of I/O threads"; goto loaderr; } - } else if (!strcasecmp(argv[0],"io-threads-do-reads") && argc == 2) { - if ((server.io_threads_do_reads = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } } else if (!strcasecmp(argv[0],"include") && argc == 2) { loadServerConfig(argv[1],NULL); } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) { @@ -381,14 +427,6 @@ void loadServerConfigFromString(char *config) { err = "repl-timeout must be 1 or greater"; goto loaderr; } - } else if (!strcasecmp(argv[0],"repl-disable-tcp-nodelay") && argc==2) { - if ((server.repl_disable_tcp_nodelay = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } - } else if (!strcasecmp(argv[0],"repl-diskless-sync") && argc==2) { - if ((server.repl_diskless_sync = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } } else if (!strcasecmp(argv[0],"repl-diskless-sync-delay") && argc==2) { server.repl_diskless_sync_delay = atoi(argv[1]); if (server.repl_diskless_sync_delay < 0) { @@ -414,57 +452,6 @@ void loadServerConfigFromString(char *config) { } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) { zfree(server.masterauth); server.masterauth = argv[1][0] ? zstrdup(argv[1]) : NULL; - } else if ((!strcasecmp(argv[0],"slave-serve-stale-data") || - !strcasecmp(argv[0],"replica-serve-stale-data")) - && argc == 2) - { - if ((server.repl_serve_stale_data = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } - } else if ((!strcasecmp(argv[0],"slave-read-only") || - !strcasecmp(argv[0],"replica-read-only")) - && argc == 2) - { - if ((server.repl_slave_ro = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } - } else if ((!strcasecmp(argv[0],"slave-ignore-maxmemory") || - !strcasecmp(argv[0],"replica-ignore-maxmemory")) - && argc == 2) - { - if ((server.repl_slave_ignore_maxmemory = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } - } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) { - if ((server.rdb_compression = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } - } else if (!strcasecmp(argv[0],"rdbchecksum") && argc == 2) { - if ((server.rdb_checksum = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } - } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) { - if ((server.activerehashing = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } - } else if (!strcasecmp(argv[0],"lazyfree-lazy-eviction") && argc == 2) { - if ((server.lazyfree_lazy_eviction = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } - } else if (!strcasecmp(argv[0],"lazyfree-lazy-expire") && argc == 2) { - if ((server.lazyfree_lazy_expire = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } - } else if (!strcasecmp(argv[0],"lazyfree-lazy-server-del") && argc == 2){ - if ((server.lazyfree_lazy_server_del = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } - } else if ((!strcasecmp(argv[0],"slave-lazy-flush") || - !strcasecmp(argv[0],"replica-lazy-flush")) && argc == 2) - { - if ((server.repl_slave_lazy_flush = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } } else if (!strcasecmp(argv[0],"activedefrag") && argc == 2) { if ((server.active_defrag_enabled = yesnotoi(argv[1])) == -1) { err = "argument must be 'yes' or 'no'"; goto loaderr; @@ -474,14 +461,6 @@ void loadServerConfigFromString(char *config) { err = "active defrag can't be enabled without proper jemalloc support"; goto loaderr; #endif } - } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) { - if ((server.daemonize = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } - } else if (!strcasecmp(argv[0],"dynamic-hz") && argc == 2) { - if ((server.dynamic_hz = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } } else if (!strcasecmp(argv[0],"hz") && argc == 2) { server.config_hz = atoi(argv[1]); if (server.config_hz < CONFIG_MIN_HZ) server.config_hz = CONFIG_MIN_HZ; @@ -500,11 +479,6 @@ void loadServerConfigFromString(char *config) { } zfree(server.aof_filename); server.aof_filename = zstrdup(argv[1]); - } else if (!strcasecmp(argv[0],"no-appendfsync-on-rewrite") - && argc == 2) { - if ((server.aof_no_fsync_on_rewrite= yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) { server.aof_fsync = configEnumGetValue(aof_fsync_enum,argv[1]); if (server.aof_fsync == INT_MIN) { @@ -523,28 +497,6 @@ void loadServerConfigFromString(char *config) { argc == 2) { server.aof_rewrite_min_size = memtoll(argv[1],NULL); - } else if (!strcasecmp(argv[0],"aof-rewrite-incremental-fsync") && - argc == 2) - { - if ((server.aof_rewrite_incremental_fsync = - yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } - } else if (!strcasecmp(argv[0],"rdb-save-incremental-fsync") && - argc == 2) - { - if ((server.rdb_save_incremental_fsync = - yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } - } else if (!strcasecmp(argv[0],"aof-load-truncated") && argc == 2) { - if ((server.aof_load_truncated = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } - } else if (!strcasecmp(argv[0],"aof-use-rdb-preamble") && argc == 2) { - if ((server.aof_use_rdb_preamble = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) { if (strlen(argv[1]) > CONFIG_AUTHPASS_MAX_LEN) { err = "Password is longer than CONFIG_AUTHPASS_MAX_LEN"; @@ -678,13 +630,6 @@ void loadServerConfigFromString(char *config) { { err = "Invalid port"; goto loaderr; } - } else if (!strcasecmp(argv[0],"cluster-require-full-coverage") && - argc == 2) - { - if ((server.cluster_require_full_coverage = yesnotoi(argv[1])) == -1) - { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } } else if (!strcasecmp(argv[0],"cluster-node-timeout") && argc == 2) { server.cluster_node_timeout = strtoll(argv[1],NULL,10); if (server.cluster_node_timeout <= 0) { @@ -707,15 +652,6 @@ void loadServerConfigFromString(char *config) { err = "cluster replica validity factor must be zero or positive"; goto loaderr; } - } else if ((!strcasecmp(argv[0],"cluster-slave-no-failover") || - !strcasecmp(argv[0],"cluster-replica-no-failover")) && - argc == 2) - { - server.cluster_slave_no_failover = yesnotoi(argv[1]); - if (server.cluster_slave_no_failover == -1) { - err = "argument must be 'yes' or 'no'"; - goto loaderr; - } } else if (!strcasecmp(argv[0],"lua-time-limit") && argc == 2) { server.lua_time_limit = strtoll(argv[1],NULL,10); } else if (!strcasecmp(argv[0],"lua-replicate-commands") && argc == 2) { @@ -756,11 +692,6 @@ void loadServerConfigFromString(char *config) { server.client_obuf_limits[class].hard_limit_bytes = hard; server.client_obuf_limits[class].soft_limit_bytes = soft; server.client_obuf_limits[class].soft_limit_seconds = soft_seconds; - } else if (!strcasecmp(argv[0],"stop-writes-on-bgsave-error") && - argc == 2) { - if ((server.stop_writes_on_bgsave_err = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } } else if ((!strcasecmp(argv[0],"slave-priority") || !strcasecmp(argv[0],"replica-priority")) && argc == 2) { @@ -941,6 +872,19 @@ void configSetCommand(client *c) { serverAssertWithInfo(c,c->argv[3],sdsEncodedObject(c->argv[3])); o = c->argv[3]; + /* Iterate the configs that are standard */ + for (configYesNo *config = configs_yesno; config->name != NULL; config++) { + if(config->modifiable && (!strcasecmp(c->argv[2]->ptr,config->name) || + (config->alias && !strcasecmp(c->argv[2]->ptr,config->alias)))) + { + int yn = yesnotoi(o->ptr); + if (yn == -1) goto badfmt; + *(config->config) = yn; + addReply(c,shared.ok); + return; + } + } + if (0) { /* this starts the config_set macros else-if chain. */ /* Special fields that can't be handled with general macros. */ @@ -1105,40 +1049,6 @@ void configSetCommand(client *c) { /* Boolean fields. * config_set_bool_field(name,var). */ - } config_set_bool_field( - "rdbcompression", server.rdb_compression) { - } config_set_bool_field( - "repl-disable-tcp-nodelay",server.repl_disable_tcp_nodelay) { - } config_set_bool_field( - "repl-diskless-sync",server.repl_diskless_sync) { - } config_set_bool_field( - "cluster-require-full-coverage",server.cluster_require_full_coverage) { - } config_set_bool_field( - "cluster-slave-no-failover",server.cluster_slave_no_failover) { - } config_set_bool_field( - "cluster-replica-no-failover",server.cluster_slave_no_failover) { - } config_set_bool_field( - "aof-rewrite-incremental-fsync",server.aof_rewrite_incremental_fsync) { - } config_set_bool_field( - "rdb-save-incremental-fsync",server.rdb_save_incremental_fsync) { - } config_set_bool_field( - "aof-load-truncated",server.aof_load_truncated) { - } config_set_bool_field( - "aof-use-rdb-preamble",server.aof_use_rdb_preamble) { - } config_set_bool_field( - "slave-serve-stale-data",server.repl_serve_stale_data) { - } config_set_bool_field( - "replica-serve-stale-data",server.repl_serve_stale_data) { - } config_set_bool_field( - "slave-read-only",server.repl_slave_ro) { - } config_set_bool_field( - "replica-read-only",server.repl_slave_ro) { - } config_set_bool_field( - "slave-ignore-maxmemory",server.repl_slave_ignore_maxmemory) { - } config_set_bool_field( - "replica-ignore-maxmemory",server.repl_slave_ignore_maxmemory) { - } config_set_bool_field( - "activerehashing",server.activerehashing) { } config_set_bool_field( "activedefrag",server.active_defrag_enabled) { #ifndef HAVE_DEFRAG @@ -1152,27 +1062,6 @@ void configSetCommand(client *c) { return; } #endif - } config_set_bool_field( - "protected-mode",server.protected_mode) { - } config_set_bool_field( - "gopher-enabled",server.gopher_enabled) { - } config_set_bool_field( - "stop-writes-on-bgsave-error",server.stop_writes_on_bgsave_err) { - } config_set_bool_field( - "lazyfree-lazy-eviction",server.lazyfree_lazy_eviction) { - } config_set_bool_field( - "lazyfree-lazy-expire",server.lazyfree_lazy_expire) { - } config_set_bool_field( - "lazyfree-lazy-server-del",server.lazyfree_lazy_server_del) { - } config_set_bool_field( - "slave-lazy-flush",server.repl_slave_lazy_flush) { - } config_set_bool_field( - "replica-lazy-flush",server.repl_slave_lazy_flush) { - } config_set_bool_field( - "no-appendfsync-on-rewrite",server.aof_no_fsync_on_rewrite) { - } config_set_bool_field( - "dynamic-hz",server.dynamic_hz) { - /* Numerical fields. * config_set_numerical_field(name,var,min,max) */ } config_set_numerical_field( @@ -1460,60 +1349,15 @@ void configGetCommand(client *c) { config_get_numerical_field("tcp-keepalive",server.tcpkeepalive); /* Bool (yes/no) values */ - config_get_bool_field("cluster-require-full-coverage", - server.cluster_require_full_coverage); - config_get_bool_field("cluster-slave-no-failover", - server.cluster_slave_no_failover); - config_get_bool_field("cluster-replica-no-failover", - server.cluster_slave_no_failover); - config_get_bool_field("no-appendfsync-on-rewrite", - server.aof_no_fsync_on_rewrite); - config_get_bool_field("slave-serve-stale-data", - server.repl_serve_stale_data); - config_get_bool_field("replica-serve-stale-data", - server.repl_serve_stale_data); - config_get_bool_field("slave-read-only", - server.repl_slave_ro); - config_get_bool_field("replica-read-only", - server.repl_slave_ro); - config_get_bool_field("slave-ignore-maxmemory", - server.repl_slave_ignore_maxmemory); - config_get_bool_field("replica-ignore-maxmemory", - server.repl_slave_ignore_maxmemory); - config_get_bool_field("stop-writes-on-bgsave-error", - server.stop_writes_on_bgsave_err); - config_get_bool_field("daemonize", server.daemonize); - config_get_bool_field("rdbcompression", server.rdb_compression); - config_get_bool_field("rdbchecksum", server.rdb_checksum); - config_get_bool_field("activerehashing", server.activerehashing); + /* Iterate the configs that are standard */ + for (configYesNo *config = configs_yesno; config->name != NULL; config++) { + config_get_bool_field(config->name, *(config->config)); + if (config->alias) { + config_get_bool_field(config->alias, *(config->config)); + } + } + config_get_bool_field("activedefrag", server.active_defrag_enabled); - config_get_bool_field("protected-mode", server.protected_mode); - config_get_bool_field("gopher-enabled", server.gopher_enabled); - config_get_bool_field("io-threads-do-reads", server.io_threads_do_reads); - config_get_bool_field("repl-disable-tcp-nodelay", - server.repl_disable_tcp_nodelay); - config_get_bool_field("repl-diskless-sync", - server.repl_diskless_sync); - config_get_bool_field("aof-rewrite-incremental-fsync", - server.aof_rewrite_incremental_fsync); - config_get_bool_field("rdb-save-incremental-fsync", - server.rdb_save_incremental_fsync); - config_get_bool_field("aof-load-truncated", - server.aof_load_truncated); - config_get_bool_field("aof-use-rdb-preamble", - server.aof_use_rdb_preamble); - config_get_bool_field("lazyfree-lazy-eviction", - server.lazyfree_lazy_eviction); - config_get_bool_field("lazyfree-lazy-expire", - server.lazyfree_lazy_expire); - config_get_bool_field("lazyfree-lazy-server-del", - server.lazyfree_lazy_server_del); - config_get_bool_field("slave-lazy-flush", - server.repl_slave_lazy_flush); - config_get_bool_field("replica-lazy-flush", - server.repl_slave_lazy_flush); - config_get_bool_field("dynamic-hz", - server.dynamic_hz); /* Enum values */ config_get_enum_field("maxmemory-policy", @@ -1858,7 +1702,7 @@ void rewriteConfigBytesOption(struct rewriteConfigState *state, char *option, lo } /* Rewrite a yes/no option. */ -void rewriteConfigYesNoOption(struct rewriteConfigState *state, char *option, int value, int defvalue) { +void rewriteConfigYesNoOption(struct rewriteConfigState *state, const char *option, int value, int defvalue) { int force = value != defvalue; sds line = sdscatprintf(sdsempty(),"%s %s",option, value ? "yes" : "no"); @@ -2228,7 +2072,11 @@ int rewriteConfig(char *path) { /* Step 2: rewrite every single option, replacing or appending it inside * the rewrite state. */ - rewriteConfigYesNoOption(state,"daemonize",server.daemonize,0); + /* Iterate the configs that are standard */ + for (configYesNo *config = configs_yesno; config->name != NULL; config++) { + rewriteConfigYesNoOption(state,config->name,*(config->config),config->default_value); + } + rewriteConfigStringOption(state,"pidfile",server.pidfile,CONFIG_DEFAULT_PID_FILE); rewriteConfigNumericalOption(state,"port",server.port,CONFIG_DEFAULT_SERVER_PORT); rewriteConfigNumericalOption(state,"cluster-announce-port",server.cluster_announce_port,CONFIG_DEFAULT_CLUSTER_ANNOUNCE_PORT); @@ -2250,9 +2098,6 @@ int rewriteConfig(char *path) { rewriteConfigUserOption(state); rewriteConfigNumericalOption(state,"databases",server.dbnum,CONFIG_DEFAULT_DBNUM); rewriteConfigNumericalOption(state,"io-threads",server.dbnum,CONFIG_DEFAULT_IO_THREADS_NUM); - rewriteConfigYesNoOption(state,"stop-writes-on-bgsave-error",server.stop_writes_on_bgsave_err,CONFIG_DEFAULT_STOP_WRITES_ON_BGSAVE_ERROR); - rewriteConfigYesNoOption(state,"rdbcompression",server.rdb_compression,CONFIG_DEFAULT_RDB_COMPRESSION); - rewriteConfigYesNoOption(state,"rdbchecksum",server.rdb_checksum,CONFIG_DEFAULT_RDB_CHECKSUM); rewriteConfigStringOption(state,"dbfilename",server.rdb_filename,CONFIG_DEFAULT_RDB_FILENAME); rewriteConfigDirOption(state); rewriteConfigSlaveofOption(state,"replicaof"); @@ -2260,15 +2105,10 @@ int rewriteConfig(char *path) { rewriteConfigStringOption(state,"masteruser",server.masteruser,NULL); rewriteConfigStringOption(state,"masterauth",server.masterauth,NULL); rewriteConfigStringOption(state,"cluster-announce-ip",server.cluster_announce_ip,NULL); - rewriteConfigYesNoOption(state,"replica-serve-stale-data",server.repl_serve_stale_data,CONFIG_DEFAULT_SLAVE_SERVE_STALE_DATA); - rewriteConfigYesNoOption(state,"replica-read-only",server.repl_slave_ro,CONFIG_DEFAULT_SLAVE_READ_ONLY); - rewriteConfigYesNoOption(state,"replica-ignore-maxmemory",server.repl_slave_ignore_maxmemory,CONFIG_DEFAULT_SLAVE_IGNORE_MAXMEMORY); rewriteConfigNumericalOption(state,"repl-ping-replica-period",server.repl_ping_slave_period,CONFIG_DEFAULT_REPL_PING_SLAVE_PERIOD); rewriteConfigNumericalOption(state,"repl-timeout",server.repl_timeout,CONFIG_DEFAULT_REPL_TIMEOUT); rewriteConfigBytesOption(state,"repl-backlog-size",server.repl_backlog_size,CONFIG_DEFAULT_REPL_BACKLOG_SIZE); rewriteConfigBytesOption(state,"repl-backlog-ttl",server.repl_backlog_time_limit,CONFIG_DEFAULT_REPL_BACKLOG_TIME_LIMIT); - rewriteConfigYesNoOption(state,"repl-disable-tcp-nodelay",server.repl_disable_tcp_nodelay,CONFIG_DEFAULT_REPL_DISABLE_TCP_NODELAY); - rewriteConfigYesNoOption(state,"repl-diskless-sync",server.repl_diskless_sync,CONFIG_DEFAULT_REPL_DISKLESS_SYNC); rewriteConfigNumericalOption(state,"repl-diskless-sync-delay",server.repl_diskless_sync_delay,CONFIG_DEFAULT_REPL_DISKLESS_SYNC_DELAY); rewriteConfigNumericalOption(state,"replica-priority",server.slave_priority,CONFIG_DEFAULT_SLAVE_PRIORITY); rewriteConfigNumericalOption(state,"min-replicas-to-write",server.repl_min_slaves_to_write,CONFIG_DEFAULT_MIN_SLAVES_TO_WRITE); @@ -2291,14 +2131,11 @@ int rewriteConfig(char *path) { rewriteConfigYesNoOption(state,"appendonly",server.aof_state != AOF_OFF,0); rewriteConfigStringOption(state,"appendfilename",server.aof_filename,CONFIG_DEFAULT_AOF_FILENAME); rewriteConfigEnumOption(state,"appendfsync",server.aof_fsync,aof_fsync_enum,CONFIG_DEFAULT_AOF_FSYNC); - rewriteConfigYesNoOption(state,"no-appendfsync-on-rewrite",server.aof_no_fsync_on_rewrite,CONFIG_DEFAULT_AOF_NO_FSYNC_ON_REWRITE); rewriteConfigNumericalOption(state,"auto-aof-rewrite-percentage",server.aof_rewrite_perc,AOF_REWRITE_PERC); rewriteConfigBytesOption(state,"auto-aof-rewrite-min-size",server.aof_rewrite_min_size,AOF_REWRITE_MIN_SIZE); rewriteConfigNumericalOption(state,"lua-time-limit",server.lua_time_limit,LUA_SCRIPT_TIME_LIMIT); rewriteConfigYesNoOption(state,"cluster-enabled",server.cluster_enabled,0); rewriteConfigStringOption(state,"cluster-config-file",server.cluster_configfile,CONFIG_DEFAULT_CLUSTER_CONFIG_FILE); - rewriteConfigYesNoOption(state,"cluster-require-full-coverage",server.cluster_require_full_coverage,CLUSTER_DEFAULT_REQUIRE_FULL_COVERAGE); - rewriteConfigYesNoOption(state,"cluster-replica-no-failover",server.cluster_slave_no_failover,CLUSTER_DEFAULT_SLAVE_NO_FAILOVER); rewriteConfigNumericalOption(state,"cluster-node-timeout",server.cluster_node_timeout,CLUSTER_DEFAULT_NODE_TIMEOUT); rewriteConfigNumericalOption(state,"cluster-migration-barrier",server.cluster_migration_barrier,CLUSTER_DEFAULT_MIGRATION_BARRIER); rewriteConfigNumericalOption(state,"cluster-replica-validity-factor",server.cluster_slave_validity_factor,CLUSTER_DEFAULT_SLAVE_VALIDITY); @@ -2316,23 +2153,10 @@ int rewriteConfig(char *path) { rewriteConfigNumericalOption(state,"zset-max-ziplist-entries",server.zset_max_ziplist_entries,OBJ_ZSET_MAX_ZIPLIST_ENTRIES); rewriteConfigNumericalOption(state,"zset-max-ziplist-value",server.zset_max_ziplist_value,OBJ_ZSET_MAX_ZIPLIST_VALUE); rewriteConfigNumericalOption(state,"hll-sparse-max-bytes",server.hll_sparse_max_bytes,CONFIG_DEFAULT_HLL_SPARSE_MAX_BYTES); - rewriteConfigYesNoOption(state,"activerehashing",server.activerehashing,CONFIG_DEFAULT_ACTIVE_REHASHING); rewriteConfigYesNoOption(state,"activedefrag",server.active_defrag_enabled,CONFIG_DEFAULT_ACTIVE_DEFRAG); - rewriteConfigYesNoOption(state,"protected-mode",server.protected_mode,CONFIG_DEFAULT_PROTECTED_MODE); - rewriteConfigYesNoOption(state,"gopher-enabled",server.gopher_enabled,CONFIG_DEFAULT_GOPHER_ENABLED); - rewriteConfigYesNoOption(state,"io-threads-do-reads",server.io_threads_do_reads,CONFIG_DEFAULT_IO_THREADS_DO_READS); rewriteConfigClientoutputbufferlimitOption(state); rewriteConfigNumericalOption(state,"hz",server.config_hz,CONFIG_DEFAULT_HZ); - rewriteConfigYesNoOption(state,"aof-rewrite-incremental-fsync",server.aof_rewrite_incremental_fsync,CONFIG_DEFAULT_AOF_REWRITE_INCREMENTAL_FSYNC); - rewriteConfigYesNoOption(state,"rdb-save-incremental-fsync",server.rdb_save_incremental_fsync,CONFIG_DEFAULT_RDB_SAVE_INCREMENTAL_FSYNC); - rewriteConfigYesNoOption(state,"aof-load-truncated",server.aof_load_truncated,CONFIG_DEFAULT_AOF_LOAD_TRUNCATED); - rewriteConfigYesNoOption(state,"aof-use-rdb-preamble",server.aof_use_rdb_preamble,CONFIG_DEFAULT_AOF_USE_RDB_PREAMBLE); rewriteConfigEnumOption(state,"supervised",server.supervised_mode,supervised_mode_enum,SUPERVISED_NONE); - rewriteConfigYesNoOption(state,"lazyfree-lazy-eviction",server.lazyfree_lazy_eviction,CONFIG_DEFAULT_LAZYFREE_LAZY_EVICTION); - rewriteConfigYesNoOption(state,"lazyfree-lazy-expire",server.lazyfree_lazy_expire,CONFIG_DEFAULT_LAZYFREE_LAZY_EXPIRE); - rewriteConfigYesNoOption(state,"lazyfree-lazy-server-del",server.lazyfree_lazy_server_del,CONFIG_DEFAULT_LAZYFREE_LAZY_SERVER_DEL); - rewriteConfigYesNoOption(state,"replica-lazy-flush",server.repl_slave_lazy_flush,CONFIG_DEFAULT_SLAVE_LAZY_FLUSH); - rewriteConfigYesNoOption(state,"dynamic-hz",server.dynamic_hz,CONFIG_DEFAULT_DYNAMIC_HZ); /* Rewrite Sentinel config if in Sentinel mode. */ if (server.sentinel_mode) rewriteConfigSentinelOption(state); From a152f483cddb2d8c5a28c6abc3805ac66140b1e4 Mon Sep 17 00:00:00 2001 From: antirez Date: Sat, 29 Jun 2019 09:09:38 -0400 Subject: [PATCH 17/76] Client side caching: add new file and description. --- src/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Makefile b/src/Makefile index f35685eff..e608309f8 100644 --- a/src/Makefile +++ b/src/Makefile @@ -164,7 +164,7 @@ endif REDIS_SERVER_NAME=redis-server REDIS_SENTINEL_NAME=redis-sentinel -REDIS_SERVER_OBJ=adlist.o quicklist.o ae.o anet.o dict.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o crc16.o endianconv.o slowlog.o scripting.o bio.o rio.o rand.o memtest.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o acl.o gopher.o +REDIS_SERVER_OBJ=adlist.o quicklist.o ae.o anet.o dict.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o crc16.o endianconv.o slowlog.o scripting.o bio.o rio.o rand.o memtest.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o acl.o gopher.o tracking.c REDIS_CLI_NAME=redis-cli REDIS_CLI_OBJ=anet.o adlist.o dict.o redis-cli.o zmalloc.o release.o anet.o ae.o crc64.o siphash.o crc16.o REDIS_BENCHMARK_NAME=redis-benchmark From 7f9de752de8fb5a12b571570386656362c215b60 Mon Sep 17 00:00:00 2001 From: antirez Date: Sat, 29 Jun 2019 20:08:41 -0400 Subject: [PATCH 18/76] Client side caching: fields and flags for tracking mode. --- src/networking.c | 5 +++++ src/server.h | 21 ++++++++++++++++++--- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/src/networking.c b/src/networking.c index 4bc22120a..44979770c 100644 --- a/src/networking.c +++ b/src/networking.c @@ -158,6 +158,7 @@ client *createClient(int fd) { c->pubsub_patterns = listCreate(); c->peerid = NULL; c->client_list_node = NULL; + c->client_tracking_redirection = 0; listSetFreeMethod(c->pubsub_patterns,decrRefCountVoid); listSetMatchMethod(c->pubsub_patterns,listMatchObjects); if (fd != -1) linkClient(c); @@ -966,6 +967,9 @@ void unlinkClient(client *c) { listDelNode(server.unblocked_clients,ln); c->flags &= ~CLIENT_UNBLOCKED; } + + /* Clear the tracking status. */ + if (c->flags & CLIENT_TRACKING) disableTracking(c); } void freeClient(client *c) { @@ -1849,6 +1853,7 @@ sds catClientInfoString(sds s, client *client) { if (client->flags & CLIENT_PUBSUB) *p++ = 'P'; if (client->flags & CLIENT_MULTI) *p++ = 'x'; if (client->flags & CLIENT_BLOCKED) *p++ = 'b'; + if (client->flags & CLIENT_TRACKING) *p++ = 't'; if (client->flags & CLIENT_DIRTY_CAS) *p++ = 'd'; if (client->flags & CLIENT_CLOSE_AFTER_REPLY) *p++ = 'c'; if (client->flags & CLIENT_UNBLOCKED) *p++ = 'u'; diff --git a/src/server.h b/src/server.h index 0813f8bd1..a6c6a4dae 100644 --- a/src/server.h +++ b/src/server.h @@ -254,8 +254,8 @@ typedef long long mstime_t; /* millisecond time type. */ #define AOF_WAIT_REWRITE 2 /* AOF waits rewrite to start appending */ /* Client flags */ -#define CLIENT_SLAVE (1<<0) /* This client is a slave server */ -#define CLIENT_MASTER (1<<1) /* This client is a master server */ +#define CLIENT_SLAVE (1<<0) /* This client is a repliaca */ +#define CLIENT_MASTER (1<<1) /* This client is a master */ #define CLIENT_MONITOR (1<<2) /* This client is a slave monitor, see MONITOR */ #define CLIENT_MULTI (1<<3) /* This client is in a MULTI context */ #define CLIENT_BLOCKED (1<<4) /* The client is waiting in a blocking operation */ @@ -289,7 +289,12 @@ typedef long long mstime_t; /* millisecond time type. */ #define CLIENT_PENDING_READ (1<<29) /* The client has pending reads and was put in the list of clients we can read from. */ -#define CLIENT_PENDING_COMMAND (1<<30) /* */ +#define CLIENT_PENDING_COMMAND (1<<30) /* Used in threaded I/O to signal after + we return single threaded that the + client has already pending commands + to be executed. */ +#define CLIENT_TRACKING (1<<31) /* Client enabled keys tracking in order to + perform client side caching. */ /* Client block type (btype field in client structure) * if CLIENT_BLOCKED flag is set. */ @@ -845,6 +850,11 @@ typedef struct client { sds peerid; /* Cached peer ID. */ listNode *client_list_node; /* list node in client list */ + /* If this client is in tracking mode and this field is non zero, + * invalidation messages for keys fetched by this client will be send to + * the specified client ID. */ + uint64_t client_tracking_redirection; + /* Response buffer */ int bufpos; char buf[PROTO_REPLY_CHUNK_BYTES]; @@ -1286,6 +1296,8 @@ struct redisServer { unsigned int blocked_clients_by_type[BLOCKED_NUM]; list *unblocked_clients; /* list of clients to unblock before next loop */ list *ready_keys; /* List of readyList structures for BLPOP & co */ + /* Client side caching. */ + unsigned int tracking_clients; /* # of clients with tracking enabled.*/ /* Sort parameters - qsort_r() is only available under BSD so we * have to take this state global, in order to pass it to sortCompare() */ int sort_desc; @@ -1602,6 +1614,9 @@ void addReplyErrorFormat(client *c, const char *fmt, ...); void addReplyStatusFormat(client *c, const char *fmt, ...); #endif +/* Client side caching (tracking mode) */ +void disableTracking(client *c); + /* List data type */ void listTypeTryConversion(robj *subject, robj *value); void listTypePush(robj *subject, robj *value, int where); From b54789d47252075b0761158c2e120f043eb46a15 Mon Sep 17 00:00:00 2001 From: antirez Date: Sun, 30 Jun 2019 06:19:04 -0400 Subject: [PATCH 19/76] Client side caching: enable tracking mode. --- src/server.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/server.h b/src/server.h index a6c6a4dae..8c97f83f6 100644 --- a/src/server.h +++ b/src/server.h @@ -1615,6 +1615,7 @@ void addReplyStatusFormat(client *c, const char *fmt, ...); #endif /* Client side caching (tracking mode) */ +void enableTracking(client *c, uint64_t redirect_to); void disableTracking(client *c); /* List data type */ From b826cf89f1e2d9fd082f92b9bf71f886bee17168 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 3 Jul 2019 11:58:20 +0200 Subject: [PATCH 20/76] Client side caching: CLIENT TRACKING subcommand. --- src/networking.c | 41 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/src/networking.c b/src/networking.c index 44979770c..185060267 100644 --- a/src/networking.c +++ b/src/networking.c @@ -1966,6 +1966,7 @@ void clientCommand(client *c) { "reply (on|off|skip) -- Control the replies sent to the current connection.", "setname -- Assign the name to the current connection.", "unblock [TIMEOUT|ERROR] -- Unblock the specified blocked client.", +"tracking (on|off) [REDIRECT ] -- Enable client keys tracking for client side caching.", NULL }; addReplyHelp(c, help); @@ -2122,20 +2123,56 @@ NULL addReply(c,shared.czero); } } else if (!strcasecmp(c->argv[1]->ptr,"setname") && c->argc == 3) { + /* CLIENT SETNAME */ if (clientSetNameOrReply(c,c->argv[2]) == C_OK) addReply(c,shared.ok); } else if (!strcasecmp(c->argv[1]->ptr,"getname") && c->argc == 2) { + /* CLIENT GETNAME */ if (c->name) addReplyBulk(c,c->name); else addReplyNull(c); } else if (!strcasecmp(c->argv[1]->ptr,"pause") && c->argc == 3) { + /* CLIENT PAUSE */ long long duration; - if (getTimeoutFromObjectOrReply(c,c->argv[2],&duration,UNIT_MILLISECONDS) - != C_OK) return; + if (getTimeoutFromObjectOrReply(c,c->argv[2],&duration, + UNIT_MILLISECONDS) != C_OK) return; pauseClients(duration); addReply(c,shared.ok); + } else if (!strcasecmp(c->argv[1]->ptr,"tracking") && + (c->argc == 3 || c->argc == 5)) + { + /* CLIENT TRACKING (on|off) [REDIRECT ] */ + long long redir = 0; + + /* Parse the redirection option: we'll require the client with + * the specified ID to exist right now, even if it is possible + * it will get disconnected later. */ + if (c->argc == 5) { + if (strcasecmp(c->argv[3]->ptr,"redirect") != 0) { + addReply(c,shared.syntaxerr); + return; + } else { + if (getLongLongFromObjectOrReply(c,c->argv[4],&redir,NULL) != + C_OK) return; + if (lookupClientByID(redir) == NULL) { + addReplyError(c,"The client ID you want redirect to " + "does not exist"); + return; + } + } + } + + if (!strcasecmp(c->argv[2]->ptr,"on")) { + enableTracking(c,redir); + } else if (!strcasecmp(c->argv[2]->ptr,"off")) { + disableTracking(c); + } else { + addReply(c,shared.syntaxerr); + return; + } + addReply(c,shared.ok); } else { addReplyErrorFormat(c, "Unknown subcommand or wrong number of arguments for '%s'. Try CLIENT HELP", (char*)c->argv[1]->ptr); } From a4e44726d02ea63a9a2baf185ff9133b6b2bc867 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 3 Jul 2019 12:42:16 +0200 Subject: [PATCH 21/76] Client side caching: hook inside call() for tracking. --- src/server.c | 11 +++++++++++ src/server.h | 1 + 2 files changed, 12 insertions(+) diff --git a/src/server.c b/src/server.c index 4b87b6ac2..bb891594b 100644 --- a/src/server.c +++ b/src/server.c @@ -3194,6 +3194,7 @@ void call(client *c, int flags) { latencyAddSampleIfNeeded(latency_event,duration/1000); slowlogPushEntryIfNeeded(c,c->argv,c->argc,duration); } + if (flags & CMD_CALL_STATS) { /* use the real command that was executed (cmd and lastamc) may be * different, in case of MULTI-EXEC or re-written commands such as @@ -3261,6 +3262,16 @@ void call(client *c, int flags) { redisOpArrayFree(&server.also_propagate); } server.also_propagate = prev_also_propagate; + + /* If the client has keys tracking enabled for client side caching, + * make sure to remember the keys it fetched via this command. */ + if (c->cmd->flags & CMD_READONLY) { + client *caller = (c->flags & CLIENT_LUA && server.lua_caller) ? + server.lua_caller : c; + if (caller->flags & CLIENT_TRACKING) + trackingRememberKeys(caller); + } + server.stat_numcommands++; } diff --git a/src/server.h b/src/server.h index 8c97f83f6..022e48304 100644 --- a/src/server.h +++ b/src/server.h @@ -1617,6 +1617,7 @@ void addReplyStatusFormat(client *c, const char *fmt, ...); /* Client side caching (tracking mode) */ void enableTracking(client *c, uint64_t redirect_to); void disableTracking(client *c); +void trackingRememberKeys(client *c); /* List data type */ void listTypeTryConversion(robj *subject, robj *value); From 4b5027845e2b3566579a1b5e34b536df2f1c23ee Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 3 Jul 2019 19:16:20 +0200 Subject: [PATCH 22/76] Client side caching: implement trackingInvalidateKey(). --- src/db.c | 1 + src/debug.c | 2 +- src/expire.c | 1 + src/networking.c | 1 + src/server.h | 5 +- src/tracking.c | 162 +++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 170 insertions(+), 2 deletions(-) create mode 100644 src/tracking.c diff --git a/src/db.c b/src/db.c index b537a29a4..4977873e9 100644 --- a/src/db.c +++ b/src/db.c @@ -399,6 +399,7 @@ int selectDb(client *c, int id) { void signalModifiedKey(redisDb *db, robj *key) { touchWatchedKey(db,key); + if (server.tracking_clients) trackingInvalidateKey(key); } void signalFlushedDb(int dbid) { diff --git a/src/debug.c b/src/debug.c index 0c6b5630c..1f1157d4a 100644 --- a/src/debug.c +++ b/src/debug.c @@ -702,7 +702,7 @@ void _serverAssertPrintClientInfo(const client *c) { bugReportStart(); serverLog(LL_WARNING,"=== ASSERTION FAILED CLIENT CONTEXT ==="); - serverLog(LL_WARNING,"client->flags = %d", c->flags); + serverLog(LL_WARNING,"client->flags = %llu", (unsigned long long)c->flags); serverLog(LL_WARNING,"client->fd = %d", c->fd); serverLog(LL_WARNING,"client->argc = %d", c->argc); for (j=0; j < c->argc; j++) { diff --git a/src/expire.c b/src/expire.c index 0b92ee3fe..b23117a3c 100644 --- a/src/expire.c +++ b/src/expire.c @@ -64,6 +64,7 @@ int activeExpireCycleTryExpire(redisDb *db, dictEntry *de, long long now) { dbSyncDelete(db,keyobj); notifyKeyspaceEvent(NOTIFY_EXPIRED, "expired",keyobj,db->id); + if (server.tracking_clients) trackingInvalidateKey(keyobj); decrRefCount(keyobj); server.stat_expiredkeys++; return 1; diff --git a/src/networking.c b/src/networking.c index 185060267..716b35859 100644 --- a/src/networking.c +++ b/src/networking.c @@ -1854,6 +1854,7 @@ sds catClientInfoString(sds s, client *client) { if (client->flags & CLIENT_MULTI) *p++ = 'x'; if (client->flags & CLIENT_BLOCKED) *p++ = 'b'; if (client->flags & CLIENT_TRACKING) *p++ = 't'; + if (client->flags & CLIENT_TRACKING_BROKEN_REDIR) *p++ = 'R'; if (client->flags & CLIENT_DIRTY_CAS) *p++ = 'd'; if (client->flags & CLIENT_CLOSE_AFTER_REPLY) *p++ = 'c'; if (client->flags & CLIENT_UNBLOCKED) *p++ = 'u'; diff --git a/src/server.h b/src/server.h index 022e48304..cd6652257 100644 --- a/src/server.h +++ b/src/server.h @@ -295,6 +295,7 @@ typedef long long mstime_t; /* millisecond time type. */ to be executed. */ #define CLIENT_TRACKING (1<<31) /* Client enabled keys tracking in order to perform client side caching. */ +#define CLIENT_TRACKING_BROKEN_REDIR (1ULL<<32) /* Target client is invalid. */ /* Client block type (btype field in client structure) * if CLIENT_BLOCKED flag is set. */ @@ -821,7 +822,7 @@ typedef struct client { time_t ctime; /* Client creation time. */ time_t lastinteraction; /* Time of the last interaction, used for timeout */ time_t obuf_soft_limit_reached_time; - int flags; /* Client flags: CLIENT_* macros. */ + uint64_t flags; /* Client flags: CLIENT_* macros. */ int authenticated; /* Needed when the default user requires auth. */ int replstate; /* Replication state if this is a slave. */ int repl_put_online_on_ack; /* Install slave write handler on ACK. */ @@ -1603,6 +1604,7 @@ void linkClient(client *c); void protectClient(client *c); void unprotectClient(client *c); void initThreadedIO(void); +client *lookupClientByID(uint64_t id); #ifdef __GNUC__ void addReplyErrorFormat(client *c, const char *fmt, ...) @@ -1618,6 +1620,7 @@ void addReplyStatusFormat(client *c, const char *fmt, ...); void enableTracking(client *c, uint64_t redirect_to); void disableTracking(client *c); void trackingRememberKeys(client *c); +void trackingInvalidateKey(robj *keyobj); /* List data type */ void listTypeTryConversion(robj *subject, robj *value); diff --git a/src/tracking.c b/src/tracking.c new file mode 100644 index 000000000..66615ed91 --- /dev/null +++ b/src/tracking.c @@ -0,0 +1,162 @@ +/* tracking.c - Client side caching: keys tracking and invalidation + * + * Copyright (c) 2019, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "server.h" + +/* The tracking table is constituted by 2^24 radix trees (each tree, and the + * table itself, are allocated in a lazy way only when needed) tracking + * clients that may have certain keys in their local, client side, cache. + * + * Keys are grouped into 2^24 slots, in a way similar to Redis Cluster hash + * slots, however here the function we use is crc64, taking the least + * significant 24 bits of the output. + * + * When a client enables tracking with "CLIENT TRACKING on", each key served to + * the client is hashed to one of such slots, and Redis will remember what + * client may have keys about such slot. Later, when a key in a given slot is + * modified, all the clients that may have local copies of keys in that slot + * will receive an invalidation message. There is no distinction of database + * number: a single table is used. + * + * Clients will normally take frequently requested objects in memory, removing + * them when invalidation messages are received. A strategy clients may use is + * to just cache objects in a dictionary, associating to each cached object + * some incremental epoch, or just a timestamp. When invalidation messages are + * received clients may store, in a different table, the timestamp (or epoch) + * of the invalidation of such given slot: later when accessing objects, the + * eviction of stale objects may be performed in a lazy way by checking if the + * cached object timestamp is older than the invalidation timestamp for such + * objects. + * + * The output of the 24 bit hash function is very large (more than 16 million + * possible slots), so clients that may want to use less resources may only + * use the most significant bits instead of the full 24 bits. */ +#define TRACKING_TABLE_SIZE (1<<24) +rax **TrackingTable = NULL; + +/* Remove the tracking state from the client 'c'. Note that there is not much + * to do for us here, if not to decrement the counter of the clients in + * tracking mode, because we just store the ID of the client in the tracking + * table, so we'll remove the ID reference in a lazy way. Otherwise when a + * client with many entries in the table is removed, it would cost a lot of + * time to do the cleanup. */ +void disableTracking(client *c) { + if (c->flags & CLIENT_TRACKING) { + server.tracking_clients--; + c->flags &= ~(CLIENT_TRACKING|CLIENT_TRACKING_BROKEN_REDIR); + } +} + +/* Enable the tracking state for the client 'c', and as a side effect allocates + * the tracking table if needed. If the 'redirect_to' argument is non zero, the + * invalidation messages for this client will be sent to the client ID + * specified by the 'redirect_to' argument. Note that if such client will + * eventually get freed, we'll send a message to the original client to + * inform it of the condition. Multiple clients can redirect the invalidation + * messages to the same client ID. */ +void enableTracking(client *c, uint64_t redirect_to) { + if (c->flags & CLIENT_TRACKING) return; + c->flags |= CLIENT_TRACKING; + c->flags &= ~CLIENT_TRACKING_BROKEN_REDIR; + c->client_tracking_redirection = redirect_to; + server.tracking_clients++; + if (TrackingTable == NULL) + TrackingTable = zcalloc(sizeof(rax*) * TRACKING_TABLE_SIZE); +} + +/* This function is called after the excution of a readonly command in the + * case the client 'c' has keys tracking enabled. It will populate the + * tracking ivalidation table according to the keys the user fetched, so that + * Redis will know what are the clients that should receive an invalidation + * message with certain groups of keys are modified. */ +void trackingRememberKeys(client *c) { + int numkeys; + int *keys = getKeysFromCommand(c->cmd,c->argv,c->argc,&numkeys); + if (keys == NULL) return; + + for(int j = 0; j < numkeys; j++) { + int idx = keys[j]; + sds sdskey = c->argv[idx]->ptr; + uint64_t hash = crc64(0, + (unsigned char*)sdskey,sdslen(sdskey))&(TRACKING_TABLE_SIZE-1); + if (TrackingTable[hash] == NULL) + TrackingTable[hash] = raxNew(); + raxTryInsert(TrackingTable[hash], + (unsigned char*)&c->id,sizeof(c->id),NULL,NULL); + } + getKeysFreeResult(keys); +} + +/* This function is called from signalModifiedKey() or other places in Redis + * when a key changes value. In the context of keys tracking, our task here is + * to send a notification to every client that may have keys about such . */ +void trackingInvalidateKey(robj *keyobj) { + sds sdskey = keyobj->ptr; + uint64_t hash = crc64(0, + (unsigned char*)sdskey,sdslen(sdskey))&(TRACKING_TABLE_SIZE-1); + if (TrackingTable == NULL || TrackingTable[hash] == NULL) return; + + raxIterator ri; + raxStart(&ri,TrackingTable[hash]); + raxSeek(&ri,"^",NULL,0); + while(raxNext(&ri)) { + uint64_t id; + memcpy(&id,ri.key,ri.key_len); + client *c = lookupClientByID(id); + if (c->client_tracking_redirection) { + client *redir = lookupClientByID(c->client_tracking_redirection); + if (!redir) { + /* We need to signal to the original connection that we + * are unable to send invalidation messages to the redirected + * connection, because the client no longer exist. */ + if (c->resp > 2) { + addReplyPushLen(c,3); + addReplyBulkCBuffer(c,"tracking-redir-broken",21); + addReplyLongLong(c,c->client_tracking_redirection); + } + continue; + } + c = redir; + } + + /* Only send such info for clients in RESP version 3 or more. */ + if (c->resp > 2) { + addReplyPushLen(c,3); + addReplyBulkCBuffer(c,"invalidate",10); + addReplyBulk(c,keyobj); + } + } + raxStop(&ri); + + /* Free the tracking table: we'll create the radix tree and populate it + * again if more keys will be modified in this hash slot. */ + raxFree(TrackingTable[hash]); + TrackingTable[hash] = NULL; +} From 6aeb2627ded872211c1ccec4e90bc27078aa6b89 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 4 Jul 2019 14:06:44 +0200 Subject: [PATCH 23/76] Client side caching: fix invalidate message len and content. --- src/tracking.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tracking.c b/src/tracking.c index 66615ed91..aade137c4 100644 --- a/src/tracking.c +++ b/src/tracking.c @@ -148,9 +148,9 @@ void trackingInvalidateKey(robj *keyobj) { /* Only send such info for clients in RESP version 3 or more. */ if (c->resp > 2) { - addReplyPushLen(c,3); + addReplyPushLen(c,2); addReplyBulkCBuffer(c,"invalidate",10); - addReplyBulk(c,keyobj); + addReplyLongLong(c,hash); } } raxStop(&ri); From f3a8b5001d3dd97eea84f7ed5aaabcec14802f16 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 5 Jul 2019 12:24:28 +0200 Subject: [PATCH 24/76] Client side caching: RESP2 support. --- src/server.h | 1 + src/tracking.c | 16 ++++++++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/src/server.h b/src/server.h index cd6652257..cb70b93ad 100644 --- a/src/server.h +++ b/src/server.h @@ -1946,6 +1946,7 @@ int pubsubUnsubscribeAllPatterns(client *c, int notify); void freePubsubPattern(void *p); int listMatchPubsubPattern(void *a, void *b); int pubsubPublishMessage(robj *channel, robj *message); +void addReplyPubsubMessage(client *c, robj *channel, robj *msg); /* Keyspace events notification */ void notifyKeyspaceEvent(int type, char *event, robj *key, int dbid); diff --git a/src/tracking.c b/src/tracking.c index aade137c4..9d9585c95 100644 --- a/src/tracking.c +++ b/src/tracking.c @@ -60,6 +60,7 @@ * use the most significant bits instead of the full 24 bits. */ #define TRACKING_TABLE_SIZE (1<<24) rax **TrackingTable = NULL; +robj *TrackingChannelName; /* Remove the tracking state from the client 'c'. Note that there is not much * to do for us here, if not to decrement the counter of the clients in @@ -87,8 +88,10 @@ void enableTracking(client *c, uint64_t redirect_to) { c->flags &= ~CLIENT_TRACKING_BROKEN_REDIR; c->client_tracking_redirection = redirect_to; server.tracking_clients++; - if (TrackingTable == NULL) + if (TrackingTable == NULL) { TrackingTable = zcalloc(sizeof(rax*) * TRACKING_TABLE_SIZE); + TrackingChannelName = createStringObject("__redis__:invalidate",20); + } } /* This function is called after the excution of a readonly command in the @@ -130,6 +133,7 @@ void trackingInvalidateKey(robj *keyobj) { uint64_t id; memcpy(&id,ri.key,ri.key_len); client *c = lookupClientByID(id); + int using_redirection = 0; if (c->client_tracking_redirection) { client *redir = lookupClientByID(c->client_tracking_redirection); if (!redir) { @@ -144,13 +148,21 @@ void trackingInvalidateKey(robj *keyobj) { continue; } c = redir; + using_redirection = 1; } - /* Only send such info for clients in RESP version 3 or more. */ + /* Only send such info for clients in RESP version 3 or more. However + * if redirection is active, and the connection we redirect to is + * in Pub/Sub mode, we can support the feature with RESP 2 as well, + * by sending Pub/Sub messages in the __redis__:invalidate channel. */ if (c->resp > 2) { addReplyPushLen(c,2); addReplyBulkCBuffer(c,"invalidate",10); addReplyLongLong(c,hash); + } else if (using_redirection && c->flags & CLIENT_PUBSUB) { + robj *msg = createStringObjectFromLongLong(hash); + addReplyPubsubMessage(c,TrackingChannelName,msg); + decrRefCount(msg); } } raxStop(&ri); From 664d7d6e84b590564690c453677ac10362ff2ee7 Mon Sep 17 00:00:00 2001 From: Guy Korland Date: Sun, 7 Jul 2019 18:28:15 +0300 Subject: [PATCH 25/76] fix build tracking.c should be tracking.o thanks to @rafie --- src/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Makefile b/src/Makefile index e608309f8..b6cc69e2f 100644 --- a/src/Makefile +++ b/src/Makefile @@ -164,7 +164,7 @@ endif REDIS_SERVER_NAME=redis-server REDIS_SENTINEL_NAME=redis-sentinel -REDIS_SERVER_OBJ=adlist.o quicklist.o ae.o anet.o dict.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o crc16.o endianconv.o slowlog.o scripting.o bio.o rio.o rand.o memtest.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o acl.o gopher.o tracking.c +REDIS_SERVER_OBJ=adlist.o quicklist.o ae.o anet.o dict.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o crc16.o endianconv.o slowlog.o scripting.o bio.o rio.o rand.o memtest.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o acl.o gopher.o tracking.o REDIS_CLI_NAME=redis-cli REDIS_CLI_OBJ=anet.o adlist.o dict.o redis-cli.o zmalloc.o release.o anet.o ae.o crc64.o siphash.o crc16.o REDIS_BENCHMARK_NAME=redis-benchmark From 00c9b4f15dea45e3470655e80f71465070bf728d Mon Sep 17 00:00:00 2001 From: Angus Pearson Date: Mon, 8 Jul 2019 11:04:37 +0100 Subject: [PATCH 26/76] Change typeNameCanonicalize -> getObjectTypeName, and other style changes --- src/db.c | 6 +++--- src/server.h | 9 ++++----- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/src/db.c b/src/db.c index 6557ddc3c..bb53081f6 100644 --- a/src/db.c +++ b/src/db.c @@ -767,7 +767,7 @@ void scanGenericCommand(client *c, robj *o, unsigned long cursor) { /* Filter an element if it isn't the type we want. */ if (!filter && o == NULL && typename){ robj* typecheck = lookupKeyReadWithFlags(c->db, kobj, LOOKUP_NOTOUCH); - char* type = typeNameCanonicalize(typecheck); + char* type = getObjectTypeName(typecheck); if (strcasecmp((char*) typename, type)) filter = 1; } @@ -827,7 +827,7 @@ void lastsaveCommand(client *c) { addReplyLongLong(c,server.lastsave); } -char* typeNameCanonicalize(robj *o) { +char* getObjectTypeName(robj *o) { char* type; if (o == NULL) { type = "none"; @@ -852,7 +852,7 @@ char* typeNameCanonicalize(robj *o) { void typeCommand(client *c) { robj *o; o = lookupKeyReadWithFlags(c->db,c->argv[1],LOOKUP_NOTOUCH); - addReplyStatus(c, typeNameCanonicalize(o)); + addReplyStatus(c, getObjectTypeName(o)); } void shutdownCommand(client *c) { diff --git a/src/server.h b/src/server.h index dc02edb5c..19ef1ac59 100644 --- a/src/server.h +++ b/src/server.h @@ -646,11 +646,10 @@ typedef struct redisObject { void *ptr; } robj; -/* The 'canonical' name for a type as enumerated above is given by the - * below function. Native types are checked against the OBJ_STRING, - * OBJ_LIST, OBJ_* defines, and Module types have their registered name - * returned.*/ -char* typeNameCanonicalize(robj*); +/* The a string name for an object's type as listed above + * Native types are checked against the OBJ_STRING, OBJ_LIST, OBJ_* defines, + * and Module types have their registered name returned. */ +char *getObjectTypeName(robj*); /* Macro used to initialize a Redis object allocated on the stack. * Note that this macro is taken near the structure definition to make sure From 271db4274290155aece1b3755fd5ee462f24e44f Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 13 May 2019 17:27:06 +0200 Subject: [PATCH 27/76] Fix test false positive introduced by threaded I/O. Now clients that are ready to be terminated asynchronously are processed more often in beforeSleep() instead of being processed in serverCron(). This means that the test will not be able to catch the moment the client was terminated, also note that the 'omem' figure now changes in big steps, because of the new client output buffers layout. So we have to change the test range in order to accomodate for that. Yet the test is useful enough to be worth taking, even if its precision is reduced by this commit. Probably if we get more problems, a thing that makes sense is just to check that the limit is < 200k. That's more than enough actually. Former-commit-id: 8aaa8b0b116dc86473b6a94bf2ff330dd4163ca1 --- tests/unit/obuf-limits.tcl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/obuf-limits.tcl b/tests/unit/obuf-limits.tcl index b205eb31b..c45bf8e86 100644 --- a/tests/unit/obuf-limits.tcl +++ b/tests/unit/obuf-limits.tcl @@ -15,7 +15,7 @@ start_server {tags {"obuf-limits"}} { if {![regexp {omem=([0-9]+)} $c - omem]} break if {$omem > 200000} break } - assert {$omem >= 80000 && $omem < 200000} + assert {$omem >= 70000 && $omem < 200000} $rd1 close } From b2d03d42211ad6922c0e5d486cb266e52d85e51d Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 13 May 2019 17:30:02 +0200 Subject: [PATCH 28/76] Make comment in getClientOutputBufferMemoryUsage() describing the present. Former-commit-id: 35acae360a4c3c67370b03b4835c51b08194ca28 --- src/networking.cpp | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/networking.cpp b/src/networking.cpp index 174c21ff5..92e2e42a5 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -2696,15 +2696,8 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) { } } -/* This function returns the number of bytes that Redis is virtually +/* This function returns the number of bytes that Redis is * using to store the reply still not read by the client. - * It is "virtual" since the reply output list may contain objects that - * are shared and are not really using additional memory. - * - * The function returns the total sum of the length of all the objects - * stored in the output list, plus the memory used to allocate every - * list node. The static reply buffer is not taken into account since it - * is allocated anyway. * * Note: this function is very fast so can be called as many time as * the caller wishes. The main usage of this function currently is From 12325871b474ae6000f9d77081e18bfb80399bee Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 14 May 2019 16:54:59 +0200 Subject: [PATCH 29/76] Test: fix slowlog test false positive. In fast systems "SLOWLOG RESET" is fast enough to don't be logged even when the time limit is "1" sometimes. Leading to false positives such as: [err]: SLOWLOG - can be disabled in tests/unit/slowlog.tcl Expected '1' to be equal to '0' Former-commit-id: 8198a697fd4455c88712099f20632e554fb564d4 --- tests/unit/slowlog.tcl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/unit/slowlog.tcl b/tests/unit/slowlog.tcl index dbd7a1547..22f088103 100644 --- a/tests/unit/slowlog.tcl +++ b/tests/unit/slowlog.tcl @@ -80,9 +80,11 @@ start_server {tags {"slowlog"} overrides {slowlog-log-slower-than 1000000}} { } test {SLOWLOG - can be disabled} { + r config set slowlog-max-len 1 r config set slowlog-log-slower-than 1 r slowlog reset - assert_equal [r slowlog len] 1 + r debug sleep 0.2 + assert_equal [r slowlog len] 1 r config set slowlog-log-slower-than -1 r slowlog reset r debug sleep 0.2 From a25866ad3510b77a71077496905bc9c1870e5a62 Mon Sep 17 00:00:00 2001 From: Madelyn Olson Date: Fri, 7 Jun 2019 13:20:22 -0700 Subject: [PATCH 30/76] Fixed some spelling issues in ACL codepath including user facing error Former-commit-id: 50ad880ad55e7761fe2598e09be43947e88740fe --- src/acl.cpp | 22 +++++++++++----------- src/server.cpp | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/acl.cpp b/src/acl.cpp index 259d9fb61..7493300d5 100644 --- a/src/acl.cpp +++ b/src/acl.cpp @@ -295,7 +295,7 @@ int ACLGetCommandBitCoordinates(uint64_t id, uint64_t *word, uint64_t *bit) { * Note that this function does not check the ALLCOMMANDS flag of the user * but just the lowlevel bitmask. * - * If the bit overflows the user internal represetation, zero is returned + * If the bit overflows the user internal representation, zero is returned * in order to disallow the execution of the command in such edge case. */ int ACLGetUserCommandBit(user *u, unsigned long id) { uint64_t word, bit; @@ -311,7 +311,7 @@ int ACLUserCanExecuteFutureCommands(user *u) { } /* Set the specified command bit for the specified user to 'value' (0 or 1). - * If the bit overflows the user internal represetation, no operation + * If the bit overflows the user internal representation, no operation * is performed. As a side effect of calling this function with a value of * zero, the user flag ALLCOMMANDS is cleared since it is no longer possible * to skip the command bit explicit test. */ @@ -350,7 +350,7 @@ int ACLSetUserCommandBitsForCategory(user *u, const char *category, int value) { /* Return the number of commands allowed (on) and denied (off) for the user 'u' * in the subset of commands flagged with the specified category name. - * If the categoty name is not valid, C_ERR is returend, otherwise C_OK is + * If the category name is not valid, C_ERR is returned, otherwise C_OK is * returned and on and off are populated by reference. */ int ACLCountCategoryBitsForUser(user *u, unsigned long *on, unsigned long *off, const char *category) @@ -626,7 +626,7 @@ void ACLAddAllowedSubcommand(user *u, unsigned long id, const char *sub) { * It is possible to specify multiple patterns. * allkeys Alias for ~* * resetkeys Flush the list of allowed keys patterns. - * > Add this passowrd to the list of valid password for the user. + * > Add this password to the list of valid password for the user. * For example >mypass will add "mypass" to the list. * This directive clears the "nopass" flag (see later). * < Remove this password from the list of valid passwords. @@ -949,9 +949,9 @@ user *ACLGetUserByName(const char *name, size_t namelen) { return (user*)myuser; } -/* Check if the command ready to be excuted in the client 'c', and already - * referenced by c->cmd, can be executed by this client according to the - * ACls associated to the client user c->user. +/* Check if the command is ready to be executed in the client 'c', already + * referenced by c->cmd, and can be executed by this client according to the + * ACLs associated to the client user c->user. * * If the user can execute the command ACL_OK is returned, otherwise * ACL_DENIED_CMD or ACL_DENIED_KEY is returned: the first in case the @@ -1122,7 +1122,7 @@ int ACLLoadConfiguredUsers(void) { } /* This function loads the ACL from the specified filename: every line - * is validated and shold be either empty or in the format used to specify + * is validated and should be either empty or in the format used to specify * users in the redis.conf configuration or in the ACL file, that is: * * user ... rules ... @@ -1172,7 +1172,7 @@ sds ACLLoadFromFile(const char *filename) { * to the real user mentioned in the ACL line. */ user *fakeuser = ACLCreateUnlinkedUser(); - /* We do all the loading in a fresh insteance of the Users radix tree, + /* We do all the loading in a fresh instance of the Users radix tree, * so if there are errors loading the ACL file we can rollback to the * old version. */ rax *old_users = Users; @@ -1248,7 +1248,7 @@ sds ACLLoadFromFile(const char *filename) { } /* Note that the same rules already applied to the fake user, so - * we just assert that everything goess well: it should. */ + * we just assert that everything goes well: it should. */ for (j = 2; j < argc; j++) serverAssert(ACLSetUser(u,argv[j],sdslen(argv[j])) == C_OK); @@ -1611,7 +1611,7 @@ void addReplyCommandCategories(client *c, struct redisCommand *cmd) { setDeferredSetLen(c, flaglen, flagcount); } -/* AUTH +/* AUTH * AUTH (Redis >= 6.0 form) * * When the user is omitted it means that we are trying to authenticate diff --git a/src/server.cpp b/src/server.cpp index 4ee6922d2..c40efc7ac 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -3523,7 +3523,7 @@ int processCommand(client *c, int callFlags) { if (acl_retval == ACL_DENIED_CMD) addReplyErrorFormat(c, "-NOPERM this user has no permissions to run " - "the '%s' command or its subcommnad", c->cmd->name); + "the '%s' command or its subcommand", c->cmd->name); else addReplyErrorFormat(c, "-NOPERM this user has no permissions to access " From 12c8823c936a64f8e718aaf64c6748304714a566 Mon Sep 17 00:00:00 2001 From: Angus Pearson Date: Mon, 10 Jun 2019 17:41:44 +0100 Subject: [PATCH 31/76] Add char* typeNameCanonicalize(robj*) to remove duplicate code between SCAN and TYPE commands, and to keep OBJ_* enum to string canonicalization in one place. Former-commit-id: 3cdc6e8d846e88cf4e250b2643662bde2a9317c5 --- src/db.cpp | 37 +++++++++++-------------------------- src/server.h | 8 +++++++- 2 files changed, 18 insertions(+), 27 deletions(-) diff --git a/src/db.cpp b/src/db.cpp index f605c253b..cc0db9411 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -818,26 +818,8 @@ void scanGenericCommand(client *c, robj_roptr o, unsigned long cursor) { /* Filter an element if it isn't the type we want. */ if (!filter && o == NULL && typename){ - robj* typecheck; - char *type; - typecheck = lookupKeyReadWithFlags(c->db, kobj, LOOKUP_NOTOUCH); - if (typecheck == NULL) { - type = "none"; - } else { - switch(typecheck->type) { - case OBJ_STRING: type = "string"; break; - case OBJ_LIST: type = "list"; break; - case OBJ_SET: type = "set"; break; - case OBJ_ZSET: type = "zset"; break; - case OBJ_HASH: type = "hash"; break; - case OBJ_STREAM: type = "stream"; break; - case OBJ_MODULE: { - moduleValue *mv = typecheck->ptr; - type = mv->type->name; - }; break; - default: type = "unknown"; break; - } - } + robj* typecheck = lookupKeyReadWithFlags(c->db, kobj, LOOKUP_NOTOUCH); + char* type = typeNameCanonicalize(typecheck); if (strcasecmp((char*) typename, type)) filter = 1; } @@ -897,11 +879,9 @@ void lastsaveCommand(client *c) { addReplyLongLong(c,g_pserver->lastsave); } -void typeCommand(client *c) { - const char *type; - - robj_roptr o = lookupKeyReadWithFlags(c->db,c->argv[1],LOOKUP_NOTOUCH); - if (o == nullptr) { +const char* typeNameCanonicalize(robj *o) { + const char* type; + if (o == NULL) { type = "none"; } else { switch(o->type) { @@ -918,7 +898,12 @@ void typeCommand(client *c) { default: type = "unknown"; break; } } - addReplyStatus(c,type); + return type; +} + +void typeCommand(client *c) { + robj_roptr o = lookupKeyReadWithFlags(c->db,c->argv[1],LOOKUP_NOTOUCH); + addReplyStatus(c, typeNameCanonicalize(o)); } void shutdownCommand(client *c) { diff --git a/src/server.h b/src/server.h index d057765f2..4eec2323e 100644 --- a/src/server.h +++ b/src/server.h @@ -721,7 +721,6 @@ typedef struct redisObject { void *m_ptr; } robj; - __attribute__((always_inline)) inline const void *ptrFromObj(robj_roptr &o) { if (o->encoding == OBJ_ENCODING_EMBSTR) @@ -746,6 +745,13 @@ __attribute__((always_inline)) inline char *szFromObj(const robj *o) return (char*)ptrFromObj(o); } + +/* The 'cannonical' name for a type as enumerated above is given by the + * below function. Native types are checked against the OBJ_STRING, + * OBJ_LIST, OBJ_* defines, and Module types have their registered name + * returned.*/ +char* typeNameCanonicalize(robj*); + /* Macro used to initialize a Redis object allocated on the stack. * Note that this macro is taken near the structure definition to make sure * we'll update it when the structure is changed, to avoid bugs like From 6e461c3cc3685dbe535da2a22fdd098582a805ba Mon Sep 17 00:00:00 2001 From: John Sully Date: Thu, 18 Jul 2019 23:35:51 -0400 Subject: [PATCH 32/76] Fix compile errors from merges Former-commit-id: 27a927fe0011536c6539d7c2a79ccfdaf78cee22 --- src/db.cpp | 18 +++++++++--------- src/server.h | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/db.cpp b/src/db.cpp index cc0db9411..044a2bcd5 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -681,7 +681,7 @@ void scanGenericCommand(client *c, robj_roptr o, unsigned long cursor) { listNode *node, *nextnode; long count = 10; sds pat = NULL; - sds typename = NULL; + sds type = NULL; int patlen = 0, use_pattern = 0; dict *ht; @@ -718,9 +718,9 @@ void scanGenericCommand(client *c, robj_roptr o, unsigned long cursor) { use_pattern = !(pat[0] == '*' && patlen == 1); i += 2; - } else if (!strcasecmp(c->argv[i]->ptr, "type") && o == NULL && j >= 2) { + } else if (!strcasecmp(szFromObj(c->argv[i]), "type") && o == nullptr && j >= 2) { /* SCAN for a particular type only applies to the db dict */ - typename = c->argv[i+1]->ptr; + type = szFromObj(c->argv[i+1]); i+= 2; } else { addReply(c,shared.syntaxerr); @@ -817,10 +817,10 @@ void scanGenericCommand(client *c, robj_roptr o, unsigned long cursor) { } /* Filter an element if it isn't the type we want. */ - if (!filter && o == NULL && typename){ - robj* typecheck = lookupKeyReadWithFlags(c->db, kobj, LOOKUP_NOTOUCH); - char* type = typeNameCanonicalize(typecheck); - if (strcasecmp((char*) typename, type)) filter = 1; + if (!filter && o == nullptr && type){ + robj_roptr typecheck = lookupKeyReadWithFlags(c->db, kobj, LOOKUP_NOTOUCH); + const char* type = typeNameCanonicalize(typecheck); + if (strcasecmp((char*) type, type)) filter = 1; } /* Filter element if it is an expired key. */ @@ -879,9 +879,9 @@ void lastsaveCommand(client *c) { addReplyLongLong(c,g_pserver->lastsave); } -const char* typeNameCanonicalize(robj *o) { +const char* typeNameCanonicalize(robj_roptr o) { const char* type; - if (o == NULL) { + if (o == nullptr) { type = "none"; } else { switch(o->type) { diff --git a/src/server.h b/src/server.h index 4eec2323e..f359e9852 100644 --- a/src/server.h +++ b/src/server.h @@ -750,7 +750,7 @@ __attribute__((always_inline)) inline char *szFromObj(const robj *o) * below function. Native types are checked against the OBJ_STRING, * OBJ_LIST, OBJ_* defines, and Module types have their registered name * returned.*/ -char* typeNameCanonicalize(robj*); +const char* typeNameCanonicalize(robj_roptr o); /* Macro used to initialize a Redis object allocated on the stack. * Note that this macro is taken near the structure definition to make sure From 1c8c4a5db223ca47b816390ddbdc58162cc84eb5 Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 19 Jul 2019 00:43:23 -0400 Subject: [PATCH 33/76] Fix bad merge in SCAN KEYS command Former-commit-id: c21af6b351328ffbdb1d1e2a7eed44f8f929f8b2 --- src/db.cpp | 4 ++-- tests/unit/scan.tcl | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/db.cpp b/src/db.cpp index a71bef5df..114b84297 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -820,8 +820,8 @@ void scanGenericCommand(client *c, robj_roptr o, unsigned long cursor) { /* Filter an element if it isn't the type we want. */ if (!filter && o == nullptr && type){ robj_roptr typecheck = lookupKeyReadWithFlags(c->db, kobj, LOOKUP_NOTOUCH); - const char* type = getObjectTypeName(typecheck); - if (strcasecmp((char*) type, type)) filter = 1; + const char* typeT = getObjectTypeName(typecheck); + if (strcasecmp((char*) type, typeT)) filter = 1; } /* Filter element if it is an expired key. */ diff --git a/tests/unit/scan.tcl b/tests/unit/scan.tcl index 9f9ff4df2..25549c4ac 100644 --- a/tests/unit/scan.tcl +++ b/tests/unit/scan.tcl @@ -69,7 +69,7 @@ start_server {tags {"scan"}} { if {$cur == 0} break } - assert_equal 0 [llength $keys] + assert_equal 0 [llength $keys] "non-strings excluded" # Check strings are included set cur 0 @@ -82,7 +82,7 @@ start_server {tags {"scan"}} { if {$cur == 0} break } - assert_equal 1000 [llength $keys] + assert_equal 1000 [llength $keys] "strings included" # Check all three args work together set cur 0 From 867070eb9995f456e042d80c3c88925b87c0baac Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 18 Jul 2019 18:51:45 +0200 Subject: [PATCH 34/76] RDB: handle encoding errors with rdbExitReportCorruptRDB(). Without such change, the diskless replicas, when loading RDB files from the socket will not abort when a broken RDB file gets loaded. This is potentially unsafe, because right now Redis is not able to guarantee that encoding errors are safe from the POV of memory corruptions (for instance the LZF library may not be safe against untrusted data?) so better to abort when the RDB file we are going to load is corrupted. Instead I/O errors are still returned to the caller without aborting, so that in case of short read the diskless replica can try again. Former-commit-id: 47feb2719ca7fd04e7e108ec1af0f777e536bf8a --- src/rdb.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/rdb.cpp b/src/rdb.cpp index d446109fa..73957220d 100644 --- a/src/rdb.cpp +++ b/src/rdb.cpp @@ -278,8 +278,8 @@ void *rdbLoadIntegerObject(rio *rdb, int enctype, int flags, size_t *lenptr) { v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24); val = (int32_t)v; } else { - val = 0; /* anti-warning */ rdbExitReportCorruptRDB("Unknown RDB integer encoding type %d",enctype); + return nullptr; /* Never reached. */ } if (plain || sds) { char buf[LONG_STR_SIZE], *p; @@ -497,6 +497,7 @@ void *rdbGenericLoadStringObject(rio *rdb, int flags, size_t *lenptr) { return rdbLoadLzfStringObject(rdb,flags,lenptr); default: rdbExitReportCorruptRDB("Unknown RDB string encoding type %d",len); + return nullptr; /* Never reached. */ } } From ebc50797a071f981aa94201189cf78a7bff6f6af Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 18 Jul 2019 18:59:38 +0200 Subject: [PATCH 35/76] RDB: make sure to abort on LZF encoding error. Former-commit-id: 27fe1658a2019bcd5d880e844bac21ccef8303f2 --- src/rdb.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/rdb.cpp b/src/rdb.cpp index 73957220d..d4d91ff1f 100644 --- a/src/rdb.cpp +++ b/src/rdb.cpp @@ -382,8 +382,7 @@ void *rdbLoadLzfStringObject(rio *rdb, int flags, size_t *lenptr) { /* Load the compressed representation and uncompress it to target. */ if (rioRead(rdb,c,clen) == 0) goto err; if (lzf_decompress(c,clen,val,len) == 0) { - if (rdbCheckMode) rdbCheckSetError("Invalid LZF compressed string"); - goto err; + rdbExitReportCorruptRDB("Invalid LZF compressed string"); } zfree(c); From 5404d6f6bb3be409a484835529de615e4eba128a Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 19 Jul 2019 01:31:10 -0400 Subject: [PATCH 36/76] Modules must have execute permissions to load Former-commit-id: a4efcd35af52227a22daf7f882e8e14db3f8bf57 --- src/module.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/module.cpp b/src/module.cpp index 7d825df27..d71301cd6 100644 --- a/src/module.cpp +++ b/src/module.cpp @@ -32,6 +32,7 @@ #include #include #include +#include #define REDISMODULE_CORE 1 #include "redismodule.h" @@ -5226,6 +5227,15 @@ int moduleLoad(const char *path, void **module_argv, int module_argc) { int (*onload)(void *, void **, int); void *handle; RedisModuleCtx ctx = REDISMODULE_CTX_INIT; + + struct stat st; + if (stat(path, &st) == 0) + { // this check is best effort + if (!(st.st_mode & S_IEXEC)) { + serverLog(LL_WARNING, "Module %s failed to load: It does not have execute permissions.", path); + return C_ERR; + } + } handle = dlopen(path,RTLD_NOW|RTLD_LOCAL); if (handle == NULL) { From 8d526d93547909693b58e952545e188afb813fa0 Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 19 Jul 2019 01:42:05 -0400 Subject: [PATCH 37/76] Any +x bit is acceptable Former-commit-id: fc58516cca72fc9db97bc4c388f9fa692d115df4 --- src/module.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/module.cpp b/src/module.cpp index d71301cd6..b86b29d13 100644 --- a/src/module.cpp +++ b/src/module.cpp @@ -5231,7 +5231,7 @@ int moduleLoad(const char *path, void **module_argv, int module_argc) { struct stat st; if (stat(path, &st) == 0) { // this check is best effort - if (!(st.st_mode & S_IEXEC)) { + if (!(st.st_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) { serverLog(LL_WARNING, "Module %s failed to load: It does not have execute permissions.", path); return C_ERR; } From f094402c337e9e4fa8b517a33ce4f4ee4199bfbf Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 19 Jul 2019 15:26:17 -0400 Subject: [PATCH 38/76] Any +x bit is acceptable Former-commit-id: 156e596f9c7a922bc3361652b74b78bbeab0f2dc --- src/module.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/module.cpp b/src/module.cpp index b86b29d13..ee31cf7a5 100644 --- a/src/module.cpp +++ b/src/module.cpp @@ -5231,7 +5231,7 @@ int moduleLoad(const char *path, void **module_argv, int module_argc) { struct stat st; if (stat(path, &st) == 0) { // this check is best effort - if (!(st.st_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) { + if (!(st.st_mode & (S_IXUSR | S_IXGRP | S_IXOTH))) { serverLog(LL_WARNING, "Module %s failed to load: It does not have execute permissions.", path); return C_ERR; } From a060bc79427f544d4748222c45beece1c0370f39 Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 5 Jul 2019 23:49:09 -0400 Subject: [PATCH 39/76] New expire datastructure and algorithm. Allows us to expire in sublinear time Former-commit-id: 3880d2616c882e19169180dc10268564347b0279 --- .vscode/settings.json | 3 +- src/bio.cpp | 4 +- src/db.cpp | 156 +++++++++++++++++++------- src/debug.cpp | 11 +- src/defrag.cpp | 21 ++-- src/evict.cpp | 255 +++++++++++++++++++++++------------------- src/expire.cpp | 141 +++++++++-------------- src/lazyfree.cpp | 27 +++-- src/module.cpp | 4 +- src/object.cpp | 47 +++++--- src/rdb.cpp | 58 ++++++---- src/scripting.cpp | 2 +- src/server.cpp | 14 +-- src/server.h | 57 +++++++++- src/slowlog.cpp | 2 +- src/t_string.cpp | 2 +- 16 files changed, 479 insertions(+), 325 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 56bf76d11..e4d7c4c9a 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -51,6 +51,7 @@ "tuple": "cpp", "type_traits": "cpp", "typeinfo": "cpp", - "utility": "cpp" + "utility": "cpp", + "set": "cpp" } } diff --git a/src/bio.cpp b/src/bio.cpp index 62f6615a6..844464e77 100644 --- a/src/bio.cpp +++ b/src/bio.cpp @@ -85,7 +85,7 @@ struct bio_job { void *bioProcessBackgroundJobs(void *arg); void lazyfreeFreeObjectFromBioThread(robj *o); -void lazyfreeFreeDatabaseFromBioThread(dict *ht1, dict *ht2); +void lazyfreeFreeDatabaseFromBioThread(dict *ht1, semiorderedset *set); void lazyfreeFreeSlotsMapFromBioThread(rax *rt); /* Make sure we have enough stack to perform all the things we do in the @@ -196,7 +196,7 @@ void *bioProcessBackgroundJobs(void *arg) { if (job->arg1) lazyfreeFreeObjectFromBioThread((robj*)job->arg1); else if (job->arg2 && job->arg3) - lazyfreeFreeDatabaseFromBioThread((dict*)job->arg2,(dict*)job->arg3); + lazyfreeFreeDatabaseFromBioThread((dict*)job->arg2,(semiorderedset*)job->arg3); else if (job->arg3) lazyfreeFreeSlotsMapFromBioThread((rax*)job->arg3); } else { diff --git a/src/db.cpp b/src/db.cpp index 114b84297..d1a687712 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -39,6 +39,8 @@ *----------------------------------------------------------------------------*/ int keyIsExpired(redisDb *db, robj *key); +int expireIfNeeded(redisDb *db, robj *key, robj *o); +void dbOverwriteCore(redisDb *db, dictEntry *de, robj *key, robj *val, bool fUpdateMvcc, bool fRemoveExpire); /* Update LFU when an object is accessed. * Firstly, decrement the counter if the decrement time is reached. @@ -49,6 +51,20 @@ void updateLFU(robj *val) { val->lru = (LFUGetTimeInMinutes()<<8) | counter; } +void updateExpire(redisDb *db, sds key, robj *valOld, robj *valNew) +{ + serverAssert(valOld->FExpires()); + serverAssert(!valNew->FExpires()); + + auto itr = db->setexpire->find(key); + serverAssert(itr != db->setexpire->end()); + + valNew->SetFExpires(true); + valOld->SetFExpires(false); + return; +} + + /* Low level key lookup API, not actually called directly from commands * implementations that should instead rely on lookupKeyRead(), * lookupKeyWrite() and lookupKeyReadWithFlags(). */ @@ -160,8 +176,10 @@ robj_roptr lookupKeyRead(redisDb *db, robj *key) { * Returns the linked value object if the key exists or NULL if the key * does not exist in the specified DB. */ robj *lookupKeyWrite(redisDb *db, robj *key) { - expireIfNeeded(db,key); - return lookupKey(db,key,LOOKUP_UPDATEMVCC); + robj *o = lookupKey(db,key,LOOKUP_UPDATEMVCC); + if (expireIfNeeded(db,key)) + o = NULL; + return o; } robj_roptr lookupKeyReadOrReply(client *c, robj *key, robj *reply) { @@ -177,6 +195,7 @@ robj *lookupKeyWriteOrReply(client *c, robj *key, robj *reply) { } int dbAddCore(redisDb *db, robj *key, robj *val) { + serverAssert(!val->FExpires()); sds copy = sdsdup(szFromObj(key)); int retval = dictAdd(db->pdict, copy, val); val->mvcc_tstamp = key->mvcc_tstamp = getMvccTstamp(); @@ -206,10 +225,18 @@ void dbAdd(redisDb *db, robj *key, robj *val) serverAssertWithInfo(NULL,key,retval == DICT_OK); } -void dbOverwriteCore(redisDb *db, dictEntry *de, robj *val, bool fUpdateMvcc) +void dbOverwriteCore(redisDb *db, dictEntry *de, robj *key, robj *val, bool fUpdateMvcc, bool fRemoveExpire) { dictEntry auxentry = *de; robj *old = (robj*)dictGetVal(de); + + if (old->FExpires()) { + if (fRemoveExpire) + removeExpire(db, key); + else + updateExpire(db, (sds)dictGetKey(de), old, val); + } + if (g_pserver->maxmemory_policy & MAXMEMORY_FLAG_LFU) { val->lru = old->lru; } @@ -235,7 +262,7 @@ void dbOverwrite(redisDb *db, robj *key, robj *val) { dictEntry *de = dictFind(db->pdict,ptrFromObj(key)); serverAssertWithInfo(NULL,key,de != NULL); - dbOverwriteCore(db, de, val, true); + dbOverwriteCore(db, de, key, val, true, false); } /* Insert a key, handling duplicate keys according to fReplace */ @@ -250,7 +277,7 @@ int dbMerge(redisDb *db, robj *key, robj *val, int fReplace) robj *old = (robj*)dictGetVal(de); if (old->mvcc_tstamp <= val->mvcc_tstamp) { - dbOverwriteCore(db, de, val, false); + dbOverwriteCore(db, de, key, val, false, true); return true; } @@ -271,13 +298,13 @@ int dbMerge(redisDb *db, robj *key, robj *val, int fReplace) * * All the new keys in the database should be created via this interface. */ void setKey(redisDb *db, robj *key, robj *val) { - if (lookupKeyWrite(db,key) == NULL) { + dictEntry *de = dictFind(db->pdict, ptrFromObj(key)); + if (de == NULL) { dbAdd(db,key,val); } else { - dbOverwrite(db,key,val); + dbOverwriteCore(db,de,key,val,true,true); } incrRefCount(val); - removeExpire(db,key); signalModifiedKey(db,key); } @@ -292,7 +319,7 @@ int dbExists(redisDb *db, robj *key) { robj *dbRandomKey(redisDb *db) { dictEntry *de; int maxtries = 100; - int allvolatile = dictSize(db->pdict) == dictSize(db->expires); + int allvolatile = dictSize(db->pdict) == db->setexpire->size(); while(1) { sds key; @@ -303,23 +330,30 @@ robj *dbRandomKey(redisDb *db) { key = (sds)dictGetKey(de); keyobj = createStringObject(key,sdslen(key)); - if (dictFind(db->expires,key)) { + + if (((robj*)dictGetVal(de))->FExpires()) + { if (allvolatile && listLength(g_pserver->masters) && --maxtries == 0) { /* If the DB is composed only of keys with an expire set, - * it could happen that all the keys are already logically - * expired in the slave, so the function cannot stop because - * expireIfNeeded() is false, nor it can stop because - * dictGetRandomKey() returns NULL (there are keys to return). - * To prevent the infinite loop we do some tries, but if there - * are the conditions for an infinite loop, eventually we - * return a key name that may be already expired. */ + * it could happen that all the keys are already logically + * expired in the slave, so the function cannot stop because + * expireIfNeeded() is false, nor it can stop because + * dictGetRandomKey() returns NULL (there are keys to return). + * To prevent the infinite loop we do some tries, but if there + * are the conditions for an infinite loop, eventually we + * return a key name that may be already expired. */ return keyobj; } - if (expireIfNeeded(db,keyobj)) { + } + + if (((robj*)dictGetVal(de))->FExpires()) + { + if (expireIfNeeded(db,keyobj)) { decrRefCount(keyobj); continue; /* search for another key. This expired. */ - } + } } + return keyobj; } } @@ -328,7 +362,10 @@ robj *dbRandomKey(redisDb *db) { int dbSyncDelete(redisDb *db, robj *key) { /* Deleting an entry from the expires dict will not free the sds of * the key, because it is shared with the main dictionary. */ - if (dictSize(db->expires) > 0) dictDelete(db->expires,ptrFromObj(key)); + + dictEntry *de = dictFind(db->pdict, szFromObj(key)); + if (de != nullptr && ((robj*)dictGetVal(de))->FExpires()) + removeExpireCore(db, key, de); if (dictDelete(db->pdict,ptrFromObj(key)) == DICT_OK) { if (g_pserver->cluster_enabled) slotToKeyDel(key); return 1; @@ -373,7 +410,7 @@ int dbDelete(redisDb *db, robj *key) { */ robj *dbUnshareStringValue(redisDb *db, robj *key, robj *o) { serverAssert(o->type == OBJ_STRING); - if (o->refcount != 1 || o->encoding != OBJ_ENCODING_RAW) { + if (o->getrefcount(std::memory_order_relaxed) != 1 || o->encoding != OBJ_ENCODING_RAW) { robj *decoded = getDecodedObject(o); o = createRawStringObject(szFromObj(decoded), sdslen(szFromObj(decoded))); decrRefCount(decoded); @@ -419,7 +456,8 @@ long long emptyDb(int dbnum, int flags, void(callback)(void*)) { emptyDbAsync(&g_pserver->db[j]); } else { dictEmpty(g_pserver->db[j].pdict,callback); - dictEmpty(g_pserver->db[j].expires,callback); + delete g_pserver->db[j].setexpire; + g_pserver->db[j].setexpire = new (MALLOC_LOCAL) semiorderedset(); } } if (g_pserver->cluster_enabled) { @@ -964,9 +1002,10 @@ void renameGenericCommand(client *c, int nx) { * with the same name. */ dbDelete(c->db,c->argv[2]); } - dbAdd(c->db,c->argv[2],o); - if (expire != -1) setExpire(c,c->db,c->argv[2],expire); dbDelete(c->db,c->argv[1]); + dbAdd(c->db,c->argv[2],o); + if (expire != -1) + setExpire(c,c->db,c->argv[2],expire); signalModifiedKey(c->db,c->argv[1]); signalModifiedKey(c->db,c->argv[2]); notifyKeyspaceEvent(NOTIFY_GENERIC,"rename_from", @@ -1024,6 +1063,12 @@ void moveCommand(client *c) { return; } expire = getExpire(c->db,c->argv[1]); + if (o->FExpires()) + removeExpire(c->db,c->argv[1]); + serverAssert(!o->FExpires()); + incrRefCount(o); + dbDelete(src,c->argv[1]); + g_pserver->dirty++; /* Return zero if the key already exists in the target DB */ if (lookupKeyWrite(dst,c->argv[1]) != NULL) { @@ -1032,11 +1077,7 @@ void moveCommand(client *c) { } dbAdd(dst,c->argv[1],o); if (expire != -1) setExpire(c,dst,c->argv[1],expire); - incrRefCount(o); - /* OK! key moved, free the entry in the source DB */ - dbDelete(src,c->argv[1]); - g_pserver->dirty++; addReply(c,shared.cone); } @@ -1077,11 +1118,11 @@ int dbSwapDatabases(int id1, int id2) { * ready_keys and watched_keys, since we want clients to * remain in the same DB they were. */ db1->pdict = db2->pdict; - db1->expires = db2->expires; + db1->setexpire = db2->setexpire; db1->avg_ttl = db2->avg_ttl; db2->pdict = aux.pdict; - db2->expires = aux.expires; + db2->setexpire = aux.setexpire; db2->avg_ttl = aux.avg_ttl; /* Now we need to handle clients blocked on lists: as an effect @@ -1130,12 +1171,25 @@ void swapdbCommand(client *c) { /*----------------------------------------------------------------------------- * Expires API *----------------------------------------------------------------------------*/ - int removeExpire(redisDb *db, robj *key) { + dictEntry *de = dictFind(db->pdict,ptrFromObj(key)); + return removeExpireCore(db, key, de); +} +int removeExpireCore(redisDb *db, robj *key, dictEntry *de) { /* An expire may only be removed if there is a corresponding entry in the * main dict. Otherwise, the key will never be freed. */ - serverAssertWithInfo(NULL,key,dictFind(db->pdict,ptrFromObj(key)) != NULL); - return dictDelete(db->expires,ptrFromObj(key)) == DICT_OK; + serverAssertWithInfo(NULL,key,de != NULL); + + robj *val = (robj*)dictGetVal(de); + if (!val->FExpires()) + return 0; + + auto itr = db->setexpire->find((sds)dictGetKey(de)); + serverAssert(itr != db->setexpire->end()); + serverAssert(itr->key() == (sds)dictGetKey(de)); + db->setexpire->erase(itr); + val->SetFExpires(false); + return 1; } /* Set an expire to the specified key. If the expire is set in the context @@ -1143,14 +1197,27 @@ int removeExpire(redisDb *db, robj *key) { * to NULL. The 'when' parameter is the absolute unix time in milliseconds * after which the key will no longer be considered valid. */ void setExpire(client *c, redisDb *db, robj *key, long long when) { - dictEntry *kde, *de; + dictEntry *kde; + serverAssert(GlobalLocksAcquired()); /* Reuse the sds from the main dict in the expire dict */ kde = dictFind(db->pdict,ptrFromObj(key)); serverAssertWithInfo(NULL,key,kde != NULL); - de = dictAddOrFind(db->expires,dictGetKey(kde)); - dictSetSignedIntegerVal(de,when); + + if (((robj*)dictGetVal(kde))->getrefcount(std::memory_order_relaxed) == OBJ_SHARED_REFCOUNT) + { + // shared objects cannot have the expire bit set, create a real object + dictSetVal(db->pdict, kde, dupStringObject((robj*)dictGetVal(kde))); + } + + if (((robj*)dictGetVal(kde))->FExpires()) + removeExpire(db, key); // should we optimize for when this is called with an already set expiry? + + expireEntry e((sds)dictGetKey(kde), when); + ((robj*)dictGetVal(kde))->SetFExpires(true); + + db->setexpire->insert(e); int writable_slave = listLength(g_pserver->masters) && g_pserver->repl_slave_ro == 0; if (c && writable_slave && !(c->flags & CLIENT_MASTER)) @@ -1163,13 +1230,18 @@ long long getExpire(redisDb *db, robj_roptr key) { dictEntry *de; /* No expire? return ASAP */ - if (dictSize(db->expires) == 0 || - (de = dictFind(db->expires,ptrFromObj(key))) == NULL) return -1; + if (db->setexpire->size() == 0) + return -1; - /* The entry was found in the expire dict, this means it should also - * be present in the main dict (safety check). */ - serverAssertWithInfo(NULL,key,dictFind(db->pdict,ptrFromObj(key)) != NULL); - return dictGetSignedIntegerVal(de); + de = dictFind(db->pdict, ptrFromObj(key)); + if (de == NULL) + return -1; + robj *obj = (robj*)dictGetVal(de); + if (!obj->FExpires()) + return -1; + + auto itr = db->setexpire->find((sds)dictGetKey(de)); + return itr->when(); } /* Propagate expires into slaves and the AOF file. diff --git a/src/debug.cpp b/src/debug.cpp index 3485df967..c02eba225 100644 --- a/src/debug.cpp +++ b/src/debug.cpp @@ -436,7 +436,7 @@ NULL "Value at:%p refcount:%d " "encoding:%s serializedlength:%zu " "lru:%d lru_seconds_idle:%llu%s", - (void*)val, static_cast(val->refcount), + (void*)val, static_cast(val->getrefcount(std::memory_order_relaxed)), strenc, rdbSavedObjectLen(val), val->lru, estimateObjectIdleTime(val)/1000, extra); } else if (!strcasecmp(szFromObj(c->argv[1]),"sdslen") && c->argc == 3) { @@ -639,8 +639,9 @@ NULL stats = sdscat(stats,buf); stats = sdscatprintf(stats,"[Expires HT]\n"); - dictGetStats(buf,sizeof(buf),g_pserver->db[dbid].expires); - stats = sdscat(stats,buf); + // TODO! + //dictGetStats(buf,sizeof(buf),server.db[dbid].expires); + //stats = sdscat(stats,buf); addReplyBulkSds(c,stats); } else if (!strcasecmp(szFromObj(c->argv[1]),"htstats-key") && c->argc == 3) { @@ -721,14 +722,14 @@ void _serverAssertPrintClientInfo(const client *c) { arg = buf; } serverLog(LL_WARNING,"client->argv[%d] = \"%s\" (refcount: %d)", - j, arg, static_cast(c->argv[j]->refcount)); + j, arg, static_cast(c->argv[j]->getrefcount(std::memory_order_relaxed))); } } void serverLogObjectDebugInfo(robj_roptr o) { serverLog(LL_WARNING,"Object type: %d", o->type); serverLog(LL_WARNING,"Object encoding: %d", o->encoding); - serverLog(LL_WARNING,"Object refcount: %d", static_cast(o->refcount)); + serverLog(LL_WARNING,"Object refcount: %d", static_cast(o->getrefcount(std::memory_order_relaxed))); if (o->type == OBJ_STRING && sdsEncodedObject(o)) { serverLog(LL_WARNING,"Object raw string len: %zu", sdslen(szFromObj(o))); if (sdslen(szFromObj(o)) < 4096) { diff --git a/src/defrag.cpp b/src/defrag.cpp index 2e9abd290..b11564c4b 100644 --- a/src/defrag.cpp +++ b/src/defrag.cpp @@ -48,6 +48,7 @@ extern "C" int je_get_defrag_hint(void* ptr, int *bin_util, int *run_util); /* forward declarations*/ void defragDictBucketCallback(void *privdata, dictEntry **bucketref); dictEntry* replaceSateliteDictKeyPtrAndOrDefragDictEntry(dict *d, sds oldkey, sds newkey, uint64_t hash, long *defragged); +void replaceSateliteOSetKeyPtr(semiorderedset &set, sds oldkey, sds newkey); /* Defrag helper for generic allocations. * @@ -102,7 +103,7 @@ sds activeDefragSds(sds sdsptr) { * and should NOT be accessed. */ robj *activeDefragStringOb(robj* ob, long *defragged) { robj *ret = NULL; - if (ob->refcount!=1) + if (ob->getrefcount(std::memory_order_relaxed)!=1) return NULL; /* try to defrag robj (only if not an EMBSTR type (handled below). */ @@ -406,6 +407,16 @@ dictEntry* replaceSateliteDictKeyPtrAndOrDefragDictEntry(dict *d, sds oldkey, sd return NULL; } +void replaceSateliteOSetKeyPtr(semiorderedset &set, sds oldkey, sds newkey) { + auto itr = set.find(oldkey); + if (itr != set.end()) + { + expireEntry eNew(newkey, itr->when()); + set.erase(itr); + set.insert(eNew); + } +} + long activeDefragQuickListNodes(quicklist *ql) { quicklistNode *node = ql->head, *newnode; long defragged = 0; @@ -769,12 +780,8 @@ long defragKey(redisDb *db, dictEntry *de) { newsds = activeDefragSds(keysds); if (newsds) defragged++, de->key = newsds; - if (dictSize(db->expires)) { - /* Dirty code: - * I can't search in db->expires for that key after i already released - * the pointer it holds it won't be able to do the string compare */ - uint64_t hash = dictGetHash(db->pdict, de->key); - replaceSateliteDictKeyPtrAndOrDefragDictEntry(db->expires, keysds, newsds, hash, &defragged); + if (!db->setexpire->empty()) { + replaceSateliteOSetKeyPtr(*db->setexpire, keysds, newsds); } /* Try to defrag robj and / or string value. */ diff --git a/src/evict.cpp b/src/evict.cpp index 4be6bf761..4acdb5ad0 100644 --- a/src/evict.cpp +++ b/src/evict.cpp @@ -150,6 +150,84 @@ void evictionPoolAlloc(void) { EvictionPoolLRU = ep; } +void processEvictionCandidate(int dbid, sds key, robj *o, const expireEntry *e, struct evictionPoolEntry *pool) +{ + unsigned long long idle; + + /* Calculate the idle time according to the policy. This is called + * idle just because the code initially handled LRU, but is in fact + * just a score where an higher score means better candidate. */ + if (g_pserver->maxmemory_policy & MAXMEMORY_FLAG_LRU) { + idle = (o != nullptr) ? estimateObjectIdleTime(o) : 0; + } else if (g_pserver->maxmemory_policy & MAXMEMORY_FLAG_LFU) { + /* When we use an LRU policy, we sort the keys by idle time + * so that we expire keys starting from greater idle time. + * However when the policy is an LFU one, we have a frequency + * estimation, and we want to evict keys with lower frequency + * first. So inside the pool we put objects using the inverted + * frequency subtracting the actual frequency to the maximum + * frequency of 255. */ + idle = 255-LFUDecrAndReturn(o); + } else if (g_pserver->maxmemory_policy == MAXMEMORY_VOLATILE_TTL) { + /* In this case the sooner the expire the better. */ + idle = ULLONG_MAX - e->when(); + } else { + serverPanic("Unknown eviction policy in evictionPoolPopulate()"); + } + + /* Insert the element inside the pool. + * First, find the first empty bucket or the first populated + * bucket that has an idle time smaller than our idle time. */ + int k = 0; + while (k < EVPOOL_SIZE && + pool[k].key && + pool[k].idle < idle) k++; + if (k == 0 && pool[EVPOOL_SIZE-1].key != NULL) { + /* Can't insert if the element is < the worst element we have + * and there are no empty buckets. */ + return; + } else if (k < EVPOOL_SIZE && pool[k].key == NULL) { + /* Inserting into empty position. No setup needed before insert. */ + } else { + /* Inserting in the middle. Now k points to the first element + * greater than the element to insert. */ + if (pool[EVPOOL_SIZE-1].key == NULL) { + /* Free space on the right? Insert at k shifting + * all the elements from k to end to the right. */ + + /* Save SDS before overwriting. */ + sds cached = pool[EVPOOL_SIZE-1].cached; + memmove(pool+k+1,pool+k, + sizeof(pool[0])*(EVPOOL_SIZE-k-1)); + pool[k].cached = cached; + } else { + /* No free space on right? Insert at k-1 */ + k--; + /* Shift all elements on the left of k (included) to the + * left, so we discard the element with smaller idle time. */ + sds cached = pool[0].cached; /* Save SDS before overwriting. */ + if (pool[0].key != pool[0].cached) sdsfree(pool[0].key); + memmove(pool,pool+1,sizeof(pool[0])*k); + pool[k].cached = cached; + } + } + + /* Try to reuse the cached SDS string allocated in the pool entry, + * because allocating and deallocating this object is costly + * (according to the profiler, not my fantasy. Remember: + * premature optimizbla bla bla bla. */ + int klen = sdslen(key); + if (klen > EVPOOL_CACHED_SDS_SIZE) { + pool[k].key = sdsdup(key); + } else { + memcpy(pool[k].cached,key,klen+1); + sdssetlen(pool[k].cached,klen); + pool[k].key = pool[k].cached; + } + pool[k].idle = idle; + pool[k].dbid = dbid; +} + /* This is an helper function for freeMemoryIfNeeded(), it is used in order * to populate the evictionPool with a few entries every time we want to * expire a key. Keys with idle time smaller than one of the current @@ -159,100 +237,36 @@ void evictionPoolAlloc(void) { * idle time are on the left, and keys with the higher idle time on the * right. */ -void evictionPoolPopulate(int dbid, dict *sampledict, dict *keydict, struct evictionPoolEntry *pool) { - int j, k, count; - dictEntry **samples = (dictEntry**)alloca(g_pserver->maxmemory_samples * sizeof(dictEntry*)); +struct visitFunctor +{ + int dbid; + dict *dbdict; + struct evictionPoolEntry *pool; + int count; - count = dictGetSomeKeys(sampledict,samples,g_pserver->maxmemory_samples); - for (j = 0; j < count; j++) { - unsigned long long idle; - sds key; - robj *o = nullptr; - dictEntry *de; - - de = samples[j]; - key = (sds)dictGetKey(de); - - /* If the dictionary we are sampling from is not the main - * dictionary (but the expires one) we need to lookup the key - * again in the key dictionary to obtain the value object. */ - if (g_pserver->maxmemory_policy != MAXMEMORY_VOLATILE_TTL) { - if (sampledict != keydict) de = dictFind(keydict, key); - o = (robj*)dictGetVal(de); + bool operator()(const expireEntry &e) + { + dictEntry *de = dictFind(dbdict, e.key()); + processEvictionCandidate(dbid, (sds)dictGetKey(de), (robj*)dictGetVal(de), &e, pool); + ++count; + return count < g_pserver->maxmemory_samples; + } +}; +void evictionPoolPopulate(int dbid, dict *dbdict, semiorderedset *setexpire, struct evictionPoolEntry *pool) +{ + if (setexpire != nullptr) + { + visitFunctor visitor { dbid, dbdict, pool, 0 }; + setexpire->random_visit(visitor); + } + else + { + dictEntry **samples = (dictEntry**)alloca(g_pserver->maxmemory_samples * sizeof(dictEntry*)); + int count = dictGetSomeKeys(dbdict,samples,g_pserver->maxmemory_samples); + for (int j = 0; j < count; j++) { + robj *o = (robj*)dictGetVal(samples[j]); + processEvictionCandidate(dbid, (sds)dictGetKey(samples[j]), o, nullptr, pool); } - - /* Calculate the idle time according to the policy. This is called - * idle just because the code initially handled LRU, but is in fact - * just a score where an higher score means better candidate. */ - if (g_pserver->maxmemory_policy & MAXMEMORY_FLAG_LRU) { - idle = (o != nullptr) ? estimateObjectIdleTime(o) : 0; - } else if (g_pserver->maxmemory_policy & MAXMEMORY_FLAG_LFU) { - /* When we use an LRU policy, we sort the keys by idle time - * so that we expire keys starting from greater idle time. - * However when the policy is an LFU one, we have a frequency - * estimation, and we want to evict keys with lower frequency - * first. So inside the pool we put objects using the inverted - * frequency subtracting the actual frequency to the maximum - * frequency of 255. */ - idle = 255-LFUDecrAndReturn(o); - } else if (g_pserver->maxmemory_policy == MAXMEMORY_VOLATILE_TTL) { - /* In this case the sooner the expire the better. */ - idle = ULLONG_MAX - (long)dictGetVal(de); - } else { - serverPanic("Unknown eviction policy in evictionPoolPopulate()"); - } - - /* Insert the element inside the pool. - * First, find the first empty bucket or the first populated - * bucket that has an idle time smaller than our idle time. */ - k = 0; - while (k < EVPOOL_SIZE && - pool[k].key && - pool[k].idle < idle) k++; - if (k == 0 && pool[EVPOOL_SIZE-1].key != NULL) { - /* Can't insert if the element is < the worst element we have - * and there are no empty buckets. */ - continue; - } else if (k < EVPOOL_SIZE && pool[k].key == NULL) { - /* Inserting into empty position. No setup needed before insert. */ - } else { - /* Inserting in the middle. Now k points to the first element - * greater than the element to insert. */ - if (pool[EVPOOL_SIZE-1].key == NULL) { - /* Free space on the right? Insert at k shifting - * all the elements from k to end to the right. */ - - /* Save SDS before overwriting. */ - sds cached = pool[EVPOOL_SIZE-1].cached; - memmove(pool+k+1,pool+k, - sizeof(pool[0])*(EVPOOL_SIZE-k-1)); - pool[k].cached = cached; - } else { - /* No free space on right? Insert at k-1 */ - k--; - /* Shift all elements on the left of k (included) to the - * left, so we discard the element with smaller idle time. */ - sds cached = pool[0].cached; /* Save SDS before overwriting. */ - if (pool[0].key != pool[0].cached) sdsfree(pool[0].key); - memmove(pool,pool+1,sizeof(pool[0])*k); - pool[k].cached = cached; - } - } - - /* Try to reuse the cached SDS string allocated in the pool entry, - * because allocating and deallocating this object is costly - * (according to the profiler, not my fantasy. Remember: - * premature optimizbla bla bla bla. */ - int klen = sdslen(key); - if (klen > EVPOOL_CACHED_SDS_SIZE) { - pool[k].key = sdsdup(key); - } else { - memcpy(pool[k].cached,key,klen+1); - sdssetlen(pool[k].cached,klen); - pool[k].key = pool[k].cached; - } - pool[k].idle = idle; - pool[k].dbid = dbid; } } @@ -474,8 +488,6 @@ int freeMemoryIfNeeded(void) { sds bestkey = NULL; int bestdbid; redisDb *db; - dict *dict; - dictEntry *de; if (g_pserver->maxmemory_policy & (MAXMEMORY_FLAG_LRU|MAXMEMORY_FLAG_LFU) || g_pserver->maxmemory_policy == MAXMEMORY_VOLATILE_TTL) @@ -490,10 +502,18 @@ int freeMemoryIfNeeded(void) { * every DB. */ for (i = 0; i < cserver.dbnum; i++) { db = g_pserver->db+i; - dict = (g_pserver->maxmemory_policy & MAXMEMORY_FLAG_ALLKEYS) ? - db->pdict : db->expires; - if ((keys = dictSize(dict)) != 0) { - evictionPoolPopulate(i, dict, db->pdict, pool); + if (g_pserver->maxmemory_policy & MAXMEMORY_FLAG_ALLKEYS) + { + if ((keys = dictSize(db->pdict)) != 0) { + evictionPoolPopulate(i, db->pdict, nullptr, pool); + total_keys += keys; + } + } + else + { + keys = db->setexpire->size(); + if (keys != 0) + evictionPoolPopulate(i, db->pdict, db->setexpire, pool); total_keys += keys; } } @@ -503,14 +523,11 @@ int freeMemoryIfNeeded(void) { for (k = EVPOOL_SIZE-1; k >= 0; k--) { if (pool[k].key == NULL) continue; bestdbid = pool[k].dbid; + sds key = nullptr; - if (g_pserver->maxmemory_policy & MAXMEMORY_FLAG_ALLKEYS) { - de = dictFind(g_pserver->db[pool[k].dbid].pdict, - pool[k].key); - } else { - de = dictFind(g_pserver->db[pool[k].dbid].expires, - pool[k].key); - } + dictEntry *de = dictFind(g_pserver->db[pool[k].dbid].pdict,pool[k].key); + if (de != nullptr && (g_pserver->maxmemory_policy & MAXMEMORY_FLAG_ALLKEYS || ((robj*)dictGetVal(de))->FExpires())) + key = (sds)dictGetKey(de); /* Remove the entry from the pool. */ if (pool[k].key != pool[k].cached) @@ -520,8 +537,8 @@ int freeMemoryIfNeeded(void) { /* If the key exists, is our pick. Otherwise it is * a ghost and we need to try the next element. */ - if (de) { - bestkey = (sds)dictGetKey(de); + if (key) { + bestkey = key; break; } else { /* Ghost... Iterate again. */ @@ -540,13 +557,23 @@ int freeMemoryIfNeeded(void) { for (i = 0; i < cserver.dbnum; i++) { j = (++next_db) % cserver.dbnum; db = g_pserver->db+j; - dict = (g_pserver->maxmemory_policy == MAXMEMORY_ALLKEYS_RANDOM) ? - db->pdict : db->expires; - if (dictSize(dict) != 0) { - de = dictGetRandomKey(dict); - bestkey = (sds)dictGetKey(de); - bestdbid = j; - break; + if (g_pserver->maxmemory_policy == MAXMEMORY_ALLKEYS_RANDOM) + { + if (dictSize(db->pdict) != 0) { + dictEntry *de = dictGetRandomKey(db->pdict); + bestkey = (sds)dictGetKey(de); + bestdbid = j; + break; + } + } + else + { + if (!db->setexpire->empty()) + { + bestkey = (sds)db->setexpire->random_value().key(); + bestdbid = j; + break; + } } } } diff --git a/src/expire.cpp b/src/expire.cpp index 5a0abbb06..55ea83411 100644 --- a/src/expire.cpp +++ b/src/expire.cpp @@ -51,26 +51,19 @@ * * The parameter 'now' is the current time in milliseconds as is passed * to the function to avoid too many gettimeofday() syscalls. */ -int activeExpireCycleTryExpire(redisDb *db, dictEntry *de, long long now) { - long long t = dictGetSignedIntegerVal(de); - if (now > t) { - sds key = (sds)dictGetKey(de); - robj *keyobj = createStringObject(key,sdslen(key)); +void activeExpireCycleExpire(redisDb *db, const char *key) { + robj *keyobj = createStringObject(key,sdslen(key)); - propagateExpire(db,keyobj,g_pserver->lazyfree_lazy_expire); - if (g_pserver->lazyfree_lazy_expire) - dbAsyncDelete(db,keyobj); - else - dbSyncDelete(db,keyobj); - notifyKeyspaceEvent(NOTIFY_EXPIRED, - "expired",keyobj,db->id); - if (g_pserver->tracking_clients) trackingInvalidateKey(keyobj); - decrRefCount(keyobj); - g_pserver->stat_expiredkeys++; - return 1; - } else { - return 0; - } + propagateExpire(db,keyobj,g_pserver->lazyfree_lazy_expire); + if (g_pserver->lazyfree_lazy_expire) + dbAsyncDelete(db,keyobj); + else + dbSyncDelete(db,keyobj); + notifyKeyspaceEvent(NOTIFY_EXPIRED, + "expired",keyobj,db->id); + if (g_pserver->tracking_clients) trackingInvalidateKey(keyobj); + decrRefCount(keyobj); + g_pserver->stat_expiredkeys++; } /* Try to expire a few timed out keys. The algorithm used is adaptive and @@ -148,7 +141,6 @@ void activeExpireCycle(int type) { long total_expired = 0; for (j = 0; j < dbs_per_call && timelimit_exit == 0; j++) { - int expired; redisDb *db = g_pserver->db+(current_db % cserver.dbnum); /* Increment the DB now so we are sure if we run out of time @@ -156,78 +148,44 @@ void activeExpireCycle(int type) { * distribute the time evenly across DBs. */ current_db++; - /* Continue to expire if at the end of the cycle more than 25% - * of the keys were expired. */ - do { - unsigned long num, slots; - long long now, ttl_sum; - int ttl_samples; - iteration++; + long long now; + iteration++; + now = mstime(); - /* If there is nothing to expire try next DB ASAP. */ - if ((num = dictSize(db->expires)) == 0) { - db->avg_ttl = 0; - break; + /* If there is nothing to expire try next DB ASAP. */ + if (db->setexpire->empty()) + { + // TODO: Compute db->avg_ttl somewhere... but probably not here + db->avg_ttl = 0; + continue; + } + + size_t expired = 0; + size_t tried = 0; + db->expireitr = db->setexpire->enumerate(db->expireitr, now, [&](const expireEntry &e) __attribute__((always_inline)) { + if (e.when() < now) + { + activeExpireCycleExpire(db, e.key()); + ++expired; } - slots = dictSlots(db->expires); - now = mstime(); + ++tried; - /* When there are less than 1% filled slots getting random - * keys is expensive, so stop here waiting for better times... - * The dictionary will be resized asap. */ - if (num && slots > DICT_HT_INITIAL_SIZE && - (num*100/slots < 1)) break; - - /* The main collection cycle. Sample random keys among keys - * with an expire set, checking for expired ones. */ - expired = 0; - ttl_sum = 0; - ttl_samples = 0; - - if (num > ACTIVE_EXPIRE_CYCLE_LOOKUPS_PER_LOOP) - num = ACTIVE_EXPIRE_CYCLE_LOOKUPS_PER_LOOP; - - while (num--) { - dictEntry *de; - long long ttl; - - if ((de = dictGetRandomKey(db->expires)) == NULL) break; - ttl = dictGetSignedIntegerVal(de)-now; - if (activeExpireCycleTryExpire(db,de,now)) expired++; - if (ttl > 0) { - /* We want the average TTL of keys yet not expired. */ - ttl_sum += ttl; - ttl_samples++; - } - total_sampled++; - } - total_expired += expired; - - /* Update the average TTL stats for this database. */ - if (ttl_samples) { - long long avg_ttl = ttl_sum/ttl_samples; - - /* Do a simple running average with a few samples. - * We just use the current estimate with a weight of 2% - * and the previous estimate with a weight of 98%. */ - if (db->avg_ttl == 0) db->avg_ttl = avg_ttl; - db->avg_ttl = (db->avg_ttl/50)*49 + (avg_ttl/50); - } - - /* We can't block forever here even if there are many keys to - * expire. So after a given amount of milliseconds return to the - * caller waiting for the other active expire cycle. */ - if ((iteration & 0xf) == 0) { /* check once every 16 iterations. */ + if ((tried % ACTIVE_EXPIRE_CYCLE_LOOKUPS_PER_LOOP) == 0) + { + /* We can't block forever here even if there are many keys to + * expire. So after a given amount of milliseconds return to the + * caller waiting for the other active expire cycle. */ elapsed = ustime()-start; if (elapsed > timelimit) { timelimit_exit = 1; g_pserver->stat_expired_time_cap_reached_count++; - break; + return false; } } - /* We don't repeat the cycle if there are less than 25% of keys - * found expired in the current DB. */ - } while (expired > ACTIVE_EXPIRE_CYCLE_LOOKUPS_PER_LOOP/4); + return true; + }); + + total_expired += expired; } elapsed = ustime()-start; @@ -301,20 +259,27 @@ void expireSlaveKeys(void) { while(dbids && dbid < cserver.dbnum) { if ((dbids & 1) != 0) { redisDb *db = g_pserver->db+dbid; - dictEntry *expire = dictFind(db->expires,keyname); + + // the expire is hashed based on the key pointer, so we need the point in the main db + dictEntry *deMain = dictFind(db->pdict, keyname); + auto itr = db->setexpire->end(); + if (deMain != nullptr) + itr = db->setexpire->find((sds)dictGetKey(deMain)); int expired = 0; - if (expire && - activeExpireCycleTryExpire(g_pserver->db+dbid,expire,start)) + if (itr != db->setexpire->end()) { - expired = 1; + if (itr->when() < start) { + activeExpireCycleExpire(g_pserver->db+dbid,itr->key()); + expired = 1; + } } /* If the key was not expired in this DB, we need to set the * corresponding bit in the new bitmap we set as value. * At the end of the loop if the bitmap is zero, it means we * no longer need to keep track of this key. */ - if (expire && !expired) { + if (itr != db->setexpire->end() && !expired) { noexpire++; new_dbids |= (uint64_t)1 << dbid; } diff --git a/src/lazyfree.cpp b/src/lazyfree.cpp index 6d56ec86d..0dbfd57d1 100644 --- a/src/lazyfree.cpp +++ b/src/lazyfree.cpp @@ -52,16 +52,19 @@ size_t lazyfreeGetFreeEffort(robj *obj) { * will be reclaimed in a different bio.c thread. */ #define LAZYFREE_THRESHOLD 64 int dbAsyncDelete(redisDb *db, robj *key) { - /* Deleting an entry from the expires dict will not free the sds of - * the key, because it is shared with the main dictionary. */ - if (dictSize(db->expires) > 0) dictDelete(db->expires,ptrFromObj(key)); - /* If the value is composed of a few allocations, to free in a lazy way * is actually just slower... So under a certain limit we just free * the object synchronously. */ dictEntry *de = dictUnlink(db->pdict,ptrFromObj(key)); if (de) { robj *val = (robj*)dictGetVal(de); + if (val->FExpires()) + { + /* Deleting an entry from the expires dict will not free the sds of + * the key, because it is shared with the main dictionary. */ + removeExpireCore(db,key,de); + } + size_t free_effort = lazyfreeGetFreeEffort(val); /* If releasing the object is too much work, do it in the background @@ -72,7 +75,7 @@ int dbAsyncDelete(redisDb *db, robj *key) { * objects, and then call dbDelete(). In this case we'll fall * through and reach the dictFreeUnlinkedEntry() call, that will be * equivalent to just calling decrRefCount(). */ - if (free_effort > LAZYFREE_THRESHOLD && val->refcount == 1) { + if (free_effort > LAZYFREE_THRESHOLD && val->getrefcount(std::memory_order_relaxed) == 1) { atomicIncr(lazyfree_objects,1); bioCreateBackgroundJob(BIO_LAZY_FREE,val,NULL,NULL); dictSetVal(db->pdict,de,NULL); @@ -93,7 +96,7 @@ int dbAsyncDelete(redisDb *db, robj *key) { /* Free an object, if the object is huge enough, free it in async way. */ void freeObjAsync(robj *o) { size_t free_effort = lazyfreeGetFreeEffort(o); - if (free_effort > LAZYFREE_THRESHOLD && o->refcount == 1) { + if (free_effort > LAZYFREE_THRESHOLD && o->getrefcount(std::memory_order_relaxed) == 1) { atomicIncr(lazyfree_objects,1); bioCreateBackgroundJob(BIO_LAZY_FREE,o,NULL,NULL); } else { @@ -105,11 +108,13 @@ void freeObjAsync(robj *o) { * create a new empty set of hash tables and scheduling the old ones for * lazy freeing. */ void emptyDbAsync(redisDb *db) { - dict *oldht1 = db->pdict, *oldht2 = db->expires; + dict *oldht1 = db->pdict; + auto *set = db->setexpire; + db->setexpire = new (MALLOC_LOCAL) semiorderedset(); + db->expireitr = db->setexpire->end(); db->pdict = dictCreate(&dbDictType,NULL); - db->expires = dictCreate(&keyptrDictType,NULL); atomicIncr(lazyfree_objects,dictSize(oldht1)); - bioCreateBackgroundJob(BIO_LAZY_FREE,NULL,oldht1,oldht2); + bioCreateBackgroundJob(BIO_LAZY_FREE,NULL,oldht1,set); } /* Empty the slots-keys map of Redis CLuster by creating a new empty one @@ -136,10 +141,10 @@ void lazyfreeFreeObjectFromBioThread(robj *o) { * when the database was logically deleted. 'sl' is a skiplist used by * Redis Cluster in order to take the hash slots -> keys mapping. This * may be NULL if Redis Cluster is disabled. */ -void lazyfreeFreeDatabaseFromBioThread(dict *ht1, dict *ht2) { +void lazyfreeFreeDatabaseFromBioThread(dict *ht1, semiorderedset *set) { size_t numkeys = dictSize(ht1); dictRelease(ht1); - dictRelease(ht2); + delete set; atomicDecr(lazyfree_objects,numkeys); } diff --git a/src/module.cpp b/src/module.cpp index ee31cf7a5..7863ca4cf 100644 --- a/src/module.cpp +++ b/src/module.cpp @@ -566,7 +566,7 @@ void RedisModuleCommandDispatcher(client *c) { for (int i = 0; i < c->argc; i++) { /* Only do the work if the module took ownership of the object: * in that case the refcount is no longer 1. */ - if (c->argv[i]->refcount > 1) + if (c->argv[i]->getrefcount(std::memory_order_relaxed) > 1) trimStringObjectIfNeeded(c->argv[i]); } } @@ -1037,7 +1037,7 @@ int RM_StringCompare(RedisModuleString *a, RedisModuleString *b) { /* Return the (possibly modified in encoding) input 'str' object if * the string is unshared, otherwise NULL is returned. */ RedisModuleString *moduleAssertUnsharedString(RedisModuleString *str) { - if (str->refcount != 1) { + if (str->getrefcount(std::memory_order_relaxed) != 1) { serverLog(LL_WARNING, "Module attempted to use an in-place string modify operation " "with a string referenced multiple times. Please check the code " diff --git a/src/object.cpp b/src/object.cpp index 6e65ec52b..900a9058c 100644 --- a/src/object.cpp +++ b/src/object.cpp @@ -39,11 +39,11 @@ /* ===================== Creation and parsing of objects ==================== */ robj *createObject(int type, void *ptr) { - robj *o = (robj*)zmalloc(sizeof(*o), MALLOC_SHARED); + robj *o = (robj*)zcalloc(sizeof(*o), MALLOC_SHARED); o->type = type; o->encoding = OBJ_ENCODING_RAW; o->m_ptr = ptr; - o->refcount.store(1, std::memory_order_relaxed); + o->setrefcount(1); o->mvcc_tstamp = OBJ_MVCC_INVALID; /* Set the LRU to the current lruclock (minutes resolution), or @@ -68,8 +68,9 @@ robj *createObject(int type, void *ptr) { * */ robj *makeObjectShared(robj *o) { - serverAssert(o->refcount == 1); - o->refcount.store(OBJ_SHARED_REFCOUNT, std::memory_order_relaxed); + serverAssert(o->getrefcount(std::memory_order_relaxed) == 1); + serverAssert(!o->FExpires()); + o->setrefcount(OBJ_SHARED_REFCOUNT); return o; } @@ -86,12 +87,12 @@ robj *createEmbeddedStringObject(const char *ptr, size_t len) { size_t allocsize = sizeof(struct sdshdr8)+len+1; if (allocsize < sizeof(void*)) allocsize = sizeof(void*); - robj *o = (robj*)zmalloc(sizeof(robj)+allocsize-sizeof(o->m_ptr), MALLOC_SHARED); + robj *o = (robj*)zcalloc(sizeof(robj)+allocsize-sizeof(o->m_ptr), MALLOC_SHARED); struct sdshdr8 *sh = (sdshdr8*)(&o->m_ptr); o->type = OBJ_STRING; o->encoding = OBJ_ENCODING_EMBSTR; - o->refcount.store(1, std::memory_order_relaxed); + o->setrefcount(1); o->mvcc_tstamp = OBJ_MVCC_INVALID; if (g_pserver->maxmemory_policy & MAXMEMORY_FLAG_LFU) { @@ -352,11 +353,14 @@ void freeStreamObject(robj_roptr o) { } void incrRefCount(robj_roptr o) { - if (o->refcount != OBJ_SHARED_REFCOUNT) o->refcount.fetch_add(1, std::memory_order_acquire); + if (o->getrefcount(std::memory_order_relaxed) != OBJ_SHARED_REFCOUNT) o->addref(); } void decrRefCount(robj_roptr o) { - if (o->refcount.load(std::memory_order_acquire) == 1) { + if (o->getrefcount(std::memory_order_relaxed) == OBJ_SHARED_REFCOUNT) + return; + unsigned prev = o->release(); + if (prev == 1) { switch(o->type) { case OBJ_STRING: freeStringObject(o); break; case OBJ_LIST: freeListObject(o); break; @@ -369,8 +373,7 @@ void decrRefCount(robj_roptr o) { } zfree(o.unsafe_robjcast()); } else { - if (o->refcount <= 0) serverPanic("decrRefCount against refcount <= 0"); - if (o->refcount != OBJ_SHARED_REFCOUNT) o->refcount.fetch_sub(1, std::memory_order_acquire); + if (prev <= 0) serverPanic("decrRefCount against refcount <= 0"); } } @@ -394,7 +397,7 @@ void decrRefCountVoid(const void *o) { * decrRefCount(obj); */ robj *resetRefCount(robj *obj) { - obj->refcount = 0; + obj->setrefcount(0); return obj; } @@ -452,7 +455,7 @@ robj *tryObjectEncoding(robj *o) { /* It's not safe to encode shared objects: shared objects can be shared * everywhere in the "object space" of Redis and may end in places where * they are not handled. We handle them only as values in the keyspace. */ - if (o->refcount > 1) return o; + if (o->getrefcount(std::memory_order_relaxed) > 1) return o; /* Check if we can represent this string as a long integer. * Note that we are sure that a string larger than 20 chars is not @@ -1064,8 +1067,7 @@ struct redisMemOverhead *getMemoryOverheadData(void) { mh->db[mh->num_dbs].overhead_ht_main = mem; mem_total+=mem; - mem = dictSize(db->expires) * sizeof(dictEntry) + - dictSlots(db->expires) * sizeof(dictEntry*); + mem = db->setexpire->bytes_used(); mh->db[mh->num_dbs].overhead_ht_expires = mem; mem_total+=mem; @@ -1275,7 +1277,7 @@ NULL } else if (!strcasecmp(szFromObj(c->argv[1]),"refcount") && c->argc == 3) { if ((o = objectCommandLookupOrReply(c,c->argv[2],shared.null[c->resp])) == NULL) return; - addReplyLongLong(c,o->refcount); + addReplyLongLong(c,o->getrefcount(std::memory_order_relaxed)); } else if (!strcasecmp(szFromObj(c->argv[1]),"encoding") && c->argc == 3) { if ((o = objectCommandLookupOrReply(c,c->argv[2],shared.null[c->resp])) == NULL) return; @@ -1474,3 +1476,18 @@ NULL addReplyErrorFormat(c, "Unknown subcommand or wrong number of arguments for '%s'. Try MEMORY HELP", (char*)ptrFromObj(c->argv[1])); } } + +void redisObject::SetFExpires(bool fExpire) +{ + serverAssert(this->refcount != OBJ_SHARED_REFCOUNT); + if (fExpire) + this->refcount.fetch_or(1U << 31, std::memory_order_relaxed); + else + this->refcount.fetch_and(~(1U << 31), std::memory_order_relaxed); +} + +void redisObject::setrefcount(unsigned ref) +{ + serverAssert(!FExpires()); + refcount.store(ref, std::memory_order_relaxed); +} \ No newline at end of file diff --git a/src/rdb.cpp b/src/rdb.cpp index d4d91ff1f..5443ca064 100644 --- a/src/rdb.cpp +++ b/src/rdb.cpp @@ -1096,6 +1096,29 @@ int rdbSaveInfoAuxFields(rio *rdb, int flags, rdbSaveInfo *rsi) { return 1; } +int saveKey(rio *rdb, redisDb *db, int flags, size_t *processed, const char *keystr, robj *o) +{ + robj key; + long long expire; + + initStaticStringObject(key,(char*)keystr); + expire = getExpire(db, &key); + + if (rdbSaveKeyValuePair(rdb,&key,o,expire) == -1) + return 0; + + /* When this RDB is produced as part of an AOF rewrite, move + * accumulated diff from parent to child while rewriting in + * order to have a smaller final write. */ + if (flags & RDB_SAVE_AOF_PREAMBLE && + rdb->processed_bytes > *processed+AOF_READ_DIFF_INTERVAL_BYTES) + { + *processed = rdb->processed_bytes; + aofReadDiffFromParent(); + } + return 1; +} + /* Produces a dump of the database in RDB format sending it to the specified * Redis I/O channel. On success C_OK is returned, otherwise C_ERR * is returned and part of the output, or all the output, can be @@ -1134,31 +1157,24 @@ int rdbSaveRio(rio *rdb, int *error, int flags, rdbSaveInfo *rsi) { * these sizes are just hints to resize the hash tables. */ uint64_t db_size, expires_size; db_size = dictSize(db->pdict); - expires_size = dictSize(db->expires); + expires_size = db->setexpire->size(); if (rdbSaveType(rdb,RDB_OPCODE_RESIZEDB) == -1) goto werr; if (rdbSaveLen(rdb,db_size) == -1) goto werr; if (rdbSaveLen(rdb,expires_size) == -1) goto werr; - + /* Iterate this DB writing every entry */ + size_t ckeysExpired = 0; while((de = dictNext(di)) != NULL) { sds keystr = (sds)dictGetKey(de); - robj key, *o = (robj*)dictGetVal(de); - long long expire; + robj *o = (robj*)dictGetVal(de); - initStaticStringObject(key,keystr); - expire = getExpire(db,&key); - if (rdbSaveKeyValuePair(rdb,&key,o,expire) == -1) goto werr; - - /* When this RDB is produced as part of an AOF rewrite, move - * accumulated diff from parent to child while rewriting in - * order to have a smaller final write. */ - if (flags & RDB_SAVE_AOF_PREAMBLE && - rdb->processed_bytes > processed+AOF_READ_DIFF_INTERVAL_BYTES) - { - processed = rdb->processed_bytes; - aofReadDiffFromParent(); - } + if (o->FExpires()) + ++ckeysExpired; + + if (!saveKey(rdb, db, flags, &processed, keystr, o)) + goto werr; } + serverAssert(ckeysExpired == db->setexpire->size()); dictReleaseIterator(di); di = NULL; /* So that we don't release it again on error. */ } @@ -1822,6 +1838,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, robj *key, uint64_t mvcc_tstamp) { } o->mvcc_tstamp = mvcc_tstamp; + serverAssert(!o->FExpires()); return o; } @@ -1909,7 +1926,7 @@ int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi, int loading_aof) { now = mstime(); lru_clock = LRU_CLOCK(); - + while(1) { robj *key, *val; @@ -1965,7 +1982,6 @@ int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi, int loading_aof) { if ((expires_size = rdbLoadLen(rdb,NULL)) == RDB_LENERR) goto eoferr; dictExpand(db->pdict,db_size); - dictExpand(db->expires,expires_size); continue; /* Read next opcode. */ } else if (type == RDB_OPCODE_AUX) { /* AUX: generic string-string fields. Use to add state to RDB @@ -2079,7 +2095,8 @@ int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi, int loading_aof) { if (fInserted) { /* Set the expire time if needed */ - if (expiretime != -1) setExpire(NULL,db,key,expiretime); + if (expiretime != -1) + setExpire(NULL,db,key,expiretime); /* Set usage information (for eviction). */ objectSetLRUOrLFU(val,lfu_freq,lru_idle,lru_clock); @@ -2101,6 +2118,7 @@ int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi, int loading_aof) { lfu_freq = -1; lru_idle = -1; } + /* Verify the checksum if RDB version is >= 5 */ if (rdbver >= 5) { uint64_t cksum, expected = rdb->cksum; diff --git a/src/scripting.cpp b/src/scripting.cpp index 1548044e2..5ba336374 100644 --- a/src/scripting.cpp +++ b/src/scripting.cpp @@ -665,7 +665,7 @@ cleanup: * The object must be small, SDS-encoded, and with refcount = 1 * (we must be the only owner) for us to cache it. */ if (j < LUA_CMD_OBJCACHE_SIZE && - o->refcount == 1 && + o->getrefcount(std::memory_order_relaxed) == 1 && (o->encoding == OBJ_ENCODING_RAW || o->encoding == OBJ_ENCODING_EMBSTR) && sdslen((sds)ptrFromObj(o)) <= LUA_CMD_OBJCACHE_MAX_LEN) diff --git a/src/server.cpp b/src/server.cpp index ebdca3234..008459034 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -1428,8 +1428,6 @@ int htNeedsResize(dict *dict) { void tryResizeHashTables(int dbid) { if (htNeedsResize(g_pserver->db[dbid].pdict)) dictResize(g_pserver->db[dbid].pdict); - if (htNeedsResize(g_pserver->db[dbid].expires)) - dictResize(g_pserver->db[dbid].expires); } /* Our hash table implementation performs rehashing incrementally while @@ -1445,11 +1443,6 @@ int incrementallyRehash(int dbid) { dictRehashMilliseconds(g_pserver->db[dbid].pdict,1); return 1; /* already used our millisecond for this loop... */ } - /* Expires */ - if (dictIsRehashing(g_pserver->db[dbid].expires)) { - dictRehashMilliseconds(g_pserver->db[dbid].expires,1); - return 1; /* already used our millisecond for this loop... */ - } return 0; } @@ -1889,7 +1882,7 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) { size = dictSlots(g_pserver->db[j].pdict); used = dictSize(g_pserver->db[j].pdict); - vkeys = dictSize(g_pserver->db[j].expires); + vkeys = g_pserver->db[j].setexpire->size(); if (used || vkeys) { serverLog(LL_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size); /* dictPrintStats(g_pserver->dict); */ @@ -2926,7 +2919,8 @@ void initServer(void) { /* Create the Redis databases, and initialize other internal state. */ for (int j = 0; j < cserver.dbnum; j++) { g_pserver->db[j].pdict = dictCreate(&dbDictType,NULL); - g_pserver->db[j].expires = dictCreate(&keyptrDictType,NULL); + g_pserver->db[j].setexpire = new(MALLOC_LOCAL) semiorderedset; + g_pserver->db[j].expireitr = g_pserver->db[j].setexpire->end(); g_pserver->db[j].blocking_keys = dictCreate(&keylistDictType,NULL); g_pserver->db[j].ready_keys = dictCreate(&objectKeyPointerValueDictType,NULL); g_pserver->db[j].watched_keys = dictCreate(&keylistDictType,NULL); @@ -4571,7 +4565,7 @@ sds genRedisInfoString(const char *section) { long long keys, vkeys; keys = dictSize(g_pserver->db[j].pdict); - vkeys = dictSize(g_pserver->db[j].expires); + vkeys = g_pserver->db[j].setexpire->size(); if (keys || vkeys) { info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld,avg_ttl=%lld\r\n", diff --git a/src/server.h b/src/server.h index 5c66aaba6..74960bbab 100644 --- a/src/server.h +++ b/src/server.h @@ -81,6 +81,7 @@ typedef long long mstime_t; /* millisecond time type. */ N-elements flat arrays */ #include "rax.h" /* Radix tree */ #include "uuid.h" +#include "semiorderedset.h" /* Following includes allow test functions to be called from Redis main() */ #include "zipmap.h" @@ -243,7 +244,7 @@ public: #define CONFIG_DEFAULT_ACTIVE_REPLICA 0 #define CONFIG_DEFAULT_ENABLE_MULTIMASTER 0 -#define ACTIVE_EXPIRE_CYCLE_LOOKUPS_PER_LOOP 20 /* Loopkups per loop. */ +#define ACTIVE_EXPIRE_CYCLE_LOOKUPS_PER_LOOP 64 /* Loopkups per loop. */ #define ACTIVE_EXPIRE_CYCLE_FAST_DURATION 1000 /* Microseconds */ #define ACTIVE_EXPIRE_CYCLE_SLOW_TIME_PERC 25 /* CPU max % for keys collection */ #define ACTIVE_EXPIRE_CYCLE_SLOW 0 @@ -717,7 +718,7 @@ typedef struct RedisModuleDigest { #define LRU_CLOCK_MAX ((1<lru */ #define LRU_CLOCK_RESOLUTION 1000 /* LRU clock resolution in ms */ -#define OBJ_SHARED_REFCOUNT INT_MAX +#define OBJ_SHARED_REFCOUNT (0x7FFFFFFF) #define OBJ_MVCC_INVALID (0xFFFFFFFFFFFFFFFFULL) typedef struct redisObject { @@ -726,10 +727,21 @@ typedef struct redisObject { unsigned lru:LRU_BITS; /* LRU time (relative to global lru_clock) or * LFU data (least significant 8 bits frequency * and most significant 16 bits access time). */ - mutable std::atomic refcount; +private: + mutable std::atomic refcount; +public: uint64_t mvcc_tstamp; void *m_ptr; + + inline bool FExpires() const { return refcount.load(std::memory_order_relaxed) >> 31; } + void SetFExpires(bool fExpires); + + void setrefcount(unsigned ref); + unsigned getrefcount(std::memory_order order) const { return (refcount.load(order) & ~(1U << 31)); } + void addref() const { refcount.fetch_add(1, std::memory_order_acq_rel); } + unsigned release() const { return refcount.fetch_sub(1, std::memory_order_acq_rel) & ~(1U << 31); } } robj; +static_assert(sizeof(redisObject) == 24, "object size is critical, don't increase"); __attribute__((always_inline)) inline const void *ptrFromObj(robj_roptr &o) { @@ -755,6 +767,38 @@ __attribute__((always_inline)) inline char *szFromObj(const robj *o) return (char*)ptrFromObj(o); } +class expireEntry { + sds m_key; + long long m_when; + +public: + expireEntry(sds key, long long when) + { + m_key = key; + m_when = when; + } + + bool operator!=(const expireEntry &e) const noexcept + { + return m_when != e.m_when || m_key != e.m_key; + } + bool operator==(const expireEntry &e) const noexcept + { + return m_when == e.m_when && m_key == e.m_key; + } + bool operator==(const char *key) const noexcept { return m_key == key; } + + bool operator<(const expireEntry &e) const noexcept { return m_when < e.m_when; } + bool operator<(const char *key) const noexcept { return m_key < key; } + bool operator<(long long when) const noexcept { return m_when < when; } + + const char *key() const noexcept { return m_key; } + long long when() const noexcept { return m_when; } + + + explicit operator const char*() const noexcept { return m_key; } + explicit operator long long() const noexcept { return m_when; } +}; /* The a string name for an object's type as listed above * Native types are checked against the OBJ_STRING, OBJ_LIST, OBJ_* defines, @@ -766,7 +810,7 @@ const char *getObjectTypeName(robj_roptr o); * we'll update it when the structure is changed, to avoid bugs like * bug #85 introduced exactly in this way. */ #define initStaticStringObject(_var,_ptr) do { \ - _var.refcount = 1; \ + _var.setrefcount(1); \ _var.type = OBJ_STRING; \ _var.encoding = OBJ_ENCODING_RAW; \ _var.m_ptr = _ptr; \ @@ -793,7 +837,9 @@ typedef struct clientReplyBlock { * database. The database number is the 'id' field in the structure. */ typedef struct redisDb { dict *pdict; /* The keyspace for this DB */ - dict *expires; /* Timeout of keys with a timeout set */ + semiorderedset *setexpire; + semiorderedset::setiter expireitr; + dict *blocking_keys; /* Keys with clients waiting for data (BLPOP)*/ dict *ready_keys; /* Blocked keys that received a PUSH */ dict *watched_keys; /* WATCHED keys for MULTI/EXEC CAS */ @@ -2174,6 +2220,7 @@ int rewriteConfig(char *path); /* db.c -- Keyspace access API */ int removeExpire(redisDb *db, robj *key); +int removeExpireCore(redisDb *db, robj *key, dictEntry *de); void propagateExpire(redisDb *db, robj *key, int lazy); int expireIfNeeded(redisDb *db, robj *key); long long getExpire(redisDb *db, robj_roptr key); diff --git a/src/slowlog.cpp b/src/slowlog.cpp index 4f338b341..08a2e62e9 100644 --- a/src/slowlog.cpp +++ b/src/slowlog.cpp @@ -72,7 +72,7 @@ slowlogEntry *slowlogCreateEntry(client *c, robj **argv, int argc, long long dur (unsigned long) sdslen(szFromObj(argv[j])) - SLOWLOG_ENTRY_MAX_STRING); se->argv[j] = createObject(OBJ_STRING,s); - } else if (argv[j]->refcount == OBJ_SHARED_REFCOUNT) { + } else if (argv[j]->getrefcount(std::memory_order_relaxed) == OBJ_SHARED_REFCOUNT) { se->argv[j] = argv[j]; } else { /* Here we need to dupliacate the string objects composing the diff --git a/src/t_string.cpp b/src/t_string.cpp index 4cb30eac6..a254f4f53 100644 --- a/src/t_string.cpp +++ b/src/t_string.cpp @@ -353,7 +353,7 @@ void incrDecrCommand(client *c, long long incr) { } value += incr; - if (o && o->refcount == 1 && o->encoding == OBJ_ENCODING_INT && + if (o && o->getrefcount(std::memory_order_relaxed) == 1 && o->encoding == OBJ_ENCODING_INT && (value < 0 || value >= OBJ_SHARED_INTEGERS) && value >= LONG_MIN && value <= LONG_MAX) { From 23c1e8919075744bda053b1442897bd60d2acf02 Mon Sep 17 00:00:00 2001 From: John Sully Date: Sat, 6 Jul 2019 00:23:38 -0400 Subject: [PATCH 40/76] Fix a few potential assert crashes Former-commit-id: 5f3920e491a9632d3b84d9af7800c154f2be0809 --- src/db.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/db.cpp b/src/db.cpp index d1a687712..68e1fae11 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -458,6 +458,7 @@ long long emptyDb(int dbnum, int flags, void(callback)(void*)) { dictEmpty(g_pserver->db[j].pdict,callback); delete g_pserver->db[j].setexpire; g_pserver->db[j].setexpire = new (MALLOC_LOCAL) semiorderedset(); + g_pserver->db[j].expireitr = g_pserver->db[j].setexpire->end(); } } if (g_pserver->cluster_enabled) { @@ -1119,10 +1120,12 @@ int dbSwapDatabases(int id1, int id2) { * remain in the same DB they were. */ db1->pdict = db2->pdict; db1->setexpire = db2->setexpire; + db1->expireitr = db2->expireitr->end(); db1->avg_ttl = db2->avg_ttl; db2->pdict = aux.pdict; db2->setexpire = aux.setexpire; + db2->expireitr = aux.expireitr->end(); db2->avg_ttl = aux.avg_ttl; /* Now we need to handle clients blocked on lists: as an effect From fc54e0970fb34fdcfb7e26e082bccf3500070cfb Mon Sep 17 00:00:00 2001 From: John Sully Date: Sat, 6 Jul 2019 00:55:30 -0400 Subject: [PATCH 41/76] never make last minute changes before commiting Former-commit-id: 7e5d3f4f160c1c6f91c42b19f95ad17fcb7f1590 --- src/db.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/db.cpp b/src/db.cpp index 68e1fae11..45bc16f02 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -1120,12 +1120,12 @@ int dbSwapDatabases(int id1, int id2) { * remain in the same DB they were. */ db1->pdict = db2->pdict; db1->setexpire = db2->setexpire; - db1->expireitr = db2->expireitr->end(); + db1->expireitr = db2->expireitr; db1->avg_ttl = db2->avg_ttl; db2->pdict = aux.pdict; db2->setexpire = aux.setexpire; - db2->expireitr = aux.expireitr->end(); + db2->expireitr = aux.expireitr; db2->avg_ttl = aux.avg_ttl; /* Now we need to handle clients blocked on lists: as an effect From 3ffdccad8650466c59e437ad59709bfa950137a7 Mon Sep 17 00:00:00 2001 From: John Sully Date: Thu, 18 Jul 2019 21:43:30 -0400 Subject: [PATCH 42/76] Add back file erroniously deleted in rebase Former-commit-id: 42bda8eaba71c99c776100b225606c9aced1d2ba --- src/semiorderedset.h | 302 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 302 insertions(+) create mode 100644 src/semiorderedset.h diff --git a/src/semiorderedset.h b/src/semiorderedset.h new file mode 100644 index 000000000..12741de5b --- /dev/null +++ b/src/semiorderedset.h @@ -0,0 +1,302 @@ +#pragma once +#include +#include "compactvector.h" + +/**************************************** + * semiorderedset.h: + * + * The ordered set is a hash set that maintains semi-ordering, that is you can iterate in sub-linear time over the set comparing a value. + * It has a few other useful properties vs the traditional set: + * 1. The key need not be the underlying type, the only requirement is the value type is castable to the key + * 2. The key need not have total ordering. The set will iterate until it finds an exact match with operator== on the value + * This provides additional flexibility on insert allowing us to optimize this case. + * + */ + +template +class semiorderedset +{ + friend struct setiter; + std::vector> m_data; + size_t celem = 0; + static const size_t bits_min = 8; + size_t bits = bits_min; + size_t idxRehash = (1ULL << bits_min); + bool fPauseRehash = false; + + constexpr size_t targetElementsPerBucket() + { + // Aim for roughly 2 cache lines per bucket (determined by imperical testing) + // lower values are faster but use more memory + return std::max((64/sizeof(T))*2, (size_t)2); + } + +public: + semiorderedset() + { + m_data.resize((1ULL << bits)); + } + + struct setiter + { + semiorderedset *set; + size_t idxPrimary = 0; + size_t idxSecondary = 0; + + setiter(semiorderedset *set) + { + this->set = set; + } + + bool operator==(const setiter &other) const + { + return (idxPrimary == other.idxPrimary) && (idxSecondary == other.idxSecondary); + } + + bool operator!=(const setiter &other) const { return !operator==(other); } + + inline T &operator*() { return set->m_data[idxPrimary][idxSecondary]; } + inline const T &operator*() const { return set->m_data[idxPrimary][idxSecondary]; } + + inline T *operator->() { return &set->m_data[idxPrimary][idxSecondary]; } + inline const T *operator->() const { return &set->m_data[idxPrimary][idxSecondary]; } + }; + + setiter find(const T_KEY &key) + { + RehashStep(); + setiter itr(this); + itr.idxPrimary = idxFromObj(key); + + for (int hashset = 0; hashset < 2; ++hashset) // rehashing may only be 1 resize behind, so we check up to two slots + { + auto &vecBucket = m_data[itr.idxPrimary]; + + auto itrFind = std::find(vecBucket.begin(), vecBucket.end(), key); + if (itrFind != vecBucket.end()) + { + itr.idxSecondary = itrFind - vecBucket.begin(); + return itr; + } + + // See if we have to check the older slot + size_t mask = (hashmask() >> 1); + if (itr.idxPrimary == (itr.idxPrimary & mask)) + break; // same bucket we just checked + itr.idxPrimary &= mask; + if (FRehashedRow(itr.idxPrimary)) + break; + } + + return end(); + } + + setiter end() + { + setiter itr(this); + itr.idxPrimary = m_data.size(); + return itr; + } + + void insert(T &e, bool fRehash = false) + { + if (!fRehash) + RehashStep(); + + auto idx = idxFromObj(static_cast(e)); + if (!fRehash) + ++celem; + + typename compactvector::iterator itrInsert; + if (!m_data[idx].empty() && !(e < m_data[idx].back())) + itrInsert = m_data[idx].end(); + else + itrInsert = std::upper_bound(m_data[idx].begin(), m_data[idx].end(), e); + itrInsert = m_data[idx].insert(itrInsert, e); + + if (celem > ((1ULL << bits)*targetElementsPerBucket())) + grow(); + } + + // enumeration starting from the 'itrStart'th key. Note that the iter is a hint, and need no be valid anymore + template + setiter enumerate(const setiter &itrStart, const T_MAX &max, T_VISITOR fn) + { + setiter itr(itrStart); + + if (itrStart.set == this) // really if this case isn't true its probably a bug + itr = itrStart; // but why crash the program when we can easily fix this? + + fPauseRehash = true; + if (itr.idxPrimary >= m_data.size()) + itr.idxPrimary = 0; + + for (size_t ibucket = 0; ibucket < m_data.size(); ++ibucket) + { + if (!enumerate_bucket(itr, max, fn)) + break; + itr.idxSecondary = 0; + + ++itr.idxPrimary; + if (itr.idxPrimary >= m_data.size()) + itr.idxPrimary = 0; + } + fPauseRehash = false; + return itr; + } + + // This will "randomly" visit nodes biased towards lower values first + template + size_t random_visit(T_VISITOR &fn) + { + bool fSawAny = true; + size_t visited = 0; + size_t basePrimary = rand() % m_data.size(); + for (size_t idxSecondary = 0; fSawAny; ++idxSecondary) + { + fSawAny = false; + for (size_t idxPrimaryCount = 0; idxPrimaryCount < m_data.size(); ++idxPrimaryCount) + { + size_t idxPrimary = (basePrimary + idxPrimaryCount) % m_data.size(); + if (idxSecondary < m_data[idxPrimary].size()) + { + ++visited; + fSawAny = true; + if (!fn(m_data[idxPrimary][idxSecondary])) + return visited; + } + } + } + return visited; + } + + const T& random_value() const + { + assert(!empty()); + for (;;) + { + size_t idxPrimary = rand() % m_data.size(); + if (m_data[idxPrimary].empty()) + continue; + + return m_data[idxPrimary][rand() % m_data[idxPrimary].size()]; + } + } + + void erase(const setiter &itr) + { + auto &vecRow = m_data[itr.idxPrimary]; + vecRow.erase(vecRow.begin() + itr.idxSecondary); + --celem; + RehashStep(); + } + + void clear() + { + m_data = decltype(m_data)(); + bits = bits_min; + m_data.resize(1ULL << bits); + idxRehash = m_data.size(); + } + + bool empty() const noexcept { return celem == 0; } + size_t size() const noexcept { return celem; } + + size_t bytes_used() const + { + size_t cb = sizeof(this) + (m_data.capacity()-m_data.size())*sizeof(T); + for (auto &vec : m_data) + { + cb += vec.bytes_used(); + } + return cb; + } + +private: + inline size_t hashmask() const { return (1ULL << bits) - 1; } + + template::value>* = nullptr> + size_t idxFromObj(TT_KEY key) + { + static_assert(!std::is_pointer::value, "SFINAE isn't working"); + std::hash hash; + return hash(key) & hashmask(); + } + + + template::value>* = nullptr> + size_t idxFromObj(TT_KEY key) + { + std::hash hash; + size_t v = hash(key); + // it's legal for std::hash to literally give us back the same pointer + // in which case the lower bits will have zero entropy. Of course its also + // legal for std::hash to do what we're doing here in which case we're reducing + // lower order entropy... so rotate+XOR is the safest choice + v ^= (v>>3) | (v << ((sizeof(v)*8)-3)); + return v & hashmask(); + } + + bool FRehashedRow(size_t idx) const + { + return (idx >= (m_data.size()/2)) || (idx < idxRehash); + } + + void RehashStep() + { + if (fPauseRehash) + return; + + int steps = 0; + for (; idxRehash < (m_data.size()/2); ++idxRehash) + { + compactvector vecT; + std::swap(m_data[idxRehash], vecT); + + for (auto &v : vecT) + insert(v, true); + + if (++steps > 1024) + break; + } + } + + void grow() + { + assert(idxRehash >= (m_data.size()/2)); // we should have finished rehashing by the time we need to grow again + + ++bits; + m_data.resize(1ULL << bits); + idxRehash = 0; + RehashStep(); + } + + template + inline bool enumerate_bucket(setiter &itr, const T_MAX &max, T_VISITOR &fn) + { + auto &vec = m_data[itr.idxPrimary]; + for (; itr.idxSecondary < vec.size(); ++itr.idxSecondary) + { + // Assert we're ordered by T_MAX + assert((itr.idxSecondary+1) >= vec.size() + || static_cast(vec[itr.idxSecondary]) <= static_cast(vec[itr.idxSecondary+1])); + + if (max < static_cast(*itr)) + return true; + + size_t sizeBefore = vec.size(); + if (!fn(*itr)) + { + itr.idxSecondary++; // we still visited this node + return false; + } + if (vec.size() != sizeBefore) + { + assert(vec.size() == (sizeBefore-1)); // they may only remove the element passed to them + --itr.idxSecondary; // they deleted the element + } + } + vec.shrink_to_fit(); + return true; + } +}; From e8709ee6b0543ffdd00644ab45e07c77fd2194fc Mon Sep 17 00:00:00 2001 From: John Sully Date: Thu, 18 Jul 2019 21:53:00 -0400 Subject: [PATCH 43/76] Add back missing file lost in rebase Former-commit-id: b5512d77a1299cf6ff960229cd47776b82eaba4b --- src/compactvector.h | 153 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 153 insertions(+) create mode 100644 src/compactvector.h diff --git a/src/compactvector.h b/src/compactvector.h new file mode 100644 index 000000000..65a40f114 --- /dev/null +++ b/src/compactvector.h @@ -0,0 +1,153 @@ +#pragma once + +#include +#include + +/************************************************* + * compactvector - similar to std::vector but optimized for minimal memory + * + * Notable differences: + * - Limited to 2^32 elements + * - Grows linearly not exponentially + * + *************************************************/ + +template +class compactvector +{ + static_assert(MEMMOVE_SAFE || std::is_trivially_copyable::value, "compactvector requires trivially copyable types"); + T *m_data = nullptr; + unsigned m_celem = 0; + unsigned m_max = 0; + +public: + typedef T* iterator; + + compactvector() noexcept = default; + ~compactvector() noexcept + { + clear(); // call dtors + zfree(m_data); + } + + compactvector(compactvector &) noexcept = delete; + + compactvector(compactvector &&src) noexcept + { + m_data = src.m_data; + m_celem = src.m_celem; + m_max = src.m_max; + src.m_data = nullptr; + src.m_celem = 0; + src.m_max = 0; + } + + compactvector &operator=(const compactvector&) noexcept = delete; + compactvector &operator=(compactvector &&src) noexcept + { + zfree(m_data); + m_data = src.m_data; + m_celem = src.m_celem; + m_max = src.m_max; + src.m_data = nullptr; + src.m_celem = 0; + src.m_max = 0; + return *this; + } + + inline T* begin() { return m_data; } + inline const T* begin() const { return m_data; } + + inline T* end() { return m_data + m_celem; } + inline const T* end() const { return m_data + m_celem; } + + T* insert(T* where, T &val) + { + assert(where >= m_data); + size_t idx = where - m_data; + if (m_celem >= m_max) + { + if (m_max < 2) + m_max = 2; + else + m_max = m_max + 4; + + m_data = (T*)zrealloc(m_data, sizeof(T) * m_max, MALLOC_LOCAL); + m_max = zmalloc_usable(m_data) / sizeof(T); + } + assert(idx < m_max); + where = m_data + idx; + memmove(m_data + idx + 1, m_data + idx, (m_celem - idx)*sizeof(T)); + new(m_data + idx) T(std::move(val)); + ++m_celem; + return where; + } + + T &operator[](size_t idx) + { + assert(idx < m_celem); + return m_data[idx]; + } + const T &operator[](size_t idx) const + { + assert(idx < m_celem); + return m_data[idx]; + } + + T& back() { assert(m_celem > 0); return m_data[m_celem-1]; } + const T& back() const { assert(m_celem > 0); return m_data[m_celem-1]; } + + void erase(T* where) + { + assert(where >= m_data); + size_t idx = where - m_data; + assert(idx < m_celem); + where->~T(); + memmove(where, where+1, ((m_celem - idx - 1)*sizeof(T))); + --m_celem; + + if (m_celem == 0) + { + zfree(m_data); + m_data = nullptr; + m_max = 0; + } + } + + void shrink_to_fit() + { + if (m_max == m_celem) + return; + m_data = (T*)zrealloc(m_data, sizeof(T) * m_celem, MALLOC_LOCAL); + m_max = m_celem; // NOTE: We do not get the usable size here, because this could cause us to continually realloc + } + + size_t bytes_used() const + { + return sizeof(this) + (m_max * sizeof(T)); + } + + void clear() + { + for (size_t idx = 0; idx < m_celem; ++idx) + m_data[idx].~T(); + zfree(m_data); + m_data = nullptr; + m_celem = 0; + m_max = 0; + } + + bool empty() const noexcept + { + return m_celem == 0; + } + + size_t size() const noexcept + { + return m_celem; + } + + T* data() noexcept { return m_data; } + const T* data() const noexcept { return m_data; } +}; +static_assert(sizeof(compactvector) <= 16, "not compact"); From 8b3d250d8fc87dd942bf6ebfe07639b2b6659896 Mon Sep 17 00:00:00 2001 From: John Sully Date: Tue, 23 Jul 2019 17:49:56 -0400 Subject: [PATCH 44/76] Implement hash table stats and fixup the hash function based on the results Former-commit-id: 5a193872f8e002c97b7dc2c4bc3bab8e0478765f --- src/debug.cpp | 7 ++-- src/semiorderedset.h | 82 +++++++++++++++++++++++++++++++++----------- 2 files changed, 65 insertions(+), 24 deletions(-) diff --git a/src/debug.cpp b/src/debug.cpp index c02eba225..4d2f4bbca 100644 --- a/src/debug.cpp +++ b/src/debug.cpp @@ -638,10 +638,9 @@ NULL dictGetStats(buf,sizeof(buf),g_pserver->db[dbid].pdict); stats = sdscat(stats,buf); - stats = sdscatprintf(stats,"[Expires HT]\n"); - // TODO! - //dictGetStats(buf,sizeof(buf),server.db[dbid].expires); - //stats = sdscat(stats,buf); + stats = sdscatprintf(stats,"[Expires set]\n"); + g_pserver->db[dbid].setexpire->getstats(buf, sizeof(buf)); + stats = sdscat(stats, buf); addReplyBulkSds(c,stats); } else if (!strcasecmp(szFromObj(c->argv[1]),"htstats-key") && c->argc == 3) { diff --git a/src/semiorderedset.h b/src/semiorderedset.h index 12741de5b..7713d5533 100644 --- a/src/semiorderedset.h +++ b/src/semiorderedset.h @@ -13,6 +13,8 @@ * */ +extern uint64_t dictGenHashFunction(const void *key, int len); + template class semiorderedset { @@ -26,9 +28,9 @@ class semiorderedset constexpr size_t targetElementsPerBucket() { - // Aim for roughly 2 cache lines per bucket (determined by imperical testing) + // Aim for roughly 4 cache lines per bucket (determined by imperical testing) // lower values are faster but use more memory - return std::max((64/sizeof(T))*2, (size_t)2); + return std::max((64/sizeof(T))*4, (size_t)2); } public: @@ -212,28 +214,68 @@ public: return cb; } + #define DICT_STATS_VECTLEN 50 + size_t getstats(char *buf, size_t bufsize) const + { + unsigned long i, slots = 0, chainlen, maxchainlen = 0; + unsigned long totchainlen = 0; + unsigned long clvector[DICT_STATS_VECTLEN] = {0}; + size_t l = 0; + + if (empty()) { + return snprintf(buf,bufsize, + "No stats available for empty dictionaries\n"); + } + + /* Compute stats. */ + for (auto &vec : m_data) { + if (vec.empty()) { + clvector[0]++; + continue; + } + slots++; + /* For each hash entry on this slot... */ + chainlen = vec.size(); + + clvector[(chainlen < DICT_STATS_VECTLEN) ? chainlen : (DICT_STATS_VECTLEN-1)]++; + if (chainlen > maxchainlen) maxchainlen = chainlen; + totchainlen += chainlen; + } + + size_t used = m_data.size()-clvector[0]; + /* Generate human readable stats. */ + l += snprintf(buf+l,bufsize-l, + "semiordered set stats:\n" + " table size: %ld\n" + " number of slots: %ld\n" + " used slots: %ld\n" + " max chain length: %ld\n" + " avg chain length (counted): %.02f\n" + " avg chain length (computed): %.02f\n" + " Chain length distribution:\n", + size(), used, slots, maxchainlen, + (float)totchainlen/slots, (float)size()/m_data.size()); + + for (i = 0; i < DICT_STATS_VECTLEN; i++) { + if (clvector[i] == 0) continue; + if (l >= bufsize) break; + l += snprintf(buf+l,bufsize-l, + " %s%ld: %ld (%.02f%%)\n", + (i == DICT_STATS_VECTLEN-1)?">= ":"", + i, clvector[i], ((float)clvector[i]/m_data.size())*100); + } + + /* Unlike snprintf(), teturn the number of characters actually written. */ + if (bufsize) buf[bufsize-1] = '\0'; + return strlen(buf); + } + private: inline size_t hashmask() const { return (1ULL << bits) - 1; } - template::value>* = nullptr> - size_t idxFromObj(TT_KEY key) + size_t idxFromObj(const T_KEY &key) { - static_assert(!std::is_pointer::value, "SFINAE isn't working"); - std::hash hash; - return hash(key) & hashmask(); - } - - - template::value>* = nullptr> - size_t idxFromObj(TT_KEY key) - { - std::hash hash; - size_t v = hash(key); - // it's legal for std::hash to literally give us back the same pointer - // in which case the lower bits will have zero entropy. Of course its also - // legal for std::hash to do what we're doing here in which case we're reducing - // lower order entropy... so rotate+XOR is the safest choice - v ^= (v>>3) | (v << ((sizeof(v)*8)-3)); + size_t v = (size_t)dictGenHashFunction(&key, sizeof(key)); return v & hashmask(); } From e04cff2cfe7aac5e7ca8514e2587e3da0cafb98e Mon Sep 17 00:00:00 2001 From: John Sully Date: Tue, 23 Jul 2019 18:25:38 -0400 Subject: [PATCH 45/76] Support TTL stats with the new expire datastructure Former-commit-id: 271df3dad4f55f20177a8a9a065778f4943835f1 --- src/db.cpp | 15 +++++++++++++++ src/expire.cpp | 2 +- src/server.cpp | 10 +++++++++- src/server.h | 3 ++- 4 files changed, 27 insertions(+), 3 deletions(-) diff --git a/src/db.cpp b/src/db.cpp index 45bc16f02..dd75c28e9 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -1122,11 +1122,13 @@ int dbSwapDatabases(int id1, int id2) { db1->setexpire = db2->setexpire; db1->expireitr = db2->expireitr; db1->avg_ttl = db2->avg_ttl; + db1->last_expire_set = db2->last_expire_set; db2->pdict = aux.pdict; db2->setexpire = aux.setexpire; db2->expireitr = aux.expireitr; db2->avg_ttl = aux.avg_ttl; + db2->last_expire_set = aux.last_expire_set; /* Now we need to handle clients blocked on lists: as an effect * of swapping the two DBs, a client that was waiting for list @@ -1220,6 +1222,19 @@ void setExpire(client *c, redisDb *db, robj *key, long long when) { expireEntry e((sds)dictGetKey(kde), when); ((robj*)dictGetVal(kde))->SetFExpires(true); + /* Update TTL stats (exponential moving average) */ + /* Note: We never have to update this on expiry since we reduce it by the current elapsed time here */ + long long now = g_pserver->mstime; + db->avg_ttl -= (now - db->last_expire_set); // reduce the TTL by the time that has elapsed + if (db->setexpire->empty()) + db->avg_ttl = 0; + else + db->avg_ttl -= db->avg_ttl / db->setexpire->size(); // slide one entry out the window + if (db->avg_ttl < 0) + db->avg_ttl = 0; // TTLs are never negative + db->avg_ttl += (double)(when-now) / (db->setexpire->size()+1); // add the new entry + db->last_expire_set = now; + db->setexpire->insert(e); int writable_slave = listLength(g_pserver->masters) && g_pserver->repl_slave_ro == 0; diff --git a/src/expire.cpp b/src/expire.cpp index 55ea83411..38a65cf44 100644 --- a/src/expire.cpp +++ b/src/expire.cpp @@ -155,8 +155,8 @@ void activeExpireCycle(int type) { /* If there is nothing to expire try next DB ASAP. */ if (db->setexpire->empty()) { - // TODO: Compute db->avg_ttl somewhere... but probably not here db->avg_ttl = 0; + db->last_expire_set = now; continue; } diff --git a/src/server.cpp b/src/server.cpp index 008459034..e6e86f6ea 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -2926,6 +2926,7 @@ void initServer(void) { g_pserver->db[j].watched_keys = dictCreate(&keylistDictType,NULL); g_pserver->db[j].id = j; g_pserver->db[j].avg_ttl = 0; + g_pserver->db[j].last_expire_set = 0; g_pserver->db[j].defrag_later = listCreate(); } @@ -4566,10 +4567,17 @@ sds genRedisInfoString(const char *section) { keys = dictSize(g_pserver->db[j].pdict); vkeys = g_pserver->db[j].setexpire->size(); + + // Adjust TTL by the current time + g_pserver->db[j].avg_ttl -= (g_pserver->mstime - g_pserver->db[j].last_expire_set); + if (g_pserver->db[j].avg_ttl < 0) + g_pserver->db[j].avg_ttl = 0; + g_pserver->db[j].last_expire_set = g_pserver->mstime; + if (keys || vkeys) { info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld,avg_ttl=%lld\r\n", - j, keys, vkeys, g_pserver->db[j].avg_ttl); + j, keys, vkeys, static_cast(g_pserver->db[j].avg_ttl)); } } } diff --git a/src/server.h b/src/server.h index 74960bbab..8bc30a0f5 100644 --- a/src/server.h +++ b/src/server.h @@ -844,7 +844,8 @@ typedef struct redisDb { dict *ready_keys; /* Blocked keys that received a PUSH */ dict *watched_keys; /* WATCHED keys for MULTI/EXEC CAS */ int id; /* Database ID */ - long long avg_ttl; /* Average TTL, just for stats */ + long long last_expire_set; /* when the last expire was set */ + double avg_ttl; /* Average TTL, just for stats */ list *defrag_later; /* List of key names to attempt to defrag one by one, gradually. */ } redisDb; From 82a3e942bf974d35a696b13e44d6d5bb736b581c Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 5 Jul 2019 23:49:09 -0400 Subject: [PATCH 46/76] New expire datastructure and algorithm. Allows us to expire in sublinear time Former-commit-id: ea3bd614b8b88b8de0b114f917fbd0de93557c72 --- src/compactvector.h | 20 ++++++++++++++++++++ src/object.cpp | 2 +- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/src/compactvector.h b/src/compactvector.h index 65a40f114..8f9e8e74e 100644 --- a/src/compactvector.h +++ b/src/compactvector.h @@ -12,10 +12,17 @@ * *************************************************/ +<<<<<<< HEAD template class compactvector { static_assert(MEMMOVE_SAFE || std::is_trivially_copyable::value, "compactvector requires trivially copyable types"); +======= +template +class compactvector +{ + static_assert(std::is_trivially_copyable::value, "compactvector requires trivially copyable types"); +>>>>>>> New expire datastructure and algorithm. Allows us to expire in sublinear time T *m_data = nullptr; unsigned m_celem = 0; unsigned m_max = 0; @@ -26,7 +33,10 @@ public: compactvector() noexcept = default; ~compactvector() noexcept { +<<<<<<< HEAD clear(); // call dtors +======= +>>>>>>> New expire datastructure and algorithm. Allows us to expire in sublinear time zfree(m_data); } @@ -78,7 +88,11 @@ public: assert(idx < m_max); where = m_data + idx; memmove(m_data + idx + 1, m_data + idx, (m_celem - idx)*sizeof(T)); +<<<<<<< HEAD new(m_data + idx) T(std::move(val)); +======= + m_data[idx] = val; +>>>>>>> New expire datastructure and algorithm. Allows us to expire in sublinear time ++m_celem; return where; } @@ -102,7 +116,10 @@ public: assert(where >= m_data); size_t idx = where - m_data; assert(idx < m_celem); +<<<<<<< HEAD where->~T(); +======= +>>>>>>> New expire datastructure and algorithm. Allows us to expire in sublinear time memmove(where, where+1, ((m_celem - idx - 1)*sizeof(T))); --m_celem; @@ -129,8 +146,11 @@ public: void clear() { +<<<<<<< HEAD for (size_t idx = 0; idx < m_celem; ++idx) m_data[idx].~T(); +======= +>>>>>>> New expire datastructure and algorithm. Allows us to expire in sublinear time zfree(m_data); m_data = nullptr; m_celem = 0; diff --git a/src/object.cpp b/src/object.cpp index 900a9058c..ce6265ad1 100644 --- a/src/object.cpp +++ b/src/object.cpp @@ -1490,4 +1490,4 @@ void redisObject::setrefcount(unsigned ref) { serverAssert(!FExpires()); refcount.store(ref, std::memory_order_relaxed); -} \ No newline at end of file +} From 94645b33ddd6bf8273632a952a716a8c80efd3c3 Mon Sep 17 00:00:00 2001 From: John Sully Date: Sat, 13 Jul 2019 20:11:49 -0400 Subject: [PATCH 47/76] Initial prototype of EXPIREMEMBER command Former-commit-id: 0b3d74ea67d616a6869cbd66198c8dd7ffa72eb7 --- src/bio.cpp | 4 +- src/cluster.cpp | 2 +- src/compactvector.h | 21 ++++-- src/db.cpp | 28 +++++--- src/defrag.cpp | 7 +- src/evict.cpp | 2 +- src/expire.cpp | 127 ++++++++++++++++++++++++++++----- src/help.h | 3 + src/lazyfree.cpp | 4 +- src/module.cpp | 2 +- src/rdb.cpp | 2 +- src/semiorderedset.h | 8 +-- src/server.cpp | 6 +- src/server.h | 164 ++++++++++++++++++++++++++++++++++++------- src/t_string.cpp | 2 +- 15 files changed, 307 insertions(+), 75 deletions(-) diff --git a/src/bio.cpp b/src/bio.cpp index 844464e77..97fa7cf18 100644 --- a/src/bio.cpp +++ b/src/bio.cpp @@ -85,7 +85,7 @@ struct bio_job { void *bioProcessBackgroundJobs(void *arg); void lazyfreeFreeObjectFromBioThread(robj *o); -void lazyfreeFreeDatabaseFromBioThread(dict *ht1, semiorderedset *set); +void lazyfreeFreeDatabaseFromBioThread(dict *ht1, expireset *set); void lazyfreeFreeSlotsMapFromBioThread(rax *rt); /* Make sure we have enough stack to perform all the things we do in the @@ -196,7 +196,7 @@ void *bioProcessBackgroundJobs(void *arg) { if (job->arg1) lazyfreeFreeObjectFromBioThread((robj*)job->arg1); else if (job->arg2 && job->arg3) - lazyfreeFreeDatabaseFromBioThread((dict*)job->arg2,(semiorderedset*)job->arg3); + lazyfreeFreeDatabaseFromBioThread((dict*)job->arg2,(expireset*)job->arg3); else if (job->arg3) lazyfreeFreeSlotsMapFromBioThread((rax*)job->arg3); } else { diff --git a/src/cluster.cpp b/src/cluster.cpp index 79cb0972d..8978f184b 100644 --- a/src/cluster.cpp +++ b/src/cluster.cpp @@ -4949,7 +4949,7 @@ void restoreCommand(client *c) { dbAdd(c->db,c->argv[1],obj); if (ttl) { if (!absttl) ttl+=mstime(); - setExpire(c,c->db,c->argv[1],ttl); + setExpire(c,c->db,c->argv[1],nullptr,ttl); } objectSetLRUOrLFU(obj,lfu_freq,lru_idle,lru_clock); signalModifiedKey(c->db,c->argv[1]); diff --git a/src/compactvector.h b/src/compactvector.h index 8f9e8e74e..ee10a135b 100644 --- a/src/compactvector.h +++ b/src/compactvector.h @@ -12,6 +12,7 @@ * *************************************************/ +<<<<<<< HEAD <<<<<<< HEAD template class compactvector @@ -23,6 +24,12 @@ class compactvector { static_assert(std::is_trivially_copyable::value, "compactvector requires trivially copyable types"); >>>>>>> New expire datastructure and algorithm. Allows us to expire in sublinear time +======= +template +class compactvector +{ + static_assert(MEMMOVE_SAFE || std::is_trivially_copyable::value, "compactvector requires trivially copyable types"); +>>>>>>> Initial prototype of EXPIREMEMBER command T *m_data = nullptr; unsigned m_celem = 0; unsigned m_max = 0; @@ -33,10 +40,14 @@ public: compactvector() noexcept = default; ~compactvector() noexcept { +<<<<<<< HEAD <<<<<<< HEAD clear(); // call dtors ======= >>>>>>> New expire datastructure and algorithm. Allows us to expire in sublinear time +======= + clear(); // call dtors +>>>>>>> Initial prototype of EXPIREMEMBER command zfree(m_data); } @@ -88,11 +99,15 @@ public: assert(idx < m_max); where = m_data + idx; memmove(m_data + idx + 1, m_data + idx, (m_celem - idx)*sizeof(T)); +<<<<<<< HEAD <<<<<<< HEAD new(m_data + idx) T(std::move(val)); ======= m_data[idx] = val; >>>>>>> New expire datastructure and algorithm. Allows us to expire in sublinear time +======= + new(m_data + idx) T(std::move(val)); +>>>>>>> Initial prototype of EXPIREMEMBER command ++m_celem; return where; } @@ -116,10 +131,7 @@ public: assert(where >= m_data); size_t idx = where - m_data; assert(idx < m_celem); -<<<<<<< HEAD where->~T(); -======= ->>>>>>> New expire datastructure and algorithm. Allows us to expire in sublinear time memmove(where, where+1, ((m_celem - idx - 1)*sizeof(T))); --m_celem; @@ -146,11 +158,8 @@ public: void clear() { -<<<<<<< HEAD for (size_t idx = 0; idx < m_celem; ++idx) m_data[idx].~T(); -======= ->>>>>>> New expire datastructure and algorithm. Allows us to expire in sublinear time zfree(m_data); m_data = nullptr; m_celem = 0; diff --git a/src/db.cpp b/src/db.cpp index dd75c28e9..9a47b415d 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -457,7 +457,7 @@ long long emptyDb(int dbnum, int flags, void(callback)(void*)) { } else { dictEmpty(g_pserver->db[j].pdict,callback); delete g_pserver->db[j].setexpire; - g_pserver->db[j].setexpire = new (MALLOC_LOCAL) semiorderedset(); + g_pserver->db[j].setexpire = new (MALLOC_LOCAL) expireset(); g_pserver->db[j].expireitr = g_pserver->db[j].setexpire->end(); } } @@ -1006,7 +1006,7 @@ void renameGenericCommand(client *c, int nx) { dbDelete(c->db,c->argv[1]); dbAdd(c->db,c->argv[2],o); if (expire != -1) - setExpire(c,c->db,c->argv[2],expire); + setExpire(c,c->db,c->argv[2],nullptr,expire); signalModifiedKey(c->db,c->argv[1]); signalModifiedKey(c->db,c->argv[2]); notifyKeyspaceEvent(NOTIFY_GENERIC,"rename_from", @@ -1077,7 +1077,7 @@ void moveCommand(client *c) { return; } dbAdd(dst,c->argv[1],o); - if (expire != -1) setExpire(c,dst,c->argv[1],expire); + if (expire != -1) setExpire(c,dst,c->argv[1],nullptr,expire); addReply(c,shared.cone); } @@ -1201,7 +1201,7 @@ int removeExpireCore(redisDb *db, robj *key, dictEntry *de) { * of an user calling a command 'c' is the client, otherwise 'c' is set * to NULL. The 'when' parameter is the absolute unix time in milliseconds * after which the key will no longer be considered valid. */ -void setExpire(client *c, redisDb *db, robj *key, long long when) { +void setExpire(client *c, redisDb *db, robj *key, robj *subkey, long long when) { dictEntry *kde; serverAssert(GlobalLocksAcquired()); @@ -1216,12 +1216,6 @@ void setExpire(client *c, redisDb *db, robj *key, long long when) { dictSetVal(db->pdict, kde, dupStringObject((robj*)dictGetVal(kde))); } - if (((robj*)dictGetVal(kde))->FExpires()) - removeExpire(db, key); // should we optimize for when this is called with an already set expiry? - - expireEntry e((sds)dictGetKey(kde), when); - ((robj*)dictGetVal(kde))->SetFExpires(true); - /* Update TTL stats (exponential moving average) */ /* Note: We never have to update this on expiry since we reduce it by the current elapsed time here */ long long now = g_pserver->mstime; @@ -1235,7 +1229,19 @@ void setExpire(client *c, redisDb *db, robj *key, long long when) { db->avg_ttl += (double)(when-now) / (db->setexpire->size()+1); // add the new entry db->last_expire_set = now; - db->setexpire->insert(e); + /* Update the expire set */ + const char *szSubKey = (subkey != nullptr) ? szFromObj(subkey) : nullptr; + if (((robj*)dictGetVal(kde))->FExpires()) { + auto itr = db->setexpire->find((sds)dictGetKey(kde)); + serverAssert(itr != db->setexpire->end()); + itr->update(szSubKey, when); + } + else + { + expireEntry e((sds)dictGetKey(kde), szSubKey, when); + ((robj*)dictGetVal(kde))->SetFExpires(true); + db->setexpire->insert(e); + } int writable_slave = listLength(g_pserver->masters) && g_pserver->repl_slave_ro == 0; if (c && writable_slave && !(c->flags & CLIENT_MASTER)) diff --git a/src/defrag.cpp b/src/defrag.cpp index b11564c4b..a6acb8e72 100644 --- a/src/defrag.cpp +++ b/src/defrag.cpp @@ -48,7 +48,7 @@ extern "C" int je_get_defrag_hint(void* ptr, int *bin_util, int *run_util); /* forward declarations*/ void defragDictBucketCallback(void *privdata, dictEntry **bucketref); dictEntry* replaceSateliteDictKeyPtrAndOrDefragDictEntry(dict *d, sds oldkey, sds newkey, uint64_t hash, long *defragged); -void replaceSateliteOSetKeyPtr(semiorderedset &set, sds oldkey, sds newkey); +void replaceSateliteOSetKeyPtr(expireset &set, sds oldkey, sds newkey); /* Defrag helper for generic allocations. * @@ -407,11 +407,12 @@ dictEntry* replaceSateliteDictKeyPtrAndOrDefragDictEntry(dict *d, sds oldkey, sd return NULL; } -void replaceSateliteOSetKeyPtr(semiorderedset &set, sds oldkey, sds newkey) { +void replaceSateliteOSetKeyPtr(expireset &set, sds oldkey, sds newkey) { auto itr = set.find(oldkey); + serverAssert(false); if (itr != set.end()) { - expireEntry eNew(newkey, itr->when()); + expireEntry eNew(newkey, nullptr, itr->when()); set.erase(itr); set.insert(eNew); } diff --git a/src/evict.cpp b/src/evict.cpp index 4acdb5ad0..8cf24dd5e 100644 --- a/src/evict.cpp +++ b/src/evict.cpp @@ -252,7 +252,7 @@ struct visitFunctor return count < g_pserver->maxmemory_samples; } }; -void evictionPoolPopulate(int dbid, dict *dbdict, semiorderedset *setexpire, struct evictionPoolEntry *pool) +void evictionPoolPopulate(int dbid, dict *dbdict, expireset *setexpire, struct evictionPoolEntry *pool) { if (setexpire != nullptr) { diff --git a/src/expire.cpp b/src/expire.cpp index 38a65cf44..b6833db04 100644 --- a/src/expire.cpp +++ b/src/expire.cpp @@ -32,6 +32,21 @@ #include "server.h" +void activeExpireCycleExpireFullKey(redisDb *db, const char *key) { + robj *keyobj = createStringObject(key,sdslen(key)); + + propagateExpire(db,keyobj,g_pserver->lazyfree_lazy_expire); + if (g_pserver->lazyfree_lazy_expire) + dbAsyncDelete(db,keyobj); + else + dbSyncDelete(db,keyobj); + notifyKeyspaceEvent(NOTIFY_EXPIRED, + "expired",keyobj,db->id); + if (g_pserver->tracking_clients) trackingInvalidateKey(keyobj); + decrRefCount(keyobj); + g_pserver->stat_expiredkeys++; +} + /*----------------------------------------------------------------------------- * Incremental collection of expired keys. * @@ -51,19 +66,99 @@ * * The parameter 'now' is the current time in milliseconds as is passed * to the function to avoid too many gettimeofday() syscalls. */ -void activeExpireCycleExpire(redisDb *db, const char *key) { - robj *keyobj = createStringObject(key,sdslen(key)); +void activeExpireCycleExpire(redisDb *db, expireEntry &e, long long now) { + if (!e.FFat()) + activeExpireCycleExpireFullKey(db, e.key()); - propagateExpire(db,keyobj,g_pserver->lazyfree_lazy_expire); - if (g_pserver->lazyfree_lazy_expire) - dbAsyncDelete(db,keyobj); - else - dbSyncDelete(db,keyobj); - notifyKeyspaceEvent(NOTIFY_EXPIRED, - "expired",keyobj,db->id); - if (g_pserver->tracking_clients) trackingInvalidateKey(keyobj); - decrRefCount(keyobj); - g_pserver->stat_expiredkeys++; + expireEntryFat *pfat = e.pfatentry(); + dictEntry *de = dictFind(db->pdict, e.key()); + robj *val = (robj*)dictGetVal(de); + int deleted = 0; + while (!pfat->FEmpty()) + { + if (pfat->nextExpireEntry().when > now) + break; + + // Is it the full key expiration? + if (pfat->nextExpireEntry().spsubkey == nullptr) + { + activeExpireCycleExpireFullKey(db, e.key()); + return; + } + + switch (val->type) + { + case OBJ_SET: + if (setTypeRemove(val,pfat->nextExpireEntry().spsubkey.get())) { + deleted++; + if (setTypeSize(val) == 0) { + activeExpireCycleExpireFullKey(db, e.key()); + return; + } + } + break; + case OBJ_LIST: + case OBJ_ZSET: + case OBJ_HASH: + default: + serverAssert(false); + } + pfat->popfrontExpireEntry(); + } + + if (deleted) + { + robj objT; + switch (val->type) + { + case OBJ_SET: + initStaticStringObject(objT, (char*)e.key()); + signalModifiedKey(db,&objT); + notifyKeyspaceEvent(NOTIFY_SET,"srem",&objT,db->id); + break; + } + } + + if (pfat->FEmpty()) + { + robj *keyobj = createStringObject(e.key(),sdslen(e.key())); + removeExpire(db, keyobj); + decrRefCount(keyobj); + } +} + +void expireMemberCommand(client *c) +{ + long long when; + if (getLongLongFromObjectOrReply(c, c->argv[3], &when, NULL) != C_OK) + return; + + when *= 1000; + when += mstime(); + + /* No key, return zero. */ + dictEntry *de = dictFind(c->db->pdict, szFromObj(c->argv[1])); + if (de == NULL) { + addReply(c,shared.czero); + return; + } + + robj *val = (robj*)dictGetVal(de); + + switch (val->type) + { + case OBJ_SET: + // these types are safe + break; + + default: + addReplyError(c, "object type is unsupported"); + return; + } + + setExpire(c, c->db, c->argv[1], c->argv[2], when); + + addReply(c, shared.ok); } /* Try to expire a few timed out keys. The algorithm used is adaptive and @@ -162,10 +257,10 @@ void activeExpireCycle(int type) { size_t expired = 0; size_t tried = 0; - db->expireitr = db->setexpire->enumerate(db->expireitr, now, [&](const expireEntry &e) __attribute__((always_inline)) { + db->expireitr = db->setexpire->enumerate(db->expireitr, now, [&](expireEntry &e) __attribute__((always_inline)) { if (e.when() < now) { - activeExpireCycleExpire(db, e.key()); + activeExpireCycleExpire(db, e, now); ++expired; } ++tried; @@ -270,7 +365,7 @@ void expireSlaveKeys(void) { if (itr != db->setexpire->end()) { if (itr->when() < start) { - activeExpireCycleExpire(g_pserver->db+dbid,itr->key()); + activeExpireCycleExpire(g_pserver->db+dbid,*itr,start); expired = 1; } } @@ -406,7 +501,7 @@ void expireGenericCommand(client *c, long long basetime, int unit) { addReply(c, shared.cone); return; } else { - setExpire(c,c->db,key,when); + setExpire(c,c->db,key,nullptr,when); addReply(c,shared.cone); signalModifiedKey(c->db,key); notifyKeyspaceEvent(NOTIFY_GENERIC,"expire",key,c->db->id); diff --git a/src/help.h b/src/help.h index 184d76724..01b856b9d 100644 --- a/src/help.h +++ b/src/help.h @@ -343,6 +343,9 @@ struct commandHelp { "Set the expiration for a key as a UNIX timestamp", 0, "1.2.0" }, + { "EXPIREMEMBER", + "key subkey seconds", + "set a subkey's time to live in seconds"}, { "FLUSHALL", "[ASYNC]", "Remove all keys from all databases", diff --git a/src/lazyfree.cpp b/src/lazyfree.cpp index 0dbfd57d1..91577cb85 100644 --- a/src/lazyfree.cpp +++ b/src/lazyfree.cpp @@ -110,7 +110,7 @@ void freeObjAsync(robj *o) { void emptyDbAsync(redisDb *db) { dict *oldht1 = db->pdict; auto *set = db->setexpire; - db->setexpire = new (MALLOC_LOCAL) semiorderedset(); + db->setexpire = new (MALLOC_LOCAL) expireset(); db->expireitr = db->setexpire->end(); db->pdict = dictCreate(&dbDictType,NULL); atomicIncr(lazyfree_objects,dictSize(oldht1)); @@ -141,7 +141,7 @@ void lazyfreeFreeObjectFromBioThread(robj *o) { * when the database was logically deleted. 'sl' is a skiplist used by * Redis Cluster in order to take the hash slots -> keys mapping. This * may be NULL if Redis Cluster is disabled. */ -void lazyfreeFreeDatabaseFromBioThread(dict *ht1, semiorderedset *set) { +void lazyfreeFreeDatabaseFromBioThread(dict *ht1, expireset *set) { size_t numkeys = dictSize(ht1); dictRelease(ht1); delete set; diff --git a/src/module.cpp b/src/module.cpp index 7863ca4cf..3ef824a7f 100644 --- a/src/module.cpp +++ b/src/module.cpp @@ -1664,7 +1664,7 @@ int RM_SetExpire(RedisModuleKey *key, mstime_t expire) { return REDISMODULE_ERR; if (expire != REDISMODULE_NO_EXPIRE) { expire += mstime(); - setExpire(key->ctx->client,key->db,key->key,expire); + setExpire(key->ctx->client,key->db,key->key,nullptr,expire); } else { removeExpire(key->db,key->key); } diff --git a/src/rdb.cpp b/src/rdb.cpp index 5443ca064..c1b15e2ca 100644 --- a/src/rdb.cpp +++ b/src/rdb.cpp @@ -2096,7 +2096,7 @@ int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi, int loading_aof) { { /* Set the expire time if needed */ if (expiretime != -1) - setExpire(NULL,db,key,expiretime); + setExpire(NULL,db,key,nullptr,expiretime); /* Set usage information (for eviction). */ objectSetLRUOrLFU(val,lfu_freq,lru_idle,lru_clock); diff --git a/src/semiorderedset.h b/src/semiorderedset.h index 7713d5533..450910c49 100644 --- a/src/semiorderedset.h +++ b/src/semiorderedset.h @@ -15,11 +15,11 @@ extern uint64_t dictGenHashFunction(const void *key, int len); -template +template class semiorderedset { friend struct setiter; - std::vector> m_data; + std::vector> m_data; size_t celem = 0; static const size_t bits_min = 8; size_t bits = bits_min; @@ -109,7 +109,7 @@ public: if (!fRehash) ++celem; - typename compactvector::iterator itrInsert; + typename compactvector::iterator itrInsert; if (!m_data[idx].empty() && !(e < m_data[idx].back())) itrInsert = m_data[idx].end(); else @@ -292,7 +292,7 @@ private: int steps = 0; for (; idxRehash < (m_data.size()/2); ++idxRehash) { - compactvector vecT; + compactvector vecT; std::swap(m_data[idxRehash], vecT); for (auto &v : vecT) diff --git a/src/server.cpp b/src/server.cpp index e6e86f6ea..2d9627c0f 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -618,6 +618,10 @@ struct redisCommand redisCommandTable[] = { "write fast @keyspace", 0,NULL,1,1,1,0,0,0}, + {"expiremember", expireMemberCommand, 4, + "write fast @keyspace", + 0,NULL,1,1,1,0,0,0}, + {"pexpire",pexpireCommand,3, "write fast @keyspace", 0,NULL,1,1,1,0,0,0}, @@ -2919,7 +2923,7 @@ void initServer(void) { /* Create the Redis databases, and initialize other internal state. */ for (int j = 0; j < cserver.dbnum; j++) { g_pserver->db[j].pdict = dictCreate(&dbDictType,NULL); - g_pserver->db[j].setexpire = new(MALLOC_LOCAL) semiorderedset; + g_pserver->db[j].setexpire = new(MALLOC_LOCAL) expireset(); g_pserver->db[j].expireitr = g_pserver->db[j].setexpire->end(); g_pserver->db[j].blocking_keys = dictCreate(&keylistDictType,NULL); g_pserver->db[j].ready_keys = dictCreate(&objectKeyPointerValueDictType,NULL); diff --git a/src/server.h b/src/server.h index 8bc30a0f5..99104cf48 100644 --- a/src/server.h +++ b/src/server.h @@ -53,6 +53,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { #include @@ -767,38 +768,150 @@ __attribute__((always_inline)) inline char *szFromObj(const robj *o) return (char*)ptrFromObj(o); } -class expireEntry { - sds m_key; - long long m_when; +class expireEntryFat +{ +public: + struct subexpireEntry + { + long long when; + std::unique_ptr spsubkey; + + subexpireEntry(long long when, const char *subkey) + : when(when), spsubkey(subkey, sdsfree) + {} + + bool operator<(long long when) const noexcept { return this->when < when; } + bool operator<(const subexpireEntry &se) { return this->when < se.when; } + }; + +private: + sds m_keyPrimary; + std::vector m_vecexpireEntries; // Note a NULL for the sds portion means the expire is for the primary key public: - expireEntry(sds key, long long when) + expireEntryFat(sds keyPrimary) + : m_keyPrimary(keyPrimary) + {} + long long when() const noexcept { return m_vecexpireEntries.front().when; } + const char *key() const noexcept { return m_keyPrimary; } + + bool operator<(long long when) const noexcept { return this->when() < when; } + + void expireSubKey(const char *szSubkey, long long when) { - m_key = key; - m_when = when; + auto itrInsert = std::lower_bound(m_vecexpireEntries.begin(), m_vecexpireEntries.end(), when); + m_vecexpireEntries.emplace(itrInsert, when, sdsdup(szSubkey)); } - bool operator!=(const expireEntry &e) const noexcept - { - return m_when != e.m_when || m_key != e.m_key; - } - bool operator==(const expireEntry &e) const noexcept - { - return m_when == e.m_when && m_key == e.m_key; - } - bool operator==(const char *key) const noexcept { return m_key == key; } + bool FEmpty() const noexcept { return m_vecexpireEntries.empty(); } + const subexpireEntry &nextExpireEntry() const noexcept { return m_vecexpireEntries.front(); } + void popfrontExpireEntry() { m_vecexpireEntries.erase(m_vecexpireEntries.begin()); } - bool operator<(const expireEntry &e) const noexcept { return m_when < e.m_when; } - bool operator<(const char *key) const noexcept { return m_key < key; } - bool operator<(long long when) const noexcept { return m_when < when; } +}; - const char *key() const noexcept { return m_key; } - long long when() const noexcept { return m_when; } +class expireEntry { + union + { + sds m_key; + expireEntryFat *m_pfatentry; + } u; + long long m_when; // LLONG_MIN means this is a fat entry and we should use the pointer + +public: + expireEntry(sds key, const char *subkey, long long when) + { + if (subkey != nullptr) + { + m_when = LLONG_MIN; + u.m_pfatentry = new (MALLOC_LOCAL) expireEntryFat(key); + u.m_pfatentry->expireSubKey(subkey, when); + } + else + { + u.m_key = key; + m_when = when; + } + } + + expireEntry(expireEntryFat *pfatentry) + { + u.m_pfatentry = pfatentry; + m_when = LLONG_MIN; + } + + expireEntry(expireEntry &&e) + { + u.m_key = e.u.m_key; + m_when = e.m_when; + e.u.m_key = nullptr; + e.m_when = 0; + } + + ~expireEntry() + { + if (FFat()) + delete u.m_pfatentry; + } + + inline bool FFat() const noexcept { return m_when == LLONG_MIN; } + expireEntryFat *pfatentry() { assert(FFat()); return u.m_pfatentry; } + + + bool operator==(const char *key) const noexcept + { + return this->key() == key; + } + + bool operator<(const expireEntry &e) const noexcept + { + return when() < e.when(); + } + bool operator<(long long when) const noexcept + { + return this->when() < when; + } + + const char *key() const noexcept + { + if (FFat()) + return u.m_pfatentry->key(); + return u.m_key; + } + long long when() const noexcept + { + if (FFat()) + return u.m_pfatentry->when(); + return m_when; + } + + void update(const char *subkey, long long when) + { + if (!FFat()) + { + if (subkey == nullptr) + { + m_when = when; + return; + } + else + { + // we have to upgrade to a fat entry + long long whenT = m_when; + sds keyPrimary = u.m_key; + m_when = LLONG_MIN; + u.m_pfatentry = new (MALLOC_LOCAL) expireEntryFat(keyPrimary); + u.m_pfatentry->expireSubKey(nullptr, whenT); + // at this point we're fat so fall through + } + } + u.m_pfatentry->expireSubKey(subkey, when); + } - explicit operator const char*() const noexcept { return m_key; } - explicit operator long long() const noexcept { return m_when; } + explicit operator const char*() const noexcept { return key(); } + explicit operator long long() const noexcept { return when(); } }; +typedef semiorderedset expireset; /* The a string name for an object's type as listed above * Native types are checked against the OBJ_STRING, OBJ_LIST, OBJ_* defines, @@ -837,8 +950,8 @@ typedef struct clientReplyBlock { * database. The database number is the 'id' field in the structure. */ typedef struct redisDb { dict *pdict; /* The keyspace for this DB */ - semiorderedset *setexpire; - semiorderedset::setiter expireitr; + expireset *setexpire; + expireset::setiter expireitr; dict *blocking_keys; /* Keys with clients waiting for data (BLPOP)*/ dict *ready_keys; /* Blocked keys that received a PUSH */ @@ -2225,7 +2338,7 @@ int removeExpireCore(redisDb *db, robj *key, dictEntry *de); void propagateExpire(redisDb *db, robj *key, int lazy); int expireIfNeeded(redisDb *db, robj *key); long long getExpire(redisDb *db, robj_roptr key); -void setExpire(client *c, redisDb *db, robj *key, long long when); +void setExpire(client *c, redisDb *db, robj *key, robj *subkey, long long when); robj_roptr lookupKeyRead(redisDb *db, robj *key); robj *lookupKeyWrite(redisDb *db, robj *key); robj_roptr lookupKeyReadOrReply(client *c, robj *key, robj *reply); @@ -2420,6 +2533,7 @@ void mgetCommand(client *c); void monitorCommand(client *c); void expireCommand(client *c); void expireatCommand(client *c); +void expireMemberCommand(client *c); void pexpireCommand(client *c); void pexpireatCommand(client *c); void getsetCommand(client *c); diff --git a/src/t_string.cpp b/src/t_string.cpp index a254f4f53..8b79097c0 100644 --- a/src/t_string.cpp +++ b/src/t_string.cpp @@ -85,7 +85,7 @@ void setGenericCommand(client *c, int flags, robj *key, robj *val, robj *expire, } setKey(c->db,key,val); g_pserver->dirty++; - if (expire) setExpire(c,c->db,key,mstime()+milliseconds); + if (expire) setExpire(c,c->db,key,nullptr,mstime()+milliseconds); notifyKeyspaceEvent(NOTIFY_STRING,"set",key,c->db->id); if (expire) notifyKeyspaceEvent(NOTIFY_GENERIC, "expire",key,c->db->id); From 95371d60fe6ecc471a12caba5aadef0f77292378 Mon Sep 17 00:00:00 2001 From: John Sully Date: Sat, 13 Jul 2019 20:35:44 -0400 Subject: [PATCH 48/76] Fix crash with traditional expiration Former-commit-id: 0ba5b2c3d66d3a1a520f223ad2c288c22601bd5a --- src/db.cpp | 5 ++++- src/expire.cpp | 3 +++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/db.cpp b/src/db.cpp index 9a47b415d..40e44a7c1 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -1234,7 +1234,10 @@ void setExpire(client *c, redisDb *db, robj *key, robj *subkey, long long when) if (((robj*)dictGetVal(kde))->FExpires()) { auto itr = db->setexpire->find((sds)dictGetKey(kde)); serverAssert(itr != db->setexpire->end()); - itr->update(szSubKey, when); + expireEntry eNew(std::move(*itr)); + eNew.update(szSubKey, when); + db->setexpire->erase(itr); + db->setexpire->insert(eNew); } else { diff --git a/src/expire.cpp b/src/expire.cpp index b6833db04..c10047d2c 100644 --- a/src/expire.cpp +++ b/src/expire.cpp @@ -68,7 +68,10 @@ void activeExpireCycleExpireFullKey(redisDb *db, const char *key) { * to the function to avoid too many gettimeofday() syscalls. */ void activeExpireCycleExpire(redisDb *db, expireEntry &e, long long now) { if (!e.FFat()) + { activeExpireCycleExpireFullKey(db, e.key()); + return; + } expireEntryFat *pfat = e.pfatentry(); dictEntry *de = dictFind(db->pdict, e.key()); From e06c38f1d304273ba3a09ce1dc54e79f1394ba59 Mon Sep 17 00:00:00 2001 From: John Sully Date: Sun, 14 Jul 2019 00:23:31 -0400 Subject: [PATCH 49/76] Plumb support for sub expires to all expire related code Former-commit-id: 184abac6942a9a6aa8783741b50b23210afddcc5 --- src/aof.cpp | 25 ++++++++++---- src/cluster.cpp | 5 ++- src/db.cpp | 87 ++++++++++++++++++++++++++++++++++++++++--------- src/debug.cpp | 6 +++- src/defrag.cpp | 4 +-- src/expire.cpp | 7 ++-- src/module.cpp | 6 +++- src/rdb.cpp | 28 +++++++++++----- src/server.h | 71 ++++++++++++++++++++++++++++++++++++++-- 9 files changed, 198 insertions(+), 41 deletions(-) diff --git a/src/aof.cpp b/src/aof.cpp index c7160489b..5c6385c84 100644 --- a/src/aof.cpp +++ b/src/aof.cpp @@ -1321,13 +1321,12 @@ int rewriteAppendOnlyFileRio(rio *aof) { while((de = dictNext(di)) != NULL) { sds keystr; robj key, *o; - long long expiretime; keystr = (sds)dictGetKey(de); o = (robj*)dictGetVal(de); initStaticStringObject(key,keystr); - expiretime = getExpire(db,&key); + expireEntry *pexpire = getExpire(db,&key); /* Save the key and associated value */ if (o->type == OBJ_STRING) { @@ -1353,11 +1352,23 @@ int rewriteAppendOnlyFileRio(rio *aof) { serverPanic("Unknown object type"); } /* Save the expire time */ - if (expiretime != -1) { - char cmd[]="*3\r\n$9\r\nPEXPIREAT\r\n"; - if (rioWrite(aof,cmd,sizeof(cmd)-1) == 0) goto werr; - if (rioWriteBulkObject(aof,&key) == 0) goto werr; - if (rioWriteBulkLongLong(aof,expiretime) == 0) goto werr; + if (pexpire != nullptr) { + for (auto &subExpire : *pexpire) { + if (subExpire.subkey() == nullptr) + { + char cmd[]="*3\r\n$9\r\nPEXPIREAT\r\n"; + if (rioWrite(aof,cmd,sizeof(cmd)-1) == 0) goto werr; + if (rioWriteBulkObject(aof,&key) == 0) goto werr; + } + else + { + char cmd[]="*4\r\n$12\r\nEXPIREMEMBER\r\n"; + if (rioWrite(aof,cmd,sizeof(cmd)-1) == 0) goto werr; + if (rioWriteBulkObject(aof,&key) == 0) goto werr; + if (rioWrite(aof,subExpire.subkey(),sdslen(subExpire.subkey())) == 0) goto werr; + } + if (rioWriteBulkLongLong(aof,subExpire.when()) == 0) goto werr; // common + } } /* Read some diff from the parent process from time to time. */ if (aof->processed_bytes > processed+AOF_READ_DIFF_INTERVAL_BYTES) { diff --git a/src/cluster.cpp b/src/cluster.cpp index 8978f184b..619ce3b3a 100644 --- a/src/cluster.cpp +++ b/src/cluster.cpp @@ -5194,7 +5194,10 @@ try_again: /* Create RESTORE payload and generate the protocol to call the command. */ for (j = 0; j < num_keys; j++) { long long ttl = 0; - long long expireat = getExpire(c->db,kv[j]); + expireEntry *pexpire = getExpire(c->db,kv[j]); + long long expireat = -1; + if (pexpire != nullptr) + pexpire->FGetPrimaryExpire(&expireat); if (expireat != -1) { ttl = expireat-mstime(); diff --git a/src/db.cpp b/src/db.cpp index 40e44a7c1..a3dec518a 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -976,7 +976,6 @@ void shutdownCommand(client *c) { void renameGenericCommand(client *c, int nx) { robj *o; - long long expire; int samekey = 0; /* When source and dest key is the same, no operation is performed, @@ -992,7 +991,15 @@ void renameGenericCommand(client *c, int nx) { } incrRefCount(o); - expire = getExpire(c->db,c->argv[1]); + + std::unique_ptr spexpire; + + { // scope pexpireOld since it will be invalid soon + expireEntry *pexpireOld = getExpire(c->db,c->argv[1]); + if (pexpireOld != nullptr) + spexpire = std::make_unique(std::move(*pexpireOld)); + } + if (lookupKeyWrite(c->db,c->argv[2]) != NULL) { if (nx) { decrRefCount(o); @@ -1005,8 +1012,8 @@ void renameGenericCommand(client *c, int nx) { } dbDelete(c->db,c->argv[1]); dbAdd(c->db,c->argv[2],o); - if (expire != -1) - setExpire(c,c->db,c->argv[2],nullptr,expire); + if (spexpire != nullptr) + setExpire(c,c->db,c->argv[2],std::move(*spexpire)); signalModifiedKey(c->db,c->argv[1]); signalModifiedKey(c->db,c->argv[2]); notifyKeyspaceEvent(NOTIFY_GENERIC,"rename_from", @@ -1029,7 +1036,7 @@ void moveCommand(client *c) { robj *o; redisDb *src, *dst; int srcid; - long long dbid, expire; + long long dbid; if (g_pserver->cluster_enabled) { addReplyError(c,"MOVE is not allowed in cluster mode"); @@ -1063,7 +1070,13 @@ void moveCommand(client *c) { addReply(c,shared.czero); return; } - expire = getExpire(c->db,c->argv[1]); + + std::unique_ptr spexpire; + { // scope pexpireOld + expireEntry *pexpireOld = getExpire(c->db,c->argv[1]); + if (pexpireOld != nullptr) + spexpire = std::make_unique(std::move(*pexpireOld)); + } if (o->FExpires()) removeExpire(c->db,c->argv[1]); serverAssert(!o->FExpires()); @@ -1077,7 +1090,7 @@ void moveCommand(client *c) { return; } dbAdd(dst,c->argv[1],o); - if (expire != -1) setExpire(c,dst,c->argv[1],nullptr,expire); + if (spexpire != nullptr) setExpire(c,dst,c->argv[1],std::move(*spexpire)); addReply(c,shared.cone); } @@ -1251,24 +1264,53 @@ void setExpire(client *c, redisDb *db, robj *key, robj *subkey, long long when) rememberSlaveKeyWithExpire(db,key); } -/* Return the expire time of the specified key, or -1 if no expire +void setExpire(client *c, redisDb *db, robj *key, expireEntry &&e) +{ + dictEntry *kde; + + serverAssert(GlobalLocksAcquired()); + + /* Reuse the sds from the main dict in the expire dict */ + kde = dictFind(db->pdict,ptrFromObj(key)); + serverAssertWithInfo(NULL,key,kde != NULL); + + if (((robj*)dictGetVal(kde))->refcount == OBJ_SHARED_REFCOUNT) + { + // shared objects cannot have the expire bit set, create a real object + dictSetVal(db->pdict, kde, dupStringObject((robj*)dictGetVal(kde))); + } + + if (((robj*)dictGetVal(kde))->FExpires()) + removeExpire(db, key); + + e.setKeyUnsafe((sds)dictGetKey(kde)); + db->setexpire->insert(e); + ((robj*)dictGetVal(kde))->SetFExpires(true); + + + int writable_slave = listLength(g_pserver->masters) && g_pserver->repl_slave_ro == 0; + if (c && writable_slave && !(c->flags & CLIENT_MASTER)) + rememberSlaveKeyWithExpire(db,key); +} + +/* Return the expire time of the specified key, or null if no expire * is associated with this key (i.e. the key is non volatile) */ -long long getExpire(redisDb *db, robj_roptr key) { +expireEntry *getExpire(redisDb *db, robj_roptr key) { dictEntry *de; /* No expire? return ASAP */ if (db->setexpire->size() == 0) - return -1; + return nullptr; de = dictFind(db->pdict, ptrFromObj(key)); if (de == NULL) - return -1; + return nullptr; robj *obj = (robj*)dictGetVal(de); if (!obj->FExpires()) - return -1; + return nullptr; auto itr = db->setexpire->find((sds)dictGetKey(de)); - return itr->when(); + return itr.operator->(); } /* Propagate expires into slaves and the AOF file. @@ -1296,15 +1338,28 @@ void propagateExpire(redisDb *db, robj *key, int lazy) { decrRefCount(argv[1]); } -/* Check if the key is expired. */ +/* Check if the key is expired. Note, this does not check subexpires */ int keyIsExpired(redisDb *db, robj *key) { - mstime_t when = getExpire(db,key); + expireEntry *pexpire = getExpire(db,key); - if (when < 0) return 0; /* No expire for this key */ + if (pexpire == nullptr) return 0; /* No expire for this key */ /* Don't expire anything while loading. It will be done later. */ if (g_pserver->loading) return 0; + long long when = -1; + for (auto &exp : *pexpire) + { + if (exp.subkey() == nullptr) + { + when = exp.when(); + break; + } + } + + if (when == -1) + return 0; + /* If we are in the context of a Lua script, we pretend that time is * blocked to when the Lua script started. This way a key can expire * only the first time it is accessed and not in the middle of the diff --git a/src/debug.cpp b/src/debug.cpp index 4d2f4bbca..41c73b55c 100644 --- a/src/debug.cpp +++ b/src/debug.cpp @@ -124,9 +124,13 @@ void mixStringObjectDigest(unsigned char *digest, robj_roptr o) { void xorObjectDigest(redisDb *db, robj_roptr keyobj, unsigned char *digest, robj_roptr o) { uint32_t aux = htonl(o->type); mixDigest(digest,&aux,sizeof(aux)); - long long expiretime = getExpire(db,keyobj); + expireEntry *pexpire = getExpire(db,keyobj); + long long expiretime = -1; char buf[128]; + if (pexpire != nullptr) + pexpire->FGetPrimaryExpire(&expiretime); + /* Save the key and associated value */ if (o->type == OBJ_STRING) { mixStringObjectDigest(digest,o); diff --git a/src/defrag.cpp b/src/defrag.cpp index a6acb8e72..c49cd2665 100644 --- a/src/defrag.cpp +++ b/src/defrag.cpp @@ -409,10 +409,10 @@ dictEntry* replaceSateliteDictKeyPtrAndOrDefragDictEntry(dict *d, sds oldkey, sd void replaceSateliteOSetKeyPtr(expireset &set, sds oldkey, sds newkey) { auto itr = set.find(oldkey); - serverAssert(false); if (itr != set.end()) { - expireEntry eNew(newkey, nullptr, itr->when()); + expireEntry eNew(std::move(*itr)); + eNew.setKeyUnsafe(newkey); set.erase(itr); set.insert(eNew); } diff --git a/src/expire.cpp b/src/expire.cpp index c10047d2c..5d257428d 100644 --- a/src/expire.cpp +++ b/src/expire.cpp @@ -535,7 +535,7 @@ void pexpireatCommand(client *c) { /* Implements TTL and PTTL */ void ttlGenericCommand(client *c, int output_ms) { - long long expire, ttl = -1; + long long expire = -1, ttl = -1; /* If the key does not exist at all, return -2 */ if (lookupKeyReadWithFlags(c->db,c->argv[1],LOOKUP_NOTOUCH) == nullptr) { @@ -544,7 +544,10 @@ void ttlGenericCommand(client *c, int output_ms) { } /* The key exists. Return -1 if it has no expire, or the actual * TTL value otherwise. */ - expire = getExpire(c->db,c->argv[1]); + expireEntry *pexpire = getExpire(c->db,c->argv[1]); + if (pexpire != nullptr) + pexpire->FGetPrimaryExpire(&expire); + if (expire != -1) { ttl = expire-mstime(); if (ttl < 0) ttl = 0; diff --git a/src/module.cpp b/src/module.cpp index 3ef824a7f..052c8744a 100644 --- a/src/module.cpp +++ b/src/module.cpp @@ -1644,7 +1644,11 @@ int RM_UnlinkKey(RedisModuleKey *key) { * If no TTL is associated with the key or if the key is empty, * REDISMODULE_NO_EXPIRE is returned. */ mstime_t RM_GetExpire(RedisModuleKey *key) { - mstime_t expire = getExpire(key->db,key->key); + expireEntry *pexpire = getExpire(key->db,key->key); + mstime_t expire = -1; + if (pexpire != nullptr) + pexpire->FGetPrimaryExpire(&expire); + if (expire == -1 || key->value == NULL) return -1; expire -= mstime(); return expire >= 0 ? expire : 0; diff --git a/src/rdb.cpp b/src/rdb.cpp index c1b15e2ca..bed797305 100644 --- a/src/rdb.cpp +++ b/src/rdb.cpp @@ -1031,12 +1031,13 @@ size_t rdbSavedObjectLen(robj *o) { * On error -1 is returned. * On success if the key was actually saved 1 is returned, otherwise 0 * is returned (the key was already expired). */ -int rdbSaveKeyValuePair(rio *rdb, robj *key, robj *val, long long expiretime) { +int rdbSaveKeyValuePair(rio *rdb, robj *key, robj *val, expireEntry *pexpire) { int savelru = g_pserver->maxmemory_policy & MAXMEMORY_FLAG_LRU; int savelfu = g_pserver->maxmemory_policy & MAXMEMORY_FLAG_LFU; /* Save the expire time */ - if (expiretime != -1) { + long long expiretime = -1; + if (pexpire != nullptr && pexpire->FGetPrimaryExpire(&expiretime)) { if (rdbSaveType(rdb,RDB_OPCODE_EXPIRETIME_MS) == -1) return -1; if (rdbSaveMillisecondTime(rdb,expiretime) == -1) return -1; } @@ -1061,9 +1062,21 @@ int rdbSaveKeyValuePair(rio *rdb, robj *key, robj *val, long long expiretime) { if (rdbWriteRaw(rdb,buf,1) == -1) return -1; } - char szMvcc[32]; - snprintf(szMvcc, 32, "%" PRIu64, val->mvcc_tstamp); - if (rdbSaveAuxFieldStrStr(rdb,"mvcc-tstamp", szMvcc) == -1) return -1; + char szT[32]; + snprintf(szT, 32, "%" PRIu64, val->mvcc_tstamp); + if (rdbSaveAuxFieldStrStr(rdb,"mvcc-tstamp", szT) == -1) return -1; + + if (pexpire != nullptr) + { + for (auto itr : *pexpire) + { + if (itr.subkey() == nullptr) + continue; // already saved + snprintf(szT, 32, "%lld", itr.when()); + rdbSaveAuxFieldStrStr(rdb,"keydb-subexpire-key",itr.subkey()); + rdbSaveAuxFieldStrStr(rdb,"keydb-subexpire-when",szT); + } + } /* Save type, key, value */ if (rdbSaveObjectType(rdb,val) == -1) return -1; @@ -1099,12 +1112,11 @@ int rdbSaveInfoAuxFields(rio *rdb, int flags, rdbSaveInfo *rsi) { int saveKey(rio *rdb, redisDb *db, int flags, size_t *processed, const char *keystr, robj *o) { robj key; - long long expire; initStaticStringObject(key,(char*)keystr); - expire = getExpire(db, &key); + expireEntry *pexpire = getExpire(db, &key); - if (rdbSaveKeyValuePair(rdb,&key,o,expire) == -1) + if (rdbSaveKeyValuePair(rdb,&key,o,pexpire) == -1) return 0; /* When this RDB is produced as part of an AOF rewrite, move diff --git a/src/server.h b/src/server.h index 99104cf48..f350410d0 100644 --- a/src/server.h +++ b/src/server.h @@ -770,6 +770,7 @@ __attribute__((always_inline)) inline char *szFromObj(const robj *o) class expireEntryFat { + friend class expireEntry; public: struct subexpireEntry { @@ -806,7 +807,8 @@ public: bool FEmpty() const noexcept { return m_vecexpireEntries.empty(); } const subexpireEntry &nextExpireEntry() const noexcept { return m_vecexpireEntries.front(); } void popfrontExpireEntry() { m_vecexpireEntries.erase(m_vecexpireEntries.begin()); } - + const subexpireEntry &operator[](size_t idx) { return m_vecexpireEntries[idx]; } + size_t size() const noexcept { return m_vecexpireEntries.size(); } }; class expireEntry { @@ -818,6 +820,39 @@ class expireEntry { long long m_when; // LLONG_MIN means this is a fat entry and we should use the pointer public: + class iter + { + expireEntry *m_pentry = nullptr; + size_t m_idx = 0; + + public: + iter(expireEntry *pentry, size_t idx) + : m_pentry(pentry), m_idx(idx) + {} + + iter &operator++() { ++m_idx; return *this; } + + const char *subkey() const + { + if (m_pentry->FFat()) + return (*m_pentry->pfatentry())[m_idx].spsubkey.get(); + return nullptr; + } + long long when() const + { + if (m_pentry->FFat()) + return (*m_pentry->pfatentry())[m_idx].when; + return m_pentry->when(); + } + + bool operator!=(const iter &other) + { + return m_idx != other.m_idx; + } + + const iter &operator*() const { return *this; } + }; + expireEntry(sds key, const char *subkey, long long when) { if (subkey != nullptr) @@ -843,7 +878,7 @@ public: { u.m_key = e.u.m_key; m_when = e.m_when; - e.u.m_key = nullptr; + e.u.m_key = (char*)key(); // we do this so it can still be found in the set e.m_when = 0; } @@ -853,6 +888,14 @@ public: delete u.m_pfatentry; } + void setKeyUnsafe(sds key) + { + if (FFat()) + u.m_pfatentry->m_keyPrimary = key; + else + u.m_key = key; + } + inline bool FFat() const noexcept { return m_when == LLONG_MIN; } expireEntryFat *pfatentry() { assert(FFat()); return u.m_pfatentry; } @@ -907,6 +950,27 @@ public: u.m_pfatentry->expireSubKey(subkey, when); } + iter begin() { return iter(this, 0); } + iter end() + { + if (FFat()) + return iter(this, u.m_pfatentry->size()); + return iter(this, 1); + } + + bool FGetPrimaryExpire(long long *pwhen) + { + *pwhen = -1; + for (auto itr : *this) + { + if (itr.subkey() == nullptr) + { + *pwhen = itr.when(); + return true; + } + } + return false; + } explicit operator const char*() const noexcept { return key(); } explicit operator long long() const noexcept { return when(); } @@ -2337,8 +2401,9 @@ int removeExpire(redisDb *db, robj *key); int removeExpireCore(redisDb *db, robj *key, dictEntry *de); void propagateExpire(redisDb *db, robj *key, int lazy); int expireIfNeeded(redisDb *db, robj *key); -long long getExpire(redisDb *db, robj_roptr key); +expireEntry *getExpire(redisDb *db, robj_roptr key); void setExpire(client *c, redisDb *db, robj *key, robj *subkey, long long when); +void setExpire(client *c, redisDb *db, robj *key, expireEntry &&entry); robj_roptr lookupKeyRead(redisDb *db, robj *key); robj *lookupKeyWrite(redisDb *db, robj *key); robj_roptr lookupKeyReadOrReply(client *c, robj *key, robj *reply); From 4ec97fdb460e3825e33b000b533a5cdf23351ac2 Mon Sep 17 00:00:00 2001 From: John Sully Date: Thu, 18 Jul 2019 21:52:19 -0400 Subject: [PATCH 50/76] Fix merge conflict Former-commit-id: 0b43b51a2e3a6af11532146fbb7929f3ecf3b036 --- src/db.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/db.cpp b/src/db.cpp index a3dec518a..f6607b40e 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -1274,7 +1274,7 @@ void setExpire(client *c, redisDb *db, robj *key, expireEntry &&e) kde = dictFind(db->pdict,ptrFromObj(key)); serverAssertWithInfo(NULL,key,kde != NULL); - if (((robj*)dictGetVal(kde))->refcount == OBJ_SHARED_REFCOUNT) + if (((robj*)dictGetVal(kde))->getrefcount(std::memory_order_relaxed) == OBJ_SHARED_REFCOUNT) { // shared objects cannot have the expire bit set, create a real object dictSetVal(db->pdict, kde, dupStringObject((robj*)dictGetVal(kde))); From 9f42bb5d910c97ff6862b3a14dc55be46e076095 Mon Sep 17 00:00:00 2001 From: John Sully Date: Thu, 18 Jul 2019 21:57:17 -0400 Subject: [PATCH 51/76] Fix buggy rebase Former-commit-id: 6037d1f326116e5aae56be9a73a8f9ca68a45bbe --- src/compactvector.h | 29 ----------------------------- 1 file changed, 29 deletions(-) diff --git a/src/compactvector.h b/src/compactvector.h index ee10a135b..65a40f114 100644 --- a/src/compactvector.h +++ b/src/compactvector.h @@ -12,24 +12,10 @@ * *************************************************/ -<<<<<<< HEAD -<<<<<<< HEAD template class compactvector { static_assert(MEMMOVE_SAFE || std::is_trivially_copyable::value, "compactvector requires trivially copyable types"); -======= -template -class compactvector -{ - static_assert(std::is_trivially_copyable::value, "compactvector requires trivially copyable types"); ->>>>>>> New expire datastructure and algorithm. Allows us to expire in sublinear time -======= -template -class compactvector -{ - static_assert(MEMMOVE_SAFE || std::is_trivially_copyable::value, "compactvector requires trivially copyable types"); ->>>>>>> Initial prototype of EXPIREMEMBER command T *m_data = nullptr; unsigned m_celem = 0; unsigned m_max = 0; @@ -40,14 +26,7 @@ public: compactvector() noexcept = default; ~compactvector() noexcept { -<<<<<<< HEAD -<<<<<<< HEAD clear(); // call dtors -======= ->>>>>>> New expire datastructure and algorithm. Allows us to expire in sublinear time -======= - clear(); // call dtors ->>>>>>> Initial prototype of EXPIREMEMBER command zfree(m_data); } @@ -99,15 +78,7 @@ public: assert(idx < m_max); where = m_data + idx; memmove(m_data + idx + 1, m_data + idx, (m_celem - idx)*sizeof(T)); -<<<<<<< HEAD -<<<<<<< HEAD new(m_data + idx) T(std::move(val)); -======= - m_data[idx] = val; ->>>>>>> New expire datastructure and algorithm. Allows us to expire in sublinear time -======= - new(m_data + idx) T(std::move(val)); ->>>>>>> Initial prototype of EXPIREMEMBER command ++m_celem; return where; } From 9ba5270bdac9ec87ca42b46d12c0e18c9f98909b Mon Sep 17 00:00:00 2001 From: John Sully Date: Thu, 18 Jul 2019 22:31:20 -0400 Subject: [PATCH 52/76] Subexpire entries should load/save Former-commit-id: a55d98043655473ecdd53db2927381635eefc0b8 --- src/rdb.cpp | 47 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 11 deletions(-) diff --git a/src/rdb.cpp b/src/rdb.cpp index bed797305..97ade6d1f 100644 --- a/src/rdb.cpp +++ b/src/rdb.cpp @@ -1066,6 +1066,13 @@ int rdbSaveKeyValuePair(rio *rdb, robj *key, robj *val, expireEntry *pexpire) { snprintf(szT, 32, "%" PRIu64, val->mvcc_tstamp); if (rdbSaveAuxFieldStrStr(rdb,"mvcc-tstamp", szT) == -1) return -1; + /* Save type, key, value */ + if (rdbSaveObjectType(rdb,val) == -1) return -1; + if (rdbSaveStringObject(rdb,key) == -1) return -1; + if (rdbSaveObject(rdb,val,key) == -1) return -1; + + /* Save expire entry after as it will apply to the previously loaded key */ + /* This is because we update the expire datastructure directly without buffering */ if (pexpire != nullptr) { for (auto itr : *pexpire) @@ -1078,10 +1085,6 @@ int rdbSaveKeyValuePair(rio *rdb, robj *key, robj *val, expireEntry *pexpire) { } } - /* Save type, key, value */ - if (rdbSaveObjectType(rdb,val) == -1) return -1; - if (rdbSaveStringObject(rdb,key) == -1) return -1; - if (rdbSaveObject(rdb,val,key) == -1) return -1; return 1; } @@ -1919,6 +1922,8 @@ int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi, int loading_aof) { long long lru_idle = -1, lfu_freq = -1, expiretime = -1, now = mstime(); long long lru_clock = 0; uint64_t mvcc_tstamp = OBJ_MVCC_INVALID; + robj *subexpireKey = nullptr; + robj *key = nullptr; rdb->update_cksum = rdbLoadProgressCallback; rdb->max_processing_chunk = g_pserver->loading_process_events_interval_bytes; @@ -1940,7 +1945,7 @@ int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi, int loading_aof) { lru_clock = LRU_CLOCK(); while(1) { - robj *key, *val; + robj *val; /* Read type. */ if ((type = rdbLoadType(rdb)) == -1) goto eoferr; @@ -2048,6 +2053,18 @@ int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi, int loading_aof) { } else if (!strcasecmp(szFromObj(auxkey),"mvcc-tstamp")) { static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "Ensure long long is 64-bits"); mvcc_tstamp = strtoull(szFromObj(auxval), nullptr, 10); + } else if (!strcasecmp(szFromObj(auxkey), "keydb-subexpire-key")) { + subexpireKey = auxval; + incrRefCount(subexpireKey); + } else if (!strcasecmp(szFromObj(auxkey), "keydb-subexpire-when")) { + if (key == nullptr || subexpireKey == nullptr) { + serverLog(LL_WARNING, "Corrupt subexpire entry in RDB skipping."); + } + else { + setExpire(NULL, db, key, subexpireKey, strtoll(szFromObj(auxval), nullptr, 10)); + decrRefCount(subexpireKey); + subexpireKey = nullptr; + } } else { /* We ignore fields we don't understand, as by AUX field * contract. */ @@ -2089,6 +2106,9 @@ int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi, int loading_aof) { } /* Read key */ + if (key != nullptr) + decrRefCount(key); + if ((key = rdbLoadStringObject(rdb)) == NULL) goto eoferr; /* Read value */ if ((val = rdbLoadObject(type,rdb,key, mvcc_tstamp)) == NULL) goto eoferr; @@ -2102,7 +2122,7 @@ int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi, int loading_aof) { decrRefCount(val); } else { /* Add the new object in the hash table */ - int fInserted = dbMerge(db, key, val, rsi->fForceSetKey); + int fInserted = dbMerge(db, key, val, rsi->fForceSetKey); // Note: dbMerge will incrRef if (fInserted) { @@ -2112,14 +2132,9 @@ int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi, int loading_aof) { /* Set usage information (for eviction). */ objectSetLRUOrLFU(val,lfu_freq,lru_idle,lru_clock); - - /* Decrement the key refcount since dbMerge() will take its - * own reference. */ - decrRefCount(key); } else { - decrRefCount(key); decrRefCount(val); } } @@ -2130,6 +2145,16 @@ int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi, int loading_aof) { lfu_freq = -1; lru_idle = -1; } + + if (key != nullptr) + decrRefCount(key); + + if (subexpireKey != nullptr) + { + serverLog(LL_WARNING, "Corrupt subexpire entry in RDB."); + decrRefCount(subexpireKey); + subexpireKey = nullptr; + } /* Verify the checksum if RDB version is >= 5 */ if (rdbver >= 5) { From a0d7eb5a196a30716b1c29a017f668c0d39c3cea Mon Sep 17 00:00:00 2001 From: John Sully Date: Tue, 23 Jul 2019 23:39:47 -0400 Subject: [PATCH 53/76] We can't set FExpire() on shared objects, issue #66 Former-commit-id: 2794cfced4fdb18c0860e966dde0b46b9584c4dc --- src/db.cpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/db.cpp b/src/db.cpp index f6607b40e..cc67acd05 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -231,17 +231,24 @@ void dbOverwriteCore(redisDb *db, dictEntry *de, robj *key, robj *val, bool fUpd robj *old = (robj*)dictGetVal(de); if (old->FExpires()) { - if (fRemoveExpire) + if (fRemoveExpire) { removeExpire(db, key); - else + } + else { + if (val->getrefcount(std::memory_order_relaxed) == OBJ_SHARED_REFCOUNT) + val = dupStringObject(val); updateExpire(db, (sds)dictGetKey(de), old, val); + } } if (g_pserver->maxmemory_policy & MAXMEMORY_FLAG_LFU) { val->lru = old->lru; } - if (fUpdateMvcc) + if (fUpdateMvcc) { + if (val->getrefcount(std::memory_order_relaxed) == OBJ_SHARED_REFCOUNT) + val = dupStringObject(val); val->mvcc_tstamp = getMvccTstamp(); + } dictSetVal(db->pdict, de, val); From 4c49370efe9bdafd2d07f0de330f616b3fcfd135 Mon Sep 17 00:00:00 2001 From: John Sully Date: Wed, 24 Jul 2019 22:31:02 -0400 Subject: [PATCH 54/76] Issue #64 RREPLAY isn't binary safe. Add fix and test. Former-commit-id: afe66288fe9df6d8247d459e57858430f1ec7a25 --- src/replication.cpp | 2 +- tests/integration/replication-active.tcl | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/replication.cpp b/src/replication.cpp index 4f66bab99..e19daa267 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -3277,7 +3277,7 @@ void replicaReplayCommand(client *c) cFake->lock.lock(); cFake->authenticated = c->authenticated; cFake->puser = c->puser; - cFake->querybuf = sdscat(cFake->querybuf,(sds)ptrFromObj(c->argv[2])); + cFake->querybuf = sdscatsds(cFake->querybuf,(sds)ptrFromObj(c->argv[2])); selectDb(cFake, c->db->id); processInputBuffer(cFake, (CMD_CALL_FULL & (~CMD_CALL_PROPAGATE))); cFake->lock.unlock(); diff --git a/tests/integration/replication-active.tcl b/tests/integration/replication-active.tcl index dfb89f603..99e0dc006 100644 --- a/tests/integration/replication-active.tcl +++ b/tests/integration/replication-active.tcl @@ -49,6 +49,15 @@ start_server {tags {"active-repl"} overrides {active-replica yes}} { } } + test {Active replicas propogate binary} { + $master set binkey "\u0000foo" + wait_for_condition 50 500 { + [string match *foo* [$slave get binkey]] + } else { + fail "replication failed to propogate binary data" + } + } + test {Active replicas WAIT} { # Test that wait succeeds since replicas should be syncronized $master set testkey foo From e3b97286cae8d105907f4fa7981876f2eabf1977 Mon Sep 17 00:00:00 2001 From: John Sully Date: Wed, 24 Jul 2019 22:49:30 -0400 Subject: [PATCH 55/76] RREPLAY failures should be logged Former-commit-id: 08b6a04055e950e53f043391ec9f9a09f654b1ee --- src/replication.cpp | 7 ++++++- src/server.cpp | 1 + src/server.h | 1 + 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/replication.cpp b/src/replication.cpp index e19daa267..736b2ccec 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -3279,9 +3279,14 @@ void replicaReplayCommand(client *c) cFake->puser = c->puser; cFake->querybuf = sdscatsds(cFake->querybuf,(sds)ptrFromObj(c->argv[2])); selectDb(cFake, c->db->id); + auto ccmdPrev = serverTL->commandsExecuted; processInputBuffer(cFake, (CMD_CALL_FULL & (~CMD_CALL_PROPAGATE))); + bool fExec = ccmdPrev != serverTL->commandsExecuted; cFake->lock.unlock(); - addReply(c, shared.ok); + if (fExec) + addReply(c, shared.ok); + else + addReplyError(c, "command did not execute"); freeClient(cFake); serverTL->current_client = current_clientSave; diff --git a/src/server.cpp b/src/server.cpp index 2d9627c0f..9866117aa 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -3347,6 +3347,7 @@ void call(client *c, int flags) { dirty = g_pserver->dirty; start = ustime(); c->cmd->proc(c); + serverTL->commandsExecuted++; duration = ustime()-start; dirty = g_pserver->dirty-dirty; if (dirty < 0) dirty = 0; diff --git a/src/server.h b/src/server.h index f350410d0..52c9877e6 100644 --- a/src/server.h +++ b/src/server.h @@ -1403,6 +1403,7 @@ struct redisServerThreadVars { client *lua_client = nullptr; /* The "fake client" to query Redis from Lua */ struct fastlock lockPendingWrite; char neterr[ANET_ERR_LEN]; /* Error buffer for anet.c */ + long unsigned commandsExecuted = 0; }; struct redisMaster { From 945a7b69f11e53341766442809d6663038a9989e Mon Sep 17 00:00:00 2001 From: John Sully Date: Mon, 29 Jul 2019 15:08:41 -0400 Subject: [PATCH 56/76] Fix crash in RediSearch Former-commit-id: cbe5c04a0f3b0b1886f6c88c0a2401e0e6501b3b --- src/module.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/module.cpp b/src/module.cpp index 052c8744a..b2c19c735 100644 --- a/src/module.cpp +++ b/src/module.cpp @@ -3949,6 +3949,8 @@ void RM_FreeThreadSafeContext(RedisModuleCtx *ctx) { void RM_ThreadSafeContextLock(RedisModuleCtx *ctx) { UNUSED(ctx); moduleAcquireGIL(FALSE /*fServerThread*/); + if (serverTL == nullptr) + serverTL = &g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN]; // arbitrary module threads get the main thread context } /* Release the server lock after a thread safe API call was executed. */ From f8884fed3aa780153af10d11e1a94e74a22796bf Mon Sep 17 00:00:00 2001 From: John Sully Date: Mon, 29 Jul 2019 17:09:51 -0400 Subject: [PATCH 57/76] Fix crash setting expire on a key with an existing subkey expiration Former-commit-id: 4858fd893c8e638b0efdcd3ab2c6dc188a6dc6bd --- src/server.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/server.h b/src/server.h index 52c9877e6..9c5623fe9 100644 --- a/src/server.h +++ b/src/server.h @@ -801,7 +801,8 @@ public: void expireSubKey(const char *szSubkey, long long when) { auto itrInsert = std::lower_bound(m_vecexpireEntries.begin(), m_vecexpireEntries.end(), when); - m_vecexpireEntries.emplace(itrInsert, when, sdsdup(szSubkey)); + const char *subkey = (szSubkey) ? sdsdup(szSubkey) : nullptr; + m_vecexpireEntries.emplace(itrInsert, when, subkey); } bool FEmpty() const noexcept { return m_vecexpireEntries.empty(); } From 89b8ba8d9dcf807e03f8b976cb5ba4ba39e31d62 Mon Sep 17 00:00:00 2001 From: John Sully Date: Mon, 29 Jul 2019 18:08:45 -0400 Subject: [PATCH 58/76] Fix HLL corruption Former-commit-id: 47ef9e5cbd11dad107a68a94dfb51d192e86c84e --- src/hyperloglog.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/hyperloglog.cpp b/src/hyperloglog.cpp index 344fd219f..0b1239965 100644 --- a/src/hyperloglog.cpp +++ b/src/hyperloglog.cpp @@ -710,6 +710,7 @@ int hllSparseSet(robj *o, long index, uint8_t count) { first += span; } if (span == 0) return -1; /* Invalid format. */ + if (p >= end) return -1; /* Invalid format. */ next = HLL_SPARSE_IS_XZERO(p) ? p+2 : p+1; if (next >= end) next = NULL; From 5525de6148e4045b2624015bd4da70441698004c Mon Sep 17 00:00:00 2001 From: John Sully Date: Tue, 30 Jul 2019 16:54:25 -0400 Subject: [PATCH 59/76] Acquire the lock for modules that don't acquire it before calling like they are supposed to Former-commit-id: 6016dd0f8b68ceeeb161a39a0d531a3cc52d78d8 --- src/module.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/module.cpp b/src/module.cpp index b2c19c735..1d00d2b1d 100644 --- a/src/module.cpp +++ b/src/module.cpp @@ -3939,7 +3939,9 @@ RedisModuleCtx *RM_GetThreadSafeContext(RedisModuleBlockedClient *bc) { /* Release a thread safe context. */ void RM_FreeThreadSafeContext(RedisModuleCtx *ctx) { + moduleAcquireGIL(false /*fServerThread*/); moduleFreeContext(ctx); + moduleReleaseGIL(false /*fServerThread*/); zfree(ctx); } From ef6d9c6e1fa3227c34cdd850462b92f7905066c9 Mon Sep 17 00:00:00 2001 From: John Sully Date: Tue, 30 Jul 2019 17:11:15 -0400 Subject: [PATCH 60/76] Fix test failures: Non active replicas are allowed to use shared int values Former-commit-id: 8536854266d5795c5c2bc11b8b344f20759b55af --- src/db.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/db.cpp b/src/db.cpp index cc67acd05..ca42453e8 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -269,7 +269,7 @@ void dbOverwrite(redisDb *db, robj *key, robj *val) { dictEntry *de = dictFind(db->pdict,ptrFromObj(key)); serverAssertWithInfo(NULL,key,de != NULL); - dbOverwriteCore(db, de, key, val, true, false); + dbOverwriteCore(db, de, key, val, !!g_pserver->fActiveReplica, false); } /* Insert a key, handling duplicate keys according to fReplace */ @@ -309,7 +309,7 @@ void setKey(redisDb *db, robj *key, robj *val) { if (de == NULL) { dbAdd(db,key,val); } else { - dbOverwriteCore(db,de,key,val,true,true); + dbOverwriteCore(db,de,key,val,!!g_pserver->fActiveReplica,true); } incrRefCount(val); signalModifiedKey(db,key); From b3de6fcf2670981201551178edd1e2c12e57f2c9 Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 2 Aug 2019 21:31:05 -0400 Subject: [PATCH 61/76] Optimize the new expire code Former-commit-id: 75c6b4c64a9c9f39654c16b1f5ff5a003d24afbc --- src/expire.cpp | 4 +++- src/semiorderedset.h | 13 +++++++------ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/expire.cpp b/src/expire.cpp index 5d257428d..ba0b99284 100644 --- a/src/expire.cpp +++ b/src/expire.cpp @@ -260,6 +260,7 @@ void activeExpireCycle(int type) { size_t expired = 0; size_t tried = 0; + long long check = ACTIVE_EXPIRE_CYCLE_FAST_DURATION; // assume a check is roughly 1us. It isn't but good enough db->expireitr = db->setexpire->enumerate(db->expireitr, now, [&](expireEntry &e) __attribute__((always_inline)) { if (e.when() < now) { @@ -279,9 +280,10 @@ void activeExpireCycle(int type) { g_pserver->stat_expired_time_cap_reached_count++; return false; } + check = ACTIVE_EXPIRE_CYCLE_FAST_DURATION; } return true; - }); + }, &check); total_expired += expired; } diff --git a/src/semiorderedset.h b/src/semiorderedset.h index 450910c49..00a1f1d91 100644 --- a/src/semiorderedset.h +++ b/src/semiorderedset.h @@ -30,7 +30,7 @@ class semiorderedset { // Aim for roughly 4 cache lines per bucket (determined by imperical testing) // lower values are faster but use more memory - return std::max((64/sizeof(T))*4, (size_t)2); + return std::max((64/sizeof(T))*8, (size_t)2); } public: @@ -122,7 +122,7 @@ public: // enumeration starting from the 'itrStart'th key. Note that the iter is a hint, and need no be valid anymore template - setiter enumerate(const setiter &itrStart, const T_MAX &max, T_VISITOR fn) + setiter enumerate(const setiter &itrStart, const T_MAX &max, T_VISITOR fn, long long *pccheck) { setiter itr(itrStart); @@ -135,7 +135,7 @@ public: for (size_t ibucket = 0; ibucket < m_data.size(); ++ibucket) { - if (!enumerate_bucket(itr, max, fn)) + if (!enumerate_bucket(itr, max, fn, pccheck)) break; itr.idxSecondary = 0; @@ -314,7 +314,7 @@ private: } template - inline bool enumerate_bucket(setiter &itr, const T_MAX &max, T_VISITOR &fn) + inline bool enumerate_bucket(setiter &itr, const T_MAX &max, T_VISITOR &fn, long long *pcheckLimit) { auto &vec = m_data[itr.idxPrimary]; for (; itr.idxSecondary < vec.size(); ++itr.idxSecondary) @@ -323,8 +323,9 @@ private: assert((itr.idxSecondary+1) >= vec.size() || static_cast(vec[itr.idxSecondary]) <= static_cast(vec[itr.idxSecondary+1])); + (*pcheckLimit)--; if (max < static_cast(*itr)) - return true; + return *pcheckLimit > 0; size_t sizeBefore = vec.size(); if (!fn(*itr)) @@ -339,6 +340,6 @@ private: } } vec.shrink_to_fit(); - return true; + return *pcheckLimit > 0; } }; From 4c3ecad1784a0805ccc3bc214ca5fb407b3419c0 Mon Sep 17 00:00:00 2001 From: benschermel <43507366+benschermel@users.noreply.github.com> Date: Tue, 6 Aug 2019 19:25:13 -0400 Subject: [PATCH 62/76] update readme (#71) * update readme * KeyDB must acknowledge its Redis origin and that while we have different opinions we are greatful to Antirez and Redis for the work they've done in this space. * Update README.md Former-commit-id: 4e9bfcf84bb6a7ed47d2eaf478fc933abfb48a2d --- README.md | 76 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 43 insertions(+), 33 deletions(-) diff --git a/README.md b/README.md index 1f23f2c04..18db3c875 100644 --- a/README.md +++ b/README.md @@ -2,40 +2,47 @@ [![Build Status](https://travis-ci.org/JohnSully/KeyDB.svg?branch=unstable)](https://travis-ci.org/JohnSully/KeyDB) [![Join the chat at https://gitter.im/KeyDB/community](https://badges.gitter.im/KeyDB/community.svg)](https://gitter.im/KeyDB/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) [![StackShare](http://img.shields.io/badge/tech-stack-0690fa.svg?style=flat)](https://stackshare.io/eq-alpha-technology-inc/eq-alpha-technology-inc) +##### Need Help? Check out our extensive [documentation](https://docs.keydb.dev). + What is KeyDB? -------------- -KeyDB is a high performance fork of Redis focusing on multithreading, memory efficiency, and high throughput. In addition to multithreading KeyDB also has features only available in Redis Enterprise such as [Active Replication](https://github.com/JohnSully/KeyDB/wiki/Active-Replication), [FLASH storage](https://github.com/JohnSully/KeyDB/wiki/FLASH-Storage) support, and some not available at all such as direct backup to AWS S3. +KeyDB is a high performance fork of Redis with a focus on multithreading, memory efficiency, and high throughput. In addition to multithreading, KeyDB also has features only available in Redis Enterprise such as [Active Replication](https://github.com/JohnSully/KeyDB/wiki/Active-Replication), [FLASH storage](https://github.com/JohnSully/KeyDB/wiki/FLASH-Storage) support, and some not available at all such as direct backup to AWS S3. -On the same hardware KeyDB can perform twice as many queries per second as Redis, with 60% lower latency. +KeyDB maintains full compatibility with the Redis protocol, modules, and scripts. This includes the atomicity gurantees for scripts and transactions. Because KeyDB keeps in sync with Redis development KeyDB is a superset of Redis functionality, making KeyDB a drop in replacement for existing Redis deployments. -KeyDB has full compatibility with the Redis protocol, modules, and scripts. This includes full support for transactions, and atomic execution of scripts. For more information see our architecture section below. +On the same hardware KeyDB can perform twice as many queries per second as Redis, with 60% lower latency. Active-Replication simplifies hot-spare failover allowing you to easily distribute writes over replicas and use simple TCP based load balancing/failover. KeyDB's higher performance allows you to do more on less hardware which reduces operation costs and complexity. + + + +Why fork Redis? +--------------- + +KeyDB has a different philosophy on how the codebase should evolve. We feel that ease of use, high performance, and a "batteries included" approach is the best way to create a good user experience. While we have great respect for the Redis maintainers it is our opinion that the Redis approach focusses too much on simplicity of the code base at the expense of complexity for the user. This results in the need for external components and workarounds to solve common problems - resulting in more complexity overall. + +Because of this difference of opinion features which are right for KeyDB may not be appropriate for Redis. A fork allows us to explore this new development path and implement features which may never be a part of Redis. KeyDB keeps in sync with upstream Redis changes, and where applicable we upstream bug fixes and changes. It is our hope that the two projects can continue to grow and learn from each other. + +Additional Resources +-------------------- Try our docker container: https://hub.docker.com/r/eqalpha/keydb Talk on Gitter: https://gitter.im/KeyDB +Visit our Website: https://keydb.dev + +See options for channel partners and support contracts: https://keydb.dev/support.html + +Learn with KeyDB’s official documentation site: https://docs.keydb.dev + [Subscribe to the KeyDB mailing list](https://eqalpha.us20.list-manage.com/subscribe/post?u=978f486c2f95589b24591a9cc&id=4ab9220500) Management GUI: We recommend [FastoNoSQL](https://fastonosql.com/) which has official KeyDB support. -New: Active Replica Support ---------------------------- - -New! KeyDB now has support for Active Replicas. This feature greatly simplifies hot-spare failover and allows you to distribute writes over replicas instead of just a single master. For more information [see the wiki page](https://github.com/JohnSully/KeyDB/wiki/Active-Replication). - -Why fork Redis? ---------------- - -The Redis maintainers have continually reiterated that they do not plan to support multithreading. While we have great respect for the redis team, we feel the analysis justifying this decision is incorrect. In addition we wanted open source implementations of features currently only available in proprietary modules. We feel a fork is the best way to accelerate development in the areas of most interest to us. - -We plan to track the Redis repo closely and hope our projects can learn from each other. Benchmarking KeyDB ------------------ - - Please note keydb-benchmark and redis-benchmark are currently single threaded and too slow to properly benchmark KeyDB. We recommend using a redis cluster benchmark tool such as [memtier](https://github.com/RedisLabs/memtier_benchmark). Please ensure your machine has enough cores for both KeyDB and memteir if testing locally. KeyDB expects exclusive use of any cores assigned to it. For more details on how we benchmarked KeyDB along with performance numbers check out our blog post: [Redis Should Be Multithreaded](https://medium.com/@john_63123/redis-should-be-multi-threaded-e28319cab744?source=friends_link&sk=7ce8e9fe3ec8224a4d27ef075d085457) @@ -58,6 +65,10 @@ If you would like to use the [FLASH backed](https://github.com/JohnSully/KeyDB/w If you would like KeyDB to dump and load directly to AWS S3 this option specifies the bucket. Using this option with the traditional RDB options will result in KeyDB backing up twice to both locations. If both are specified KeyDB will first attempt to load from the local dump file and if that fails load from S3. This requires the AWS CLI tools to be installed and configured which are used under the hood to transfer the data. + active-replica yes + +If you are using active-active replication set `active-replica` option to “yes”. This will enable both instances to accept reads and writes while remaining synced. [Click here](https://docs.keydb.dev/docs/active-rep/) to see more on active-rep in our docs section. There are also [docker examples]( https://docs.keydb.dev/docs/docker-active-rep/) on docs. + All other configuration options behave as you'd expect. Your existing configuration files should continue to work unchanged. Building KeyDB @@ -67,16 +78,19 @@ KeyDB can be compiled and is tested for use on Linux. KeyDB currently relies on Install dependencies: - % sudo apt install build-essential nasm autotools-dev autoconf libjemalloc-dev tcl tcl-dev uuid-dev + % sudo apt install build-essential nasm autotools-dev autoconf libjemalloc-dev tcl tcl-dev uuid-dev libcurl4-openssl-dev Compiling is as simple as: % make -You can enable flash support with (Note: autoconf and autotools must be installed): +You can enable flash support with: % make MALLOC=memkind +***Note that the following dependencies may be needed: + % sudo apt-get install autoconf autotools-dev libnuma-dev libtool + Fixing build problems with dependencies or cached build options --------- @@ -179,7 +193,7 @@ then in another terminal try the following: (integer) 2 keydb> -You can find the list of all the available commands at http://redis.io/commands. +You can find the list of all the available commands at https://docs.keydb.dev/docs/commands/ Installing KeyDB ----------------- @@ -222,23 +236,18 @@ Future work: Docker Build ------------ - -Run the following commands for a full source download and build: - +Build the latest binaries from the github unstable branch within a docker container. Note this is built for Ubuntu 18.04. +Simply make a directory you would like to have the latest binaries dumped in, then run the following commmand with your updated path: ``` -git clone git@github.com:JohnSully/KeyDB.git -docker run -it --rm -v `pwd`/KeyDB:/build -w /build devopsdood/keydb-builder make +$ docker run -it --rm -v /path-to-dump-binaries:/keydb_bin eqalpha/keydb-build-bin ``` +You should receive the following files: keydb-benchmark, keydb-check-aof, keydb-check-rdb, keydb-cli, keydb-sentinel, keydb-server -Then you have fresh binaries built, you can also pass any other options to the make command above after the word make. E.g. - -```docker run -it --rm -v `pwd`/KeyDB:/build -w /build devopsdood/keydb-builder make MAllOC=memkind``` - -The above commands will build you binaries in the src directory. Standard `make install` without Docker command will work after if you wish to install - -If you'd prefer you can build the Dockerfile in the repo instead of pulling the above container for use: - -`docker build -t KeyDB .` +If you are looking to enable flash support with the build (make MALLOC=memkind) then use the following command: +``` +$ docker run -it --rm -v /path-to-dump-binaries:/keydb_bin eqalpha/keydb-build-bin:flash +``` +Please note that you will need libcurl4-openssl-dev in order to run keydb. With flash version you may need libnuma-dev and libtool installed in order to run the binaries. Keep this in mind especially when running in a container. For a copy of all our Dockerfiles, please see them on [docs]( https://docs.keydb.dev/docs/dockerfiles/). Code contributions ----------------- @@ -252,3 +261,4 @@ source distribution. Please see the CONTRIBUTING file in this source distribution for more information. + From 862a87a8ae25dbc018c2c67e3d32ffe18076cdc4 Mon Sep 17 00:00:00 2001 From: John Sully Date: Wed, 14 Aug 2019 01:03:24 -0400 Subject: [PATCH 63/76] Fix warnings on newer compilers Former-commit-id: 33e0b3e1c0ddc0e56a00100e202b89eeff9cff61 --- src/compactvector.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/compactvector.h b/src/compactvector.h index 65a40f114..daa8ad9fc 100644 --- a/src/compactvector.h +++ b/src/compactvector.h @@ -77,7 +77,7 @@ public: } assert(idx < m_max); where = m_data + idx; - memmove(m_data + idx + 1, m_data + idx, (m_celem - idx)*sizeof(T)); + memmove(reinterpret_cast(m_data + idx + 1), reinterpret_cast(m_data + idx), (m_celem - idx)*sizeof(T)); new(m_data + idx) T(std::move(val)); ++m_celem; return where; @@ -103,7 +103,7 @@ public: size_t idx = where - m_data; assert(idx < m_celem); where->~T(); - memmove(where, where+1, ((m_celem - idx - 1)*sizeof(T))); + memmove(reinterpret_cast(where), reinterpret_cast(where+1), ((m_celem - idx - 1)*sizeof(T))); --m_celem; if (m_celem == 0) From a21a7a8ee9b3748e323009f350bc5460cfbe16f7 Mon Sep 17 00:00:00 2001 From: John Sully Date: Mon, 26 Aug 2019 20:18:52 -0400 Subject: [PATCH 64/76] Fix race condition in PUB/SUB and other async reply commands where the client can be freed before our handler is executed on the client thread. When this occurs the client pointer is dangling Former-commit-id: 46a78c69e718f0aeb5e62f33c59458b15b8d9bc6 --- .vscode/settings.json | 3 +- src/ae.cpp | 33 +++++++++++++++++- src/debug.cpp | 3 ++ src/networking.cpp | 79 +++++++++++++++++++++++++++++++++++-------- src/pubsub.cpp | 10 ++++++ src/server.h | 6 ++-- 6 files changed, 115 insertions(+), 19 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index e4d7c4c9a..42660486e 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -52,6 +52,7 @@ "type_traits": "cpp", "typeinfo": "cpp", "utility": "cpp", - "set": "cpp" + "set": "cpp", + "algorithm": "cpp" } } diff --git a/src/ae.cpp b/src/ae.cpp index 48d6107b7..f636078b1 100644 --- a/src/ae.cpp +++ b/src/ae.cpp @@ -191,6 +191,36 @@ void aeProcessCmd(aeEventLoop *eventLoop, int fd, void *, int ) } } +// Unlike write() this is an all or nothing thing. We will block if a partial write is hit +ssize_t safe_write(int fd, const void *pv, size_t cb) +{ + const char *pcb = (const char*)pv; + ssize_t written = 0; + do + { + ssize_t rval = write(fd, pcb, cb); + if (rval > 0) + { + pcb += rval; + cb -= rval; + written += rval; + } + else if (errno == EAGAIN) + { + if (written == 0) + break; + // if we've already written something then we're committed so keep trying + } + else + { + if (rval == 0) + return written; + return rval; + } + } while (cb); + return written; +} + int aeCreateRemoteFileEvent(aeEventLoop *eventLoop, int fd, int mask, aeFileProc *proc, void *clientData, int fSynchronous) { @@ -212,9 +242,10 @@ int aeCreateRemoteFileEvent(aeEventLoop *eventLoop, int fd, int mask, std::unique_lock ulock(cmd.pctl->mutexcv, std::defer_lock); if (fSynchronous) cmd.pctl->mutexcv.lock(); - auto size = write(eventLoop->fdCmdWrite, &cmd, sizeof(cmd)); + auto size = safe_write(eventLoop->fdCmdWrite, &cmd, sizeof(cmd)); if (size != sizeof(cmd)) { + AE_ASSERT(size == sizeof(cmd) || size <= 0); AE_ASSERT(errno == EAGAIN); ret = AE_ERR; } diff --git a/src/debug.cpp b/src/debug.cpp index 41c73b55c..3246f9d19 100644 --- a/src/debug.cpp +++ b/src/debug.cpp @@ -55,6 +55,8 @@ typedef ucontext_t sigcontext_t; #endif #endif +bool g_fInCrash = false; + /* ================================= Debugging ============================== */ /* Compute the sha1 of string at 's' with 'len' bytes long. @@ -1360,6 +1362,7 @@ void dumpX86Calls(void *addr, size_t len) { void sigsegvHandler(int sig, siginfo_t *info, void *secret) { ucontext_t *uc = (ucontext_t*) secret; + g_fInCrash = true; void *eip = getMcontextEip(uc); sds infostring, clients; struct sigaction act; diff --git a/src/networking.cpp b/src/networking.cpp index 9bd70c35e..15973fe1b 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -175,6 +175,7 @@ client *createClient(int fd, int iel) { c->buflenAsync = 0; c->bufposAsync = 0; c->client_tracking_redirection = 0; + c->casyncOpsPending = 0; memset(c->uuid, 0, UUID_BINARY_LEN); listSetFreeMethod(c->pubsub_patterns,decrRefCountVoid); @@ -1004,7 +1005,6 @@ static void acceptCommonHandler(int fd, int flags, char *ip, int iel) { serverLog(LL_WARNING, "Error registering fd event for the new client: %s (fd=%d)", strerror(errno),fd); - close(fd); /* May be already closed, just ignore errors */ return; } @@ -1270,7 +1270,7 @@ void unlinkClient(client *c) { if (c->flags & CLIENT_TRACKING) disableTracking(c); } -void freeClient(client *c) { +bool freeClient(client *c) { listNode *ln; serverAssert(c->fd == -1 || GlobalLocksAcquired()); AssertCorrectThread(c); @@ -1278,9 +1278,9 @@ void freeClient(client *c) { /* If a client is protected, yet we need to free it right now, make sure * to at least use asynchronous freeing. */ - if (c->flags & CLIENT_PROTECTED) { + if (c->flags & CLIENT_PROTECTED || c->casyncOpsPending) { freeClientAsync(c); - return; + return false; } /* If it is our master that's beging disconnected we should make sure @@ -1295,7 +1295,7 @@ void freeClient(client *c) { CLIENT_BLOCKED))) { replicationCacheMaster(MasterInfoFromClient(c), c); - return; + return false; } } @@ -1374,6 +1374,7 @@ void freeClient(client *c) { ulock.unlock(); fastlock_free(&c->lock); zfree(c); + return true; } /* Schedule a client to free it at a safe time in the serverCron() function. @@ -1386,28 +1387,37 @@ void freeClientAsync(client *c) { * may access the list while Redis uses I/O threads. All the other accesses * are in the context of the main thread while the other threads are * idle. */ - if (c->flags & CLIENT_CLOSE_ASAP || c->flags & CLIENT_LUA) return; + if (c->flags & CLIENT_CLOSE_ASAP || c->flags & CLIENT_LUA) return; // check without the lock first std::lock_guardlock)> clientlock(c->lock); AeLocker lock; lock.arm(c); + if (c->flags & CLIENT_CLOSE_ASAP || c->flags & CLIENT_LUA) return; // race condition after we acquire the lock c->flags |= CLIENT_CLOSE_ASAP; listAddNodeTail(g_pserver->clients_to_close,c); } void freeClientsInAsyncFreeQueue(int iel) { + serverAssert(GlobalLocksAcquired()); listIter li; listNode *ln; listRewind(g_pserver->clients_to_close,&li); - while((ln = listNext(&li))) { + // Store the clients in a temp vector since freeClient will modify this list + std::vector vecclientsFree; + while((ln = listNext(&li))) + { client *c = (client*)listNodeValue(ln); - if (c->iel != iel) - continue; // wrong thread + if (c->iel == iel) + { + vecclientsFree.push_back(c); + listDelNode(g_pserver->clients_to_close, ln); + } + } + for (client *c : vecclientsFree) + { c->flags &= ~CLIENT_CLOSE_ASAP; freeClient(c); - listDelNode(g_pserver->clients_to_close,ln); - listRewind(g_pserver->clients_to_close,&li); } } @@ -1555,6 +1565,15 @@ void ProcessPendingAsyncWrites() std::lock_guardlock)> lock(c->lock); serverAssert(c->fPendingAsyncWrite); + if (c->flags & (CLIENT_CLOSE_ASAP | CLIENT_CLOSE_AFTER_REPLY)) + { + c->bufposAsync = 0; + c->buflenAsync = 0; + zfree(c->bufAsync); + c->bufAsync = nullptr; + c->fPendingAsyncWrite = FALSE; + continue; + } // TODO: Append to end of reply block? @@ -1591,8 +1610,36 @@ void ProcessPendingAsyncWrites() continue; asyncCloseClientOnOutputBufferLimitReached(c); - if (aeCreateRemoteFileEvent(g_pserver->rgthreadvar[c->iel].el, c->fd, ae_flags, sendReplyToClient, c, FALSE) == AE_ERR) - continue; // We can retry later in the cron + if (c->flags & CLIENT_CLOSE_ASAP) + continue; // we will never write this so don't post an op + + std::atomic_thread_fence(std::memory_order_seq_cst); + + if (c->casyncOpsPending == 0) + { + if (FCorrectThread(c)) + { + prepareClientToWrite(c, false); // queue an event + } + else + { + // We need to start the write on the client's thread + if (aePostFunction(g_pserver->rgthreadvar[c->iel].el, [c]{ + // Install a write handler. Don't do the actual write here since we don't want + // to duplicate the throttling and safety mechanisms of the normal write code + std::lock_guardlock)> lock(c->lock); + serverAssert(c->casyncOpsPending > 0); + c->casyncOpsPending--; + aeCreateFileEvent(g_pserver->rgthreadvar[c->iel].el, c->fd, AE_WRITABLE|AE_WRITE_THREADSAFE, sendReplyToClient, c); + }, false) == AE_ERR + ) + { + // Posting the function failed + continue; // We can retry later in the cron + } + ++c->casyncOpsPending; // race is handled by the client lock in the lambda + } + } } } @@ -1632,13 +1679,15 @@ int handleClientsWithPendingWrites(int iel) { std::unique_locklock)> lock(c->lock); /* Try to write buffers to the client socket. */ - if (writeToClient(c->fd,c,0) == C_ERR) { + if (writeToClient(c->fd,c,0) == C_ERR) + { if (c->flags & CLIENT_CLOSE_ASAP) { lock.release(); // still locked AeLocker ae; ae.arm(c); - freeClient(c); // writeToClient will only async close, but there's no need to wait + if (!freeClient(c)) // writeToClient will only async close, but there's no need to wait + c->lock.unlock(); // if we just got put on the async close list, then we need to remove the lock } continue; } diff --git a/src/pubsub.cpp b/src/pubsub.cpp index 6a9c2bdfc..46677487f 100644 --- a/src/pubsub.cpp +++ b/src/pubsub.cpp @@ -143,6 +143,8 @@ int clientSubscriptionsCount(client *c) { /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or * 0 if the client was already subscribed to that channel. */ int pubsubSubscribeChannel(client *c, robj *channel) { + serverAssert(GlobalLocksAcquired()); + serverAssert(c->lock.fOwnLock()); dictEntry *de; list *clients = NULL; int retval = 0; @@ -202,6 +204,7 @@ int pubsubUnsubscribeChannel(client *c, robj *channel, int notify) { /* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the client was already subscribed to that pattern. */ int pubsubSubscribePattern(client *c, robj *pattern) { + serverAssert(GlobalLocksAcquired()); int retval = 0; if (listSearchKey(c->pubsub_patterns,pattern) == NULL) { @@ -244,6 +247,7 @@ int pubsubUnsubscribePattern(client *c, robj *pattern, int notify) { /* Unsubscribe from all the channels. Return the number of channels the * client was subscribed to. */ int pubsubUnsubscribeAllChannels(client *c, int notify) { + serverAssert(GlobalLocksAcquired()); dictIterator *di = dictGetSafeIterator(c->pubsub_channels); dictEntry *de; int count = 0; @@ -262,6 +266,7 @@ int pubsubUnsubscribeAllChannels(client *c, int notify) { /* Unsubscribe from all the patterns. Return the number of patterns the * client was subscribed from. */ int pubsubUnsubscribeAllPatterns(client *c, int notify) { + serverAssert(GlobalLocksAcquired()); listNode *ln; listIter li; int count = 0; @@ -278,6 +283,7 @@ int pubsubUnsubscribeAllPatterns(client *c, int notify) { /* Publish a message */ int pubsubPublishMessage(robj *channel, robj *message) { + serverAssert(GlobalLocksAcquired()); int receivers = 0; dictEntry *de; listNode *ln; @@ -293,6 +299,8 @@ int pubsubPublishMessage(robj *channel, robj *message) { listRewind(list,&li); while ((ln = listNext(&li)) != NULL) { client *c = reinterpret_cast(ln->value); + if (c->flags & CLIENT_CLOSE_ASAP) // avoid blocking if the write will be ignored + continue; fastlock_lock(&c->lock); addReplyPubsubMessage(c,channel,message); fastlock_unlock(&c->lock); @@ -311,6 +319,8 @@ int pubsubPublishMessage(robj *channel, robj *message) { (char*)ptrFromObj(channel), sdslen(szFromObj(channel)),0)) { + if (pat->pclient->flags & CLIENT_CLOSE_ASAP) + continue; fastlock_lock(&pat->pclient->lock); addReplyPubsubPatMessage(pat->pclient, pat->pattern,channel,message); diff --git a/src/server.h b/src/server.h index 9c5623fe9..62f803761 100644 --- a/src/server.h +++ b/src/server.h @@ -1165,6 +1165,7 @@ typedef struct client { time_t lastinteraction; /* Time of the last interaction, used for timeout */ time_t obuf_soft_limit_reached_time; std::atomic flags; /* Client flags: CLIENT_* macros. */ + int casyncOpsPending; int fPendingAsyncWrite; /* NOTE: Not a flag because it is written to outside of the client lock (locked by the global lock instead) */ int authenticated; /* Needed when the default user requires auth. */ int replstate; /* Replication state if this is a slave. */ @@ -1941,7 +1942,7 @@ void redisSetProcTitle(const char *title); /* networking.c -- Networking and Client related operations */ client *createClient(int fd, int iel); void closeTimedoutClients(void); -void freeClient(client *c); +bool freeClient(client *c); void freeClientAsync(client *c); void resetClient(client *c); void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask); @@ -2761,9 +2762,10 @@ void xorDigest(unsigned char *digest, const void *ptr, size_t len); int populateCommandTableParseFlags(struct redisCommand *c, const char *strflags); int moduleGILAcquiredByModule(void); +extern bool g_fInCrash; static inline int GlobalLocksAcquired(void) // Used in asserts to verify all global locks are correctly acquired for a server-thread to operate { - return aeThreadOwnsLock() || moduleGILAcquiredByModule(); + return aeThreadOwnsLock() || moduleGILAcquiredByModule() || g_fInCrash; } inline int ielFromEventLoop(const aeEventLoop *eventLoop) From e45268930a061ac8d5dc83e9d4ee5a203f494673 Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 6 Sep 2019 13:30:15 -0400 Subject: [PATCH 65/76] Fix issue where Active Replicas were commiting data to the wrong database under load Former-commit-id: ac595ebe371480a924d542aa62733854eb7c527a --- src/replication.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/replication.cpp b/src/replication.cpp index 736b2ccec..8ea96da1c 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -3284,9 +3284,14 @@ void replicaReplayCommand(client *c) bool fExec = ccmdPrev != serverTL->commandsExecuted; cFake->lock.unlock(); if (fExec) + { addReply(c, shared.ok); + selectDb(c, cFake->db->id); + } else + { addReplyError(c, "command did not execute"); + } freeClient(cFake); serverTL->current_client = current_clientSave; From bd4ed482c467bc226365c9537a71758fd336f723 Mon Sep 17 00:00:00 2001 From: John Sully Date: Thu, 12 Sep 2019 18:51:10 -0400 Subject: [PATCH 66/76] KEYS now only blocks one database Former-commit-id: 18d42a5c353f76533a0ccc4ded24ed089cedacc8 --- src/aelocker.h | 5 +++++ src/db.cpp | 12 +++++++++++- src/networking.cpp | 2 +- src/server.cpp | 9 +++++++-- src/server.h | 5 +++++ 5 files changed, 29 insertions(+), 4 deletions(-) diff --git a/src/aelocker.h b/src/aelocker.h index d5c8186bf..eca15f491 100644 --- a/src/aelocker.h +++ b/src/aelocker.h @@ -61,6 +61,11 @@ public: return m_fArmed; } + void release() + { + m_fArmed = false; + } + ~AeLocker() { if (m_fArmed) diff --git a/src/db.cpp b/src/db.cpp index ca42453e8..b4ac46a2a 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -30,6 +30,7 @@ #include "server.h" #include "cluster.h" #include "atomicvar.h" +#include "aelocker.h" #include #include @@ -642,6 +643,8 @@ void keysCommand(client *c) { unsigned long numkeys = 0; void *replylen = addReplyDeferredLen(c); + aeReleaseLock(); + di = dictGetSafeIterator(c->db->pdict); allkeys = (pattern[0] == '*' && pattern[1] == '\0'); while((de = dictNext(di)) != NULL) { @@ -659,6 +662,12 @@ void keysCommand(client *c) { } dictReleaseIterator(di); setDeferredArrayLen(c,replylen,numkeys); + + fastlock_unlock(&c->db->lock); // we must release the DB lock before acquiring the AE lock to prevent deadlocks + AeLocker lock; + lock.arm(c); + fastlock_lock(&c->db->lock); // we still need the DB lock + lock.release(); } /* This callback is used by scanGenericCommand in order to collect elements @@ -1132,7 +1141,8 @@ int dbSwapDatabases(int id1, int id2) { if (id1 < 0 || id1 >= cserver.dbnum || id2 < 0 || id2 >= cserver.dbnum) return C_ERR; if (id1 == id2) return C_OK; - redisDb aux = g_pserver->db[id1]; + redisDb aux; + memcpy(&aux, &g_pserver->db[id1], sizeof(redisDb)); redisDb *db1 = &g_pserver->db[id1], *db2 = &g_pserver->db[id2]; /* Swap hash tables. Note that we don't swap blocking_keys, diff --git a/src/networking.cpp b/src/networking.cpp index 15973fe1b..0588745b2 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -247,7 +247,7 @@ void clientInstallAsyncWriteHandler(client *c) { * data should be appended to the output buffers. */ int prepareClientToWrite(client *c, bool fAsync) { fAsync = fAsync && !FCorrectThread(c); // Not async if we're on the right thread - serverAssert(!fAsync || GlobalLocksAcquired()); + serverAssert(FCorrectThread(c) || fAsync); serverAssert(c->fd <= 0 || c->lock.fOwnLock()); if (c->flags & CLIENT_FORCE_REPLY) return C_OK; // FORCE REPLY means we're doing something else with the buffer. diff --git a/src/server.cpp b/src/server.cpp index 9866117aa..74c18df24 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -59,6 +59,7 @@ #include #include #include +#include #include "aelocker.h" int g_fTestMode = false; @@ -2922,6 +2923,7 @@ void initServer(void) { /* Create the Redis databases, and initialize other internal state. */ for (int j = 0; j < cserver.dbnum; j++) { + new (&g_pserver->db[j]) redisDb; g_pserver->db[j].pdict = dictCreate(&dbDictType,NULL); g_pserver->db[j].setexpire = new(MALLOC_LOCAL) expireset(); g_pserver->db[j].expireitr = g_pserver->db[j].setexpire->end(); @@ -3696,6 +3698,7 @@ int processCommand(client *c, int callFlags) { queueMultiCommand(c); addReply(c,shared.queued); } else { + std::unique_lockdb->lock)> ulock(c->db->lock); call(c,callFlags); c->woff = g_pserver->master_repl_offset; if (listLength(g_pserver->ready_keys)) @@ -4097,10 +4100,12 @@ sds genRedisInfoString(const char *section) { "connected_clients:%lu\r\n" "client_recent_max_input_buffer:%zu\r\n" "client_recent_max_output_buffer:%zu\r\n" - "blocked_clients:%d\r\n", + "blocked_clients:%d\r\n" + "current_client_thread:%d\r\n", listLength(g_pserver->clients)-listLength(g_pserver->slaves), maxin, maxout, - g_pserver->blocked_clients); + g_pserver->blocked_clients, + static_cast(serverTL - g_pserver->rgthreadvar)); for (int ithread = 0; ithread < cserver.cthreads; ++ithread) { info = sdscatprintf(info, diff --git a/src/server.h b/src/server.h index 62f803761..94e679dc2 100644 --- a/src/server.h +++ b/src/server.h @@ -1014,6 +1014,9 @@ typedef struct clientReplyBlock { * by integers from 0 (the default database) up to the max configured * database. The database number is the 'id' field in the structure. */ typedef struct redisDb { + redisDb() + : expireitr(nullptr) + {}; dict *pdict; /* The keyspace for this DB */ expireset *setexpire; expireset::setiter expireitr; @@ -1025,6 +1028,8 @@ typedef struct redisDb { long long last_expire_set; /* when the last expire was set */ double avg_ttl; /* Average TTL, just for stats */ list *defrag_later; /* List of key names to attempt to defrag one by one, gradually. */ + + fastlock lock; } redisDb; /* Client MULTI/EXEC state */ From c75e700a86607066161c0fe9d41e3762e26208f3 Mon Sep 17 00:00:00 2001 From: John Sully Date: Thu, 19 Sep 2019 15:39:52 -0400 Subject: [PATCH 67/76] Fix issue where AOF events are posted to the wrong event loop and not properly cleaned up Former-commit-id: 55324aa56a249ccbc73ffe92cac1740bfcc82ebd --- src/aof.cpp | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/src/aof.cpp b/src/aof.cpp index 5c6385c84..637b2ce34 100644 --- a/src/aof.cpp +++ b/src/aof.cpp @@ -97,6 +97,7 @@ void aofChildWriteDiffData(aeEventLoop *el, int fd, void *privdata, int mask) { aofrwblock *block; ssize_t nwritten; serverAssert(GlobalLocksAcquired()); + serverAssert(el == g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el); // MUST run on main thread UNUSED(el); UNUSED(fd); @@ -164,10 +165,7 @@ void aofRewriteBufferAppend(unsigned char *s, unsigned long len) { /* Install a file event to send data to the rewrite child if there is * not one already. */ - if (aeGetFileEvents(serverTL->el,g_pserver->aof_pipe_write_data_to_child) == 0) { - aeCreateFileEvent(serverTL->el, g_pserver->aof_pipe_write_data_to_child, - AE_WRITABLE, aofChildWriteDiffData, NULL); - } + aeCreateRemoteFileEvent(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el, g_pserver->aof_pipe_write_data_to_child, AE_WRITABLE, aofChildWriteDiffData, NULL, FALSE); } /* Write the buffer (possibly composed of multiple blocks) into the specified @@ -1519,7 +1517,7 @@ void aofChildPipeReadable(aeEventLoop *el, int fd, void *privdata, int mask) { } /* Remove the handler since this can be called only one time during a * rewrite. */ - aeDeleteFileEventAsync(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el,g_pserver->aof_pipe_read_ack_from_child,AE_READABLE); + aeDeleteFileEvent(el,g_pserver->aof_pipe_read_ack_from_child,AE_READABLE); } /* Create the pipes used for parent - child process IPC during rewrite. @@ -1557,12 +1555,20 @@ error: } void aofClosePipes(void) { - aeDeleteFileEventAsync(g_pserver->el_alf_pip_read_ack_from_child,g_pserver->aof_pipe_read_ack_from_child,AE_READABLE); - aeDeleteFileEventAsync(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el,g_pserver->aof_pipe_write_data_to_child,AE_WRITABLE); - close(g_pserver->aof_pipe_write_data_to_child); + int fdAofAckPipe = g_pserver->aof_pipe_read_ack_from_child; + aePostFunction(g_pserver->el_alf_pip_read_ack_from_child, [fdAofAckPipe]{ + aeDeleteFileEventAsync(serverTL->el,fdAofAckPipe,AE_READABLE); + close (fdAofAckPipe); + }); + + int fdAofWritePipe = g_pserver->aof_pipe_write_data_to_child; + aePostFunction(g_pserver->rgthreadvar[IDX_EVENT_LOOP_MAIN].el, [fdAofWritePipe]{ + aeDeleteFileEventAsync(serverTL->el,fdAofWritePipe,AE_WRITABLE); + close(fdAofWritePipe); + }); + close(g_pserver->aof_pipe_read_data_from_parent); close(g_pserver->aof_pipe_write_ack_to_parent); - close(g_pserver->aof_pipe_read_ack_from_child); close(g_pserver->aof_pipe_write_ack_to_child); close(g_pserver->aof_pipe_read_ack_from_parent); } From 4f19c5de9fb6a4fa3cbafb3b2de83fa1d02e2edf Mon Sep 17 00:00:00 2001 From: John Sully Date: Thu, 26 Sep 2019 20:35:51 -0400 Subject: [PATCH 68/76] Fix multi master bugs: 1. we fail to create the temp file. 2. We use a master RDB as our backup even though we merged databases (and therefore it is not representitive) Former-commit-id: e776474f68a2824bb7d4082c41991a9a9f3a9c9d --- src/rdb.cpp | 3 +-- src/rdb.h | 1 + src/replication.cpp | 59 ++++++++++++++++++++++++++++++++++++--------- src/server.h | 1 + 4 files changed, 50 insertions(+), 14 deletions(-) diff --git a/src/rdb.cpp b/src/rdb.cpp index 97ade6d1f..b983167a4 100644 --- a/src/rdb.cpp +++ b/src/rdb.cpp @@ -2179,7 +2179,6 @@ eoferr: /* unexpected end of file is handled here with a fatal exit */ return C_ERR; /* Just to avoid warning */ } -int rdbLoadFile(char *filename, rdbSaveInfo *rsi); int rdbLoad(rdbSaveInfo *rsi) { int err = C_ERR; @@ -2199,7 +2198,7 @@ int rdbLoad(rdbSaveInfo *rsi) * * If you pass an 'rsi' structure initialied with RDB_SAVE_OPTION_INIT, the * loading code will fiil the information fields in the structure. */ -int rdbLoadFile(char *filename, rdbSaveInfo *rsi) { +int rdbLoadFile(const char *filename, rdbSaveInfo *rsi) { FILE *fp; rio rdb; int retval; diff --git a/src/rdb.h b/src/rdb.h index 0ee2cad92..edf43d422 100644 --- a/src/rdb.h +++ b/src/rdb.h @@ -136,6 +136,7 @@ int rdbLoadLenByRef(rio *rdb, int *isencoded, uint64_t *lenptr); int rdbSaveObjectType(rio *rdb, robj_roptr o); int rdbLoadObjectType(rio *rdb); int rdbLoad(rdbSaveInfo *rsi); +int rdbLoadFile(const char *filename, rdbSaveInfo *rsi); int rdbSaveBackground(rdbSaveInfo *rsi); int rdbSaveToSlavesSockets(rdbSaveInfo *rsi); void rdbRemoveTempFile(pid_t childpid); diff --git a/src/replication.cpp b/src/replication.cpp index 8ea96da1c..e817e6000 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -41,6 +41,7 @@ #include #include #include +#include void replicationDiscardCachedMaster(redisMaster *mi); void replicationResurrectCachedMaster(redisMaster *mi, int newfd); @@ -1219,6 +1220,24 @@ void changeReplicationId(void) { g_pserver->replid[CONFIG_RUN_ID_SIZE] = '\0'; } + +int hexchToInt(char ch) +{ + if (ch >= '0' && ch <= '9') + return ch - '0'; + if (ch >= 'a' && ch <= 'f') + return (ch - 'a') + 10; + return (ch - 'A') + 10; +} +void mergeReplicationId(const char *id) +{ + for (int i = 0; i < CONFIG_RUN_ID_SIZE; ++i) + { + const char *charset = "0123456789abcdef"; + g_pserver->replid[i] = charset[hexchToInt(g_pserver->replid[i]) ^ hexchToInt(id[i])]; + } +} + /* Clear (invalidate) the secondary replication ID. This happens, for * example, after a full resynchronization, when we start a new replication * history. */ @@ -1492,12 +1511,19 @@ void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask) { killRDBChild(); } - if (rename(mi->repl_transfer_tmpfile,g_pserver->rdb_filename) == -1) { - serverLog(LL_WARNING,"Failed trying to rename the temp DB into %s in MASTER <-> REPLICA synchronization: %s", - g_pserver->rdb_filename, strerror(errno)); - cancelReplicationHandshake(mi); - return; + const char *rdb_filename = mi->repl_transfer_tmpfile; + + if (!fUpdate) + { + if (rename(mi->repl_transfer_tmpfile,g_pserver->rdb_filename) == -1) { + serverLog(LL_WARNING,"Failed trying to rename the temp DB into %s in MASTER <-> REPLICA synchronization: %s", + g_pserver->rdb_filename, strerror(errno)); + cancelReplicationHandshake(mi); + return; + } + rdb_filename = g_pserver->rdb_filename; } + serverLog(LL_NOTICE, "MASTER <-> REPLICA sync: %s", fUpdate ? "Keeping old data" : "Flushing old data"); /* We need to stop any AOFRW fork before flusing and parsing * RDB, otherwise we'll create a copy-on-write disaster. */ @@ -1518,7 +1544,7 @@ void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask) { aeDeleteFileEvent(el,mi->repl_transfer_s,AE_READABLE); serverLog(LL_NOTICE, "MASTER <-> REPLICA sync: Loading DB in memory"); rdbSaveInfo rsi = RDB_SAVE_INFO_INIT; - if (rdbLoad(&rsi) != C_OK) { + if (rdbLoadFile(rdb_filename, &rsi) != C_OK) { serverLog(LL_WARNING,"Failed trying to load the MASTER synchronization DB from disk"); cancelReplicationHandshake(mi); /* Re-enable the AOF if we disabled it earlier, in order to restore @@ -1532,11 +1558,18 @@ void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask) { replicationCreateMasterClient(mi, mi->repl_transfer_s,rsi.repl_stream_db); mi->repl_state = REPL_STATE_CONNECTED; mi->repl_down_since = 0; - /* After a full resynchroniziation we use the replication ID and - * offset of the master. The secondary ID / offset are cleared since - * we are starting a new history. */ - memcpy(g_pserver->replid,mi->master->replid,sizeof(g_pserver->replid)); - g_pserver->master_repl_offset = mi->master->reploff; + if (fUpdate) + { + mergeReplicationId(mi->master->replid); + } + else + { + /* After a full resynchroniziation we use the replication ID and + * offset of the master. The secondary ID / offset are cleared since + * we are starting a new history. */ + memcpy(g_pserver->replid,mi->master->replid,sizeof(g_pserver->replid)); + g_pserver->master_repl_offset = mi->master->reploff; + } clearReplicationId2(); /* Let's create the replication backlog if needed. Slaves need to * accumulate the backlog regardless of the fact they have sub-slaves @@ -2123,8 +2156,10 @@ void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) { /* Prepare a suitable temp file for bulk transfer */ while(maxtries--) { + auto dt = std::chrono::system_clock::now().time_since_epoch(); + auto dtMillisecond = std::chrono::duration_cast(dt); snprintf(tmpfile,256, - "temp-%d.%ld.rdb",(int)g_pserver->unixtime,(long int)getpid()); + "temp-%d.%ld.rdb",(int)dtMillisecond.count(),(long int)getpid()); dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644); if (dfd != -1) break; sleep(1); diff --git a/src/server.h b/src/server.h index 94e679dc2..dbcff38f5 100644 --- a/src/server.h +++ b/src/server.h @@ -2171,6 +2171,7 @@ long long getPsyncInitialOffset(void); int replicationSetupSlaveForFullResync(client *slave, long long offset); void changeReplicationId(void); void clearReplicationId2(void); +void mergeReplicationId(const char *); void chopReplicationBacklog(void); void replicationCacheMasterUsingMyself(struct redisMaster *mi); void feedReplicationBacklog(const void *ptr, size_t len); From cdfcc42b6d5e12ecda46be4ea4b274407601cc79 Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 27 Sep 2019 13:17:29 -0400 Subject: [PATCH 69/76] Fix leaking stale RDBs during multimaster sync Former-commit-id: e1c96209510b374e644e5d7e7b6a009ed0f27c32 --- src/replication.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/replication.cpp b/src/replication.cpp index e817e6000..e440c5440 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -1553,6 +1553,8 @@ void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask) { return; } /* Final setup of the connected slave <- master link */ + if (fUpdate) + unlink(mi->repl_transfer_tmpfile); // if we're not updating this became the backup RDB zfree(mi->repl_transfer_tmpfile); close(mi->repl_transfer_fd); replicationCreateMasterClient(mi, mi->repl_transfer_s,rsi.repl_stream_db); From 4cac0ca35a39d5f29be278c973b915a68f38f20b Mon Sep 17 00:00:00 2001 From: John Sully Date: Sat, 28 Sep 2019 00:10:46 -0400 Subject: [PATCH 70/76] Implement left and right shift BITOP operators Former-commit-id: ba365298ed37a76f0a8630e0ec6c86393293aebe --- src/bitops.cpp | 235 +++++++++++++++++++++++++++--------------- src/server.h | 2 +- tests/unit/bitops.tcl | 58 +++++++++++ 3 files changed, 213 insertions(+), 82 deletions(-) diff --git a/src/bitops.cpp b/src/bitops.cpp index 02034f377..bcb8840b3 100644 --- a/src/bitops.cpp +++ b/src/bitops.cpp @@ -396,6 +396,8 @@ void printBits(unsigned char *p, unsigned long count) { #define BITOP_OR 1 #define BITOP_XOR 2 #define BITOP_NOT 3 +#define BITOP_LSHIFT 4 +#define BITOP_RSHIFT 5 #define BITFIELDOP_GET 0 #define BITFIELDOP_SET 1 @@ -592,7 +594,8 @@ void bitopCommand(client *c) { char *opname = szFromObj(c->argv[1]); robj *targetkey = c->argv[2]; robj_roptr o; - unsigned long op, j, numkeys; + int op; + unsigned long j, numkeys; robj_roptr *objects; /* Array of source objects. */ unsigned char **src; /* Array of source strings pointers. */ unsigned long *len, maxlen = 0; /* Array of length of src strings, @@ -609,6 +612,10 @@ void bitopCommand(client *c) { op = BITOP_XOR; else if((opname[0] == 'n' || opname[0] == 'N') && !strcasecmp(opname,"not")) op = BITOP_NOT; + else if (!strcasecmp(opname, "lshift")) + op = BITOP_LSHIFT; + else if (!strcasecmp(opname, "rshift")) + op = BITOP_RSHIFT; else { addReply(c,shared.syntaxerr); return; @@ -620,8 +627,25 @@ void bitopCommand(client *c) { return; } + bool fShiftOp = (op == BITOP_LSHIFT) || (op == BITOP_RSHIFT); + long long shift = 0; + + /* Sanity check: SHIFTS only accept a single arg and an integer */ + if (fShiftOp) { + if (c->argc != 5) { + addReplyError(c,"BITOP SHIFT must be called with a single source key and an integer shift."); + return; + } + if (getLongLongFromObject(c->argv[4], &shift) != C_OK) { + addReplyError(c, "BITOP SHIFT's last parameter must be an integer"); + return; + } + if (op == BITOP_RSHIFT) + shift = -shift; + } + /* Lookup keys, and store pointers to the string objects into an array. */ - numkeys = c->argc - 3; + numkeys = c->argc - (fShiftOp ? 4 : 3); src = (unsigned char**)zmalloc(sizeof(unsigned char*) * numkeys, MALLOC_LOCAL); len = (unsigned long*)zmalloc(sizeof(long) * numkeys, MALLOC_LOCAL); objects = (robj_roptr*)zmalloc(sizeof(robj_roptr) * numkeys, MALLOC_LOCAL); @@ -654,94 +678,143 @@ void bitopCommand(client *c) { if (j == 0 || len[j] < minlen) minlen = len[j]; } - /* Compute the bit operation, if at least one string is not empty. */ - if (maxlen) { - res = (unsigned char*) sdsnewlen(NULL,maxlen); - unsigned char output, byte; - unsigned long i; + if (fShiftOp) + { + long newlen = (long)maxlen + shift/CHAR_BIT; + if (shift > 0 && (shift % CHAR_BIT) != 0) + newlen++; - /* Fast path: as far as we have data for all the input bitmaps we - * can take a fast path that performs much better than the - * vanilla algorithm. On ARM we skip the fast path since it will - * result in GCC compiling the code using multiple-words load/store - * operations that are not supported even in ARM >= v6. */ - j = 0; - #ifndef USE_ALIGNED_ACCESS - if (minlen >= sizeof(unsigned long)*4 && numkeys <= 16) { - unsigned long *lp[16]; - unsigned long *lres = (unsigned long*) res; + if (newlen < 0) + newlen = 0; + + if (newlen) + { + res = (unsigned char*) sdsnewlen(NULL,newlen); + if (shift >= 0) + { // left shift + long byteoffset = shift/CHAR_BIT; + memset(res, 0, byteoffset); + long srcLen = newlen - byteoffset - ((shift % CHAR_BIT) ? 1 : 0); - /* Note: sds pointer is always aligned to 8 byte boundary. */ - memcpy(lp,src,sizeof(unsigned long*)*numkeys); - memcpy(res,src[0],minlen); - - /* Different branches per different operations for speed (sorry). */ - if (op == BITOP_AND) { - while(minlen >= sizeof(unsigned long)*4) { - for (i = 1; i < numkeys; i++) { - lres[0] &= lp[i][0]; - lres[1] &= lp[i][1]; - lres[2] &= lp[i][2]; - lres[3] &= lp[i][3]; - lp[i]+=4; - } - lres+=4; - j += sizeof(unsigned long)*4; - minlen -= sizeof(unsigned long)*4; + // now the bitshift+copy + unsigned bitshift = shift % CHAR_BIT; + unsigned char carry = 0; + for (long iSrc = 0; iSrc < srcLen; ++iSrc) + { + res[byteoffset+iSrc] = (src[0][iSrc] << bitshift) | carry; + carry = src[0][iSrc] >> (CHAR_BIT - bitshift); } - } else if (op == BITOP_OR) { - while(minlen >= sizeof(unsigned long)*4) { - for (i = 1; i < numkeys; i++) { - lres[0] |= lp[i][0]; - lres[1] |= lp[i][1]; - lres[2] |= lp[i][2]; - lres[3] |= lp[i][3]; - lp[i]+=4; - } - lres+=4; - j += sizeof(unsigned long)*4; - minlen -= sizeof(unsigned long)*4; - } - } else if (op == BITOP_XOR) { - while(minlen >= sizeof(unsigned long)*4) { - for (i = 1; i < numkeys; i++) { - lres[0] ^= lp[i][0]; - lres[1] ^= lp[i][1]; - lres[2] ^= lp[i][2]; - lres[3] ^= lp[i][3]; - lp[i]+=4; - } - lres+=4; - j += sizeof(unsigned long)*4; - minlen -= sizeof(unsigned long)*4; - } - } else if (op == BITOP_NOT) { - while(minlen >= sizeof(unsigned long)*4) { - lres[0] = ~lres[0]; - lres[1] = ~lres[1]; - lres[2] = ~lres[2]; - lres[3] = ~lres[3]; - lres+=4; - j += sizeof(unsigned long)*4; - minlen -= sizeof(unsigned long)*4; + if (bitshift) + res[newlen-1] = carry; + } + else + { // right shift + long byteoffset = -shift/CHAR_BIT; + unsigned bitshift = -shift % CHAR_BIT; + if (bitshift) + ++byteoffset; + res[0] = (src[0][byteoffset] << (CHAR_BIT-bitshift)); + if (byteoffset > 0) + res[0] |= (src[0][byteoffset-1] >> bitshift); + for (long idx = 1; idx < newlen; ++idx) + { + res[idx] = (src[0][byteoffset+idx] << (CHAR_BIT-bitshift)) | (src[0][byteoffset+idx-1] >> bitshift); } } } - #endif + maxlen = newlen; // this is to ensure we DEL below if newlen was 0 + } + else + { + /* Compute the bit operation, if at least one string is not empty. */ + if (maxlen) { + res = (unsigned char*) sdsnewlen(NULL,maxlen); + unsigned char output, byte; + unsigned long i; - /* j is set to the next byte to process by the previous loop. */ - for (; j < maxlen; j++) { - output = (len[0] <= j) ? 0 : src[0][j]; - if (op == BITOP_NOT) output = ~output; - for (i = 1; i < numkeys; i++) { - byte = (len[i] <= j) ? 0 : src[i][j]; - switch(op) { - case BITOP_AND: output &= byte; break; - case BITOP_OR: output |= byte; break; - case BITOP_XOR: output ^= byte; break; + /* Fast path: as far as we have data for all the input bitmaps we + * can take a fast path that performs much better than the + * vanilla algorithm. On ARM we skip the fast path since it will + * result in GCC compiling the code using multiple-words load/store + * operations that are not supported even in ARM >= v6. */ + j = 0; + #ifndef USE_ALIGNED_ACCESS + if (minlen >= sizeof(unsigned long)*4 && numkeys <= 16) { + unsigned long *lp[16]; + unsigned long *lres = (unsigned long*) res; + + /* Note: sds pointer is always aligned to 8 byte boundary. */ + memcpy(lp,src,sizeof(unsigned long*)*numkeys); + memcpy(res,src[0],minlen); + + /* Different branches per different operations for speed (sorry). */ + if (op == BITOP_AND) { + while(minlen >= sizeof(unsigned long)*4) { + for (i = 1; i < numkeys; i++) { + lres[0] &= lp[i][0]; + lres[1] &= lp[i][1]; + lres[2] &= lp[i][2]; + lres[3] &= lp[i][3]; + lp[i]+=4; + } + lres+=4; + j += sizeof(unsigned long)*4; + minlen -= sizeof(unsigned long)*4; + } + } else if (op == BITOP_OR) { + while(minlen >= sizeof(unsigned long)*4) { + for (i = 1; i < numkeys; i++) { + lres[0] |= lp[i][0]; + lres[1] |= lp[i][1]; + lres[2] |= lp[i][2]; + lres[3] |= lp[i][3]; + lp[i]+=4; + } + lres+=4; + j += sizeof(unsigned long)*4; + minlen -= sizeof(unsigned long)*4; + } + } else if (op == BITOP_XOR) { + while(minlen >= sizeof(unsigned long)*4) { + for (i = 1; i < numkeys; i++) { + lres[0] ^= lp[i][0]; + lres[1] ^= lp[i][1]; + lres[2] ^= lp[i][2]; + lres[3] ^= lp[i][3]; + lp[i]+=4; + } + lres+=4; + j += sizeof(unsigned long)*4; + minlen -= sizeof(unsigned long)*4; + } + } else if (op == BITOP_NOT) { + while(minlen >= sizeof(unsigned long)*4) { + lres[0] = ~lres[0]; + lres[1] = ~lres[1]; + lres[2] = ~lres[2]; + lres[3] = ~lres[3]; + lres+=4; + j += sizeof(unsigned long)*4; + minlen -= sizeof(unsigned long)*4; + } } } - res[j] = output; + #endif + + /* j is set to the next byte to process by the previous loop. */ + for (; j < maxlen; j++) { + output = (len[0] <= j) ? 0 : src[0][j]; + if (op == BITOP_NOT) output = ~output; + for (i = 1; i < numkeys; i++) { + byte = (len[i] <= j) ? 0 : src[i][j]; + switch(op) { + case BITOP_AND: output &= byte; break; + case BITOP_OR: output |= byte; break; + case BITOP_XOR: output ^= byte; break; + } + } + res[j] = output; + } } } for (j = 0; j < numkeys; j++) { diff --git a/src/server.h b/src/server.h index dbcff38f5..6a2bda9fa 100644 --- a/src/server.h +++ b/src/server.h @@ -729,7 +729,7 @@ typedef struct redisObject { * LFU data (least significant 8 bits frequency * and most significant 16 bits access time). */ private: - mutable std::atomic refcount; + mutable std::atomic refcount {0}; public: uint64_t mvcc_tstamp; void *m_ptr; diff --git a/tests/unit/bitops.tcl b/tests/unit/bitops.tcl index 926f38295..f8a5cbe18 100644 --- a/tests/unit/bitops.tcl +++ b/tests/unit/bitops.tcl @@ -214,6 +214,64 @@ start_server {tags {"bitops"}} { r bitop or x a b } {32} + test {BITOP lshift size} { + r set a " " + r bitop lshift x a 1 + } {2} + + test {BITOP rshift size} { + r set a " " + r bitop rshift x a 1 + } {1} + + test {BITOP rshift 0 byte} { + r set a " " + r bitop rshift x a 8 + } {0} + + test {BITOP rshift underflow} { + r set a " " + r bitop rshift x a 65 + } {0} + + test {BITOP lshift string} { + r set a "abcdefg" + r bitop lshift x a 8 + r get x + } "\x00abcdefg" + + test {BITOP lshift char} { + r set a "\xAA" + r bitop lshift x a 4 + r get x + } "\xA0\x0A" + + test {BITOP rshift char} { + r set a "\xAA" + r bitop rshift x a 3 + r get x + } "\x15" + + test {BITOP lshift carry} { + r set a "\xFF" + r bitop lshift x a 1 + r get x + } "\xFE\x01" + + test {BITOP rshift carry} { + r set a "\x00\xFF" + r bitop rshift x a 1 + r get x + } "\x80\x7F" + + test {BITOP rshift reciprocal} { + r flushdb + r set a "abcdefg" + r bitop lshift b a 14 + r bitop rshift res b 14 + r get res + } "abcdefg\x00" + test {BITPOS bit=0 with empty key returns 0} { r del str r bitpos str 0 From 4db6193052ad49d2a603541c85bbbfa44eb5c41b Mon Sep 17 00:00:00 2001 From: John Sully Date: Sat, 28 Sep 2019 14:59:44 -0400 Subject: [PATCH 71/76] RREPLAY command now takes a DB argument Former-commit-id: 6e1e5bd08b59f8ad4653621a6c01fcf3a76f0692 --- src/debug.cpp | 6 +++-- src/replication.cpp | 22 ++++++++++++++++- tests/integration/replication-active.tcl | 13 ++++++++++ tests/test_helper.tcl | 1 + tests/unit/rreplay.tcl | 30 ++++++++++++++++++++++++ 5 files changed, 69 insertions(+), 3 deletions(-) create mode 100644 tests/unit/rreplay.tcl diff --git a/src/debug.cpp b/src/debug.cpp index 3246f9d19..3a4520776 100644 --- a/src/debug.cpp +++ b/src/debug.cpp @@ -682,10 +682,12 @@ NULL changeReplicationId(); clearReplicationId2(); addReply(c,shared.ok); - } else if (!strcasecmp(szFromObj(c->argv[1]),"stringmatch-test") && c->argc == 2) - { + } else if (!strcasecmp(szFromObj(c->argv[1]),"stringmatch-test") && c->argc == 2) { stringmatchlen_fuzz_test(); addReplyStatus(c,"Apparently Redis did not crash: test passed"); + } else if (!strcasecmp(szFromObj(c->argv[1]), "force-master") && c->argc == 2) { + c->flags |= CLIENT_MASTER | CLIENT_MASTER_FORCE_REPLY; + addReply(c, shared.ok); } else { addReplySubcommandSyntaxError(c); return; diff --git a/src/replication.cpp b/src/replication.cpp index e440c5440..5cd527259 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -323,9 +323,13 @@ void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) { char uuid[40] = {'\0'}; uuid_unparse(cserver.uuid, uuid); char proto[1024]; - int cchProto = snprintf(proto, sizeof(proto), "*3\r\n$7\r\nRREPLAY\r\n$%d\r\n%s\r\n$%lld\r\n", (int)strlen(uuid), uuid, cchbuf); + int cchProto = snprintf(proto, sizeof(proto), "*4\r\n$7\r\nRREPLAY\r\n$%d\r\n%s\r\n$%lld\r\n", (int)strlen(uuid), uuid, cchbuf); cchProto = std::min((int)sizeof(proto), cchProto); long long master_repl_offset_start = g_pserver->master_repl_offset; + + serverAssert(dictid >= 0); + char szDbNum[128]; + int cchDbNum = snprintf(szDbNum, sizeof(szDbNum), "$%d\r\n%d\r\n", (dictid/10)+1, dictid); /* Write the command to the replication backlog if any. */ if (g_pserver->repl_backlog) @@ -368,6 +372,7 @@ void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) { } const char *crlf = "\r\n"; feedReplicationBacklog(crlf, 2); + feedReplicationBacklog(szDbNum, cchDbNum); } } @@ -396,7 +401,10 @@ void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) { addReplyProtoAsync(slave, reply->buf(), reply->used); } if (!fSendRaw) + { addReplyAsync(slave,shared.crlf); + addReplyProtoAsync(slave, szDbNum, cchDbNum); + } } freeClient(fake); @@ -3266,6 +3274,7 @@ void replicaReplayCommand(client *c) // the replay command contains two arguments: // 1: The UUID of the source // 2: The raw command buffer to be replayed + // 3: (OPTIONAL) the database ID the command should apply to if (!(c->flags & CLIENT_MASTER)) { @@ -3298,6 +3307,17 @@ void replicaReplayCommand(client *c) return; } + if (c->argc >= 4) + { + long long db; + if (getLongLongFromObject(c->argv[3], &db) != C_OK || db >= cserver.dbnum || selectDb(c, (int)db) != C_OK) + { + addReplyError(c, "Invalid database ID"); + s_pstate->Cancel(); + return; + } + } + if (FSameUuidNoNil(uuid, cserver.uuid)) { addReply(c, shared.ok); diff --git a/tests/integration/replication-active.tcl b/tests/integration/replication-active.tcl index 99e0dc006..2ba761766 100644 --- a/tests/integration/replication-active.tcl +++ b/tests/integration/replication-active.tcl @@ -93,5 +93,18 @@ start_server {tags {"active-repl"} overrides {active-replica yes}} { assert_equal {0} [$master del testkey1] assert_equal {0} [$slave del testkey1] } + + test {Active replica different databases} { + $master select 3 + $master set testkey abcd + $master select 2 + $master del testkey + $slave select 3 + wait_for_condition 50 1000 { + [string match abcd [$slave get testkey]] + } else { + fail "Replication failed to propogate DB 3" + } + } } } diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl index 6abbddbbe..a06afca3e 100644 --- a/tests/test_helper.tcl +++ b/tests/test_helper.tcl @@ -35,6 +35,7 @@ set ::all_tests { unit/quit unit/aofrw unit/acl + unit/rreplay integration/block-repl integration/replication integration/replication-2 diff --git a/tests/unit/rreplay.tcl b/tests/unit/rreplay.tcl new file mode 100644 index 000000000..2029f521d --- /dev/null +++ b/tests/unit/rreplay.tcl @@ -0,0 +1,30 @@ +start_server {tags {"rreplay"}} { + + test {RREPLAY use current db} { + r debug force-master + r select 4 + r set dbnum invalid + r rreplay "f4d5b2b5-4f07-4ee5-a4f2-5dc98507dfce" "*3\r\n\$3\r\nSET\r\n\$5\r\ndbnum\r\n\$4\r\nfour\r\n" + r get dbnum + } {four} + reconnect + + test {RREPLAY db different} { + r debug force-master + r select 4 + r set testkey four + r rreplay "f4d5b2b5-4f07-4ee5-a4f2-5dc98507dfce" "*3\r\n\$3\r\nSET\r\n\$7\r\ntestkey\r\n\$4\r\nbebe\r\n" 2 + r select 4 + assert { [r get testkey] == "four" } + r select 2 + r get testkey + } {bebe} + + reconnect + + test {RREPLAY not master} { + assert_error "*master*" {r rreplay "f4d5b2b5-4f07-4ee5-a4f2-5dc98507dfce" "*3\r\n\$3\r\nSET\r\n\$7\r\ntestkey\r\n\$4\r\nbebe\r\n" 2} + } + + r flushdb +} From b01cf739999682182fef0dca9a05bdc5131d3fad Mon Sep 17 00:00:00 2001 From: John Sully Date: Sat, 28 Sep 2019 17:04:08 -0400 Subject: [PATCH 72/76] RDB fuzz test cases Former-commit-id: 6df4d294220ed914c592f3bd195ae921d6a84cde --- fuzz/rdb/dict.txt | 13 +++++++++++++ fuzz/rdb/testcases/empty.rdb | Bin 0 -> 92 bytes fuzz/rdb/testcases/expire.rdb | Bin 0 -> 152 bytes fuzz/rdb/testcases/expireonload.rdb | Bin 0 -> 152 bytes fuzz/rdb/testcases/integer.rdb | Bin 0 -> 144 bytes fuzz/rdb/testcases/integer_shared.rdb | Bin 0 -> 141 bytes fuzz/rdb/testcases/multidb.rdb | Bin 0 -> 143 bytes fuzz/rdb/testcases/replica.rdb | Bin 0 -> 175 bytes fuzz/rdb/testcases/set.rdb | Bin 0 -> 151 bytes fuzz/rdb/testcases/string.rdb | Bin 0 -> 143 bytes src/server.cpp | 4 ++++ 11 files changed, 17 insertions(+) create mode 100644 fuzz/rdb/dict.txt create mode 100644 fuzz/rdb/testcases/empty.rdb create mode 100644 fuzz/rdb/testcases/expire.rdb create mode 100644 fuzz/rdb/testcases/expireonload.rdb create mode 100644 fuzz/rdb/testcases/integer.rdb create mode 100644 fuzz/rdb/testcases/integer_shared.rdb create mode 100644 fuzz/rdb/testcases/multidb.rdb create mode 100644 fuzz/rdb/testcases/replica.rdb create mode 100644 fuzz/rdb/testcases/set.rdb create mode 100644 fuzz/rdb/testcases/string.rdb diff --git a/fuzz/rdb/dict.txt b/fuzz/rdb/dict.txt new file mode 100644 index 000000000..d854f3319 --- /dev/null +++ b/fuzz/rdb/dict.txt @@ -0,0 +1,13 @@ +="repl-stream-db" +="repl-id" +="repl-offset" +="lua" +="redis-ver" +="ctime" +="used-mem" +="aof-preamble" +="redis-bits" +="mvcc-tstamp" +="keydb-subexpire-key" +="keydb-subexpire-when" + diff --git a/fuzz/rdb/testcases/empty.rdb b/fuzz/rdb/testcases/empty.rdb new file mode 100644 index 0000000000000000000000000000000000000000..c490bd4c497d1b6b9badf4c2bf963717d786255f GIT binary patch literal 92 zcmWG?b@2=~Ffg$E#aWb^l3A=n?bZfBwzi#xY0Ia#-) zxFj*RK-kdC#MIK%#K6GN)XdDp*x1OzfPuXvwYVfZwURk4KmUJJ&6?etj4G!A0L=n7 Aq5uE@ literal 0 HcmV?d00001 diff --git a/fuzz/rdb/testcases/expireonload.rdb b/fuzz/rdb/testcases/expireonload.rdb new file mode 100644 index 0000000000000000000000000000000000000000..4e98f5f81b5240ee8d3a08c61df506ee900d3e84 GIT binary patch literal 152 zcmWG?b@2=~Ffg$E#aWb^l3A=n?bZfWME-oX=-3@Vr*c-z+RGCT#}tydC0zWN2w&VPR}yYG7f(z+RGCT#}tyd4Tc%e@k`Fn?bZfn?bZf 0 && cserver.cthreads <= MAX_EVENT_LOOPS); pthread_t rgthread[MAX_EVENT_LOOPS]; for (int iel = 0; iel < cserver.cthreads; ++iel) From 3a8d13760ad2b474d4d5e5414e23894cbb7c9a7d Mon Sep 17 00:00:00 2001 From: John Sully Date: Sat, 28 Sep 2019 17:38:26 -0400 Subject: [PATCH 73/76] Fix double free in RDB load Former-commit-id: 070c4818715b56645855abb72af47c846fc63027 --- src/rdb.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/rdb.cpp b/src/rdb.cpp index b983167a4..ee61b3f19 100644 --- a/src/rdb.cpp +++ b/src/rdb.cpp @@ -2107,7 +2107,10 @@ int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi, int loading_aof) { /* Read key */ if (key != nullptr) + { decrRefCount(key); + key = nullptr; + } if ((key = rdbLoadStringObject(rdb)) == NULL) goto eoferr; /* Read value */ @@ -2119,7 +2122,9 @@ int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi, int loading_aof) { * snapshot taken by the master may not be reflected on the slave. */ if (listLength(g_pserver->masters) == 0 && !loading_aof && expiretime != -1 && expiretime < now) { decrRefCount(key); + key = nullptr; decrRefCount(val); + val = nullptr; } else { /* Add the new object in the hash table */ int fInserted = dbMerge(db, key, val, rsi->fForceSetKey); // Note: dbMerge will incrRef @@ -2136,6 +2141,7 @@ int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi, int loading_aof) { else { decrRefCount(val); + val = nullptr; } } From 311b3502ae0d2fea1f84da89fea8c334c362c36a Mon Sep 17 00:00:00 2001 From: John Sully Date: Sat, 28 Sep 2019 17:38:46 -0400 Subject: [PATCH 74/76] finish up the RDB fuzz Former-commit-id: 0942188b0a263d4b7de013458e5a65dc2d3b206a --- src/server.cpp | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/src/server.cpp b/src/server.cpp index 96e1cc0f2..fda6d5db7 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -5206,6 +5206,20 @@ int main(int argc, char **argv) { #endif moduleLoadFromQueue(); ACLLoadUsersAtStartup(); + + // special case of FUZZING load from stdin then quit + if (strstr(argv[0],"keydb-fuzz-rdb") != NULL) + { + serverAssert(GlobalLocksAcquired()); + rio rdb; + rdbSaveInfo rsi = RDB_SAVE_INFO_INIT; + startLoading(stdin); + rioInitWithFile(&rdb,stdin); + rdbLoadRio(&rdb,&rsi,0); + stopLoading(); + return EXIT_SUCCESS; + } + loadDataFromDisk(); if (g_pserver->cluster_enabled) { if (verifyClusterConfigWithData() == C_ERR) { @@ -5242,11 +5256,7 @@ int main(int argc, char **argv) { aeReleaseLock(); //Finally we can dump the lock moduleReleaseGIL(true); - - // If we're just fuzzing then we've already loaded the RDB so just quit successfully - if (strstr(argv[0],"keydb-fuzz-rdb") != NULL) - return EXIT_SUCCESS; - + serverAssert(cserver.cthreads > 0 && cserver.cthreads <= MAX_EVENT_LOOPS); pthread_t rgthread[MAX_EVENT_LOOPS]; for (int iel = 0; iel < cserver.cthreads; ++iel) From c18bba8535a7045845457f924c7e0c381806b2e8 Mon Sep 17 00:00:00 2001 From: John Sully Date: Mon, 30 Sep 2019 00:00:29 -0400 Subject: [PATCH 75/76] Fuzz mode is now an argument not based on the exe name Former-commit-id: 551a3cb1aa1c5799ec351dd2bef97adf0517b954 --- src/config.cpp | 2 ++ src/server.cpp | 13 +++++++++++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/src/config.cpp b/src/config.cpp index 36485c7be..0519e44ff 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -800,6 +800,8 @@ void loadServerConfigFromString(char *config) { serverLog(LL_WARNING, "Warning version is overriden to: %s\n", KEYDB_SET_VERSION); } else if (!strcasecmp(argv[0],"testmode") && argc == 2){ g_fTestMode = yesnotoi(argv[1]); + } else if (!strcasecmp(argv[0],"rdbfuzz-mode")) { + // NOP, handled in main } else { err = "Bad directive or wrong number of arguments"; goto loaderr; } diff --git a/src/server.cpp b/src/server.cpp index fda6d5db7..4e0239a8d 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -4852,6 +4852,12 @@ void redisOutOfMemoryHandler(size_t allocation_size) { serverPanic("Redis aborting for OUT OF MEMORY"); } +void fuzzOutOfMemoryHandler(size_t allocation_size) { + serverLog(LL_WARNING,"Out Of Memory allocating %zu bytes!", + allocation_size); + exit(EXIT_FAILURE); // don't crash because it causes false positives +} + void redisSetProcTitle(const char *title) { #ifdef USE_SETPROCTITLE const char *server_mode = ""; @@ -5208,9 +5214,12 @@ int main(int argc, char **argv) { ACLLoadUsersAtStartup(); // special case of FUZZING load from stdin then quit - if (strstr(argv[0],"keydb-fuzz-rdb") != NULL) + if (argc > 1 && strstr(argv[1],"rdbfuzz-mode") != NULL) { - serverAssert(GlobalLocksAcquired()); + zmalloc_set_oom_handler(fuzzOutOfMemoryHandler); +#ifdef __AFL_HAVE_MANUAL_CONTROL + __AFL_INIT(); +#endif rio rdb; rdbSaveInfo rsi = RDB_SAVE_INFO_INIT; startLoading(stdin); From b11d3c4b82f9dcf0f69516e56cd24359a4e75fed Mon Sep 17 00:00:00 2001 From: John Sully Date: Mon, 30 Sep 2019 00:01:23 -0400 Subject: [PATCH 76/76] Expire DEL command place holder Former-commit-id: 64aa4b80e62fac75d5b5598515585c425472c537 --- src/server.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/server.cpp b/src/server.cpp index 4e0239a8d..2db38dceb 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -229,6 +229,10 @@ struct redisCommand redisCommandTable[] = { "write @keyspace", 0,NULL,1,-1,1,0,0,0}, + {"expdel",delCommand,-2, + "write @keyspace", + 0,NULL,1,-1,1,0,0,0}, + {"unlink",unlinkCommand,-2, "write fast @keyspace", 0,NULL,1,-1,1,0,0,0},