diff --git a/CONTRIBUTING b/CONTRIBUTING index 7dee24c74..000edbeaf 100644 --- a/CONTRIBUTING +++ b/CONTRIBUTING @@ -14,9 +14,7 @@ each source file that you contribute. PLEASE DO NOT POST GENERAL QUESTIONS that are not about bugs or suspected bugs in the Github issues system. We'll be very happy to help you and provide - all the support at the Reddit sub: - - http://reddit.com/r/redis + all the support in the mailing list. There is also an active community of Redis users at Stack Overflow: @@ -24,7 +22,12 @@ each source file that you contribute. # How to provide a patch for a new feature -1. If it is a major feature or a semantical change, please post it as a new submission in r/redis on Reddit at http://reddit.com/r/redis. Try to be passionate about why the feature is needed, make users upvote your proposal to gain traction and so forth. Read feedbacks about the community. But in this first step **please don't write code yet**. +1. If it is a major feature or a semantical change, please don't start coding +straight away: if your feature is not a conceptual fit you'll lose a lot of +time writing the code without any reason. Start by posting in the mailing list +and creating an issue at Github with the description of, exactly, what you want +to accomplish and why. Use cases are important for features to be accepted. +Here you'll see if there is consensus about your idea. 2. If in step 1 you get an acknowledgment from the project leaders, use the following procedure to submit a patch: @@ -35,6 +38,13 @@ each source file that you contribute. d. Initiate a pull request on github ( https://help.github.com/articles/creating-a-pull-request/ ) e. Done :) -For minor fixes just open a pull request on Github. +3. Keep in mind that we are very overloaded, so issues and PRs sometimes wait +for a *very* long time. However this is not lack of interest, as the project +gets more and more users, we find ourselves in a constant need to prioritize +certain issues/PRs over others. If you think your issue/PR is very important +try to popularize it, have other users commenting and sharing their point of +view and so forth. This helps. + +4. For minor fixes just open a pull request on Github. Thanks! diff --git a/README.md b/README.md index 1f23f2c04..18db3c875 100644 --- a/README.md +++ b/README.md @@ -2,40 +2,47 @@ [![Build Status](https://travis-ci.org/JohnSully/KeyDB.svg?branch=unstable)](https://travis-ci.org/JohnSully/KeyDB) [![Join the chat at https://gitter.im/KeyDB/community](https://badges.gitter.im/KeyDB/community.svg)](https://gitter.im/KeyDB/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) [![StackShare](http://img.shields.io/badge/tech-stack-0690fa.svg?style=flat)](https://stackshare.io/eq-alpha-technology-inc/eq-alpha-technology-inc) +##### Need Help? Check out our extensive [documentation](https://docs.keydb.dev). + What is KeyDB? -------------- -KeyDB is a high performance fork of Redis focusing on multithreading, memory efficiency, and high throughput. In addition to multithreading KeyDB also has features only available in Redis Enterprise such as [Active Replication](https://github.com/JohnSully/KeyDB/wiki/Active-Replication), [FLASH storage](https://github.com/JohnSully/KeyDB/wiki/FLASH-Storage) support, and some not available at all such as direct backup to AWS S3. +KeyDB is a high performance fork of Redis with a focus on multithreading, memory efficiency, and high throughput. In addition to multithreading, KeyDB also has features only available in Redis Enterprise such as [Active Replication](https://github.com/JohnSully/KeyDB/wiki/Active-Replication), [FLASH storage](https://github.com/JohnSully/KeyDB/wiki/FLASH-Storage) support, and some not available at all such as direct backup to AWS S3. -On the same hardware KeyDB can perform twice as many queries per second as Redis, with 60% lower latency. +KeyDB maintains full compatibility with the Redis protocol, modules, and scripts. This includes the atomicity gurantees for scripts and transactions. Because KeyDB keeps in sync with Redis development KeyDB is a superset of Redis functionality, making KeyDB a drop in replacement for existing Redis deployments. -KeyDB has full compatibility with the Redis protocol, modules, and scripts. This includes full support for transactions, and atomic execution of scripts. For more information see our architecture section below. +On the same hardware KeyDB can perform twice as many queries per second as Redis, with 60% lower latency. Active-Replication simplifies hot-spare failover allowing you to easily distribute writes over replicas and use simple TCP based load balancing/failover. KeyDB's higher performance allows you to do more on less hardware which reduces operation costs and complexity. + + + +Why fork Redis? +--------------- + +KeyDB has a different philosophy on how the codebase should evolve. We feel that ease of use, high performance, and a "batteries included" approach is the best way to create a good user experience. While we have great respect for the Redis maintainers it is our opinion that the Redis approach focusses too much on simplicity of the code base at the expense of complexity for the user. This results in the need for external components and workarounds to solve common problems - resulting in more complexity overall. + +Because of this difference of opinion features which are right for KeyDB may not be appropriate for Redis. A fork allows us to explore this new development path and implement features which may never be a part of Redis. KeyDB keeps in sync with upstream Redis changes, and where applicable we upstream bug fixes and changes. It is our hope that the two projects can continue to grow and learn from each other. + +Additional Resources +-------------------- Try our docker container: https://hub.docker.com/r/eqalpha/keydb Talk on Gitter: https://gitter.im/KeyDB +Visit our Website: https://keydb.dev + +See options for channel partners and support contracts: https://keydb.dev/support.html + +Learn with KeyDB’s official documentation site: https://docs.keydb.dev + [Subscribe to the KeyDB mailing list](https://eqalpha.us20.list-manage.com/subscribe/post?u=978f486c2f95589b24591a9cc&id=4ab9220500) Management GUI: We recommend [FastoNoSQL](https://fastonosql.com/) which has official KeyDB support. -New: Active Replica Support ---------------------------- - -New! KeyDB now has support for Active Replicas. This feature greatly simplifies hot-spare failover and allows you to distribute writes over replicas instead of just a single master. For more information [see the wiki page](https://github.com/JohnSully/KeyDB/wiki/Active-Replication). - -Why fork Redis? ---------------- - -The Redis maintainers have continually reiterated that they do not plan to support multithreading. While we have great respect for the redis team, we feel the analysis justifying this decision is incorrect. In addition we wanted open source implementations of features currently only available in proprietary modules. We feel a fork is the best way to accelerate development in the areas of most interest to us. - -We plan to track the Redis repo closely and hope our projects can learn from each other. Benchmarking KeyDB ------------------ - - Please note keydb-benchmark and redis-benchmark are currently single threaded and too slow to properly benchmark KeyDB. We recommend using a redis cluster benchmark tool such as [memtier](https://github.com/RedisLabs/memtier_benchmark). Please ensure your machine has enough cores for both KeyDB and memteir if testing locally. KeyDB expects exclusive use of any cores assigned to it. For more details on how we benchmarked KeyDB along with performance numbers check out our blog post: [Redis Should Be Multithreaded](https://medium.com/@john_63123/redis-should-be-multi-threaded-e28319cab744?source=friends_link&sk=7ce8e9fe3ec8224a4d27ef075d085457) @@ -58,6 +65,10 @@ If you would like to use the [FLASH backed](https://github.com/JohnSully/KeyDB/w If you would like KeyDB to dump and load directly to AWS S3 this option specifies the bucket. Using this option with the traditional RDB options will result in KeyDB backing up twice to both locations. If both are specified KeyDB will first attempt to load from the local dump file and if that fails load from S3. This requires the AWS CLI tools to be installed and configured which are used under the hood to transfer the data. + active-replica yes + +If you are using active-active replication set `active-replica` option to “yes”. This will enable both instances to accept reads and writes while remaining synced. [Click here](https://docs.keydb.dev/docs/active-rep/) to see more on active-rep in our docs section. There are also [docker examples]( https://docs.keydb.dev/docs/docker-active-rep/) on docs. + All other configuration options behave as you'd expect. Your existing configuration files should continue to work unchanged. Building KeyDB @@ -67,16 +78,19 @@ KeyDB can be compiled and is tested for use on Linux. KeyDB currently relies on Install dependencies: - % sudo apt install build-essential nasm autotools-dev autoconf libjemalloc-dev tcl tcl-dev uuid-dev + % sudo apt install build-essential nasm autotools-dev autoconf libjemalloc-dev tcl tcl-dev uuid-dev libcurl4-openssl-dev Compiling is as simple as: % make -You can enable flash support with (Note: autoconf and autotools must be installed): +You can enable flash support with: % make MALLOC=memkind +***Note that the following dependencies may be needed: + % sudo apt-get install autoconf autotools-dev libnuma-dev libtool + Fixing build problems with dependencies or cached build options --------- @@ -179,7 +193,7 @@ then in another terminal try the following: (integer) 2 keydb> -You can find the list of all the available commands at http://redis.io/commands. +You can find the list of all the available commands at https://docs.keydb.dev/docs/commands/ Installing KeyDB ----------------- @@ -222,23 +236,18 @@ Future work: Docker Build ------------ - -Run the following commands for a full source download and build: - +Build the latest binaries from the github unstable branch within a docker container. Note this is built for Ubuntu 18.04. +Simply make a directory you would like to have the latest binaries dumped in, then run the following commmand with your updated path: ``` -git clone git@github.com:JohnSully/KeyDB.git -docker run -it --rm -v `pwd`/KeyDB:/build -w /build devopsdood/keydb-builder make +$ docker run -it --rm -v /path-to-dump-binaries:/keydb_bin eqalpha/keydb-build-bin ``` +You should receive the following files: keydb-benchmark, keydb-check-aof, keydb-check-rdb, keydb-cli, keydb-sentinel, keydb-server -Then you have fresh binaries built, you can also pass any other options to the make command above after the word make. E.g. - -```docker run -it --rm -v `pwd`/KeyDB:/build -w /build devopsdood/keydb-builder make MAllOC=memkind``` - -The above commands will build you binaries in the src directory. Standard `make install` without Docker command will work after if you wish to install - -If you'd prefer you can build the Dockerfile in the repo instead of pulling the above container for use: - -`docker build -t KeyDB .` +If you are looking to enable flash support with the build (make MALLOC=memkind) then use the following command: +``` +$ docker run -it --rm -v /path-to-dump-binaries:/keydb_bin eqalpha/keydb-build-bin:flash +``` +Please note that you will need libcurl4-openssl-dev in order to run keydb. With flash version you may need libnuma-dev and libtool installed in order to run the binaries. Keep this in mind especially when running in a container. For a copy of all our Dockerfiles, please see them on [docs]( https://docs.keydb.dev/docs/dockerfiles/). Code contributions ----------------- @@ -252,3 +261,4 @@ source distribution. Please see the CONTRIBUTING file in this source distribution for more information. + diff --git a/fuzz/rdb/dict.txt b/fuzz/rdb/dict.txt new file mode 100644 index 000000000..d854f3319 --- /dev/null +++ b/fuzz/rdb/dict.txt @@ -0,0 +1,13 @@ +="repl-stream-db" +="repl-id" +="repl-offset" +="lua" +="redis-ver" +="ctime" +="used-mem" +="aof-preamble" +="redis-bits" +="mvcc-tstamp" +="keydb-subexpire-key" +="keydb-subexpire-when" + diff --git a/fuzz/rdb/testcases/empty.rdb b/fuzz/rdb/testcases/empty.rdb new file mode 100644 index 000000000..c490bd4c4 Binary files /dev/null and b/fuzz/rdb/testcases/empty.rdb differ diff --git a/fuzz/rdb/testcases/expire.rdb b/fuzz/rdb/testcases/expire.rdb new file mode 100644 index 000000000..291d25214 Binary files /dev/null and b/fuzz/rdb/testcases/expire.rdb differ diff --git a/fuzz/rdb/testcases/expireonload.rdb b/fuzz/rdb/testcases/expireonload.rdb new file mode 100644 index 000000000..4e98f5f81 Binary files /dev/null and b/fuzz/rdb/testcases/expireonload.rdb differ diff --git a/fuzz/rdb/testcases/integer.rdb b/fuzz/rdb/testcases/integer.rdb new file mode 100644 index 000000000..379ad2e21 Binary files /dev/null and b/fuzz/rdb/testcases/integer.rdb differ diff --git a/fuzz/rdb/testcases/integer_shared.rdb b/fuzz/rdb/testcases/integer_shared.rdb new file mode 100644 index 000000000..aff442e75 Binary files /dev/null and b/fuzz/rdb/testcases/integer_shared.rdb differ diff --git a/fuzz/rdb/testcases/multidb.rdb b/fuzz/rdb/testcases/multidb.rdb new file mode 100644 index 000000000..0481143b8 Binary files /dev/null and b/fuzz/rdb/testcases/multidb.rdb differ diff --git a/fuzz/rdb/testcases/replica.rdb b/fuzz/rdb/testcases/replica.rdb new file mode 100644 index 000000000..52ec1a962 Binary files /dev/null and b/fuzz/rdb/testcases/replica.rdb differ diff --git a/fuzz/rdb/testcases/set.rdb b/fuzz/rdb/testcases/set.rdb new file mode 100644 index 000000000..9f760eafd Binary files /dev/null and b/fuzz/rdb/testcases/set.rdb differ diff --git a/fuzz/rdb/testcases/string.rdb b/fuzz/rdb/testcases/string.rdb new file mode 100644 index 000000000..b3b5f6dc5 Binary files /dev/null and b/fuzz/rdb/testcases/string.rdb differ diff --git a/src/Makefile b/src/Makefile index 3a9365f78..be4c07369 100644 --- a/src/Makefile +++ b/src/Makefile @@ -209,7 +209,7 @@ endif REDIS_SERVER_NAME=keydb-server REDIS_SENTINEL_NAME=keydb-sentinel -REDIS_SERVER_OBJ=adlist.o quicklist.o ae.o anet.o dict.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o crc16.o endianconv.o slowlog.o scripting.o bio.o rio.o rand.o memtest.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o acl.o storage.o rdb-s3.o fastlock.o new.o $(ASM_OBJ) +REDIS_SERVER_OBJ=adlist.o quicklist.o ae.o anet.o dict.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o crc16.o endianconv.o slowlog.o scripting.o bio.o rio.o rand.o memtest.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o acl.o storage.o rdb-s3.o fastlock.o new.o tracking.o $(ASM_OBJ) REDIS_CLI_NAME=keydb-cli REDIS_CLI_OBJ=anet.o adlist.o dict.o redis-cli.o redis-cli-cpphelper.o zmalloc.o release.o anet.o ae.o crc64.o siphash.o crc16.o storage-lite.o fastlock.o new.o $(ASM_OBJ) REDIS_BENCHMARK_NAME=keydb-benchmark diff --git a/src/acl.cpp b/src/acl.cpp index 259d9fb61..7493300d5 100644 --- a/src/acl.cpp +++ b/src/acl.cpp @@ -295,7 +295,7 @@ int ACLGetCommandBitCoordinates(uint64_t id, uint64_t *word, uint64_t *bit) { * Note that this function does not check the ALLCOMMANDS flag of the user * but just the lowlevel bitmask. * - * If the bit overflows the user internal represetation, zero is returned + * If the bit overflows the user internal representation, zero is returned * in order to disallow the execution of the command in such edge case. */ int ACLGetUserCommandBit(user *u, unsigned long id) { uint64_t word, bit; @@ -311,7 +311,7 @@ int ACLUserCanExecuteFutureCommands(user *u) { } /* Set the specified command bit for the specified user to 'value' (0 or 1). - * If the bit overflows the user internal represetation, no operation + * If the bit overflows the user internal representation, no operation * is performed. As a side effect of calling this function with a value of * zero, the user flag ALLCOMMANDS is cleared since it is no longer possible * to skip the command bit explicit test. */ @@ -350,7 +350,7 @@ int ACLSetUserCommandBitsForCategory(user *u, const char *category, int value) { /* Return the number of commands allowed (on) and denied (off) for the user 'u' * in the subset of commands flagged with the specified category name. - * If the categoty name is not valid, C_ERR is returend, otherwise C_OK is + * If the category name is not valid, C_ERR is returned, otherwise C_OK is * returned and on and off are populated by reference. */ int ACLCountCategoryBitsForUser(user *u, unsigned long *on, unsigned long *off, const char *category) @@ -626,7 +626,7 @@ void ACLAddAllowedSubcommand(user *u, unsigned long id, const char *sub) { * It is possible to specify multiple patterns. * allkeys Alias for ~* * resetkeys Flush the list of allowed keys patterns. - * > Add this passowrd to the list of valid password for the user. + * > Add this password to the list of valid password for the user. * For example >mypass will add "mypass" to the list. * This directive clears the "nopass" flag (see later). * < Remove this password from the list of valid passwords. @@ -949,9 +949,9 @@ user *ACLGetUserByName(const char *name, size_t namelen) { return (user*)myuser; } -/* Check if the command ready to be excuted in the client 'c', and already - * referenced by c->cmd, can be executed by this client according to the - * ACls associated to the client user c->user. +/* Check if the command is ready to be executed in the client 'c', already + * referenced by c->cmd, and can be executed by this client according to the + * ACLs associated to the client user c->user. * * If the user can execute the command ACL_OK is returned, otherwise * ACL_DENIED_CMD or ACL_DENIED_KEY is returned: the first in case the @@ -1122,7 +1122,7 @@ int ACLLoadConfiguredUsers(void) { } /* This function loads the ACL from the specified filename: every line - * is validated and shold be either empty or in the format used to specify + * is validated and should be either empty or in the format used to specify * users in the redis.conf configuration or in the ACL file, that is: * * user ... rules ... @@ -1172,7 +1172,7 @@ sds ACLLoadFromFile(const char *filename) { * to the real user mentioned in the ACL line. */ user *fakeuser = ACLCreateUnlinkedUser(); - /* We do all the loading in a fresh insteance of the Users radix tree, + /* We do all the loading in a fresh instance of the Users radix tree, * so if there are errors loading the ACL file we can rollback to the * old version. */ rax *old_users = Users; @@ -1248,7 +1248,7 @@ sds ACLLoadFromFile(const char *filename) { } /* Note that the same rules already applied to the fake user, so - * we just assert that everything goess well: it should. */ + * we just assert that everything goes well: it should. */ for (j = 2; j < argc; j++) serverAssert(ACLSetUser(u,argv[j],sdslen(argv[j])) == C_OK); @@ -1611,7 +1611,7 @@ void addReplyCommandCategories(client *c, struct redisCommand *cmd) { setDeferredSetLen(c, flaglen, flagcount); } -/* AUTH +/* AUTH * AUTH (Redis >= 6.0 form) * * When the user is omitted it means that we are trying to authenticate diff --git a/src/aelocker.h b/src/aelocker.h index d5c8186bf..eca15f491 100644 --- a/src/aelocker.h +++ b/src/aelocker.h @@ -61,6 +61,11 @@ public: return m_fArmed; } + void release() + { + m_fArmed = false; + } + ~AeLocker() { if (m_fArmed) diff --git a/src/aof.cpp b/src/aof.cpp index 69e32fc3b..637b2ce34 100644 --- a/src/aof.cpp +++ b/src/aof.cpp @@ -1319,13 +1319,12 @@ int rewriteAppendOnlyFileRio(rio *aof) { while((de = dictNext(di)) != NULL) { sds keystr; robj key, *o; - long long expiretime; keystr = (sds)dictGetKey(de); o = (robj*)dictGetVal(de); initStaticStringObject(key,keystr); - expiretime = getExpire(db,&key); + expireEntry *pexpire = getExpire(db,&key); /* Save the key and associated value */ if (o->type == OBJ_STRING) { @@ -1351,11 +1350,23 @@ int rewriteAppendOnlyFileRio(rio *aof) { serverPanic("Unknown object type"); } /* Save the expire time */ - if (expiretime != -1) { - char cmd[]="*3\r\n$9\r\nPEXPIREAT\r\n"; - if (rioWrite(aof,cmd,sizeof(cmd)-1) == 0) goto werr; - if (rioWriteBulkObject(aof,&key) == 0) goto werr; - if (rioWriteBulkLongLong(aof,expiretime) == 0) goto werr; + if (pexpire != nullptr) { + for (auto &subExpire : *pexpire) { + if (subExpire.subkey() == nullptr) + { + char cmd[]="*3\r\n$9\r\nPEXPIREAT\r\n"; + if (rioWrite(aof,cmd,sizeof(cmd)-1) == 0) goto werr; + if (rioWriteBulkObject(aof,&key) == 0) goto werr; + } + else + { + char cmd[]="*4\r\n$12\r\nEXPIREMEMBER\r\n"; + if (rioWrite(aof,cmd,sizeof(cmd)-1) == 0) goto werr; + if (rioWriteBulkObject(aof,&key) == 0) goto werr; + if (rioWrite(aof,subExpire.subkey(),sdslen(subExpire.subkey())) == 0) goto werr; + } + if (rioWriteBulkLongLong(aof,subExpire.when()) == 0) goto werr; // common + } } /* Read some diff from the parent process from time to time. */ if (aof->processed_bytes > processed+AOF_READ_DIFF_INTERVAL_BYTES) { diff --git a/src/bio.cpp b/src/bio.cpp index 62f6615a6..97fa7cf18 100644 --- a/src/bio.cpp +++ b/src/bio.cpp @@ -85,7 +85,7 @@ struct bio_job { void *bioProcessBackgroundJobs(void *arg); void lazyfreeFreeObjectFromBioThread(robj *o); -void lazyfreeFreeDatabaseFromBioThread(dict *ht1, dict *ht2); +void lazyfreeFreeDatabaseFromBioThread(dict *ht1, expireset *set); void lazyfreeFreeSlotsMapFromBioThread(rax *rt); /* Make sure we have enough stack to perform all the things we do in the @@ -196,7 +196,7 @@ void *bioProcessBackgroundJobs(void *arg) { if (job->arg1) lazyfreeFreeObjectFromBioThread((robj*)job->arg1); else if (job->arg2 && job->arg3) - lazyfreeFreeDatabaseFromBioThread((dict*)job->arg2,(dict*)job->arg3); + lazyfreeFreeDatabaseFromBioThread((dict*)job->arg2,(expireset*)job->arg3); else if (job->arg3) lazyfreeFreeSlotsMapFromBioThread((rax*)job->arg3); } else { diff --git a/src/bitops.cpp b/src/bitops.cpp index 02034f377..bcb8840b3 100644 --- a/src/bitops.cpp +++ b/src/bitops.cpp @@ -396,6 +396,8 @@ void printBits(unsigned char *p, unsigned long count) { #define BITOP_OR 1 #define BITOP_XOR 2 #define BITOP_NOT 3 +#define BITOP_LSHIFT 4 +#define BITOP_RSHIFT 5 #define BITFIELDOP_GET 0 #define BITFIELDOP_SET 1 @@ -592,7 +594,8 @@ void bitopCommand(client *c) { char *opname = szFromObj(c->argv[1]); robj *targetkey = c->argv[2]; robj_roptr o; - unsigned long op, j, numkeys; + int op; + unsigned long j, numkeys; robj_roptr *objects; /* Array of source objects. */ unsigned char **src; /* Array of source strings pointers. */ unsigned long *len, maxlen = 0; /* Array of length of src strings, @@ -609,6 +612,10 @@ void bitopCommand(client *c) { op = BITOP_XOR; else if((opname[0] == 'n' || opname[0] == 'N') && !strcasecmp(opname,"not")) op = BITOP_NOT; + else if (!strcasecmp(opname, "lshift")) + op = BITOP_LSHIFT; + else if (!strcasecmp(opname, "rshift")) + op = BITOP_RSHIFT; else { addReply(c,shared.syntaxerr); return; @@ -620,8 +627,25 @@ void bitopCommand(client *c) { return; } + bool fShiftOp = (op == BITOP_LSHIFT) || (op == BITOP_RSHIFT); + long long shift = 0; + + /* Sanity check: SHIFTS only accept a single arg and an integer */ + if (fShiftOp) { + if (c->argc != 5) { + addReplyError(c,"BITOP SHIFT must be called with a single source key and an integer shift."); + return; + } + if (getLongLongFromObject(c->argv[4], &shift) != C_OK) { + addReplyError(c, "BITOP SHIFT's last parameter must be an integer"); + return; + } + if (op == BITOP_RSHIFT) + shift = -shift; + } + /* Lookup keys, and store pointers to the string objects into an array. */ - numkeys = c->argc - 3; + numkeys = c->argc - (fShiftOp ? 4 : 3); src = (unsigned char**)zmalloc(sizeof(unsigned char*) * numkeys, MALLOC_LOCAL); len = (unsigned long*)zmalloc(sizeof(long) * numkeys, MALLOC_LOCAL); objects = (robj_roptr*)zmalloc(sizeof(robj_roptr) * numkeys, MALLOC_LOCAL); @@ -654,94 +678,143 @@ void bitopCommand(client *c) { if (j == 0 || len[j] < minlen) minlen = len[j]; } - /* Compute the bit operation, if at least one string is not empty. */ - if (maxlen) { - res = (unsigned char*) sdsnewlen(NULL,maxlen); - unsigned char output, byte; - unsigned long i; + if (fShiftOp) + { + long newlen = (long)maxlen + shift/CHAR_BIT; + if (shift > 0 && (shift % CHAR_BIT) != 0) + newlen++; - /* Fast path: as far as we have data for all the input bitmaps we - * can take a fast path that performs much better than the - * vanilla algorithm. On ARM we skip the fast path since it will - * result in GCC compiling the code using multiple-words load/store - * operations that are not supported even in ARM >= v6. */ - j = 0; - #ifndef USE_ALIGNED_ACCESS - if (minlen >= sizeof(unsigned long)*4 && numkeys <= 16) { - unsigned long *lp[16]; - unsigned long *lres = (unsigned long*) res; + if (newlen < 0) + newlen = 0; + + if (newlen) + { + res = (unsigned char*) sdsnewlen(NULL,newlen); + if (shift >= 0) + { // left shift + long byteoffset = shift/CHAR_BIT; + memset(res, 0, byteoffset); + long srcLen = newlen - byteoffset - ((shift % CHAR_BIT) ? 1 : 0); - /* Note: sds pointer is always aligned to 8 byte boundary. */ - memcpy(lp,src,sizeof(unsigned long*)*numkeys); - memcpy(res,src[0],minlen); - - /* Different branches per different operations for speed (sorry). */ - if (op == BITOP_AND) { - while(minlen >= sizeof(unsigned long)*4) { - for (i = 1; i < numkeys; i++) { - lres[0] &= lp[i][0]; - lres[1] &= lp[i][1]; - lres[2] &= lp[i][2]; - lres[3] &= lp[i][3]; - lp[i]+=4; - } - lres+=4; - j += sizeof(unsigned long)*4; - minlen -= sizeof(unsigned long)*4; + // now the bitshift+copy + unsigned bitshift = shift % CHAR_BIT; + unsigned char carry = 0; + for (long iSrc = 0; iSrc < srcLen; ++iSrc) + { + res[byteoffset+iSrc] = (src[0][iSrc] << bitshift) | carry; + carry = src[0][iSrc] >> (CHAR_BIT - bitshift); } - } else if (op == BITOP_OR) { - while(minlen >= sizeof(unsigned long)*4) { - for (i = 1; i < numkeys; i++) { - lres[0] |= lp[i][0]; - lres[1] |= lp[i][1]; - lres[2] |= lp[i][2]; - lres[3] |= lp[i][3]; - lp[i]+=4; - } - lres+=4; - j += sizeof(unsigned long)*4; - minlen -= sizeof(unsigned long)*4; - } - } else if (op == BITOP_XOR) { - while(minlen >= sizeof(unsigned long)*4) { - for (i = 1; i < numkeys; i++) { - lres[0] ^= lp[i][0]; - lres[1] ^= lp[i][1]; - lres[2] ^= lp[i][2]; - lres[3] ^= lp[i][3]; - lp[i]+=4; - } - lres+=4; - j += sizeof(unsigned long)*4; - minlen -= sizeof(unsigned long)*4; - } - } else if (op == BITOP_NOT) { - while(minlen >= sizeof(unsigned long)*4) { - lres[0] = ~lres[0]; - lres[1] = ~lres[1]; - lres[2] = ~lres[2]; - lres[3] = ~lres[3]; - lres+=4; - j += sizeof(unsigned long)*4; - minlen -= sizeof(unsigned long)*4; + if (bitshift) + res[newlen-1] = carry; + } + else + { // right shift + long byteoffset = -shift/CHAR_BIT; + unsigned bitshift = -shift % CHAR_BIT; + if (bitshift) + ++byteoffset; + res[0] = (src[0][byteoffset] << (CHAR_BIT-bitshift)); + if (byteoffset > 0) + res[0] |= (src[0][byteoffset-1] >> bitshift); + for (long idx = 1; idx < newlen; ++idx) + { + res[idx] = (src[0][byteoffset+idx] << (CHAR_BIT-bitshift)) | (src[0][byteoffset+idx-1] >> bitshift); } } } - #endif + maxlen = newlen; // this is to ensure we DEL below if newlen was 0 + } + else + { + /* Compute the bit operation, if at least one string is not empty. */ + if (maxlen) { + res = (unsigned char*) sdsnewlen(NULL,maxlen); + unsigned char output, byte; + unsigned long i; - /* j is set to the next byte to process by the previous loop. */ - for (; j < maxlen; j++) { - output = (len[0] <= j) ? 0 : src[0][j]; - if (op == BITOP_NOT) output = ~output; - for (i = 1; i < numkeys; i++) { - byte = (len[i] <= j) ? 0 : src[i][j]; - switch(op) { - case BITOP_AND: output &= byte; break; - case BITOP_OR: output |= byte; break; - case BITOP_XOR: output ^= byte; break; + /* Fast path: as far as we have data for all the input bitmaps we + * can take a fast path that performs much better than the + * vanilla algorithm. On ARM we skip the fast path since it will + * result in GCC compiling the code using multiple-words load/store + * operations that are not supported even in ARM >= v6. */ + j = 0; + #ifndef USE_ALIGNED_ACCESS + if (minlen >= sizeof(unsigned long)*4 && numkeys <= 16) { + unsigned long *lp[16]; + unsigned long *lres = (unsigned long*) res; + + /* Note: sds pointer is always aligned to 8 byte boundary. */ + memcpy(lp,src,sizeof(unsigned long*)*numkeys); + memcpy(res,src[0],minlen); + + /* Different branches per different operations for speed (sorry). */ + if (op == BITOP_AND) { + while(minlen >= sizeof(unsigned long)*4) { + for (i = 1; i < numkeys; i++) { + lres[0] &= lp[i][0]; + lres[1] &= lp[i][1]; + lres[2] &= lp[i][2]; + lres[3] &= lp[i][3]; + lp[i]+=4; + } + lres+=4; + j += sizeof(unsigned long)*4; + minlen -= sizeof(unsigned long)*4; + } + } else if (op == BITOP_OR) { + while(minlen >= sizeof(unsigned long)*4) { + for (i = 1; i < numkeys; i++) { + lres[0] |= lp[i][0]; + lres[1] |= lp[i][1]; + lres[2] |= lp[i][2]; + lres[3] |= lp[i][3]; + lp[i]+=4; + } + lres+=4; + j += sizeof(unsigned long)*4; + minlen -= sizeof(unsigned long)*4; + } + } else if (op == BITOP_XOR) { + while(minlen >= sizeof(unsigned long)*4) { + for (i = 1; i < numkeys; i++) { + lres[0] ^= lp[i][0]; + lres[1] ^= lp[i][1]; + lres[2] ^= lp[i][2]; + lres[3] ^= lp[i][3]; + lp[i]+=4; + } + lres+=4; + j += sizeof(unsigned long)*4; + minlen -= sizeof(unsigned long)*4; + } + } else if (op == BITOP_NOT) { + while(minlen >= sizeof(unsigned long)*4) { + lres[0] = ~lres[0]; + lres[1] = ~lres[1]; + lres[2] = ~lres[2]; + lres[3] = ~lres[3]; + lres+=4; + j += sizeof(unsigned long)*4; + minlen -= sizeof(unsigned long)*4; + } } } - res[j] = output; + #endif + + /* j is set to the next byte to process by the previous loop. */ + for (; j < maxlen; j++) { + output = (len[0] <= j) ? 0 : src[0][j]; + if (op == BITOP_NOT) output = ~output; + for (i = 1; i < numkeys; i++) { + byte = (len[i] <= j) ? 0 : src[i][j]; + switch(op) { + case BITOP_AND: output &= byte; break; + case BITOP_OR: output |= byte; break; + case BITOP_XOR: output ^= byte; break; + } + } + res[j] = output; + } } } for (j = 0; j < numkeys; j++) { diff --git a/src/cluster.cpp b/src/cluster.cpp index 79cb0972d..619ce3b3a 100644 --- a/src/cluster.cpp +++ b/src/cluster.cpp @@ -4949,7 +4949,7 @@ void restoreCommand(client *c) { dbAdd(c->db,c->argv[1],obj); if (ttl) { if (!absttl) ttl+=mstime(); - setExpire(c,c->db,c->argv[1],ttl); + setExpire(c,c->db,c->argv[1],nullptr,ttl); } objectSetLRUOrLFU(obj,lfu_freq,lru_idle,lru_clock); signalModifiedKey(c->db,c->argv[1]); @@ -5194,7 +5194,10 @@ try_again: /* Create RESTORE payload and generate the protocol to call the command. */ for (j = 0; j < num_keys; j++) { long long ttl = 0; - long long expireat = getExpire(c->db,kv[j]); + expireEntry *pexpire = getExpire(c->db,kv[j]); + long long expireat = -1; + if (pexpire != nullptr) + pexpire->FGetPrimaryExpire(&expireat); if (expireat != -1) { ttl = expireat-mstime(); diff --git a/src/compactvector.h b/src/compactvector.h new file mode 100644 index 000000000..daa8ad9fc --- /dev/null +++ b/src/compactvector.h @@ -0,0 +1,153 @@ +#pragma once + +#include +#include + +/************************************************* + * compactvector - similar to std::vector but optimized for minimal memory + * + * Notable differences: + * - Limited to 2^32 elements + * - Grows linearly not exponentially + * + *************************************************/ + +template +class compactvector +{ + static_assert(MEMMOVE_SAFE || std::is_trivially_copyable::value, "compactvector requires trivially copyable types"); + T *m_data = nullptr; + unsigned m_celem = 0; + unsigned m_max = 0; + +public: + typedef T* iterator; + + compactvector() noexcept = default; + ~compactvector() noexcept + { + clear(); // call dtors + zfree(m_data); + } + + compactvector(compactvector &) noexcept = delete; + + compactvector(compactvector &&src) noexcept + { + m_data = src.m_data; + m_celem = src.m_celem; + m_max = src.m_max; + src.m_data = nullptr; + src.m_celem = 0; + src.m_max = 0; + } + + compactvector &operator=(const compactvector&) noexcept = delete; + compactvector &operator=(compactvector &&src) noexcept + { + zfree(m_data); + m_data = src.m_data; + m_celem = src.m_celem; + m_max = src.m_max; + src.m_data = nullptr; + src.m_celem = 0; + src.m_max = 0; + return *this; + } + + inline T* begin() { return m_data; } + inline const T* begin() const { return m_data; } + + inline T* end() { return m_data + m_celem; } + inline const T* end() const { return m_data + m_celem; } + + T* insert(T* where, T &val) + { + assert(where >= m_data); + size_t idx = where - m_data; + if (m_celem >= m_max) + { + if (m_max < 2) + m_max = 2; + else + m_max = m_max + 4; + + m_data = (T*)zrealloc(m_data, sizeof(T) * m_max, MALLOC_LOCAL); + m_max = zmalloc_usable(m_data) / sizeof(T); + } + assert(idx < m_max); + where = m_data + idx; + memmove(reinterpret_cast(m_data + idx + 1), reinterpret_cast(m_data + idx), (m_celem - idx)*sizeof(T)); + new(m_data + idx) T(std::move(val)); + ++m_celem; + return where; + } + + T &operator[](size_t idx) + { + assert(idx < m_celem); + return m_data[idx]; + } + const T &operator[](size_t idx) const + { + assert(idx < m_celem); + return m_data[idx]; + } + + T& back() { assert(m_celem > 0); return m_data[m_celem-1]; } + const T& back() const { assert(m_celem > 0); return m_data[m_celem-1]; } + + void erase(T* where) + { + assert(where >= m_data); + size_t idx = where - m_data; + assert(idx < m_celem); + where->~T(); + memmove(reinterpret_cast(where), reinterpret_cast(where+1), ((m_celem - idx - 1)*sizeof(T))); + --m_celem; + + if (m_celem == 0) + { + zfree(m_data); + m_data = nullptr; + m_max = 0; + } + } + + void shrink_to_fit() + { + if (m_max == m_celem) + return; + m_data = (T*)zrealloc(m_data, sizeof(T) * m_celem, MALLOC_LOCAL); + m_max = m_celem; // NOTE: We do not get the usable size here, because this could cause us to continually realloc + } + + size_t bytes_used() const + { + return sizeof(this) + (m_max * sizeof(T)); + } + + void clear() + { + for (size_t idx = 0; idx < m_celem; ++idx) + m_data[idx].~T(); + zfree(m_data); + m_data = nullptr; + m_celem = 0; + m_max = 0; + } + + bool empty() const noexcept + { + return m_celem == 0; + } + + size_t size() const noexcept + { + return m_celem; + } + + T* data() noexcept { return m_data; } + const T* data() const noexcept { return m_data; } +}; +static_assert(sizeof(compactvector) <= 16, "not compact"); diff --git a/src/config.cpp b/src/config.cpp index fd284e5d8..0519e44ff 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -100,6 +100,47 @@ clientBufferLimitsConfig clientBufferLimitsDefaults[CLIENT_TYPE_OBUF_COUNT] = { {1024*1024*32, 1024*1024*8, 60} /* pubsub */ }; +/* Configuration values that require no special handling to set, get, load or + * rewrite. */ +typedef struct configYesNo { + const char *name; /* The user visible name of this config */ + const char *alias; /* An alias that can also be used for this config */ + int *config; /* The pointer to the server config this value is stored in */ + const int modifiable; /* Can this value be updated by CONFIG SET? */ + const int default_value; /* The default value of the config on rewrite */ +} configYesNo; + +configYesNo configs_yesno[] = { + /* Non-Modifiable */ + {"rdbchecksum",NULL,&g_pserver->rdb_checksum,0,CONFIG_DEFAULT_RDB_CHECKSUM}, + {"daemonize",NULL,&cserver.daemonize,0,0}, + {"always-show-logo",NULL,&g_pserver->always_show_logo,0,CONFIG_DEFAULT_ALWAYS_SHOW_LOGO}, + /* Modifiable */ + {"protected-mode",NULL,&g_pserver->protected_mode,1,CONFIG_DEFAULT_PROTECTED_MODE}, + {"rdbcompression",NULL,&g_pserver->rdb_compression,1,CONFIG_DEFAULT_RDB_COMPRESSION}, + {"activerehashing",NULL,&g_pserver->activerehashing,1,CONFIG_DEFAULT_ACTIVE_REHASHING}, + {"stop-writes-on-bgsave-error",NULL,&g_pserver->stop_writes_on_bgsave_err,1,CONFIG_DEFAULT_STOP_WRITES_ON_BGSAVE_ERROR}, + {"dynamic-hz",NULL,&g_pserver->dynamic_hz,1,CONFIG_DEFAULT_DYNAMIC_HZ}, + {"lazyfree-lazy-eviction",NULL,&g_pserver->lazyfree_lazy_eviction,1,CONFIG_DEFAULT_LAZYFREE_LAZY_EVICTION}, + {"lazyfree-lazy-expire",NULL,&g_pserver->lazyfree_lazy_expire,1,CONFIG_DEFAULT_LAZYFREE_LAZY_EXPIRE}, + {"lazyfree-lazy-server-del",NULL,&g_pserver->lazyfree_lazy_server_del,1,CONFIG_DEFAULT_LAZYFREE_LAZY_SERVER_DEL}, + {"repl-disable-tcp-nodelay",NULL,&g_pserver->repl_disable_tcp_nodelay,1,CONFIG_DEFAULT_REPL_DISABLE_TCP_NODELAY}, + {"repl-diskless-sync",NULL,&g_pserver->repl_diskless_sync,1,CONFIG_DEFAULT_REPL_DISKLESS_SYNC}, + {"aof-rewrite-incremental-fsync",NULL,&g_pserver->aof_rewrite_incremental_fsync,1,CONFIG_DEFAULT_AOF_REWRITE_INCREMENTAL_FSYNC}, + {"no-appendfsync-on-rewrite",NULL,&g_pserver->aof_no_fsync_on_rewrite,1,CONFIG_DEFAULT_AOF_NO_FSYNC_ON_REWRITE}, + {"cluster-require-full-coverage",NULL,&g_pserver->cluster_require_full_coverage,CLUSTER_DEFAULT_REQUIRE_FULL_COVERAGE}, + {"rdb-save-incremental-fsync",NULL,&g_pserver->rdb_save_incremental_fsync,1,CONFIG_DEFAULT_RDB_SAVE_INCREMENTAL_FSYNC}, + {"aof-load-truncated",NULL,&g_pserver->aof_load_truncated,1,CONFIG_DEFAULT_AOF_LOAD_TRUNCATED}, + {"aof-use-rdb-preamble",NULL,&g_pserver->aof_use_rdb_preamble,1,CONFIG_DEFAULT_AOF_USE_RDB_PREAMBLE}, + {"cluster-replica-no-failover","cluster-slave-no-failover",&g_pserver->cluster_slave_no_failover,1,CLUSTER_DEFAULT_SLAVE_NO_FAILOVER}, + {"replica-lazy-flush","slave-lazy-flush",&g_pserver->repl_slave_lazy_flush,1,CONFIG_DEFAULT_SLAVE_LAZY_FLUSH}, + {"replica-serve-stale-data","slave-serve-stale-data",&g_pserver->repl_serve_stale_data,1,CONFIG_DEFAULT_SLAVE_SERVE_STALE_DATA}, + {"replica-read-only","slave-read-only",&g_pserver->repl_slave_ro,1,CONFIG_DEFAULT_SLAVE_READ_ONLY}, + {"replica-ignore-maxmemory","slave-ignore-maxmemory",&g_pserver->repl_slave_ignore_maxmemory,1,CONFIG_DEFAULT_SLAVE_IGNORE_MAXMEMORY}, + {"multi-master",NULL,&g_pserver->enable_multimaster,false,CONFIG_DEFAULT_ENABLE_MULTIMASTER}, + {NULL, NULL, 0, 0} +}; + /*----------------------------------------------------------------------------- * Enum access functions *----------------------------------------------------------------------------*/ @@ -203,6 +244,26 @@ void loadServerConfigFromString(char *config) { } sdstolower(argv[0]); + /* Iterate the configs that are standard */ + int match = 0; + for (configYesNo *config = configs_yesno; config->name != NULL; config++) { + if ((!strcasecmp(argv[0],config->name) || + (config->alias && !strcasecmp(argv[0],config->alias))) && + (argc == 2)) + { + if ((*(config->config) = yesnotoi(argv[1])) == -1) { + err = "argument must be 'yes' or 'no'"; goto loaderr; + } + match = 1; + break; + } + } + + if (match) { + sdsfreesplitres(argv,argc); + continue; + } + /* Execute config directives */ if (!strcasecmp(argv[0],"timeout") && argc == 2) { cserver.maxidletime = atoi(argv[1]); @@ -214,10 +275,6 @@ void loadServerConfigFromString(char *config) { if (cserver.tcpkeepalive < 0) { err = "Invalid tcp-keepalive value"; goto loaderr; } - } else if (!strcasecmp(argv[0],"protected-mode") && argc == 2) { - if ((g_pserver->protected_mode = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } } else if (!strcasecmp(argv[0],"port") && argc == 2) { g_pserver->port = atoi(argv[1]); if (g_pserver->port < 0 || g_pserver->port > 65535) { @@ -288,10 +345,6 @@ void loadServerConfigFromString(char *config) { } else if (!strcasecmp(argv[0],"aclfile") && argc == 2) { zfree(g_pserver->acl_filename); g_pserver->acl_filename = zstrdup(argv[1]); - } else if (!strcasecmp(argv[0],"always-show-logo") && argc == 2) { - if ((g_pserver->always_show_logo = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } } else if (!strcasecmp(argv[0],"syslog-enabled") && argc == 2) { if ((g_pserver->syslog_enabled = yesnotoi(argv[1])) == -1) { err = "argument must be 'yes' or 'no'"; goto loaderr; @@ -368,14 +421,6 @@ void loadServerConfigFromString(char *config) { err = "repl-timeout must be 1 or greater"; goto loaderr; } - } else if (!strcasecmp(argv[0],"repl-disable-tcp-nodelay") && argc==2) { - if ((g_pserver->repl_disable_tcp_nodelay = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } - } else if (!strcasecmp(argv[0],"repl-diskless-sync") && argc==2) { - if ((g_pserver->repl_diskless_sync = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } } else if (!strcasecmp(argv[0],"repl-diskless-sync-delay") && argc==2) { g_pserver->repl_diskless_sync_delay = atoi(argv[1]); if (g_pserver->repl_diskless_sync_delay < 0) { @@ -403,57 +448,6 @@ void loadServerConfigFromString(char *config) { cserver.default_masterauth = argv[1][0] ? zstrdup(argv[1]) : NULL; // Loop through all existing master infos and update them (in case this came after the replicaof config) updateMasterAuth(); - } else if ((!strcasecmp(argv[0],"slave-serve-stale-data") || - !strcasecmp(argv[0],"replica-serve-stale-data")) - && argc == 2) - { - if ((g_pserver->repl_serve_stale_data = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } - } else if ((!strcasecmp(argv[0],"slave-read-only") || - !strcasecmp(argv[0],"replica-read-only")) - && argc == 2) - { - if ((g_pserver->repl_slave_ro = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } - } else if ((!strcasecmp(argv[0],"slave-ignore-maxmemory") || - !strcasecmp(argv[0],"replica-ignore-maxmemory")) - && argc == 2) - { - if ((g_pserver->repl_slave_ignore_maxmemory = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } - } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) { - if ((g_pserver->rdb_compression = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } - } else if (!strcasecmp(argv[0],"rdbchecksum") && argc == 2) { - if ((g_pserver->rdb_checksum = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } - } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) { - if ((g_pserver->activerehashing = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } - } else if (!strcasecmp(argv[0],"lazyfree-lazy-eviction") && argc == 2) { - if ((g_pserver->lazyfree_lazy_eviction = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } - } else if (!strcasecmp(argv[0],"lazyfree-lazy-expire") && argc == 2) { - if ((g_pserver->lazyfree_lazy_expire = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } - } else if (!strcasecmp(argv[0],"lazyfree-lazy-server-del") && argc == 2){ - if ((g_pserver->lazyfree_lazy_server_del = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } - } else if ((!strcasecmp(argv[0],"slave-lazy-flush") || - !strcasecmp(argv[0],"replica-lazy-flush")) && argc == 2) - { - if ((g_pserver->repl_slave_lazy_flush = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } } else if (!strcasecmp(argv[0],"activedefrag") && argc == 2) { if ((cserver.active_defrag_enabled = yesnotoi(argv[1])) == -1) { err = "argument must be 'yes' or 'no'"; goto loaderr; @@ -463,14 +457,6 @@ void loadServerConfigFromString(char *config) { err = "active defrag can't be enabled without proper jemalloc support"; goto loaderr; #endif } - } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) { - if ((cserver.daemonize = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } - } else if (!strcasecmp(argv[0],"dynamic-hz") && argc == 2) { - if ((g_pserver->dynamic_hz = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } } else if (!strcasecmp(argv[0],"hz") && argc == 2) { g_pserver->config_hz = atoi(argv[1]); if (g_pserver->config_hz < CONFIG_MIN_HZ) g_pserver->config_hz = CONFIG_MIN_HZ; @@ -489,11 +475,6 @@ void loadServerConfigFromString(char *config) { } zfree(g_pserver->aof_filename); g_pserver->aof_filename = zstrdup(argv[1]); - } else if (!strcasecmp(argv[0],"no-appendfsync-on-rewrite") - && argc == 2) { - if ((g_pserver->aof_no_fsync_on_rewrite= yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) { g_pserver->aof_fsync = configEnumGetValue(aof_fsync_enum,argv[1]); if (g_pserver->aof_fsync == INT_MIN) { @@ -512,28 +493,6 @@ void loadServerConfigFromString(char *config) { argc == 2) { g_pserver->aof_rewrite_min_size = memtoll(argv[1],NULL); - } else if (!strcasecmp(argv[0],"aof-rewrite-incremental-fsync") && - argc == 2) - { - if ((g_pserver->aof_rewrite_incremental_fsync = - yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } - } else if (!strcasecmp(argv[0],"rdb-save-incremental-fsync") && - argc == 2) - { - if ((g_pserver->rdb_save_incremental_fsync = - yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } - } else if (!strcasecmp(argv[0],"aof-load-truncated") && argc == 2) { - if ((g_pserver->aof_load_truncated = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } - } else if (!strcasecmp(argv[0],"aof-use-rdb-preamble") && argc == 2) { - if ((g_pserver->aof_use_rdb_preamble = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) { if (strlen(argv[1]) > CONFIG_AUTHPASS_MAX_LEN) { err = "Password is longer than CONFIG_AUTHPASS_MAX_LEN"; @@ -670,13 +629,6 @@ void loadServerConfigFromString(char *config) { { err = "Invalid port"; goto loaderr; } - } else if (!strcasecmp(argv[0],"cluster-require-full-coverage") && - argc == 2) - { - if ((g_pserver->cluster_require_full_coverage = yesnotoi(argv[1])) == -1) - { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } } else if (!strcasecmp(argv[0],"cluster-node-timeout") && argc == 2) { g_pserver->cluster_node_timeout = strtoll(argv[1],NULL,10); if (g_pserver->cluster_node_timeout <= 0) { @@ -699,15 +651,6 @@ void loadServerConfigFromString(char *config) { err = "cluster replica validity factor must be zero or positive"; goto loaderr; } - } else if ((!strcasecmp(argv[0],"cluster-slave-no-failover") || - !strcasecmp(argv[0],"cluster-replica-no-failover")) && - argc == 2) - { - g_pserver->cluster_slave_no_failover = yesnotoi(argv[1]); - if (g_pserver->cluster_slave_no_failover == -1) { - err = "argument must be 'yes' or 'no'"; - goto loaderr; - } } else if (!strcasecmp(argv[0],"lua-time-limit") && argc == 2) { g_pserver->lua_time_limit = strtoll(argv[1],NULL,10); } else if (!strcasecmp(argv[0],"lua-replicate-commands") && argc == 2) { @@ -748,11 +691,6 @@ void loadServerConfigFromString(char *config) { cserver.client_obuf_limits[type].hard_limit_bytes = hard; cserver.client_obuf_limits[type].soft_limit_bytes = soft; cserver.client_obuf_limits[type].soft_limit_seconds = soft_seconds; - } else if (!strcasecmp(argv[0],"stop-writes-on-bgsave-error") && - argc == 2) { - if ((g_pserver->stop_writes_on_bgsave_err = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } } else if ((!strcasecmp(argv[0],"slave-priority") || !strcasecmp(argv[0],"replica-priority")) && argc == 2) { @@ -857,15 +795,13 @@ void loadServerConfigFromString(char *config) { g_pserver->fActiveReplica = CONFIG_DEFAULT_ACTIVE_REPLICA; err = "argument must be 'yes' or 'no'"; goto loaderr; } - } else if (!strcasecmp(argv[0],"multi-master") && argc == 2){ - if ((g_pserver->enable_multimaster = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } } else if (!strcasecmp(argv[0], "version-override") && argc == 2) { KEYDB_SET_VERSION = zstrdup(argv[1]); serverLog(LL_WARNING, "Warning version is overriden to: %s\n", KEYDB_SET_VERSION); } else if (!strcasecmp(argv[0],"testmode") && argc == 2){ g_fTestMode = yesnotoi(argv[1]); + } else if (!strcasecmp(argv[0],"rdbfuzz-mode")) { + // NOP, handled in main } else { err = "Bad directive or wrong number of arguments"; goto loaderr; } @@ -974,6 +910,19 @@ void configSetCommand(client *c) { serverAssertWithInfo(c,c->argv[3],sdsEncodedObject(c->argv[3])); o = c->argv[3]; + /* Iterate the configs that are standard */ + for (configYesNo *config = configs_yesno; config->name != NULL; config++) { + if(config->modifiable && (!strcasecmp(szFromObj(c->argv[2]),config->name) || + (config->alias && !strcasecmp(szFromObj(c->argv[2]),config->alias)))) + { + int yn = yesnotoi(szFromObj(o)); + if (yn == -1) goto badfmt; + *(config->config) = yn; + addReply(c,shared.ok); + return; + } + } + if (0) { /* this starts the config_set macros else-if chain. */ /* Special fields that can't be handled with general macros. */ @@ -1141,40 +1090,6 @@ void configSetCommand(client *c) { /* Boolean fields. * config_set_bool_field(name,var). */ - } config_set_bool_field( - "rdbcompression", g_pserver->rdb_compression) { - } config_set_bool_field( - "repl-disable-tcp-nodelay",g_pserver->repl_disable_tcp_nodelay) { - } config_set_bool_field( - "repl-diskless-sync",g_pserver->repl_diskless_sync) { - } config_set_bool_field( - "cluster-require-full-coverage",g_pserver->cluster_require_full_coverage) { - } config_set_bool_field( - "cluster-slave-no-failover",g_pserver->cluster_slave_no_failover) { - } config_set_bool_field( - "cluster-replica-no-failover",g_pserver->cluster_slave_no_failover) { - } config_set_bool_field( - "aof-rewrite-incremental-fsync",g_pserver->aof_rewrite_incremental_fsync) { - } config_set_bool_field( - "rdb-save-incremental-fsync",g_pserver->rdb_save_incremental_fsync) { - } config_set_bool_field( - "aof-load-truncated",g_pserver->aof_load_truncated) { - } config_set_bool_field( - "aof-use-rdb-preamble",g_pserver->aof_use_rdb_preamble) { - } config_set_bool_field( - "slave-serve-stale-data",g_pserver->repl_serve_stale_data) { - } config_set_bool_field( - "replica-serve-stale-data",g_pserver->repl_serve_stale_data) { - } config_set_bool_field( - "slave-read-only",g_pserver->repl_slave_ro) { - } config_set_bool_field( - "replica-read-only",g_pserver->repl_slave_ro) { - } config_set_bool_field( - "slave-ignore-maxmemory",g_pserver->repl_slave_ignore_maxmemory) { - } config_set_bool_field( - "replica-ignore-maxmemory",g_pserver->repl_slave_ignore_maxmemory) { - } config_set_bool_field( - "activerehashing",g_pserver->activerehashing) { } config_set_bool_field( "activedefrag",cserver.active_defrag_enabled) { #ifndef HAVE_DEFRAG @@ -1188,24 +1103,6 @@ void configSetCommand(client *c) { return; } #endif - } config_set_bool_field( - "protected-mode",g_pserver->protected_mode) { - } config_set_bool_field( - "stop-writes-on-bgsave-error",g_pserver->stop_writes_on_bgsave_err) { - } config_set_bool_field( - "lazyfree-lazy-eviction",g_pserver->lazyfree_lazy_eviction) { - } config_set_bool_field( - "lazyfree-lazy-expire",g_pserver->lazyfree_lazy_expire) { - } config_set_bool_field( - "lazyfree-lazy-server-del",g_pserver->lazyfree_lazy_server_del) { - } config_set_bool_field( - "slave-lazy-flush",g_pserver->repl_slave_lazy_flush) { - } config_set_bool_field( - "replica-lazy-flush",g_pserver->repl_slave_lazy_flush) { - } config_set_bool_field( - "no-appendfsync-on-rewrite",g_pserver->aof_no_fsync_on_rewrite) { - } config_set_bool_field( - "dynamic-hz",g_pserver->dynamic_hz) { /* Numerical fields. * config_set_numerical_field(name,var,min,max) */ @@ -1342,8 +1239,6 @@ void configSetCommand(client *c) { "loglevel",cserver.verbosity,loglevel_enum) { } config_set_enum_field( "maxmemory-policy",g_pserver->maxmemory_policy,maxmemory_policy_enum) { - } config_set_bool_field( - "multi-master", g_pserver->enable_multimaster) { } config_set_enum_field( "appendfsync",g_pserver->aof_fsync,aof_fsync_enum) { @@ -1496,58 +1391,15 @@ void configGetCommand(client *c) { config_get_numerical_field("tcp-keepalive",cserver.tcpkeepalive); /* Bool (yes/no) values */ - config_get_bool_field("cluster-require-full-coverage", - g_pserver->cluster_require_full_coverage); - config_get_bool_field("cluster-slave-no-failover", - g_pserver->cluster_slave_no_failover); - config_get_bool_field("cluster-replica-no-failover", - g_pserver->cluster_slave_no_failover); - config_get_bool_field("no-appendfsync-on-rewrite", - g_pserver->aof_no_fsync_on_rewrite); - config_get_bool_field("slave-serve-stale-data", - g_pserver->repl_serve_stale_data); - config_get_bool_field("replica-serve-stale-data", - g_pserver->repl_serve_stale_data); - config_get_bool_field("slave-read-only", - g_pserver->repl_slave_ro); - config_get_bool_field("replica-read-only", - g_pserver->repl_slave_ro); - config_get_bool_field("slave-ignore-maxmemory", - g_pserver->repl_slave_ignore_maxmemory); - config_get_bool_field("replica-ignore-maxmemory", - g_pserver->repl_slave_ignore_maxmemory); - config_get_bool_field("stop-writes-on-bgsave-error", - g_pserver->stop_writes_on_bgsave_err); - config_get_bool_field("daemonize", cserver.daemonize); - config_get_bool_field("rdbcompression", g_pserver->rdb_compression); - config_get_bool_field("rdbchecksum", g_pserver->rdb_checksum); - config_get_bool_field("activerehashing", g_pserver->activerehashing); + /* Iterate the configs that are standard */ + for (configYesNo *config = configs_yesno; config->name != NULL; config++) { + config_get_bool_field(config->name, *(config->config)); + if (config->alias) { + config_get_bool_field(config->alias, *(config->config)); + } + } + config_get_bool_field("activedefrag", cserver.active_defrag_enabled); - config_get_bool_field("protected-mode", g_pserver->protected_mode); - config_get_bool_field("repl-disable-tcp-nodelay", - g_pserver->repl_disable_tcp_nodelay); - config_get_bool_field("repl-diskless-sync", - g_pserver->repl_diskless_sync); - config_get_bool_field("aof-rewrite-incremental-fsync", - g_pserver->aof_rewrite_incremental_fsync); - config_get_bool_field("rdb-save-incremental-fsync", - g_pserver->rdb_save_incremental_fsync); - config_get_bool_field("aof-load-truncated", - g_pserver->aof_load_truncated); - config_get_bool_field("aof-use-rdb-preamble", - g_pserver->aof_use_rdb_preamble); - config_get_bool_field("lazyfree-lazy-eviction", - g_pserver->lazyfree_lazy_eviction); - config_get_bool_field("lazyfree-lazy-expire", - g_pserver->lazyfree_lazy_expire); - config_get_bool_field("lazyfree-lazy-server-del", - g_pserver->lazyfree_lazy_server_del); - config_get_bool_field("slave-lazy-flush", - g_pserver->repl_slave_lazy_flush); - config_get_bool_field("replica-lazy-flush", - g_pserver->repl_slave_lazy_flush); - config_get_bool_field("dynamic-hz", - g_pserver->dynamic_hz); /* Enum values */ config_get_enum_field("maxmemory-policy", @@ -2282,7 +2134,11 @@ int rewriteConfig(char *path) { /* Step 2: rewrite every single option, replacing or appending it inside * the rewrite state. */ - rewriteConfigYesNoOption(state,"daemonize",cserver.daemonize,0); + /* Iterate the configs that are standard */ + for (configYesNo *config = configs_yesno; config->name != NULL; config++) { + rewriteConfigYesNoOption(state,config->name,*(config->config),config->default_value); + } + rewriteConfigStringOption(state,"pidfile",cserver.pidfile,CONFIG_DEFAULT_PID_FILE); rewriteConfigNumericalOption(state,"port",g_pserver->port,CONFIG_DEFAULT_SERVER_PORT); rewriteConfigNumericalOption(state,"cluster-announce-port",g_pserver->cluster_announce_port,CONFIG_DEFAULT_CLUSTER_ANNOUNCE_PORT); @@ -2303,9 +2159,6 @@ int rewriteConfig(char *path) { rewriteConfigSaveOption(state); rewriteConfigUserOption(state); rewriteConfigNumericalOption(state,"databases",cserver.dbnum,CONFIG_DEFAULT_DBNUM); - rewriteConfigYesNoOption(state,"stop-writes-on-bgsave-error",g_pserver->stop_writes_on_bgsave_err,CONFIG_DEFAULT_STOP_WRITES_ON_BGSAVE_ERROR); - rewriteConfigYesNoOption(state,"rdbcompression",g_pserver->rdb_compression,CONFIG_DEFAULT_RDB_COMPRESSION); - rewriteConfigYesNoOption(state,"rdbchecksum",g_pserver->rdb_checksum,CONFIG_DEFAULT_RDB_CHECKSUM); rewriteConfigStringOption(state,"dbfilename",g_pserver->rdb_filename,CONFIG_DEFAULT_RDB_FILENAME); rewriteConfigDirOption(state); rewriteConfigSlaveofOption(state,"replicaof"); @@ -2313,15 +2166,10 @@ int rewriteConfig(char *path) { rewriteConfigStringOption(state,"masteruser",cserver.default_masteruser,NULL); rewriteConfigStringOption(state,"masterauth",cserver.default_masterauth,NULL); rewriteConfigStringOption(state,"cluster-announce-ip",g_pserver->cluster_announce_ip,NULL); - rewriteConfigYesNoOption(state,"replica-serve-stale-data",g_pserver->repl_serve_stale_data,CONFIG_DEFAULT_SLAVE_SERVE_STALE_DATA); - rewriteConfigYesNoOption(state,"replica-read-only",g_pserver->repl_slave_ro,CONFIG_DEFAULT_SLAVE_READ_ONLY); - rewriteConfigYesNoOption(state,"replica-ignore-maxmemory",g_pserver->repl_slave_ignore_maxmemory,CONFIG_DEFAULT_SLAVE_IGNORE_MAXMEMORY); rewriteConfigNumericalOption(state,"repl-ping-replica-period",g_pserver->repl_ping_slave_period,CONFIG_DEFAULT_REPL_PING_SLAVE_PERIOD); rewriteConfigNumericalOption(state,"repl-timeout",g_pserver->repl_timeout,CONFIG_DEFAULT_REPL_TIMEOUT); rewriteConfigBytesOption(state,"repl-backlog-size",g_pserver->repl_backlog_size,CONFIG_DEFAULT_REPL_BACKLOG_SIZE); rewriteConfigBytesOption(state,"repl-backlog-ttl",g_pserver->repl_backlog_time_limit,CONFIG_DEFAULT_REPL_BACKLOG_TIME_LIMIT); - rewriteConfigYesNoOption(state,"repl-disable-tcp-nodelay",g_pserver->repl_disable_tcp_nodelay,CONFIG_DEFAULT_REPL_DISABLE_TCP_NODELAY); - rewriteConfigYesNoOption(state,"repl-diskless-sync",g_pserver->repl_diskless_sync,CONFIG_DEFAULT_REPL_DISKLESS_SYNC); rewriteConfigNumericalOption(state,"repl-diskless-sync-delay",g_pserver->repl_diskless_sync_delay,CONFIG_DEFAULT_REPL_DISKLESS_SYNC_DELAY); rewriteConfigNumericalOption(state,"replica-priority",g_pserver->slave_priority,CONFIG_DEFAULT_SLAVE_PRIORITY); rewriteConfigNumericalOption(state,"min-replicas-to-write",g_pserver->repl_min_slaves_to_write,CONFIG_DEFAULT_MIN_SLAVES_TO_WRITE); @@ -2344,14 +2192,11 @@ int rewriteConfig(char *path) { rewriteConfigYesNoOption(state,"appendonly",g_pserver->aof_state != AOF_OFF,0); rewriteConfigStringOption(state,"appendfilename",g_pserver->aof_filename,CONFIG_DEFAULT_AOF_FILENAME); rewriteConfigEnumOption(state,"appendfsync",g_pserver->aof_fsync,aof_fsync_enum,CONFIG_DEFAULT_AOF_FSYNC); - rewriteConfigYesNoOption(state,"no-appendfsync-on-rewrite",g_pserver->aof_no_fsync_on_rewrite,CONFIG_DEFAULT_AOF_NO_FSYNC_ON_REWRITE); rewriteConfigNumericalOption(state,"auto-aof-rewrite-percentage",g_pserver->aof_rewrite_perc,AOF_REWRITE_PERC); rewriteConfigBytesOption(state,"auto-aof-rewrite-min-size",g_pserver->aof_rewrite_min_size,AOF_REWRITE_MIN_SIZE); rewriteConfigNumericalOption(state,"lua-time-limit",g_pserver->lua_time_limit,LUA_SCRIPT_TIME_LIMIT); rewriteConfigYesNoOption(state,"cluster-enabled",g_pserver->cluster_enabled,0); rewriteConfigStringOption(state,"cluster-config-file",g_pserver->cluster_configfile,CONFIG_DEFAULT_CLUSTER_CONFIG_FILE); - rewriteConfigYesNoOption(state,"cluster-require-full-coverage",g_pserver->cluster_require_full_coverage,CLUSTER_DEFAULT_REQUIRE_FULL_COVERAGE); - rewriteConfigYesNoOption(state,"cluster-replica-no-failover",g_pserver->cluster_slave_no_failover,CLUSTER_DEFAULT_SLAVE_NO_FAILOVER); rewriteConfigNumericalOption(state,"cluster-node-timeout",g_pserver->cluster_node_timeout,CLUSTER_DEFAULT_NODE_TIMEOUT); rewriteConfigNumericalOption(state,"cluster-migration-barrier",g_pserver->cluster_migration_barrier,CLUSTER_DEFAULT_MIGRATION_BARRIER); rewriteConfigNumericalOption(state,"cluster-replica-validity-factor",g_pserver->cluster_slave_validity_factor,CLUSTER_DEFAULT_SLAVE_VALIDITY); @@ -2369,23 +2214,11 @@ int rewriteConfig(char *path) { rewriteConfigNumericalOption(state,"zset-max-ziplist-entries",g_pserver->zset_max_ziplist_entries,OBJ_ZSET_MAX_ZIPLIST_ENTRIES); rewriteConfigNumericalOption(state,"zset-max-ziplist-value",g_pserver->zset_max_ziplist_value,OBJ_ZSET_MAX_ZIPLIST_VALUE); rewriteConfigNumericalOption(state,"hll-sparse-max-bytes",g_pserver->hll_sparse_max_bytes,CONFIG_DEFAULT_HLL_SPARSE_MAX_BYTES); - rewriteConfigYesNoOption(state,"activerehashing",g_pserver->activerehashing,CONFIG_DEFAULT_ACTIVE_REHASHING); rewriteConfigYesNoOption(state,"activedefrag",cserver.active_defrag_enabled,CONFIG_DEFAULT_ACTIVE_DEFRAG); - rewriteConfigYesNoOption(state,"protected-mode",g_pserver->protected_mode,CONFIG_DEFAULT_PROTECTED_MODE); rewriteConfigClientoutputbufferlimitOption(state); rewriteConfigNumericalOption(state,"hz",g_pserver->config_hz,CONFIG_DEFAULT_HZ); - rewriteConfigYesNoOption(state,"aof-rewrite-incremental-fsync",g_pserver->aof_rewrite_incremental_fsync,CONFIG_DEFAULT_AOF_REWRITE_INCREMENTAL_FSYNC); - rewriteConfigYesNoOption(state,"rdb-save-incremental-fsync",g_pserver->rdb_save_incremental_fsync,CONFIG_DEFAULT_RDB_SAVE_INCREMENTAL_FSYNC); - rewriteConfigYesNoOption(state,"aof-load-truncated",g_pserver->aof_load_truncated,CONFIG_DEFAULT_AOF_LOAD_TRUNCATED); - rewriteConfigYesNoOption(state,"aof-use-rdb-preamble",g_pserver->aof_use_rdb_preamble,CONFIG_DEFAULT_AOF_USE_RDB_PREAMBLE); rewriteConfigEnumOption(state,"supervised",cserver.supervised_mode,supervised_mode_enum,SUPERVISED_NONE); - rewriteConfigYesNoOption(state,"lazyfree-lazy-eviction",g_pserver->lazyfree_lazy_eviction,CONFIG_DEFAULT_LAZYFREE_LAZY_EVICTION); - rewriteConfigYesNoOption(state,"lazyfree-lazy-expire",g_pserver->lazyfree_lazy_expire,CONFIG_DEFAULT_LAZYFREE_LAZY_EXPIRE); - rewriteConfigYesNoOption(state,"lazyfree-lazy-server-del",g_pserver->lazyfree_lazy_server_del,CONFIG_DEFAULT_LAZYFREE_LAZY_SERVER_DEL); - rewriteConfigYesNoOption(state,"replica-lazy-flush",g_pserver->repl_slave_lazy_flush,CONFIG_DEFAULT_SLAVE_LAZY_FLUSH); - rewriteConfigYesNoOption(state,"dynamic-hz",g_pserver->dynamic_hz,CONFIG_DEFAULT_DYNAMIC_HZ); rewriteConfigYesNoOption(state,"active-replica",g_pserver->fActiveReplica,CONFIG_DEFAULT_ACTIVE_REPLICA); - rewriteConfigYesNoOption(state,"multi-master",g_pserver->enable_multimaster,CONFIG_DEFAULT_ENABLE_MULTIMASTER); rewriteConfigStringOption(state, "version-override",KEYDB_SET_VERSION,KEYDB_REAL_VERSION); /* Rewrite Sentinel config if in Sentinel mode. */ diff --git a/src/db.cpp b/src/db.cpp index 3e0be4f88..b4ac46a2a 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -30,6 +30,7 @@ #include "server.h" #include "cluster.h" #include "atomicvar.h" +#include "aelocker.h" #include #include @@ -39,6 +40,8 @@ *----------------------------------------------------------------------------*/ int keyIsExpired(redisDb *db, robj *key); +int expireIfNeeded(redisDb *db, robj *key, robj *o); +void dbOverwriteCore(redisDb *db, dictEntry *de, robj *key, robj *val, bool fUpdateMvcc, bool fRemoveExpire); /* Update LFU when an object is accessed. * Firstly, decrement the counter if the decrement time is reached. @@ -49,6 +52,20 @@ void updateLFU(robj *val) { val->lru = (LFUGetTimeInMinutes()<<8) | counter; } +void updateExpire(redisDb *db, sds key, robj *valOld, robj *valNew) +{ + serverAssert(valOld->FExpires()); + serverAssert(!valNew->FExpires()); + + auto itr = db->setexpire->find(key); + serverAssert(itr != db->setexpire->end()); + + valNew->SetFExpires(true); + valOld->SetFExpires(false); + return; +} + + /* Low level key lookup API, not actually called directly from commands * implementations that should instead rely on lookupKeyRead(), * lookupKeyWrite() and lookupKeyReadWithFlags(). */ @@ -160,8 +177,10 @@ robj_roptr lookupKeyRead(redisDb *db, robj *key) { * Returns the linked value object if the key exists or NULL if the key * does not exist in the specified DB. */ robj *lookupKeyWrite(redisDb *db, robj *key) { - expireIfNeeded(db,key); - return lookupKey(db,key,LOOKUP_UPDATEMVCC); + robj *o = lookupKey(db,key,LOOKUP_UPDATEMVCC); + if (expireIfNeeded(db,key)) + o = NULL; + return o; } robj_roptr lookupKeyReadOrReply(client *c, robj *key, robj *reply) { @@ -177,6 +196,7 @@ robj *lookupKeyWriteOrReply(client *c, robj *key, robj *reply) { } int dbAddCore(redisDb *db, robj *key, robj *val) { + serverAssert(!val->FExpires()); sds copy = sdsdup(szFromObj(key)); int retval = dictAdd(db->pdict, copy, val); val->mvcc_tstamp = key->mvcc_tstamp = getMvccTstamp(); @@ -206,15 +226,30 @@ void dbAdd(redisDb *db, robj *key, robj *val) serverAssertWithInfo(NULL,key,retval == DICT_OK); } -void dbOverwriteCore(redisDb *db, dictEntry *de, robj *val, bool fUpdateMvcc) +void dbOverwriteCore(redisDb *db, dictEntry *de, robj *key, robj *val, bool fUpdateMvcc, bool fRemoveExpire) { dictEntry auxentry = *de; robj *old = (robj*)dictGetVal(de); + + if (old->FExpires()) { + if (fRemoveExpire) { + removeExpire(db, key); + } + else { + if (val->getrefcount(std::memory_order_relaxed) == OBJ_SHARED_REFCOUNT) + val = dupStringObject(val); + updateExpire(db, (sds)dictGetKey(de), old, val); + } + } + if (g_pserver->maxmemory_policy & MAXMEMORY_FLAG_LFU) { val->lru = old->lru; } - if (fUpdateMvcc) + if (fUpdateMvcc) { + if (val->getrefcount(std::memory_order_relaxed) == OBJ_SHARED_REFCOUNT) + val = dupStringObject(val); val->mvcc_tstamp = getMvccTstamp(); + } dictSetVal(db->pdict, de, val); @@ -235,7 +270,7 @@ void dbOverwrite(redisDb *db, robj *key, robj *val) { dictEntry *de = dictFind(db->pdict,ptrFromObj(key)); serverAssertWithInfo(NULL,key,de != NULL); - dbOverwriteCore(db, de, val, true); + dbOverwriteCore(db, de, key, val, !!g_pserver->fActiveReplica, false); } /* Insert a key, handling duplicate keys according to fReplace */ @@ -250,7 +285,7 @@ int dbMerge(redisDb *db, robj *key, robj *val, int fReplace) robj *old = (robj*)dictGetVal(de); if (old->mvcc_tstamp <= val->mvcc_tstamp) { - dbOverwriteCore(db, de, val, false); + dbOverwriteCore(db, de, key, val, false, true); return true; } @@ -271,13 +306,13 @@ int dbMerge(redisDb *db, robj *key, robj *val, int fReplace) * * All the new keys in the database should be created via this interface. */ void setKey(redisDb *db, robj *key, robj *val) { - if (lookupKeyWrite(db,key) == NULL) { + dictEntry *de = dictFind(db->pdict, ptrFromObj(key)); + if (de == NULL) { dbAdd(db,key,val); } else { - dbOverwrite(db,key,val); + dbOverwriteCore(db,de,key,val,!!g_pserver->fActiveReplica,true); } incrRefCount(val); - removeExpire(db,key); signalModifiedKey(db,key); } @@ -292,7 +327,7 @@ int dbExists(redisDb *db, robj *key) { robj *dbRandomKey(redisDb *db) { dictEntry *de; int maxtries = 100; - int allvolatile = dictSize(db->pdict) == dictSize(db->expires); + int allvolatile = dictSize(db->pdict) == db->setexpire->size(); while(1) { sds key; @@ -303,23 +338,30 @@ robj *dbRandomKey(redisDb *db) { key = (sds)dictGetKey(de); keyobj = createStringObject(key,sdslen(key)); - if (dictFind(db->expires,key)) { + + if (((robj*)dictGetVal(de))->FExpires()) + { if (allvolatile && listLength(g_pserver->masters) && --maxtries == 0) { /* If the DB is composed only of keys with an expire set, - * it could happen that all the keys are already logically - * expired in the slave, so the function cannot stop because - * expireIfNeeded() is false, nor it can stop because - * dictGetRandomKey() returns NULL (there are keys to return). - * To prevent the infinite loop we do some tries, but if there - * are the conditions for an infinite loop, eventually we - * return a key name that may be already expired. */ + * it could happen that all the keys are already logically + * expired in the slave, so the function cannot stop because + * expireIfNeeded() is false, nor it can stop because + * dictGetRandomKey() returns NULL (there are keys to return). + * To prevent the infinite loop we do some tries, but if there + * are the conditions for an infinite loop, eventually we + * return a key name that may be already expired. */ return keyobj; } - if (expireIfNeeded(db,keyobj)) { + } + + if (((robj*)dictGetVal(de))->FExpires()) + { + if (expireIfNeeded(db,keyobj)) { decrRefCount(keyobj); continue; /* search for another key. This expired. */ - } + } } + return keyobj; } } @@ -328,7 +370,10 @@ robj *dbRandomKey(redisDb *db) { int dbSyncDelete(redisDb *db, robj *key) { /* Deleting an entry from the expires dict will not free the sds of * the key, because it is shared with the main dictionary. */ - if (dictSize(db->expires) > 0) dictDelete(db->expires,ptrFromObj(key)); + + dictEntry *de = dictFind(db->pdict, szFromObj(key)); + if (de != nullptr && ((robj*)dictGetVal(de))->FExpires()) + removeExpireCore(db, key, de); if (dictDelete(db->pdict,ptrFromObj(key)) == DICT_OK) { if (g_pserver->cluster_enabled) slotToKeyDel(key); return 1; @@ -373,7 +418,7 @@ int dbDelete(redisDb *db, robj *key) { */ robj *dbUnshareStringValue(redisDb *db, robj *key, robj *o) { serverAssert(o->type == OBJ_STRING); - if (o->refcount != 1 || o->encoding != OBJ_ENCODING_RAW) { + if (o->getrefcount(std::memory_order_relaxed) != 1 || o->encoding != OBJ_ENCODING_RAW) { robj *decoded = getDecodedObject(o); o = createRawStringObject(szFromObj(decoded), sdslen(szFromObj(decoded))); decrRefCount(decoded); @@ -419,7 +464,9 @@ long long emptyDb(int dbnum, int flags, void(callback)(void*)) { emptyDbAsync(&g_pserver->db[j]); } else { dictEmpty(g_pserver->db[j].pdict,callback); - dictEmpty(g_pserver->db[j].expires,callback); + delete g_pserver->db[j].setexpire; + g_pserver->db[j].setexpire = new (MALLOC_LOCAL) expireset(); + g_pserver->db[j].expireitr = g_pserver->db[j].setexpire->end(); } } if (g_pserver->cluster_enabled) { @@ -451,6 +498,7 @@ int selectDb(client *c, int id) { void signalModifiedKey(redisDb *db, robj *key) { touchWatchedKey(db,key); + if (g_pserver->tracking_clients) trackingInvalidateKey(key); } void signalFlushedDb(int dbid) { @@ -595,6 +643,8 @@ void keysCommand(client *c) { unsigned long numkeys = 0; void *replylen = addReplyDeferredLen(c); + aeReleaseLock(); + di = dictGetSafeIterator(c->db->pdict); allkeys = (pattern[0] == '*' && pattern[1] == '\0'); while((de = dictNext(di)) != NULL) { @@ -612,6 +662,12 @@ void keysCommand(client *c) { } dictReleaseIterator(di); setDeferredArrayLen(c,replylen,numkeys); + + fastlock_unlock(&c->db->lock); // we must release the DB lock before acquiring the AE lock to prevent deadlocks + AeLocker lock; + lock.arm(c); + fastlock_lock(&c->db->lock); // we still need the DB lock + lock.release(); } /* This callback is used by scanGenericCommand in order to collect elements @@ -665,7 +721,7 @@ int parseScanCursorOrReply(client *c, robj *o, unsigned long *cursor) { } /* This command implements SCAN, HSCAN and SSCAN commands. - * If object 'o' is passed, then it must be a Hash or Set object, otherwise + * If object 'o' is passed, then it must be a Hash, Set or Zset object, otherwise * if 'o' is NULL the command will operate on the dictionary associated with * the current database. * @@ -681,6 +737,7 @@ void scanGenericCommand(client *c, robj_roptr o, unsigned long cursor) { listNode *node, *nextnode; long count = 10; sds pat = NULL; + sds type = NULL; int patlen = 0, use_pattern = 0; dict *ht; @@ -717,6 +774,10 @@ void scanGenericCommand(client *c, robj_roptr o, unsigned long cursor) { use_pattern = !(pat[0] == '*' && patlen == 1); i += 2; + } else if (!strcasecmp(szFromObj(c->argv[i]), "type") && o == nullptr && j >= 2) { + /* SCAN for a particular type only applies to the db dict */ + type = szFromObj(c->argv[i+1]); + i+= 2; } else { addReply(c,shared.syntaxerr); goto cleanup; @@ -811,6 +872,13 @@ void scanGenericCommand(client *c, robj_roptr o, unsigned long cursor) { } } + /* Filter an element if it isn't the type we want. */ + if (!filter && o == nullptr && type){ + robj_roptr typecheck = lookupKeyReadWithFlags(c->db, kobj, LOOKUP_NOTOUCH); + const char* typeT = getObjectTypeName(typecheck); + if (strcasecmp((char*) type, typeT)) filter = 1; + } + /* Filter element if it is an expired key. */ if (!filter && o == nullptr && expireIfNeeded(c->db, kobj)) filter = 1; @@ -867,10 +935,8 @@ void lastsaveCommand(client *c) { addReplyLongLong(c,g_pserver->lastsave); } -void typeCommand(client *c) { - const char *type; - - robj_roptr o = lookupKeyReadWithFlags(c->db,c->argv[1],LOOKUP_NOTOUCH); +const char* getObjectTypeName(robj_roptr o) { + const char* type; if (o == nullptr) { type = "none"; } else { @@ -888,7 +954,12 @@ void typeCommand(client *c) { default: type = "unknown"; break; } } - addReplyStatus(c,type); + return type; +} + +void typeCommand(client *c) { + robj_roptr o = lookupKeyReadWithFlags(c->db,c->argv[1],LOOKUP_NOTOUCH); + addReplyStatus(c, getObjectTypeName(o)); } void shutdownCommand(client *c) { @@ -921,7 +992,6 @@ void shutdownCommand(client *c) { void renameGenericCommand(client *c, int nx) { robj *o; - long long expire; int samekey = 0; /* When source and dest key is the same, no operation is performed, @@ -937,7 +1007,15 @@ void renameGenericCommand(client *c, int nx) { } incrRefCount(o); - expire = getExpire(c->db,c->argv[1]); + + std::unique_ptr spexpire; + + { // scope pexpireOld since it will be invalid soon + expireEntry *pexpireOld = getExpire(c->db,c->argv[1]); + if (pexpireOld != nullptr) + spexpire = std::make_unique(std::move(*pexpireOld)); + } + if (lookupKeyWrite(c->db,c->argv[2]) != NULL) { if (nx) { decrRefCount(o); @@ -948,9 +1026,10 @@ void renameGenericCommand(client *c, int nx) { * with the same name. */ dbDelete(c->db,c->argv[2]); } - dbAdd(c->db,c->argv[2],o); - if (expire != -1) setExpire(c,c->db,c->argv[2],expire); dbDelete(c->db,c->argv[1]); + dbAdd(c->db,c->argv[2],o); + if (spexpire != nullptr) + setExpire(c,c->db,c->argv[2],std::move(*spexpire)); signalModifiedKey(c->db,c->argv[1]); signalModifiedKey(c->db,c->argv[2]); notifyKeyspaceEvent(NOTIFY_GENERIC,"rename_from", @@ -973,7 +1052,7 @@ void moveCommand(client *c) { robj *o; redisDb *src, *dst; int srcid; - long long dbid, expire; + long long dbid; if (g_pserver->cluster_enabled) { addReplyError(c,"MOVE is not allowed in cluster mode"); @@ -1007,7 +1086,19 @@ void moveCommand(client *c) { addReply(c,shared.czero); return; } - expire = getExpire(c->db,c->argv[1]); + + std::unique_ptr spexpire; + { // scope pexpireOld + expireEntry *pexpireOld = getExpire(c->db,c->argv[1]); + if (pexpireOld != nullptr) + spexpire = std::make_unique(std::move(*pexpireOld)); + } + if (o->FExpires()) + removeExpire(c->db,c->argv[1]); + serverAssert(!o->FExpires()); + incrRefCount(o); + dbDelete(src,c->argv[1]); + g_pserver->dirty++; /* Return zero if the key already exists in the target DB */ if (lookupKeyWrite(dst,c->argv[1]) != NULL) { @@ -1015,12 +1106,8 @@ void moveCommand(client *c) { return; } dbAdd(dst,c->argv[1],o); - if (expire != -1) setExpire(c,dst,c->argv[1],expire); - incrRefCount(o); + if (spexpire != nullptr) setExpire(c,dst,c->argv[1],std::move(*spexpire)); - /* OK! key moved, free the entry in the source DB */ - dbDelete(src,c->argv[1]); - g_pserver->dirty++; addReply(c,shared.cone); } @@ -1054,19 +1141,24 @@ int dbSwapDatabases(int id1, int id2) { if (id1 < 0 || id1 >= cserver.dbnum || id2 < 0 || id2 >= cserver.dbnum) return C_ERR; if (id1 == id2) return C_OK; - redisDb aux = g_pserver->db[id1]; + redisDb aux; + memcpy(&aux, &g_pserver->db[id1], sizeof(redisDb)); redisDb *db1 = &g_pserver->db[id1], *db2 = &g_pserver->db[id2]; /* Swap hash tables. Note that we don't swap blocking_keys, * ready_keys and watched_keys, since we want clients to * remain in the same DB they were. */ db1->pdict = db2->pdict; - db1->expires = db2->expires; + db1->setexpire = db2->setexpire; + db1->expireitr = db2->expireitr; db1->avg_ttl = db2->avg_ttl; + db1->last_expire_set = db2->last_expire_set; db2->pdict = aux.pdict; - db2->expires = aux.expires; + db2->setexpire = aux.setexpire; + db2->expireitr = aux.expireitr; db2->avg_ttl = aux.avg_ttl; + db2->last_expire_set = aux.last_expire_set; /* Now we need to handle clients blocked on lists: as an effect * of swapping the two DBs, a client that was waiting for list @@ -1114,46 +1206,128 @@ void swapdbCommand(client *c) { /*----------------------------------------------------------------------------- * Expires API *----------------------------------------------------------------------------*/ - int removeExpire(redisDb *db, robj *key) { + dictEntry *de = dictFind(db->pdict,ptrFromObj(key)); + return removeExpireCore(db, key, de); +} +int removeExpireCore(redisDb *db, robj *key, dictEntry *de) { /* An expire may only be removed if there is a corresponding entry in the * main dict. Otherwise, the key will never be freed. */ - serverAssertWithInfo(NULL,key,dictFind(db->pdict,ptrFromObj(key)) != NULL); - return dictDelete(db->expires,ptrFromObj(key)) == DICT_OK; + serverAssertWithInfo(NULL,key,de != NULL); + + robj *val = (robj*)dictGetVal(de); + if (!val->FExpires()) + return 0; + + auto itr = db->setexpire->find((sds)dictGetKey(de)); + serverAssert(itr != db->setexpire->end()); + serverAssert(itr->key() == (sds)dictGetKey(de)); + db->setexpire->erase(itr); + val->SetFExpires(false); + return 1; } /* Set an expire to the specified key. If the expire is set in the context * of an user calling a command 'c' is the client, otherwise 'c' is set * to NULL. The 'when' parameter is the absolute unix time in milliseconds * after which the key will no longer be considered valid. */ -void setExpire(client *c, redisDb *db, robj *key, long long when) { - dictEntry *kde, *de; +void setExpire(client *c, redisDb *db, robj *key, robj *subkey, long long when) { + dictEntry *kde; + serverAssert(GlobalLocksAcquired()); /* Reuse the sds from the main dict in the expire dict */ kde = dictFind(db->pdict,ptrFromObj(key)); serverAssertWithInfo(NULL,key,kde != NULL); - de = dictAddOrFind(db->expires,dictGetKey(kde)); - dictSetSignedIntegerVal(de,when); + + if (((robj*)dictGetVal(kde))->getrefcount(std::memory_order_relaxed) == OBJ_SHARED_REFCOUNT) + { + // shared objects cannot have the expire bit set, create a real object + dictSetVal(db->pdict, kde, dupStringObject((robj*)dictGetVal(kde))); + } + + /* Update TTL stats (exponential moving average) */ + /* Note: We never have to update this on expiry since we reduce it by the current elapsed time here */ + long long now = g_pserver->mstime; + db->avg_ttl -= (now - db->last_expire_set); // reduce the TTL by the time that has elapsed + if (db->setexpire->empty()) + db->avg_ttl = 0; + else + db->avg_ttl -= db->avg_ttl / db->setexpire->size(); // slide one entry out the window + if (db->avg_ttl < 0) + db->avg_ttl = 0; // TTLs are never negative + db->avg_ttl += (double)(when-now) / (db->setexpire->size()+1); // add the new entry + db->last_expire_set = now; + + /* Update the expire set */ + const char *szSubKey = (subkey != nullptr) ? szFromObj(subkey) : nullptr; + if (((robj*)dictGetVal(kde))->FExpires()) { + auto itr = db->setexpire->find((sds)dictGetKey(kde)); + serverAssert(itr != db->setexpire->end()); + expireEntry eNew(std::move(*itr)); + eNew.update(szSubKey, when); + db->setexpire->erase(itr); + db->setexpire->insert(eNew); + } + else + { + expireEntry e((sds)dictGetKey(kde), szSubKey, when); + ((robj*)dictGetVal(kde))->SetFExpires(true); + db->setexpire->insert(e); + } int writable_slave = listLength(g_pserver->masters) && g_pserver->repl_slave_ro == 0; if (c && writable_slave && !(c->flags & CLIENT_MASTER)) rememberSlaveKeyWithExpire(db,key); } -/* Return the expire time of the specified key, or -1 if no expire +void setExpire(client *c, redisDb *db, robj *key, expireEntry &&e) +{ + dictEntry *kde; + + serverAssert(GlobalLocksAcquired()); + + /* Reuse the sds from the main dict in the expire dict */ + kde = dictFind(db->pdict,ptrFromObj(key)); + serverAssertWithInfo(NULL,key,kde != NULL); + + if (((robj*)dictGetVal(kde))->getrefcount(std::memory_order_relaxed) == OBJ_SHARED_REFCOUNT) + { + // shared objects cannot have the expire bit set, create a real object + dictSetVal(db->pdict, kde, dupStringObject((robj*)dictGetVal(kde))); + } + + if (((robj*)dictGetVal(kde))->FExpires()) + removeExpire(db, key); + + e.setKeyUnsafe((sds)dictGetKey(kde)); + db->setexpire->insert(e); + ((robj*)dictGetVal(kde))->SetFExpires(true); + + + int writable_slave = listLength(g_pserver->masters) && g_pserver->repl_slave_ro == 0; + if (c && writable_slave && !(c->flags & CLIENT_MASTER)) + rememberSlaveKeyWithExpire(db,key); +} + +/* Return the expire time of the specified key, or null if no expire * is associated with this key (i.e. the key is non volatile) */ -long long getExpire(redisDb *db, robj_roptr key) { +expireEntry *getExpire(redisDb *db, robj_roptr key) { dictEntry *de; /* No expire? return ASAP */ - if (dictSize(db->expires) == 0 || - (de = dictFind(db->expires,ptrFromObj(key))) == NULL) return -1; + if (db->setexpire->size() == 0) + return nullptr; - /* The entry was found in the expire dict, this means it should also - * be present in the main dict (safety check). */ - serverAssertWithInfo(NULL,key,dictFind(db->pdict,ptrFromObj(key)) != NULL); - return dictGetSignedIntegerVal(de); + de = dictFind(db->pdict, ptrFromObj(key)); + if (de == NULL) + return nullptr; + robj *obj = (robj*)dictGetVal(de); + if (!obj->FExpires()) + return nullptr; + + auto itr = db->setexpire->find((sds)dictGetKey(de)); + return itr.operator->(); } /* Propagate expires into slaves and the AOF file. @@ -1181,15 +1355,28 @@ void propagateExpire(redisDb *db, robj *key, int lazy) { decrRefCount(argv[1]); } -/* Check if the key is expired. */ +/* Check if the key is expired. Note, this does not check subexpires */ int keyIsExpired(redisDb *db, robj *key) { - mstime_t when = getExpire(db,key); + expireEntry *pexpire = getExpire(db,key); - if (when < 0) return 0; /* No expire for this key */ + if (pexpire == nullptr) return 0; /* No expire for this key */ /* Don't expire anything while loading. It will be done later. */ if (g_pserver->loading) return 0; + long long when = -1; + for (auto &exp : *pexpire) + { + if (exp.subkey() == nullptr) + { + when = exp.when(); + break; + } + } + + if (when == -1) + return 0; + /* If we are in the context of a Lua script, we pretend that time is * blocked to when the Lua script started. This way a key can expire * only the first time it is accessed and not in the middle of the diff --git a/src/debug.cpp b/src/debug.cpp index 2ded21ea4..3a4520776 100644 --- a/src/debug.cpp +++ b/src/debug.cpp @@ -126,9 +126,13 @@ void mixStringObjectDigest(unsigned char *digest, robj_roptr o) { void xorObjectDigest(redisDb *db, robj_roptr keyobj, unsigned char *digest, robj_roptr o) { uint32_t aux = htonl(o->type); mixDigest(digest,&aux,sizeof(aux)); - long long expiretime = getExpire(db,keyobj); + expireEntry *pexpire = getExpire(db,keyobj); + long long expiretime = -1; char buf[128]; + if (pexpire != nullptr) + pexpire->FGetPrimaryExpire(&expiretime); + /* Save the key and associated value */ if (o->type == OBJ_STRING) { mixStringObjectDigest(digest,o); @@ -438,7 +442,7 @@ NULL "Value at:%p refcount:%d " "encoding:%s serializedlength:%zu " "lru:%d lru_seconds_idle:%llu%s", - (void*)val, static_cast(val->refcount), + (void*)val, static_cast(val->getrefcount(std::memory_order_relaxed)), strenc, rdbSavedObjectLen(val), val->lru, estimateObjectIdleTime(val)/1000, extra); } else if (!strcasecmp(szFromObj(c->argv[1]),"sdslen") && c->argc == 3) { @@ -640,9 +644,9 @@ NULL dictGetStats(buf,sizeof(buf),g_pserver->db[dbid].pdict); stats = sdscat(stats,buf); - stats = sdscatprintf(stats,"[Expires HT]\n"); - dictGetStats(buf,sizeof(buf),g_pserver->db[dbid].expires); - stats = sdscat(stats,buf); + stats = sdscatprintf(stats,"[Expires set]\n"); + g_pserver->db[dbid].setexpire->getstats(buf, sizeof(buf)); + stats = sdscat(stats, buf); addReplyBulkSds(c,stats); } else if (!strcasecmp(szFromObj(c->argv[1]),"htstats-key") && c->argc == 3) { @@ -678,10 +682,12 @@ NULL changeReplicationId(); clearReplicationId2(); addReply(c,shared.ok); - } else if (!strcasecmp(szFromObj(c->argv[1]),"stringmatch-test") && c->argc == 2) - { + } else if (!strcasecmp(szFromObj(c->argv[1]),"stringmatch-test") && c->argc == 2) { stringmatchlen_fuzz_test(); addReplyStatus(c,"Apparently Redis did not crash: test passed"); + } else if (!strcasecmp(szFromObj(c->argv[1]), "force-master") && c->argc == 2) { + c->flags |= CLIENT_MASTER | CLIENT_MASTER_FORCE_REPLY; + addReply(c, shared.ok); } else { addReplySubcommandSyntaxError(c); return; @@ -708,7 +714,7 @@ void _serverAssertPrintClientInfo(const client *c) { bugReportStart(); serverLog(LL_WARNING,"=== ASSERTION FAILED CLIENT CONTEXT ==="); - serverLog(LL_WARNING,"client->flags = %d", static_cast(c->flags)); + serverLog(LL_WARNING,"client->flags = %llu", static_cast(c->flags)); serverLog(LL_WARNING,"client->fd = %d", c->fd); serverLog(LL_WARNING,"client->argc = %d", c->argc); for (j=0; j < c->argc; j++) { @@ -723,14 +729,14 @@ void _serverAssertPrintClientInfo(const client *c) { arg = buf; } serverLog(LL_WARNING,"client->argv[%d] = \"%s\" (refcount: %d)", - j, arg, static_cast(c->argv[j]->refcount)); + j, arg, static_cast(c->argv[j]->getrefcount(std::memory_order_relaxed))); } } void serverLogObjectDebugInfo(robj_roptr o) { serverLog(LL_WARNING,"Object type: %d", o->type); serverLog(LL_WARNING,"Object encoding: %d", o->encoding); - serverLog(LL_WARNING,"Object refcount: %d", static_cast(o->refcount)); + serverLog(LL_WARNING,"Object refcount: %d", static_cast(o->getrefcount(std::memory_order_relaxed))); if (o->type == OBJ_STRING && sdsEncodedObject(o)) { serverLog(LL_WARNING,"Object raw string len: %zu", sdslen(szFromObj(o))); if (sdslen(szFromObj(o)) < 4096) { diff --git a/src/defrag.cpp b/src/defrag.cpp index 2e9abd290..c49cd2665 100644 --- a/src/defrag.cpp +++ b/src/defrag.cpp @@ -48,6 +48,7 @@ extern "C" int je_get_defrag_hint(void* ptr, int *bin_util, int *run_util); /* forward declarations*/ void defragDictBucketCallback(void *privdata, dictEntry **bucketref); dictEntry* replaceSateliteDictKeyPtrAndOrDefragDictEntry(dict *d, sds oldkey, sds newkey, uint64_t hash, long *defragged); +void replaceSateliteOSetKeyPtr(expireset &set, sds oldkey, sds newkey); /* Defrag helper for generic allocations. * @@ -102,7 +103,7 @@ sds activeDefragSds(sds sdsptr) { * and should NOT be accessed. */ robj *activeDefragStringOb(robj* ob, long *defragged) { robj *ret = NULL; - if (ob->refcount!=1) + if (ob->getrefcount(std::memory_order_relaxed)!=1) return NULL; /* try to defrag robj (only if not an EMBSTR type (handled below). */ @@ -406,6 +407,17 @@ dictEntry* replaceSateliteDictKeyPtrAndOrDefragDictEntry(dict *d, sds oldkey, sd return NULL; } +void replaceSateliteOSetKeyPtr(expireset &set, sds oldkey, sds newkey) { + auto itr = set.find(oldkey); + if (itr != set.end()) + { + expireEntry eNew(std::move(*itr)); + eNew.setKeyUnsafe(newkey); + set.erase(itr); + set.insert(eNew); + } +} + long activeDefragQuickListNodes(quicklist *ql) { quicklistNode *node = ql->head, *newnode; long defragged = 0; @@ -769,12 +781,8 @@ long defragKey(redisDb *db, dictEntry *de) { newsds = activeDefragSds(keysds); if (newsds) defragged++, de->key = newsds; - if (dictSize(db->expires)) { - /* Dirty code: - * I can't search in db->expires for that key after i already released - * the pointer it holds it won't be able to do the string compare */ - uint64_t hash = dictGetHash(db->pdict, de->key); - replaceSateliteDictKeyPtrAndOrDefragDictEntry(db->expires, keysds, newsds, hash, &defragged); + if (!db->setexpire->empty()) { + replaceSateliteOSetKeyPtr(*db->setexpire, keysds, newsds); } /* Try to defrag robj and / or string value. */ diff --git a/src/evict.cpp b/src/evict.cpp index 4be6bf761..8cf24dd5e 100644 --- a/src/evict.cpp +++ b/src/evict.cpp @@ -150,6 +150,84 @@ void evictionPoolAlloc(void) { EvictionPoolLRU = ep; } +void processEvictionCandidate(int dbid, sds key, robj *o, const expireEntry *e, struct evictionPoolEntry *pool) +{ + unsigned long long idle; + + /* Calculate the idle time according to the policy. This is called + * idle just because the code initially handled LRU, but is in fact + * just a score where an higher score means better candidate. */ + if (g_pserver->maxmemory_policy & MAXMEMORY_FLAG_LRU) { + idle = (o != nullptr) ? estimateObjectIdleTime(o) : 0; + } else if (g_pserver->maxmemory_policy & MAXMEMORY_FLAG_LFU) { + /* When we use an LRU policy, we sort the keys by idle time + * so that we expire keys starting from greater idle time. + * However when the policy is an LFU one, we have a frequency + * estimation, and we want to evict keys with lower frequency + * first. So inside the pool we put objects using the inverted + * frequency subtracting the actual frequency to the maximum + * frequency of 255. */ + idle = 255-LFUDecrAndReturn(o); + } else if (g_pserver->maxmemory_policy == MAXMEMORY_VOLATILE_TTL) { + /* In this case the sooner the expire the better. */ + idle = ULLONG_MAX - e->when(); + } else { + serverPanic("Unknown eviction policy in evictionPoolPopulate()"); + } + + /* Insert the element inside the pool. + * First, find the first empty bucket or the first populated + * bucket that has an idle time smaller than our idle time. */ + int k = 0; + while (k < EVPOOL_SIZE && + pool[k].key && + pool[k].idle < idle) k++; + if (k == 0 && pool[EVPOOL_SIZE-1].key != NULL) { + /* Can't insert if the element is < the worst element we have + * and there are no empty buckets. */ + return; + } else if (k < EVPOOL_SIZE && pool[k].key == NULL) { + /* Inserting into empty position. No setup needed before insert. */ + } else { + /* Inserting in the middle. Now k points to the first element + * greater than the element to insert. */ + if (pool[EVPOOL_SIZE-1].key == NULL) { + /* Free space on the right? Insert at k shifting + * all the elements from k to end to the right. */ + + /* Save SDS before overwriting. */ + sds cached = pool[EVPOOL_SIZE-1].cached; + memmove(pool+k+1,pool+k, + sizeof(pool[0])*(EVPOOL_SIZE-k-1)); + pool[k].cached = cached; + } else { + /* No free space on right? Insert at k-1 */ + k--; + /* Shift all elements on the left of k (included) to the + * left, so we discard the element with smaller idle time. */ + sds cached = pool[0].cached; /* Save SDS before overwriting. */ + if (pool[0].key != pool[0].cached) sdsfree(pool[0].key); + memmove(pool,pool+1,sizeof(pool[0])*k); + pool[k].cached = cached; + } + } + + /* Try to reuse the cached SDS string allocated in the pool entry, + * because allocating and deallocating this object is costly + * (according to the profiler, not my fantasy. Remember: + * premature optimizbla bla bla bla. */ + int klen = sdslen(key); + if (klen > EVPOOL_CACHED_SDS_SIZE) { + pool[k].key = sdsdup(key); + } else { + memcpy(pool[k].cached,key,klen+1); + sdssetlen(pool[k].cached,klen); + pool[k].key = pool[k].cached; + } + pool[k].idle = idle; + pool[k].dbid = dbid; +} + /* This is an helper function for freeMemoryIfNeeded(), it is used in order * to populate the evictionPool with a few entries every time we want to * expire a key. Keys with idle time smaller than one of the current @@ -159,100 +237,36 @@ void evictionPoolAlloc(void) { * idle time are on the left, and keys with the higher idle time on the * right. */ -void evictionPoolPopulate(int dbid, dict *sampledict, dict *keydict, struct evictionPoolEntry *pool) { - int j, k, count; - dictEntry **samples = (dictEntry**)alloca(g_pserver->maxmemory_samples * sizeof(dictEntry*)); +struct visitFunctor +{ + int dbid; + dict *dbdict; + struct evictionPoolEntry *pool; + int count; - count = dictGetSomeKeys(sampledict,samples,g_pserver->maxmemory_samples); - for (j = 0; j < count; j++) { - unsigned long long idle; - sds key; - robj *o = nullptr; - dictEntry *de; - - de = samples[j]; - key = (sds)dictGetKey(de); - - /* If the dictionary we are sampling from is not the main - * dictionary (but the expires one) we need to lookup the key - * again in the key dictionary to obtain the value object. */ - if (g_pserver->maxmemory_policy != MAXMEMORY_VOLATILE_TTL) { - if (sampledict != keydict) de = dictFind(keydict, key); - o = (robj*)dictGetVal(de); + bool operator()(const expireEntry &e) + { + dictEntry *de = dictFind(dbdict, e.key()); + processEvictionCandidate(dbid, (sds)dictGetKey(de), (robj*)dictGetVal(de), &e, pool); + ++count; + return count < g_pserver->maxmemory_samples; + } +}; +void evictionPoolPopulate(int dbid, dict *dbdict, expireset *setexpire, struct evictionPoolEntry *pool) +{ + if (setexpire != nullptr) + { + visitFunctor visitor { dbid, dbdict, pool, 0 }; + setexpire->random_visit(visitor); + } + else + { + dictEntry **samples = (dictEntry**)alloca(g_pserver->maxmemory_samples * sizeof(dictEntry*)); + int count = dictGetSomeKeys(dbdict,samples,g_pserver->maxmemory_samples); + for (int j = 0; j < count; j++) { + robj *o = (robj*)dictGetVal(samples[j]); + processEvictionCandidate(dbid, (sds)dictGetKey(samples[j]), o, nullptr, pool); } - - /* Calculate the idle time according to the policy. This is called - * idle just because the code initially handled LRU, but is in fact - * just a score where an higher score means better candidate. */ - if (g_pserver->maxmemory_policy & MAXMEMORY_FLAG_LRU) { - idle = (o != nullptr) ? estimateObjectIdleTime(o) : 0; - } else if (g_pserver->maxmemory_policy & MAXMEMORY_FLAG_LFU) { - /* When we use an LRU policy, we sort the keys by idle time - * so that we expire keys starting from greater idle time. - * However when the policy is an LFU one, we have a frequency - * estimation, and we want to evict keys with lower frequency - * first. So inside the pool we put objects using the inverted - * frequency subtracting the actual frequency to the maximum - * frequency of 255. */ - idle = 255-LFUDecrAndReturn(o); - } else if (g_pserver->maxmemory_policy == MAXMEMORY_VOLATILE_TTL) { - /* In this case the sooner the expire the better. */ - idle = ULLONG_MAX - (long)dictGetVal(de); - } else { - serverPanic("Unknown eviction policy in evictionPoolPopulate()"); - } - - /* Insert the element inside the pool. - * First, find the first empty bucket or the first populated - * bucket that has an idle time smaller than our idle time. */ - k = 0; - while (k < EVPOOL_SIZE && - pool[k].key && - pool[k].idle < idle) k++; - if (k == 0 && pool[EVPOOL_SIZE-1].key != NULL) { - /* Can't insert if the element is < the worst element we have - * and there are no empty buckets. */ - continue; - } else if (k < EVPOOL_SIZE && pool[k].key == NULL) { - /* Inserting into empty position. No setup needed before insert. */ - } else { - /* Inserting in the middle. Now k points to the first element - * greater than the element to insert. */ - if (pool[EVPOOL_SIZE-1].key == NULL) { - /* Free space on the right? Insert at k shifting - * all the elements from k to end to the right. */ - - /* Save SDS before overwriting. */ - sds cached = pool[EVPOOL_SIZE-1].cached; - memmove(pool+k+1,pool+k, - sizeof(pool[0])*(EVPOOL_SIZE-k-1)); - pool[k].cached = cached; - } else { - /* No free space on right? Insert at k-1 */ - k--; - /* Shift all elements on the left of k (included) to the - * left, so we discard the element with smaller idle time. */ - sds cached = pool[0].cached; /* Save SDS before overwriting. */ - if (pool[0].key != pool[0].cached) sdsfree(pool[0].key); - memmove(pool,pool+1,sizeof(pool[0])*k); - pool[k].cached = cached; - } - } - - /* Try to reuse the cached SDS string allocated in the pool entry, - * because allocating and deallocating this object is costly - * (according to the profiler, not my fantasy. Remember: - * premature optimizbla bla bla bla. */ - int klen = sdslen(key); - if (klen > EVPOOL_CACHED_SDS_SIZE) { - pool[k].key = sdsdup(key); - } else { - memcpy(pool[k].cached,key,klen+1); - sdssetlen(pool[k].cached,klen); - pool[k].key = pool[k].cached; - } - pool[k].idle = idle; - pool[k].dbid = dbid; } } @@ -474,8 +488,6 @@ int freeMemoryIfNeeded(void) { sds bestkey = NULL; int bestdbid; redisDb *db; - dict *dict; - dictEntry *de; if (g_pserver->maxmemory_policy & (MAXMEMORY_FLAG_LRU|MAXMEMORY_FLAG_LFU) || g_pserver->maxmemory_policy == MAXMEMORY_VOLATILE_TTL) @@ -490,10 +502,18 @@ int freeMemoryIfNeeded(void) { * every DB. */ for (i = 0; i < cserver.dbnum; i++) { db = g_pserver->db+i; - dict = (g_pserver->maxmemory_policy & MAXMEMORY_FLAG_ALLKEYS) ? - db->pdict : db->expires; - if ((keys = dictSize(dict)) != 0) { - evictionPoolPopulate(i, dict, db->pdict, pool); + if (g_pserver->maxmemory_policy & MAXMEMORY_FLAG_ALLKEYS) + { + if ((keys = dictSize(db->pdict)) != 0) { + evictionPoolPopulate(i, db->pdict, nullptr, pool); + total_keys += keys; + } + } + else + { + keys = db->setexpire->size(); + if (keys != 0) + evictionPoolPopulate(i, db->pdict, db->setexpire, pool); total_keys += keys; } } @@ -503,14 +523,11 @@ int freeMemoryIfNeeded(void) { for (k = EVPOOL_SIZE-1; k >= 0; k--) { if (pool[k].key == NULL) continue; bestdbid = pool[k].dbid; + sds key = nullptr; - if (g_pserver->maxmemory_policy & MAXMEMORY_FLAG_ALLKEYS) { - de = dictFind(g_pserver->db[pool[k].dbid].pdict, - pool[k].key); - } else { - de = dictFind(g_pserver->db[pool[k].dbid].expires, - pool[k].key); - } + dictEntry *de = dictFind(g_pserver->db[pool[k].dbid].pdict,pool[k].key); + if (de != nullptr && (g_pserver->maxmemory_policy & MAXMEMORY_FLAG_ALLKEYS || ((robj*)dictGetVal(de))->FExpires())) + key = (sds)dictGetKey(de); /* Remove the entry from the pool. */ if (pool[k].key != pool[k].cached) @@ -520,8 +537,8 @@ int freeMemoryIfNeeded(void) { /* If the key exists, is our pick. Otherwise it is * a ghost and we need to try the next element. */ - if (de) { - bestkey = (sds)dictGetKey(de); + if (key) { + bestkey = key; break; } else { /* Ghost... Iterate again. */ @@ -540,13 +557,23 @@ int freeMemoryIfNeeded(void) { for (i = 0; i < cserver.dbnum; i++) { j = (++next_db) % cserver.dbnum; db = g_pserver->db+j; - dict = (g_pserver->maxmemory_policy == MAXMEMORY_ALLKEYS_RANDOM) ? - db->pdict : db->expires; - if (dictSize(dict) != 0) { - de = dictGetRandomKey(dict); - bestkey = (sds)dictGetKey(de); - bestdbid = j; - break; + if (g_pserver->maxmemory_policy == MAXMEMORY_ALLKEYS_RANDOM) + { + if (dictSize(db->pdict) != 0) { + dictEntry *de = dictGetRandomKey(db->pdict); + bestkey = (sds)dictGetKey(de); + bestdbid = j; + break; + } + } + else + { + if (!db->setexpire->empty()) + { + bestkey = (sds)db->setexpire->random_value().key(); + bestdbid = j; + break; + } } } } diff --git a/src/expire.cpp b/src/expire.cpp index 64a430389..ba0b99284 100644 --- a/src/expire.cpp +++ b/src/expire.cpp @@ -32,6 +32,21 @@ #include "server.h" +void activeExpireCycleExpireFullKey(redisDb *db, const char *key) { + robj *keyobj = createStringObject(key,sdslen(key)); + + propagateExpire(db,keyobj,g_pserver->lazyfree_lazy_expire); + if (g_pserver->lazyfree_lazy_expire) + dbAsyncDelete(db,keyobj); + else + dbSyncDelete(db,keyobj); + notifyKeyspaceEvent(NOTIFY_EXPIRED, + "expired",keyobj,db->id); + if (g_pserver->tracking_clients) trackingInvalidateKey(keyobj); + decrRefCount(keyobj); + g_pserver->stat_expiredkeys++; +} + /*----------------------------------------------------------------------------- * Incremental collection of expired keys. * @@ -51,25 +66,102 @@ * * The parameter 'now' is the current time in milliseconds as is passed * to the function to avoid too many gettimeofday() syscalls. */ -int activeExpireCycleTryExpire(redisDb *db, dictEntry *de, long long now) { - long long t = dictGetSignedIntegerVal(de); - if (now > t) { - sds key = (sds)dictGetKey(de); - robj *keyobj = createStringObject(key,sdslen(key)); - - propagateExpire(db,keyobj,g_pserver->lazyfree_lazy_expire); - if (g_pserver->lazyfree_lazy_expire) - dbAsyncDelete(db,keyobj); - else - dbSyncDelete(db,keyobj); - notifyKeyspaceEvent(NOTIFY_EXPIRED, - "expired",keyobj,db->id); - decrRefCount(keyobj); - g_pserver->stat_expiredkeys++; - return 1; - } else { - return 0; +void activeExpireCycleExpire(redisDb *db, expireEntry &e, long long now) { + if (!e.FFat()) + { + activeExpireCycleExpireFullKey(db, e.key()); + return; } + + expireEntryFat *pfat = e.pfatentry(); + dictEntry *de = dictFind(db->pdict, e.key()); + robj *val = (robj*)dictGetVal(de); + int deleted = 0; + while (!pfat->FEmpty()) + { + if (pfat->nextExpireEntry().when > now) + break; + + // Is it the full key expiration? + if (pfat->nextExpireEntry().spsubkey == nullptr) + { + activeExpireCycleExpireFullKey(db, e.key()); + return; + } + + switch (val->type) + { + case OBJ_SET: + if (setTypeRemove(val,pfat->nextExpireEntry().spsubkey.get())) { + deleted++; + if (setTypeSize(val) == 0) { + activeExpireCycleExpireFullKey(db, e.key()); + return; + } + } + break; + case OBJ_LIST: + case OBJ_ZSET: + case OBJ_HASH: + default: + serverAssert(false); + } + pfat->popfrontExpireEntry(); + } + + if (deleted) + { + robj objT; + switch (val->type) + { + case OBJ_SET: + initStaticStringObject(objT, (char*)e.key()); + signalModifiedKey(db,&objT); + notifyKeyspaceEvent(NOTIFY_SET,"srem",&objT,db->id); + break; + } + } + + if (pfat->FEmpty()) + { + robj *keyobj = createStringObject(e.key(),sdslen(e.key())); + removeExpire(db, keyobj); + decrRefCount(keyobj); + } +} + +void expireMemberCommand(client *c) +{ + long long when; + if (getLongLongFromObjectOrReply(c, c->argv[3], &when, NULL) != C_OK) + return; + + when *= 1000; + when += mstime(); + + /* No key, return zero. */ + dictEntry *de = dictFind(c->db->pdict, szFromObj(c->argv[1])); + if (de == NULL) { + addReply(c,shared.czero); + return; + } + + robj *val = (robj*)dictGetVal(de); + + switch (val->type) + { + case OBJ_SET: + // these types are safe + break; + + default: + addReplyError(c, "object type is unsupported"); + return; + } + + setExpire(c, c->db, c->argv[1], c->argv[2], when); + + addReply(c, shared.ok); } /* Try to expire a few timed out keys. The algorithm used is adaptive and @@ -147,7 +239,6 @@ void activeExpireCycle(int type) { long total_expired = 0; for (j = 0; j < dbs_per_call && timelimit_exit == 0; j++) { - int expired; redisDb *db = g_pserver->db+(current_db % cserver.dbnum); /* Increment the DB now so we are sure if we run out of time @@ -155,78 +246,46 @@ void activeExpireCycle(int type) { * distribute the time evenly across DBs. */ current_db++; - /* Continue to expire if at the end of the cycle more than 25% - * of the keys were expired. */ - do { - unsigned long num, slots; - long long now, ttl_sum; - int ttl_samples; - iteration++; + long long now; + iteration++; + now = mstime(); - /* If there is nothing to expire try next DB ASAP. */ - if ((num = dictSize(db->expires)) == 0) { - db->avg_ttl = 0; - break; + /* If there is nothing to expire try next DB ASAP. */ + if (db->setexpire->empty()) + { + db->avg_ttl = 0; + db->last_expire_set = now; + continue; + } + + size_t expired = 0; + size_t tried = 0; + long long check = ACTIVE_EXPIRE_CYCLE_FAST_DURATION; // assume a check is roughly 1us. It isn't but good enough + db->expireitr = db->setexpire->enumerate(db->expireitr, now, [&](expireEntry &e) __attribute__((always_inline)) { + if (e.when() < now) + { + activeExpireCycleExpire(db, e, now); + ++expired; } - slots = dictSlots(db->expires); - now = mstime(); + ++tried; - /* When there are less than 1% filled slots getting random - * keys is expensive, so stop here waiting for better times... - * The dictionary will be resized asap. */ - if (num && slots > DICT_HT_INITIAL_SIZE && - (num*100/slots < 1)) break; - - /* The main collection cycle. Sample random keys among keys - * with an expire set, checking for expired ones. */ - expired = 0; - ttl_sum = 0; - ttl_samples = 0; - - if (num > ACTIVE_EXPIRE_CYCLE_LOOKUPS_PER_LOOP) - num = ACTIVE_EXPIRE_CYCLE_LOOKUPS_PER_LOOP; - - while (num--) { - dictEntry *de; - long long ttl; - - if ((de = dictGetRandomKey(db->expires)) == NULL) break; - ttl = dictGetSignedIntegerVal(de)-now; - if (activeExpireCycleTryExpire(db,de,now)) expired++; - if (ttl > 0) { - /* We want the average TTL of keys yet not expired. */ - ttl_sum += ttl; - ttl_samples++; - } - total_sampled++; - } - total_expired += expired; - - /* Update the average TTL stats for this database. */ - if (ttl_samples) { - long long avg_ttl = ttl_sum/ttl_samples; - - /* Do a simple running average with a few samples. - * We just use the current estimate with a weight of 2% - * and the previous estimate with a weight of 98%. */ - if (db->avg_ttl == 0) db->avg_ttl = avg_ttl; - db->avg_ttl = (db->avg_ttl/50)*49 + (avg_ttl/50); - } - - /* We can't block forever here even if there are many keys to - * expire. So after a given amount of milliseconds return to the - * caller waiting for the other active expire cycle. */ - if ((iteration & 0xf) == 0) { /* check once every 16 iterations. */ + if ((tried % ACTIVE_EXPIRE_CYCLE_LOOKUPS_PER_LOOP) == 0) + { + /* We can't block forever here even if there are many keys to + * expire. So after a given amount of milliseconds return to the + * caller waiting for the other active expire cycle. */ elapsed = ustime()-start; if (elapsed > timelimit) { timelimit_exit = 1; g_pserver->stat_expired_time_cap_reached_count++; - break; + return false; } + check = ACTIVE_EXPIRE_CYCLE_FAST_DURATION; } - /* We don't repeat the cycle if there are less than 25% of keys - * found expired in the current DB. */ - } while (expired > ACTIVE_EXPIRE_CYCLE_LOOKUPS_PER_LOOP/4); + return true; + }, &check); + + total_expired += expired; } elapsed = ustime()-start; @@ -300,20 +359,27 @@ void expireSlaveKeys(void) { while(dbids && dbid < cserver.dbnum) { if ((dbids & 1) != 0) { redisDb *db = g_pserver->db+dbid; - dictEntry *expire = dictFind(db->expires,keyname); + + // the expire is hashed based on the key pointer, so we need the point in the main db + dictEntry *deMain = dictFind(db->pdict, keyname); + auto itr = db->setexpire->end(); + if (deMain != nullptr) + itr = db->setexpire->find((sds)dictGetKey(deMain)); int expired = 0; - if (expire && - activeExpireCycleTryExpire(g_pserver->db+dbid,expire,start)) + if (itr != db->setexpire->end()) { - expired = 1; + if (itr->when() < start) { + activeExpireCycleExpire(g_pserver->db+dbid,*itr,start); + expired = 1; + } } /* If the key was not expired in this DB, we need to set the * corresponding bit in the new bitmap we set as value. * At the end of the loop if the bitmap is zero, it means we * no longer need to keep track of this key. */ - if (expire && !expired) { + if (itr != db->setexpire->end() && !expired) { noexpire++; new_dbids |= (uint64_t)1 << dbid; } @@ -440,7 +506,7 @@ void expireGenericCommand(client *c, long long basetime, int unit) { addReply(c, shared.cone); return; } else { - setExpire(c,c->db,key,when); + setExpire(c,c->db,key,nullptr,when); addReply(c,shared.cone); signalModifiedKey(c->db,key); notifyKeyspaceEvent(NOTIFY_GENERIC,"expire",key,c->db->id); @@ -471,7 +537,7 @@ void pexpireatCommand(client *c) { /* Implements TTL and PTTL */ void ttlGenericCommand(client *c, int output_ms) { - long long expire, ttl = -1; + long long expire = -1, ttl = -1; /* If the key does not exist at all, return -2 */ if (lookupKeyReadWithFlags(c->db,c->argv[1],LOOKUP_NOTOUCH) == nullptr) { @@ -480,7 +546,10 @@ void ttlGenericCommand(client *c, int output_ms) { } /* The key exists. Return -1 if it has no expire, or the actual * TTL value otherwise. */ - expire = getExpire(c->db,c->argv[1]); + expireEntry *pexpire = getExpire(c->db,c->argv[1]); + if (pexpire != nullptr) + pexpire->FGetPrimaryExpire(&expire); + if (expire != -1) { ttl = expire-mstime(); if (ttl < 0) ttl = 0; diff --git a/src/help.h b/src/help.h index 184d76724..01b856b9d 100644 --- a/src/help.h +++ b/src/help.h @@ -343,6 +343,9 @@ struct commandHelp { "Set the expiration for a key as a UNIX timestamp", 0, "1.2.0" }, + { "EXPIREMEMBER", + "key subkey seconds", + "set a subkey's time to live in seconds"}, { "FLUSHALL", "[ASYNC]", "Remove all keys from all databases", diff --git a/src/hyperloglog.cpp b/src/hyperloglog.cpp index 344fd219f..0b1239965 100644 --- a/src/hyperloglog.cpp +++ b/src/hyperloglog.cpp @@ -710,6 +710,7 @@ int hllSparseSet(robj *o, long index, uint8_t count) { first += span; } if (span == 0) return -1; /* Invalid format. */ + if (p >= end) return -1; /* Invalid format. */ next = HLL_SPARSE_IS_XZERO(p) ? p+2 : p+1; if (next >= end) next = NULL; diff --git a/src/lazyfree.cpp b/src/lazyfree.cpp index 6d56ec86d..91577cb85 100644 --- a/src/lazyfree.cpp +++ b/src/lazyfree.cpp @@ -52,16 +52,19 @@ size_t lazyfreeGetFreeEffort(robj *obj) { * will be reclaimed in a different bio.c thread. */ #define LAZYFREE_THRESHOLD 64 int dbAsyncDelete(redisDb *db, robj *key) { - /* Deleting an entry from the expires dict will not free the sds of - * the key, because it is shared with the main dictionary. */ - if (dictSize(db->expires) > 0) dictDelete(db->expires,ptrFromObj(key)); - /* If the value is composed of a few allocations, to free in a lazy way * is actually just slower... So under a certain limit we just free * the object synchronously. */ dictEntry *de = dictUnlink(db->pdict,ptrFromObj(key)); if (de) { robj *val = (robj*)dictGetVal(de); + if (val->FExpires()) + { + /* Deleting an entry from the expires dict will not free the sds of + * the key, because it is shared with the main dictionary. */ + removeExpireCore(db,key,de); + } + size_t free_effort = lazyfreeGetFreeEffort(val); /* If releasing the object is too much work, do it in the background @@ -72,7 +75,7 @@ int dbAsyncDelete(redisDb *db, robj *key) { * objects, and then call dbDelete(). In this case we'll fall * through and reach the dictFreeUnlinkedEntry() call, that will be * equivalent to just calling decrRefCount(). */ - if (free_effort > LAZYFREE_THRESHOLD && val->refcount == 1) { + if (free_effort > LAZYFREE_THRESHOLD && val->getrefcount(std::memory_order_relaxed) == 1) { atomicIncr(lazyfree_objects,1); bioCreateBackgroundJob(BIO_LAZY_FREE,val,NULL,NULL); dictSetVal(db->pdict,de,NULL); @@ -93,7 +96,7 @@ int dbAsyncDelete(redisDb *db, robj *key) { /* Free an object, if the object is huge enough, free it in async way. */ void freeObjAsync(robj *o) { size_t free_effort = lazyfreeGetFreeEffort(o); - if (free_effort > LAZYFREE_THRESHOLD && o->refcount == 1) { + if (free_effort > LAZYFREE_THRESHOLD && o->getrefcount(std::memory_order_relaxed) == 1) { atomicIncr(lazyfree_objects,1); bioCreateBackgroundJob(BIO_LAZY_FREE,o,NULL,NULL); } else { @@ -105,11 +108,13 @@ void freeObjAsync(robj *o) { * create a new empty set of hash tables and scheduling the old ones for * lazy freeing. */ void emptyDbAsync(redisDb *db) { - dict *oldht1 = db->pdict, *oldht2 = db->expires; + dict *oldht1 = db->pdict; + auto *set = db->setexpire; + db->setexpire = new (MALLOC_LOCAL) expireset(); + db->expireitr = db->setexpire->end(); db->pdict = dictCreate(&dbDictType,NULL); - db->expires = dictCreate(&keyptrDictType,NULL); atomicIncr(lazyfree_objects,dictSize(oldht1)); - bioCreateBackgroundJob(BIO_LAZY_FREE,NULL,oldht1,oldht2); + bioCreateBackgroundJob(BIO_LAZY_FREE,NULL,oldht1,set); } /* Empty the slots-keys map of Redis CLuster by creating a new empty one @@ -136,10 +141,10 @@ void lazyfreeFreeObjectFromBioThread(robj *o) { * when the database was logically deleted. 'sl' is a skiplist used by * Redis Cluster in order to take the hash slots -> keys mapping. This * may be NULL if Redis Cluster is disabled. */ -void lazyfreeFreeDatabaseFromBioThread(dict *ht1, dict *ht2) { +void lazyfreeFreeDatabaseFromBioThread(dict *ht1, expireset *set) { size_t numkeys = dictSize(ht1); dictRelease(ht1); - dictRelease(ht2); + delete set; atomicDecr(lazyfree_objects,numkeys); } diff --git a/src/module.cpp b/src/module.cpp index f071ae4e1..1d00d2b1d 100644 --- a/src/module.cpp +++ b/src/module.cpp @@ -32,6 +32,7 @@ #include #include #include +#include #define REDISMODULE_CORE 1 #include "redismodule.h" @@ -565,7 +566,7 @@ void RedisModuleCommandDispatcher(client *c) { for (int i = 0; i < c->argc; i++) { /* Only do the work if the module took ownership of the object: * in that case the refcount is no longer 1. */ - if (c->argv[i]->refcount > 1) + if (c->argv[i]->getrefcount(std::memory_order_relaxed) > 1) trimStringObjectIfNeeded(c->argv[i]); } } @@ -1036,7 +1037,7 @@ int RM_StringCompare(RedisModuleString *a, RedisModuleString *b) { /* Return the (possibly modified in encoding) input 'str' object if * the string is unshared, otherwise NULL is returned. */ RedisModuleString *moduleAssertUnsharedString(RedisModuleString *str) { - if (str->refcount != 1) { + if (str->getrefcount(std::memory_order_relaxed) != 1) { serverLog(LL_WARNING, "Module attempted to use an in-place string modify operation " "with a string referenced multiple times. Please check the code " @@ -1252,6 +1253,17 @@ int RM_ReplyWithStringBuffer(RedisModuleCtx *ctx, const char *buf, size_t len) { return REDISMODULE_OK; } +/* Reply with a bulk string, taking in input a C buffer pointer that is + * assumed to be null-terminated. + * + * The function always returns REDISMODULE_OK. */ +int RM_ReplyWithCString(RedisModuleCtx *ctx, const char *buf) { + client *c = moduleGetReplyClient(ctx); + if (c == NULL) return REDISMODULE_OK; + addReplyBulkCString(c,(char*)buf); + return REDISMODULE_OK; +} + /* Reply with a bulk string, taking in input a RedisModuleString object. * * The function always returns REDISMODULE_OK. */ @@ -1465,6 +1477,9 @@ int RM_GetContextFlags(RedisModuleCtx *ctx) { if (g_pserver->cluster_enabled) flags |= REDISMODULE_CTX_FLAGS_CLUSTER; + if (g_pserver->loading) + flags |= REDISMODULE_CTX_FLAGS_LOADING; + /* Maxmemory and eviction policy */ if (g_pserver->maxmemory > 0) { flags |= REDISMODULE_CTX_FLAGS_MAXMEMORY; @@ -1629,7 +1644,11 @@ int RM_UnlinkKey(RedisModuleKey *key) { * If no TTL is associated with the key or if the key is empty, * REDISMODULE_NO_EXPIRE is returned. */ mstime_t RM_GetExpire(RedisModuleKey *key) { - mstime_t expire = getExpire(key->db,key->key); + expireEntry *pexpire = getExpire(key->db,key->key); + mstime_t expire = -1; + if (pexpire != nullptr) + pexpire->FGetPrimaryExpire(&expire); + if (expire == -1 || key->value == NULL) return -1; expire -= mstime(); return expire >= 0 ? expire : 0; @@ -1649,7 +1668,7 @@ int RM_SetExpire(RedisModuleKey *key, mstime_t expire) { return REDISMODULE_ERR; if (expire != REDISMODULE_NO_EXPIRE) { expire += mstime(); - setExpire(key->ctx->client,key->db,key->key,expire); + setExpire(key->ctx->client,key->db,key->key,nullptr,expire); } else { removeExpire(key->db,key->key); } @@ -5216,6 +5235,15 @@ int moduleLoad(const char *path, void **module_argv, int module_argc) { int (*onload)(void *, void **, int); void *handle; RedisModuleCtx ctx = REDISMODULE_CTX_INIT; + + struct stat st; + if (stat(path, &st) == 0) + { // this check is best effort + if (!(st.st_mode & (S_IXUSR | S_IXGRP | S_IXOTH))) { + serverLog(LL_WARNING, "Module %s failed to load: It does not have execute permissions.", path); + return C_ERR; + } + } handle = dlopen(path,RTLD_NOW|RTLD_LOCAL); if (handle == NULL) { diff --git a/src/networking.cpp b/src/networking.cpp index baaf796fd..0588745b2 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -174,6 +174,7 @@ client *createClient(int fd, int iel) { c->bufAsync = NULL; c->buflenAsync = 0; c->bufposAsync = 0; + c->client_tracking_redirection = 0; c->casyncOpsPending = 0; memset(c->uuid, 0, UUID_BINARY_LEN); @@ -246,7 +247,7 @@ void clientInstallAsyncWriteHandler(client *c) { * data should be appended to the output buffers. */ int prepareClientToWrite(client *c, bool fAsync) { fAsync = fAsync && !FCorrectThread(c); // Not async if we're on the right thread - serverAssert(!fAsync || GlobalLocksAcquired()); + serverAssert(FCorrectThread(c) || fAsync); serverAssert(c->fd <= 0 || c->lock.fOwnLock()); if (c->flags & CLIENT_FORCE_REPLY) return C_OK; // FORCE REPLY means we're doing something else with the buffer. @@ -1264,6 +1265,9 @@ void unlinkClient(client *c) { serverAssert(fFound); c->fPendingAsyncWrite = FALSE; } + + /* Clear the tracking status. */ + if (c->flags & CLIENT_TRACKING) disableTracking(c); } bool freeClient(client *c) { @@ -2294,6 +2298,8 @@ sds catClientInfoString(sds s, client *client) { if (client->flags & CLIENT_PUBSUB) *p++ = 'P'; if (client->flags & CLIENT_MULTI) *p++ = 'x'; if (client->flags & CLIENT_BLOCKED) *p++ = 'b'; + if (client->flags & CLIENT_TRACKING) *p++ = 't'; + if (client->flags & CLIENT_TRACKING_BROKEN_REDIR) *p++ = 'R'; if (client->flags & CLIENT_DIRTY_CAS) *p++ = 'd'; if (client->flags & CLIENT_CLOSE_AFTER_REPLY) *p++ = 'c'; if (client->flags & CLIENT_UNBLOCKED) *p++ = 'u'; @@ -2406,6 +2412,7 @@ void clientCommand(client *c) { "reply (on|off|skip) -- Control the replies sent to the current connection.", "setname -- Assign the name to the current connection.", "unblock [TIMEOUT|ERROR] -- Unblock the specified blocked client.", +"tracking (on|off) [REDIRECT ] -- Enable client keys tracking for client side caching.", NULL }; addReplyHelp(c, help); @@ -2564,21 +2571,57 @@ NULL } else { addReply(c,shared.czero); } - } else if (!strcasecmp((const char*)ptrFromObj(c->argv[1]),"setname") && c->argc == 3) { + } else if (!strcasecmp(szFromObj(c->argv[1]),"setname") && c->argc == 3) { + /* CLIENT SETNAME */ if (clientSetNameOrReply(c,c->argv[2]) == C_OK) addReply(c,shared.ok); - } else if (!strcasecmp((const char*)ptrFromObj(c->argv[1]),"getname") && c->argc == 2) { + } else if (!strcasecmp(szFromObj(c->argv[1]),"getname") && c->argc == 2) { + /* CLIENT GETNAME */ if (c->name) addReplyBulk(c,c->name); else addReplyNull(c, shared.nullbulk); - } else if (!strcasecmp((const char*)ptrFromObj(c->argv[1]),"pause") && c->argc == 3) { + } else if (!strcasecmp(szFromObj(c->argv[1]),"pause") && c->argc == 3) { + /* CLIENT PAUSE */ long long duration; - if (getTimeoutFromObjectOrReply(c,c->argv[2],&duration,UNIT_MILLISECONDS) - != C_OK) return; + if (getTimeoutFromObjectOrReply(c,c->argv[2],&duration, + UNIT_MILLISECONDS) != C_OK) return; pauseClients(duration); addReply(c,shared.ok); + } else if (!strcasecmp(szFromObj(c->argv[1]),"tracking") && + (c->argc == 3 || c->argc == 5)) + { + /* CLIENT TRACKING (on|off) [REDIRECT ] */ + long long redir = 0; + + /* Parse the redirection option: we'll require the client with + * the specified ID to exist right now, even if it is possible + * it will get disconnected later. */ + if (c->argc == 5) { + if (strcasecmp(szFromObj(c->argv[3]),"redirect") != 0) { + addReply(c,shared.syntaxerr); + return; + } else { + if (getLongLongFromObjectOrReply(c,c->argv[4],&redir,NULL) != + C_OK) return; + if (lookupClientByID(redir) == NULL) { + addReplyError(c,"The client ID you want redirect to " + "does not exist"); + return; + } + } + } + + if (!strcasecmp(szFromObj(c->argv[2]),"on")) { + enableTracking(c,redir); + } else if (!strcasecmp(szFromObj(c->argv[2]),"off")) { + disableTracking(c); + } else { + addReply(c,shared.syntaxerr); + return; + } + addReply(c,shared.ok); } else { addReplyErrorFormat(c, "Unknown subcommand or wrong number of arguments for '%s'. Try CLIENT HELP", (char*)ptrFromObj(c->argv[1])); } @@ -2745,15 +2788,8 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) { } } -/* This function returns the number of bytes that Redis is virtually +/* This function returns the number of bytes that Redis is * using to store the reply still not read by the client. - * It is "virtual" since the reply output list may contain objects that - * are shared and are not really using additional memory. - * - * The function returns the total sum of the length of all the objects - * stored in the output list, plus the memory used to allocate every - * list node. The static reply buffer is not taken into account since it - * is allocated anyway. * * Note: this function is very fast so can be called as many time as * the caller wishes. The main usage of this function currently is diff --git a/src/object.cpp b/src/object.cpp index 6e65ec52b..ce6265ad1 100644 --- a/src/object.cpp +++ b/src/object.cpp @@ -39,11 +39,11 @@ /* ===================== Creation and parsing of objects ==================== */ robj *createObject(int type, void *ptr) { - robj *o = (robj*)zmalloc(sizeof(*o), MALLOC_SHARED); + robj *o = (robj*)zcalloc(sizeof(*o), MALLOC_SHARED); o->type = type; o->encoding = OBJ_ENCODING_RAW; o->m_ptr = ptr; - o->refcount.store(1, std::memory_order_relaxed); + o->setrefcount(1); o->mvcc_tstamp = OBJ_MVCC_INVALID; /* Set the LRU to the current lruclock (minutes resolution), or @@ -68,8 +68,9 @@ robj *createObject(int type, void *ptr) { * */ robj *makeObjectShared(robj *o) { - serverAssert(o->refcount == 1); - o->refcount.store(OBJ_SHARED_REFCOUNT, std::memory_order_relaxed); + serverAssert(o->getrefcount(std::memory_order_relaxed) == 1); + serverAssert(!o->FExpires()); + o->setrefcount(OBJ_SHARED_REFCOUNT); return o; } @@ -86,12 +87,12 @@ robj *createEmbeddedStringObject(const char *ptr, size_t len) { size_t allocsize = sizeof(struct sdshdr8)+len+1; if (allocsize < sizeof(void*)) allocsize = sizeof(void*); - robj *o = (robj*)zmalloc(sizeof(robj)+allocsize-sizeof(o->m_ptr), MALLOC_SHARED); + robj *o = (robj*)zcalloc(sizeof(robj)+allocsize-sizeof(o->m_ptr), MALLOC_SHARED); struct sdshdr8 *sh = (sdshdr8*)(&o->m_ptr); o->type = OBJ_STRING; o->encoding = OBJ_ENCODING_EMBSTR; - o->refcount.store(1, std::memory_order_relaxed); + o->setrefcount(1); o->mvcc_tstamp = OBJ_MVCC_INVALID; if (g_pserver->maxmemory_policy & MAXMEMORY_FLAG_LFU) { @@ -352,11 +353,14 @@ void freeStreamObject(robj_roptr o) { } void incrRefCount(robj_roptr o) { - if (o->refcount != OBJ_SHARED_REFCOUNT) o->refcount.fetch_add(1, std::memory_order_acquire); + if (o->getrefcount(std::memory_order_relaxed) != OBJ_SHARED_REFCOUNT) o->addref(); } void decrRefCount(robj_roptr o) { - if (o->refcount.load(std::memory_order_acquire) == 1) { + if (o->getrefcount(std::memory_order_relaxed) == OBJ_SHARED_REFCOUNT) + return; + unsigned prev = o->release(); + if (prev == 1) { switch(o->type) { case OBJ_STRING: freeStringObject(o); break; case OBJ_LIST: freeListObject(o); break; @@ -369,8 +373,7 @@ void decrRefCount(robj_roptr o) { } zfree(o.unsafe_robjcast()); } else { - if (o->refcount <= 0) serverPanic("decrRefCount against refcount <= 0"); - if (o->refcount != OBJ_SHARED_REFCOUNT) o->refcount.fetch_sub(1, std::memory_order_acquire); + if (prev <= 0) serverPanic("decrRefCount against refcount <= 0"); } } @@ -394,7 +397,7 @@ void decrRefCountVoid(const void *o) { * decrRefCount(obj); */ robj *resetRefCount(robj *obj) { - obj->refcount = 0; + obj->setrefcount(0); return obj; } @@ -452,7 +455,7 @@ robj *tryObjectEncoding(robj *o) { /* It's not safe to encode shared objects: shared objects can be shared * everywhere in the "object space" of Redis and may end in places where * they are not handled. We handle them only as values in the keyspace. */ - if (o->refcount > 1) return o; + if (o->getrefcount(std::memory_order_relaxed) > 1) return o; /* Check if we can represent this string as a long integer. * Note that we are sure that a string larger than 20 chars is not @@ -1064,8 +1067,7 @@ struct redisMemOverhead *getMemoryOverheadData(void) { mh->db[mh->num_dbs].overhead_ht_main = mem; mem_total+=mem; - mem = dictSize(db->expires) * sizeof(dictEntry) + - dictSlots(db->expires) * sizeof(dictEntry*); + mem = db->setexpire->bytes_used(); mh->db[mh->num_dbs].overhead_ht_expires = mem; mem_total+=mem; @@ -1275,7 +1277,7 @@ NULL } else if (!strcasecmp(szFromObj(c->argv[1]),"refcount") && c->argc == 3) { if ((o = objectCommandLookupOrReply(c,c->argv[2],shared.null[c->resp])) == NULL) return; - addReplyLongLong(c,o->refcount); + addReplyLongLong(c,o->getrefcount(std::memory_order_relaxed)); } else if (!strcasecmp(szFromObj(c->argv[1]),"encoding") && c->argc == 3) { if ((o = objectCommandLookupOrReply(c,c->argv[2],shared.null[c->resp])) == NULL) return; @@ -1474,3 +1476,18 @@ NULL addReplyErrorFormat(c, "Unknown subcommand or wrong number of arguments for '%s'. Try MEMORY HELP", (char*)ptrFromObj(c->argv[1])); } } + +void redisObject::SetFExpires(bool fExpire) +{ + serverAssert(this->refcount != OBJ_SHARED_REFCOUNT); + if (fExpire) + this->refcount.fetch_or(1U << 31, std::memory_order_relaxed); + else + this->refcount.fetch_and(~(1U << 31), std::memory_order_relaxed); +} + +void redisObject::setrefcount(unsigned ref) +{ + serverAssert(!FExpires()); + refcount.store(ref, std::memory_order_relaxed); +} diff --git a/src/rdb.cpp b/src/rdb.cpp index d446109fa..ee61b3f19 100644 --- a/src/rdb.cpp +++ b/src/rdb.cpp @@ -278,8 +278,8 @@ void *rdbLoadIntegerObject(rio *rdb, int enctype, int flags, size_t *lenptr) { v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24); val = (int32_t)v; } else { - val = 0; /* anti-warning */ rdbExitReportCorruptRDB("Unknown RDB integer encoding type %d",enctype); + return nullptr; /* Never reached. */ } if (plain || sds) { char buf[LONG_STR_SIZE], *p; @@ -382,8 +382,7 @@ void *rdbLoadLzfStringObject(rio *rdb, int flags, size_t *lenptr) { /* Load the compressed representation and uncompress it to target. */ if (rioRead(rdb,c,clen) == 0) goto err; if (lzf_decompress(c,clen,val,len) == 0) { - if (rdbCheckMode) rdbCheckSetError("Invalid LZF compressed string"); - goto err; + rdbExitReportCorruptRDB("Invalid LZF compressed string"); } zfree(c); @@ -497,6 +496,7 @@ void *rdbGenericLoadStringObject(rio *rdb, int flags, size_t *lenptr) { return rdbLoadLzfStringObject(rdb,flags,lenptr); default: rdbExitReportCorruptRDB("Unknown RDB string encoding type %d",len); + return nullptr; /* Never reached. */ } } @@ -1031,12 +1031,13 @@ size_t rdbSavedObjectLen(robj *o) { * On error -1 is returned. * On success if the key was actually saved 1 is returned, otherwise 0 * is returned (the key was already expired). */ -int rdbSaveKeyValuePair(rio *rdb, robj *key, robj *val, long long expiretime) { +int rdbSaveKeyValuePair(rio *rdb, robj *key, robj *val, expireEntry *pexpire) { int savelru = g_pserver->maxmemory_policy & MAXMEMORY_FLAG_LRU; int savelfu = g_pserver->maxmemory_policy & MAXMEMORY_FLAG_LFU; /* Save the expire time */ - if (expiretime != -1) { + long long expiretime = -1; + if (pexpire != nullptr && pexpire->FGetPrimaryExpire(&expiretime)) { if (rdbSaveType(rdb,RDB_OPCODE_EXPIRETIME_MS) == -1) return -1; if (rdbSaveMillisecondTime(rdb,expiretime) == -1) return -1; } @@ -1061,14 +1062,29 @@ int rdbSaveKeyValuePair(rio *rdb, robj *key, robj *val, long long expiretime) { if (rdbWriteRaw(rdb,buf,1) == -1) return -1; } - char szMvcc[32]; - snprintf(szMvcc, 32, "%" PRIu64, val->mvcc_tstamp); - if (rdbSaveAuxFieldStrStr(rdb,"mvcc-tstamp", szMvcc) == -1) return -1; + char szT[32]; + snprintf(szT, 32, "%" PRIu64, val->mvcc_tstamp); + if (rdbSaveAuxFieldStrStr(rdb,"mvcc-tstamp", szT) == -1) return -1; /* Save type, key, value */ if (rdbSaveObjectType(rdb,val) == -1) return -1; if (rdbSaveStringObject(rdb,key) == -1) return -1; if (rdbSaveObject(rdb,val,key) == -1) return -1; + + /* Save expire entry after as it will apply to the previously loaded key */ + /* This is because we update the expire datastructure directly without buffering */ + if (pexpire != nullptr) + { + for (auto itr : *pexpire) + { + if (itr.subkey() == nullptr) + continue; // already saved + snprintf(szT, 32, "%lld", itr.when()); + rdbSaveAuxFieldStrStr(rdb,"keydb-subexpire-key",itr.subkey()); + rdbSaveAuxFieldStrStr(rdb,"keydb-subexpire-when",szT); + } + } + return 1; } @@ -1096,6 +1112,28 @@ int rdbSaveInfoAuxFields(rio *rdb, int flags, rdbSaveInfo *rsi) { return 1; } +int saveKey(rio *rdb, redisDb *db, int flags, size_t *processed, const char *keystr, robj *o) +{ + robj key; + + initStaticStringObject(key,(char*)keystr); + expireEntry *pexpire = getExpire(db, &key); + + if (rdbSaveKeyValuePair(rdb,&key,o,pexpire) == -1) + return 0; + + /* When this RDB is produced as part of an AOF rewrite, move + * accumulated diff from parent to child while rewriting in + * order to have a smaller final write. */ + if (flags & RDB_SAVE_AOF_PREAMBLE && + rdb->processed_bytes > *processed+AOF_READ_DIFF_INTERVAL_BYTES) + { + *processed = rdb->processed_bytes; + aofReadDiffFromParent(); + } + return 1; +} + /* Produces a dump of the database in RDB format sending it to the specified * Redis I/O channel. On success C_OK is returned, otherwise C_ERR * is returned and part of the output, or all the output, can be @@ -1134,31 +1172,24 @@ int rdbSaveRio(rio *rdb, int *error, int flags, rdbSaveInfo *rsi) { * these sizes are just hints to resize the hash tables. */ uint64_t db_size, expires_size; db_size = dictSize(db->pdict); - expires_size = dictSize(db->expires); + expires_size = db->setexpire->size(); if (rdbSaveType(rdb,RDB_OPCODE_RESIZEDB) == -1) goto werr; if (rdbSaveLen(rdb,db_size) == -1) goto werr; if (rdbSaveLen(rdb,expires_size) == -1) goto werr; - + /* Iterate this DB writing every entry */ + size_t ckeysExpired = 0; while((de = dictNext(di)) != NULL) { sds keystr = (sds)dictGetKey(de); - robj key, *o = (robj*)dictGetVal(de); - long long expire; + robj *o = (robj*)dictGetVal(de); - initStaticStringObject(key,keystr); - expire = getExpire(db,&key); - if (rdbSaveKeyValuePair(rdb,&key,o,expire) == -1) goto werr; - - /* When this RDB is produced as part of an AOF rewrite, move - * accumulated diff from parent to child while rewriting in - * order to have a smaller final write. */ - if (flags & RDB_SAVE_AOF_PREAMBLE && - rdb->processed_bytes > processed+AOF_READ_DIFF_INTERVAL_BYTES) - { - processed = rdb->processed_bytes; - aofReadDiffFromParent(); - } + if (o->FExpires()) + ++ckeysExpired; + + if (!saveKey(rdb, db, flags, &processed, keystr, o)) + goto werr; } + serverAssert(ckeysExpired == db->setexpire->size()); dictReleaseIterator(di); di = NULL; /* So that we don't release it again on error. */ } @@ -1822,6 +1853,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, robj *key, uint64_t mvcc_tstamp) { } o->mvcc_tstamp = mvcc_tstamp; + serverAssert(!o->FExpires()); return o; } @@ -1890,6 +1922,8 @@ int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi, int loading_aof) { long long lru_idle = -1, lfu_freq = -1, expiretime = -1, now = mstime(); long long lru_clock = 0; uint64_t mvcc_tstamp = OBJ_MVCC_INVALID; + robj *subexpireKey = nullptr; + robj *key = nullptr; rdb->update_cksum = rdbLoadProgressCallback; rdb->max_processing_chunk = g_pserver->loading_process_events_interval_bytes; @@ -1909,9 +1943,9 @@ int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi, int loading_aof) { now = mstime(); lru_clock = LRU_CLOCK(); - + while(1) { - robj *key, *val; + robj *val; /* Read type. */ if ((type = rdbLoadType(rdb)) == -1) goto eoferr; @@ -1965,7 +1999,6 @@ int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi, int loading_aof) { if ((expires_size = rdbLoadLen(rdb,NULL)) == RDB_LENERR) goto eoferr; dictExpand(db->pdict,db_size); - dictExpand(db->expires,expires_size); continue; /* Read next opcode. */ } else if (type == RDB_OPCODE_AUX) { /* AUX: generic string-string fields. Use to add state to RDB @@ -2020,6 +2053,18 @@ int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi, int loading_aof) { } else if (!strcasecmp(szFromObj(auxkey),"mvcc-tstamp")) { static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "Ensure long long is 64-bits"); mvcc_tstamp = strtoull(szFromObj(auxval), nullptr, 10); + } else if (!strcasecmp(szFromObj(auxkey), "keydb-subexpire-key")) { + subexpireKey = auxval; + incrRefCount(subexpireKey); + } else if (!strcasecmp(szFromObj(auxkey), "keydb-subexpire-when")) { + if (key == nullptr || subexpireKey == nullptr) { + serverLog(LL_WARNING, "Corrupt subexpire entry in RDB skipping."); + } + else { + setExpire(NULL, db, key, subexpireKey, strtoll(szFromObj(auxval), nullptr, 10)); + decrRefCount(subexpireKey); + subexpireKey = nullptr; + } } else { /* We ignore fields we don't understand, as by AUX field * contract. */ @@ -2061,6 +2106,12 @@ int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi, int loading_aof) { } /* Read key */ + if (key != nullptr) + { + decrRefCount(key); + key = nullptr; + } + if ((key = rdbLoadStringObject(rdb)) == NULL) goto eoferr; /* Read value */ if ((val = rdbLoadObject(type,rdb,key, mvcc_tstamp)) == NULL) goto eoferr; @@ -2071,27 +2122,26 @@ int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi, int loading_aof) { * snapshot taken by the master may not be reflected on the slave. */ if (listLength(g_pserver->masters) == 0 && !loading_aof && expiretime != -1 && expiretime < now) { decrRefCount(key); + key = nullptr; decrRefCount(val); + val = nullptr; } else { /* Add the new object in the hash table */ - int fInserted = dbMerge(db, key, val, rsi->fForceSetKey); + int fInserted = dbMerge(db, key, val, rsi->fForceSetKey); // Note: dbMerge will incrRef if (fInserted) { /* Set the expire time if needed */ - if (expiretime != -1) setExpire(NULL,db,key,expiretime); + if (expiretime != -1) + setExpire(NULL,db,key,nullptr,expiretime); /* Set usage information (for eviction). */ objectSetLRUOrLFU(val,lfu_freq,lru_idle,lru_clock); - - /* Decrement the key refcount since dbMerge() will take its - * own reference. */ - decrRefCount(key); } else { - decrRefCount(key); decrRefCount(val); + val = nullptr; } } @@ -2101,6 +2151,17 @@ int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi, int loading_aof) { lfu_freq = -1; lru_idle = -1; } + + if (key != nullptr) + decrRefCount(key); + + if (subexpireKey != nullptr) + { + serverLog(LL_WARNING, "Corrupt subexpire entry in RDB."); + decrRefCount(subexpireKey); + subexpireKey = nullptr; + } + /* Verify the checksum if RDB version is >= 5 */ if (rdbver >= 5) { uint64_t cksum, expected = rdb->cksum; @@ -2124,7 +2185,6 @@ eoferr: /* unexpected end of file is handled here with a fatal exit */ return C_ERR; /* Just to avoid warning */ } -int rdbLoadFile(char *filename, rdbSaveInfo *rsi); int rdbLoad(rdbSaveInfo *rsi) { int err = C_ERR; @@ -2144,7 +2204,7 @@ int rdbLoad(rdbSaveInfo *rsi) * * If you pass an 'rsi' structure initialied with RDB_SAVE_OPTION_INIT, the * loading code will fiil the information fields in the structure. */ -int rdbLoadFile(char *filename, rdbSaveInfo *rsi) { +int rdbLoadFile(const char *filename, rdbSaveInfo *rsi) { FILE *fp; rio rdb; int retval; diff --git a/src/rdb.h b/src/rdb.h index 0ee2cad92..edf43d422 100644 --- a/src/rdb.h +++ b/src/rdb.h @@ -136,6 +136,7 @@ int rdbLoadLenByRef(rio *rdb, int *isencoded, uint64_t *lenptr); int rdbSaveObjectType(rio *rdb, robj_roptr o); int rdbLoadObjectType(rio *rdb); int rdbLoad(rdbSaveInfo *rsi); +int rdbLoadFile(const char *filename, rdbSaveInfo *rsi); int rdbSaveBackground(rdbSaveInfo *rsi); int rdbSaveToSlavesSockets(rdbSaveInfo *rsi); void rdbRemoveTempFile(pid_t childpid); diff --git a/src/redis-benchmark.cpp b/src/redis-benchmark.cpp index 45170f037..19aa7a892 100644 --- a/src/redis-benchmark.cpp +++ b/src/redis-benchmark.cpp @@ -1546,7 +1546,10 @@ int main(int argc, const char **argv) { if (node->name) printf("%s ", node->name); printf("%s:%d\n", node->ip, node->port); node->redis_config = getRedisConfig(node->ip, node->port, NULL); - if (node->redis_config == NULL) exit(1); + if (node->redis_config == NULL) { + fprintf(stderr, "WARN: could not fetch node CONFIG %s:%d\n", + node->ip, node->port); + } } printf("\n"); /* Automatically set thread number to node count if not specified @@ -1556,7 +1559,8 @@ int main(int argc, const char **argv) { } else { config.redis_config = getRedisConfig(config.hostip, config.hostport, config.hostsocket); - if (config.redis_config == NULL) exit(1); + if (config.redis_config == NULL) + fprintf(stderr, "WARN: could not fetch server CONFIG\n"); } if (config.num_threads > 0) { diff --git a/src/redismodule.h b/src/redismodule.h index 217025319..fae755b0a 100644 --- a/src/redismodule.h +++ b/src/redismodule.h @@ -91,6 +91,8 @@ extern "C" { #define REDISMODULE_CTX_FLAGS_OOM_WARNING (1<<11) /* The command was sent over the replication link. */ #define REDISMODULE_CTX_FLAGS_REPLICATED (1<<12) +/* Redis is currently loading either from AOF or RDB. */ +#define REDISMODULE_CTX_FLAGS_LOADING (1<<13) #define REDISMODULE_NOTIFY_GENERIC (1<<2) /* g */ @@ -230,6 +232,7 @@ int REDISMODULE_API_FUNC(RedisModule_ReplyWithSimpleString)(RedisModuleCtx *ctx, int REDISMODULE_API_FUNC(RedisModule_ReplyWithArray)(RedisModuleCtx *ctx, long len); void REDISMODULE_API_FUNC(RedisModule_ReplySetArrayLength)(RedisModuleCtx *ctx, long len); int REDISMODULE_API_FUNC(RedisModule_ReplyWithStringBuffer)(RedisModuleCtx *ctx, const char *buf, size_t len); +int REDISMODULE_API_FUNC(RedisModule_ReplyWithCString)(RedisModuleCtx *ctx, const char *buf); int REDISMODULE_API_FUNC(RedisModule_ReplyWithString)(RedisModuleCtx *ctx, RedisModuleString *str); int REDISMODULE_API_FUNC(RedisModule_ReplyWithNull)(RedisModuleCtx *ctx); int REDISMODULE_API_FUNC(RedisModule_ReplyWithDouble)(RedisModuleCtx *ctx, double d); @@ -380,6 +383,7 @@ static int RedisModule_Init(RedisModuleCtx *ctx, const char *name, int ver, int REDISMODULE_GET_API(ReplyWithArray); REDISMODULE_GET_API(ReplySetArrayLength); REDISMODULE_GET_API(ReplyWithStringBuffer); + REDISMODULE_GET_API(ReplyWithCString); REDISMODULE_GET_API(ReplyWithString); REDISMODULE_GET_API(ReplyWithNull); REDISMODULE_GET_API(ReplyWithCallReply); diff --git a/src/replication.cpp b/src/replication.cpp index 52ba568a6..5cd527259 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -31,6 +31,7 @@ #include "server.h" +#include "cluster.h" #include #include @@ -40,6 +41,7 @@ #include #include #include +#include void replicationDiscardCachedMaster(redisMaster *mi); void replicationResurrectCachedMaster(redisMaster *mi, int newfd); @@ -321,9 +323,13 @@ void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) { char uuid[40] = {'\0'}; uuid_unparse(cserver.uuid, uuid); char proto[1024]; - int cchProto = snprintf(proto, sizeof(proto), "*3\r\n$7\r\nRREPLAY\r\n$%d\r\n%s\r\n$%lld\r\n", (int)strlen(uuid), uuid, cchbuf); + int cchProto = snprintf(proto, sizeof(proto), "*4\r\n$7\r\nRREPLAY\r\n$%d\r\n%s\r\n$%lld\r\n", (int)strlen(uuid), uuid, cchbuf); cchProto = std::min((int)sizeof(proto), cchProto); long long master_repl_offset_start = g_pserver->master_repl_offset; + + serverAssert(dictid >= 0); + char szDbNum[128]; + int cchDbNum = snprintf(szDbNum, sizeof(szDbNum), "$%d\r\n%d\r\n", (dictid/10)+1, dictid); /* Write the command to the replication backlog if any. */ if (g_pserver->repl_backlog) @@ -366,6 +372,7 @@ void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) { } const char *crlf = "\r\n"; feedReplicationBacklog(crlf, 2); + feedReplicationBacklog(szDbNum, cchDbNum); } } @@ -394,7 +401,10 @@ void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) { addReplyProtoAsync(slave, reply->buf(), reply->used); } if (!fSendRaw) + { addReplyAsync(slave,shared.crlf); + addReplyProtoAsync(slave, szDbNum, cchDbNum); + } } freeClient(fake); @@ -1218,6 +1228,24 @@ void changeReplicationId(void) { g_pserver->replid[CONFIG_RUN_ID_SIZE] = '\0'; } + +int hexchToInt(char ch) +{ + if (ch >= '0' && ch <= '9') + return ch - '0'; + if (ch >= 'a' && ch <= 'f') + return (ch - 'a') + 10; + return (ch - 'A') + 10; +} +void mergeReplicationId(const char *id) +{ + for (int i = 0; i < CONFIG_RUN_ID_SIZE; ++i) + { + const char *charset = "0123456789abcdef"; + g_pserver->replid[i] = charset[hexchToInt(g_pserver->replid[i]) ^ hexchToInt(id[i])]; + } +} + /* Clear (invalidate) the secondary replication ID. This happens, for * example, after a full resynchronization, when we start a new replication * history. */ @@ -1491,12 +1519,19 @@ void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask) { killRDBChild(); } - if (rename(mi->repl_transfer_tmpfile,g_pserver->rdb_filename) == -1) { - serverLog(LL_WARNING,"Failed trying to rename the temp DB into %s in MASTER <-> REPLICA synchronization: %s", - g_pserver->rdb_filename, strerror(errno)); - cancelReplicationHandshake(mi); - return; + const char *rdb_filename = mi->repl_transfer_tmpfile; + + if (!fUpdate) + { + if (rename(mi->repl_transfer_tmpfile,g_pserver->rdb_filename) == -1) { + serverLog(LL_WARNING,"Failed trying to rename the temp DB into %s in MASTER <-> REPLICA synchronization: %s", + g_pserver->rdb_filename, strerror(errno)); + cancelReplicationHandshake(mi); + return; + } + rdb_filename = g_pserver->rdb_filename; } + serverLog(LL_NOTICE, "MASTER <-> REPLICA sync: %s", fUpdate ? "Keeping old data" : "Flushing old data"); /* We need to stop any AOFRW fork before flusing and parsing * RDB, otherwise we'll create a copy-on-write disaster. */ @@ -1517,7 +1552,7 @@ void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask) { aeDeleteFileEvent(el,mi->repl_transfer_s,AE_READABLE); serverLog(LL_NOTICE, "MASTER <-> REPLICA sync: Loading DB in memory"); rdbSaveInfo rsi = RDB_SAVE_INFO_INIT; - if (rdbLoad(&rsi) != C_OK) { + if (rdbLoadFile(rdb_filename, &rsi) != C_OK) { serverLog(LL_WARNING,"Failed trying to load the MASTER synchronization DB from disk"); cancelReplicationHandshake(mi); /* Re-enable the AOF if we disabled it earlier, in order to restore @@ -1526,16 +1561,25 @@ void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask) { return; } /* Final setup of the connected slave <- master link */ + if (fUpdate) + unlink(mi->repl_transfer_tmpfile); // if we're not updating this became the backup RDB zfree(mi->repl_transfer_tmpfile); close(mi->repl_transfer_fd); replicationCreateMasterClient(mi, mi->repl_transfer_s,rsi.repl_stream_db); mi->repl_state = REPL_STATE_CONNECTED; mi->repl_down_since = 0; - /* After a full resynchroniziation we use the replication ID and - * offset of the master. The secondary ID / offset are cleared since - * we are starting a new history. */ - memcpy(g_pserver->replid,mi->master->replid,sizeof(g_pserver->replid)); - g_pserver->master_repl_offset = mi->master->reploff; + if (fUpdate) + { + mergeReplicationId(mi->master->replid); + } + else + { + /* After a full resynchroniziation we use the replication ID and + * offset of the master. The secondary ID / offset are cleared since + * we are starting a new history. */ + memcpy(g_pserver->replid,mi->master->replid,sizeof(g_pserver->replid)); + g_pserver->master_repl_offset = mi->master->reploff; + } clearReplicationId2(); /* Let's create the replication backlog if needed. Slaves need to * accumulate the backlog regardless of the fact they have sub-slaves @@ -2122,8 +2166,10 @@ void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) { /* Prepare a suitable temp file for bulk transfer */ while(maxtries--) { + auto dt = std::chrono::system_clock::now().time_since_epoch(); + auto dtMillisecond = std::chrono::duration_cast(dt); snprintf(tmpfile,256, - "temp-%d.%ld.rdb",(int)g_pserver->unixtime,(long int)getpid()); + "temp-%d.%ld.rdb",(int)dtMillisecond.count(),(long int)getpid()); dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644); if (dfd != -1) break; sleep(1); @@ -2973,10 +3019,21 @@ void replicationCron(void) { if ((replication_cron_loops % g_pserver->repl_ping_slave_period) == 0 && listLength(g_pserver->slaves)) { - ping_argv[0] = createStringObject("PING",4); - replicationFeedSlaves(g_pserver->slaves, g_pserver->slaveseldb, - ping_argv, 1); - decrRefCount(ping_argv[0]); + /* Note that we don't send the PING if the clients are paused during + * a Redis Cluster manual failover: the PING we send will otherwise + * alter the replication offsets of master and slave, and will no longer + * match the one stored into 'mf_master_offset' state. */ + int manual_failover_in_progress = + g_pserver->cluster_enabled && + g_pserver->cluster->mf_end && + clientsArePaused(); + + if (!manual_failover_in_progress) { + ping_argv[0] = createStringObject("PING",4); + replicationFeedSlaves(g_pserver->slaves, g_pserver->slaveseldb, + ping_argv, 1); + decrRefCount(ping_argv[0]); + } } /* Second, send a newline to all the slaves in pre-synchronization @@ -3217,6 +3274,7 @@ void replicaReplayCommand(client *c) // the replay command contains two arguments: // 1: The UUID of the source // 2: The raw command buffer to be replayed + // 3: (OPTIONAL) the database ID the command should apply to if (!(c->flags & CLIENT_MASTER)) { @@ -3249,6 +3307,17 @@ void replicaReplayCommand(client *c) return; } + if (c->argc >= 4) + { + long long db; + if (getLongLongFromObject(c->argv[3], &db) != C_OK || db >= cserver.dbnum || selectDb(c, (int)db) != C_OK) + { + addReplyError(c, "Invalid database ID"); + s_pstate->Cancel(); + return; + } + } + if (FSameUuidNoNil(uuid, cserver.uuid)) { addReply(c, shared.ok); diff --git a/src/scripting.cpp b/src/scripting.cpp index 1548044e2..5ba336374 100644 --- a/src/scripting.cpp +++ b/src/scripting.cpp @@ -665,7 +665,7 @@ cleanup: * The object must be small, SDS-encoded, and with refcount = 1 * (we must be the only owner) for us to cache it. */ if (j < LUA_CMD_OBJCACHE_SIZE && - o->refcount == 1 && + o->getrefcount(std::memory_order_relaxed) == 1 && (o->encoding == OBJ_ENCODING_RAW || o->encoding == OBJ_ENCODING_EMBSTR) && sdslen((sds)ptrFromObj(o)) <= LUA_CMD_OBJCACHE_MAX_LEN) diff --git a/src/semiorderedset.h b/src/semiorderedset.h new file mode 100644 index 000000000..00a1f1d91 --- /dev/null +++ b/src/semiorderedset.h @@ -0,0 +1,345 @@ +#pragma once +#include +#include "compactvector.h" + +/**************************************** + * semiorderedset.h: + * + * The ordered set is a hash set that maintains semi-ordering, that is you can iterate in sub-linear time over the set comparing a value. + * It has a few other useful properties vs the traditional set: + * 1. The key need not be the underlying type, the only requirement is the value type is castable to the key + * 2. The key need not have total ordering. The set will iterate until it finds an exact match with operator== on the value + * This provides additional flexibility on insert allowing us to optimize this case. + * + */ + +extern uint64_t dictGenHashFunction(const void *key, int len); + +template +class semiorderedset +{ + friend struct setiter; + std::vector> m_data; + size_t celem = 0; + static const size_t bits_min = 8; + size_t bits = bits_min; + size_t idxRehash = (1ULL << bits_min); + bool fPauseRehash = false; + + constexpr size_t targetElementsPerBucket() + { + // Aim for roughly 4 cache lines per bucket (determined by imperical testing) + // lower values are faster but use more memory + return std::max((64/sizeof(T))*8, (size_t)2); + } + +public: + semiorderedset() + { + m_data.resize((1ULL << bits)); + } + + struct setiter + { + semiorderedset *set; + size_t idxPrimary = 0; + size_t idxSecondary = 0; + + setiter(semiorderedset *set) + { + this->set = set; + } + + bool operator==(const setiter &other) const + { + return (idxPrimary == other.idxPrimary) && (idxSecondary == other.idxSecondary); + } + + bool operator!=(const setiter &other) const { return !operator==(other); } + + inline T &operator*() { return set->m_data[idxPrimary][idxSecondary]; } + inline const T &operator*() const { return set->m_data[idxPrimary][idxSecondary]; } + + inline T *operator->() { return &set->m_data[idxPrimary][idxSecondary]; } + inline const T *operator->() const { return &set->m_data[idxPrimary][idxSecondary]; } + }; + + setiter find(const T_KEY &key) + { + RehashStep(); + setiter itr(this); + itr.idxPrimary = idxFromObj(key); + + for (int hashset = 0; hashset < 2; ++hashset) // rehashing may only be 1 resize behind, so we check up to two slots + { + auto &vecBucket = m_data[itr.idxPrimary]; + + auto itrFind = std::find(vecBucket.begin(), vecBucket.end(), key); + if (itrFind != vecBucket.end()) + { + itr.idxSecondary = itrFind - vecBucket.begin(); + return itr; + } + + // See if we have to check the older slot + size_t mask = (hashmask() >> 1); + if (itr.idxPrimary == (itr.idxPrimary & mask)) + break; // same bucket we just checked + itr.idxPrimary &= mask; + if (FRehashedRow(itr.idxPrimary)) + break; + } + + return end(); + } + + setiter end() + { + setiter itr(this); + itr.idxPrimary = m_data.size(); + return itr; + } + + void insert(T &e, bool fRehash = false) + { + if (!fRehash) + RehashStep(); + + auto idx = idxFromObj(static_cast(e)); + if (!fRehash) + ++celem; + + typename compactvector::iterator itrInsert; + if (!m_data[idx].empty() && !(e < m_data[idx].back())) + itrInsert = m_data[idx].end(); + else + itrInsert = std::upper_bound(m_data[idx].begin(), m_data[idx].end(), e); + itrInsert = m_data[idx].insert(itrInsert, e); + + if (celem > ((1ULL << bits)*targetElementsPerBucket())) + grow(); + } + + // enumeration starting from the 'itrStart'th key. Note that the iter is a hint, and need no be valid anymore + template + setiter enumerate(const setiter &itrStart, const T_MAX &max, T_VISITOR fn, long long *pccheck) + { + setiter itr(itrStart); + + if (itrStart.set == this) // really if this case isn't true its probably a bug + itr = itrStart; // but why crash the program when we can easily fix this? + + fPauseRehash = true; + if (itr.idxPrimary >= m_data.size()) + itr.idxPrimary = 0; + + for (size_t ibucket = 0; ibucket < m_data.size(); ++ibucket) + { + if (!enumerate_bucket(itr, max, fn, pccheck)) + break; + itr.idxSecondary = 0; + + ++itr.idxPrimary; + if (itr.idxPrimary >= m_data.size()) + itr.idxPrimary = 0; + } + fPauseRehash = false; + return itr; + } + + // This will "randomly" visit nodes biased towards lower values first + template + size_t random_visit(T_VISITOR &fn) + { + bool fSawAny = true; + size_t visited = 0; + size_t basePrimary = rand() % m_data.size(); + for (size_t idxSecondary = 0; fSawAny; ++idxSecondary) + { + fSawAny = false; + for (size_t idxPrimaryCount = 0; idxPrimaryCount < m_data.size(); ++idxPrimaryCount) + { + size_t idxPrimary = (basePrimary + idxPrimaryCount) % m_data.size(); + if (idxSecondary < m_data[idxPrimary].size()) + { + ++visited; + fSawAny = true; + if (!fn(m_data[idxPrimary][idxSecondary])) + return visited; + } + } + } + return visited; + } + + const T& random_value() const + { + assert(!empty()); + for (;;) + { + size_t idxPrimary = rand() % m_data.size(); + if (m_data[idxPrimary].empty()) + continue; + + return m_data[idxPrimary][rand() % m_data[idxPrimary].size()]; + } + } + + void erase(const setiter &itr) + { + auto &vecRow = m_data[itr.idxPrimary]; + vecRow.erase(vecRow.begin() + itr.idxSecondary); + --celem; + RehashStep(); + } + + void clear() + { + m_data = decltype(m_data)(); + bits = bits_min; + m_data.resize(1ULL << bits); + idxRehash = m_data.size(); + } + + bool empty() const noexcept { return celem == 0; } + size_t size() const noexcept { return celem; } + + size_t bytes_used() const + { + size_t cb = sizeof(this) + (m_data.capacity()-m_data.size())*sizeof(T); + for (auto &vec : m_data) + { + cb += vec.bytes_used(); + } + return cb; + } + + #define DICT_STATS_VECTLEN 50 + size_t getstats(char *buf, size_t bufsize) const + { + unsigned long i, slots = 0, chainlen, maxchainlen = 0; + unsigned long totchainlen = 0; + unsigned long clvector[DICT_STATS_VECTLEN] = {0}; + size_t l = 0; + + if (empty()) { + return snprintf(buf,bufsize, + "No stats available for empty dictionaries\n"); + } + + /* Compute stats. */ + for (auto &vec : m_data) { + if (vec.empty()) { + clvector[0]++; + continue; + } + slots++; + /* For each hash entry on this slot... */ + chainlen = vec.size(); + + clvector[(chainlen < DICT_STATS_VECTLEN) ? chainlen : (DICT_STATS_VECTLEN-1)]++; + if (chainlen > maxchainlen) maxchainlen = chainlen; + totchainlen += chainlen; + } + + size_t used = m_data.size()-clvector[0]; + /* Generate human readable stats. */ + l += snprintf(buf+l,bufsize-l, + "semiordered set stats:\n" + " table size: %ld\n" + " number of slots: %ld\n" + " used slots: %ld\n" + " max chain length: %ld\n" + " avg chain length (counted): %.02f\n" + " avg chain length (computed): %.02f\n" + " Chain length distribution:\n", + size(), used, slots, maxchainlen, + (float)totchainlen/slots, (float)size()/m_data.size()); + + for (i = 0; i < DICT_STATS_VECTLEN; i++) { + if (clvector[i] == 0) continue; + if (l >= bufsize) break; + l += snprintf(buf+l,bufsize-l, + " %s%ld: %ld (%.02f%%)\n", + (i == DICT_STATS_VECTLEN-1)?">= ":"", + i, clvector[i], ((float)clvector[i]/m_data.size())*100); + } + + /* Unlike snprintf(), teturn the number of characters actually written. */ + if (bufsize) buf[bufsize-1] = '\0'; + return strlen(buf); + } + +private: + inline size_t hashmask() const { return (1ULL << bits) - 1; } + + size_t idxFromObj(const T_KEY &key) + { + size_t v = (size_t)dictGenHashFunction(&key, sizeof(key)); + return v & hashmask(); + } + + bool FRehashedRow(size_t idx) const + { + return (idx >= (m_data.size()/2)) || (idx < idxRehash); + } + + void RehashStep() + { + if (fPauseRehash) + return; + + int steps = 0; + for (; idxRehash < (m_data.size()/2); ++idxRehash) + { + compactvector vecT; + std::swap(m_data[idxRehash], vecT); + + for (auto &v : vecT) + insert(v, true); + + if (++steps > 1024) + break; + } + } + + void grow() + { + assert(idxRehash >= (m_data.size()/2)); // we should have finished rehashing by the time we need to grow again + + ++bits; + m_data.resize(1ULL << bits); + idxRehash = 0; + RehashStep(); + } + + template + inline bool enumerate_bucket(setiter &itr, const T_MAX &max, T_VISITOR &fn, long long *pcheckLimit) + { + auto &vec = m_data[itr.idxPrimary]; + for (; itr.idxSecondary < vec.size(); ++itr.idxSecondary) + { + // Assert we're ordered by T_MAX + assert((itr.idxSecondary+1) >= vec.size() + || static_cast(vec[itr.idxSecondary]) <= static_cast(vec[itr.idxSecondary+1])); + + (*pcheckLimit)--; + if (max < static_cast(*itr)) + return *pcheckLimit > 0; + + size_t sizeBefore = vec.size(); + if (!fn(*itr)) + { + itr.idxSecondary++; // we still visited this node + return false; + } + if (vec.size() != sizeBefore) + { + assert(vec.size() == (sizeBefore-1)); // they may only remove the element passed to them + --itr.idxSecondary; // they deleted the element + } + } + vec.shrink_to_fit(); + return *pcheckLimit > 0; + } +}; diff --git a/src/server.cpp b/src/server.cpp index 1aca94717..2db38dceb 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -59,6 +59,7 @@ #include #include #include +#include #include "aelocker.h" int g_fTestMode = false; @@ -228,6 +229,10 @@ struct redisCommand redisCommandTable[] = { "write @keyspace", 0,NULL,1,-1,1,0,0,0}, + {"expdel",delCommand,-2, + "write @keyspace", + 0,NULL,1,-1,1,0,0,0}, + {"unlink",unlinkCommand,-2, "write fast @keyspace", 0,NULL,1,-1,1,0,0,0}, @@ -618,6 +623,10 @@ struct redisCommand redisCommandTable[] = { "write fast @keyspace", 0,NULL,1,1,1,0,0,0}, + {"expiremember", expireMemberCommand, 4, + "write fast @keyspace", + 0,NULL,1,1,1,0,0,0}, + {"pexpire",pexpireCommand,3, "write fast @keyspace", 0,NULL,1,1,1,0,0,0}, @@ -1428,8 +1437,6 @@ int htNeedsResize(dict *dict) { void tryResizeHashTables(int dbid) { if (htNeedsResize(g_pserver->db[dbid].pdict)) dictResize(g_pserver->db[dbid].pdict); - if (htNeedsResize(g_pserver->db[dbid].expires)) - dictResize(g_pserver->db[dbid].expires); } /* Our hash table implementation performs rehashing incrementally while @@ -1445,11 +1452,6 @@ int incrementallyRehash(int dbid) { dictRehashMilliseconds(g_pserver->db[dbid].pdict,1); return 1; /* already used our millisecond for this loop... */ } - /* Expires */ - if (dictIsRehashing(g_pserver->db[dbid].expires)) { - dictRehashMilliseconds(g_pserver->db[dbid].expires,1); - return 1; /* already used our millisecond for this loop... */ - } return 0; } @@ -1889,7 +1891,7 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) { size = dictSlots(g_pserver->db[j].pdict); used = dictSize(g_pserver->db[j].pdict); - vkeys = dictSize(g_pserver->db[j].expires); + vkeys = g_pserver->db[j].setexpire->size(); if (used || vkeys) { serverLog(LL_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size); /* dictPrintStats(g_pserver->dict); */ @@ -2925,13 +2927,16 @@ void initServer(void) { /* Create the Redis databases, and initialize other internal state. */ for (int j = 0; j < cserver.dbnum; j++) { + new (&g_pserver->db[j]) redisDb; g_pserver->db[j].pdict = dictCreate(&dbDictType,NULL); - g_pserver->db[j].expires = dictCreate(&keyptrDictType,NULL); + g_pserver->db[j].setexpire = new(MALLOC_LOCAL) expireset(); + g_pserver->db[j].expireitr = g_pserver->db[j].setexpire->end(); g_pserver->db[j].blocking_keys = dictCreate(&keylistDictType,NULL); g_pserver->db[j].ready_keys = dictCreate(&objectKeyPointerValueDictType,NULL); g_pserver->db[j].watched_keys = dictCreate(&keylistDictType,NULL); g_pserver->db[j].id = j; g_pserver->db[j].avg_ttl = 0; + g_pserver->db[j].last_expire_set = 0; g_pserver->db[j].defrag_later = listCreate(); } @@ -3376,6 +3381,7 @@ void call(client *c, int flags) { latencyAddSampleIfNeeded(latency_event,duration/1000); slowlogPushEntryIfNeeded(c,c->argv,c->argc,duration); } + if (flags & CMD_CALL_STATS) { /* use the real command that was executed (cmd and lastamc) may be * different, in case of MULTI-EXEC or re-written commands such as @@ -3449,6 +3455,16 @@ void call(client *c, int flags) { ProcessPendingAsyncWrites(); g_pserver->also_propagate = prev_also_propagate; + + /* If the client has keys tracking enabled for client side caching, + * make sure to remember the keys it fetched via this command. */ + if (c->cmd->flags & CMD_READONLY) { + client *caller = (c->flags & CLIENT_LUA && g_pserver->lua_caller) ? + g_pserver->lua_caller : c; + if (caller->flags & CLIENT_TRACKING) + trackingRememberKeys(caller); + } + g_pserver->stat_numcommands++; } @@ -3524,7 +3540,7 @@ int processCommand(client *c, int callFlags) { if (acl_retval == ACL_DENIED_CMD) addReplyErrorFormat(c, "-NOPERM this user has no permissions to run " - "the '%s' command or its subcommnad", c->cmd->name); + "the '%s' command or its subcommand", c->cmd->name); else addReplyErrorFormat(c, "-NOPERM this user has no permissions to access " @@ -3686,6 +3702,7 @@ int processCommand(client *c, int callFlags) { queueMultiCommand(c); addReply(c,shared.queued); } else { + std::unique_lockdb->lock)> ulock(c->db->lock); call(c,callFlags); c->woff = g_pserver->master_repl_offset; if (listLength(g_pserver->ready_keys)) @@ -4087,10 +4104,12 @@ sds genRedisInfoString(const char *section) { "connected_clients:%lu\r\n" "client_recent_max_input_buffer:%zu\r\n" "client_recent_max_output_buffer:%zu\r\n" - "blocked_clients:%d\r\n", + "blocked_clients:%d\r\n" + "current_client_thread:%d\r\n", listLength(g_pserver->clients)-listLength(g_pserver->slaves), maxin, maxout, - g_pserver->blocked_clients); + g_pserver->blocked_clients, + static_cast(serverTL - g_pserver->rgthreadvar)); for (int ithread = 0; ithread < cserver.cthreads; ++ithread) { info = sdscatprintf(info, @@ -4561,11 +4580,18 @@ sds genRedisInfoString(const char *section) { long long keys, vkeys; keys = dictSize(g_pserver->db[j].pdict); - vkeys = dictSize(g_pserver->db[j].expires); + vkeys = g_pserver->db[j].setexpire->size(); + + // Adjust TTL by the current time + g_pserver->db[j].avg_ttl -= (g_pserver->mstime - g_pserver->db[j].last_expire_set); + if (g_pserver->db[j].avg_ttl < 0) + g_pserver->db[j].avg_ttl = 0; + g_pserver->db[j].last_expire_set = g_pserver->mstime; + if (keys || vkeys) { info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld,avg_ttl=%lld\r\n", - j, keys, vkeys, g_pserver->db[j].avg_ttl); + j, keys, vkeys, static_cast(g_pserver->db[j].avg_ttl)); } } } @@ -4830,6 +4856,12 @@ void redisOutOfMemoryHandler(size_t allocation_size) { serverPanic("Redis aborting for OUT OF MEMORY"); } +void fuzzOutOfMemoryHandler(size_t allocation_size) { + serverLog(LL_WARNING,"Out Of Memory allocating %zu bytes!", + allocation_size); + exit(EXIT_FAILURE); // don't crash because it causes false positives +} + void redisSetProcTitle(const char *title) { #ifdef USE_SETPROCTITLE const char *server_mode = ""; @@ -5184,6 +5216,23 @@ int main(int argc, char **argv) { #endif moduleLoadFromQueue(); ACLLoadUsersAtStartup(); + + // special case of FUZZING load from stdin then quit + if (argc > 1 && strstr(argv[1],"rdbfuzz-mode") != NULL) + { + zmalloc_set_oom_handler(fuzzOutOfMemoryHandler); +#ifdef __AFL_HAVE_MANUAL_CONTROL + __AFL_INIT(); +#endif + rio rdb; + rdbSaveInfo rsi = RDB_SAVE_INFO_INIT; + startLoading(stdin); + rioInitWithFile(&rdb,stdin); + rdbLoadRio(&rdb,&rsi,0); + stopLoading(); + return EXIT_SUCCESS; + } + loadDataFromDisk(); if (g_pserver->cluster_enabled) { if (verifyClusterConfigWithData() == C_ERR) { @@ -5220,7 +5269,7 @@ int main(int argc, char **argv) { aeReleaseLock(); //Finally we can dump the lock moduleReleaseGIL(true); - + serverAssert(cserver.cthreads > 0 && cserver.cthreads <= MAX_EVENT_LOOPS); pthread_t rgthread[MAX_EVENT_LOOPS]; for (int iel = 0; iel < cserver.cthreads; ++iel) diff --git a/src/server.h b/src/server.h index 23f0d7aa0..6a2bda9fa 100644 --- a/src/server.h +++ b/src/server.h @@ -53,6 +53,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { #include @@ -81,6 +82,7 @@ typedef long long mstime_t; /* millisecond time type. */ N-elements flat arrays */ #include "rax.h" /* Radix tree */ #include "uuid.h" +#include "semiorderedset.h" /* Following includes allow test functions to be called from Redis main() */ #include "zipmap.h" @@ -243,7 +245,7 @@ public: #define CONFIG_DEFAULT_ACTIVE_REPLICA 0 #define CONFIG_DEFAULT_ENABLE_MULTIMASTER 0 -#define ACTIVE_EXPIRE_CYCLE_LOOKUPS_PER_LOOP 20 /* Loopkups per loop. */ +#define ACTIVE_EXPIRE_CYCLE_LOOKUPS_PER_LOOP 64 /* Loopkups per loop. */ #define ACTIVE_EXPIRE_CYCLE_FAST_DURATION 1000 /* Microseconds */ #define ACTIVE_EXPIRE_CYCLE_SLOW_TIME_PERC 25 /* CPU max % for keys collection */ #define ACTIVE_EXPIRE_CYCLE_SLOW 0 @@ -327,8 +329,8 @@ public: #define AOF_WAIT_REWRITE 2 /* AOF waits rewrite to start appending */ /* Client flags */ -#define CLIENT_SLAVE (1<<0) /* This client is a slave server */ -#define CLIENT_MASTER (1<<1) /* This client is a master server */ +#define CLIENT_SLAVE (1<<0) /* This client is a repliaca */ +#define CLIENT_MASTER (1<<1) /* This client is a master */ #define CLIENT_MONITOR (1<<2) /* This client is a slave monitor, see MONITOR */ #define CLIENT_MULTI (1<<3) /* This client is in a MULTI context */ #define CLIENT_BLOCKED (1<<4) /* The client is waiting in a blocking operation */ @@ -359,7 +361,17 @@ public: #define CLIENT_LUA_DEBUG_SYNC (1<<26) /* EVAL debugging without fork() */ #define CLIENT_MODULE (1<<27) /* Non connected client used by some module. */ #define CLIENT_PROTECTED (1<<28) /* Client should not be freed for now. */ -#define CLIENT_FORCE_REPLY (1<<29) /* Should addReply be forced to write the text? */ +#define CLIENT_PENDING_READ (1<<29) /* The client has pending reads and was put + in the list of clients we can read + from. */ +#define CLIENT_PENDING_COMMAND (1<<30) /* Used in threaded I/O to signal after + we return single threaded that the + client has already pending commands + to be executed. */ +#define CLIENT_TRACKING (1<<31) /* Client enabled keys tracking in order to + perform client side caching. */ +#define CLIENT_TRACKING_BROKEN_REDIR (1ULL<<32) /* Target client is invalid. */ +#define CLIENT_FORCE_REPLY (1ULL<<33) /* Should addReply be forced to write the text? */ /* Client block type (btype field in client structure) * if CLIENT_BLOCKED flag is set. */ @@ -707,7 +719,7 @@ typedef struct RedisModuleDigest { #define LRU_CLOCK_MAX ((1<lru */ #define LRU_CLOCK_RESOLUTION 1000 /* LRU clock resolution in ms */ -#define OBJ_SHARED_REFCOUNT INT_MAX +#define OBJ_SHARED_REFCOUNT (0x7FFFFFFF) #define OBJ_MVCC_INVALID (0xFFFFFFFFFFFFFFFFULL) typedef struct redisObject { @@ -716,11 +728,21 @@ typedef struct redisObject { unsigned lru:LRU_BITS; /* LRU time (relative to global lru_clock) or * LFU data (least significant 8 bits frequency * and most significant 16 bits access time). */ - mutable std::atomic refcount; +private: + mutable std::atomic refcount {0}; +public: uint64_t mvcc_tstamp; void *m_ptr; -} robj; + inline bool FExpires() const { return refcount.load(std::memory_order_relaxed) >> 31; } + void SetFExpires(bool fExpires); + + void setrefcount(unsigned ref); + unsigned getrefcount(std::memory_order order) const { return (refcount.load(order) & ~(1U << 31)); } + void addref() const { refcount.fetch_add(1, std::memory_order_acq_rel); } + unsigned release() const { return refcount.fetch_sub(1, std::memory_order_acq_rel) & ~(1U << 31); } +} robj; +static_assert(sizeof(redisObject) == 24, "object size is critical, don't increase"); __attribute__((always_inline)) inline const void *ptrFromObj(robj_roptr &o) { @@ -746,12 +768,227 @@ __attribute__((always_inline)) inline char *szFromObj(const robj *o) return (char*)ptrFromObj(o); } +class expireEntryFat +{ + friend class expireEntry; +public: + struct subexpireEntry + { + long long when; + std::unique_ptr spsubkey; + + subexpireEntry(long long when, const char *subkey) + : when(when), spsubkey(subkey, sdsfree) + {} + + bool operator<(long long when) const noexcept { return this->when < when; } + bool operator<(const subexpireEntry &se) { return this->when < se.when; } + }; + +private: + sds m_keyPrimary; + std::vector m_vecexpireEntries; // Note a NULL for the sds portion means the expire is for the primary key + +public: + expireEntryFat(sds keyPrimary) + : m_keyPrimary(keyPrimary) + {} + long long when() const noexcept { return m_vecexpireEntries.front().when; } + const char *key() const noexcept { return m_keyPrimary; } + + bool operator<(long long when) const noexcept { return this->when() < when; } + + void expireSubKey(const char *szSubkey, long long when) + { + auto itrInsert = std::lower_bound(m_vecexpireEntries.begin(), m_vecexpireEntries.end(), when); + const char *subkey = (szSubkey) ? sdsdup(szSubkey) : nullptr; + m_vecexpireEntries.emplace(itrInsert, when, subkey); + } + + bool FEmpty() const noexcept { return m_vecexpireEntries.empty(); } + const subexpireEntry &nextExpireEntry() const noexcept { return m_vecexpireEntries.front(); } + void popfrontExpireEntry() { m_vecexpireEntries.erase(m_vecexpireEntries.begin()); } + const subexpireEntry &operator[](size_t idx) { return m_vecexpireEntries[idx]; } + size_t size() const noexcept { return m_vecexpireEntries.size(); } +}; + +class expireEntry { + union + { + sds m_key; + expireEntryFat *m_pfatentry; + } u; + long long m_when; // LLONG_MIN means this is a fat entry and we should use the pointer + +public: + class iter + { + expireEntry *m_pentry = nullptr; + size_t m_idx = 0; + + public: + iter(expireEntry *pentry, size_t idx) + : m_pentry(pentry), m_idx(idx) + {} + + iter &operator++() { ++m_idx; return *this; } + + const char *subkey() const + { + if (m_pentry->FFat()) + return (*m_pentry->pfatentry())[m_idx].spsubkey.get(); + return nullptr; + } + long long when() const + { + if (m_pentry->FFat()) + return (*m_pentry->pfatentry())[m_idx].when; + return m_pentry->when(); + } + + bool operator!=(const iter &other) + { + return m_idx != other.m_idx; + } + + const iter &operator*() const { return *this; } + }; + + expireEntry(sds key, const char *subkey, long long when) + { + if (subkey != nullptr) + { + m_when = LLONG_MIN; + u.m_pfatentry = new (MALLOC_LOCAL) expireEntryFat(key); + u.m_pfatentry->expireSubKey(subkey, when); + } + else + { + u.m_key = key; + m_when = when; + } + } + + expireEntry(expireEntryFat *pfatentry) + { + u.m_pfatentry = pfatentry; + m_when = LLONG_MIN; + } + + expireEntry(expireEntry &&e) + { + u.m_key = e.u.m_key; + m_when = e.m_when; + e.u.m_key = (char*)key(); // we do this so it can still be found in the set + e.m_when = 0; + } + + ~expireEntry() + { + if (FFat()) + delete u.m_pfatentry; + } + + void setKeyUnsafe(sds key) + { + if (FFat()) + u.m_pfatentry->m_keyPrimary = key; + else + u.m_key = key; + } + + inline bool FFat() const noexcept { return m_when == LLONG_MIN; } + expireEntryFat *pfatentry() { assert(FFat()); return u.m_pfatentry; } + + + bool operator==(const char *key) const noexcept + { + return this->key() == key; + } + + bool operator<(const expireEntry &e) const noexcept + { + return when() < e.when(); + } + bool operator<(long long when) const noexcept + { + return this->when() < when; + } + + const char *key() const noexcept + { + if (FFat()) + return u.m_pfatentry->key(); + return u.m_key; + } + long long when() const noexcept + { + if (FFat()) + return u.m_pfatentry->when(); + return m_when; + } + + void update(const char *subkey, long long when) + { + if (!FFat()) + { + if (subkey == nullptr) + { + m_when = when; + return; + } + else + { + // we have to upgrade to a fat entry + long long whenT = m_when; + sds keyPrimary = u.m_key; + m_when = LLONG_MIN; + u.m_pfatentry = new (MALLOC_LOCAL) expireEntryFat(keyPrimary); + u.m_pfatentry->expireSubKey(nullptr, whenT); + // at this point we're fat so fall through + } + } + u.m_pfatentry->expireSubKey(subkey, when); + } + + iter begin() { return iter(this, 0); } + iter end() + { + if (FFat()) + return iter(this, u.m_pfatentry->size()); + return iter(this, 1); + } + + bool FGetPrimaryExpire(long long *pwhen) + { + *pwhen = -1; + for (auto itr : *this) + { + if (itr.subkey() == nullptr) + { + *pwhen = itr.when(); + return true; + } + } + return false; + } + + explicit operator const char*() const noexcept { return key(); } + explicit operator long long() const noexcept { return when(); } +}; +typedef semiorderedset expireset; + +/* The a string name for an object's type as listed above + * Native types are checked against the OBJ_STRING, OBJ_LIST, OBJ_* defines, + * and Module types have their registered name returned. */ +const char *getObjectTypeName(robj_roptr o); + /* Macro used to initialize a Redis object allocated on the stack. * Note that this macro is taken near the structure definition to make sure * we'll update it when the structure is changed, to avoid bugs like * bug #85 introduced exactly in this way. */ #define initStaticStringObject(_var,_ptr) do { \ - _var.refcount = 1; \ + _var.setrefcount(1); \ _var.type = OBJ_STRING; \ _var.encoding = OBJ_ENCODING_RAW; \ _var.m_ptr = _ptr; \ @@ -777,14 +1014,22 @@ typedef struct clientReplyBlock { * by integers from 0 (the default database) up to the max configured * database. The database number is the 'id' field in the structure. */ typedef struct redisDb { + redisDb() + : expireitr(nullptr) + {}; dict *pdict; /* The keyspace for this DB */ - dict *expires; /* Timeout of keys with a timeout set */ + expireset *setexpire; + expireset::setiter expireitr; + dict *blocking_keys; /* Keys with clients waiting for data (BLPOP)*/ dict *ready_keys; /* Blocked keys that received a PUSH */ dict *watched_keys; /* WATCHED keys for MULTI/EXEC CAS */ int id; /* Database ID */ - long long avg_ttl; /* Average TTL, just for stats */ + long long last_expire_set; /* when the last expire was set */ + double avg_ttl; /* Average TTL, just for stats */ list *defrag_later; /* List of key names to attempt to defrag one by one, gradually. */ + + fastlock lock; } redisDb; /* Client MULTI/EXEC state */ @@ -924,7 +1169,7 @@ typedef struct client { time_t ctime; /* Client creation time. */ time_t lastinteraction; /* Time of the last interaction, used for timeout */ time_t obuf_soft_limit_reached_time; - std::atomic flags; /* Client flags: CLIENT_* macros. */ + std::atomic flags; /* Client flags: CLIENT_* macros. */ int casyncOpsPending; int fPendingAsyncWrite; /* NOTE: Not a flag because it is written to outside of the client lock (locked by the global lock instead) */ int authenticated; /* Needed when the default user requires auth. */ @@ -961,6 +1206,11 @@ typedef struct client { /* UUIDs are transient and lost when the server is shut down */ unsigned char uuid[UUID_BINARY_LEN]; + /* If this client is in tracking mode and this field is non zero, + * invalidation messages for keys fetched by this client will be send to + * the specified client ID. */ + uint64_t client_tracking_redirection; + /* Response buffer */ int bufpos; char buf[PROTO_REPLY_CHUNK_BYTES]; @@ -1456,6 +1706,8 @@ struct redisServer { unsigned int blocked_clients; /* # of clients executing a blocking cmd.*/ unsigned int blocked_clients_by_type[BLOCKED_NUM]; list *ready_keys; /* List of readyList structures for BLPOP & co */ + /* Client side caching. */ + unsigned int tracking_clients; /* # of clients with tracking enabled.*/ /* Sort parameters - qsort_r() is only available under BSD so we * have to take this state global, in order to pass it to sortCompare() */ int sort_desc; @@ -1792,6 +2044,7 @@ void addReplyPushLenAsync(client *c, long length); void addReplyLongLongAsync(client *c, long long ll); void ProcessPendingAsyncWrites(void); +client *lookupClientByID(uint64_t id); #ifdef __GNUC__ void addReplyErrorFormat(client *c, const char *fmt, ...) @@ -1803,6 +2056,12 @@ void addReplyErrorFormat(client *c, const char *fmt, ...); void addReplyStatusFormat(client *c, const char *fmt, ...); #endif +/* Client side caching (tracking mode) */ +void enableTracking(client *c, uint64_t redirect_to); +void disableTracking(client *c); +void trackingRememberKeys(client *c); +void trackingInvalidateKey(robj *keyobj); + /* List data type */ void listTypeTryConversion(robj *subject, robj *value); void listTypePush(robj *subject, robj *value, int where); @@ -1912,6 +2171,7 @@ long long getPsyncInitialOffset(void); int replicationSetupSlaveForFullResync(client *slave, long long offset); void changeReplicationId(void); void clearReplicationId2(void); +void mergeReplicationId(const char *); void chopReplicationBacklog(void); void replicationCacheMasterUsingMyself(struct redisMaster *mi); void feedReplicationBacklog(const void *ptr, size_t len); @@ -2130,6 +2390,7 @@ int pubsubUnsubscribeAllPatterns(client *c, int notify); void freePubsubPattern(const void *p); int listMatchPubsubPattern(void *a, void *b); int pubsubPublishMessage(robj *channel, robj *message); +void addReplyPubsubMessage(client *c, robj *channel, robj *msg); /* Keyspace events notification */ void notifyKeyspaceEvent(int type, const char *event, robj *key, int dbid); @@ -2146,10 +2407,12 @@ int rewriteConfig(char *path); /* db.c -- Keyspace access API */ int removeExpire(redisDb *db, robj *key); +int removeExpireCore(redisDb *db, robj *key, dictEntry *de); void propagateExpire(redisDb *db, robj *key, int lazy); int expireIfNeeded(redisDb *db, robj *key); -long long getExpire(redisDb *db, robj_roptr key); -void setExpire(client *c, redisDb *db, robj *key, long long when); +expireEntry *getExpire(redisDb *db, robj_roptr key); +void setExpire(client *c, redisDb *db, robj *key, robj *subkey, long long when); +void setExpire(client *c, redisDb *db, robj *key, expireEntry &&entry); robj_roptr lookupKeyRead(redisDb *db, robj *key); robj *lookupKeyWrite(redisDb *db, robj *key); robj_roptr lookupKeyReadOrReply(client *c, robj *key, robj *reply); @@ -2344,6 +2607,7 @@ void mgetCommand(client *c); void monitorCommand(client *c); void expireCommand(client *c); void expireatCommand(client *c); +void expireMemberCommand(client *c); void pexpireCommand(client *c); void pexpireatCommand(client *c); void getsetCommand(client *c); diff --git a/src/slowlog.cpp b/src/slowlog.cpp index 4f338b341..08a2e62e9 100644 --- a/src/slowlog.cpp +++ b/src/slowlog.cpp @@ -72,7 +72,7 @@ slowlogEntry *slowlogCreateEntry(client *c, robj **argv, int argc, long long dur (unsigned long) sdslen(szFromObj(argv[j])) - SLOWLOG_ENTRY_MAX_STRING); se->argv[j] = createObject(OBJ_STRING,s); - } else if (argv[j]->refcount == OBJ_SHARED_REFCOUNT) { + } else if (argv[j]->getrefcount(std::memory_order_relaxed) == OBJ_SHARED_REFCOUNT) { se->argv[j] = argv[j]; } else { /* Here we need to dupliacate the string objects composing the diff --git a/src/t_string.cpp b/src/t_string.cpp index 4cb30eac6..8b79097c0 100644 --- a/src/t_string.cpp +++ b/src/t_string.cpp @@ -85,7 +85,7 @@ void setGenericCommand(client *c, int flags, robj *key, robj *val, robj *expire, } setKey(c->db,key,val); g_pserver->dirty++; - if (expire) setExpire(c,c->db,key,mstime()+milliseconds); + if (expire) setExpire(c,c->db,key,nullptr,mstime()+milliseconds); notifyKeyspaceEvent(NOTIFY_STRING,"set",key,c->db->id); if (expire) notifyKeyspaceEvent(NOTIFY_GENERIC, "expire",key,c->db->id); @@ -353,7 +353,7 @@ void incrDecrCommand(client *c, long long incr) { } value += incr; - if (o && o->refcount == 1 && o->encoding == OBJ_ENCODING_INT && + if (o && o->getrefcount(std::memory_order_relaxed) == 1 && o->encoding == OBJ_ENCODING_INT && (value < 0 || value >= OBJ_SHARED_INTEGERS) && value >= LONG_MIN && value <= LONG_MAX) { diff --git a/src/tracking.cpp b/src/tracking.cpp new file mode 100644 index 000000000..bcc98eac7 --- /dev/null +++ b/src/tracking.cpp @@ -0,0 +1,174 @@ +/* tracking.c - Client side caching: keys tracking and invalidation + * + * Copyright (c) 2019, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "server.h" + +/* The tracking table is constituted by 2^24 radix trees (each tree, and the + * table itself, are allocated in a lazy way only when needed) tracking + * clients that may have certain keys in their local, client side, cache. + * + * Keys are grouped into 2^24 slots, in a way similar to Redis Cluster hash + * slots, however here the function we use is crc64, taking the least + * significant 24 bits of the output. + * + * When a client enables tracking with "CLIENT TRACKING on", each key served to + * the client is hashed to one of such slots, and Redis will remember what + * client may have keys about such slot. Later, when a key in a given slot is + * modified, all the clients that may have local copies of keys in that slot + * will receive an invalidation message. There is no distinction of database + * number: a single table is used. + * + * Clients will normally take frequently requested objects in memory, removing + * them when invalidation messages are received. A strategy clients may use is + * to just cache objects in a dictionary, associating to each cached object + * some incremental epoch, or just a timestamp. When invalidation messages are + * received clients may store, in a different table, the timestamp (or epoch) + * of the invalidation of such given slot: later when accessing objects, the + * eviction of stale objects may be performed in a lazy way by checking if the + * cached object timestamp is older than the invalidation timestamp for such + * objects. + * + * The output of the 24 bit hash function is very large (more than 16 million + * possible slots), so clients that may want to use less resources may only + * use the most significant bits instead of the full 24 bits. */ +#define TRACKING_TABLE_SIZE (1<<24) +rax **TrackingTable = NULL; +robj *TrackingChannelName; + +/* Remove the tracking state from the client 'c'. Note that there is not much + * to do for us here, if not to decrement the counter of the clients in + * tracking mode, because we just store the ID of the client in the tracking + * table, so we'll remove the ID reference in a lazy way. Otherwise when a + * client with many entries in the table is removed, it would cost a lot of + * time to do the cleanup. */ +void disableTracking(client *c) { + if (c->flags & CLIENT_TRACKING) { + g_pserver->tracking_clients--; + c->flags &= ~(CLIENT_TRACKING|CLIENT_TRACKING_BROKEN_REDIR); + } +} + +/* Enable the tracking state for the client 'c', and as a side effect allocates + * the tracking table if needed. If the 'redirect_to' argument is non zero, the + * invalidation messages for this client will be sent to the client ID + * specified by the 'redirect_to' argument. Note that if such client will + * eventually get freed, we'll send a message to the original client to + * inform it of the condition. Multiple clients can redirect the invalidation + * messages to the same client ID. */ +void enableTracking(client *c, uint64_t redirect_to) { + if (c->flags & CLIENT_TRACKING) return; + c->flags |= CLIENT_TRACKING; + c->flags &= ~CLIENT_TRACKING_BROKEN_REDIR; + c->client_tracking_redirection = redirect_to; + g_pserver->tracking_clients++; + if (TrackingTable == NULL) { + TrackingTable = (rax**)zcalloc(sizeof(rax*) * TRACKING_TABLE_SIZE, MALLOC_LOCAL); + TrackingChannelName = createStringObject("__redis__:invalidate",20); + } +} + +/* This function is called after the excution of a readonly command in the + * case the client 'c' has keys tracking enabled. It will populate the + * tracking ivalidation table according to the keys the user fetched, so that + * Redis will know what are the clients that should receive an invalidation + * message with certain groups of keys are modified. */ +void trackingRememberKeys(client *c) { + int numkeys; + int *keys = getKeysFromCommand(c->cmd,c->argv,c->argc,&numkeys); + if (keys == NULL) return; + + for(int j = 0; j < numkeys; j++) { + int idx = keys[j]; + sds sdskey = (sds)ptrFromObj(c->argv[idx]); + uint64_t hash = crc64(0, + (unsigned char*)sdskey,sdslen(sdskey))&(TRACKING_TABLE_SIZE-1); + if (TrackingTable[hash] == NULL) + TrackingTable[hash] = raxNew(); + raxTryInsert(TrackingTable[hash], + (unsigned char*)&c->id,sizeof(c->id),NULL,NULL); + } + getKeysFreeResult(keys); +} + +/* This function is called from signalModifiedKey() or other places in Redis + * when a key changes value. In the context of keys tracking, our task here is + * to send a notification to every client that may have keys about such . */ +void trackingInvalidateKey(robj *keyobj) { + sds sdskey = (sds)ptrFromObj(keyobj); + uint64_t hash = crc64(0, + (unsigned char*)sdskey,sdslen(sdskey))&(TRACKING_TABLE_SIZE-1); + if (TrackingTable == NULL || TrackingTable[hash] == NULL) return; + + raxIterator ri; + raxStart(&ri,TrackingTable[hash]); + raxSeek(&ri,"^",NULL,0); + while(raxNext(&ri)) { + uint64_t id; + memcpy(&id,ri.key,ri.key_len); + client *c = lookupClientByID(id); + int using_redirection = 0; + if (c->client_tracking_redirection) { + client *redir = lookupClientByID(c->client_tracking_redirection); + if (!redir) { + /* We need to signal to the original connection that we + * are unable to send invalidation messages to the redirected + * connection, because the client no longer exist. */ + if (c->resp > 2) { + addReplyPushLen(c,3); + addReplyBulkCBuffer(c,"tracking-redir-broken",21); + addReplyLongLong(c,c->client_tracking_redirection); + } + continue; + } + c = redir; + using_redirection = 1; + } + + /* Only send such info for clients in RESP version 3 or more. However + * if redirection is active, and the connection we redirect to is + * in Pub/Sub mode, we can support the feature with RESP 2 as well, + * by sending Pub/Sub messages in the __redis__:invalidate channel. */ + if (c->resp > 2) { + addReplyPushLen(c,2); + addReplyBulkCBuffer(c,"invalidate",10); + addReplyLongLong(c,hash); + } else if (using_redirection && c->flags & CLIENT_PUBSUB) { + robj *msg = createStringObjectFromLongLong(hash); + addReplyPubsubMessage(c,TrackingChannelName,msg); + decrRefCount(msg); + } + } + raxStop(&ri); + + /* Free the tracking table: we'll create the radix tree and populate it + * again if more keys will be modified in this hash slot. */ + raxFree(TrackingTable[hash]); + TrackingTable[hash] = NULL; +} diff --git a/tests/integration/replication-active.tcl b/tests/integration/replication-active.tcl index 99e0dc006..2ba761766 100644 --- a/tests/integration/replication-active.tcl +++ b/tests/integration/replication-active.tcl @@ -93,5 +93,18 @@ start_server {tags {"active-repl"} overrides {active-replica yes}} { assert_equal {0} [$master del testkey1] assert_equal {0} [$slave del testkey1] } + + test {Active replica different databases} { + $master select 3 + $master set testkey abcd + $master select 2 + $master del testkey + $slave select 3 + wait_for_condition 50 1000 { + [string match abcd [$slave get testkey]] + } else { + fail "Replication failed to propogate DB 3" + } + } } } diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl index 6abbddbbe..a06afca3e 100644 --- a/tests/test_helper.tcl +++ b/tests/test_helper.tcl @@ -35,6 +35,7 @@ set ::all_tests { unit/quit unit/aofrw unit/acl + unit/rreplay integration/block-repl integration/replication integration/replication-2 diff --git a/tests/unit/bitops.tcl b/tests/unit/bitops.tcl index 926f38295..f8a5cbe18 100644 --- a/tests/unit/bitops.tcl +++ b/tests/unit/bitops.tcl @@ -214,6 +214,64 @@ start_server {tags {"bitops"}} { r bitop or x a b } {32} + test {BITOP lshift size} { + r set a " " + r bitop lshift x a 1 + } {2} + + test {BITOP rshift size} { + r set a " " + r bitop rshift x a 1 + } {1} + + test {BITOP rshift 0 byte} { + r set a " " + r bitop rshift x a 8 + } {0} + + test {BITOP rshift underflow} { + r set a " " + r bitop rshift x a 65 + } {0} + + test {BITOP lshift string} { + r set a "abcdefg" + r bitop lshift x a 8 + r get x + } "\x00abcdefg" + + test {BITOP lshift char} { + r set a "\xAA" + r bitop lshift x a 4 + r get x + } "\xA0\x0A" + + test {BITOP rshift char} { + r set a "\xAA" + r bitop rshift x a 3 + r get x + } "\x15" + + test {BITOP lshift carry} { + r set a "\xFF" + r bitop lshift x a 1 + r get x + } "\xFE\x01" + + test {BITOP rshift carry} { + r set a "\x00\xFF" + r bitop rshift x a 1 + r get x + } "\x80\x7F" + + test {BITOP rshift reciprocal} { + r flushdb + r set a "abcdefg" + r bitop lshift b a 14 + r bitop rshift res b 14 + r get res + } "abcdefg\x00" + test {BITPOS bit=0 with empty key returns 0} { r del str r bitpos str 0 diff --git a/tests/unit/obuf-limits.tcl b/tests/unit/obuf-limits.tcl index b205eb31b..c45bf8e86 100644 --- a/tests/unit/obuf-limits.tcl +++ b/tests/unit/obuf-limits.tcl @@ -15,7 +15,7 @@ start_server {tags {"obuf-limits"}} { if {![regexp {omem=([0-9]+)} $c - omem]} break if {$omem > 200000} break } - assert {$omem >= 80000 && $omem < 200000} + assert {$omem >= 70000 && $omem < 200000} $rd1 close } diff --git a/tests/unit/rreplay.tcl b/tests/unit/rreplay.tcl new file mode 100644 index 000000000..2029f521d --- /dev/null +++ b/tests/unit/rreplay.tcl @@ -0,0 +1,30 @@ +start_server {tags {"rreplay"}} { + + test {RREPLAY use current db} { + r debug force-master + r select 4 + r set dbnum invalid + r rreplay "f4d5b2b5-4f07-4ee5-a4f2-5dc98507dfce" "*3\r\n\$3\r\nSET\r\n\$5\r\ndbnum\r\n\$4\r\nfour\r\n" + r get dbnum + } {four} + reconnect + + test {RREPLAY db different} { + r debug force-master + r select 4 + r set testkey four + r rreplay "f4d5b2b5-4f07-4ee5-a4f2-5dc98507dfce" "*3\r\n\$3\r\nSET\r\n\$7\r\ntestkey\r\n\$4\r\nbebe\r\n" 2 + r select 4 + assert { [r get testkey] == "four" } + r select 2 + r get testkey + } {bebe} + + reconnect + + test {RREPLAY not master} { + assert_error "*master*" {r rreplay "f4d5b2b5-4f07-4ee5-a4f2-5dc98507dfce" "*3\r\n\$3\r\nSET\r\n\$7\r\ntestkey\r\n\$4\r\nbebe\r\n" 2} + } + + r flushdb +} diff --git a/tests/unit/scan.tcl b/tests/unit/scan.tcl index c0f4349d2..25549c4ac 100644 --- a/tests/unit/scan.tcl +++ b/tests/unit/scan.tcl @@ -53,6 +53,51 @@ start_server {tags {"scan"}} { assert_equal 100 [llength $keys] } + test "SCAN TYPE" { + r flushdb + # populate only creates strings + r debug populate 1000 + + # Check non-strings are excluded + set cur 0 + set keys {} + while 1 { + set res [r scan $cur type "list"] + set cur [lindex $res 0] + set k [lindex $res 1] + lappend keys {*}$k + if {$cur == 0} break + } + + assert_equal 0 [llength $keys] "non-strings excluded" + + # Check strings are included + set cur 0 + set keys {} + while 1 { + set res [r scan $cur type "string"] + set cur [lindex $res 0] + set k [lindex $res 1] + lappend keys {*}$k + if {$cur == 0} break + } + + assert_equal 1000 [llength $keys] "strings included" + + # Check all three args work together + set cur 0 + set keys {} + while 1 { + set res [r scan $cur type "string" match "key:*" count 10] + set cur [lindex $res 0] + set k [lindex $res 1] + lappend keys {*}$k + if {$cur == 0} break + } + + assert_equal 1000 [llength $keys] + } + foreach enc {intset hashtable} { test "SSCAN with encoding $enc" { # Create the Set diff --git a/tests/unit/slowlog.tcl b/tests/unit/slowlog.tcl index dbd7a1547..22f088103 100644 --- a/tests/unit/slowlog.tcl +++ b/tests/unit/slowlog.tcl @@ -80,9 +80,11 @@ start_server {tags {"slowlog"} overrides {slowlog-log-slower-than 1000000}} { } test {SLOWLOG - can be disabled} { + r config set slowlog-max-len 1 r config set slowlog-log-slower-than 1 r slowlog reset - assert_equal [r slowlog len] 1 + r debug sleep 0.2 + assert_equal [r slowlog len] 1 r config set slowlog-log-slower-than -1 r slowlog reset r debug sleep 0.2