diff --git a/.gitignore b/.gitignore index 2b2e15eba..f1c0ecf4b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ .*.swp +core *.o *.log dump.rdb diff --git a/.vscode/settings.json b/.vscode/settings.json index 6fac65a3d..56bf76d11 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,6 +1,56 @@ { "files.associations": { "zmalloc.h": "c", - "stat.h": "c" + "stat.h": "c", + "array": "cpp", + "atomic": "cpp", + "*.tcc": "cpp", + "cctype": "cpp", + "chrono": "cpp", + "clocale": "cpp", + "cmath": "cpp", + "condition_variable": "cpp", + "cstdarg": "cpp", + "cstddef": "cpp", + "cstdint": "cpp", + "cstdio": "cpp", + "cstdlib": "cpp", + "cstring": "cpp", + "ctime": "cpp", + "cwchar": "cpp", + "cwctype": "cpp", + "deque": "cpp", + "list": "cpp", + "unordered_map": "cpp", + "vector": "cpp", + "exception": "cpp", + "fstream": "cpp", + "functional": "cpp", + "future": "cpp", + "initializer_list": "cpp", + "iomanip": "cpp", + "iosfwd": "cpp", + "iostream": "cpp", + "istream": "cpp", + "limits": "cpp", + "memory": "cpp", + "mutex": "cpp", + "new": "cpp", + "numeric": "cpp", + "optional": "cpp", + "ostream": "cpp", + "ratio": "cpp", + "scoped_allocator": "cpp", + "sstream": "cpp", + "stdexcept": "cpp", + "streambuf": "cpp", + "string_view": "cpp", + "system_error": "cpp", + "thread": "cpp", + "cinttypes": "cpp", + "tuple": "cpp", + "type_traits": "cpp", + "typeinfo": "cpp", + "utility": "cpp" } -} \ No newline at end of file +} diff --git a/redis.conf b/redis.conf index 2d0de9610..356dbcca8 100644 --- a/redis.conf +++ b/redis.conf @@ -291,6 +291,17 @@ dir ./ # refuse the replica request. # # masterauth +# +# However this is not enough if you are using Redis ACLs (for Redis version +# 6 or greater), and the default user is not capable of running the PSYNC +# command and/or other commands needed for replication. In this case it's +# better to configure a special user to use with replication, and specify the +# masteruser configuration as such: +# +# masteruser +# +# When masteruser is specified, the replica will authenticate against its +# master using the new AUTH form: AUTH . # When a replica loses its connection with the master, or when the replication # is still in progress, the replica can act in two different ways: @@ -501,6 +512,94 @@ replica-priority 100 # can be easily a long string from /dev/urandom or whatever, so by using a # long and unguessable password no brute force attack will be possible. +# Redis ACL users are defined in the following format: +# +# user ... acl rules ... +# +# For example: +# +# user worker +@list +@connection ~jobs:* on >ffa9203c493aa99 +# +# The special username "default" is used for new connections. If this user +# has the "nopass" rule, then new connections will be immediately authenticated +# as the "default" user without the need of any password provided via the +# AUTH command. Otherwise if the "default" user is not flagged with "nopass" +# the connections will start in not authenticated state, and will require +# AUTH (or the HELLO command AUTH option) in order to be authenticated and +# start to work. +# +# The ACL rules that describe what an user can do are the following: +# +# on Enable the user: it is possible to authenticate as this user. +# off Disable the user: it's no longer possible to authenticate +# with this user, however the already authenticated connections +# will still work. +# + Allow the execution of that command +# - Disallow the execution of that command +# +@ Allow the execution of all the commands in such category +# with valid categories are like @admin, @set, @sortedset, ... +# and so forth, see the full list in the server.c file where +# the Redis command table is described and defined. +# The special category @all means all the commands, but currently +# present in the server, and that will be loaded in the future +# via modules. +# +|subcommand Allow a specific subcommand of an otherwise +# disabled command. Note that this form is not +# allowed as negative like -DEBUG|SEGFAULT, but +# only additive starting with "+". +# allcommands Alias for +@all. Note that it implies the ability to execute +# all the future commands loaded via the modules system. +# nocommands Alias for -@all. +# ~ Add a pattern of keys that can be mentioned as part of +# commands. For instance ~* allows all the keys. The pattern +# is a glob-style pattern like the one of KEYS. +# It is possible to specify multiple patterns. +# allkeys Alias for ~* +# resetkeys Flush the list of allowed keys patterns. +# > Add this passowrd to the list of valid password for the user. +# For example >mypass will add "mypass" to the list. +# This directive clears the "nopass" flag (see later). +# < Remove this password from the list of valid passwords. +# nopass All the set passwords of the user are removed, and the user +# is flagged as requiring no password: it means that every +# password will work against this user. If this directive is +# used for the default user, every new connection will be +# immediately authenticated with the default user without +# any explicit AUTH command required. Note that the "resetpass" +# directive will clear this condition. +# resetpass Flush the list of allowed passwords. Moreover removes the +# "nopass" status. After "resetpass" the user has no associated +# passwords and there is no way to authenticate without adding +# some password (or setting it as "nopass" later). +# reset Performs the following actions: resetpass, resetkeys, off, +# -@all. The user returns to the same state it has immediately +# after its creation. +# +# ACL rules can be specified in any order: for instance you can start with +# passwords, then flags, or key patterns. However note that the additive +# and subtractive rules will CHANGE MEANING depending on the ordering. +# For instance see the following example: +# +# user alice on +@all -DEBUG ~* >somepassword +# +# This will allow "alice" to use all the commands with the exception of the +# DEBUG command, since +@all added all the commands to the set of the commands +# alice can use, and later DEBUG was removed. However if we invert the order +# of two ACL rules the result will be different: +# +# user alice on -DEBUG +@all ~* >somepassword +# +# Now DEBUG was removed when alice had yet no commands in the set of allowed +# commands, later all the commands are added, so the user will be able to +# execute everything. +# +# Basically ACL rules are processed left-to-right. +# +# For more information about ACL configuration please refer to +# the Redis web site at https://redis.io/topics/acl + +# Using an external ACL file +# # Instead of configuring users here in this file, it is possible to use # a stand-alone file just listing users. The two methods cannot be mixed: # if you configure users here and at the same time you activate the exteranl @@ -1399,3 +1498,8 @@ rdb-save-incremental-fsync yes # reduces memory requirements by storing rarely accessed data on disk # instead of RAM. A temporary file will be created in this directory. # scratch-file-path /tmp/ + +# Number of worker threads serving requests. This number should be related to the performance +# of your network hardware, not the number of cores on your machine. We don't recommend going +# above 4 at this time. By default this is set 1. +server-threads 2 diff --git a/src/Makefile b/src/Makefile index b2b5f5833..4258f47a5 100644 --- a/src/Makefile +++ b/src/Makefile @@ -21,7 +21,7 @@ NODEPS:=clean distclean # Default settings STD=-std=c99 -pedantic -DREDIS_STATIC='' -CXX_STD=-std=c++14 -pedantic +CXX_STD=-std=c++14 -pedantic -fno-rtti ifneq (,$(findstring clang,$(CC))) ifneq (,$(findstring FreeBSD,$(uname_S))) STD+=-Wno-c11-extensions @@ -39,7 +39,7 @@ MALLOC=libc ifneq ($(uname_M),armv6l) ifneq ($(uname_M),armv7l) ifeq ($(uname_S),Linux) - MALLOC=memkind + MALLOC=jemalloc endif endif endif @@ -134,23 +134,27 @@ FINAL_CXXFLAGS+= -I../deps/hiredis -I../deps/linenoise -I../deps/lua/src ifeq ($(MALLOC),tcmalloc) FINAL_CFLAGS+= -DUSE_TCMALLOC + FINAL_CXXFLAGS+= -DUSE_TCMALLOC FINAL_LIBS+= -ltcmalloc endif ifeq ($(MALLOC),tcmalloc_minimal) FINAL_CFLAGS+= -DUSE_TCMALLOC + FINAL_CXXFLAGS+= -DUSE_TCMALLOC FINAL_LIBS+= -ltcmalloc_minimal endif ifeq ($(MALLOC),jemalloc) DEPENDENCY_TARGETS+= jemalloc FINAL_CFLAGS+= -DUSE_JEMALLOC -I../deps/jemalloc/include + FINAL_CXXFLAGS+= -DUSE_JEMALLOC -I../deps/jemalloc/include FINAL_LIBS := ../deps/jemalloc/lib/libjemalloc.a $(FINAL_LIBS) endif ifeq ($(MALLOC),memkind) DEPENDENCY_TARGETS+= memkind FINAL_CFLAGS+= -DUSE_MEMKIND -I../deps/memkind/src/include + FINAL_CXXFLAGS+= -DUSE_MEMKIND -I../deps/memkind/src/include FINAL_LIBS := ../deps/memkind/src/.libs/libmemkind.a -lnuma $(FINAL_LIBS) endif diff --git a/src/acl.c b/src/acl.c index 42cd0c734..b5b9f46a7 100644 --- a/src/acl.c +++ b/src/acl.c @@ -28,6 +28,7 @@ */ #include "server.h" +#include /* ============================================================================= * Global state for ACLs @@ -90,6 +91,7 @@ struct ACLUserFlag { void ACLResetSubcommandsForCommand(user *u, unsigned long id); void ACLResetSubcommands(user *u); +void ACLAddAllowedSubcommand(user *u, unsigned long id, const char *sub); /* ============================================================================= * Helper functions for the rest of the ACL implementation @@ -163,6 +165,11 @@ void ACLListFreeSds(void *item) { sdsfree(item); } +/* Method to duplicate list elements from ACL users password/ptterns lists. */ +void *ACLListDupSds(void *item) { + return sdsdup(item); +} + /* Create a new user with the specified name, store it in the list * of users (the Users global radix tree), and returns a reference to * the structure representing the user. @@ -178,13 +185,32 @@ user *ACLCreateUser(const char *name, size_t namelen) { u->patterns = listCreate(); listSetMatchMethod(u->passwords,ACLListMatchSds); listSetFreeMethod(u->passwords,ACLListFreeSds); + listSetDupMethod(u->passwords,ACLListDupSds); listSetMatchMethod(u->patterns,ACLListMatchSds); listSetFreeMethod(u->patterns,ACLListFreeSds); + listSetDupMethod(u->patterns,ACLListDupSds); memset(u->allowed_commands,0,sizeof(u->allowed_commands)); raxInsert(Users,(unsigned char*)name,namelen,u,NULL); return u; } +/* This function should be called when we need an unlinked "fake" user + * we can use in order to validate ACL rules or for other similar reasons. + * The user will not get linked to the Users radix tree. The returned + * user should be released with ACLFreeUser() as usually. */ +user *ACLCreateUnlinkedUser(void) { + char username[64]; + for (int j = 0; ; j++) { + snprintf(username,sizeof(username),"__fakeuser:%d__",j); + user *fakeuser = ACLCreateUser(username,strlen(username)); + if (fakeuser == NULL) continue; + int retval = raxRemove(Users,(unsigned char*) username, + strlen(username),NULL); + serverAssert(retval != 0); + return fakeuser; + } +} + /* Release the memory used by the user structure. Note that this function * will not remove the user from the Users global radix tree. */ void ACLFreeUser(user *u) { @@ -195,6 +221,62 @@ void ACLFreeUser(user *u) { zfree(u); } +/* When a user is deleted we need to cycle the active + * connections in order to kill all the pending ones that + * are authenticated with such user. */ +void ACLFreeUserAndKillClients(user *u) { + listIter li; + listNode *ln; + listRewind(server.clients,&li); + while ((ln = listNext(&li)) != NULL) { + client *c = listNodeValue(ln); + if (c->puser == u) { + /* We'll free the conenction asynchronously, so + * in theory to set a different user is not needed. + * However if there are bugs in Redis, soon or later + * this may result in some security hole: it's much + * more defensive to set the default user and put + * it in non authenticated mode. */ + c->puser = DefaultUser; + c->authenticated = 0; + freeClientAsync(c); + } + } + ACLFreeUser(u); +} + +/* Copy the user ACL rules from the source user 'src' to the destination + * user 'dst' so that at the end of the process they'll have exactly the + * same rules (but the names will continue to be the original ones). */ +void ACLCopyUser(user *dst, user *src) { + listRelease(dst->passwords); + listRelease(dst->patterns); + dst->passwords = listDup(src->passwords); + dst->patterns = listDup(src->patterns); + memcpy(dst->allowed_commands,src->allowed_commands, + sizeof(dst->allowed_commands)); + dst->flags = src->flags; + ACLResetSubcommands(dst); + /* Copy the allowed subcommands array of array of SDS strings. */ + if (src->allowed_subcommands) { + for (int j = 0; j < USER_COMMAND_BITS_COUNT; j++) { + if (src->allowed_subcommands[j]) { + for (int i = 0; src->allowed_subcommands[j][i]; i++) + { + ACLAddAllowedSubcommand(dst, j, + src->allowed_subcommands[j][i]); + } + } + } + } +} + +/* Free all the users registered in the radix tree 'users' and free the + * radix tree itself. */ +void ACLFreeUsersSet(rax *users) { + raxFreeWithCallback(users,(void(*)(void*))ACLFreeUserAndKillClients); +} + /* Given a command ID, this function set by reference 'word' and 'bit' * so that user->allowed_commands[word] will address the right word * where the corresponding bit for the provided ID is stored, and @@ -256,6 +338,7 @@ int ACLSetUserCommandBitsForCategory(user *u, const char *category, int value) { dictEntry *de; while ((de = dictNext(di)) != NULL) { struct redisCommand *cmd = dictGetVal(de); + if (cmd->flags & CMD_MODULE) continue; /* Ignore modules commands. */ if (cmd->flags & cflag) { ACLSetUserCommandBit(u,cmd->id,value); ACLResetSubcommandsForCommand(u,cmd->id); @@ -579,6 +662,7 @@ void ACLAddAllowedSubcommand(user *u, unsigned long id, const char *sub) { * fully added. * EEXIST: You are adding a key pattern after "*" was already added. This is * almost surely an error on the user side. + * ENODEV: The password you are trying to remove from the user does not exist. */ int ACLSetUser(user *u, const char *op, ssize_t oplen) { if (oplen == -1) oplen = strlen(op); @@ -623,8 +707,13 @@ int ACLSetUser(user *u, const char *op, ssize_t oplen) { } else if (op[0] == '<') { sds delpass = sdsnewlen(op+1,oplen-1); listNode *ln = listSearchKey(u->passwords,delpass); - if (ln) listDelNode(u->passwords,ln); sdsfree(delpass); + if (ln) { + listDelNode(u->passwords,ln); + } else { + errno = ENODEV; + return C_ERR; + } } else if (op[0] == '~') { if (u->flags & USER_FLAG_ALLKEYS) { errno = EEXIST; @@ -728,6 +817,9 @@ char *ACLSetUserStringError(void) { "'allkeys' flag) is not valid and does not have any " "effect. Try 'resetkeys' to start with an empty " "list of patterns"; + else if (errno == ENODEV) + errmsg = "The password you are trying to remove from the user does " + "not exist"; return errmsg; } @@ -741,10 +833,9 @@ sds ACLDefaultUserFirstPassword(void) { return listNodeValue(first); } -/* Initialization of the ACL subsystem. */ -void ACLInit(void) { - Users = raxNew(); - UsersToLoad = listCreate(); +/* Initialize the default user, that will always exist for all the process + * lifetime. */ +void ACLInitDefaultUser(void) { DefaultUser = ACLCreateUser("default",7); ACLSetUser(DefaultUser,"+@all",-1); ACLSetUser(DefaultUser,"~*",-1); @@ -752,6 +843,13 @@ void ACLInit(void) { ACLSetUser(DefaultUser,"nopass",-1); } +/* Initialization of the ACL subsystem. */ +void ACLInit(void) { + Users = raxNew(); + UsersToLoad = listCreate(); + ACLInitDefaultUser(); +} + /* Check the username and password pair and return C_OK if they are valid, * otherwise C_ERR is returned and errno is set to: * @@ -944,11 +1042,7 @@ int ACLAppendUserForLoading(sds *argv, int argc, int *argc_err) { /* Try to apply the user rules in a fake user to see if they * are actually valid. */ - char *funame = "__fakeuser__"; - user *fakeuser = ACLCreateUser(funame,strlen(funame)); - serverAssert(fakeuser != NULL); - int retval = raxRemove(Users,(unsigned char*) funame,strlen(funame),NULL); - serverAssert(retval != 0); + user *fakeuser = ACLCreateUnlinkedUser(); for (int j = 2; j < argc; j++) { if (ACLSetUser(fakeuser,argv[j],sdslen(argv[j])) == C_ERR) { @@ -1009,15 +1103,275 @@ int ACLLoadConfiguredUsers(void) { return C_OK; } +/* This function loads the ACL from the specified filename: every line + * is validated and shold be either empty or in the format used to specify + * users in the redis.conf configuration or in the ACL file, that is: + * + * user ... rules ... + * + * Note that this function considers comments starting with '#' as errors + * because the ACL file is meant to be rewritten, and comments would be + * lost after the rewrite. Yet empty lines are allowed to avoid being too + * strict. + * + * One important part of implementing ACL LOAD, that uses this function, is + * to avoid ending with broken rules if the ACL file is invalid for some + * reason, so the function will attempt to validate the rules before loading + * each user. For every line that will be found broken the function will + * collect an error message. + * + * IMPORTANT: If there is at least a single error, nothing will be loaded + * and the rules will remain exactly as they were. + * + * At the end of the process, if no errors were found in the whole file then + * NULL is returned. Otherwise an SDS string describing in a single line + * a description of all the issues found is returned. */ +sds ACLLoadFromFile(const char *filename) { + FILE *fp; + char buf[1024]; + + /* Open the ACL file. */ + if ((fp = fopen(filename,"r")) == NULL) { + sds errors = sdscatprintf(sdsempty(), + "Error loading ACLs, opening file '%s': %s", + filename, strerror(errno)); + return errors; + } + + /* Load the whole file as a single string in memory. */ + sds acls = sdsempty(); + while(fgets(buf,sizeof(buf),fp) != NULL) + acls = sdscat(acls,buf); + fclose(fp); + + /* Split the file into lines and attempt to load each line. */ + int totlines; + sds *lines, errors = sdsempty(); + lines = sdssplitlen(acls,strlen(acls),"\n",1,&totlines); + sdsfree(acls); + + /* We need a fake user to validate the rules before making changes + * to the real user mentioned in the ACL line. */ + user *fakeuser = ACLCreateUnlinkedUser(); + + /* We do all the loading in a fresh insteance of the Users radix tree, + * so if there are errors loading the ACL file we can rollback to the + * old version. */ + rax *old_users = Users; + user *old_default_user = DefaultUser; + Users = raxNew(); + ACLInitDefaultUser(); + + /* Load each line of the file. */ + for (int i = 0; i < totlines; i++) { + sds *argv; + int argc; + int linenum = i+1; + + lines[i] = sdstrim(lines[i]," \t\r\n"); + + /* Skip blank lines */ + if (lines[i][0] == '\0') continue; + + /* Split into arguments */ + argv = sdssplitargs(lines[i],&argc); + if (argv == NULL) { + errors = sdscatprintf(errors, + "%s:%d: unbalanced quotes in acl line. ", + server.acl_filename, linenum); + continue; + } + + /* Skip this line if the resulting command vector is empty. */ + if (argc == 0) { + sdsfreesplitres(argv,argc); + continue; + } + + /* The line should start with the "user" keyword. */ + if (strcmp(argv[0],"user") || argc < 2) { + errors = sdscatprintf(errors, + "%s:%d should start with user keyword followed " + "by the username. ", server.acl_filename, + linenum); + sdsfreesplitres(argv,argc); + continue; + } + + /* Try to process the line using the fake user to validate iif + * the rules are able to apply cleanly. */ + ACLSetUser(fakeuser,"reset",-1); + int j; + for (j = 2; j < argc; j++) { + if (ACLSetUser(fakeuser,argv[j],sdslen(argv[j])) != C_OK) { + char *errmsg = ACLSetUserStringError(); + errors = sdscatprintf(errors, + "%s:%d: %s. ", + server.acl_filename, linenum, errmsg); + continue; + } + } + + /* Apply the rule to the new users set only if so far there + * are no errors, otherwise it's useless since we are going + * to discard the new users set anyway. */ + if (sdslen(errors) != 0) { + sdsfreesplitres(argv,argc); + continue; + } + + /* We can finally lookup the user and apply the rule. If the + * user already exists we always reset it to start. */ + user *u = ACLCreateUser(argv[1],sdslen(argv[1])); + if (!u) { + u = ACLGetUserByName(argv[1],sdslen(argv[1])); + serverAssert(u != NULL); + ACLSetUser(u,"reset",-1); + } + + /* Note that the same rules already applied to the fake user, so + * we just assert that everything goess well: it should. */ + for (j = 2; j < argc; j++) + serverAssert(ACLSetUser(u,argv[j],sdslen(argv[j])) == C_OK); + + sdsfreesplitres(argv,argc); + } + + ACLFreeUser(fakeuser); + sdsfreesplitres(lines,totlines); + DefaultUser = old_default_user; /* This pointer must never change. */ + + /* Check if we found errors and react accordingly. */ + if (sdslen(errors) == 0) { + /* The default user pointer is referenced in different places: instead + * of replacing such occurrences it is much simpler to copy the new + * default user configuration in the old one. */ + user *new = ACLGetUserByName("default",7); + serverAssert(new != NULL); + ACLCopyUser(DefaultUser,new); + ACLFreeUser(new); + raxInsert(Users,(unsigned char*)"default",7,DefaultUser,NULL); + raxRemove(old_users,(unsigned char*)"default",7,NULL); + ACLFreeUsersSet(old_users); + sdsfree(errors); + return NULL; + } else { + ACLFreeUsersSet(Users); + Users = old_users; + errors = sdscat(errors,"WARNING: ACL errors detected, no change to the previously active ACL rules was performed"); + return errors; + } +} + +/* Generate a copy of the ACLs currently in memory in the specified filename. + * Returns C_OK on success or C_ERR if there was an error during the I/O. + * When C_ERR is returned a log is produced with hints about the issue. */ +int ACLSaveToFile(const char *filename) { + sds acl = sdsempty(); + int fd = -1; + sds tmpfilename = NULL; + int retval = C_ERR; + + /* Let's generate an SDS string containing the new version of the + * ACL file. */ + raxIterator ri; + raxStart(&ri,Users); + raxSeek(&ri,"^",NULL,0); + while(raxNext(&ri)) { + user *u = ri.data; + /* Return information in the configuration file format. */ + sds user = sdsnew("user "); + user = sdscatsds(user,u->name); + user = sdscatlen(user," ",1); + sds descr = ACLDescribeUser(u); + user = sdscatsds(user,descr); + sdsfree(descr); + acl = sdscatsds(acl,user); + acl = sdscatlen(acl,"\n",1); + sdsfree(user); + } + raxStop(&ri); + + /* Create a temp file with the new content. */ + tmpfilename = sdsnew(filename); + tmpfilename = sdscatfmt(tmpfilename,".tmp-%i-%I", + (int)getpid(),(int)mstime()); + if ((fd = open(tmpfilename,O_WRONLY|O_CREAT,0644)) == -1) { + serverLog(LL_WARNING,"Opening temp ACL file for ACL SAVE: %s", + strerror(errno)); + goto cleanup; + } + + /* Write it. */ + if (write(fd,acl,sdslen(acl)) != (ssize_t)sdslen(acl)) { + serverLog(LL_WARNING,"Writing ACL file for ACL SAVE: %s", + strerror(errno)); + goto cleanup; + } + close(fd); fd = -1; + + /* Let's replace the new file with the old one. */ + if (rename(tmpfilename,filename) == -1) { + serverLog(LL_WARNING,"Renaming ACL file for ACL SAVE: %s", + strerror(errno)); + goto cleanup; + } + sdsfree(tmpfilename); tmpfilename = NULL; + retval = C_OK; /* If we reached this point, everything is fine. */ + +cleanup: + if (fd != -1) close(fd); + if (tmpfilename) unlink(tmpfilename); + sdsfree(tmpfilename); + sdsfree(acl); + return retval; +} + +/* This function is called once the server is already running, modules are + * loaded, and we are ready to start, in order to load the ACLs either from + * the pending list of users defined in redis.conf, or from the ACL file. + * The function will just exit with an error if the user is trying to mix + * both the loading methods. */ +void ACLLoadUsersAtStartup(void) { + if (server.acl_filename[0] != '\0' && listLength(UsersToLoad) != 0) { + serverLog(LL_WARNING, + "Configuring Redis with users defined in redis.conf and at " + "the same setting an ACL file path is invalid. This setup " + "is very likely to lead to configuration errors and security " + "holes, please define either an ACL file or declare users " + "directly in your redis.conf, but not both."); + exit(1); + } + + if (ACLLoadConfiguredUsers() == C_ERR) { + serverLog(LL_WARNING, + "Critical error while loading ACLs. Exiting."); + exit(1); + } + + if (server.acl_filename[0] != '\0') { + sds errors = ACLLoadFromFile(server.acl_filename); + if (errors) { + serverLog(LL_WARNING, + "Aborting Redis startup because of ACL errors: %s", errors); + sdsfree(errors); + exit(1); + } + } +} + /* ============================================================================= * ACL related commands * ==========================================================================*/ /* ACL -- show and modify the configuration of ACL users. * ACL HELP + * ACL LOAD * ACL LIST - * ACL SETUSER ... user attribs ... - * ACL DELUSER + * ACL USERS + * ACL CAT [] + * ACL SETUSER ... acl rules ... + * ACL DELUSER [...] * ACL GETUSER */ void aclCommand(client *c) { @@ -1045,32 +1399,16 @@ void aclCommand(client *c) { addReplyError(c,"The 'default' user cannot be removed"); return; } + } + + for (int j = 2; j < c->argc; j++) { + sds username = ptrFromObj(c->argv[j]); user *u; if (raxRemove(Users,(unsigned char*)username, sdslen(username), (void**)&u)) { - /* When a user is deleted we need to cycle the active - * connections in order to kill all the pending ones that - * are authenticated with such user. */ - ACLFreeUser(u); - listIter li; - listNode *ln; - listRewind(server.clients,&li); - while ((ln = listNext(&li)) != NULL) { - client *c = listNodeValue(ln); - if (c->puser == u) { - /* We'll free the conenction asynchronously, so - * in theory to set a different user is not needed. - * However if there are bugs in Redis, soon or later - * this may result in some security hole: it's much - * more defensive to set the default user and put - * it in non authenticated mode. */ - c->puser = DefaultUser; - c->authenticated = 0; - freeClientAsync(c); - } - } + ACLFreeUserAndKillClients(u); deleted++; } } @@ -1151,19 +1489,69 @@ void aclCommand(client *c) { } } raxStop(&ri); - } else if (!strcasecmp(sub,"whoami")) { + } else if (!strcasecmp(sub,"whoami") && c->argc == 2) { if (c->puser != NULL) { addReplyBulkCBuffer(c,c->puser->name,sdslen(c->puser->name)); } else { addReplyNull(c); } + } else if (server.acl_filename[0] == '\0' && + (!strcasecmp(sub,"load") || !strcasecmp(sub,"save"))) + { + addReplyError(c,"This Redis instance is not configured to use an ACL file. You may want to specify users via the ACL SETUSER command and then issue a CONFIG REWRITE (assuming you have a Redis configuration file set) in order to store users in the Redis configuration."); + return; + } else if (!strcasecmp(sub,"load") && c->argc == 2) { + sds errors = ACLLoadFromFile(server.acl_filename); + if (errors == NULL) { + addReply(c,shared.ok); + } else { + addReplyError(c,errors); + sdsfree(errors); + } + } else if (!strcasecmp(sub,"save") && c->argc == 2) { + if (ACLSaveToFile(server.acl_filename) == C_OK) { + addReply(c,shared.ok); + } else { + addReplyError(c,"There was an error trying to save the ACLs. " + "Please check the server logs for more " + "information"); + } + } else if (!strcasecmp(sub,"cat") && c->argc == 2) { + void *dl = addReplyDeferredLen(c); + int j; + for (j = 0; ACLCommandCategories[j].flag != 0; j++) + addReplyBulkCString(c,ACLCommandCategories[j].name); + setDeferredArrayLen(c,dl,j); + } else if (!strcasecmp(sub,"cat") && c->argc == 3) { + uint64_t cflag = ACLGetCommandCategoryFlagByName(ptrFromObj(c->argv[2])); + if (cflag == 0) { + addReplyErrorFormat(c, "Unknown category '%s'", (char*)ptrFromObj(c->argv[2])); + return; + } + int arraylen = 0; + void *dl = addReplyDeferredLen(c); + dictIterator *di = dictGetIterator(server.orig_commands); + dictEntry *de; + while ((de = dictNext(di)) != NULL) { + struct redisCommand *cmd = dictGetVal(de); + if (cmd->flags & CMD_MODULE) continue; + if (cmd->flags & cflag) { + addReplyBulkCString(c,cmd->name); + arraylen++; + } + } + dictReleaseIterator(di); + setDeferredArrayLen(c,dl,arraylen); } else if (!strcasecmp(sub,"help")) { const char *help[] = { +"LOAD -- Reload users from the ACL file.", "LIST -- Show user details in config file format.", "USERS -- List all the registered usernames.", "SETUSER [attribs ...] -- Create or modify a user.", "GETUSER -- Get the user details.", -"DELUSER -- Delete a user.", +"DELUSER [...] -- Delete a list of users.", +"CAT -- List available categories.", +"CAT -- List commands inside category.", "WHOAMI -- Return the current connection username.", NULL }; @@ -1172,3 +1560,15 @@ NULL addReplySubcommandSyntaxError(c); } } + +void addReplyCommandCategories(client *c, struct redisCommand *cmd) { + int flagcount = 0; + void *flaglen = addReplyDeferredLen(c); + for (int j = 0; ACLCommandCategories[j].flag != 0; j++) { + if (cmd->flags & ACLCommandCategories[j].flag) { + addReplyStatusFormat(c, "@%s", ACLCommandCategories[j].name); + flagcount++; + } + } + setDeferredSetLen(c, flaglen, flagcount); +} diff --git a/src/adlist.h b/src/adlist.h index c954fac87..e9de81ceb 100644 --- a/src/adlist.h +++ b/src/adlist.h @@ -31,6 +31,10 @@ #ifndef __ADLIST_H__ #define __ADLIST_H__ +#ifdef __cplusplus +extern "C" { +#endif + /* Node, List, and Iterator are the only data structures used currently. */ typedef struct listNode { @@ -92,4 +96,8 @@ void listJoin(list *l, list *o); #define AL_START_HEAD 0 #define AL_START_TAIL 1 +#ifdef __cplusplus +} +#endif + #endif /* __ADLIST_H__ */ diff --git a/src/ae.cpp b/src/ae.cpp index 17408e316..60e22db83 100644 --- a/src/ae.cpp +++ b/src/ae.cpp @@ -30,7 +30,11 @@ * POSSIBILITY OF SUCH DAMAGE. */ +#include +#include +#include #include +#include #include #include #include @@ -41,11 +45,49 @@ #include #include "ae.h" +#include "fastlock.h" extern "C" { #include "zmalloc.h" #include "config.h" } +#ifdef USE_MUTEX +thread_local int cOwnLock = 0; +class mutex_wrapper +{ + std::recursive_mutex m_mutex; +public: + void lock() { + m_mutex.lock(); + cOwnLock++; + } + + void unlock() { + cOwnLock--; + m_mutex.unlock(); + } + + bool try_lock() { + if (m_mutex.try_lock()) { + cOwnLock++; + return true; + } + return false; + } + + bool fOwnLock() { + return cOwnLock > 0; + } +}; +mutex_wrapper g_lock; + +#else +fastlock g_lock; +#endif +thread_local aeEventLoop *g_eventLoopThisThread = NULL; + +#define AE_ASSERT(x) if (!(x)) do { fprintf(stderr, "AE_ASSER FAILURE\n"); *((volatile int*)0) = 1; } while(0) + /* Include the best multiplexing layer supported by this system. * The following should be ordered by performances, descending. */ #ifdef HAVE_EVPORT @@ -62,6 +104,178 @@ extern "C" { #endif #endif +enum class AE_ASYNC_OP +{ + PostFunction, + PostCppFunction, + DeleteFileEvent, + CreateFileEvent, +}; + +struct aeCommandControl +{ + std::condition_variable cv; + std::atomic rval; + std::mutex mutexcv; +}; + +struct aeCommand +{ + AE_ASYNC_OP op; + int fd; + int mask; + union { + aePostFunctionProc *proc; + aeFileProc *fproc; + std::function *pfn; + }; + void *clientData; + aeCommandControl *pctl; +}; + +void aeProcessCmd(aeEventLoop *eventLoop, int fd, void *, int ) +{ + aeCommand cmd; + for (;;) + { + auto cb = read(fd, &cmd, sizeof(aeCommand)); + if (cb != sizeof(cmd)) + { + AE_ASSERT(errno == EAGAIN); + break; + } + switch (cmd.op) + { + case AE_ASYNC_OP::DeleteFileEvent: + aeDeleteFileEvent(eventLoop, cmd.fd, cmd.mask); + break; + + case AE_ASYNC_OP::CreateFileEvent: + { + if (cmd.pctl != nullptr) + { + cmd.pctl->mutexcv.lock(); + std::atomic_store(&cmd.pctl->rval, aeCreateFileEvent(eventLoop, cmd.fd, cmd.mask, cmd.fproc, cmd.clientData)); + cmd.pctl->cv.notify_all(); + cmd.pctl->mutexcv.unlock(); + } + else + { + aeCreateFileEvent(eventLoop, cmd.fd, cmd.mask, cmd.fproc, cmd.clientData); + } + } + break; + + case AE_ASYNC_OP::PostFunction: + { + std::unique_lock ulock(g_lock); + ((aePostFunctionProc*)cmd.proc)(cmd.clientData); + break; + } + + case AE_ASYNC_OP::PostCppFunction: + { + if (cmd.pctl != nullptr) + cmd.pctl->mutexcv.lock(); + + std::unique_lock ulock(g_lock); + (*cmd.pfn)(); + + if (cmd.pctl != nullptr) + { + cmd.pctl->cv.notify_all(); + cmd.pctl->mutexcv.unlock(); + } + delete cmd.pfn; + } + break; + } + } +} + +int aeCreateRemoteFileEvent(aeEventLoop *eventLoop, int fd, int mask, + aeFileProc *proc, void *clientData, int fSynchronous) +{ + if (eventLoop == g_eventLoopThisThread) + return aeCreateFileEvent(eventLoop, fd, mask, proc, clientData); + + int ret = AE_OK; + + aeCommand cmd; + cmd.op = AE_ASYNC_OP::CreateFileEvent; + cmd.fd = fd; + cmd.mask = mask; + cmd.fproc = proc; + cmd.clientData = clientData; + cmd.pctl = nullptr; + if (fSynchronous) + cmd.pctl = new aeCommandControl(); + + std::unique_lock ulock(cmd.pctl->mutexcv, std::defer_lock); + if (fSynchronous) + cmd.pctl->mutexcv.lock(); + auto size = write(eventLoop->fdCmdWrite, &cmd, sizeof(cmd)); + if (size != sizeof(cmd)) + { + AE_ASSERT(errno == EAGAIN); + ret = AE_ERR; + } + + if (fSynchronous) + { + cmd.pctl->cv.wait(ulock); + ret = cmd.pctl->rval; + delete cmd.pctl; + } + + return ret; +} + +int aePostFunction(aeEventLoop *eventLoop, aePostFunctionProc *proc, void *arg) +{ + if (eventLoop == g_eventLoopThisThread) + { + proc(arg); + return AE_OK; + } + aeCommand cmd; + cmd.op = AE_ASYNC_OP::PostFunction; + cmd.proc = proc; + cmd.clientData = arg; + auto size = write(eventLoop->fdCmdWrite, &cmd, sizeof(cmd)); + AE_ASSERT(size == sizeof(cmd)); + return AE_OK; +} + +int aePostFunction(aeEventLoop *eventLoop, std::function fn, bool fSynchronous) +{ + if (eventLoop == g_eventLoopThisThread) + { + fn(); + return AE_OK; + } + + aeCommand cmd; + cmd.op = AE_ASYNC_OP::PostCppFunction; + cmd.pfn = new std::function(fn); + cmd.pctl = nullptr; + if (fSynchronous) + cmd.pctl = new aeCommandControl(); + std::unique_lock ulock(cmd.pctl->mutexcv, std::defer_lock); + if (fSynchronous) + cmd.pctl->mutexcv.lock(); + auto size = write(eventLoop->fdCmdWrite, &cmd, sizeof(cmd)); + AE_ASSERT(size == sizeof(cmd)); + int ret = AE_OK; + if (fSynchronous) + { + cmd.pctl->cv.wait(ulock); + ret = cmd.pctl->rval; + delete cmd.pctl; + } + return ret; +} + aeEventLoop *aeCreateEventLoop(int setsize) { aeEventLoop *eventLoop; int i; @@ -83,6 +297,18 @@ aeEventLoop *aeCreateEventLoop(int setsize) { * vector with it. */ for (i = 0; i < setsize; i++) eventLoop->events[i].mask = AE_NONE; + + fastlock_init(&eventLoop->flock); + int rgfd[2]; + if (pipe(rgfd) < 0) + goto err; + eventLoop->fdCmdRead = rgfd[0]; + eventLoop->fdCmdWrite = rgfd[1]; + fcntl(eventLoop->fdCmdWrite, F_SETFL, O_NONBLOCK); + fcntl(eventLoop->fdCmdRead, F_SETFL, O_NONBLOCK); + eventLoop->cevents = 0; + aeCreateFileEvent(eventLoop, eventLoop->fdCmdRead, AE_READABLE|AE_READ_THREADSAFE, aeProcessCmd, NULL); + return eventLoop; err: @@ -107,6 +333,7 @@ int aeGetSetSize(aeEventLoop *eventLoop) { * * Otherwise AE_OK is returned and the operation is successful. */ int aeResizeSetSize(aeEventLoop *eventLoop, int setsize) { + AE_ASSERT(g_eventLoopThisThread == NULL || g_eventLoopThisThread == eventLoop); int i; if (setsize == eventLoop->setsize) return AE_OK; @@ -129,19 +356,25 @@ extern "C" void aeDeleteEventLoop(aeEventLoop *eventLoop) { zfree(eventLoop->events); zfree(eventLoop->fired); zfree(eventLoop); + fastlock_free(&eventLoop->flock); + close(eventLoop->fdCmdRead); + close(eventLoop->fdCmdWrite); } extern "C" void aeStop(aeEventLoop *eventLoop) { + AE_ASSERT(g_eventLoopThisThread == NULL || g_eventLoopThisThread == eventLoop); eventLoop->stop = 1; } extern "C" int aeCreateFileEvent(aeEventLoop *eventLoop, int fd, int mask, aeFileProc *proc, void *clientData) { + AE_ASSERT(g_eventLoopThisThread == NULL || g_eventLoopThisThread == eventLoop); if (fd >= eventLoop->setsize) { errno = ERANGE; return AE_ERR; } + aeFileEvent *fe = &eventLoop->events[fd]; if (aeApiAddEvent(eventLoop, fd, mask) == -1) @@ -155,8 +388,21 @@ extern "C" int aeCreateFileEvent(aeEventLoop *eventLoop, int fd, int mask, return AE_OK; } +void aeDeleteFileEventAsync(aeEventLoop *eventLoop, int fd, int mask) +{ + if (eventLoop == g_eventLoopThisThread) + return aeDeleteFileEvent(eventLoop, fd, mask); + aeCommand cmd; + cmd.op = AE_ASYNC_OP::DeleteFileEvent; + cmd.fd = fd; + cmd.mask = mask; + auto cb = write(eventLoop->fdCmdWrite, &cmd, sizeof(cmd)); + AE_ASSERT(cb == sizeof(cmd)); +} + extern "C" void aeDeleteFileEvent(aeEventLoop *eventLoop, int fd, int mask) { + AE_ASSERT(g_eventLoopThisThread == NULL || g_eventLoopThisThread == eventLoop); if (fd >= eventLoop->setsize) return; aeFileEvent *fe = &eventLoop->events[fd]; if (fe->mask == AE_NONE) return; @@ -165,6 +411,9 @@ extern "C" void aeDeleteFileEvent(aeEventLoop *eventLoop, int fd, int mask) * is removed. */ if (mask & AE_WRITABLE) mask |= AE_BARRIER; + if (mask & AE_WRITABLE) mask |= AE_WRITE_THREADSAFE; + if (mask & AE_READABLE) mask |= AE_READ_THREADSAFE; + aeApiDelEvent(eventLoop, fd, mask); fe->mask = fe->mask & (~mask); if (fd == eventLoop->maxfd && fe->mask == AE_NONE) { @@ -211,6 +460,7 @@ extern "C" long long aeCreateTimeEvent(aeEventLoop *eventLoop, long long millise aeTimeProc *proc, void *clientData, aeEventFinalizerProc *finalizerProc) { + AE_ASSERT(g_eventLoopThisThread == NULL || g_eventLoopThisThread == eventLoop); long long id = eventLoop->timeEventNextId++; aeTimeEvent *te; @@ -231,6 +481,7 @@ extern "C" long long aeCreateTimeEvent(aeEventLoop *eventLoop, long long millise extern "C" int aeDeleteTimeEvent(aeEventLoop *eventLoop, long long id) { + AE_ASSERT(g_eventLoopThisThread == NULL || g_eventLoopThisThread == eventLoop); aeTimeEvent *te = eventLoop->timeEventHead; while(te) { if (te->id == id) { @@ -255,6 +506,7 @@ extern "C" int aeDeleteTimeEvent(aeEventLoop *eventLoop, long long id) */ static aeTimeEvent *aeSearchNearestTimer(aeEventLoop *eventLoop) { + AE_ASSERT(g_eventLoopThisThread == NULL || g_eventLoopThisThread == eventLoop); aeTimeEvent *te = eventLoop->timeEventHead; aeTimeEvent *nearest = NULL; @@ -270,6 +522,7 @@ static aeTimeEvent *aeSearchNearestTimer(aeEventLoop *eventLoop) /* Process time events */ static int processTimeEvents(aeEventLoop *eventLoop) { + std::unique_lock ulock(g_lock); int processed = 0; aeTimeEvent *te; long long maxId; @@ -343,6 +596,62 @@ static int processTimeEvents(aeEventLoop *eventLoop) { return processed; } +extern "C" void ProcessEventCore(aeEventLoop *eventLoop, aeFileEvent *fe, int mask, int fd) +{ +#define LOCK_IF_NECESSARY(fe, tsmask) \ + std::unique_lock ulock(g_lock, std::defer_lock); \ + if (!(fe->mask & tsmask)) \ + ulock.lock() + + int fired = 0; /* Number of events fired for current fd. */ + + /* Normally we execute the readable event first, and the writable + * event laster. This is useful as sometimes we may be able + * to serve the reply of a query immediately after processing the + * query. + * + * However if AE_BARRIER is set in the mask, our application is + * asking us to do the reverse: never fire the writable event + * after the readable. In such a case, we invert the calls. + * This is useful when, for instance, we want to do things + * in the beforeSleep() hook, like fsynching a file to disk, + * before replying to a client. */ + int invert = fe->mask & AE_BARRIER; + + /* Note the "fe->mask & mask & ..." code: maybe an already + * processed event removed an element that fired and we still + * didn't processed, so we check if the event is still valid. + * + * Fire the readable event if the call sequence is not + * inverted. */ + if (!invert && fe->mask & mask & AE_READABLE) { + LOCK_IF_NECESSARY(fe, AE_READ_THREADSAFE); + fe->rfileProc(eventLoop,fd,fe->clientData,mask | (fe->mask & AE_READ_THREADSAFE)); + fired++; + } + + /* Fire the writable event. */ + if (fe->mask & mask & AE_WRITABLE) { + if (!fired || fe->wfileProc != fe->rfileProc) { + LOCK_IF_NECESSARY(fe, AE_WRITE_THREADSAFE); + fe->wfileProc(eventLoop,fd,fe->clientData,mask | (fe->mask & AE_WRITE_THREADSAFE)); + fired++; + } + } + + /* If we have to invert the call, fire the readable event now + * after the writable one. */ + if (invert && fe->mask & mask & AE_READABLE) { + if (!fired || fe->wfileProc != fe->rfileProc) { + LOCK_IF_NECESSARY(fe, AE_READ_THREADSAFE); + fe->rfileProc(eventLoop,fd,fe->clientData,mask | (fe->mask & AE_READ_THREADSAFE)); + fired++; + } + } + +#undef LOCK_IF_NECESSARY +} + /* Process every pending time event, then every pending file event * (that may be registered by time event callbacks just processed). * Without special flags the function sleeps until some file event @@ -359,6 +668,7 @@ static int processTimeEvents(aeEventLoop *eventLoop) { * The function returns the number of events processed. */ int aeProcessEvents(aeEventLoop *eventLoop, int flags) { + AE_ASSERT(g_eventLoopThisThread == NULL || g_eventLoopThisThread == eventLoop); int processed = 0, numevents; /* Nothing to do? return ASAP */ @@ -413,55 +723,19 @@ int aeProcessEvents(aeEventLoop *eventLoop, int flags) numevents = aeApiPoll(eventLoop, tvp); /* After sleep callback. */ - if (eventLoop->aftersleep != NULL && flags & AE_CALL_AFTER_SLEEP) + if (eventLoop->aftersleep != NULL && flags & AE_CALL_AFTER_SLEEP) { + std::unique_lock ulock(g_lock, std::defer_lock); + if (!(eventLoop->beforesleepFlags & AE_SLEEP_THREADSAFE)) + ulock.lock(); eventLoop->aftersleep(eventLoop); + } for (j = 0; j < numevents; j++) { aeFileEvent *fe = &eventLoop->events[eventLoop->fired[j].fd]; int mask = eventLoop->fired[j].mask; int fd = eventLoop->fired[j].fd; - int fired = 0; /* Number of events fired for current fd. */ - /* Normally we execute the readable event first, and the writable - * event laster. This is useful as sometimes we may be able - * to serve the reply of a query immediately after processing the - * query. - * - * However if AE_BARRIER is set in the mask, our application is - * asking us to do the reverse: never fire the writable event - * after the readable. In such a case, we invert the calls. - * This is useful when, for instance, we want to do things - * in the beforeSleep() hook, like fsynching a file to disk, - * before replying to a client. */ - int invert = fe->mask & AE_BARRIER; - - /* Note the "fe->mask & mask & ..." code: maybe an already - * processed event removed an element that fired and we still - * didn't processed, so we check if the event is still valid. - * - * Fire the readable event if the call sequence is not - * inverted. */ - if (!invert && fe->mask & mask & AE_READABLE) { - fe->rfileProc(eventLoop,fd,fe->clientData,mask); - fired++; - } - - /* Fire the writable event. */ - if (fe->mask & mask & AE_WRITABLE) { - if (!fired || fe->wfileProc != fe->rfileProc) { - fe->wfileProc(eventLoop,fd,fe->clientData,mask); - fired++; - } - } - - /* If we have to invert the call, fire the readable event now - * after the writable one. */ - if (invert && fe->mask & mask & AE_READABLE) { - if (!fired || fe->wfileProc != fe->rfileProc) { - fe->rfileProc(eventLoop,fd,fe->clientData,mask); - fired++; - } - } + ProcessEventCore(eventLoop, fe, mask, fd); processed++; } @@ -470,6 +744,7 @@ int aeProcessEvents(aeEventLoop *eventLoop, int flags) if (flags & AE_TIME_EVENTS) processed += processTimeEvents(eventLoop); + eventLoop->cevents += processed; return processed; /* return the number of processed file/time events */ } @@ -497,10 +772,17 @@ int aeWait(int fd, int mask, long long milliseconds) { void aeMain(aeEventLoop *eventLoop) { eventLoop->stop = 0; + g_eventLoopThisThread = eventLoop; while (!eventLoop->stop) { - if (eventLoop->beforesleep != NULL) + if (eventLoop->beforesleep != NULL) { + std::unique_lock ulock(g_lock, std::defer_lock); + if (!(eventLoop->beforesleepFlags & AE_SLEEP_THREADSAFE)) + ulock.lock(); eventLoop->beforesleep(eventLoop); + } + AE_ASSERT(!aeThreadOwnsLock()); // we should have relinquished it after processing aeProcessEvents(eventLoop, AE_ALL_EVENTS|AE_CALL_AFTER_SLEEP); + AE_ASSERT(!aeThreadOwnsLock()); // we should have relinquished it after processing } } @@ -508,10 +790,32 @@ const char *aeGetApiName(void) { return aeApiName(); } -void aeSetBeforeSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *beforesleep) { +void aeSetBeforeSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *beforesleep, int flags) { eventLoop->beforesleep = beforesleep; + eventLoop->beforesleepFlags = flags; } -void aeSetAfterSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *aftersleep) { +void aeSetAfterSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *aftersleep, int flags) { eventLoop->aftersleep = aftersleep; + eventLoop->aftersleepFlags = flags; +} + +void aeAcquireLock() +{ + g_lock.lock(); +} + +int aeTryAcquireLock() +{ + return g_lock.try_lock(); +} + +void aeReleaseLock() +{ + g_lock.unlock(); +} + +int aeThreadOwnsLock() +{ + return g_lock.fOwnLock(); } diff --git a/src/ae.h b/src/ae.h index a6ee1d05b..f08c49dd8 100644 --- a/src/ae.h +++ b/src/ae.h @@ -33,7 +33,11 @@ #ifndef __AE_H__ #define __AE_H__ +#ifdef __cplusplus +#include +#endif #include +#include "fastlock.h" #ifdef __cplusplus extern "C" { @@ -50,7 +54,9 @@ extern "C" { loop iteration. Useful when you want to persist things to disk before sending replies, and want to do that in a group fashion. */ -#define AE_THREADSAFE 8 /* Ok to run concurrently */ +#define AE_READ_THREADSAFE 8 +#define AE_WRITE_THREADSAFE 16 +#define AE_SLEEP_THREADSAFE 32 #define AE_FILE_EVENTS 1 #define AE_TIME_EVENTS 2 @@ -71,6 +77,7 @@ typedef void aeFileProc(struct aeEventLoop *eventLoop, int fd, void *clientData, typedef int aeTimeProc(struct aeEventLoop *eventLoop, long long id, void *clientData); typedef void aeEventFinalizerProc(struct aeEventLoop *eventLoop, void *clientData); typedef void aeBeforeSleepProc(struct aeEventLoop *eventLoop); +typedef void aePostFunctionProc(void *pvArgs); /* File event structure */ typedef struct aeFileEvent { @@ -110,16 +117,33 @@ typedef struct aeEventLoop { int stop; void *apidata; /* This is used for polling API specific data */ aeBeforeSleepProc *beforesleep; + int beforesleepFlags; aeBeforeSleepProc *aftersleep; + int aftersleepFlags; + struct fastlock flock; + int fdCmdWrite; + int fdCmdRead; + int cevents; } aeEventLoop; /* Prototypes */ aeEventLoop *aeCreateEventLoop(int setsize); +int aePostFunction(aeEventLoop *eventLoop, aePostFunctionProc *proc, void *arg); +#ifdef __cplusplus +} // EXTERN C +int aePostFunction(aeEventLoop *eventLoop, std::function fn, bool fSynchronous = false); +extern "C" { +#endif void aeDeleteEventLoop(aeEventLoop *eventLoop); void aeStop(aeEventLoop *eventLoop); int aeCreateFileEvent(aeEventLoop *eventLoop, int fd, int mask, aeFileProc *proc, void *clientData); + +int aeCreateRemoteFileEvent(aeEventLoop *eventLoop, int fd, int mask, + aeFileProc *proc, void *clientData, int fSynchronous); + void aeDeleteFileEvent(aeEventLoop *eventLoop, int fd, int mask); +void aeDeleteFileEventAsync(aeEventLoop *eventLoop, int fd, int mask); int aeGetFileEvents(aeEventLoop *eventLoop, int fd); long long aeCreateTimeEvent(aeEventLoop *eventLoop, long long milliseconds, aeTimeProc *proc, void *clientData, @@ -129,11 +153,16 @@ int aeProcessEvents(aeEventLoop *eventLoop, int flags); int aeWait(int fd, int mask, long long milliseconds); void aeMain(aeEventLoop *eventLoop); const char *aeGetApiName(void); -void aeSetBeforeSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *beforesleep); -void aeSetAfterSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *aftersleep); +void aeSetBeforeSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *beforesleep, int flags); +void aeSetAfterSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *aftersleep, int flags); int aeGetSetSize(aeEventLoop *eventLoop); int aeResizeSetSize(aeEventLoop *eventLoop, int setsize); +void aeAcquireLock(); +int aeTryAcquireLock(); +void aeReleaseLock(); +int aeThreadOwnsLock(); + #ifdef __cplusplus } #endif diff --git a/src/ae_epoll.cpp b/src/ae_epoll.cpp index cadcc3f51..05638ebdc 100644 --- a/src/ae_epoll.cpp +++ b/src/ae_epoll.cpp @@ -83,7 +83,11 @@ static int aeApiAddEvent(aeEventLoop *eventLoop, int fd, int mask) { if (mask & AE_READABLE) ee.events |= EPOLLIN; if (mask & AE_WRITABLE) ee.events |= EPOLLOUT; ee.data.fd = fd; - if (epoll_ctl(state->epfd,op,fd,&ee) == -1) return -1; + if (epoll_ctl(state->epfd,op,fd,&ee) == -1) + { + perror("epoll_ctl failed"); + return -1; + } return 0; } diff --git a/src/anet.c b/src/anet.c index 2981fca13..91ab94efd 100644 --- a/src/anet.c +++ b/src/anet.c @@ -246,6 +246,16 @@ static int anetSetReuseAddr(char *err, int fd) { return ANET_OK; } +static int anetSetReusePort(char *err, int fd) { + int yes = 1; + /* Let us load balance listen()s from multiple threads */ + if (setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &yes, sizeof(yes)) == -1) { + anetSetError(err, "setsockopt SO_REUSEPORT: %s", strerror(errno)); + return ANET_ERR; + } + return ANET_OK; +} + static int anetCreateSocket(char *err, int domain) { int s; if ((s = socket(domain, SOCK_STREAM, 0)) == -1) { @@ -265,6 +275,7 @@ static int anetCreateSocket(char *err, int domain) { #define ANET_CONNECT_NONE 0 #define ANET_CONNECT_NONBLOCK 1 #define ANET_CONNECT_BE_BINDING 2 /* Best effort binding. */ +#define ANET_CONNECT_REUSEPORT 4 static int anetTcpGenericConnect(char *err, char *addr, int port, char *source_addr, int flags) { @@ -287,7 +298,10 @@ static int anetTcpGenericConnect(char *err, char *addr, int port, * the next entry in servinfo. */ if ((s = socket(p->ai_family,p->ai_socktype,p->ai_protocol)) == -1) continue; - if (anetSetReuseAddr(err,s) == ANET_ERR) goto error; + if (anetSetReuseAddr(err,s) == ANET_ERR) + goto error; + if (flags & ANET_CONNECT_REUSEPORT && anetSetReusePort(err, s) != ANET_OK) + goto error; if (flags & ANET_CONNECT_NONBLOCK && anetNonBlock(err,s) != ANET_OK) goto error; if (source_addr) { @@ -462,7 +476,7 @@ static int anetV6Only(char *err, int s) { return ANET_OK; } -static int _anetTcpServer(char *err, int port, char *bindaddr, int af, int backlog) +static int _anetTcpServer(char *err, int port, char *bindaddr, int af, int backlog, int fReusePort) { int s = -1, rv; char _port[6]; /* strlen("65535") */ @@ -484,6 +498,7 @@ static int _anetTcpServer(char *err, int port, char *bindaddr, int af, int backl if (af == AF_INET6 && anetV6Only(err,s) == ANET_ERR) goto error; if (anetSetReuseAddr(err,s) == ANET_ERR) goto error; + if (fReusePort && anetSetReusePort(err,s) == ANET_ERR) goto error; if (anetListen(err,s,p->ai_addr,p->ai_addrlen,backlog) == ANET_ERR) s = ANET_ERR; goto end; } @@ -500,14 +515,14 @@ end: return s; } -int anetTcpServer(char *err, int port, char *bindaddr, int backlog) +int anetTcpServer(char *err, int port, char *bindaddr, int backlog, int fReusePort) { - return _anetTcpServer(err, port, bindaddr, AF_INET, backlog); + return _anetTcpServer(err, port, bindaddr, AF_INET, backlog, fReusePort); } -int anetTcp6Server(char *err, int port, char *bindaddr, int backlog) +int anetTcp6Server(char *err, int port, char *bindaddr, int backlog, int fReusePort) { - return _anetTcpServer(err, port, bindaddr, AF_INET6, backlog); + return _anetTcpServer(err, port, bindaddr, AF_INET6, backlog, fReusePort); } int anetUnixServer(char *err, char *path, mode_t perm, int backlog) diff --git a/src/anet.h b/src/anet.h index 7142f78d2..44c57b4cd 100644 --- a/src/anet.h +++ b/src/anet.h @@ -33,6 +33,10 @@ #include +#ifdef __cplusplus +extern "C" { +#endif + #define ANET_OK 0 #define ANET_ERR -1 #define ANET_ERR_LEN 256 @@ -58,8 +62,8 @@ int anetUnixNonBlockConnect(char *err, char *path); int anetRead(int fd, char *buf, int count); int anetResolve(char *err, char *host, char *ipbuf, size_t ipbuf_len); int anetResolveIP(char *err, char *host, char *ipbuf, size_t ipbuf_len); -int anetTcpServer(char *err, int port, char *bindaddr, int backlog); -int anetTcp6Server(char *err, int port, char *bindaddr, int backlog); +int anetTcpServer(char *err, int port, char *bindaddr, int backlog, int fReusePort); +int anetTcp6Server(char *err, int port, char *bindaddr, int backlog, int fReusePort); int anetUnixServer(char *err, char *path, mode_t perm, int backlog); int anetTcpAccept(char *err, int serversock, char *ip, size_t ip_len, int *port); int anetUnixAccept(char *err, int serversock); @@ -77,4 +81,8 @@ int anetFormatAddr(char *fmt, size_t fmt_len, char *ip, int port); int anetFormatPeer(int fd, char *fmt, size_t fmt_len); int anetFormatSock(int fd, char *fmt, size_t fmt_len); +#ifdef __cplusplus +} +#endif + #endif diff --git a/src/aof.c b/src/aof.c index c71f88aa6..f19affc64 100644 --- a/src/aof.c +++ b/src/aof.c @@ -96,6 +96,8 @@ void aofChildWriteDiffData(aeEventLoop *el, int fd, void *privdata, int mask) { listNode *ln; aofrwblock *block; ssize_t nwritten; + serverAssert(aeThreadOwnsLock()); + UNUSED(el); UNUSED(fd); UNUSED(privdata); @@ -105,7 +107,7 @@ void aofChildWriteDiffData(aeEventLoop *el, int fd, void *privdata, int mask) { ln = listFirst(server.aof_rewrite_buf_blocks); block = ln ? ln->value : NULL; if (server.aof_stop_sending_diff || !block) { - aeDeleteFileEvent(server.el,server.aof_pipe_write_data_to_child, + aeDeleteFileEvent(el,server.aof_pipe_write_data_to_child, AE_WRITABLE); return; } @@ -162,8 +164,8 @@ void aofRewriteBufferAppend(unsigned char *s, unsigned long len) { /* Install a file event to send data to the rewrite child if there is * not one already. */ - if (aeGetFileEvents(server.el,server.aof_pipe_write_data_to_child) == 0) { - aeCreateFileEvent(server.el, server.aof_pipe_write_data_to_child, + if (aeGetFileEvents(serverTL->el,server.aof_pipe_write_data_to_child) == 0) { + aeCreateFileEvent(serverTL->el, server.aof_pipe_write_data_to_child, AE_WRITABLE, aofChildWriteDiffData, NULL); } } @@ -631,6 +633,7 @@ struct client *createFakeClient(void) { selectDb(c,0); c->fd = -1; + c->iel = IDX_EVENT_LOOP_MAIN; c->name = NULL; c->querybuf = sdsempty(); c->querybuf_peak = 0; @@ -638,6 +641,7 @@ struct client *createFakeClient(void) { c->argv = NULL; c->bufpos = 0; c->flags = 0; + c->fPendingAsyncWrite = FALSE; c->btype = BLOCKED_NONE; /* We set the fake client as a slave waiting for the synchronization * so that Redis will not try to send replies to this client. */ @@ -651,6 +655,8 @@ struct client *createFakeClient(void) { c->puser = NULL; listSetFreeMethod(c->reply,freeClientReplyValue); listSetDupMethod(c->reply,dupClientReplyValue); + fastlock_init(&c->lock); + fastlock_lock(&c->lock); initClientMultiState(c); return c; } @@ -668,6 +674,8 @@ void freeFakeClient(struct client *c) { listRelease(c->reply); listRelease(c->watched_keys); freeClientMultiState(c); + fastlock_unlock(&c->lock); + fastlock_free(&c->lock); zfree(c); } @@ -682,6 +690,7 @@ int loadAppendOnlyFile(char *filename) { long loops = 0; off_t valid_up_to = 0; /* Offset of latest well-formed command loaded. */ off_t valid_before_multi = 0; /* Offset before MULTI command loaded. */ + serverAssert(serverTL != NULL); // This happens early in boot, ensure serverTL was setup if (fp == NULL) { serverLog(LL_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno)); @@ -738,7 +747,7 @@ int loadAppendOnlyFile(char *filename) { /* Serve the clients from time to time */ if (!(loops++ % 1000)) { loadingProgress(ftello(fp)); - processEventsWhileBlocked(); + processEventsWhileBlocked(serverTL - server.rgthreadvar); } if (fgets(buf,sizeof(buf),fp) == NULL) { @@ -1470,7 +1479,7 @@ void aofChildPipeReadable(aeEventLoop *el, int fd, void *privdata, int mask) { } /* Remove the handler since this can be called only one time during a * rewrite. */ - aeDeleteFileEvent(server.el,server.aof_pipe_read_ack_from_child,AE_READABLE); + aeDeleteFileEventAsync(server.rgthreadvar[IDX_EVENT_LOOP_MAIN].el,server.aof_pipe_read_ack_from_child,AE_READABLE); } /* Create the pipes used for parent - child process IPC during rewrite. @@ -1488,12 +1497,13 @@ int aofCreatePipes(void) { /* Parent -> children data is non blocking. */ if (anetNonBlock(NULL,fds[0]) != ANET_OK) goto error; if (anetNonBlock(NULL,fds[1]) != ANET_OK) goto error; - if (aeCreateFileEvent(server.el, fds[2], AE_READABLE, aofChildPipeReadable, NULL) == AE_ERR) goto error; + if (aeCreateFileEvent(serverTL->el, fds[2], AE_READABLE, aofChildPipeReadable, NULL) == AE_ERR) goto error; server.aof_pipe_write_data_to_child = fds[1]; server.aof_pipe_read_data_from_parent = fds[0]; server.aof_pipe_write_ack_to_parent = fds[3]; server.aof_pipe_read_ack_from_child = fds[2]; + server.el_alf_pip_read_ack_from_child = serverTL->el; server.aof_pipe_write_ack_to_child = fds[5]; server.aof_pipe_read_ack_from_parent = fds[4]; server.aof_stop_sending_diff = 0; @@ -1507,8 +1517,8 @@ error: } void aofClosePipes(void) { - aeDeleteFileEvent(server.el,server.aof_pipe_read_ack_from_child,AE_READABLE); - aeDeleteFileEvent(server.el,server.aof_pipe_write_data_to_child,AE_WRITABLE); + aeDeleteFileEventAsync(server.el_alf_pip_read_ack_from_child,server.aof_pipe_read_ack_from_child,AE_READABLE); + aeDeleteFileEventAsync(server.rgthreadvar[IDX_EVENT_LOOP_MAIN].el,server.aof_pipe_write_data_to_child,AE_WRITABLE); close(server.aof_pipe_write_data_to_child); close(server.aof_pipe_read_data_from_parent); close(server.aof_pipe_write_ack_to_parent); diff --git a/src/blocked.c b/src/blocked.c index 2ac57b5db..ad7113d52 100644 --- a/src/blocked.c +++ b/src/blocked.c @@ -100,6 +100,7 @@ int getTimeoutFromObjectOrReply(client *c, robj *object, mstime_t *timeout, int * flag is set client query buffer is not longer processed, but accumulated, * and will be processed when the client is unblocked. */ void blockClient(client *c, int btype) { + serverAssert(aeThreadOwnsLock()); c->flags |= CLIENT_BLOCKED; c->btype = btype; server.blocked_clients++; @@ -109,15 +110,22 @@ void blockClient(client *c, int btype) { /* This function is called in the beforeSleep() function of the event loop * in order to process the pending input buffer of clients that were * unblocked after a blocking operation. */ -void processUnblockedClients(void) { +void processUnblockedClients(int iel) { + serverAssert(aeThreadOwnsLock()); + listNode *ln; client *c; + list *unblocked_clients = server.rgthreadvar[iel].unblocked_clients; + serverAssert(iel == (serverTL - server.rgthreadvar)); - while (listLength(server.unblocked_clients)) { - ln = listFirst(server.unblocked_clients); + while (listLength(unblocked_clients)) { + ln = listFirst(unblocked_clients); serverAssert(ln != NULL); c = ln->value; - listDelNode(server.unblocked_clients,ln); + listDelNode(unblocked_clients,ln); + AssertCorrectThread(c); + + fastlock_lock(&c->lock); c->flags &= ~CLIENT_UNBLOCKED; /* Process remaining data in the input buffer, unless the client @@ -129,6 +137,7 @@ void processUnblockedClients(void) { processInputBufferAndReplicate(c); } } + fastlock_unlock(&c->lock); } } @@ -151,15 +160,19 @@ void processUnblockedClients(void) { void queueClientForReprocessing(client *c) { /* The client may already be into the unblocked list because of a previous * blocking operation, don't add back it into the list multiple times. */ + serverAssert(aeThreadOwnsLock()); + fastlock_lock(&c->lock); if (!(c->flags & CLIENT_UNBLOCKED)) { c->flags |= CLIENT_UNBLOCKED; - listAddNodeTail(server.unblocked_clients,c); + listAddNodeTail(server.rgthreadvar[c->iel].unblocked_clients,c); } + fastlock_unlock(&c->lock); } /* Unblock a client calling the right function depending on the kind * of operation the client is blocking for. */ void unblockClient(client *c) { + serverAssert(aeThreadOwnsLock()); if (c->btype == BLOCKED_LIST || c->btype == BLOCKED_ZSET || c->btype == BLOCKED_STREAM) { @@ -205,20 +218,23 @@ void replyToBlockedClientTimedOut(client *c) { * The semantics is to send an -UNBLOCKED error to the client, disconnecting * it at the same time. */ void disconnectAllBlockedClients(void) { + serverAssert(aeThreadOwnsLock()); listNode *ln; listIter li; listRewind(server.clients,&li); while((ln = listNext(&li))) { client *c = listNodeValue(ln); - + + fastlock_lock(&c->lock); if (c->flags & CLIENT_BLOCKED) { - addReplySds(c,sdsnew( + addReplySdsAsync(c,sdsnew( "-UNBLOCKED force unblock from blocking operation, " "instance state changed (master -> replica?)\r\n")); unblockClient(c); c->flags |= CLIENT_CLOSE_AFTER_REPLY; } + fastlock_unlock(&c->lock); } } @@ -244,6 +260,7 @@ void disconnectAllBlockedClients(void) { * be used only for a single type, like virtually any Redis application will * do, the function is already fair. */ void handleClientsBlockedOnKeys(void) { + serverAssert(aeThreadOwnsLock()); while(listLength(server.ready_keys) != 0) { list *l; @@ -297,6 +314,7 @@ void handleClientsBlockedOnKeys(void) { * freed by the next unblockClient() * call. */ if (dstkey) incrRefCount(dstkey); + fastlock_lock(&receiver->lock); unblockClient(receiver); if (serveClientBlockedOnList(receiver, @@ -309,6 +327,7 @@ void handleClientsBlockedOnKeys(void) { } if (dstkey) decrRefCount(dstkey); + fastlock_unlock(&receiver->lock); decrRefCount(value); } else { break; @@ -348,6 +367,7 @@ void handleClientsBlockedOnKeys(void) { continue; } + fastlock_lock(&receiver->lock); int where = (receiver->lastcmd && receiver->lastcmd->proc == bzpopminCommand) ? ZSET_MIN : ZSET_MAX; @@ -365,6 +385,7 @@ void handleClientsBlockedOnKeys(void) { incrRefCount(rl->key); propagate(cmd,receiver->db->id, argv,2,PROPAGATE_AOF|PROPAGATE_REPL); + fastlock_unlock(&receiver->lock); decrRefCount(argv[0]); decrRefCount(argv[1]); } @@ -407,10 +428,12 @@ void handleClientsBlockedOnKeys(void) { /* If the group was not found, send an error * to the consumer. */ if (!group) { - addReplyError(receiver, + fastlock_lock(&receiver->lock); + addReplyErrorAsync(receiver, "-NOGROUP the consumer group this client " "was blocked on no longer exists"); unblockClient(receiver); + fastlock_unlock(&receiver->lock); continue; } else { *gt = group->last_id; @@ -432,17 +455,19 @@ void handleClientsBlockedOnKeys(void) { noack = receiver->bpop.xread_group_noack; } + fastlock_lock(&receiver->lock); + /* Emit the two elements sub-array consisting of * the name of the stream and the data we * extracted from it. Wrapped in a single-item * array, since we have just one key. */ if (receiver->resp == 2) { - addReplyArrayLen(receiver,1); - addReplyArrayLen(receiver,2); + addReplyArrayLenAsync(receiver,1); + addReplyArrayLenAsync(receiver,2); } else { - addReplyMapLen(receiver,1); + addReplyMapLenAsync(receiver,1); } - addReplyBulk(receiver,rl->key); + addReplyBulkAsync(receiver,rl->key); streamPropInfo pi = { rl->key, @@ -457,6 +482,7 @@ void handleClientsBlockedOnKeys(void) { * valid, so we must do the setup above before * this call. */ unblockClient(receiver); + fastlock_unlock(&receiver->lock); } } } diff --git a/src/cluster.c b/src/cluster.c index 11eb170fc..946332fa1 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -486,14 +486,14 @@ void clusterInit(void) { } if (listenToPort(server.port+CLUSTER_PORT_INCR, - server.cfd,&server.cfd_count) == C_ERR) + server.cfd,&server.cfd_count, 0 /*fReusePort*/) == C_ERR) { exit(1); } else { int j; for (j = 0; j < server.cfd_count; j++) { - if (aeCreateFileEvent(server.el, server.cfd[j], AE_READABLE, + if (aeCreateFileEvent(server.rgthreadvar[IDX_EVENT_LOOP_MAIN].el, server.cfd[j], AE_READABLE, clusterAcceptHandler, NULL) == AE_ERR) serverPanic("Unrecoverable error creating Redis Cluster " "file event."); @@ -601,7 +601,7 @@ clusterLink *createClusterLink(clusterNode *node) { * with this link will have the 'link' field set to NULL. */ void freeClusterLink(clusterLink *link) { if (link->fd != -1) { - aeDeleteFileEvent(server.el, link->fd, AE_READABLE|AE_WRITABLE); + aeDeleteFileEvent(server.rgthreadvar[IDX_EVENT_LOOP_MAIN].el, link->fd, AE_READABLE|AE_WRITABLE); } sdsfree(link->sndbuf); sdsfree(link->rcvbuf); @@ -645,7 +645,7 @@ void clusterAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) { * node identity. */ link = createClusterLink(NULL); link->fd = cfd; - aeCreateFileEvent(server.el,cfd,AE_READABLE,clusterReadHandler,link); + aeCreateFileEvent(server.rgthreadvar[IDX_EVENT_LOOP_MAIN].el,cfd,AE_READABLE,clusterReadHandler,link); } } @@ -2132,7 +2132,7 @@ void clusterWriteHandler(aeEventLoop *el, int fd, void *privdata, int mask) { } sdsrange(link->sndbuf,nwritten,-1); if (sdslen(link->sndbuf) == 0) - aeDeleteFileEvent(server.el, link->fd, AE_WRITABLE); + aeDeleteFileEvent(server.rgthreadvar[IDX_EVENT_LOOP_MAIN].el, link->fd, AE_WRITABLE); } /* Read data. Try to read the first field of the header first to check the @@ -2208,7 +2208,7 @@ void clusterReadHandler(aeEventLoop *el, int fd, void *privdata, int mask) { * from event handlers that will do stuff with the same link later. */ void clusterSendMessage(clusterLink *link, unsigned char *msg, size_t msglen) { if (sdslen(link->sndbuf) == 0 && msglen != 0) - aeCreateFileEvent(server.el,link->fd,AE_WRITABLE|AE_BARRIER, + aeCreateFileEvent(server.rgthreadvar[IDX_EVENT_LOOP_MAIN].el,link->fd,AE_WRITABLE|AE_BARRIER, clusterWriteHandler,link); link->sndbuf = sdscatlen(link->sndbuf, msg, msglen); @@ -3402,7 +3402,7 @@ void clusterCron(void) { link = createClusterLink(node); link->fd = fd; node->link = link; - aeCreateFileEvent(server.el,link->fd,AE_READABLE, + aeCreateFileEvent(server.rgthreadvar[IDX_EVENT_LOOP_MAIN].el,link->fd,AE_READABLE, clusterReadHandler,link); /* Queue a PING in the new connection ASAP: this is crucial * to avoid false positives in failure detection. @@ -5390,6 +5390,7 @@ socket_err: * the target instance. See the Redis Cluster specification for more * information. */ void askingCommand(client *c) { + serverAssert(aeThreadOwnsLock()); if (server.cluster_enabled == 0) { addReplyError(c,"This instance has cluster support disabled"); return; @@ -5402,6 +5403,7 @@ void askingCommand(client *c) { * In this mode slaves will not redirect clients as long as clients access * with read-only commands to keys that are served by the slave's master. */ void readonlyCommand(client *c) { + serverAssert(aeThreadOwnsLock()); if (server.cluster_enabled == 0) { addReplyError(c,"This instance has cluster support disabled"); return; @@ -5412,6 +5414,7 @@ void readonlyCommand(client *c) { /* The READWRITE command just clears the READONLY command state. */ void readwriteCommand(client *c) { + serverAssert(aeThreadOwnsLock()); c->flags &= ~CLIENT_READONLY; addReply(c,shared.ok); } @@ -5455,6 +5458,11 @@ clusterNode *getNodeByQuery(client *c, struct redisCommand *cmd, robj **argv, in multiState *ms, _ms; multiCmd mc; int i, slot = 0, migrating_slot = 0, importing_slot = 0, missing_keys = 0; + serverAssert(aeThreadOwnsLock()); + + /* Allow any key to be set if a module disabled cluster redirections. */ + if (server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_REDIRECTION) + return myself; /* Allow any key to be set if a module disabled cluster redirections. */ if (server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_REDIRECTION) @@ -5663,6 +5671,7 @@ void clusterRedirectClient(client *c, clusterNode *n, int hashslot, int error_co * longer handles, the client is sent a redirection error, and the function * returns 1. Otherwise 0 is returned and no operation is performed. */ int clusterRedirectBlockedClientIfNeeded(client *c) { + serverAssert(aeThreadOwnsLock()); if (c->flags & CLIENT_BLOCKED && (c->btype == BLOCKED_LIST || c->btype == BLOCKED_ZSET || diff --git a/src/cluster.h b/src/cluster.h index 571b9c543..ea4f51c78 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -1,6 +1,10 @@ #ifndef __CLUSTER_H #define __CLUSTER_H +#ifdef __cplusplus +extern "C" { +#endif + /*----------------------------------------------------------------------------- * Redis cluster data structures, defines, exported API. *----------------------------------------------------------------------------*/ @@ -287,4 +291,8 @@ clusterNode *getNodeByQuery(client *c, struct redisCommand *cmd, robj **argv, in int clusterRedirectBlockedClientIfNeeded(client *c); void clusterRedirectClient(client *c, clusterNode *n, int hashslot, int error_code); +#ifdef __cplusplus +} +#endif + #endif /* __CLUSTER_H */ diff --git a/src/config.c b/src/config.c index 7e9b19d76..59ae23303 100644 --- a/src/config.c +++ b/src/config.c @@ -395,6 +395,9 @@ void loadServerConfigFromString(char *config) { err = "repl-backlog-ttl can't be negative "; goto loaderr; } + } else if (!strcasecmp(argv[0],"masteruser") && argc == 2) { + zfree(server.masteruser); + server.masteruser = argv[1][0] ? zstrdup(argv[1]) : NULL; } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) { zfree(server.masterauth); server.masterauth = argv[1][0] ? zstrdup(argv[1]) : NULL; @@ -821,7 +824,18 @@ void loadServerConfigFromString(char *config) { if (err) goto loaderr; } } else if (!strcasecmp(argv[0],"scratch-file-path")) { +#ifdef USE_MEMKIND storage_init(argv[1], server.maxmemory); +#else + err = "KeyDB not compliled with scratch-file support."; + goto loaderr; +#endif + } else if (!strcasecmp(argv[0],"server-threads") && argc == 2) { + server.cthreads = atoi(argv[1]); + if (server.cthreads <= 0 || server.cthreads > MAX_EVENT_LOOPS) { + err = "Invalid number of threads specified"; + goto loaderr; + } } else { err = "Bad directive or wrong number of arguments"; goto loaderr; } @@ -948,6 +962,9 @@ void configSetCommand(client *c) { sds aclop = sdscatprintf(sdsempty(),">%s",(char*)ptrFromObj(o)); ACLSetUser(DefaultUser,aclop,sdslen(aclop)); sdsfree(aclop); + } config_set_special_field("masteruser") { + zfree(server.masteruser); + server.masteruser = ((char*)ptrFromObj(o))[0] ? zstrdup(ptrFromObj(o)) : NULL; } config_set_special_field("masterauth") { zfree(server.masterauth); server.masterauth = ((char*)ptrFromObj(o))[0] ? zstrdup(ptrFromObj(o)) : NULL; @@ -961,6 +978,7 @@ void configSetCommand(client *c) { /* Try to check if the OS is capable of supporting so many FDs. */ server.maxclients = ll; + serverAssert(FALSE); if (ll > orig_value) { adjustOpenFilesLimit(); if (server.maxclients != ll) { @@ -968,15 +986,18 @@ void configSetCommand(client *c) { server.maxclients = orig_value; return; } - if ((unsigned int) aeGetSetSize(server.el) < + if ((unsigned int) aeGetSetSize(server.rgthreadvar[IDX_EVENT_LOOP_MAIN].el) < server.maxclients + CONFIG_FDSET_INCR) { - if (aeResizeSetSize(server.el, - server.maxclients + CONFIG_FDSET_INCR) == AE_ERR) + for (int iel = 0; iel < server.cthreads; ++iel) { - addReplyError(c,"The event loop API used by Redis is not able to handle the specified number of clients"); - server.maxclients = orig_value; - return; + if (aeResizeSetSize(server.rgthreadvar[iel].el, + server.maxclients + CONFIG_FDSET_INCR) == AE_ERR) + { + addReplyError(c,"The event loop API used by Redis is not able to handle the specified number of clients"); + server.maxclients = orig_value; + return; + } } } } @@ -1359,6 +1380,7 @@ void configGetCommand(client *c) { /* String values */ config_get_string_field("dbfilename",server.rdb_filename); + config_get_string_field("masteruser",server.masteruser); config_get_string_field("masterauth",server.masterauth); config_get_string_field("cluster-announce-ip",server.cluster_announce_ip); config_get_string_field("unixsocket",server.unixsocket); @@ -2019,7 +2041,7 @@ void rewriteConfigClientoutputbufferlimitOption(struct rewriteConfigState *state rewriteConfigFormatMemory(soft,sizeof(soft), server.client_obuf_limits[j].soft_limit_bytes); - char *typename = getClientTypeName(j); + const char *typename = getClientTypeName(j); if (!strcmp(typename,"slave")) typename = "replica"; line = sdscatprintf(sdsempty(),"%s %s %s %s %ld", option, typename, hard, soft, @@ -2237,6 +2259,7 @@ int rewriteConfig(char *path) { rewriteConfigDirOption(state); rewriteConfigSlaveofOption(state,"replicaof"); rewriteConfigStringOption(state,"replica-announce-ip",server.slave_announce_ip,CONFIG_DEFAULT_SLAVE_ANNOUNCE_IP); + rewriteConfigStringOption(state,"masteruser",server.masteruser,NULL); rewriteConfigStringOption(state,"masterauth",server.masterauth,NULL); rewriteConfigStringOption(state,"cluster-announce-ip",server.cluster_announce_ip,NULL); rewriteConfigYesNoOption(state,"replica-serve-stale-data",server.repl_serve_stale_data,CONFIG_DEFAULT_SLAVE_SERVE_STALE_DATA); diff --git a/src/crc64.h b/src/crc64.h index c9fca519d..e63cbc2e3 100644 --- a/src/crc64.h +++ b/src/crc64.h @@ -3,10 +3,18 @@ #include +#ifdef __cplusplus +extern "C" { +#endif + uint64_t crc64(uint64_t crc, const unsigned char *s, uint64_t l); #ifdef REDIS_TEST int crc64Test(int argc, char *argv[]); #endif +#ifdef __cplusplus +} +#endif + #endif diff --git a/src/db.c b/src/db.c index c8553c985..67631a597 100644 --- a/src/db.c +++ b/src/db.c @@ -99,6 +99,7 @@ robj *lookupKey(redisDb *db, robj *key, int flags) { * expiring our key via DELs in the replication link. */ robj *lookupKeyReadWithFlags(redisDb *db, robj *key, int flags) { robj *val; + serverAssert(aeThreadOwnsLock()); if (expireIfNeeded(db,key) == 1) { /* Key expired. If we are in the context of a master, expireIfNeeded() @@ -1072,6 +1073,7 @@ int removeExpire(redisDb *db, robj *key) { * after which the key will no longer be considered valid. */ void setExpire(client *c, redisDb *db, robj *key, long long when) { dictEntry *kde, *de; + serverAssert(aeThreadOwnsLock()); /* Reuse the sds from the main dict in the expire dict */ kde = dictFind(db->pdict,ptrFromObj(key)); @@ -1108,6 +1110,7 @@ long long getExpire(redisDb *db, robj *key) { * will be consistent even if we allow write operations against expiring * keys. */ void propagateExpire(redisDb *db, robj *key, int lazy) { + serverAssert(aeThreadOwnsLock()); robj *argv[2]; argv[0] = lazy ? shared.unlink : shared.del; diff --git a/src/debug.c b/src/debug.c index 51e5f39f5..d24c9ef9c 100644 --- a/src/debug.c +++ b/src/debug.c @@ -803,7 +803,7 @@ static void *getMcontextEip(ucontext_t *uc) { #endif #elif defined(__linux__) /* Linux */ - #if defined(__i386__) + #if defined(__i386__) || defined(__ILP32__) return (void*) uc->uc_mcontext.gregs[14]; /* Linux 32 */ #elif defined(__X86_64__) || defined(__x86_64__) return (void*) uc->uc_mcontext.gregs[16]; /* Linux 64 */ @@ -915,7 +915,7 @@ void logRegisters(ucontext_t *uc) { /* Linux */ #elif defined(__linux__) /* Linux x86 */ - #if defined(__i386__) + #if defined(__i386__) || defined(__ILP32__) serverLog(LL_WARNING, "\n" "EAX:%08lx EBX:%08lx ECX:%08lx EDX:%08lx\n" diff --git a/src/defrag.c b/src/defrag.c index a6d64d065..af12289f3 100644 --- a/src/defrag.c +++ b/src/defrag.c @@ -116,17 +116,15 @@ robj *activeDefragStringOb(robj* ob, long *defragged) { /* try to defrag string object */ if (ob->type == OBJ_STRING) { if(ob->encoding==OBJ_ENCODING_RAW) { - sds newsds = activeDefragSds((sds)ob->ptr); + sds newsds = activeDefragSds((sds)ptrFromObj(ob)); if (newsds) { - ob->ptr = newsds; + ob->m_ptr = newsds; (*defragged)++; } } else if (ob->encoding==OBJ_ENCODING_EMBSTR) { /* The sds is embedded in the object allocation, calculate the * offset and update the pointer in the new allocation. */ - long ofs = (intptr_t)ob->ptr - (intptr_t)ob; if ((ret = activeDefragAlloc(ob))) { - ret->ptr = (void*)((intptr_t)ret + ofs); (*defragged)++; } } else if (ob->encoding!=OBJ_ENCODING_INT) { @@ -441,7 +439,7 @@ void defragLater(redisDb *db, dictEntry *kde) { } long scanLaterList(robj *ob) { - quicklist *ql = ob->ptr; + quicklist *ql = ptrFromObj(ob); if (ob->type != OBJ_LIST || ob->encoding != OBJ_ENCODING_QUICKLIST) return 0; server.stat_active_defrag_scanned+=ql->len; @@ -463,7 +461,7 @@ void scanLaterZsetCallback(void *privdata, const dictEntry *_de) { long scanLaterZset(robj *ob, unsigned long *cursor) { if (ob->type != OBJ_ZSET || ob->encoding != OBJ_ENCODING_SKIPLIST) return 0; - zset *zs = (zset*)ob->ptr; + zset *zs = (zset*)ptrFromObj(ob); dict *d = zs->pdict; scanLaterZsetData data = {zs, 0}; *cursor = dictScan(d, *cursor, scanLaterZsetCallback, defragDictBucketCallback, &data); @@ -483,7 +481,7 @@ long scanLaterSet(robj *ob, unsigned long *cursor) { long defragged = 0; if (ob->type != OBJ_SET || ob->encoding != OBJ_ENCODING_HT) return 0; - dict *d = ob->ptr; + dict *d = ptrFromObj(ob); *cursor = dictScan(d, *cursor, scanLaterSetCallback, defragDictBucketCallback, &defragged); return defragged; } @@ -504,7 +502,7 @@ long scanLaterHash(robj *ob, unsigned long *cursor) { long defragged = 0; if (ob->type != OBJ_HASH || ob->encoding != OBJ_ENCODING_HT) return 0; - dict *d = ob->ptr; + dict *d = ptrFromObj(ob); *cursor = dictScan(d, *cursor, scanLaterHashCallback, defragDictBucketCallback, &defragged); return defragged; } @@ -512,10 +510,10 @@ long scanLaterHash(robj *ob, unsigned long *cursor) { long defragQuicklist(redisDb *db, dictEntry *kde) { robj *ob = dictGetVal(kde); long defragged = 0; - quicklist *ql = ob->ptr, *newql; + quicklist *ql = ptrFromObj(ob), *newql; serverAssert(ob->type == OBJ_LIST && ob->encoding == OBJ_ENCODING_QUICKLIST); if ((newql = activeDefragAlloc(ql))) - defragged++, ob->ptr = ql = newql; + defragged++, ob->m_ptr = ql = newql; if (ql->len > server.active_defrag_max_scan_fields) defragLater(db, kde); else @@ -526,7 +524,7 @@ long defragQuicklist(redisDb *db, dictEntry *kde) { long defragZsetSkiplist(redisDb *db, dictEntry *kde) { robj *ob = dictGetVal(kde); long defragged = 0; - zset *zs = (zset*)ob->ptr; + zset *zs = (zset*)ptrFromObj(ob); zset *newzs; zskiplist *newzsl; dict *newdict; @@ -534,7 +532,7 @@ long defragZsetSkiplist(redisDb *db, dictEntry *kde) { struct zskiplistNode *newheader; serverAssert(ob->type == OBJ_ZSET && ob->encoding == OBJ_ENCODING_SKIPLIST); if ((newzs = activeDefragAlloc(zs))) - defragged++, ob->ptr = zs = newzs; + defragged++, ob->m_ptr = zs = newzs; if ((newzsl = activeDefragAlloc(zs->zsl))) defragged++, zs->zsl = newzsl; if ((newheader = activeDefragAlloc(zs->zsl->header))) @@ -561,16 +559,16 @@ long defragHash(redisDb *db, dictEntry *kde) { robj *ob = dictGetVal(kde); dict *d, *newd; serverAssert(ob->type == OBJ_HASH && ob->encoding == OBJ_ENCODING_HT); - d = ob->ptr; + d = ptrFromObj(ob); if (dictSize(d) > server.active_defrag_max_scan_fields) defragLater(db, kde); else defragged += activeDefragSdsDict(d, DEFRAG_SDS_DICT_VAL_IS_SDS); /* handle the dict struct */ - if ((newd = activeDefragAlloc(ob->ptr))) - defragged++, ob->ptr = newd; + if ((newd = activeDefragAlloc(ptrFromObj(ob)))) + defragged++, ob->m_ptr = newd; /* defrag the dict tables */ - defragged += dictDefragTables(ob->ptr); + defragged += dictDefragTables(ptrFromObj(ob)); return defragged; } @@ -579,16 +577,16 @@ long defragSet(redisDb *db, dictEntry *kde) { robj *ob = dictGetVal(kde); dict *d, *newd; serverAssert(ob->type == OBJ_SET && ob->encoding == OBJ_ENCODING_HT); - d = ob->ptr; + d = ptrFromObj(ob); if (dictSize(d) > server.active_defrag_max_scan_fields) defragLater(db, kde); else defragged += activeDefragSdsDict(d, DEFRAG_SDS_DICT_NO_VAL); /* handle the dict struct */ - if ((newd = activeDefragAlloc(ob->ptr))) - defragged++, ob->ptr = newd; + if ((newd = activeDefragAlloc(ptrFromObj(ob)))) + defragged++, ob->m_ptr = newd; /* defrag the dict tables */ - defragged += dictDefragTables(ob->ptr); + defragged += dictDefragTables(ptrFromObj(ob)); return defragged; } @@ -613,11 +611,11 @@ int scanLaterStraemListpacks(robj *ob, unsigned long *cursor, long long endtime, return 0; } - stream *s = ob->ptr; - raxStart(&ri,s->rax); + stream *s = ptrFromObj(ob); + raxStart(&ri,s->prax); if (*cursor == 0) { /* if cursor is 0, we start new iteration */ - defragRaxNode(&s->rax->head); + defragRaxNode(&s->prax->head); /* assign the iterator node callback before the seek, so that the * initial nodes that are processed till the first item are covered */ ri.node_cb = defragRaxNode; @@ -738,19 +736,19 @@ long defragStream(redisDb *db, dictEntry *kde) { long defragged = 0; robj *ob = dictGetVal(kde); serverAssert(ob->type == OBJ_STREAM && ob->encoding == OBJ_ENCODING_STREAM); - stream *s = ob->ptr, *news; + stream *s = ptrFromObj(ob), *news; /* handle the main struct */ if ((news = activeDefragAlloc(s))) - defragged++, ob->ptr = s = news; + defragged++, ob->m_ptr = s = news; - if (raxSize(s->rax) > server.active_defrag_max_scan_fields) { - rax *newrax = activeDefragAlloc(s->rax); + if (raxSize(s->prax) > server.active_defrag_max_scan_fields) { + rax *newrax = activeDefragAlloc(s->prax); if (newrax) - defragged++, s->rax = newrax; + defragged++, s->prax = newrax; defragLater(db, kde); } else - defragged += defragRadixTree(&s->rax, 1, NULL, NULL); + defragged += defragRadixTree(&s->prax, 1, NULL, NULL); if (s->cgroups) defragged += defragRadixTree(&s->cgroups, 1, defragStreamConsumerGroup, NULL); @@ -792,8 +790,8 @@ long defragKey(redisDb *db, dictEntry *de) { if (ob->encoding == OBJ_ENCODING_QUICKLIST) { defragged += defragQuicklist(db, de); } else if (ob->encoding == OBJ_ENCODING_ZIPLIST) { - if ((newzl = activeDefragAlloc(ob->ptr))) - defragged++, ob->ptr = newzl; + if ((newzl = activeDefragAlloc(ptrFromObj(ob)))) + defragged++, ob->m_ptr = newzl; } else { serverPanic("Unknown list encoding"); } @@ -801,16 +799,16 @@ long defragKey(redisDb *db, dictEntry *de) { if (ob->encoding == OBJ_ENCODING_HT) { defragged += defragSet(db, de); } else if (ob->encoding == OBJ_ENCODING_INTSET) { - intset *newis, *is = ob->ptr; + intset *newis, *is = ptrFromObj(ob); if ((newis = activeDefragAlloc(is))) - defragged++, ob->ptr = newis; + defragged++, ob->m_ptr = newis; } else { serverPanic("Unknown set encoding"); } } else if (ob->type == OBJ_ZSET) { if (ob->encoding == OBJ_ENCODING_ZIPLIST) { - if ((newzl = activeDefragAlloc(ob->ptr))) - defragged++, ob->ptr = newzl; + if ((newzl = activeDefragAlloc(ptrFromObj(ob)))) + defragged++, ob->m_ptr = newzl; } else if (ob->encoding == OBJ_ENCODING_SKIPLIST) { defragged += defragZsetSkiplist(db, de); } else { @@ -818,8 +816,8 @@ long defragKey(redisDb *db, dictEntry *de) { } } else if (ob->type == OBJ_HASH) { if (ob->encoding == OBJ_ENCODING_ZIPLIST) { - if ((newzl = activeDefragAlloc(ob->ptr))) - defragged++, ob->ptr = newzl; + if ((newzl = activeDefragAlloc(ptrFromObj(ob)))) + defragged++, ob->m_ptr = newzl; } else if (ob->encoding == OBJ_ENCODING_HT) { defragged += defragHash(db, de); } else { diff --git a/src/dict.c b/src/dict.c index 3560eb3d3..9b5aba452 100644 --- a/src/dict.c +++ b/src/dict.c @@ -739,6 +739,30 @@ unsigned int dictGetSomeKeys(dict *d, dictEntry **des, unsigned int count) { return stored; } +/* This is like dictGetRandomKey() from the POV of the API, but will do more + * work to ensure a better distribution of the returned element. + * + * This function improves the distribution because the dictGetRandomKey() + * problem is that it selects a random bucket, then it selects a random + * element from the chain in the bucket. However elements being in different + * chain lengths will have different probabilities of being reported. With + * this function instead what we do is to consider a "linear" range of the table + * that may be constituted of N buckets with chains of different lengths + * appearing one after the other. Then we report a random element in the range. + * In this way we smooth away the problem of different chain lenghts. */ +#define GETFAIR_NUM_ENTRIES 15 +dictEntry *dictGetFairRandomKey(dict *d) { + dictEntry *entries[GETFAIR_NUM_ENTRIES]; + unsigned int count = dictGetSomeKeys(d,entries,GETFAIR_NUM_ENTRIES); + /* Note that dictGetSomeKeys() may return zero elements in an unlucky + * run() even if there are actually elements inside the hash table. So + * when we get zero, we call the true dictGetRandomKey() that will always + * yeld the element if the hash table has at least one. */ + if (count == 0) return dictGetRandomKey(d); + unsigned int idx = rand() % count; + return entries[idx]; +} + /* Function to reverse bits. Algorithm from: * http://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel */ static unsigned long rev(unsigned long v) { diff --git a/src/dict.h b/src/dict.h index 62018cc44..4befb9b66 100644 --- a/src/dict.h +++ b/src/dict.h @@ -35,6 +35,10 @@ #include +#ifdef __cplusplus +extern "C" { +#endif + #ifndef __DICT_H #define __DICT_H @@ -166,6 +170,7 @@ dictIterator *dictGetSafeIterator(dict *d); dictEntry *dictNext(dictIterator *iter); void dictReleaseIterator(dictIterator *iter); dictEntry *dictGetRandomKey(dict *d); +dictEntry *dictGetFairRandomKey(dict *d); unsigned int dictGetSomeKeys(dict *d, dictEntry **des, unsigned int count); void dictGetStats(char *buf, size_t bufsize, dict *d); uint64_t dictGenHashFunction(const void *key, int len); @@ -186,4 +191,8 @@ extern dictType dictTypeHeapStringCopyKey; extern dictType dictTypeHeapStrings; extern dictType dictTypeHeapStringCopyKeyValue; +#ifdef __cplusplus +} +#endif + #endif /* __DICT_H */ diff --git a/src/endianconv.h b/src/endianconv.h index 475f72b08..3c8aef14f 100644 --- a/src/endianconv.h +++ b/src/endianconv.h @@ -36,6 +36,10 @@ #include "config.h" #include +#ifdef __cplusplus +extern "C" { +#endif + void memrev16(void *p); void memrev32(void *p); void memrev64(void *p); @@ -75,4 +79,8 @@ uint64_t intrev64(uint64_t v); int endianconvTest(int argc, char *argv[]); #endif +#ifdef __cplusplus +} +#endif + #endif diff --git a/src/evict.c b/src/evict.c index 28cd73f6f..48d6d0387 100644 --- a/src/evict.c +++ b/src/evict.c @@ -350,6 +350,7 @@ unsigned long LFUDecrAndReturn(robj *o) { * used memory: the eviction should use mostly data size. This function * returns the sum of AOF and slaves buffer. */ size_t freeMemoryGetNotCountedMemory(void) { + serverAssert(aeThreadOwnsLock()); size_t overhead = 0; int slaves = listLength(server.slaves); @@ -444,6 +445,7 @@ int getMaxmemoryState(size_t *total, size_t *logical, size_t *tofree, float *lev * Otehrwise if we are over the memory limit, but not enough memory * was freed to return back under the limit, the function returns C_ERR. */ int freeMemoryIfNeeded(void) { + serverAssert(aeThreadOwnsLock()); /* By default replicas should ignore maxmemory * and just be masters exact copies. */ if (server.masterhost && server.repl_slave_ignore_maxmemory) return C_OK; diff --git a/src/fastlock.cpp b/src/fastlock.cpp index 42741d0a1..4a4fb2962 100644 --- a/src/fastlock.cpp +++ b/src/fastlock.cpp @@ -1,25 +1,135 @@ +/* + * Copyright (c) 2019, John Sully + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + #include "fastlock.h" #include +#include +#include +#include +#include +#include + +/**************************************************** + * + * Implementation of a fair spinlock. To promote fairness we + * use a ticket lock instead of a raw spinlock + * + ****************************************************/ + +static_assert(sizeof(pid_t) <= sizeof(fastlock::m_pidOwner), "fastlock::m_pidOwner not large enough"); + +static pid_t gettid() +{ + static thread_local int pidCache = -1; + if (pidCache == -1) + pidCache = syscall(SYS_gettid); + return pidCache; +} extern "C" void fastlock_init(struct fastlock *lock) { - lock->m_lock = 0; + lock->m_ticket.m_active = 0; + lock->m_ticket.m_avail = 0; + lock->m_depth = 0; + lock->m_pidOwner = -1; } extern "C" void fastlock_lock(struct fastlock *lock) { - while (!__sync_bool_compare_and_swap(&lock->m_lock, 0, 1)) + if ((int)__atomic_load_4(&lock->m_pidOwner, __ATOMIC_ACQUIRE) == gettid()) { + ++lock->m_depth; + return; } + + unsigned myticket = __atomic_fetch_add(&lock->m_ticket.m_avail, 1, __ATOMIC_RELEASE); + + int cloops = 0; + while (__atomic_load_2(&lock->m_ticket.m_active, __ATOMIC_ACQUIRE) != myticket) + { + if ((++cloops % 1024*1024) == 0) + sched_yield(); + } + + lock->m_depth = 1; + __atomic_store_4(&lock->m_pidOwner, gettid(), __ATOMIC_RELEASE); + std::atomic_thread_fence(std::memory_order_acquire); +} + +extern "C" int fastlock_trylock(struct fastlock *lock) +{ + if ((int)__atomic_load_4(&lock->m_pidOwner, __ATOMIC_ACQUIRE) == gettid()) + { + ++lock->m_depth; + return true; + } + + // cheap test + if (lock->m_ticket.m_active != lock->m_ticket.m_avail) + return false; + + uint16_t active = __atomic_load_2(&lock->m_ticket.m_active, __ATOMIC_RELAXED); + uint16_t next = active + 1; + + struct ticket ticket_expect { active, active }; + struct ticket ticket_setiflocked { active, next }; + if (__atomic_compare_exchange(&lock->m_ticket, &ticket_expect, &ticket_setiflocked, true /*strong*/, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE)) + { + lock->m_depth = 1; + __atomic_store_4(&lock->m_pidOwner, gettid(), __ATOMIC_RELEASE); + return true; + } + return false; } extern "C" void fastlock_unlock(struct fastlock *lock) { - __sync_bool_compare_and_swap(&lock->m_lock, 1, 0); + --lock->m_depth; + if (lock->m_depth == 0) + { + assert((int)__atomic_load_4(&lock->m_pidOwner, __ATOMIC_RELAXED) >= 0); // unlock after free + lock->m_pidOwner = -1; + std::atomic_thread_fence(std::memory_order_acquire); + __atomic_fetch_add(&lock->m_ticket.m_active, 1, __ATOMIC_ACQ_REL); + } } extern "C" void fastlock_free(struct fastlock *lock) { // NOP - (void)lock; + assert((lock->m_ticket.m_active == lock->m_ticket.m_avail) // Asser the lock is unlocked + || (lock->m_pidOwner == gettid() && (lock->m_ticket.m_active == lock->m_ticket.m_avail-1))); // OR we own the lock and nobody else is waiting + lock->m_pidOwner = -2; // sentinal value indicating free +} + + +bool fastlock::fOwnLock() +{ + return gettid() == m_pidOwner; } \ No newline at end of file diff --git a/src/fastlock.h b/src/fastlock.h index 864c86822..b5a70c530 100644 --- a/src/fastlock.h +++ b/src/fastlock.h @@ -1,4 +1,5 @@ #pragma once +#include #ifdef __cplusplus extern "C" { @@ -8,6 +9,7 @@ extern "C" { struct fastlock; void fastlock_init(struct fastlock *lock); void fastlock_lock(struct fastlock *lock); +int fastlock_trylock(struct fastlock *lock); void fastlock_unlock(struct fastlock *lock); void fastlock_free(struct fastlock *lock); @@ -16,19 +18,39 @@ void fastlock_free(struct fastlock *lock); } #endif +struct ticket +{ + uint16_t m_active; + uint16_t m_avail; +}; struct fastlock { - int m_lock; + volatile struct ticket m_ticket; + + volatile int m_pidOwner; + volatile int m_depth; #ifdef __cplusplus + fastlock() + { + fastlock_init(this); + } + void lock() { fastlock_lock(this); } + bool try_lock() + { + return !!fastlock_trylock(this); + } + void unlock() { fastlock_unlock(this); } + + bool fOwnLock(); // true if this thread owns the lock, NOTE: not 100% reliable, use for debugging only #endif }; diff --git a/src/fmacros.h b/src/fmacros.h index 3b1bc5eb8..a56bb9331 100644 --- a/src/fmacros.h +++ b/src/fmacros.h @@ -30,13 +30,11 @@ #ifndef _REDIS_FMACRO_H #define _REDIS_FMACRO_H -#define _BSD_SOURCE +#define _DEFAULT_SOURCE 1 #if defined(__linux__) -#ifndef __cplusplus -#define _GNU_SOURCE -#define _DEFAULT_SOURCE -#endif +#define _GNU_SOURCE 1 +#define _DEFAULT_SOURCE 1 #endif #if defined(_AIX) diff --git a/src/intset.h b/src/intset.h index 6849abff9..9bf172d5d 100644 --- a/src/intset.h +++ b/src/intset.h @@ -41,7 +41,9 @@ typedef struct intset { uint32_t encoding; uint32_t length; - int8_t contents[ZERO_LENGTH_ARRAY_LENGTH]; +#ifndef __cplusplus + int8_t contents[]; +#endif } intset; intset *intsetNew(void); diff --git a/src/module.c b/src/module.c index 45fa4e293..54a36e0c1 100644 --- a/src/module.c +++ b/src/module.c @@ -484,6 +484,7 @@ void moduleFreeContext(RedisModuleCtx *ctx) { * details needed to correctly replicate commands. */ void moduleHandlePropagationAfterCommandCallback(RedisModuleCtx *ctx) { client *c = ctx->client; + serverAssert(aeThreadOwnsLock()); if (c->flags & CLIENT_LUA) return; @@ -2696,7 +2697,7 @@ RedisModuleCallReply *RM_Call(RedisModuleCtx *ctx, const char *cmdname, const ch /* Create the client and dispatch the command. */ va_start(ap, fmt); - c = createClient(-1); + c = createClient(-1, IDX_EVENT_LOOP_MAIN); c->puser = NULL; /* Root user. */ argv = moduleCreateArgvFromUserFormat(cmdname,fmt,&argc,&flags,ap); replicate = flags & REDISMODULE_ARGV_REPLICATE; @@ -3546,7 +3547,7 @@ RedisModuleBlockedClient *RM_BlockClient(RedisModuleCtx *ctx, RedisModuleCmdFunc bc->disconnect_callback = NULL; /* Set by RM_SetDisconnectCallback() */ bc->free_privdata = free_privdata; bc->privdata = NULL; - bc->reply_client = createClient(-1); + bc->reply_client = createClient(-1, IDX_EVENT_LOOP_MAIN); bc->reply_client->flags |= CLIENT_MODULE; bc->dbid = c->db->id; c->bpop.timeout = timeout_ms ? (mstime()+timeout_ms) : 0; @@ -3623,6 +3624,7 @@ void RM_SetDisconnectCallback(RedisModuleBlockedClient *bc, RedisModuleDisconnec void moduleHandleBlockedClients(void) { listNode *ln; RedisModuleBlockedClient *bc; + serverAssert(aeThreadOwnsLock()); pthread_mutex_lock(&moduleUnblockedClientsMutex); /* Here we unblock all the pending clients blocked in modules operations @@ -3633,9 +3635,16 @@ void moduleHandleBlockedClients(void) { ln = listFirst(moduleUnblockedClients); bc = ln->value; client *c = bc->client; + serverAssert(c->iel == IDX_EVENT_LOOP_MAIN); listDelNode(moduleUnblockedClients,ln); pthread_mutex_unlock(&moduleUnblockedClientsMutex); + if (c) + { + AssertCorrectThread(c); + fastlock_lock(&c->lock); + } + /* Release the lock during the loop, as long as we don't * touch the shared list. */ @@ -3692,13 +3701,15 @@ void moduleHandleBlockedClients(void) { !(c->flags & CLIENT_PENDING_WRITE)) { c->flags |= CLIENT_PENDING_WRITE; - listAddNodeHead(server.clients_pending_write,c); + AssertCorrectThread(c); + listAddNodeHead(server.rgthreadvar[c->iel].clients_pending_write,c); } } /* Free 'bc' only after unblocking the client, since it is * referenced in the client blocking context, and must be valid * when calling unblockClient(). */ + fastlock_unlock(&c->lock); zfree(bc); /* Lock again before to iterate the loop. */ @@ -3794,7 +3805,7 @@ RedisModuleCtx *RM_GetThreadSafeContext(RedisModuleBlockedClient *bc) { * access it safely from another thread, so we create a fake client here * in order to keep things like the currently selected database and similar * things. */ - ctx->client = createClient(-1); + ctx->client = createClient(-1, IDX_EVENT_LOOP_MAIN); if (bc) selectDb(ctx->client,bc->dbid); return ctx; } @@ -4300,7 +4311,7 @@ RedisModuleTimerID RM_CreateTimer(RedisModuleCtx *ctx, mstime_t period, RedisMod if (memcmp(ri.key,&key,sizeof(key)) == 0) { /* This is the first key, we need to re-install the timer according * to the just added event. */ - aeDeleteTimeEvent(server.el,aeTimer); + aeDeleteTimeEvent(server.rgthreadvar[IDX_EVENT_LOOP_MAIN].el,aeTimer); aeTimer = -1; } raxStop(&ri); @@ -4309,7 +4320,7 @@ RedisModuleTimerID RM_CreateTimer(RedisModuleCtx *ctx, mstime_t period, RedisMod /* If we have no main timer (the old one was invalidated, or this is the * first module timer we have), install one. */ if (aeTimer == -1) - aeTimer = aeCreateTimeEvent(server.el,period,moduleTimerHandler,NULL,NULL); + aeTimer = aeCreateTimeEvent(server.rgthreadvar[IDX_EVENT_LOOP_MAIN].el,period,moduleTimerHandler,NULL,NULL); return key; } @@ -4659,7 +4670,7 @@ void moduleInitModulesSystem(void) { /* Set up the keyspace notification susbscriber list and static client */ moduleKeyspaceSubscribers = listCreate(); - moduleFreeContextReusedClient = createClient(-1); + moduleFreeContextReusedClient = createClient(-1, IDX_EVENT_LOOP_MAIN); moduleFreeContextReusedClient->flags |= CLIENT_MODULE; moduleFreeContextReusedClient->puser = NULL; /* root user. */ diff --git a/src/multi.c b/src/multi.c index 6d722b8af..4f7711f6c 100644 --- a/src/multi.c +++ b/src/multi.c @@ -72,6 +72,7 @@ void queueMultiCommand(client *c) { } void discardTransaction(client *c) { + serverAssert(aeThreadOwnsLock()); freeClientMultiState(c); initClientMultiState(c); c->flags &= ~(CLIENT_MULTI|CLIENT_DIRTY_CAS|CLIENT_DIRTY_EXEC); @@ -81,11 +82,13 @@ void discardTransaction(client *c) { /* Flag the transacation as DIRTY_EXEC so that EXEC will fail. * Should be called every time there is an error while queueing a command. */ void flagTransaction(client *c) { + serverAssert(aeThreadOwnsLock()); if (c->flags & CLIENT_MULTI) c->flags |= CLIENT_DIRTY_EXEC; } void multiCommand(client *c) { + serverAssert(aeThreadOwnsLock()); if (c->flags & CLIENT_MULTI) { addReplyError(c,"MULTI calls can not be nested"); return; @@ -291,6 +294,7 @@ void unwatchAllKeys(client *c) { /* "Touch" a key, so that if this key is being WATCHed by some client the * next EXEC will fail. */ void touchWatchedKey(redisDb *db, robj *key) { + serverAssert(aeThreadOwnsLock()); list *clients; listIter li; listNode *ln; @@ -316,6 +320,7 @@ void touchWatchedKey(redisDb *db, robj *key) { void touchWatchedKeysOnFlush(int dbid) { listIter li1, li2; listNode *ln; + serverAssert(aeThreadOwnsLock()); /* For every client, check all the waited keys */ listRewind(server.clients,&li1); @@ -350,6 +355,7 @@ void watchCommand(client *c) { void unwatchCommand(client *c) { unwatchAllKeys(c); + serverAssert(aeThreadOwnsLock()); c->flags &= (~CLIENT_DIRTY_CAS); addReply(c,shared.ok); } diff --git a/src/networking.c b/src/networking.cpp similarity index 77% rename from src/networking.c rename to src/networking.cpp index 1c917af2a..2cddccf92 100644 --- a/src/networking.c +++ b/src/networking.cpp @@ -1,5 +1,6 @@ /* * Copyright (c) 2009-2012, Salvatore Sanfilippo + * Copyright (c) 2019 John Sully * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -32,8 +33,67 @@ #include #include #include +#include +#include static void setProtocolError(const char *errstr, client *c); +void addReplyLongLongWithPrefixCore(client *c, long long ll, char prefix, bool fAsync); +void addReplyBulkCStringCore(client *c, const char *s, bool fAsync); + +class AeLocker +{ + bool m_fArmed = false; + +public: + AeLocker() + { + } + + void arm(client *c) // if a client is passed, then the client is already locked + { + if (c != nullptr) + { + serverAssert(!m_fArmed); + serverAssert(c->lock.fOwnLock()); + + bool fClientLocked = true; + while (!aeTryAcquireLock()) + { + if (fClientLocked) c->lock.unlock(); + fClientLocked = false; + aeAcquireLock(); + if (!c->lock.try_lock()) + { + aeReleaseLock(); + } + else + { + break; + } + } + + m_fArmed = true; + } + else if (!m_fArmed) + { + m_fArmed = true; + aeAcquireLock(); + } + } + + void disarm() + { + serverAssert(m_fArmed); + m_fArmed = false; + aeReleaseLock(); + } + + ~AeLocker() + { + if (m_fArmed) + aeReleaseLock(); + } +}; /* Return the size consumed from the allocator, for the specified SDS string, * including internal fragmentation. This function is used in order to compute @@ -48,7 +108,7 @@ size_t sdsZmallocSize(sds s) { size_t getStringObjectSdsUsedMemory(robj *o) { serverAssertWithInfo(NULL,o,o->type == OBJ_STRING); switch(o->encoding) { - case OBJ_ENCODING_RAW: return sdsZmallocSize(ptrFromObj(o)); + case OBJ_ENCODING_RAW: return sdsZmallocSize((sds)ptrFromObj(o)); case OBJ_ENCODING_EMBSTR: return zmalloc_size(o)-sizeof(robj); default: return 0; /* Just integer encoding for now. */ } @@ -56,8 +116,8 @@ size_t getStringObjectSdsUsedMemory(robj *o) { /* Client.reply list dup and free methods. */ void *dupClientReplyValue(void *o) { - clientReplyBlock *old = o; - clientReplyBlock *buf = zmalloc(sizeof(clientReplyBlock) + old->size, MALLOC_LOCAL); + clientReplyBlock *old = (clientReplyBlock*)o; + clientReplyBlock *buf = (clientReplyBlock*)zmalloc(sizeof(clientReplyBlock) + old->size, MALLOC_LOCAL); memcpy(buf, o, sizeof(clientReplyBlock) + old->size); return buf; } @@ -67,7 +127,7 @@ void freeClientReplyValue(void *o) { } int listMatchObjects(void *a, void *b) { - return equalStringObjects(a,b); + return equalStringObjects((robj*)a,(robj*)b); } /* This function links the client to the global linked list of clients. @@ -82,9 +142,10 @@ void linkClient(client *c) { raxInsert(server.clients_index,(unsigned char*)&id,sizeof(id),c,NULL); } -client *createClient(int fd) { - client *c = zmalloc(sizeof(client), MALLOC_LOCAL); +client *createClient(int fd, int iel) { + client *c = (client*)zmalloc(sizeof(client), MALLOC_LOCAL); + c->iel = iel; /* passing -1 as fd it is possible to create a non connected client. * This is useful since all the commands needs to be executed * in the context of a client. When commands are executed in other @@ -94,7 +155,7 @@ client *createClient(int fd) { anetEnableTcpNoDelay(NULL,fd); if (server.tcpkeepalive) anetKeepAlive(NULL,fd,server.tcpkeepalive); - if (aeCreateFileEvent(server.el,fd,AE_READABLE, + if (aeCreateFileEvent(server.rgthreadvar[iel].el,fd,AE_READABLE|AE_READ_THREADSAFE, readQueryFromClient, c) == AE_ERR) { close(fd); @@ -106,6 +167,8 @@ client *createClient(int fd) { selectDb(c,0); uint64_t client_id; atomicGetIncr(server.next_client_id,client_id,1); + c->iel = iel; + fastlock_init(&c->lock); c->id = client_id; c->resp = 2; c->fd = fd; @@ -123,7 +186,9 @@ client *createClient(int fd) { c->multibulklen = 0; c->bulklen = -1; c->sentlen = 0; + c->sentlenAsync = 0; c->flags = 0; + c->fPendingAsyncWrite = FALSE; c->ctime = c->lastinteraction = server.unixtime; /* If the default user does not require authentication, the user is * directly authenticated. */ @@ -157,10 +222,15 @@ client *createClient(int fd) { c->pubsub_patterns = listCreate(); c->peerid = NULL; c->client_list_node = NULL; + c->bufAsync = NULL; + c->buflenAsync = 0; + c->bufposAsync = 0; + listSetFreeMethod(c->pubsub_patterns,decrRefCountVoid); listSetMatchMethod(c->pubsub_patterns,listMatchObjects); if (fd != -1) linkClient(c); initClientMultiState(c); + AssertCorrectThread(c); return c; } @@ -179,6 +249,8 @@ void clientInstallWriteHandler(client *c) { (c->replstate == REPL_STATE_NONE || (c->replstate == SLAVE_STATE_ONLINE && !c->repl_put_online_on_ack))) { + AssertCorrectThread(c); + serverAssert(c->lock.fOwnLock()); /* Here instead of installing the write handler, we just flag the * client and put it into a list of clients that have something * to write to the socket. This way before re-entering the event @@ -186,7 +258,15 @@ void clientInstallWriteHandler(client *c) { * a system call. We'll only really install the write handler if * we'll not be able to write the whole reply at once. */ c->flags |= CLIENT_PENDING_WRITE; - listAddNodeHead(server.clients_pending_write,c); + listAddNodeHead(server.rgthreadvar[c->iel].clients_pending_write,c); + } +} + +void clientInstallAsyncWriteHandler(client *c) { + serverAssert(aeThreadOwnsLock()); + if (!(c->fPendingAsyncWrite)) { + c->fPendingAsyncWrite = TRUE; + listAddNodeHead(serverTL->clients_pending_asyncwrite,c); } } @@ -212,7 +292,11 @@ void clientInstallWriteHandler(client *c) { * Typically gets called every time a reply is built, before adding more * data to the clients output buffers. If the function returns C_ERR no * data should be appended to the output buffers. */ -int prepareClientToWrite(client *c) { +int prepareClientToWrite(client *c, bool fAsync) { + fAsync = fAsync && !FCorrectThread(c); // Not async if we're on the right thread + serverAssert(!fAsync || aeThreadOwnsLock()); + serverAssert(c->lock.fOwnLock()); + /* If it's the Lua client we always return ok without installing any * handler since there is no socket at all. */ if (c->flags & (CLIENT_LUA|CLIENT_MODULE)) return C_OK; @@ -229,7 +313,8 @@ int prepareClientToWrite(client *c) { /* Schedule the client to write the output buffers to the socket, unless * it should already be setup to do so (it has already pending data). */ - if (!clientHasPendingReplies(c)) clientInstallWriteHandler(c); + if (!fAsync && !clientHasPendingReplies(c)) clientInstallWriteHandler(c); + if (fAsync && !(c->fPendingAsyncWrite)) clientInstallAsyncWriteHandler(c); /* Authorize the caller to queue in the output buffer of this client. */ return C_OK; @@ -239,28 +324,46 @@ int prepareClientToWrite(client *c) { * Low level functions to add more data to output buffers. * -------------------------------------------------------------------------- */ -int _addReplyToBuffer(client *c, const char *s, size_t len) { - size_t available = sizeof(c->buf)-c->bufpos; - +int _addReplyToBuffer(client *c, const char *s, size_t len, bool fAsync) { if (c->flags & CLIENT_CLOSE_AFTER_REPLY) return C_OK; - /* If there already are entries in the reply list, we cannot - * add anything more to the static buffer. */ - if (listLength(c->reply) > 0) return C_ERR; + fAsync = fAsync && !FCorrectThread(c); // Not async if we're on the right thread + if (fAsync) + { + serverAssert(aeThreadOwnsLock()); + if ((c->buflenAsync - c->bufposAsync) < (int)len) + { + int minsize = len + c->bufposAsync; + c->buflenAsync = std::max(minsize, c->buflenAsync*2 - c->buflenAsync); + c->bufAsync = (char*)zrealloc(c->bufAsync, c->buflenAsync, MALLOC_LOCAL); + c->buflenAsync = zmalloc_usable(c->bufAsync); + } + memcpy(c->bufAsync+c->bufposAsync,s,len); + c->bufposAsync += len; + } + else + { + size_t available = sizeof(c->buf)-c->bufpos; - /* Check that the buffer has enough space available for this string. */ - if (len > available) return C_ERR; + /* If there already are entries in the reply list, we cannot + * add anything more to the static buffer. */ + if (listLength(c->reply) > 0) return C_ERR; - memcpy(c->buf+c->bufpos,s,len); - c->bufpos+=len; + /* Check that the buffer has enough space available for this string. */ + if (len > available) return C_ERR; + + memcpy(c->buf+c->bufpos,s,len); + c->bufpos+=len; + } return C_OK; } void _addReplyProtoToList(client *c, const char *s, size_t len) { if (c->flags & CLIENT_CLOSE_AFTER_REPLY) return; + AssertCorrectThread(c); listNode *ln = listLast(c->reply); - clientReplyBlock *tail = ln? listNodeValue(ln): NULL; + clientReplyBlock *tail = (clientReplyBlock*) (ln? listNodeValue(ln): NULL); /* Note that 'tail' may be NULL even if we have a tail node, becuase when * addDeferredMultiBulkLength() is used, it sets a dummy node to NULL just @@ -272,7 +375,7 @@ void _addReplyProtoToList(client *c, const char *s, size_t len) { * new node */ size_t avail = tail->size - tail->used; size_t copy = avail >= len? len: avail; - memcpy(tail->buf + tail->used, s, copy); + memcpy(tail->buf() + tail->used, s, copy); tail->used += copy; s += copy; len -= copy; @@ -281,11 +384,11 @@ void _addReplyProtoToList(client *c, const char *s, size_t len) { /* Create a new node, make sure it is allocated to at * least PROTO_REPLY_CHUNK_BYTES */ size_t size = len < PROTO_REPLY_CHUNK_BYTES? PROTO_REPLY_CHUNK_BYTES: len; - tail = zmalloc(size + sizeof(clientReplyBlock), MALLOC_LOCAL); + tail = (clientReplyBlock*)zmalloc(size + sizeof(clientReplyBlock), MALLOC_LOCAL); /* take over the allocation's internal fragmentation */ tail->size = zmalloc_usable(tail) - sizeof(clientReplyBlock); tail->used = len; - memcpy(tail->buf, s, len); + memcpy(tail->buf(), s, len); listAddNodeTail(c->reply, tail); c->reply_bytes += tail->size; } @@ -296,40 +399,56 @@ void _addReplyProtoToList(client *c, const char *s, size_t len) { * Higher level functions to queue data on the client output buffer. * The following functions are the ones that commands implementations will call. * -------------------------------------------------------------------------- */ - -/* Add the object 'obj' string representation to the client output buffer. */ -void addReply(client *c, robj *obj) { - if (prepareClientToWrite(c) != C_OK) return; +void addReplyCore(client *c, robj *obj, bool fAsync) { + if (prepareClientToWrite(c, fAsync) != C_OK) return; if (sdsEncodedObject(obj)) { - if (_addReplyToBuffer(c,ptrFromObj(obj),sdslen(ptrFromObj(obj))) != C_OK) - _addReplyProtoToList(c,ptrFromObj(obj),sdslen(ptrFromObj(obj))); + if (_addReplyToBuffer(c,(const char*)ptrFromObj(obj),sdslen((sds)ptrFromObj(obj)),fAsync) != C_OK) + _addReplyProtoToList(c,(const char*)ptrFromObj(obj),sdslen((sds)ptrFromObj(obj))); } else if (obj->encoding == OBJ_ENCODING_INT) { /* For integer encoded strings we just convert it into a string * using our optimized function, and attach the resulting string * to the output buffer. */ char buf[32]; size_t len = ll2string(buf,sizeof(buf),(long)ptrFromObj(obj)); - if (_addReplyToBuffer(c,buf,len) != C_OK) + if (_addReplyToBuffer(c,buf,len,fAsync) != C_OK) _addReplyProtoToList(c,buf,len); } else { serverPanic("Wrong obj->encoding in addReply()"); } } +/* Add the object 'obj' string representation to the client output buffer. */ +void addReply(client *c, robj *obj) +{ + addReplyCore(c, obj, false); +} +void addReplyAsync(client *c, robj *obj) +{ + addReplyCore(c, obj, true); +} + /* Add the SDS 's' string to the client output buffer, as a side effect * the SDS string is freed. */ -void addReplySds(client *c, sds s) { - if (prepareClientToWrite(c) != C_OK) { +void addReplySdsCore(client *c, sds s, bool fAsync) { + if (prepareClientToWrite(c, fAsync) != C_OK) { /* The caller expects the sds to be free'd. */ sdsfree(s); return; } - if (_addReplyToBuffer(c,s,sdslen(s)) != C_OK) + if (_addReplyToBuffer(c,s,sdslen(s), fAsync) != C_OK) _addReplyProtoToList(c,s,sdslen(s)); sdsfree(s); } +void addReplySds(client *c, sds s) { + addReplySdsCore(c, s, false); +} + +void addReplySdsAsync(client *c, sds s) { + addReplySdsCore(c, s, true); +} + /* This low level function just adds whatever protocol you send it to the * client buffer, trying the static buffer initially, and using the string * of objects if not possible. @@ -338,12 +457,20 @@ void addReplySds(client *c, sds s) { * if not needed. The object will only be created by calling * _addReplyProtoToList() if we fail to extend the existing tail object * in the list of objects. */ -void addReplyProto(client *c, const char *s, size_t len) { - if (prepareClientToWrite(c) != C_OK) return; - if (_addReplyToBuffer(c,s,len) != C_OK) +void addReplyProtoCore(client *c, const char *s, size_t len, bool fAsync) { + if (prepareClientToWrite(c, fAsync) != C_OK) return; + if (_addReplyToBuffer(c,s,len,fAsync) != C_OK) _addReplyProtoToList(c,s,len); } +void addReplyProto(client *c, const char *s, size_t len) { + addReplyProtoCore(c, s, len, false); +} + +void addReplyProtoAsync(client *c, const char *s, size_t len) { + addReplyProtoCore(c, s, len, true); +} + /* Low level function called by the addReplyError...() functions. * It emits the protocol for a Redis error, in the form: * @@ -352,12 +479,12 @@ void addReplyProto(client *c, const char *s, size_t len) { * If the error code is already passed in the string 's', the error * code provided is used, otherwise the string "-ERR " for the generic * error code is automatically added. */ -void addReplyErrorLength(client *c, const char *s, size_t len) { +void addReplyErrorLengthCore(client *c, const char *s, size_t len, bool fAsync) { /* If the string already starts with "-..." then the error code * is provided by the caller. Otherwise we use "-ERR". */ - if (!len || s[0] != '-') addReplyProto(c,"-ERR ",5); - addReplyProto(c,s,len); - addReplyProto(c,"\r\n",2); + if (!len || s[0] != '-') addReplyProtoCore(c,"-ERR ",5,fAsync); + addReplyProtoCore(c,s,len,fAsync); + addReplyProtoCore(c,"\r\n",2,fAsync); /* Sometimes it could be normal that a slave replies to a master with * an error and this function gets called. Actually the error will never @@ -370,17 +497,26 @@ void addReplyErrorLength(client *c, const char *s, size_t len) { * will produce an error. However it is useful to log such events since * they are rare and may hint at errors in a script or a bug in Redis. */ if (c->flags & (CLIENT_MASTER|CLIENT_SLAVE) && !(c->flags & CLIENT_MONITOR)) { - char* to = c->flags & CLIENT_MASTER? "master": "replica"; - char* from = c->flags & CLIENT_MASTER? "replica": "master"; - char *cmdname = c->lastcmd ? c->lastcmd->name : ""; + const char* to = reinterpret_cast(c->flags & CLIENT_MASTER? "master": "replica"); + const char* from = reinterpret_cast(c->flags & CLIENT_MASTER? "replica": "master"); + const char *cmdname = reinterpret_cast(c->lastcmd ? c->lastcmd->name : ""); serverLog(LL_WARNING,"== CRITICAL == This %s is sending an error " "to its %s: '%s' after processing the command " "'%s'", from, to, s, cmdname); } } +void addReplyErrorLength(client *c, const char *s, size_t len) +{ + addReplyErrorLengthCore(c, s, len, false); +} + void addReplyError(client *c, const char *err) { - addReplyErrorLength(c,err,strlen(err)); + addReplyErrorLengthCore(c,err,strlen(err), false); +} + +void addReplyErrorAsync(client *c, const char *err) { + addReplyErrorLengthCore(c, err, strlen(err), true); } void addReplyErrorFormat(client *c, const char *fmt, ...) { @@ -424,11 +560,18 @@ void *addReplyDeferredLen(client *c) { /* Note that we install the write event here even if the object is not * ready to be sent, since we are sure that before returning to the * event loop setDeferredAggregateLen() will be called. */ - if (prepareClientToWrite(c) != C_OK) return NULL; + if (prepareClientToWrite(c, false) != C_OK) return NULL; listAddNodeTail(c->reply,NULL); /* NULL is our placeholder. */ return listLast(c->reply); } +void *addReplyDeferredLenAsync(client *c) { + if (FCorrectThread(c)) + return addReplyDeferredLen(c); + + return (void*)((ssize_t)c->bufposAsync); +} + /* Populate the length object and try gluing it to the next chunk. */ void setDeferredAggregateLen(client *c, void *node, long length, char prefix) { listNode *ln = (listNode*)node; @@ -451,30 +594,57 @@ void setDeferredAggregateLen(client *c, void *node, long length, char prefix) { * - The next node is non-NULL, * - It has enough room already allocated * - And not too large (avoid large memmove) */ - if (ln->next != NULL && (next = listNodeValue(ln->next)) && + if (ln->next != NULL && (next = (clientReplyBlock*)listNodeValue(ln->next)) && next->size - next->used >= lenstr_len && next->used < PROTO_REPLY_CHUNK_BYTES * 4) { - memmove(next->buf + lenstr_len, next->buf, next->used); - memcpy(next->buf, lenstr, lenstr_len); + memmove(next->buf() + lenstr_len, next->buf(), next->used); + memcpy(next->buf(), lenstr, lenstr_len); next->used += lenstr_len; listDelNode(c->reply,ln); } else { /* Create a new node */ - clientReplyBlock *buf = zmalloc(lenstr_len + sizeof(clientReplyBlock), MALLOC_LOCAL); + clientReplyBlock *buf = (clientReplyBlock*)zmalloc(lenstr_len + sizeof(clientReplyBlock), MALLOC_LOCAL); /* Take over the allocation's internal fragmentation */ buf->size = zmalloc_usable(buf) - sizeof(clientReplyBlock); buf->used = lenstr_len; - memcpy(buf->buf, lenstr, lenstr_len); + memcpy(buf->buf(), lenstr, lenstr_len); listNodeValue(ln) = buf; c->reply_bytes += buf->size; } asyncCloseClientOnOutputBufferLimitReached(c); } +void setDeferredAggregateLenAsync(client *c, void *node, long length, char prefix) +{ + if (FCorrectThread(c)) { + setDeferredAggregateLen(c, node, length, prefix); + return; + } + + char lenstr[128]; + int lenstr_len = sprintf(lenstr, "%c%ld\r\n", prefix, length); + + ssize_t idxSplice = (ssize_t)node; + serverAssert(idxSplice <= c->bufposAsync); + if (c->buflenAsync < (c->bufposAsync + lenstr_len)) + { + c->buflenAsync = std::max((int)(c->bufposAsync+lenstr_len), c->buflenAsync*2 - c->buflenAsync); + c->bufAsync = (char*)zrealloc(c->bufAsync, c->buflenAsync, MALLOC_LOCAL); + } + + memmove(c->bufAsync + idxSplice + lenstr_len, c->bufAsync + idxSplice, c->bufposAsync - idxSplice); + memcpy(c->bufAsync + idxSplice, lenstr, lenstr_len); + c->bufposAsync += lenstr_len; +} + void setDeferredArrayLen(client *c, void *node, long length) { setDeferredAggregateLen(c,node,length,'*'); } +void setDeferredArrayLenAsync(client *c, void *node, long length) { + setDeferredAggregateLenAsync(c, node, length, '*'); +} + void setDeferredMapLen(client *c, void *node, long length) { int prefix = c->resp == 2 ? '*' : '%'; if (c->resp == 2) length *= 2; @@ -498,15 +668,15 @@ void setDeferredPushLen(client *c, void *node, long length) { } /* Add a double as a bulk reply */ -void addReplyDouble(client *c, double d) { +void addReplyDoubleCore(client *c, double d, bool fAsync) { if (isinf(d)) { /* Libc in odd systems (Hi Solaris!) will format infinite in a * different way, so better to handle it in an explicit way. */ if (c->resp == 2) { - addReplyBulkCString(c, d > 0 ? "inf" : "-inf"); + addReplyBulkCStringCore(c, d > 0 ? "inf" : "-inf", fAsync); } else { - addReplyProto(c, d > 0 ? ",inf\r\n" : "-inf\r\n", - d > 0 ? 6 : 7); + addReplyProtoCore(c, d > 0 ? ",inf\r\n" : "-inf\r\n", + d > 0 ? 6 : 7, fAsync); } } else { char dbuf[MAX_LONG_DOUBLE_CHARS+3], @@ -515,14 +685,22 @@ void addReplyDouble(client *c, double d) { if (c->resp == 2) { dlen = snprintf(dbuf,sizeof(dbuf),"%.17g",d); slen = snprintf(sbuf,sizeof(sbuf),"$%d\r\n%s\r\n",dlen,dbuf); - addReplyProto(c,sbuf,slen); + addReplyProtoCore(c,sbuf,slen,fAsync); } else { dlen = snprintf(dbuf,sizeof(dbuf),",%.17g\r\n",d); - addReplyProto(c,dbuf,dlen); + addReplyProtoCore(c,dbuf,dlen,fAsync); } } } +void addReplyDouble(client *c, double d) { + addReplyDoubleCore(c, d, false); +} + +void addReplyDoubleAsync(client *c, double d) { + addReplyDoubleCore(c, d, true); +} + /* Add a long double as a bulk reply, but uses a human readable formatting * of the double instead of exposing the crude behavior of doubles to the * dear user. */ @@ -542,7 +720,7 @@ void addReplyHumanLongDouble(client *c, long double d) { /* Add a long long as integer reply or bulk len / multi bulk count. * Basically this is used to output . */ -void addReplyLongLongWithPrefix(client *c, long long ll, char prefix) { +void addReplyLongLongWithPrefixCore(client *c, long long ll, char prefix, bool fAsync) { char buf[128]; int len; @@ -550,10 +728,10 @@ void addReplyLongLongWithPrefix(client *c, long long ll, char prefix) { * so we have a few shared objects to use if the integer is small * like it is most of the times. */ if (prefix == '*' && ll < OBJ_SHARED_BULKHDR_LEN && ll >= 0) { - addReply(c,shared.mbulkhdr[ll]); + addReplyCore(c,shared.mbulkhdr[ll], fAsync); return; } else if (prefix == '$' && ll < OBJ_SHARED_BULKHDR_LEN && ll >= 0) { - addReply(c,shared.bulkhdr[ll]); + addReplyCore(c,shared.bulkhdr[ll], fAsync); return; } @@ -561,33 +739,65 @@ void addReplyLongLongWithPrefix(client *c, long long ll, char prefix) { len = ll2string(buf+1,sizeof(buf)-1,ll); buf[len+1] = '\r'; buf[len+2] = '\n'; - addReplyProto(c,buf,len+3); + addReplyProtoCore(c,buf,len+3, fAsync); +} + +void addReplyLongLongWithPrefix(client *c, long long ll, char prefix) { + addReplyLongLongWithPrefixCore(c, ll, prefix, false); +} + +void addReplyLongLongCore(client *c, long long ll, bool fAsync) { + if (ll == 0) + addReplyCore(c,shared.czero, fAsync); + else if (ll == 1) + addReplyCore(c,shared.cone, fAsync); + else + addReplyLongLongWithPrefixCore(c,ll,':', fAsync); } void addReplyLongLong(client *c, long long ll) { - if (ll == 0) - addReply(c,shared.czero); - else if (ll == 1) - addReply(c,shared.cone); + addReplyLongLongCore(c, ll, false); +} + +void addReplyLongLongAsync(client *c, long long ll) { + addReplyLongLongCore(c, ll, true); +} + +void addReplyAggregateLenCore(client *c, long length, int prefix, bool fAsync) { + if (prefix == '*' && length < OBJ_SHARED_BULKHDR_LEN) + addReplyCore(c,shared.mbulkhdr[length], fAsync); else - addReplyLongLongWithPrefix(c,ll,':'); + addReplyLongLongWithPrefixCore(c,length,prefix, fAsync); } void addReplyAggregateLen(client *c, long length, int prefix) { - if (prefix == '*' && length < OBJ_SHARED_BULKHDR_LEN) - addReply(c,shared.mbulkhdr[length]); - else - addReplyLongLongWithPrefix(c,length,prefix); + addReplyAggregateLenCore(c, length, prefix, false); +} + +void addReplyArrayLenCore(client *c, long length, bool fAsync) { + addReplyAggregateLenCore(c,length,'*', fAsync); } void addReplyArrayLen(client *c, long length) { - addReplyAggregateLen(c,length,'*'); + addReplyArrayLenCore(c, length, false); +} + +void addReplyArrayLenAsync(client *c, long length) { + addReplyArrayLenCore(c, length, true); +} + +void addReplyMapLenCore(client *c, long length, bool fAsync) { + int prefix = c->resp == 2 ? '*' : '%'; + if (c->resp == 2) length *= 2; + addReplyAggregateLenCore(c,length,prefix,fAsync); } void addReplyMapLen(client *c, long length) { - int prefix = c->resp == 2 ? '*' : '%'; - if (c->resp == 2) length *= 2; - addReplyAggregateLen(c,length,prefix); + addReplyMapLenCore(c, length, false); +} + +void addReplyMapLenAsync(client *c, long length) { + addReplyMapLenCore(c, length, true); } void addReplySetLen(client *c, long length) { @@ -601,17 +811,33 @@ void addReplyAttributeLen(client *c, long length) { addReplyAggregateLen(c,length,prefix); } -void addReplyPushLen(client *c, long length) { +void addReplyPushLenCore(client *c, long length, bool fAsync) { int prefix = c->resp == 2 ? '*' : '>'; - addReplyAggregateLen(c,length,prefix); + addReplyAggregateLenCore(c,length,prefix, fAsync); +} + +void addReplyPushLen(client *c, long length) { + addReplyPushLenCore(c, length, false); +} + +void addReplyPushLenAsync(client *c, long length) { + addReplyPushLenCore(c, length, true); +} + +void addReplyNullCore(client *c, bool fAsync) { + if (c->resp == 2) { + addReplyProtoCore(c,"$-1\r\n",5,fAsync); + } else { + addReplyProtoCore(c,"_\r\n",3,fAsync); + } } void addReplyNull(client *c) { - if (c->resp == 2) { - addReplyProto(c,"$-1\r\n",5); - } else { - addReplyProto(c,"_\r\n",3); - } + addReplyNullCore(c, false); +} + +void addReplyNullAsync(client *c) { + addReplyNullCore(c, true); } void addReplyBool(client *c, int b) { @@ -635,61 +861,80 @@ void addReplyNullArray(client *c) { } /* Create the length prefix of a bulk reply, example: $2234 */ -void addReplyBulkLen(client *c, robj *obj) { - size_t len; - - if (sdsEncodedObject(obj)) { - len = sdslen(ptrFromObj(obj)); - } else { - long n = (long)ptrFromObj(obj); - - /* Compute how many bytes will take this integer as a radix 10 string */ - len = 1; - if (n < 0) { - len++; - n = -n; - } - while((n = n/10) != 0) { - len++; - } - } +void addReplyBulkLenCore(client *c, robj *obj, bool fAsync) { + size_t len = stringObjectLen(obj); if (len < OBJ_SHARED_BULKHDR_LEN) - addReply(c,shared.bulkhdr[len]); + addReplyCore(c,shared.bulkhdr[len], fAsync); else - addReplyLongLongWithPrefix(c,len,'$'); + addReplyLongLongWithPrefixCore(c,len,'$', fAsync); +} + +void addReplyBulkLen(client *c, robj *obj) +{ + addReplyBulkLenCore(c, obj, false); } /* Add a Redis Object as a bulk reply */ -void addReplyBulk(client *c, robj *obj) { - addReplyBulkLen(c,obj); - addReply(c,obj); - addReply(c,shared.crlf); +void addReplyBulkCore(client *c, robj *obj, bool fAsync) { + addReplyBulkLenCore(c,obj,fAsync); + addReplyCore(c,obj,fAsync); + addReplyCore(c,shared.crlf,fAsync); +} + +void addReplyBulk(client *c, robj *obj) +{ + addReplyBulkCore(c, obj, false); +} + +void addReplyBulkAsync(client *c, robj *obj) +{ + addReplyBulkCore(c, obj, true); } /* Add a C buffer as bulk reply */ +void addReplyBulkCBufferCore(client *c, const void *p, size_t len, bool fAsync) { + addReplyLongLongWithPrefixCore(c,len,'$',fAsync); + addReplyProtoCore(c,(const char*)p,len,fAsync); + addReplyCore(c,shared.crlf,fAsync); +} + void addReplyBulkCBuffer(client *c, const void *p, size_t len) { - addReplyLongLongWithPrefix(c,len,'$'); - addReplyProto(c,p,len); - addReply(c,shared.crlf); + addReplyBulkCBufferCore(c, p, len, false); +} + +void addReplyBulkCBufferAsync(client *c, const void *p, size_t len) { + addReplyBulkCBufferCore(c, p, len, true); } /* Add sds to reply (takes ownership of sds and frees it) */ -void addReplyBulkSds(client *c, sds s) { - addReplyLongLongWithPrefix(c,sdslen(s),'$'); - addReplySds(c,s); - addReply(c,shared.crlf); +void addReplyBulkSdsCore(client *c, sds s, bool fAsync) { + addReplyLongLongWithPrefixCore(c,sdslen(s),'$', fAsync); + addReplySdsCore(c,s,fAsync); + addReplyCore(c,shared.crlf,fAsync); +} + +void addReplyBulkSds(client *c, sds s) { + addReplyBulkSdsCore(c, s, false); +} + +void addReplyBulkSdsAsync(client *c, sds s) { + addReplyBulkSdsCore(c, s, true); } /* Add a C null term string as bulk reply */ -void addReplyBulkCString(client *c, const char *s) { +void addReplyBulkCStringCore(client *c, const char *s, bool fAsync) { if (s == NULL) { - addReplyNull(c); + addReplyNullCore(c,fAsync); } else { - addReplyBulkCBuffer(c,s,strlen(s)); + addReplyBulkCBufferCore(c,s,strlen(s),fAsync); } } +void addReplyBulkCString(client *c, const char *s) { + addReplyBulkCStringCore(c, s, false); +} + /* Add a long long as a bulk reply */ void addReplyBulkLongLong(client *c, long long ll) { char buf[64]; @@ -779,9 +1024,9 @@ int clientHasPendingReplies(client *c) { } #define MAX_ACCEPTS_PER_CALL 1000 -static void acceptCommonHandler(int fd, int flags, char *ip) { +static void acceptCommonHandler(int fd, int flags, char *ip, int iel) { client *c; - if ((c = createClient(fd)) == NULL) { + if ((c = createClient(fd, iel)) == NULL) { serverLog(LL_WARNING, "Error registering fd event for the new client: %s (fd=%d)", strerror(errno),fd); @@ -793,7 +1038,7 @@ static void acceptCommonHandler(int fd, int flags, char *ip) { * for this condition, since now the socket is already set in non-blocking * mode and we can send an error for free using the Kernel I/O */ if (listLength(server.clients) > server.maxclients) { - char *err = "-ERR max number of clients reached\r\n"; + const char *err = "-ERR max number of clients reached\r\n"; /* That's a best effort error message, don't check write errors */ if (write(c->fd,err,strlen(err)) == -1) { @@ -815,7 +1060,7 @@ static void acceptCommonHandler(int fd, int flags, char *ip) { ip != NULL) { if (strcmp(ip,"127.0.0.1") && strcmp(ip,"::1")) { - char *err = + const char *err = "-DENIED Redis is running in protected mode because protected " "mode is enabled, no bind address was specified, no " "authentication password is requested to clients. In this mode " @@ -852,7 +1097,6 @@ static void acceptCommonHandler(int fd, int flags, char *ip) { void acceptTcpHandler(aeEventLoop *el, int fd, void *privdata, int mask) { int cport, cfd, max = MAX_ACCEPTS_PER_CALL; char cip[NET_IP_STR_LEN]; - UNUSED(el); UNUSED(mask); UNUSED(privdata); @@ -865,7 +1109,12 @@ void acceptTcpHandler(aeEventLoop *el, int fd, void *privdata, int mask) { return; } serverLog(LL_VERBOSE,"Accepted %s:%d", cip, cport); - acceptCommonHandler(cfd,0,cip); + int ielCur = ielFromEventLoop(el); + + // We always accept on the same thread + aeAcquireLock(); + acceptCommonHandler(cfd,0,cip, ielCur); + aeReleaseLock(); } } @@ -883,8 +1132,13 @@ void acceptUnixHandler(aeEventLoop *el, int fd, void *privdata, int mask) { "Accepting client connection: %s", server.neterr); return; } + int ielCur = ielFromEventLoop(el); serverLog(LL_VERBOSE,"Accepted connection to %s", server.unixsocket); - acceptCommonHandler(cfd,CLIENT_UNIX_SOCKET,NULL); + + aeAcquireLock(); + acceptCommonHandler(cfd,CLIENT_UNIX_SOCKET,NULL, ielCur); + aeReleaseLock(); + } } @@ -900,9 +1154,19 @@ static void freeClientArgv(client *c) { * when we resync with our own master and want to force all our slaves to * resync with us as well. */ void disconnectSlaves(void) { - while (listLength(server.slaves)) { - listNode *ln = listFirst(server.slaves); - freeClient((client*)ln->value); + serverAssert(aeThreadOwnsLock()); + listIter li; + listNode *ln; + + listRewind(server.slaves, &li); + while ((ln = listNext(&li))) { + client *c = (client*)listNodeValue(ln); + if (FCorrectThread(c)) { + freeClient(c); + } + else { + freeClientAsync(c); + } } } @@ -911,6 +1175,9 @@ void disconnectSlaves(void) { * This is used by freeClient() and replicationCacheMaster(). */ void unlinkClient(client *c) { listNode *ln; + AssertCorrectThread(c); + serverAssert(aeThreadOwnsLock()); + serverAssert(c->lock.fOwnLock()); /* If this is marked as current client unset it. */ if (server.current_client == c) server.current_client = NULL; @@ -928,32 +1195,51 @@ void unlinkClient(client *c) { } /* Unregister async I/O handlers and close the socket. */ - aeDeleteFileEvent(server.el,c->fd,AE_READABLE); - aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE); + aeDeleteFileEvent(server.rgthreadvar[c->iel].el,c->fd,AE_READABLE); + aeDeleteFileEvent(server.rgthreadvar[c->iel].el,c->fd,AE_WRITABLE); close(c->fd); c->fd = -1; } /* Remove from the list of pending writes if needed. */ if (c->flags & CLIENT_PENDING_WRITE) { - ln = listSearchKey(server.clients_pending_write,c); + ln = listSearchKey(server.rgthreadvar[c->iel].clients_pending_write,c); serverAssert(ln != NULL); - listDelNode(server.clients_pending_write,ln); + listDelNode(server.rgthreadvar[c->iel].clients_pending_write,ln); c->flags &= ~CLIENT_PENDING_WRITE; } /* When client was just unblocked because of a blocking operation, * remove it from the list of unblocked clients. */ if (c->flags & CLIENT_UNBLOCKED) { - ln = listSearchKey(server.unblocked_clients,c); + ln = listSearchKey(server.rgthreadvar[c->iel].unblocked_clients,c); serverAssert(ln != NULL); - listDelNode(server.unblocked_clients,ln); + listDelNode(server.rgthreadvar[c->iel].unblocked_clients,ln); c->flags &= ~CLIENT_UNBLOCKED; } + + if (c->fPendingAsyncWrite) { + ln = NULL; + bool fFound = false; + for (int iel = 0; iel < server.cthreads; ++iel) + { + ln = listSearchKey(server.rgthreadvar[iel].clients_pending_asyncwrite,c); + if (ln) + { + fFound = true; + listDelNode(server.rgthreadvar[iel].clients_pending_asyncwrite,ln); + } + } + serverAssert(fFound); + c->fPendingAsyncWrite = FALSE; + } } void freeClient(client *c) { listNode *ln; + serverAssert(aeThreadOwnsLock()); + AssertCorrectThread(c); + std::unique_locklock)> ulock(c->lock); /* If a client is protected, yet we need to free it right now, make sure * to at least use asynchronous freeing. */ @@ -1045,10 +1331,13 @@ void freeClient(client *c) { /* Release other dynamically allocated client structure fields, * and finally release the client structure itself. */ + zfree(c->bufAsync); if (c->name) decrRefCount(c->name); zfree(c->argv); freeClientMultiState(c); sdsfree(c->peerid); + ulock.unlock(); + fastlock_free(&c->lock); zfree(c); } @@ -1058,18 +1347,27 @@ void freeClient(client *c) { * should be valid for the continuation of the flow of the program. */ void freeClientAsync(client *c) { if (c->flags & CLIENT_CLOSE_ASAP || c->flags & CLIENT_LUA) return; - c->flags |= CLIENT_CLOSE_ASAP; + AeLocker lock; + lock.arm(nullptr); + std::lock_guardlock)> clientlock(c->lock); + c->flags |= CLIENT_CLOSE_ASAP; listAddNodeTail(server.clients_to_close,c); } -void freeClientsInAsyncFreeQueue(void) { - while (listLength(server.clients_to_close)) { - listNode *ln = listFirst(server.clients_to_close); - client *c = listNodeValue(ln); +void freeClientsInAsyncFreeQueue(int iel) { + listIter li; + listNode *ln; + listRewind(server.clients_to_close,&li); + + while((ln = listNext(&li))) { + client *c = (client*)listNodeValue(ln); + if (c->iel != iel) + continue; // wrong thread c->flags &= ~CLIENT_CLOSE_ASAP; freeClient(c); listDelNode(server.clients_to_close,ln); + listRewind(server.clients_to_close,&li); } } @@ -1078,7 +1376,7 @@ void freeClientsInAsyncFreeQueue(void) { * are not registered clients. */ client *lookupClientByID(uint64_t id) { id = htonu64(id); - client *c = raxFind(server.clients_index,(unsigned char*)&id,sizeof(id)); + client *c = (client*)raxFind(server.clients_index,(unsigned char*)&id,sizeof(id)); return (c == raxNotFound) ? NULL : c; } @@ -1086,12 +1384,15 @@ client *lookupClientByID(uint64_t id) { * is still valid after the call, C_ERR if it was freed. */ int writeToClient(int fd, client *c, int handler_installed) { ssize_t nwritten = 0, totwritten = 0; - size_t objlen; clientReplyBlock *o; + AssertCorrectThread(c); + std::unique_locklock)> lock(c->lock); + while(clientHasPendingReplies(c)) { if (c->bufpos > 0) { nwritten = write(fd,c->buf+c->sentlen,c->bufpos-c->sentlen); + if (nwritten <= 0) break; c->sentlen += nwritten; totwritten += nwritten; @@ -1103,27 +1404,27 @@ int writeToClient(int fd, client *c, int handler_installed) { c->sentlen = 0; } } else { - o = listNodeValue(listFirst(c->reply)); - objlen = o->used; - - if (objlen == 0) { + o = (clientReplyBlock*)listNodeValue(listFirst(c->reply)); + if (o->used == 0) { c->reply_bytes -= o->size; listDelNode(c->reply,listFirst(c->reply)); continue; } - nwritten = write(fd, o->buf + c->sentlen, objlen - c->sentlen); - if (nwritten <= 0) break; + nwritten = write(fd, o->buf() + c->sentlen, o->used - c->sentlen); + if (nwritten <= 0) + break; + c->sentlen += nwritten; totwritten += nwritten; - + /* If we fully sent the object on head go to the next one */ - if (c->sentlen == objlen) { + if (c->sentlen == o->used) { c->reply_bytes -= o->size; listDelNode(c->reply,listFirst(c->reply)); c->sentlen = 0; /* If there are no longer objects in the list, we expect - * the count of reply bytes to be exactly zero. */ + * the count of reply bytes to be exactly zero. */ if (listLength(c->reply) == 0) serverAssert(c->reply_bytes == 0); } @@ -1145,14 +1446,26 @@ int writeToClient(int fd, client *c, int handler_installed) { zmalloc_used_memory() < server.maxmemory) && !(c->flags & CLIENT_SLAVE)) break; } - server.stat_net_output_bytes += totwritten; + + __atomic_fetch_add(&server.stat_net_output_bytes, totwritten, __ATOMIC_RELAXED); if (nwritten == -1) { if (errno == EAGAIN) { nwritten = 0; } else { serverLog(LL_VERBOSE, "Error writing to client: %s", strerror(errno)); - freeClient(c); + lock.unlock(); + if (aeTryAcquireLock()) + { + freeClient(c); + aeReleaseLock(); + } + else + { + lock.unlock(); + freeClientAsync(c); + } + return C_ERR; } } @@ -1165,11 +1478,21 @@ int writeToClient(int fd, client *c, int handler_installed) { } if (!clientHasPendingReplies(c)) { c->sentlen = 0; - if (handler_installed) aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE); + if (handler_installed) aeDeleteFileEvent(server.rgthreadvar[c->iel].el,c->fd,AE_WRITABLE); /* Close connection after entire reply has been sent. */ if (c->flags & CLIENT_CLOSE_AFTER_REPLY) { - freeClient(c); + lock.unlock(); + if (aeTryAcquireLock()) + { + freeClient(c); + aeReleaseLock(); + } + else + { + lock.unlock(); + freeClientAsync(c); + } return C_ERR; } } @@ -1178,37 +1501,99 @@ int writeToClient(int fd, client *c, int handler_installed) { /* Write event handler. Just send data to the client. */ void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) { - UNUSED(el); UNUSED(mask); - writeToClient(fd,privdata,1); + client *c = (client*)privdata; + + serverAssert(ielFromEventLoop(el) == c->iel); + writeToClient(fd,c,1); +} + +void ProcessPendingAsyncWrites() +{ + serverAssert(aeThreadOwnsLock()); + + while(listLength(serverTL->clients_pending_asyncwrite)) { + client *c = (client*)listNodeValue(listFirst(serverTL->clients_pending_asyncwrite)); + listDelNode(serverTL->clients_pending_asyncwrite, listFirst(serverTL->clients_pending_asyncwrite)); + std::lock_guardlock)> lock(c->lock); + + serverAssert(c->fPendingAsyncWrite); + + // TODO: Append to end of reply block? + + size_t size = c->bufposAsync; + clientReplyBlock *reply = (clientReplyBlock*)zmalloc(size + sizeof(clientReplyBlock), MALLOC_LOCAL); + /* take over the allocation's internal fragmentation */ + reply->size = zmalloc_usable(reply) - sizeof(clientReplyBlock); + reply->used = c->bufposAsync; + memcpy(reply->buf(), c->bufAsync, c->bufposAsync); + listAddNodeTail(c->reply, reply); + c->reply_bytes += reply->size; + + c->bufposAsync = 0; + c->buflenAsync = 0; + zfree(c->bufAsync); + c->bufAsync = nullptr; + c->fPendingAsyncWrite = FALSE; + + // Now install the write event handler + int ae_flags = AE_WRITABLE|AE_WRITE_THREADSAFE; + /* For the fsync=always policy, we want that a given FD is never + * served for reading and writing in the same event loop iteration, + * so that in the middle of receiving the query, and serving it + * to the client, we'll call beforeSleep() that will do the + * actual fsync of AOF to disk. AE_BARRIER ensures that. */ + if (server.aof_state == AOF_ON && + server.aof_fsync == AOF_FSYNC_ALWAYS) + { + ae_flags |= AE_BARRIER; + } + + if (!((c->replstate == REPL_STATE_NONE || + (c->replstate == SLAVE_STATE_ONLINE && !c->repl_put_online_on_ack)))) + continue; + + asyncCloseClientOnOutputBufferLimitReached(c); + if (aeCreateRemoteFileEvent(server.rgthreadvar[c->iel].el, c->fd, ae_flags, sendReplyToClient, c, FALSE) == AE_ERR) + continue; // We can retry later in the cron + } } /* This function is called just before entering the event loop, in the hope * we can just write the replies to the client output buffer without any * need to use a syscall in order to install the writable event handler, * get it called, and so forth. */ -int handleClientsWithPendingWrites(void) { +int handleClientsWithPendingWrites(int iel) { listIter li; listNode *ln; - int processed = listLength(server.clients_pending_write); - listRewind(server.clients_pending_write,&li); + list *list = server.rgthreadvar[iel].clients_pending_write; + int processed = listLength(list); + serverAssert(iel == (serverTL - server.rgthreadvar)); + + listRewind(list,&li); while((ln = listNext(&li))) { - client *c = listNodeValue(ln); + client *c = (client*)listNodeValue(ln); + std::unique_locklock)> lock(c->lock); + c->flags &= ~CLIENT_PENDING_WRITE; - listDelNode(server.clients_pending_write,ln); + listDelNode(list,ln); + AssertCorrectThread(c); /* If a client is protected, don't do anything, * that may trigger write error or recreate handler. */ if (c->flags & CLIENT_PROTECTED) continue; /* Try to write buffers to the client socket. */ - if (writeToClient(c->fd,c,0) == C_ERR) continue; + if (writeToClient(c->fd,c,0) == C_ERR) { + lock.release(); // client is free'd + continue; + } /* If after the synchronous writes above we still have data to * output to the client, we need to install the writable handler. */ if (clientHasPendingReplies(c)) { - int ae_flags = AE_WRITABLE; + int ae_flags = AE_WRITABLE|AE_WRITE_THREADSAFE; /* For the fsync=always policy, we want that a given FD is never * served for reading and writing in the same event loop iteration, * so that in the middle of receiving the query, and serving it @@ -1219,13 +1604,16 @@ int handleClientsWithPendingWrites(void) { { ae_flags |= AE_BARRIER; } - if (aeCreateFileEvent(server.el, c->fd, ae_flags, - sendReplyToClient, c) == AE_ERR) - { - freeClientAsync(c); - } + + if (aeCreateFileEvent(server.rgthreadvar[c->iel].el, c->fd, ae_flags, sendReplyToClient, c) == AE_ERR) + freeClientAsync(c); } } + + AeLocker locker; + locker.arm(nullptr); + ProcessPendingAsyncWrites(); + return processed; } @@ -1268,15 +1656,17 @@ void resetClient(client *c) { * path, it is not really released, but only marked for later release. */ void protectClient(client *c) { c->flags |= CLIENT_PROTECTED; - aeDeleteFileEvent(server.el,c->fd,AE_READABLE); - aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE); + AssertCorrectThread(c); + aeDeleteFileEvent(server.rgthreadvar[c->iel].el,c->fd,AE_READABLE); + aeDeleteFileEvent(server.rgthreadvar[c->iel].el,c->fd,AE_WRITABLE); } /* This will undo the client protection done by protectClient() */ void unprotectClient(client *c) { + AssertCorrectThread(c); if (c->flags & CLIENT_PROTECTED) { c->flags &= ~CLIENT_PROTECTED; - aeCreateFileEvent(server.el,c->fd,AE_READABLE,readQueryFromClient,c); + aeCreateFileEvent(server.rgthreadvar[c->iel].el,c->fd,AE_READABLE|AE_READ_THREADSAFE,readQueryFromClient,c); if (clientHasPendingReplies(c)) clientInstallWriteHandler(c); } } @@ -1333,7 +1723,7 @@ int processInlineBuffer(client *c) { /* Setup argv array on client structure */ if (argc) { if (c->argv) zfree(c->argv); - c->argv = zmalloc(sizeof(robj*)*argc, MALLOC_LOCAL); + c->argv = (robj**)zmalloc(sizeof(robj*)*argc, MALLOC_LOCAL); } /* Create redis objects for all arguments. */ @@ -1431,7 +1821,7 @@ int processMultibulkBuffer(client *c) { /* Setup argv array on client structure */ if (c->argv) zfree(c->argv); - c->argv = zmalloc(sizeof(robj*)*c->multibulklen, MALLOC_LOCAL); + c->argv = (robj**)zmalloc(sizeof(robj*)*c->multibulklen, MALLOC_LOCAL); } serverAssertWithInfo(c,NULL,c->multibulklen > 0); @@ -1530,8 +1920,9 @@ int processMultibulkBuffer(client *c) { * or because a client was blocked and later reactivated, so there could be * pending query buffer, already representing a full command, to process. */ void processInputBuffer(client *c) { - server.current_client = c; - + AssertCorrectThread(c); + bool fFreed = false; + /* Keep processing while there is something in the input buffer */ while(c->qb_pos < sdslen(c->querybuf)) { /* Return if clients are paused. */ @@ -1574,6 +1965,10 @@ void processInputBuffer(client *c) { if (c->argc == 0) { resetClient(c); } else { + AeLocker locker; + locker.arm(c); + server.current_client = c; + /* Only reset the client when the command was executed. */ if (processCommand(c) == C_OK) { if (c->flags & CLIENT_MASTER && !(c->flags & CLIENT_MULTI)) { @@ -1591,17 +1986,19 @@ void processInputBuffer(client *c) { /* freeMemoryIfNeeded may flush slave output buffers. This may * result into a slave, that may be the active client, to be * freed. */ - if (server.current_client == NULL) break; + if (server.current_client == NULL) { + fFreed = true; + break; + } + server.current_client = NULL; } } /* Trim to pos */ - if (server.current_client != NULL && c->qb_pos) { + if (!fFreed && c->qb_pos) { sdsrange(c->querybuf,c->qb_pos,-1); c->qb_pos = 0; } - - server.current_client = NULL; } /* This is a wrapper for processInputBuffer that also cares about handling @@ -1616,8 +2013,10 @@ void processInputBufferAndReplicate(client *c) { processInputBuffer(c); size_t applied = c->reploff - prev_offset; if (applied) { + aeAcquireLock(); replicationFeedSlavesFromMasterStream(server.slaves, c->pending_querybuf, applied); + aeReleaseLock(); sdsrange(c->pending_querybuf,applied,-1); } } @@ -1629,6 +2028,14 @@ void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) { size_t qblen; UNUSED(el); UNUSED(mask); + serverAssert(mask & AE_READ_THREADSAFE); + serverAssert(c->iel == ielFromEventLoop(el)); + + AeLocker aelock; + AssertCorrectThread(c); + std::unique_locklock)> lock(c->lock, std::defer_lock); + if (!lock.try_lock()) + return; // Process something else while we wait readlen = PROTO_IOBUF_LEN; /* If this is a multi bulk request, and we are processing a bulk reply @@ -1650,17 +2057,23 @@ void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) { qblen = sdslen(c->querybuf); if (c->querybuf_peak < qblen) c->querybuf_peak = qblen; c->querybuf = sdsMakeRoomFor(c->querybuf, readlen); + nread = read(fd, c->querybuf+qblen, readlen); + if (nread == -1) { if (errno == EAGAIN) { return; } else { serverLog(LL_VERBOSE, "Reading from client: %s",strerror(errno)); + lock.unlock(); + aelock.arm(nullptr); freeClient(c); return; } } else if (nread == 0) { serverLog(LL_VERBOSE, "Client closed connection"); + lock.unlock(); + aelock.arm(nullptr); freeClient(c); return; } else if (c->flags & CLIENT_MASTER) { @@ -1682,6 +2095,8 @@ void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) { serverLog(LL_WARNING,"Closing client that reached max query buffer length: %s (qbuf initial bytes: %s)", ci, bytes); sdsfree(ci); sdsfree(bytes); + lock.unlock(); + aelock.arm(nullptr); freeClient(c); return; } @@ -1693,6 +2108,8 @@ void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) { * corresponding part of the replication stream, will be propagated to * the sub-slaves and to the replication backlog. */ processInputBufferAndReplicate(c); + aelock.arm(nullptr); + ProcessPendingAsyncWrites(); } void getClientsMaxBuffers(unsigned long *longest_output_list, @@ -1704,7 +2121,7 @@ void getClientsMaxBuffers(unsigned long *longest_output_list, listRewind(server.clients,&li); while ((ln = listNext(&li)) != NULL) { - c = listNodeValue(ln); + c = (client*)listNodeValue(ln); if (listLength(c->reply) > lol) lol = listLength(c->reply); if (sdslen(c->querybuf) > bib) bib = sdslen(c->querybuf); @@ -1775,7 +2192,7 @@ sds catClientInfoString(sds s, client *client) { if (p == flags) *p++ = 'N'; *p++ = '\0'; - emask = client->fd == -1 ? 0 : aeGetFileEvents(server.el,client->fd); + emask = client->fd == -1 ? 0 : aeGetFileEvents(server.rgthreadvar[client->iel].el,client->fd); p = events; if (emask & AE_READABLE) *p++ = 'r'; if (emask & AE_WRITABLE) *p++ = 'w'; @@ -1810,7 +2227,7 @@ sds getAllClientsInfoString(int type) { sdsclear(o); listRewind(server.clients,&li); while ((ln = listNext(&li)) != NULL) { - client = listNodeValue(ln); + client = reinterpret_cast(listNodeValue(ln)); if (type != -1 && getClientType(client) != type) continue; o = catClientInfoString(o,client); o = sdscatlen(o,"\n",1); @@ -1823,7 +2240,7 @@ void clientCommand(client *c) { listIter li; client *client; - if (c->argc == 2 && !strcasecmp(ptrFromObj(c->argv[1]),"help")) { + if (c->argc == 2 && !strcasecmp((const char*)ptrFromObj(c->argv[1]),"help")) { const char *help[] = { "id -- Return the ID of the current connection.", "getname -- Return the name of the current connection.", @@ -1841,14 +2258,14 @@ void clientCommand(client *c) { NULL }; addReplyHelp(c, help); - } else if (!strcasecmp(ptrFromObj(c->argv[1]),"id") && c->argc == 2) { + } else if (!strcasecmp((const char*)ptrFromObj(c->argv[1]),"id") && c->argc == 2) { /* CLIENT ID */ addReplyLongLong(c,c->id); - } else if (!strcasecmp(ptrFromObj(c->argv[1]),"list")) { + } else if (!strcasecmp((const char*)ptrFromObj(c->argv[1]),"list")) { /* CLIENT LIST */ int type = -1; - if (c->argc == 4 && !strcasecmp(ptrFromObj(c->argv[2]),"type")) { - type = getClientTypeByName(ptrFromObj(c->argv[3])); + if (c->argc == 4 && !strcasecmp((const char*)ptrFromObj(c->argv[2]),"type")) { + type = getClientTypeByName((char*)ptrFromObj(c->argv[3])); if (type == -1) { addReplyErrorFormat(c,"Unknown client type '%s'", (char*) ptrFromObj(c->argv[3])); @@ -1861,21 +2278,21 @@ NULL sds o = getAllClientsInfoString(type); addReplyBulkCBuffer(c,o,sdslen(o)); sdsfree(o); - } else if (!strcasecmp(ptrFromObj(c->argv[1]),"reply") && c->argc == 3) { + } else if (!strcasecmp((const char*)ptrFromObj(c->argv[1]),"reply") && c->argc == 3) { /* CLIENT REPLY ON|OFF|SKIP */ - if (!strcasecmp(ptrFromObj(c->argv[2]),"on")) { + if (!strcasecmp((const char*)ptrFromObj(c->argv[2]),"on")) { c->flags &= ~(CLIENT_REPLY_SKIP|CLIENT_REPLY_OFF); addReply(c,shared.ok); - } else if (!strcasecmp(ptrFromObj(c->argv[2]),"off")) { + } else if (!strcasecmp((const char*)ptrFromObj(c->argv[2]),"off")) { c->flags |= CLIENT_REPLY_OFF; - } else if (!strcasecmp(ptrFromObj(c->argv[2]),"skip")) { + } else if (!strcasecmp((const char*)ptrFromObj(c->argv[2]),"skip")) { if (!(c->flags & CLIENT_REPLY_OFF)) c->flags |= CLIENT_REPLY_SKIP_NEXT; } else { addReply(c,shared.syntaxerr); return; } - } else if (!strcasecmp(ptrFromObj(c->argv[1]),"kill")) { + } else if (!strcasecmp((const char*)ptrFromObj(c->argv[1]),"kill")) { /* CLIENT KILL * CLIENT KILL