diff --git a/.gitignore b/.gitignore
index 2b2e15eba..f1c0ecf4b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 .*.swp
+core
 *.o
 *.log
 dump.rdb
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 6fac65a3d..56bf76d11 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,6 +1,56 @@
 {
     "files.associations": {
         "zmalloc.h": "c",
-        "stat.h": "c"
+        "stat.h": "c",
+        "array": "cpp",
+        "atomic": "cpp",
+        "*.tcc": "cpp",
+        "cctype": "cpp",
+        "chrono": "cpp",
+        "clocale": "cpp",
+        "cmath": "cpp",
+        "condition_variable": "cpp",
+        "cstdarg": "cpp",
+        "cstddef": "cpp",
+        "cstdint": "cpp",
+        "cstdio": "cpp",
+        "cstdlib": "cpp",
+        "cstring": "cpp",
+        "ctime": "cpp",
+        "cwchar": "cpp",
+        "cwctype": "cpp",
+        "deque": "cpp",
+        "list": "cpp",
+        "unordered_map": "cpp",
+        "vector": "cpp",
+        "exception": "cpp",
+        "fstream": "cpp",
+        "functional": "cpp",
+        "future": "cpp",
+        "initializer_list": "cpp",
+        "iomanip": "cpp",
+        "iosfwd": "cpp",
+        "iostream": "cpp",
+        "istream": "cpp",
+        "limits": "cpp",
+        "memory": "cpp",
+        "mutex": "cpp",
+        "new": "cpp",
+        "numeric": "cpp",
+        "optional": "cpp",
+        "ostream": "cpp",
+        "ratio": "cpp",
+        "scoped_allocator": "cpp",
+        "sstream": "cpp",
+        "stdexcept": "cpp",
+        "streambuf": "cpp",
+        "string_view": "cpp",
+        "system_error": "cpp",
+        "thread": "cpp",
+        "cinttypes": "cpp",
+        "tuple": "cpp",
+        "type_traits": "cpp",
+        "typeinfo": "cpp",
+        "utility": "cpp"
     }
-}
\ No newline at end of file
+}
diff --git a/redis.conf b/redis.conf
index 2d0de9610..356dbcca8 100644
--- a/redis.conf
+++ b/redis.conf
@@ -291,6 +291,17 @@ dir ./
 # refuse the replica request.
 #
 # masterauth <master-password>
+#
+# However this is not enough if you are using Redis ACLs (for Redis version
+# 6 or greater), and the default user is not capable of running the PSYNC
+# command and/or other commands needed for replication. In this case it's
+# better to configure a special user to use with replication, and specify the
+# masteruser configuration as such:
+#
+# masteruser <username>
+#
+# When masteruser is specified, the replica will authenticate against its
+# master using the new AUTH form: AUTH <username> <password>.
 
 # When a replica loses its connection with the master, or when the replication
 # is still in progress, the replica can act in two different ways:
@@ -501,6 +512,94 @@ replica-priority 100
 # can be easily a long string from /dev/urandom or whatever, so by using a
 # long and unguessable password no brute force attack will be possible.
 
+# Redis ACL users are defined in the following format:
+#
+#   user <username> ... acl rules ...
+#
+# For example:
+#
+#   user worker +@list +@connection ~jobs:* on >ffa9203c493aa99
+#
+# The special username "default" is used for new connections. If this user
+# has the "nopass" rule, then new connections will be immediately authenticated
+# as the "default" user without the need of any password provided via the
+# AUTH command. Otherwise if the "default" user is not flagged with "nopass"
+# the connections will start in not authenticated state, and will require
+# AUTH (or the HELLO command AUTH option) in order to be authenticated and
+# start to work.
+#
+# The ACL rules that describe what an user can do are the following:
+#
+#  on           Enable the user: it is possible to authenticate as this user.
+#  off          Disable the user: it's no longer possible to authenticate
+#               with this user, however the already authenticated connections
+#               will still work.
+#  +<command>   Allow the execution of that command
+#  -<command>   Disallow the execution of that command
+#  +@<category> Allow the execution of all the commands in such category
+#               with valid categories are like @admin, @set, @sortedset, ...
+#               and so forth, see the full list in the server.c file where
+#               the Redis command table is described and defined.
+#               The special category @all means all the commands, but currently
+#               present in the server, and that will be loaded in the future
+#               via modules.
+#  +<command>|subcommand    Allow a specific subcommand of an otherwise
+#                           disabled command. Note that this form is not
+#                           allowed as negative like -DEBUG|SEGFAULT, but
+#                           only additive starting with "+".
+#  allcommands  Alias for +@all. Note that it implies the ability to execute
+#               all the future commands loaded via the modules system.
+#  nocommands   Alias for -@all.
+#  ~<pattern>   Add a pattern of keys that can be mentioned as part of
+#               commands. For instance ~* allows all the keys. The pattern
+#               is a glob-style pattern like the one of KEYS.
+#               It is possible to specify multiple patterns.
+#  allkeys      Alias for ~*
+#  resetkeys    Flush the list of allowed keys patterns.
+#  ><password>  Add this passowrd to the list of valid password for the user.
+#               For example >mypass will add "mypass" to the list.
+#               This directive clears the "nopass" flag (see later).
+#  <<password>  Remove this password from the list of valid passwords.
+#  nopass       All the set passwords of the user are removed, and the user
+#               is flagged as requiring no password: it means that every
+#               password will work against this user. If this directive is
+#               used for the default user, every new connection will be
+#               immediately authenticated with the default user without
+#               any explicit AUTH command required. Note that the "resetpass"
+#               directive will clear this condition.
+#  resetpass    Flush the list of allowed passwords. Moreover removes the
+#               "nopass" status. After "resetpass" the user has no associated
+#               passwords and there is no way to authenticate without adding
+#               some password (or setting it as "nopass" later).
+#  reset        Performs the following actions: resetpass, resetkeys, off,
+#               -@all. The user returns to the same state it has immediately
+#               after its creation.
+#
+# ACL rules can be specified in any order: for instance you can start with
+# passwords, then flags, or key patterns. However note that the additive
+# and subtractive rules will CHANGE MEANING depending on the ordering.
+# For instance see the following example:
+#
+#   user alice on +@all -DEBUG ~* >somepassword
+#
+# This will allow "alice" to use all the commands with the exception of the
+# DEBUG command, since +@all added all the commands to the set of the commands
+# alice can use, and later DEBUG was removed. However if we invert the order
+# of two ACL rules the result will be different:
+#
+#   user alice on -DEBUG +@all ~* >somepassword
+#
+# Now DEBUG was removed when alice had yet no commands in the set of allowed
+# commands, later all the commands are added, so the user will be able to
+# execute everything.
+#
+# Basically ACL rules are processed left-to-right.
+#
+# For more information about ACL configuration please refer to
+# the Redis web site at https://redis.io/topics/acl
+
+# Using an external ACL file
+#
 # Instead of configuring users here in this file, it is possible to use
 # a stand-alone file just listing users. The two methods cannot be mixed:
 # if you configure users here and at the same time you activate the exteranl
@@ -1399,3 +1498,8 @@ rdb-save-incremental-fsync yes
 # reduces memory requirements by storing rarely accessed data on disk 
 # instead of RAM.  A temporary file will be created in this directory.
 # scratch-file-path /tmp/
+
+# Number of worker threads serving requests.  This number should be related to the performance
+# of your network hardware, not the number of cores on your machine.  We don't recommend going
+# above 4 at this time.  By default this is set 1.
+server-threads 2
diff --git a/src/Makefile b/src/Makefile
index b2b5f5833..4258f47a5 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -21,7 +21,7 @@ NODEPS:=clean distclean
 
 # Default settings
 STD=-std=c99 -pedantic -DREDIS_STATIC=''
-CXX_STD=-std=c++14 -pedantic
+CXX_STD=-std=c++14 -pedantic -fno-rtti
 ifneq (,$(findstring clang,$(CC)))
 ifneq (,$(findstring FreeBSD,$(uname_S)))
   STD+=-Wno-c11-extensions
@@ -39,7 +39,7 @@ MALLOC=libc
 ifneq ($(uname_M),armv6l)
 ifneq ($(uname_M),armv7l)
 ifeq ($(uname_S),Linux)
-	MALLOC=memkind
+	MALLOC=jemalloc
 endif
 endif
 endif
@@ -134,23 +134,27 @@ FINAL_CXXFLAGS+= -I../deps/hiredis -I../deps/linenoise -I../deps/lua/src
 
 ifeq ($(MALLOC),tcmalloc)
 	FINAL_CFLAGS+= -DUSE_TCMALLOC
+	FINAL_CXXFLAGS+= -DUSE_TCMALLOC
 	FINAL_LIBS+= -ltcmalloc
 endif
 
 ifeq ($(MALLOC),tcmalloc_minimal)
 	FINAL_CFLAGS+= -DUSE_TCMALLOC
+	FINAL_CXXFLAGS+= -DUSE_TCMALLOC
 	FINAL_LIBS+= -ltcmalloc_minimal
 endif
 
 ifeq ($(MALLOC),jemalloc)
 	DEPENDENCY_TARGETS+= jemalloc
 	FINAL_CFLAGS+= -DUSE_JEMALLOC -I../deps/jemalloc/include
+	FINAL_CXXFLAGS+= -DUSE_JEMALLOC -I../deps/jemalloc/include
 	FINAL_LIBS := ../deps/jemalloc/lib/libjemalloc.a $(FINAL_LIBS)
 endif
 
 ifeq ($(MALLOC),memkind)
 	DEPENDENCY_TARGETS+= memkind
 	FINAL_CFLAGS+= -DUSE_MEMKIND -I../deps/memkind/src/include
+	FINAL_CXXFLAGS+= -DUSE_MEMKIND -I../deps/memkind/src/include
 	FINAL_LIBS := ../deps/memkind/src/.libs/libmemkind.a -lnuma $(FINAL_LIBS)
 endif
 
diff --git a/src/acl.c b/src/acl.c
index 42cd0c734..b5b9f46a7 100644
--- a/src/acl.c
+++ b/src/acl.c
@@ -28,6 +28,7 @@
  */
 
 #include "server.h"
+#include <fcntl.h>
 
 /* =============================================================================
  * Global state for ACLs
@@ -90,6 +91,7 @@ struct ACLUserFlag {
 
 void ACLResetSubcommandsForCommand(user *u, unsigned long id);
 void ACLResetSubcommands(user *u);
+void ACLAddAllowedSubcommand(user *u, unsigned long id, const char *sub);
 
 /* =============================================================================
  * Helper functions for the rest of the ACL implementation
@@ -163,6 +165,11 @@ void ACLListFreeSds(void *item) {
     sdsfree(item);
 }
 
+/* Method to duplicate list elements from ACL users password/ptterns lists. */
+void *ACLListDupSds(void *item) {
+    return sdsdup(item);
+}
+
 /* Create a new user with the specified name, store it in the list
  * of users (the Users global radix tree), and returns a reference to
  * the structure representing the user.
@@ -178,13 +185,32 @@ user *ACLCreateUser(const char *name, size_t namelen) {
     u->patterns = listCreate();
     listSetMatchMethod(u->passwords,ACLListMatchSds);
     listSetFreeMethod(u->passwords,ACLListFreeSds);
+    listSetDupMethod(u->passwords,ACLListDupSds);
     listSetMatchMethod(u->patterns,ACLListMatchSds);
     listSetFreeMethod(u->patterns,ACLListFreeSds);
+    listSetDupMethod(u->patterns,ACLListDupSds);
     memset(u->allowed_commands,0,sizeof(u->allowed_commands));
     raxInsert(Users,(unsigned char*)name,namelen,u,NULL);
     return u;
 }
 
+/* This function should be called when we need an unlinked "fake" user
+ * we can use in order to validate ACL rules or for other similar reasons.
+ * The user will not get linked to the Users radix tree. The returned
+ * user should be released with ACLFreeUser() as usually. */
+user *ACLCreateUnlinkedUser(void) {
+    char username[64];
+    for (int j = 0; ; j++) {
+        snprintf(username,sizeof(username),"__fakeuser:%d__",j);
+        user *fakeuser = ACLCreateUser(username,strlen(username));
+        if (fakeuser == NULL) continue;
+        int retval = raxRemove(Users,(unsigned char*) username,
+                               strlen(username),NULL);
+        serverAssert(retval != 0);
+        return fakeuser;
+    }
+}
+
 /* Release the memory used by the user structure. Note that this function
  * will not remove the user from the Users global radix tree. */
 void ACLFreeUser(user *u) {
@@ -195,6 +221,62 @@ void ACLFreeUser(user *u) {
     zfree(u);
 }
 
+/* When a user is deleted we need to cycle the active
+ * connections in order to kill all the pending ones that
+ * are authenticated with such user. */
+void ACLFreeUserAndKillClients(user *u) {
+    listIter li;
+    listNode *ln;
+    listRewind(server.clients,&li);
+    while ((ln = listNext(&li)) != NULL) {
+        client *c = listNodeValue(ln);
+        if (c->puser == u) {
+            /* We'll free the conenction asynchronously, so
+             * in theory to set a different user is not needed.
+             * However if there are bugs in Redis, soon or later
+             * this may result in some security hole: it's much
+             * more defensive to set the default user and put
+             * it in non authenticated mode. */
+            c->puser = DefaultUser;
+            c->authenticated = 0;
+            freeClientAsync(c);
+        }
+    }
+    ACLFreeUser(u);
+}
+
+/* Copy the user ACL rules from the source user 'src' to the destination
+ * user 'dst' so that at the end of the process they'll have exactly the
+ * same rules (but the names will continue to be the original ones). */
+void ACLCopyUser(user *dst, user *src) {
+    listRelease(dst->passwords);
+    listRelease(dst->patterns);
+    dst->passwords = listDup(src->passwords);
+    dst->patterns = listDup(src->patterns);
+    memcpy(dst->allowed_commands,src->allowed_commands,
+           sizeof(dst->allowed_commands));
+    dst->flags = src->flags;
+    ACLResetSubcommands(dst);
+    /* Copy the allowed subcommands array of array of SDS strings. */
+    if (src->allowed_subcommands) {
+        for (int j = 0; j < USER_COMMAND_BITS_COUNT; j++) {
+            if (src->allowed_subcommands[j]) {
+                for (int i = 0; src->allowed_subcommands[j][i]; i++)
+                {
+                    ACLAddAllowedSubcommand(dst, j,
+                        src->allowed_subcommands[j][i]);
+                }
+            }
+        }
+    }
+}
+
+/* Free all the users registered in the radix tree 'users' and free the
+ * radix tree itself. */
+void ACLFreeUsersSet(rax *users) {
+    raxFreeWithCallback(users,(void(*)(void*))ACLFreeUserAndKillClients);
+}
+
 /* Given a command ID, this function set by reference 'word' and 'bit'
  * so that user->allowed_commands[word] will address the right word
  * where the corresponding bit for the provided ID is stored, and
@@ -256,6 +338,7 @@ int ACLSetUserCommandBitsForCategory(user *u, const char *category, int value) {
     dictEntry *de;
     while ((de = dictNext(di)) != NULL) {
         struct redisCommand *cmd = dictGetVal(de);
+        if (cmd->flags & CMD_MODULE) continue; /* Ignore modules commands. */
         if (cmd->flags & cflag) {
             ACLSetUserCommandBit(u,cmd->id,value);
             ACLResetSubcommandsForCommand(u,cmd->id);
@@ -579,6 +662,7 @@ void ACLAddAllowedSubcommand(user *u, unsigned long id, const char *sub) {
  *         fully added.
  * EEXIST: You are adding a key pattern after "*" was already added. This is
  *         almost surely an error on the user side.
+ * ENODEV: The password you are trying to remove from the user does not exist.
  */
 int ACLSetUser(user *u, const char *op, ssize_t oplen) {
     if (oplen == -1) oplen = strlen(op);
@@ -623,8 +707,13 @@ int ACLSetUser(user *u, const char *op, ssize_t oplen) {
     } else if (op[0] == '<') {
         sds delpass = sdsnewlen(op+1,oplen-1);
         listNode *ln = listSearchKey(u->passwords,delpass);
-        if (ln) listDelNode(u->passwords,ln);
         sdsfree(delpass);
+        if (ln) {
+            listDelNode(u->passwords,ln);
+        } else {
+            errno = ENODEV;
+            return C_ERR;
+        }
     } else if (op[0] == '~') {
         if (u->flags & USER_FLAG_ALLKEYS) {
             errno = EEXIST;
@@ -728,6 +817,9 @@ char *ACLSetUserStringError(void) {
                  "'allkeys' flag) is not valid and does not have any "
                  "effect. Try 'resetkeys' to start with an empty "
                  "list of patterns";
+    else if (errno == ENODEV)
+        errmsg = "The password you are trying to remove from the user does "
+                 "not exist";
     return errmsg;
 }
 
@@ -741,10 +833,9 @@ sds ACLDefaultUserFirstPassword(void) {
     return listNodeValue(first);
 }
 
-/* Initialization of the ACL subsystem. */
-void ACLInit(void) {
-    Users = raxNew();
-    UsersToLoad = listCreate();
+/* Initialize the default user, that will always exist for all the process
+ * lifetime. */
+void ACLInitDefaultUser(void) {
     DefaultUser = ACLCreateUser("default",7);
     ACLSetUser(DefaultUser,"+@all",-1);
     ACLSetUser(DefaultUser,"~*",-1);
@@ -752,6 +843,13 @@ void ACLInit(void) {
     ACLSetUser(DefaultUser,"nopass",-1);
 }
 
+/* Initialization of the ACL subsystem. */
+void ACLInit(void) {
+    Users = raxNew();
+    UsersToLoad = listCreate();
+    ACLInitDefaultUser();
+}
+
 /* Check the username and password pair and return C_OK if they are valid,
  * otherwise C_ERR is returned and errno is set to:
  *
@@ -944,11 +1042,7 @@ int ACLAppendUserForLoading(sds *argv, int argc, int *argc_err) {
 
     /* Try to apply the user rules in a fake user to see if they
      * are actually valid. */
-    char *funame = "__fakeuser__";
-    user *fakeuser = ACLCreateUser(funame,strlen(funame));
-    serverAssert(fakeuser != NULL);
-    int retval = raxRemove(Users,(unsigned char*) funame,strlen(funame),NULL);
-    serverAssert(retval != 0);
+    user *fakeuser = ACLCreateUnlinkedUser();
 
     for (int j = 2; j < argc; j++) {
         if (ACLSetUser(fakeuser,argv[j],sdslen(argv[j])) == C_ERR) {
@@ -1009,15 +1103,275 @@ int ACLLoadConfiguredUsers(void) {
     return C_OK;
 }
 
+/* This function loads the ACL from the specified filename: every line
+ * is validated and shold be either empty or in the format used to specify
+ * users in the redis.conf configuration or in the ACL file, that is:
+ *
+ *  user <username> ... rules ...
+ *
+ * Note that this function considers comments starting with '#' as errors
+ * because the ACL file is meant to be rewritten, and comments would be
+ * lost after the rewrite. Yet empty lines are allowed to avoid being too
+ * strict.
+ *
+ * One important part of implementing ACL LOAD, that uses this function, is
+ * to avoid ending with broken rules if the ACL file is invalid for some
+ * reason, so the function will attempt to validate the rules before loading
+ * each user. For every line that will be found broken the function will
+ * collect an error message.
+ *
+ * IMPORTANT: If there is at least a single error, nothing will be loaded
+ * and the rules will remain exactly as they were.
+ *
+ * At the end of the process, if no errors were found in the whole file then
+ * NULL is returned. Otherwise an SDS string describing in a single line
+ * a description of all the issues found is returned. */
+sds ACLLoadFromFile(const char *filename) {
+    FILE *fp;
+    char buf[1024];
+
+    /* Open the ACL file. */
+    if ((fp = fopen(filename,"r")) == NULL) {
+        sds errors = sdscatprintf(sdsempty(),
+            "Error loading ACLs, opening file '%s': %s",
+            filename, strerror(errno));
+        return errors;
+    }
+
+    /* Load the whole file as a single string in memory. */
+    sds acls = sdsempty();
+    while(fgets(buf,sizeof(buf),fp) != NULL)
+        acls = sdscat(acls,buf);
+    fclose(fp);
+
+    /* Split the file into lines and attempt to load each line. */
+    int totlines;
+    sds *lines, errors = sdsempty();
+    lines = sdssplitlen(acls,strlen(acls),"\n",1,&totlines);
+    sdsfree(acls);
+
+    /* We need a fake user to validate the rules before making changes
+     * to the real user mentioned in the ACL line. */
+    user *fakeuser = ACLCreateUnlinkedUser();
+
+    /* We do all the loading in a fresh insteance of the Users radix tree,
+     * so if there are errors loading the ACL file we can rollback to the
+     * old version. */
+    rax *old_users = Users;
+    user *old_default_user = DefaultUser;
+    Users = raxNew();
+    ACLInitDefaultUser();
+
+    /* Load each line of the file. */
+    for (int i = 0; i < totlines; i++) {
+        sds *argv;
+        int argc;
+        int linenum = i+1;
+
+        lines[i] = sdstrim(lines[i]," \t\r\n");
+
+        /* Skip blank lines */
+        if (lines[i][0] == '\0') continue;
+
+        /* Split into arguments */
+        argv = sdssplitargs(lines[i],&argc);
+        if (argv == NULL) {
+            errors = sdscatprintf(errors,
+                     "%s:%d: unbalanced quotes in acl line. ",
+                     server.acl_filename, linenum);
+            continue;
+        }
+
+        /* Skip this line if the resulting command vector is empty. */
+        if (argc == 0) {
+            sdsfreesplitres(argv,argc);
+            continue;
+        }
+
+        /* The line should start with the "user" keyword. */
+        if (strcmp(argv[0],"user") || argc < 2) {
+            errors = sdscatprintf(errors,
+                     "%s:%d should start with user keyword followed "
+                     "by the username. ", server.acl_filename,
+                     linenum);
+            sdsfreesplitres(argv,argc);
+            continue;
+        }
+
+        /* Try to process the line using the fake user to validate iif
+         * the rules are able to apply cleanly. */
+        ACLSetUser(fakeuser,"reset",-1);
+        int j;
+        for (j = 2; j < argc; j++) {
+            if (ACLSetUser(fakeuser,argv[j],sdslen(argv[j])) != C_OK) {
+                char *errmsg = ACLSetUserStringError();
+                errors = sdscatprintf(errors,
+                         "%s:%d: %s. ",
+                         server.acl_filename, linenum, errmsg);
+                continue;
+            }
+        }
+
+        /* Apply the rule to the new users set only if so far there
+         * are no errors, otherwise it's useless since we are going
+         * to discard the new users set anyway. */
+        if (sdslen(errors) != 0) {
+            sdsfreesplitres(argv,argc);
+            continue;
+        }
+
+        /* We can finally lookup the user and apply the rule. If the
+         * user already exists we always reset it to start. */
+        user *u = ACLCreateUser(argv[1],sdslen(argv[1]));
+        if (!u) {
+            u = ACLGetUserByName(argv[1],sdslen(argv[1]));
+            serverAssert(u != NULL);
+            ACLSetUser(u,"reset",-1);
+        }
+
+        /* Note that the same rules already applied to the fake user, so
+         * we just assert that everything goess well: it should. */
+        for (j = 2; j < argc; j++)
+            serverAssert(ACLSetUser(u,argv[j],sdslen(argv[j])) == C_OK);
+
+        sdsfreesplitres(argv,argc);
+    }
+
+    ACLFreeUser(fakeuser);
+    sdsfreesplitres(lines,totlines);
+    DefaultUser = old_default_user; /* This pointer must never change. */
+
+    /* Check if we found errors and react accordingly. */
+    if (sdslen(errors) == 0) {
+        /* The default user pointer is referenced in different places: instead
+         * of replacing such occurrences it is much simpler to copy the new
+         * default user configuration in the old one. */
+        user *new = ACLGetUserByName("default",7);
+        serverAssert(new != NULL);
+        ACLCopyUser(DefaultUser,new);
+        ACLFreeUser(new);
+        raxInsert(Users,(unsigned char*)"default",7,DefaultUser,NULL);
+        raxRemove(old_users,(unsigned char*)"default",7,NULL);
+        ACLFreeUsersSet(old_users);
+        sdsfree(errors);
+        return NULL;
+    } else {
+        ACLFreeUsersSet(Users);
+        Users = old_users;
+        errors = sdscat(errors,"WARNING: ACL errors detected, no change to the previously active ACL rules was performed");
+        return errors;
+    }
+}
+
+/* Generate a copy of the ACLs currently in memory in the specified filename.
+ * Returns C_OK on success or C_ERR if there was an error during the I/O.
+ * When C_ERR is returned a log is produced with hints about the issue. */
+int ACLSaveToFile(const char *filename) {
+    sds acl = sdsempty();
+    int fd = -1;
+    sds tmpfilename = NULL;
+    int retval = C_ERR;
+
+    /* Let's generate an SDS string containing the new version of the
+     * ACL file. */
+    raxIterator ri;
+    raxStart(&ri,Users);
+    raxSeek(&ri,"^",NULL,0);
+    while(raxNext(&ri)) {
+        user *u = ri.data;
+        /* Return information in the configuration file format. */
+        sds user = sdsnew("user ");
+        user = sdscatsds(user,u->name);
+        user = sdscatlen(user," ",1);
+        sds descr = ACLDescribeUser(u);
+        user = sdscatsds(user,descr);
+        sdsfree(descr);
+        acl = sdscatsds(acl,user);
+        acl = sdscatlen(acl,"\n",1);
+        sdsfree(user);
+    }
+    raxStop(&ri);
+
+    /* Create a temp file with the new content. */
+    tmpfilename = sdsnew(filename);
+    tmpfilename = sdscatfmt(tmpfilename,".tmp-%i-%I",
+        (int)getpid(),(int)mstime());
+    if ((fd = open(tmpfilename,O_WRONLY|O_CREAT,0644)) == -1) {
+        serverLog(LL_WARNING,"Opening temp ACL file for ACL SAVE: %s",
+            strerror(errno));
+        goto cleanup;
+    }
+
+    /* Write it. */
+    if (write(fd,acl,sdslen(acl)) != (ssize_t)sdslen(acl)) {
+        serverLog(LL_WARNING,"Writing ACL file for ACL SAVE: %s",
+            strerror(errno));
+        goto cleanup;
+    }
+    close(fd); fd = -1;
+
+    /* Let's replace the new file with the old one. */
+    if (rename(tmpfilename,filename) == -1) {
+        serverLog(LL_WARNING,"Renaming ACL file for ACL SAVE: %s",
+            strerror(errno));
+        goto cleanup;
+    }
+    sdsfree(tmpfilename); tmpfilename = NULL;
+    retval = C_OK; /* If we reached this point, everything is fine. */
+
+cleanup:
+    if (fd != -1) close(fd);
+    if (tmpfilename) unlink(tmpfilename);
+    sdsfree(tmpfilename);
+    sdsfree(acl);
+    return retval;
+}
+
+/* This function is called once the server is already running, modules are
+ * loaded, and we are ready to start, in order to load the ACLs either from
+ * the pending list of users defined in redis.conf, or from the ACL file.
+ * The function will just exit with an error if the user is trying to mix
+ * both the loading methods. */
+void ACLLoadUsersAtStartup(void) {
+    if (server.acl_filename[0] != '\0' && listLength(UsersToLoad) != 0) {
+        serverLog(LL_WARNING,
+            "Configuring Redis with users defined in redis.conf and at "
+            "the same setting an ACL file path is invalid. This setup "
+            "is very likely to lead to configuration errors and security "
+            "holes, please define either an ACL file or declare users "
+            "directly in your redis.conf, but not both.");
+        exit(1);
+    }
+
+    if (ACLLoadConfiguredUsers() == C_ERR) {
+        serverLog(LL_WARNING,
+            "Critical error while loading ACLs. Exiting.");
+        exit(1);
+    }
+
+    if (server.acl_filename[0] != '\0') {
+        sds errors = ACLLoadFromFile(server.acl_filename);
+        if (errors) {
+            serverLog(LL_WARNING,
+                "Aborting Redis startup because of ACL errors: %s", errors);
+            sdsfree(errors);
+            exit(1);
+        }
+    }
+}
+
 /* =============================================================================
  * ACL related commands
  * ==========================================================================*/
 
 /* ACL -- show and modify the configuration of ACL users.
  * ACL HELP
+ * ACL LOAD
  * ACL LIST
- * ACL SETUSER <username> ... user attribs ...
- * ACL DELUSER <username>
+ * ACL USERS
+ * ACL CAT [<category>]
+ * ACL SETUSER <username> ... acl rules ...
+ * ACL DELUSER <username> [...]
  * ACL GETUSER <username>
  */
 void aclCommand(client *c) {
@@ -1045,32 +1399,16 @@ void aclCommand(client *c) {
                 addReplyError(c,"The 'default' user cannot be removed");
                 return;
             }
+        }
+
+        for (int j = 2; j < c->argc; j++) {
+            sds username = ptrFromObj(c->argv[j]);
             user *u;
             if (raxRemove(Users,(unsigned char*)username,
                           sdslen(username),
                           (void**)&u))
             {
-                /* When a user is deleted we need to cycle the active
-                 * connections in order to kill all the pending ones that
-                 * are authenticated with such user. */
-                ACLFreeUser(u);
-                listIter li;
-                listNode *ln;
-                listRewind(server.clients,&li);
-                while ((ln = listNext(&li)) != NULL) {
-                    client *c = listNodeValue(ln);
-                    if (c->puser == u) {
-                        /* We'll free the conenction asynchronously, so
-                         * in theory to set a different user is not needed.
-                         * However if there are bugs in Redis, soon or later
-                         * this may result in some security hole: it's much
-                         * more defensive to set the default user and put
-                         * it in non authenticated mode. */
-                        c->puser = DefaultUser;
-                        c->authenticated = 0;
-                        freeClientAsync(c);
-                    }
-                }
+                ACLFreeUserAndKillClients(u);
                 deleted++;
             }
         }
@@ -1151,19 +1489,69 @@ void aclCommand(client *c) {
             }
         }
         raxStop(&ri);
-    } else if (!strcasecmp(sub,"whoami")) {
+    } else if (!strcasecmp(sub,"whoami") && c->argc == 2) {
         if (c->puser != NULL) {
             addReplyBulkCBuffer(c,c->puser->name,sdslen(c->puser->name));
         } else {
             addReplyNull(c);
         }
+    } else if (server.acl_filename[0] == '\0' &&
+               (!strcasecmp(sub,"load") || !strcasecmp(sub,"save")))
+    {
+        addReplyError(c,"This Redis instance is not configured to use an ACL file. You may want to specify users via the ACL SETUSER command and then issue a CONFIG REWRITE (assuming you have a Redis configuration file set) in order to store users in the Redis configuration.");
+        return;
+    } else if (!strcasecmp(sub,"load") && c->argc == 2) {
+        sds errors = ACLLoadFromFile(server.acl_filename);
+        if (errors == NULL) {
+            addReply(c,shared.ok);
+        } else {
+            addReplyError(c,errors);
+            sdsfree(errors);
+        }
+    } else if (!strcasecmp(sub,"save") && c->argc == 2) {
+        if (ACLSaveToFile(server.acl_filename) == C_OK) {
+            addReply(c,shared.ok);
+        } else {
+            addReplyError(c,"There was an error trying to save the ACLs. "
+                            "Please check the server logs for more "
+                            "information");
+        }
+    } else if (!strcasecmp(sub,"cat") && c->argc == 2) {
+        void *dl = addReplyDeferredLen(c);
+        int j;
+        for (j = 0; ACLCommandCategories[j].flag != 0; j++)
+            addReplyBulkCString(c,ACLCommandCategories[j].name);
+        setDeferredArrayLen(c,dl,j);
+    } else if (!strcasecmp(sub,"cat") && c->argc == 3) {
+        uint64_t cflag = ACLGetCommandCategoryFlagByName(ptrFromObj(c->argv[2]));
+        if (cflag == 0) {
+            addReplyErrorFormat(c, "Unknown category '%s'", (char*)ptrFromObj(c->argv[2]));
+            return;
+        }
+        int arraylen = 0;
+        void *dl = addReplyDeferredLen(c);
+        dictIterator *di = dictGetIterator(server.orig_commands);
+        dictEntry *de;
+        while ((de = dictNext(di)) != NULL) {
+            struct redisCommand *cmd = dictGetVal(de);
+            if (cmd->flags & CMD_MODULE) continue;
+            if (cmd->flags & cflag) {
+                addReplyBulkCString(c,cmd->name);
+                arraylen++;
+            }
+        }
+        dictReleaseIterator(di);
+        setDeferredArrayLen(c,dl,arraylen);
     } else if (!strcasecmp(sub,"help")) {
         const char *help[] = {
+"LOAD                              -- Reload users from the ACL file.",
 "LIST                              -- Show user details in config file format.",
 "USERS                             -- List all the registered usernames.",
 "SETUSER <username> [attribs ...]  -- Create or modify a user.",
 "GETUSER <username>                -- Get the user details.",
-"DELUSER <username>                -- Delete a user.",
+"DELUSER <username> [...]          -- Delete a list of users.",
+"CAT                               -- List available categories.",
+"CAT <category>                    -- List commands inside category.",
 "WHOAMI                            -- Return the current connection username.",
 NULL
         };
@@ -1172,3 +1560,15 @@ NULL
         addReplySubcommandSyntaxError(c);
     }
 }
+
+void addReplyCommandCategories(client *c, struct redisCommand *cmd) {
+    int flagcount = 0;
+    void *flaglen = addReplyDeferredLen(c);
+    for (int j = 0; ACLCommandCategories[j].flag != 0; j++) {
+        if (cmd->flags & ACLCommandCategories[j].flag) {
+            addReplyStatusFormat(c, "@%s", ACLCommandCategories[j].name);
+            flagcount++;
+        }
+    }
+    setDeferredSetLen(c, flaglen, flagcount);
+}
diff --git a/src/adlist.h b/src/adlist.h
index c954fac87..e9de81ceb 100644
--- a/src/adlist.h
+++ b/src/adlist.h
@@ -31,6 +31,10 @@
 #ifndef __ADLIST_H__
 #define __ADLIST_H__
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /* Node, List, and Iterator are the only data structures used currently. */
 
 typedef struct listNode {
@@ -92,4 +96,8 @@ void listJoin(list *l, list *o);
 #define AL_START_HEAD 0
 #define AL_START_TAIL 1
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif /* __ADLIST_H__ */
diff --git a/src/ae.cpp b/src/ae.cpp
index 17408e316..60e22db83 100644
--- a/src/ae.cpp
+++ b/src/ae.cpp
@@ -30,7 +30,11 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include <condition_variable>
+#include <atomic>
+#include <mutex>
 #include <stdio.h>
+#include <fcntl.h>
 #include <sys/time.h>
 #include <sys/types.h>
 #include <unistd.h>
@@ -41,11 +45,49 @@
 #include <errno.h>
 
 #include "ae.h"
+#include "fastlock.h"
 extern "C" {
 #include "zmalloc.h"
 #include "config.h"
 }
 
+#ifdef USE_MUTEX
+thread_local int cOwnLock = 0;
+class mutex_wrapper
+{
+    std::recursive_mutex m_mutex;
+public:
+    void lock() {
+        m_mutex.lock();
+        cOwnLock++;
+    }
+
+    void unlock() {
+        cOwnLock--;
+        m_mutex.unlock();
+    }
+
+    bool try_lock() {
+        if (m_mutex.try_lock()) {
+            cOwnLock++;
+            return true;
+        }
+        return false;
+    }
+
+    bool fOwnLock() {
+        return cOwnLock > 0;
+    }
+};
+mutex_wrapper g_lock;
+
+#else
+fastlock g_lock;
+#endif
+thread_local aeEventLoop *g_eventLoopThisThread = NULL;
+
+#define AE_ASSERT(x) if (!(x)) do { fprintf(stderr, "AE_ASSER FAILURE\n"); *((volatile int*)0) = 1; } while(0)
+
 /* Include the best multiplexing layer supported by this system.
  * The following should be ordered by performances, descending. */
 #ifdef HAVE_EVPORT
@@ -62,6 +104,178 @@ extern "C" {
     #endif
 #endif
 
+enum class AE_ASYNC_OP
+{
+    PostFunction,
+    PostCppFunction,
+    DeleteFileEvent,
+    CreateFileEvent,
+};
+
+struct aeCommandControl
+{
+    std::condition_variable cv;
+    std::atomic<int> rval;
+    std::mutex mutexcv;
+};
+
+struct aeCommand
+{
+    AE_ASYNC_OP op;
+    int fd; 
+    int mask;
+    union {
+        aePostFunctionProc *proc;
+        aeFileProc *fproc;
+        std::function<void()> *pfn;
+    };
+    void *clientData;
+    aeCommandControl *pctl;
+};
+
+void aeProcessCmd(aeEventLoop *eventLoop, int fd, void *, int )
+{
+    aeCommand cmd;
+    for (;;)
+    {
+        auto cb = read(fd, &cmd, sizeof(aeCommand));
+        if (cb != sizeof(cmd))
+        {
+            AE_ASSERT(errno == EAGAIN);
+            break;
+        }
+        switch (cmd.op)
+        {
+        case AE_ASYNC_OP::DeleteFileEvent:
+            aeDeleteFileEvent(eventLoop, cmd.fd, cmd.mask);
+            break;
+
+        case AE_ASYNC_OP::CreateFileEvent:
+        {
+            if (cmd.pctl != nullptr)
+            {
+                cmd.pctl->mutexcv.lock();
+                std::atomic_store(&cmd.pctl->rval, aeCreateFileEvent(eventLoop, cmd.fd, cmd.mask, cmd.fproc, cmd.clientData));
+                cmd.pctl->cv.notify_all();
+                cmd.pctl->mutexcv.unlock();
+            }
+            else
+            {
+                aeCreateFileEvent(eventLoop, cmd.fd, cmd.mask, cmd.fproc, cmd.clientData);
+            }
+        }
+            break;
+
+        case AE_ASYNC_OP::PostFunction:
+            {
+            std::unique_lock<decltype(g_lock)> ulock(g_lock);
+            ((aePostFunctionProc*)cmd.proc)(cmd.clientData);
+            break;
+            }
+
+        case AE_ASYNC_OP::PostCppFunction:
+        {
+            if (cmd.pctl != nullptr)
+                cmd.pctl->mutexcv.lock();
+            
+            std::unique_lock<decltype(g_lock)> ulock(g_lock);
+            (*cmd.pfn)();
+            
+            if (cmd.pctl != nullptr)
+            {
+                cmd.pctl->cv.notify_all();
+                cmd.pctl->mutexcv.unlock();
+            }
+            delete cmd.pfn;
+        }
+            break;
+        }
+    }
+}
+
+int aeCreateRemoteFileEvent(aeEventLoop *eventLoop, int fd, int mask,
+        aeFileProc *proc, void *clientData, int fSynchronous)
+{
+    if (eventLoop == g_eventLoopThisThread)
+        return aeCreateFileEvent(eventLoop, fd, mask, proc, clientData);
+
+    int ret = AE_OK;
+    
+    aeCommand cmd;
+    cmd.op = AE_ASYNC_OP::CreateFileEvent;
+    cmd.fd = fd;
+    cmd.mask = mask;
+    cmd.fproc = proc;
+    cmd.clientData = clientData;
+    cmd.pctl = nullptr;
+    if (fSynchronous)
+        cmd.pctl = new aeCommandControl();
+
+    std::unique_lock<std::mutex> ulock(cmd.pctl->mutexcv, std::defer_lock);
+    if (fSynchronous)
+        cmd.pctl->mutexcv.lock();
+    auto size = write(eventLoop->fdCmdWrite, &cmd, sizeof(cmd));
+    if (size != sizeof(cmd))
+    {
+        AE_ASSERT(errno == EAGAIN);
+        ret = AE_ERR;
+    }
+    
+    if (fSynchronous)
+    {
+        cmd.pctl->cv.wait(ulock);
+        ret = cmd.pctl->rval;
+        delete cmd.pctl;
+    }
+
+    return ret;
+}
+
+int aePostFunction(aeEventLoop *eventLoop, aePostFunctionProc *proc, void *arg)
+{
+    if (eventLoop == g_eventLoopThisThread)
+    {
+        proc(arg);
+        return AE_OK;
+    }
+    aeCommand cmd;
+    cmd.op = AE_ASYNC_OP::PostFunction;
+    cmd.proc = proc;
+    cmd.clientData = arg;
+    auto size = write(eventLoop->fdCmdWrite, &cmd, sizeof(cmd));
+    AE_ASSERT(size == sizeof(cmd));
+    return AE_OK;
+}
+
+int aePostFunction(aeEventLoop *eventLoop, std::function<void()> fn, bool fSynchronous)
+{
+    if (eventLoop == g_eventLoopThisThread)
+    {
+        fn();
+        return AE_OK;
+    }
+
+    aeCommand cmd;
+    cmd.op = AE_ASYNC_OP::PostCppFunction;
+    cmd.pfn = new std::function<void()>(fn);
+    cmd.pctl = nullptr;
+    if (fSynchronous)
+        cmd.pctl = new aeCommandControl();
+    std::unique_lock<std::mutex> ulock(cmd.pctl->mutexcv, std::defer_lock);
+    if (fSynchronous)
+        cmd.pctl->mutexcv.lock();
+    auto size = write(eventLoop->fdCmdWrite, &cmd, sizeof(cmd));
+    AE_ASSERT(size == sizeof(cmd));
+    int ret = AE_OK;
+    if (fSynchronous)
+    {
+        cmd.pctl->cv.wait(ulock);
+        ret = cmd.pctl->rval;
+        delete cmd.pctl;
+    }
+    return ret;
+}
+
 aeEventLoop *aeCreateEventLoop(int setsize) {
     aeEventLoop *eventLoop;
     int i;
@@ -83,6 +297,18 @@ aeEventLoop *aeCreateEventLoop(int setsize) {
      * vector with it. */
     for (i = 0; i < setsize; i++)
         eventLoop->events[i].mask = AE_NONE;
+
+    fastlock_init(&eventLoop->flock);
+    int rgfd[2];
+    if (pipe(rgfd) < 0)
+        goto err;
+    eventLoop->fdCmdRead = rgfd[0];
+    eventLoop->fdCmdWrite = rgfd[1];
+    fcntl(eventLoop->fdCmdWrite, F_SETFL, O_NONBLOCK);
+    fcntl(eventLoop->fdCmdRead, F_SETFL, O_NONBLOCK);
+    eventLoop->cevents = 0;
+    aeCreateFileEvent(eventLoop, eventLoop->fdCmdRead, AE_READABLE|AE_READ_THREADSAFE, aeProcessCmd, NULL);
+
     return eventLoop;
 
 err:
@@ -107,6 +333,7 @@ int aeGetSetSize(aeEventLoop *eventLoop) {
  *
  * Otherwise AE_OK is returned and the operation is successful. */
 int aeResizeSetSize(aeEventLoop *eventLoop, int setsize) {
+    AE_ASSERT(g_eventLoopThisThread == NULL || g_eventLoopThisThread == eventLoop);
     int i;
 
     if (setsize == eventLoop->setsize) return AE_OK;
@@ -129,19 +356,25 @@ extern "C" void aeDeleteEventLoop(aeEventLoop *eventLoop) {
     zfree(eventLoop->events);
     zfree(eventLoop->fired);
     zfree(eventLoop);
+    fastlock_free(&eventLoop->flock);
+    close(eventLoop->fdCmdRead);
+    close(eventLoop->fdCmdWrite);
 }
 
 extern "C" void aeStop(aeEventLoop *eventLoop) {
+    AE_ASSERT(g_eventLoopThisThread == NULL || g_eventLoopThisThread == eventLoop);
     eventLoop->stop = 1;
 }
 
 extern "C" int aeCreateFileEvent(aeEventLoop *eventLoop, int fd, int mask,
         aeFileProc *proc, void *clientData)
 {
+    AE_ASSERT(g_eventLoopThisThread == NULL || g_eventLoopThisThread == eventLoop);
     if (fd >= eventLoop->setsize) {
         errno = ERANGE;
         return AE_ERR;
     }
+
     aeFileEvent *fe = &eventLoop->events[fd];
 
     if (aeApiAddEvent(eventLoop, fd, mask) == -1)
@@ -155,8 +388,21 @@ extern "C" int aeCreateFileEvent(aeEventLoop *eventLoop, int fd, int mask,
     return AE_OK;
 }
 
+void aeDeleteFileEventAsync(aeEventLoop *eventLoop, int fd, int mask)
+{
+    if (eventLoop == g_eventLoopThisThread)
+        return aeDeleteFileEvent(eventLoop, fd, mask);
+    aeCommand cmd;
+    cmd.op = AE_ASYNC_OP::DeleteFileEvent;
+    cmd.fd = fd;
+    cmd.mask = mask;
+    auto cb = write(eventLoop->fdCmdWrite, &cmd, sizeof(cmd));
+    AE_ASSERT(cb == sizeof(cmd));
+}
+
 extern "C" void aeDeleteFileEvent(aeEventLoop *eventLoop, int fd, int mask)
 {
+    AE_ASSERT(g_eventLoopThisThread == NULL || g_eventLoopThisThread == eventLoop);
     if (fd >= eventLoop->setsize) return;
     aeFileEvent *fe = &eventLoop->events[fd];
     if (fe->mask == AE_NONE) return;
@@ -165,6 +411,9 @@ extern "C" void aeDeleteFileEvent(aeEventLoop *eventLoop, int fd, int mask)
      * is removed. */
     if (mask & AE_WRITABLE) mask |= AE_BARRIER;
 
+    if (mask & AE_WRITABLE) mask |= AE_WRITE_THREADSAFE;
+    if (mask & AE_READABLE) mask |= AE_READ_THREADSAFE;
+
     aeApiDelEvent(eventLoop, fd, mask);
     fe->mask = fe->mask & (~mask);
     if (fd == eventLoop->maxfd && fe->mask == AE_NONE) {
@@ -211,6 +460,7 @@ extern "C" long long aeCreateTimeEvent(aeEventLoop *eventLoop, long long millise
         aeTimeProc *proc, void *clientData,
         aeEventFinalizerProc *finalizerProc)
 {
+    AE_ASSERT(g_eventLoopThisThread == NULL || g_eventLoopThisThread == eventLoop);
     long long id = eventLoop->timeEventNextId++;
     aeTimeEvent *te;
 
@@ -231,6 +481,7 @@ extern "C" long long aeCreateTimeEvent(aeEventLoop *eventLoop, long long millise
 
 extern "C" int aeDeleteTimeEvent(aeEventLoop *eventLoop, long long id)
 {
+    AE_ASSERT(g_eventLoopThisThread == NULL || g_eventLoopThisThread == eventLoop);
     aeTimeEvent *te = eventLoop->timeEventHead;
     while(te) {
         if (te->id == id) {
@@ -255,6 +506,7 @@ extern "C" int aeDeleteTimeEvent(aeEventLoop *eventLoop, long long id)
  */
 static aeTimeEvent *aeSearchNearestTimer(aeEventLoop *eventLoop)
 {
+    AE_ASSERT(g_eventLoopThisThread == NULL || g_eventLoopThisThread == eventLoop);
     aeTimeEvent *te = eventLoop->timeEventHead;
     aeTimeEvent *nearest = NULL;
 
@@ -270,6 +522,7 @@ static aeTimeEvent *aeSearchNearestTimer(aeEventLoop *eventLoop)
 
 /* Process time events */
 static int processTimeEvents(aeEventLoop *eventLoop) {
+    std::unique_lock<decltype(g_lock)> ulock(g_lock);
     int processed = 0;
     aeTimeEvent *te;
     long long maxId;
@@ -343,6 +596,62 @@ static int processTimeEvents(aeEventLoop *eventLoop) {
     return processed;
 }
 
+extern "C" void ProcessEventCore(aeEventLoop *eventLoop, aeFileEvent *fe, int mask, int fd)
+{
+#define LOCK_IF_NECESSARY(fe, tsmask) \
+    std::unique_lock<decltype(g_lock)> ulock(g_lock, std::defer_lock); \
+    if (!(fe->mask & tsmask)) \
+        ulock.lock()
+
+    int fired = 0; /* Number of events fired for current fd. */
+
+    /* Normally we execute the readable event first, and the writable
+    * event laster. This is useful as sometimes we may be able
+    * to serve the reply of a query immediately after processing the
+    * query.
+    *
+    * However if AE_BARRIER is set in the mask, our application is
+    * asking us to do the reverse: never fire the writable event
+    * after the readable. In such a case, we invert the calls.
+    * This is useful when, for instance, we want to do things
+    * in the beforeSleep() hook, like fsynching a file to disk,
+    * before replying to a client. */
+    int invert = fe->mask & AE_BARRIER;
+
+    /* Note the "fe->mask & mask & ..." code: maybe an already
+        * processed event removed an element that fired and we still
+        * didn't processed, so we check if the event is still valid.
+        *
+        * Fire the readable event if the call sequence is not
+        * inverted. */
+    if (!invert && fe->mask & mask & AE_READABLE) {
+        LOCK_IF_NECESSARY(fe, AE_READ_THREADSAFE);
+        fe->rfileProc(eventLoop,fd,fe->clientData,mask | (fe->mask & AE_READ_THREADSAFE));
+        fired++;
+    }
+
+    /* Fire the writable event. */
+    if (fe->mask & mask & AE_WRITABLE) {
+        if (!fired || fe->wfileProc != fe->rfileProc) {
+            LOCK_IF_NECESSARY(fe, AE_WRITE_THREADSAFE);
+            fe->wfileProc(eventLoop,fd,fe->clientData,mask | (fe->mask & AE_WRITE_THREADSAFE));
+            fired++;
+        }
+    }
+
+    /* If we have to invert the call, fire the readable event now
+        * after the writable one. */
+    if (invert && fe->mask & mask & AE_READABLE) {
+        if (!fired || fe->wfileProc != fe->rfileProc) {
+            LOCK_IF_NECESSARY(fe, AE_READ_THREADSAFE);
+            fe->rfileProc(eventLoop,fd,fe->clientData,mask | (fe->mask & AE_READ_THREADSAFE));
+            fired++;
+        }
+    }
+
+#undef LOCK_IF_NECESSARY
+}
+
 /* Process every pending time event, then every pending file event
  * (that may be registered by time event callbacks just processed).
  * Without special flags the function sleeps until some file event
@@ -359,6 +668,7 @@ static int processTimeEvents(aeEventLoop *eventLoop) {
  * The function returns the number of events processed. */
 int aeProcessEvents(aeEventLoop *eventLoop, int flags)
 {
+    AE_ASSERT(g_eventLoopThisThread == NULL || g_eventLoopThisThread == eventLoop);
     int processed = 0, numevents;
 
     /* Nothing to do? return ASAP */
@@ -413,55 +723,19 @@ int aeProcessEvents(aeEventLoop *eventLoop, int flags)
         numevents = aeApiPoll(eventLoop, tvp);
 
         /* After sleep callback. */
-        if (eventLoop->aftersleep != NULL && flags & AE_CALL_AFTER_SLEEP)
+        if (eventLoop->aftersleep != NULL && flags & AE_CALL_AFTER_SLEEP) {
+            std::unique_lock<decltype(g_lock)> ulock(g_lock, std::defer_lock);
+            if (!(eventLoop->beforesleepFlags & AE_SLEEP_THREADSAFE))
+                ulock.lock();
             eventLoop->aftersleep(eventLoop);
+        }
 
         for (j = 0; j < numevents; j++) {
             aeFileEvent *fe = &eventLoop->events[eventLoop->fired[j].fd];
             int mask = eventLoop->fired[j].mask;
             int fd = eventLoop->fired[j].fd;
-            int fired = 0; /* Number of events fired for current fd. */
 
-            /* Normally we execute the readable event first, and the writable
-             * event laster. This is useful as sometimes we may be able
-             * to serve the reply of a query immediately after processing the
-             * query.
-             *
-             * However if AE_BARRIER is set in the mask, our application is
-             * asking us to do the reverse: never fire the writable event
-             * after the readable. In such a case, we invert the calls.
-             * This is useful when, for instance, we want to do things
-             * in the beforeSleep() hook, like fsynching a file to disk,
-             * before replying to a client. */
-            int invert = fe->mask & AE_BARRIER;
-
-            /* Note the "fe->mask & mask & ..." code: maybe an already
-             * processed event removed an element that fired and we still
-             * didn't processed, so we check if the event is still valid.
-             *
-             * Fire the readable event if the call sequence is not
-             * inverted. */
-            if (!invert && fe->mask & mask & AE_READABLE) {
-                fe->rfileProc(eventLoop,fd,fe->clientData,mask);
-                fired++;
-            }
-
-            /* Fire the writable event. */
-            if (fe->mask & mask & AE_WRITABLE) {
-                if (!fired || fe->wfileProc != fe->rfileProc) {
-                    fe->wfileProc(eventLoop,fd,fe->clientData,mask);
-                    fired++;
-                }
-            }
-
-            /* If we have to invert the call, fire the readable event now
-             * after the writable one. */
-            if (invert && fe->mask & mask & AE_READABLE) {
-                if (!fired || fe->wfileProc != fe->rfileProc) {
-                    fe->rfileProc(eventLoop,fd,fe->clientData,mask);
-                    fired++;
-                }
-            }
+            ProcessEventCore(eventLoop, fe, mask, fd);
 
             processed++;
         }
@@ -470,6 +744,7 @@ int aeProcessEvents(aeEventLoop *eventLoop, int flags)
     if (flags & AE_TIME_EVENTS)
         processed += processTimeEvents(eventLoop);
 
+    eventLoop->cevents += processed;
     return processed; /* return the number of processed file/time events */
 }
 
@@ -497,10 +772,17 @@ int aeWait(int fd, int mask, long long milliseconds) {
 
 void aeMain(aeEventLoop *eventLoop) {
     eventLoop->stop = 0;
+    g_eventLoopThisThread = eventLoop;
     while (!eventLoop->stop) {
-        if (eventLoop->beforesleep != NULL)
+        if (eventLoop->beforesleep != NULL) {
+            std::unique_lock<decltype(g_lock)> ulock(g_lock, std::defer_lock);
+            if (!(eventLoop->beforesleepFlags & AE_SLEEP_THREADSAFE))
+                ulock.lock();
             eventLoop->beforesleep(eventLoop);
+        }
+        AE_ASSERT(!aeThreadOwnsLock()); // we should have relinquished it after processing
         aeProcessEvents(eventLoop, AE_ALL_EVENTS|AE_CALL_AFTER_SLEEP);
+        AE_ASSERT(!aeThreadOwnsLock()); // we should have relinquished it after processing
     }
 }
 
@@ -508,10 +790,32 @@ const char *aeGetApiName(void) {
     return aeApiName();
 }
 
-void aeSetBeforeSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *beforesleep) {
+void aeSetBeforeSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *beforesleep, int flags) {
     eventLoop->beforesleep = beforesleep;
+    eventLoop->beforesleepFlags = flags;
 }
 
-void aeSetAfterSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *aftersleep) {
+void aeSetAfterSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *aftersleep, int flags) {
     eventLoop->aftersleep = aftersleep;
+    eventLoop->aftersleepFlags = flags;
+}
+
+void aeAcquireLock()
+{
+    g_lock.lock();
+}
+
+int aeTryAcquireLock()
+{
+    return g_lock.try_lock();
+}
+
+void aeReleaseLock()
+{
+    g_lock.unlock();
+}
+
+int aeThreadOwnsLock()
+{
+    return g_lock.fOwnLock();
 }
diff --git a/src/ae.h b/src/ae.h
index a6ee1d05b..f08c49dd8 100644
--- a/src/ae.h
+++ b/src/ae.h
@@ -33,7 +33,11 @@
 #ifndef __AE_H__
 #define __AE_H__
 
+#ifdef __cplusplus
+#include <functional>
+#endif
 #include <time.h>
+#include "fastlock.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -50,7 +54,9 @@ extern "C" {
                            loop iteration. Useful when you want to persist
                            things to disk before sending replies, and want
                            to do that in a group fashion. */
-#define AE_THREADSAFE 8 /* Ok to run concurrently */
+#define AE_READ_THREADSAFE 8
+#define AE_WRITE_THREADSAFE 16
+#define AE_SLEEP_THREADSAFE 32
 
 #define AE_FILE_EVENTS 1
 #define AE_TIME_EVENTS 2
@@ -71,6 +77,7 @@ typedef void aeFileProc(struct aeEventLoop *eventLoop, int fd, void *clientData,
 typedef int aeTimeProc(struct aeEventLoop *eventLoop, long long id, void *clientData);
 typedef void aeEventFinalizerProc(struct aeEventLoop *eventLoop, void *clientData);
 typedef void aeBeforeSleepProc(struct aeEventLoop *eventLoop);
+typedef void aePostFunctionProc(void *pvArgs);
 
 /* File event structure */
 typedef struct aeFileEvent {
@@ -110,16 +117,33 @@ typedef struct aeEventLoop {
     int stop;
     void *apidata; /* This is used for polling API specific data */
     aeBeforeSleepProc *beforesleep;
+    int beforesleepFlags;
     aeBeforeSleepProc *aftersleep;
+    int aftersleepFlags;
+    struct fastlock flock;
+    int fdCmdWrite;
+    int fdCmdRead;
+    int cevents;
 } aeEventLoop;
 
 /* Prototypes */
 aeEventLoop *aeCreateEventLoop(int setsize);
+int aePostFunction(aeEventLoop *eventLoop, aePostFunctionProc *proc, void *arg);
+#ifdef __cplusplus
+}   // EXTERN C
+int aePostFunction(aeEventLoop *eventLoop, std::function<void()> fn, bool fSynchronous = false);
+extern "C" {
+#endif
 void aeDeleteEventLoop(aeEventLoop *eventLoop);
 void aeStop(aeEventLoop *eventLoop);
 int aeCreateFileEvent(aeEventLoop *eventLoop, int fd, int mask,
         aeFileProc *proc, void *clientData);
+
+int aeCreateRemoteFileEvent(aeEventLoop *eventLoop, int fd, int mask,
+        aeFileProc *proc, void *clientData, int fSynchronous);
+
 void aeDeleteFileEvent(aeEventLoop *eventLoop, int fd, int mask);
+void aeDeleteFileEventAsync(aeEventLoop *eventLoop, int fd, int mask);
 int aeGetFileEvents(aeEventLoop *eventLoop, int fd);
 long long aeCreateTimeEvent(aeEventLoop *eventLoop, long long milliseconds,
         aeTimeProc *proc, void *clientData,
@@ -129,11 +153,16 @@ int aeProcessEvents(aeEventLoop *eventLoop, int flags);
 int aeWait(int fd, int mask, long long milliseconds);
 void aeMain(aeEventLoop *eventLoop);
 const char *aeGetApiName(void);
-void aeSetBeforeSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *beforesleep);
-void aeSetAfterSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *aftersleep);
+void aeSetBeforeSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *beforesleep, int flags);
+void aeSetAfterSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *aftersleep, int flags);
 int aeGetSetSize(aeEventLoop *eventLoop);
 int aeResizeSetSize(aeEventLoop *eventLoop, int setsize);
 
+void aeAcquireLock();
+int aeTryAcquireLock();
+void aeReleaseLock();
+int aeThreadOwnsLock();
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/ae_epoll.cpp b/src/ae_epoll.cpp
index cadcc3f51..05638ebdc 100644
--- a/src/ae_epoll.cpp
+++ b/src/ae_epoll.cpp
@@ -83,7 +83,11 @@ static int aeApiAddEvent(aeEventLoop *eventLoop, int fd, int mask) {
     if (mask & AE_READABLE) ee.events |= EPOLLIN;
     if (mask & AE_WRITABLE) ee.events |= EPOLLOUT;
     ee.data.fd = fd;
-    if (epoll_ctl(state->epfd,op,fd,&ee) == -1) return -1;
+    if (epoll_ctl(state->epfd,op,fd,&ee) == -1)
+    {
+        perror("epoll_ctl failed");
+        return -1;
+    }
     return 0;
 }
 
diff --git a/src/anet.c b/src/anet.c
index 2981fca13..91ab94efd 100644
--- a/src/anet.c
+++ b/src/anet.c
@@ -246,6 +246,16 @@ static int anetSetReuseAddr(char *err, int fd) {
     return ANET_OK;
 }
 
+static int anetSetReusePort(char *err, int fd) {
+    int yes = 1;
+    /* Let us load balance listen()s from multiple threads */
+    if (setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &yes, sizeof(yes)) == -1) {
+        anetSetError(err, "setsockopt SO_REUSEPORT: %s", strerror(errno));
+        return ANET_ERR;
+    }
+    return ANET_OK;
+}
+
 static int anetCreateSocket(char *err, int domain) {
     int s;
     if ((s = socket(domain, SOCK_STREAM, 0)) == -1) {
@@ -265,6 +275,7 @@ static int anetCreateSocket(char *err, int domain) {
 #define ANET_CONNECT_NONE 0
 #define ANET_CONNECT_NONBLOCK 1
 #define ANET_CONNECT_BE_BINDING 2 /* Best effort binding. */
+#define ANET_CONNECT_REUSEPORT 4
 static int anetTcpGenericConnect(char *err, char *addr, int port,
                                  char *source_addr, int flags)
 {
@@ -287,7 +298,10 @@ static int anetTcpGenericConnect(char *err, char *addr, int port,
          * the next entry in servinfo. */
         if ((s = socket(p->ai_family,p->ai_socktype,p->ai_protocol)) == -1)
             continue;
-        if (anetSetReuseAddr(err,s) == ANET_ERR) goto error;
+        if (anetSetReuseAddr(err,s) == ANET_ERR) 
+            goto error;
+        if (flags & ANET_CONNECT_REUSEPORT && anetSetReusePort(err, s) != ANET_OK)
+            goto error;
         if (flags & ANET_CONNECT_NONBLOCK && anetNonBlock(err,s) != ANET_OK)
             goto error;
         if (source_addr) {
@@ -462,7 +476,7 @@ static int anetV6Only(char *err, int s) {
     return ANET_OK;
 }
 
-static int _anetTcpServer(char *err, int port, char *bindaddr, int af, int backlog)
+static int _anetTcpServer(char *err, int port, char *bindaddr, int af, int backlog, int fReusePort)
 {
     int s = -1, rv;
     char _port[6];  /* strlen("65535") */
@@ -484,6 +498,7 @@ static int _anetTcpServer(char *err, int port, char *bindaddr, int af, int backl
 
         if (af == AF_INET6 && anetV6Only(err,s) == ANET_ERR) goto error;
         if (anetSetReuseAddr(err,s) == ANET_ERR) goto error;
+        if (fReusePort && anetSetReusePort(err,s) == ANET_ERR) goto error;
         if (anetListen(err,s,p->ai_addr,p->ai_addrlen,backlog) == ANET_ERR) s = ANET_ERR;
         goto end;
     }
@@ -500,14 +515,14 @@ end:
     return s;
 }
 
-int anetTcpServer(char *err, int port, char *bindaddr, int backlog)
+int anetTcpServer(char *err, int port, char *bindaddr, int backlog, int fReusePort)
 {
-    return _anetTcpServer(err, port, bindaddr, AF_INET, backlog);
+    return _anetTcpServer(err, port, bindaddr, AF_INET, backlog, fReusePort);
 }
 
-int anetTcp6Server(char *err, int port, char *bindaddr, int backlog)
+int anetTcp6Server(char *err, int port, char *bindaddr, int backlog, int fReusePort)
 {
-    return _anetTcpServer(err, port, bindaddr, AF_INET6, backlog);
+    return _anetTcpServer(err, port, bindaddr, AF_INET6, backlog, fReusePort);
 }
 
 int anetUnixServer(char *err, char *path, mode_t perm, int backlog)
diff --git a/src/anet.h b/src/anet.h
index 7142f78d2..44c57b4cd 100644
--- a/src/anet.h
+++ b/src/anet.h
@@ -33,6 +33,10 @@
 
 #include <sys/types.h>
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define ANET_OK 0
 #define ANET_ERR -1
 #define ANET_ERR_LEN 256
@@ -58,8 +62,8 @@ int anetUnixNonBlockConnect(char *err, char *path);
 int anetRead(int fd, char *buf, int count);
 int anetResolve(char *err, char *host, char *ipbuf, size_t ipbuf_len);
 int anetResolveIP(char *err, char *host, char *ipbuf, size_t ipbuf_len);
-int anetTcpServer(char *err, int port, char *bindaddr, int backlog);
-int anetTcp6Server(char *err, int port, char *bindaddr, int backlog);
+int anetTcpServer(char *err, int port, char *bindaddr, int backlog, int fReusePort);
+int anetTcp6Server(char *err, int port, char *bindaddr, int backlog, int fReusePort);
 int anetUnixServer(char *err, char *path, mode_t perm, int backlog);
 int anetTcpAccept(char *err, int serversock, char *ip, size_t ip_len, int *port);
 int anetUnixAccept(char *err, int serversock);
@@ -77,4 +81,8 @@ int anetFormatAddr(char *fmt, size_t fmt_len, char *ip, int port);
 int anetFormatPeer(int fd, char *fmt, size_t fmt_len);
 int anetFormatSock(int fd, char *fmt, size_t fmt_len);
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif
diff --git a/src/aof.c b/src/aof.c
index c71f88aa6..f19affc64 100644
--- a/src/aof.c
+++ b/src/aof.c
@@ -96,6 +96,8 @@ void aofChildWriteDiffData(aeEventLoop *el, int fd, void *privdata, int mask) {
     listNode *ln;
     aofrwblock *block;
     ssize_t nwritten;
+    serverAssert(aeThreadOwnsLock());
+
     UNUSED(el);
     UNUSED(fd);
     UNUSED(privdata);
@@ -105,7 +107,7 @@ void aofChildWriteDiffData(aeEventLoop *el, int fd, void *privdata, int mask) {
         ln = listFirst(server.aof_rewrite_buf_blocks);
         block = ln ? ln->value : NULL;
         if (server.aof_stop_sending_diff || !block) {
-            aeDeleteFileEvent(server.el,server.aof_pipe_write_data_to_child,
+            aeDeleteFileEvent(el,server.aof_pipe_write_data_to_child,
                               AE_WRITABLE);
             return;
         }
@@ -162,8 +164,8 @@ void aofRewriteBufferAppend(unsigned char *s, unsigned long len) {
 
     /* Install a file event to send data to the rewrite child if there is
      * not one already. */
-    if (aeGetFileEvents(server.el,server.aof_pipe_write_data_to_child) == 0) {
-        aeCreateFileEvent(server.el, server.aof_pipe_write_data_to_child,
+    if (aeGetFileEvents(serverTL->el,server.aof_pipe_write_data_to_child) == 0) {
+        aeCreateFileEvent(serverTL->el, server.aof_pipe_write_data_to_child,
             AE_WRITABLE, aofChildWriteDiffData, NULL);
     }
 }
@@ -631,6 +633,7 @@ struct client *createFakeClient(void) {
 
     selectDb(c,0);
     c->fd = -1;
+    c->iel = IDX_EVENT_LOOP_MAIN;
     c->name = NULL;
     c->querybuf = sdsempty();
     c->querybuf_peak = 0;
@@ -638,6 +641,7 @@ struct client *createFakeClient(void) {
     c->argv = NULL;
     c->bufpos = 0;
     c->flags = 0;
+    c->fPendingAsyncWrite = FALSE;
     c->btype = BLOCKED_NONE;
     /* We set the fake client as a slave waiting for the synchronization
      * so that Redis will not try to send replies to this client. */
@@ -651,6 +655,8 @@ struct client *createFakeClient(void) {
     c->puser = NULL;
     listSetFreeMethod(c->reply,freeClientReplyValue);
     listSetDupMethod(c->reply,dupClientReplyValue);
+    fastlock_init(&c->lock);
+    fastlock_lock(&c->lock);
     initClientMultiState(c);
     return c;
 }
@@ -668,6 +674,8 @@ void freeFakeClient(struct client *c) {
     listRelease(c->reply);
     listRelease(c->watched_keys);
     freeClientMultiState(c);
+    fastlock_unlock(&c->lock);
+    fastlock_free(&c->lock);
     zfree(c);
 }
 
@@ -682,6 +690,7 @@ int loadAppendOnlyFile(char *filename) {
     long loops = 0;
     off_t valid_up_to = 0; /* Offset of latest well-formed command loaded. */
     off_t valid_before_multi = 0; /* Offset before MULTI command loaded. */
+    serverAssert(serverTL != NULL); // This happens early in boot, ensure serverTL was setup
 
     if (fp == NULL) {
         serverLog(LL_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
@@ -738,7 +747,7 @@ int loadAppendOnlyFile(char *filename) {
         /* Serve the clients from time to time */
         if (!(loops++ % 1000)) {
             loadingProgress(ftello(fp));
-            processEventsWhileBlocked();
+            processEventsWhileBlocked(serverTL - server.rgthreadvar);
         }
 
         if (fgets(buf,sizeof(buf),fp) == NULL) {
@@ -1470,7 +1479,7 @@ void aofChildPipeReadable(aeEventLoop *el, int fd, void *privdata, int mask) {
     }
     /* Remove the handler since this can be called only one time during a
      * rewrite. */
-    aeDeleteFileEvent(server.el,server.aof_pipe_read_ack_from_child,AE_READABLE);
+    aeDeleteFileEventAsync(server.rgthreadvar[IDX_EVENT_LOOP_MAIN].el,server.aof_pipe_read_ack_from_child,AE_READABLE);
 }
 
 /* Create the pipes used for parent - child process IPC during rewrite.
@@ -1488,12 +1497,13 @@ int aofCreatePipes(void) {
     /* Parent -> children data is non blocking. */
     if (anetNonBlock(NULL,fds[0]) != ANET_OK) goto error;
     if (anetNonBlock(NULL,fds[1]) != ANET_OK) goto error;
-    if (aeCreateFileEvent(server.el, fds[2], AE_READABLE, aofChildPipeReadable, NULL) == AE_ERR) goto error;
+    if (aeCreateFileEvent(serverTL->el, fds[2], AE_READABLE, aofChildPipeReadable, NULL) == AE_ERR) goto error;
 
     server.aof_pipe_write_data_to_child = fds[1];
     server.aof_pipe_read_data_from_parent = fds[0];
     server.aof_pipe_write_ack_to_parent = fds[3];
     server.aof_pipe_read_ack_from_child = fds[2];
+    server.el_alf_pip_read_ack_from_child = serverTL->el;
     server.aof_pipe_write_ack_to_child = fds[5];
     server.aof_pipe_read_ack_from_parent = fds[4];
     server.aof_stop_sending_diff = 0;
@@ -1507,8 +1517,8 @@ error:
 }
 
 void aofClosePipes(void) {
-    aeDeleteFileEvent(server.el,server.aof_pipe_read_ack_from_child,AE_READABLE);
-    aeDeleteFileEvent(server.el,server.aof_pipe_write_data_to_child,AE_WRITABLE);
+    aeDeleteFileEventAsync(server.el_alf_pip_read_ack_from_child,server.aof_pipe_read_ack_from_child,AE_READABLE);
+    aeDeleteFileEventAsync(server.rgthreadvar[IDX_EVENT_LOOP_MAIN].el,server.aof_pipe_write_data_to_child,AE_WRITABLE);
     close(server.aof_pipe_write_data_to_child);
     close(server.aof_pipe_read_data_from_parent);
     close(server.aof_pipe_write_ack_to_parent);
diff --git a/src/blocked.c b/src/blocked.c
index 2ac57b5db..ad7113d52 100644
--- a/src/blocked.c
+++ b/src/blocked.c
@@ -100,6 +100,7 @@ int getTimeoutFromObjectOrReply(client *c, robj *object, mstime_t *timeout, int
  * flag is set client query buffer is not longer processed, but accumulated,
  * and will be processed when the client is unblocked. */
 void blockClient(client *c, int btype) {
+    serverAssert(aeThreadOwnsLock());
     c->flags |= CLIENT_BLOCKED;
     c->btype = btype;
     server.blocked_clients++;
@@ -109,15 +110,22 @@ void blockClient(client *c, int btype) {
 /* This function is called in the beforeSleep() function of the event loop
  * in order to process the pending input buffer of clients that were
  * unblocked after a blocking operation. */
-void processUnblockedClients(void) {
+void processUnblockedClients(int iel) {
+    serverAssert(aeThreadOwnsLock());
+
     listNode *ln;
     client *c;
+    list *unblocked_clients = server.rgthreadvar[iel].unblocked_clients;
+    serverAssert(iel == (serverTL - server.rgthreadvar));
 
-    while (listLength(server.unblocked_clients)) {
-        ln = listFirst(server.unblocked_clients);
+    while (listLength(unblocked_clients)) {
+        ln = listFirst(unblocked_clients);
         serverAssert(ln != NULL);
         c = ln->value;
-        listDelNode(server.unblocked_clients,ln);
+        listDelNode(unblocked_clients,ln);
+        AssertCorrectThread(c);
+        
+        fastlock_lock(&c->lock);
         c->flags &= ~CLIENT_UNBLOCKED;
 
         /* Process remaining data in the input buffer, unless the client
@@ -129,6 +137,7 @@ void processUnblockedClients(void) {
                 processInputBufferAndReplicate(c);
             }
         }
+        fastlock_unlock(&c->lock);
     }
 }
 
@@ -151,15 +160,19 @@ void processUnblockedClients(void) {
 void queueClientForReprocessing(client *c) {
     /* The client may already be into the unblocked list because of a previous
      * blocking operation, don't add back it into the list multiple times. */
+    serverAssert(aeThreadOwnsLock());
+    fastlock_lock(&c->lock);
     if (!(c->flags & CLIENT_UNBLOCKED)) {
         c->flags |= CLIENT_UNBLOCKED;
-        listAddNodeTail(server.unblocked_clients,c);
+        listAddNodeTail(server.rgthreadvar[c->iel].unblocked_clients,c);
     }
+    fastlock_unlock(&c->lock);
 }
 
 /* Unblock a client calling the right function depending on the kind
  * of operation the client is blocking for. */
 void unblockClient(client *c) {
+    serverAssert(aeThreadOwnsLock());
     if (c->btype == BLOCKED_LIST ||
         c->btype == BLOCKED_ZSET ||
         c->btype == BLOCKED_STREAM) {
@@ -205,20 +218,23 @@ void replyToBlockedClientTimedOut(client *c) {
  * The semantics is to send an -UNBLOCKED error to the client, disconnecting
  * it at the same time. */
 void disconnectAllBlockedClients(void) {
+    serverAssert(aeThreadOwnsLock());
     listNode *ln;
     listIter li;
 
     listRewind(server.clients,&li);
     while((ln = listNext(&li))) {
         client *c = listNodeValue(ln);
-
+        
+        fastlock_lock(&c->lock);
         if (c->flags & CLIENT_BLOCKED) {
-            addReplySds(c,sdsnew(
+            addReplySdsAsync(c,sdsnew(
                 "-UNBLOCKED force unblock from blocking operation, "
                 "instance state changed (master -> replica?)\r\n"));
             unblockClient(c);
             c->flags |= CLIENT_CLOSE_AFTER_REPLY;
         }
+        fastlock_unlock(&c->lock);
     }
 }
 
@@ -244,6 +260,7 @@ void disconnectAllBlockedClients(void) {
  * be used only for a single type, like virtually any Redis application will
  * do, the function is already fair. */
 void handleClientsBlockedOnKeys(void) {
+    serverAssert(aeThreadOwnsLock());
     while(listLength(server.ready_keys) != 0) {
         list *l;
 
@@ -297,6 +314,7 @@ void handleClientsBlockedOnKeys(void) {
                              * freed by the next unblockClient()
                              * call. */
                             if (dstkey) incrRefCount(dstkey);
+                            fastlock_lock(&receiver->lock);
                             unblockClient(receiver);
 
                             if (serveClientBlockedOnList(receiver,
@@ -309,6 +327,7 @@ void handleClientsBlockedOnKeys(void) {
                             }
 
                             if (dstkey) decrRefCount(dstkey);
+                            fastlock_unlock(&receiver->lock);
                             decrRefCount(value);
                         } else {
                             break;
@@ -348,6 +367,7 @@ void handleClientsBlockedOnKeys(void) {
                             continue;
                         }
 
+                        fastlock_lock(&receiver->lock);
                         int where = (receiver->lastcmd &&
                                      receiver->lastcmd->proc == bzpopminCommand)
                                      ? ZSET_MIN : ZSET_MAX;
@@ -365,6 +385,7 @@ void handleClientsBlockedOnKeys(void) {
                         incrRefCount(rl->key);
                         propagate(cmd,receiver->db->id,
                                   argv,2,PROPAGATE_AOF|PROPAGATE_REPL);
+                        fastlock_unlock(&receiver->lock);
                         decrRefCount(argv[0]);
                         decrRefCount(argv[1]);
                     }
@@ -407,10 +428,12 @@ void handleClientsBlockedOnKeys(void) {
                             /* If the group was not found, send an error
                              * to the consumer. */
                             if (!group) {
-                                addReplyError(receiver,
+                                fastlock_lock(&receiver->lock);
+                                addReplyErrorAsync(receiver,
                                     "-NOGROUP the consumer group this client "
                                     "was blocked on no longer exists");
                                 unblockClient(receiver);
+                                fastlock_unlock(&receiver->lock);
                                 continue;
                             } else {
                                 *gt = group->last_id;
@@ -432,17 +455,19 @@ void handleClientsBlockedOnKeys(void) {
                                 noack = receiver->bpop.xread_group_noack;
                             }
 
+                            fastlock_lock(&receiver->lock);
+
                             /* Emit the two elements sub-array consisting of
                              * the name of the stream and the data we
                              * extracted from it. Wrapped in a single-item
                              * array, since we have just one key. */
                             if (receiver->resp == 2) {
-                                addReplyArrayLen(receiver,1);
-                                addReplyArrayLen(receiver,2);
+                                addReplyArrayLenAsync(receiver,1);
+                                addReplyArrayLenAsync(receiver,2);
                             } else {
-                                addReplyMapLen(receiver,1);
+                                addReplyMapLenAsync(receiver,1);
                             }
-                            addReplyBulk(receiver,rl->key);
+                            addReplyBulkAsync(receiver,rl->key);
 
                             streamPropInfo pi = {
                                 rl->key,
@@ -457,6 +482,7 @@ void handleClientsBlockedOnKeys(void) {
                              * valid, so we must do the setup above before
                              * this call. */
                             unblockClient(receiver);
+                            fastlock_unlock(&receiver->lock);
                         }
                     }
                 }
diff --git a/src/cluster.c b/src/cluster.c
index 11eb170fc..946332fa1 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -486,14 +486,14 @@ void clusterInit(void) {
     }
 
     if (listenToPort(server.port+CLUSTER_PORT_INCR,
-        server.cfd,&server.cfd_count) == C_ERR)
+        server.cfd,&server.cfd_count, 0 /*fReusePort*/) == C_ERR)
     {
         exit(1);
     } else {
         int j;
 
         for (j = 0; j < server.cfd_count; j++) {
-            if (aeCreateFileEvent(server.el, server.cfd[j], AE_READABLE,
+            if (aeCreateFileEvent(server.rgthreadvar[IDX_EVENT_LOOP_MAIN].el, server.cfd[j], AE_READABLE,
                 clusterAcceptHandler, NULL) == AE_ERR)
                     serverPanic("Unrecoverable error creating Redis Cluster "
                                 "file event.");
@@ -601,7 +601,7 @@ clusterLink *createClusterLink(clusterNode *node) {
  * with this link will have the 'link' field set to NULL. */
 void freeClusterLink(clusterLink *link) {
     if (link->fd != -1) {
-        aeDeleteFileEvent(server.el, link->fd, AE_READABLE|AE_WRITABLE);
+        aeDeleteFileEvent(server.rgthreadvar[IDX_EVENT_LOOP_MAIN].el, link->fd, AE_READABLE|AE_WRITABLE);
     }
     sdsfree(link->sndbuf);
     sdsfree(link->rcvbuf);
@@ -645,7 +645,7 @@ void clusterAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
          * node identity. */
         link = createClusterLink(NULL);
         link->fd = cfd;
-        aeCreateFileEvent(server.el,cfd,AE_READABLE,clusterReadHandler,link);
+        aeCreateFileEvent(server.rgthreadvar[IDX_EVENT_LOOP_MAIN].el,cfd,AE_READABLE,clusterReadHandler,link);
     }
 }
 
@@ -2132,7 +2132,7 @@ void clusterWriteHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
     }
     sdsrange(link->sndbuf,nwritten,-1);
     if (sdslen(link->sndbuf) == 0)
-        aeDeleteFileEvent(server.el, link->fd, AE_WRITABLE);
+        aeDeleteFileEvent(server.rgthreadvar[IDX_EVENT_LOOP_MAIN].el, link->fd, AE_WRITABLE);
 }
 
 /* Read data. Try to read the first field of the header first to check the
@@ -2208,7 +2208,7 @@ void clusterReadHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
  * from event handlers that will do stuff with the same link later. */
 void clusterSendMessage(clusterLink *link, unsigned char *msg, size_t msglen) {
     if (sdslen(link->sndbuf) == 0 && msglen != 0)
-        aeCreateFileEvent(server.el,link->fd,AE_WRITABLE|AE_BARRIER,
+        aeCreateFileEvent(server.rgthreadvar[IDX_EVENT_LOOP_MAIN].el,link->fd,AE_WRITABLE|AE_BARRIER,
                     clusterWriteHandler,link);
 
     link->sndbuf = sdscatlen(link->sndbuf, msg, msglen);
@@ -3402,7 +3402,7 @@ void clusterCron(void) {
             link = createClusterLink(node);
             link->fd = fd;
             node->link = link;
-            aeCreateFileEvent(server.el,link->fd,AE_READABLE,
+            aeCreateFileEvent(server.rgthreadvar[IDX_EVENT_LOOP_MAIN].el,link->fd,AE_READABLE,
                     clusterReadHandler,link);
             /* Queue a PING in the new connection ASAP: this is crucial
              * to avoid false positives in failure detection.
@@ -5390,6 +5390,7 @@ socket_err:
  * the target instance. See the Redis Cluster specification for more
  * information. */
 void askingCommand(client *c) {
+    serverAssert(aeThreadOwnsLock());
     if (server.cluster_enabled == 0) {
         addReplyError(c,"This instance has cluster support disabled");
         return;
@@ -5402,6 +5403,7 @@ void askingCommand(client *c) {
  * In this mode slaves will not redirect clients as long as clients access
  * with read-only commands to keys that are served by the slave's master. */
 void readonlyCommand(client *c) {
+    serverAssert(aeThreadOwnsLock());
     if (server.cluster_enabled == 0) {
         addReplyError(c,"This instance has cluster support disabled");
         return;
@@ -5412,6 +5414,7 @@ void readonlyCommand(client *c) {
 
 /* The READWRITE command just clears the READONLY command state. */
 void readwriteCommand(client *c) {
+    serverAssert(aeThreadOwnsLock());
     c->flags &= ~CLIENT_READONLY;
     addReply(c,shared.ok);
 }
@@ -5455,6 +5458,11 @@ clusterNode *getNodeByQuery(client *c, struct redisCommand *cmd, robj **argv, in
     multiState *ms, _ms;
     multiCmd mc;
     int i, slot = 0, migrating_slot = 0, importing_slot = 0, missing_keys = 0;
+    serverAssert(aeThreadOwnsLock());
+
+    /* Allow any key to be set if a module disabled cluster redirections. */
+    if (server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_REDIRECTION)
+        return myself;
 
     /* Allow any key to be set if a module disabled cluster redirections. */
     if (server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_REDIRECTION)
@@ -5663,6 +5671,7 @@ void clusterRedirectClient(client *c, clusterNode *n, int hashslot, int error_co
  * longer handles, the client is sent a redirection error, and the function
  * returns 1. Otherwise 0 is returned and no operation is performed. */
 int clusterRedirectBlockedClientIfNeeded(client *c) {
+    serverAssert(aeThreadOwnsLock());
     if (c->flags & CLIENT_BLOCKED &&
         (c->btype == BLOCKED_LIST ||
          c->btype == BLOCKED_ZSET ||
diff --git a/src/cluster.h b/src/cluster.h
index 571b9c543..ea4f51c78 100644
--- a/src/cluster.h
+++ b/src/cluster.h
@@ -1,6 +1,10 @@
 #ifndef __CLUSTER_H
 #define __CLUSTER_H
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /*-----------------------------------------------------------------------------
  * Redis cluster data structures, defines, exported API.
  *----------------------------------------------------------------------------*/
@@ -287,4 +291,8 @@ clusterNode *getNodeByQuery(client *c, struct redisCommand *cmd, robj **argv, in
 int clusterRedirectBlockedClientIfNeeded(client *c);
 void clusterRedirectClient(client *c, clusterNode *n, int hashslot, int error_code);
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif /* __CLUSTER_H */
diff --git a/src/config.c b/src/config.c
index 7e9b19d76..59ae23303 100644
--- a/src/config.c
+++ b/src/config.c
@@ -395,6 +395,9 @@ void loadServerConfigFromString(char *config) {
                 err = "repl-backlog-ttl can't be negative ";
                 goto loaderr;
             }
+        } else if (!strcasecmp(argv[0],"masteruser") && argc == 2) {
+            zfree(server.masteruser);
+            server.masteruser = argv[1][0] ? zstrdup(argv[1]) : NULL;
         } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
             zfree(server.masterauth);
             server.masterauth = argv[1][0] ? zstrdup(argv[1]) : NULL;
@@ -821,7 +824,18 @@ void loadServerConfigFromString(char *config) {
                 if (err) goto loaderr;
             }
         } else if (!strcasecmp(argv[0],"scratch-file-path")) {
+#ifdef USE_MEMKIND
             storage_init(argv[1], server.maxmemory);
+#else
+            err = "KeyDB not compliled with scratch-file support.";
+            goto loaderr;
+#endif
+        } else if (!strcasecmp(argv[0],"server-threads") && argc == 2) {
+            server.cthreads = atoi(argv[1]);
+            if (server.cthreads <= 0 || server.cthreads > MAX_EVENT_LOOPS) {
+                err = "Invalid number of threads specified";
+                goto loaderr;
+            }
         } else {
             err = "Bad directive or wrong number of arguments"; goto loaderr;
         }
@@ -948,6 +962,9 @@ void configSetCommand(client *c) {
         sds aclop = sdscatprintf(sdsempty(),">%s",(char*)ptrFromObj(o));
         ACLSetUser(DefaultUser,aclop,sdslen(aclop));
         sdsfree(aclop);
+    } config_set_special_field("masteruser") {
+        zfree(server.masteruser);
+        server.masteruser = ((char*)ptrFromObj(o))[0] ? zstrdup(ptrFromObj(o)) : NULL;
     } config_set_special_field("masterauth") {
         zfree(server.masterauth);
         server.masterauth = ((char*)ptrFromObj(o))[0] ? zstrdup(ptrFromObj(o)) : NULL;
@@ -961,6 +978,7 @@ void configSetCommand(client *c) {
 
         /* Try to check if the OS is capable of supporting so many FDs. */
         server.maxclients = ll;
+        serverAssert(FALSE);
         if (ll > orig_value) {
             adjustOpenFilesLimit();
             if (server.maxclients != ll) {
@@ -968,15 +986,18 @@ void configSetCommand(client *c) {
                 server.maxclients = orig_value;
                 return;
             }
-            if ((unsigned int) aeGetSetSize(server.el) <
+            if ((unsigned int) aeGetSetSize(server.rgthreadvar[IDX_EVENT_LOOP_MAIN].el) <
                 server.maxclients + CONFIG_FDSET_INCR)
             {
-                if (aeResizeSetSize(server.el,
-                    server.maxclients + CONFIG_FDSET_INCR) == AE_ERR)
+                for (int iel = 0; iel < server.cthreads; ++iel)
                 {
-                    addReplyError(c,"The event loop API used by Redis is not able to handle the specified number of clients");
-                    server.maxclients = orig_value;
-                    return;
+                    if (aeResizeSetSize(server.rgthreadvar[iel].el,
+                        server.maxclients + CONFIG_FDSET_INCR) == AE_ERR)
+                    {
+                        addReplyError(c,"The event loop API used by Redis is not able to handle the specified number of clients");
+                        server.maxclients = orig_value;
+                        return;
+                    }
                 }
             }
         }
@@ -1359,6 +1380,7 @@ void configGetCommand(client *c) {
 
     /* String values */
     config_get_string_field("dbfilename",server.rdb_filename);
+    config_get_string_field("masteruser",server.masteruser);
     config_get_string_field("masterauth",server.masterauth);
     config_get_string_field("cluster-announce-ip",server.cluster_announce_ip);
     config_get_string_field("unixsocket",server.unixsocket);
@@ -2019,7 +2041,7 @@ void rewriteConfigClientoutputbufferlimitOption(struct rewriteConfigState *state
         rewriteConfigFormatMemory(soft,sizeof(soft),
                 server.client_obuf_limits[j].soft_limit_bytes);
 
-        char *typename = getClientTypeName(j);
+        const char *typename = getClientTypeName(j);
         if (!strcmp(typename,"slave")) typename = "replica";
         line = sdscatprintf(sdsempty(),"%s %s %s %s %ld",
                 option, typename, hard, soft,
@@ -2237,6 +2259,7 @@ int rewriteConfig(char *path) {
     rewriteConfigDirOption(state);
     rewriteConfigSlaveofOption(state,"replicaof");
     rewriteConfigStringOption(state,"replica-announce-ip",server.slave_announce_ip,CONFIG_DEFAULT_SLAVE_ANNOUNCE_IP);
+    rewriteConfigStringOption(state,"masteruser",server.masteruser,NULL);
     rewriteConfigStringOption(state,"masterauth",server.masterauth,NULL);
     rewriteConfigStringOption(state,"cluster-announce-ip",server.cluster_announce_ip,NULL);
     rewriteConfigYesNoOption(state,"replica-serve-stale-data",server.repl_serve_stale_data,CONFIG_DEFAULT_SLAVE_SERVE_STALE_DATA);
diff --git a/src/crc64.h b/src/crc64.h
index c9fca519d..e63cbc2e3 100644
--- a/src/crc64.h
+++ b/src/crc64.h
@@ -3,10 +3,18 @@
 
 #include <stdint.h>
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 uint64_t crc64(uint64_t crc, const unsigned char *s, uint64_t l);
 
 #ifdef REDIS_TEST
 int crc64Test(int argc, char *argv[]);
 #endif
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif
diff --git a/src/db.c b/src/db.c
index c8553c985..67631a597 100644
--- a/src/db.c
+++ b/src/db.c
@@ -99,6 +99,7 @@ robj *lookupKey(redisDb *db, robj *key, int flags) {
  * expiring our key via DELs in the replication link. */
 robj *lookupKeyReadWithFlags(redisDb *db, robj *key, int flags) {
     robj *val;
+    serverAssert(aeThreadOwnsLock());
 
     if (expireIfNeeded(db,key) == 1) {
         /* Key expired. If we are in the context of a master, expireIfNeeded()
@@ -1072,6 +1073,7 @@ int removeExpire(redisDb *db, robj *key) {
  * after which the key will no longer be considered valid. */
 void setExpire(client *c, redisDb *db, robj *key, long long when) {
     dictEntry *kde, *de;
+    serverAssert(aeThreadOwnsLock());
 
     /* Reuse the sds from the main dict in the expire dict */
     kde = dictFind(db->pdict,ptrFromObj(key));
@@ -1108,6 +1110,7 @@ long long getExpire(redisDb *db, robj *key) {
  * will be consistent even if we allow write operations against expiring
  * keys. */
 void propagateExpire(redisDb *db, robj *key, int lazy) {
+    serverAssert(aeThreadOwnsLock());
     robj *argv[2];
 
     argv[0] = lazy ? shared.unlink : shared.del;
diff --git a/src/debug.c b/src/debug.c
index 51e5f39f5..d24c9ef9c 100644
--- a/src/debug.c
+++ b/src/debug.c
@@ -803,7 +803,7 @@ static void *getMcontextEip(ucontext_t *uc) {
     #endif
 #elif defined(__linux__)
     /* Linux */
-    #if defined(__i386__)
+    #if defined(__i386__) || defined(__ILP32__)
     return (void*) uc->uc_mcontext.gregs[14]; /* Linux 32 */
     #elif defined(__X86_64__) || defined(__x86_64__)
     return (void*) uc->uc_mcontext.gregs[16]; /* Linux 64 */
@@ -915,7 +915,7 @@ void logRegisters(ucontext_t *uc) {
 /* Linux */
 #elif defined(__linux__)
     /* Linux x86 */
-    #if defined(__i386__)
+    #if defined(__i386__) || defined(__ILP32__)
     serverLog(LL_WARNING,
     "\n"
     "EAX:%08lx EBX:%08lx ECX:%08lx EDX:%08lx\n"
diff --git a/src/defrag.c b/src/defrag.c
index a6d64d065..af12289f3 100644
--- a/src/defrag.c
+++ b/src/defrag.c
@@ -116,17 +116,15 @@ robj *activeDefragStringOb(robj* ob, long *defragged) {
     /* try to defrag string object */
     if (ob->type == OBJ_STRING) {
         if(ob->encoding==OBJ_ENCODING_RAW) {
-            sds newsds = activeDefragSds((sds)ob->ptr);
+            sds newsds = activeDefragSds((sds)ptrFromObj(ob));
             if (newsds) {
-                ob->ptr = newsds;
+                ob->m_ptr = newsds;
                 (*defragged)++;
             }
         } else if (ob->encoding==OBJ_ENCODING_EMBSTR) {
             /* The sds is embedded in the object allocation, calculate the
              * offset and update the pointer in the new allocation. */
-            long ofs = (intptr_t)ob->ptr - (intptr_t)ob;
             if ((ret = activeDefragAlloc(ob))) {
-                ret->ptr = (void*)((intptr_t)ret + ofs);
                 (*defragged)++;
             }
         } else if (ob->encoding!=OBJ_ENCODING_INT) {
@@ -441,7 +439,7 @@ void defragLater(redisDb *db, dictEntry *kde) {
 }
 
 long scanLaterList(robj *ob) {
-    quicklist *ql = ob->ptr;
+    quicklist *ql = ptrFromObj(ob);
     if (ob->type != OBJ_LIST || ob->encoding != OBJ_ENCODING_QUICKLIST)
         return 0;
     server.stat_active_defrag_scanned+=ql->len;
@@ -463,7 +461,7 @@ void scanLaterZsetCallback(void *privdata, const dictEntry *_de) {
 long scanLaterZset(robj *ob, unsigned long *cursor) {
     if (ob->type != OBJ_ZSET || ob->encoding != OBJ_ENCODING_SKIPLIST)
         return 0;
-    zset *zs = (zset*)ob->ptr;
+    zset *zs = (zset*)ptrFromObj(ob);
     dict *d = zs->pdict;
     scanLaterZsetData data = {zs, 0};
     *cursor = dictScan(d, *cursor, scanLaterZsetCallback, defragDictBucketCallback, &data);
@@ -483,7 +481,7 @@ long scanLaterSet(robj *ob, unsigned long *cursor) {
     long defragged = 0;
     if (ob->type != OBJ_SET || ob->encoding != OBJ_ENCODING_HT)
         return 0;
-    dict *d = ob->ptr;
+    dict *d = ptrFromObj(ob);
     *cursor = dictScan(d, *cursor, scanLaterSetCallback, defragDictBucketCallback, &defragged);
     return defragged;
 }
@@ -504,7 +502,7 @@ long scanLaterHash(robj *ob, unsigned long *cursor) {
     long defragged = 0;
     if (ob->type != OBJ_HASH || ob->encoding != OBJ_ENCODING_HT)
         return 0;
-    dict *d = ob->ptr;
+    dict *d = ptrFromObj(ob);
     *cursor = dictScan(d, *cursor, scanLaterHashCallback, defragDictBucketCallback, &defragged);
     return defragged;
 }
@@ -512,10 +510,10 @@ long scanLaterHash(robj *ob, unsigned long *cursor) {
 long defragQuicklist(redisDb *db, dictEntry *kde) {
     robj *ob = dictGetVal(kde);
     long defragged = 0;
-    quicklist *ql = ob->ptr, *newql;
+    quicklist *ql = ptrFromObj(ob), *newql;
     serverAssert(ob->type == OBJ_LIST && ob->encoding == OBJ_ENCODING_QUICKLIST);
     if ((newql = activeDefragAlloc(ql)))
-        defragged++, ob->ptr = ql = newql;
+        defragged++, ob->m_ptr = ql = newql;
     if (ql->len > server.active_defrag_max_scan_fields)
         defragLater(db, kde);
     else
@@ -526,7 +524,7 @@ long defragQuicklist(redisDb *db, dictEntry *kde) {
 long defragZsetSkiplist(redisDb *db, dictEntry *kde) {
     robj *ob = dictGetVal(kde);
     long defragged = 0;
-    zset *zs = (zset*)ob->ptr;
+    zset *zs = (zset*)ptrFromObj(ob);
     zset *newzs;
     zskiplist *newzsl;
     dict *newdict;
@@ -534,7 +532,7 @@ long defragZsetSkiplist(redisDb *db, dictEntry *kde) {
     struct zskiplistNode *newheader;
     serverAssert(ob->type == OBJ_ZSET && ob->encoding == OBJ_ENCODING_SKIPLIST);
     if ((newzs = activeDefragAlloc(zs)))
-        defragged++, ob->ptr = zs = newzs;
+        defragged++, ob->m_ptr = zs = newzs;
     if ((newzsl = activeDefragAlloc(zs->zsl)))
         defragged++, zs->zsl = newzsl;
     if ((newheader = activeDefragAlloc(zs->zsl->header)))
@@ -561,16 +559,16 @@ long defragHash(redisDb *db, dictEntry *kde) {
     robj *ob = dictGetVal(kde);
     dict *d, *newd;
     serverAssert(ob->type == OBJ_HASH && ob->encoding == OBJ_ENCODING_HT);
-    d = ob->ptr;
+    d = ptrFromObj(ob);
     if (dictSize(d) > server.active_defrag_max_scan_fields)
         defragLater(db, kde);
     else
         defragged += activeDefragSdsDict(d, DEFRAG_SDS_DICT_VAL_IS_SDS);
     /* handle the dict struct */
-    if ((newd = activeDefragAlloc(ob->ptr)))
-        defragged++, ob->ptr = newd;
+    if ((newd = activeDefragAlloc(ptrFromObj(ob))))
+        defragged++, ob->m_ptr = newd;
     /* defrag the dict tables */
-    defragged += dictDefragTables(ob->ptr);
+    defragged += dictDefragTables(ptrFromObj(ob));
     return defragged;
 }
 
@@ -579,16 +577,16 @@ long defragSet(redisDb *db, dictEntry *kde) {
     robj *ob = dictGetVal(kde);
     dict *d, *newd;
     serverAssert(ob->type == OBJ_SET && ob->encoding == OBJ_ENCODING_HT);
-    d = ob->ptr;
+    d = ptrFromObj(ob);
     if (dictSize(d) > server.active_defrag_max_scan_fields)
         defragLater(db, kde);
     else
         defragged += activeDefragSdsDict(d, DEFRAG_SDS_DICT_NO_VAL);
     /* handle the dict struct */
-    if ((newd = activeDefragAlloc(ob->ptr)))
-        defragged++, ob->ptr = newd;
+    if ((newd = activeDefragAlloc(ptrFromObj(ob))))
+        defragged++, ob->m_ptr = newd;
     /* defrag the dict tables */
-    defragged += dictDefragTables(ob->ptr);
+    defragged += dictDefragTables(ptrFromObj(ob));
     return defragged;
 }
 
@@ -613,11 +611,11 @@ int scanLaterStraemListpacks(robj *ob, unsigned long *cursor, long long endtime,
         return 0;
     }
 
-    stream *s = ob->ptr;
-    raxStart(&ri,s->rax);
+    stream *s = ptrFromObj(ob);
+    raxStart(&ri,s->prax);
     if (*cursor == 0) {
         /* if cursor is 0, we start new iteration */
-        defragRaxNode(&s->rax->head);
+        defragRaxNode(&s->prax->head);
         /* assign the iterator node callback before the seek, so that the
          * initial nodes that are processed till the first item are covered */
         ri.node_cb = defragRaxNode;
@@ -738,19 +736,19 @@ long defragStream(redisDb *db, dictEntry *kde) {
     long defragged = 0;
     robj *ob = dictGetVal(kde);
     serverAssert(ob->type == OBJ_STREAM && ob->encoding == OBJ_ENCODING_STREAM);
-    stream *s = ob->ptr, *news;
+    stream *s = ptrFromObj(ob), *news;
 
     /* handle the main struct */
     if ((news = activeDefragAlloc(s)))
-        defragged++, ob->ptr = s = news;
+        defragged++, ob->m_ptr = s = news;
 
-    if (raxSize(s->rax) > server.active_defrag_max_scan_fields) {
-        rax *newrax = activeDefragAlloc(s->rax);
+    if (raxSize(s->prax) > server.active_defrag_max_scan_fields) {
+        rax *newrax = activeDefragAlloc(s->prax);
         if (newrax)
-            defragged++, s->rax = newrax;
+            defragged++, s->prax = newrax;
         defragLater(db, kde);
     } else
-        defragged += defragRadixTree(&s->rax, 1, NULL, NULL);
+        defragged += defragRadixTree(&s->prax, 1, NULL, NULL);
 
     if (s->cgroups)
         defragged += defragRadixTree(&s->cgroups, 1, defragStreamConsumerGroup, NULL);
@@ -792,8 +790,8 @@ long defragKey(redisDb *db, dictEntry *de) {
         if (ob->encoding == OBJ_ENCODING_QUICKLIST) {
             defragged += defragQuicklist(db, de);
         } else if (ob->encoding == OBJ_ENCODING_ZIPLIST) {
-            if ((newzl = activeDefragAlloc(ob->ptr)))
-                defragged++, ob->ptr = newzl;
+            if ((newzl = activeDefragAlloc(ptrFromObj(ob))))
+                defragged++, ob->m_ptr = newzl;
         } else {
             serverPanic("Unknown list encoding");
         }
@@ -801,16 +799,16 @@ long defragKey(redisDb *db, dictEntry *de) {
         if (ob->encoding == OBJ_ENCODING_HT) {
             defragged += defragSet(db, de);
         } else if (ob->encoding == OBJ_ENCODING_INTSET) {
-            intset *newis, *is = ob->ptr;
+            intset *newis, *is = ptrFromObj(ob);
             if ((newis = activeDefragAlloc(is)))
-                defragged++, ob->ptr = newis;
+                defragged++, ob->m_ptr = newis;
         } else {
             serverPanic("Unknown set encoding");
         }
     } else if (ob->type == OBJ_ZSET) {
         if (ob->encoding == OBJ_ENCODING_ZIPLIST) {
-            if ((newzl = activeDefragAlloc(ob->ptr)))
-                defragged++, ob->ptr = newzl;
+            if ((newzl = activeDefragAlloc(ptrFromObj(ob))))
+                defragged++, ob->m_ptr = newzl;
         } else if (ob->encoding == OBJ_ENCODING_SKIPLIST) {
             defragged += defragZsetSkiplist(db, de);
         } else {
@@ -818,8 +816,8 @@ long defragKey(redisDb *db, dictEntry *de) {
         }
     } else if (ob->type == OBJ_HASH) {
         if (ob->encoding == OBJ_ENCODING_ZIPLIST) {
-            if ((newzl = activeDefragAlloc(ob->ptr)))
-                defragged++, ob->ptr = newzl;
+            if ((newzl = activeDefragAlloc(ptrFromObj(ob))))
+                defragged++, ob->m_ptr = newzl;
         } else if (ob->encoding == OBJ_ENCODING_HT) {
             defragged += defragHash(db, de);
         } else {
diff --git a/src/dict.c b/src/dict.c
index 3560eb3d3..9b5aba452 100644
--- a/src/dict.c
+++ b/src/dict.c
@@ -739,6 +739,30 @@ unsigned int dictGetSomeKeys(dict *d, dictEntry **des, unsigned int count) {
     return stored;
 }
 
+/* This is like dictGetRandomKey() from the POV of the API, but will do more
+ * work to ensure a better distribution of the returned element.
+ *
+ * This function improves the distribution because the dictGetRandomKey()
+ * problem is that it selects a random bucket, then it selects a random
+ * element from the chain in the bucket. However elements being in different
+ * chain lengths will have different probabilities of being reported. With
+ * this function instead what we do is to consider a "linear" range of the table
+ * that may be constituted of N buckets with chains of different lengths
+ * appearing one after the other. Then we report a random element in the range.
+ * In this way we smooth away the problem of different chain lenghts. */
+#define GETFAIR_NUM_ENTRIES 15
+dictEntry *dictGetFairRandomKey(dict *d) {
+    dictEntry *entries[GETFAIR_NUM_ENTRIES];
+    unsigned int count = dictGetSomeKeys(d,entries,GETFAIR_NUM_ENTRIES);
+    /* Note that dictGetSomeKeys() may return zero elements in an unlucky
+     * run() even if there are actually elements inside the hash table. So
+     * when we get zero, we call the true dictGetRandomKey() that will always
+     * yeld the element if the hash table has at least one. */
+    if (count == 0) return dictGetRandomKey(d);
+    unsigned int idx = rand() % count;
+    return entries[idx];
+}
+
 /* Function to reverse bits. Algorithm from:
  * http://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel */
 static unsigned long rev(unsigned long v) {
diff --git a/src/dict.h b/src/dict.h
index 62018cc44..4befb9b66 100644
--- a/src/dict.h
+++ b/src/dict.h
@@ -35,6 +35,10 @@
 
 #include <stdint.h>
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #ifndef __DICT_H
 #define __DICT_H
 
@@ -166,6 +170,7 @@ dictIterator *dictGetSafeIterator(dict *d);
 dictEntry *dictNext(dictIterator *iter);
 void dictReleaseIterator(dictIterator *iter);
 dictEntry *dictGetRandomKey(dict *d);
+dictEntry *dictGetFairRandomKey(dict *d);
 unsigned int dictGetSomeKeys(dict *d, dictEntry **des, unsigned int count);
 void dictGetStats(char *buf, size_t bufsize, dict *d);
 uint64_t dictGenHashFunction(const void *key, int len);
@@ -186,4 +191,8 @@ extern dictType dictTypeHeapStringCopyKey;
 extern dictType dictTypeHeapStrings;
 extern dictType dictTypeHeapStringCopyKeyValue;
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif /* __DICT_H */
diff --git a/src/endianconv.h b/src/endianconv.h
index 475f72b08..3c8aef14f 100644
--- a/src/endianconv.h
+++ b/src/endianconv.h
@@ -36,6 +36,10 @@
 #include "config.h"
 #include <stdint.h>
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 void memrev16(void *p);
 void memrev32(void *p);
 void memrev64(void *p);
@@ -75,4 +79,8 @@ uint64_t intrev64(uint64_t v);
 int endianconvTest(int argc, char *argv[]);
 #endif
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif
diff --git a/src/evict.c b/src/evict.c
index 28cd73f6f..48d6d0387 100644
--- a/src/evict.c
+++ b/src/evict.c
@@ -350,6 +350,7 @@ unsigned long LFUDecrAndReturn(robj *o) {
  * used memory: the eviction should use mostly data size. This function
  * returns the sum of AOF and slaves buffer. */
 size_t freeMemoryGetNotCountedMemory(void) {
+    serverAssert(aeThreadOwnsLock());
     size_t overhead = 0;
     int slaves = listLength(server.slaves);
 
@@ -444,6 +445,7 @@ int getMaxmemoryState(size_t *total, size_t *logical, size_t *tofree, float *lev
  * Otehrwise if we are over the memory limit, but not enough memory
  * was freed to return back under the limit, the function returns C_ERR. */
 int freeMemoryIfNeeded(void) {
+    serverAssert(aeThreadOwnsLock());
     /* By default replicas should ignore maxmemory
      * and just be masters exact copies. */
     if (server.masterhost && server.repl_slave_ignore_maxmemory) return C_OK;
diff --git a/src/fastlock.cpp b/src/fastlock.cpp
index 42741d0a1..4a4fb2962 100644
--- a/src/fastlock.cpp
+++ b/src/fastlock.cpp
@@ -1,25 +1,135 @@
+/* 
+ * Copyright (c) 2019, John Sully <john at eqalpha dot com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *   * Neither the name of Redis nor the names of its contributors may be used
+ *     to endorse or promote products derived from this software without
+ *     specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
 #include "fastlock.h"
 #include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sched.h>
+#include <atomic>
+#include <assert.h>
+
+/****************************************************
+ *
+ *      Implementation of a fair spinlock.  To promote fairness we
+ *      use a ticket lock instead of a raw spinlock
+ * 
+ ****************************************************/
+
+static_assert(sizeof(pid_t) <= sizeof(fastlock::m_pidOwner), "fastlock::m_pidOwner not large enough");
+
+static pid_t gettid()
+{
+    static thread_local int pidCache = -1;
+    if (pidCache == -1)
+        pidCache = syscall(SYS_gettid);
+    return pidCache;
+}
 
 extern "C" void fastlock_init(struct fastlock *lock)
 {
-    lock->m_lock = 0;
+    lock->m_ticket.m_active = 0;
+    lock->m_ticket.m_avail = 0;
+    lock->m_depth = 0;
+    lock->m_pidOwner = -1;
 }
 
 extern "C" void fastlock_lock(struct fastlock *lock)
 {
-    while (!__sync_bool_compare_and_swap(&lock->m_lock, 0, 1))
+    if ((int)__atomic_load_4(&lock->m_pidOwner, __ATOMIC_ACQUIRE) == gettid())
     {
+        ++lock->m_depth;
+        return;
     }
+
+    unsigned myticket = __atomic_fetch_add(&lock->m_ticket.m_avail, 1, __ATOMIC_RELEASE);
+
+    int cloops = 0;
+    while (__atomic_load_2(&lock->m_ticket.m_active, __ATOMIC_ACQUIRE) != myticket)
+    {
+        if ((++cloops % 1024*1024) == 0)
+            sched_yield();
+    }
+
+    lock->m_depth = 1;
+    __atomic_store_4(&lock->m_pidOwner, gettid(), __ATOMIC_RELEASE);
+    std::atomic_thread_fence(std::memory_order_acquire);
+}
+
+extern "C" int fastlock_trylock(struct fastlock *lock)
+{
+    if ((int)__atomic_load_4(&lock->m_pidOwner, __ATOMIC_ACQUIRE) == gettid())
+    {
+        ++lock->m_depth;
+        return true;
+    }
+
+    // cheap test
+    if (lock->m_ticket.m_active != lock->m_ticket.m_avail)
+        return false;
+
+    uint16_t active = __atomic_load_2(&lock->m_ticket.m_active, __ATOMIC_RELAXED);
+    uint16_t next = active + 1;
+
+    struct ticket ticket_expect { active, active };
+    struct ticket ticket_setiflocked { active, next };
+    if (__atomic_compare_exchange(&lock->m_ticket, &ticket_expect, &ticket_setiflocked, true /*strong*/, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE))
+    {
+        lock->m_depth = 1;
+        __atomic_store_4(&lock->m_pidOwner, gettid(), __ATOMIC_RELEASE);
+        return true;
+    }
+    return false;
 }
 
 extern "C" void fastlock_unlock(struct fastlock *lock)
 {
-    __sync_bool_compare_and_swap(&lock->m_lock, 1, 0);
+    --lock->m_depth;
+    if (lock->m_depth == 0)
+    {
+        assert((int)__atomic_load_4(&lock->m_pidOwner, __ATOMIC_RELAXED) >= 0);  // unlock after free
+        lock->m_pidOwner = -1;
+        std::atomic_thread_fence(std::memory_order_acquire);
+        __atomic_fetch_add(&lock->m_ticket.m_active, 1, __ATOMIC_ACQ_REL);
+    }
 }
 
 extern "C" void fastlock_free(struct fastlock *lock)
 {
     // NOP
-    (void)lock;
+    assert((lock->m_ticket.m_active == lock->m_ticket.m_avail)                                        // Asser the lock is unlocked
+        || (lock->m_pidOwner == gettid() && (lock->m_ticket.m_active == lock->m_ticket.m_avail-1)));  // OR we own the lock and nobody else is waiting
+    lock->m_pidOwner = -2;  // sentinal value indicating free
+}
+
+
+bool fastlock::fOwnLock()
+{
+    return gettid() == m_pidOwner;
 }
\ No newline at end of file
diff --git a/src/fastlock.h b/src/fastlock.h
index 864c86822..b5a70c530 100644
--- a/src/fastlock.h
+++ b/src/fastlock.h
@@ -1,4 +1,5 @@
 #pragma once
+#include <inttypes.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -8,6 +9,7 @@ extern "C" {
 struct fastlock;
 void fastlock_init(struct fastlock *lock);
 void fastlock_lock(struct fastlock *lock);
+int fastlock_trylock(struct fastlock *lock);
 void fastlock_unlock(struct fastlock *lock);
 void fastlock_free(struct fastlock *lock);
 
@@ -16,19 +18,39 @@ void fastlock_free(struct fastlock *lock);
 }
 #endif
 
+struct ticket
+{
+    uint16_t m_active;
+    uint16_t m_avail;
+};
 struct fastlock
 {
-    int m_lock;
+    volatile struct ticket m_ticket;
+
+    volatile int m_pidOwner;
+    volatile int m_depth;
 
 #ifdef __cplusplus
+    fastlock()
+    {
+        fastlock_init(this);
+    }
+
     void lock()
     {
         fastlock_lock(this);
     }
 
+    bool try_lock()
+    {
+        return !!fastlock_trylock(this);
+    }
+
     void unlock()
     {
         fastlock_unlock(this);
     }
+
+    bool fOwnLock();   // true if this thread owns the lock, NOTE: not 100% reliable, use for debugging only
 #endif
 };
diff --git a/src/fmacros.h b/src/fmacros.h
index 3b1bc5eb8..a56bb9331 100644
--- a/src/fmacros.h
+++ b/src/fmacros.h
@@ -30,13 +30,11 @@
 #ifndef _REDIS_FMACRO_H
 #define _REDIS_FMACRO_H
 
-#define _BSD_SOURCE
+#define _DEFAULT_SOURCE 1
 
 #if defined(__linux__)
-#ifndef __cplusplus
-#define _GNU_SOURCE
-#define _DEFAULT_SOURCE
-#endif
+#define _GNU_SOURCE 1
+#define _DEFAULT_SOURCE 1
 #endif
 
 #if defined(_AIX)
diff --git a/src/intset.h b/src/intset.h
index 6849abff9..9bf172d5d 100644
--- a/src/intset.h
+++ b/src/intset.h
@@ -41,7 +41,9 @@
 typedef struct intset {
     uint32_t encoding;
     uint32_t length;
-    int8_t contents[ZERO_LENGTH_ARRAY_LENGTH];
+#ifndef __cplusplus
+    int8_t contents[];
+#endif
 } intset;
 
 intset *intsetNew(void);
diff --git a/src/module.c b/src/module.c
index 45fa4e293..54a36e0c1 100644
--- a/src/module.c
+++ b/src/module.c
@@ -484,6 +484,7 @@ void moduleFreeContext(RedisModuleCtx *ctx) {
  * details needed to correctly replicate commands. */
 void moduleHandlePropagationAfterCommandCallback(RedisModuleCtx *ctx) {
     client *c = ctx->client;
+    serverAssert(aeThreadOwnsLock());
 
     if (c->flags & CLIENT_LUA) return;
 
@@ -2696,7 +2697,7 @@ RedisModuleCallReply *RM_Call(RedisModuleCtx *ctx, const char *cmdname, const ch
 
     /* Create the client and dispatch the command. */
     va_start(ap, fmt);
-    c = createClient(-1);
+    c = createClient(-1, IDX_EVENT_LOOP_MAIN);
     c->puser = NULL; /* Root user. */
     argv = moduleCreateArgvFromUserFormat(cmdname,fmt,&argc,&flags,ap);
     replicate = flags & REDISMODULE_ARGV_REPLICATE;
@@ -3546,7 +3547,7 @@ RedisModuleBlockedClient *RM_BlockClient(RedisModuleCtx *ctx, RedisModuleCmdFunc
     bc->disconnect_callback = NULL; /* Set by RM_SetDisconnectCallback() */
     bc->free_privdata = free_privdata;
     bc->privdata = NULL;
-    bc->reply_client = createClient(-1);
+    bc->reply_client = createClient(-1, IDX_EVENT_LOOP_MAIN);
     bc->reply_client->flags |= CLIENT_MODULE;
     bc->dbid = c->db->id;
     c->bpop.timeout = timeout_ms ? (mstime()+timeout_ms) : 0;
@@ -3623,6 +3624,7 @@ void RM_SetDisconnectCallback(RedisModuleBlockedClient *bc, RedisModuleDisconnec
 void moduleHandleBlockedClients(void) {
     listNode *ln;
     RedisModuleBlockedClient *bc;
+    serverAssert(aeThreadOwnsLock());
 
     pthread_mutex_lock(&moduleUnblockedClientsMutex);
     /* Here we unblock all the pending clients blocked in modules operations
@@ -3633,9 +3635,16 @@ void moduleHandleBlockedClients(void) {
         ln = listFirst(moduleUnblockedClients);
         bc = ln->value;
         client *c = bc->client;
+        serverAssert(c->iel == IDX_EVENT_LOOP_MAIN);
         listDelNode(moduleUnblockedClients,ln);
         pthread_mutex_unlock(&moduleUnblockedClientsMutex);
 
+        if (c)
+        {
+            AssertCorrectThread(c);
+            fastlock_lock(&c->lock);
+        }
+
         /* Release the lock during the loop, as long as we don't
          * touch the shared list. */
 
@@ -3692,13 +3701,15 @@ void moduleHandleBlockedClients(void) {
                 !(c->flags & CLIENT_PENDING_WRITE))
             {
                 c->flags |= CLIENT_PENDING_WRITE;
-                listAddNodeHead(server.clients_pending_write,c);
+                AssertCorrectThread(c);
+                listAddNodeHead(server.rgthreadvar[c->iel].clients_pending_write,c);
             }
         }
 
         /* Free 'bc' only after unblocking the client, since it is
          * referenced in the client blocking context, and must be valid
          * when calling unblockClient(). */
+        fastlock_unlock(&c->lock);
         zfree(bc);
 
         /* Lock again before to iterate the loop. */
@@ -3794,7 +3805,7 @@ RedisModuleCtx *RM_GetThreadSafeContext(RedisModuleBlockedClient *bc) {
      * access it safely from another thread, so we create a fake client here
      * in order to keep things like the currently selected database and similar
      * things. */
-    ctx->client = createClient(-1);
+    ctx->client = createClient(-1, IDX_EVENT_LOOP_MAIN);
     if (bc) selectDb(ctx->client,bc->dbid);
     return ctx;
 }
@@ -4300,7 +4311,7 @@ RedisModuleTimerID RM_CreateTimer(RedisModuleCtx *ctx, mstime_t period, RedisMod
         if (memcmp(ri.key,&key,sizeof(key)) == 0) {
             /* This is the first key, we need to re-install the timer according
              * to the just added event. */
-            aeDeleteTimeEvent(server.el,aeTimer);
+            aeDeleteTimeEvent(server.rgthreadvar[IDX_EVENT_LOOP_MAIN].el,aeTimer);
             aeTimer = -1;
         }
         raxStop(&ri);
@@ -4309,7 +4320,7 @@ RedisModuleTimerID RM_CreateTimer(RedisModuleCtx *ctx, mstime_t period, RedisMod
     /* If we have no main timer (the old one was invalidated, or this is the
      * first module timer we have), install one. */
     if (aeTimer == -1)
-        aeTimer = aeCreateTimeEvent(server.el,period,moduleTimerHandler,NULL,NULL);
+        aeTimer = aeCreateTimeEvent(server.rgthreadvar[IDX_EVENT_LOOP_MAIN].el,period,moduleTimerHandler,NULL,NULL);
 
     return key;
 }
@@ -4659,7 +4670,7 @@ void moduleInitModulesSystem(void) {
 
     /* Set up the keyspace notification susbscriber list and static client */
     moduleKeyspaceSubscribers = listCreate();
-    moduleFreeContextReusedClient = createClient(-1);
+    moduleFreeContextReusedClient = createClient(-1, IDX_EVENT_LOOP_MAIN);
     moduleFreeContextReusedClient->flags |= CLIENT_MODULE;
     moduleFreeContextReusedClient->puser = NULL; /* root user. */
 
diff --git a/src/multi.c b/src/multi.c
index 6d722b8af..4f7711f6c 100644
--- a/src/multi.c
+++ b/src/multi.c
@@ -72,6 +72,7 @@ void queueMultiCommand(client *c) {
 }
 
 void discardTransaction(client *c) {
+    serverAssert(aeThreadOwnsLock());
     freeClientMultiState(c);
     initClientMultiState(c);
     c->flags &= ~(CLIENT_MULTI|CLIENT_DIRTY_CAS|CLIENT_DIRTY_EXEC);
@@ -81,11 +82,13 @@ void discardTransaction(client *c) {
 /* Flag the transacation as DIRTY_EXEC so that EXEC will fail.
  * Should be called every time there is an error while queueing a command. */
 void flagTransaction(client *c) {
+    serverAssert(aeThreadOwnsLock());
     if (c->flags & CLIENT_MULTI)
         c->flags |= CLIENT_DIRTY_EXEC;
 }
 
 void multiCommand(client *c) {
+    serverAssert(aeThreadOwnsLock());
     if (c->flags & CLIENT_MULTI) {
         addReplyError(c,"MULTI calls can not be nested");
         return;
@@ -291,6 +294,7 @@ void unwatchAllKeys(client *c) {
 /* "Touch" a key, so that if this key is being WATCHed by some client the
  * next EXEC will fail. */
 void touchWatchedKey(redisDb *db, robj *key) {
+    serverAssert(aeThreadOwnsLock());
     list *clients;
     listIter li;
     listNode *ln;
@@ -316,6 +320,7 @@ void touchWatchedKey(redisDb *db, robj *key) {
 void touchWatchedKeysOnFlush(int dbid) {
     listIter li1, li2;
     listNode *ln;
+    serverAssert(aeThreadOwnsLock());
 
     /* For every client, check all the waited keys */
     listRewind(server.clients,&li1);
@@ -350,6 +355,7 @@ void watchCommand(client *c) {
 
 void unwatchCommand(client *c) {
     unwatchAllKeys(c);
+    serverAssert(aeThreadOwnsLock());
     c->flags &= (~CLIENT_DIRTY_CAS);
     addReply(c,shared.ok);
 }
diff --git a/src/networking.c b/src/networking.cpp
similarity index 77%
rename from src/networking.c
rename to src/networking.cpp
index 1c917af2a..2cddccf92 100644
--- a/src/networking.c
+++ b/src/networking.cpp
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2009-2012, Salvatore Sanfilippo <antirez at gmail dot com>
+ * Copyright (c) 2019 John Sully <john at eqalpha dot com>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -32,8 +33,67 @@
 #include <sys/uio.h>
 #include <math.h>
 #include <ctype.h>
+#include <vector>
+#include <mutex>
 
 static void setProtocolError(const char *errstr, client *c);
+void addReplyLongLongWithPrefixCore(client *c, long long ll, char prefix, bool fAsync);
+void addReplyBulkCStringCore(client *c, const char *s, bool fAsync);
+
+class AeLocker
+{
+    bool m_fArmed = false;
+
+public:
+    AeLocker()
+    {
+    }
+
+    void arm(client *c) // if a client is passed, then the client is already locked
+    {
+        if (c != nullptr)
+        {
+            serverAssert(!m_fArmed);
+            serverAssert(c->lock.fOwnLock());
+
+            bool fClientLocked = true;
+            while (!aeTryAcquireLock())
+            {
+                if (fClientLocked) c->lock.unlock();
+                fClientLocked = false;
+                aeAcquireLock();
+                if (!c->lock.try_lock())
+                {
+                    aeReleaseLock();
+                }
+                else
+                {
+                    break;
+                }
+            }
+            
+            m_fArmed = true;
+        }
+        else if (!m_fArmed)
+        {
+            m_fArmed = true;
+            aeAcquireLock();
+        }
+    }
+
+    void disarm()
+    {
+        serverAssert(m_fArmed);
+        m_fArmed = false;
+        aeReleaseLock();
+    }
+
+    ~AeLocker()
+    {
+        if (m_fArmed)
+            aeReleaseLock();
+    }
+};
 
 /* Return the size consumed from the allocator, for the specified SDS string,
  * including internal fragmentation. This function is used in order to compute
@@ -48,7 +108,7 @@ size_t sdsZmallocSize(sds s) {
 size_t getStringObjectSdsUsedMemory(robj *o) {
     serverAssertWithInfo(NULL,o,o->type == OBJ_STRING);
     switch(o->encoding) {
-    case OBJ_ENCODING_RAW: return sdsZmallocSize(ptrFromObj(o));
+    case OBJ_ENCODING_RAW: return sdsZmallocSize((sds)ptrFromObj(o));
     case OBJ_ENCODING_EMBSTR: return zmalloc_size(o)-sizeof(robj);
     default: return 0; /* Just integer encoding for now. */
     }
@@ -56,8 +116,8 @@ size_t getStringObjectSdsUsedMemory(robj *o) {
 
 /* Client.reply list dup and free methods. */
 void *dupClientReplyValue(void *o) {
-    clientReplyBlock *old = o;
-    clientReplyBlock *buf = zmalloc(sizeof(clientReplyBlock) + old->size, MALLOC_LOCAL);
+    clientReplyBlock *old = (clientReplyBlock*)o;
+    clientReplyBlock *buf = (clientReplyBlock*)zmalloc(sizeof(clientReplyBlock) + old->size, MALLOC_LOCAL);
     memcpy(buf, o, sizeof(clientReplyBlock) + old->size);
     return buf;
 }
@@ -67,7 +127,7 @@ void freeClientReplyValue(void *o) {
 }
 
 int listMatchObjects(void *a, void *b) {
-    return equalStringObjects(a,b);
+    return equalStringObjects((robj*)a,(robj*)b);
 }
 
 /* This function links the client to the global linked list of clients.
@@ -82,9 +142,10 @@ void linkClient(client *c) {
     raxInsert(server.clients_index,(unsigned char*)&id,sizeof(id),c,NULL);
 }
 
-client *createClient(int fd) {
-    client *c = zmalloc(sizeof(client), MALLOC_LOCAL);
+client *createClient(int fd, int iel) {
+    client *c = (client*)zmalloc(sizeof(client), MALLOC_LOCAL);
 
+    c->iel = iel;
     /* passing -1 as fd it is possible to create a non connected client.
      * This is useful since all the commands needs to be executed
      * in the context of a client. When commands are executed in other
@@ -94,7 +155,7 @@ client *createClient(int fd) {
         anetEnableTcpNoDelay(NULL,fd);
         if (server.tcpkeepalive)
             anetKeepAlive(NULL,fd,server.tcpkeepalive);
-        if (aeCreateFileEvent(server.el,fd,AE_READABLE,
+        if (aeCreateFileEvent(server.rgthreadvar[iel].el,fd,AE_READABLE|AE_READ_THREADSAFE,
             readQueryFromClient, c) == AE_ERR)
         {
             close(fd);
@@ -106,6 +167,8 @@ client *createClient(int fd) {
     selectDb(c,0);
     uint64_t client_id;
     atomicGetIncr(server.next_client_id,client_id,1);
+    c->iel = iel;
+    fastlock_init(&c->lock);
     c->id = client_id;
     c->resp = 2;
     c->fd = fd;
@@ -123,7 +186,9 @@ client *createClient(int fd) {
     c->multibulklen = 0;
     c->bulklen = -1;
     c->sentlen = 0;
+    c->sentlenAsync = 0;
     c->flags = 0;
+    c->fPendingAsyncWrite = FALSE;
     c->ctime = c->lastinteraction = server.unixtime;
     /* If the default user does not require authentication, the user is
      * directly authenticated. */
@@ -157,10 +222,15 @@ client *createClient(int fd) {
     c->pubsub_patterns = listCreate();
     c->peerid = NULL;
     c->client_list_node = NULL;
+    c->bufAsync = NULL;
+    c->buflenAsync = 0;
+    c->bufposAsync = 0;
+
     listSetFreeMethod(c->pubsub_patterns,decrRefCountVoid);
     listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
     if (fd != -1) linkClient(c);
     initClientMultiState(c);
+    AssertCorrectThread(c);
     return c;
 }
 
@@ -179,6 +249,8 @@ void clientInstallWriteHandler(client *c) {
         (c->replstate == REPL_STATE_NONE ||
          (c->replstate == SLAVE_STATE_ONLINE && !c->repl_put_online_on_ack)))
     {
+        AssertCorrectThread(c);
+        serverAssert(c->lock.fOwnLock());
         /* Here instead of installing the write handler, we just flag the
          * client and put it into a list of clients that have something
          * to write to the socket. This way before re-entering the event
@@ -186,7 +258,15 @@ void clientInstallWriteHandler(client *c) {
          * a system call. We'll only really install the write handler if
          * we'll not be able to write the whole reply at once. */
         c->flags |= CLIENT_PENDING_WRITE;
-        listAddNodeHead(server.clients_pending_write,c);
+        listAddNodeHead(server.rgthreadvar[c->iel].clients_pending_write,c);
+    }
+}
+
+void clientInstallAsyncWriteHandler(client *c) {
+    serverAssert(aeThreadOwnsLock());
+    if (!(c->fPendingAsyncWrite)) {
+        c->fPendingAsyncWrite = TRUE;
+        listAddNodeHead(serverTL->clients_pending_asyncwrite,c);
     }
 }
 
@@ -212,7 +292,11 @@ void clientInstallWriteHandler(client *c) {
  * Typically gets called every time a reply is built, before adding more
  * data to the clients output buffers. If the function returns C_ERR no
  * data should be appended to the output buffers. */
-int prepareClientToWrite(client *c) {
+int prepareClientToWrite(client *c, bool fAsync) {
+    fAsync = fAsync && !FCorrectThread(c);  // Not async if we're on the right thread
+    serverAssert(!fAsync || aeThreadOwnsLock());
+    serverAssert(c->lock.fOwnLock());
+
     /* If it's the Lua client we always return ok without installing any
      * handler since there is no socket at all. */
     if (c->flags & (CLIENT_LUA|CLIENT_MODULE)) return C_OK;
@@ -229,7 +313,8 @@ int prepareClientToWrite(client *c) {
 
     /* Schedule the client to write the output buffers to the socket, unless
      * it should already be setup to do so (it has already pending data). */
-    if (!clientHasPendingReplies(c)) clientInstallWriteHandler(c);
+    if (!fAsync && !clientHasPendingReplies(c)) clientInstallWriteHandler(c);
+    if (fAsync && !(c->fPendingAsyncWrite)) clientInstallAsyncWriteHandler(c);
 
     /* Authorize the caller to queue in the output buffer of this client. */
     return C_OK;
@@ -239,28 +324,46 @@ int prepareClientToWrite(client *c) {
  * Low level functions to add more data to output buffers.
  * -------------------------------------------------------------------------- */
 
-int _addReplyToBuffer(client *c, const char *s, size_t len) {
-    size_t available = sizeof(c->buf)-c->bufpos;
-
+int _addReplyToBuffer(client *c, const char *s, size_t len, bool fAsync) {
     if (c->flags & CLIENT_CLOSE_AFTER_REPLY) return C_OK;
 
-    /* If there already are entries in the reply list, we cannot
-     * add anything more to the static buffer. */
-    if (listLength(c->reply) > 0) return C_ERR;
+    fAsync = fAsync && !FCorrectThread(c);  // Not async if we're on the right thread
+    if (fAsync)
+    {
+        serverAssert(aeThreadOwnsLock());
+        if ((c->buflenAsync - c->bufposAsync) < (int)len)
+        {
+            int minsize = len + c->bufposAsync;
+            c->buflenAsync = std::max(minsize, c->buflenAsync*2 - c->buflenAsync);
+            c->bufAsync = (char*)zrealloc(c->bufAsync, c->buflenAsync, MALLOC_LOCAL);
+            c->buflenAsync = zmalloc_usable(c->bufAsync);
+        }
+        memcpy(c->bufAsync+c->bufposAsync,s,len);
+        c->bufposAsync += len;
+    }
+    else
+    {
+        size_t available = sizeof(c->buf)-c->bufpos;
 
-    /* Check that the buffer has enough space available for this string. */
-    if (len > available) return C_ERR;
+        /* If there already are entries in the reply list, we cannot
+        * add anything more to the static buffer. */
+        if (listLength(c->reply) > 0) return C_ERR;
 
-    memcpy(c->buf+c->bufpos,s,len);
-    c->bufpos+=len;
+        /* Check that the buffer has enough space available for this string. */
+        if (len > available) return C_ERR;
+
+        memcpy(c->buf+c->bufpos,s,len);
+        c->bufpos+=len;
+    }
     return C_OK;
 }
 
 void _addReplyProtoToList(client *c, const char *s, size_t len) {
     if (c->flags & CLIENT_CLOSE_AFTER_REPLY) return;
+    AssertCorrectThread(c);
 
     listNode *ln = listLast(c->reply);
-    clientReplyBlock *tail = ln? listNodeValue(ln): NULL;
+    clientReplyBlock *tail = (clientReplyBlock*) (ln? listNodeValue(ln): NULL);
 
     /* Note that 'tail' may be NULL even if we have a tail node, becuase when
      * addDeferredMultiBulkLength() is used, it sets a dummy node to NULL just
@@ -272,7 +375,7 @@ void _addReplyProtoToList(client *c, const char *s, size_t len) {
          * new node */
         size_t avail = tail->size - tail->used;
         size_t copy = avail >= len? len: avail;
-        memcpy(tail->buf + tail->used, s, copy);
+        memcpy(tail->buf() + tail->used, s, copy);
         tail->used += copy;
         s += copy;
         len -= copy;
@@ -281,11 +384,11 @@ void _addReplyProtoToList(client *c, const char *s, size_t len) {
         /* Create a new node, make sure it is allocated to at
          * least PROTO_REPLY_CHUNK_BYTES */
         size_t size = len < PROTO_REPLY_CHUNK_BYTES? PROTO_REPLY_CHUNK_BYTES: len;
-        tail = zmalloc(size + sizeof(clientReplyBlock), MALLOC_LOCAL);
+        tail = (clientReplyBlock*)zmalloc(size + sizeof(clientReplyBlock), MALLOC_LOCAL);
         /* take over the allocation's internal fragmentation */
         tail->size = zmalloc_usable(tail) - sizeof(clientReplyBlock);
         tail->used = len;
-        memcpy(tail->buf, s, len);
+        memcpy(tail->buf(), s, len);
         listAddNodeTail(c->reply, tail);
         c->reply_bytes += tail->size;
     }
@@ -296,40 +399,56 @@ void _addReplyProtoToList(client *c, const char *s, size_t len) {
  * Higher level functions to queue data on the client output buffer.
  * The following functions are the ones that commands implementations will call.
  * -------------------------------------------------------------------------- */
-
-/* Add the object 'obj' string representation to the client output buffer. */
-void addReply(client *c, robj *obj) {
-    if (prepareClientToWrite(c) != C_OK) return;
+void addReplyCore(client *c, robj *obj, bool fAsync) {
+    if (prepareClientToWrite(c, fAsync) != C_OK) return;
 
     if (sdsEncodedObject(obj)) {
-        if (_addReplyToBuffer(c,ptrFromObj(obj),sdslen(ptrFromObj(obj))) != C_OK)
-            _addReplyProtoToList(c,ptrFromObj(obj),sdslen(ptrFromObj(obj)));
+        if (_addReplyToBuffer(c,(const char*)ptrFromObj(obj),sdslen((sds)ptrFromObj(obj)),fAsync) != C_OK)
+            _addReplyProtoToList(c,(const char*)ptrFromObj(obj),sdslen((sds)ptrFromObj(obj)));
     } else if (obj->encoding == OBJ_ENCODING_INT) {
         /* For integer encoded strings we just convert it into a string
          * using our optimized function, and attach the resulting string
          * to the output buffer. */
         char buf[32];
         size_t len = ll2string(buf,sizeof(buf),(long)ptrFromObj(obj));
-        if (_addReplyToBuffer(c,buf,len) != C_OK)
+        if (_addReplyToBuffer(c,buf,len,fAsync) != C_OK)
             _addReplyProtoToList(c,buf,len);
     } else {
         serverPanic("Wrong obj->encoding in addReply()");
     }
 }
 
+/* Add the object 'obj' string representation to the client output buffer. */
+void addReply(client *c, robj *obj)
+{
+    addReplyCore(c, obj, false);
+}
+void addReplyAsync(client *c, robj *obj)
+{
+    addReplyCore(c, obj, true);
+}
+
 /* Add the SDS 's' string to the client output buffer, as a side effect
  * the SDS string is freed. */
-void addReplySds(client *c, sds s) {
-    if (prepareClientToWrite(c) != C_OK) {
+void addReplySdsCore(client *c, sds s, bool fAsync) {
+    if (prepareClientToWrite(c, fAsync) != C_OK) {
         /* The caller expects the sds to be free'd. */
         sdsfree(s);
         return;
     }
-    if (_addReplyToBuffer(c,s,sdslen(s)) != C_OK)
+    if (_addReplyToBuffer(c,s,sdslen(s), fAsync) != C_OK)
         _addReplyProtoToList(c,s,sdslen(s));
     sdsfree(s);
 }
 
+void addReplySds(client *c, sds s) {
+    addReplySdsCore(c, s, false);
+}
+
+void addReplySdsAsync(client *c, sds s) {
+    addReplySdsCore(c, s, true);
+}
+
 /* This low level function just adds whatever protocol you send it to the
  * client buffer, trying the static buffer initially, and using the string
  * of objects if not possible.
@@ -338,12 +457,20 @@ void addReplySds(client *c, sds s) {
  * if not needed. The object will only be created by calling
  * _addReplyProtoToList() if we fail to extend the existing tail object
  * in the list of objects. */
-void addReplyProto(client *c, const char *s, size_t len) {
-    if (prepareClientToWrite(c) != C_OK) return;
-    if (_addReplyToBuffer(c,s,len) != C_OK)
+void addReplyProtoCore(client *c, const char *s, size_t len, bool fAsync) {
+    if (prepareClientToWrite(c, fAsync) != C_OK) return;
+    if (_addReplyToBuffer(c,s,len,fAsync) != C_OK)
         _addReplyProtoToList(c,s,len);
 }
 
+void addReplyProto(client *c, const char *s, size_t len) {
+    addReplyProtoCore(c, s, len, false);
+}
+
+void addReplyProtoAsync(client *c, const char *s, size_t len) {
+    addReplyProtoCore(c, s, len, true);
+}
+
 /* Low level function called by the addReplyError...() functions.
  * It emits the protocol for a Redis error, in the form:
  *
@@ -352,12 +479,12 @@ void addReplyProto(client *c, const char *s, size_t len) {
  * If the error code is already passed in the string 's', the error
  * code provided is used, otherwise the string "-ERR " for the generic
  * error code is automatically added. */
-void addReplyErrorLength(client *c, const char *s, size_t len) {
+void addReplyErrorLengthCore(client *c, const char *s, size_t len, bool fAsync) {
     /* If the string already starts with "-..." then the error code
      * is provided by the caller. Otherwise we use "-ERR". */
-    if (!len || s[0] != '-') addReplyProto(c,"-ERR ",5);
-    addReplyProto(c,s,len);
-    addReplyProto(c,"\r\n",2);
+    if (!len || s[0] != '-') addReplyProtoCore(c,"-ERR ",5,fAsync);
+    addReplyProtoCore(c,s,len,fAsync);
+    addReplyProtoCore(c,"\r\n",2,fAsync);
 
     /* Sometimes it could be normal that a slave replies to a master with
      * an error and this function gets called. Actually the error will never
@@ -370,17 +497,26 @@ void addReplyErrorLength(client *c, const char *s, size_t len) {
      * will produce an error. However it is useful to log such events since
      * they are rare and may hint at errors in a script or a bug in Redis. */
     if (c->flags & (CLIENT_MASTER|CLIENT_SLAVE) && !(c->flags & CLIENT_MONITOR)) {
-        char* to = c->flags & CLIENT_MASTER? "master": "replica";
-        char* from = c->flags & CLIENT_MASTER? "replica": "master";
-        char *cmdname = c->lastcmd ? c->lastcmd->name : "<unknown>";
+        const char* to = reinterpret_cast<const char*>(c->flags & CLIENT_MASTER? "master": "replica");
+        const char* from = reinterpret_cast<const char*>(c->flags & CLIENT_MASTER? "replica": "master");
+        const char *cmdname = reinterpret_cast<const char*>(c->lastcmd ? c->lastcmd->name : "<unknown>");
         serverLog(LL_WARNING,"== CRITICAL == This %s is sending an error "
                              "to its %s: '%s' after processing the command "
                              "'%s'", from, to, s, cmdname);
     }
 }
 
+void addReplyErrorLength(client *c, const char *s, size_t len)
+{
+    addReplyErrorLengthCore(c, s, len, false);
+}
+
 void addReplyError(client *c, const char *err) {
-    addReplyErrorLength(c,err,strlen(err));
+    addReplyErrorLengthCore(c,err,strlen(err), false);
+}
+
+void addReplyErrorAsync(client *c, const char *err) {
+    addReplyErrorLengthCore(c, err, strlen(err), true);
 }
 
 void addReplyErrorFormat(client *c, const char *fmt, ...) {
@@ -424,11 +560,18 @@ void *addReplyDeferredLen(client *c) {
     /* Note that we install the write event here even if the object is not
      * ready to be sent, since we are sure that before returning to the
      * event loop setDeferredAggregateLen() will be called. */
-    if (prepareClientToWrite(c) != C_OK) return NULL;
+    if (prepareClientToWrite(c, false) != C_OK) return NULL;
     listAddNodeTail(c->reply,NULL); /* NULL is our placeholder. */
     return listLast(c->reply);
 }
 
+void *addReplyDeferredLenAsync(client *c) {
+    if (FCorrectThread(c))
+        return addReplyDeferredLen(c);
+        
+    return (void*)((ssize_t)c->bufposAsync);
+}
+
 /* Populate the length object and try gluing it to the next chunk. */
 void setDeferredAggregateLen(client *c, void *node, long length, char prefix) {
     listNode *ln = (listNode*)node;
@@ -451,30 +594,57 @@ void setDeferredAggregateLen(client *c, void *node, long length, char prefix) {
      * - The next node is non-NULL,
      * - It has enough room already allocated
      * - And not too large (avoid large memmove) */
-    if (ln->next != NULL && (next = listNodeValue(ln->next)) &&
+    if (ln->next != NULL && (next = (clientReplyBlock*)listNodeValue(ln->next)) &&
         next->size - next->used >= lenstr_len &&
         next->used < PROTO_REPLY_CHUNK_BYTES * 4) {
-        memmove(next->buf + lenstr_len, next->buf, next->used);
-        memcpy(next->buf, lenstr, lenstr_len);
+        memmove(next->buf() + lenstr_len, next->buf(), next->used);
+        memcpy(next->buf(), lenstr, lenstr_len);
         next->used += lenstr_len;
         listDelNode(c->reply,ln);
     } else {
         /* Create a new node */
-        clientReplyBlock *buf = zmalloc(lenstr_len + sizeof(clientReplyBlock), MALLOC_LOCAL);
+        clientReplyBlock *buf = (clientReplyBlock*)zmalloc(lenstr_len + sizeof(clientReplyBlock), MALLOC_LOCAL);
         /* Take over the allocation's internal fragmentation */
         buf->size = zmalloc_usable(buf) - sizeof(clientReplyBlock);
         buf->used = lenstr_len;
-        memcpy(buf->buf, lenstr, lenstr_len);
+        memcpy(buf->buf(), lenstr, lenstr_len);
         listNodeValue(ln) = buf;
         c->reply_bytes += buf->size;
     }
     asyncCloseClientOnOutputBufferLimitReached(c);
 }
 
+void setDeferredAggregateLenAsync(client *c, void *node, long length, char prefix)
+{
+    if (FCorrectThread(c)) {
+        setDeferredAggregateLen(c, node, length, prefix);
+        return;
+    }
+
+    char lenstr[128];
+    int lenstr_len = sprintf(lenstr, "%c%ld\r\n", prefix, length);
+
+    ssize_t idxSplice = (ssize_t)node;
+    serverAssert(idxSplice <= c->bufposAsync);
+    if (c->buflenAsync < (c->bufposAsync + lenstr_len))
+    {
+        c->buflenAsync = std::max((int)(c->bufposAsync+lenstr_len), c->buflenAsync*2 - c->buflenAsync);
+        c->bufAsync = (char*)zrealloc(c->bufAsync, c->buflenAsync, MALLOC_LOCAL);
+    }
+    
+    memmove(c->bufAsync + idxSplice + lenstr_len, c->bufAsync + idxSplice, c->bufposAsync - idxSplice);
+    memcpy(c->bufAsync + idxSplice, lenstr, lenstr_len);
+    c->bufposAsync += lenstr_len;
+}
+
 void setDeferredArrayLen(client *c, void *node, long length) {
     setDeferredAggregateLen(c,node,length,'*');
 }
 
+void setDeferredArrayLenAsync(client *c, void *node, long length) {
+    setDeferredAggregateLenAsync(c, node, length, '*');
+}
+
 void setDeferredMapLen(client *c, void *node, long length) {
     int prefix = c->resp == 2 ? '*' : '%';
     if (c->resp == 2) length *= 2;
@@ -498,15 +668,15 @@ void setDeferredPushLen(client *c, void *node, long length) {
 }
 
 /* Add a double as a bulk reply */
-void addReplyDouble(client *c, double d) {
+void addReplyDoubleCore(client *c, double d, bool fAsync) {
     if (isinf(d)) {
         /* Libc in odd systems (Hi Solaris!) will format infinite in a
          * different way, so better to handle it in an explicit way. */
         if (c->resp == 2) {
-            addReplyBulkCString(c, d > 0 ? "inf" : "-inf");
+            addReplyBulkCStringCore(c, d > 0 ? "inf" : "-inf", fAsync);
         } else {
-            addReplyProto(c, d > 0 ? ",inf\r\n" : "-inf\r\n",
-                              d > 0 ? 6 : 7);
+            addReplyProtoCore(c, d > 0 ? ",inf\r\n" : "-inf\r\n",
+                              d > 0 ? 6 : 7, fAsync);
         }
     } else {
         char dbuf[MAX_LONG_DOUBLE_CHARS+3],
@@ -515,14 +685,22 @@ void addReplyDouble(client *c, double d) {
         if (c->resp == 2) {
             dlen = snprintf(dbuf,sizeof(dbuf),"%.17g",d);
             slen = snprintf(sbuf,sizeof(sbuf),"$%d\r\n%s\r\n",dlen,dbuf);
-            addReplyProto(c,sbuf,slen);
+            addReplyProtoCore(c,sbuf,slen,fAsync);
         } else {
             dlen = snprintf(dbuf,sizeof(dbuf),",%.17g\r\n",d);
-            addReplyProto(c,dbuf,dlen);
+            addReplyProtoCore(c,dbuf,dlen,fAsync);
         }
     }
 }
 
+void addReplyDouble(client *c, double d) {
+    addReplyDoubleCore(c, d, false);
+}
+
+void addReplyDoubleAsync(client *c, double d) {
+    addReplyDoubleCore(c, d, true);
+}
+
 /* Add a long double as a bulk reply, but uses a human readable formatting
  * of the double instead of exposing the crude behavior of doubles to the
  * dear user. */
@@ -542,7 +720,7 @@ void addReplyHumanLongDouble(client *c, long double d) {
 
 /* Add a long long as integer reply or bulk len / multi bulk count.
  * Basically this is used to output <prefix><long long><crlf>. */
-void addReplyLongLongWithPrefix(client *c, long long ll, char prefix) {
+void addReplyLongLongWithPrefixCore(client *c, long long ll, char prefix, bool fAsync) {
     char buf[128];
     int len;
 
@@ -550,10 +728,10 @@ void addReplyLongLongWithPrefix(client *c, long long ll, char prefix) {
      * so we have a few shared objects to use if the integer is small
      * like it is most of the times. */
     if (prefix == '*' && ll < OBJ_SHARED_BULKHDR_LEN && ll >= 0) {
-        addReply(c,shared.mbulkhdr[ll]);
+        addReplyCore(c,shared.mbulkhdr[ll], fAsync);
         return;
     } else if (prefix == '$' && ll < OBJ_SHARED_BULKHDR_LEN && ll >= 0) {
-        addReply(c,shared.bulkhdr[ll]);
+        addReplyCore(c,shared.bulkhdr[ll], fAsync);
         return;
     }
 
@@ -561,33 +739,65 @@ void addReplyLongLongWithPrefix(client *c, long long ll, char prefix) {
     len = ll2string(buf+1,sizeof(buf)-1,ll);
     buf[len+1] = '\r';
     buf[len+2] = '\n';
-    addReplyProto(c,buf,len+3);
+    addReplyProtoCore(c,buf,len+3, fAsync);
+}
+
+void addReplyLongLongWithPrefix(client *c, long long ll, char prefix) {
+    addReplyLongLongWithPrefixCore(c, ll, prefix, false);
+}
+
+void addReplyLongLongCore(client *c, long long ll, bool fAsync) {
+    if (ll == 0)
+        addReplyCore(c,shared.czero, fAsync);
+    else if (ll == 1)
+        addReplyCore(c,shared.cone, fAsync);
+    else
+        addReplyLongLongWithPrefixCore(c,ll,':', fAsync);
 }
 
 void addReplyLongLong(client *c, long long ll) {
-    if (ll == 0)
-        addReply(c,shared.czero);
-    else if (ll == 1)
-        addReply(c,shared.cone);
+    addReplyLongLongCore(c, ll, false);
+}
+
+void addReplyLongLongAsync(client *c, long long ll) {
+    addReplyLongLongCore(c, ll, true);
+}
+
+void addReplyAggregateLenCore(client *c, long length, int prefix, bool fAsync) {
+    if (prefix == '*' && length < OBJ_SHARED_BULKHDR_LEN)
+        addReplyCore(c,shared.mbulkhdr[length], fAsync);
     else
-        addReplyLongLongWithPrefix(c,ll,':');
+        addReplyLongLongWithPrefixCore(c,length,prefix, fAsync);
 }
 
 void addReplyAggregateLen(client *c, long length, int prefix) {
-    if (prefix == '*' && length < OBJ_SHARED_BULKHDR_LEN)
-        addReply(c,shared.mbulkhdr[length]);
-    else
-        addReplyLongLongWithPrefix(c,length,prefix);
+    addReplyAggregateLenCore(c, length, prefix, false);
+}
+
+void addReplyArrayLenCore(client *c, long length, bool fAsync) {
+    addReplyAggregateLenCore(c,length,'*', fAsync);
 }
 
 void addReplyArrayLen(client *c, long length) {
-    addReplyAggregateLen(c,length,'*');
+    addReplyArrayLenCore(c, length, false);
+}
+
+void addReplyArrayLenAsync(client *c, long length) {
+    addReplyArrayLenCore(c, length, true);
+}
+
+void addReplyMapLenCore(client *c, long length, bool fAsync) {
+    int prefix = c->resp == 2 ? '*' : '%';
+    if (c->resp == 2) length *= 2;
+    addReplyAggregateLenCore(c,length,prefix,fAsync);
 }
 
 void addReplyMapLen(client *c, long length) {
-    int prefix = c->resp == 2 ? '*' : '%';
-    if (c->resp == 2) length *= 2;
-    addReplyAggregateLen(c,length,prefix);
+    addReplyMapLenCore(c, length, false);
+}
+
+void addReplyMapLenAsync(client *c, long length) {
+    addReplyMapLenCore(c, length, true);
 }
 
 void addReplySetLen(client *c, long length) {
@@ -601,17 +811,33 @@ void addReplyAttributeLen(client *c, long length) {
     addReplyAggregateLen(c,length,prefix);
 }
 
-void addReplyPushLen(client *c, long length) {
+void addReplyPushLenCore(client *c, long length, bool fAsync) {
     int prefix = c->resp == 2 ? '*' : '>';
-    addReplyAggregateLen(c,length,prefix);
+    addReplyAggregateLenCore(c,length,prefix, fAsync);
+}
+
+void addReplyPushLen(client *c, long length) {
+    addReplyPushLenCore(c, length, false);
+}
+
+void addReplyPushLenAsync(client *c, long length) {
+    addReplyPushLenCore(c, length, true);
+}
+
+void addReplyNullCore(client *c, bool fAsync) {
+    if (c->resp == 2) {
+        addReplyProtoCore(c,"$-1\r\n",5,fAsync);
+    } else {
+        addReplyProtoCore(c,"_\r\n",3,fAsync);
+    }
 }
 
 void addReplyNull(client *c) {
-    if (c->resp == 2) {
-        addReplyProto(c,"$-1\r\n",5);
-    } else {
-        addReplyProto(c,"_\r\n",3);
-    }
+    addReplyNullCore(c, false);
+}
+
+void addReplyNullAsync(client *c) {
+    addReplyNullCore(c, true);
 }
 
 void addReplyBool(client *c, int b) {
@@ -635,61 +861,80 @@ void addReplyNullArray(client *c) {
 }
 
 /* Create the length prefix of a bulk reply, example: $2234 */
-void addReplyBulkLen(client *c, robj *obj) {
-    size_t len;
-
-    if (sdsEncodedObject(obj)) {
-        len = sdslen(ptrFromObj(obj));
-    } else {
-        long n = (long)ptrFromObj(obj);
-
-        /* Compute how many bytes will take this integer as a radix 10 string */
-        len = 1;
-        if (n < 0) {
-            len++;
-            n = -n;
-        }
-        while((n = n/10) != 0) {
-            len++;
-        }
-    }
+void addReplyBulkLenCore(client *c, robj *obj, bool fAsync) {
+    size_t len = stringObjectLen(obj);
 
     if (len < OBJ_SHARED_BULKHDR_LEN)
-        addReply(c,shared.bulkhdr[len]);
+        addReplyCore(c,shared.bulkhdr[len], fAsync);
     else
-        addReplyLongLongWithPrefix(c,len,'$');
+        addReplyLongLongWithPrefixCore(c,len,'$', fAsync);
+}
+
+void addReplyBulkLen(client *c, robj *obj)
+{
+    addReplyBulkLenCore(c, obj, false);
 }
 
 /* Add a Redis Object as a bulk reply */
-void addReplyBulk(client *c, robj *obj) {
-    addReplyBulkLen(c,obj);
-    addReply(c,obj);
-    addReply(c,shared.crlf);
+void addReplyBulkCore(client *c, robj *obj, bool fAsync) {
+    addReplyBulkLenCore(c,obj,fAsync);
+    addReplyCore(c,obj,fAsync);
+    addReplyCore(c,shared.crlf,fAsync);
+}
+
+void addReplyBulk(client *c, robj *obj)
+{
+    addReplyBulkCore(c, obj, false);
+}
+
+void addReplyBulkAsync(client *c, robj *obj)
+{
+    addReplyBulkCore(c, obj, true);
 }
 
 /* Add a C buffer as bulk reply */
+void addReplyBulkCBufferCore(client *c, const void *p, size_t len, bool fAsync) {
+    addReplyLongLongWithPrefixCore(c,len,'$',fAsync);
+    addReplyProtoCore(c,(const char*)p,len,fAsync);
+    addReplyCore(c,shared.crlf,fAsync);
+}
+
 void addReplyBulkCBuffer(client *c, const void *p, size_t len) {
-    addReplyLongLongWithPrefix(c,len,'$');
-    addReplyProto(c,p,len);
-    addReply(c,shared.crlf);
+    addReplyBulkCBufferCore(c, p, len, false);
+}
+
+void addReplyBulkCBufferAsync(client *c, const void *p, size_t len) {
+    addReplyBulkCBufferCore(c, p, len, true);
 }
 
 /* Add sds to reply (takes ownership of sds and frees it) */
-void addReplyBulkSds(client *c, sds s)  {
-    addReplyLongLongWithPrefix(c,sdslen(s),'$');
-    addReplySds(c,s);
-    addReply(c,shared.crlf);
+void addReplyBulkSdsCore(client *c, sds s, bool fAsync)  {
+    addReplyLongLongWithPrefixCore(c,sdslen(s),'$', fAsync);
+    addReplySdsCore(c,s,fAsync);
+    addReplyCore(c,shared.crlf,fAsync);
+}
+
+void addReplyBulkSds(client *c, sds s) {
+    addReplyBulkSdsCore(c, s, false);
+}
+
+void addReplyBulkSdsAsync(client *c, sds s) {
+    addReplyBulkSdsCore(c, s, true);
 }
 
 /* Add a C null term string as bulk reply */
-void addReplyBulkCString(client *c, const char *s) {
+void addReplyBulkCStringCore(client *c, const char *s, bool fAsync) {
     if (s == NULL) {
-        addReplyNull(c);
+        addReplyNullCore(c,fAsync);
     } else {
-        addReplyBulkCBuffer(c,s,strlen(s));
+        addReplyBulkCBufferCore(c,s,strlen(s),fAsync);
     }
 }
 
+void addReplyBulkCString(client *c, const char *s) {
+    addReplyBulkCStringCore(c, s, false);
+}
+
 /* Add a long long as a bulk reply */
 void addReplyBulkLongLong(client *c, long long ll) {
     char buf[64];
@@ -779,9 +1024,9 @@ int clientHasPendingReplies(client *c) {
 }
 
 #define MAX_ACCEPTS_PER_CALL 1000
-static void acceptCommonHandler(int fd, int flags, char *ip) {
+static void acceptCommonHandler(int fd, int flags, char *ip, int iel) {
     client *c;
-    if ((c = createClient(fd)) == NULL) {
+    if ((c = createClient(fd, iel)) == NULL) {
         serverLog(LL_WARNING,
             "Error registering fd event for the new client: %s (fd=%d)",
             strerror(errno),fd);
@@ -793,7 +1038,7 @@ static void acceptCommonHandler(int fd, int flags, char *ip) {
      * for this condition, since now the socket is already set in non-blocking
      * mode and we can send an error for free using the Kernel I/O */
     if (listLength(server.clients) > server.maxclients) {
-        char *err = "-ERR max number of clients reached\r\n";
+        const char *err = "-ERR max number of clients reached\r\n";
 
         /* That's a best effort error message, don't check write errors */
         if (write(c->fd,err,strlen(err)) == -1) {
@@ -815,7 +1060,7 @@ static void acceptCommonHandler(int fd, int flags, char *ip) {
         ip != NULL)
     {
         if (strcmp(ip,"127.0.0.1") && strcmp(ip,"::1")) {
-            char *err =
+            const char *err =
                 "-DENIED Redis is running in protected mode because protected "
                 "mode is enabled, no bind address was specified, no "
                 "authentication password is requested to clients. In this mode "
@@ -852,7 +1097,6 @@ static void acceptCommonHandler(int fd, int flags, char *ip) {
 void acceptTcpHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
     int cport, cfd, max = MAX_ACCEPTS_PER_CALL;
     char cip[NET_IP_STR_LEN];
-    UNUSED(el);
     UNUSED(mask);
     UNUSED(privdata);
 
@@ -865,7 +1109,12 @@ void acceptTcpHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
             return;
         }
         serverLog(LL_VERBOSE,"Accepted %s:%d", cip, cport);
-        acceptCommonHandler(cfd,0,cip);
+        int ielCur = ielFromEventLoop(el);
+
+        // We always accept on the same thread
+        aeAcquireLock();
+        acceptCommonHandler(cfd,0,cip, ielCur);
+        aeReleaseLock();
     }
 }
 
@@ -883,8 +1132,13 @@ void acceptUnixHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
                     "Accepting client connection: %s", server.neterr);
             return;
         }
+        int ielCur = ielFromEventLoop(el);
         serverLog(LL_VERBOSE,"Accepted connection to %s", server.unixsocket);
-        acceptCommonHandler(cfd,CLIENT_UNIX_SOCKET,NULL);
+
+        aeAcquireLock();
+        acceptCommonHandler(cfd,CLIENT_UNIX_SOCKET,NULL, ielCur);
+        aeReleaseLock();
+        
     }
 }
 
@@ -900,9 +1154,19 @@ static void freeClientArgv(client *c) {
  * when we resync with our own master and want to force all our slaves to
  * resync with us as well. */
 void disconnectSlaves(void) {
-    while (listLength(server.slaves)) {
-        listNode *ln = listFirst(server.slaves);
-        freeClient((client*)ln->value);
+    serverAssert(aeThreadOwnsLock());
+    listIter li;
+    listNode *ln;
+
+    listRewind(server.slaves, &li);
+    while ((ln = listNext(&li))) {
+        client *c = (client*)listNodeValue(ln);
+        if (FCorrectThread(c)) {
+            freeClient(c);
+        }
+        else {
+            freeClientAsync(c);
+        }
     }
 }
 
@@ -911,6 +1175,9 @@ void disconnectSlaves(void) {
  * This is used by freeClient() and replicationCacheMaster(). */
 void unlinkClient(client *c) {
     listNode *ln;
+    AssertCorrectThread(c);
+    serverAssert(aeThreadOwnsLock());
+    serverAssert(c->lock.fOwnLock());
 
     /* If this is marked as current client unset it. */
     if (server.current_client == c) server.current_client = NULL;
@@ -928,32 +1195,51 @@ void unlinkClient(client *c) {
         }
 
         /* Unregister async I/O handlers and close the socket. */
-        aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
-        aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
+        aeDeleteFileEvent(server.rgthreadvar[c->iel].el,c->fd,AE_READABLE);
+        aeDeleteFileEvent(server.rgthreadvar[c->iel].el,c->fd,AE_WRITABLE);
         close(c->fd);
         c->fd = -1;
     }
 
     /* Remove from the list of pending writes if needed. */
     if (c->flags & CLIENT_PENDING_WRITE) {
-        ln = listSearchKey(server.clients_pending_write,c);
+        ln = listSearchKey(server.rgthreadvar[c->iel].clients_pending_write,c);
         serverAssert(ln != NULL);
-        listDelNode(server.clients_pending_write,ln);
+        listDelNode(server.rgthreadvar[c->iel].clients_pending_write,ln);
         c->flags &= ~CLIENT_PENDING_WRITE;
     }
 
     /* When client was just unblocked because of a blocking operation,
      * remove it from the list of unblocked clients. */
     if (c->flags & CLIENT_UNBLOCKED) {
-        ln = listSearchKey(server.unblocked_clients,c);
+        ln = listSearchKey(server.rgthreadvar[c->iel].unblocked_clients,c);
         serverAssert(ln != NULL);
-        listDelNode(server.unblocked_clients,ln);
+        listDelNode(server.rgthreadvar[c->iel].unblocked_clients,ln);
         c->flags &= ~CLIENT_UNBLOCKED;
     }
+
+    if (c->fPendingAsyncWrite) {
+        ln = NULL;
+        bool fFound = false;
+        for (int iel = 0; iel < server.cthreads; ++iel)
+        {
+            ln = listSearchKey(server.rgthreadvar[iel].clients_pending_asyncwrite,c);
+            if (ln)
+            {
+                fFound = true;
+                listDelNode(server.rgthreadvar[iel].clients_pending_asyncwrite,ln);
+            }
+        }
+        serverAssert(fFound);
+        c->fPendingAsyncWrite = FALSE;
+    }
 }
 
 void freeClient(client *c) {
     listNode *ln;
+    serverAssert(aeThreadOwnsLock());
+    AssertCorrectThread(c);
+    std::unique_lock<decltype(c->lock)> ulock(c->lock);
 
     /* If a client is protected, yet we need to free it right now, make sure
      * to at least use asynchronous freeing. */
@@ -1045,10 +1331,13 @@ void freeClient(client *c) {
 
     /* Release other dynamically allocated client structure fields,
      * and finally release the client structure itself. */
+    zfree(c->bufAsync);
     if (c->name) decrRefCount(c->name);
     zfree(c->argv);
     freeClientMultiState(c);
     sdsfree(c->peerid);
+    ulock.unlock();
+    fastlock_free(&c->lock);
     zfree(c);
 }
 
@@ -1058,18 +1347,27 @@ void freeClient(client *c) {
  * should be valid for the continuation of the flow of the program. */
 void freeClientAsync(client *c) {
     if (c->flags & CLIENT_CLOSE_ASAP || c->flags & CLIENT_LUA) return;
-    c->flags |= CLIENT_CLOSE_ASAP;
+    AeLocker lock;
+    lock.arm(nullptr);
+    std::lock_guard<decltype(c->lock)> clientlock(c->lock);
+    c->flags |= CLIENT_CLOSE_ASAP;    
     listAddNodeTail(server.clients_to_close,c);
 }
 
-void freeClientsInAsyncFreeQueue(void) {
-    while (listLength(server.clients_to_close)) {
-        listNode *ln = listFirst(server.clients_to_close);
-        client *c = listNodeValue(ln);
+void freeClientsInAsyncFreeQueue(int iel) {
+    listIter li;
+    listNode *ln;
+    listRewind(server.clients_to_close,&li);
+
+    while((ln = listNext(&li))) {
+        client *c = (client*)listNodeValue(ln);
+        if (c->iel != iel)
+            continue;   // wrong thread
 
         c->flags &= ~CLIENT_CLOSE_ASAP;
         freeClient(c);
         listDelNode(server.clients_to_close,ln);
+        listRewind(server.clients_to_close,&li);
     }
 }
 
@@ -1078,7 +1376,7 @@ void freeClientsInAsyncFreeQueue(void) {
  * are not registered clients. */
 client *lookupClientByID(uint64_t id) {
     id = htonu64(id);
-    client *c = raxFind(server.clients_index,(unsigned char*)&id,sizeof(id));
+    client *c = (client*)raxFind(server.clients_index,(unsigned char*)&id,sizeof(id));
     return (c == raxNotFound) ? NULL : c;
 }
 
@@ -1086,12 +1384,15 @@ client *lookupClientByID(uint64_t id) {
  * is still valid after the call, C_ERR if it was freed. */
 int writeToClient(int fd, client *c, int handler_installed) {
     ssize_t nwritten = 0, totwritten = 0;
-    size_t objlen;
     clientReplyBlock *o;
+    AssertCorrectThread(c);
 
+    std::unique_lock<decltype(c->lock)> lock(c->lock);
+   
     while(clientHasPendingReplies(c)) {
         if (c->bufpos > 0) {
             nwritten = write(fd,c->buf+c->sentlen,c->bufpos-c->sentlen);
+
             if (nwritten <= 0) break;
             c->sentlen += nwritten;
             totwritten += nwritten;
@@ -1103,27 +1404,27 @@ int writeToClient(int fd, client *c, int handler_installed) {
                 c->sentlen = 0;
             }
         } else {
-            o = listNodeValue(listFirst(c->reply));
-            objlen = o->used;
-
-            if (objlen == 0) {
+            o = (clientReplyBlock*)listNodeValue(listFirst(c->reply));
+            if (o->used == 0) {
                 c->reply_bytes -= o->size;
                 listDelNode(c->reply,listFirst(c->reply));
                 continue;
             }
 
-            nwritten = write(fd, o->buf + c->sentlen, objlen - c->sentlen);
-            if (nwritten <= 0) break;
+            nwritten = write(fd, o->buf() + c->sentlen, o->used - c->sentlen);
+            if (nwritten <= 0)
+                break;
+                
             c->sentlen += nwritten;
             totwritten += nwritten;
-
+            
             /* If we fully sent the object on head go to the next one */
-            if (c->sentlen == objlen) {
+            if (c->sentlen == o->used) {
                 c->reply_bytes -= o->size;
                 listDelNode(c->reply,listFirst(c->reply));
                 c->sentlen = 0;
                 /* If there are no longer objects in the list, we expect
-                 * the count of reply bytes to be exactly zero. */
+                    * the count of reply bytes to be exactly zero. */
                 if (listLength(c->reply) == 0)
                     serverAssert(c->reply_bytes == 0);
             }
@@ -1145,14 +1446,26 @@ int writeToClient(int fd, client *c, int handler_installed) {
              zmalloc_used_memory() < server.maxmemory) &&
             !(c->flags & CLIENT_SLAVE)) break;
     }
-    server.stat_net_output_bytes += totwritten;
+    
+    __atomic_fetch_add(&server.stat_net_output_bytes, totwritten, __ATOMIC_RELAXED);
     if (nwritten == -1) {
         if (errno == EAGAIN) {
             nwritten = 0;
         } else {
             serverLog(LL_VERBOSE,
                 "Error writing to client: %s", strerror(errno));
-            freeClient(c);
+            lock.unlock();
+            if (aeTryAcquireLock())
+            {
+                freeClient(c);
+                aeReleaseLock();
+            }
+            else
+            {
+                lock.unlock();
+                freeClientAsync(c);
+            }
+            
             return C_ERR;
         }
     }
@@ -1165,11 +1478,21 @@ int writeToClient(int fd, client *c, int handler_installed) {
     }
     if (!clientHasPendingReplies(c)) {
         c->sentlen = 0;
-        if (handler_installed) aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
+        if (handler_installed) aeDeleteFileEvent(server.rgthreadvar[c->iel].el,c->fd,AE_WRITABLE);
 
         /* Close connection after entire reply has been sent. */
         if (c->flags & CLIENT_CLOSE_AFTER_REPLY) {
-            freeClient(c);
+            lock.unlock();
+            if (aeTryAcquireLock())
+            {
+                freeClient(c);
+                aeReleaseLock();
+            }
+            else
+            {
+                lock.unlock();
+                freeClientAsync(c);
+            }
             return C_ERR;
         }
     }
@@ -1178,37 +1501,99 @@ int writeToClient(int fd, client *c, int handler_installed) {
 
 /* Write event handler. Just send data to the client. */
 void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
-    UNUSED(el);
     UNUSED(mask);
-    writeToClient(fd,privdata,1);
+    client *c = (client*)privdata;
+
+    serverAssert(ielFromEventLoop(el) == c->iel);
+    writeToClient(fd,c,1);
+}
+
+void ProcessPendingAsyncWrites()
+{
+    serverAssert(aeThreadOwnsLock());
+
+    while(listLength(serverTL->clients_pending_asyncwrite)) {
+        client *c = (client*)listNodeValue(listFirst(serverTL->clients_pending_asyncwrite));
+        listDelNode(serverTL->clients_pending_asyncwrite, listFirst(serverTL->clients_pending_asyncwrite));
+        std::lock_guard<decltype(c->lock)> lock(c->lock);
+
+        serverAssert(c->fPendingAsyncWrite);
+
+        // TODO: Append to end of reply block?
+
+        size_t size = c->bufposAsync;
+        clientReplyBlock *reply = (clientReplyBlock*)zmalloc(size + sizeof(clientReplyBlock), MALLOC_LOCAL);
+        /* take over the allocation's internal fragmentation */
+        reply->size = zmalloc_usable(reply) - sizeof(clientReplyBlock);
+        reply->used = c->bufposAsync;
+        memcpy(reply->buf(), c->bufAsync, c->bufposAsync);
+        listAddNodeTail(c->reply, reply);
+        c->reply_bytes += reply->size;
+
+        c->bufposAsync = 0;
+        c->buflenAsync = 0;
+        zfree(c->bufAsync);
+        c->bufAsync = nullptr;
+        c->fPendingAsyncWrite = FALSE;
+
+        // Now install the write event handler
+        int ae_flags = AE_WRITABLE|AE_WRITE_THREADSAFE;
+        /* For the fsync=always policy, we want that a given FD is never
+            * served for reading and writing in the same event loop iteration,
+            * so that in the middle of receiving the query, and serving it
+            * to the client, we'll call beforeSleep() that will do the
+            * actual fsync of AOF to disk. AE_BARRIER ensures that. */
+        if (server.aof_state == AOF_ON &&
+            server.aof_fsync == AOF_FSYNC_ALWAYS)
+        {
+            ae_flags |= AE_BARRIER;
+        }
+        
+        if (!((c->replstate == REPL_STATE_NONE ||
+         (c->replstate == SLAVE_STATE_ONLINE && !c->repl_put_online_on_ack))))
+            continue;
+
+        asyncCloseClientOnOutputBufferLimitReached(c);
+        if (aeCreateRemoteFileEvent(server.rgthreadvar[c->iel].el, c->fd, ae_flags, sendReplyToClient, c, FALSE) == AE_ERR)
+            continue;   // We can retry later in the cron
+    }
 }
 
 /* This function is called just before entering the event loop, in the hope
  * we can just write the replies to the client output buffer without any
  * need to use a syscall in order to install the writable event handler,
  * get it called, and so forth. */
-int handleClientsWithPendingWrites(void) {
+int handleClientsWithPendingWrites(int iel) {
     listIter li;
     listNode *ln;
-    int processed = listLength(server.clients_pending_write);
 
-    listRewind(server.clients_pending_write,&li);
+    list *list = server.rgthreadvar[iel].clients_pending_write;
+    int processed = listLength(list);
+    serverAssert(iel == (serverTL - server.rgthreadvar));
+
+    listRewind(list,&li);
     while((ln = listNext(&li))) {
-        client *c = listNodeValue(ln);
+        client *c = (client*)listNodeValue(ln);
+        std::unique_lock<decltype(c->lock)> lock(c->lock);
+
         c->flags &= ~CLIENT_PENDING_WRITE;
-        listDelNode(server.clients_pending_write,ln);
+        listDelNode(list,ln);
+        AssertCorrectThread(c);
 
         /* If a client is protected, don't do anything,
          * that may trigger write error or recreate handler. */
         if (c->flags & CLIENT_PROTECTED) continue;
 
         /* Try to write buffers to the client socket. */
-        if (writeToClient(c->fd,c,0) == C_ERR) continue;
+        if (writeToClient(c->fd,c,0) == C_ERR) {
+            lock.release(); // client is free'd
+            continue;
+        }
 
         /* If after the synchronous writes above we still have data to
          * output to the client, we need to install the writable handler. */
         if (clientHasPendingReplies(c)) {
-            int ae_flags = AE_WRITABLE;
+            int ae_flags = AE_WRITABLE|AE_WRITE_THREADSAFE;
             /* For the fsync=always policy, we want that a given FD is never
              * served for reading and writing in the same event loop iteration,
              * so that in the middle of receiving the query, and serving it
@@ -1219,13 +1604,16 @@ int handleClientsWithPendingWrites(void) {
             {
                 ae_flags |= AE_BARRIER;
             }
-            if (aeCreateFileEvent(server.el, c->fd, ae_flags,
-                sendReplyToClient, c) == AE_ERR)
-            {
-                    freeClientAsync(c);
-            }
+            
+            if (aeCreateFileEvent(server.rgthreadvar[c->iel].el, c->fd, ae_flags, sendReplyToClient, c) == AE_ERR)
+                freeClientAsync(c);
         }
     }
+
+    AeLocker locker;
+    locker.arm(nullptr);
+    ProcessPendingAsyncWrites();
+
     return processed;
 }
 
@@ -1268,15 +1656,17 @@ void resetClient(client *c) {
  *    path, it is not really released, but only marked for later release. */
 void protectClient(client *c) {
     c->flags |= CLIENT_PROTECTED;
-    aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
-    aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
+    AssertCorrectThread(c);
+    aeDeleteFileEvent(server.rgthreadvar[c->iel].el,c->fd,AE_READABLE);
+    aeDeleteFileEvent(server.rgthreadvar[c->iel].el,c->fd,AE_WRITABLE);
 }
 
 /* This will undo the client protection done by protectClient() */
 void unprotectClient(client *c) {
+    AssertCorrectThread(c);
     if (c->flags & CLIENT_PROTECTED) {
         c->flags &= ~CLIENT_PROTECTED;
-        aeCreateFileEvent(server.el,c->fd,AE_READABLE,readQueryFromClient,c);
+        aeCreateFileEvent(server.rgthreadvar[c->iel].el,c->fd,AE_READABLE|AE_READ_THREADSAFE,readQueryFromClient,c);
         if (clientHasPendingReplies(c)) clientInstallWriteHandler(c);
     }
 }
@@ -1333,7 +1723,7 @@ int processInlineBuffer(client *c) {
     /* Setup argv array on client structure */
     if (argc) {
         if (c->argv) zfree(c->argv);
-        c->argv = zmalloc(sizeof(robj*)*argc, MALLOC_LOCAL);
+        c->argv = (robj**)zmalloc(sizeof(robj*)*argc, MALLOC_LOCAL);
     }
 
     /* Create redis objects for all arguments. */
@@ -1431,7 +1821,7 @@ int processMultibulkBuffer(client *c) {
 
         /* Setup argv array on client structure */
         if (c->argv) zfree(c->argv);
-        c->argv = zmalloc(sizeof(robj*)*c->multibulklen, MALLOC_LOCAL);
+        c->argv = (robj**)zmalloc(sizeof(robj*)*c->multibulklen, MALLOC_LOCAL);
     }
 
     serverAssertWithInfo(c,NULL,c->multibulklen > 0);
@@ -1530,8 +1920,9 @@ int processMultibulkBuffer(client *c) {
  * or because a client was blocked and later reactivated, so there could be
  * pending query buffer, already representing a full command, to process. */
 void processInputBuffer(client *c) {
-    server.current_client = c;
-
+    AssertCorrectThread(c);
+    bool fFreed = false;
+    
     /* Keep processing while there is something in the input buffer */
     while(c->qb_pos < sdslen(c->querybuf)) {
         /* Return if clients are paused. */
@@ -1574,6 +1965,10 @@ void processInputBuffer(client *c) {
         if (c->argc == 0) {
             resetClient(c);
         } else {
+            AeLocker locker;
+            locker.arm(c);
+            server.current_client = c;
+
             /* Only reset the client when the command was executed. */
             if (processCommand(c) == C_OK) {
                 if (c->flags & CLIENT_MASTER && !(c->flags & CLIENT_MULTI)) {
@@ -1591,17 +1986,19 @@ void processInputBuffer(client *c) {
             /* freeMemoryIfNeeded may flush slave output buffers. This may
              * result into a slave, that may be the active client, to be
              * freed. */
-            if (server.current_client == NULL) break;
+            if (server.current_client == NULL) {
+                fFreed = true;
+                break;
+            }
+            server.current_client = NULL;
         }
     }
 
     /* Trim to pos */
-    if (server.current_client != NULL && c->qb_pos) {
+    if (!fFreed && c->qb_pos) {
         sdsrange(c->querybuf,c->qb_pos,-1);
         c->qb_pos = 0;
     }
-
-    server.current_client = NULL;
 }
 
 /* This is a wrapper for processInputBuffer that also cares about handling
@@ -1616,8 +2013,10 @@ void processInputBufferAndReplicate(client *c) {
         processInputBuffer(c);
         size_t applied = c->reploff - prev_offset;
         if (applied) {
+            aeAcquireLock();
             replicationFeedSlavesFromMasterStream(server.slaves,
                     c->pending_querybuf, applied);
+            aeReleaseLock();
             sdsrange(c->pending_querybuf,applied,-1);
         }
     }
@@ -1629,6 +2028,14 @@ void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
     size_t qblen;
     UNUSED(el);
     UNUSED(mask);
+    serverAssert(mask & AE_READ_THREADSAFE);
+    serverAssert(c->iel == ielFromEventLoop(el));
+    
+    AeLocker aelock;
+    AssertCorrectThread(c);
+    std::unique_lock<decltype(c->lock)> lock(c->lock, std::defer_lock);
+    if (!lock.try_lock())
+        return; // Process something else while we wait
 
     readlen = PROTO_IOBUF_LEN;
     /* If this is a multi bulk request, and we are processing a bulk reply
@@ -1650,17 +2057,23 @@ void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
     qblen = sdslen(c->querybuf);
     if (c->querybuf_peak < qblen) c->querybuf_peak = qblen;
     c->querybuf = sdsMakeRoomFor(c->querybuf, readlen);
+    
     nread = read(fd, c->querybuf+qblen, readlen);
+    
     if (nread == -1) {
         if (errno == EAGAIN) {
             return;
         } else {
             serverLog(LL_VERBOSE, "Reading from client: %s",strerror(errno));
+            lock.unlock();
+            aelock.arm(nullptr);
             freeClient(c);
             return;
         }
     } else if (nread == 0) {
         serverLog(LL_VERBOSE, "Client closed connection");
+        lock.unlock();
+        aelock.arm(nullptr);
         freeClient(c);
         return;
     } else if (c->flags & CLIENT_MASTER) {
@@ -1682,6 +2095,8 @@ void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
         serverLog(LL_WARNING,"Closing client that reached max query buffer length: %s (qbuf initial bytes: %s)", ci, bytes);
         sdsfree(ci);
         sdsfree(bytes);
+        lock.unlock();
+        aelock.arm(nullptr);
         freeClient(c);
         return;
     }
@@ -1693,6 +2108,8 @@ void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
      * corresponding part of the replication stream, will be propagated to
      * the sub-slaves and to the replication backlog. */
     processInputBufferAndReplicate(c);
+    aelock.arm(nullptr);
+    ProcessPendingAsyncWrites();
 }
 
 void getClientsMaxBuffers(unsigned long *longest_output_list,
@@ -1704,7 +2121,7 @@ void getClientsMaxBuffers(unsigned long *longest_output_list,
 
     listRewind(server.clients,&li);
     while ((ln = listNext(&li)) != NULL) {
-        c = listNodeValue(ln);
+        c = (client*)listNodeValue(ln);
 
         if (listLength(c->reply) > lol) lol = listLength(c->reply);
         if (sdslen(c->querybuf) > bib) bib = sdslen(c->querybuf);
@@ -1775,7 +2192,7 @@ sds catClientInfoString(sds s, client *client) {
     if (p == flags) *p++ = 'N';
     *p++ = '\0';
 
-    emask = client->fd == -1 ? 0 : aeGetFileEvents(server.el,client->fd);
+    emask = client->fd == -1 ? 0 : aeGetFileEvents(server.rgthreadvar[client->iel].el,client->fd);
     p = events;
     if (emask & AE_READABLE) *p++ = 'r';
     if (emask & AE_WRITABLE) *p++ = 'w';
@@ -1810,7 +2227,7 @@ sds getAllClientsInfoString(int type) {
     sdsclear(o);
     listRewind(server.clients,&li);
     while ((ln = listNext(&li)) != NULL) {
-        client = listNodeValue(ln);
+        client = reinterpret_cast<struct client*>(listNodeValue(ln));
         if (type != -1 && getClientType(client) != type) continue;
         o = catClientInfoString(o,client);
         o = sdscatlen(o,"\n",1);
@@ -1823,7 +2240,7 @@ void clientCommand(client *c) {
     listIter li;
     client *client;
 
-    if (c->argc == 2 && !strcasecmp(ptrFromObj(c->argv[1]),"help")) {
+    if (c->argc == 2 && !strcasecmp((const char*)ptrFromObj(c->argv[1]),"help")) {
         const char *help[] = {
 "id                     -- Return the ID of the current connection.",
 "getname                -- Return the name of the current connection.",
@@ -1841,14 +2258,14 @@ void clientCommand(client *c) {
 NULL
         };
         addReplyHelp(c, help);
-    } else if (!strcasecmp(ptrFromObj(c->argv[1]),"id") && c->argc == 2) {
+    } else if (!strcasecmp((const char*)ptrFromObj(c->argv[1]),"id") && c->argc == 2) {
         /* CLIENT ID */
         addReplyLongLong(c,c->id);
-    } else if (!strcasecmp(ptrFromObj(c->argv[1]),"list")) {
+    } else if (!strcasecmp((const char*)ptrFromObj(c->argv[1]),"list")) {
         /* CLIENT LIST */
         int type = -1;
-        if (c->argc == 4 && !strcasecmp(ptrFromObj(c->argv[2]),"type")) {
-            type = getClientTypeByName(ptrFromObj(c->argv[3]));
+        if (c->argc == 4 && !strcasecmp((const char*)ptrFromObj(c->argv[2]),"type")) {
+            type = getClientTypeByName((char*)ptrFromObj(c->argv[3]));
             if (type == -1) {
                 addReplyErrorFormat(c,"Unknown client type '%s'",
                     (char*) ptrFromObj(c->argv[3]));
@@ -1861,21 +2278,21 @@ NULL
         sds o = getAllClientsInfoString(type);
         addReplyBulkCBuffer(c,o,sdslen(o));
         sdsfree(o);
-    } else if (!strcasecmp(ptrFromObj(c->argv[1]),"reply") && c->argc == 3) {
+    } else if (!strcasecmp((const char*)ptrFromObj(c->argv[1]),"reply") && c->argc == 3) {
         /* CLIENT REPLY ON|OFF|SKIP */
-        if (!strcasecmp(ptrFromObj(c->argv[2]),"on")) {
+        if (!strcasecmp((const char*)ptrFromObj(c->argv[2]),"on")) {
             c->flags &= ~(CLIENT_REPLY_SKIP|CLIENT_REPLY_OFF);
             addReply(c,shared.ok);
-        } else if (!strcasecmp(ptrFromObj(c->argv[2]),"off")) {
+        } else if (!strcasecmp((const char*)ptrFromObj(c->argv[2]),"off")) {
             c->flags |= CLIENT_REPLY_OFF;
-        } else if (!strcasecmp(ptrFromObj(c->argv[2]),"skip")) {
+        } else if (!strcasecmp((const char*)ptrFromObj(c->argv[2]),"skip")) {
             if (!(c->flags & CLIENT_REPLY_OFF))
                 c->flags |= CLIENT_REPLY_SKIP_NEXT;
         } else {
             addReply(c,shared.syntaxerr);
             return;
         }
-    } else if (!strcasecmp(ptrFromObj(c->argv[1]),"kill")) {
+    } else if (!strcasecmp((const char*)ptrFromObj(c->argv[1]),"kill")) {
         /* CLIENT KILL <ip:port>
          * CLIENT KILL <option> [value] ... <option> [value] */
         char *addr = NULL;
@@ -1886,7 +2303,7 @@ NULL
 
         if (c->argc == 3) {
             /* Old style syntax: CLIENT KILL <addr> */
-            addr = ptrFromObj(c->argv[2]);
+            addr = (char*)ptrFromObj(c->argv[2]);
             skipme = 0; /* With the old form, you can kill yourself. */
         } else if (c->argc > 3) {
             int i = 2; /* Next option index. */
@@ -1895,25 +2312,25 @@ NULL
             while(i < c->argc) {
                 int moreargs = c->argc > i+1;
 
-                if (!strcasecmp(ptrFromObj(c->argv[i]),"id") && moreargs) {
+                if (!strcasecmp((const char*)ptrFromObj(c->argv[i]),"id") && moreargs) {
                     long long tmp;
 
                     if (getLongLongFromObjectOrReply(c,c->argv[i+1],&tmp,NULL)
                         != C_OK) return;
                     id = tmp;
-                } else if (!strcasecmp(ptrFromObj(c->argv[i]),"type") && moreargs) {
-                    type = getClientTypeByName(ptrFromObj(c->argv[i+1]));
+                } else if (!strcasecmp((const char*)ptrFromObj(c->argv[i]),"type") && moreargs) {
+                    type = getClientTypeByName((const char*)ptrFromObj(c->argv[i+1]));
                     if (type == -1) {
                         addReplyErrorFormat(c,"Unknown client type '%s'",
                             (char*) ptrFromObj(c->argv[i+1]));
                         return;
                     }
-                } else if (!strcasecmp(ptrFromObj(c->argv[i]),"addr") && moreargs) {
-                    addr = ptrFromObj(c->argv[i+1]);
-                } else if (!strcasecmp(ptrFromObj(c->argv[i]),"skipme") && moreargs) {
-                    if (!strcasecmp(ptrFromObj(c->argv[i+1]),"yes")) {
+                } else if (!strcasecmp((const char*)ptrFromObj(c->argv[i]),"addr") && moreargs) {
+                    addr = (char*)ptrFromObj(c->argv[i+1]);
+                } else if (!strcasecmp((const char*)ptrFromObj(c->argv[i]),"skipme") && moreargs) {
+                    if (!strcasecmp((const char*)ptrFromObj(c->argv[i+1]),"yes")) {
                         skipme = 1;
-                    } else if (!strcasecmp(ptrFromObj(c->argv[i+1]),"no")) {
+                    } else if (!strcasecmp((const char*)ptrFromObj(c->argv[i+1]),"no")) {
                         skipme = 0;
                     } else {
                         addReply(c,shared.syntaxerr);
@@ -1933,7 +2350,7 @@ NULL
         /* Iterate clients killing all the matching clients. */
         listRewind(server.clients,&li);
         while ((ln = listNext(&li)) != NULL) {
-            client = listNodeValue(ln);
+            client = (struct client*)listNodeValue(ln);
             if (addr && strcmp(getClientPeerId(client),addr) != 0) continue;
             if (type != -1 && getClientType(client) != type) continue;
             if (id != 0 && client->id != id) continue;
@@ -1943,7 +2360,7 @@ NULL
             if (c == client) {
                 close_this_client = 1;
             } else {
-                freeClient(client);
+                freeClientAsync(client);
             }
             killed++;
         }
@@ -1961,7 +2378,7 @@ NULL
         /* If this client has to be closed, flag it as CLOSE_AFTER_REPLY
          * only after we queued the reply to its output buffers. */
         if (close_this_client) c->flags |= CLIENT_CLOSE_AFTER_REPLY;
-    } else if (!strcasecmp(ptrFromObj(c->argv[1]),"unblock") && (c->argc == 3 ||
+    } else if (!strcasecmp((const char*)ptrFromObj(c->argv[1]),"unblock") && (c->argc == 3 ||
                                                           c->argc == 4))
     {
         /* CLIENT UNBLOCK <id> [timeout|error] */
@@ -1969,9 +2386,9 @@ NULL
         int unblock_error = 0;
 
         if (c->argc == 4) {
-            if (!strcasecmp(ptrFromObj(c->argv[3]),"timeout")) {
+            if (!strcasecmp((const char*)ptrFromObj(c->argv[3]),"timeout")) {
                 unblock_error = 0;
-            } else if (!strcasecmp(ptrFromObj(c->argv[3]),"error")) {
+            } else if (!strcasecmp((const char*)ptrFromObj(c->argv[3]),"error")) {
                 unblock_error = 1;
             } else {
                 addReplyError(c,
@@ -1993,9 +2410,9 @@ NULL
         } else {
             addReply(c,shared.czero);
         }
-    } else if (!strcasecmp(ptrFromObj(c->argv[1]),"setname") && c->argc == 3) {
-        int j, len = sdslen(ptrFromObj(c->argv[2]));
-        char *p = ptrFromObj(c->argv[2]);
+    } else if (!strcasecmp((const char*)ptrFromObj(c->argv[1]),"setname") && c->argc == 3) {
+        int j, len = sdslen((sds)ptrFromObj(c->argv[2]));
+        char *p = (char*)ptrFromObj(c->argv[2]);
 
         /* Setting the client name to an empty string actually removes
          * the current name. */
@@ -2021,12 +2438,12 @@ NULL
         c->name = c->argv[2];
         incrRefCount(c->name);
         addReply(c,shared.ok);
-    } else if (!strcasecmp(ptrFromObj(c->argv[1]),"getname") && c->argc == 2) {
+    } else if (!strcasecmp((const char*)ptrFromObj(c->argv[1]),"getname") && c->argc == 2) {
         if (c->name)
             addReplyBulk(c,c->name);
         else
             addReplyNull(c);
-    } else if (!strcasecmp(ptrFromObj(c->argv[1]),"pause") && c->argc == 3) {
+    } else if (!strcasecmp((const char*)ptrFromObj(c->argv[1]),"pause") && c->argc == 3) {
         long long duration;
 
         if (getTimeoutFromObjectOrReply(c,c->argv[2],&duration,UNIT_MILLISECONDS)
@@ -2123,7 +2540,7 @@ void rewriteClientCommandVector(client *c, int argc, ...) {
     int j;
     robj **argv; /* The new argument vector */
 
-    argv = zmalloc(sizeof(robj*)*argc, MALLOC_LOCAL);
+    argv = (robj**)zmalloc(sizeof(robj*)*argc, MALLOC_LOCAL);
     va_start(ap,argc);
     for (j = 0; j < argc; j++) {
         robj *a;
@@ -2140,7 +2557,7 @@ void rewriteClientCommandVector(client *c, int argc, ...) {
     /* Replace argv and argc with our new versions. */
     c->argv = argv;
     c->argc = argc;
-    c->cmd = lookupCommandOrOriginal(ptrFromObj(c->argv[0]));
+    c->cmd = lookupCommandOrOriginal((sds)ptrFromObj(c->argv[0]));
     serverAssertWithInfo(c,NULL,c->cmd != NULL);
     va_end(ap);
 }
@@ -2151,7 +2568,7 @@ void replaceClientCommandVector(client *c, int argc, robj **argv) {
     zfree(c->argv);
     c->argv = argv;
     c->argc = argc;
-    c->cmd = lookupCommandOrOriginal(ptrFromObj(c->argv[0]));
+    c->cmd = lookupCommandOrOriginal((sds)ptrFromObj(c->argv[0]));
     serverAssertWithInfo(c,NULL,c->cmd != NULL);
 }
 
@@ -2170,7 +2587,7 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) {
     robj *oldval;
 
     if (i >= c->argc) {
-        c->argv = zrealloc(c->argv,sizeof(robj*)*(i+1), MALLOC_LOCAL);
+        c->argv = (robj**)zrealloc(c->argv,sizeof(robj*)*(i+1), MALLOC_LOCAL);
         c->argc = i+1;
         c->argv[i] = NULL;
     }
@@ -2181,7 +2598,7 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) {
 
     /* If this is the command name make sure to fix c->cmd. */
     if (i == 0) {
-        c->cmd = lookupCommandOrOriginal(ptrFromObj(c->argv[0]));
+        c->cmd = lookupCommandOrOriginal((sds)ptrFromObj(c->argv[0]));
         serverAssertWithInfo(c,NULL,c->cmd != NULL);
     }
 }
@@ -2201,7 +2618,7 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) {
  * enforcing the client output length limits. */
 unsigned long getClientOutputBufferMemoryUsage(client *c) {
     unsigned long list_item_size = sizeof(listNode) + sizeof(clientReplyBlock);
-    return c->reply_bytes + (list_item_size*listLength(c->reply));
+    return c->reply_bytes + (list_item_size*listLength(c->reply)) + c->buflenAsync;
 }
 
 /* Get the class of a client, used in order to enforce limits to different
@@ -2221,7 +2638,7 @@ int getClientType(client *c) {
     return CLIENT_TYPE_NORMAL;
 }
 
-int getClientTypeByName(char *name) {
+int getClientTypeByName(const char *name) {
     if (!strcasecmp(name,"normal")) return CLIENT_TYPE_NORMAL;
     else if (!strcasecmp(name,"slave")) return CLIENT_TYPE_SLAVE;
     else if (!strcasecmp(name,"replica")) return CLIENT_TYPE_SLAVE;
@@ -2230,8 +2647,8 @@ int getClientTypeByName(char *name) {
     else return -1;
 }
 
-char *getClientTypeName(int class) {
-    switch(class) {
+const char *getClientTypeName(int clientType) {
+    switch(clientType) {
     case CLIENT_TYPE_NORMAL: return "normal";
     case CLIENT_TYPE_SLAVE:  return "slave";
     case CLIENT_TYPE_PUBSUB: return "pubsub";
@@ -2247,19 +2664,19 @@ char *getClientTypeName(int class) {
  * Return value: non-zero if the client reached the soft or the hard limit.
  *               Otherwise zero is returned. */
 int checkClientOutputBufferLimits(client *c) {
-    int soft = 0, hard = 0, class;
+    int soft = 0, hard = 0;
     unsigned long used_mem = getClientOutputBufferMemoryUsage(c);
 
-    class = getClientType(c);
+    int clientType = getClientType(c);
     /* For the purpose of output buffer limiting, masters are handled
      * like normal clients. */
-    if (class == CLIENT_TYPE_MASTER) class = CLIENT_TYPE_NORMAL;
+    if (clientType == CLIENT_TYPE_MASTER) clientType = CLIENT_TYPE_NORMAL;
 
-    if (server.client_obuf_limits[class].hard_limit_bytes &&
-        used_mem >= server.client_obuf_limits[class].hard_limit_bytes)
+    if (server.client_obuf_limits[clientType].hard_limit_bytes &&
+        used_mem >= server.client_obuf_limits[clientType].hard_limit_bytes)
         hard = 1;
-    if (server.client_obuf_limits[class].soft_limit_bytes &&
-        used_mem >= server.client_obuf_limits[class].soft_limit_bytes)
+    if (server.client_obuf_limits[clientType].soft_limit_bytes &&
+        used_mem >= server.client_obuf_limits[clientType].soft_limit_bytes)
         soft = 1;
 
     /* We need to check if the soft limit is reached continuously for the
@@ -2272,7 +2689,7 @@ int checkClientOutputBufferLimits(client *c) {
             time_t elapsed = server.unixtime - c->obuf_soft_limit_reached_time;
 
             if (elapsed <=
-                server.client_obuf_limits[class].soft_limit_seconds) {
+                server.client_obuf_limits[clientType].soft_limit_seconds) {
                 soft = 0; /* The client still did not reached the max number of
                              seconds for the soft limit to be considered
                              reached. */
@@ -2309,21 +2726,25 @@ void asyncCloseClientOnOutputBufferLimitReached(client *c) {
  * This is also called by SHUTDOWN for a best-effort attempt to send
  * slaves the latest writes. */
 void flushSlavesOutputBuffers(void) {
+    serverAssert(aeThreadOwnsLock());
     listIter li;
     listNode *ln;
 
     listRewind(server.slaves,&li);
     while((ln = listNext(&li))) {
-        client *slave = listNodeValue(ln);
+        client *slave = (client*)listNodeValue(ln);
         int events;
 
+        if (!FCorrectThread(slave))
+            continue;   // we cannot synchronously flush other thread's clients
+
         /* Note that the following will not flush output buffers of slaves
          * in STATE_ONLINE but having put_online_on_ack set to true: in this
          * case the writable event is never installed, since the purpose
          * of put_online_on_ack is to postpone the moment it is installed.
          * This is what we want since slaves in this state should not receive
          * writes before the first ACK. */
-        events = aeGetFileEvents(server.el,slave->fd);
+        events = aeGetFileEvents(server.rgthreadvar[slave->iel].el,slave->fd);
         if (events & AE_WRITABLE &&
             slave->replstate == SLAVE_STATE_ONLINE &&
             clientHasPendingReplies(slave))
@@ -2372,7 +2793,7 @@ int clientsArePaused(void) {
          * force the re-processing of the input buffer if any. */
         listRewind(server.clients,&li);
         while ((ln = listNext(&li)) != NULL) {
-            c = listNodeValue(ln);
+            c = (client*)listNodeValue(ln);
 
             /* Don't touch slaves and blocked clients.
              * The latter pending requests will be processed when unblocked. */
@@ -2395,15 +2816,18 @@ int clientsArePaused(void) {
  * write, close sequence needed to serve a client.
  *
  * The function returns the total number of events processed. */
-int processEventsWhileBlocked(void) {
+int processEventsWhileBlocked(int iel) {
     int iterations = 4; /* See the function top-comment. */
     int count = 0;
+
+    aeReleaseLock();
     while (iterations--) {
         int events = 0;
-        events += aeProcessEvents(server.el, AE_FILE_EVENTS|AE_DONT_WAIT);
-        events += handleClientsWithPendingWrites();
+        events += aeProcessEvents(server.rgthreadvar[iel].el, AE_FILE_EVENTS|AE_DONT_WAIT);
+        events += handleClientsWithPendingWrites(iel);
         if (!events) break;
         count += events;
     }
+    aeAcquireLock();
     return count;
 }
diff --git a/src/object.c b/src/object.c
index 14c43fe1a..15eaf0508 100644
--- a/src/object.c
+++ b/src/object.c
@@ -82,7 +82,10 @@ robj *createRawStringObject(const char *ptr, size_t len) {
  * an object where the sds string is actually an unmodifiable string
  * allocated in the same chunk as the object itself. */
 robj *createEmbeddedStringObject(const char *ptr, size_t len) {
-    robj *o = zmalloc(sizeof(robj)+sizeof(struct sdshdr8)+len+1-sizeof(o->m_ptr), MALLOC_SHARED);
+    size_t allocsize = sizeof(struct sdshdr8)+len+1;
+    if (allocsize < sizeof(void*))
+        allocsize = sizeof(void*);
+    robj *o = zmalloc(sizeof(robj)+allocsize-sizeof(o->m_ptr), MALLOC_SHARED);
     struct sdshdr8 *sh = (void*)(&o->m_ptr);
 
     o->type = OBJ_STRING;
@@ -394,7 +397,7 @@ robj *resetRefCount(robj *obj) {
 
 int checkType(client *c, robj *o, int type) {
     if (o->type != type) {
-        addReply(c,shared.wrongtypeerr);
+        addReplyAsync(c,shared.wrongtypeerr);
         return 1;
     }
     return 0;
@@ -940,6 +943,7 @@ void freeMemoryOverheadData(struct redisMemOverhead *mh) {
  * information used for the MEMORY OVERHEAD and INFO command. The returned
  * structure pointer should be freed calling freeMemoryOverheadData(). */
 struct redisMemOverhead *getMemoryOverheadData(void) {
+    serverAssert(aeThreadOwnsLock());
     int j;
     size_t mem_total = 0;
     size_t mem = 0;
@@ -982,6 +986,8 @@ struct redisMemOverhead *getMemoryOverheadData(void) {
         listRewind(server.slaves,&li);
         while((ln = listNext(&li))) {
             client *c = listNodeValue(ln);
+            if (c->flags & CLIENT_CLOSE_ASAP)
+                continue;
             mem += getClientOutputBufferMemoryUsage(c);
             mem += sdsAllocSize(c->querybuf);
             mem += sizeof(client);
@@ -1077,6 +1083,7 @@ void inputCatSds(void *result, const char *str) {
 /* This implements MEMORY DOCTOR. An human readable analysis of the Redis
  * memory condition. */
 sds getMemoryDoctorReport(void) {
+    serverAssert(aeThreadOwnsLock());
     int empty = 0;          /* Instance is empty or almost empty. */
     int big_peak = 0;       /* Memory peak is much larger than used mem. */
     int high_frag = 0;      /* High fragmentation. */
diff --git a/src/pubsub.c b/src/pubsub.c
index c2c3c9a82..af064d06a 100644
--- a/src/pubsub.c
+++ b/src/pubsub.c
@@ -38,12 +38,12 @@ int clientSubscriptionsCount(client *c);
 /* Send a pubsub message of type "message" to the client. */
 void addReplyPubsubMessage(client *c, robj *channel, robj *msg) {
     if (c->resp == 2)
-        addReply(c,shared.mbulkhdr[3]);
+        addReplyAsync(c,shared.mbulkhdr[3]);
     else
-        addReplyPushLen(c,3);
-    addReply(c,shared.messagebulk);
-    addReplyBulk(c,channel);
-    addReplyBulk(c,msg);
+        addReplyPushLenAsync(c,3);
+    addReplyAsync(c,shared.messagebulk);
+    addReplyBulkAsync(c,channel);
+    addReplyBulkAsync(c,msg);
 }
 
 /* Send a pubsub message of type "pmessage" to the client. The difference
@@ -51,13 +51,13 @@ void addReplyPubsubMessage(client *c, robj *channel, robj *msg) {
  * this message format also includes the pattern that matched the message. */
 void addReplyPubsubPatMessage(client *c, robj *pat, robj *channel, robj *msg) {
     if (c->resp == 2)
-        addReply(c,shared.mbulkhdr[4]);
+        addReplyAsync(c,shared.mbulkhdr[4]);
     else
-        addReplyPushLen(c,4);
-    addReply(c,shared.pmessagebulk);
-    addReplyBulk(c,pat);
-    addReplyBulk(c,channel);
-    addReplyBulk(c,msg);
+        addReplyPushLenAsync(c,4);
+    addReplyAsync(c,shared.pmessagebulk);
+    addReplyBulkAsync(c,pat);
+    addReplyBulkAsync(c,channel);
+    addReplyBulkAsync(c,msg);
 }
 
 /* Send the pubsub subscription notification to the client. */
@@ -293,7 +293,9 @@ int pubsubPublishMessage(robj *channel, robj *message) {
         listRewind(list,&li);
         while ((ln = listNext(&li)) != NULL) {
             client *c = ln->value;
+            fastlock_lock(&c->lock);
             addReplyPubsubMessage(c,channel,message);
+            fastlock_unlock(&c->lock);
             receivers++;
         }
     }
@@ -309,8 +311,10 @@ int pubsubPublishMessage(robj *channel, robj *message) {
                                 (char*)ptrFromObj(channel),
                                 sdslen(ptrFromObj(channel)),0))
             {
+                fastlock_lock(&pat->pclient->lock);
                 addReplyPubsubPatMessage(pat->pclient,
                     pat->pattern,channel,message);
+                fastlock_unlock(&pat->pclient->lock);
                 receivers++;
             }
         }
@@ -325,6 +329,7 @@ int pubsubPublishMessage(robj *channel, robj *message) {
 
 void subscribeCommand(client *c) {
     int j;
+    serverAssert(aeThreadOwnsLock());
 
     for (j = 1; j < c->argc; j++)
         pubsubSubscribeChannel(c,c->argv[j]);
@@ -345,6 +350,7 @@ void unsubscribeCommand(client *c) {
 
 void psubscribeCommand(client *c) {
     int j;
+    serverAssert(aeThreadOwnsLock());
 
     for (j = 1; j < c->argc; j++)
         pubsubSubscribePattern(c,c->argv[j]);
diff --git a/src/quicklist.h b/src/quicklist.h
index 165da0877..2e9ef079f 100644
--- a/src/quicklist.h
+++ b/src/quicklist.h
@@ -67,7 +67,9 @@ typedef struct quicklistNode {
  * When quicklistNode->zl is compressed, node->zl points to a quicklistLZF */
 typedef struct quicklistLZF {
     unsigned int sz; /* LZF size in bytes*/
-    char compressed[ZERO_LENGTH_ARRAY_LENGTH];
+#ifndef __cplusplus
+    char compressed[];
+#endif
 } quicklistLZF;
 
 /* quicklist is a 40 byte struct (on 64-bit systems) describing a quicklist.
diff --git a/src/rand.h b/src/rand.h
index 1dce3e8b0..c6a9ae454 100644
--- a/src/rand.h
+++ b/src/rand.h
@@ -30,9 +30,17 @@
 #ifndef REDIS_RANDOM_H
 #define REDIS_RANDOM_H
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 int32_t redisLrand48();
 void redisSrand48(int32_t seedval);
 
+#ifdef __cplusplus
+}
+#endif
+
 #define REDIS_LRAND48_MAX INT32_MAX
 
 #endif
diff --git a/src/rax.h b/src/rax.h
index 737f1cbb1..aae282ba0 100644
--- a/src/rax.h
+++ b/src/rax.h
@@ -39,6 +39,10 @@
 
 #include <stdint.h>
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /* Representation of a radix tree as implemented in this file, that contains
  * the strings "foo", "foobar" and "footer" after the insertion of each
  * word. When the node represents a key inside the radix tree, we write it
@@ -133,7 +137,9 @@ typedef struct raxNode {
      * children, an additional value pointer is present (as you can see
      * in the representation above as "value-ptr" field).
      */
-    unsigned char data[ZERO_LENGTH_ARRAY_LENGTH];
+#ifndef __cplusplus
+    unsigned char data[];
+#endif
 } raxNode;
 
 typedef struct rax {
@@ -219,4 +225,8 @@ void raxSetDebugMsg(int onoff);
  * in a low level way, so this function is exported as well. */
 void raxSetData(raxNode *n, void *data);
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif
diff --git a/src/rdb-s3.cpp b/src/rdb-s3.cpp
index d6bf34ae0..bd00bb2bd 100644
--- a/src/rdb-s3.cpp
+++ b/src/rdb-s3.cpp
@@ -1,7 +1,7 @@
 extern "C" {
 #include "rio.h"
-#include "server.h"
 }
+#include "server.h"
 #include <unistd.h>
 #include <sys/wait.h>
 
diff --git a/src/rdb.c b/src/rdb.c
index 849751d32..9940a0d52 100644
--- a/src/rdb.c
+++ b/src/rdb.c
@@ -1862,7 +1862,7 @@ void rdbLoadProgressCallback(rio *r, const void *buf, size_t len) {
         if (server.masterhost && server.repl_state == REPL_STATE_TRANSFER)
             replicationSendNewlineToMaster();
         loadingProgress(r->processed_bytes);
-        processEventsWhileBlocked();
+        processEventsWhileBlocked(serverTL - server.rgthreadvar);
     }
 }
 
@@ -2140,6 +2140,7 @@ void backgroundSaveDoneHandlerDisk(int exitcode, int bysignal) {
  * This function covers the case of RDB -> Salves socket transfers for
  * diskless replication. */
 void backgroundSaveDoneHandlerSocket(int exitcode, int bysignal) {
+    serverAssert(aeThreadOwnsLock());
     uint64_t *ok_slaves;
 
     if (!bysignal && exitcode == 0) {
@@ -2259,6 +2260,7 @@ void killRDBChild(void) {
 /* Spawn an RDB child that writes the RDB to the sockets of the slaves
  * that are currently in SLAVE_STATE_WAIT_BGSAVE_START state. */
 int rdbSaveToSlavesSockets(rdbSaveInfo *rsi) {
+    serverAssert(aeThreadOwnsLock());
     int *fds;
     uint64_t *clientids;
     int numfds;
diff --git a/src/redis-benchmark.c b/src/redis-benchmark.c
index ae61936da..15d558bb8 100644
--- a/src/redis-benchmark.c
+++ b/src/redis-benchmark.c
@@ -39,6 +39,7 @@
 #include <sys/time.h>
 #include <signal.h>
 #include <assert.h>
+#include <math.h>
 
 #include <sds.h> /* Use hiredis sds. */
 #include "ae.h"
@@ -49,6 +50,7 @@
 
 #define UNUSED(V) ((void) V)
 #define RANDPTR_INITIAL_SIZE 8
+#define MAX_LATENCY_PRECISION 3
 
 static struct config {
     aeEventLoop *el;
@@ -80,6 +82,7 @@ static struct config {
     sds dbnumstr;
     char *tests;
     char *auth;
+    int precision;
 } config;
 
 typedef struct _client {
@@ -429,8 +432,19 @@ static int compareLatency(const void *a, const void *b) {
     return (*(long long*)a)-(*(long long*)b);
 }
 
+static int ipow(int base, int exp) {
+    int result = 1;
+    while (exp) {
+        if (exp & 1) result *= base;
+        exp /= 2;
+        base *= base;
+    }
+    return result;
+}
+
 static void showLatencyReport(void) {
     int i, curlat = 0;
+    int usbetweenlat = ipow(10, MAX_LATENCY_PRECISION-config.precision);
     float perc, reqpersec;
 
     reqpersec = (float)config.requests_finished/((float)config.totlatency/1000);
@@ -445,10 +459,21 @@ static void showLatencyReport(void) {
 
         qsort(config.latency,config.requests,sizeof(long long),compareLatency);
         for (i = 0; i < config.requests; i++) {
-            if (config.latency[i]/1000 != curlat || i == (config.requests-1)) {
-                curlat = config.latency[i]/1000;
+            if (config.latency[i]/usbetweenlat != curlat ||
+                i == (config.requests-1))
+            {
+                curlat = config.latency[i]/usbetweenlat;
                 perc = ((float)(i+1)*100)/config.requests;
-                printf("%.2f%% <= %d milliseconds\n", perc, curlat);
+                printf("%.2f%% <= %.*f milliseconds\n", perc, config.precision,
+                    curlat/pow(10.0, config.precision));
+
+                /* After the 2 milliseconds latency to have percentages split
+                 * by decimals will just add a lot of noise to the output. */
+                if (config.latency[i] > 2000) {
+                    config.precision = 0;
+                    usbetweenlat = ipow(10,
+                        MAX_LATENCY_PRECISION-config.precision);
+                }
             }
         }
         printf("%.2f requests per second\n\n", reqpersec);
@@ -547,6 +572,11 @@ int parseOptions(int argc, const char **argv) {
             if (lastarg) goto invalid;
             config.dbnum = atoi(argv[++i]);
             config.dbnumstr = sdsfromlonglong(config.dbnum);
+        } else if (!strcmp(argv[i],"--precision")) {
+            if (lastarg) goto invalid;
+            config.precision = atoi(argv[++i]);
+            if (config.precision < 0) config.precision = 0;
+            if (config.precision > MAX_LATENCY_PRECISION) config.precision = MAX_LATENCY_PRECISION;
         } else if (!strcmp(argv[i],"--help")) {
             exit_status = 0;
             goto usage;
@@ -586,6 +616,7 @@ usage:
 " -e                 If server replies with errors, show them on stdout.\n"
 "                    (no more than 1 error per second is displayed)\n"
 " -q                 Quiet. Just show query/sec values\n"
+" --precision        Number of decimal places to display in latency output (default 0)\n"
 " --csv              Output in CSV format\n"
 " -l                 Loop. Run the tests forever\n"
 " -t <tests>         Only run the comma separated list of tests. The test\n"
@@ -682,6 +713,7 @@ int main(int argc, const char **argv) {
     config.tests = NULL;
     config.dbnum = 0;
     config.auth = NULL;
+    config.precision = 1;
 
     i = parseOptions(argc,argv);
     argc -= i;
diff --git a/src/redis-cli.c b/src/redis-cli.c
index 03ead55f2..3667a1919 100644
--- a/src/redis-cli.c
+++ b/src/redis-cli.c
@@ -211,6 +211,8 @@ static struct config {
     char *pattern;
     char *rdb_filename;
     int bigkeys;
+    int memkeys;
+    unsigned memkeys_samples;
     int hotkeys;
     int stdinarg; /* get last arg from stdin. (-x option) */
     char *auth;
@@ -1337,6 +1339,12 @@ static int parseOptions(int argc, char **argv) {
             config.pipe_timeout = atoi(argv[++i]);
         } else if (!strcmp(argv[i],"--bigkeys")) {
             config.bigkeys = 1;
+        } else if (!strcmp(argv[i],"--memkeys")) {
+            config.memkeys = 1;
+            config.memkeys_samples = 0; /* use redis default */
+        } else if (!strcmp(argv[i],"--memkeys-samples")) {
+            config.memkeys = 1;
+            config.memkeys_samples = atoi(argv[++i]);
         } else if (!strcmp(argv[i],"--hotkeys")) {
             config.hotkeys = 1;
         } else if (!strcmp(argv[i],"--eval") && !lastarg) {
@@ -1535,7 +1543,10 @@ static void usage(void) {
 "  --pipe-timeout <n> In --pipe mode, abort with error if after sending all data.\n"
 "                     no reply is received within <n> seconds.\n"
 "                     Default timeout: %d. Use 0 to wait forever.\n"
-"  --bigkeys          Sample Redis keys looking for big keys.\n"
+"  --bigkeys          Sample Redis keys looking for keys with many elements (complexity).\n"
+"  --memkeys          Sample Redis keys looking for keys consuming a lot of memory.\n"
+"  --memkeys-samples <n> Sample Redis keys looking for keys consuming a lot of memory.\n"
+"                     And define number of key elements to sample\n"
 "  --hotkeys          Sample Redis keys looking for hot keys.\n"
 "                     only works when maxmemory-policy is *lfu.\n"
 "  --scan             List all keys using the SCAN command.\n"
@@ -4919,6 +4930,12 @@ static int clusterManagerCommandCreate(int argc, char **argv) {
         cursor += slots_per_node;
     }
 
+    /* Rotating the list sometimes helps to get better initial
+     * anti-affinity before the optimizer runs. */
+    clusterManagerNode *first_node = interleaved[0];
+    for (i = 0; i < (interleaved_len - 1); i++)
+        interleaved[i] = interleaved[i + 1];
+    interleaved[interleaved_len - 1] = first_node;
     int assign_unused = 0, available_count = interleaved_len;
 assign_replicas:
     for (i = 0; i < masters_count; i++) {
@@ -6142,9 +6159,31 @@ static void latencyDistMode(void) {
  * Slave mode
  *--------------------------------------------------------------------------- */
 
+#define RDB_EOF_MARK_SIZE 40
+
+void sendReplconf(const char* arg1, const char* arg2) {
+    printf("sending REPLCONF %s %s\n", arg1, arg2);
+    redisReply *reply = redisCommand(context, "REPLCONF %s %s", arg1, arg2);
+
+    /* Handle any error conditions */
+    if(reply == NULL) {
+        fprintf(stderr, "\nI/O error\n");
+        exit(1);
+    } else if(reply->type == REDIS_REPLY_ERROR) {
+        fprintf(stderr, "REPLCONF %s error: %s\n", arg1, reply->str);
+        /* non fatal, old versions may not support it */
+    }
+    freeReplyObject(reply);
+}
+
+void sendCapa() {
+    sendReplconf("capa", "eof");
+}
+
 /* Sends SYNC and reads the number of bytes in the payload. Used both by
- * slaveMode() and getRDB(). */
-unsigned long long sendSync(int fd) {
+ * slaveMode() and getRDB().
+ * returns 0 in case an EOF marker is used. */
+unsigned long long sendSync(int fd, char *out_eof) {
     /* To start we need to send the SYNC command and return the payload.
      * The hiredis client lib does not understand this part of the protocol
      * and we don't want to mess with its buffers, so everything is performed
@@ -6174,17 +6213,33 @@ unsigned long long sendSync(int fd) {
         printf("SYNC with master failed: %s\n", buf);
         exit(1);
     }
+    if (strncmp(buf+1,"EOF:",4) == 0 && strlen(buf+5) >= RDB_EOF_MARK_SIZE) {
+        memcpy(out_eof, buf+5, RDB_EOF_MARK_SIZE);
+        return 0;
+    }
     return strtoull(buf+1,NULL,10);
 }
 
 static void slaveMode(void) {
     int fd = context->fd;
-    unsigned long long payload = sendSync(fd);
+    static char eofmark[RDB_EOF_MARK_SIZE];
+    static char lastbytes[RDB_EOF_MARK_SIZE];
+    static int usemark = 0;
+    unsigned long long payload = sendSync(fd, eofmark);
     char buf[1024];
     int original_output = config.output;
 
-    fprintf(stderr,"SYNC with master, discarding %llu "
-                   "bytes of bulk transfer...\n", payload);
+    if (payload == 0) {
+        payload = ULLONG_MAX;
+        memset(lastbytes,0,RDB_EOF_MARK_SIZE);
+        usemark = 1;
+        fprintf(stderr,"SYNC with master, discarding "
+                       "bytes of bulk transfer until EOF marker...\n");
+    } else {
+        fprintf(stderr,"SYNC with master, discarding %llu "
+                       "bytes of bulk transfer...\n", payload);
+    }
+
 
     /* Discard the payload. */
     while(payload) {
@@ -6196,8 +6251,29 @@ static void slaveMode(void) {
             exit(1);
         }
         payload -= nread;
+
+        if (usemark) {
+            /* Update the last bytes array, and check if it matches our delimiter.*/
+            if (nread >= RDB_EOF_MARK_SIZE) {
+                memcpy(lastbytes,buf+nread-RDB_EOF_MARK_SIZE,RDB_EOF_MARK_SIZE);
+            } else {
+                int rem = RDB_EOF_MARK_SIZE-nread;
+                memmove(lastbytes,lastbytes+nread,rem);
+                memcpy(lastbytes+rem,buf,nread);
+            }
+            if (memcmp(lastbytes,eofmark,RDB_EOF_MARK_SIZE) == 0)
+                break;
+        }
     }
-    fprintf(stderr,"SYNC done. Logging commands from master.\n");
+
+    if (usemark) {
+        unsigned long long offset = ULLONG_MAX - payload;
+        fprintf(stderr,"SYNC done after %llu bytes. Logging commands from master.\n", offset);
+        /* put the slave online */
+        sleep(1);
+        sendReplconf("ACK", "0");
+    } else
+        fprintf(stderr,"SYNC done. Logging commands from master.\n");
 
     /* Now we can use hiredis to read the incoming protocol. */
     config.output = OUTPUT_CSV;
@@ -6214,11 +6290,22 @@ static void slaveMode(void) {
 static void getRDB(void) {
     int s = context->fd;
     int fd;
-    unsigned long long payload = sendSync(s);
+    static char eofmark[RDB_EOF_MARK_SIZE];
+    static char lastbytes[RDB_EOF_MARK_SIZE];
+    static int usemark = 0;
+    unsigned long long payload = sendSync(s, eofmark);
     char buf[4096];
 
-    fprintf(stderr,"SYNC sent to master, writing %llu bytes to '%s'\n",
-        payload, config.rdb_filename);
+    if (payload == 0) {
+        payload = ULLONG_MAX;
+        memset(lastbytes,0,RDB_EOF_MARK_SIZE);
+        usemark = 1;
+        fprintf(stderr,"SYNC sent to master, writing bytes of bulk transfer until EOF marker to '%s'\n",
+            config.rdb_filename);
+    } else {
+        fprintf(stderr,"SYNC sent to master, writing %llu bytes to '%s'\n",
+            payload, config.rdb_filename);
+    }
 
     /* Write to file. */
     if (!strcmp(config.rdb_filename,"-")) {
@@ -6247,11 +6334,31 @@ static void getRDB(void) {
             exit(1);
         }
         payload -= nread;
+
+        if (usemark) {
+            /* Update the last bytes array, and check if it matches our delimiter.*/
+            if (nread >= RDB_EOF_MARK_SIZE) {
+                memcpy(lastbytes,buf+nread-RDB_EOF_MARK_SIZE,RDB_EOF_MARK_SIZE);
+            } else {
+                int rem = RDB_EOF_MARK_SIZE-nread;
+                memmove(lastbytes,lastbytes+nread,rem);
+                memcpy(lastbytes+rem,buf,nread);
+            }
+            if (memcmp(lastbytes,eofmark,RDB_EOF_MARK_SIZE) == 0)
+                break;
+        }
+    }
+    if (usemark) {
+        payload = ULLONG_MAX - payload - RDB_EOF_MARK_SIZE;
+        if (ftruncate(fd, payload) == -1)
+            fprintf(stderr,"ftruncate failed: %s.\n", strerror(errno));
+        fprintf(stderr,"Transfer finished with success after %llu bytes\n", payload);
+    } else {
+        fprintf(stderr,"Transfer finished with success.\n");
     }
     close(s); /* Close the file descriptor ASAP as fsync() may take time. */
     fsync(fd);
     close(fd);
-    fprintf(stderr,"Transfer finished with success.\n");
     exit(0);
 }
 
@@ -6419,15 +6526,6 @@ static void pipeMode(void) {
  * Find big keys
  *--------------------------------------------------------------------------- */
 
-#define TYPE_STRING 0
-#define TYPE_LIST   1
-#define TYPE_SET    2
-#define TYPE_HASH   3
-#define TYPE_ZSET   4
-#define TYPE_STREAM 5
-#define TYPE_NONE   6
-#define TYPE_COUNT  7
-
 static redisReply *sendScan(unsigned long long *it) {
     redisReply *reply = redisCommand(context, "SCAN %llu", *it);
 
@@ -6474,28 +6572,51 @@ static int getDbSize(void) {
     return size;
 }
 
-static int toIntType(char *key, char *type) {
-    if(!strcmp(type, "string")) {
-        return TYPE_STRING;
-    } else if(!strcmp(type, "list")) {
-        return TYPE_LIST;
-    } else if(!strcmp(type, "set")) {
-        return TYPE_SET;
-    } else if(!strcmp(type, "hash")) {
-        return TYPE_HASH;
-    } else if(!strcmp(type, "zset")) {
-        return TYPE_ZSET;
-    } else if(!strcmp(type, "stream")) {
-        return TYPE_STREAM;
-    } else if(!strcmp(type, "none")) {
-        return TYPE_NONE;
-    } else {
-        fprintf(stderr, "Unknown type '%s' for key '%s'\n", type, key);
-        exit(1);
-    }
+typedef struct {
+    char *name;
+    char *sizecmd;
+    char *sizeunit;
+    unsigned long long biggest;
+    unsigned long long count;
+    unsigned long long totalsize;
+    sds biggest_key;
+} typeinfo;
+
+typeinfo type_string = { "string", "STRLEN", "bytes" };
+typeinfo type_list = { "list", "LLEN", "items" };
+typeinfo type_set = { "set", "SCARD", "members" };
+typeinfo type_hash = { "hash", "HLEN", "fields" };
+typeinfo type_zset = { "zset", "ZCARD", "members" };
+typeinfo type_stream = { "stream", "XLEN", "entries" };
+typeinfo type_other = { "other", NULL, "?" };
+
+static typeinfo* typeinfo_add(dict *types, char* name, typeinfo* type_template) {
+    typeinfo *info = zmalloc(sizeof(typeinfo), MALLOC_LOCAL);
+    *info = *type_template;
+    info->name = sdsnew(name);
+    dictAdd(types, info->name, info);
+    return info;
 }
 
-static void getKeyTypes(redisReply *keys, int *types) {
+void type_free(void* priv_data, void* val) {
+    typeinfo *info = val;
+    UNUSED(priv_data);
+    if (info->biggest_key)
+        sdsfree(info->biggest_key);
+    sdsfree(info->name);
+    zfree(info);
+}
+
+static dictType typeinfoDictType = {
+    dictSdsHash,               /* hash function */
+    NULL,                      /* key dup */
+    NULL,                      /* val dup */
+    dictSdsKeyCompare,         /* key compare */
+    NULL,                      /* key destructor (owned by the value)*/
+    type_free                  /* val destructor */
+};
+
+static void getKeyTypes(dict *types_dict, redisReply *keys, typeinfo **types) {
     redisReply *reply;
     unsigned int i;
 
@@ -6521,32 +6642,47 @@ static void getKeyTypes(redisReply *keys, int *types) {
             exit(1);
         }
 
-        types[i] = toIntType(keys->element[i]->str, reply->str);
+        sds typereply = sdsnew(reply->str);
+        dictEntry *de = dictFind(types_dict, typereply);
+        sdsfree(typereply);
+        typeinfo *type = NULL;
+        if (de)
+            type = dictGetVal(de);
+        else if (strcmp(reply->str, "none")) /* create new types for modules, (but not for deleted keys) */
+            type = typeinfo_add(types_dict, reply->str, &type_other);
+        types[i] = type;
         freeReplyObject(reply);
     }
 }
 
-static void getKeySizes(redisReply *keys, int *types,
-                        unsigned long long *sizes)
+static void getKeySizes(redisReply *keys, typeinfo **types,
+                        unsigned long long *sizes, int memkeys,
+                        unsigned memkeys_samples)
 {
     redisReply *reply;
-    char *sizecmds[] = {"STRLEN","LLEN","SCARD","HLEN","ZCARD"};
     unsigned int i;
 
     /* Pipeline size commands */
     for(i=0;i<keys->elements;i++) {
-        /* Skip keys that were deleted */
-        if(types[i]==TYPE_NONE)
+        /* Skip keys that disappeared between SCAN and TYPE (or unknown types when not in memkeys mode) */
+        if(!types[i] || (!types[i]->sizecmd && !memkeys))
             continue;
 
-        redisAppendCommand(context, "%s %s", sizecmds[types[i]],
-            keys->element[i]->str);
+        if (!memkeys)
+            redisAppendCommand(context, "%s %s",
+                types[i]->sizecmd, keys->element[i]->str);
+        else if (memkeys_samples==0)
+            redisAppendCommand(context, "%s %s %s",
+                "MEMORY", "USAGE", keys->element[i]->str);
+        else
+            redisAppendCommand(context, "%s %s %s SAMPLES %u",
+                "MEMORY", "USAGE", keys->element[i]->str, memkeys_samples);
     }
 
     /* Retrieve sizes */
     for(i=0;i<keys->elements;i++) {
-        /* Skip keys that disappeared between SCAN and TYPE */
-        if(types[i] == TYPE_NONE) {
+        /* Skip keys that disappeared between SCAN and TYPE (or unknown types when not in memkeys mode) */
+        if(!types[i] || (!types[i]->sizecmd && !memkeys)) {
             sizes[i] = 0;
             continue;
         }
@@ -6561,7 +6697,8 @@ static void getKeySizes(redisReply *keys, int *types,
              * added as a different type between TYPE and SIZE */
             fprintf(stderr,
                 "Warning:  %s on '%s' failed (may have changed type)\n",
-                 sizecmds[types[i]], keys->element[i]->str);
+                !memkeys? types[i]->sizecmd: "MEMORY USAGE",
+                keys->element[i]->str);
             sizes[i] = 0;
         } else {
             sizes[i] = reply->integer;
@@ -6571,17 +6708,23 @@ static void getKeySizes(redisReply *keys, int *types,
     }
 }
 
-static void findBigKeys(void) {
-    unsigned long long biggest[TYPE_COUNT] = {0}, counts[TYPE_COUNT] = {0}, totalsize[TYPE_COUNT] = {0};
+static void findBigKeys(int memkeys, unsigned memkeys_samples) {
     unsigned long long sampled = 0, total_keys, totlen=0, *sizes=NULL, it=0;
-    sds maxkeys[TYPE_COUNT] = {0};
-    char *typename[] = {"string","list","set","hash","zset","stream","none"};
-    char *typeunit[] = {"bytes","items","members","fields","members","entries",""};
     redisReply *reply, *keys;
     unsigned int arrsize=0, i;
-    int type, *types=NULL;
+    dictIterator *di;
+    dictEntry *de;
+    typeinfo **types = NULL;
     double pct;
 
+    dict *types_dict = dictCreate(&typeinfoDictType, NULL);
+    typeinfo_add(types_dict, "string", &type_string);
+    typeinfo_add(types_dict, "list", &type_list);
+    typeinfo_add(types_dict, "set", &type_set);
+    typeinfo_add(types_dict, "hash", &type_hash);
+    typeinfo_add(types_dict, "zset", &type_zset);
+    typeinfo_add(types_dict, "stream", &type_stream);
+
     /* Total keys pre scanning */
     total_keys = getDbSize();
 
@@ -6590,15 +6733,6 @@ static void findBigKeys(void) {
     printf("# average sizes per key type.  You can use -i 0.1 to sleep 0.1 sec\n");
     printf("# per 100 SCAN commands (not usually needed).\n\n");
 
-    /* New up sds strings to keep track of overall biggest per type */
-    for(i=0;i<TYPE_NONE; i++) {
-        maxkeys[i] = sdsempty();
-        if(!maxkeys[i]) {
-            fprintf(stderr, "Failed to allocate memory for largest key names!\n");
-            exit(1);
-        }
-    }
-
     /* SCAN loop */
     do {
         /* Calculate approximate percentage completion */
@@ -6622,34 +6756,38 @@ static void findBigKeys(void) {
         }
 
         /* Retrieve types and then sizes */
-        getKeyTypes(keys, types);
-        getKeySizes(keys, types, sizes);
+        getKeyTypes(types_dict, keys, types);
+        getKeySizes(keys, types, sizes, memkeys, memkeys_samples);
 
         /* Now update our stats */
         for(i=0;i<keys->elements;i++) {
-            if((type = types[i]) == TYPE_NONE)
+            typeinfo *type = types[i];
+            /* Skip keys that disappeared between SCAN and TYPE */
+            if(!type)
                 continue;
 
-            totalsize[type] += sizes[i];
-            counts[type]++;
+            type->totalsize += sizes[i];
+            type->count++;
             totlen += keys->element[i]->len;
             sampled++;
 
-            if(biggest[type]<sizes[i]) {
+            if(type->biggest<sizes[i]) {
                 printf(
                    "[%05.2f%%] Biggest %-6s found so far '%s' with %llu %s\n",
-                   pct, typename[type], keys->element[i]->str, sizes[i],
-                   typeunit[type]);
+                   pct, type->name, keys->element[i]->str, sizes[i],
+                   !memkeys? type->sizeunit: "bytes");
 
                 /* Keep track of biggest key name for this type */
-                maxkeys[type] = sdscpy(maxkeys[type], keys->element[i]->str);
-                if(!maxkeys[type]) {
+                if (type->biggest_key)
+                    sdsfree(type->biggest_key);
+                type->biggest_key = sdsnew(keys->element[i]->str);
+                if(!type->biggest_key) {
                     fprintf(stderr, "Failed to allocate memory for key!\n");
                     exit(1);
                 }
 
                 /* Keep track of the biggest size for this type */
-                biggest[type] = sizes[i];
+                type->biggest = sizes[i];
             }
 
             /* Update overall progress */
@@ -6677,26 +6815,29 @@ static void findBigKeys(void) {
        totlen, totlen ? (double)totlen/sampled : 0);
 
     /* Output the biggest keys we found, for types we did find */
-    for(i=0;i<TYPE_NONE;i++) {
-        if(sdslen(maxkeys[i])>0) {
-            printf("Biggest %6s found '%s' has %llu %s\n", typename[i], maxkeys[i],
-               biggest[i], typeunit[i]);
+    di = dictGetIterator(types_dict);
+    while ((de = dictNext(di))) {
+        typeinfo *type = dictGetVal(de);
+        if(type->biggest_key) {
+            printf("Biggest %6s found '%s' has %llu %s\n", type->name, type->biggest_key,
+               type->biggest, !memkeys? type->sizeunit: "bytes");
         }
     }
+    dictReleaseIterator(di);
 
     printf("\n");
 
-    for(i=0;i<TYPE_NONE;i++) {
+    di = dictGetIterator(types_dict);
+    while ((de = dictNext(di))) {
+        typeinfo *type = dictGetVal(de);
         printf("%llu %ss with %llu %s (%05.2f%% of keys, avg size %.2f)\n",
-           counts[i], typename[i], totalsize[i], typeunit[i],
-           sampled ? 100 * (double)counts[i]/sampled : 0,
-           counts[i] ? (double)totalsize[i]/counts[i] : 0);
+           type->count, type->name, type->totalsize, !memkeys? type->sizeunit: "bytes",
+           sampled ? 100 * (double)type->count/sampled : 0,
+           type->count ? (double)type->totalsize/type->count : 0);
     }
+    dictReleaseIterator(di);
 
-    /* Free sds strings containing max keys */
-    for(i=0;i<TYPE_NONE;i++) {
-        sdsfree(maxkeys[i]);
-    }
+    dictRelease(types_dict);
 
     /* Success! */
     exit(0);
@@ -7271,12 +7412,14 @@ int main(int argc, char **argv) {
     /* Slave mode */
     if (config.slave_mode) {
         if (cliConnect(0) == REDIS_ERR) exit(1);
+        sendCapa();
         slaveMode();
     }
 
     /* Get RDB mode. */
     if (config.getrdb_mode) {
         if (cliConnect(0) == REDIS_ERR) exit(1);
+        sendCapa();
         getRDB();
     }
 
@@ -7289,7 +7432,19 @@ int main(int argc, char **argv) {
     /* Find big keys */
     if (config.bigkeys) {
         if (cliConnect(0) == REDIS_ERR) exit(1);
-        findBigKeys();
+        findBigKeys(0, 0);
+    }
+
+    /* Find large keys */
+    if (config.memkeys) {
+        if (cliConnect(0) == REDIS_ERR) exit(1);
+        findBigKeys(1, config.memkeys_samples);
+    }
+
+    /* Find hot keys */
+    if (config.hotkeys) {
+        if (cliConnect(0) == REDIS_ERR) exit(1);
+        findHotKeys();
     }
 
     /* Find hot keys */
diff --git a/src/replication.c b/src/replication.cpp
similarity index 92%
rename from src/replication.c
rename to src/replication.cpp
index e625af854..b1abb285a 100644
--- a/src/replication.c
+++ b/src/replication.cpp
@@ -1,6 +1,7 @@
 /* Asynchronous replication implementation.
  *
  * Copyright (c) 2009-2012, Salvatore Sanfilippo <antirez at gmail dot com>
+ * Copyright (c) 2019 John Sully <john at eqalpha dot com>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -36,6 +37,7 @@
 #include <fcntl.h>
 #include <sys/socket.h>
 #include <sys/stat.h>
+#include <mutex>
 
 void replicationDiscardCachedMaster(void);
 void replicationResurrectCachedMaster(int newfd);
@@ -76,7 +78,7 @@ char *replicationGetSlaveName(client *c) {
 
 void createReplicationBacklog(void) {
     serverAssert(server.repl_backlog == NULL);
-    server.repl_backlog = zmalloc(server.repl_backlog_size, MALLOC_LOCAL);
+    server.repl_backlog = (char*)zmalloc(server.repl_backlog_size, MALLOC_LOCAL);
     server.repl_backlog_histlen = 0;
     server.repl_backlog_idx = 0;
 
@@ -105,7 +107,7 @@ void resizeReplicationBacklog(long long newsize) {
          * worse often we need to alloc additional space before freeing the
          * old buffer. */
         zfree(server.repl_backlog);
-        server.repl_backlog = zmalloc(server.repl_backlog_size, MALLOC_LOCAL);
+        server.repl_backlog = (char*)zmalloc(server.repl_backlog_size, MALLOC_LOCAL);
         server.repl_backlog_histlen = 0;
         server.repl_backlog_idx = 0;
         /* Next byte we have is... the next since the buffer is empty. */
@@ -114,6 +116,7 @@ void resizeReplicationBacklog(long long newsize) {
 }
 
 void freeReplicationBacklog(void) {
+    serverAssert(aeThreadOwnsLock());
     serverAssert(listLength(server.slaves) == 0);
     zfree(server.repl_backlog);
     server.repl_backlog = NULL;
@@ -124,7 +127,8 @@ void freeReplicationBacklog(void) {
  * server.master_repl_offset, because there is no case where we want to feed
  * the backlog without incrementing the offset. */
 void feedReplicationBacklog(void *ptr, size_t len) {
-    unsigned char *p = ptr;
+    serverAssert(aeThreadOwnsLock());
+    unsigned char *p = (unsigned char*)ptr;
 
     server.master_repl_offset += len;
 
@@ -159,7 +163,7 @@ void feedReplicationBacklogWithObject(robj *o) {
         len = ll2string(llstr,sizeof(llstr),(long)ptrFromObj(o));
         p = llstr;
     } else {
-        len = sdslen(ptrFromObj(o));
+        len = sdslen((sds)ptrFromObj(o));
         p = ptrFromObj(o);
     }
     feedReplicationBacklog(p,len);
@@ -175,6 +179,7 @@ void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
     listIter li;
     int j, len;
     char llstr[LONG_STR_SIZE];
+    serverAssert(aeThreadOwnsLock());
 
     /* If the instance is not a top level master, return ASAP: we'll just proxy
      * the stream of data we receive from our master instead, in order to
@@ -190,6 +195,12 @@ void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
     /* We can't have slaves attached and no backlog. */
     serverAssert(!(listLength(slaves) != 0 && server.repl_backlog == NULL));
 
+    /* Get the lock on all slaves */
+    listRewind(slaves,&li);
+    while((ln = listNext(&li))) {
+        ((client*)ln->value)->lock.lock();
+    }
+
     /* Send SELECT command to every slave if needed. */
     if (server.slaveseldb != dictid) {
         robj *selectcmd;
@@ -213,9 +224,9 @@ void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
         /* Send it to slaves. */
         listRewind(slaves,&li);
         while((ln = listNext(&li))) {
-            client *slave = ln->value;
+            client *slave = (client*)ln->value;
             if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue;
-            addReply(slave,selectcmd);
+            addReplyAsync(slave,selectcmd);
         }
 
         if (dictid < 0 || dictid >= PROTO_SHARED_SELECT_CMDS)
@@ -253,7 +264,7 @@ void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
     /* Write the command to every slave. */
     listRewind(slaves,&li);
     while((ln = listNext(&li))) {
-        client *slave = ln->value;
+        client *slave = (client*)ln->value;
 
         /* Don't feed slaves that are still waiting for BGSAVE to start */
         if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue;
@@ -263,12 +274,18 @@ void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
          * or are already in sync with the master. */
 
         /* Add the multi bulk length. */
-        addReplyArrayLen(slave,argc);
+        addReplyArrayLenAsync(slave,argc);
 
         /* Finally any additional argument that was not stored inside the
          * static buffer if any (from j to argc). */
         for (j = 0; j < argc; j++)
-            addReplyBulk(slave,argv[j]);
+            addReplyBulkAsync(slave,argv[j]);
+    }
+
+    /* Release the lock on all slaves */
+    listRewind(slaves,&li);
+    while((ln = listNext(&li))) {
+        ((client*)ln->value)->lock.unlock();
     }
 }
 
@@ -292,12 +309,16 @@ void replicationFeedSlavesFromMasterStream(list *slaves, char *buf, size_t bufle
     if (server.repl_backlog) feedReplicationBacklog(buf,buflen);
     listRewind(slaves,&li);
     while((ln = listNext(&li))) {
-        client *slave = ln->value;
+        client *slave = (client*)ln->value;
+        std::lock_guard<decltype(slave->lock)> ulock(slave->lock);
 
         /* Don't feed slaves that are still waiting for BGSAVE to start */
         if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue;
-        addReplyProto(slave,buf,buflen);
+        addReplyProtoAsync(slave,buf,buflen);
     }
+    
+    if (listLength(slaves))
+        ProcessPendingAsyncWrites();    // flush them to their respective threads
 }
 
 void replicationFeedMonitors(client *c, list *monitors, int dictid, robj **argv, int argc) {
@@ -307,6 +328,7 @@ void replicationFeedMonitors(client *c, list *monitors, int dictid, robj **argv,
     sds cmdrepr = sdsnew("+");
     robj *cmdobj;
     struct timeval tv;
+    serverAssert(aeThreadOwnsLock());
 
     gettimeofday(&tv,NULL);
     cmdrepr = sdscatprintf(cmdrepr,"%ld.%06ld ",(long)tv.tv_sec,(long)tv.tv_usec);
@@ -323,7 +345,7 @@ void replicationFeedMonitors(client *c, list *monitors, int dictid, robj **argv,
             cmdrepr = sdscatprintf(cmdrepr, "\"%ld\"", (long)ptrFromObj(argv[j]));
         } else {
             cmdrepr = sdscatrepr(cmdrepr,(char*)ptrFromObj(argv[j]),
-                        sdslen(ptrFromObj(argv[j])));
+                        sdslen((sds)ptrFromObj(argv[j])));
         }
         if (j != argc-1)
             cmdrepr = sdscatlen(cmdrepr," ",1);
@@ -333,8 +355,9 @@ void replicationFeedMonitors(client *c, list *monitors, int dictid, robj **argv,
 
     listRewind(monitors,&li);
     while((ln = listNext(&li))) {
-        client *monitor = ln->value;
-        addReply(monitor,cmdobj);
+        client *monitor = (client*)ln->value;
+        std::lock_guard<decltype(monitor->lock)> lock(monitor->lock);
+        addReplyAsync(monitor,cmdobj);
     }
     decrRefCount(cmdobj);
 }
@@ -445,8 +468,9 @@ int replicationSetupSlaveForFullResync(client *slave, long long offset) {
  * On success return C_OK, otherwise C_ERR is returned and we proceed
  * with the usual full resync. */
 int masterTryPartialResynchronization(client *c) {
+    serverAssert(aeThreadOwnsLock());
     long long psync_offset, psync_len;
-    char *master_replid = ptrFromObj(c->argv[1]);
+    char *master_replid = (char*)ptrFromObj(c->argv[1]);
     char buf[128];
     int buflen;
 
@@ -519,7 +543,10 @@ int masterTryPartialResynchronization(client *c) {
         buflen = snprintf(buf,sizeof(buf),"+CONTINUE\r\n");
     }
     if (write(c->fd,buf,buflen) != buflen) {
-        freeClientAsync(c);
+        if (FCorrectThread(c))
+            freeClient(c);
+        else
+            freeClientAsync(c);
         return C_OK;
     }
     psync_len = addReplyReplicationBacklog(c,psync_offset);
@@ -561,6 +588,7 @@ need_full_resync:
  *
  * Returns C_OK on success or C_ERR otherwise. */
 int startBgsaveForReplication(int mincapa) {
+    serverAssert(aeThreadOwnsLock());
     int retval;
     int socket_target = server.repl_diskless_sync && (mincapa & SLAVE_CAPA_EOF);
     listIter li;
@@ -590,7 +618,7 @@ int startBgsaveForReplication(int mincapa) {
         serverLog(LL_WARNING,"BGSAVE for replication failed");
         listRewind(server.slaves,&li);
         while((ln = listNext(&li))) {
-            client *slave = ln->value;
+            client *slave = (client*)ln->value;
 
             if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_START) {
                 slave->flags &= ~CLIENT_SLAVE;
@@ -608,7 +636,7 @@ int startBgsaveForReplication(int mincapa) {
     if (!socket_target) {
         listRewind(server.slaves,&li);
         while((ln = listNext(&li))) {
-            client *slave = ln->value;
+            client *slave = (client*)ln->value;
 
             if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_START) {
                     replicationSetupSlaveForFullResync(slave,
@@ -656,12 +684,12 @@ void syncCommand(client *c) {
      *
      * So the slave knows the new replid and offset to try a PSYNC later
      * if the connection with the master is lost. */
-    if (!strcasecmp(ptrFromObj(c->argv[0]),"psync")) {
+    if (!strcasecmp((const char*)ptrFromObj(c->argv[0]),"psync")) {
         if (masterTryPartialResynchronization(c) == C_OK) {
             server.stat_sync_partial_ok++;
             return; /* No full resync needed, return. */
         } else {
-            char *master_replid = ptrFromObj(c->argv[1]);
+            char *master_replid = (char*)ptrFromObj(c->argv[1]);
 
             /* Increment stats for failed PSYNCs, but only if the
              * replid is not "?", as this is used by slaves to force a full
@@ -711,9 +739,10 @@ void syncCommand(client *c) {
 
         listRewind(server.slaves,&li);
         while((ln = listNext(&li))) {
-            slave = ln->value;
+            slave = (client*)ln->value;
             if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_END) break;
         }
+        
         /* To attach this slave, we check that it has at least all the
          * capabilities of the slave that triggered the current BGSAVE. */
         if (ln && ((c->slave_capa & slave->slave_capa) == slave->slave_capa)) {
@@ -785,15 +814,15 @@ void replconfCommand(client *c) {
 
     /* Process every option-value pair. */
     for (j = 1; j < c->argc; j+=2) {
-        if (!strcasecmp(ptrFromObj(c->argv[j]),"listening-port")) {
+        if (!strcasecmp((const char*)ptrFromObj(c->argv[j]),"listening-port")) {
             long port;
 
             if ((getLongFromObjectOrReply(c,c->argv[j+1],
                     &port,NULL) != C_OK))
                 return;
             c->slave_listening_port = port;
-        } else if (!strcasecmp(ptrFromObj(c->argv[j]),"ip-address")) {
-            sds ip = ptrFromObj(c->argv[j+1]);
+        } else if (!strcasecmp((const char*)ptrFromObj(c->argv[j]),"ip-address")) {
+            sds ip = (sds)ptrFromObj(c->argv[j+1]);
             if (sdslen(ip) < sizeof(c->slave_ip)) {
                 memcpy(c->slave_ip,ip,sdslen(ip)+1);
             } else {
@@ -801,13 +830,13 @@ void replconfCommand(client *c) {
                     "replica instance is too long: %zd bytes", sdslen(ip));
                 return;
             }
-        } else if (!strcasecmp(ptrFromObj(c->argv[j]),"capa")) {
+        } else if (!strcasecmp((const char*)ptrFromObj(c->argv[j]),"capa")) {
             /* Ignore capabilities not understood by this master. */
-            if (!strcasecmp(ptrFromObj(c->argv[j+1]),"eof"))
+            if (!strcasecmp((const char*)ptrFromObj(c->argv[j+1]),"eof"))
                 c->slave_capa |= SLAVE_CAPA_EOF;
-            else if (!strcasecmp(ptrFromObj(c->argv[j+1]),"psync2"))
+            else if (!strcasecmp((const char*)ptrFromObj(c->argv[j+1]),"psync2"))
                 c->slave_capa |= SLAVE_CAPA_PSYNC2;
-        } else if (!strcasecmp(ptrFromObj(c->argv[j]),"ack")) {
+        } else if (!strcasecmp((const char*)ptrFromObj(c->argv[j]),"ack")) {
             /* REPLCONF ACK is used by slave to inform the master the amount
              * of replication stream that it processed so far. It is an
              * internal only command that normal clients should never use. */
@@ -826,7 +855,7 @@ void replconfCommand(client *c) {
                 putSlaveOnline(c);
             /* Note: this command does not reply anything! */
             return;
-        } else if (!strcasecmp(ptrFromObj(c->argv[j]),"getack")) {
+        } else if (!strcasecmp((const char*)ptrFromObj(c->argv[j]),"getack")) {
             /* REPLCONF GETACK is used in order to request an ACK ASAP
              * to the slave. */
             if (server.masterhost && server.master) replicationSendAck();
@@ -856,7 +885,8 @@ void putSlaveOnline(client *slave) {
     slave->replstate = SLAVE_STATE_ONLINE;
     slave->repl_put_online_on_ack = 0;
     slave->repl_ack_time = server.unixtime; /* Prevent false timeout. */
-    if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
+    AssertCorrectThread(slave);
+    if (aeCreateFileEvent(server.rgthreadvar[slave->iel].el, slave->fd, AE_WRITABLE|AE_WRITE_THREADSAFE,
         sendReplyToClient, slave) == AE_ERR) {
         serverLog(LL_WARNING,"Unable to register writable event for replica bulk transfer: %s", strerror(errno));
         freeClient(slave);
@@ -868,9 +898,10 @@ void putSlaveOnline(client *slave) {
 }
 
 void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
-    client *slave = privdata;
+    client *slave = (client*)privdata;
     UNUSED(el);
     UNUSED(mask);
+    serverAssert(ielFromEventLoop(el) == slave->iel);
     char buf[PROTO_IOBUF_LEN];
     ssize_t nwritten, buflen;
 
@@ -878,6 +909,7 @@ void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
      * replication process. Currently the preamble is just the bulk count of
      * the file in the form "$<length>\r\n". */
     if (slave->replpreamble) {
+        serverAssert(slave->replpreamble[0] == '$');
         nwritten = write(fd,slave->replpreamble,sdslen(slave->replpreamble));
         if (nwritten == -1) {
             serverLog(LL_VERBOSE,"Write error sending RDB preamble to replica: %s",
@@ -918,7 +950,7 @@ void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
     if (slave->repldboff == slave->repldbsize) {
         close(slave->repldbfd);
         slave->repldbfd = -1;
-        aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
+        aeDeleteFileEvent(el,slave->fd,AE_WRITABLE);
         putSlaveOnline(slave);
     }
 }
@@ -937,20 +969,22 @@ void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
  * otherwise C_ERR is passed to the function.
  * The 'type' argument is the type of the child that terminated
  * (if it had a disk or socket target). */
-void updateSlavesWaitingBgsave(int bgsaveerr, int type) {
+void updateSlavesWaitingBgsave(int bgsaveerr, int type)
+{
     listNode *ln;
+    listIter li;
     int startbgsave = 0;
     int mincapa = -1;
-    listIter li;
+    serverAssert(aeThreadOwnsLock());
 
     listRewind(server.slaves,&li);
     while((ln = listNext(&li))) {
-        client *slave = ln->value;
+        client *slave = (client*)ln->value;
 
         if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_START) {
             startbgsave = 1;
             mincapa = (mincapa == -1) ? slave->slave_capa :
-                                        (mincapa & slave->slave_capa);
+                        (mincapa & slave->slave_capa);
         } else if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_END) {
             struct redis_stat buf;
 
@@ -973,13 +1007,19 @@ void updateSlavesWaitingBgsave(int bgsaveerr, int type) {
                 slave->repl_ack_time = server.unixtime; /* Timeout otherwise. */
             } else {
                 if (bgsaveerr != C_OK) {
-                    freeClient(slave);
+                    if (FCorrectThread(slave))
+                        freeClient(slave);
+                    else
+                        freeClientAsync(slave);
                     serverLog(LL_WARNING,"SYNC failed. BGSAVE child returned an error");
                     continue;
                 }
                 if ((slave->repldbfd = open(server.rdb_filename,O_RDONLY)) == -1 ||
                     redis_fstat(slave->repldbfd,&buf) == -1) {
-                    freeClient(slave);
+                    if (FCorrectThread(slave))
+                        freeClient(slave);
+                    else
+                        freeClientAsync(slave);
                     serverLog(LL_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
                     continue;
                 }
@@ -989,15 +1029,28 @@ void updateSlavesWaitingBgsave(int bgsaveerr, int type) {
                 slave->replpreamble = sdscatprintf(sdsempty(),"$%lld\r\n",
                     (unsigned long long) slave->repldbsize);
 
-                aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
-                if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
-                    freeClient(slave);
-                    continue;
+                if (FCorrectThread(slave))
+                {
+                    aeDeleteFileEvent(server.rgthreadvar[slave->iel].el,slave->fd,AE_WRITABLE);
+                    if (aeCreateFileEvent(server.rgthreadvar[slave->iel].el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
+                        freeClient(slave);
+                    }
+                }
+                else
+                {
+                    aePostFunction(server.rgthreadvar[slave->iel].el, [slave]{
+                        aeDeleteFileEvent(server.rgthreadvar[slave->iel].el,slave->fd,AE_WRITABLE);
+                        if (aeCreateFileEvent(server.rgthreadvar[slave->iel].el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
+                            freeClient(slave);
+                        }
+                    });
                 }
             }
         }
     }
-    if (startbgsave) startBgsaveForReplication(mincapa);
+
+    if (startbgsave)
+        startBgsaveForReplication(mincapa);
 }
 
 /* Change the current instance replication ID with a new, random one.
@@ -1075,7 +1128,7 @@ void replicationEmptyDbCallback(void *privdata) {
  * performed, this function materializes the master client we store
  * at server.master, starting from the specified file descriptor. */
 void replicationCreateMasterClient(int fd, int dbid) {
-    server.master = createClient(fd);
+    server.master = createClient(fd, serverTL - server.rgthreadvar);
     server.master->flags |= CLIENT_MASTER;
     server.master->authenticated = 1;
     server.master->reploff = server.master_initial_offset;
@@ -1112,12 +1165,18 @@ void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask) {
     UNUSED(privdata);
     UNUSED(mask);
 
+    serverAssert(aeThreadOwnsLock());
+
     /* Static vars used to hold the EOF mark, and the last bytes received
      * form the server: when they match, we reached the end of the transfer. */
     static char eofmark[CONFIG_RUN_ID_SIZE];
     static char lastbytes[CONFIG_RUN_ID_SIZE];
     static int usemark = 0;
 
+    /* When a mark is used, we want to detect EOF asap in order to avoid
+     * writing the EOF mark into the file... */
+    int eof_reached = 0;
+
     /* If repl_transfer_size == -1 we still have to read the bulk length
      * from the master reply. */
     if (server.repl_transfer_size == -1) {
@@ -1190,10 +1249,6 @@ void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask) {
     }
     server.stat_net_input_bytes += nread;
 
-    /* When a mark is used, we want to detect EOF asap in order to avoid
-     * writing the EOF mark into the file... */
-    int eof_reached = 0;
-
     if (usemark) {
         /* Update the last bytes array, and check if it matches our delimiter.*/
         if (nread >= CONFIG_RUN_ID_SIZE) {
@@ -1276,7 +1331,7 @@ void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask) {
          * handler, otherwise it will get called recursively since
          * rdbLoad() will call the event loop to process events from time to
          * time for non blocking loading. */
-        aeDeleteFileEvent(server.el,server.repl_transfer_s,AE_READABLE);
+        aeDeleteFileEvent(el,server.repl_transfer_s,AE_READABLE);
         serverLog(LL_NOTICE, "MASTER <-> REPLICA sync: Loading DB in memory");
         rdbSaveInfo rsi = RDB_SAVE_INFO_INIT;
         if (rdbLoad(server.rdb_filename,&rsi) != C_OK) {
@@ -1435,8 +1490,8 @@ char *sendSynchronousCommand(int flags, int fd, ...) {
 #define PSYNC_FULLRESYNC 3
 #define PSYNC_NOT_SUPPORTED 4
 #define PSYNC_TRY_LATER 5
-int slaveTryPartialResynchronization(int fd, int read_reply) {
-    char *psync_replid;
+int slaveTryPartialResynchronization(aeEventLoop *el, int fd, int read_reply) {
+    const char *psync_replid;
     char psync_offset[32];
     sds reply;
 
@@ -1464,7 +1519,7 @@ int slaveTryPartialResynchronization(int fd, int read_reply) {
         if (reply != NULL) {
             serverLog(LL_WARNING,"Unable to send PSYNC to master: %s",reply);
             sdsfree(reply);
-            aeDeleteFileEvent(server.el,fd,AE_READABLE);
+            aeDeleteFileEvent(el,fd,AE_READABLE);
             return PSYNC_WRITE_ERROR;
         }
         return PSYNC_WAIT_REPLY;
@@ -1479,7 +1534,7 @@ int slaveTryPartialResynchronization(int fd, int read_reply) {
         return PSYNC_WAIT_REPLY;
     }
 
-    aeDeleteFileEvent(server.el,fd,AE_READABLE);
+    aeDeleteFileEvent(el,fd,AE_READABLE);
 
     if (!strncmp(reply,"+FULLRESYNC",11)) {
         char *replid = NULL, *offset = NULL;
@@ -1528,13 +1583,13 @@ int slaveTryPartialResynchronization(int fd, int read_reply) {
         char *end = reply+9;
         while(end[0] != '\r' && end[0] != '\n' && end[0] != '\0') end++;
         if (end-start == CONFIG_RUN_ID_SIZE) {
-            char new[CONFIG_RUN_ID_SIZE+1];
-            memcpy(new,start,CONFIG_RUN_ID_SIZE);
-            new[CONFIG_RUN_ID_SIZE] = '\0';
+            char sznew[CONFIG_RUN_ID_SIZE+1];
+            memcpy(sznew,start,CONFIG_RUN_ID_SIZE);
+            sznew[CONFIG_RUN_ID_SIZE] = '\0';
 
-            if (strcmp(new,server.cached_master->replid)) {
+            if (strcmp(sznew,server.cached_master->replid)) {
                 /* Master ID changed. */
-                serverLog(LL_WARNING,"Master replication ID changed to %s",new);
+                serverLog(LL_WARNING,"Master replication ID changed to %s",sznew);
 
                 /* Set the old ID as our ID2, up to the current offset+1. */
                 memcpy(server.replid2,server.cached_master->replid,
@@ -1543,8 +1598,8 @@ int slaveTryPartialResynchronization(int fd, int read_reply) {
 
                 /* Update the cached master ID and our own primary ID to the
                  * new one. */
-                memcpy(server.replid,new,sizeof(server.replid));
-                memcpy(server.cached_master->replid,new,sizeof(server.replid));
+                memcpy(server.replid,sznew,sizeof(server.replid));
+                memcpy(server.cached_master->replid,sznew,sizeof(server.replid));
 
                 /* Disconnect all the sub-slaves: they need to be notified. */
                 disconnectSlaves();
@@ -1596,6 +1651,7 @@ int slaveTryPartialResynchronization(int fd, int read_reply) {
 /* This handler fires when the non blocking connect was able to
  * establish a connection with the master. */
 void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) {
+    serverAssert(aeThreadOwnsLock());
     char tmpfile[256], *err = NULL;
     int dfd = -1, maxtries = 5;
     int sockerr = 0, psync_result;
@@ -1626,7 +1682,7 @@ void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) {
         serverLog(LL_NOTICE,"Non blocking connect for SYNC fired the event.");
         /* Delete the writable event so that the readable event remains
          * registered and we can wait for the PONG reply. */
-        aeDeleteFileEvent(server.el,fd,AE_WRITABLE);
+        aeDeleteFileEvent(el,fd,AE_WRITABLE);
         server.repl_state = REPL_STATE_RECEIVE_PONG;
         /* Send the PING, don't check for errors at all, we have the timeout
          * that will take care about this. */
@@ -1661,7 +1717,13 @@ void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) {
 
     /* AUTH with the master if required. */
     if (server.repl_state == REPL_STATE_SEND_AUTH) {
-        if (server.masterauth) {
+        if (server.masteruser && server.masterauth) {
+            err = sendSynchronousCommand(SYNC_CMD_WRITE,fd,"AUTH",
+                                         server.masteruser,server.masterauth,NULL);
+            if (err) goto write_error;
+            server.repl_state = REPL_STATE_RECEIVE_AUTH;
+            return;
+        } else if (server.masterauth) {
             err = sendSynchronousCommand(SYNC_CMD_WRITE,fd,"AUTH",server.masterauth,NULL);
             if (err) goto write_error;
             server.repl_state = REPL_STATE_RECEIVE_AUTH;
@@ -1775,7 +1837,7 @@ void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) {
      * and the global offset, to try a partial resync at the next
      * reconnection attempt. */
     if (server.repl_state == REPL_STATE_SEND_PSYNC) {
-        if (slaveTryPartialResynchronization(fd,0) == PSYNC_WRITE_ERROR) {
+        if (slaveTryPartialResynchronization(el,fd,0) == PSYNC_WRITE_ERROR) {
             err = sdsnew("Write error sending the PSYNC command.");
             goto write_error;
         }
@@ -1791,7 +1853,7 @@ void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) {
         goto error;
     }
 
-    psync_result = slaveTryPartialResynchronization(fd,1);
+    psync_result = slaveTryPartialResynchronization(el,fd,1);
     if (psync_result == PSYNC_WAIT_REPLY) return; /* Try again later... */
 
     /* If the master is in an transient error, we should try to PSYNC
@@ -1841,7 +1903,7 @@ void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) {
     }
 
     /* Setup the non blocking download of the bulk file. */
-    if (aeCreateFileEvent(server.el,fd, AE_READABLE,readSyncBulkPayload,NULL)
+    if (aeCreateFileEvent(el,fd, AE_READABLE,readSyncBulkPayload,NULL)
             == AE_ERR)
     {
         serverLog(LL_WARNING,
@@ -1860,7 +1922,7 @@ void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) {
     return;
 
 error:
-    aeDeleteFileEvent(server.el,fd,AE_READABLE|AE_WRITABLE);
+    aeDeleteFileEvent(el,fd,AE_READABLE|AE_WRITABLE);
     if (dfd != -1) close(dfd);
     close(fd);
     server.repl_transfer_s = -1;
@@ -1884,7 +1946,7 @@ int connectWithMaster(void) {
         return C_ERR;
     }
 
-    if (aeCreateFileEvent(server.el,fd,AE_READABLE|AE_WRITABLE,syncWithMaster,NULL) ==
+    if (aeCreateFileEvent(server.rgthreadvar[IDX_EVENT_LOOP_MAIN].el,fd,AE_READABLE|AE_WRITABLE,syncWithMaster,NULL) ==
             AE_ERR)
     {
         close(fd);
@@ -1905,7 +1967,7 @@ int connectWithMaster(void) {
 void undoConnectWithMaster(void) {
     int fd = server.repl_transfer_s;
 
-    aeDeleteFileEvent(server.el,fd,AE_READABLE|AE_WRITABLE);
+    aeDeleteFileEvent(server.rgthreadvar[IDX_EVENT_LOOP_MAIN].el,fd,AE_READABLE|AE_WRITABLE);
     close(fd);
     server.repl_transfer_s = -1;
 }
@@ -1952,7 +2014,10 @@ void replicationSetMaster(char *ip, int port) {
     server.masterhost = sdsnew(ip);
     server.masterport = port;
     if (server.master) {
-        freeClient(server.master);
+        if (FCorrectThread(server.master))
+            freeClient(server.master);
+        else
+            freeClientAsync(server.master);
     }
     disconnectAllBlockedClients(); /* Clients blocked in master, now slave. */
 
@@ -1976,7 +2041,12 @@ void replicationUnsetMaster(void) {
      * used as secondary ID up to the current offset, and a new replication
      * ID is created to continue with a new replication history. */
     shiftReplicationId();
-    if (server.master) freeClient(server.master);
+    if (server.master) {
+        if (FCorrectThread(server.master))
+            freeClient(server.master);
+        else
+            freeClientAsync(server.master);
+    }
     replicationDiscardCachedMaster();
     cancelReplicationHandshake();
     /* Disconnecting all the slaves is required: we need to inform slaves
@@ -2020,8 +2090,8 @@ void replicaofCommand(client *c) {
 
     /* The special host/port combination "NO" "ONE" turns the instance
      * into a master. Otherwise the new master address is set. */
-    if (!strcasecmp(ptrFromObj(c->argv[1]),"no") &&
-        !strcasecmp(ptrFromObj(c->argv[2]),"one")) {
+    if (!strcasecmp((const char*)ptrFromObj(c->argv[1]),"no") &&
+        !strcasecmp((const char*)ptrFromObj(c->argv[2]),"one")) {
         if (server.masterhost) {
             replicationUnsetMaster();
             sds client = catClientInfoString(sdsempty(),c);
@@ -2036,21 +2106,21 @@ void replicaofCommand(client *c) {
             return;
 
         /* Check if we are already attached to the specified slave */
-        if (server.masterhost && !strcasecmp(server.masterhost,ptrFromObj(c->argv[1]))
+        if (server.masterhost && !strcasecmp(server.masterhost,(const char*)ptrFromObj(c->argv[1]))
             && server.masterport == port) {
             serverLog(LL_NOTICE,"REPLICAOF would result into synchronization with the master we are already connected with. No operation performed.");
-            addReplySds(c,sdsnew("+OK Already connected to specified master\r\n"));
+            addReplySdsAsync(c,sdsnew("+OK Already connected to specified master\r\n"));
             return;
         }
         /* There was no previous master or the user specified a different one,
          * we can continue. */
-        replicationSetMaster(ptrFromObj(c->argv[1]), port);
+        replicationSetMaster((char*)ptrFromObj(c->argv[1]), port);
         sds client = catClientInfoString(sdsempty(),c);
         serverLog(LL_NOTICE,"REPLICAOF %s:%d enabled (user request from '%s')",
             server.masterhost, server.masterport, client);
         sdsfree(client);
     }
-    addReply(c,shared.ok);
+    addReplyAsync(c,shared.ok);
 }
 
 /* ROLE command: provide information about the role of the instance
@@ -2069,7 +2139,7 @@ void roleCommand(client *c) {
         mbcount = addReplyDeferredLen(c);
         listRewind(server.slaves,&li);
         while((ln = listNext(&li))) {
-            client *slave = ln->value;
+            client *slave = (client*)ln->value;
             char ip[NET_IP_STR_LEN], *slaveip = slave->slave_ip;
 
             if (slaveip[0] == '\0') {
@@ -2086,7 +2156,7 @@ void roleCommand(client *c) {
         }
         setDeferredArrayLen(c,mbcount,slaves);
     } else {
-        char *slavestate = NULL;
+        const char *slavestate = NULL;
 
         addReplyArrayLen(c,5);
         addReplyBulkCBuffer(c,"slave",5);
@@ -2112,7 +2182,8 @@ void roleCommand(client *c) {
 /* Send a REPLCONF ACK command to the master to inform it about the current
  * processed offset. If we are not connected with a master, the command has
  * no effects. */
-void replicationSendAck(void) {
+void replicationSendAck(void) 
+{
     client *c = server.master;
 
     if (c != NULL) {
@@ -2148,6 +2219,8 @@ void replicationSendAck(void) {
 void replicationCacheMaster(client *c) {
     serverAssert(server.master != NULL && server.cached_master == NULL);
     serverLog(LL_NOTICE,"Caching the disconnected master state.");
+    AssertCorrectThread(c);
+    std::lock_guard<decltype(c->lock)> clientlock(c->lock);
 
     /* Unlink the client from the server structures. */
     unlinkClient(c);
@@ -2162,6 +2235,7 @@ void replicationCacheMaster(client *c) {
     if (c->flags & CLIENT_MULTI) discardTransaction(c);
     listEmpty(c->reply);
     c->sentlen = 0;
+    c->sentlenAsync = 0;
     c->reply_bytes = 0;
     c->bufpos = 0;
     resetClient(c);
@@ -2196,6 +2270,7 @@ void replicationCacheMasterUsingMyself(void) {
      * the new master will start its replication stream with SELECT. */
     server.master_initial_offset = server.master_repl_offset;
     replicationCreateMasterClient(-1,-1);
+    std::lock_guard<decltype(server.master->lock)> lock(server.master->lock);
 
     /* Use our own ID / offset. */
     memcpy(server.master->replid, server.replid, sizeof(server.replid));
@@ -2214,7 +2289,10 @@ void replicationDiscardCachedMaster(void) {
 
     serverLog(LL_NOTICE,"Discarding previously cached master state.");
     server.cached_master->flags &= ~CLIENT_MASTER;
-    freeClient(server.cached_master);
+    if (FCorrectThread(server.cached_master))
+        freeClient(server.cached_master);
+    else
+        freeClientAsync(server.cached_master);
     server.cached_master = NULL;
 }
 
@@ -2234,9 +2312,13 @@ void replicationResurrectCachedMaster(int newfd) {
     server.repl_state = REPL_STATE_CONNECTED;
     server.repl_down_since = 0;
 
+    /* Normally changing the thread of a client is a BIG NONO,
+        but this client was unlinked so its OK here */
+    server.master->iel = serverTL - server.rgthreadvar; // martial to this thread
+
     /* Re-add to the list of clients. */
     linkClient(server.master);
-    if (aeCreateFileEvent(server.el, newfd, AE_READABLE,
+    if (aeCreateFileEvent(server.rgthreadvar[server.master->iel].el, newfd, AE_READABLE|AE_READ_THREADSAFE,
                           readQueryFromClient, server.master)) {
         serverLog(LL_WARNING,"Error resurrecting the cached master, impossible to add the readable handler: %s", strerror(errno));
         freeClientAsync(server.master); /* Close ASAP. */
@@ -2245,7 +2327,7 @@ void replicationResurrectCachedMaster(int newfd) {
     /* We may also need to install the write handler as well if there is
      * pending data in the write buffers. */
     if (clientHasPendingReplies(server.master)) {
-        if (aeCreateFileEvent(server.el, newfd, AE_WRITABLE,
+        if (aeCreateFileEvent(server.rgthreadvar[server.master->iel].el, newfd, AE_WRITABLE|AE_WRITE_THREADSAFE,
                           sendReplyToClient, server.master)) {
             serverLog(LL_WARNING,"Error resurrecting the cached master, impossible to add the writable handler: %s", strerror(errno));
             freeClientAsync(server.master); /* Close ASAP. */
@@ -2268,7 +2350,7 @@ void refreshGoodSlavesCount(void) {
 
     listRewind(server.slaves,&li);
     while((ln = listNext(&li))) {
-        client *slave = ln->value;
+        client *slave = (client*)ln->value;
         time_t lag = server.unixtime - slave->repl_ack_time;
 
         if (slave->replstate == SLAVE_STATE_ONLINE &&
@@ -2342,7 +2424,7 @@ void replicationScriptCacheAdd(sds sha1) {
     if (listLength(server.repl_scriptcache_fifo) == server.repl_scriptcache_size)
     {
         listNode *ln = listLast(server.repl_scriptcache_fifo);
-        sds oldest = listNodeValue(ln);
+        sds oldest = (sds)listNodeValue(ln);
 
         retval = dictDelete(server.repl_scriptcache_dict,oldest);
         serverAssert(retval == DICT_OK);
@@ -2404,7 +2486,7 @@ int replicationCountAcksByOffset(long long offset) {
 
     listRewind(server.slaves,&li);
     while((ln = listNext(&li))) {
-        client *slave = ln->value;
+        client *slave = (client*)ln->value;
 
         if (slave->replstate != SLAVE_STATE_ONLINE) continue;
         if (slave->repl_ack_off >= offset) count++;
@@ -2471,7 +2553,8 @@ void processClientsWaitingReplicas(void) {
 
     listRewind(server.clients_waiting_acks,&li);
     while((ln = listNext(&li))) {
-        client *c = ln->value;
+        client *c = (client*)ln->value;
+        fastlock_lock(&c->lock);
 
         /* Every time we find a client that is satisfied for a given
          * offset and number of replicas, we remember it so the next client
@@ -2481,7 +2564,7 @@ void processClientsWaitingReplicas(void) {
                            last_numreplicas > c->bpop.numreplicas)
         {
             unblockClient(c);
-            addReplyLongLong(c,last_numreplicas);
+            addReplyLongLongAsync(c,last_numreplicas);
         } else {
             int numreplicas = replicationCountAcksByOffset(c->bpop.reploffset);
 
@@ -2489,9 +2572,10 @@ void processClientsWaitingReplicas(void) {
                 last_offset = c->bpop.reploffset;
                 last_numreplicas = numreplicas;
                 unblockClient(c);
-                addReplyLongLong(c,numreplicas);
+                addReplyLongLongAsync(c,numreplicas);
             }
         }
+        fastlock_unlock(&c->lock);
     }
 }
 
@@ -2519,7 +2603,11 @@ long long replicationGetSlaveOffset(void) {
 
 /* Replication cron function, called 1 time per second. */
 void replicationCron(void) {
+    serverAssert(aeThreadOwnsLock());
     static long long replication_cron_loops = 0;
+    std::unique_lock<decltype(server.master->lock)> ulock;
+    if (server.master != nullptr)
+        ulock = decltype(ulock)(server.master->lock);
 
     /* Non blocking connection timeout? */
     if (server.masterhost &&
@@ -2544,7 +2632,10 @@ void replicationCron(void) {
         (time(NULL)-server.master->lastinteraction) > server.repl_timeout)
     {
         serverLog(LL_WARNING,"MASTER timeout: no data nor PING received...");
-        freeClient(server.master);
+        if (FCorrectThread(server.master))
+            freeClient(server.master);
+        else
+            freeClientAsync(server.master);
     }
 
     /* Check if we should connect to a MASTER */
@@ -2597,7 +2688,7 @@ void replicationCron(void) {
      * timeouts are set at a few seconds (example: PSYNC response). */
     listRewind(server.slaves,&li);
     while((ln = listNext(&li))) {
-        client *slave = ln->value;
+        client *slave = (client*)ln->value;
 
         int is_presync =
             (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_START ||
@@ -2618,7 +2709,7 @@ void replicationCron(void) {
 
         listRewind(server.slaves,&li);
         while((ln = listNext(&li))) {
-            client *slave = ln->value;
+            client *slave = (client*)ln->value;
 
             if (slave->replstate != SLAVE_STATE_ONLINE) continue;
             if (slave->flags & CLIENT_PRE_PSYNC) continue;
@@ -2626,7 +2717,10 @@ void replicationCron(void) {
             {
                 serverLog(LL_WARNING, "Disconnecting timedout replica: %s",
                     replicationGetSlaveName(slave));
-                freeClient(slave);
+                if (FCorrectThread(slave))
+                    freeClient(slave);
+                else
+                    freeClientAsync(slave);
             }
         }
     }
@@ -2693,7 +2787,7 @@ void replicationCron(void) {
 
         listRewind(server.slaves,&li);
         while((ln = listNext(&li))) {
-            client *slave = ln->value;
+            client *slave = (client*)ln->value;
             if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_START) {
                 idle = server.unixtime - slave->lastinteraction;
                 if (idle > max_idle) max_idle = idle;
diff --git a/src/scripting.c b/src/scripting.cpp
similarity index 95%
rename from src/scripting.c
rename to src/scripting.cpp
index 2f44a511d..71d1a2815 100644
--- a/src/scripting.c
+++ b/src/scripting.cpp
@@ -32,11 +32,14 @@
 #include "rand.h"
 #include "cluster.h"
 
+extern "C" {
 #include <lua.h>
 #include <lauxlib.h>
 #include <lualib.h>
+}
 #include <ctype.h>
 #include <math.h>
+#include <mutex>
 
 char *redisProtocolToLuaType_Int(lua_State *lua, char *reply);
 char *redisProtocolToLuaType_Bulk(lua_State *lua, char *reply);
@@ -89,7 +92,7 @@ struct ldbState {
 void sha1hex(char *digest, char *script, size_t len) {
     SHA1_CTX ctx;
     unsigned char hash[20];
-    char *cset = "0123456789abcdef";
+    const char *cset = "0123456789abcdef";
     int j;
 
     SHA1Init(&ctx);
@@ -223,7 +226,7 @@ char *redisProtocolToLuaType_MultiBulk(lua_State *lua, char *reply, int atype) {
  * with a single "err" field set to the error string. Note that this
  * table is never a valid reply by proper commands, since the returned
  * tables are otherwise always indexed by integers, never by strings. */
-void luaPushError(lua_State *lua, char *error) {
+void luaPushError(lua_State *lua, const char *error) {
     lua_Debug dbg;
 
     /* If debugging is active and in step mode, log errors resulting from
@@ -365,9 +368,17 @@ void luaReplyToRedisReply(client *c, lua_State *lua) {
 #define LUA_CMD_OBJCACHE_MAX_LEN 64
 int luaRedisGenericCommand(lua_State *lua, int raise_error) {
     int j, argc = lua_gettop(lua);
+    int acl_retval = 0;
+    int call_flags = CMD_CALL_SLOWLOG | CMD_CALL_STATS;
     struct redisCommand *cmd;
     client *c = server.lua_client;
     sds reply;
+    
+    // Ensure our client is on the right thread
+    serverAssert(!(c->flags & CLIENT_PENDING_WRITE));
+    serverAssert(!(c->flags & CLIENT_UNBLOCKED));
+    serverAssert(aeThreadOwnsLock());
+    c->iel = serverTL - server.rgthreadvar;
 
     /* Cached across calls. */
     static robj **argv = NULL;
@@ -388,7 +399,7 @@ int luaRedisGenericCommand(lua_State *lua, int raise_error) {
      * To make this function reentrant is futile and makes it slower, but
      * we should at least detect such a misuse, and abort. */
     if (inuse) {
-        char *recursion_warning =
+        const char *recursion_warning =
             "luaRedisGenericCommand() recursive call detected. "
             "Are you doing funny stuff with Lua debug hooks?";
         serverLog(LL_WARNING,"%s",recursion_warning);
@@ -396,6 +407,7 @@ int luaRedisGenericCommand(lua_State *lua, int raise_error) {
         return 1;
     }
     inuse++;
+    std::unique_lock<decltype(c->lock)> ulock(c->lock);
 
     /* Require at least one argument */
     if (argc == 0) {
@@ -407,7 +419,7 @@ int luaRedisGenericCommand(lua_State *lua, int raise_error) {
 
     /* Build the arguments vector */
     if (argv_size < argc) {
-        argv = zrealloc(argv,sizeof(robj*)*argc, MALLOC_LOCAL);
+        argv = (robj**)zrealloc(argv,sizeof(robj*)*argc, MALLOC_LOCAL);
         argv_size = argc;
     }
 
@@ -432,7 +444,7 @@ int luaRedisGenericCommand(lua_State *lua, int raise_error) {
         if (j < LUA_CMD_OBJCACHE_SIZE && cached_objects[j] &&
             cached_objects_len[j] >= obj_len)
         {
-            sds s = ptrFromObj(cached_objects[j]);
+            sds s = (sds)ptrFromObj(cached_objects[j]);
             argv[j] = cached_objects[j];
             cached_objects[j] = NULL;
             memcpy(s,obj_s,obj_len+1);
@@ -472,14 +484,14 @@ int luaRedisGenericCommand(lua_State *lua, int raise_error) {
                 break;
             } else {
                 cmdlog = sdscatlen(cmdlog," ",1);
-                cmdlog = sdscatsds(cmdlog,ptrFromObj(c->argv[j]));
+                cmdlog = sdscatsds(cmdlog,(sds)ptrFromObj(c->argv[j]));
             }
         }
         ldbLog(cmdlog);
     }
 
     /* Command lookup */
-    cmd = lookupCommand(ptrFromObj(argv[0]));
+    cmd = lookupCommand((sds)ptrFromObj(argv[0]));
     if (!cmd || ((cmd->arity > 0 && cmd->arity != argc) ||
                    (argc < -cmd->arity)))
     {
@@ -499,7 +511,7 @@ int luaRedisGenericCommand(lua_State *lua, int raise_error) {
     }
 
     /* Check the ACLs. */
-    int acl_retval = ACLCheckCommandPerm(c);
+    acl_retval = ACLCheckCommandPerm(c);
     if (acl_retval != ACL_OK) {
         if (acl_retval == ACL_DENIED_CMD)
             luaPushError(lua, "The user executing the script can't run this "
@@ -524,11 +536,11 @@ int luaRedisGenericCommand(lua_State *lua, int raise_error) {
                    !server.loading &&
                    !(server.lua_caller->flags & CLIENT_MASTER))
         {
-            luaPushError(lua, ptrFromObj(shared.roslaveerr));
+            luaPushError(lua, (char*)ptrFromObj(shared.roslaveerr));
             goto cleanup;
         } else if (deny_write_type != DISK_ERROR_TYPE_NONE) {
             if (deny_write_type == DISK_ERROR_TYPE_RDB) {
-                luaPushError(lua, ptrFromObj(shared.bgsaveerr));
+                luaPushError(lua, (char*)ptrFromObj(shared.bgsaveerr));
             } else {
                 sds aof_write_err = sdscatfmt(sdsempty(),
                     "-MISCONF Errors writing to the AOF file: %s\r\n",
@@ -551,7 +563,7 @@ int luaRedisGenericCommand(lua_State *lua, int raise_error) {
         (cmd->flags & CMD_DENYOOM))
     {
         if (getMaxmemoryState(NULL,NULL,NULL,NULL) != C_OK) {
-            luaPushError(lua, ptrFromObj(shared.oomerr));
+            luaPushError(lua, (char*)ptrFromObj(shared.oomerr));
             goto cleanup;
         }
     }
@@ -592,7 +604,6 @@ int luaRedisGenericCommand(lua_State *lua, int raise_error) {
     }
 
     /* Run the command */
-    int call_flags = CMD_CALL_SLOWLOG | CMD_CALL_STATS;
     if (server.lua_replicate_commands) {
         /* Set flags according to redis.set_repl() settings. */
         if (server.lua_repl & PROPAGATE_AOF)
@@ -616,9 +627,9 @@ int luaRedisGenericCommand(lua_State *lua, int raise_error) {
         reply = sdsnewlen(c->buf,c->bufpos);
         c->bufpos = 0;
         while(listLength(c->reply)) {
-            clientReplyBlock *o = listNodeValue(listFirst(c->reply));
+            clientReplyBlock *o = (clientReplyBlock*)listNodeValue(listFirst(c->reply));
 
-            reply = sdscatlen(reply,o->buf,o->used);
+            reply = sdscatlen(reply,o->buf(),o->used);
             listDelNode(c->reply,listFirst(c->reply));
         }
     }
@@ -652,9 +663,9 @@ cleanup:
             o->refcount == 1 &&
             (o->encoding == OBJ_ENCODING_RAW ||
              o->encoding == OBJ_ENCODING_EMBSTR) &&
-            sdslen(ptrFromObj(o)) <= LUA_CMD_OBJCACHE_MAX_LEN)
+            sdslen((sds)ptrFromObj(o)) <= LUA_CMD_OBJCACHE_MAX_LEN)
         {
-            sds s = ptrFromObj(o);
+            sds s = (sds)ptrFromObj(o);
             if (cached_objects[j]) decrRefCount(cached_objects[j]);
             cached_objects[j] = o;
             cached_objects_len[j] = sdsalloc(s);
@@ -718,7 +729,7 @@ int luaRedisSha1hexCommand(lua_State *lua) {
  * return redis.error_reply("ERR Some Error")
  * return redis.status_reply("ERR Some Error")
  */
-int luaRedisReturnSingleFieldTable(lua_State *lua, char *field) {
+int luaRedisReturnSingleFieldTable(lua_State *lua, const char *field) {
     if (lua_gettop(lua) != 1 || lua_type(lua,-1) != LUA_TSTRING) {
         luaPushError(lua, "wrong number or type of arguments");
         return 1;
@@ -864,10 +875,12 @@ void luaLoadLib(lua_State *lua, const char *libname, lua_CFunction luafunc) {
   lua_call(lua, 1, 0);
 }
 
+extern "C" {
 LUALIB_API int (luaopen_cjson) (lua_State *L);
 LUALIB_API int (luaopen_struct) (lua_State *L);
 LUALIB_API int (luaopen_cmsgpack) (lua_State *L);
 LUALIB_API int (luaopen_bit) (lua_State *L);
+}
 
 void luaLoadLibraries(lua_State *lua) {
     luaLoadLib(lua, "", luaopen_base);
@@ -901,7 +914,7 @@ void luaRemoveUnsupportedFunctions(lua_State *lua) {
  * It should be the last to be called in the scripting engine initialization
  * sequence, because it may interact with creation of globals. */
 void scriptingEnableGlobalsProtection(lua_State *lua) {
-    char *s[32];
+    const char *s[32];
     sds code = sdsempty();
     int j = 0;
 
@@ -1069,7 +1082,7 @@ void scriptingInit(int setup) {
     /* Add a helper function that we use to sort the multi bulk output of non
      * deterministic commands, when containing 'false' elements. */
     {
-        char *compare_func =    "function __redis__compare_helper(a,b)\n"
+        const char *compare_func =    "function __redis__compare_helper(a,b)\n"
                                 "  if a == false then a = '' end\n"
                                 "  if b == false then b = '' end\n"
                                 "  return a<b\n"
@@ -1083,7 +1096,7 @@ void scriptingInit(int setup) {
      * information about the caller, that's what makes sense from the point
      * of view of the user debugging a script. */
     {
-        char *errh_func =       "local dbg = debug\n"
+        const char *errh_func =       "local dbg = debug\n"
                                 "function __redis__err__handler(err)\n"
                                 "  local i = dbg.getinfo(2,'nSl')\n"
                                 "  if i and i.what == 'C' then\n"
@@ -1104,7 +1117,7 @@ void scriptingInit(int setup) {
      * Note: there is no need to create it again when this function is called
      * by scriptingReset(). */
     if (server.lua_client == NULL) {
-        server.lua_client = createClient(-1);
+        server.lua_client = createClient(-1, IDX_EVENT_LOOP_MAIN);
         server.lua_client->flags |= CLIENT_LUA;
     }
 
@@ -1131,12 +1144,12 @@ void scriptingReset(void) {
 
 /* Set an array of Redis String Objects as a Lua array (table) stored into a
  * global variable. */
-void luaSetGlobalArray(lua_State *lua, char *var, robj **elev, int elec) {
+void luaSetGlobalArray(lua_State *lua, const char *var, robj **elev, int elec) {
     int j;
 
     lua_newtable(lua);
     for (j = 0; j < elec; j++) {
-        lua_pushlstring(lua,(char*)ptrFromObj(elev[j]),sdslen(ptrFromObj(elev[j])));
+        lua_pushlstring(lua,(char*)ptrFromObj(elev[j]),sdslen((sds)ptrFromObj(elev[j])));
         lua_rawseti(lua,-2,j+1);
     }
     lua_setglobal(lua,var);
@@ -1212,19 +1225,19 @@ sds luaCreateFunction(client *c, lua_State *lua, robj *body) {
 
     funcname[0] = 'f';
     funcname[1] = '_';
-    sha1hex(funcname+2,ptrFromObj(body),sdslen(ptrFromObj(body)));
+    sha1hex(funcname+2,(char*)ptrFromObj(body),sdslen((sds)ptrFromObj(body)));
 
     sds sha = sdsnewlen(funcname+2,40);
     if ((de = dictFind(server.lua_scripts,sha)) != NULL) {
         sdsfree(sha);
-        return dictGetKey(de);
+        return (sds)dictGetKey(de);
     }
 
     sds funcdef = sdsempty();
     funcdef = sdscat(funcdef,"function ");
     funcdef = sdscatlen(funcdef,funcname,42);
     funcdef = sdscatlen(funcdef,"() ",3);
-    funcdef = sdscatlen(funcdef,ptrFromObj(body),sdslen(ptrFromObj(body)));
+    funcdef = sdscatlen(funcdef,ptrFromObj(body),sdslen((sds)ptrFromObj(body)));
     funcdef = sdscatlen(funcdef,"\nend",4);
 
     if (luaL_loadbuffer(lua,funcdef,sdslen(funcdef),"@user_script")) {
@@ -1278,7 +1291,7 @@ void luaMaskCountHook(lua_State *lua, lua_Debug *ar) {
          * here when the EVAL command will return. */
         protectClient(server.lua_caller);
     }
-    if (server.lua_timedout) processEventsWhileBlocked();
+    if (server.lua_timedout) processEventsWhileBlocked(serverTL - server.rgthreadvar);
     if (server.lua_kill) {
         serverLog(LL_WARNING,"Lua script killed by user with SCRIPT KILL.");
         lua_pushstring(lua,"Script killed by user with SCRIPT KILL...");
@@ -1328,11 +1341,11 @@ void evalGenericCommand(client *c, int evalsha) {
     funcname[1] = '_';
     if (!evalsha) {
         /* Hash the code if this is an EVAL call */
-        sha1hex(funcname+2,ptrFromObj(c->argv[1]),sdslen(ptrFromObj(c->argv[1])));
+        sha1hex(funcname+2,(char*)ptrFromObj(c->argv[1]),sdslen((sds)ptrFromObj(c->argv[1])));
     } else {
         /* We already have the SHA if it is a EVALSHA */
         int j;
-        char *sha = ptrFromObj(c->argv[1]);
+        char *sha = (char*)ptrFromObj(c->argv[1]);
 
         /* Convert to lowercase. We don't use tolower since the function
          * managed to always show up in the profiler output consuming
@@ -1464,13 +1477,13 @@ void evalGenericCommand(client *c, int evalsha) {
      * flush our cache of scripts that can be replicated as EVALSHA, while
      * for AOF we need to do so every time we rewrite the AOF file. */
     if (evalsha && !server.lua_replicate_commands) {
-        if (!replicationScriptCacheExists(ptrFromObj(c->argv[1]))) {
+        if (!replicationScriptCacheExists((sds)ptrFromObj(c->argv[1]))) {
             /* This script is not in our script cache, replicate it as
              * EVAL, then add it into the script cache, as from now on
              * slaves and AOF know about it. */
-            robj *script = dictFetchValue(server.lua_scripts,ptrFromObj(c->argv[1]));
+            robj *script = (robj*)dictFetchValue(server.lua_scripts,ptrFromObj(c->argv[1]));
 
-            replicationScriptCacheAdd(ptrFromObj(c->argv[1]));
+            replicationScriptCacheAdd((sds)ptrFromObj(c->argv[1]));
             serverAssertWithInfo(c,NULL,script != NULL);
 
             /* If the script did not produce any changes in the dataset we want
@@ -1500,7 +1513,7 @@ void evalCommand(client *c) {
 }
 
 void evalShaCommand(client *c) {
-    if (sdslen(ptrFromObj(c->argv[1])) != 40) {
+    if (sdslen((sds)ptrFromObj(c->argv[1])) != 40) {
         /* We know that a match is not possible if the provided SHA is
          * not the right length. So we return an error ASAP, this way
          * evalGenericCommand() can be implemented without string length
@@ -1517,7 +1530,7 @@ void evalShaCommand(client *c) {
 }
 
 void scriptCommand(client *c) {
-    if (c->argc == 2 && !strcasecmp(ptrFromObj(c->argv[1]),"help")) {
+    if (c->argc == 2 && !strcasecmp((const char*)ptrFromObj(c->argv[1]),"help")) {
         const char *help[] = {
 "DEBUG (yes|sync|no) -- Set the debug mode for subsequent scripts executed.",
 "EXISTS <sha1> [<sha1> ...] -- Return information about the existence of the scripts in the script cache.",
@@ -1527,12 +1540,12 @@ void scriptCommand(client *c) {
 NULL
         };
         addReplyHelp(c, help);
-    } else if (c->argc == 2 && !strcasecmp(ptrFromObj(c->argv[1]),"flush")) {
+    } else if (c->argc == 2 && !strcasecmp((const char*)ptrFromObj(c->argv[1]),"flush")) {
         scriptingReset();
         addReply(c,shared.ok);
         replicationScriptCacheFlush();
         server.dirty++; /* Propagating this command is a good idea. */
-    } else if (c->argc >= 2 && !strcasecmp(ptrFromObj(c->argv[1]),"exists")) {
+    } else if (c->argc >= 2 && !strcasecmp((const char*)ptrFromObj(c->argv[1]),"exists")) {
         int j;
 
         addReplyArrayLen(c, c->argc-2);
@@ -1542,12 +1555,12 @@ NULL
             else
                 addReply(c,shared.czero);
         }
-    } else if (c->argc == 3 && !strcasecmp(ptrFromObj(c->argv[1]),"load")) {
+    } else if (c->argc == 3 && !strcasecmp((const char*)ptrFromObj(c->argv[1]),"load")) {
         sds sha = luaCreateFunction(c,server.lua,c->argv[2]);
         if (sha == NULL) return; /* The error was sent by luaCreateFunction(). */
         addReplyBulkCBuffer(c,sha,40);
         forceCommandPropagation(c,PROPAGATE_REPL|PROPAGATE_AOF);
-    } else if (c->argc == 2 && !strcasecmp(ptrFromObj(c->argv[1]),"kill")) {
+    } else if (c->argc == 2 && !strcasecmp((const char*)ptrFromObj(c->argv[1]),"kill")) {
         if (server.lua_caller == NULL) {
             addReplySds(c,sdsnew("-NOTBUSY No scripts in execution right now.\r\n"));
         } else if (server.lua_caller->flags & CLIENT_MASTER) {
@@ -1558,18 +1571,18 @@ NULL
             server.lua_kill = 1;
             addReply(c,shared.ok);
         }
-    } else if (c->argc == 3 && !strcasecmp(ptrFromObj(c->argv[1]),"debug")) {
+    } else if (c->argc == 3 && !strcasecmp((const char*)ptrFromObj(c->argv[1]),"debug")) {
         if (clientHasPendingReplies(c)) {
             addReplyError(c,"SCRIPT DEBUG must be called outside a pipeline");
             return;
         }
-        if (!strcasecmp(ptrFromObj(c->argv[2]),"no")) {
+        if (!strcasecmp((const char*)ptrFromObj(c->argv[2]),"no")) {
             ldbDisable(c);
             addReply(c,shared.ok);
-        } else if (!strcasecmp(ptrFromObj(c->argv[2]),"yes")) {
+        } else if (!strcasecmp((const char*)ptrFromObj(c->argv[2]),"yes")) {
             ldbEnable(c);
             addReply(c,shared.ok);
-        } else if (!strcasecmp(ptrFromObj(c->argv[2]),"sync")) {
+        } else if (!strcasecmp((const char*)ptrFromObj(c->argv[2]),"sync")) {
             ldbEnable(c);
             addReply(c,shared.ok);
             c->flags |= CLIENT_LUA_DEBUG_SYNC;
@@ -1660,8 +1673,8 @@ void ldbSendLogs(void) {
     while(listLength(ldb.logs)) {
         listNode *ln = listFirst(ldb.logs);
         proto = sdscatlen(proto,"+",1);
-        sdsmapchars(ln->value,"\r\n","  ",2);
-        proto = sdscatsds(proto,ln->value);
+        sdsmapchars((sds)ln->value,"\r\n","  ",2);
+        proto = sdscatsds(proto,(sds)ln->value);
         proto = sdscatlen(proto,"\r\n",2);
         listDelNode(ldb.logs,ln);
     }
@@ -1724,7 +1737,7 @@ int ldbStartSession(client *c) {
 
     /* First argument of EVAL is the script itself. We split it into different
      * lines since this is the way the debugger accesses the source code. */
-    sds srcstring = sdsdup(ptrFromObj(c->argv[1]));
+    sds srcstring = sdsdup((sds)ptrFromObj(c->argv[1]));
     size_t srclen = sdslen(srcstring);
     while(srclen && (srcstring[srclen-1] == '\n' ||
                      srcstring[srclen-1] == '\r'))
@@ -1814,7 +1827,7 @@ void evalGenericCommandWithDebugging(client *c, int evalsha) {
 
 /* Return a pointer to ldb.src source code line, considering line to be
  * one-based, and returning a special string for out of range lines. */
-char *ldbGetSourceLine(int line) {
+const char *ldbGetSourceLine(int line) {
     int idx = line-1;
     if (idx < 0 || idx >= ldb.lines) return "<out of range source code line>";
     return ldb.src[idx];
@@ -1862,6 +1875,7 @@ int ldbDelBreakpoint(int line) {
 sds *ldbReplParseCommand(int *argcp) {
     sds *argv = NULL;
     int argc = 0;
+    char *plen = NULL;
     if (sdslen(ldb.cbuf) == 0) return NULL;
 
     /* Working on a copy is simpler in this case. We can modify it freely
@@ -1875,14 +1889,14 @@ sds *ldbReplParseCommand(int *argcp) {
 
     /* Seek and parse *<count>\r\n. */
     p = strchr(p,'*'); if (!p) goto protoerr;
-    char *plen = p+1; /* Multi bulk len pointer. */
+    plen = p+1; /* Multi bulk len pointer. */
     p = strstr(p,"\r\n"); if (!p) goto protoerr;
     *p = '\0'; p += 2;
     *argcp = atoi(plen);
     if (*argcp <= 0 || *argcp > 1024) goto protoerr;
 
     /* Parse each argument. */
-    argv = zmalloc(sizeof(sds)*(*argcp), MALLOC_LOCAL);
+    argv = (sds*)zmalloc(sizeof(sds)*(*argcp), MALLOC_LOCAL);
     argc = 0;
     while(argc < *argcp) {
         if (*p != '$') goto protoerr;
@@ -1907,8 +1921,8 @@ protoerr:
 
 /* Log the specified line in the Lua debugger output. */
 void ldbLogSourceLine(int lnum) {
-    char *line = ldbGetSourceLine(lnum);
-    char *prefix;
+    const char *line = ldbGetSourceLine(lnum);
+    const char *prefix;
     int bp = ldbIsBreakpoint(lnum);
     int current = ldb.currentline == lnum;
 
@@ -2014,12 +2028,12 @@ sds ldbCatStackValueRec(sds s, lua_State *lua, int idx, int level) {
     case LUA_TLIGHTUSERDATA:
         {
         const void *p = lua_topointer(lua,idx);
-        char *typename = "unknown";
-        if (t == LUA_TFUNCTION) typename = "function";
-        else if (t == LUA_TUSERDATA) typename = "userdata";
-        else if (t == LUA_TTHREAD) typename = "thread";
-        else if (t == LUA_TLIGHTUSERDATA) typename = "light-userdata";
-        s = sdscatprintf(s,"\"%s@%p\"",typename,p);
+        const char *tname = "unknown";
+        if (t == LUA_TFUNCTION) tname = "function";
+        else if (t == LUA_TUSERDATA) tname = "userdata";
+        else if (t == LUA_TTHREAD) tname = "thread";
+        else if (t == LUA_TLIGHTUSERDATA) tname = "light-userdata";
+        s = sdscatprintf(s,"\"%s@%p\"",tname,p);
         }
         break;
     default:
@@ -2038,7 +2052,7 @@ sds ldbCatStackValue(sds s, lua_State *lua, int idx) {
 /* Produce a debugger log entry representing the value of the Lua object
  * currently on the top of the stack. The element is ot popped nor modified.
  * Check ldbCatStackValue() for the actual implementation. */
-void ldbLogStackValue(lua_State *lua, char *prefix) {
+void ldbLogStackValue(lua_State *lua, const char *prefix) {
     sds s = sdsnew(prefix);
     s = ldbCatStackValue(s,lua,-1);
     ldbLogWithMaxLen(s);
@@ -2460,7 +2474,7 @@ void luaLdbLineHook(lua_State *lua, lua_Debug *ar) {
     }
 
     if (ldb.step || bp) {
-        char *reason = "step over";
+        const char *reason = "step over";
         if (bp) reason = ldb.luabp ? "redis.breakpoint() called" :
                                      "break point";
         else if (timeout) reason = "timeout reached, infinite loop?";
diff --git a/src/sds.h b/src/sds.h
index e4090d23a..1985fb263 100644
--- a/src/sds.h
+++ b/src/sds.h
@@ -41,9 +41,7 @@ extern const char *SDS_NOINIT;
 #include <stdint.h>
 
 #ifdef __cplusplus
-#define ZERO_LENGTH_ARRAY_LENGTH 1
-#else
-#define ZERO_LENGTH_ARRAY_LENGTH
+extern "C" {
 #endif
 
 typedef char *sds;
@@ -52,31 +50,41 @@ typedef char *sds;
  * However is here to document the layout of type 5 SDS strings. */
 struct __attribute__ ((__packed__)) sdshdr5 {
     unsigned char flags; /* 3 lsb of type, and 5 msb of string length */
-    char buf[ZERO_LENGTH_ARRAY_LENGTH];
+#ifndef __cplusplus
+    char buf[];
+#endif
 };
 struct __attribute__ ((__packed__)) sdshdr8 {
     uint8_t len; /* used */
     uint8_t alloc; /* excluding the header and null terminator */
     unsigned char flags; /* 3 lsb of type, 5 unused bits */
-    char buf[ZERO_LENGTH_ARRAY_LENGTH];
+#ifndef __cplusplus
+    char buf[];
+#endif
 };
 struct __attribute__ ((__packed__)) sdshdr16 {
     uint16_t len; /* used */
     uint16_t alloc; /* excluding the header and null terminator */
     unsigned char flags; /* 3 lsb of type, 5 unused bits */
-    char buf[ZERO_LENGTH_ARRAY_LENGTH];
+#ifndef __cplusplus
+    char buf[];
+#endif
 };
 struct __attribute__ ((__packed__)) sdshdr32 {
     uint32_t len; /* used */
     uint32_t alloc; /* excluding the header and null terminator */
     unsigned char flags; /* 3 lsb of type, 5 unused bits */
-    char buf[ZERO_LENGTH_ARRAY_LENGTH];
+#ifndef __cplusplus
+    char buf[];
+#endif
 };
 struct __attribute__ ((__packed__)) sdshdr64 {
     uint64_t len; /* used */
     uint64_t alloc; /* excluding the header and null terminator */
     unsigned char flags; /* 3 lsb of type, 5 unused bits */
-    char buf[ZERO_LENGTH_ARRAY_LENGTH];
+#ifndef __cplusplus
+    char buf[];
+#endif
 };
 
 #define SDS_TYPE_5  0
@@ -284,4 +292,8 @@ void sds_free(void *ptr);
 int sdsTest(int argc, char *argv[]);
 #endif
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif
diff --git a/src/sentinel.c b/src/sentinel.c
index 6fa3cf84f..c3c905058 100644
--- a/src/sentinel.c
+++ b/src/sentinel.c
@@ -2013,7 +2013,7 @@ void sentinelReconnectInstance(sentinelRedisInstance *ri) {
             link->pending_commands = 0;
             link->cc_conn_time = mstime();
             link->cc->data = link;
-            redisAeAttach(server.el,link->cc);
+            redisAeAttach(server.rgthreadvar[IDX_EVENT_LOOP_MAIN].el,link->cc);
             redisAsyncSetConnectCallback(link->cc,
                     sentinelLinkEstablishedCallback);
             redisAsyncSetDisconnectCallback(link->cc,
@@ -2037,7 +2037,7 @@ void sentinelReconnectInstance(sentinelRedisInstance *ri) {
 
             link->pc_conn_time = mstime();
             link->pc->data = link;
-            redisAeAttach(server.el,link->pc);
+            redisAeAttach(server.rgthreadvar[IDX_EVENT_LOOP_MAIN].el,link->pc);
             redisAsyncSetConnectCallback(link->pc,
                     sentinelLinkEstablishedCallback);
             redisAsyncSetDisconnectCallback(link->pc,
@@ -3976,6 +3976,7 @@ int sentinelSendSlaveOf(sentinelRedisInstance *ri, char *host, int port) {
 
 /* Setup the master state to start a failover. */
 void sentinelStartFailover(sentinelRedisInstance *master) {
+    serverAssert(aeThreadOwnsLock());
     serverAssert(master->flags & SRI_MASTER);
 
     master->failover_state = SENTINEL_FAILOVER_STATE_WAIT_START;
@@ -4168,6 +4169,7 @@ void sentinelFailoverWaitStart(sentinelRedisInstance *ri) {
 }
 
 void sentinelFailoverSelectSlave(sentinelRedisInstance *ri) {
+    serverAssert(aeThreadOwnsLock());
     sentinelRedisInstance *slave = sentinelSelectSlave(ri);
 
     /* We don't handle the timeout in this state as the function aborts
@@ -4292,6 +4294,7 @@ void sentinelFailoverReconfNextSlave(sentinelRedisInstance *master) {
     dictIterator *di;
     dictEntry *de;
     int in_progress = 0;
+    serverAssert(aeThreadOwnsLock());
 
     di = dictGetIterator(master->slaves);
     while((de = dictNext(di)) != NULL) {
diff --git a/src/server.c b/src/server.c
index 4794571c8..32b7069dd 100644
--- a/src/server.c
+++ b/src/server.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2009-2016, Salvatore Sanfilippo <antirez at gmail dot com>
+ * Copyright (c) 2019 John Sully <john at eqalpha dot com>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -71,6 +72,7 @@ double R_Zero, R_PosInf, R_NegInf, R_Nan;
 
 /* Global vars */
 struct redisServer server; /* Server global state */
+__thread struct redisServerThreadVars *serverTL = NULL;   // thread local server vars
 volatile unsigned long lru_clock; /* Server global current LRU time. */
 
 /* Our command table.
@@ -659,7 +661,7 @@ struct redisCommand redisCommandTable[] = {
      0,NULL,0,0,0,0,0,0},
 
     {"lastsave",lastsaveCommand,1,
-     "read-only random fast @admin",
+     "read-only random fast @admin @dangerous",
      0,NULL,0,0,0,0,0,0},
 
     {"type",typeCommand,2,
@@ -1525,6 +1527,7 @@ int clientsCronHandleTimeout(client *c, mstime_t now_ms) {
  *
  * The function always returns 0 as it never terminates the client. */
 int clientsCronResizeQueryBuffer(client *c) {
+    AssertCorrectThread(c);
     size_t querybuf_size = sdsAllocSize(c->querybuf);
     time_t idletime = server.unixtime - c->lastinteraction;
 
@@ -1635,7 +1638,7 @@ void getExpansiveClientsInfo(size_t *in_usage, size_t *out_usage) {
  * of clients per second, turning this function into a source of latency.
  */
 #define CLIENTS_CRON_MIN_ITERATIONS 5
-void clientsCron(void) {
+void clientsCron(int iel) {
     /* Try to process at least numclients/server.hz of clients
      * per call. Since normally (if there are no big latency events) this
      * function is called server.hz times per second, in the average case we
@@ -1661,12 +1664,18 @@ void clientsCron(void) {
         listRotate(server.clients);
         head = listFirst(server.clients);
         c = listNodeValue(head);
-        /* The following functions do different service checks on the client.
-         * The protocol is that they return non-zero if the client was
-         * terminated. */
-        if (clientsCronHandleTimeout(c,now)) continue;
-        if (clientsCronResizeQueryBuffer(c)) continue;
-        if (clientsCronTrackExpansiveClients(c)) continue;
+        if (c->iel == iel)
+        {
+            fastlock_lock(&c->lock);
+            /* The following functions do different service checks on the client.
+            * The protocol is that they return non-zero if the client was
+            * terminated. */
+            if (clientsCronHandleTimeout(c,now)) goto LContinue;
+            if (clientsCronResizeQueryBuffer(c)) goto LContinue;
+            if (clientsCronTrackExpansiveClients(c)) goto LContinue;
+        LContinue:
+            fastlock_unlock(&c->lock);
+        }        
     }
 }
 
@@ -1768,6 +1777,8 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
     UNUSED(id);
     UNUSED(clientData);
 
+    ProcessPendingAsyncWrites();    // This is really a bug, but for now catch any laggards that didn't clean up
+        
     /* Software watchdog: deliver the SIGALRM that will reach the signal
      * handler if we don't return here fast enough. */
     if (server.watchdog_period) watchdogScheduleSignal(server.watchdog_period);
@@ -1879,7 +1890,7 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
     }
 
     /* We need to do a few operations on clients asynchronously. */
-    clientsCron();
+    clientsCron(IDX_EVENT_LOOP_MAIN);
 
     /* Handle background operations on Redis databases. */
     databasesCron();
@@ -1984,7 +1995,7 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
     }
 
     /* Close clients that need to be closed asynchronous */
-    freeClientsInAsyncFreeQueue();
+    freeClientsInAsyncFreeQueue(IDX_EVENT_LOOP_MAIN);
 
     /* Clear the paused clients flag if needed. */
     clientsArePaused(); /* Don't check return value, just use the side effect.*/
@@ -2028,6 +2039,25 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
     return 1000/server.hz;
 }
 
+// serverCron for worker threads other than the main thread
+int serverCronLite(struct aeEventLoop *eventLoop, long long id, void *clientData)
+{
+    UNUSED(id);
+    UNUSED(clientData);
+
+    int iel = ielFromEventLoop(eventLoop);
+    serverAssert(iel != IDX_EVENT_LOOP_MAIN);
+    
+    aeAcquireLock();
+    ProcessPendingAsyncWrites();    // A bug but leave for now, events should clean up after themselves
+    clientsCron(iel);
+
+    freeClientsInAsyncFreeQueue(iel);
+    aeReleaseLock();
+
+    return 1000/server.hz;
+}
+
 /* This function gets called every time Redis is entering the
  * main loop of the event driven library, that is, before to sleep
  * for ready file descriptors. */
@@ -2070,14 +2100,16 @@ void beforeSleep(struct aeEventLoop *eventLoop) {
     moduleHandleBlockedClients();
 
     /* Try to process pending commands for clients that were just unblocked. */
-    if (listLength(server.unblocked_clients))
-        processUnblockedClients();
+    if (listLength(server.rgthreadvar[IDX_EVENT_LOOP_MAIN].unblocked_clients))
+    {
+        processUnblockedClients(IDX_EVENT_LOOP_MAIN);
+    }
 
     /* Write the AOF buffer on disk */
     flushAppendOnlyFile(0);
 
     /* Handle writes with pending output buffers. */
-    handleClientsWithPendingWrites();
+    handleClientsWithPendingWrites(IDX_EVENT_LOOP_MAIN);
 
     /* Before we are going to sleep, let the threads access the dataset by
      * releasing the GIL. Redis main thread will not touch anything at this
@@ -2085,6 +2117,21 @@ void beforeSleep(struct aeEventLoop *eventLoop) {
     if (moduleCount()) moduleReleaseGIL();
 }
 
+void beforeSleepLite(struct aeEventLoop *eventLoop)
+{
+    int iel = ielFromEventLoop(eventLoop);
+    
+    /* Try to process pending commands for clients that were just unblocked. */
+    aeAcquireLock();
+    if (listLength(server.rgthreadvar[iel].unblocked_clients)) {
+        processUnblockedClients(iel);
+    }
+
+    /* Handle writes with pending output buffers. */
+    handleClientsWithPendingWrites(iel);
+    aeReleaseLock();
+}
+
 /* This function is called immadiately after the event loop multiplexing
  * API returned, and the control is going to soon return to Redis by invoking
  * the different events callbacks. */
@@ -2221,7 +2268,6 @@ void initServerConfig(void) {
     server.bindaddr_count = 0;
     server.unixsocket = NULL;
     server.unixsocketperm = CONFIG_DEFAULT_UNIX_SOCKET_PERM;
-    server.ipfd_count = 0;
     server.sofd = -1;
     server.protected_mode = CONFIG_DEFAULT_PROTECTED_MODE;
     server.dbnum = CONFIG_DEFAULT_DBNUM;
@@ -2409,6 +2455,9 @@ void initServerConfig(void) {
      * script to the slave / AOF. This is the new way starting from
      * Redis 5. However it is possible to revert it via redis.conf. */
     server.lua_always_replicate_commands = 1;
+
+    /* Multithreading */
+    server.cthreads = CONFIG_DEFAULT_THREADS;
 }
 
 extern char **environ;
@@ -2595,7 +2644,7 @@ void checkTcpBacklogSettings(void) {
  * impossible to bind, or no bind addresses were specified in the server
  * configuration but the function is not able to bind * for at least
  * one of the IPv4 or IPv6 protocols. */
-int listenToPort(int port, int *fds, int *count) {
+int listenToPort(int port, int *fds, int *count, int fReusePort) {
     int j;
 
     /* Force binding of 0.0.0.0 if no bind address is specified, always
@@ -2607,7 +2656,7 @@ int listenToPort(int port, int *fds, int *count) {
             /* Bind * for both IPv6 and IPv4, we enter here only if
              * server.bindaddr_count == 0. */
             fds[*count] = anetTcp6Server(server.neterr,port,NULL,
-                server.tcp_backlog);
+                server.tcp_backlog, fReusePort);
             if (fds[*count] != ANET_ERR) {
                 anetNonBlock(NULL,fds[*count]);
                 (*count)++;
@@ -2619,7 +2668,7 @@ int listenToPort(int port, int *fds, int *count) {
             if (*count == 1 || unsupported) {
                 /* Bind the IPv4 address as well. */
                 fds[*count] = anetTcpServer(server.neterr,port,NULL,
-                    server.tcp_backlog);
+                    server.tcp_backlog, fReusePort);
                 if (fds[*count] != ANET_ERR) {
                     anetNonBlock(NULL,fds[*count]);
                     (*count)++;
@@ -2635,11 +2684,11 @@ int listenToPort(int port, int *fds, int *count) {
         } else if (strchr(server.bindaddr[j],':')) {
             /* Bind IPv6 address. */
             fds[*count] = anetTcp6Server(server.neterr,port,server.bindaddr[j],
-                server.tcp_backlog);
+                server.tcp_backlog, fReusePort);
         } else {
             /* Bind IPv4 address. */
             fds[*count] = anetTcpServer(server.neterr,port,server.bindaddr[j],
-                server.tcp_backlog);
+                server.tcp_backlog, fReusePort);
         }
         if (fds[*count] == ANET_ERR) {
             serverLog(LL_WARNING,
@@ -2695,50 +2744,38 @@ void resetServerStats(void) {
     server.aof_delayed_fsync = 0;
 }
 
-void initServer(void) {
-    int j;
-
-    signal(SIGHUP, SIG_IGN);
-    signal(SIGPIPE, SIG_IGN);
-    setupSignalHandlers();
-
-    if (server.syslog_enabled) {
-        openlog(server.syslog_ident, LOG_PID | LOG_NDELAY | LOG_NOWAIT,
-            server.syslog_facility);
-    }
-
-    server.hz = server.config_hz;
-    server.pid = getpid();
-    server.current_client = NULL;
-    server.clients = listCreate();
-    server.clients_index = raxNew();
-    server.clients_to_close = listCreate();
-    server.slaves = listCreate();
-    server.monitors = listCreate();
-    server.clients_pending_write = listCreate();
-    server.slaveseldb = -1; /* Force to emit the first SELECT command. */
-    server.unblocked_clients = listCreate();
-    server.ready_keys = listCreate();
-    server.clients_waiting_acks = listCreate();
-    server.get_ack_from_slaves = 0;
-    server.clients_paused = 0;
-    server.system_memory_size = zmalloc_get_memory_size();
-
-    createSharedObjects();
-    adjustOpenFilesLimit();
-    server.el = aeCreateEventLoop(server.maxclients+CONFIG_FDSET_INCR);
-    if (server.el == NULL) {
-        serverLog(LL_WARNING,
-            "Failed creating the event loop. Error message: '%s'",
-            strerror(errno));
-        exit(1);
-    }
-    server.db = zmalloc(sizeof(redisDb)*server.dbnum, MALLOC_LOCAL);
-
+static void initNetworkingThread(int iel, int fReusePort)
+{
     /* Open the TCP listening socket for the user commands. */
-    if (server.port != 0 &&
-        listenToPort(server.port,server.ipfd,&server.ipfd_count) == C_ERR)
-        exit(1);
+    if (fReusePort || (iel == IDX_EVENT_LOOP_MAIN))
+    {
+        if (server.port != 0 &&
+            listenToPort(server.port,server.rgthreadvar[iel].ipfd,&server.rgthreadvar[iel].ipfd_count, fReusePort) == C_ERR)
+            exit(1);
+    }
+    else
+    {
+        // We use the main threads file descriptors
+        memcpy(server.rgthreadvar[iel].ipfd, server.rgthreadvar[IDX_EVENT_LOOP_MAIN].ipfd, sizeof(int)*CONFIG_BINDADDR_MAX);
+        server.rgthreadvar[iel].ipfd_count = server.rgthreadvar[IDX_EVENT_LOOP_MAIN].ipfd_count;
+    }
+
+    /* Create an event handler for accepting new connections in TCP */
+    for (int j = 0; j < server.rgthreadvar[iel].ipfd_count; j++) {
+        if (aeCreateFileEvent(server.rgthreadvar[iel].el, server.rgthreadvar[iel].ipfd[j], AE_READABLE|AE_READ_THREADSAFE,
+            acceptTcpHandler,NULL) == AE_ERR)
+            {
+                serverPanic(
+                    "Unrecoverable error creating server.ipfd file event.");
+            }
+    }
+}
+
+static void initNetworking(int fReusePort)
+{
+    int celListen = (fReusePort) ? server.cthreads : 1;
+    for (int iel = 0; iel < celListen; ++iel)
+        initNetworkingThread(iel, fReusePort);
 
     /* Open the listening Unix domain socket. */
     if (server.unixsocket != NULL) {
@@ -2753,13 +2790,72 @@ void initServer(void) {
     }
 
     /* Abort if there are no listening sockets at all. */
-    if (server.ipfd_count == 0 && server.sofd < 0) {
+    if (server.rgthreadvar[IDX_EVENT_LOOP_MAIN].ipfd_count == 0 && server.sofd < 0) {
         serverLog(LL_WARNING, "Configured to not listen anywhere, exiting.");
         exit(1);
     }
 
+    if (server.sofd > 0 && aeCreateFileEvent(server.rgthreadvar[IDX_EVENT_LOOP_MAIN].el,server.sofd,AE_READABLE|AE_READ_THREADSAFE,
+        acceptUnixHandler,NULL) == AE_ERR) serverPanic("Unrecoverable error creating server.sofd file event.");
+}
+
+static void initServerThread(struct redisServerThreadVars *pvar, int fMain)
+{
+    pvar->clients_pending_write = listCreate();
+    pvar->unblocked_clients = listCreate();
+    pvar->clients_pending_asyncwrite = listCreate();
+    pvar->ipfd_count = 0;
+    pvar->el = aeCreateEventLoop(server.maxclients+CONFIG_FDSET_INCR);
+    if (pvar->el == NULL) {
+        serverLog(LL_WARNING,
+            "Failed creating the event loop. Error message: '%s'",
+            strerror(errno));
+        exit(1);
+    }
+
+    if (!fMain)
+    {
+        if (aeCreateTimeEvent(pvar->el, 1, serverCronLite, NULL, NULL) == AE_ERR) {
+            serverPanic("Can't create event loop timers.");
+            exit(1);
+        }
+    }
+}
+
+void initServer(void) {
+    signal(SIGHUP, SIG_IGN);
+    signal(SIGPIPE, SIG_IGN);
+    setupSignalHandlers();
+
+    fastlock_init(&server.flock);
+
+    if (server.syslog_enabled) {
+        openlog(server.syslog_ident, LOG_PID | LOG_NDELAY | LOG_NOWAIT,
+            server.syslog_facility);
+    }
+
+    server.hz = server.config_hz;
+    server.pid = getpid();
+    server.current_client = NULL;
+    server.clients = listCreate();
+    server.clients_index = raxNew();
+    server.clients_to_close = listCreate();
+    server.slaves = listCreate();
+    server.monitors = listCreate();
+    server.slaveseldb = -1; /* Force to emit the first SELECT command. */
+    server.ready_keys = listCreate();
+    server.clients_waiting_acks = listCreate();
+    server.get_ack_from_slaves = 0;
+    server.clients_paused = 0;
+    server.system_memory_size = zmalloc_get_memory_size();
+
+    createSharedObjects();
+    adjustOpenFilesLimit();
+
+    server.db = zmalloc(sizeof(redisDb)*server.dbnum, MALLOC_LOCAL);
+
     /* Create the Redis databases, and initialize other internal state. */
-    for (j = 0; j < server.dbnum; j++) {
+    for (int j = 0; j < server.dbnum; j++) {
         server.db[j].pdict = dictCreate(&dbDictType,NULL);
         server.db[j].expires = dictCreate(&keyptrDictType,NULL);
         server.db[j].blocking_keys = dictCreate(&keylistDictType,NULL);
@@ -2808,28 +2904,24 @@ void initServer(void) {
     /* Create the timer callback, this is our way to process many background
      * operations incrementally, like clients timeout, eviction of unaccessed
      * expired keys and so forth. */
-    if (aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL) == AE_ERR) {
+    if (aeCreateTimeEvent(server.rgthreadvar[IDX_EVENT_LOOP_MAIN].el, 1, serverCron, NULL, NULL) == AE_ERR) {
         serverPanic("Can't create event loop timers.");
         exit(1);
     }
 
-    /* Create an event handler for accepting new connections in TCP and Unix
-     * domain sockets. */
-    for (j = 0; j < server.ipfd_count; j++) {
-        if (aeCreateFileEvent(server.el, server.ipfd[j], AE_READABLE,
-            acceptTcpHandler,NULL) == AE_ERR)
-            {
-                serverPanic(
-                    "Unrecoverable error creating server.ipfd file event.");
-            }
+    /* Register a readable event for the pipe used to awake the event loop
+     * when a blocked client in a module needs attention. */
+    if (aeCreateFileEvent(server.rgthreadvar[IDX_EVENT_LOOP_MAIN].el, server.module_blocked_pipe[0], AE_READABLE,
+        moduleBlockedClientPipeReadable,NULL) == AE_ERR) {
+            serverPanic(
+                "Error registering the readable event for the module "
+                "blocked clients subsystem.");
     }
-    if (server.sofd > 0 && aeCreateFileEvent(server.el,server.sofd,AE_READABLE,
-        acceptUnixHandler,NULL) == AE_ERR) serverPanic("Unrecoverable error creating server.sofd file event.");
 
 
     /* Register a readable event for the pipe used to awake the event loop
      * when a blocked client in a module needs attention. */
-    if (aeCreateFileEvent(server.el, server.module_blocked_pipe[0], AE_READABLE,
+    if (aeCreateFileEvent(server.rgthreadvar[IDX_EVENT_LOOP_MAIN].el, server.module_blocked_pipe[0], AE_READABLE,
         moduleBlockedClientPipeReadable,NULL) == AE_ERR) {
             serverPanic(
                 "Error registering the readable event for the module "
@@ -2917,10 +3009,10 @@ int populateCommandTableParseFlags(struct redisCommand *c, char *strflags) {
                 return C_ERR;
             }
         }
-
-        /* If it's not @fast is @slow in this binary world. */
-        if (!(c->flags & CMD_CATEGORY_FAST)) c->flags |= CMD_CATEGORY_SLOW;
     }
+    /* If it's not @fast is @slow in this binary world. */
+    if (!(c->flags & CMD_CATEGORY_FAST)) c->flags |= CMD_CATEGORY_SLOW;
+
     sdsfreesplitres(argv,argc);
     return C_OK;
 }
@@ -3044,6 +3136,7 @@ struct redisCommand *lookupCommandOrOriginal(sds name) {
 void propagate(struct redisCommand *cmd, int dbid, robj **argv, int argc,
                int flags)
 {
+    serverAssert(aeThreadOwnsLock());
     if (server.aof_state != AOF_OFF && flags & PROPAGATE_AOF)
         feedAppendOnlyFile(cmd,dbid,argv,argc);
     if (flags & PROPAGATE_REPL)
@@ -3144,6 +3237,7 @@ void call(client *c, int flags) {
     long long dirty, start, duration;
     int client_old_flags = c->flags;
     struct redisCommand *real_cmd = c->cmd;
+    serverAssert(aeThreadOwnsLock());
 
     /* Sent the command to clients in MONITOR mode, only if the commands are
      * not generated from reading an AOF. */
@@ -3257,6 +3351,9 @@ void call(client *c, int flags) {
         }
         redisOpArrayFree(&server.also_propagate);
     }
+
+    ProcessPendingAsyncWrites();
+    
     server.also_propagate = prev_also_propagate;
     server.stat_numcommands++;
 }
@@ -3270,6 +3367,7 @@ void call(client *c, int flags) {
  * other operations can be performed by the caller. Otherwise
  * if C_ERR is returned the client was destroyed (i.e. after QUIT). */
 int processCommand(client *c) {
+    serverAssert(aeThreadOwnsLock());
     /* The QUIT command is handled separately. Normal command procs will
      * go through checking for replication and QUIT will cause trouble
      * when FORCE_REPLICATION is enabled and would be implemented in
@@ -3280,6 +3378,9 @@ int processCommand(client *c) {
         return C_ERR;
     }
 
+    AssertCorrectThread(c);
+    serverAssert(aeThreadOwnsLock());
+
     /* Now lookup the command and check ASAP about trivial error conditions
      * such as wrong arity, bad command name and so forth. */
     c->cmd = c->lastcmd = lookupCommand(ptrFromObj(c->argv[0]));
@@ -3301,14 +3402,17 @@ int processCommand(client *c) {
         return C_OK;
     }
 
-    /* Check if the user is authenticated */
-    if (!(DefaultUser->flags & USER_FLAG_NOPASS) &&
-        !c->authenticated &&
-        (c->cmd->proc != authCommand || c->cmd->proc == helloCommand))
-    {
-        flagTransaction(c);
-        addReply(c,shared.noautherr);
-        return C_OK;
+    /* Check if the user is authenticated. This check is skipped in case
+     * the default user is flagged as "nopass" and is active. */
+    int auth_required = !(DefaultUser->flags & USER_FLAG_NOPASS) &&
+                        !c->authenticated;
+    if (auth_required || DefaultUser->flags & USER_FLAG_DISABLED) {
+        /* AUTH and HELLO are valid even in non authenticated state. */
+        if (c->cmd->proc != authCommand || c->cmd->proc == helloCommand) {
+            flagTransaction(c);
+            addReply(c,shared.noautherr);
+            return C_OK;
+        }
     }
 
     /* Check if the user can run this command according to the current
@@ -3490,7 +3594,11 @@ int processCommand(client *c) {
 void closeListeningSockets(int unlink_unix_socket) {
     int j;
 
-    for (j = 0; j < server.ipfd_count; j++) close(server.ipfd[j]);
+    for (int iel = 0; iel < server.cthreads; ++iel)
+    {
+        for (j = 0; j < server.rgthreadvar[iel].ipfd_count; j++) 
+            close(server.rgthreadvar[iel].ipfd[j]);
+    }
     if (server.sofd != -1) close(server.sofd);
     if (server.cluster_enabled)
         for (j = 0; j < server.cfd_count; j++) close(server.cfd[j]);
@@ -3567,7 +3675,7 @@ int prepareForShutdown(int flags) {
     /* Close the listening sockets. Apparently this allows faster restarts. */
     closeListeningSockets(1);
     serverLog(LL_WARNING,"%s is now ready to exit, bye bye...",
-        server.sentinel_mode ? "Sentinel" : "Redis");
+        server.sentinel_mode ? "Sentinel" : "KeyDB");
     return C_OK;
 }
 
@@ -3698,8 +3806,8 @@ void addReplyCommand(client *c, struct redisCommand *cmd) {
     if (!cmd) {
         addReplyNull(c);
     } else {
-        /* We are adding: command name, arg count, flags, first, last, offset */
-        addReplyArrayLen(c, 6);
+        /* We are adding: command name, arg count, flags, first, last, offset, categories */
+        addReplyArrayLen(c, 7);
         addReplyBulkCString(c, cmd->name);
         addReplyLongLong(c, cmd->arity);
 
@@ -3729,6 +3837,8 @@ void addReplyCommand(client *c, struct redisCommand *cmd) {
         addReplyLongLong(c, cmd->firstkey);
         addReplyLongLong(c, cmd->lastkey);
         addReplyLongLong(c, cmd->keystep);
+
+        addReplyCommandCategories(c,cmd);
     }
 }
 
@@ -3953,6 +4063,7 @@ sds genRedisInfoString(char *section) {
         bytesToHuman(maxmemory_hmem,server.maxmemory);
 
         if (sections++) info = sdscat(info,"\r\n");
+        serverLog(LL_WARNING, "OOM max sent used_memory: %zu", zmalloc_used);
         info = sdscatprintf(info,
             "# Memory\r\n"
             "used_memory:%zu\r\n"
@@ -4387,10 +4498,12 @@ void infoCommand(client *c) {
         return;
     }
     addReplyBulkSds(c, genRedisInfoString(section));
+    serverLog(LL_WARNING, "OOM max info command %zu", zmalloc_used_memory());
 }
 
 void monitorCommand(client *c) {
     /* ignore MONITOR if already slave or in monitor mode */
+    serverAssert(aeThreadOwnsLock());
     if (c->flags & CLIENT_SLAVE) return;
 
     c->flags |= (CLIENT_SLAVE|CLIENT_MONITOR);
@@ -4420,7 +4533,7 @@ void linuxMemoryWarnings(void) {
         serverLog(LL_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
     }
     if (THPIsEnabled()) {
-        serverLog(LL_WARNING,"WARNING you have Transparent Huge Pages (THP) support enabled in your kernel. This will create latency and memory usage issues with Redis. To fix this issue run the command 'echo never > /sys/kernel/mm/transparent_hugepage/enabled' as root, and add it to your /etc/rc.local in order to retain the setting after a reboot. Redis must be restarted after THP is disabled.");
+        serverLog(LL_WARNING,"WARNING you have Transparent Huge Pages (THP) support enabled in your kernel. This will create latency and memory usage issues with KeyDB. To fix this issue run the command 'echo never > /sys/kernel/mm/transparent_hugepage/enabled' as root, and add it to your /etc/rc.local in order to retain the setting after a reboot. KeyDB must be restarted after THP is disabled.");
     }
 }
 #endif /* __linux__ */
@@ -4738,12 +4851,28 @@ int redisIsSupervised(int mode) {
     return 0;
 }
 
+void *workerThreadMain(void *parg)
+{
+    int iel = (int)((int64_t)parg);
+    serverLog(LOG_INFO, "Thread %d alive.", iel);
+    serverTL = server.rgthreadvar+iel;  // set the TLS threadsafe global
+
+    int isMainThread = (iel == IDX_EVENT_LOOP_MAIN);
+    aeEventLoop *el = server.rgthreadvar[iel].el;
+    aeSetBeforeSleepProc(el, isMainThread ? beforeSleep : beforeSleepLite, isMainThread ? 0 : AE_SLEEP_THREADSAFE);
+    aeSetAfterSleepProc(el, isMainThread ? afterSleep : NULL, 0);
+    aeMain(el);
+    aeDeleteEventLoop(el);
+    return NULL;
+}
 
 int main(int argc, char **argv) {
     struct timeval tv;
     int j;
 
+#ifdef USE_MEMKIND
     storage_init(NULL, 0);
+#endif
 
 #ifdef REDIS_TEST
     if (argc == 3 && !strcasecmp(argv[1], "test")) {
@@ -4788,6 +4917,13 @@ int main(int argc, char **argv) {
     dictSetHashFunctionSeed((uint8_t*)hashseed);
     server.sentinel_mode = checkForSentinelMode(argc,argv);
     initServerConfig();
+    for (int iel = 0; iel < MAX_EVENT_LOOPS; ++iel)
+    {
+        initServerThread(server.rgthreadvar+iel, iel == IDX_EVENT_LOOP_MAIN);
+    }
+    serverTL = &server.rgthreadvar[IDX_EVENT_LOOP_MAIN];
+    aeAcquireLock();    // We own the lock on boot
+
     ACLInit(); /* The ACL subsystem must be initialized ASAP because the
                   basic networking code and client creation depends on it. */
     moduleInitModulesSystem();
@@ -4881,9 +5017,9 @@ int main(int argc, char **argv) {
         sdsfree(options);
     }
 
-    serverLog(LL_WARNING, "oO0OoO0OoO0Oo Redis is starting oO0OoO0OoO0Oo");
+    serverLog(LL_WARNING, "oO0OoO0OoO0Oo KeyDB is starting oO0OoO0OoO0Oo");
     serverLog(LL_WARNING,
-        "Redis version=%s, bits=%d, commit=%s, modified=%d, pid=%d, just started",
+        "KeyDB version=%s, bits=%d, commit=%s, modified=%d, pid=%d, just started",
             REDIS_VERSION,
             (sizeof(long) == 8) ? 64 : 32,
             redisGitSHA1(),
@@ -4901,6 +5037,8 @@ int main(int argc, char **argv) {
     if (background) daemonize();
 
     initServer();
+    initNetworking(server.cthreads > 1 /* fReusePort */);
+
     if (background || server.pidfile) createPidFile();
     redisSetProcTitle(argv[0]);
     redisAsciiArt();
@@ -4913,11 +5051,7 @@ int main(int argc, char **argv) {
         linuxMemoryWarnings();
     #endif
         moduleLoadFromQueue();
-        if (ACLLoadConfiguredUsers() == C_ERR) {
-            serverLog(LL_WARNING,
-                "Critical error while loading ACLs. Exiting.");
-            exit(1);
-        }
+        ACLLoadUsersAtStartup();
         loadDataFromDisk();
         if (server.cluster_enabled) {
             if (verifyClusterConfigWithData() == C_ERR) {
@@ -4927,7 +5061,7 @@ int main(int argc, char **argv) {
                 exit(1);
             }
         }
-        if (server.ipfd_count > 0)
+        if (server.rgthreadvar[IDX_EVENT_LOOP_MAIN].ipfd_count > 0)
             serverLog(LL_NOTICE,"Ready to accept connections");
         if (server.sofd > 0)
             serverLog(LL_NOTICE,"The server is now ready to accept connections at %s", server.unixsocket);
@@ -4935,15 +5069,24 @@ int main(int argc, char **argv) {
         sentinelIsRunning();
     }
 
+    if (server.cthreads > 4) {
+        serverLog(LL_WARNING, "Warning: server-threads is set to %d.  This is above the maximum recommend value of 4, please ensure you've verified this is actually faster on your machine.", server.cthreads);
+    }
+
     /* Warning the user about suspicious maxmemory setting. */
     if (server.maxmemory > 0 && server.maxmemory < 1024*1024) {
         serverLog(LL_WARNING,"WARNING: You specified a maxmemory value that is less than 1MB (current value is %llu bytes). Are you sure this is what you really want?", server.maxmemory);
     }
 
-    aeSetBeforeSleepProc(server.el,beforeSleep);
-    aeSetAfterSleepProc(server.el,afterSleep);
-    aeMain(server.el);
-    aeDeleteEventLoop(server.el);
+    aeReleaseLock();    //Finally we can dump the lock
+
+    serverAssert(server.cthreads > 0 && server.cthreads <= MAX_EVENT_LOOPS);
+    pthread_t rgthread[MAX_EVENT_LOOPS];
+    for (int iel = 1; iel < server.cthreads; ++iel)
+    {
+        pthread_create(rgthread + iel, NULL, workerThreadMain, (void*)((int64_t)iel));
+    }
+    workerThreadMain((void*)((int64_t)IDX_EVENT_LOOP_MAIN));
     return 0;
 }
 
diff --git a/src/server.h b/src/server.h
index 7d1f34e40..b8e3fa4e7 100644
--- a/src/server.h
+++ b/src/server.h
@@ -30,6 +30,9 @@
 #ifndef __REDIS_H
 #define __REDIS_H
 
+#define TRUE 1
+#define FALSE 0
+
 #include "fmacros.h"
 #include "config.h"
 #include "solarisfixes.h"
@@ -46,11 +49,18 @@
 #include <pthread.h>
 #include <syslog.h>
 #include <netinet/in.h>
+#ifdef __cplusplus
+extern "C" {
 #include <lua.h>
+}
+#else
+#include <lua.h>
+#endif
 #include <signal.h>
 
 typedef long long mstime_t; /* millisecond time type. */
 
+#include "fastlock.h"
 #include "ae.h"      /* Event driven programming library */
 #include "sds.h"     /* Dynamic safe strings */
 #include "dict.h"    /* Hash tables */
@@ -73,6 +83,10 @@ typedef long long mstime_t; /* millisecond time type. */
 #include "endianconv.h"
 #include "crc64.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /* Error codes */
 #define C_OK                    0
 #define C_ERR                   -1
@@ -168,6 +182,8 @@ typedef long long mstime_t; /* millisecond time type. */
 #define CONFIG_DEFAULT_DEFRAG_MAX_SCAN_FIELDS 1000 /* keys with more than 1000 fields will be processed separately */
 #define CONFIG_DEFAULT_PROTO_MAX_BULK_LEN (512ll*1024*1024) /* Bulk request max size */
 
+#define CONFIG_DEFAULT_THREADS 1
+
 #define ACTIVE_EXPIRE_CYCLE_LOOKUPS_PER_LOOP 20 /* Loopkups per loop. */
 #define ACTIVE_EXPIRE_CYCLE_FAST_DURATION 1000 /* Microseconds */
 #define ACTIVE_EXPIRE_CYCLE_SLOW_TIME_PERC 25 /* CPU max % for keys collection */
@@ -660,7 +676,14 @@ struct evictionPoolEntry; /* Defined in evict.c */
  * which is actually a linked list of blocks like that, that is: client->reply. */
 typedef struct clientReplyBlock {
     size_t size, used;
-    char buf[ZERO_LENGTH_ARRAY_LENGTH];
+#ifndef __cplusplus
+    char buf[];
+#else
+    __attribute__((always_inline)) char *buf()
+    {
+        return reinterpret_cast<char*>(this+1);
+    }
+#endif
 } clientReplyBlock;
 
 /* Redis database representation. There are multiple databases identified
@@ -810,10 +833,12 @@ typedef struct client {
     unsigned long long reply_bytes; /* Tot bytes of objects in reply list. */
     size_t sentlen;         /* Amount of bytes already sent in the current
                                buffer or object being sent. */
+    size_t sentlenAsync;    /* same as sentlen buf for async buffers (which are a different stream) */
     time_t ctime;           /* Client creation time. */
     time_t lastinteraction; /* Time of the last interaction, used for timeout */
     time_t obuf_soft_limit_reached_time;
     int flags;              /* Client flags: CLIENT_* macros. */
+    int fPendingAsyncWrite; /* NOTE: Not a flag because it is written to outside of the client lock (locked by the global lock instead) */
     int authenticated;      /* Needed when the default user requires auth. */
     int replstate;          /* Replication state if this is a slave. */
     int repl_put_online_on_ack; /* Install slave write handler on ACK. */
@@ -845,6 +870,14 @@ typedef struct client {
     /* Response buffer */
     int bufpos;
     char buf[PROTO_REPLY_CHUNK_BYTES];
+
+    /* Async Response Buffer - other threads write here */
+    int bufposAsync;
+    int buflenAsync;
+    char *bufAsync;
+
+    int iel; /* the event loop index we're registered with */
+    struct fastlock lock;
 } client;
 
 struct saveparam {
@@ -879,10 +912,12 @@ typedef struct zskiplistNode {
     sds ele;
     double score;
     struct zskiplistNode *backward;
+#ifndef __cplusplus
     struct zskiplistLevel {
         struct zskiplistNode *forward;
         unsigned long span;
-    } level[ZERO_LENGTH_ARRAY_LENGTH];
+    } level[];
+#endif
 } zskiplistNode;
 
 typedef struct zskiplist {
@@ -1005,6 +1040,19 @@ struct clusterState;
 #define CHILD_INFO_TYPE_RDB 0
 #define CHILD_INFO_TYPE_AOF 1
 
+#define MAX_EVENT_LOOPS 16
+#define IDX_EVENT_LOOP_MAIN 0
+
+// Per-thread variabels that may be accessed without a lock
+struct redisServerThreadVars {
+    aeEventLoop *el;
+    int ipfd[CONFIG_BINDADDR_MAX]; /* TCP socket file descriptors */
+    int ipfd_count;             /* Used slots in ipfd[] */
+    list *clients_pending_write; /* There is to write or install handler. */
+    list *unblocked_clients;     /* list of clients to unblock before next loop NOT THREADSAFE */
+    list *clients_pending_asyncwrite;
+};
+
 struct redisServer {
     /* General */
     pid_t pid;                  /* Main process pid. */
@@ -1019,7 +1067,10 @@ struct redisServer {
     redisDb *db;
     dict *commands;             /* Command table */
     dict *orig_commands;        /* Command table before command renaming. */
-    aeEventLoop *el;
+
+    int cthreads;               /* Number of main worker threads */
+    struct redisServerThreadVars rgthreadvar[MAX_EVENT_LOOPS];
+
     unsigned int lruclock;      /* Clock for LRU eviction */
     int shutdown_asap;          /* SHUTDOWN needed ASAP */
     int activerehashing;        /* Incremental rehash in serverCron() */
@@ -1044,14 +1095,11 @@ struct redisServer {
     int bindaddr_count;         /* Number of addresses in server.bindaddr[] */
     char *unixsocket;           /* UNIX socket path */
     mode_t unixsocketperm;      /* UNIX socket permission */
-    int ipfd[CONFIG_BINDADDR_MAX]; /* TCP socket file descriptors */
-    int ipfd_count;             /* Used slots in ipfd[] */
     int sofd;                   /* Unix socket file descriptor */
     int cfd[CONFIG_BINDADDR_MAX];/* Cluster bus listening socket */
     int cfd_count;              /* Used slots in cfd[] */
     list *clients;              /* List of active clients */
     list *clients_to_close;     /* Clients to close asynchronously */
-    list *clients_pending_write; /* There is to write or install handler. */
     list *slaves, *monitors;    /* List of slaves and MONITORs */
     client *current_client; /* Current client, only used on crash report */
     rax *clients_index;         /* Active clients dictionary by client ID. */
@@ -1162,6 +1210,7 @@ struct redisServer {
     int aof_pipe_read_data_from_parent;
     int aof_pipe_write_ack_to_parent;
     int aof_pipe_read_ack_from_child;
+    aeEventLoop *el_alf_pip_read_ack_from_child;
     int aof_pipe_write_ack_to_child;
     int aof_pipe_read_ack_from_parent;
     int aof_stop_sending_diff;     /* If true stop sending accumulated diffs
@@ -1225,6 +1274,7 @@ struct redisServer {
     int repl_diskless_sync;         /* Send RDB to slaves sockets directly. */
     int repl_diskless_sync_delay;   /* Delay to start a diskless repl BGSAVE. */
     /* Replication (slave) */
+    char *masteruser;               /* AUTH with this user and masterauth with master */
     char *masterauth;               /* AUTH with this password with master */
     char *masterhost;               /* Hostname of master */
     int masterport;                 /* Port of master */
@@ -1272,7 +1322,6 @@ struct redisServer {
     /* Blocked clients */
     unsigned int blocked_clients;   /* # of clients executing a blocking cmd.*/
     unsigned int blocked_clients_by_type[BLOCKED_NUM];
-    list *unblocked_clients; /* list of clients to unblock before next loop */
     list *ready_keys;        /* List of readyList structures for BLPOP & co */
     /* Sort parameters - qsort_r() is only available under BSD so we
      * have to take this state global, in order to pass it to sortCompare() */
@@ -1362,6 +1411,8 @@ struct redisServer {
     pthread_mutex_t lruclock_mutex;
     pthread_mutex_t next_client_id_mutex;
     pthread_mutex_t unixtime_mutex;
+
+    struct fastlock flock;
 };
 
 typedef struct pubsubPattern {
@@ -1456,6 +1507,7 @@ typedef struct {
  *----------------------------------------------------------------------------*/
 
 extern struct redisServer server;
+extern __thread struct redisServerThreadVars *serverTL;   // thread local server vars
 extern struct sharedObjectsStruct shared;
 extern dictType objectKeyPointerValueDictType;
 extern dictType objectKeyHeapPointerValueDictType;
@@ -1504,7 +1556,7 @@ size_t redisPopcount(void *s, long count);
 void redisSetProcTitle(char *title);
 
 /* networking.c -- Networking and Client related operations */
-client *createClient(int fd);
+client *createClient(int fd, int iel);
 void closeTimedoutClients(void);
 void freeClient(client *c);
 void freeClientAsync(client *c);
@@ -1561,18 +1613,18 @@ void rewriteClientCommandVector(client *c, int argc, ...);
 void rewriteClientCommandArgument(client *c, int i, robj *newval);
 void replaceClientCommandVector(client *c, int argc, robj **argv);
 unsigned long getClientOutputBufferMemoryUsage(client *c);
-void freeClientsInAsyncFreeQueue(void);
+void freeClientsInAsyncFreeQueue(int iel);
 void asyncCloseClientOnOutputBufferLimitReached(client *c);
 int getClientType(client *c);
-int getClientTypeByName(char *name);
-char *getClientTypeName(int cclass);
+int getClientTypeByName(const char *name);
+const char *getClientTypeName(int cclass);
 void flushSlavesOutputBuffers(void);
 void disconnectSlaves(void);
-int listenToPort(int port, int *fds, int *count);
+int listenToPort(int port, int *fds, int *count, int fReusePort);
 void pauseClients(mstime_t duration);
 int clientsArePaused(void);
-int processEventsWhileBlocked(void);
-int handleClientsWithPendingWrites(void);
+int processEventsWhileBlocked(int iel);
+int handleClientsWithPendingWrites(int iel);
 int clientHasPendingReplies(client *c);
 void unlinkClient(client *c);
 int writeToClient(int fd, client *c, int handler_installed);
@@ -1580,6 +1632,25 @@ void linkClient(client *c);
 void protectClient(client *c);
 void unprotectClient(client *c);
 
+// Special Thread-safe addReply() commands for posting messages to clients from a different thread
+void addReplyAsync(client *c, robj *obj);
+void addReplyArrayLenAsync(client *c, long length);
+void addReplyProtoAsync(client *c, const char *s, size_t len);
+void addReplyBulkAsync(client *c, robj *obj);
+void addReplyBulkCBufferAsync(client *c, const void *p, size_t len);
+void addReplyErrorAsync(client *c, const char *err);
+void addReplyMapLenAsync(client *c, long length);
+void addReplyNullAsync(client *c);
+void addReplyDoubleAsync(client *c, double d);
+void *addReplyDeferredLenAsync(client *c);
+void setDeferredArrayLenAsync(client *c, void *node, long length);
+void addReplySdsAsync(client *c, sds s);
+void addReplyBulkSdsAsync(client *c, sds s);
+void addReplyPushLenAsync(client *c, long length);
+void addReplyLongLongAsync(client *c, long long ll);
+
+void ProcessPendingAsyncWrites(void);
+
 #ifdef __GNUC__
 void addReplyErrorFormat(client *c, const char *fmt, ...)
     __attribute__((format(printf, 2, 3)));
@@ -1666,7 +1737,7 @@ unsigned long long estimateObjectIdleTime(robj *o);
 #define sdsEncodedObject(objptr) (objptr->encoding == OBJ_ENCODING_RAW || objptr->encoding == OBJ_ENCODING_EMBSTR)
 
 /* Synchronous I/O with timeout */
-ssize_t syncWrite(int fd, char *ptr, ssize_t size, long long timeout);
+ssize_t syncWrite(int fd, const char *ptr, ssize_t size, long long timeout);
 ssize_t syncRead(int fd, char *ptr, ssize_t size, long long timeout);
 ssize_t syncReadLine(int fd, char *ptr, ssize_t size, long long timeout);
 
@@ -1754,6 +1825,8 @@ int ACLAppendUserForLoading(sds *argv, int argc, int *argc_err);
 char *ACLSetUserStringError(void);
 int ACLLoadConfiguredUsers(void);
 sds ACLDescribeUser(user *u);
+void ACLLoadUsersAtStartup(void);
+void addReplyCommandCategories(client *c, struct redisCommand *cmd);
 
 /* Sorted sets data type */
 
@@ -2014,7 +2087,7 @@ int ldbPendingChildren(void);
 sds luaCreateFunction(client *c, lua_State *lua, robj *body);
 
 /* Blocked clients */
-void processUnblockedClients(void);
+void processUnblockedClients(int iel);
 void blockClient(client *c, int btype);
 void unblockClient(client *c);
 void queueClientForReprocessing(client *c);
@@ -2270,9 +2343,31 @@ int memtest_preserving_test(unsigned long *m, size_t bytes, int passes);
 void mixDigest(unsigned char *digest, void *ptr, size_t len);
 void xorDigest(unsigned char *digest, void *ptr, size_t len);
 
+inline int ielFromEventLoop(const aeEventLoop *eventLoop)
+{
+    int iel = 0;
+    for (; iel < server.cthreads; ++iel)
+    {
+        if (server.rgthreadvar[iel].el == eventLoop)
+            break;
+    }
+    serverAssert(iel < server.cthreads);
+    return iel;
+}
+
+inline int FCorrectThread(client *c)
+{
+    return server.rgthreadvar[c->iel].el == serverTL->el;
+}
+#define AssertCorrectThread(c) serverAssert(FCorrectThread(c))
+
 #define redisDebug(fmt, ...) \
     printf("DEBUG %s:%d > " fmt "\n", __FILE__, __LINE__, __VA_ARGS__)
 #define redisDebugMark() \
     printf("-- MARK %s:%d --\n", __FILE__, __LINE__)
 
+#ifdef __cplusplus
+}
 #endif
+
+#endif
\ No newline at end of file
diff --git a/src/sha1.h b/src/sha1.h
index f41691258..e42d4d2d4 100644
--- a/src/sha1.h
+++ b/src/sha1.h
@@ -7,6 +7,10 @@ By Steve Reid <steve@edmweb.com>
 100% Public Domain
 */
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct {
     uint32_t state[5];
     uint32_t count[2];
@@ -21,4 +25,9 @@ void SHA1Final(unsigned char digest[20], SHA1_CTX* context);
 #ifdef REDIS_TEST
 int sha1Test(int argc, char **argv);
 #endif
+
+#ifdef __cplusplus
+}
+#endif
+
 #endif
diff --git a/src/storage.c b/src/storage.c
index 1987ac4f3..12d7ef3c0 100644
--- a/src/storage.c
+++ b/src/storage.c
@@ -1,5 +1,7 @@
 #include "server.h"
 
+#ifdef USE_MEMKIND
+
 #include <stdlib.h>
 #include <stdio.h>
 #include <memkind.h>
@@ -264,4 +266,6 @@ void handle_postfork_child()
     int fdOriginal = memkind_fd(mkdisk);
     memkind_pmem_remapfd(mkdisk, fdNew);
     close(fdOriginal);
-}
\ No newline at end of file
+}
+
+#endif // USE_MEMKIND
diff --git a/src/syncio.c b/src/syncio.c
index b2843d5fb..c394dbb49 100644
--- a/src/syncio.c
+++ b/src/syncio.c
@@ -46,7 +46,7 @@
  * done within 'timeout' milliseconds the operation succeeds and 'size' is
  * returned. Otherwise the operation fails, -1 is returned, and an unspecified
  * partial write could be performed against the file descriptor. */
-ssize_t syncWrite(int fd, char *ptr, ssize_t size, long long timeout) {
+ssize_t syncWrite(int fd, const char *ptr, ssize_t size, long long timeout) {
     ssize_t nwritten, ret = size;
     long long start = mstime();
     long long remaining = timeout;
diff --git a/src/t_list.c b/src/t_list.c
index e05944a82..c7350887f 100644
--- a/src/t_list.c
+++ b/src/t_list.c
@@ -547,7 +547,7 @@ void lremCommand(client *c) {
  * as well. This command was originally proposed by Ezra Zygmuntowicz.
  */
 
-void rpoplpushHandlePush(client *c, robj *dstkey, robj *dstobj, robj *value) {
+static void rpoplpushHandlePush(client *c, robj *dstkey, robj *dstobj, robj *value) {
     /* Create the list if the key does not exist */
     if (!dstobj) {
         dstobj = createQuicklistObject();
@@ -559,7 +559,7 @@ void rpoplpushHandlePush(client *c, robj *dstkey, robj *dstobj, robj *value) {
     listTypePush(dstobj,value,LIST_HEAD);
     notifyKeyspaceEvent(NOTIFY_LIST,"lpush",dstkey,c->db->id);
     /* Always send the pushed value to the client. */
-    addReplyBulk(c,value);
+    addReplyBulkAsync(c,value);
 }
 
 void rpoplpushCommand(client *c) {
@@ -630,6 +630,7 @@ int serveClientBlockedOnList(client *receiver, robj *key, robj *dstkey, redisDb
     robj *argv[3];
 
     if (dstkey == NULL) {
+        fastlock_lock(&receiver->lock);
         /* Propagate the [LR]POP operation. */
         argv[0] = (where == LIST_HEAD) ? shared.lpop :
                                           shared.rpop;
@@ -637,16 +638,18 @@ int serveClientBlockedOnList(client *receiver, robj *key, robj *dstkey, redisDb
         propagate((where == LIST_HEAD) ?
             server.lpopCommand : server.rpopCommand,
             db->id,argv,2,PROPAGATE_AOF|PROPAGATE_REPL);
-
+       
         /* BRPOP/BLPOP */
-        addReplyArrayLen(receiver,2);
-        addReplyBulk(receiver,key);
-        addReplyBulk(receiver,value);
+        addReplyArrayLenAsync(receiver,2);
+        addReplyBulkAsync(receiver,key);
+        addReplyBulkAsync(receiver,value);
 
         /* Notify event. */
         char *event = (where == LIST_HEAD) ? "lpop" : "rpop";
         notifyKeyspaceEvent(NOTIFY_LIST,event,key,receiver->db->id);
+        fastlock_unlock(&receiver->lock);
     } else {
+        fastlock_lock(&receiver->lock);
         /* BRPOPLPUSH */
         robj *dstobj =
             lookupKeyWrite(receiver->db,dstkey);
@@ -673,9 +676,11 @@ int serveClientBlockedOnList(client *receiver, robj *key, robj *dstkey, redisDb
 
             /* Notify event ("lpush" was notified by rpoplpushHandlePush). */
             notifyKeyspaceEvent(NOTIFY_LIST,"rpop",key,receiver->db->id);
+            fastlock_unlock(&receiver->lock);
         } else {
             /* BRPOPLPUSH failed because of wrong
              * destination type. */
+            fastlock_unlock(&receiver->lock);
             return C_ERR;
         }
     }
diff --git a/src/t_set.c b/src/t_set.c
index 7e96ae38e..99ff6fb47 100644
--- a/src/t_set.c
+++ b/src/t_set.c
@@ -207,7 +207,7 @@ sds setTypeNextObject(setTypeIterator *si) {
  * used field with values which are easy to trap if misused. */
 int setTypeRandomElement(robj *setobj, sds *sdsele, int64_t *llele) {
     if (setobj->encoding == OBJ_ENCODING_HT) {
-        dictEntry *de = dictGetRandomKey(setobj->m_ptr);
+        dictEntry *de = dictGetFairRandomKey(setobj->m_ptr);
         *sdsele = dictGetKey(de);
         *llele = -123456789; /* Not needed. Defensive. */
     } else if (setobj->encoding == OBJ_ENCODING_INTSET) {
diff --git a/src/t_stream.c b/src/t_stream.c
index 63750b857..ebf1dfb9f 100644
--- a/src/t_stream.c
+++ b/src/t_stream.c
@@ -776,11 +776,16 @@ int streamDeleteItem(stream *s, streamID *id) {
 /* Emit a reply in the client output buffer by formatting a Stream ID
  * in the standard <ms>-<seq> format, using the simple string protocol
  * of REPL. */
-void addReplyStreamID(client *c, streamID *id) {
+static void addReplyStreamID(client *c, streamID *id) {
     sds replyid = sdscatfmt(sdsempty(),"%U-%U",id->ms,id->seq);
     addReplyBulkSds(c,replyid);
 }
 
+static void addReplyStreamIDAsync(client *c, streamID *id) {
+    sds replyid = sdscatfmt(sdsempty(),"%U-%U",id->ms,id->seq);
+    addReplyBulkSdsAsync(c,replyid);
+}
+
 /* Similar to the above function, but just creates an object, usually useful
  * for replication purposes to create arguments. */
 robj *createObjectFromStreamID(streamID *id) {
@@ -914,7 +919,7 @@ size_t streamReplyWithRange(client *c, stream *s, streamID *start, streamID *end
     }
 
     if (!(flags & STREAM_RWR_RAWENTRIES))
-        arraylen_ptr = addReplyDeferredLen(c);
+        arraylen_ptr = addReplyDeferredLenAsync(c);
     streamIteratorStart(&si,s,start,end,rev);
     while(streamIteratorGetID(&si,&id,&numfields)) {
         /* Update the group last_id if needed. */
@@ -925,18 +930,18 @@ size_t streamReplyWithRange(client *c, stream *s, streamID *start, streamID *end
 
         /* Emit a two elements array for each item. The first is
          * the ID, the second is an array of field-value pairs. */
-        addReplyArrayLen(c,2);
-        addReplyStreamID(c,&id);
+        addReplyArrayLenAsync(c,2);
+        addReplyStreamIDAsync(c,&id);
 
-        addReplyMapLen(c,numfields);
+        addReplyMapLenAsync(c,numfields);
 
         /* Emit the field-value pairs. */
         while(numfields--) {
             unsigned char *key, *value;
             int64_t key_len, value_len;
             streamIteratorGetField(&si,&key,&value,&key_len,&value_len);
-            addReplyBulkCBuffer(c,key,key_len);
-            addReplyBulkCBuffer(c,value,value_len);
+            addReplyBulkCBufferAsync(c,key,key_len);
+            addReplyBulkCBufferAsync(c,value,value_len);
         }
 
         /* If a group is passed, we need to create an entry in the
@@ -994,7 +999,7 @@ size_t streamReplyWithRange(client *c, stream *s, streamID *start, streamID *end
         if (count && count == arraylen) break;
     }
     streamIteratorStop(&si);
-    if (arraylen_ptr) setDeferredArrayLen(c,arraylen_ptr,arraylen);
+    if (arraylen_ptr) setDeferredArrayLenAsync(c,arraylen_ptr,arraylen);
     return arraylen;
 }
 
diff --git a/src/t_zset.c b/src/t_zset.c
index 5f3f00950..7fbcf3bbf 100644
--- a/src/t_zset.c
+++ b/src/t_zset.c
@@ -3155,15 +3155,15 @@ void genericZpopCommand(client *c, robj **keyv, int keyc, int where, int emitkey
 
     /* No candidate for zpopping, return empty. */
     if (!zobj) {
-        addReplyNull(c);
+        addReplyNullAsync(c);
         return;
     }
 
-    void *arraylen_ptr = addReplyDeferredLen(c);
+    void *arraylen_ptr = addReplyDeferredLenAsync(c);
     long arraylen = 0;
 
     /* We emit the key only for the blocking variant. */
-    if (emitkey) addReplyBulk(c,key);
+    if (emitkey) addReplyBulkAsync(c,key);
 
     /* Remove the element. */
     do {
@@ -3213,8 +3213,8 @@ void genericZpopCommand(client *c, robj **keyv, int keyc, int where, int emitkey
             signalModifiedKey(c->db,key);
         }
 
-        addReplyBulkCBuffer(c,ele,sdslen(ele));
-        addReplyDouble(c,score);
+        addReplyBulkCBufferAsync(c,ele,sdslen(ele));
+        addReplyDoubleAsync(c,score);
         sdsfree(ele);
         arraylen += 2;
 
@@ -3226,7 +3226,7 @@ void genericZpopCommand(client *c, robj **keyv, int keyc, int where, int emitkey
         }
     } while(--count);
 
-    setDeferredArrayLen(c,arraylen_ptr,arraylen + (emitkey != 0));
+    setDeferredArrayLenAsync(c,arraylen_ptr,arraylen + (emitkey != 0));
 }
 
 /* ZPOPMIN key [<count>] */
diff --git a/src/util.h b/src/util.h
index b6c01aa59..97ca9471b 100644
--- a/src/util.h
+++ b/src/util.h
@@ -33,6 +33,10 @@
 #include <stdint.h>
 #include "sds.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /* The maximum number of characters needed to represent a long double
  * as a string (long double has a huge range).
  * This should be the size of the buffer given to ld2string */
@@ -58,4 +62,8 @@ int pathIsBaseName(char *path);
 int utilTest(int argc, char **argv);
 #endif
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif
diff --git a/src/zmalloc.c b/src/zmalloc.c
index c081cb90f..090009485 100644
--- a/src/zmalloc.c
+++ b/src/zmalloc.c
@@ -63,17 +63,21 @@ void zlibc_free(void *ptr) {
 #define realloc(ptr, size, type) srealloc(ptr, size, type)
 #define free(ptr) sfree(ptr)
 #elif defined(USE_TCMALLOC)
-#define malloc(size) tc_malloc(size)
-#define calloc(count,size) tc_calloc(count,size)
-#define realloc(ptr,size) tc_realloc(ptr,size)
+#define malloc(size, type) tc_malloc(size)
+#define calloc(count,size, type) tc_calloc(count,size)
+#define realloc(ptr,size, type) tc_realloc(ptr,size)
 #define free(ptr) tc_free(ptr)
 #elif defined(USE_JEMALLOC)
-#define malloc(size) je_malloc(size)
-#define calloc(count,size) je_calloc(count,size)
-#define realloc(ptr,size) je_realloc(ptr,size)
+#define malloc(size, type) je_malloc(size)
+#define calloc(count,size,type) je_calloc(count,size)
+#define realloc(ptr,size,type) je_realloc(ptr,size)
 #define free(ptr) je_free(ptr)
 #define mallocx(size,flags) je_mallocx(size,flags)
 #define dallocx(ptr,flags) je_dallocx(ptr,flags)
+#else
+#define malloc(size, type) malloc(size)
+#define calloc(count,size,type) calloc(count,size)
+#define realloc(ptr,size,type) realloc(ptr,size)
 #endif
 
 #define update_zmalloc_stat_alloc(__n) do { \
@@ -101,12 +105,8 @@ static void zmalloc_default_oom(size_t size) {
 static void (*zmalloc_oom_handler)(size_t) = zmalloc_default_oom;
 
 void *zmalloc(size_t size, enum MALLOC_CLASS class) {
-#ifdef USE_MEMKIND
-    void *ptr = malloc(size+PREFIX_SIZE, class);
-#else
     (void)class;
-    void *ptr = malloc(size+PREFIX_SIZE);
-#endif
+    void *ptr = malloc(size+PREFIX_SIZE, class);
 
     if (!ptr) zmalloc_oom_handler(size);
 #ifdef HAVE_MALLOC_SIZE
@@ -138,12 +138,8 @@ void zfree_no_tcache(void *ptr) {
 #endif
 
 void *zcalloc(size_t size, enum MALLOC_CLASS class) {
-#ifdef USE_MEMKIND
+    (void)(class);
     void *ptr = calloc(1, size+PREFIX_SIZE, class);
-#else
-    (void)class;
-    void *ptr = calloc(1, size+PREFIX_SIZE);
-#endif
 
     if (!ptr) zmalloc_oom_handler(size);
 #ifdef HAVE_MALLOC_SIZE
diff --git a/src/zmalloc.h b/src/zmalloc.h
index 30ba80492..a7b980025 100644
--- a/src/zmalloc.h
+++ b/src/zmalloc.h
@@ -36,14 +36,10 @@
 #define __str(s) #s
 
 #include "storage.h"
-#define USE_MEMKIND 1
 #if defined(USE_MEMKIND)
     #define ZMALLOC_LIB ("memkind")
     #undef USE_JEMALLOC
     #define USE_MALLOC_CLASS 1
-    // Even though memkind supports malloc_usable_size we don't use it for performance reasons
-    //#define HAVE_MALLOC_SIZE 0
-    //#define zmalloc_size(p) salloc_usable_size(p)
 #elif defined(USE_TCMALLOC)
 #define ZMALLOC_LIB ("tcmalloc-" __xstr(TC_VERSION_MAJOR) "." __xstr(TC_VERSION_MINOR))
 #include <google/tcmalloc.h>
@@ -86,6 +82,10 @@
 #define HAVE_DEFRAG
 #endif
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 void *zmalloc(size_t size, enum MALLOC_CLASS mclass);
 void *zcalloc(size_t size, enum MALLOC_CLASS mclass);
 void *zrealloc(void *ptr, size_t size, enum MALLOC_CLASS mclass);
@@ -116,4 +116,8 @@ size_t zmalloc_usable(void *ptr);
 int zmalloc_test(int argc, char **argv);
 #endif
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif /* __ZMALLOC_H */
diff --git a/tests/unit/lazyfree.tcl b/tests/unit/lazyfree.tcl
index 4e994494b..1e568ed78 100644
--- a/tests/unit/lazyfree.tcl
+++ b/tests/unit/lazyfree.tcl
@@ -19,6 +19,7 @@ start_server {tags {"lazyfree"}} {
     }
 
     test "FLUSHDB ASYNC can reclaim memory in background" {
+        after 500   # Sometimes Redis is busy with a prior operation
         set orig_mem [s used_memory]
         set args {}
         for {set i 0} {$i < 100000} {incr i} {
diff --git a/utils/srandmember/README.md b/utils/srandmember/README.md
new file mode 100644
index 000000000..d3da1e82f
--- /dev/null
+++ b/utils/srandmember/README.md
@@ -0,0 +1,14 @@
+The utilities in this directory plot the distribution of SRANDMEMBER to
+evaluate how fair it is.
+
+See http://theshfl.com/redis_sets for more information on the topic that lead
+to such investigation fix.
+
+showdist.rb -- shows the distribution of the frequency elements are returned.
+               The x axis is the number of times elements were returned, and
+               the y axis is how many elements were returned with such
+               frequency.
+
+showfreq.rb -- shows the frequency each element was returned.
+               The x axis is the element number.
+               The y axis is the times it was returned.
diff --git a/utils/srandmember/showdist.rb b/utils/srandmember/showdist.rb
new file mode 100644
index 000000000..243585700
--- /dev/null
+++ b/utils/srandmember/showdist.rb
@@ -0,0 +1,33 @@
+require 'redis'
+
+r = Redis.new
+r.select(9)
+r.del("myset");
+r.sadd("myset",(0..999).to_a)
+freq = {}
+100.times {
+    res = r.pipelined {
+        1000.times {
+            r.srandmember("myset")
+        }
+    }
+    res.each{|ele|
+        freq[ele] = 0 if freq[ele] == nil
+        freq[ele] += 1
+    }
+}
+
+# Convert into frequency distribution
+dist = {}
+freq.each{|item,count|
+    dist[count] = 0 if dist[count] == nil
+    dist[count] += 1
+}
+
+min = dist.keys.min
+max = dist.keys.max
+(min..max).each{|x|
+    count = dist[x]
+    count = 0 if count == nil
+    puts "#{x} -> #{"*"*count}"
+}
diff --git a/utils/srandmember/showfreq.rb b/utils/srandmember/showfreq.rb
new file mode 100644
index 000000000..fd47bc0ca
--- /dev/null
+++ b/utils/srandmember/showfreq.rb
@@ -0,0 +1,23 @@
+require 'redis'
+
+r = Redis.new
+r.select(9)
+r.del("myset");
+r.sadd("myset",(0..999).to_a)
+freq = {}
+500.times {
+    res = r.pipelined {
+        1000.times {
+            r.srandmember("myset")
+        }
+    }
+    res.each{|ele|
+        freq[ele] = 0 if freq[ele] == nil
+        freq[ele] += 1
+    }
+}
+
+# Print the frequency each element was yeld to process it with gnuplot
+freq.each{|item,count|
+    puts "#{item} #{count}"
+}