From 6454867df0f2d63106bb9ba4ec9d8e01e26eeb4d Mon Sep 17 00:00:00 2001 From: Adam Baldwin Date: Thu, 25 Oct 2012 20:27:10 -0700 Subject: [PATCH 0001/2500] Removed dofile() from Lua --- src/scripting.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/scripting.c b/src/scripting.c index 6f9ec2e89..5f7280448 100644 --- a/src/scripting.c +++ b/src/scripting.c @@ -451,6 +451,8 @@ void luaLoadLibraries(lua_State *lua) { void luaRemoveUnsupportedFunctions(lua_State *lua) { lua_pushnil(lua); lua_setglobal(lua,"loadfile"); + lua_pushnil(lua); + lua_setglobal(lua,"dofile"); } /* This function installs metamethods in the global table _G that prevent From fad954fd74e3cc23e9308bc061bd2246b2cf2e0b Mon Sep 17 00:00:00 2001 From: charsyam Date: Tue, 20 Nov 2012 02:50:31 +0800 Subject: [PATCH 0002/2500] fix randstring bug --- src/ziplist.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ziplist.c b/src/ziplist.c index d4ac4f9b4..00516f509 100644 --- a/src/ziplist.c +++ b/src/ziplist.c @@ -1041,7 +1041,8 @@ void pop(unsigned char *zl, int where) { } int randstring(char *target, unsigned int min, unsigned int max) { - int p, len = min+rand()%(max-min+1); + int p = 0; + int len = min+rand()%(max-min+1); int minval, maxval; switch(rand() % 3) { case 0: From f08d4ed811ba0dcc7c227c3ed2850ea4f1e22554 Mon Sep 17 00:00:00 2001 From: YAMAMOTO Takashi Date: Thu, 19 Jul 2012 21:25:30 +0900 Subject: [PATCH 0003/2500] don't define _XOPEN_SOURCE for NetBSD on NetBSD, defining _XOPEN_SOURCE hides extensions like inet_aton, strcasecmp, etc. --- src/fmacros.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/fmacros.h b/src/fmacros.h index a6cf3578c..c16f5e204 100644 --- a/src/fmacros.h +++ b/src/fmacros.h @@ -36,9 +36,13 @@ #define _GNU_SOURCE #endif -#if defined(__linux__) || defined(__OpenBSD__) || defined(__NetBSD__) +#if defined(__linux__) || defined(__OpenBSD__) #define _XOPEN_SOURCE 700 -#else +/* + * On NetBSD, _XOPEN_SOURCE undefines _NETBSD_SOURCE and + * thus hides inet_aton etc. + */ +#elif !defined(__NetBSD__) #define _XOPEN_SOURCE #endif From ed23c6ff59b218159392d7decb8f412cd4695c5a Mon Sep 17 00:00:00 2001 From: YAMAMOTO Takashi Date: Thu, 19 Jul 2012 21:28:11 +0900 Subject: [PATCH 0004/2500] rename popcount to popcount_binary to avoid a conflict with NetBSD libc NetBSD-current's libc has a function named popcount. hiding these extensions using feature macros is not possible because redis uses other extensions covered by the same feature macro. eg. inet_aton --- src/bitops.c | 4 ++-- src/redis.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/bitops.c b/src/bitops.c index 47f768c31..1c2e13ddc 100644 --- a/src/bitops.c +++ b/src/bitops.c @@ -58,7 +58,7 @@ static int getBitOffsetFromArgument(redisClient *c, robj *o, size_t *offset) { /* Count number of bits set in the binary array pointed by 's' and long * 'count' bytes. The implementation of this function is required to * work with a input string length up to 512 MB. */ -size_t popcount(void *s, long count) { +size_t popcount_binary(void *s, long count) { size_t bits = 0; unsigned char *p; uint32_t *p4 = s; @@ -407,6 +407,6 @@ void bitcountCommand(redisClient *c) { } else { long bytes = end-start+1; - addReplyLongLong(c,popcount(p+start,bytes)); + addReplyLongLong(c,popcount_binary(p+start,bytes)); } } diff --git a/src/redis.h b/src/redis.h index 3002dd973..a39f3d81d 100644 --- a/src/redis.h +++ b/src/redis.h @@ -1010,7 +1010,7 @@ long long mstime(void); void getRandomHexChars(char *p, unsigned int len); uint64_t crc64(uint64_t crc, const unsigned char *s, uint64_t l); void exitFromChild(int retcode); -size_t popcount(void *s, long count); +size_t popcount_binary(void *s, long count); void redisSetProcTitle(char *title); /* networking.c -- Networking and Client related operations */ From 321c1a590553324e9fd8b9d987c483ac90e71d62 Mon Sep 17 00:00:00 2001 From: YAMAMOTO Takashi Date: Thu, 19 Jul 2012 21:37:34 +0900 Subject: [PATCH 0005/2500] use nanosleep instead of usleep SUSv3 says that: The useconds argument shall be less than one million. If the value of useconds is 0, then the call has no effect. and actually NetBSD's implementation rejects such a value with EINVAL. use nanosleep which has no such a limitation instead. --- src/debug.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/debug.c b/src/debug.c index 2f62bedb0..6cfa61b91 100644 --- a/src/debug.c +++ b/src/debug.c @@ -329,8 +329,11 @@ void debugCommand(redisClient *c) { } else if (!strcasecmp(c->argv[1]->ptr,"sleep") && c->argc == 3) { double dtime = strtod(c->argv[2]->ptr,NULL); long long utime = dtime*1000000; + struct timespec tv; - usleep(utime); + tv.tv_sec = utime / 1000000; + tv.tv_nsec = (utime % 1000000) * 1000; + nanosleep(&tv, NULL); addReply(c,shared.ok); } else if (!strcasecmp(c->argv[1]->ptr,"set-active-expire") && c->argc == 3) From 2e829fc15d262beafbe92ac59df57b3feac83d69 Mon Sep 17 00:00:00 2001 From: YAMAMOTO Takashi Date: Sun, 28 Oct 2012 13:33:04 +0900 Subject: [PATCH 0006/2500] don't assume time_t == long time_t is always 64bit on recent versions of NetBSD. --- src/config.c | 4 ++-- src/redis.c | 44 ++++++++++++++++++++++---------------------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/src/config.c b/src/config.c index c41d57184..b4b128e8a 100644 --- a/src/config.c +++ b/src/config.c @@ -974,8 +974,8 @@ void configGetCommand(redisClient *c) { int j; for (j = 0; j < server.saveparamslen; j++) { - buf = sdscatprintf(buf,"%ld %d", - server.saveparams[j].seconds, + buf = sdscatprintf(buf,"%jd %d", + (intmax_t)server.saveparams[j].seconds, server.saveparams[j].changes); if (j != server.saveparamslen-1) buf = sdscatlen(buf," ",1); diff --git a/src/redis.c b/src/redis.c index f86dafa53..2045176cf 100644 --- a/src/redis.c +++ b/src/redis.c @@ -2070,8 +2070,8 @@ sds genRedisInfoString(char *section) { "process_id:%ld\r\n" "run_id:%s\r\n" "tcp_port:%d\r\n" - "uptime_in_seconds:%ld\r\n" - "uptime_in_days:%ld\r\n" + "uptime_in_seconds:%jd\r\n" + "uptime_in_days:%jd\r\n" "hz:%d\r\n" "lru_clock:%ld\r\n" "config_file:%s\r\n", @@ -2091,8 +2091,8 @@ sds genRedisInfoString(char *section) { (long) getpid(), server.runid, server.port, - uptime, - uptime/(3600*24), + (intmax_t)uptime, + (intmax_t)(uptime/(3600*24)), server.hz, (unsigned long) server.lruclock, server.configfile ? server.configfile : ""); @@ -2149,30 +2149,30 @@ sds genRedisInfoString(char *section) { "loading:%d\r\n" "rdb_changes_since_last_save:%lld\r\n" "rdb_bgsave_in_progress:%d\r\n" - "rdb_last_save_time:%ld\r\n" + "rdb_last_save_time:%jd\r\n" "rdb_last_bgsave_status:%s\r\n" - "rdb_last_bgsave_time_sec:%ld\r\n" - "rdb_current_bgsave_time_sec:%ld\r\n" + "rdb_last_bgsave_time_sec:%jd\r\n" + "rdb_current_bgsave_time_sec:%jd\r\n" "aof_enabled:%d\r\n" "aof_rewrite_in_progress:%d\r\n" "aof_rewrite_scheduled:%d\r\n" - "aof_last_rewrite_time_sec:%ld\r\n" - "aof_current_rewrite_time_sec:%ld\r\n" + "aof_last_rewrite_time_sec:%jd\r\n" + "aof_current_rewrite_time_sec:%jd\r\n" "aof_last_bgrewrite_status:%s\r\n", server.loading, server.dirty, server.rdb_child_pid != -1, - server.lastsave, + (intmax_t)server.lastsave, (server.lastbgsave_status == REDIS_OK) ? "ok" : "err", - server.rdb_save_time_last, - (server.rdb_child_pid == -1) ? - -1 : time(NULL)-server.rdb_save_time_start, + (intmax_t)server.rdb_save_time_last, + (intmax_t)((server.rdb_child_pid == -1) ? + -1 : time(NULL)-server.rdb_save_time_start), server.aof_state != REDIS_AOF_OFF, server.aof_child_pid != -1, server.aof_rewrite_scheduled, - server.aof_rewrite_time_last, - (server.aof_child_pid == -1) ? - -1 : time(NULL)-server.aof_rewrite_time_start, + (intmax_t)server.aof_rewrite_time_last, + (intmax_t)((server.aof_child_pid == -1) ? + -1 : time(NULL)-server.aof_rewrite_time_start), (server.aof_lastbgrewrite_status == REDIS_OK) ? "ok" : "err"); if (server.aof_state != REDIS_AOF_OFF) { @@ -2211,16 +2211,16 @@ sds genRedisInfoString(char *section) { } info = sdscatprintf(info, - "loading_start_time:%ld\r\n" + "loading_start_time:%jd\r\n" "loading_total_bytes:%llu\r\n" "loading_loaded_bytes:%llu\r\n" "loading_loaded_perc:%.2f\r\n" - "loading_eta_seconds:%ld\r\n" - ,(unsigned long) server.loading_start_time, + "loading_eta_seconds:%jd\r\n", + (intmax_t) server.loading_start_time, (unsigned long long) server.loading_total_bytes, (unsigned long long) server.loading_loaded_bytes, perc, - eta + (intmax_t)eta ); } } @@ -2299,8 +2299,8 @@ sds genRedisInfoString(char *section) { if (server.repl_state != REDIS_REPL_CONNECTED) { info = sdscatprintf(info, - "master_link_down_since_seconds:%ld\r\n", - (long)server.unixtime-server.repl_down_since); + "master_link_down_since_seconds:%jd\r\n", + (intmax_t)server.unixtime-server.repl_down_since); } info = sdscatprintf(info, "slave_priority:%d\r\n" From d56c9e48fff1086910742e6523e25a6a7b8cd0f7 Mon Sep 17 00:00:00 2001 From: ioddly Date: Wed, 22 May 2013 18:17:58 -0500 Subject: [PATCH 0007/2500] Try to report source of bad Lua API calls --- src/scripting.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/scripting.c b/src/scripting.c index 6661f3748..2ffd92ef7 100644 --- a/src/scripting.c +++ b/src/scripting.c @@ -152,9 +152,20 @@ char *redisProtocolToLuaType_MultiBulk(lua_State *lua, char *reply) { } void luaPushError(lua_State *lua, char *error) { + lua_Debug dbg; + lua_newtable(lua); lua_pushstring(lua,"err"); - lua_pushstring(lua, error); + + /* Attempt to figure out where this function was called, if possible */ + if(lua_getstack(lua, 1, &dbg) && lua_getinfo(lua, "nSl", &dbg)) { + sds msg = sdscatprintf(sdsempty(), "%s: %d: %s", + dbg.source, dbg.currentline, error); + lua_pushstring(lua, msg); + sdsfree(msg); + } else { + lua_pushstring(lua, error); + } lua_settable(lua,-3); } @@ -866,9 +877,10 @@ void evalGenericCommand(redisClient *c, int evalsha) { delhook = 1; } - /* At this point whatever this script was never seen before or if it was + /* At this point whether this script was never seen before or if it was * already defined, we can call it. We have zero arguments and expect * a single return value. */ + err = lua_pcall(lua,0,1,0); /* Perform some cleanup that we need to do both on error and success. */ From f9eef102c86cbcdb4e21382d7b60b319d5d36302 Mon Sep 17 00:00:00 2001 From: Marc-Antoine Perennou Date: Mon, 10 Dec 2012 18:21:10 +0100 Subject: [PATCH 0008/2500] test-server: only listen to 127.0.0.1 Signed-off-by: Marc-Antoine Perennou --- tests/assets/default.conf | 1 + tests/test_helper.tcl | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/assets/default.conf b/tests/assets/default.conf index 17d21b07c..902a094a5 100644 --- a/tests/assets/default.conf +++ b/tests/assets/default.conf @@ -5,6 +5,7 @@ daemonize no pidfile /var/run/redis.pid port 6379 timeout 0 +bind 127.0.0.1 loglevel verbose logfile '' databases 16 diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl index 5e2c2ad92..930eba4ee 100644 --- a/tests/test_helper.tcl +++ b/tests/test_helper.tcl @@ -186,7 +186,7 @@ proc test_server_main {} { if {!$::quiet} { puts "Starting test server at port $port" } - socket -server accept_test_clients $port + socket -server accept_test_clients -myaddr 127.0.0.1 $port # Start the client instances set ::clients_pids {} From af2cadc4b7dd98ccb499023d7c6b6a75d0833cad Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 18 Jun 2013 17:33:35 +0200 Subject: [PATCH 0009/2500] Lua scripting: improve error reporting. When calling Lua scripts we try to report not just the error but information about the code line causing the error. --- src/scripting.c | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/src/scripting.c b/src/scripting.c index 2ffd92ef7..06539a4cb 100644 --- a/src/scripting.c +++ b/src/scripting.c @@ -620,6 +620,26 @@ void scriptingInit(void) { lua_pcall(lua,0,0,0); } + /* Add a helper function we use for pcall error reporting. + * Note that when the error is in the C function we want to report the + * information about the caller, that's what makes sense from the point + * of view of the user debugging a script. */ + { + char *errh_func = "function __redis__err__handler(err)\n" + " local i = debug.getinfo(2,'nSl')\n" + " if i and i.what == 'C' then\n" + " i = debug.getinfo(3,'nSl')\n" + " end\n" + " if i then\n" + " return err ..': '.. i.source .. ': ' .. i.currentline\n" + " else\n" + " return err\n" + " end\n" + "end\n"; + luaL_loadbuffer(lua,errh_func,strlen(errh_func),"@err_handler_def"); + lua_pcall(lua,0,0,0); + } + /* Create the (non connected) client that we use to execute Redis commands * inside the Lua interpreter. * Note: there is no need to create it again when this function is called @@ -840,21 +860,25 @@ void evalGenericCommand(redisClient *c, int evalsha) { funcname[42] = '\0'; } + /* Push the pcall error handler function on the stack. */ + lua_getglobal(lua, "__redis__err__handler"); + /* Try to lookup the Lua function */ lua_getglobal(lua, funcname); - if (lua_isnil(lua,1)) { + if (lua_isnil(lua,-1)) { lua_pop(lua,1); /* remove the nil from the stack */ /* Function not defined... let's define it if we have the * body of the function. If this is an EVALSHA call we can just * return an error. */ if (evalsha) { + lua_pop(lua,1); /* remove the error handler from the stack. */ addReply(c, shared.noscripterr); return; } if (luaCreateFunction(c,lua,funcname,c->argv[1]) == REDIS_ERR) return; /* Now the following is guaranteed to return non nil */ lua_getglobal(lua, funcname); - redisAssert(!lua_isnil(lua,1)); + redisAssert(!lua_isnil(lua,-1)); } /* Populate the argv and keys table accordingly to the arguments that @@ -881,7 +905,7 @@ void evalGenericCommand(redisClient *c, int evalsha) { * already defined, we can call it. We have zero arguments and expect * a single return value. */ - err = lua_pcall(lua,0,1,0); + err = lua_pcall(lua,0,1,-2); /* Perform some cleanup that we need to do both on error and success. */ if (delhook) lua_sethook(lua,luaMaskCountHook,0,0); /* Disable hook */ From 058541f3be7b271456c25d270df7a76fafbc0410 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 18 Jun 2013 19:30:56 +0200 Subject: [PATCH 0010/2500] Lua script errors format more unified. lua_pcall error handler now formats errors in a way more similar to luaPushError() so that errors generated in different contexts look alike. --- src/scripting.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scripting.c b/src/scripting.c index 06539a4cb..a707f1bac 100644 --- a/src/scripting.c +++ b/src/scripting.c @@ -631,7 +631,7 @@ void scriptingInit(void) { " i = debug.getinfo(3,'nSl')\n" " end\n" " if i then\n" - " return err ..': '.. i.source .. ': ' .. i.currentline\n" + " return i.source .. ':' .. i.currentline .. ': ' .. err\n" " else\n" " return err\n" " end\n" From f0861f0116f33c2a2772ae4525fabd8b7f9f1ff1 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 19 Jun 2013 14:44:40 +0200 Subject: [PATCH 0011/2500] Fix logStackTrace() when logging to stdout. When the semantics changed from logfile = NULL to logfile = "" to log into standard output, no proper change was made to logStackTrace() to make it able to work with the new setup. This commit fixes the issue. --- src/debug.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/debug.c b/src/debug.c index eb08c38d0..32d361200 100644 --- a/src/debug.c +++ b/src/debug.c @@ -619,11 +619,12 @@ void logRegisters(ucontext_t *uc) { void logStackTrace(ucontext_t *uc) { void *trace[100]; int trace_size = 0, fd; + int log_to_stdout = server.logfile[0] == '\0'; /* Open the log file in append mode. */ - fd = server.logfile ? - open(server.logfile, O_APPEND|O_CREAT|O_WRONLY, 0644) : - STDOUT_FILENO; + fd = log_to_stdout ? + STDOUT_FILENO : + open(server.logfile, O_APPEND|O_CREAT|O_WRONLY, 0644); if (fd == -1) return; /* Generate the stack trace */ @@ -637,7 +638,7 @@ void logStackTrace(ucontext_t *uc) { backtrace_symbols_fd(trace, trace_size, fd); /* Cleanup */ - if (server.logfile) close(fd); + if (!log_to_stdout) close(fd); } /* Log information about the "current" client, that is, the client that is From f352985cc1c6d724b289170aa261efba281b9dec Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 19 Jun 2013 18:25:03 +0200 Subject: [PATCH 0012/2500] Allow writes from scripts called by AOF loading in read-only slaves. This fixes issue #1163 --- src/scripting.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/scripting.c b/src/scripting.c index a707f1bac..b94627c7f 100644 --- a/src/scripting.c +++ b/src/scripting.c @@ -269,6 +269,7 @@ int luaRedisGenericCommand(lua_State *lua, int raise_error) { "Write commands not allowed after non deterministic commands"); goto cleanup; } else if (server.masterhost && server.repl_slave_ro && + !server.loading && !(server.lua_caller->flags & REDIS_MASTER)) { luaPushError(lua, shared.roslaveerr->ptr); From 96574ee6108d54c773fe7f16bffc53afd8f8364b Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 19 Jun 2013 18:31:33 +0200 Subject: [PATCH 0013/2500] Fix comment typo in integration/aof.tcl. --- tests/integration/aof.tcl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/aof.tcl b/tests/integration/aof.tcl index ebf9cb564..f255d7ec1 100644 --- a/tests/integration/aof.tcl +++ b/tests/integration/aof.tcl @@ -98,7 +98,7 @@ tags {"aof"} { } } - ## Test that SPOP (that modifies the client its argc/argv) is correctly free'd + ## Test that SPOP (that modifies the client's argc/argv) is correctly free'd create_aof { append_to_aof [formatCommand sadd set foo] append_to_aof [formatCommand sadd set bar] From 5f770dcc2f97a8ff795033bf9e70659d64aef271 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 19 Jun 2013 18:53:07 +0200 Subject: [PATCH 0014/2500] Test: regression test for #1163. --- tests/unit/scripting.tcl | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/unit/scripting.tcl b/tests/unit/scripting.tcl index e42f87725..3e08f630c 100644 --- a/tests/unit/scripting.tcl +++ b/tests/unit/scripting.tcl @@ -281,6 +281,23 @@ start_server {tags {"scripting"}} { assert_equal $rand1 $rand2 assert {$rand2 ne $rand3} } + + test {EVAL processes writes from AOF in read-only slaves} { + r flushall + r config set appendonly yes + r eval {redis.call("set","foo","100")} 0 + r eval {redis.call("incr","foo")} 0 + r eval {redis.call("incr","foo")} 0 + wait_for_condition 50 100 { + [s aof_rewrite_in_progress] == 0 + } else { + fail "AOF rewrite can't complete after CONFIG SET appendonly yes." + } + r config set slave-read-only yes + r slaveof 127.0.0.1 0 + r debug loadaof + r get foo + } {102} } # Start a new server since the last test in this stanza will kill the From 8b49716f0d2562c7687de56c21f5dbd73f952c6a Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 20 Jun 2013 10:21:38 +0200 Subject: [PATCH 0015/2500] Sentinel: parse new INFO replication output correctly. Sentinel was not able to detect slaves when connected to a very recent version of Redis master since a previos non-backward compatible change to INFO broken the parsing of the slaves ip:port INFO output. This fixes issue #1164 --- src/sentinel.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index a4db9408e..ed0978694 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1397,20 +1397,33 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { } } - /* slave0:,, */ + /* old versions: slave0:,, + * new versions: slave0:ip=127.0.0.1,port=9999,... */ if ((ri->flags & SRI_MASTER) && sdslen(l) >= 7 && !memcmp(l,"slave",5) && isdigit(l[5])) { char *ip, *port, *end; - ip = strchr(l,':'); if (!ip) continue; - ip++; /* Now ip points to start of ip address. */ - port = strchr(ip,','); if (!port) continue; - *port = '\0'; /* nul term for easy access. */ - port++; /* Now port points to start of port number. */ - end = strchr(port,','); if (!end) continue; - *end = '\0'; /* nul term for easy access. */ + if (strstr(l,"ip=") == NULL) { + /* Old format. */ + ip = strchr(l,':'); if (!ip) continue; + ip++; /* Now ip points to start of ip address. */ + port = strchr(ip,','); if (!port) continue; + *port = '\0'; /* nul term for easy access. */ + port++; /* Now port points to start of port number. */ + end = strchr(port,','); if (!end) continue; + *end = '\0'; /* nul term for easy access. */ + } else { + /* New format. */ + ip = strstr(l,"ip="); if (!ip) continue; + ip += 3; /* Now ip points to start of ip address. */ + port = strstr(l,"port="); if (!port) continue; + port += 5; /* Now port points to start of port number. */ + /* Nul term both fields for easy access. */ + end = strchr(ip,','); if (end) *end = '\0'; + end = strchr(port,','); if (end) *end = '\0'; + } /* Check if we already have this slave into our table, * otherwise add it. */ From d337302b906f5dcf189ca0fca7a28232d5ca2529 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 20 Jun 2013 15:32:00 +0200 Subject: [PATCH 0016/2500] PUBSUB command implemented. Currently it implements three subcommands: PUBSUB CHANNELS [] List channels with non-zero subscribers. PUBSUB NUMSUB [channel_1 ...] List number of subscribers for channels. PUBSUB NUMPAT Return number of subscribed patterns. --- src/pubsub.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ src/redis.c | 1 + src/redis.h | 1 + 3 files changed, 49 insertions(+) diff --git a/src/pubsub.c b/src/pubsub.c index 524cb9c5a..307825679 100644 --- a/src/pubsub.c +++ b/src/pubsub.c @@ -309,3 +309,50 @@ void publishCommand(redisClient *c) { if (server.cluster_enabled) clusterPropagatePublish(c->argv[1],c->argv[2]); addReplyLongLong(c,receivers); } + +/* PUBSUB command for Pub/Sub introspection. */ +void pubsubCommand(redisClient *c) { + if (!strcasecmp(c->argv[1]->ptr,"channels") && + (c->argc == 2 || c->argc ==3)) + { + /* PUBSUB CHANNELS [] */ + sds pat = (c->argc == 2) ? NULL : c->argv[2]->ptr; + dictIterator *di = dictGetIterator(server.pubsub_channels); + dictEntry *de; + long mblen = 0; + void *replylen; + + replylen = addDeferredMultiBulkLength(c); + while((de = dictNext(di)) != NULL) { + robj *cobj = dictGetKey(de); + sds channel = cobj->ptr; + + if (!pat || stringmatchlen(pat, sdslen(pat), + channel, sdslen(channel),0)) + { + addReplyBulk(c,cobj); + mblen++; + } + } + dictReleaseIterator(di); + setDeferredMultiBulkLength(c,replylen,mblen); + } else if (!strcasecmp(c->argv[1]->ptr,"numsub") && c->argc > 2) { + /* PUBSUB NUMSUB Channel_1 [... Channel_N] */ + int j; + + addReplyMultiBulkLen(c,(c->argc-2)*2); + for (j = 2; j < c->argc; j++) { + list *l = dictFetchValue(server.pubsub_channels,c->argv[j]); + + addReplyBulk(c,c->argv[j]); + addReplyBulkLongLong(c,l ? listLength(l) : 0); + } + } else if (!strcasecmp(c->argv[1]->ptr,"numpat") && c->argc == 2) { + /* PUBSUB NUMPAT */ + addReplyLongLong(c,listLength(server.pubsub_patterns)); + } else { + addReplyErrorFormat(c, + "Unknown PUBSUB subcommand or wrong number of arguments for '%s'", + (char*)c->argv[1]->ptr); + } +} diff --git a/src/redis.c b/src/redis.c index b974d5ef7..eee6cc53a 100644 --- a/src/redis.c +++ b/src/redis.c @@ -240,6 +240,7 @@ struct redisCommand redisCommandTable[] = { {"psubscribe",psubscribeCommand,-2,"rpslt",0,NULL,0,0,0,0,0}, {"punsubscribe",punsubscribeCommand,-1,"rpslt",0,NULL,0,0,0,0,0}, {"publish",publishCommand,3,"pfltr",0,NULL,0,0,0,0,0}, + {"pubsub",pubsubCommand,-2,"pltrR",0,NULL,0,0,0,0,0}, {"watch",watchCommand,-2,"rs",0,noPreloadGetKeys,1,-1,1,0,0}, {"unwatch",unwatchCommand,1,"rs",0,NULL,0,0,0,0,0}, {"cluster",clusterCommand,-2,"ar",0,NULL,0,0,0,0,0}, diff --git a/src/redis.h b/src/redis.h index 8bc76783e..b97fb0739 100644 --- a/src/redis.h +++ b/src/redis.h @@ -1458,6 +1458,7 @@ void unsubscribeCommand(redisClient *c); void psubscribeCommand(redisClient *c); void punsubscribeCommand(redisClient *c); void publishCommand(redisClient *c); +void pubsubCommand(redisClient *c); void watchCommand(redisClient *c); void unwatchCommand(redisClient *c); void clusterCommand(redisClient *c); From c0a47ea63cde5736412c46e289a692f8ced3bc4c Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 20 Jun 2013 15:34:56 +0200 Subject: [PATCH 0017/2500] Allow PUBSUB NUMSUB without channels. The result is an empty list but it is handy to call it programmatically. --- src/pubsub.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pubsub.c b/src/pubsub.c index 307825679..add9a4c5f 100644 --- a/src/pubsub.c +++ b/src/pubsub.c @@ -336,8 +336,8 @@ void pubsubCommand(redisClient *c) { } dictReleaseIterator(di); setDeferredMultiBulkLength(c,replylen,mblen); - } else if (!strcasecmp(c->argv[1]->ptr,"numsub") && c->argc > 2) { - /* PUBSUB NUMSUB Channel_1 [... Channel_N] */ + } else if (!strcasecmp(c->argv[1]->ptr,"numsub") && c->argc >= 2) { + /* PUBSUB NUMSUB [Channel_1 ... Channel_N] */ int j; addReplyMultiBulkLen(c,(c->argc-2)*2); From 21afa5dbed6a53b5648c7e4f2286c4da0a7e6c36 Mon Sep 17 00:00:00 2001 From: Jan-Erik Rediger Date: Thu, 20 Jun 2013 17:53:35 +0300 Subject: [PATCH 0018/2500] Initialize char* to NULL to remove compiler warning --- src/config.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/config.c b/src/config.c index c0d99b92a..374377320 100644 --- a/src/config.c +++ b/src/config.c @@ -1332,7 +1332,7 @@ void rewriteConfigOctalOption(struct rewriteConfigState *state, char *option, in * specified. See how the function is used for more information. */ void rewriteConfigEnumOption(struct rewriteConfigState *state, char *option, int value, ...) { va_list ap; - char *enum_name, *matching_name; + char *enum_name, *matching_name = NULL; int enum_val, def_val, force; sds line; @@ -1357,7 +1357,7 @@ void rewriteConfigEnumOption(struct rewriteConfigState *state, char *option, int void rewriteConfigSyslogfacilityOption(struct rewriteConfigState *state) { int value = server.syslog_facility, j; int force = value != LOG_LOCAL0; - char *name, *option = "syslog-facility"; + char *name = NULL, *option = "syslog-facility"; sds line; for (j = 0; validSyslogFacilities[j].name; j++) { From 08dff9fb66b299fd2adb0e158241d52b18a10626 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 21 Jun 2013 12:07:53 +0200 Subject: [PATCH 0019/2500] New API to force propagation. The old REDIS_CMD_FORCE_REPLICATION flag was removed from the implementation of Redis, now there is a new API to force specific executions of a command to be propagated to AOF / Replication link: void forceCommandPropagation(int flags); The new API is also compatible with Lua scripting, so a script that will execute commands that are forced to be propagated, will also be propagated itself accordingly even if no change to data is operated. As a side effect, this new design fixes the issue with scripts not able to propagate PUBLISH to slaves (issue #873). --- src/pubsub.c | 1 + src/redis.c | 32 ++++++++++++++++++++++++++++---- src/redis.h | 5 ++++- src/scripting.c | 1 + 4 files changed, 34 insertions(+), 5 deletions(-) diff --git a/src/pubsub.c b/src/pubsub.c index add9a4c5f..a596dfc96 100644 --- a/src/pubsub.c +++ b/src/pubsub.c @@ -307,6 +307,7 @@ void punsubscribeCommand(redisClient *c) { void publishCommand(redisClient *c) { int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]); if (server.cluster_enabled) clusterPropagatePublish(c->argv[1],c->argv[2]); + forceCommandPropagation(c,REDIS_PROPAGATE_REPL); addReplyLongLong(c,receivers); } diff --git a/src/redis.c b/src/redis.c index eee6cc53a..d3d426c04 100644 --- a/src/redis.c +++ b/src/redis.c @@ -239,7 +239,7 @@ struct redisCommand redisCommandTable[] = { {"unsubscribe",unsubscribeCommand,-1,"rpslt",0,NULL,0,0,0,0,0}, {"psubscribe",psubscribeCommand,-2,"rpslt",0,NULL,0,0,0,0,0}, {"punsubscribe",punsubscribeCommand,-1,"rpslt",0,NULL,0,0,0,0,0}, - {"publish",publishCommand,3,"pfltr",0,NULL,0,0,0,0,0}, + {"publish",publishCommand,3,"pltr",0,NULL,0,0,0,0,0}, {"pubsub",pubsubCommand,-2,"pltrR",0,NULL,0,0,0,0,0}, {"watch",watchCommand,-2,"rs",0,noPreloadGetKeys,1,-1,1,0,0}, {"unwatch",unwatchCommand,1,"rs",0,NULL,0,0,0,0,0}, @@ -1528,7 +1528,6 @@ void populateCommandTable(void) { case 'm': c->flags |= REDIS_CMD_DENYOOM; break; case 'a': c->flags |= REDIS_CMD_ADMIN; break; case 'p': c->flags |= REDIS_CMD_PUBSUB; break; - case 'f': c->flags |= REDIS_CMD_FORCE_REPLICATION; break; case 's': c->flags |= REDIS_CMD_NOSCRIPT; break; case 'R': c->flags |= REDIS_CMD_RANDOM; break; case 'S': c->flags |= REDIS_CMD_SORT_FOR_SCRIPT; break; @@ -1652,9 +1651,18 @@ void alsoPropagate(struct redisCommand *cmd, int dbid, robj **argv, int argc, redisOpArrayAppend(&server.also_propagate,cmd,dbid,argv,argc,target); } +/* It is possible to call the function forceCommandPropagation() inside a + * Redis command implementaiton in order to to force the propagation of a + * specific command execution into AOF / Replication. */ +void forceCommandPropagation(redisClient *c, int flags) { + if (flags & REDIS_PROPAGATE_REPL) c->flags |= REDIS_FORCE_REPL; + if (flags & REDIS_PROPAGATE_AOF) c->flags |= REDIS_FORCE_AOF; +} + /* Call() is the core of Redis execution of a command */ void call(redisClient *c, int flags) { long long dirty, start = ustime(), duration; + int client_old_flags = c->flags; /* Sent the command to clients in MONITOR mode, only if the commands are * not generated from reading an AOF. */ @@ -1666,6 +1674,7 @@ void call(redisClient *c, int flags) { } /* Call the command. */ + c->flags &= ~(REDIS_FORCE_AOF|REDIS_FORCE_REPL); redisOpArrayInit(&server.also_propagate); dirty = server.dirty; c->cmd->proc(c); @@ -1677,6 +1686,16 @@ void call(redisClient *c, int flags) { if (server.loading && c->flags & REDIS_LUA_CLIENT) flags &= ~(REDIS_CALL_SLOWLOG | REDIS_CALL_STATS); + /* If the caller is Lua, we want to force the EVAL caller to propagate + * the script if the command flag or client flag are forcing the + * propagation. */ + if (c->flags & REDIS_LUA_CLIENT && server.lua_caller) { + if (c->flags & REDIS_FORCE_REPL) + server.lua_caller->flags |= REDIS_FORCE_REPL; + if (c->flags & REDIS_FORCE_AOF) + server.lua_caller->flags |= REDIS_FORCE_AOF; + } + /* Log the command into the Slow log if needed, and populate the * per-command statistics that we show in INFO commandstats. */ if (flags & REDIS_CALL_SLOWLOG && c->cmd->proc != execCommand) @@ -1690,14 +1709,19 @@ void call(redisClient *c, int flags) { if (flags & REDIS_CALL_PROPAGATE) { int flags = REDIS_PROPAGATE_NONE; - if (c->cmd->flags & REDIS_CMD_FORCE_REPLICATION) - flags |= REDIS_PROPAGATE_REPL; + if (c->flags & REDIS_FORCE_REPL) flags |= REDIS_PROPAGATE_REPL; + if (c->flags & REDIS_FORCE_AOF) flags |= REDIS_PROPAGATE_AOF; if (dirty) flags |= (REDIS_PROPAGATE_REPL | REDIS_PROPAGATE_AOF); if (flags != REDIS_PROPAGATE_NONE) propagate(c->cmd,c->db->id,c->argv,c->argc,flags); } + /* Restore the old FORCE_AOF/REPL flags, since call can be executed + * recursively. */ + c->flags &= ~(REDIS_FORCE_AOF|REDIS_FORCE_REPL); + c->flags |= client_old_flags & (REDIS_FORCE_AOF|REDIS_FORCE_REPL); + /* Handle the alsoPropagate() API to handle commands that want to propagate * multiple separated commands. */ if (server.also_propagate.numops) { diff --git a/src/redis.h b/src/redis.h index b97fb0739..0c0549198 100644 --- a/src/redis.h +++ b/src/redis.h @@ -139,7 +139,7 @@ #define REDIS_CMD_WRITE 1 /* "w" flag */ #define REDIS_CMD_READONLY 2 /* "r" flag */ #define REDIS_CMD_DENYOOM 4 /* "m" flag */ -#define REDIS_CMD_FORCE_REPLICATION 8 /* "f" flag */ +#define REDIS_CMD_NOT_USED_1 8 /* no longer used flag */ #define REDIS_CMD_ADMIN 16 /* "a" flag */ #define REDIS_CMD_PUBSUB 32 /* "p" flag */ #define REDIS_CMD_NOSCRIPT 64 /* "s" flag */ @@ -217,6 +217,8 @@ #define REDIS_UNIX_SOCKET (1<<11) /* Client connected via Unix domain socket */ #define REDIS_DIRTY_EXEC (1<<12) /* EXEC will fail for errors while queueing */ #define REDIS_MASTER_FORCE_REPLY (1<<13) /* Queue replies even if is master */ +#define REDIS_FORCE_AOF (1<<14) /* Force AOF propagation of current cmd. */ +#define REDIS_FORCE_REPL (1<<15) /* Force replication of current cmd. */ /* Client request types */ #define REDIS_REQ_INLINE 1 @@ -1211,6 +1213,7 @@ struct redisCommand *lookupCommandOrOriginal(sds name); void call(redisClient *c, int flags); void propagate(struct redisCommand *cmd, int dbid, robj **argv, int argc, int flags); void alsoPropagate(struct redisCommand *cmd, int dbid, robj **argv, int argc, int target); +void forceCommandPropagation(redisClient *c, int flags); int prepareForShutdown(); #ifdef __GNUC__ void redisLog(int level, const char *fmt, ...) diff --git a/src/scripting.c b/src/scripting.c index b94627c7f..104bd3dde 100644 --- a/src/scripting.c +++ b/src/scripting.c @@ -1042,6 +1042,7 @@ void scriptCommand(redisClient *c) { } addReplyBulkCBuffer(c,funcname+2,40); sdsfree(sha); + forceCommandPropagation(c,REDIS_PROPAGATE_REPL); } else if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr,"kill")) { if (server.lua_caller == NULL) { addReplySds(c,sdsnew("-NOTBUSY No scripts in execution right now.\r\n")); From a9e1c46f4071225efd64d088003007ec2e81c5c7 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 24 Jun 2013 10:26:04 +0200 Subject: [PATCH 0020/2500] Replication of scripts as EVALSHA: sha1 caching implemented. This code is only responsible to take an LRU-evicted fixed length cache of SHA1 that we are sure all the slaves received. In this commit only the implementation is provided, but the Redis core does not use it to actually send EVALSHA to slaves when possible. --- src/redis.c | 12 +++++++ src/redis.h | 5 +++ src/replication.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 96 insertions(+), 1 deletion(-) diff --git a/src/redis.c b/src/redis.c index d3d426c04..5928357ae 100644 --- a/src/redis.c +++ b/src/redis.c @@ -587,6 +587,18 @@ dictType migrateCacheDictType = { NULL /* val destructor */ }; +/* Replication cached script dict (server.repl_scriptcache_dict). + * Keys are sds SHA1 strings, while values are not used at all in the current + * implementation. */ +dictType replScriptCacheDictType = { + dictSdsHash, /* hash function */ + NULL, /* key dup */ + NULL, /* val dup */ + dictSdsKeyCompare, /* key compare */ + dictSdsDestructor, /* key destructor */ + NULL /* val destructor */ +}; + int htNeedsResize(dict *dict) { long long size, used; diff --git a/src/redis.h b/src/redis.h index 0c0549198..e2cfbcccd 100644 --- a/src/redis.h +++ b/src/redis.h @@ -862,6 +862,10 @@ struct redisServer { int slave_priority; /* Reported in INFO and used by Sentinel. */ char repl_master_runid[REDIS_RUN_ID_SIZE+1]; /* Master run id for PSYNC. */ long long repl_master_initial_offset; /* Master PSYNC offset. */ + /* Replication script cache. */ + dict *repl_scriptcache_dict; /* SHA1 all slaves are aware of. */ + list *repl_scriptcache_fifo; /* First in, first out LRU eviction. */ + int repl_scriptcache_size; /* Max number of elements. */ /* Limits */ unsigned int maxclients; /* Max number of simultaneous clients */ unsigned long long maxmemory; /* Max number of memory bytes to use */ @@ -1012,6 +1016,7 @@ extern dictType dbDictType; extern dictType shaScriptObjectDictType; extern double R_Zero, R_PosInf, R_NegInf, R_Nan; extern dictType hashDictType; +extern dictType replScriptCacheDictType; /*----------------------------------------------------------------------------- * Functions prototypes diff --git a/src/replication.c b/src/replication.c index fa0eb2b70..e2d5b3ec4 100644 --- a/src/replication.c +++ b/src/replication.c @@ -1432,7 +1432,85 @@ void refreshGoodSlavesCount(void) { server.repl_good_slaves_count = good; } -/* --------------------------- REPLICATION CRON ---------------------------- */ +/* ----------------------- REPLICATION SCRIPT CACHE -------------------------- + * The goal of this code is to keep track of scripts already sent to every + * connected slave, in order to be able to replicate EVALSHA as it is without + * translating it to EVAL every time it is possible. + * + * We use a capped collection implemented by an hash table for fast lookup + * of scripts we can send as EVALSHA, plus a linked list that is used for + * eviction of the oldest entry when the max number of items is reached. + * + * We don't care about taking a different cache for every different slave + * since to fill the cache again is not very costly, the goal of this code + * is to avoid that the same big script is trasmitted a big number of times + * per second wasting bandwidth and processor speed, but it is not a problem + * if we need to rebuild the cache from scratch from time to time, every used + * script will need to be transmitted a single time to reappear in the cache. + * + * This is how the system works: + * + * 1) Every time a new slave connects, we flush the whole script cache. + * 2) We only send as EVALSHA what was sent to the master as EVALSHA, without + * trying to convert EVAL into EVALSHA specifically for slaves. + * 3) Every time we trasmit a script as EVAL to the slaves, we also add the + * corresponding SHA1 of the script into the cache as we are sure every + * slave knows about the script starting from now. + * 4) On SCRIPT FLUSH command, we replicate the command to all the slaves + * and at the same time flush the script cache. + * 5) When the last slave disconnects, flush the cache. + * 6) We handle SCRIPT LOAD as well since that's how scripts are loaded + * in the master sometimes. + */ + +/* Initialize the script cache, only called at startup. */ +void replicationScriptCacheInit(void) { + server.repl_scriptcache_size = 10000; + server.repl_scriptcache_dict = dictCreate(&replScriptCacheDictType,NULL); + server.repl_scriptcache_fifo = listCreate(); +} + +/* Empty the script cache. Should be called every time we are no longer sure + * that every slave knows about all the scripts in our set, for example + * every time a new slave connects to this master and performs a full + * resynchronization. There is no need to flush the cache when a partial + * resynchronization is performed. */ +void replicationScriptCacheFlush(void) { + dictEmpty(server.repl_scriptcache_dict); + listRelease(server.repl_scriptcache_fifo); + server.repl_scriptcache_fifo = listCreate(); +} + +/* Add an entry into the script cache, if we reach max number of entries the + * oldest is removed from the list. */ +void replicationScriptCacheAdd(sds sha1) { + int retval; + sds key = sdsdup(sha1); + + /* Evict oldest. */ + if (listLength(server.repl_scriptcache_fifo) == server.repl_scriptcache_size) + { + listNode *ln = listLast(server.repl_scriptcache_fifo); + sds oldest = listNodeValue(ln); + + retval = dictDelete(server.repl_scriptcache_dict,oldest); + redisAssert(retval == DICT_OK); + listDelNode(server.repl_scriptcache_fifo,ln); + } + + /* Add current. */ + retval = dictAdd(server.repl_scriptcache_dict,key,NULL); + listAddNodeHead(server.repl_scriptcache_fifo,key); + redisAssert(retval == DICT_OK); +} + +/* Returns non-zero if the specified entry exists inside the cache, that is, + * if all the slaves are aware of this script SHA1. */ +int replicationScriptCacheExists(sds sha1) { + return dictFetchValue(server.repl_scriptcache_dict,sha1) != NULL; +} + +/* --------------------------- REPLICATION CRON ----------------------------- */ /* Replication cron funciton, called 1 time per second. */ void replicationCron(void) { From eaebabe5648f635e487ee3d0a8dd8ab11bc3c5a7 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 24 Jun 2013 18:57:31 +0200 Subject: [PATCH 0021/2500] Use the RSC to replicate EVALSHA unmodified. This commit uses the Replication Script Cache in order to avoid translating EVALSHA into EVAL whenever possible for both the AOF and slaves. --- src/aof.c | 1 + src/redis.c | 4 ++-- src/redis.h | 4 ++++ src/replication.c | 33 ++++++++++++++++++++++++++++----- src/scripting.c | 38 ++++++++++++++++++++++++-------------- 5 files changed, 59 insertions(+), 21 deletions(-) diff --git a/src/aof.c b/src/aof.c index 9e602ce01..9ad85c536 100644 --- a/src/aof.c +++ b/src/aof.c @@ -998,6 +998,7 @@ int rewriteAppendOnlyFileBackground(void) { * accumulated by the parent into server.aof_rewrite_buf will start * with a SELECT statement and it will be safe to merge. */ server.aof_selected_db = -1; + replicationScriptCacheFlush(); return REDIS_OK; } return REDIS_OK; /* unreached */ diff --git a/src/redis.c b/src/redis.c index 5928357ae..fdd30d148 100644 --- a/src/redis.c +++ b/src/redis.c @@ -591,10 +591,10 @@ dictType migrateCacheDictType = { * Keys are sds SHA1 strings, while values are not used at all in the current * implementation. */ dictType replScriptCacheDictType = { - dictSdsHash, /* hash function */ + dictSdsCaseHash, /* hash function */ NULL, /* key dup */ NULL, /* val dup */ - dictSdsKeyCompare, /* key compare */ + dictSdsKeyCaseCompare, /* key compare */ dictSdsDestructor, /* key destructor */ NULL /* val destructor */ }; diff --git a/src/redis.h b/src/redis.h index e2cfbcccd..5d9dcd349 100644 --- a/src/redis.h +++ b/src/redis.h @@ -1165,6 +1165,10 @@ void resizeReplicationBacklog(long long newsize); void replicationSetMaster(char *ip, int port); void replicationUnsetMaster(void); void refreshGoodSlavesCount(void); +void replicationScriptCacheInit(void); +void replicationScriptCacheFlush(void); +void replicationScriptCacheAdd(sds sha1); +int replicationScriptCacheExists(sds sha1); /* Generic persistence functions */ void startLoading(FILE *fp); diff --git a/src/replication.c b/src/replication.c index e2d5b3ec4..04a74dbdc 100644 --- a/src/replication.c +++ b/src/replication.c @@ -546,6 +546,8 @@ void syncCommand(redisClient *c) { return; } c->replstate = REDIS_REPL_WAIT_BGSAVE_END; + /* Flush the script cache for the new slave. */ + replicationScriptCacheFlush(); } if (server.repl_disable_tcp_nodelay) @@ -711,6 +713,11 @@ void updateSlavesWaitingBgsave(int bgsaveerr) { } } if (startbgsave) { + /* Since we are starting a new background save for one or more slaves, + * we flush the Replication Script Cache to use EVAL to propagate every + * new EVALSHA for the first time, since all the new slaves don't know + * about previous scripts. */ + replicationScriptCacheFlush(); if (rdbSaveBackground(server.rdb_filename) != REDIS_OK) { listIter li; @@ -1471,10 +1478,16 @@ void replicationScriptCacheInit(void) { } /* Empty the script cache. Should be called every time we are no longer sure - * that every slave knows about all the scripts in our set, for example - * every time a new slave connects to this master and performs a full - * resynchronization. There is no need to flush the cache when a partial - * resynchronization is performed. */ + * that every slave knows about all the scripts in our set, or when the + * current AOF "context" is no longer aware of the script. In general we + * should flush the cache: + * + * 1) Every time a new slave reconnects to this master and performs a + * full SYNC (PSYNC does not require flushing). + * 2) Every time an AOF rewrite is performed. + * 3) Every time we are left without slaves at all, and AOF is off, in order + * to reclaim otherwise unused memory. + */ void replicationScriptCacheFlush(void) { dictEmpty(server.repl_scriptcache_dict); listRelease(server.repl_scriptcache_fifo); @@ -1507,7 +1520,7 @@ void replicationScriptCacheAdd(sds sha1) { /* Returns non-zero if the specified entry exists inside the cache, that is, * if all the slaves are aware of this script SHA1. */ int replicationScriptCacheExists(sds sha1) { - return dictFetchValue(server.repl_scriptcache_dict,sha1) != NULL; + return dictFind(server.repl_scriptcache_dict,sha1) != NULL; } /* --------------------------- REPLICATION CRON ----------------------------- */ @@ -1624,6 +1637,16 @@ void replicationCron(void) { } } + /* If AOF is disabled and we no longer have attached slaves, we can + * free our Replication Script Cache as there is no need to propagate + * EVALSHA at all. */ + if (listLength(server.slaves) == 0 && + server.aof_state == REDIS_AOF_OFF && + listLength(server.repl_scriptcache_fifo) != 0) + { + replicationScriptCacheFlush(); + } + /* Refresh the number of slaves with lag <= min-slaves-max-lag. */ refreshGoodSlavesCount(); } diff --git a/src/scripting.c b/src/scripting.c index 104bd3dde..f30956bd2 100644 --- a/src/scripting.c +++ b/src/scripting.c @@ -655,6 +655,10 @@ void scriptingInit(void) { * to global variables. */ scriptingEnableGlobalsProtection(lua); + /* Initialize the Replication Script Cache for EVALSHA propagation to + * slaves and AOF. */ + replicationScriptCacheInit(); + server.lua = lua; } @@ -931,23 +935,29 @@ void evalGenericCommand(redisClient *c, int evalsha) { luaReplyToRedisReply(c,lua); } - /* If we have slaves attached we want to replicate this command as - * EVAL instead of EVALSHA. We do this also in the AOF as currently there - * is no easy way to propagate a command in a different way in the AOF - * and in the replication link. + /* EVALSHA should be propagated to Slave and AOF file as full EVAL, unless + * we are sure that the script was already in the context of all the + * attached slaves *and* the current AOF file if enabled. * - * IMPROVEMENT POSSIBLE: - * 1) Replicate this command as EVALSHA in the AOF. - * 2) Remember what slave already received a given script, and replicate - * the EVALSHA against this slaves when possible. - */ + * To do so we use a cache of SHA1s of scripts that we already propagated + * as full EVAL, that's called the Replication Script Cache. + * + * For repliation, everytime a new slave attaches to the master, we need to + * flush our cache of scripts that can be replicated as EVALSHA, while + * for AOF we need to do so every time we rewrite the AOF file. */ if (evalsha) { - robj *script = dictFetchValue(server.lua_scripts,c->argv[1]->ptr); + if (!replicationScriptCacheExists(c->argv[1]->ptr)) { + /* This script is not in our script cache, replicate it as + * EVAL, then add it into the script cache, as from now on + * slaves and AOF know about it. */ + robj *script = dictFetchValue(server.lua_scripts,c->argv[1]->ptr); - redisAssertWithInfo(c,NULL,script != NULL); - rewriteClientCommandArgument(c,0, - resetRefCount(createStringObject("EVAL",4))); - rewriteClientCommandArgument(c,1,script); + replicationScriptCacheAdd(c->argv[1]->ptr); + redisAssertWithInfo(c,NULL,script != NULL); + rewriteClientCommandArgument(c,0, + resetRefCount(createStringObject("EVAL",4))); + rewriteClientCommandArgument(c,1,script); + } } } From 8338e50127854422e937dd520602b7f2f177d3ae Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 24 Jun 2013 19:27:49 +0200 Subject: [PATCH 0022/2500] Move Replication Script Cache initialization in safer place. It should be called just one time at startup and not every time the Lua scripting engine is re-initialized, otherwise memory is leaked. --- src/redis.c | 1 + src/scripting.c | 4 ---- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/src/redis.c b/src/redis.c index fdd30d148..21298e6eb 100644 --- a/src/redis.c +++ b/src/redis.c @@ -1517,6 +1517,7 @@ void initServer() { } if (server.cluster_enabled) clusterInit(); + replicationScriptCacheInit(); scriptingInit(); slowlogInit(); bioInit(); diff --git a/src/scripting.c b/src/scripting.c index f30956bd2..ff94ae165 100644 --- a/src/scripting.c +++ b/src/scripting.c @@ -655,10 +655,6 @@ void scriptingInit(void) { * to global variables. */ scriptingEnableGlobalsProtection(lua); - /* Initialize the Replication Script Cache for EVALSHA propagation to - * slaves and AOF. */ - replicationScriptCacheInit(); - server.lua = lua; } From a4a60080b7f9c73921d899c85a99410721f02a07 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 25 Jun 2013 10:56:59 +0200 Subject: [PATCH 0023/2500] SCRIPT FLUSH comment minor pedantic improvement. --- src/scripting.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scripting.c b/src/scripting.c index ff94ae165..11ed4c616 100644 --- a/src/scripting.c +++ b/src/scripting.c @@ -1020,7 +1020,7 @@ void scriptCommand(redisClient *c) { if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr,"flush")) { scriptingReset(); addReply(c,shared.ok); - server.dirty++; /* Replicating this command is a good idea. */ + server.dirty++; /* Propagating this command is a good idea. */ } else if (c->argc >= 2 && !strcasecmp(c->argv[1]->ptr,"exists")) { int j; From 32b465b9989298f8dd9678dc6d3dc07a7972c4de Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 25 Jun 2013 12:49:56 +0200 Subject: [PATCH 0024/2500] Force propagation of SCRIPT LOAD to AOF. --- src/scripting.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scripting.c b/src/scripting.c index 11ed4c616..a794f34ff 100644 --- a/src/scripting.c +++ b/src/scripting.c @@ -1048,7 +1048,7 @@ void scriptCommand(redisClient *c) { } addReplyBulkCBuffer(c,funcname+2,40); sdsfree(sha); - forceCommandPropagation(c,REDIS_PROPAGATE_REPL); + forceCommandPropagation(c,REDIS_PROPAGATE_REPL|REDIS_PROPAGATE_AOF); } else if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr,"kill")) { if (server.lua_caller == NULL) { addReplySds(c,sdsnew("-NOTBUSY No scripts in execution right now.\r\n")); From c43286e5a9349683445ef44b8736a84e42090909 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 25 Jun 2013 15:13:14 +0200 Subject: [PATCH 0025/2500] Test: replication-3 test speedup in master-slave setup. --- tests/integration/replication-3.tcl | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/integration/replication-3.tcl b/tests/integration/replication-3.tcl index e660bf4e5..57ed77c18 100644 --- a/tests/integration/replication-3.tcl +++ b/tests/integration/replication-3.tcl @@ -2,9 +2,12 @@ start_server {tags {"repl"}} { start_server {} { test {First server should have role slave after SLAVEOF} { r -1 slaveof [srv 0 host] [srv 0 port] - after 1000 - s -1 role - } {slave} + wait_for_condition 50 100 { + [s -1 master_link_status] eq {up} + } else { + fail "Replication not started." + } + } if {$::accurate} {set numops 50000} else {set numops 5000} From 31f34595c6a33aef18bf7f91910b68fa2385f60e Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 25 Jun 2013 15:32:37 +0200 Subject: [PATCH 0026/2500] Test: randomInt() behavior commented. --- tests/support/util.tcl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/support/util.tcl b/tests/support/util.tcl index 48d06b741..c5a6853b3 100644 --- a/tests/support/util.tcl +++ b/tests/support/util.tcl @@ -91,10 +91,12 @@ proc wait_for_sync r { } } +# Random integer between 0 and max (excluded). proc randomInt {max} { expr {int(rand()*$max)} } +# Random signed integer between -max and max (both extremes excluded). proc randomSignedInt {max} { set i [randomInt $max] if {rand() > 0.5} { From 1ff422dc47f36e1ac10e1cd445cb6d36c261d556 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 25 Jun 2013 15:35:48 +0200 Subject: [PATCH 0027/2500] Test: EVALSHA replication. --- tests/integration/replication-3.tcl | 50 +++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/tests/integration/replication-3.tcl b/tests/integration/replication-3.tcl index 57ed77c18..48ffc6ec9 100644 --- a/tests/integration/replication-3.tcl +++ b/tests/integration/replication-3.tcl @@ -32,3 +32,53 @@ start_server {tags {"repl"}} { } } } + +start_server {tags {"repl"}} { + start_server {} { + test {First server should have role slave after SLAVEOF} { + r -1 slaveof [srv 0 host] [srv 0 port] + wait_for_condition 50 100 { + [s -1 master_link_status] eq {up} + } else { + fail "Replication not started." + } + } + + set numops 20000 ;# Enough to trigger the Script Cache LRU eviction. + + test {MASTER and SLAVE consistency with EVALSHA replication} { + array set oldsha {} + for {set j 0} {$j < $numops} {incr j} { + set key "key:$j" + # Make sure to create scripts that have different SHA1s + set script "return redis.call('incr','$key')" + set sha1 [r eval "return redis.sha1hex(\"$script\")" 0] + set oldsha($j) $sha1 + r eval $script 0 + set res [r evalsha $sha1 0] + assert {$res == 2} + # Additionally call one of the old scripts as well, at random. + set res [r evalsha $oldsha([randomInt $j]) 0] + assert {$res > 2} + } + + wait_for_condition 50 100 { + [r dbsize] == $numops && + [r -1 dbsize] == $numops && + [r debug digest] eq [r -1 debug digest] + } else { + set csv1 [csvdump r] + set csv2 [csvdump {r -1}] + set fd [open /tmp/repldump1.txt w] + puts -nonewline $fd $csv1 + close $fd + set fd [open /tmp/repldump2.txt w] + puts -nonewline $fd $csv2 + close $fd + puts "Master - Slave inconsistency" + puts "Run diff -u against /tmp/repldump*.txt for more info" + + } + } + } +} From e9f50cb1a1f76d1f02e5b13b629752fd7dcbd503 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 25 Jun 2013 15:36:48 +0200 Subject: [PATCH 0028/2500] Flush the replication script cache after SCRIPT FLUSH. --- src/scripting.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/scripting.c b/src/scripting.c index a794f34ff..baf585279 100644 --- a/src/scripting.c +++ b/src/scripting.c @@ -1020,6 +1020,7 @@ void scriptCommand(redisClient *c) { if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr,"flush")) { scriptingReset(); addReply(c,shared.ok); + replicationScriptCacheFlush(); server.dirty++; /* Propagating this command is a good idea. */ } else if (c->argc >= 2 && !strcasecmp(c->argv[1]->ptr,"exists")) { int j; From 600567383aa5890e5354b0526f29323452cc3a25 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 25 Jun 2013 15:49:07 +0200 Subject: [PATCH 0029/2500] Test: add some AOF testing to EVALSHA replication test. --- tests/integration/replication-3.tcl | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/integration/replication-3.tcl b/tests/integration/replication-3.tcl index 48ffc6ec9..0fcbad45b 100644 --- a/tests/integration/replication-3.tcl +++ b/tests/integration/replication-3.tcl @@ -46,6 +46,10 @@ start_server {tags {"repl"}} { set numops 20000 ;# Enough to trigger the Script Cache LRU eviction. + # While we are at it, enable AOF to test it will be consistent as well + # after the test. + r config set appendonly yes + test {MASTER and SLAVE consistency with EVALSHA replication} { array set oldsha {} for {set j 0} {$j < $numops} {incr j} { @@ -60,6 +64,13 @@ start_server {tags {"repl"}} { # Additionally call one of the old scripts as well, at random. set res [r evalsha $oldsha([randomInt $j]) 0] assert {$res > 2} + + # Trigger an AOF rewrite while we are half-way, this also + # forces the flush of the script cache, and we will cover + # more code as a result. + if {$j == $numops / 2} { + catch {r bgrewriteaof} + } } wait_for_condition 50 100 { @@ -79,6 +90,12 @@ start_server {tags {"repl"}} { puts "Run diff -u against /tmp/repldump*.txt for more info" } + + set old_digest [r debug digest] + r config set appendonly no + r debug loadaof + set new_digest [r debug digest] + assert {$old_digest eq $new_digest} } } } From cdaacf03aac2e6ec2ba274977143cc535b632bff Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 26 Jun 2013 10:11:20 +0200 Subject: [PATCH 0030/2500] Don't disconnect pre PSYNC replication clients for timeout. Clients using SYNC to replicate are older implementations, such as redis-cli --slave, and are not designed to acknowledge the master with REPLCONF ACK commands, so we don't have any feedback and should not disconnect them on timeout. --- src/redis.h | 1 + src/replication.c | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/src/redis.h b/src/redis.h index 5d9dcd349..570a7deff 100644 --- a/src/redis.h +++ b/src/redis.h @@ -219,6 +219,7 @@ #define REDIS_MASTER_FORCE_REPLY (1<<13) /* Queue replies even if is master */ #define REDIS_FORCE_AOF (1<<14) /* Force AOF propagation of current cmd. */ #define REDIS_FORCE_REPL (1<<15) /* Force replication of current cmd. */ +#define REDIS_PRE_PSYNC_SLAVE (1<<16) /* Slave don't understand PSYNC. */ /* Client request types */ #define REDIS_REQ_INLINE 1 diff --git a/src/replication.c b/src/replication.c index 04a74dbdc..196b8d8f3 100644 --- a/src/replication.c +++ b/src/replication.c @@ -505,6 +505,11 @@ void syncCommand(redisClient *c) { * resync. */ if (master_runid[0] != '?') server.stat_sync_partial_err++; } + } else { + /* If a slave uses SYNC, we are dealing with an old implementation + * of the replication protocol (like redis-cli --slave). Flag the client + * so that we don't expect to receive REPLCONF ACK feedbacks. */ + c->flags |= REDIS_PRE_PSYNC_SLAVE; } /* Full resynchronization. */ @@ -1606,6 +1611,7 @@ void replicationCron(void) { redisClient *slave = ln->value; if (slave->replstate != REDIS_REPL_ONLINE) continue; + if (slave->flags & REDIS_PRE_PSYNC_SLAVE) continue; if ((server.unixtime - slave->repl_ack_time) > server.repl_timeout) { char ip[32]; From 1065918c36ce8252364706e654e58dc3ea081bda Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 26 Jun 2013 15:19:06 +0200 Subject: [PATCH 0031/2500] function renamed: popcount_binary -> redisPopcount. --- src/bitops.c | 4 ++-- src/redis.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/bitops.c b/src/bitops.c index 1c2e13ddc..c96a9e3c7 100644 --- a/src/bitops.c +++ b/src/bitops.c @@ -58,7 +58,7 @@ static int getBitOffsetFromArgument(redisClient *c, robj *o, size_t *offset) { /* Count number of bits set in the binary array pointed by 's' and long * 'count' bytes. The implementation of this function is required to * work with a input string length up to 512 MB. */ -size_t popcount_binary(void *s, long count) { +size_t redisPopcount(void *s, long count) { size_t bits = 0; unsigned char *p; uint32_t *p4 = s; @@ -407,6 +407,6 @@ void bitcountCommand(redisClient *c) { } else { long bytes = end-start+1; - addReplyLongLong(c,popcount_binary(p+start,bytes)); + addReplyLongLong(c,redisPopcount(p+start,bytes)); } } diff --git a/src/redis.h b/src/redis.h index ccf978128..78d97ca22 100644 --- a/src/redis.h +++ b/src/redis.h @@ -1029,7 +1029,7 @@ long long mstime(void); void getRandomHexChars(char *p, unsigned int len); uint64_t crc64(uint64_t crc, const unsigned char *s, uint64_t l); void exitFromChild(int retcode); -size_t popcount_binary(void *s, long count); +size_t redisPopcount(void *s, long count); void redisSetProcTitle(char *title); /* networking.c -- Networking and Client related operations */ From e94b5b9359e80c80321465f56fa3fa2a909c75f7 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 27 Jun 2013 12:14:23 +0200 Subject: [PATCH 0032/2500] Allow SHUTDOWN in loading state. --- src/db.c | 6 ++++++ src/redis.c | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/db.c b/src/db.c index 64be530ee..02f8dd3a7 100644 --- a/src/db.c +++ b/src/db.c @@ -362,6 +362,12 @@ void shutdownCommand(redisClient *c) { return; } } + /* SHUTDOWN can be called even while the server is in "loading" state. + * When this happens we need to make sure no attempt is performed to save + * the dataset on shutdown (otherwise it could overwrite the current DB + * with half-read data). */ + if (server.loading) + flags = (flags & ~REDIS_SHUTDOWN_SAVE) | REDIS_SHUTDOWN_NOSAVE; if (prepareForShutdown(flags) == REDIS_OK) exit(0); addReplyError(c,"Errors trying to SHUTDOWN. Check logs."); } diff --git a/src/redis.c b/src/redis.c index cb1943a0d..c23978346 100644 --- a/src/redis.c +++ b/src/redis.c @@ -215,7 +215,7 @@ struct redisCommand redisCommandTable[] = { {"save",saveCommand,1,"ars",0,NULL,0,0,0,0,0}, {"bgsave",bgsaveCommand,1,"ar",0,NULL,0,0,0,0,0}, {"bgrewriteaof",bgrewriteaofCommand,1,"ar",0,NULL,0,0,0,0,0}, - {"shutdown",shutdownCommand,-1,"ar",0,NULL,0,0,0,0,0}, + {"shutdown",shutdownCommand,-1,"arl",0,NULL,0,0,0,0,0}, {"lastsave",lastsaveCommand,1,"rR",0,NULL,0,0,0,0,0}, {"type",typeCommand,2,"r",0,NULL,1,1,1,0,0}, {"multi",multiCommand,1,"rs",0,NULL,0,0,0,0,0}, From 9281336e6ba8ef8873b28dfb457c938b6bed4169 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 28 Jun 2013 16:39:49 +0200 Subject: [PATCH 0033/2500] ae.c event loop: API to resize the fd set size on the run. --- src/ae.c | 30 ++++++++++++++++++++++++++++++ src/ae.h | 2 ++ src/ae_epoll.c | 7 +++++++ src/ae_evport.c | 5 +++++ src/ae_kqueue.c | 8 +++++++- src/ae_select.c | 6 ++++++ 6 files changed, 57 insertions(+), 1 deletion(-) diff --git a/src/ae.c b/src/ae.c index 6ca9a5153..164f8fdeb 100644 --- a/src/ae.c +++ b/src/ae.c @@ -91,6 +91,36 @@ err: return NULL; } +/* Return the current set size. */ +int aeGetSetSize(aeEventLoop *eventLoop) { + return eventLoop->setsize; +} + +/* Resize the maximum set size of the event loop. + * If the requested set size is smaller than the current set size, but + * there is already a file descriptor in use that is >= the requested + * set size minus one, AE_ERR is returned and the operation is not + * performed at all. + * + * Otherwise AE_OK is returned and the operation is successful. */ +int aeResizeSetSize(aeEventLoop *eventLoop, int setsize) { + int i; + + if (setsize == eventLoop->setsize) return AE_OK; + if (eventLoop->maxfd >= setsize) return AE_ERR; + if (aeApiResize(eventLoop,setsize) == -1) return AE_ERR; + + eventLoop->events = zrealloc(eventLoop->events,sizeof(aeFileEvent)*setsize); + eventLoop->fired = zrealloc(eventLoop->fired,sizeof(aeFiredEvent)*setsize); + eventLoop->setsize = setsize; + + /* Make sure that if we created new slots, they are initialized with + * an AE_NONE mask. */ + for (i = eventLoop->maxfd+1; i < setsize; i++) + eventLoop->events[i].mask = AE_NONE; + return AE_OK; +} + void aeDeleteEventLoop(aeEventLoop *eventLoop) { aeApiFree(eventLoop); zfree(eventLoop->events); diff --git a/src/ae.h b/src/ae.h index 4d8950242..15ca1b5e7 100644 --- a/src/ae.h +++ b/src/ae.h @@ -114,5 +114,7 @@ int aeWait(int fd, int mask, long long milliseconds); void aeMain(aeEventLoop *eventLoop); char *aeGetApiName(void); void aeSetBeforeSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *beforesleep); +int aeGetSetSize(aeEventLoop *eventLoop); +int aeResizeSetSize(aeEventLoop *eventLoop, int setsize); #endif diff --git a/src/ae_epoll.c b/src/ae_epoll.c index 4823c281e..41af3e874 100644 --- a/src/ae_epoll.c +++ b/src/ae_epoll.c @@ -55,6 +55,13 @@ static int aeApiCreate(aeEventLoop *eventLoop) { return 0; } +static int aeApiResize(aeEventLoop *eventLoop, int setsize) { + aeApiState *state = eventLoop->apidata; + + state->events = zrealloc(state->events, sizeof(struct epoll_event)*setsize); + return 0; +} + static void aeApiFree(aeEventLoop *eventLoop) { aeApiState *state = eventLoop->apidata; diff --git a/src/ae_evport.c b/src/ae_evport.c index 94413c132..5c317becb 100644 --- a/src/ae_evport.c +++ b/src/ae_evport.c @@ -94,6 +94,11 @@ static int aeApiCreate(aeEventLoop *eventLoop) { return 0; } +static int aeApiResize(aeEventLoop *eventLoop, int setsize) { + /* Nothing to resize here. */ + return 0; +} + static void aeApiFree(aeEventLoop *eventLoop) { aeApiState *state = eventLoop->apidata; diff --git a/src/ae_kqueue.c b/src/ae_kqueue.c index 458772f7e..dbcc5805f 100644 --- a/src/ae_kqueue.c +++ b/src/ae_kqueue.c @@ -54,10 +54,16 @@ static int aeApiCreate(aeEventLoop *eventLoop) { return -1; } eventLoop->apidata = state; - return 0; } +static int aeApiResize(aeEventLoop *eventLoop, int setsize) { + aeApiState *state = eventLoop->apidata; + + state->events = zrealloc(state->events, sizeof(struct kevent)*setsize); + return 0; +} + static void aeApiFree(aeEventLoop *eventLoop) { aeApiState *state = eventLoop->apidata; diff --git a/src/ae_select.c b/src/ae_select.c index f732e8e1e..e2b7a9e8a 100644 --- a/src/ae_select.c +++ b/src/ae_select.c @@ -48,6 +48,12 @@ static int aeApiCreate(aeEventLoop *eventLoop) { return 0; } +static int aeApiResize(aeEventLoop *eventLoop, int setsize) { + /* Just ensure we have enough room in the fd_set type. */ + if (setsize >= FD_SETSIZE) return -1; + return 0; +} + static void aeApiFree(aeEventLoop *eventLoop) { zfree(eventLoop->apidata); } From f4805fa45a4db6fa114b8e4d964a0de7483bee78 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 28 Jun 2013 17:08:03 +0200 Subject: [PATCH 0034/2500] CONFIG SET maxclients. --- src/config.c | 31 ++++++++++++++++++++++++++++--- src/redis.c | 2 +- src/redis.h | 5 +++++ 3 files changed, 34 insertions(+), 4 deletions(-) diff --git a/src/config.c b/src/config.c index cd0ef04e2..ae1c7abc3 100644 --- a/src/config.c +++ b/src/config.c @@ -559,10 +559,35 @@ void configSetCommand(redisClient *c) { } freeMemoryIfNeeded(); } + } else if (!strcasecmp(c->argv[2]->ptr,"maxclients")) { + int orig_value = server.maxclients; + + if (getLongLongFromObject(o,&ll) == REDIS_ERR || ll < 0) goto badfmt; + + /* Try to check if the OS is capable of supporting so many FDs. */ + server.maxclients = ll; + if (ll > orig_value) { + adjustOpenFilesLimit(); + if (server.maxclients != ll) { + addReplyErrorFormat(c,"The operating system is not able to handle the specified number of clients, try with %d", server.maxclients); + server.maxclients = orig_value; + return; + } + if (aeGetSetSize(server.el) < + server.maxclients + REDIS_EVENTLOOP_FDSET_INCR) + { + if (aeResizeSetSize(server.el, + server.maxclients + REDIS_EVENTLOOP_FDSET_INCR) == AE_ERR) + { + addReplyError(c,"The event loop API used by Redis is not able to handle the specified number of clients"); + server.maxclients = orig_value; + return; + } + } + } } else if (!strcasecmp(c->argv[2]->ptr,"hz")) { - if (getLongLongFromObject(o,&ll) == REDIS_ERR || - ll < 0) goto badfmt; - server.hz = (int) ll; + if (getLongLongFromObject(o,&ll) == REDIS_ERR || ll < 0) goto badfmt; + server.hz = ll; if (server.hz < REDIS_MIN_HZ) server.hz = REDIS_MIN_HZ; if (server.hz > REDIS_MAX_HZ) server.hz = REDIS_MAX_HZ; } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory-policy")) { diff --git a/src/redis.c b/src/redis.c index c23978346..e16c9bb2e 100644 --- a/src/redis.c +++ b/src/redis.c @@ -1422,7 +1422,7 @@ void initServer() { createSharedObjects(); adjustOpenFilesLimit(); - server.el = aeCreateEventLoop(server.maxclients+1024); + server.el = aeCreateEventLoop(server.maxclients+REDIS_EVENTLOOP_FDSET_INCR); server.db = zmalloc(sizeof(redisDb)*server.dbnum); if (server.port != 0) { diff --git a/src/redis.h b/src/redis.h index 78d97ca22..dade5e2a2 100644 --- a/src/redis.h +++ b/src/redis.h @@ -130,6 +130,10 @@ #define REDIS_MBULK_BIG_ARG (1024*32) #define REDIS_LONGSTR_SIZE 21 /* Bytes needed for long -> str */ #define REDIS_AOF_AUTOSYNC_BYTES (1024*1024*32) /* fdatasync every 32MB */ +/* When configuring the Redis eventloop, we setup it so that the total number + * of file descriptors we can handle are server.maxclients + FDSET_INCR + * that is our safety margin. */ +#define REDIS_EVENTLOOP_FDSET_INCR 128 /* Hash table parameters */ #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */ @@ -1239,6 +1243,7 @@ int htNeedsResize(dict *dict); void oom(const char *msg); void populateCommandTable(void); void resetCommandTableStats(void); +void adjustOpenFilesLimit(void); /* Set data type */ robj *setTypeCreate(robj *value); From f57a871de008de559baf23a4d9215d912987d073 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 2 Jul 2013 11:56:52 +0200 Subject: [PATCH 0035/2500] getAbsolutePath() moved into utils.c --- src/redis.c | 52 ---------------------------------------------------- src/util.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ src/util.h | 3 +++ 3 files changed, 55 insertions(+), 52 deletions(-) diff --git a/src/redis.c b/src/redis.c index e16c9bb2e..81601273e 100644 --- a/src/redis.c +++ b/src/redis.c @@ -2844,58 +2844,6 @@ void redisSetProcTitle(char *title) { #endif } -/* Given the filename, return the absolute path as an SDS string, or NULL - * if it fails for some reason. Note that "filename" may be an absolute path - * already, this will be detected and handled correctly. - * - * The function does not try to normalize everything, but only the obvious - * case of one or more "../" appearning at the start of "filename" - * relative path. */ -sds getAbsolutePath(char *filename) { - char cwd[1024]; - sds abspath; - sds relpath = sdsnew(filename); - - relpath = sdstrim(relpath," \r\n\t"); - if (relpath[0] == '/') return relpath; /* Path is already absolute. */ - - /* If path is relative, join cwd and relative path. */ - if (getcwd(cwd,sizeof(cwd)) == NULL) { - sdsfree(relpath); - return NULL; - } - abspath = sdsnew(cwd); - if (sdslen(abspath) && abspath[sdslen(abspath)-1] != '/') - abspath = sdscat(abspath,"/"); - - /* At this point we have the current path always ending with "/", and - * the trimmed relative path. Try to normalize the obvious case of - * trailing ../ elements at the start of the path. - * - * For every "../" we find in the filename, we remove it and also remove - * the last element of the cwd, unless the current cwd is "/". */ - while (sdslen(relpath) >= 3 && - relpath[0] == '.' && relpath[1] == '.' && relpath[2] == '/') - { - relpath = sdsrange(relpath,3,-1); - if (sdslen(abspath) > 1) { - char *p = abspath + sdslen(abspath)-2; - int trimlen = 1; - - while(*p != '/') { - p--; - trimlen++; - } - abspath = sdsrange(abspath,0,-(trimlen+1)); - } - } - - /* Finally glue the two parts together. */ - abspath = sdscatsds(abspath,relpath); - sdsfree(relpath); - return abspath; -} - int main(int argc, char **argv) { struct timeval tv; diff --git a/src/util.c b/src/util.c index 24f936b66..4b77e9fef 100644 --- a/src/util.c +++ b/src/util.c @@ -405,6 +405,58 @@ void getRandomHexChars(char *p, unsigned int len) { fclose(fp); } +/* Given the filename, return the absolute path as an SDS string, or NULL + * if it fails for some reason. Note that "filename" may be an absolute path + * already, this will be detected and handled correctly. + * + * The function does not try to normalize everything, but only the obvious + * case of one or more "../" appearning at the start of "filename" + * relative path. */ +sds getAbsolutePath(char *filename) { + char cwd[1024]; + sds abspath; + sds relpath = sdsnew(filename); + + relpath = sdstrim(relpath," \r\n\t"); + if (relpath[0] == '/') return relpath; /* Path is already absolute. */ + + /* If path is relative, join cwd and relative path. */ + if (getcwd(cwd,sizeof(cwd)) == NULL) { + sdsfree(relpath); + return NULL; + } + abspath = sdsnew(cwd); + if (sdslen(abspath) && abspath[sdslen(abspath)-1] != '/') + abspath = sdscat(abspath,"/"); + + /* At this point we have the current path always ending with "/", and + * the trimmed relative path. Try to normalize the obvious case of + * trailing ../ elements at the start of the path. + * + * For every "../" we find in the filename, we remove it and also remove + * the last element of the cwd, unless the current cwd is "/". */ + while (sdslen(relpath) >= 3 && + relpath[0] == '.' && relpath[1] == '.' && relpath[2] == '/') + { + relpath = sdsrange(relpath,3,-1); + if (sdslen(abspath) > 1) { + char *p = abspath + sdslen(abspath)-2; + int trimlen = 1; + + while(*p != '/') { + p--; + trimlen++; + } + abspath = sdsrange(abspath,0,-(trimlen+1)); + } + } + + /* Finally glue the two parts together. */ + abspath = sdscatsds(abspath,relpath); + sdsfree(relpath); + return abspath; +} + #ifdef UTIL_TEST_MAIN #include diff --git a/src/util.h b/src/util.h index 48425245a..8e9b0281d 100644 --- a/src/util.h +++ b/src/util.h @@ -30,6 +30,8 @@ #ifndef __REDIS_UTIL_H #define __REDIS_UTIL_H +#include "sds.h" + int stringmatchlen(const char *p, int plen, const char *s, int slen, int nocase); int stringmatch(const char *p, const char *s, int nocase); long long memtoll(const char *p, int *err); @@ -37,5 +39,6 @@ int ll2string(char *s, size_t len, long long value); int string2ll(const char *s, size_t slen, long long *value); int string2l(const char *s, size_t slen, long *value); int d2string(char *buf, size_t len, double value); +sds getAbsolutePath(char *filename); #endif From 0726c259006c733612e90bcca27a73eb3729ae81 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 2 Jul 2013 12:08:07 +0200 Subject: [PATCH 0036/2500] pathIsBaseName() added to utils.c The function is used to test that the specified string looks like just as the basename of a path, without any absolute or relative path. --- src/util.c | 8 ++++++++ src/util.h | 1 + 2 files changed, 9 insertions(+) diff --git a/src/util.c b/src/util.c index 4b77e9fef..022a6adf4 100644 --- a/src/util.c +++ b/src/util.c @@ -457,6 +457,14 @@ sds getAbsolutePath(char *filename) { return abspath; } +/* Return true if the specified path is just a file basename without any + * relative or absolute path. This function just checks that no / or \ + * character exists inside the specified path, that's enough in the + * environments where Redis runs. */ +int pathIsBaseName(char *path) { + return strchr(path,'/') == NULL && strchr(path,'\\') == NULL; +} + #ifdef UTIL_TEST_MAIN #include diff --git a/src/util.h b/src/util.h index 8e9b0281d..b3667cd6f 100644 --- a/src/util.h +++ b/src/util.h @@ -40,5 +40,6 @@ int string2ll(const char *s, size_t slen, long long *value); int string2l(const char *s, size_t slen, long *value); int d2string(char *buf, size_t len, double value); sds getAbsolutePath(char *filename); +int pathIsBaseName(char *path); #endif From 37b5ec9425339fcc0321c817fc774878d5fe3e05 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 2 Jul 2013 12:14:28 +0200 Subject: [PATCH 0037/2500] Only allow basenames for dbfilename and appendfilename. This fixes issue #1094. --- src/config.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/config.c b/src/config.c index ae1c7abc3..01bcafb7a 100644 --- a/src/config.c +++ b/src/config.c @@ -306,6 +306,10 @@ void loadServerConfigFromString(char *config) { } server.aof_state = yes ? REDIS_AOF_ON : REDIS_AOF_OFF; } else if (!strcasecmp(argv[0],"appendfilename") && argc == 2) { + if (!pathIsBaseName(argv[1])) { + err = "appendfilename can't be a path, just a filename"; + goto loaderr; + } zfree(server.aof_filename); server.aof_filename = zstrdup(argv[1]); } else if (!strcasecmp(argv[0],"no-appendfsync-on-rewrite") @@ -352,6 +356,10 @@ void loadServerConfigFromString(char *config) { zfree(server.pidfile); server.pidfile = zstrdup(argv[1]); } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) { + if (!pathIsBaseName(argv[1])) { + err = "dbfilename can't be a path, just a filename"; + goto loaderr; + } zfree(server.rdb_filename); server.rdb_filename = zstrdup(argv[1]); } else if (!strcasecmp(argv[0],"hash-max-ziplist-entries") && argc == 2) { @@ -540,6 +548,10 @@ void configSetCommand(redisClient *c) { o = c->argv[3]; if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) { + if (!pathIsBaseName(o->ptr)) { + addReplyError(c, "dbfilename can't be a path, just a filename"); + return; + } zfree(server.rdb_filename); server.rdb_filename = zstrdup(o->ptr); } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) { From 4a6701d7ec2f8faf15578d77b7e911747a73185e Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 2 Jul 2013 17:44:42 +0200 Subject: [PATCH 0038/2500] pqsort.c: remove the "switch to insertion sort" optimization. It causes catastrophic performance for certain inputs. Relevant NetBSD commit: http://cvsweb.netbsd.org/bsdweb.cgi/src/lib/libc/stdlib/qsort.c?rev=1.20&content-type=text/x-cvsweb-markup&only_with_tag=MAIN This fixes issue #968. --- src/pqsort.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/src/pqsort.c b/src/pqsort.c index 9c57aacd0..57c217f94 100644 --- a/src/pqsort.c +++ b/src/pqsort.c @@ -102,10 +102,9 @@ _pqsort(void *a, size_t n, size_t es, { char *pa, *pb, *pc, *pd, *pl, *pm, *pn; size_t d, r; - int swaptype, swap_cnt, cmp_result; + int swaptype, cmp_result; loop: SWAPINIT(a, es); - swap_cnt = 0; if (n < 7) { for (pm = (char *) a + es; pm < (char *) a + n * es; pm += es) for (pl = pm; pl > (char *) a && cmp(pl - es, pl) > 0; @@ -132,7 +131,6 @@ loop: SWAPINIT(a, es); for (;;) { while (pb <= pc && (cmp_result = cmp(pb, a)) <= 0) { if (cmp_result == 0) { - swap_cnt = 1; swap(pa, pb); pa += es; } @@ -140,7 +138,6 @@ loop: SWAPINIT(a, es); } while (pb <= pc && (cmp_result = cmp(pc, a)) >= 0) { if (cmp_result == 0) { - swap_cnt = 1; swap(pc, pd); pd -= es; } @@ -149,17 +146,9 @@ loop: SWAPINIT(a, es); if (pb > pc) break; swap(pb, pc); - swap_cnt = 1; pb += es; pc -= es; } - if (swap_cnt == 0) { /* Switch to insertion sort */ - for (pm = (char *) a + es; pm < (char *) a + n * es; pm += es) - for (pl = pm; pl > (char *) a && cmp(pl - es, pl) > 0; - pl -= es) - swap(pl, pl - es); - return; - } pn = (char *) a + n * es; r = min(pa - (char *) a, pb - pa); From 703cd738dcd3b8f6def30f48637a7c8e352692e0 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 3 Jul 2013 11:59:44 +0200 Subject: [PATCH 0039/2500] redis-cli --pipe: send final ECHO in a safer way. If the protocol read from stdin happened to contain grabage (invalid random chars), in the previous implementation it was possible to end with something like: dksfjdksjflskfjl*2\r\n$4\r\nECHO.... That is invalid as the *2 should start into a new line. Now we prefix the ECHO with a CRLF that has no effects on the server but prevents this issues most of the times. Of course if the offending wrong sequence is something like: $3248772349\r\n No one is going to save us as Redis will wait for data in the context of a big argument, so this fix does not cover all the cases. This partially fixes issue #681. --- src/redis-cli.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/redis-cli.c b/src/redis-cli.c index 5914fd2c4..8cf1c4646 100644 --- a/src/redis-cli.c +++ b/src/redis-cli.c @@ -1204,8 +1204,12 @@ static void pipeMode(void) { ssize_t nread = read(STDIN_FILENO,obuf,sizeof(obuf)); if (nread == 0) { + /* The ECHO sequence starts with a "\r\n" so that if there + * is garbage in the protocol we read from stdin, the ECHO + * will likely still be properly formatted. + * CRLF is ignored by Redis, so it has no effects. */ char echo[] = - "*2\r\n$4\r\nECHO\r\n$20\r\n01234567890123456789\r\n"; + "\r\n*2\r\n$4\r\nECHO\r\n$20\r\n01234567890123456789\r\n"; int j; eof = 1; @@ -1214,7 +1218,7 @@ static void pipeMode(void) { * to make sure everything was read from the server. */ for (j = 0; j < 20; j++) magic[j] = rand() & 0xff; - memcpy(echo+19,magic,20); + memcpy(echo+21,magic,20); memcpy(obuf,echo,sizeof(echo)-1); obuf_len = sizeof(echo)-1; obuf_pos = 0; From 01430e6599e79a6642ad3220c19efd633d01d3cd Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 3 Jul 2013 12:18:55 +0200 Subject: [PATCH 0040/2500] redis-cli: introduced --pipe-timeout. When in --pipe mode, after all the data transfer to the server is complete, now redis-cli waits at max the specified amount of seconds (30 by default, use 0 to wait forever) without receiving any reply at all from the server. After this time limit the operation is aborted with an error. That's related to issue #681. --- src/redis-cli.c | 74 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 49 insertions(+), 25 deletions(-) diff --git a/src/redis-cli.c b/src/redis-cli.c index 8cf1c4646..d52076f0b 100644 --- a/src/redis-cli.c +++ b/src/redis-cli.c @@ -58,6 +58,7 @@ #define OUTPUT_RAW 1 #define OUTPUT_CSV 2 #define REDIS_CLI_KEEPALIVE_INTERVAL 15 /* seconds */ +#define REDIS_DEFAULT_PIPE_TIMEOUT 30 /* seconds */ static redisContext *context; static struct config { @@ -77,6 +78,7 @@ static struct config { int cluster_reissue_command; int slave_mode; int pipe_mode; + int pipe_timeout; int getrdb_mode; int stat_mode; char *rdb_filename; @@ -714,6 +716,8 @@ static int parseOptions(int argc, char **argv) { config.rdb_filename = argv[++i]; } else if (!strcmp(argv[i],"--pipe")) { config.pipe_mode = 1; + } else if (!strcmp(argv[i],"--pipe-timeout") && !lastarg) { + config.pipe_timeout = atoi(argv[++i]); } else if (!strcmp(argv[i],"--bigkeys")) { config.bigkeys = 1; } else if (!strcmp(argv[i],"--eval") && !lastarg) { @@ -766,29 +770,32 @@ static void usage() { "redis-cli %s\n" "\n" "Usage: redis-cli [OPTIONS] [cmd [arg [arg ...]]]\n" -" -h Server hostname (default: 127.0.0.1)\n" -" -p Server port (default: 6379)\n" -" -s Server socket (overrides hostname and port)\n" -" -a Password to use when connecting to the server\n" -" -r Execute specified command N times\n" -" -i When -r is used, waits seconds per command.\n" -" It is possible to specify sub-second times like -i 0.1\n" -" -n Database number\n" -" -x Read last argument from STDIN\n" -" -d Multi-bulk delimiter in for raw formatting (default: \\n)\n" -" -c Enable cluster mode (follow -ASK and -MOVED redirections)\n" -" --raw Use raw formatting for replies (default when STDOUT is\n" -" not a tty)\n" -" --latency Enter a special mode continuously sampling latency\n" -" --latency-history Like --latency but tracking latency changes over time.\n" -" Default time interval is 15 sec. Change it using -i.\n" -" --slave Simulate a slave showing commands received from the master\n" -" --rdb Transfer an RDB dump from remote server to local file.\n" -" --pipe Transfer raw Redis protocol from stdin to server\n" -" --bigkeys Sample Redis keys looking for big keys\n" -" --eval Send an EVAL command using the Lua script at \n" -" --help Output this help and exit\n" -" --version Output version and exit\n" +" -h Server hostname (default: 127.0.0.1)\n" +" -p Server port (default: 6379)\n" +" -s Server socket (overrides hostname and port)\n" +" -a Password to use when connecting to the server\n" +" -r Execute specified command N times\n" +" -i When -r is used, waits seconds per command.\n" +" It is possible to specify sub-second times like -i 0.1\n" +" -n Database number\n" +" -x Read last argument from STDIN\n" +" -d Multi-bulk delimiter in for raw formatting (default: \\n)\n" +" -c Enable cluster mode (follow -ASK and -MOVED redirections)\n" +" --raw Use raw formatting for replies (default when STDOUT is\n" +" not a tty)\n" +" --latency Enter a special mode continuously sampling latency\n" +" --latency-history Like --latency but tracking latency changes over time.\n" +" Default time interval is 15 sec. Change it using -i.\n" +" --slave Simulate a slave showing commands received from the master\n" +" --rdb Transfer an RDB dump from remote server to local file.\n" +" --pipe Transfer raw Redis protocol from stdin to server\n" +" --pipe-timeout In --pipe mode, abort with error if after sending all data\n" +" no reply is received within seconds.\n" +" Default timeout: %d. Use 0 to wait forever.\n" +" --bigkeys Sample Redis keys looking for big keys\n" +" --eval Send an EVAL command using the Lua script at \n" +" --help Output this help and exit\n" +" --version Output version and exit\n" "\n" "Examples:\n" " cat /etc/passwd | redis-cli -x set mypasswd\n" @@ -801,7 +808,7 @@ static void usage() { "When no command is given, redis-cli starts in interactive mode.\n" "Type \"help\" in interactive mode for information on available commands.\n" "\n", - version); + version, REDIS_DEFAULT_PIPE_TIMEOUT); sdsfree(version); exit(1); } @@ -1119,6 +1126,7 @@ static void pipeMode(void) { int eof = 0; /* True once we consumed all the standard input. */ int done = 0; char magic[20]; /* Special reply we recognize. */ + time_t last_read_time = time(NULL); srand(time(NULL)); @@ -1149,7 +1157,10 @@ static void pipeMode(void) { strerror(errno)); exit(1); } - if (nread > 0) redisReaderFeed(reader,ibuf,nread); + if (nread > 0) { + redisReaderFeed(reader,ibuf,nread); + last_read_time = time(NULL); + } } while(nread > 0); /* Consume replies. */ @@ -1235,6 +1246,18 @@ static void pipeMode(void) { if (obuf_len == 0 && eof) break; } } + + /* Handle timeout, that is, we reached EOF, and we are not getting + * replies from the server for a few seconds, nor the final ECHO is + * received. */ + if (eof && config.pipe_timeout > 0 && + time(NULL)-last_read_time > config.pipe_timeout) + { + fprintf(stderr,"No replies for %d seconds: exiting.\n", + config.pipe_timeout); + errors++; + break; + } } redisReaderFree(reader); printf("errors: %lld, replies: %lld\n", errors, replies); @@ -1490,6 +1513,7 @@ int main(int argc, char **argv) { config.getrdb_mode = 0; config.rdb_filename = NULL; config.pipe_mode = 0; + config.pipe_timeout = REDIS_DEFAULT_PIPE_TIMEOUT; config.bigkeys = 0; config.stdinarg = 0; config.auth = NULL; From de40c9df4f83c71198fcb47265dbf6fd992056f9 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 4 Jul 2013 18:30:59 +0200 Subject: [PATCH 0041/2500] sds.c: new function sdsjoin() to join strings. --- src/sds.c | 13 +++++++++++++ src/sds.h | 1 + 2 files changed, 14 insertions(+) diff --git a/src/sds.c b/src/sds.c index 4cf700b9a..aa45fc4f5 100644 --- a/src/sds.c +++ b/src/sds.c @@ -621,6 +621,19 @@ sds sdsmapchars(sds s, const char *from, const char *to, size_t setlen) { return s; } +/* Join an array of C strings using the specified separator (also a C string). + * Returns the result as an sds string. */ +sds sdsjoin(char **argv, int argc, char *sep) { + sds join = sdsempty(); + int j; + + for (j = 0; j < argc; j++) { + join = sdscat(join, argv[j]); + if (j != argc-1) join = sdscat(join,sep); + } + return join; +} + #ifdef SDS_TEST_MAIN #include #include "testhelp.h" diff --git a/src/sds.h b/src/sds.h index c5a4f30a9..46d914fd1 100644 --- a/src/sds.h +++ b/src/sds.h @@ -89,6 +89,7 @@ sds sdsfromlonglong(long long value); sds sdscatrepr(sds s, const char *p, size_t len); sds *sdssplitargs(const char *line, int *argc); sds sdsmapchars(sds s, const char *from, const char *to, size_t setlen); +sds sdsjoin(char **argv, int argc, char *sep); /* Low level functions exposed to the user API */ sds sdsMakeRoomFor(sds s, size_t addlen); From e0be252d5ea895ed0adb43d8d14f2bee233c6953 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 4 Jul 2013 18:48:46 +0200 Subject: [PATCH 0042/2500] anet.c: Allow creation of TCP listening sockets bound to N addresses. --- src/anet.c | 23 +++++++++++++++-------- src/anet.h | 2 +- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/src/anet.c b/src/anet.c index 963b6688e..7a6b7d4bf 100644 --- a/src/anet.c +++ b/src/anet.c @@ -331,9 +331,9 @@ static int anetListen(char *err, int s, struct sockaddr *sa, socklen_t len) { return ANET_OK; } -int anetTcpServer(char *err, int port, char *bindaddr) +int anetTcpServer(char *err, int port, char **bindaddr, int bindaddr_count) { - int s; + int s, j; struct sockaddr_in sa; if ((s = anetCreateSocket(err,AF_INET)) == ANET_ERR) @@ -343,13 +343,20 @@ int anetTcpServer(char *err, int port, char *bindaddr) sa.sin_family = AF_INET; sa.sin_port = htons(port); sa.sin_addr.s_addr = htonl(INADDR_ANY); - if (bindaddr && inet_aton(bindaddr, &sa.sin_addr) == 0) { - anetSetError(err, "invalid bind address"); - close(s); - return ANET_ERR; + if (bindaddr_count) { + for (j = 0; j < bindaddr_count; j++) { + if (inet_aton(bindaddr[j], &sa.sin_addr) == 0) { + anetSetError(err, "invalid bind address"); + close(s); + return ANET_ERR; + } + if (anetListen(err,s,(struct sockaddr*)&sa,sizeof(sa)) == ANET_ERR) + return ANET_ERR; + } + } else { + if (anetListen(err,s,(struct sockaddr*)&sa,sizeof(sa)) == ANET_ERR) + return ANET_ERR; } - if (anetListen(err,s,(struct sockaddr*)&sa,sizeof(sa)) == ANET_ERR) - return ANET_ERR; return s; } diff --git a/src/anet.h b/src/anet.h index 696c2c225..bf76dd24d 100644 --- a/src/anet.h +++ b/src/anet.h @@ -45,7 +45,7 @@ int anetUnixConnect(char *err, char *path); int anetUnixNonBlockConnect(char *err, char *path); int anetRead(int fd, char *buf, int count); int anetResolve(char *err, char *host, char *ipbuf); -int anetTcpServer(char *err, int port, char *bindaddr); +int anetTcpServer(char *err, int port, char **bindaddr, int bindaddr_count); int anetUnixServer(char *err, char *path, mode_t perm); int anetTcpAccept(char *err, int serversock, char *ip, int *port); int anetUnixAccept(char *err, int serversock); From c5e87a13de69dba4f9d9017271ba59fe36231144 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 4 Jul 2013 18:49:49 +0200 Subject: [PATCH 0043/2500] Cluster: use new anet.c listening socket creation API. --- src/cluster.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/cluster.c b/src/cluster.c index 2f77941d4..670aedcb7 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -252,7 +252,8 @@ void clusterInit(void) { if (saveconf) clusterSaveConfigOrDie(); /* We need a listening TCP port for our cluster messaging needs */ server.cfd = anetTcpServer(server.neterr, - server.port+REDIS_CLUSTER_PORT_INCR, server.bindaddr); + server.port+REDIS_CLUSTER_PORT_INCR, server.bindaddr, + server.bindaddr_count); if (server.cfd == -1) { redisLog(REDIS_WARNING, "Opening cluster TCP port: %s", server.neterr); exit(1); From bd9b1251acb85d39a57f9c7758d525132ab5f1cf Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 4 Jul 2013 18:50:15 +0200 Subject: [PATCH 0044/2500] Ability to bind multiple addresses. --- src/config.c | 41 +++++++++++++++++++++++++++++++++++++---- src/redis.c | 6 +++--- src/redis.h | 4 +++- 3 files changed, 43 insertions(+), 8 deletions(-) diff --git a/src/config.c b/src/config.c index 01bcafb7a..78b458dcc 100644 --- a/src/config.c +++ b/src/config.c @@ -125,8 +125,15 @@ void loadServerConfigFromString(char *config) { if (server.port < 0 || server.port > 65535) { err = "Invalid port"; goto loaderr; } - } else if (!strcasecmp(argv[0],"bind") && argc == 2) { - server.bindaddr = zstrdup(argv[1]); + } else if (!strcasecmp(argv[0],"bind") && argc >= 2) { + int j, addresses = argc-1; + + if (addresses > REDIS_BINDADDR_MAX) { + err = "Too many bind addresses specified"; goto loaderr; + } + for (j = 0; j < addresses; j++) + server.bindaddr[j] = zstrdup(argv[j+1]); + server.bindaddr_count = addresses; } else if (!strcasecmp(argv[0],"unixsocket") && argc == 2) { server.unixsocket = zstrdup(argv[1]); } else if (!strcasecmp(argv[0],"unixsocketperm") && argc == 2) { @@ -917,7 +924,6 @@ void configGetCommand(redisClient *c) { config_get_string_field("dbfilename",server.rdb_filename); config_get_string_field("requirepass",server.requirepass); config_get_string_field("masterauth",server.masterauth); - config_get_string_field("bind",server.bindaddr); config_get_string_field("unixsocket",server.unixsocket); config_get_string_field("logfile",server.logfile); config_get_string_field("pidfile",server.pidfile); @@ -1104,6 +1110,14 @@ void configGetCommand(redisClient *c) { decrRefCount(flagsobj); matches++; } + if (stringmatch(pattern,"bind",0)) { + sds aux = sdsjoin(server.bindaddr,server.bindaddr_count," "); + + addReplyBulkCString(c,"bind"); + addReplyBulkCString(c,aux); + sdsfree(aux); + matches++; + } setDeferredMultiBulkLength(c,replylen,matches*2); } @@ -1495,6 +1509,25 @@ void rewriteConfigClientoutputbufferlimitOption(struct rewriteConfigState *state } } +/* Rewrite the bind option. */ +void rewriteConfigBindOption(struct rewriteConfigState *state) { + int force = 1; + sds line, addresses; + char *option = "bind"; + + /* Nothing to rewrite if we don't have bind addresses. */ + if (server.bindaddr_count == 0) return; + + /* Rewrite as bind ... */ + addresses = sdsjoin(server.bindaddr,server.bindaddr_count," "); + line = sdsnew(option); + line = sdscatlen(line, " ", 1); + line = sdscatsds(line, addresses); + sdsfree(addresses); + + rewriteConfigRewriteLine(state,option,line,force); +} + /* Glue together the configuration lines in the current configuration * rewrite state into a single string, stripping multiple empty lines. */ sds rewriteConfigGetContentFromState(struct rewriteConfigState *state) { @@ -1630,7 +1663,7 @@ int rewriteConfig(char *path) { rewriteConfigYesNoOption(state,"daemonize",server.daemonize,0); rewriteConfigStringOption(state,"pidfile",server.pidfile,REDIS_DEFAULT_PID_FILE); rewriteConfigNumericalOption(state,"port",server.port,REDIS_SERVERPORT); - rewriteConfigStringOption(state,"bind",server.bindaddr,NULL); + rewriteConfigBindOption(state); rewriteConfigStringOption(state,"unixsocket",server.unixsocket,NULL); rewriteConfigOctalOption(state,"unixsocketperm",server.unixsocketperm,REDIS_DEFAULT_UNIX_SOCKET_PERM); rewriteConfigNumericalOption(state,"timeout",server.maxidletime,REDIS_MAXIDLETIME); diff --git a/src/redis.c b/src/redis.c index 81601273e..a78ecfb51 100644 --- a/src/redis.c +++ b/src/redis.c @@ -1221,7 +1221,7 @@ void initServerConfig() { server.runid[REDIS_RUN_ID_SIZE] = '\0'; server.arch_bits = (sizeof(long) == 8) ? 64 : 32; server.port = REDIS_SERVERPORT; - server.bindaddr = NULL; + server.bindaddr_count = 0; server.unixsocket = NULL; server.unixsocketperm = REDIS_DEFAULT_UNIX_SOCKET_PERM; server.ipfd = -1; @@ -1426,7 +1426,7 @@ void initServer() { server.db = zmalloc(sizeof(redisDb)*server.dbnum); if (server.port != 0) { - server.ipfd = anetTcpServer(server.neterr,server.port,server.bindaddr); + server.ipfd = anetTcpServer(server.neterr,server.port,server.bindaddr,server.bindaddr_count); if (server.ipfd == ANET_ERR) { redisLog(REDIS_WARNING, "Opening port %d: %s", server.port, server.neterr); @@ -2837,7 +2837,7 @@ void redisSetProcTitle(char *title) { #ifdef USE_SETPROCTITLE setproctitle("%s %s:%d", title, - server.bindaddr ? server.bindaddr : "*", + server.bindaddr_count ? server.bindaddr[0] : "*", server.port); #else REDIS_NOTUSED(title); diff --git a/src/redis.h b/src/redis.h index dade5e2a2..1df4084ed 100644 --- a/src/redis.h +++ b/src/redis.h @@ -121,6 +121,7 @@ #define REDIS_DEFAULT_MIN_SLAVES_TO_WRITE 0 #define REDIS_DEFAULT_MIN_SLAVES_MAX_LAG 10 #define REDIS_IP_STR_LEN 16 +#define REDIS_BINDADDR_MAX 16 /* Protocol and I/O related defines */ #define REDIS_MAX_QUERYBUF_LEN (1024*1024*1024) /* 1GB max query buffer. */ @@ -730,7 +731,8 @@ struct redisServer { int sentinel_mode; /* True if this instance is a Sentinel. */ /* Networking */ int port; /* TCP listening port */ - char *bindaddr; /* Bind address or NULL */ + char *bindaddr[REDIS_BINDADDR_MAX]; /* Addresses we should bind to */ + int bindaddr_count; /* Number of addresses in server.bindaddr[] */ char *unixsocket; /* UNIX socket path */ mode_t unixsocketperm; /* UNIX socket permission */ int ipfd; /* TCP socket file descriptor */ From f6c4fa5ff2054a84baeec13b2416af30890d665b Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 5 Jul 2013 11:07:55 +0200 Subject: [PATCH 0045/2500] Revert "anet.c: Allow creation of TCP listening sockets bound to N addresses." Bind() can't be called multiple times against the same socket, multiple sockets are required to bind multiple interfaces, silly me. This reverts commit e0be252d5ea895ed0adb43d8d14f2bee233c6953. --- src/anet.c | 23 ++++++++--------------- src/anet.h | 2 +- 2 files changed, 9 insertions(+), 16 deletions(-) diff --git a/src/anet.c b/src/anet.c index 7a6b7d4bf..963b6688e 100644 --- a/src/anet.c +++ b/src/anet.c @@ -331,9 +331,9 @@ static int anetListen(char *err, int s, struct sockaddr *sa, socklen_t len) { return ANET_OK; } -int anetTcpServer(char *err, int port, char **bindaddr, int bindaddr_count) +int anetTcpServer(char *err, int port, char *bindaddr) { - int s, j; + int s; struct sockaddr_in sa; if ((s = anetCreateSocket(err,AF_INET)) == ANET_ERR) @@ -343,20 +343,13 @@ int anetTcpServer(char *err, int port, char **bindaddr, int bindaddr_count) sa.sin_family = AF_INET; sa.sin_port = htons(port); sa.sin_addr.s_addr = htonl(INADDR_ANY); - if (bindaddr_count) { - for (j = 0; j < bindaddr_count; j++) { - if (inet_aton(bindaddr[j], &sa.sin_addr) == 0) { - anetSetError(err, "invalid bind address"); - close(s); - return ANET_ERR; - } - if (anetListen(err,s,(struct sockaddr*)&sa,sizeof(sa)) == ANET_ERR) - return ANET_ERR; - } - } else { - if (anetListen(err,s,(struct sockaddr*)&sa,sizeof(sa)) == ANET_ERR) - return ANET_ERR; + if (bindaddr && inet_aton(bindaddr, &sa.sin_addr) == 0) { + anetSetError(err, "invalid bind address"); + close(s); + return ANET_ERR; } + if (anetListen(err,s,(struct sockaddr*)&sa,sizeof(sa)) == ANET_ERR) + return ANET_ERR; return s; } diff --git a/src/anet.h b/src/anet.h index bf76dd24d..696c2c225 100644 --- a/src/anet.h +++ b/src/anet.h @@ -45,7 +45,7 @@ int anetUnixConnect(char *err, char *path); int anetUnixNonBlockConnect(char *err, char *path); int anetRead(int fd, char *buf, int count); int anetResolve(char *err, char *host, char *ipbuf); -int anetTcpServer(char *err, int port, char **bindaddr, int bindaddr_count); +int anetTcpServer(char *err, int port, char *bindaddr); int anetUnixServer(char *err, char *path, mode_t perm); int anetTcpAccept(char *err, int serversock, char *ip, int *port); int anetUnixAccept(char *err, int serversock); From 8ea3b1e79dec2b2e8859a5c2a13ff7a80b5eca53 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 5 Jul 2013 11:08:44 +0200 Subject: [PATCH 0046/2500] Revert "Cluster: use new anet.c listening socket creation API." This reverts commit c5e87a13de69dba4f9d9017271ba59fe36231144. --- src/cluster.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 670aedcb7..2f77941d4 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -252,8 +252,7 @@ void clusterInit(void) { if (saveconf) clusterSaveConfigOrDie(); /* We need a listening TCP port for our cluster messaging needs */ server.cfd = anetTcpServer(server.neterr, - server.port+REDIS_CLUSTER_PORT_INCR, server.bindaddr, - server.bindaddr_count); + server.port+REDIS_CLUSTER_PORT_INCR, server.bindaddr); if (server.cfd == -1) { redisLog(REDIS_WARNING, "Opening cluster TCP port: %s", server.neterr); exit(1); From d3cde096450bbaebc1b16394a34ef3d8237b7b5d Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 5 Jul 2013 11:47:20 +0200 Subject: [PATCH 0047/2500] Binding multiple IPs done properly with multiple sockets. --- src/aof.c | 3 +-- src/cluster.c | 26 +++++++++++++------ src/rdb.c | 3 +-- src/redis.c | 71 +++++++++++++++++++++++++++++++++++++++------------ src/redis.h | 7 +++-- 5 files changed, 79 insertions(+), 31 deletions(-) diff --git a/src/aof.c b/src/aof.c index 9ad85c536..89f17abab 100644 --- a/src/aof.c +++ b/src/aof.c @@ -962,8 +962,7 @@ int rewriteAppendOnlyFileBackground(void) { char tmpfile[256]; /* Child */ - if (server.ipfd > 0) close(server.ipfd); - if (server.sofd > 0) close(server.sofd); + closeListeningSockets(0); redisSetProcTitle("redis-aof-rewrite"); snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid()); if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) { diff --git a/src/cluster.c b/src/cluster.c index 2f77941d4..1682b436d 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -224,7 +224,7 @@ void clusterSaveConfigOrDie(void) { } void clusterInit(void) { - int saveconf = 0; + int saveconf = 0, j; server.cluster = zmalloc(sizeof(clusterState)); server.cluster->myself = NULL; @@ -251,14 +251,24 @@ void clusterInit(void) { } if (saveconf) clusterSaveConfigOrDie(); /* We need a listening TCP port for our cluster messaging needs */ - server.cfd = anetTcpServer(server.neterr, - server.port+REDIS_CLUSTER_PORT_INCR, server.bindaddr); - if (server.cfd == -1) { - redisLog(REDIS_WARNING, "Opening cluster TCP port: %s", server.neterr); - exit(1); + server.cfd_count = 0; + if (server.bindaddr_count == 0) server.bindaddr[0] = NULL; + for (j = 0; j < server.bindaddr_count || j == 0; j++) { + server.cfd[j] = anetTcpServer( + server.neterr, server.port+REDIS_CLUSTER_PORT_INCR, + server.bindaddr[j]); + if (server.cfd[j] == -1) { + redisLog(REDIS_WARNING, + "Opening cluster listening TCP socket %s:%d: %s", + server.bindaddr[j] ? server.bindaddr[j] : "*", + server.port+REDIS_CLUSTER_PORT_INCR, + server.neterr); + exit(1); + } + if (aeCreateFileEvent(server.el, server.cfd[j], AE_READABLE, + clusterAcceptHandler, NULL) == AE_ERR) redisPanic("Unrecoverable error creating Redis Cluster file event."); + server.cfd_count++; } - if (aeCreateFileEvent(server.el, server.cfd, AE_READABLE, - clusterAcceptHandler, NULL) == AE_ERR) redisPanic("Unrecoverable error creating Redis Cluster file event."); server.cluster->slots_to_keys = zslCreate(); } diff --git a/src/rdb.c b/src/rdb.c index f403e27c8..c24f2d58f 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -729,8 +729,7 @@ int rdbSaveBackground(char *filename) { int retval; /* Child */ - if (server.ipfd > 0) close(server.ipfd); - if (server.sofd > 0) close(server.sofd); + closeListeningSockets(0); redisSetProcTitle("redis-rdb-bgsave"); retval = rdbSave(filename); if (retval == REDIS_OK) { diff --git a/src/redis.c b/src/redis.c index a78ecfb51..adea459a8 100644 --- a/src/redis.c +++ b/src/redis.c @@ -1224,7 +1224,7 @@ void initServerConfig() { server.bindaddr_count = 0; server.unixsocket = NULL; server.unixsocketperm = REDIS_DEFAULT_UNIX_SOCKET_PERM; - server.ipfd = -1; + server.ipfd_count = 0; server.sofd = -1; server.dbnum = REDIS_DEFAULT_DBNUM; server.verbosity = REDIS_DEFAULT_VERBOSITY; @@ -1425,14 +1425,25 @@ void initServer() { server.el = aeCreateEventLoop(server.maxclients+REDIS_EVENTLOOP_FDSET_INCR); server.db = zmalloc(sizeof(redisDb)*server.dbnum); + /* Open the TCP listening sockets. */ if (server.port != 0) { - server.ipfd = anetTcpServer(server.neterr,server.port,server.bindaddr,server.bindaddr_count); - if (server.ipfd == ANET_ERR) { - redisLog(REDIS_WARNING, "Opening port %d: %s", - server.port, server.neterr); - exit(1); + /* Force binding of 0.0.0.0 if no bind address is specified, always + * entering the loop if j == 0. */ + if (server.bindaddr_count == 0) server.bindaddr[0] = NULL; + for (j = 0; j < server.bindaddr_count || j == 0; j++) { + server.ipfd[server.ipfd_count] = anetTcpServer(server.neterr,server.port,server.bindaddr[j]); + if (server.ipfd[server.ipfd_count] == ANET_ERR) { + redisLog(REDIS_WARNING, + "Creating Server TCP listening socket %s:%d: %s", + server.bindaddr[j] ? server.bindaddr[j] : "*", + server.port, server.neterr); + exit(1); + } + server.ipfd_count++; } } + + /* Open the listening Unix domain socket. */ if (server.unixsocket != NULL) { unlink(server.unixsocket); /* don't care if this fails */ server.sofd = anetUnixServer(server.neterr,server.unixsocket,server.unixsocketperm); @@ -1441,10 +1452,14 @@ void initServer() { exit(1); } } - if (server.ipfd < 0 && server.sofd < 0) { + + /* Abort if there are no listening sockets at all. */ + if (server.ipfd_count == 0 && server.sofd < 0) { redisLog(REDIS_WARNING, "Configured to not listen anywhere, exiting."); exit(1); } + + /* Create the Redis databases, and initialize other internal state. */ for (j = 0; j < server.dbnum; j++) { server.db[j].dict = dictCreate(&dbDictType,NULL); server.db[j].expires = dictCreate(&keyptrDictType,NULL); @@ -1487,15 +1502,28 @@ void initServer() { server.unixtime = time(NULL); server.lastbgsave_status = REDIS_OK; server.repl_good_slaves_count = 0; + + /* Create the serverCron() time event, that's our main way to process + * background operations. */ if(aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL) == AE_ERR) { redisPanic("Can't create the serverCron time event."); exit(1); } - if (server.ipfd > 0 && aeCreateFileEvent(server.el,server.ipfd,AE_READABLE, - acceptTcpHandler,NULL) == AE_ERR) redisPanic("Unrecoverable error creating server.ipfd file event."); + + /* Create an event handler for accepting new connections in TCP and Unix + * domain sockets. */ + for (j = 0; j < server.ipfd_count; j++) { + if (aeCreateFileEvent(server.el, server.ipfd[j], AE_READABLE, + acceptTcpHandler,NULL) == AE_ERR) + { + redisPanic( + "Unrecoverable error creating server.ipfd file event."); + } + } if (server.sofd > 0 && aeCreateFileEvent(server.el,server.sofd,AE_READABLE, acceptUnixHandler,NULL) == AE_ERR) redisPanic("Unrecoverable error creating server.sofd file event."); + /* Open the AOF file if needed. */ if (server.aof_state == REDIS_AOF_ON) { server.aof_fd = open(server.aof_filename, O_WRONLY|O_APPEND|O_CREAT,0644); @@ -1930,6 +1958,21 @@ int processCommand(redisClient *c) { /*================================== Shutdown =============================== */ +/* Close listening sockets. Also unlink the unix domain socket if + * unlink_unix_socket is non-zero. */ +void closeListeningSockets(int unlink_unix_socket) { + int j; + + for (j = 0; j < server.ipfd_count; j++) close(server.ipfd[j]); + if (server.sofd != -1) close(server.sofd); + if (server.cluster_enabled) + for (j = 0; j < server.cfd_count; j++) close(server.cfd[j]); + if (unlink_unix_socket && server.unixsocket) { + redisLog(REDIS_NOTICE,"Removing the unix socket file."); + unlink(server.unixsocket); /* don't care if this fails */ + } +} + int prepareForShutdown(int flags) { int save = flags & REDIS_SHUTDOWN_SAVE; int nosave = flags & REDIS_SHUTDOWN_NOSAVE; @@ -1973,13 +2016,7 @@ int prepareForShutdown(int flags) { unlink(server.pidfile); } /* Close the listening sockets. Apparently this allows faster restarts. */ - if (server.ipfd != -1) close(server.ipfd); - if (server.sofd != -1) close(server.sofd); - if (server.unixsocket) { - redisLog(REDIS_NOTICE,"Removing the unix socket file."); - unlink(server.unixsocket); /* don't care if this fails */ - } - + closeListeningSockets(1); redisLog(REDIS_WARNING,"Redis is now ready to exit, bye bye..."); return REDIS_OK; } @@ -2936,7 +2973,7 @@ int main(int argc, char **argv) { exit(1); } } - if (server.ipfd > 0) + if (server.ipfd_count > 0) redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port); if (server.sofd > 0) redisLog(REDIS_NOTICE,"The server is now ready to accept connections at %s", server.unixsocket); diff --git a/src/redis.h b/src/redis.h index 1df4084ed..e3542967f 100644 --- a/src/redis.h +++ b/src/redis.h @@ -735,9 +735,11 @@ struct redisServer { int bindaddr_count; /* Number of addresses in server.bindaddr[] */ char *unixsocket; /* UNIX socket path */ mode_t unixsocketperm; /* UNIX socket permission */ - int ipfd; /* TCP socket file descriptor */ + int ipfd[REDIS_BINDADDR_MAX]; /* TCP socket file descriptors */ + int ipfd_count; /* Used slots in ipfd[] */ int sofd; /* Unix socket file descriptor */ - int cfd; /* Cluster bus listening socket */ + int cfd[REDIS_BINDADDR_MAX];/* Cluster bus listening socket */ + int cfd_count; /* Used slots in cfd[] */ list *clients; /* List of active clients */ list *clients_to_close; /* Clients to close asynchronously */ list *slaves, *monitors; /* List of slaves and MONITORs */ @@ -1246,6 +1248,7 @@ void oom(const char *msg); void populateCommandTable(void); void resetCommandTableStats(void); void adjustOpenFilesLimit(void); +void closeListeningSockets(int unlink_unix_socket); /* Set data type */ robj *setTypeCreate(robj *value); From eb0d75f21a52a0efe32f4597021d55dc54d105d9 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 8 Jul 2013 10:42:16 +0200 Subject: [PATCH 0048/2500] Example redis.conf: bind to multiple interfaces documented. --- redis.conf | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/redis.conf b/redis.conf index 10560eb76..a31d014c4 100644 --- a/redis.conf +++ b/redis.conf @@ -24,9 +24,14 @@ pidfile /var/run/redis.pid # If port 0 is specified Redis will not listen on a TCP socket. port 6379 -# If you want you can bind a single interface, if the bind option is not -# specified all the interfaces will listen for incoming connections. +# By default Redis listens for connections from all the network interfaces +# available on the server. It is possible to listen to just one or multiple +# interfaces using the "bind" configuration directive, followed by one or +# more IP addresses. # +# Examples: +# +# bind 192.168.1.100 10.0.0.1 # bind 127.0.0.1 # Specify the path for the unix socket that will be used to listen for From 13c44e7b07e985c36e8719d86ca1164a0e2f6412 Mon Sep 17 00:00:00 2001 From: Geoff Garside Date: Fri, 17 Jun 2011 00:29:49 +0100 Subject: [PATCH 0049/2500] Use getaddrinfo(3) in anetResolve. #apichange Change anetResolve() function to use getaddrinfo(3) to resolve hostnames. Resolved hostnames are limited to those reachable by the AF_INET address family. API Change: anetResolve requires additional argument. additional argument required to specify the length of the character buffer the IP address is written to in order to comply with inet_ntop(3) function semantics. inet_ntop(3) replaces inet_ntoa(3) as it has been designed to be compatible with more address families. --- src/anet.c | 28 ++++++++++++++++------------ src/anet.h | 2 +- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/src/anet.c b/src/anet.c index 963b6688e..afa1d6478 100644 --- a/src/anet.c +++ b/src/anet.c @@ -163,22 +163,26 @@ int anetTcpKeepAlive(char *err, int fd) return ANET_OK; } -int anetResolve(char *err, char *host, char *ipbuf) +int anetResolve(char *err, char *host, char *ipbuf, size_t ipbuf_len) { - struct sockaddr_in sa; + struct addrinfo hints, *info; + void *addr; + int rv; - sa.sin_family = AF_INET; - if (inet_aton(host, &sa.sin_addr) == 0) { - struct hostent *he; + memset(&hints,0,sizeof(hints)); + hints.ai_family = AF_INET; - he = gethostbyname(host); - if (he == NULL) { - anetSetError(err, "can't resolve: %s", host); - return ANET_ERR; - } - memcpy(&sa.sin_addr, he->h_addr, sizeof(struct in_addr)); + if ((rv = getaddrinfo(host, NULL, &hints, &info)) != 0) { + anetSetError(err, "%s", gai_strerror(rv)); + return ANET_ERR; } - strcpy(ipbuf,inet_ntoa(sa.sin_addr)); + if (info->ai_family == AF_INET) { + struct sockaddr_in *sa = (struct sockaddr_in *)info->ai_addr; + addr = &(sa->sin_addr); + } + + inet_ntop(info->ai_family, addr, ipbuf, ipbuf_len); + freeaddrinfo(info); return ANET_OK; } diff --git a/src/anet.h b/src/anet.h index 696c2c225..efaa2cc96 100644 --- a/src/anet.h +++ b/src/anet.h @@ -44,7 +44,7 @@ int anetTcpNonBlockConnect(char *err, char *addr, int port); int anetUnixConnect(char *err, char *path); int anetUnixNonBlockConnect(char *err, char *path); int anetRead(int fd, char *buf, int count); -int anetResolve(char *err, char *host, char *ipbuf); +int anetResolve(char *err, char *host, char *ipbuf, size_t ipbuf_len); int anetTcpServer(char *err, int port, char *bindaddr); int anetUnixServer(char *err, char *path, mode_t perm); int anetTcpAccept(char *err, int serversock, char *ip, int *port); From ac940caf4f20a113b0f185c9afd733d0ed965ad3 Mon Sep 17 00:00:00 2001 From: Geoff Garside Date: Fri, 17 Jun 2011 00:55:00 +0100 Subject: [PATCH 0050/2500] Add anetSetReuseAddr(err, fd) static function. Extract setting SO_REUSEADDR socket option into separate function so the same code can be more easily used by anetCreateSocket and other functions. --- src/anet.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/src/anet.c b/src/anet.c index afa1d6478..78c24bf25 100644 --- a/src/anet.c +++ b/src/anet.c @@ -186,8 +186,19 @@ int anetResolve(char *err, char *host, char *ipbuf, size_t ipbuf_len) return ANET_OK; } +static int anetSetReuseAddr(char *err, int fd) { + int yes = 1; + /* Make sure connection-intensive things like the redis benckmark + * will be able to close/open sockets a zillion of times */ + if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) == -1) { + anetSetError(err, "setsockopt SO_REUSEADDR: %s", strerror(errno)); + return ANET_ERR; + } + return ANET_OK; +} + static int anetCreateSocket(char *err, int domain) { - int s, on = 1; + int s; if ((s = socket(domain, SOCK_STREAM, 0)) == -1) { anetSetError(err, "creating socket: %s", strerror(errno)); return ANET_ERR; @@ -195,8 +206,8 @@ static int anetCreateSocket(char *err, int domain) { /* Make sure connection-intensive things like the redis benchmark * will be able to close/open sockets a zillion of times */ - if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) { - anetSetError(err, "setsockopt SO_REUSEADDR: %s", strerror(errno)); + if (anetSetReuseAddr(err,s) == ANET_ERR) { + close(s); return ANET_ERR; } return s; From 391ebc17de6bd4d1e389f9ff14e31ae6c9c7198e Mon Sep 17 00:00:00 2001 From: Geoff Garside Date: Fri, 17 Jun 2011 01:06:19 +0100 Subject: [PATCH 0051/2500] Use getaddrinfo(3) in anetTcpGenericConnect. Change anetTcpGenericConnect() function to use getaddrinfo(3) to perform address resolution, socket creation and connection. Resolved addresses are limited to those reachable by the AF_INET family. --- src/anet.c | 66 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 40 insertions(+), 26 deletions(-) diff --git a/src/anet.c b/src/anet.c index 78c24bf25..bbff3e3e9 100644 --- a/src/anet.c +++ b/src/anet.c @@ -217,38 +217,52 @@ static int anetCreateSocket(char *err, int domain) { #define ANET_CONNECT_NONBLOCK 1 static int anetTcpGenericConnect(char *err, char *addr, int port, int flags) { - int s; - struct sockaddr_in sa; + int s, rv; + char _port[6]; /* strlen("65535"); */ + struct addrinfo hints, *servinfo, *p; - if ((s = anetCreateSocket(err,AF_INET)) == ANET_ERR) + snprintf(_port,6,"%d",port); + memset(&hints,0,sizeof(hints)); + hints.ai_family = AF_INET; + hints.ai_socktype = SOCK_STREAM; + + if ((rv = getaddrinfo(addr,_port,&hints,&servinfo)) != 0) { + anetSetError(err, "%s", gai_strerror(rv)); return ANET_ERR; + } + for (p = servinfo; p != NULL; p = p->ai_next) { + if ((s = socket(p->ai_family,p->ai_socktype,p->ai_protocol)) == -1) + continue; - sa.sin_family = AF_INET; - sa.sin_port = htons(port); - if (inet_aton(addr, &sa.sin_addr) == 0) { - struct hostent *he; - - he = gethostbyname(addr); - if (he == NULL) { - anetSetError(err, "can't resolve: %s", addr); - close(s); - return ANET_ERR; + /* if we set err then goto cleanup, otherwise next */ + if (anetSetReuseAddr(err,s) == ANET_ERR) { + goto error; } - memcpy(&sa.sin_addr, he->h_addr, sizeof(struct in_addr)); - } - if (flags & ANET_CONNECT_NONBLOCK) { - if (anetNonBlock(err,s) != ANET_OK) - return ANET_ERR; - } - if (connect(s, (struct sockaddr*)&sa, sizeof(sa)) == -1) { - if (errno == EINPROGRESS && - flags & ANET_CONNECT_NONBLOCK) - return s; + if (flags & ANET_CONNECT_NONBLOCK) { + if (anetNonBlock(err,s) != ANET_OK) + goto error; + } + if (connect(s,p->ai_addr,p->ai_addrlen) == -1) { + if (errno == EINPROGRESS && + flags & ANET_CONNECT_NONBLOCK) + goto end; - anetSetError(err, "connect: %s", strerror(errno)); - close(s); - return ANET_ERR; + close(s); + continue; + } + + /* break with the socket */ + goto end; } + if (p == NULL) { + anetSetError(err, "creating socket: %s", strerror(errno)); + goto error; + } + +error: + s = ANET_ERR; +end: + freeaddrinfo(servinfo); return s; } From ebff000e4d683387a9dafea01b79be953987d5fe Mon Sep 17 00:00:00 2001 From: Geoff Garside Date: Fri, 17 Jun 2011 01:49:21 +0100 Subject: [PATCH 0052/2500] Use getaddrinfo(3) in a anetTcpServer. Change anetTcpServer() function to use getaddrinfo(3) to perform address resolution, socket creation and binding. Resolved addresses are limited to those reachable by the AF_INET address family. --- src/anet.c | 40 +++++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/src/anet.c b/src/anet.c index bbff3e3e9..0c33fbf45 100644 --- a/src/anet.c +++ b/src/anet.c @@ -362,23 +362,37 @@ static int anetListen(char *err, int s, struct sockaddr *sa, socklen_t len) { int anetTcpServer(char *err, int port, char *bindaddr) { - int s; - struct sockaddr_in sa; + int s, rv; + char _port[6]; /* strlen("65535") */ + struct addrinfo hints, *servinfo, *p; - if ((s = anetCreateSocket(err,AF_INET)) == ANET_ERR) - return ANET_ERR; + snprintf(_port,6,"%d",port); + memset(&hints,0,sizeof(hints)); + hints.ai_family = AF_INET; + hints.ai_socktype = SOCK_STREAM; + hints.ai_flags = AI_PASSIVE; /* No effect if bindaddr != NULL */ - memset(&sa,0,sizeof(sa)); - sa.sin_family = AF_INET; - sa.sin_port = htons(port); - sa.sin_addr.s_addr = htonl(INADDR_ANY); - if (bindaddr && inet_aton(bindaddr, &sa.sin_addr) == 0) { - anetSetError(err, "invalid bind address"); - close(s); + if ((rv = getaddrinfo(bindaddr,_port,&hints,&servinfo)) != 0) { + anetSetError(err, "%s", gai_strerror(rv)); return ANET_ERR; } - if (anetListen(err,s,(struct sockaddr*)&sa,sizeof(sa)) == ANET_ERR) - return ANET_ERR; + for (p = servinfo; p != NULL; p = p->ai_next) { + if ((s = socket(p->ai_family,p->ai_socktype,p->ai_protocol)) == -1) + continue; + + if (anetListen(err,s,p->ai_addr,p->ai_addrlen) == ANET_ERR) + goto error; /* could continue here? */ + goto end; + } + if (p == NULL) { + anetSetError(err, "unable to bind socket"); + goto error; + } + +error: + s = ANET_ERR; +end: + freeaddrinfo(servinfo); return s; } From aeeffaf3b8f30bc5f00b3e7637308f5a67ca89b2 Mon Sep 17 00:00:00 2001 From: Geoff Garside Date: Fri, 17 Jun 2011 19:47:49 +0100 Subject: [PATCH 0053/2500] Use inet_ntop(3) in anet. #apichange Replace inet_ntoa(3) calls with the more future proof inet_ntop(3) function which is capable of handling additional address families. API Change: anetTcpAccept() & anetPeerToString() additional argument additional argument required to specify the length of the character buffer the IP address is written to in order to comply with inet_ntop(3) function semantics. --- src/anet.c | 12 ++++++------ src/anet.h | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/anet.c b/src/anet.c index 0c33fbf45..358802b1c 100644 --- a/src/anet.c +++ b/src/anet.c @@ -431,14 +431,14 @@ static int anetGenericAccept(char *err, int s, struct sockaddr *sa, socklen_t *l return fd; } -int anetTcpAccept(char *err, int s, char *ip, int *port) { +int anetTcpAccept(char *err, int s, char *ip, size_t ip_len, int *port) { int fd; struct sockaddr_in sa; socklen_t salen = sizeof(sa); if ((fd = anetGenericAccept(err,s,(struct sockaddr*)&sa,&salen)) == ANET_ERR) return ANET_ERR; - if (ip) strcpy(ip,inet_ntoa(sa.sin_addr)); + if (ip) inet_ntop(sa.sin_family,(void*)&(sa.sin_addr),ip,ip_len); if (port) *port = ntohs(sa.sin_port); return fd; } @@ -453,7 +453,7 @@ int anetUnixAccept(char *err, int s) { return fd; } -int anetPeerToString(int fd, char *ip, int *port) { +int anetPeerToString(int fd, char *ip, size_t ip_len, int *port) { struct sockaddr_in sa; socklen_t salen = sizeof(sa); @@ -463,12 +463,12 @@ int anetPeerToString(int fd, char *ip, int *port) { ip[1] = '\0'; return -1; } - if (ip) strcpy(ip,inet_ntoa(sa.sin_addr)); + if (ip) inet_ntop(sa.sin_family,(void*)&(sa.sin_addr),ip,ip_len); if (port) *port = ntohs(sa.sin_port); return 0; } -int anetSockName(int fd, char *ip, int *port) { +int anetSockName(int fd, char *ip, size_t ip_len, int *port) { struct sockaddr_in sa; socklen_t salen = sizeof(sa); @@ -478,7 +478,7 @@ int anetSockName(int fd, char *ip, int *port) { ip[1] = '\0'; return -1; } - if (ip) strcpy(ip,inet_ntoa(sa.sin_addr)); + if (ip) inet_ntop(sa.sin_family,(void*)&(sa.sin_addr),ip,ip_len); if (port) *port = ntohs(sa.sin_port); return 0; } diff --git a/src/anet.h b/src/anet.h index efaa2cc96..ccc67c634 100644 --- a/src/anet.h +++ b/src/anet.h @@ -47,14 +47,14 @@ int anetRead(int fd, char *buf, int count); int anetResolve(char *err, char *host, char *ipbuf, size_t ipbuf_len); int anetTcpServer(char *err, int port, char *bindaddr); int anetUnixServer(char *err, char *path, mode_t perm); -int anetTcpAccept(char *err, int serversock, char *ip, int *port); +int anetTcpAccept(char *err, int serversock, char *ip, size_t ip_len, int *port); int anetUnixAccept(char *err, int serversock); int anetWrite(int fd, char *buf, int count); int anetNonBlock(char *err, int fd); int anetEnableTcpNoDelay(char *err, int fd); int anetDisableTcpNoDelay(char *err, int fd); int anetTcpKeepAlive(char *err, int fd); -int anetPeerToString(int fd, char *ip, int *port); +int anetPeerToString(int fd, char *ip, size_t ip_len, int *port); int anetKeepAlive(char *err, int fd, int interval); #endif From 8b2e90acece09ffc76d9d054eb4df74097c88bdd Mon Sep 17 00:00:00 2001 From: Geoff Garside Date: Fri, 17 Jun 2011 19:54:17 +0100 Subject: [PATCH 0054/2500] Update anetTcpAccept & anetPeerToString calls. Add the additional ip buffer length argument to function calls of anetTcpAccept and anetPeerToString in network.c and cluster.c --- src/cluster.c | 2 +- src/networking.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 1682b436d..a111c0300 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -310,7 +310,7 @@ void clusterAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) { REDIS_NOTUSED(mask); REDIS_NOTUSED(privdata); - cfd = anetTcpAccept(server.neterr, fd, cip, &cport); + cfd = anetTcpAccept(server.neterr, fd, cip, sizeof(cip), &cport); if (cfd == AE_ERR) { redisLog(REDIS_VERBOSE,"Accepting cluster node: %s", server.neterr); return; diff --git a/src/networking.c b/src/networking.c index 192ca5e0c..b493f16a8 100644 --- a/src/networking.c +++ b/src/networking.c @@ -557,7 +557,7 @@ void acceptTcpHandler(aeEventLoop *el, int fd, void *privdata, int mask) { REDIS_NOTUSED(mask); REDIS_NOTUSED(privdata); - cfd = anetTcpAccept(server.neterr, fd, cip, &cport); + cfd = anetTcpAccept(server.neterr, fd, cip, sizeof(cip), &cport); if (cfd == AE_ERR) { redisLog(REDIS_WARNING,"Accepting client connection: %s", server.neterr); return; @@ -1133,7 +1133,7 @@ sds getClientInfoString(redisClient *client) { int emask; if (!(client->flags & REDIS_UNIX_SOCKET)) - anetPeerToString(client->fd,ip,&port); + anetPeerToString(client->fd,ip,sizeof(ip),&port); p = flags; if (client->flags & REDIS_SLAVE) { if (client->flags & REDIS_MONITOR) @@ -1214,7 +1214,7 @@ void clientCommand(redisClient *c) { int port; client = listNodeValue(ln); - if (anetPeerToString(client->fd,ip,&port) == -1) continue; + if (anetPeerToString(client->fd,ip,sizeof(ip),&port) == -1) continue; snprintf(addr,sizeof(addr),"%s:%d",ip,port); if (strcmp(addr,c->argv[2]->ptr) == 0) { addReply(c,shared.ok); From c87105431cc8eebea61a5e84804ea4a561d1a651 Mon Sep 17 00:00:00 2001 From: Geoff Garside Date: Fri, 17 Jun 2011 20:35:56 +0100 Subject: [PATCH 0055/2500] Use inet_ntop(3) in nodeIp2String & clusterCommand Replace inet_ntoa(3) calls with the more future proof inet_ntop(3) function which is capable of handling additional address families. --- src/cluster.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index a111c0300..f59b4561a 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -758,7 +758,7 @@ void nodeIp2String(char *buf, clusterLink *link) { if (getpeername(link->fd, (struct sockaddr*) &sa, &salen) == -1) redisPanic("getpeername() failed."); - strncpy(buf,inet_ntoa(sa.sin_addr),sizeof(link->node->ip)); + inet_ntop(sa.sin_family,(void*)&(sa.sin_addr),buf,sizeof(link->node->ip)); } @@ -2084,7 +2084,7 @@ void clusterCommand(redisClient *c) { /* Finally add the node to the cluster with a random name, this * will get fixed in the first handshake (ping/pong). */ n = createClusterNode(NULL,REDIS_NODE_HANDSHAKE|REDIS_NODE_MEET); - strncpy(n->ip,inet_ntoa(sa.sin_addr),sizeof(n->ip)); + inet_ntop(sa.sin_family,(void*)&(sa.sin_addr),n->ip,sizeof(n->ip)); n->port = port; clusterAddNode(n); addReply(c,shared.ok); From 9ddaff53a93b4a8df75d768aea8598b4faaf5579 Mon Sep 17 00:00:00 2001 From: Geoff Garside Date: Fri, 17 Jun 2011 20:37:45 +0100 Subject: [PATCH 0056/2500] Use inet_pton(3) in clusterCommand. Replace inet_aton(3) call with the more future proof inet_pton(3) function which is capable of handling additional address families. --- src/cluster.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cluster.c b/src/cluster.c index f59b4561a..7531ab7c5 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -2070,7 +2070,7 @@ void clusterCommand(redisClient *c) { long port; /* Perform sanity checks on IP/port */ - if (inet_aton(c->argv[2]->ptr,&sa.sin_addr) == 0) { + if (inet_pton(AF_INET,c->argv[0]->ptr,&(sa.sin_addr)) == 0) { addReplyError(c,"Invalid IP address in MEET"); return; } From 74b773178178530d830d62e94edab3fb8ccda760 Mon Sep 17 00:00:00 2001 From: Geoff Garside Date: Sat, 18 Jun 2011 18:59:07 +0100 Subject: [PATCH 0057/2500] Fix cluster.c inet_ntop use of sizeof(n->ip). Using sizeof with an array will only return expected results if the array is created in the scope of the function where sizeof is used. This commit changes the inet_ntop calls so that they use the fixed buffer value as defined in redis.h which is 16. --- src/cluster.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 7531ab7c5..e60d82a64 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -758,7 +758,7 @@ void nodeIp2String(char *buf, clusterLink *link) { if (getpeername(link->fd, (struct sockaddr*) &sa, &salen) == -1) redisPanic("getpeername() failed."); - inet_ntop(sa.sin_family,(void*)&(sa.sin_addr),buf,sizeof(link->node->ip)); + inet_ntop(sa.sin_family,(void*)&(sa.sin_addr),buf,16); } @@ -2084,7 +2084,7 @@ void clusterCommand(redisClient *c) { /* Finally add the node to the cluster with a random name, this * will get fixed in the first handshake (ping/pong). */ n = createClusterNode(NULL,REDIS_NODE_HANDSHAKE|REDIS_NODE_MEET); - inet_ntop(sa.sin_family,(void*)&(sa.sin_addr),n->ip,sizeof(n->ip)); + inet_ntop(sa.sin_family,(void*)&(sa.sin_addr),n->ip,16); n->port = port; clusterAddNode(n); addReply(c,shared.ok); From a6c9ad267cbc0f11ae5e83e14d7846f3087a2ba9 Mon Sep 17 00:00:00 2001 From: Geoff Garside Date: Sat, 18 Jun 2011 19:06:26 +0100 Subject: [PATCH 0058/2500] Add macro to define clusterNode.ip buffer size. Add REDIS_CLUSTER_IPLEN macro to define the size of the clusterNode ip character array. Additionally use this macro in inet_ntop(3) calls where the size of the array was being defined manually. The REDIS_CLUSTER_IPLEN is defined as INET_ADDRSTRLEN which defines the correct size of a buffer to store an IPv4 address in. The INET_ADDRSTRLEN macro itself is defined in the header file and should be portable across the majority of systems. --- src/cluster.c | 4 ++-- src/redis.h | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index e60d82a64..dacddbcc0 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -758,7 +758,7 @@ void nodeIp2String(char *buf, clusterLink *link) { if (getpeername(link->fd, (struct sockaddr*) &sa, &salen) == -1) redisPanic("getpeername() failed."); - inet_ntop(sa.sin_family,(void*)&(sa.sin_addr),buf,16); + inet_ntop(sa.sin_family,(void*)&(sa.sin_addr),buf,REDIS_CLUSTER_IPLEN); } @@ -2084,7 +2084,7 @@ void clusterCommand(redisClient *c) { /* Finally add the node to the cluster with a random name, this * will get fixed in the first handshake (ping/pong). */ n = createClusterNode(NULL,REDIS_NODE_HANDSHAKE|REDIS_NODE_MEET); - inet_ntop(sa.sin_family,(void*)&(sa.sin_addr),n->ip,16); + inet_ntop(sa.sin_family,(void*)&(sa.sin_addr),n->ip,REDIS_CLUSTER_IPLEN); n->port = port; clusterAddNode(n); addReply(c,shared.ok); diff --git a/src/redis.h b/src/redis.h index e3542967f..2cfcce356 100644 --- a/src/redis.h +++ b/src/redis.h @@ -120,7 +120,7 @@ #define REDIS_DEFAULT_AOF_REWRITE_INCREMENTAL_FSYNC 1 #define REDIS_DEFAULT_MIN_SLAVES_TO_WRITE 0 #define REDIS_DEFAULT_MIN_SLAVES_MAX_LAG 10 -#define REDIS_IP_STR_LEN 16 +#define REDIS_IP_STR_LEN INET6_ADDRSTRLEN #define REDIS_BINDADDR_MAX 16 /* Protocol and I/O related defines */ @@ -564,6 +564,7 @@ typedef struct redisOpArray { #define REDIS_CLUSTER_FAIL 1 /* The cluster can't work */ #define REDIS_CLUSTER_NAMELEN 40 /* sha1 hex length */ #define REDIS_CLUSTER_PORT_INCR 10000 /* Cluster port = baseport + PORT_INCR */ +#define REDIS_CLUSTER_IPLEN INET_ADDRSTRLEN /* IPv4 address string length */ /* The following defines are amunt of time, sometimes expressed as * multiplicators of the node timeout value (when ending with MULT). */ From cc9c474c604d6e4a0b875a52e7385418eda0ffc0 Mon Sep 17 00:00:00 2001 From: Geoff Garside Date: Tue, 20 Sep 2011 00:00:14 +0100 Subject: [PATCH 0059/2500] Add missing includes for getpeername. getpeername(2) requires which on some systems also requires . Include both to avoid compilation warnings. --- src/cluster.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/cluster.c b/src/cluster.c index dacddbcc0..9f789d940 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -31,6 +31,8 @@ #include "redis.h" #include "endianconv.h" +#include +#include #include #include #include From dc7e8ec27f6f2ffdd2fc77eb0dc8f3c55b49548c Mon Sep 17 00:00:00 2001 From: Geoff Garside Date: Wed, 17 Oct 2012 22:32:21 +0100 Subject: [PATCH 0060/2500] Update calls to anetPeerToString to include ip_len. --- src/redis.c | 2 +- src/replication.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/redis.c b/src/redis.c index adea459a8..81d35852e 100644 --- a/src/redis.c +++ b/src/redis.c @@ -2428,7 +2428,7 @@ sds genRedisInfoString(char *section) { int port; long lag = 0; - if (anetPeerToString(slave->fd,ip,&port) == -1) continue; + if (anetPeerToString(slave->fd,ip,sizeof(ip),&port) == -1) continue; switch(slave->replstate) { case REDIS_REPL_WAIT_BGSAVE_START: case REDIS_REPL_WAIT_BGSAVE_END: diff --git a/src/replication.c b/src/replication.c index 196b8d8f3..4343f6193 100644 --- a/src/replication.c +++ b/src/replication.c @@ -293,7 +293,7 @@ void replicationFeedMonitors(redisClient *c, list *monitors, int dictid, robj ** int j, port; sds cmdrepr = sdsnew("+"); robj *cmdobj; - char ip[32]; + char ip[REDIS_IP_STR_LEN]; struct timeval tv; gettimeofday(&tv,NULL); @@ -303,7 +303,7 @@ void replicationFeedMonitors(redisClient *c, list *monitors, int dictid, robj ** } else if (c->flags & REDIS_UNIX_SOCKET) { cmdrepr = sdscatprintf(cmdrepr,"[%d unix:%s] ",dictid,server.unixsocket); } else { - anetPeerToString(c->fd,ip,&port); + anetPeerToString(c->fd,ip,sizeof(ip),&port); cmdrepr = sdscatprintf(cmdrepr,"[%d %s:%d] ",dictid,ip,port); } From 7d5cb8d1246caa80120053dacea9e9c917852f0e Mon Sep 17 00:00:00 2001 From: Geoff Garside Date: Wed, 17 Oct 2012 22:32:48 +0100 Subject: [PATCH 0061/2500] Update calls to anetResolve to include buffer size --- src/sentinel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sentinel.c b/src/sentinel.c index ed0978694..cb7008727 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -426,7 +426,7 @@ sentinelAddr *createSentinelAddr(char *hostname, int port) { errno = EINVAL; return NULL; } - if (anetResolve(NULL,hostname,buf) == ANET_ERR) { + if (anetResolve(NULL,hostname,buf,sizeof(buf)) == ANET_ERR) { errno = ENOENT; return NULL; } From d4ea27462aab01496f5baba72c183c47928cd5f0 Mon Sep 17 00:00:00 2001 From: Geoff Garside Date: Sat, 18 Jun 2011 13:25:31 +0100 Subject: [PATCH 0062/2500] Update anetResolve to resolve AF_INET6 as well. Change the getaddrinfo(3) hints family from AF_INET to AF_UNSPEC to allow resolution of IPv6 addresses as well as IPv4 addresses. The function will return the IP address of whichever address family is preferenced by the operating system. Most current operating systems will preference AF_INET6 over AF_INET. Unfortunately without attempting to establish a connection to the remote address we can't know if the host is capable of using the returned IP address. It might be desirable to have anetResolve accept an additional argument specifying the AF_INET/AF_INET6 address the caller would like to receive. Currently though it does not appear as though the anetResolve function is ever used within Redis. --- src/anet.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/anet.c b/src/anet.c index 358802b1c..d75a4802a 100644 --- a/src/anet.c +++ b/src/anet.c @@ -166,11 +166,11 @@ int anetTcpKeepAlive(char *err, int fd) int anetResolve(char *err, char *host, char *ipbuf, size_t ipbuf_len) { struct addrinfo hints, *info; - void *addr; int rv; memset(&hints,0,sizeof(hints)); - hints.ai_family = AF_INET; + hints.ai_family = AF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; /* specify socktype to avoid dups */ if ((rv = getaddrinfo(host, NULL, &hints, &info)) != 0) { anetSetError(err, "%s", gai_strerror(rv)); @@ -178,10 +178,12 @@ int anetResolve(char *err, char *host, char *ipbuf, size_t ipbuf_len) } if (info->ai_family == AF_INET) { struct sockaddr_in *sa = (struct sockaddr_in *)info->ai_addr; - addr = &(sa->sin_addr); + inet_ntop(AF_INET, &(sa->sin_addr), ipbuf, ipbuf_len); + } else { + struct sockaddr_in6 *sa = (struct sockaddr_in6 *)info->ai_addr; + inet_ntop(AF_INET6, &(sa->sin6_addr), ipbuf, ipbuf_len); } - inet_ntop(info->ai_family, addr, ipbuf, ipbuf_len); freeaddrinfo(info); return ANET_OK; } From 041332c0280a17afd722e1bbd6261bead9554bcf Mon Sep 17 00:00:00 2001 From: Geoff Garside Date: Sat, 18 Jun 2011 14:47:38 +0100 Subject: [PATCH 0063/2500] Update anetTcpAccept to handle AF_INET6 addresses. Change the sockaddr_in to sockaddr_storage which is capable of storing both AF_INET and AF_INET6 sockets. Uses the sockaddr_storage ss_family to correctly return the printable IP address and port. --- src/anet.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/anet.c b/src/anet.c index d75a4802a..0d6a358e3 100644 --- a/src/anet.c +++ b/src/anet.c @@ -435,13 +435,20 @@ static int anetGenericAccept(char *err, int s, struct sockaddr *sa, socklen_t *l int anetTcpAccept(char *err, int s, char *ip, size_t ip_len, int *port) { int fd; - struct sockaddr_in sa; + struct sockaddr_storage sa; socklen_t salen = sizeof(sa); if ((fd = anetGenericAccept(err,s,(struct sockaddr*)&sa,&salen)) == ANET_ERR) return ANET_ERR; - if (ip) inet_ntop(sa.sin_family,(void*)&(sa.sin_addr),ip,ip_len); - if (port) *port = ntohs(sa.sin_port); + if (sa.ss_family == AF_INET) { + struct sockaddr_in *s = (struct sockaddr_in *)&sa; + if (ip) inet_ntop(AF_INET,(void*)&(s->sin_addr),ip,ip_len); + if (port) *port = ntohs(s->sin_port); + } else { + struct sockaddr_in6 *s = (struct sockaddr_in6 *)&sa; + if (ip) inet_ntop(AF_INET6,(void*)&(s->sin6_addr),ip,ip_len); + if (port) *port = ntohs(s->sin6_port); + } return fd; } From 1e235da147159f05c4d3e2aff21d8fb587c65acf Mon Sep 17 00:00:00 2001 From: Geoff Garside Date: Sat, 18 Jun 2011 16:20:08 +0100 Subject: [PATCH 0064/2500] Update anetPeerToString to handle AF_INET6 addrs. Change the sockaddr_in to sockaddr_storage which is capable of storing both AF_INET and AF_INET6 sockets. Uses the sockaddr_storage ss_family to correctly return the printable IP address and port. --- src/anet.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/src/anet.c b/src/anet.c index 0d6a358e3..1996a487b 100644 --- a/src/anet.c +++ b/src/anet.c @@ -463,7 +463,7 @@ int anetUnixAccept(char *err, int s) { } int anetPeerToString(int fd, char *ip, size_t ip_len, int *port) { - struct sockaddr_in sa; + struct sockaddr_storage sa; socklen_t salen = sizeof(sa); if (getpeername(fd,(struct sockaddr*)&sa,&salen) == -1) { @@ -472,13 +472,20 @@ int anetPeerToString(int fd, char *ip, size_t ip_len, int *port) { ip[1] = '\0'; return -1; } - if (ip) inet_ntop(sa.sin_family,(void*)&(sa.sin_addr),ip,ip_len); - if (port) *port = ntohs(sa.sin_port); + if (sa.ss_family == AF_INET) { + struct sockaddr_in *s = (struct sockaddr_in *)&sa; + if (ip) inet_ntop(AF_INET,(void*)&(s->sin_addr),ip,ip_len); + if (port) *port = ntohs(s->sin_port); + } else { + struct sockaddr_in6 *s = (struct sockaddr_in6 *)&sa; + if (ip) inet_ntop(AF_INET6,(void*)&(s->sin6_addr),ip,ip_len); + if (port) *port = ntohs(s->sin6_port); + } return 0; } int anetSockName(int fd, char *ip, size_t ip_len, int *port) { - struct sockaddr_in sa; + struct sockaddr_storage sa; socklen_t salen = sizeof(sa); if (getsockname(fd,(struct sockaddr*)&sa,&salen) == -1) { @@ -487,7 +494,14 @@ int anetSockName(int fd, char *ip, size_t ip_len, int *port) { ip[1] = '\0'; return -1; } - if (ip) inet_ntop(sa.sin_family,(void*)&(sa.sin_addr),ip,ip_len); - if (port) *port = ntohs(sa.sin_port); + if (sa.ss_family == AF_INET) { + struct sockaddr_in *s = (struct sockaddr_in *)&sa; + if (ip) inet_ntop(AF_INET,(void*)&(s->sin_addr),ip,ip_len); + if (port) *port = ntohs(s->sin_port); + } else { + struct sockaddr_in6 *s = (struct sockaddr_in6 *)&sa; + if (ip) inet_ntop(AF_INET6,(void*)&(s->sin6_addr),ip,ip_len); + if (port) *port = ntohs(s->sin6_port); + } return 0; } From 2c877368d676e5a24978223b3c110fd5db41a1e5 Mon Sep 17 00:00:00 2001 From: Geoff Garside Date: Sat, 18 Jun 2011 19:17:22 +0100 Subject: [PATCH 0065/2500] Update REDIS_CLUSTER_IPLEN to INET6_ADDRSTRLEN. Change REDIS_CLUSTER_IPLEN to INET6_ADDRSTRLEN so that the clusterNode ip character buffer is big enough to hold an IPv6 address. --- src/redis.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/redis.h b/src/redis.h index 2cfcce356..808bb9211 100644 --- a/src/redis.h +++ b/src/redis.h @@ -564,7 +564,7 @@ typedef struct redisOpArray { #define REDIS_CLUSTER_FAIL 1 /* The cluster can't work */ #define REDIS_CLUSTER_NAMELEN 40 /* sha1 hex length */ #define REDIS_CLUSTER_PORT_INCR 10000 /* Cluster port = baseport + PORT_INCR */ -#define REDIS_CLUSTER_IPLEN INET_ADDRSTRLEN /* IPv4 address string length */ +#define REDIS_CLUSTER_IPLEN INET6_ADDRSTRLEN /* IPv6 address string length */ /* The following defines are amunt of time, sometimes expressed as * multiplicators of the node timeout value (when ending with MULT). */ From 241e41a527d82c89e9b3594b7c621d31440df6be Mon Sep 17 00:00:00 2001 From: Geoff Garside Date: Sat, 18 Jun 2011 19:19:10 +0100 Subject: [PATCH 0066/2500] Update node2IpString to handle AF_INET6 addresses. Change the sockaddr_in to sockaddr_storage which is capable of storing both AF_INET and AF_INET6 sockets. Uses the sockaddr_storage ss_family to correctly return the printable IP address and port. Function makes the assumption that the buffer is of at least REDIS_CLUSTER_IPLEN bytes in size. --- src/cluster.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 9f789d940..575e46740 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -753,14 +753,21 @@ void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) { } } -/* IP -> string conversion. 'buf' is supposed to at least be 16 bytes. */ +/* IP -> string conversion. 'buf' is supposed to at least be 46 bytes. */ void nodeIp2String(char *buf, clusterLink *link) { - struct sockaddr_in sa; + struct sockaddr_storage sa; socklen_t salen = sizeof(sa); if (getpeername(link->fd, (struct sockaddr*) &sa, &salen) == -1) redisPanic("getpeername() failed."); - inet_ntop(sa.sin_family,(void*)&(sa.sin_addr),buf,REDIS_CLUSTER_IPLEN); + + if (sa.ss_family == AF_INET) { + struct sockaddr_in *s = (struct sockaddr_in *)&sa; + inet_ntop(AF_INET,(void*)&(s->sin_addr),buf,REDIS_CLUSTER_IPLEN); + } else { + struct sockaddr_in6 *s = (struct sockaddr_in6 *)&sa; + inet_ntop(AF_INET6,(void*)&(s->sin6_addr),buf,REDIS_CLUSTER_IPLEN); + } } From 9c994de435b7afb77f300e401035c937236d1295 Mon Sep 17 00:00:00 2001 From: Geoff Garside Date: Sat, 18 Jun 2011 19:21:43 +0100 Subject: [PATCH 0067/2500] Update clusterCommand to handle AF_INET6 addresses Changes the sockaddr_in to a sockaddr_storage. Attempts to convert the IP address into an AF_INET or AF_INET6 before returning an "Invalid IP address" error. Handles converting the sockaddr from either AF_INET or AF_INET6 back into a string for storage in the clusterNode ip field. --- src/cluster.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 575e46740..f635c389b 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -2075,11 +2075,12 @@ void clusterCommand(redisClient *c) { if (!strcasecmp(c->argv[1]->ptr,"meet") && c->argc == 4) { /* CLUSTER MEET */ clusterNode *n; - struct sockaddr_in sa; + struct sockaddr_storage sa; long port; /* Perform sanity checks on IP/port */ - if (inet_pton(AF_INET,c->argv[0]->ptr,&(sa.sin_addr)) == 0) { + if ((inet_pton(AF_INET,c->argv[0]->ptr,&(((struct sockaddr_in *)&sa)->sin_addr)) || + inet_pton(AF_INET6,c->argv[0]->ptr,&(((struct sockaddr_in6 *)&sa)->sin6_addr))) == 0) { addReplyError(c,"Invalid IP address in MEET"); return; } @@ -2093,7 +2094,9 @@ void clusterCommand(redisClient *c) { /* Finally add the node to the cluster with a random name, this * will get fixed in the first handshake (ping/pong). */ n = createClusterNode(NULL,REDIS_NODE_HANDSHAKE|REDIS_NODE_MEET); - inet_ntop(sa.sin_family,(void*)&(sa.sin_addr),n->ip,REDIS_CLUSTER_IPLEN); + sa.ss_family == AF_INET ? + inet_ntop(AF_INET,(void*)&(((struct sockaddr_in *)&sa)->sin_addr),n->ip,REDIS_CLUSTER_IPLEN) : + inet_ntop(AF_INET6,(void*)&(((struct sockaddr_in6 *)&sa)->sin6_addr),n->ip,REDIS_CLUSTER_IPLEN); n->port = port; clusterAddNode(n); addReply(c,shared.ok); From 15e37522ff3ce8fd6ffd7d30b8ece13187648cd9 Mon Sep 17 00:00:00 2001 From: Geoff Garside Date: Sat, 18 Jun 2011 19:34:16 +0100 Subject: [PATCH 0068/2500] Mark ip string buffers which could be reduced. In two places buffers have been created with a size of 128 bytes which could be reduced to INET6_ADDRSTRLEN to still hold a full IP address. These places have been marked as they are presently big enough to handle the needs of storing a printable IPv6 address. --- src/cluster.c | 2 +- src/networking.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index f635c389b..b2b5ca43f 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -306,7 +306,7 @@ void freeClusterLink(clusterLink *link) { void clusterAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) { int cport, cfd; - char cip[128]; + char cip[128]; /* Could use INET6_ADDRSTRLEN here, but its smaller */ clusterLink *link; REDIS_NOTUSED(el); REDIS_NOTUSED(mask); diff --git a/src/networking.c b/src/networking.c index b493f16a8..1d157aec3 100644 --- a/src/networking.c +++ b/src/networking.c @@ -552,7 +552,7 @@ static void acceptCommonHandler(int fd, int flags) { void acceptTcpHandler(aeEventLoop *el, int fd, void *privdata, int mask) { int cport, cfd; - char cip[128]; + char cip[128]; /* Could use INET6_ADDRSTRLEN here, but its smaller */ REDIS_NOTUSED(el); REDIS_NOTUSED(mask); REDIS_NOTUSED(privdata); From 0661cf0cf71841c395c67d48e7f612c11f8bfd16 Mon Sep 17 00:00:00 2001 From: Geoff Garside Date: Sat, 18 Jun 2011 19:37:45 +0100 Subject: [PATCH 0069/2500] Expand ip char buffers which are too small for v6. Increase the size of character buffers being used to store printable IP addresses so that they can safely store IPv6 addresses. --- src/networking.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/networking.c b/src/networking.c index 1d157aec3..e58db607a 100644 --- a/src/networking.c +++ b/src/networking.c @@ -1128,7 +1128,7 @@ void getClientsMaxBuffers(unsigned long *longest_output_list, /* Turn a Redis client into an sds string representing its state. */ sds getClientInfoString(redisClient *client) { - char ip[32], flags[16], events[3], *p; + char ip[REDIS_IP_STR_LEN], flags[16], events[3], *p; int port = 0; /* initialized to zero for the unix socket case. */ int emask; @@ -1210,7 +1210,8 @@ void clientCommand(redisClient *c) { } else if (!strcasecmp(c->argv[1]->ptr,"kill") && c->argc == 3) { listRewind(server.clients,&li); while ((ln = listNext(&li)) != NULL) { - char ip[32], addr[64]; + /* addr size 64 > INET6_ADDRSTRLEN + : + strlen("65535") */ + char ip[INET6_ADDRSTRLEN], addr[64]; int port; client = listNodeValue(ln); From 5d702e012e62ff2fa6beadc00698804c0ce4096f Mon Sep 17 00:00:00 2001 From: Geoff Garside Date: Sat, 18 Jun 2011 19:43:47 +0100 Subject: [PATCH 0070/2500] Mark places that might want changing for IPv6. Any places which I feel might want to be updated to work differently with IPv6 have been marked with a comment starting "IPV6:". Currently the only comments address places where an IP address is combined with a port using the standard : separated form. These may want to be changed when printing IPv6 addresses to wrap the address in [] such as [2001:db8::c0:ffee]:6379 instead of 2001:db8::c0:ffee:6379 as the latter format is a technically valid IPv6 address and it is hard to distinguish the IPv6 address component from the port unless you know the port is supposed to be there. --- src/cluster.c | 1 + src/networking.c | 1 + 2 files changed, 2 insertions(+) diff --git a/src/cluster.c b/src/cluster.c index b2b5ca43f..195cfe5d7 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -317,6 +317,7 @@ void clusterAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) { redisLog(REDIS_VERBOSE,"Accepting cluster node: %s", server.neterr); return; } + /* IPV6: might want to wrap a v6 address in [] */ redisLog(REDIS_VERBOSE,"Accepted cluster node %s:%d", cip, cport); /* We need to create a temporary node in order to read the incoming * packet in a valid contest. This node will be released once we diff --git a/src/networking.c b/src/networking.c index e58db607a..b220055f1 100644 --- a/src/networking.c +++ b/src/networking.c @@ -1216,6 +1216,7 @@ void clientCommand(redisClient *c) { client = listNodeValue(ln); if (anetPeerToString(client->fd,ip,sizeof(ip),&port) == -1) continue; + /* IPV6: might want to wrap a v6 address in [] */ snprintf(addr,sizeof(addr),"%s:%d",ip,port); if (strcmp(addr,c->argv[2]->ptr) == 0) { addReply(c,shared.ok); From 25f6d0eb6ebc72d50645ab5f29ffb20724cb7f0f Mon Sep 17 00:00:00 2001 From: Geoff Garside Date: Sat, 18 Jun 2011 19:54:05 +0100 Subject: [PATCH 0071/2500] Change anetTcpGenericConnect to use AF_UNSPEC. This allows anetTcpGenericConnect to try to connect to AF_INET6 addresses in addition to any resolved AF_INET addresses. --- src/anet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anet.c b/src/anet.c index 1996a487b..f86c52493 100644 --- a/src/anet.c +++ b/src/anet.c @@ -225,7 +225,7 @@ static int anetTcpGenericConnect(char *err, char *addr, int port, int flags) snprintf(_port,6,"%d",port); memset(&hints,0,sizeof(hints)); - hints.ai_family = AF_INET; + hints.ai_family = AF_UNSPEC; hints.ai_socktype = SOCK_STREAM; if ((rv = getaddrinfo(addr,_port,&hints,&servinfo)) != 0) { From e65a14bd3945515efdac33cd35c36f15454417a9 Mon Sep 17 00:00:00 2001 From: Geoff Garside Date: Mon, 19 Sep 2011 23:31:41 +0100 Subject: [PATCH 0072/2500] Add static anetV6Only() function. This function sets the IPV6_V6ONLY option to 1 to use separate stack IPv6 sockets. --- src/anet.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/anet.c b/src/anet.c index f86c52493..bbefa2fe9 100644 --- a/src/anet.c +++ b/src/anet.c @@ -362,6 +362,16 @@ static int anetListen(char *err, int s, struct sockaddr *sa, socklen_t len) { return ANET_OK; } +static int anetV6Only(char *err, int s) { + int yes = 1; + if (setsockopt(s,IPPROTO_IPV6,IPV6_V6ONLY,&yes,sizeof(yes)) == -1) { + anetSetError(err, "setsockopt: %s", strerror(errno)); + close(s); + return ANET_ERR; + } + return ANET_OK; +} + int anetTcpServer(char *err, int port, char *bindaddr) { int s, rv; From 5998ebfa2a1a82937c4272df49c08ead0911608c Mon Sep 17 00:00:00 2001 From: Geoff Garside Date: Mon, 19 Sep 2011 23:32:41 +0100 Subject: [PATCH 0073/2500] Add anetTcp6Server() function. Refactor the common code from anetTcpServer into internal function which can be used by both anetTcpServer and anetTcp6Server. --- src/anet.c | 17 +++++++++++++++-- src/anet.h | 1 + 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/src/anet.c b/src/anet.c index bbefa2fe9..434d8311e 100644 --- a/src/anet.c +++ b/src/anet.c @@ -372,7 +372,7 @@ static int anetV6Only(char *err, int s) { return ANET_OK; } -int anetTcpServer(char *err, int port, char *bindaddr) +static int _anetTcpServer(char *err, int port, char *bindaddr, int af) { int s, rv; char _port[6]; /* strlen("65535") */ @@ -380,7 +380,7 @@ int anetTcpServer(char *err, int port, char *bindaddr) snprintf(_port,6,"%d",port); memset(&hints,0,sizeof(hints)); - hints.ai_family = AF_INET; + hints.ai_family = af; hints.ai_socktype = SOCK_STREAM; hints.ai_flags = AI_PASSIVE; /* No effect if bindaddr != NULL */ @@ -392,6 +392,9 @@ int anetTcpServer(char *err, int port, char *bindaddr) if ((s = socket(p->ai_family,p->ai_socktype,p->ai_protocol)) == -1) continue; + if (AF_INET6 == af && anetV6Only(err,s) == ANET_ERR) + goto error; /* could continue here? */ + if (anetListen(err,s,p->ai_addr,p->ai_addrlen) == ANET_ERR) goto error; /* could continue here? */ goto end; @@ -408,6 +411,16 @@ end: return s; } +int anetTcpServer(char *err, int port, char *bindaddr) +{ + return _anetTcpServer(err, port, bindaddr, AF_INET); +} + +int anetTcp6Server(char *err, int port, char *bindaddr) +{ + return _anetTcpServer(err, port, bindaddr, AF_INET6); +} + int anetUnixServer(char *err, char *path, mode_t perm) { int s; diff --git a/src/anet.h b/src/anet.h index ccc67c634..ff5897af1 100644 --- a/src/anet.h +++ b/src/anet.h @@ -46,6 +46,7 @@ int anetUnixNonBlockConnect(char *err, char *path); int anetRead(int fd, char *buf, int count); int anetResolve(char *err, char *host, char *ipbuf, size_t ipbuf_len); int anetTcpServer(char *err, int port, char *bindaddr); +int anetTcp6Server(char *err, int port, char *bindaddr); int anetUnixServer(char *err, char *path, mode_t perm); int anetTcpAccept(char *err, int serversock, char *ip, size_t ip_len, int *port); int anetUnixAccept(char *err, int serversock); From e2517f47cdb069df67b6ac2a6a295c717c202305 Mon Sep 17 00:00:00 2001 From: Geoff Garside Date: Mon, 19 Sep 2011 23:41:39 +0100 Subject: [PATCH 0074/2500] Document port6 and bind6 config options. Add commented port6 and bind6 options to default redis.conf file. --- redis.conf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/redis.conf b/redis.conf index a31d014c4..a267a5855 100644 --- a/redis.conf +++ b/redis.conf @@ -23,6 +23,7 @@ pidfile /var/run/redis.pid # Accept connections on the specified port, default is 6379. # If port 0 is specified Redis will not listen on a TCP socket. port 6379 +# port6 6379 # By default Redis listens for connections from all the network interfaces # available on the server. It is possible to listen to just one or multiple @@ -33,6 +34,7 @@ port 6379 # # bind 192.168.1.100 10.0.0.1 # bind 127.0.0.1 +# bind6 ::1 # Specify the path for the unix socket that will be used to listen for # incoming connections. There is no default, so Redis will not listen From f0eb97325c26820babac9607b92b7f02a3b4d6c7 Mon Sep 17 00:00:00 2001 From: Geoff Garside Date: Thu, 7 Jun 2012 19:01:51 +0100 Subject: [PATCH 0075/2500] Fix calls to anetPeerToString() missing buffer size. --- src/redis.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/redis.c b/src/redis.c index 81d35852e..3b522bb10 100644 --- a/src/redis.c +++ b/src/redis.c @@ -2424,7 +2424,7 @@ sds genRedisInfoString(char *section) { while((ln = listNext(&li))) { redisClient *slave = listNodeValue(ln); char *state = NULL; - char ip[32]; + char ip[INET6_ADDRSTRLEN]; int port; long lag = 0; From d61f125b1b6609755a76ba3e4cfecd8bda93bbac Mon Sep 17 00:00:00 2001 From: Geoff Garside Date: Wed, 17 Oct 2012 23:26:30 +0100 Subject: [PATCH 0076/2500] Cleanup main() and BACKTRACE mistaken pulled while rebasing. --- src/redis.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/redis.c b/src/redis.c index 3b522bb10..e19fbf727 100644 --- a/src/redis.c +++ b/src/redis.c @@ -2975,6 +2975,8 @@ int main(int argc, char **argv) { } if (server.ipfd_count > 0) redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port); + if (server.ip6fd > 0) + redisLog(REDIS_NOTICE,"The server is now ready to accept IPv6 connections on port %d", server.port6); if (server.sofd > 0) redisLog(REDIS_NOTICE,"The server is now ready to accept connections at %s", server.unixsocket); } From 074bc28f050235fe9fc8402452c9aa5092409306 Mon Sep 17 00:00:00 2001 From: Geoff Garside Date: Wed, 17 Oct 2012 23:45:44 +0100 Subject: [PATCH 0077/2500] Add IPv6 support to sentinel.c. This has been done by exposing the anetSockName() function anet.c to be used when the sentinel is publishing its existence to the masters. This implementation is very unintelligent as it will likely break if used with IPv6 as the nested colons will break any parsing of the PUBLISH string by the master. --- src/anet.h | 1 + src/sentinel.c | 9 ++++----- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/anet.h b/src/anet.h index ff5897af1..b23411cbb 100644 --- a/src/anet.h +++ b/src/anet.h @@ -57,5 +57,6 @@ int anetDisableTcpNoDelay(char *err, int fd); int anetTcpKeepAlive(char *err, int fd); int anetPeerToString(int fd, char *ip, size_t ip_len, int *port); int anetKeepAlive(char *err, int fd, int interval); +int anetSockName(int fd, char *ip, size_t ip_len, int *port); #endif diff --git a/src/sentinel.c b/src/sentinel.c index cb7008727..e89199fec 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1837,14 +1837,13 @@ void sentinelPingInstance(sentinelRedisInstance *ri) { (now - ri->last_pub_time) > SENTINEL_PUBLISH_PERIOD) { /* PUBLISH hello messages only to masters. */ - struct sockaddr_in sa; - socklen_t salen = sizeof(sa); - - if (getsockname(ri->cc->c.fd,(struct sockaddr*)&sa,&salen) != -1) { + char ip[INET6_ADDRSTRLEN]; + if (anetSockName(ri->cc->c.fd,ip,sizeof(ip),NULL) != -1) { char myaddr[128]; + // FIXME: IPv6 will break this due to nested : characters -geoffgarside snprintf(myaddr,sizeof(myaddr),"%s:%d:%s:%d", - inet_ntoa(sa.sin_addr), server.port, server.runid, + ip, server.port, server.runid, (ri->flags & SRI_CAN_FAILOVER) != 0); retval = redisAsyncCommand(ri->cc, sentinelPublishReplyCallback, NULL, "PUBLISH %s %s", From 41cff1e8dd4854dcbe9d94a11b2e03764d749804 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 8 Jul 2013 16:08:43 +0200 Subject: [PATCH 0078/2500] Revert "Document port6 and bind6 config options." IPv6 support is not going to use IPv6 specific options, just it will be possible to specify all the ipv4 / ipv6 addresses of the interfaces to bind, otherwise connections will be accepted from all the interfaces in both IPv4 and IPv6 addresses. This reverts commit e2517f47cdb069df67b6ac2a6a295c717c202305. --- redis.conf | 2 -- 1 file changed, 2 deletions(-) diff --git a/redis.conf b/redis.conf index a267a5855..a31d014c4 100644 --- a/redis.conf +++ b/redis.conf @@ -23,7 +23,6 @@ pidfile /var/run/redis.pid # Accept connections on the specified port, default is 6379. # If port 0 is specified Redis will not listen on a TCP socket. port 6379 -# port6 6379 # By default Redis listens for connections from all the network interfaces # available on the server. It is possible to listen to just one or multiple @@ -34,7 +33,6 @@ port 6379 # # bind 192.168.1.100 10.0.0.1 # bind 127.0.0.1 -# bind6 ::1 # Specify the path for the unix socket that will be used to listen for # incoming connections. There is no default, so Redis will not listen From 2fa66d5e76d68c01ee14f217164828e5f8daca95 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 8 Jul 2013 16:11:52 +0200 Subject: [PATCH 0079/2500] Fix old anetPeerToString() API call in replication.c --- src/redis.c | 2 -- src/replication.c | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/redis.c b/src/redis.c index e19fbf727..3b522bb10 100644 --- a/src/redis.c +++ b/src/redis.c @@ -2975,8 +2975,6 @@ int main(int argc, char **argv) { } if (server.ipfd_count > 0) redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port); - if (server.ip6fd > 0) - redisLog(REDIS_NOTICE,"The server is now ready to accept IPv6 connections on port %d", server.port6); if (server.sofd > 0) redisLog(REDIS_NOTICE,"The server is now ready to accept connections at %s", server.unixsocket); } diff --git a/src/replication.c b/src/replication.c index 4343f6193..4dce2cdcb 100644 --- a/src/replication.c +++ b/src/replication.c @@ -1614,10 +1614,10 @@ void replicationCron(void) { if (slave->flags & REDIS_PRE_PSYNC_SLAVE) continue; if ((server.unixtime - slave->repl_ack_time) > server.repl_timeout) { - char ip[32]; + char ip[REDIS_IP_STR_LEN]; int port; - if (anetPeerToString(slave->fd,ip,&port) != -1) { + if (anetPeerToString(slave->fd,ip,sizeof(ip),&port) != -1) { redisLog(REDIS_WARNING, "Disconnecting timedout slave: %s:%d", ip, slave->slave_listening_port); From ef8ca939209824a0af0b9a262e44ea540b24cd56 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 9 Jul 2013 10:47:17 +0200 Subject: [PATCH 0080/2500] IPv6: bind IPv4 and IPv6 interfaces by default. --- src/anet.c | 2 +- src/redis.c | 13 ++++++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/src/anet.c b/src/anet.c index 434d8311e..bf8c92549 100644 --- a/src/anet.c +++ b/src/anet.c @@ -392,7 +392,7 @@ static int _anetTcpServer(char *err, int port, char *bindaddr, int af) if ((s = socket(p->ai_family,p->ai_socktype,p->ai_protocol)) == -1) continue; - if (AF_INET6 == af && anetV6Only(err,s) == ANET_ERR) + if (af == AF_INET6 && anetV6Only(err,s) == ANET_ERR) goto error; /* could continue here? */ if (anetListen(err,s,p->ai_addr,p->ai_addrlen) == ANET_ERR) diff --git a/src/redis.c b/src/redis.c index 3b522bb10..a2bcdd6d0 100644 --- a/src/redis.c +++ b/src/redis.c @@ -1431,7 +1431,18 @@ void initServer() { * entering the loop if j == 0. */ if (server.bindaddr_count == 0) server.bindaddr[0] = NULL; for (j = 0; j < server.bindaddr_count || j == 0; j++) { - server.ipfd[server.ipfd_count] = anetTcpServer(server.neterr,server.port,server.bindaddr[j]); + if (server.bindaddr[j] == NULL) { + /* Bind * for both IPv6 and IPv4. */ + server.ipfd[0] = anetTcp6Server(server.neterr,server.port,NULL); + if (server.ipfd[0] != ANET_ERR) server.ipfd_count++; + server.ipfd[1] = anetTcpServer(server.neterr,server.port,NULL); + } else if (strchr(server.bindaddr[j],':')) { + /* Bind IPv6 address. */ + server.ipfd[server.ipfd_count] = anetTcp6Server(server.neterr,server.port,server.bindaddr[j]); + } else { + /* Bind IPv4 address. */ + server.ipfd[server.ipfd_count] = anetTcpServer(server.neterr,server.port,server.bindaddr[j]); + } if (server.ipfd[server.ipfd_count] == ANET_ERR) { redisLog(REDIS_WARNING, "Creating Server TCP listening socket %s:%d: %s", From e4d2e6fc9d91439782dd77436c1d96724ccec005 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 9 Jul 2013 11:32:52 +0200 Subject: [PATCH 0081/2500] All IP string repr buffers are now REDIS_IP_STR_LEN bytes. --- src/cluster.c | 2 +- src/networking.c | 5 ++--- src/redis.c | 2 +- src/sentinel.c | 2 +- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 195cfe5d7..76ab67112 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -306,7 +306,7 @@ void freeClusterLink(clusterLink *link) { void clusterAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) { int cport, cfd; - char cip[128]; /* Could use INET6_ADDRSTRLEN here, but its smaller */ + char cip[REDIS_IP_STR_LEN]; clusterLink *link; REDIS_NOTUSED(el); REDIS_NOTUSED(mask); diff --git a/src/networking.c b/src/networking.c index b220055f1..59f056436 100644 --- a/src/networking.c +++ b/src/networking.c @@ -552,7 +552,7 @@ static void acceptCommonHandler(int fd, int flags) { void acceptTcpHandler(aeEventLoop *el, int fd, void *privdata, int mask) { int cport, cfd; - char cip[128]; /* Could use INET6_ADDRSTRLEN here, but its smaller */ + char cip[REDIS_IP_STR_LEN]; REDIS_NOTUSED(el); REDIS_NOTUSED(mask); REDIS_NOTUSED(privdata); @@ -1210,8 +1210,7 @@ void clientCommand(redisClient *c) { } else if (!strcasecmp(c->argv[1]->ptr,"kill") && c->argc == 3) { listRewind(server.clients,&li); while ((ln = listNext(&li)) != NULL) { - /* addr size 64 > INET6_ADDRSTRLEN + : + strlen("65535") */ - char ip[INET6_ADDRSTRLEN], addr[64]; + char ip[REDIS_IP_STR_LEN], addr[REDIS_IP_STR_LEN+64]; int port; client = listNodeValue(ln); diff --git a/src/redis.c b/src/redis.c index a2bcdd6d0..11c69629e 100644 --- a/src/redis.c +++ b/src/redis.c @@ -2435,7 +2435,7 @@ sds genRedisInfoString(char *section) { while((ln = listNext(&li))) { redisClient *slave = listNodeValue(ln); char *state = NULL; - char ip[INET6_ADDRSTRLEN]; + char ip[REDIS_IP_STR_LEN]; int port; long lag = 0; diff --git a/src/sentinel.c b/src/sentinel.c index e89199fec..eb729966a 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1837,7 +1837,7 @@ void sentinelPingInstance(sentinelRedisInstance *ri) { (now - ri->last_pub_time) > SENTINEL_PUBLISH_PERIOD) { /* PUBLISH hello messages only to masters. */ - char ip[INET6_ADDRSTRLEN]; + char ip[REDIS_IP_STR_LEN]; if (anetSockName(ri->cc->c.fd,ip,sizeof(ip),NULL) != -1) { char myaddr[128]; From 800e918c7fd2090d7bbeac5c58e0c5f0a977cbc9 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 9 Jul 2013 12:49:20 +0200 Subject: [PATCH 0082/2500] getClientPeerID introduced. The function returns an unique identifier for the client, as ip:port for IPv4 and IPv6 clients, or as path:0 for Unix socket clients. See the top comment in the function for more info. --- src/networking.c | 41 ++++++++++++++++++++++++++++++++++------- src/redis.h | 2 ++ 2 files changed, 36 insertions(+), 7 deletions(-) diff --git a/src/networking.c b/src/networking.c index 59f056436..458f4f398 100644 --- a/src/networking.c +++ b/src/networking.c @@ -1126,14 +1126,42 @@ void getClientsMaxBuffers(unsigned long *longest_output_list, *biggest_input_buffer = bib; } +/* A Redis "Peer ID" is a colon separated ip:port pair. + * For IPv4 it's in the form x.y.z.k:pork, example: "127.0.0.1:1234". + * For IPv6 addresses we use [] around the IP part, like in "[::1]:1234". + * For Unix socekts we use path:0, like in "/tmp/redis:0". + * + * A Peer ID always fits inside a buffer of REDIS_PEER_ID_LEN bytes, including + * the null term. + * + * The function is always successful, but if the IP or port can't be extracted + * for some reason, "?" and "0" are used (this is the semantics of + * anetPeerToString() from anet.c). In practical terms this should never + * happen. */ +void getClientPeerId(redisClient *client, char *peerid, size_t peerid_len) { + char ip[REDIS_IP_STR_LEN]; + int port; + + if (client->flags & REDIS_UNIX_SOCKET) { + /* Unix socket client. */ + snprintf(peerid,peerid_len,"%s:0",server.unixsocket); + return; + } else { + /* TCP client. */ + anetPeerToString(client->fd,ip,sizeof(ip),&port); + if (strchr(ip,':')) + snprintf(peerid,peerid_len,"[%s]:%d",ip,port); + else + snprintf(peerid,peerid_len,"%s:%d",ip,port); + } +} + /* Turn a Redis client into an sds string representing its state. */ sds getClientInfoString(redisClient *client) { - char ip[REDIS_IP_STR_LEN], flags[16], events[3], *p; - int port = 0; /* initialized to zero for the unix socket case. */ + char peerid[REDIS_PEER_ID_LEN], flags[16], events[3], *p; int emask; - if (!(client->flags & REDIS_UNIX_SOCKET)) - anetPeerToString(client->fd,ip,sizeof(ip),&port); + getClientPeerId(client,peerid,sizeof(peerid)); p = flags; if (client->flags & REDIS_SLAVE) { if (client->flags & REDIS_MONITOR) @@ -1158,9 +1186,8 @@ sds getClientInfoString(redisClient *client) { if (emask & AE_WRITABLE) *p++ = 'w'; *p = '\0'; return sdscatprintf(sdsempty(), - "addr=%s:%d fd=%d name=%s age=%ld idle=%ld flags=%s db=%d sub=%d psub=%d multi=%d qbuf=%lu qbuf-free=%lu obl=%lu oll=%lu omem=%lu events=%s cmd=%s", - (client->flags & REDIS_UNIX_SOCKET) ? server.unixsocket : ip, - port, + "addr=%s fd=%d name=%s age=%ld idle=%ld flags=%s db=%d sub=%d psub=%d multi=%d qbuf=%lu qbuf-free=%lu obl=%lu oll=%lu omem=%lu events=%s cmd=%s", + peerid, client->fd, client->name ? (char*)client->name->ptr : "", (long)(server.unixtime - client->ctime), diff --git a/src/redis.h b/src/redis.h index 808bb9211..901969d0d 100644 --- a/src/redis.h +++ b/src/redis.h @@ -121,6 +121,7 @@ #define REDIS_DEFAULT_MIN_SLAVES_TO_WRITE 0 #define REDIS_DEFAULT_MIN_SLAVES_MAX_LAG 10 #define REDIS_IP_STR_LEN INET6_ADDRSTRLEN +#define REDIS_PEER_ID_LEN (REDIS_IP_STR_LEN+32) /* Must be enough for ip:port */ #define REDIS_BINDADDR_MAX 16 /* Protocol and I/O related defines */ @@ -1072,6 +1073,7 @@ void copyClientOutputBuffer(redisClient *dst, redisClient *src); void *dupClientReplyValue(void *o); void getClientsMaxBuffers(unsigned long *longest_output_list, unsigned long *biggest_input_buffer); +void getClientPeerId(redisClient *client, char *peerid, size_t peerid_len); sds getClientInfoString(redisClient *client); sds getAllClientsInfoString(void); void rewriteClientCommandVector(redisClient *c, int argc, ...); From f309368ec2355af92f574070031c395ae93cea02 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 9 Jul 2013 15:28:30 +0200 Subject: [PATCH 0083/2500] getClientPeerId() now reports errors. We now also use it in CLIENT KILL implementation. --- src/networking.c | 26 +++++++++++++------------- src/redis.h | 2 +- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/networking.c b/src/networking.c index 458f4f398..fff18ab4a 100644 --- a/src/networking.c +++ b/src/networking.c @@ -1134,25 +1134,27 @@ void getClientsMaxBuffers(unsigned long *longest_output_list, * A Peer ID always fits inside a buffer of REDIS_PEER_ID_LEN bytes, including * the null term. * - * The function is always successful, but if the IP or port can't be extracted - * for some reason, "?" and "0" are used (this is the semantics of - * anetPeerToString() from anet.c). In practical terms this should never - * happen. */ -void getClientPeerId(redisClient *client, char *peerid, size_t peerid_len) { + * The function returns REDIS_OK on succcess, and REDIS_ERR on failure. + * + * On failure the function still populates 'peerid' with the "?:0" string + * in case you want to relax error checking or need to display something + * anyway (see anetPeerToString implementation for more info). */ +int getClientPeerId(redisClient *client, char *peerid, size_t peerid_len) { char ip[REDIS_IP_STR_LEN]; int port; if (client->flags & REDIS_UNIX_SOCKET) { /* Unix socket client. */ snprintf(peerid,peerid_len,"%s:0",server.unixsocket); - return; + return REDIS_OK; } else { /* TCP client. */ - anetPeerToString(client->fd,ip,sizeof(ip),&port); + int retval = anetPeerToString(client->fd,ip,sizeof(ip),&port); if (strchr(ip,':')) snprintf(peerid,peerid_len,"[%s]:%d",ip,port); else snprintf(peerid,peerid_len,"%s:%d",ip,port); + return (retval == -1) ? REDIS_ERR : REDIS_OK; } } @@ -1237,14 +1239,12 @@ void clientCommand(redisClient *c) { } else if (!strcasecmp(c->argv[1]->ptr,"kill") && c->argc == 3) { listRewind(server.clients,&li); while ((ln = listNext(&li)) != NULL) { - char ip[REDIS_IP_STR_LEN], addr[REDIS_IP_STR_LEN+64]; - int port; + char peerid[REDIS_PEER_ID_LEN]; client = listNodeValue(ln); - if (anetPeerToString(client->fd,ip,sizeof(ip),&port) == -1) continue; - /* IPV6: might want to wrap a v6 address in [] */ - snprintf(addr,sizeof(addr),"%s:%d",ip,port); - if (strcmp(addr,c->argv[2]->ptr) == 0) { + if (getClientPeerId(client,peerid,sizeof(peerid)) == REDIS_ERR) + continue; + if (strcmp(peerid,c->argv[2]->ptr) == 0) { addReply(c,shared.ok); if (c == client) { client->flags |= REDIS_CLOSE_AFTER_REPLY; diff --git a/src/redis.h b/src/redis.h index 901969d0d..043b0bac9 100644 --- a/src/redis.h +++ b/src/redis.h @@ -1073,7 +1073,7 @@ void copyClientOutputBuffer(redisClient *dst, redisClient *src); void *dupClientReplyValue(void *o); void getClientsMaxBuffers(unsigned long *longest_output_list, unsigned long *biggest_input_buffer); -void getClientPeerId(redisClient *client, char *peerid, size_t peerid_len); +int getClientPeerId(redisClient *client, char *peerid, size_t peerid_len); sds getClientInfoString(redisClient *client); sds getAllClientsInfoString(void); void rewriteClientCommandVector(redisClient *c, int argc, ...); From 4c2998e8c4d4098a88a10a76e846d5bc1c92c40f Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 9 Jul 2013 15:46:34 +0200 Subject: [PATCH 0084/2500] getClientPeerId() refactored into two functions. --- src/networking.c | 16 ++++++++++++---- src/redis.h | 1 + 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/src/networking.c b/src/networking.c index fff18ab4a..77c37af87 100644 --- a/src/networking.c +++ b/src/networking.c @@ -1126,6 +1126,17 @@ void getClientsMaxBuffers(unsigned long *longest_output_list, *biggest_input_buffer = bib; } +/* This is an helper function for getClientPeerId(). + * It writes the specified ip/port to "peerid" as a null termiated string + * in the form ip:port if ip does not contain ":" itself, otherwise + * [ip]:port format is used (for IPv6 addresses basically). */ +void formatPeerId(char *peerid, size_t peerid_len, char *ip, int port) { + if (strchr(ip,':')) + snprintf(peerid,peerid_len,"[%s]:%d",ip,port); + else + snprintf(peerid,peerid_len,"%s:%d",ip,port); +} + /* A Redis "Peer ID" is a colon separated ip:port pair. * For IPv4 it's in the form x.y.z.k:pork, example: "127.0.0.1:1234". * For IPv6 addresses we use [] around the IP part, like in "[::1]:1234". @@ -1150,10 +1161,7 @@ int getClientPeerId(redisClient *client, char *peerid, size_t peerid_len) { } else { /* TCP client. */ int retval = anetPeerToString(client->fd,ip,sizeof(ip),&port); - if (strchr(ip,':')) - snprintf(peerid,peerid_len,"[%s]:%d",ip,port); - else - snprintf(peerid,peerid_len,"%s:%d",ip,port); + formatPeerId(peerid,peerid_len,ip,port); return (retval == -1) ? REDIS_ERR : REDIS_OK; } } diff --git a/src/redis.h b/src/redis.h index 043b0bac9..f3ef3fbfd 100644 --- a/src/redis.h +++ b/src/redis.h @@ -1073,6 +1073,7 @@ void copyClientOutputBuffer(redisClient *dst, redisClient *src); void *dupClientReplyValue(void *o); void getClientsMaxBuffers(unsigned long *longest_output_list, unsigned long *biggest_input_buffer); +void formatPeerId(char *peerid, size_t peerid_len, char *ip, int port); int getClientPeerId(redisClient *client, char *peerid, size_t peerid_len); sds getClientInfoString(redisClient *client); sds getAllClientsInfoString(void); From 80993d9892b7ba44e5883e10d33f1caca09db7b2 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 9 Jul 2013 16:21:21 +0200 Subject: [PATCH 0085/2500] Use getClientPeerId() for MONITOR implementation. --- src/replication.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/replication.c b/src/replication.c index 4dce2cdcb..f82c32a8f 100644 --- a/src/replication.c +++ b/src/replication.c @@ -290,10 +290,10 @@ void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) { void replicationFeedMonitors(redisClient *c, list *monitors, int dictid, robj **argv, int argc) { listNode *ln; listIter li; - int j, port; + int j; sds cmdrepr = sdsnew("+"); robj *cmdobj; - char ip[REDIS_IP_STR_LEN]; + char peerid[REDIS_PEER_ID_LEN]; struct timeval tv; gettimeofday(&tv,NULL); @@ -303,8 +303,8 @@ void replicationFeedMonitors(redisClient *c, list *monitors, int dictid, robj ** } else if (c->flags & REDIS_UNIX_SOCKET) { cmdrepr = sdscatprintf(cmdrepr,"[%d unix:%s] ",dictid,server.unixsocket); } else { - anetPeerToString(c->fd,ip,sizeof(ip),&port); - cmdrepr = sdscatprintf(cmdrepr,"[%d %s:%d] ",dictid,ip,port); + getClientPeerId(c,peerid,sizeof(peerid)); + cmdrepr = sdscatprintf(cmdrepr,"[%d %s] ",dictid,peerid); } for (j = 0; j < argc; j++) { From 40257ef32d7b78971e2d433e92346b626055276b Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 10 Jul 2013 14:34:58 +0200 Subject: [PATCH 0086/2500] anet.c: use SO_REUSEADDR when creating listening sockets. It used to be ok, but the socket option was removed when adding IPv6 support. --- src/anet.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/anet.c b/src/anet.c index bf8c92549..3726b9f67 100644 --- a/src/anet.c +++ b/src/anet.c @@ -392,11 +392,9 @@ static int _anetTcpServer(char *err, int port, char *bindaddr, int af) if ((s = socket(p->ai_family,p->ai_socktype,p->ai_protocol)) == -1) continue; - if (af == AF_INET6 && anetV6Only(err,s) == ANET_ERR) - goto error; /* could continue here? */ - - if (anetListen(err,s,p->ai_addr,p->ai_addrlen) == ANET_ERR) - goto error; /* could continue here? */ + if (af == AF_INET6 && anetV6Only(err,s) == ANET_ERR) goto error; + if (anetSetReuseAddr(err,s) == ANET_ERR) goto error; + if (anetListen(err,s,p->ai_addr,p->ai_addrlen) == ANET_ERR) goto error; goto end; } if (p == NULL) { From 6130dc4fdecbc1cdd3f2c5ae47c8e82d6af13827 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 10 Jul 2013 14:37:13 +0200 Subject: [PATCH 0087/2500] anet.c: save some vertical space. --- src/anet.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/src/anet.c b/src/anet.c index 3726b9f67..257b491e9 100644 --- a/src/anet.c +++ b/src/anet.c @@ -237,18 +237,11 @@ static int anetTcpGenericConnect(char *err, char *addr, int port, int flags) continue; /* if we set err then goto cleanup, otherwise next */ - if (anetSetReuseAddr(err,s) == ANET_ERR) { + if (anetSetReuseAddr(err,s) == ANET_ERR) goto error; + if (flags & ANET_CONNECT_NONBLOCK && anetNonBlock(err,s) != ANET_OK) goto error; - } - if (flags & ANET_CONNECT_NONBLOCK) { - if (anetNonBlock(err,s) != ANET_OK) - goto error; - } if (connect(s,p->ai_addr,p->ai_addrlen) == -1) { - if (errno == EINPROGRESS && - flags & ANET_CONNECT_NONBLOCK) - goto end; - + if (errno == EINPROGRESS && flags & ANET_CONNECT_NONBLOCK) goto end; close(s); continue; } From 89694bfbd55acd073e438245e2ef592490d1e810 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 10 Jul 2013 14:44:38 +0200 Subject: [PATCH 0088/2500] Sentinel: make sure published addr/id buffer is large enough. With ipv6 support we need more space, so we account for the IP address max size plus what we need for the Run ID, port, flags. --- src/sentinel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sentinel.c b/src/sentinel.c index eb729966a..594b7e498 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1839,7 +1839,7 @@ void sentinelPingInstance(sentinelRedisInstance *ri) { /* PUBLISH hello messages only to masters. */ char ip[REDIS_IP_STR_LEN]; if (anetSockName(ri->cc->c.fd,ip,sizeof(ip),NULL) != -1) { - char myaddr[128]; + char myaddr[REDIS_IP_STR_LEN+128]; // FIXME: IPv6 will break this due to nested : characters -geoffgarside snprintf(myaddr,sizeof(myaddr),"%s:%d:%s:%d", From c3c836d3f9ebc190ad943f5b1719bb0d178424ec Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 11 Jul 2013 13:11:27 +0200 Subject: [PATCH 0089/2500] hiredis: minimal IPv6 support. --- deps/hiredis/net.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/deps/hiredis/net.c b/deps/hiredis/net.c index 82ab2b468..b10eee2e5 100644 --- a/deps/hiredis/net.c +++ b/deps/hiredis/net.c @@ -215,9 +215,17 @@ int redisContextConnectTcp(redisContext *c, const char *addr, int port, struct t hints.ai_family = AF_INET; hints.ai_socktype = SOCK_STREAM; + /* Try with IPv6 if no IPv4 address was found. We do it in this order since + * in a Redis client you can't afford to test if you have IPv6 connectivity + * as this would add latency to every connect. Otherwise a more sensible + * route could be: Use IPv6 if both addresses are available and there is IPv6 + * connectivity. */ if ((rv = getaddrinfo(addr,_port,&hints,&servinfo)) != 0) { - __redisSetError(c,REDIS_ERR_OTHER,gai_strerror(rv)); - return REDIS_ERR; + hints.ai_family = AF_INET6; + if ((rv = getaddrinfo(addr,_port,&hints,&servinfo)) != 0) { + __redisSetError(c,REDIS_ERR_OTHER,gai_strerror(rv)); + return REDIS_ERR; + } } for (p = servinfo; p != NULL; p = p->ai_next) { if ((s = socket(p->ai_family,p->ai_socktype,p->ai_protocol)) == -1) From ca6624668cb88085d143599d2cacd0c2bd49286c Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 11 Jul 2013 16:31:39 +0200 Subject: [PATCH 0090/2500] Sentinel: use comma as separator to publish hello messages. We use comma to play well with IPv6 addresses, but the implementation is still able to parse the old messages separated by colons. --- src/sentinel.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 594b7e498..62473b5c1 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1741,9 +1741,13 @@ void sentinelReceiveHelloMessages(redisAsyncContext *c, void *reply, void *privd { int numtokens, port, removed, canfailover; + /* Separator changed from ":" to "," in recent versions in order to + * play well with IPv6 addresses. For now we make sure to parse both + * correctly detecting if there is "," inside the string. */ + char *sep = strchr(r->element[2]->str,',') ? "," : ":"; char **token = sdssplitlen(r->element[2]->str, r->element[2]->len, - ":",1,&numtokens); + sep,1,&numtokens); sentinelRedisInstance *sentinel; if (numtokens == 4) { @@ -1841,8 +1845,7 @@ void sentinelPingInstance(sentinelRedisInstance *ri) { if (anetSockName(ri->cc->c.fd,ip,sizeof(ip),NULL) != -1) { char myaddr[REDIS_IP_STR_LEN+128]; - // FIXME: IPv6 will break this due to nested : characters -geoffgarside - snprintf(myaddr,sizeof(myaddr),"%s:%d:%s:%d", + snprintf(myaddr,sizeof(myaddr),"%s,%d,%s,%d", ip, server.port, server.runid, (ri->flags & SRI_CAN_FAILOVER) != 0); retval = redisAsyncCommand(ri->cc, From 20c325a9a76c122a205868fcedcd5c6430b23bfb Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 11 Jul 2013 16:38:30 +0200 Subject: [PATCH 0091/2500] Sentinel: embed IPv6 address into [] when naming slave/sentinel instance. --- src/sentinel.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 62473b5c1..659de29b8 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -835,7 +835,9 @@ sentinelRedisInstance *createSentinelRedisInstance(char *name, int flags, char * /* For slaves and sentinel we use ip:port as name. */ if (flags & (SRI_SLAVE|SRI_SENTINEL)) { - snprintf(slavename,sizeof(slavename),"%s:%d",hostname,port); + snprintf(slavename,sizeof(slavename), + strchr(hostname,':') ? "[%s]:%d" : "%s:%d", + hostname,port); name = slavename; } @@ -943,7 +945,9 @@ sentinelRedisInstance *sentinelRedisInstanceLookupSlave( sentinelRedisInstance *slave; redisAssert(ri->flags & SRI_MASTER); - key = sdscatprintf(sdsempty(),"%s:%d",ip,port); + key = sdscatprintf(sdsempty(), + strchr(ip,':') ? "[%s]:%d" : "%s:%d", + ip,port); slave = dictFetchValue(ri->slaves,key); sdsfree(key); return slave; From b08b0beb0477286af1e1ab84b3c6a9234fef6b62 Mon Sep 17 00:00:00 2001 From: Jan-Erik Rediger Date: Thu, 11 Jul 2013 17:47:55 +0200 Subject: [PATCH 0092/2500] Wrap IPv6 in brackets in the prompt. --- src/redis-cli.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/redis-cli.c b/src/redis-cli.c index d52076f0b..b4ce644f7 100644 --- a/src/redis-cli.c +++ b/src/redis-cli.c @@ -116,7 +116,8 @@ static void cliRefreshPrompt(void) { len = snprintf(config.prompt,sizeof(config.prompt),"redis %s", config.hostsocket); else - len = snprintf(config.prompt,sizeof(config.prompt),"redis %s:%d", + len = snprintf(config.prompt,sizeof(config.prompt), + strchr(config.hostip,':') ? "[%s]:%d" : "%s:%d", config.hostip, config.hostport); /* Add [dbnum] if needed */ if (config.dbnum != 0) From 759eae01dcf22c54cd02184860e0510cbd52d32f Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 12 Jul 2013 11:56:52 +0200 Subject: [PATCH 0093/2500] Fixed compareStringObject() and introduced collateStringObject(). compareStringObject was not always giving the same result when comparing two exact strings, but encoded as integers or as sds strings, since it switched to strcmp() when at least one of the strings were not sds encoded. For instance the two strings "123" and "123\x00456", where the first string was integer encoded, would result into the old implementation of compareStringObject() to return 0 as if the strings were equal, while instead the second string is "greater" than the first in a binary comparison. The same compasion, but with "123" encoded as sds string, would instead return a value < 0, as it is correct. It is not impossible that the above caused some obscure bug, since the comparison was not always deterministic, and compareStringObject() is used in the implementation of skiplists, hash tables, and so forth. At the same time, collateStringObject() was introduced by this commit, so that can be used by SORT command to return sorted strings usign collation instead of binary comparison. See next commit. --- src/object.c | 41 +++++++++++++++++++++++++++++++++-------- src/redis.h | 1 + 2 files changed, 34 insertions(+), 8 deletions(-) diff --git a/src/object.c b/src/object.c index 2554656a3..58668da5b 100644 --- a/src/object.c +++ b/src/object.c @@ -332,35 +332,60 @@ robj *getDecodedObject(robj *o) { } } -/* Compare two string objects via strcmp() or alike. +/* Compare two string objects via strcmp() or strcoll() depending on flags. * Note that the objects may be integer-encoded. In such a case we * use ll2string() to get a string representation of the numbers on the stack * and compare the strings, it's much faster than calling getDecodedObject(). * - * Important note: if objects are not integer encoded, but binary-safe strings, - * sdscmp() from sds.c will apply memcmp() so this function ca be considered - * binary safe. */ -int compareStringObjects(robj *a, robj *b) { + * Important note: when REDIS_COMPARE_BINARY is used a binary-safe comparison + * is used. */ + +#define REDIS_COMPARE_BINARY (1<<0) +#define REDIS_COMPARE_COLL (1<<1) + +int compareStringObjectsWithFlags(robj *a, robj *b, int flags) { redisAssertWithInfo(NULL,a,a->type == REDIS_STRING && b->type == REDIS_STRING); char bufa[128], bufb[128], *astr, *bstr; + size_t alen, blen, minlen; int bothsds = 1; if (a == b) return 0; if (a->encoding != REDIS_ENCODING_RAW) { - ll2string(bufa,sizeof(bufa),(long) a->ptr); + alen = ll2string(bufa,sizeof(bufa),(long) a->ptr); astr = bufa; bothsds = 0; } else { astr = a->ptr; + alen = sdslen(astr); } if (b->encoding != REDIS_ENCODING_RAW) { - ll2string(bufb,sizeof(bufb),(long) b->ptr); + blen = ll2string(bufb,sizeof(bufb),(long) b->ptr); bstr = bufb; bothsds = 0; } else { bstr = b->ptr; + blen = sdslen(bstr); } - return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr); + if (flags & REDIS_COMPARE_COLL) { + return strcoll(astr,bstr); + } else { + int cmp; + + minlen = (alen < blen) ? alen : blen; + cmp = memcmp(astr,bstr,minlen); + if (cmp == 0) return alen-blen; + return cmp; + } +} + +/* Wrapper for compareStringObjectsWithFlags() using binary comparison. */ +int compareStringObjects(robj *a, robj *b) { + return compareStringObjectsWithFlags(a,b,REDIS_COMPARE_BINARY); +} + +/* Wrapper for compareStringObjectsWithFlags() using collation. */ +int collateStringObjects(robj *a, robj *b) { + return compareStringObjectsWithFlags(a,b,REDIS_COMPARE_COLL); } /* Equal string objects return 1 if the two objects are the same from the diff --git a/src/redis.h b/src/redis.h index f3ef3fbfd..0fb8e3028 100644 --- a/src/redis.h +++ b/src/redis.h @@ -1159,6 +1159,7 @@ int getLongDoubleFromObject(robj *o, long double *target); int getLongDoubleFromObjectOrReply(redisClient *c, robj *o, long double *target, const char *msg); char *strEncoding(int encoding); int compareStringObjects(robj *a, robj *b); +int collateStringObjects(robj *a, robj *b); int equalStringObjects(robj *a, robj *b); unsigned long estimateObjectIdleTime(robj *o); From 18dc48aeab291a1bd210a38990f67557070972a7 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 12 Jul 2013 12:02:36 +0200 Subject: [PATCH 0094/2500] SORT ALPHA: use collation instead of binary comparison. Note that we only do it when STORE is not used, otherwise we want an absolutely locale independent and binary safe sorting in order to ensure AOF / replication consistency. This is probably an unexpected behavior violating the least surprise rule, but there is currently no other simple / good alternative. --- src/redis.h | 1 + src/sort.c | 17 ++++++++++++++--- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/src/redis.h b/src/redis.h index 0fb8e3028..b067fdf45 100644 --- a/src/redis.h +++ b/src/redis.h @@ -891,6 +891,7 @@ struct redisServer { int sort_desc; int sort_alpha; int sort_bypattern; + int sort_store; /* Zip structure config, see redis.conf for more information */ size_t hash_max_ziplist_entries; size_t hash_max_ziplist_value; diff --git a/src/sort.c b/src/sort.c index 4b5040250..a4b062645 100644 --- a/src/sort.c +++ b/src/sort.c @@ -163,12 +163,22 @@ int sortCompare(const void *s1, const void *s2) { else cmp = 1; } else { - /* We have both the objects, use strcoll */ - cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr); + /* We have both the objects, compare them. */ + if (server.sort_store) { + cmp = compareStringObjects(so1->u.cmpobj,so2->u.cmpobj); + } else { + /* Here we can use strcoll() directly as we are sure that + * the objects are decoded string objects. */ + cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr); + } } } else { /* Compare elements directly. */ - cmp = compareStringObjects(so1->obj,so2->obj); + if (server.sort_store) { + cmp = compareStringObjects(so1->obj,so2->obj); + } else { + cmp = collateStringObjects(so1->obj,so2->obj); + } } } return server.sort_desc ? -cmp : cmp; @@ -432,6 +442,7 @@ void sortCommand(redisClient *c) { server.sort_desc = desc; server.sort_alpha = alpha; server.sort_bypattern = sortby ? 1 : 0; + server.sort_store = storekey ? 1 : 0; if (sortby && (start != 0 || end != vectorlen-1)) pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end); else From 912470cd000a914e62d47a298cfc09e3be91d32b Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 12 Jul 2013 12:06:05 +0200 Subject: [PATCH 0095/2500] Use the environment locale for strcoll() collation. --- src/redis.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/redis.c b/src/redis.c index 11c69629e..714d9b65f 100644 --- a/src/redis.c +++ b/src/redis.c @@ -49,6 +49,7 @@ #include #include #include +#include /* Our shared "common" objects */ @@ -2899,6 +2900,7 @@ int main(int argc, char **argv) { #ifdef INIT_SETPROCTITLE_REPLACEMENT spt_init(argc, argv); #endif + setlocale(LC_COLLATE,""); zmalloc_enable_thread_safeness(); zmalloc_set_oom_handler(redisOutOfMemoryHandler); srand(time(NULL)^getpid()); From efaa9d0bc46cdb86a559a53d4179d021fbe2c63f Mon Sep 17 00:00:00 2001 From: Ted Nyman Date: Fri, 12 Jul 2013 14:06:27 -0700 Subject: [PATCH 0096/2500] Make sure the log standardizes on 'timeout' --- src/replication.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/replication.c b/src/replication.c index f82c32a8f..8b4c1b7c5 100644 --- a/src/replication.c +++ b/src/replication.c @@ -1554,7 +1554,7 @@ void replicationCron(void) { if (server.masterhost && server.repl_state == REDIS_REPL_CONNECTED && (time(NULL)-server.master->lastinteraction) > server.repl_timeout) { - redisLog(REDIS_WARNING,"MASTER time out: no data nor PING received..."); + redisLog(REDIS_WARNING,"MASTER timeout: no data nor PING received..."); freeClient(server.master); } From d4d57e000c3c186a7bb8542beab3e4e9193f30d3 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 16 Jul 2013 15:05:13 +0200 Subject: [PATCH 0097/2500] Make sure that ZADD can accept the full range of double values. This fixes issue #1194, that contains many details. However in short, it was possible for ZADD to not accept as score values that was however possible to obtain with multiple calls to ZINCRBY, like in the following example: redis 127.0.0.1:6379> zadd k 2.5e-308 m (integer) 1 redis 127.0.0.1:6379> zincrby k -2.4e-308 m "9.9999999999999694e-310" redis 127.0.0.1:6379> zscore k m "9.9999999999999694e-310" redis 127.0.0.1:6379> zadd k 9.9999999999999694e-310 m1 (error) ERR value is not a valid float The problem was due to strtod() returning ERANGE in the following case specified by POSIX: "If the correct value would cause an underflow, a value whose magnitude is no greater than the smallest normalized positive number in the return type shall be returned and errno set to [ERANGE].". Now instead the returned value is accepted even when ERANGE is returned as long as the return value of the function is not negative or positive HUGE_VAL or zero. --- src/object.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/object.c b/src/object.c index 58668da5b..472d4a34b 100644 --- a/src/object.c +++ b/src/object.c @@ -422,8 +422,12 @@ int getDoubleFromObject(robj *o, double *target) { if (o->encoding == REDIS_ENCODING_RAW) { errno = 0; value = strtod(o->ptr, &eptr); - if (isspace(((char*)o->ptr)[0]) || eptr[0] != '\0' || - errno == ERANGE || isnan(value)) + if (isspace(((char*)o->ptr)[0]) || + eptr[0] != '\0' || + (errno == ERANGE && + (value == HUGE_VAL || value == -HUGE_VAL || value == 0)) || + errno == EINVAL || + isnan(value)) return REDIS_ERR; } else if (o->encoding == REDIS_ENCODING_INT) { value = (long)o->ptr; From dddfb15bc059c276e4e87fcc124c1ebec5f0edc3 Mon Sep 17 00:00:00 2001 From: yoav Date: Wed, 12 Dec 2012 15:59:22 +0200 Subject: [PATCH 0098/2500] Chunked loading of RDB to prevent redis from stalling reading very large keys. --- src/rdb.c | 23 ++++++++++++++--------- src/redis.c | 1 + src/redis.h | 1 + src/rio.c | 4 ++++ src/rio.h | 31 +++++++++++++++++++++++++------ 5 files changed, 45 insertions(+), 15 deletions(-) diff --git a/src/rdb.c b/src/rdb.c index c24f2d58f..c53c157c5 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -1057,21 +1057,32 @@ void stopLoading(void) { server.loading = 0; } +/* Track loading progress in order to serve client's from time to time + and if needed calculate rdb checksum */ +void rdbLoadProgressCallback(rio *r, const void *buf, size_t len) { + if (server.rdb_checksum) + rioGenericUpdateChecksum(r, buf, len); + if (server.loading_process_events_interval_bytes && + (r->processed_bytes + len)/server.loading_process_events_interval_bytes > r->processed_bytes/server.loading_process_events_interval_bytes) { + loadingProgress(r->processed_bytes); + aeProcessEvents(server.el, AE_FILE_EVENTS|AE_DONT_WAIT); + } +} + int rdbLoad(char *filename) { uint32_t dbid; int type, rdbver; redisDb *db = server.db+0; char buf[1024]; long long expiretime, now = mstime(); - long loops = 0; FILE *fp; rio rdb; if ((fp = fopen(filename,"r")) == NULL) return REDIS_ERR; rioInitWithFile(&rdb,fp); - if (server.rdb_checksum) - rdb.update_cksum = rioGenericUpdateChecksum; + rdb.update_cksum = rdbLoadProgressCallback; + rdb.max_processing_chunk = server.loading_process_events_interval_bytes; if (rioRead(&rdb,buf,9) == 0) goto eoferr; buf[9] = '\0'; if (memcmp(buf,"REDIS",5) != 0) { @@ -1093,12 +1104,6 @@ int rdbLoad(char *filename) { robj *key, *val; expiretime = -1; - /* Serve the clients from time to time */ - if (!(loops++ % 1000)) { - loadingProgress(rioTell(&rdb)); - aeProcessEvents(server.el, AE_FILE_EVENTS|AE_DONT_WAIT); - } - /* Read type. */ if ((type = rdbLoadType(&rdb)) == -1) goto eoferr; if (type == REDIS_RDB_OPCODE_EXPIRETIME) { diff --git a/src/redis.c b/src/redis.c index 714d9b65f..99955488e 100644 --- a/src/redis.c +++ b/src/redis.c @@ -1290,6 +1290,7 @@ void initServerConfig() { server.lua_client = NULL; server.lua_timedout = 0; server.migrate_cached_sockets = dictCreate(&migrateCacheDictType,NULL); + server.loading_process_events_interval_bytes = (1024*1024*2); updateLRUClock(); resetServerSaveParams(); diff --git a/src/redis.h b/src/redis.h index b067fdf45..e78dc528d 100644 --- a/src/redis.h +++ b/src/redis.h @@ -753,6 +753,7 @@ struct redisServer { off_t loading_total_bytes; off_t loading_loaded_bytes; time_t loading_start_time; + off_t loading_process_events_interval_bytes; /* Fast pointers to often looked up command */ struct redisCommand *delCommand, *multiCommand, *lpushCommand, *lpopCommand, *rpopCommand; diff --git a/src/rio.c b/src/rio.c index b2f46a08b..405e789e6 100644 --- a/src/rio.c +++ b/src/rio.c @@ -108,6 +108,8 @@ static const rio rioBufferIO = { rioBufferTell, NULL, /* update_checksum */ 0, /* current checksum */ + 0, /* bytes read or written */ + 0, /* read/write chunk size */ { { NULL, 0 } } /* union for io-specific vars */ }; @@ -117,6 +119,8 @@ static const rio rioFileIO = { rioFileTell, NULL, /* update_checksum */ 0, /* current checksum */ + 0, /* bytes read or written */ + 0, /* read/write chunk size */ { { NULL, 0 } } /* union for io-specific vars */ }; diff --git a/src/rio.h b/src/rio.h index 3cab66af0..c28b47dc4 100644 --- a/src/rio.h +++ b/src/rio.h @@ -53,6 +53,12 @@ struct _rio { /* The current checksum */ uint64_t cksum; + /* number of bytes read or written */ + size_t processed_bytes; + + /* maximum simgle read or write chunk size */ + size_t max_processing_chunk; + /* Backend-specific vars. */ union { struct { @@ -74,16 +80,29 @@ typedef struct _rio rio; * if needed. */ static inline size_t rioWrite(rio *r, const void *buf, size_t len) { - if (r->update_cksum) r->update_cksum(r,buf,len); - return r->write(r,buf,len); + while (len) { + size_t bytes_to_write = (r->max_processing_chunk && r->max_processing_chunk < len) ? r->max_processing_chunk : len; + if (r->update_cksum) r->update_cksum(r,buf,bytes_to_write); + if (r->write(r,buf,bytes_to_write) == 0) + return 0; + buf = (char*)buf + bytes_to_write; + len -= bytes_to_write; + r->processed_bytes += bytes_to_write; + } + return 1; } static inline size_t rioRead(rio *r, void *buf, size_t len) { - if (r->read(r,buf,len) == 1) { - if (r->update_cksum) r->update_cksum(r,buf,len); - return 1; + while (len) { + size_t bytes_to_read = (r->max_processing_chunk && r->max_processing_chunk < len) ? r->max_processing_chunk : len; + if (r->read(r,buf,bytes_to_read) == 0) + return 0; + if (r->update_cksum) r->update_cksum(r,buf,bytes_to_read); + buf = (char*)buf + bytes_to_read; + len -= bytes_to_read; + r->processed_bytes += bytes_to_read; } - return 0; + return 1; } static inline off_t rioTell(rio *r) { From 2973651071657828babbb9a3fc80741580f70c0e Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 16 Jul 2013 15:43:36 +0200 Subject: [PATCH 0099/2500] Fixed typo in rio.h, simgle -> single. --- src/rio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rio.h b/src/rio.h index c28b47dc4..2d12c6cc7 100644 --- a/src/rio.h +++ b/src/rio.h @@ -56,7 +56,7 @@ struct _rio { /* number of bytes read or written */ size_t processed_bytes; - /* maximum simgle read or write chunk size */ + /* maximum single read or write chunk size */ size_t max_processing_chunk; /* Backend-specific vars. */ From 7f4fa85a62560fac7e133547f61ea55910fac925 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 17 Jul 2013 15:04:22 +0200 Subject: [PATCH 0100/2500] addReplyDouble(): format infinite in a libc agnostic way. There are systems that when printing +/- infinte with printf-family functions will not use the usual "inf" "-inf", but different strings. Handle that explicitly. Fixes issue #930. --- src/networking.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/networking.c b/src/networking.c index 77c37af87..e2cbf9ad8 100644 --- a/src/networking.c +++ b/src/networking.c @@ -29,6 +29,7 @@ #include "redis.h" #include +#include static void setProtocolError(redisClient *c, int pos); @@ -415,9 +416,15 @@ void setDeferredMultiBulkLength(redisClient *c, void *node, long length) { void addReplyDouble(redisClient *c, double d) { char dbuf[128], sbuf[128]; int dlen, slen; - dlen = snprintf(dbuf,sizeof(dbuf),"%.17g",d); - slen = snprintf(sbuf,sizeof(sbuf),"$%d\r\n%s\r\n",dlen,dbuf); - addReplyString(c,sbuf,slen); + if (isinf(d)) { + /* Libc in odd systems (Hi Solaris!) will format infinite in a + * different way, so better to handle it in an explicit way. */ + addReplyBulkCString(c, d > 0 ? "inf" : "-inf"); + } else { + dlen = snprintf(dbuf,sizeof(dbuf),"%.17g",d); + slen = snprintf(sbuf,sizeof(sbuf),"$%d\r\n%s\r\n",dlen,dbuf); + addReplyString(c,sbuf,slen); + } } /* Add a long long as integer reply or bulk len / multi bulk count. From aa32f92338c9dde8a7cdd29c3cd76e8a4a1f42ba Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 5 Jun 2012 21:50:10 +0200 Subject: [PATCH 0101/2500] Introduction of a new string encoding: EMBSTR Previously two string encodings were used for string objects: 1) REDIS_ENCODING_RAW: a string object with obj->ptr pointing to an sds stirng. 2) REDIS_ENCODING_INT: a string object where the obj->ptr void pointer is casted to a long. This commit introduces a experimental new encoding called REDIS_ENCODING_EMBSTR that implements an object represented by an sds string that is not modifiable but allocated in the same memory chunk as the robj structure itself. The chunk looks like the following: +--------------+-----------+------------+--------+----+ | robj data... | robj->ptr | sds header | string | \0 | +--------------+-----+-----+------------+--------+----+ | ^ +-----------------------+ The robj->ptr points to the contiguous sds string data, so the object can be manipulated with the same functions used to manipulate plan string objects, however we need just on malloc and one free in order to allocate or release this kind of objects. Moreover it has better cache locality. This new allocation strategy should benefit both the memory usage and the performances. A performance gain between 60 and 70% was observed during micro-benchmarks, however there is more work to do to evaluate the performance impact and the memory usage behavior. --- src/aof.c | 2 +- src/bitops.c | 10 ++--- src/cluster.c | 2 +- src/config.c | 6 +-- src/debug.c | 6 +-- src/networking.c | 30 +++++++++---- src/object.c | 115 ++++++++++++++++++++++++++++++++++++++--------- src/rdb.c | 13 +++--- src/redis.c | 2 +- src/redis.h | 4 ++ src/slowlog.c | 2 +- src/sort.c | 2 +- src/t_hash.c | 2 +- src/t_list.c | 4 +- src/t_set.c | 9 +--- src/t_string.c | 4 +- src/t_zset.c | 15 ++++--- 17 files changed, 157 insertions(+), 71 deletions(-) diff --git a/src/aof.c b/src/aof.c index 89f17abab..2a29f72fb 100644 --- a/src/aof.c +++ b/src/aof.c @@ -591,7 +591,7 @@ int rioWriteBulkObject(rio *r, robj *obj) { * in a child process when this function is called). */ if (obj->encoding == REDIS_ENCODING_INT) { return rioWriteBulkLongLong(r,(long)obj->ptr); - } else if (obj->encoding == REDIS_ENCODING_RAW) { + } else if (sdsEncodedObject(obj)) { return rioWriteBulkString(r,obj->ptr,sdslen(obj->ptr)); } else { redisPanic("Unknown string encoding"); diff --git a/src/bitops.c b/src/bitops.c index c96a9e3c7..599d4dd8e 100644 --- a/src/bitops.c +++ b/src/bitops.c @@ -133,7 +133,7 @@ void setbitCommand(redisClient *c) { /* Create a copy when the object is shared or encoded. */ if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) { robj *decoded = getDecodedObject(o); - o = createStringObject(decoded->ptr, sdslen(decoded->ptr)); + o = createRawStringObject(decoded->ptr, sdslen(decoded->ptr)); decrRefCount(decoded); dbOverwrite(c->db,c->argv[1],o); } @@ -174,12 +174,12 @@ void getbitCommand(redisClient *c) { byte = bitoffset >> 3; bit = 7 - (bitoffset & 0x7); - if (o->encoding != REDIS_ENCODING_RAW) { - if (byte < (size_t)ll2string(llbuf,sizeof(llbuf),(long)o->ptr)) - bitval = llbuf[byte] & (1 << bit); - } else { + if (sdsEncodedObject(o)) { if (byte < sdslen(o->ptr)) bitval = ((uint8_t*)o->ptr)[byte] & (1 << bit); + } else { + if (byte < (size_t)ll2string(llbuf,sizeof(llbuf),(long)o->ptr)) + bitval = llbuf[byte] & (1 << bit); } addReply(c, bitval ? shared.cone : shared.czero); diff --git a/src/cluster.c b/src/cluster.c index 76ab67112..017989df3 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -2691,7 +2691,7 @@ try_again: rioWriteBulkString(&cmd,"RESTORE-ASKING",14)); else redisAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,"RESTORE",7)); - redisAssertWithInfo(c,NULL,c->argv[3]->encoding == REDIS_ENCODING_RAW); + redisAssertWithInfo(c,NULL,sdsEncodedObject(c->argv[3])); redisAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,c->argv[3]->ptr,sdslen(c->argv[3]->ptr))); redisAssertWithInfo(c,NULL,rioWriteBulkLongLong(&cmd,ttl)); diff --git a/src/config.c b/src/config.c index 78b458dcc..3a14a6e94 100644 --- a/src/config.c +++ b/src/config.c @@ -550,8 +550,8 @@ void loadServerConfig(char *filename, char *options) { void configSetCommand(redisClient *c) { robj *o; long long ll; - redisAssertWithInfo(c,c->argv[2],c->argv[2]->encoding == REDIS_ENCODING_RAW); - redisAssertWithInfo(c,c->argv[2],c->argv[3]->encoding == REDIS_ENCODING_RAW); + redisAssertWithInfo(c,c->argv[2],sdsEncodedObject(c->argv[2])); + redisAssertWithInfo(c,c->argv[3],sdsEncodedObject(c->argv[3])); o = c->argv[3]; if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) { @@ -918,7 +918,7 @@ void configGetCommand(redisClient *c) { char *pattern = o->ptr; char buf[128]; int matches = 0; - redisAssertWithInfo(c,o,o->encoding == REDIS_ENCODING_RAW); + redisAssertWithInfo(c,o,sdsEncodedObject(o)); /* String values */ config_get_string_field("dbfilename",server.rdb_filename); diff --git a/src/debug.c b/src/debug.c index 0a947e564..a0352b5dd 100644 --- a/src/debug.c +++ b/src/debug.c @@ -373,9 +373,7 @@ void _redisAssertPrintClientInfo(redisClient *c) { char buf[128]; char *arg; - if (c->argv[j]->type == REDIS_STRING && - c->argv[j]->encoding == REDIS_ENCODING_RAW) - { + if (c->argv[j]->type == REDIS_STRING && sdsEncodedObject(c->argv[j])) { arg = (char*) c->argv[j]->ptr; } else { snprintf(buf,sizeof(buf),"Object type: %d, encoding: %d", @@ -391,7 +389,7 @@ void redisLogObjectDebugInfo(robj *o) { redisLog(REDIS_WARNING,"Object type: %d", o->type); redisLog(REDIS_WARNING,"Object encoding: %d", o->encoding); redisLog(REDIS_WARNING,"Object refcount: %d", o->refcount); - if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_RAW) { + if (o->type == REDIS_STRING && sdsEncodedObject(o)) { redisLog(REDIS_WARNING,"Object raw string len: %zu", sdslen(o->ptr)); if (sdslen(o->ptr) < 4096) { sds repr = sdscatrepr(sdsempty(),o->ptr,sdslen(o->ptr)); diff --git a/src/networking.c b/src/networking.c index e2cbf9ad8..4066d69a8 100644 --- a/src/networking.c +++ b/src/networking.c @@ -184,12 +184,15 @@ void _addReplyObjectToList(redisClient *c, robj *o) { if (listLength(c->reply) == 0) { incrRefCount(o); listAddNodeTail(c->reply,o); - c->reply_bytes += zmalloc_size_sds(o->ptr); + c->reply_bytes += (o->encoding == REDIS_ENCODING_RAW) ? + zmalloc_size_sds(o->ptr) : + sdslen(o->ptr); } else { tail = listNodeValue(listLast(c->reply)); /* Append to this object when possible. */ if (tail->ptr != NULL && + tail->encoding == REDIS_ENCODING_RAW && sdslen(tail->ptr)+sdslen(o->ptr) <= REDIS_REPLY_CHUNK_BYTES) { c->reply_bytes -= zmalloc_size_sds(tail->ptr); @@ -199,7 +202,9 @@ void _addReplyObjectToList(redisClient *c, robj *o) { } else { incrRefCount(o); listAddNodeTail(c->reply,o); - c->reply_bytes += zmalloc_size_sds(o->ptr); + c->reply_bytes += (o->encoding == REDIS_ENCODING_RAW) ? + zmalloc_size_sds(o->ptr) : + sdslen(o->ptr); } } asyncCloseClientOnOutputBufferLimitReached(c); @@ -222,7 +227,7 @@ void _addReplySdsToList(redisClient *c, sds s) { tail = listNodeValue(listLast(c->reply)); /* Append to this object when possible. */ - if (tail->ptr != NULL && + if (tail->ptr != NULL && tail->encoding == REDIS_ENCODING_RAW && sdslen(tail->ptr)+sdslen(s) <= REDIS_REPLY_CHUNK_BYTES) { c->reply_bytes -= zmalloc_size_sds(tail->ptr); @@ -247,12 +252,14 @@ void _addReplyStringToList(redisClient *c, char *s, size_t len) { robj *o = createStringObject(s,len); listAddNodeTail(c->reply,o); - c->reply_bytes += zmalloc_size_sds(o->ptr); + c->reply_bytes += (o->encoding == REDIS_ENCODING_RAW) ? + zmalloc_size_sds(o->ptr) : + sdslen(o->ptr); } else { tail = listNodeValue(listLast(c->reply)); /* Append to this object when possible. */ - if (tail->ptr != NULL && + if (tail->ptr != NULL && tail->encoding == REDIS_ENCODING_RAW && sdslen(tail->ptr)+len <= REDIS_REPLY_CHUNK_BYTES) { c->reply_bytes -= zmalloc_size_sds(tail->ptr); @@ -263,7 +270,9 @@ void _addReplyStringToList(redisClient *c, char *s, size_t len) { robj *o = createStringObject(s,len); listAddNodeTail(c->reply,o); - c->reply_bytes += zmalloc_size_sds(o->ptr); + c->reply_bytes += (o->encoding == REDIS_ENCODING_RAW) ? + zmalloc_size_sds(o->ptr) : + sdslen(o->ptr); } } asyncCloseClientOnOutputBufferLimitReached(c); @@ -284,7 +293,7 @@ void addReply(redisClient *c, robj *obj) { * If the encoding is RAW and there is room in the static buffer * we'll be able to send the object to the client without * messing with its page. */ - if (obj->encoding == REDIS_ENCODING_RAW) { + if (sdsEncodedObject(obj)) { if (_addReplyToBuffer(c,obj->ptr,sdslen(obj->ptr)) != REDIS_OK) _addReplyObjectToList(c,obj); } else if (obj->encoding == REDIS_ENCODING_INT) { @@ -396,6 +405,7 @@ void setDeferredMultiBulkLength(redisClient *c, void *node, long length) { len = listNodeValue(ln); len->ptr = sdscatprintf(sdsempty(),"*%ld\r\n",length); + len->encoding = REDIS_ENCODING_RAW; /* in case it was an EMBSTR. */ c->reply_bytes += zmalloc_size_sds(len->ptr); if (ln->next != NULL) { next = listNodeValue(ln->next); @@ -468,7 +478,7 @@ void addReplyMultiBulkLen(redisClient *c, long length) { void addReplyBulkLen(redisClient *c, robj *obj) { size_t len; - if (obj->encoding == REDIS_ENCODING_RAW) { + if (sdsEncodedObject(obj)) { len = sdslen(obj->ptr); } else { long n = (long)obj->ptr; @@ -765,7 +775,9 @@ void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) { } else { o = listNodeValue(listFirst(c->reply)); objlen = sdslen(o->ptr); - objmem = zmalloc_size_sds(o->ptr); + objmem = (o->encoding == REDIS_ENCODING_RAW) ? + zmalloc_size_sds(o->ptr) : + sdslen(o->ptr); if (objlen == 0) { listDelNode(c->reply,listFirst(c->reply)); diff --git a/src/object.c b/src/object.c index 472d4a34b..9efb590f8 100644 --- a/src/object.c +++ b/src/object.c @@ -44,10 +44,47 @@ robj *createObject(int type, void *ptr) { return o; } -robj *createStringObject(char *ptr, size_t len) { +/* Create a string object with encoding REDIS_ENCODING_RAW, that is a plain + * string object where o->ptr points to a proper sds string. */ +robj *createRawStringObject(char *ptr, size_t len) { return createObject(REDIS_STRING,sdsnewlen(ptr,len)); } +/* Create a string object with encoding REDIS_ENCODING_EMBSTR, that is + * an object where the sds string is actually an unmodifiable string + * allocated in the same chunk as the object itself. */ +robj *createEmbeddedStringObject(char *ptr, size_t len) { + robj *o = zmalloc(sizeof(robj)+sizeof(struct sdshdr)+len+1); + struct sdshdr *sh = (void*)(o+1); + + o->type = REDIS_STRING; + o->encoding = REDIS_ENCODING_EMBSTR; + o->ptr = sh+1; + o->refcount = 1; + o->lru = server.lruclock; + + sh->len = len; + sh->free = 0; + if (ptr) { + memcpy(sh->buf,ptr,len); + sh->buf[len] = '\0'; + } else { + memset(sh->buf,0,len+1); + } + return o; +} + +/* Create a string object with EMBSTR encoding if it is smaller than + * REIDS_ENCODING_EMBSTR_SIZE_LIMIT, otherwise the RAW encoding is + * used. */ +#define REDIS_ENCODING_EMBSTR_SIZE_LIMIT 32 +robj *createStringObject(char *ptr, size_t len) { + if (len <= REDIS_ENCODING_EMBSTR_SIZE_LIMIT) + return createEmbeddedStringObject(ptr,len); + else + return createRawStringObject(ptr,len); +} + robj *createStringObjectFromLongLong(long long value) { robj *o; if (value >= 0 && value < REDIS_SHARED_INTEGERS) { @@ -89,9 +126,33 @@ robj *createStringObjectFromLongDouble(long double value) { return createStringObject(buf,len); } +/* Duplicate a string object, with the guarantee that the returned object + * has the same encoding as the original one. + * + * This function also guarantees that duplicating a small integere object + * (or a string object that contains a representation of a small integer) + * will always result in a fresh object that is unshared (refcount == 1). + * + * The resulting object always has refcount set to 1. */ robj *dupStringObject(robj *o) { - redisAssertWithInfo(NULL,o,o->encoding == REDIS_ENCODING_RAW); - return createStringObject(o->ptr,sdslen(o->ptr)); + robj *d; + + redisAssert(o->type == REDIS_STRING); + + switch(o->encoding) { + case REDIS_ENCODING_RAW: + return createRawStringObject(o->ptr,sdslen(o->ptr)); + case REDIS_ENCODING_EMBSTR: + return createEmbeddedStringObject(o->ptr,sdslen(o->ptr)); + case REDIS_ENCODING_INT: + d = createObject(REDIS_STRING, NULL); + d->encoding = REDIS_ENCODING_INT; + d->ptr = o->ptr; + return d; + default: + redisPanic("Wrong encoding."); + break; + } } robj *createListObject(void) { @@ -279,7 +340,7 @@ robj *tryObjectEncoding(robj *o) { long value; sds s = o->ptr; - if (o->encoding != REDIS_ENCODING_RAW) + if (o->encoding == REDIS_ENCODING_INT) return o; /* Already encoded */ /* It's not safe to encode shared objects: shared objects can be shared @@ -291,7 +352,17 @@ robj *tryObjectEncoding(robj *o) { redisAssertWithInfo(NULL,o,o->type == REDIS_STRING); /* Check if we can represent this string as a long integer */ - if (!string2l(s,sdslen(s),&value)) return o; + if (!string2l(s,sdslen(s),&value)) { + /* Integer encoding not possible. Check if we can use EMBSTR. */ + if (sdslen(s) <= REDIS_ENCODING_EMBSTR_SIZE_LIMIT) { + robj *emb = createEmbeddedStringObject(s,sdslen(s)); + decrRefCount(o); + return emb; + } else { + /* Otherwise return the original object. */ + return o; + } + } /* Ok, this object can be encoded... * @@ -305,8 +376,8 @@ robj *tryObjectEncoding(robj *o) { incrRefCount(shared.integers[value]); return shared.integers[value]; } else { + if (o->encoding == REDIS_ENCODING_RAW) sdsfree(o->ptr); o->encoding = REDIS_ENCODING_INT; - sdsfree(o->ptr); o->ptr = (void*) value; return o; } @@ -317,7 +388,7 @@ robj *tryObjectEncoding(robj *o) { robj *getDecodedObject(robj *o) { robj *dec; - if (o->encoding == REDIS_ENCODING_RAW) { + if (sdsEncodedObject(o)) { incrRefCount(o); return o; } @@ -350,21 +421,21 @@ int compareStringObjectsWithFlags(robj *a, robj *b, int flags) { int bothsds = 1; if (a == b) return 0; - if (a->encoding != REDIS_ENCODING_RAW) { + if (sdsEncodedObject(a)) { + astr = a->ptr; + alen = sdslen(astr); + } else { alen = ll2string(bufa,sizeof(bufa),(long) a->ptr); astr = bufa; bothsds = 0; - } else { - astr = a->ptr; - alen = sdslen(astr); } - if (b->encoding != REDIS_ENCODING_RAW) { + if (sdsEncodedObject(b)) { + bstr = b->ptr; + blen = sdslen(bstr); + } else { blen = ll2string(bufb,sizeof(bufb),(long) b->ptr); bstr = bufb; bothsds = 0; - } else { - bstr = b->ptr; - blen = sdslen(bstr); } if (flags & REDIS_COMPARE_COLL) { return strcoll(astr,bstr); @@ -393,7 +464,10 @@ int collateStringObjects(robj *a, robj *b) { * this function is faster then checking for (compareStringObject(a,b) == 0) * because it can perform some more optimization. */ int equalStringObjects(robj *a, robj *b) { - if (a->encoding != REDIS_ENCODING_RAW && b->encoding != REDIS_ENCODING_RAW){ + if (a->encoding == REDIS_ENCODING_INT && + b->encoding == REDIS_ENCODING_INT){ + /* If both strings are integer encoded just check if the stored + * long is the same. */ return a->ptr == b->ptr; } else { return compareStringObjects(a,b) == 0; @@ -402,7 +476,7 @@ int equalStringObjects(robj *a, robj *b) { size_t stringObjectLen(robj *o) { redisAssertWithInfo(NULL,o,o->type == REDIS_STRING); - if (o->encoding == REDIS_ENCODING_RAW) { + if (sdsEncodedObject(o)) { return sdslen(o->ptr); } else { char buf[32]; @@ -419,7 +493,7 @@ int getDoubleFromObject(robj *o, double *target) { value = 0; } else { redisAssertWithInfo(NULL,o,o->type == REDIS_STRING); - if (o->encoding == REDIS_ENCODING_RAW) { + if (sdsEncodedObject(o)) { errno = 0; value = strtod(o->ptr, &eptr); if (isspace(((char*)o->ptr)[0]) || @@ -461,7 +535,7 @@ int getLongDoubleFromObject(robj *o, long double *target) { value = 0; } else { redisAssertWithInfo(NULL,o,o->type == REDIS_STRING); - if (o->encoding == REDIS_ENCODING_RAW) { + if (sdsEncodedObject(o)) { errno = 0; value = strtold(o->ptr, &eptr); if (isspace(((char*)o->ptr)[0]) || eptr[0] != '\0' || @@ -499,7 +573,7 @@ int getLongLongFromObject(robj *o, long long *target) { value = 0; } else { redisAssertWithInfo(NULL,o,o->type == REDIS_STRING); - if (o->encoding == REDIS_ENCODING_RAW) { + if (sdsEncodedObject(o)) { errno = 0; value = strtoll(o->ptr, &eptr, 10); if (isspace(((char*)o->ptr)[0]) || eptr[0] != '\0' || @@ -554,6 +628,7 @@ char *strEncoding(int encoding) { case REDIS_ENCODING_ZIPLIST: return "ziplist"; case REDIS_ENCODING_INTSET: return "intset"; case REDIS_ENCODING_SKIPLIST: return "skiplist"; + case REDIS_ENCODING_EMBSTR: return "embstr"; default: return "unknown"; } } diff --git a/src/rdb.c b/src/rdb.c index c53c157c5..1c2b0ed0d 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -325,7 +325,7 @@ int rdbSaveStringObject(rio *rdb, robj *obj) { if (obj->encoding == REDIS_ENCODING_INT) { return rdbSaveLongLongAsStringObject(rdb,(long)obj->ptr); } else { - redisAssertWithInfo(NULL,obj,obj->encoding == REDIS_ENCODING_RAW); + redisAssertWithInfo(NULL,obj,sdsEncodedObject(obj)); return rdbSaveRawString(rdb,obj->ptr,sdslen(obj->ptr)); } } @@ -795,7 +795,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb) { /* If we are using a ziplist and the value is too big, convert * the object to a real list. */ if (o->encoding == REDIS_ENCODING_ZIPLIST && - ele->encoding == REDIS_ENCODING_RAW && + sdsEncodedObject(ele) && sdslen(ele->ptr) > server.list_max_ziplist_value) listTypeConvert(o,REDIS_ENCODING_LINKEDLIST); @@ -869,9 +869,8 @@ robj *rdbLoadObject(int rdbtype, rio *rdb) { if (rdbLoadDoubleValue(rdb,&score) == -1) return NULL; /* Don't care about integer-encoded strings. */ - if (ele->encoding == REDIS_ENCODING_RAW && - sdslen(ele->ptr) > maxelelen) - maxelelen = sdslen(ele->ptr); + if (sdsEncodedObject(ele) && sdslen(ele->ptr) > maxelelen) + maxelelen = sdslen(ele->ptr); znode = zslInsert(zs->zsl,score,ele); dictAdd(zs->dict,ele,&znode->score); @@ -903,10 +902,10 @@ robj *rdbLoadObject(int rdbtype, rio *rdb) { /* Load raw strings */ field = rdbLoadStringObject(rdb); if (field == NULL) return NULL; - redisAssert(field->encoding == REDIS_ENCODING_RAW); + redisAssert(sdsEncodedObject(field)); value = rdbLoadStringObject(rdb); if (value == NULL) return NULL; - redisAssert(field->encoding == REDIS_ENCODING_RAW); + redisAssert(sdsEncodedObject(value)); /* Add pair to ziplist */ o->ptr = ziplistPush(o->ptr, field->ptr, sdslen(field->ptr), ZIPLIST_TAIL); diff --git a/src/redis.c b/src/redis.c index 99955488e..8a833d509 100644 --- a/src/redis.c +++ b/src/redis.c @@ -465,7 +465,7 @@ int dictEncObjKeyCompare(void *privdata, const void *key1, unsigned int dictEncObjHash(const void *key) { robj *o = (robj*) key; - if (o->encoding == REDIS_ENCODING_RAW) { + if (sdsEncodedObject(o)) { return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr)); } else { if (o->encoding == REDIS_ENCODING_INT) { diff --git a/src/redis.h b/src/redis.h index e78dc528d..57e39ad2b 100644 --- a/src/redis.h +++ b/src/redis.h @@ -174,6 +174,7 @@ #define REDIS_ENCODING_ZIPLIST 5 /* Encoded as ziplist */ #define REDIS_ENCODING_INTSET 6 /* Encoded as intset */ #define REDIS_ENCODING_SKIPLIST 7 /* Encoded as skiplist */ +#define REDIS_ENCODING_EMBSTR 8 /* Embedded sds string encoding */ /* Defines related to the dump file format. To store 32 bits lengths for short * keys requires a lot of space, so we check the most significant 2 bits of @@ -1138,6 +1139,8 @@ void freeZsetObject(robj *o); void freeHashObject(robj *o); robj *createObject(int type, void *ptr); robj *createStringObject(char *ptr, size_t len); +robj *createRawStringObject(char *ptr, size_t len); +robj *createEmbeddedStringObject(char *ptr, size_t len); robj *dupStringObject(robj *o); int isObjectRepresentableAsLongLong(robj *o, long long *llongval); robj *tryObjectEncoding(robj *o); @@ -1164,6 +1167,7 @@ int compareStringObjects(robj *a, robj *b); int collateStringObjects(robj *a, robj *b); int equalStringObjects(robj *a, robj *b); unsigned long estimateObjectIdleTime(robj *o); +#define sdsEncodedObject(objptr) (objptr->encoding == REDIS_ENCODING_RAW || objptr->encoding == REDIS_ENCODING_EMBSTR) /* Synchronous I/O with timeout */ ssize_t syncWrite(int fd, char *ptr, ssize_t size, long long timeout); diff --git a/src/slowlog.c b/src/slowlog.c index fdc18e579..ff6ccf472 100644 --- a/src/slowlog.c +++ b/src/slowlog.c @@ -63,7 +63,7 @@ slowlogEntry *slowlogCreateEntry(robj **argv, int argc, long long duration) { } else { /* Trim too long strings as well... */ if (argv[j]->type == REDIS_STRING && - argv[j]->encoding == REDIS_ENCODING_RAW && + sdsEncodedObject(argv[j]) && sdslen(argv[j]->ptr) > SLOWLOG_ENTRY_MAX_STRING) { sds s = sdsnewlen(argv[j]->ptr, SLOWLOG_ENTRY_MAX_STRING); diff --git a/src/sort.c b/src/sort.c index a4b062645..ebdf5469c 100644 --- a/src/sort.c +++ b/src/sort.c @@ -411,7 +411,7 @@ void sortCommand(redisClient *c) { if (alpha) { if (sortby) vector[j].u.cmpobj = getDecodedObject(byval); } else { - if (byval->encoding == REDIS_ENCODING_RAW) { + if (sdsEncodedObject(byval)) { char *eptr; vector[j].u.score = strtod(byval->ptr,&eptr); diff --git a/src/t_hash.c b/src/t_hash.c index 9484e531b..3b87b92ca 100644 --- a/src/t_hash.c +++ b/src/t_hash.c @@ -43,7 +43,7 @@ void hashTypeTryConversion(robj *o, robj **argv, int start, int end) { if (o->encoding != REDIS_ENCODING_ZIPLIST) return; for (i = start; i <= end; i++) { - if (argv[i]->encoding == REDIS_ENCODING_RAW && + if (sdsEncodedObject(argv[i]) && sdslen(argv[i]->ptr) > server.hash_max_ziplist_value) { hashTypeConvert(o, REDIS_ENCODING_HT); diff --git a/src/t_list.c b/src/t_list.c index 0413dc69b..a8ce9b976 100644 --- a/src/t_list.c +++ b/src/t_list.c @@ -40,7 +40,7 @@ void signalListAsReady(redisClient *c, robj *key); * objects are never too long. */ void listTypeTryConversion(robj *subject, robj *value) { if (subject->encoding != REDIS_ENCODING_ZIPLIST) return; - if (value->encoding == REDIS_ENCODING_RAW && + if (sdsEncodedObject(value) && sdslen(value->ptr) > server.list_max_ziplist_value) listTypeConvert(subject,REDIS_ENCODING_LINKEDLIST); } @@ -234,7 +234,7 @@ void listTypeInsert(listTypeEntry *entry, robj *value, int where) { int listTypeEqual(listTypeEntry *entry, robj *o) { listTypeIterator *li = entry->li; if (li->encoding == REDIS_ENCODING_ZIPLIST) { - redisAssertWithInfo(NULL,o,o->encoding == REDIS_ENCODING_RAW); + redisAssertWithInfo(NULL,o,sdsEncodedObject(o)); return ziplistCompare(entry->zi,o->ptr,sdslen(o->ptr)); } else if (li->encoding == REDIS_ENCODING_LINKEDLIST) { return equalStringObjects(o,listNodeValue(entry->ln)); diff --git a/src/t_set.c b/src/t_set.c index a522cd88a..ab65e23f3 100644 --- a/src/t_set.c +++ b/src/t_set.c @@ -495,11 +495,8 @@ void srandmemberWithCountCommand(redisClient *c) { if (encoding == REDIS_ENCODING_INTSET) { retval = dictAdd(d,createStringObjectFromLongLong(llele),NULL); - } else if (ele->encoding == REDIS_ENCODING_RAW) { + } else { retval = dictAdd(d,dupStringObject(ele),NULL); - } else if (ele->encoding == REDIS_ENCODING_INT) { - retval = dictAdd(d, - createStringObjectFromLongLong((long)ele->ptr),NULL); } redisAssert(retval == DICT_OK); } @@ -527,10 +524,8 @@ void srandmemberWithCountCommand(redisClient *c) { encoding = setTypeRandomElement(set,&ele,&llele); if (encoding == REDIS_ENCODING_INTSET) { ele = createStringObjectFromLongLong(llele); - } else if (ele->encoding == REDIS_ENCODING_RAW) { + } else { ele = dupStringObject(ele); - } else if (ele->encoding == REDIS_ENCODING_INT) { - ele = createStringObjectFromLongLong((long)ele->ptr); } /* Try to add the object to the dictionary. If it already exists * free it, otherwise increment the number of objects we have diff --git a/src/t_string.c b/src/t_string.c index cbd069d3c..3645ae7c5 100644 --- a/src/t_string.c +++ b/src/t_string.c @@ -217,7 +217,7 @@ void setrangeCommand(redisClient *c) { /* Create a copy when the object is shared or encoded. */ if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) { robj *decoded = getDecodedObject(o); - o = createStringObject(decoded->ptr, sdslen(decoded->ptr)); + o = createRawStringObject(decoded->ptr, sdslen(decoded->ptr)); decrRefCount(decoded); dbOverwrite(c->db,c->argv[1],o); } @@ -436,7 +436,7 @@ void appendCommand(redisClient *c) { /* If the object is shared or encoded, we have to make a copy */ if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) { robj *decoded = getDecodedObject(o); - o = createStringObject(decoded->ptr, sdslen(decoded->ptr)); + o = createRawStringObject(decoded->ptr, sdslen(decoded->ptr)); decrRefCount(decoded); dbOverwrite(c->db,c->argv[1],o); } diff --git a/src/t_zset.c b/src/t_zset.c index 8ef9c5376..291a7eacf 100644 --- a/src/t_zset.c +++ b/src/t_zset.c @@ -646,7 +646,7 @@ unsigned char *zzlInsertAt(unsigned char *zl, unsigned char *eptr, robj *ele, do int scorelen; size_t offset; - redisAssertWithInfo(NULL,ele,ele->encoding == REDIS_ENCODING_RAW); + redisAssertWithInfo(NULL,ele,sdsEncodedObject(ele)); scorelen = d2string(scorebuf,sizeof(scorebuf),score); if (eptr == NULL) { zl = ziplistPush(zl,ele->ptr,sdslen(ele->ptr),ZIPLIST_TAIL); @@ -1363,7 +1363,7 @@ int zuiLongLongFromValue(zsetopval *val) { if (val->ele->encoding == REDIS_ENCODING_INT) { val->ell = (long)val->ele->ptr; val->flags |= OPVAL_VALID_LL; - } else if (val->ele->encoding == REDIS_ENCODING_RAW) { + } else if (sdsEncodedObject(val->ele)) { if (string2ll(val->ele->ptr,sdslen(val->ele->ptr),&val->ell)) val->flags |= OPVAL_VALID_LL; } else { @@ -1398,7 +1398,7 @@ int zuiBufferFromValue(zsetopval *val) { if (val->ele->encoding == REDIS_ENCODING_INT) { val->elen = ll2string((char*)val->_buf,sizeof(val->_buf),(long)val->ele->ptr); val->estr = val->_buf; - } else if (val->ele->encoding == REDIS_ENCODING_RAW) { + } else if (sdsEncodedObject(val->ele)) { val->elen = sdslen(val->ele->ptr); val->estr = val->ele->ptr; } else { @@ -1624,9 +1624,10 @@ void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) { dictAdd(dstzset->dict,tmp,&znode->score); incrRefCount(tmp); /* added to dictionary */ - if (tmp->encoding == REDIS_ENCODING_RAW) + if (sdsEncodedObject(tmp)) { if (sdslen(tmp->ptr) > maxelelen) maxelelen = sdslen(tmp->ptr); + } } } } @@ -1666,9 +1667,10 @@ void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) { dictAdd(dstzset->dict,tmp,&znode->score); incrRefCount(zval.ele); /* added to dictionary */ - if (tmp->encoding == REDIS_ENCODING_RAW) + if (sdsEncodedObject(tmp)) { if (sdslen(tmp->ptr) > maxelelen) maxelelen = sdslen(tmp->ptr); + } } } } else { @@ -2146,7 +2148,8 @@ void zrankGenericCommand(redisClient *c, int reverse) { checkType(c,zobj,REDIS_ZSET)) return; llen = zsetLength(zobj); - redisAssertWithInfo(c,ele,ele->encoding == REDIS_ENCODING_RAW); + redisAssertWithInfo(c,ele,sdsEncodedObject(ele)); + if (zobj->encoding == REDIS_ENCODING_ZIPLIST) { unsigned char *zl = zobj->ptr; unsigned char *eptr, *sptr; From 06a0f621d66f1420d7ad2bcd2d5bdd1532a61e21 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 22 Jul 2013 10:36:27 +0200 Subject: [PATCH 0102/2500] Fix replicationFeedSlaves() to use sdsEncodedObject() macro. --- src/replication.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/replication.c b/src/replication.c index 8b4c1b7c5..3e8964ddd 100644 --- a/src/replication.c +++ b/src/replication.c @@ -200,10 +200,11 @@ void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) { char *objptr; if (argv[j]->encoding != REDIS_ENCODING_RAW && - argv[j]->encoding != REDIS_ENCODING_INT) { + argv[j]->encoding != REDIS_ENCODING_INT && + argv[j]->encoding != REDIS_ENCODING_EMBSTR) { redisPanic("Unexpected encoding"); } - if (argv[j]->encoding == REDIS_ENCODING_RAW) { + if (sdsEncodedObject(argv[j])) { objlen = sdslen(argv[j]->ptr); objptr = argv[j]->ptr; } else { From 9ce2c7ba73bab49676d4cd5de2076ec0d415251a Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 22 Jul 2013 11:05:55 +0200 Subject: [PATCH 0103/2500] Fixed a possible bug in client->reply_bytes computation. --- src/networking.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/networking.c b/src/networking.c index 4066d69a8..a9f016862 100644 --- a/src/networking.c +++ b/src/networking.c @@ -781,6 +781,7 @@ void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) { if (objlen == 0) { listDelNode(c->reply,listFirst(c->reply)); + c->reply_bytes -= objmem; continue; } From 0f8f91bb339fadad911488529da5303d50d1f864 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 22 Jul 2013 23:27:56 +0200 Subject: [PATCH 0104/2500] Fix setDeferredMultiBulkLength() c->reply_bytes handling with EMBSTR This function missed proper handling of reply_bytes when gluing to the previous object was used. The issue was introduced with the EMBSTR new string object encoding. This fixes issue #1208. --- src/networking.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/networking.c b/src/networking.c index a9f016862..847e80c16 100644 --- a/src/networking.c +++ b/src/networking.c @@ -413,7 +413,10 @@ void setDeferredMultiBulkLength(redisClient *c, void *node, long length) { /* Only glue when the next node is non-NULL (an sds in this case) */ if (next->ptr != NULL) { c->reply_bytes -= zmalloc_size_sds(len->ptr); - c->reply_bytes -= zmalloc_size_sds(next->ptr); + if (next->encoding == REDIS_ENCODING_RAW) + c->reply_bytes -= zmalloc_size_sds(next->ptr); + else + c->reply_bytes -= sdslen(next->ptr); len->ptr = sdscatlen(len->ptr,next->ptr,sdslen(next->ptr)); c->reply_bytes += zmalloc_size_sds(len->ptr); listDelNode(c->reply,ln->next); From 995cb581a960fe1e9daa6fd7b24813f9abd05785 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 22 Jul 2013 23:40:03 +0200 Subject: [PATCH 0105/2500] Test: regression test for issue #1208. --- tests/unit/basic.tcl | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/unit/basic.tcl b/tests/unit/basic.tcl index c766b3de9..1f46ba666 100644 --- a/tests/unit/basic.tcl +++ b/tests/unit/basic.tcl @@ -754,4 +754,11 @@ start_server {tags {"basic"}} { set ttl [r ttl foo] assert {$ttl <= 10 && $ttl > 5} } + + test {KEYS * two times with long key, Github issue #1208} { + r flushdb + r set dlskeriewrioeuwqoirueioqwrueoqwrueqw test + r keys * + r keys * + } {dlskeriewrioeuwqoirueioqwrueoqwrueqw} } From be2e9f4f4784d8cb6224f75a6c31587f807ce510 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 23 Jul 2013 11:50:17 +0200 Subject: [PATCH 0106/2500] getStringObjectSdsUsedMemory() function added. Now that EMBSTR encoding exists we calculate the amount of memory used by the SDS part of a Redis String object in two different ways: 1) For raw string object, the size of the allocation is considered. 2) For embstr objects, the length of the string itself is used. The new function takes care of this logic. --- src/networking.c | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/src/networking.c b/src/networking.c index 847e80c16..54dfc025e 100644 --- a/src/networking.c +++ b/src/networking.c @@ -41,6 +41,17 @@ size_t zmalloc_size_sds(sds s) { return zmalloc_size(s-sizeof(struct sdshdr)); } +/* Return the amount of memory used by the sds string at object->ptr + * for a string object. */ +size_t getStringObjectSdsUsedMemory(robj *o) { + redisAssertWithInfo(NULL,o,o->type == REDIS_STRING); + switch(o->encoding) { + case REDIS_ENCODING_RAW: return zmalloc_size_sds(o->ptr); + case REDIS_ENCODING_EMBSTR: return sdslen(o->ptr); + default: return 0; /* Just integer encoding for now. */ + } +} + void *dupClientReplyValue(void *o) { incrRefCount((robj*)o); return o; @@ -184,9 +195,7 @@ void _addReplyObjectToList(redisClient *c, robj *o) { if (listLength(c->reply) == 0) { incrRefCount(o); listAddNodeTail(c->reply,o); - c->reply_bytes += (o->encoding == REDIS_ENCODING_RAW) ? - zmalloc_size_sds(o->ptr) : - sdslen(o->ptr); + c->reply_bytes += getStringObjectSdsUsedMemory(o); } else { tail = listNodeValue(listLast(c->reply)); @@ -202,9 +211,7 @@ void _addReplyObjectToList(redisClient *c, robj *o) { } else { incrRefCount(o); listAddNodeTail(c->reply,o); - c->reply_bytes += (o->encoding == REDIS_ENCODING_RAW) ? - zmalloc_size_sds(o->ptr) : - sdslen(o->ptr); + c->reply_bytes += getStringObjectSdsUsedMemory(o); } } asyncCloseClientOnOutputBufferLimitReached(c); @@ -252,9 +259,7 @@ void _addReplyStringToList(redisClient *c, char *s, size_t len) { robj *o = createStringObject(s,len); listAddNodeTail(c->reply,o); - c->reply_bytes += (o->encoding == REDIS_ENCODING_RAW) ? - zmalloc_size_sds(o->ptr) : - sdslen(o->ptr); + c->reply_bytes += getStringObjectSdsUsedMemory(o); } else { tail = listNodeValue(listLast(c->reply)); @@ -270,9 +275,7 @@ void _addReplyStringToList(redisClient *c, char *s, size_t len) { robj *o = createStringObject(s,len); listAddNodeTail(c->reply,o); - c->reply_bytes += (o->encoding == REDIS_ENCODING_RAW) ? - zmalloc_size_sds(o->ptr) : - sdslen(o->ptr); + c->reply_bytes += getStringObjectSdsUsedMemory(o); } } asyncCloseClientOnOutputBufferLimitReached(c); @@ -413,10 +416,7 @@ void setDeferredMultiBulkLength(redisClient *c, void *node, long length) { /* Only glue when the next node is non-NULL (an sds in this case) */ if (next->ptr != NULL) { c->reply_bytes -= zmalloc_size_sds(len->ptr); - if (next->encoding == REDIS_ENCODING_RAW) - c->reply_bytes -= zmalloc_size_sds(next->ptr); - else - c->reply_bytes -= sdslen(next->ptr); + c->reply_bytes -= getStringObjectSdsUsedMemory(next); len->ptr = sdscatlen(len->ptr,next->ptr,sdslen(next->ptr)); c->reply_bytes += zmalloc_size_sds(len->ptr); listDelNode(c->reply,ln->next); @@ -778,9 +778,7 @@ void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) { } else { o = listNodeValue(listFirst(c->reply)); objlen = sdslen(o->ptr); - objmem = (o->encoding == REDIS_ENCODING_RAW) ? - zmalloc_size_sds(o->ptr) : - sdslen(o->ptr); + objmem = getStringObjectSdsUsedMemory(o); if (objlen == 0) { listDelNode(c->reply,listFirst(c->reply)); From 02a22c9be18f8e7772f010b75ad6011fa18550de Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 23 Jul 2013 16:35:55 +0200 Subject: [PATCH 0107/2500] Every function inside sds.c is now commented. --- src/sds.c | 152 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 145 insertions(+), 7 deletions(-) diff --git a/src/sds.c b/src/sds.c index aa45fc4f5..b63f44d94 100644 --- a/src/sds.c +++ b/src/sds.c @@ -36,6 +36,18 @@ #include "sds.h" #include "zmalloc.h" +/* Create a new sds string with the content specified by the 'init' pointer + * and 'initlen'. + * If NULL is used for 'init' the string is initialized with zero bytes. + * + * The string is always null-termined (all the sds strings are, always) so + * even if you create an sds string with: + * + * mystring = sdsnewlen("abc",3"); + * + * You can print the string with printf() as there is an implicit \0 at the + * end of the string. However the string is binary safe and can contain + * \0 characters in the middle, as the length is stored in the sds header. */ sds sdsnewlen(const void *init, size_t initlen) { struct sdshdr *sh; @@ -53,24 +65,43 @@ sds sdsnewlen(const void *init, size_t initlen) { return (char*)sh->buf; } +/* Create an empty (zero length) sds string. Even in this case the string + * always has an implicit null term. */ sds sdsempty(void) { return sdsnewlen("",0); } +/* Create a new sds string starting from a null termined C string. */ sds sdsnew(const char *init) { size_t initlen = (init == NULL) ? 0 : strlen(init); return sdsnewlen(init, initlen); } +/* Duplicate an sds string. */ sds sdsdup(const sds s) { return sdsnewlen(s, sdslen(s)); } +/* Free an sds string. No operation is performed if 's' is NULL. */ void sdsfree(sds s) { if (s == NULL) return; zfree(s-sizeof(struct sdshdr)); } +/* Set the sds string length to the length as obtained with strlen(), so + * considering as content only up to the first null term character. + * + * This function is useful when the sds string is hacked manually in some + * way, like in the following example: + * + * s = sdsnew("foobar"); + * s[2] = '\0'; + * sdsupdatelen(s); + * printf("%d\n", sdslen(s)); + * + * The output will be "2", but if we comment out the call to sdsupdatelen() + * the output will be "6" as the string was modified but the logical length + * remains 6 bytes. */ void sdsupdatelen(sds s) { struct sdshdr *sh = (void*) (s-(sizeof(struct sdshdr))); int reallen = strlen(s); @@ -78,6 +109,10 @@ void sdsupdatelen(sds s) { sh->len = reallen; } +/* Modify an sds string on-place to make it empty (zero length). + * However all the existing buffer is not discarded but set as free space + * so that next append operations will not require allocations up to the + * number of bytes previously available. */ void sdsclear(sds s) { struct sdshdr *sh = (void*) (s-(sizeof(struct sdshdr))); sh->free += sh->len; @@ -89,7 +124,7 @@ void sdsclear(sds s) { * is sure that after calling this function can overwrite up to addlen * bytes after the end of the string, plus one more byte for nul term. * - * Note: this does not change the *size* of the sds string as returned + * Note: this does not change the *length* of the sds string as returned * by sdslen(), but only the free buffer space we have. */ sds sdsMakeRoomFor(sds s, size_t addlen) { struct sdshdr *sh, *newsh; @@ -113,7 +148,10 @@ sds sdsMakeRoomFor(sds s, size_t addlen) { /* Reallocate the sds string so that it has no free space at the end. The * contained string remains not altered, but next concatenation operations - * will require a reallocation. */ + * will require a reallocation. + * + * After the call, the passed sds string is no longer valid and all the + * references must be substituted with the new pointer returned by the call. */ sds sdsRemoveFreeSpace(sds s) { struct sdshdr *sh; @@ -123,6 +161,13 @@ sds sdsRemoveFreeSpace(sds s) { return sh->buf; } +/* Return the total size of the allocation of the specifed sds string, + * including: + * 1) The sds header before the pointer. + * 2) The string. + * 3) The free buffer at the end if any. + * 4) The implicit null term. + */ size_t sdsAllocSize(sds s) { struct sdshdr *sh = (void*) (s-(sizeof(struct sdshdr))); @@ -130,7 +175,7 @@ size_t sdsAllocSize(sds s) { } /* Increment the sds length and decrements the left free space at the - * end of the string accordingly to 'incr'. Also set the null term + * end of the string according to 'incr'. Also set the null term * in the new end of the string. * * This function is used in order to fix the string length after the @@ -140,15 +185,17 @@ size_t sdsAllocSize(sds s) { * Note: it is possible to use a negative increment in order to * right-trim the string. * + * Usage example: + * * Using sdsIncrLen() and sdsMakeRoomFor() it is possible to mount the - * following schema to cat bytes coming from the kernel to the end of an - * sds string new things without copying into an intermediate buffer: + * following schema, to cat bytes coming from the kernel to the end of an + * sds string without copying into an intermediate buffer: * * oldlen = sdslen(s); * s = sdsMakeRoomFor(s, BUFFER_SIZE); * nread = read(fd, s+oldlen, BUFFER_SIZE); * ... check for nread <= 0 and handle it ... - * sdsIncrLen(s, nhread); + * sdsIncrLen(s, nread); */ void sdsIncrLen(sds s, int incr) { struct sdshdr *sh = (void*) (s-(sizeof(struct sdshdr))); @@ -161,7 +208,10 @@ void sdsIncrLen(sds s, int incr) { } /* Grow the sds to have the specified length. Bytes that were not part of - * the original length of the sds will be set to zero. */ + * the original length of the sds will be set to zero. + * + * if the specified length is smaller than the current length, no operation + * is performed. */ sds sdsgrowzero(sds s, size_t len) { struct sdshdr *sh = (void*)(s-(sizeof(struct sdshdr))); size_t totlen, curlen = sh->len; @@ -179,6 +229,11 @@ sds sdsgrowzero(sds s, size_t len) { return s; } +/* Append the specified binary-safe string pointed by 't' of 'len' bytes to the + * end of the specified sds string 's'. + * + * After the call, the passed sds string is no longer valid and all the + * references must be substituted with the new pointer returned by the call. */ sds sdscatlen(sds s, const void *t, size_t len) { struct sdshdr *sh; size_t curlen = sdslen(s); @@ -193,14 +248,24 @@ sds sdscatlen(sds s, const void *t, size_t len) { return s; } +/* Append the specified null termianted C string to the sds string 's'. + * + * After the call, the passed sds string is no longer valid and all the + * references must be substituted with the new pointer returned by the call. */ sds sdscat(sds s, const char *t) { return sdscatlen(s, t, strlen(t)); } +/* Append the specified sds 't' to the existing sds 's'. + * + * After the call, the modified sds string is no longer valid and all the + * references must be substituted with the new pointer returned by the call. */ sds sdscatsds(sds s, const sds t) { return sdscatlen(s, t, sdslen(t)); } +/* Destructively modify the sds string 's' to hold the specified binary + * safe string pointed by 't' of length 'len' bytes. */ sds sdscpylen(sds s, const char *t, size_t len) { struct sdshdr *sh = (void*) (s-(sizeof(struct sdshdr))); size_t totlen = sh->free+sh->len; @@ -218,10 +283,13 @@ sds sdscpylen(sds s, const char *t, size_t len) { return s; } +/* Like sdscpylen() but 't' must be a null-termined string so that the length + * of the string is obtained with strlen(). */ sds sdscpy(sds s, const char *t) { return sdscpylen(s, t, strlen(t)); } +/* Like sdscatpritf() but gets va_list instead of being variadic. */ sds sdscatvprintf(sds s, const char *fmt, va_list ap) { va_list cpy; char *buf, *t; @@ -245,6 +313,22 @@ sds sdscatvprintf(sds s, const char *fmt, va_list ap) { return t; } +/* Append to the sds string 's' a string obtained using printf-alike format + * specifier. + * + * After the call, the modified sds string is no longer valid and all the + * references must be substituted with the new pointer returned by the call. + * + * Example: + * + * s = sdsempty("Sum is: "); + * s = sdscatprintf(s,"%d+%d = %d",a,b,a+b). + * + * Often you need to create a string from scratch with the printf-alike + * format. When this is the need, just use sdsempty() as the target string: + * + * s = sdscatprintf(sdsempty(), "... your format ...", args); + */ sds sdscatprintf(sds s, const char *fmt, ...) { va_list ap; char *t; @@ -254,6 +338,20 @@ sds sdscatprintf(sds s, const char *fmt, ...) { return t; } +/* Remove the part of the string from left and from right composed just of + * contiguous characters found in 'cset', that is a null terminted C string. + * + * After the call, the modified sds string is no longer valid and all the + * references must be substituted with the new pointer returned by the call. + * + * Example: + * + * s = sdsnew("AA...AA.a.aa.aHelloWorld :::"); + * s = sdstrim(s,"A. :"); + * printf("%s\n", s); + * + * Output will be just "Hello World". + */ sds sdstrim(sds s, const char *cset) { struct sdshdr *sh = (void*) (s-(sizeof(struct sdshdr))); char *start, *end, *sp, *ep; @@ -271,6 +369,22 @@ sds sdstrim(sds s, const char *cset) { return s; } +/* Turn the string into a smaller (or equal) string containing only the + * substring specified by the 'start' and 'end' indexes. + * + * start and end can be negative, where -1 means the last character of the + * string, -2 the penultimate character, and so forth. + * + * The interval is inclusive, so the start and end characters will be part + * of the resulting string. + * + * The string is modified in-place. + * + * Example: + * + * s = sdsnew("Hello World"); + * sdstrim(s,1,-1); => "ello Worl" + */ sds sdsrange(sds s, int start, int end) { struct sdshdr *sh = (void*) (s-(sizeof(struct sdshdr))); size_t newlen, len = sdslen(s); @@ -302,18 +416,31 @@ sds sdsrange(sds s, int start, int end) { return s; } +/* Apply tolower() to every character of the sds string 's'. */ void sdstolower(sds s) { int len = sdslen(s), j; for (j = 0; j < len; j++) s[j] = tolower(s[j]); } +/* Apply toupper() to every character of the sds string 's'. */ void sdstoupper(sds s) { int len = sdslen(s), j; for (j = 0; j < len; j++) s[j] = toupper(s[j]); } +/* Compare two sds strings s1 and s2 with memcmp(). + * + * Return value: + * + * 1 if s1 > s2. + * -1 if s1 < s2. + * 0 if s1 and s2 are exactly the same binary string. + * + * If two strings share exactly the same prefix, but one of the two has + * additional characters, the longer string is considered to be greater than + * the smaller one. */ int sdscmp(const sds s1, const sds s2) { size_t l1, l2, minlen; int cmp; @@ -391,6 +518,7 @@ cleanup: } } +/* Free the result returned by sdssplitlen(), or do nothing if 'tokens' is NULL. */ void sdsfreesplitres(sds *tokens, int count) { if (!tokens) return; while(count--) @@ -398,6 +526,10 @@ void sdsfreesplitres(sds *tokens, int count) { zfree(tokens); } +/* Create an sds string from a long long value. It is much faster than: + * + * sdscatprintf(sdsempty(),"%lld\n", value); + */ sds sdsfromlonglong(long long value) { char buf[32], *p; unsigned long long v; @@ -413,6 +545,12 @@ sds sdsfromlonglong(long long value) { return sdsnewlen(p,32-(p-buf)); } +/* Append to the sds string "s" an escaped string representation where + * all the non-printable characters (tested with isprint()) are turned into + * escapes in the form "\n\r\a...." or "\x". + * + * After the call, the modified sds string is no longer valid and all the + * references must be substituted with the new pointer returned by the call. */ sds sdscatrepr(sds s, const char *p, size_t len) { s = sdscatlen(s,"\"",1); while(len--) { From a184ff7d0e9e493bcd5ced9b0392d38ac59f6204 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 24 Jul 2013 10:37:55 +0200 Subject: [PATCH 0108/2500] Inline protocol improved to accept quoted strings. --- src/networking.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/networking.c b/src/networking.c index 54dfc025e..f7cfeb098 100644 --- a/src/networking.c +++ b/src/networking.c @@ -847,7 +847,7 @@ void resetClient(redisClient *c) { int processInlineBuffer(redisClient *c) { char *newline = strstr(c->querybuf,"\r\n"); int argc, j; - sds *argv; + sds *argv, aux; size_t querylen; /* Nothing to do without a \r\n */ @@ -861,7 +861,9 @@ int processInlineBuffer(redisClient *c) { /* Split the input buffer up to the \r\n */ querylen = newline-(c->querybuf); - argv = sdssplitlen(c->querybuf,querylen," ",1,&argc); + aux = sdsnewlen(c->querybuf,querylen); + argv = sdssplitargs(aux,&argc); + sdsfree(aux); /* Leave data after the first line of the query in the buffer */ c->querybuf = sdsrange(c->querybuf,querylen+2,-1); From fc11a993903287ebdbac148d8b61eadaf4f06604 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 24 Jul 2013 11:21:39 +0200 Subject: [PATCH 0109/2500] sdsrange() does not need to return a value. Actaully the string is modified in-place and a reallocation is never needed, so there is no need to return the new sds string pointer as return value of the function, that is now just "void". --- src/cluster.c | 2 +- src/networking.c | 10 +++++----- src/sds.c | 5 ++--- src/sds.h | 2 +- src/sentinel.c | 2 +- 5 files changed, 10 insertions(+), 11 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 017989df3..89ef24b90 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -1153,7 +1153,7 @@ void clusterWriteHandler(aeEventLoop *el, int fd, void *privdata, int mask) { handleLinkIOError(link); return; } - link->sndbuf = sdsrange(link->sndbuf,nwritten,-1); + sdsrange(link->sndbuf,nwritten,-1); if (sdslen(link->sndbuf) == 0) aeDeleteFileEvent(server.el, link->fd, AE_WRITABLE); } diff --git a/src/networking.c b/src/networking.c index f7cfeb098..c131c9c67 100644 --- a/src/networking.c +++ b/src/networking.c @@ -866,7 +866,7 @@ int processInlineBuffer(redisClient *c) { sdsfree(aux); /* Leave data after the first line of the query in the buffer */ - c->querybuf = sdsrange(c->querybuf,querylen+2,-1); + sdsrange(c->querybuf,querylen+2,-1); /* Setup argv array on client structure */ if (c->argv) zfree(c->argv); @@ -895,7 +895,7 @@ static void setProtocolError(redisClient *c, int pos) { sdsfree(client); } c->flags |= REDIS_CLOSE_AFTER_REPLY; - c->querybuf = sdsrange(c->querybuf,pos,-1); + sdsrange(c->querybuf,pos,-1); } int processMultibulkBuffer(redisClient *c) { @@ -933,7 +933,7 @@ int processMultibulkBuffer(redisClient *c) { pos = (newline-c->querybuf)+2; if (ll <= 0) { - c->querybuf = sdsrange(c->querybuf,pos,-1); + sdsrange(c->querybuf,pos,-1); return REDIS_OK; } @@ -982,7 +982,7 @@ int processMultibulkBuffer(redisClient *c) { * try to make it likely that it will start at c->querybuf * boundary so that we can optimized object creation * avoiding a large copy of data. */ - c->querybuf = sdsrange(c->querybuf,pos,-1); + sdsrange(c->querybuf,pos,-1); pos = 0; /* Hint the sds library about the amount of bytes this string is * going to contain. */ @@ -1021,7 +1021,7 @@ int processMultibulkBuffer(redisClient *c) { } /* Trim to pos */ - if (pos) c->querybuf = sdsrange(c->querybuf,pos,-1); + if (pos) sdsrange(c->querybuf,pos,-1); /* We're done when c->multibulk == 0 */ if (c->multibulklen == 0) return REDIS_OK; diff --git a/src/sds.c b/src/sds.c index b63f44d94..d66c1d730 100644 --- a/src/sds.c +++ b/src/sds.c @@ -385,11 +385,11 @@ sds sdstrim(sds s, const char *cset) { * s = sdsnew("Hello World"); * sdstrim(s,1,-1); => "ello Worl" */ -sds sdsrange(sds s, int start, int end) { +void sdsrange(sds s, int start, int end) { struct sdshdr *sh = (void*) (s-(sizeof(struct sdshdr))); size_t newlen, len = sdslen(s); - if (len == 0) return s; + if (len == 0) return; if (start < 0) { start = len+start; if (start < 0) start = 0; @@ -413,7 +413,6 @@ sds sdsrange(sds s, int start, int end) { sh->buf[newlen] = 0; sh->free = sh->free+(sh->len-newlen); sh->len = newlen; - return s; } /* Apply tolower() to every character of the sds string 's'. */ diff --git a/src/sds.h b/src/sds.h index 46d914fd1..6f3201130 100644 --- a/src/sds.h +++ b/src/sds.h @@ -77,7 +77,7 @@ sds sdscatprintf(sds s, const char *fmt, ...); #endif sds sdstrim(sds s, const char *cset); -sds sdsrange(sds s, int start, int end); +void sdsrange(sds s, int start, int end); void sdsupdatelen(sds s); void sdsclear(sds s); int sdscmp(const sds s1, const sds s2); diff --git a/src/sentinel.c b/src/sentinel.c index 659de29b8..b257ad685 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1920,7 +1920,7 @@ void addReplySentinelRedisInstance(redisClient *c, sentinelRedisInstance *ri) { if (ri->flags & SRI_RECONF_DONE) flags = sdscat(flags,"reconf_done,"); if (ri->flags & SRI_DEMOTE) flags = sdscat(flags,"demote,"); - if (sdslen(flags) != 0) flags = sdsrange(flags,0,-2); /* remove last "," */ + if (sdslen(flags) != 0) sdsrange(flags,0,-2); /* remove last "," */ addReplyBulkCString(c,flags); sdsfree(flags); fields++; From f1be75e5d081bb4abfab9648a0f5d0a5de669ce1 Mon Sep 17 00:00:00 2001 From: Allan Date: Wed, 24 Jul 2013 21:34:55 +0800 Subject: [PATCH 0110/2500] fixed bug issue of #1213 --- src/redis.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/redis.c b/src/redis.c index 8a833d509..7e1c2fa5b 100644 --- a/src/redis.c +++ b/src/redis.c @@ -1435,9 +1435,12 @@ void initServer() { for (j = 0; j < server.bindaddr_count || j == 0; j++) { if (server.bindaddr[j] == NULL) { /* Bind * for both IPv6 and IPv4. */ - server.ipfd[0] = anetTcp6Server(server.neterr,server.port,NULL); - if (server.ipfd[0] != ANET_ERR) server.ipfd_count++; - server.ipfd[1] = anetTcpServer(server.neterr,server.port,NULL); + server.ipfd[server.ipfd_count] = anetTcp6Server(server.neterr,server.port,NULL); + if (server.ipfd[server.ipfd_count] != ANET_ERR) server.ipfd_count++; + + server.ipfd[server.ipfd_count] = anetTcpServer(server.neterr,server.port,NULL); + + } else if (strchr(server.bindaddr[j],':')) { /* Bind IPv6 address. */ server.ipfd[server.ipfd_count] = anetTcp6Server(server.neterr,server.port,server.bindaddr[j]); From 31fe0202e492cc6934a3fb8aec06136f10f73995 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 24 Jul 2013 18:59:36 +0200 Subject: [PATCH 0111/2500] Ignore sdsrange return value. --- src/util.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/util.c b/src/util.c index 022a6adf4..37534dfb9 100644 --- a/src/util.c +++ b/src/util.c @@ -438,7 +438,7 @@ sds getAbsolutePath(char *filename) { while (sdslen(relpath) >= 3 && relpath[0] == '.' && relpath[1] == '.' && relpath[2] == '/') { - relpath = sdsrange(relpath,3,-1); + sdsrange(relpath,3,-1); if (sdslen(abspath) > 1) { char *p = abspath + sdslen(abspath)-2; int trimlen = 1; @@ -447,7 +447,7 @@ sds getAbsolutePath(char *filename) { p--; trimlen++; } - abspath = sdsrange(abspath,0,-(trimlen+1)); + sdsrange(abspath,0,-(trimlen+1)); } } From d3736ba40075414b937c54dc764c72547d0259b6 Mon Sep 17 00:00:00 2001 From: Allan Date: Thu, 25 Jul 2013 15:28:33 +0800 Subject: [PATCH 0112/2500] fixed initServer failed if no IPV4 or no IPV6 --- src/redis.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/redis.c b/src/redis.c index 7e1c2fa5b..4e842fc62 100644 --- a/src/redis.c +++ b/src/redis.c @@ -1403,6 +1403,7 @@ void adjustOpenFilesLimit(void) { void initServer() { int j; + int ip_count; signal(SIGHUP, SIG_IGN); signal(SIGPIPE, SIG_IGN); @@ -1434,12 +1435,18 @@ void initServer() { if (server.bindaddr_count == 0) server.bindaddr[0] = NULL; for (j = 0; j < server.bindaddr_count || j == 0; j++) { if (server.bindaddr[j] == NULL) { - /* Bind * for both IPv6 and IPv4. */ - server.ipfd[server.ipfd_count] = anetTcp6Server(server.neterr,server.port,NULL); - if (server.ipfd[server.ipfd_count] != ANET_ERR) server.ipfd_count++; + /* Bind * for both IPv6 and IPv4. + * Should consider that someone only has IPV6 and someone only get IPV4 */ + ip_count = 0; + server.ipfd[ip_count] = anetTcp6Server(server.neterr,server.port,NULL); + if (server.ipfd[ip_count] != ANET_ERR) ip_count++; - server.ipfd[server.ipfd_count] = anetTcpServer(server.neterr,server.port,NULL); + server.ipfd[ip_count] = anetTcpServer(server.neterr,server.port,NULL); + if(server.ipfd[ip_count] != ANET_ERR ) ip_count++; + /* It should be ip_count plus one + * because out of this branch, the server.ipfd_count would increase */ + server.ipfd_count = ip_count - 1; } else if (strchr(server.bindaddr[j],':')) { /* Bind IPv6 address. */ From 74ff4fc728a3d107e3d608d37113e78fa839fba8 Mon Sep 17 00:00:00 2001 From: Allan Date: Thu, 25 Jul 2013 15:36:00 +0800 Subject: [PATCH 0113/2500] fixed initServer fail while having no IPv6 nor IPv4 --- src/redis.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/redis.c b/src/redis.c index 4e842fc62..e4afdf221 100644 --- a/src/redis.c +++ b/src/redis.c @@ -1446,7 +1446,7 @@ void initServer() { /* It should be ip_count plus one * because out of this branch, the server.ipfd_count would increase */ - server.ipfd_count = ip_count - 1; + server.ipfd_count += (ip_count - 1); } else if (strchr(server.bindaddr[j],':')) { /* Bind IPv6 address. */ From b55bcdd3ab461272763b18cb2c190469bc41f1ea Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 25 Jul 2013 10:32:19 +0200 Subject: [PATCH 0114/2500] Use latest sds.c in the hiredis library under deps. --- deps/hiredis/hiredis.c | 4 +- deps/hiredis/sds.c | 495 ++++++++++++++++++++++++++++++++--------- deps/hiredis/sds.h | 30 ++- deps/hiredis/zmalloc.h | 13 ++ 4 files changed, 422 insertions(+), 120 deletions(-) create mode 100644 deps/hiredis/zmalloc.h diff --git a/deps/hiredis/hiredis.c b/deps/hiredis/hiredis.c index 4709ee325..0b04935a1 100644 --- a/deps/hiredis/hiredis.c +++ b/deps/hiredis/hiredis.c @@ -650,7 +650,7 @@ int redisReaderGetReply(redisReader *r, void **reply) { /* Discard part of the buffer when we've consumed at least 1k, to avoid * doing unnecessary calls to memmove() in sds.c. */ if (r->pos >= 1024) { - r->buf = sdsrange(r->buf,r->pos,-1); + sdsrange(r->buf,r->pos,-1); r->pos = 0; r->len = sdslen(r->buf); } @@ -1125,7 +1125,7 @@ int redisBufferWrite(redisContext *c, int *done) { sdsfree(c->obuf); c->obuf = sdsempty(); } else { - c->obuf = sdsrange(c->obuf,nwritten,-1); + sdsrange(c->obuf,nwritten,-1); } } } diff --git a/deps/hiredis/sds.c b/deps/hiredis/sds.c index 0af9c6720..d66c1d730 100644 --- a/deps/hiredis/sds.c +++ b/deps/hiredis/sds.c @@ -1,6 +1,6 @@ /* SDSLib, A C dynamic strings library * - * Copyright (c) 2006-2010, Salvatore Sanfilippo + * Copyright (c) 2006-2012, Salvatore Sanfilippo * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -32,52 +32,76 @@ #include #include #include +#include #include "sds.h" +#include "zmalloc.h" -#ifdef SDS_ABORT_ON_OOM -static void sdsOomAbort(void) { - fprintf(stderr,"SDS: Out Of Memory (SDS_ABORT_ON_OOM defined)\n"); - abort(); -} -#endif - +/* Create a new sds string with the content specified by the 'init' pointer + * and 'initlen'. + * If NULL is used for 'init' the string is initialized with zero bytes. + * + * The string is always null-termined (all the sds strings are, always) so + * even if you create an sds string with: + * + * mystring = sdsnewlen("abc",3"); + * + * You can print the string with printf() as there is an implicit \0 at the + * end of the string. However the string is binary safe and can contain + * \0 characters in the middle, as the length is stored in the sds header. */ sds sdsnewlen(const void *init, size_t initlen) { struct sdshdr *sh; - sh = malloc(sizeof(struct sdshdr)+initlen+1); -#ifdef SDS_ABORT_ON_OOM - if (sh == NULL) sdsOomAbort(); -#else + if (init) { + sh = zmalloc(sizeof(struct sdshdr)+initlen+1); + } else { + sh = zcalloc(sizeof(struct sdshdr)+initlen+1); + } if (sh == NULL) return NULL; -#endif sh->len = initlen; sh->free = 0; - if (initlen) { - if (init) memcpy(sh->buf, init, initlen); - else memset(sh->buf,0,initlen); - } + if (initlen && init) + memcpy(sh->buf, init, initlen); sh->buf[initlen] = '\0'; return (char*)sh->buf; } +/* Create an empty (zero length) sds string. Even in this case the string + * always has an implicit null term. */ sds sdsempty(void) { return sdsnewlen("",0); } +/* Create a new sds string starting from a null termined C string. */ sds sdsnew(const char *init) { size_t initlen = (init == NULL) ? 0 : strlen(init); return sdsnewlen(init, initlen); } +/* Duplicate an sds string. */ sds sdsdup(const sds s) { return sdsnewlen(s, sdslen(s)); } +/* Free an sds string. No operation is performed if 's' is NULL. */ void sdsfree(sds s) { if (s == NULL) return; - free(s-sizeof(struct sdshdr)); + zfree(s-sizeof(struct sdshdr)); } +/* Set the sds string length to the length as obtained with strlen(), so + * considering as content only up to the first null term character. + * + * This function is useful when the sds string is hacked manually in some + * way, like in the following example: + * + * s = sdsnew("foobar"); + * s[2] = '\0'; + * sdsupdatelen(s); + * printf("%d\n", sdslen(s)); + * + * The output will be "2", but if we comment out the call to sdsupdatelen() + * the output will be "6" as the string was modified but the logical length + * remains 6 bytes. */ void sdsupdatelen(sds s) { struct sdshdr *sh = (void*) (s-(sizeof(struct sdshdr))); int reallen = strlen(s); @@ -85,7 +109,24 @@ void sdsupdatelen(sds s) { sh->len = reallen; } -static sds sdsMakeRoomFor(sds s, size_t addlen) { +/* Modify an sds string on-place to make it empty (zero length). + * However all the existing buffer is not discarded but set as free space + * so that next append operations will not require allocations up to the + * number of bytes previously available. */ +void sdsclear(sds s) { + struct sdshdr *sh = (void*) (s-(sizeof(struct sdshdr))); + sh->free += sh->len; + sh->len = 0; + sh->buf[0] = '\0'; +} + +/* Enlarge the free space at the end of the sds string so that the caller + * is sure that after calling this function can overwrite up to addlen + * bytes after the end of the string, plus one more byte for nul term. + * + * Note: this does not change the *length* of the sds string as returned + * by sdslen(), but only the free buffer space we have. */ +sds sdsMakeRoomFor(sds s, size_t addlen) { struct sdshdr *sh, *newsh; size_t free = sdsavail(s); size_t len, newlen; @@ -93,20 +134,84 @@ static sds sdsMakeRoomFor(sds s, size_t addlen) { if (free >= addlen) return s; len = sdslen(s); sh = (void*) (s-(sizeof(struct sdshdr))); - newlen = (len+addlen)*2; - newsh = realloc(sh, sizeof(struct sdshdr)+newlen+1); -#ifdef SDS_ABORT_ON_OOM - if (newsh == NULL) sdsOomAbort(); -#else + newlen = (len+addlen); + if (newlen < SDS_MAX_PREALLOC) + newlen *= 2; + else + newlen += SDS_MAX_PREALLOC; + newsh = zrealloc(sh, sizeof(struct sdshdr)+newlen+1); if (newsh == NULL) return NULL; -#endif newsh->free = newlen - len; return newsh->buf; } +/* Reallocate the sds string so that it has no free space at the end. The + * contained string remains not altered, but next concatenation operations + * will require a reallocation. + * + * After the call, the passed sds string is no longer valid and all the + * references must be substituted with the new pointer returned by the call. */ +sds sdsRemoveFreeSpace(sds s) { + struct sdshdr *sh; + + sh = (void*) (s-(sizeof(struct sdshdr))); + sh = zrealloc(sh, sizeof(struct sdshdr)+sh->len+1); + sh->free = 0; + return sh->buf; +} + +/* Return the total size of the allocation of the specifed sds string, + * including: + * 1) The sds header before the pointer. + * 2) The string. + * 3) The free buffer at the end if any. + * 4) The implicit null term. + */ +size_t sdsAllocSize(sds s) { + struct sdshdr *sh = (void*) (s-(sizeof(struct sdshdr))); + + return sizeof(*sh)+sh->len+sh->free+1; +} + +/* Increment the sds length and decrements the left free space at the + * end of the string according to 'incr'. Also set the null term + * in the new end of the string. + * + * This function is used in order to fix the string length after the + * user calls sdsMakeRoomFor(), writes something after the end of + * the current string, and finally needs to set the new length. + * + * Note: it is possible to use a negative increment in order to + * right-trim the string. + * + * Usage example: + * + * Using sdsIncrLen() and sdsMakeRoomFor() it is possible to mount the + * following schema, to cat bytes coming from the kernel to the end of an + * sds string without copying into an intermediate buffer: + * + * oldlen = sdslen(s); + * s = sdsMakeRoomFor(s, BUFFER_SIZE); + * nread = read(fd, s+oldlen, BUFFER_SIZE); + * ... check for nread <= 0 and handle it ... + * sdsIncrLen(s, nread); + */ +void sdsIncrLen(sds s, int incr) { + struct sdshdr *sh = (void*) (s-(sizeof(struct sdshdr))); + + assert(sh->free >= incr); + sh->len += incr; + sh->free -= incr; + assert(sh->free >= 0); + s[sh->len] = '\0'; +} + /* Grow the sds to have the specified length. Bytes that were not part of - * the original length of the sds will be set to zero. */ + * the original length of the sds will be set to zero. + * + * if the specified length is smaller than the current length, no operation + * is performed. */ sds sdsgrowzero(sds s, size_t len) { struct sdshdr *sh = (void*)(s-(sizeof(struct sdshdr))); size_t totlen, curlen = sh->len; @@ -124,6 +229,11 @@ sds sdsgrowzero(sds s, size_t len) { return s; } +/* Append the specified binary-safe string pointed by 't' of 'len' bytes to the + * end of the specified sds string 's'. + * + * After the call, the passed sds string is no longer valid and all the + * references must be substituted with the new pointer returned by the call. */ sds sdscatlen(sds s, const void *t, size_t len) { struct sdshdr *sh; size_t curlen = sdslen(s); @@ -138,11 +248,25 @@ sds sdscatlen(sds s, const void *t, size_t len) { return s; } +/* Append the specified null termianted C string to the sds string 's'. + * + * After the call, the passed sds string is no longer valid and all the + * references must be substituted with the new pointer returned by the call. */ sds sdscat(sds s, const char *t) { return sdscatlen(s, t, strlen(t)); } -sds sdscpylen(sds s, char *t, size_t len) { +/* Append the specified sds 't' to the existing sds 's'. + * + * After the call, the modified sds string is no longer valid and all the + * references must be substituted with the new pointer returned by the call. */ +sds sdscatsds(sds s, const sds t) { + return sdscatlen(s, t, sdslen(t)); +} + +/* Destructively modify the sds string 's' to hold the specified binary + * safe string pointed by 't' of length 'len' bytes. */ +sds sdscpylen(sds s, const char *t, size_t len) { struct sdshdr *sh = (void*) (s-(sizeof(struct sdshdr))); size_t totlen = sh->free+sh->len; @@ -159,37 +283,52 @@ sds sdscpylen(sds s, char *t, size_t len) { return s; } -sds sdscpy(sds s, char *t) { +/* Like sdscpylen() but 't' must be a null-termined string so that the length + * of the string is obtained with strlen(). */ +sds sdscpy(sds s, const char *t) { return sdscpylen(s, t, strlen(t)); } +/* Like sdscatpritf() but gets va_list instead of being variadic. */ sds sdscatvprintf(sds s, const char *fmt, va_list ap) { va_list cpy; char *buf, *t; size_t buflen = 16; while(1) { - buf = malloc(buflen); -#ifdef SDS_ABORT_ON_OOM - if (buf == NULL) sdsOomAbort(); -#else + buf = zmalloc(buflen); if (buf == NULL) return NULL; -#endif buf[buflen-2] = '\0'; va_copy(cpy,ap); vsnprintf(buf, buflen, fmt, cpy); if (buf[buflen-2] != '\0') { - free(buf); + zfree(buf); buflen *= 2; continue; } break; } t = sdscat(s, buf); - free(buf); + zfree(buf); return t; } +/* Append to the sds string 's' a string obtained using printf-alike format + * specifier. + * + * After the call, the modified sds string is no longer valid and all the + * references must be substituted with the new pointer returned by the call. + * + * Example: + * + * s = sdsempty("Sum is: "); + * s = sdscatprintf(s,"%d+%d = %d",a,b,a+b). + * + * Often you need to create a string from scratch with the printf-alike + * format. When this is the need, just use sdsempty() as the target string: + * + * s = sdscatprintf(sdsempty(), "... your format ...", args); + */ sds sdscatprintf(sds s, const char *fmt, ...) { va_list ap; char *t; @@ -199,6 +338,20 @@ sds sdscatprintf(sds s, const char *fmt, ...) { return t; } +/* Remove the part of the string from left and from right composed just of + * contiguous characters found in 'cset', that is a null terminted C string. + * + * After the call, the modified sds string is no longer valid and all the + * references must be substituted with the new pointer returned by the call. + * + * Example: + * + * s = sdsnew("AA...AA.a.aa.aHelloWorld :::"); + * s = sdstrim(s,"A. :"); + * printf("%s\n", s); + * + * Output will be just "Hello World". + */ sds sdstrim(sds s, const char *cset) { struct sdshdr *sh = (void*) (s-(sizeof(struct sdshdr))); char *start, *end, *sp, *ep; @@ -216,11 +369,27 @@ sds sdstrim(sds s, const char *cset) { return s; } -sds sdsrange(sds s, int start, int end) { +/* Turn the string into a smaller (or equal) string containing only the + * substring specified by the 'start' and 'end' indexes. + * + * start and end can be negative, where -1 means the last character of the + * string, -2 the penultimate character, and so forth. + * + * The interval is inclusive, so the start and end characters will be part + * of the resulting string. + * + * The string is modified in-place. + * + * Example: + * + * s = sdsnew("Hello World"); + * sdstrim(s,1,-1); => "ello Worl" + */ +void sdsrange(sds s, int start, int end) { struct sdshdr *sh = (void*) (s-(sizeof(struct sdshdr))); size_t newlen, len = sdslen(s); - if (len == 0) return s; + if (len == 0) return; if (start < 0) { start = len+start; if (start < 0) start = 0; @@ -244,22 +413,34 @@ sds sdsrange(sds s, int start, int end) { sh->buf[newlen] = 0; sh->free = sh->free+(sh->len-newlen); sh->len = newlen; - return s; } +/* Apply tolower() to every character of the sds string 's'. */ void sdstolower(sds s) { int len = sdslen(s), j; for (j = 0; j < len; j++) s[j] = tolower(s[j]); } +/* Apply toupper() to every character of the sds string 's'. */ void sdstoupper(sds s) { int len = sdslen(s), j; for (j = 0; j < len; j++) s[j] = toupper(s[j]); } -int sdscmp(sds s1, sds s2) { +/* Compare two sds strings s1 and s2 with memcmp(). + * + * Return value: + * + * 1 if s1 > s2. + * -1 if s1 < s2. + * 0 if s1 and s2 are exactly the same binary string. + * + * If two strings share exactly the same prefix, but one of the two has + * additional characters, the longer string is considered to be greater than + * the smaller one. */ +int sdscmp(const sds s1, const sds s2) { size_t l1, l2, minlen; int cmp; @@ -287,14 +468,15 @@ int sdscmp(sds s1, sds s2) { * requires length arguments. sdssplit() is just the * same function but for zero-terminated strings. */ -sds *sdssplitlen(char *s, int len, char *sep, int seplen, int *count) { +sds *sdssplitlen(const char *s, int len, const char *sep, int seplen, int *count) { int elements = 0, slots = 5, start = 0, j; + sds *tokens; + + if (seplen < 1 || len < 0) return NULL; + + tokens = zmalloc(sizeof(sds)*slots); + if (tokens == NULL) return NULL; - sds *tokens = malloc(sizeof(sds)*slots); -#ifdef SDS_ABORT_ON_OOM - if (tokens == NULL) sdsOomAbort(); -#endif - if (seplen < 1 || len < 0 || tokens == NULL) return NULL; if (len == 0) { *count = 0; return tokens; @@ -305,26 +487,14 @@ sds *sdssplitlen(char *s, int len, char *sep, int seplen, int *count) { sds *newtokens; slots *= 2; - newtokens = realloc(tokens,sizeof(sds)*slots); - if (newtokens == NULL) { -#ifdef SDS_ABORT_ON_OOM - sdsOomAbort(); -#else - goto cleanup; -#endif - } + newtokens = zrealloc(tokens,sizeof(sds)*slots); + if (newtokens == NULL) goto cleanup; tokens = newtokens; } /* search the separator */ if ((seplen == 1 && *(s+j) == sep[0]) || (memcmp(s+j,sep,seplen) == 0)) { tokens[elements] = sdsnewlen(s+start,j-start); - if (tokens[elements] == NULL) { -#ifdef SDS_ABORT_ON_OOM - sdsOomAbort(); -#else - goto cleanup; -#endif - } + if (tokens[elements] == NULL) goto cleanup; elements++; start = j+seplen; j = j+seplen-1; /* skip the separator */ @@ -332,35 +502,33 @@ sds *sdssplitlen(char *s, int len, char *sep, int seplen, int *count) { } /* Add the final element. We are sure there is room in the tokens array. */ tokens[elements] = sdsnewlen(s+start,len-start); - if (tokens[elements] == NULL) { -#ifdef SDS_ABORT_ON_OOM - sdsOomAbort(); -#else - goto cleanup; -#endif - } + if (tokens[elements] == NULL) goto cleanup; elements++; *count = elements; return tokens; -#ifndef SDS_ABORT_ON_OOM cleanup: { int i; for (i = 0; i < elements; i++) sdsfree(tokens[i]); - free(tokens); + zfree(tokens); + *count = 0; return NULL; } -#endif } +/* Free the result returned by sdssplitlen(), or do nothing if 'tokens' is NULL. */ void sdsfreesplitres(sds *tokens, int count) { if (!tokens) return; while(count--) sdsfree(tokens[count]); - free(tokens); + zfree(tokens); } +/* Create an sds string from a long long value. It is much faster than: + * + * sdscatprintf(sdsempty(),"%lld\n", value); + */ sds sdsfromlonglong(long long value) { char buf[32], *p; unsigned long long v; @@ -376,10 +544,14 @@ sds sdsfromlonglong(long long value) { return sdsnewlen(p,32-(p-buf)); } -sds sdscatrepr(sds s, char *p, size_t len) { +/* Append to the sds string "s" an escaped string representation where + * all the non-printable characters (tested with isprint()) are turned into + * escapes in the form "\n\r\a...." or "\x". + * + * After the call, the modified sds string is no longer valid and all the + * references must be substituted with the new pointer returned by the call. */ +sds sdscatrepr(sds s, const char *p, size_t len) { s = sdscatlen(s,"\"",1); - if (s == NULL) return NULL; - while(len--) { switch(*p) { case '\\': @@ -399,27 +571,64 @@ sds sdscatrepr(sds s, char *p, size_t len) { break; } p++; - if (s == NULL) return NULL; } return sdscatlen(s,"\"",1); } +/* Helper function for sdssplitargs() that returns non zero if 'c' + * is a valid hex digit. */ +int is_hex_digit(char c) { + return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || + (c >= 'A' && c <= 'F'); +} + +/* Helper function for sdssplitargs() that converts an hex digit into an + * integer from 0 to 15 */ +int hex_digit_to_int(char c) { + switch(c) { + case '0': return 0; + case '1': return 1; + case '2': return 2; + case '3': return 3; + case '4': return 4; + case '5': return 5; + case '6': return 6; + case '7': return 7; + case '8': return 8; + case '9': return 9; + case 'a': case 'A': return 10; + case 'b': case 'B': return 11; + case 'c': case 'C': return 12; + case 'd': case 'D': return 13; + case 'e': case 'E': return 14; + case 'f': case 'F': return 15; + default: return 0; + } +} + /* Split a line into arguments, where every argument can be in the * following programming-language REPL-alike form: * * foo bar "newline are supported\n" and "\xff\x00otherstuff" * * The number of arguments is stored into *argc, and an array - * of sds is returned. The caller should sdsfree() all the returned - * strings and finally free() the array itself. + * of sds is returned. + * + * The caller should free the resulting array of sds strings with + * sdsfreesplitres(). * * Note that sdscatrepr() is able to convert back a string into * a quoted string in the same format sdssplitargs() is able to parse. + * + * The function returns the allocated tokens on success, even when the + * input string is empty, or NULL if the input contains unbalanced + * quotes or closed quotes followed by non space characters + * as in: "foo"bar or "foo' */ -sds *sdssplitargs(char *line, int *argc) { - char *p = line; +sds *sdssplitargs(const char *line, int *argc) { + const char *p = line; char *current = NULL; - char **vector = NULL, **_vector = NULL; + char **vector = NULL; *argc = 0; while(1) { @@ -427,17 +636,24 @@ sds *sdssplitargs(char *line, int *argc) { while(*p && isspace(*p)) p++; if (*p) { /* get a token */ - int inq=0; /* set to 1 if we are in "quotes" */ + int inq=0; /* set to 1 if we are in "quotes" */ + int insq=0; /* set to 1 if we are in 'single quotes' */ int done=0; - if (current == NULL) { - current = sdsempty(); - if (current == NULL) goto err; - } - + if (current == NULL) current = sdsempty(); while(!done) { if (inq) { - if (*p == '\\' && *(p+1)) { + if (*p == '\\' && *(p+1) == 'x' && + is_hex_digit(*(p+2)) && + is_hex_digit(*(p+3))) + { + unsigned char byte; + + byte = (hex_digit_to_int(*(p+2))*16)+ + hex_digit_to_int(*(p+3)); + current = sdscatlen(current,(char*)&byte,1); + p += 3; + } else if (*p == '\\' && *(p+1)) { char c; p++; @@ -451,7 +667,23 @@ sds *sdssplitargs(char *line, int *argc) { } current = sdscatlen(current,&c,1); } else if (*p == '"') { - /* closing quote must be followed by a space */ + /* closing quote must be followed by a space or + * nothing at all. */ + if (*(p+1) && !isspace(*(p+1))) goto err; + done=1; + } else if (!*p) { + /* unterminated quotes */ + goto err; + } else { + current = sdscatlen(current,p,1); + } + } else if (insq) { + if (*p == '\\' && *(p+1) == '\'') { + p++; + current = sdscatlen(current,"'",1); + } else if (*p == '\'') { + /* closing quote must be followed by a space or + * nothing at all. */ if (*(p+1) && !isspace(*(p+1))) goto err; done=1; } else if (!*p) { @@ -472,23 +704,24 @@ sds *sdssplitargs(char *line, int *argc) { case '"': inq=1; break; + case '\'': + insq=1; + break; default: current = sdscatlen(current,p,1); break; } } if (*p) p++; - if (current == NULL) goto err; } /* add the token to the vector */ - _vector = realloc(vector,((*argc)+1)*sizeof(char*)); - if (_vector == NULL) goto err; - - vector = _vector; + vector = zrealloc(vector,((*argc)+1)*sizeof(char*)); vector[*argc] = current; (*argc)++; current = NULL; } else { + /* Even on empty input string return something not NULL. */ + if (vector == NULL) vector = zmalloc(sizeof(void*)); return vector; } } @@ -496,30 +729,55 @@ sds *sdssplitargs(char *line, int *argc) { err: while((*argc)--) sdsfree(vector[*argc]); - if (vector != NULL) free(vector); - if (current != NULL) sdsfree(current); + zfree(vector); + if (current) sdsfree(current); + *argc = 0; return NULL; } +/* Modify the string substituting all the occurrences of the set of + * characters specified in the 'from' string to the corresponding character + * in the 'to' array. + * + * For instance: sdsmapchars(mystring, "ho", "01", 2) + * will have the effect of turning the string "hello" into "0ell1". + * + * The function returns the sds string pointer, that is always the same + * as the input pointer since no resize is needed. */ +sds sdsmapchars(sds s, const char *from, const char *to, size_t setlen) { + size_t j, i, l = sdslen(s); + + for (j = 0; j < l; j++) { + for (i = 0; i < setlen; i++) { + if (s[j] == from[i]) { + s[j] = to[i]; + break; + } + } + } + return s; +} + +/* Join an array of C strings using the specified separator (also a C string). + * Returns the result as an sds string. */ +sds sdsjoin(char **argv, int argc, char *sep) { + sds join = sdsempty(); + int j; + + for (j = 0; j < argc; j++) { + join = sdscat(join, argv[j]); + if (j != argc-1) join = sdscat(join,sep); + } + return join; +} + #ifdef SDS_TEST_MAIN #include - -int __failed_tests = 0; -int __test_num = 0; -#define test_cond(descr,_c) do { \ - __test_num++; printf("%d - %s: ", __test_num, descr); \ - if(_c) printf("PASSED\n"); else {printf("FAILED\n"); __failed_tests++;} \ -} while(0); -#define test_report() do { \ - printf("%d tests, %d passed, %d failed\n", __test_num, \ - __test_num-__failed_tests, __failed_tests); \ - if (__failed_tests) { \ - printf("=== WARNING === We have failed tests here...\n"); \ - } \ -} while(0); +#include "testhelp.h" int main(void) { { + struct sdshdr *sh; sds x = sdsnew("foo"), y; test_cond("Create a string and obtain the length", @@ -599,7 +857,26 @@ int main(void) { x = sdsnew("aar"); y = sdsnew("bar"); test_cond("sdscmp(bar,bar)", sdscmp(x,y) < 0) + + { + int oldfree; + + sdsfree(x); + x = sdsnew("0"); + sh = (void*) (x-(sizeof(struct sdshdr))); + test_cond("sdsnew() free/len buffers", sh->len == 1 && sh->free == 0); + x = sdsMakeRoomFor(x,1); + sh = (void*) (x-(sizeof(struct sdshdr))); + test_cond("sdsMakeRoomFor()", sh->len == 1 && sh->free > 0); + oldfree = sh->free; + x[1] = '1'; + sdsIncrLen(x,1); + test_cond("sdsIncrLen() -- content", x[0] == '0' && x[1] == '1'); + test_cond("sdsIncrLen() -- len", sh->len == 2); + test_cond("sdsIncrLen() -- free", sh->free == oldfree-1); + } } test_report() + return 0; } #endif diff --git a/deps/hiredis/sds.h b/deps/hiredis/sds.h index 94f5871f5..6f3201130 100644 --- a/deps/hiredis/sds.h +++ b/deps/hiredis/sds.h @@ -31,6 +31,8 @@ #ifndef __SDS_H #define __SDS_H +#define SDS_MAX_PREALLOC (1024*1024) + #include #include @@ -54,16 +56,17 @@ static inline size_t sdsavail(const sds s) { sds sdsnewlen(const void *init, size_t initlen); sds sdsnew(const char *init); -sds sdsempty(void); +sds sdsempty(); size_t sdslen(const sds s); sds sdsdup(const sds s); void sdsfree(sds s); -size_t sdsavail(sds s); +size_t sdsavail(const sds s); sds sdsgrowzero(sds s, size_t len); sds sdscatlen(sds s, const void *t, size_t len); sds sdscat(sds s, const char *t); -sds sdscpylen(sds s, char *t, size_t len); -sds sdscpy(sds s, char *t); +sds sdscatsds(sds s, const sds t); +sds sdscpylen(sds s, const char *t, size_t len); +sds sdscpy(sds s, const char *t); sds sdscatvprintf(sds s, const char *fmt, va_list ap); #ifdef __GNUC__ @@ -74,15 +77,24 @@ sds sdscatprintf(sds s, const char *fmt, ...); #endif sds sdstrim(sds s, const char *cset); -sds sdsrange(sds s, int start, int end); +void sdsrange(sds s, int start, int end); void sdsupdatelen(sds s); -int sdscmp(sds s1, sds s2); -sds *sdssplitlen(char *s, int len, char *sep, int seplen, int *count); +void sdsclear(sds s); +int sdscmp(const sds s1, const sds s2); +sds *sdssplitlen(const char *s, int len, const char *sep, int seplen, int *count); void sdsfreesplitres(sds *tokens, int count); void sdstolower(sds s); void sdstoupper(sds s); sds sdsfromlonglong(long long value); -sds sdscatrepr(sds s, char *p, size_t len); -sds *sdssplitargs(char *line, int *argc); +sds sdscatrepr(sds s, const char *p, size_t len); +sds *sdssplitargs(const char *line, int *argc); +sds sdsmapchars(sds s, const char *from, const char *to, size_t setlen); +sds sdsjoin(char **argv, int argc, char *sep); + +/* Low level functions exposed to the user API */ +sds sdsMakeRoomFor(sds s, size_t addlen); +void sdsIncrLen(sds s, int incr); +sds sdsRemoveFreeSpace(sds s); +size_t sdsAllocSize(sds s); #endif diff --git a/deps/hiredis/zmalloc.h b/deps/hiredis/zmalloc.h new file mode 100644 index 000000000..99b87ace9 --- /dev/null +++ b/deps/hiredis/zmalloc.h @@ -0,0 +1,13 @@ +/* Drop in replacement for zmalloc.h in order to just use libc malloc without + * any wrappering. */ + +#ifndef ZMALLOC_H +#define ZMALLOC_H + +#define zmalloc malloc +#define zrealloc realloc +#define zcalloc(x) calloc(x,1) +#define zfree free +#define zstrdup strdup + +#endif From 9453949f8ba67af98e932314cf56c28f68a99c1a Mon Sep 17 00:00:00 2001 From: antirez Date: Sun, 28 Jul 2013 11:00:09 +0200 Subject: [PATCH 0115/2500] Remove dead variable bothsds from object.c. Thanks to @run and @badboy for spotting this. Triva: clang was not able to provide me a warning about that when compiling. This closes #1024 and #1207, committing the change myself as the pull requests no longer apply cleanly after other changes to the same function. --- src/object.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/object.c b/src/object.c index 9efb590f8..159e92ada 100644 --- a/src/object.c +++ b/src/object.c @@ -418,7 +418,6 @@ int compareStringObjectsWithFlags(robj *a, robj *b, int flags) { redisAssertWithInfo(NULL,a,a->type == REDIS_STRING && b->type == REDIS_STRING); char bufa[128], bufb[128], *astr, *bstr; size_t alen, blen, minlen; - int bothsds = 1; if (a == b) return 0; if (sdsEncodedObject(a)) { @@ -427,7 +426,6 @@ int compareStringObjectsWithFlags(robj *a, robj *b, int flags) { } else { alen = ll2string(bufa,sizeof(bufa),(long) a->ptr); astr = bufa; - bothsds = 0; } if (sdsEncodedObject(b)) { bstr = b->ptr; @@ -435,7 +433,6 @@ int compareStringObjectsWithFlags(robj *a, robj *b, int flags) { } else { blen = ll2string(bufb,sizeof(bufb),(long) b->ptr); bstr = bufb; - bothsds = 0; } if (flags & REDIS_COMPARE_COLL) { return strcoll(astr,bstr); From 9efbe0dca010c511546bde6d07a0c58d2533846e Mon Sep 17 00:00:00 2001 From: antirez Date: Sun, 28 Jul 2013 12:49:07 +0200 Subject: [PATCH 0116/2500] Fix replicationFeedSlaves() off-by-one bug. This fixes issue #1221. --- src/replication.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/replication.c b/src/replication.c index 3e8964ddd..2d7ee809d 100644 --- a/src/replication.c +++ b/src/replication.c @@ -188,7 +188,7 @@ void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) { b[0] = '*'; len = ll2string(b+1,REDIS_LONGSTR_SIZE,argc); b += len+1; - buf_left -= len; + buf_left -= len+1; b[0] = '\r'; b[1] = '\n'; b += 2; @@ -219,7 +219,7 @@ void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) { b[0] = '$'; len = ll2string(b+1,REDIS_LONGSTR_SIZE,objlen); b += len+1; - buf_left -= len; + buf_left -= len+1; b[0] = '\r'; b[1] = '\n'; b += 2; From 1b26e3e1faf876603cb62968bbc70d5e485d075a Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 29 Jul 2013 17:39:19 +0200 Subject: [PATCH 0117/2500] Test: regression test for issue #1221. --- tests/integration/replication-4.tcl | 38 +++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/tests/integration/replication-4.tcl b/tests/integration/replication-4.tcl index f84369f44..6db9ffe2b 100644 --- a/tests/integration/replication-4.tcl +++ b/tests/integration/replication-4.tcl @@ -96,3 +96,41 @@ start_server {tags {"repl"}} { } {NOREPLICAS*} } } + +start_server {tags {"repl"}} { + start_server {} { + set master [srv -1 client] + set master_host [srv -1 host] + set master_port [srv -1 port] + set slave [srv 0 client] + + test {First server should have role slave after SLAVEOF} { + $slave slaveof $master_host $master_port + wait_for_condition 50 100 { + [s 0 role] eq {slave} + } else { + fail "Replication not started." + } + } + + test {Replication: commands with many arguments (issue #1221)} { + # We now issue large MSET commands, that may trigger a specific + # class of bugs, see issue #1221. + for {set j 0} {$j < 100} {incr j} { + set cmd [list mset] + for {set x 0} {$x < 1000} {incr x} { + lappend cmd [randomKey] [randomValue] + } + $master {*}$cmd + } + + set retry 10 + while {$retry && ([$master debug digest] ne [$slave debug digest])}\ + { + after 1000 + incr retry -1 + } + assert {[$master dbsize] > 0} + } + } +} From db3bbb9006a9bc5f9313da4c40bbd0e295a57a42 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 5 Aug 2013 12:05:22 +0200 Subject: [PATCH 0118/2500] Draft #1 of a new expired keys collection algorithm. The main idea here is that when we are no longer to expire keys at the rate the are created, we can't block more in the normal expire cycle as this would result in too big latency spikes. For this reason the commit introduces a "fast" expire cycle that does not run for more than 1 millisecond but is called in the beforeSleep() hook of the event loop, so much more often, and with a frequency bound to the frequency of executed commnads. The fast expire cycle is only called when the standard expiration algorithm runs out of time, that is, consumed more than REDIS_EXPIRELOOKUPS_TIME_PERC of CPU in a given cycle without being able to take the number of already expired keys that are yet not collected to a number smaller than 25% of the number of keys. You can test this commit with different loads, but a simple way is to use the following: Extreme load with pipelining: redis-benchmark -r 100000000 -n 100000000 \ -P 32 set ele:rand:000000000000 foo ex 2 Remove the -P32 in order to avoid the pipelining for a more real-world load. In another terminal tab you can monitor the Redis behavior with: redis-cli -i 0.1 -r -1 info keyspace and redis-cli --latency-history Note: this commit will make Redis printing a lot of debug messages, it is not a good idea to use it in production. --- src/redis.c | 116 +++++++++++++++++++++++++++++++++++++++++++--------- src/redis.h | 5 ++- 2 files changed, 100 insertions(+), 21 deletions(-) diff --git a/src/redis.c b/src/redis.c index 8a833d509..52f9b1f26 100644 --- a/src/redis.c +++ b/src/redis.c @@ -654,14 +654,51 @@ void updateDictResizePolicy(void) { /* ======================= Cron: called every 100 ms ======================== */ +/* Helper function for the activeExpireCycle() function. + * This function will try to expire the key that is stored in the hash table + * entry 'de' of the 'expires' hash table of a Redis database. + * + * If the key is found to be expired, it is removed from the database and + * 1 is returned. Otherwise no operation is performed and 0 is returned. + * + * When a key is expired, server.stat_expiredkeys is incremented. + * + * The parameter 'now' is the current time in milliseconds as is passed + * to the function to avoid too many gettimeofday() syscalls. */ +int activeExpireCycleTryExpire(redisDb *db, struct dictEntry *de, long long now) { + long long t = dictGetSignedIntegerVal(de); + if (now > t) { + sds key = dictGetKey(de); + robj *keyobj = createStringObject(key,sdslen(key)); + + propagateExpire(db,keyobj); + dbDelete(db,keyobj); + notifyKeyspaceEvent(REDIS_NOTIFY_EXPIRED, + "expired",keyobj,db->id); + decrRefCount(keyobj); + server.stat_expiredkeys++; + return 1; + } else { + return 0; + } +} + /* Try to expire a few timed out keys. The algorithm used is adaptive and * will use few CPU cycles if there are few expiring keys, otherwise * it will get more aggressive to avoid that too much memory is used by * keys that can be removed from the keyspace. * * No more than REDIS_DBCRON_DBS_PER_CALL databases are tested at every - * iteration. */ -void activeExpireCycle(void) { + * iteration. + * + * If fast is non-zero the function will try to expire just one key ASAP + * from the current DB and return. This kind of call is used when Redis detects + * that timelimit_exit is true, so there is more work to do, and we do it + * more incrementally from the beforeSleep() function of the event loop. */ + +#define EXPIRED_HISTORY_LEN 10 + +void activeExpireCycle(int fast) { /* This function has some global state in order to continue the work * incrementally across calls. */ static unsigned int current_db = 0; /* Last DB tested. */ @@ -671,6 +708,31 @@ void activeExpireCycle(void) { unsigned int dbs_per_call = REDIS_DBCRON_DBS_PER_CALL; long long start = ustime(), timelimit; +#if 0 + static int expired_history[EXPIRED_HISTORY_LEN]; + static int expired_history_id = 0; + static int expired_perc_avg = 0; +#endif + + if (fast && !timelimit_exit) return; + +#if 0 + if (fast) { + if (!timelimit_exit) return; + + /* Let's try to expire a single key from the previous DB, the one that + * had enough keys expiring to reach the time limit. */ + redisDb *db = server.db+((current_db+server.dbnum-1) % server.dbnum); + dictEntry *de; + + for (j = 0; j < 100; j++) { + if ((de = dictGetRandomKey(db->expires)) == NULL) break; + activeExpireCycleTryExpire(db,de,server.mstime); + } + return; + } +#endif + /* We usually should test REDIS_DBCRON_DBS_PER_CALL per iteration, with * two exceptions: * @@ -689,6 +751,8 @@ void activeExpireCycle(void) { timelimit_exit = 0; if (timelimit <= 0) timelimit = 1; + if (fast) timelimit = 1000; /* 1 millisecond. */ + for (j = 0; j < dbs_per_call; j++) { int expired; redisDb *db = server.db+(current_db % server.dbnum); @@ -722,22 +786,9 @@ void activeExpireCycle(void) { num = REDIS_EXPIRELOOKUPS_PER_CRON; while (num--) { dictEntry *de; - long long t; if ((de = dictGetRandomKey(db->expires)) == NULL) break; - t = dictGetSignedIntegerVal(de); - if (now > t) { - sds key = dictGetKey(de); - robj *keyobj = createStringObject(key,sdslen(key)); - - propagateExpire(db,keyobj); - dbDelete(db,keyobj); - notifyKeyspaceEvent(REDIS_NOTIFY_EXPIRED, - "expired",keyobj,db->id); - decrRefCount(keyobj); - expired++; - server.stat_expiredkeys++; - } + if (activeExpireCycleTryExpire(db,de,now)) expired++; } /* We can't block forever here even if there are many keys to * expire. So after a given amount of milliseconds return to the @@ -747,8 +798,21 @@ void activeExpireCycle(void) { (ustime()-start) > timelimit) { timelimit_exit = 1; - return; } +#if 0 + expired_history_id = (expired_history_id+1) % EXPIRED_HISTORY_LEN; + expired_history[expired_history_id] = expired; + { + int i; + expired_perc_avg = 0; + for (i = 0; i < EXPIRED_HISTORY_LEN; i++) { + expired_perc_avg += expired_history[i]; + } + expired_perc_avg = (expired_perc_avg * 100) / (REDIS_EXPIRELOOKUPS_PER_CRON*EXPIRED_HISTORY_LEN); + // printf("Expired AVG: %d\n", expired_perc_avg); + } +#endif + if (timelimit_exit) return; } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4); } } @@ -868,8 +932,12 @@ void clientsCron(void) { void databasesCron(void) { /* Expire keys by random sampling. Not required for slaves * as master will synthesize DELs for us. */ - if (server.active_expire_enabled && server.masterhost == NULL) - activeExpireCycle(); + if (server.active_expire_enabled && server.masterhost == NULL) { + long long totalex = server.stat_expiredkeys; + activeExpireCycle(0); + if (server.stat_expiredkeys - totalex) + printf("EXPIRED SLOW: %lld\n", server.stat_expiredkeys - totalex); + } /* Perform hash tables rehashing if needed, but only if there are no * other processes saving the DB on disk. Otherwise rehashing is bad @@ -941,6 +1009,7 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) { * in objects at every object access, and accuracy is not needed. * To access a global var is faster than calling time(NULL) */ server.unixtime = time(NULL); + server.mstime = mstime(); run_with_period(100) trackOperationsPerSecond(); @@ -1110,6 +1179,14 @@ void beforeSleep(struct aeEventLoop *eventLoop) { listNode *ln; redisClient *c; + /* Run a fast expire cycle. */ + { + long long totalex = server.stat_expiredkeys; + activeExpireCycle(1); + if (server.stat_expiredkeys - totalex) + printf("EXPIRED FAST: %lld\n", server.stat_expiredkeys - totalex); + } + /* Try to process pending commands for clients that were just unblocked. */ while (listLength(server.unblocked_clients)) { ln = listFirst(server.unblocked_clients); @@ -1513,6 +1590,7 @@ void initServer() { server.ops_sec_last_sample_time = mstime(); server.ops_sec_last_sample_ops = 0; server.unixtime = time(NULL); + server.mstime = mstime(); server.lastbgsave_status = REDIS_OK; server.repl_good_slaves_count = 0; diff --git a/src/redis.h b/src/redis.h index 57e39ad2b..7f0ab64e3 100644 --- a/src/redis.h +++ b/src/redis.h @@ -74,7 +74,7 @@ #define REDIS_MAXIDLETIME 0 /* default client timeout: infinite */ #define REDIS_DEFAULT_DBNUM 16 #define REDIS_CONFIGLINE_MAX 1024 -#define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */ +#define REDIS_EXPIRELOOKUPS_PER_CRON 20 /* lookup 20 expires per loop */ #define REDIS_EXPIRELOOKUPS_TIME_PERC 25 /* CPU max % for keys collection */ #define REDIS_DBCRON_DBS_PER_CALL 16 #define REDIS_MAX_WRITE_PER_EVENT (1024*64) @@ -902,7 +902,8 @@ struct redisServer { size_t set_max_intset_entries; size_t zset_max_ziplist_entries; size_t zset_max_ziplist_value; - time_t unixtime; /* Unix time sampled every second. */ + time_t unixtime; /* Unix time sampled every cron cycle. */ + long long mstime; /* Like 'unixtime' but with milliseconds resolution. */ /* Pubsub */ dict *pubsub_channels; /* Map channels to list of subscribed clients */ list *pubsub_patterns; /* A list of pubsub_patterns */ From f52610280178c353ccecdbe5d5e8d204d33b78d9 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 5 Aug 2013 16:11:56 +0200 Subject: [PATCH 0119/2500] Darft #2 for key collection algo: more improvements. This commit makes the fast collection cycle time configurable, at the same time it does not allow to run a new fast collection cycle for the same amount of time as the max duration of the fast collection cycle. --- src/redis.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/redis.c b/src/redis.c index 52f9b1f26..31456f00a 100644 --- a/src/redis.c +++ b/src/redis.c @@ -697,6 +697,7 @@ int activeExpireCycleTryExpire(redisDb *db, struct dictEntry *de, long long now) * more incrementally from the beforeSleep() function of the event loop. */ #define EXPIRED_HISTORY_LEN 10 +#define EXPIRE_FAST_CYCLE_DURATION 1000 void activeExpireCycle(int fast) { /* This function has some global state in order to continue the work @@ -707,6 +708,7 @@ void activeExpireCycle(int fast) { unsigned int j, iteration = 0; unsigned int dbs_per_call = REDIS_DBCRON_DBS_PER_CALL; long long start = ustime(), timelimit; + static long long last_fast_cycle = 0; #if 0 static int expired_history[EXPIRED_HISTORY_LEN]; @@ -714,7 +716,17 @@ void activeExpireCycle(int fast) { static int expired_perc_avg = 0; #endif - if (fast && !timelimit_exit) return; + if (fast) { + /* Don't start a fast cycle if the previous cycle did not exited + * for time limt. Also don't repeat a fast cycle for the same period + * as the fast cycle total duration itself. */ + if (!timelimit_exit) return; + if (start < last_fast_cycle + EXPIRE_FAST_CYCLE_DURATION) { + printf("CANT START A FAST CYCLE\n"); + return; + } + last_fast_cycle = start; + } #if 0 if (fast) { @@ -751,7 +763,7 @@ void activeExpireCycle(int fast) { timelimit_exit = 0; if (timelimit <= 0) timelimit = 1; - if (fast) timelimit = 1000; /* 1 millisecond. */ + if (fast) timelimit = EXPIRE_FAST_CYCLE_DURATION; /* in microseconds. */ for (j = 0; j < dbs_per_call; j++) { int expired; From 8d70078b45458f216a1f1d8ecd901b533ad99fcb Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 6 Aug 2013 12:36:13 +0200 Subject: [PATCH 0120/2500] Remove dead code and fix comments for new expire code. --- src/redis.c | 66 ++++++++--------------------------------------------- 1 file changed, 10 insertions(+), 56 deletions(-) diff --git a/src/redis.c b/src/redis.c index 31456f00a..67621bde9 100644 --- a/src/redis.c +++ b/src/redis.c @@ -691,12 +691,14 @@ int activeExpireCycleTryExpire(redisDb *db, struct dictEntry *de, long long now) * No more than REDIS_DBCRON_DBS_PER_CALL databases are tested at every * iteration. * - * If fast is non-zero the function will try to expire just one key ASAP - * from the current DB and return. This kind of call is used when Redis detects - * that timelimit_exit is true, so there is more work to do, and we do it - * more incrementally from the beforeSleep() function of the event loop. */ + * If fast is non-zero the function will try to run a "fast" expire cycle that + * takes no longer than EXPIRE_FAST_CYCLE_DURATION microseconds, and is not + * repeated again before the same amount of time. + * + * This kind of call is used when Redis detects that timelimit_exit is + * true, so there is more work to do, and we do it more incrementally from + * the beforeSleep() function of the event loop. */ -#define EXPIRED_HISTORY_LEN 10 #define EXPIRE_FAST_CYCLE_DURATION 1000 void activeExpireCycle(int fast) { @@ -710,41 +712,15 @@ void activeExpireCycle(int fast) { long long start = ustime(), timelimit; static long long last_fast_cycle = 0; -#if 0 - static int expired_history[EXPIRED_HISTORY_LEN]; - static int expired_history_id = 0; - static int expired_perc_avg = 0; -#endif - if (fast) { /* Don't start a fast cycle if the previous cycle did not exited * for time limt. Also don't repeat a fast cycle for the same period * as the fast cycle total duration itself. */ if (!timelimit_exit) return; - if (start < last_fast_cycle + EXPIRE_FAST_CYCLE_DURATION) { - printf("CANT START A FAST CYCLE\n"); - return; - } + if (start < last_fast_cycle + EXPIRE_FAST_CYCLE_DURATION) return; last_fast_cycle = start; } -#if 0 - if (fast) { - if (!timelimit_exit) return; - - /* Let's try to expire a single key from the previous DB, the one that - * had enough keys expiring to reach the time limit. */ - redisDb *db = server.db+((current_db+server.dbnum-1) % server.dbnum); - dictEntry *de; - - for (j = 0; j < 100; j++) { - if ((de = dictGetRandomKey(db->expires)) == NULL) break; - activeExpireCycleTryExpire(db,de,server.mstime); - } - return; - } -#endif - /* We usually should test REDIS_DBCRON_DBS_PER_CALL per iteration, with * two exceptions: * @@ -811,19 +787,6 @@ void activeExpireCycle(int fast) { { timelimit_exit = 1; } -#if 0 - expired_history_id = (expired_history_id+1) % EXPIRED_HISTORY_LEN; - expired_history[expired_history_id] = expired; - { - int i; - expired_perc_avg = 0; - for (i = 0; i < EXPIRED_HISTORY_LEN; i++) { - expired_perc_avg += expired_history[i]; - } - expired_perc_avg = (expired_perc_avg * 100) / (REDIS_EXPIRELOOKUPS_PER_CRON*EXPIRED_HISTORY_LEN); - // printf("Expired AVG: %d\n", expired_perc_avg); - } -#endif if (timelimit_exit) return; } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4); } @@ -944,12 +907,8 @@ void clientsCron(void) { void databasesCron(void) { /* Expire keys by random sampling. Not required for slaves * as master will synthesize DELs for us. */ - if (server.active_expire_enabled && server.masterhost == NULL) { - long long totalex = server.stat_expiredkeys; + if (server.active_expire_enabled && server.masterhost == NULL) activeExpireCycle(0); - if (server.stat_expiredkeys - totalex) - printf("EXPIRED SLOW: %lld\n", server.stat_expiredkeys - totalex); - } /* Perform hash tables rehashing if needed, but only if there are no * other processes saving the DB on disk. Otherwise rehashing is bad @@ -1192,12 +1151,7 @@ void beforeSleep(struct aeEventLoop *eventLoop) { redisClient *c; /* Run a fast expire cycle. */ - { - long long totalex = server.stat_expiredkeys; - activeExpireCycle(1); - if (server.stat_expiredkeys - totalex) - printf("EXPIRED FAST: %lld\n", server.stat_expiredkeys - totalex); - } + activeExpireCycle(1); /* Try to process pending commands for clients that were just unblocked. */ while (listLength(server.unblocked_clients)) { From 1dd670c34bd3b19f3d339ce9c25c0cf946d747aa Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 6 Aug 2013 12:55:49 +0200 Subject: [PATCH 0121/2500] Some activeExpireCycle() refactoring. --- src/redis.c | 45 ++++++++++++++++++++++++++------------------- src/redis.h | 8 ++++++-- 2 files changed, 32 insertions(+), 21 deletions(-) diff --git a/src/redis.c b/src/redis.c index 67621bde9..d1dde4d1e 100644 --- a/src/redis.c +++ b/src/redis.c @@ -691,33 +691,37 @@ int activeExpireCycleTryExpire(redisDb *db, struct dictEntry *de, long long now) * No more than REDIS_DBCRON_DBS_PER_CALL databases are tested at every * iteration. * - * If fast is non-zero the function will try to run a "fast" expire cycle that - * takes no longer than EXPIRE_FAST_CYCLE_DURATION microseconds, and is not - * repeated again before the same amount of time. - * * This kind of call is used when Redis detects that timelimit_exit is * true, so there is more work to do, and we do it more incrementally from - * the beforeSleep() function of the event loop. */ + * the beforeSleep() function of the event loop. + * + * Expire cycle type: + * + * If type is ACTIVE_EXPIRE_CYCLE_FAST the function will try to run a + * "fast" expire cycle that takes no longer than EXPIRE_FAST_CYCLE_DURATION + * microseconds, and is not repeated again before the same amount of time. + * + * If type is ACTIVE_EXPIRE_CYCLE_SLOW, that normal expire cycle is + * executed, where the time limit is a percentage of the REDIS_HZ period + * as specified by the REDIS_EXPIRELOOKUPS_TIME_PERC define. */ -#define EXPIRE_FAST_CYCLE_DURATION 1000 - -void activeExpireCycle(int fast) { +void activeExpireCycle(int type) { /* This function has some global state in order to continue the work * incrementally across calls. */ static unsigned int current_db = 0; /* Last DB tested. */ static int timelimit_exit = 0; /* Time limit hit in previous call? */ + static long long last_fast_cycle = 0; /* When last fast cycle ran. */ unsigned int j, iteration = 0; unsigned int dbs_per_call = REDIS_DBCRON_DBS_PER_CALL; long long start = ustime(), timelimit; - static long long last_fast_cycle = 0; - if (fast) { + if (type == ACTIVE_EXPIRE_CYCLE_FAST) { /* Don't start a fast cycle if the previous cycle did not exited * for time limt. Also don't repeat a fast cycle for the same period * as the fast cycle total duration itself. */ if (!timelimit_exit) return; - if (start < last_fast_cycle + EXPIRE_FAST_CYCLE_DURATION) return; + if (start < last_fast_cycle + ACTIVE_EXPIRE_CYCLE_FAST_DURATION) return; last_fast_cycle = start; } @@ -731,15 +735,16 @@ void activeExpireCycle(int fast) { if (dbs_per_call > server.dbnum || timelimit_exit) dbs_per_call = server.dbnum; - /* We can use at max REDIS_EXPIRELOOKUPS_TIME_PERC percentage of CPU time + /* We can use at max ACTIVE_EXPIRE_CYCLE_SLOW_TIME_PERC percentage of CPU time * per iteration. Since this function gets called with a frequency of * server.hz times per second, the following is the max amount of * microseconds we can spend in this function. */ - timelimit = 1000000*REDIS_EXPIRELOOKUPS_TIME_PERC/server.hz/100; + timelimit = 1000000*ACTIVE_EXPIRE_CYCLE_SLOW_TIME_PERC/server.hz/100; timelimit_exit = 0; if (timelimit <= 0) timelimit = 1; - if (fast) timelimit = EXPIRE_FAST_CYCLE_DURATION; /* in microseconds. */ + if (type == ACTIVE_EXPIRE_CYCLE_FAST) + timelimit = ACTIVE_EXPIRE_CYCLE_FAST_DURATION; /* in microseconds. */ for (j = 0; j < dbs_per_call; j++) { int expired; @@ -770,8 +775,8 @@ void activeExpireCycle(int fast) { /* The main collection cycle. Sample random keys among keys * with an expire set, checking for expired ones. */ expired = 0; - if (num > REDIS_EXPIRELOOKUPS_PER_CRON) - num = REDIS_EXPIRELOOKUPS_PER_CRON; + if (num > ACTIVE_EXPIRE_CYCLE_LOOKUPS_PER_LOOP) + num = ACTIVE_EXPIRE_CYCLE_LOOKUPS_PER_LOOP; while (num--) { dictEntry *de; @@ -788,7 +793,9 @@ void activeExpireCycle(int fast) { timelimit_exit = 1; } if (timelimit_exit) return; - } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4); + /* We don't repeat the cycle if there are less than 25% of keys + * found expired in the current DB. */ + } while (expired > ACTIVE_EXPIRE_CYCLE_LOOKUPS_PER_LOOP/4); } } @@ -908,7 +915,7 @@ void databasesCron(void) { /* Expire keys by random sampling. Not required for slaves * as master will synthesize DELs for us. */ if (server.active_expire_enabled && server.masterhost == NULL) - activeExpireCycle(0); + activeExpireCycle(ACTIVE_EXPIRE_CYCLE_SLOW); /* Perform hash tables rehashing if needed, but only if there are no * other processes saving the DB on disk. Otherwise rehashing is bad @@ -1151,7 +1158,7 @@ void beforeSleep(struct aeEventLoop *eventLoop) { redisClient *c; /* Run a fast expire cycle. */ - activeExpireCycle(1); + activeExpireCycle(ACTIVE_EXPIRE_CYCLE_FAST); /* Try to process pending commands for clients that were just unblocked. */ while (listLength(server.unblocked_clients)) { diff --git a/src/redis.h b/src/redis.h index 7f0ab64e3..9870a3130 100644 --- a/src/redis.h +++ b/src/redis.h @@ -74,8 +74,6 @@ #define REDIS_MAXIDLETIME 0 /* default client timeout: infinite */ #define REDIS_DEFAULT_DBNUM 16 #define REDIS_CONFIGLINE_MAX 1024 -#define REDIS_EXPIRELOOKUPS_PER_CRON 20 /* lookup 20 expires per loop */ -#define REDIS_EXPIRELOOKUPS_TIME_PERC 25 /* CPU max % for keys collection */ #define REDIS_DBCRON_DBS_PER_CALL 16 #define REDIS_MAX_WRITE_PER_EVENT (1024*64) #define REDIS_SHARED_SELECT_CMDS 10 @@ -124,6 +122,12 @@ #define REDIS_PEER_ID_LEN (REDIS_IP_STR_LEN+32) /* Must be enough for ip:port */ #define REDIS_BINDADDR_MAX 16 +#define ACTIVE_EXPIRE_CYCLE_LOOKUPS_PER_LOOP 20 /* Loopkups per loop. */ +#define ACTIVE_EXPIRE_CYCLE_FAST_DURATION 1000 /* Microseconds */ +#define ACTIVE_EXPIRE_CYCLE_SLOW_TIME_PERC 25 /* CPU max % for keys collection */ +#define ACTIVE_EXPIRE_CYCLE_SLOW 0 +#define ACTIVE_EXPIRE_CYCLE_FAST 1 + /* Protocol and I/O related defines */ #define REDIS_MAX_QUERYBUF_LEN (1024*1024*1024) /* 1GB max query buffer. */ #define REDIS_IOBUF_LEN (1024*16) /* Generic I/O buffer size */ From 67b4704bc6bcd0ca74e10a539644f42493e12d36 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 6 Aug 2013 12:59:04 +0200 Subject: [PATCH 0122/2500] activeExpireCycle(): fix about fast cycle early start. We don't want to repeat a fast cycle too soon, the previous code was broken, we need to wait two times the period *since* the start of the previous cycle in order to avoid there is an even space between cycles: .-> start .-> second start | | +-------------+-------------+--------------+ | first cycle | pause | second cycle | +-------------+-------------+--------------+ The second and first start must be PERIOD*2 useconds apart hence the *2 in the new code. --- src/redis.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/redis.c b/src/redis.c index d1dde4d1e..c1f1d6642 100644 --- a/src/redis.c +++ b/src/redis.c @@ -721,7 +721,7 @@ void activeExpireCycle(int type) { * for time limt. Also don't repeat a fast cycle for the same period * as the fast cycle total duration itself. */ if (!timelimit_exit) return; - if (start < last_fast_cycle + ACTIVE_EXPIRE_CYCLE_FAST_DURATION) return; + if (start < last_fast_cycle + ACTIVE_EXPIRE_CYCLE_FAST_DURATION*2) return; last_fast_cycle = start; } From 623ad22fa83a128b93a29af17abd5e59cb672252 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 6 Aug 2013 15:00:43 +0200 Subject: [PATCH 0123/2500] Add per-db average TTL information in INFO output. Example: db0:keys=221913,expires=221913,avg_ttl=655 The algorithm uses a running average with only two samples (current and previous). Keys found to be expired are considered at TTL zero even if the actual TTL can be negative. The TTL is reported in milliseconds. --- src/redis.c | 33 +++++++++++++++++++++++++++++---- src/redis.h | 1 + 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/src/redis.c b/src/redis.c index c1f1d6642..e2aaed213 100644 --- a/src/redis.c +++ b/src/redis.c @@ -759,10 +759,14 @@ void activeExpireCycle(int type) { * of the keys were expired. */ do { unsigned long num, slots; - long long now; + long long now, ttl_sum; + int ttl_samples; /* If there is nothing to expire try next DB ASAP. */ - if ((num = dictSize(db->expires)) == 0) break; + if ((num = dictSize(db->expires)) == 0) { + db->avg_ttl = 0; + break; + } slots = dictSlots(db->expires); now = mstime(); @@ -775,14 +779,33 @@ void activeExpireCycle(int type) { /* The main collection cycle. Sample random keys among keys * with an expire set, checking for expired ones. */ expired = 0; + ttl_sum = 0; + ttl_samples = 0; + if (num > ACTIVE_EXPIRE_CYCLE_LOOKUPS_PER_LOOP) num = ACTIVE_EXPIRE_CYCLE_LOOKUPS_PER_LOOP; + while (num--) { dictEntry *de; + long long ttl; if ((de = dictGetRandomKey(db->expires)) == NULL) break; + ttl = dictGetSignedIntegerVal(de)-now; if (activeExpireCycleTryExpire(db,de,now)) expired++; + if (ttl < 0) ttl = 0; + ttl_sum += ttl; + ttl_samples++; } + + /* Update the average TTL stats for this database. */ + if (ttl_samples) { + long long avg_ttl = ttl_sum/ttl_samples; + + if (db->avg_ttl == 0) db->avg_ttl = avg_ttl; + /* Smooth the value averaging with the previous one. */ + db->avg_ttl = (db->avg_ttl+avg_ttl)/2; + } + /* We can't block forever here even if there are many keys to * expire. So after a given amount of milliseconds return to the * caller waiting for the other active expire cycle. */ @@ -1530,6 +1553,7 @@ void initServer() { server.db[j].ready_keys = dictCreate(&setDictType,NULL); server.db[j].watched_keys = dictCreate(&keylistDictType,NULL); server.db[j].id = j; + server.db[j].avg_ttl = 0; } server.pubsub_channels = dictCreate(&keylistDictType,NULL); server.pubsub_patterns = listCreate(); @@ -2580,8 +2604,9 @@ sds genRedisInfoString(char *section) { keys = dictSize(server.db[j].dict); vkeys = dictSize(server.db[j].expires); if (keys || vkeys) { - info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n", - j, keys, vkeys); + info = sdscatprintf(info, + "db%d:keys=%lld,expires=%lld,avg_ttl=%lld\r\n", + j, keys, vkeys, server.db[j].avg_ttl); } } } diff --git a/src/redis.h b/src/redis.h index 9870a3130..058fa7d32 100644 --- a/src/redis.h +++ b/src/redis.h @@ -400,6 +400,7 @@ typedef struct redisDb { dict *ready_keys; /* Blocked keys that received a PUSH */ dict *watched_keys; /* WATCHED keys for MULTI/EXEC CAS */ int id; + long long avg_ttl; /* Average TTL, just for stats */ } redisDb; /* Client MULTI/EXEC state */ From 09c76e6800dc934e0c666dc944f05750b2bbf438 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 6 Aug 2013 18:50:54 +0200 Subject: [PATCH 0124/2500] redis-benchmark: ability to SELECT a specifid db number. --- src/redis-benchmark.c | 46 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 3 deletions(-) diff --git a/src/redis-benchmark.c b/src/redis-benchmark.c index 69c740242..a1c7b78b6 100644 --- a/src/redis-benchmark.c +++ b/src/redis-benchmark.c @@ -73,6 +73,8 @@ static struct config { int csv; int loop; int idlemode; + int dbnum; + sds dbnumstr; char *tests; } config; @@ -85,6 +87,9 @@ typedef struct _client { long long start; /* start time of a request */ long long latency; /* request latency */ int pending; /* Number of pending requests (sent but no reply received) */ + int selectlen; /* If non-zero, a SELECT of 'selectlen' bytes is currently + used as a prefix of the pipline of commands. This gets + discarded the first time it's sent. */ } *client; /* Prototypes */ @@ -199,6 +204,15 @@ static void readHandler(aeEventLoop *el, int fd, void *privdata, int mask) { freeReplyObject(reply); + if (c->selectlen) { + /* This is the OK from SELECT. Just discard the SELECT + * from the buffer. */ + c->pending--; + sdsrange(c->obuf,c->selectlen,-1); + c->selectlen = 0; + continue; + } + if (config.requests_finished < config.requests) config.latency[config.requests_finished++] = c->latency; c->pending--; @@ -269,13 +283,26 @@ static client createClient(char *cmd, size_t len) { } /* Suppress hiredis cleanup of unused buffers for max speed. */ c->context->reader->maxbuf = 0; + /* Queue N requests accordingly to the pipeline size. */ c->obuf = sdsempty(); + if (config.dbnum != 0) { + /* If a DB number different than zero is selected, prefix our request + * buffer with the SELECT command, that will be discarded the first + * time the replies are received, so if the client is reused the + * SELECT command will not be used again. */ + c->obuf = sdscatprintf(c->obuf,"*2\r\n$6\r\nSELECT\r\n$%d\r\n%s\r\n", + (int)sdslen(config.dbnumstr),config.dbnumstr); + c->selectlen = sdslen(c->obuf); + } else { + c->selectlen = 0; + } for (j = 0; j < config.pipeline; j++) c->obuf = sdscatlen(c->obuf,cmd,len); c->randlen = 0; c->written = 0; c->pending = config.pipeline; + if (c->selectlen) c->pending++; /* Find substrings in the output buffer that need to be randomized. */ if (config.randomkeys) { @@ -286,8 +313,6 @@ static client createClient(char *cmd, size_t len) { p += 6; } } - -/* redisSetReplyObjectFunctions(c->context,NULL); */ aeCreateFileEvent(config.el,c->context->fd,AE_WRITABLE,writeHandler,c); listAddNodeTail(config.clients,c); config.liveclients++; @@ -296,9 +321,18 @@ static client createClient(char *cmd, size_t len) { static void createMissingClients(client c) { int n = 0; + char *buf = c->obuf; + size_t buflen = sdslen(c->obuf); + + /* If we are cloning from a client with a SELECT prefix, skip it since the + * client will be created with the prefixed SELECT if needed. */ + if (c->selectlen) { + buf += c->selectlen; + buflen -= c->selectlen; + } while(config.liveclients < config.numclients) { - createClient(c->obuf,sdslen(c->obuf)/config.pipeline); + createClient(buf,buflen/config.pipeline); /* Listen backlog is quite limited on most systems */ if (++n > 64) { @@ -421,6 +455,10 @@ int parseOptions(int argc, const char **argv) { config.tests = sdscat(config.tests,(char*)argv[++i]); config.tests = sdscat(config.tests,","); sdstolower(config.tests); + } else if (!strcmp(argv[i],"--dbnum")) { + if (lastarg) goto invalid; + config.dbnum = atoi(argv[++i]); + config.dbnumstr = sdsfromlonglong(config.dbnum); } else if (!strcmp(argv[i],"--help")) { exit_status = 0; goto usage; @@ -447,6 +485,7 @@ usage: " -c Number of parallel connections (default 50)\n" " -n Total number of requests (default 10000)\n" " -d Data size of SET/GET value in bytes (default 2)\n" +" -dbnum SELECT the specified db number (default 0)\n" " -k 1=keep alive 0=reconnect (default 1)\n" " -r Use random keys for SET/GET/INCR, random values for SADD\n" " Using this option the benchmark will get/set keys\n" @@ -535,6 +574,7 @@ int main(int argc, const char **argv) { config.hostport = 6379; config.hostsocket = NULL; config.tests = NULL; + config.dbnum = 0; i = parseOptions(argc,argv); argc -= i; From e053e250c9be861ea503cd1b2f29ca62f5fd4179 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 6 Aug 2013 19:01:54 +0200 Subject: [PATCH 0125/2500] redis-benchmark: fix db selection when :rand: feature is used. --- src/redis-benchmark.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/redis-benchmark.c b/src/redis-benchmark.c index a1c7b78b6..f53b333b1 100644 --- a/src/redis-benchmark.c +++ b/src/redis-benchmark.c @@ -205,10 +205,16 @@ static void readHandler(aeEventLoop *el, int fd, void *privdata, int mask) { freeReplyObject(reply); if (c->selectlen) { + int j; + /* This is the OK from SELECT. Just discard the SELECT * from the buffer. */ c->pending--; sdsrange(c->obuf,c->selectlen,-1); + /* We also need to fix the pointers to the strings + * we need to randomize. */ + for (j = 0; j < c->randlen; j++) + c->randptr[j] -= c->selectlen; c->selectlen = 0; continue; } From 22d35c646c3ea6c722af3d41aff5bebdd27d8b4d Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 7 Aug 2013 15:58:51 +0200 Subject: [PATCH 0126/2500] redis-benchmark: max pipeline length hardcoded limit removed. --- src/redis-benchmark.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/src/redis-benchmark.c b/src/redis-benchmark.c index f53b333b1..c27583388 100644 --- a/src/redis-benchmark.c +++ b/src/redis-benchmark.c @@ -47,6 +47,7 @@ #include "zmalloc.h" #define REDIS_NOTUSED(V) ((void) V) +#define RANDPTR_INITIAL_SIZE 8 static struct config { aeEventLoop *el; @@ -81,12 +82,13 @@ static struct config { typedef struct _client { redisContext *context; sds obuf; - char *randptr[32]; /* needed for MSET against 10 keys */ - size_t randlen; - unsigned int written; /* bytes of 'obuf' already written */ - long long start; /* start time of a request */ - long long latency; /* request latency */ - int pending; /* Number of pending requests (sent but no reply received) */ + char **randptr; /* Pointers to :rand: strings inside the command buf */ + size_t randlen; /* Number of pointers in client->randptr */ + size_t randfree; /* Number of unused pointers in client->randptr */ + unsigned int written; /* Bytes of 'obuf' already written */ + long long start; /* Start time of a request */ + long long latency; /* Request latency */ + int pending; /* Number of pending requests (replies to consume) */ int selectlen; /* If non-zero, a SELECT of 'selectlen' bytes is currently used as a prefix of the pipline of commands. This gets discarded the first time it's sent. */ @@ -306,6 +308,8 @@ static client createClient(char *cmd, size_t len) { for (j = 0; j < config.pipeline; j++) c->obuf = sdscatlen(c->obuf,cmd,len); c->randlen = 0; + c->randfree = RANDPTR_INITIAL_SIZE; + c->randptr = zmalloc(sizeof(char*)*c->randfree); c->written = 0; c->pending = config.pipeline; if (c->selectlen) c->pending++; @@ -314,8 +318,12 @@ static client createClient(char *cmd, size_t len) { if (config.randomkeys) { char *p = c->obuf; while ((p = strstr(p,":rand:")) != NULL) { - assert(c->randlen < (signed)(sizeof(c->randptr)/sizeof(char*))); + if (c->randfree == 0) { + c->randptr = zrealloc(c->randptr,sizeof(char*)*c->randlen*2); + c->randfree += c->randlen; + } c->randptr[c->randlen++] = p+6; + c->randfree--; p += 6; } } From 7ab3495410e9b8794dd621152ab56cb0654a0e17 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 7 Aug 2013 15:59:59 +0200 Subject: [PATCH 0127/2500] redis-benchmark: fix memory leak introduced by 22d35c6 --- src/redis-benchmark.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/redis-benchmark.c b/src/redis-benchmark.c index c27583388..dcd77b58e 100644 --- a/src/redis-benchmark.c +++ b/src/redis-benchmark.c @@ -125,6 +125,7 @@ static void freeClient(client c) { aeDeleteFileEvent(config.el,c->context->fd,AE_READABLE); redisFree(c->context); sdsfree(c->obuf); + zfree(c->randptr); zfree(c); config.liveclients--; ln = listSearchKey(config.clients,c); From 05b9bfc79b87f4dbc39c448cb38a8be4b71f43cf Mon Sep 17 00:00:00 2001 From: Jan-Erik Rediger Date: Wed, 7 Aug 2013 16:05:09 +0200 Subject: [PATCH 0128/2500] Little typo --- tests/test_helper.tcl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl index 930eba4ee..1c3049d32 100644 --- a/tests/test_helper.tcl +++ b/tests/test_helper.tcl @@ -465,7 +465,7 @@ proc close_replication_stream {s} { # With the parallel test running multiple Redis instances at the same time # we need a fast enough computer, otherwise a lot of tests may generate # false positives. -# If the computer is too slow we revert the sequetial test without any +# If the computer is too slow we revert the sequential test without any # parallelism, that is, clients == 1. proc is_a_slow_computer {} { set start [clock milliseconds] From 40693737f0415fe26a3c35d7dadb04cffc007cdc Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 8 Aug 2013 14:31:54 +0200 Subject: [PATCH 0129/2500] redis-benchmark: replace snprintf()+memcpy with faster code. This change was profiler-driven, but the actual effect is hard to measure in real-world redis benchmark runs. --- src/redis-benchmark.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/redis-benchmark.c b/src/redis-benchmark.c index dcd77b58e..ba885d978 100644 --- a/src/redis-benchmark.c +++ b/src/redis-benchmark.c @@ -152,13 +152,18 @@ static void resetClient(client c) { } static void randomizeClientKey(client c) { - char buf[32]; - size_t i, r; + size_t i; for (i = 0; i < c->randlen; i++) { - r = random() % config.randomkeys_keyspacelen; - snprintf(buf,sizeof(buf),"%012zu",r); - memcpy(c->randptr[i],buf,12); + char *p = c->randptr[i]+11; + size_t r = random() % config.randomkeys_keyspacelen; + size_t j; + + for (j = 0; j < 12; j++) { + *p = '0'+r%10; + r/=10; + p--; + } } } From 2134281040654056db326ea54f991fa7b5ddc849 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 8 Aug 2013 16:42:08 +0200 Subject: [PATCH 0130/2500] redis-benchmark: changes to random arguments substitution. Before this commit redis-benchmark supported random argumetns in the form of :rand:000000000000. In every string of that form, the zeros were replaced with a random number of 12 digits at every command invocation. However this was far from perfect as did not allowed to generate simply random numbers as arguments, there was always the :rand: prefix. Now instead every argument in the form __rand_int__ is replaced with a 12 digits number. Note that "__rand_int__" is 12 characters itself. In order to implement the new semantic, it was needed to change a few thigns in the internals of redis-benchmark, as new clients are created cloning old clients, so without a stable prefix such as ":rand:" the old way of cloning the client was no longer able to understand, from the old command line, what was the position of the random strings to substitute. Now instead a client structure is passed as a reference for cloning, so that we can directly clone the offsets inside the command line. --- src/redis-benchmark.c | 108 +++++++++++++++++++++++++++++++----------- 1 file changed, 80 insertions(+), 28 deletions(-) diff --git a/src/redis-benchmark.c b/src/redis-benchmark.c index ba885d978..59b906ed7 100644 --- a/src/redis-benchmark.c +++ b/src/redis-benchmark.c @@ -278,7 +278,29 @@ static void writeHandler(aeEventLoop *el, int fd, void *privdata, int mask) { } } -static client createClient(char *cmd, size_t len) { +/* Create a benchmark client, configured to send the command passed as 'cmd' of + * 'len' bytes. + * + * The command is copied N times in the client output buffer (that is reused + * again and again to send the request to the server) accordingly to the configured + * pipeline size. + * + * Also an initial SELECT command is prepended in order to make sure the right + * database is selected, if needed. The initial SELECT will be discarded as soon + * as the first reply is received. + * + * To create a client from scratch, the 'from' pointer is set to NULL. If instead + * we want to create a client using another client as reference, the 'from' pointer + * points to the client to use as reference. In such a case the following + * information is take from the 'from' client: + * + * 1) The command line to use. + * 2) The offsets of the __rand_int__ elements inside the command line, used + * for arguments randomization. + * + * Even when cloning another client, the SELECT command is automatically prefixed + * if needed. */ +static client createClient(char *cmd, size_t len, client from) { int j; client c = zmalloc(sizeof(struct _client)); @@ -298,39 +320,65 @@ static client createClient(char *cmd, size_t len) { /* Suppress hiredis cleanup of unused buffers for max speed. */ c->context->reader->maxbuf = 0; - /* Queue N requests accordingly to the pipeline size. */ + /* Build the request buffer: + * Queue N requests accordingly to the pipeline size, or simply clone + * the example client buffer. */ c->obuf = sdsempty(); + + /* If a DB number different than zero is selected, prefix our request + * buffer with the SELECT command, that will be discarded the first + * time the replies are received, so if the client is reused the + * SELECT command will not be used again. */ if (config.dbnum != 0) { - /* If a DB number different than zero is selected, prefix our request - * buffer with the SELECT command, that will be discarded the first - * time the replies are received, so if the client is reused the - * SELECT command will not be used again. */ c->obuf = sdscatprintf(c->obuf,"*2\r\n$6\r\nSELECT\r\n$%d\r\n%s\r\n", (int)sdslen(config.dbnumstr),config.dbnumstr); c->selectlen = sdslen(c->obuf); } else { c->selectlen = 0; } - for (j = 0; j < config.pipeline; j++) - c->obuf = sdscatlen(c->obuf,cmd,len); - c->randlen = 0; - c->randfree = RANDPTR_INITIAL_SIZE; - c->randptr = zmalloc(sizeof(char*)*c->randfree); + + /* Append the request itself. */ + if (from) { + c->obuf = sdscatlen(c->obuf, + from->obuf+from->selectlen, + sdslen(from->obuf)-from->selectlen); + } else { + for (j = 0; j < config.pipeline; j++) + c->obuf = sdscatlen(c->obuf,cmd,len); + } c->written = 0; c->pending = config.pipeline; + c->randptr = NULL; + c->randlen = 0; if (c->selectlen) c->pending++; /* Find substrings in the output buffer that need to be randomized. */ if (config.randomkeys) { - char *p = c->obuf; - while ((p = strstr(p,":rand:")) != NULL) { - if (c->randfree == 0) { - c->randptr = zrealloc(c->randptr,sizeof(char*)*c->randlen*2); - c->randfree += c->randlen; + if (from) { + c->randlen = from->randlen; + c->randfree = 0; + c->randptr = zmalloc(sizeof(char*)*c->randlen); + /* copy the offsets. */ + for (j = 0; j < c->randlen; j++) { + c->randptr[j] = c->obuf + (from->randptr[j]-from->obuf); + /* Adjust for the different select prefix length. */ + c->randptr[j] += c->selectlen - from->selectlen; + } + } else { + char *p = c->obuf; + + c->randlen = 0; + c->randfree = RANDPTR_INITIAL_SIZE; + c->randptr = zmalloc(sizeof(char*)*c->randfree); + while ((p = strstr(p,"__rand_int__")) != NULL) { + if (c->randfree == 0) { + c->randptr = zrealloc(c->randptr,sizeof(char*)*c->randlen*2); + c->randfree += c->randlen; + } + c->randptr[c->randlen++] = p; + c->randfree--; + p += 12; /* 12 is strlen("__rand_int__). */ } - c->randptr[c->randlen++] = p+6; - c->randfree--; - p += 6; } } aeCreateFileEvent(config.el,c->context->fd,AE_WRITABLE,writeHandler,c); @@ -352,7 +400,7 @@ static void createMissingClients(client c) { } while(config.liveclients < config.numclients) { - createClient(buf,buflen/config.pipeline); + createClient(NULL,0,c); /* Listen backlog is quite limited on most systems */ if (++n > 64) { @@ -403,7 +451,7 @@ static void benchmark(char *title, char *cmd, int len) { config.requests_issued = 0; config.requests_finished = 0; - c = createClient(cmd,len); + c = createClient(cmd,len,NULL); createMissingClients(c); config.start = mstime(); @@ -530,8 +578,12 @@ usage: " $ redis-benchmark -t set -n 1000000 -r 100000000\n\n" " Benchmark 127.0.0.1:6379 for a few commands producing CSV output:\n" " $ redis-benchmark -t ping,set,get -n 100000 --csv\n\n" +" Benchmark a specific command line:\n" +" $ redis-benchmark -r 10000 -n 10000 eval 'return redis.call(\"ping\")' 0\n\n" " Fill a list with 10000 random elements:\n" -" $ redis-benchmark -r 10000 -n 10000 lpush mylist ele:rand:000000000000\n\n" +" $ redis-benchmark -r 10000 -n 10000 lpush mylist __rand_int__\n\n" +" On user specified command lines __rand_int__ is replaced with a random integer\n" +" with a range of values selected by the -r option.\n" ); exit(exit_status); } @@ -608,7 +660,7 @@ int main(int argc, const char **argv) { if (config.idlemode) { printf("Creating %d idle connections and waiting forever (Ctrl+C when done)\n", config.numclients); - c = createClient("",0); /* will never receive a reply */ + c = createClient("",0,NULL); /* will never receive a reply */ createMissingClients(c); aeMain(config.el); /* and will wait for every */ @@ -647,19 +699,19 @@ int main(int argc, const char **argv) { } if (test_is_selected("set")) { - len = redisFormatCommand(&cmd,"SET foo:rand:000000000000 %s",data); + len = redisFormatCommand(&cmd,"SET key:__rand_int__ %s",data); benchmark("SET",cmd,len); free(cmd); } if (test_is_selected("get")) { - len = redisFormatCommand(&cmd,"GET foo:rand:000000000000"); + len = redisFormatCommand(&cmd,"GET key:__rand_int__"); benchmark("GET",cmd,len); free(cmd); } if (test_is_selected("incr")) { - len = redisFormatCommand(&cmd,"INCR counter:rand:000000000000"); + len = redisFormatCommand(&cmd,"INCR counter:__rand_int__"); benchmark("INCR",cmd,len); free(cmd); } @@ -678,7 +730,7 @@ int main(int argc, const char **argv) { if (test_is_selected("sadd")) { len = redisFormatCommand(&cmd, - "SADD myset counter:rand:000000000000"); + "SADD myset element:__rand_int__"); benchmark("SADD",cmd,len); free(cmd); } @@ -728,7 +780,7 @@ int main(int argc, const char **argv) { const char *argv[21]; argv[0] = "MSET"; for (i = 1; i < 21; i += 2) { - argv[i] = "foo:rand:000000000000"; + argv[i] = "key:__rand_int__"; argv[i+1] = data; } len = redisFormatCommandArgv(&cmd,21,argv,NULL); From 4b8b7cb964a0eae280a142ad5726c977fa88a45e Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 12 Aug 2013 10:29:14 +0200 Subject: [PATCH 0131/2500] Replication: better way to send a preamble before RDB payload. During the replication full resynchronization process, the RDB file is transfered from the master to the slave. However there is a short preamble to send, that is currently just the bulk payload length of the file in the usual Redis form $..length... This preamble used to be sent with a direct write call, assuming that there was alway room in the socket output buffer to hold the few bytes needed, however this does not scale in case we'll need to send more stuff, and is not very robust code in general. This commit introduces a more general mechanism to send a preamble up to 2GB in size (the max length of an sds string) in a non blocking way. --- src/networking.c | 6 ++++-- src/redis.h | 1 + src/replication.c | 34 +++++++++++++++++++++------------- 3 files changed, 26 insertions(+), 15 deletions(-) diff --git a/src/networking.c b/src/networking.c index c131c9c67..d128646f1 100644 --- a/src/networking.c +++ b/src/networking.c @@ -701,8 +701,10 @@ void freeClient(redisClient *c) { /* Master/slave cleanup. * Case 1: we lost the connection with a slave. */ if (c->flags & REDIS_SLAVE) { - if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1) - close(c->repldbfd); + if (c->replstate == REDIS_REPL_SEND_BULK) { + if (c->repldbfd != -1) close(c->repldbfd); + if (c->replpreamble) sdsfree(c->replpreamble); + } list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves; ln = listSearchKey(l,c); redisAssert(ln != NULL); diff --git a/src/redis.h b/src/redis.h index 058fa7d32..b5241b3fd 100644 --- a/src/redis.h +++ b/src/redis.h @@ -470,6 +470,7 @@ typedef struct redisClient { int repldbfd; /* replication DB file descriptor */ long repldboff; /* replication DB file offset */ off_t repldbsize; /* replication DB file size */ + sds replpreamble; /* replication DB preamble. */ long long reploff; /* replication offset if this is our master */ long long repl_ack_off; /* replication ack offset, if this is a slave */ long long repl_ack_time;/* replication ack time, if this is a slave */ diff --git a/src/replication.c b/src/replication.c index 2d7ee809d..d9e26039c 100644 --- a/src/replication.c +++ b/src/replication.c @@ -628,23 +628,28 @@ void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) { char buf[REDIS_IOBUF_LEN]; ssize_t nwritten, buflen; - if (slave->repldboff == 0) { - /* Write the bulk write count before to transfer the DB. In theory here - * we don't know how much room there is in the output buffer of the - * socket, but in practice SO_SNDLOWAT (the minimum count for output - * operations) will never be smaller than the few bytes we need. */ - sds bulkcount; - - bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long) - slave->repldbsize); - if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount)) - { - sdsfree(bulkcount); + /* Before sending the RDB file, we send the preamble as configured by the + * replication process. Currently the preamble is just the bulk count of + * the file in the form "$\r\n". */ + if (slave->replpreamble) { + nwritten = write(fd,slave->replpreamble,sdslen(slave->replpreamble)); + if (nwritten == -1) { + redisLog(REDIS_VERBOSE,"Write error sending RDB preamble to slave: %s", + strerror(errno)); freeClient(slave); return; } - sdsfree(bulkcount); + sdsrange(slave->replpreamble,nwritten,-1); + if (sdslen(slave->replpreamble) == 0) { + sdsfree(slave->replpreamble); + slave->replpreamble = NULL; + /* fall through sending data. */ + } else { + return; + } } + + /* If the preamble was already transfered, send the RDB bulk data. */ lseek(slave->repldbfd,slave->repldboff,SEEK_SET); buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN); if (buflen <= 0) { @@ -711,6 +716,9 @@ void updateSlavesWaitingBgsave(int bgsaveerr) { slave->repldboff = 0; slave->repldbsize = buf.st_size; slave->replstate = REDIS_REPL_SEND_BULK; + slave->replpreamble = sdscatprintf(sdsempty(),"$%lld\r\n", + (unsigned long long) slave->repldbsize); + aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE); if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) { freeClient(slave); From 09662d780ee181744572479f3e861f4fe97a6078 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 12 Aug 2013 11:38:21 +0200 Subject: [PATCH 0132/2500] Fix sdsempty() prototype in sds.h. --- deps/hiredis/sds.h | 2 +- src/sds.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/deps/hiredis/sds.h b/deps/hiredis/sds.h index 6f3201130..615c751cd 100644 --- a/deps/hiredis/sds.h +++ b/deps/hiredis/sds.h @@ -56,7 +56,7 @@ static inline size_t sdsavail(const sds s) { sds sdsnewlen(const void *init, size_t initlen); sds sdsnew(const char *init); -sds sdsempty(); +sds sdsempty(void); size_t sdslen(const sds s); sds sdsdup(const sds s); void sdsfree(sds s); diff --git a/src/sds.h b/src/sds.h index 6f3201130..615c751cd 100644 --- a/src/sds.h +++ b/src/sds.h @@ -56,7 +56,7 @@ static inline size_t sdsavail(const sds s) { sds sdsnewlen(const void *init, size_t initlen); sds sdsnew(const char *init); -sds sdsempty(); +sds sdsempty(void); size_t sdslen(const sds s); sds sdsdup(const sds s); void sdsfree(sds s); From 21cde6ecb73839b98bebddeef1df9320faf66e38 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 12 Aug 2013 11:50:54 +0200 Subject: [PATCH 0133/2500] Fix a PSYNC bug caused by a variable name typo. --- src/replication.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/replication.c b/src/replication.c index d9e26039c..487d3d4f9 100644 --- a/src/replication.c +++ b/src/replication.c @@ -255,7 +255,7 @@ void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) { aux[len+1] = '\r'; aux[len+2] = '\n'; feedReplicationBacklog(aux,len+3); - feedReplicationBacklogWithObject(argv[j]); + feedReplicationBacklogWithObject(argv[i]); feedReplicationBacklogWithObject(shared.crlf); } } From 6268dbdd949051a9f0d22d8d313f189ff5b522d4 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 12 Aug 2013 12:10:38 +0200 Subject: [PATCH 0134/2500] replicationFeedSlave() reworked for correctness and speed. The previous code using a static buffer as an optimization was lame: 1) Premature optimization, actually it was *slower* than naive code because resulted into the creation / destruction of the object encapsulating the output buffer. 2) The code was very hard to test, since it was needed to have specific tests for command lines exceeding the size of the static buffer. 3) As a result of "2" the code was bugged as the current tests were not able to stress specific corner cases. It was replaced with easy to understand code that is safer and faster. --- src/replication.c | 140 ++++++++++++++-------------------------------- 1 file changed, 42 insertions(+), 98 deletions(-) diff --git a/src/replication.c b/src/replication.c index 487d3d4f9..e0f9646c3 100644 --- a/src/replication.c +++ b/src/replication.c @@ -138,15 +138,11 @@ void feedReplicationBacklogWithObject(robj *o) { feedReplicationBacklog(p,len); } -#define FEEDSLAVE_BUF_SIZE (1024*64) void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) { listNode *ln; listIter li; - int j, i, len; - char buf[FEEDSLAVE_BUF_SIZE], *b = buf; + int j, len; char llstr[REDIS_LONGSTR_SIZE]; - int buf_left = FEEDSLAVE_BUF_SIZE; - robj *o; /* If there aren't slaves, and there is no backlog buffer to populate, * we can return ASAP. */ @@ -155,117 +151,66 @@ void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) { /* We can't have slaves attached and no backlog. */ redisAssert(!(listLength(slaves) != 0 && server.repl_backlog == NULL)); - /* What we do here is to try to write as much data as possible in a static - * buffer "buf" that is used to create an object that is later sent to all - * the slaves. This way we do the decoding only one time for most commands - * not containing big payloads. */ - - /* Create the SELECT command into the static buffer if needed. */ + /* Send SELECT command to every slave if needed. */ if (server.slaveseldb != dictid) { - char *selectcmd; - size_t sclen; + robj *selectcmd; + /* For a few DBs we have pre-computed SELECT command. */ if (dictid >= 0 && dictid < REDIS_SHARED_SELECT_CMDS) { - selectcmd = shared.select[dictid]->ptr; - sclen = sdslen(selectcmd); - memcpy(b,selectcmd,sclen); - b += sclen; - buf_left -= sclen; + selectcmd = shared.select[dictid]; } else { int dictid_len; dictid_len = ll2string(llstr,sizeof(llstr),dictid); - sclen = snprintf(b,buf_left,"*2\r\n$6\r\nSELECT\r\n$%d\r\n%s\r\n", - dictid_len, llstr); - b += sclen; - buf_left -= sclen; + selectcmd = createObject(REDIS_STRING, + sdscatprintf(sdsempty(), + "*2\r\n$6\r\nSELECT\r\n$%d\r\n%s\r\n", + dictid_len, llstr)); } + + /* Add the SELECT command into the backlog. */ + if (server.repl_backlog) feedReplicationBacklogWithObject(selectcmd); + + /* Send it to slaves. */ + listRewind(slaves,&li); + while((ln = listNext(&li))) { + redisClient *slave = ln->value; + addReply(slave,selectcmd); + } + + if (dictid < 0 || dictid >= REDIS_SHARED_SELECT_CMDS) + decrRefCount(selectcmd); } server.slaveseldb = dictid; - - /* Add the multi bulk reply size to the static buffer, that is, the number - * of arguments of the command to send to every slave. */ - b[0] = '*'; - len = ll2string(b+1,REDIS_LONGSTR_SIZE,argc); - b += len+1; - buf_left -= len+1; - b[0] = '\r'; - b[1] = '\n'; - b += 2; - buf_left -= 2; - /* Try to use the static buffer for as much arguments is possible. */ - for (j = 0; j < argc; j++) { - int objlen; - char *objptr; - - if (argv[j]->encoding != REDIS_ENCODING_RAW && - argv[j]->encoding != REDIS_ENCODING_INT && - argv[j]->encoding != REDIS_ENCODING_EMBSTR) { - redisPanic("Unexpected encoding"); - } - if (sdsEncodedObject(argv[j])) { - objlen = sdslen(argv[j]->ptr); - objptr = argv[j]->ptr; - } else { - objlen = ll2string(llstr,REDIS_LONGSTR_SIZE,(long)argv[j]->ptr); - objptr = llstr; - } - /* We need enough space for bulk reply encoding, newlines, and - * the data itself. */ - if (buf_left < objlen+REDIS_LONGSTR_SIZE+32) break; - - /* Write $...CRLF */ - b[0] = '$'; - len = ll2string(b+1,REDIS_LONGSTR_SIZE,objlen); - b += len+1; - buf_left -= len+1; - b[0] = '\r'; - b[1] = '\n'; - b += 2; - buf_left -= 2; - - /* And data plus CRLF */ - memcpy(b,objptr,objlen); - b += objlen; - buf_left -= objlen; - b[0] = '\r'; - b[1] = '\n'; - b += 2; - buf_left -= 2; - } - - /* Create an object with the static buffer content. */ - redisAssert(buf_left < FEEDSLAVE_BUF_SIZE); - o = createStringObject(buf,b-buf); - - /* If we have a backlog, populate it with data and increment - * the global replication offset. */ + /* Write the command to the replication backlog if any. */ if (server.repl_backlog) { - feedReplicationBacklogWithObject(o); - for (i = j; i < argc; i++) { - char aux[REDIS_LONGSTR_SIZE+3]; - long objlen = stringObjectLen(argv[i]); + char aux[REDIS_LONGSTR_SIZE+3]; + + /* Add the multi bulk reply length. */ + aux[0] = '*'; + len = ll2string(aux+1,sizeof(aux-1),argc); + aux[len+1] = '\r'; + aux[len+2] = '\n'; + feedReplicationBacklog(aux,len+3); + + for (j = 0; j < argc; j++) { + long objlen = stringObjectLen(argv[j]); /* We need to feed the buffer with the object as a bulk reply * not just as a plain string, so create the $..CRLF payload len * ad add the final CRLF */ aux[0] = '$'; - len = ll2string(aux+1,objlen,sizeof(aux)-1); + len = ll2string(aux+1,sizeof(aux)-1,objlen); aux[len+1] = '\r'; aux[len+2] = '\n'; feedReplicationBacklog(aux,len+3); - feedReplicationBacklogWithObject(argv[i]); - feedReplicationBacklogWithObject(shared.crlf); + feedReplicationBacklogWithObject(argv[j]); + feedReplicationBacklogWithObject(aux+len+1,2); } } - /* Write data to slaves. Here we do two things: - * 1) We write the "o" object that was created using the accumulated - * static buffer. - * 2) We write any additional argument of the command to replicate that - * was not written inside the static buffer for lack of space. - */ + /* Write the command to every slave. */ listRewind(slaves,&li); while((ln = listNext(&li))) { redisClient *slave = ln->value; @@ -277,15 +222,14 @@ void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) { * are queued in the output buffer until the initial SYNC completes), * or are already in sync with the master. */ - /* First, trasmit the object created from the static buffer. */ - addReply(slave,o); + /* Add the multi bulk length. */ + addReplyMultiBulkLen(slave,argc); /* Finally any additional argument that was not stored inside the * static buffer if any (from j to argc). */ - for (i = j; i < argc; i++) - addReplyBulk(slave,argv[i]); + for (j = 0; j < argc; j++) + addReplyBulk(slave,argv[j]); } - decrRefCount(o); } void replicationFeedMonitors(redisClient *c, list *monitors, int dictid, robj **argv, int argc) { From a33c9fb250c16dad99f56b64d4f87f5607d543b6 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 12 Aug 2013 12:38:52 +0200 Subject: [PATCH 0135/2500] replicationFeedSlaves() func name typo: feedReplicationBacklogWithObject -> feedReplicationBacklog. --- src/replication.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/replication.c b/src/replication.c index e0f9646c3..27187476b 100644 --- a/src/replication.c +++ b/src/replication.c @@ -206,7 +206,7 @@ void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) { aux[len+2] = '\n'; feedReplicationBacklog(aux,len+3); feedReplicationBacklogWithObject(argv[j]); - feedReplicationBacklogWithObject(aux+len+1,2); + feedReplicationBacklog(aux+len+1,2); } } From 88f51adf22f3b513be2696eba3996069723de3db Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 12 Aug 2013 12:43:26 +0200 Subject: [PATCH 0136/2500] Use precomptued objects for bulk and mbulk prefixes. --- src/networking.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/networking.c b/src/networking.c index d128646f1..be78a19f2 100644 --- a/src/networking.c +++ b/src/networking.c @@ -474,7 +474,10 @@ void addReplyLongLong(redisClient *c, long long ll) { } void addReplyMultiBulkLen(redisClient *c, long length) { - addReplyLongLongWithPrefix(c,length,'*'); + if (length < REDIS_SHARED_BULKHDR_LEN) + addReply(c,shared.mbulkhdr[length]); + else + addReplyLongLongWithPrefix(c,length,'*'); } /* Create the length prefix of a bulk reply, example: $2234 */ @@ -496,7 +499,11 @@ void addReplyBulkLen(redisClient *c, robj *obj) { len++; } } - addReplyLongLongWithPrefix(c,len,'$'); + + if (len < REDIS_SHARED_BULKHDR_LEN) + addReply(c,shared.bulkhdr[len]); + else + addReplyLongLongWithPrefix(c,len,'$'); } /* Add a Redis Object as a bulk reply */ From bfaadb0df27107f1c4096977bdabbd2d98bac83b Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 16 Aug 2013 14:08:04 +0200 Subject: [PATCH 0137/2500] dict.c iterator API misuse protection. dict.c allows the user to create unsafe iterators, that are iterators that will not touch the dictionary data structure in any way, preventing copy on write, but at the same time are limited in their usage. The limitation is that when itearting with an unsafe iterator, no call to other dictionary functions must be done inside the iteration loop, otherwise the dictionary may be incrementally rehashed resulting into missing elements in the set of the elements returned by the iterator. However after introducing this kind of iterators a number of bugs were found due to misuses of the API, and we are still finding bugs about this issue. The bugs are not trivial to track because the effect is just missing elements during the iteartion. This commit introduces auto-detection of the API misuse. The idea is that an unsafe iterator has a contract: from initialization to the release of the iterator the dictionary should not change. So we take a fingerprint of the dictionary state, xoring a few important dict properties when the unsafe iteartor is initialized. We later check when the iterator is released if the fingerprint is still the same. If it is not, we found a misuse of the iterator, as not allowed API calls changed the internal state of the dictionary. This code was checked against a real bug, issue #1240. This is what Redis prints (aborting) when a misuse is detected: Assertion failed: (iter->fingerprint == dictFingerprint(iter->d)), function dictReleaseIterator, file dict.c, line 587. --- src/dict.c | 34 ++++++++++++++++++++++++++++++---- src/dict.h | 1 + 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/src/dict.c b/src/dict.c index 2346f5bea..26d0b1ff3 100644 --- a/src/dict.c +++ b/src/dict.c @@ -505,6 +505,24 @@ void *dictFetchValue(dict *d, const void *key) { return he ? dictGetVal(he) : NULL; } +/* A fingerprint is a 64 bit number that represents the state of the dictionary + * at a given time, it's just a few dict properties xored together. + * When an unsafe iterator is initialized, we get the dict fingerprint, and check + * the fingerprint again when the iterator is released. + * If the two fingerprints are different it means that the user of the iterator + * performed forbidden operations against the dictionary while iterating. */ +long long dictFingerprint(dict *d) { + long long fingerprint = 0; + + fingerprint ^= (long long) d->ht[0].table; + fingerprint ^= (long long) d->ht[0].size; + fingerprint ^= (long long) d->ht[0].used; + fingerprint ^= (long long) d->ht[1].table; + fingerprint ^= (long long) d->ht[1].size; + fingerprint ^= (long long) d->ht[1].used; + return fingerprint; +} + dictIterator *dictGetIterator(dict *d) { dictIterator *iter = zmalloc(sizeof(*iter)); @@ -530,8 +548,12 @@ dictEntry *dictNext(dictIterator *iter) while (1) { if (iter->entry == NULL) { dictht *ht = &iter->d->ht[iter->table]; - if (iter->safe && iter->index == -1 && iter->table == 0) - iter->d->iterators++; + if (iter->index == -1 && iter->table == 0) { + if (iter->safe) + iter->d->iterators++; + else + iter->fingerprint = dictFingerprint(iter->d); + } iter->index++; if (iter->index >= (signed) ht->size) { if (dictIsRehashing(iter->d) && iter->table == 0) { @@ -558,8 +580,12 @@ dictEntry *dictNext(dictIterator *iter) void dictReleaseIterator(dictIterator *iter) { - if (iter->safe && !(iter->index == -1 && iter->table == 0)) - iter->d->iterators--; + if (!(iter->index == -1 && iter->table == 0)) { + if (iter->safe) + iter->d->iterators--; + else + assert(iter->fingerprint == dictFingerprint(iter->d)); + } zfree(iter); } diff --git a/src/dict.h b/src/dict.h index 3a311f171..4d750ae85 100644 --- a/src/dict.h +++ b/src/dict.h @@ -88,6 +88,7 @@ typedef struct dictIterator { dict *d; int table, index, safe; dictEntry *entry, *nextEntry; + long long fingerprint; /* unsafe iterator fingerprint for misuse detection */ } dictIterator; /* This is the initial size of every hash table */ From 5173de05250301cca9ff54076409d19510e7c701 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 16 Aug 2013 15:26:44 +0200 Subject: [PATCH 0138/2500] Properly init/release iterators in zunionInterGenericCommand(). This commit does mainly two things: 1) It fixes zunionInterGenericCommand() by removing mass-initialization of all the iterators used, so that we don't violate the unsafe iterator API of dictionaries. This fixes issue #1240. 2) Since the zui* APIs required the allocator to be initialized in the zsetopsrc structure in order to use non-iterator related APIs, this commit fixes this strict requirement by accessing objects directly via the op->subject->ptr pointer we have to the object. --- src/t_zset.c | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/src/t_zset.c b/src/t_zset.c index 291a7eacf..20ff84b2e 100644 --- a/src/t_zset.c +++ b/src/t_zset.c @@ -1270,20 +1270,20 @@ int zuiLength(zsetopsrc *op) { return 0; if (op->type == REDIS_SET) { - iterset *it = &op->iter.set; if (op->encoding == REDIS_ENCODING_INTSET) { - return intsetLen(it->is.is); + return intsetLen(op->subject->ptr); } else if (op->encoding == REDIS_ENCODING_HT) { - return dictSize(it->ht.dict); + dict *ht = op->subject->ptr; + return dictSize(ht); } else { redisPanic("Unknown set encoding"); } } else if (op->type == REDIS_ZSET) { - iterzset *it = &op->iter.zset; if (op->encoding == REDIS_ENCODING_ZIPLIST) { - return zzlLength(it->zl.zl); + return zzlLength(op->subject->ptr); } else if (op->encoding == REDIS_ENCODING_SKIPLIST) { - return it->sl.zs->zsl->length; + zset *zs = op->subject->ptr; + return zs->zsl->length; } else { redisPanic("Unknown sorted set encoding"); } @@ -1419,18 +1419,19 @@ int zuiFind(zsetopsrc *op, zsetopval *val, double *score) { return 0; if (op->type == REDIS_SET) { - iterset *it = &op->iter.set; - if (op->encoding == REDIS_ENCODING_INTSET) { - if (zuiLongLongFromValue(val) && intsetFind(it->is.is,val->ell)) { + if (zuiLongLongFromValue(val) && + intsetFind(op->subject->ptr,val->ell)) + { *score = 1.0; return 1; } else { return 0; } } else if (op->encoding == REDIS_ENCODING_HT) { + dict *ht = op->subject->ptr; zuiObjectFromValue(val); - if (dictFind(it->ht.dict,val->ele) != NULL) { + if (dictFind(ht,val->ele) != NULL) { *score = 1.0; return 1; } else { @@ -1440,19 +1441,19 @@ int zuiFind(zsetopsrc *op, zsetopval *val, double *score) { redisPanic("Unknown set encoding"); } } else if (op->type == REDIS_ZSET) { - iterzset *it = &op->iter.zset; zuiObjectFromValue(val); if (op->encoding == REDIS_ENCODING_ZIPLIST) { - if (zzlFind(it->zl.zl,val->ele,score) != NULL) { + if (zzlFind(op->subject->ptr,val->ele,score) != NULL) { /* Score is already set by zzlFind. */ return 1; } else { return 0; } } else if (op->encoding == REDIS_ENCODING_SKIPLIST) { + zset *zs = op->subject->ptr; dictEntry *de; - if ((de = dictFind(it->sl.zs->dict,val->ele)) != NULL) { + if ((de = dictFind(zs->dict,val->ele)) != NULL) { *score = *(double*)dictGetVal(de); return 1; } else { @@ -1580,9 +1581,6 @@ void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) { } } - for (i = 0; i < setnum; i++) - zuiInitIterator(&src[i]); - /* sort sets from the smallest to largest, this will improve our * algorithm's performance */ qsort(src,setnum,sizeof(zsetopsrc),zuiCompareByCardinality); @@ -1596,6 +1594,7 @@ void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) { if (zuiLength(&src[0]) > 0) { /* Precondition: as src[0] is non-empty and the inputs are ordered * by size, all src[i > 0] are non-empty too. */ + zuiInitIterator(&src[0]); while (zuiNext(&src[0],&zval)) { double score, value; @@ -1630,12 +1629,14 @@ void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) { } } } + zuiClearIterator(&src[0]); } } else if (op == REDIS_OP_UNION) { for (i = 0; i < setnum; i++) { if (zuiLength(&src[i]) == 0) continue; + zuiInitIterator(&src[i]); while (zuiNext(&src[i],&zval)) { double score, value; @@ -1672,14 +1673,12 @@ void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) { maxelelen = sdslen(tmp->ptr); } } + zuiClearIterator(&src[i]); } } else { redisPanic("Unknown operator"); } - for (i = 0; i < setnum; i++) - zuiClearIterator(&src[i]); - if (dbDelete(c->db,dstkey)) { signalModifiedKey(c->db,dstkey); touched = 1; From 4bb257b480780d866f08afe11f32a44697395a00 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 16 Aug 2013 15:31:25 +0200 Subject: [PATCH 0139/2500] Fix comments for correctness in zunionInterGenericCommand(). Related to issue #1240. --- src/t_zset.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/t_zset.c b/src/t_zset.c index 20ff84b2e..1fcfd6bb6 100644 --- a/src/t_zset.c +++ b/src/t_zset.c @@ -1640,7 +1640,7 @@ void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) { while (zuiNext(&src[i],&zval)) { double score, value; - /* Skip key when already processed */ + /* Skip an element that when already processed */ if (dictFind(dstzset->dict,zuiObjectFromValue(&zval)) != NULL) continue; @@ -1648,8 +1648,10 @@ void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) { score = src[i].weight * zval.score; if (isnan(score)) score = 0; - /* Because the inputs are sorted by size, it's only possible - * for sets at larger indices to hold this element. */ + /* We need to check only next sets to see if this element + * exists, since we process every element just one time so + * it can't exist in a previous set (otherwise it would be + * already processed). */ for (j = (i+1); j < setnum; j++) { /* It is not safe to access the zset we are * iterating, so explicitly check for equal object. */ From ae1bb62f621abbfd6e6ddd26e742bade48edfa84 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 19 Aug 2013 11:29:18 +0200 Subject: [PATCH 0140/2500] dictFingerprint() fingerprinting made more robust. The previous hashing used the trivial algorithm of xoring the integers together. This is not optimal as it is very likely that different hash table setups will hash the same, for instance an hash table at the start of the rehashing process, and at the end, will have the same fingerprint. Now we hash N integers in a smarter way, by summing every integer to the previous hash, and taking the integer hashing again (see the code for further details). This way it is a lot less likely that we get a collision. Moreover this way of hashing explicitly protects from the same set of integers in a different order to hash to the same number. This commit is related to issue #1240. --- src/dict.c | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/src/dict.c b/src/dict.c index 26d0b1ff3..ad95d7498 100644 --- a/src/dict.c +++ b/src/dict.c @@ -512,15 +512,35 @@ void *dictFetchValue(dict *d, const void *key) { * If the two fingerprints are different it means that the user of the iterator * performed forbidden operations against the dictionary while iterating. */ long long dictFingerprint(dict *d) { - long long fingerprint = 0; + long long integers[6], hash = 0; + int j; - fingerprint ^= (long long) d->ht[0].table; - fingerprint ^= (long long) d->ht[0].size; - fingerprint ^= (long long) d->ht[0].used; - fingerprint ^= (long long) d->ht[1].table; - fingerprint ^= (long long) d->ht[1].size; - fingerprint ^= (long long) d->ht[1].used; - return fingerprint; + integers[0] = (long long) d->ht[0].table; + integers[1] = d->ht[0].size; + integers[2] = d->ht[0].used; + integers[3] = (long long) d->ht[1].table; + integers[4] = d->ht[1].size; + integers[5] = d->ht[1].used; + + /* We hash N integers by summing every successive integer with the integer + * hashing of the previous sum. Basically: + * + * Result = hash(hash(hash(int1)+int2)+int3) ... + * + * This way the same set of integers in a different order will (likely) hash + * to a different number. */ + for (j = 0; j < 6; j++) { + hash += integers[j]; + /* For the hashing step we use Tomas Wang's 64 bit integer hash. */ + hash = (~hash) + (hash << 21); // hash = (hash << 21) - hash - 1; + hash = hash ^ (hash >> 24); + hash = (hash + (hash << 3)) + (hash << 8); // hash * 265 + hash = hash ^ (hash >> 14); + hash = (hash + (hash << 2)) + (hash << 4); // hash * 21 + hash = hash ^ (hash >> 28); + hash = hash + (hash << 31); + } + return hash; } dictIterator *dictGetIterator(dict *d) From 3a9c595ab583be3ab616622b4d0e9083041006f2 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 19 Aug 2013 14:53:38 +0200 Subject: [PATCH 0141/2500] Added redisassert.h as drop in replacement for assert.h. By using redisassert.h version of assert() you get stack traces in the log instead of a process disappearing on assertions. --- src/redisassert.h | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 src/redisassert.h diff --git a/src/redisassert.h b/src/redisassert.h new file mode 100644 index 000000000..e2cf47cb5 --- /dev/null +++ b/src/redisassert.h @@ -0,0 +1,45 @@ +/* redisassert.h -- Drop in replacemnet assert.h that prints the stack trace + * in the Redis logs. + * + * This file should be included instead of "assert.h" inside libraries used by + * Redis that are using assertions, so instead of Redis disappearing with + * SIGABORT, we get the details and stack trace inside the log file. + * + * ---------------------------------------------------------------------------- + * + * Copyright (c) 2006-2012, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __REDIS_ASSERT_H__ +#define __REDIS_ASSERT_H__ + +#define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1))) + +void _redisAssert(char *estr, char *file, int line); + +#endif From ded611636f456e05225c279faff5595571cbcf35 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 19 Aug 2013 14:54:31 +0200 Subject: [PATCH 0142/2500] assert.h replaced with redisassert.h when appropriate. Also a warning was suppressed by including unistd.h in redisassert.h (needed for _exit()). --- src/dict.c | 2 +- src/pqsort.c | 1 - src/redisassert.h | 4 +++- src/ziplist.c | 2 +- src/zipmap.c | 1 - 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/dict.c b/src/dict.c index ad95d7498..dd4156d2f 100644 --- a/src/dict.c +++ b/src/dict.c @@ -39,13 +39,13 @@ #include #include #include -#include #include #include #include #include "dict.h" #include "zmalloc.h" +#include "redisassert.h" /* Using dictEnableResize() / dictDisableResize() we make possible to * enable/disable resizing of the hash table as needed. This is very important diff --git a/src/pqsort.c b/src/pqsort.c index 57c217f94..7325a88a4 100644 --- a/src/pqsort.c +++ b/src/pqsort.c @@ -39,7 +39,6 @@ #include -#include #include #include diff --git a/src/redisassert.h b/src/redisassert.h index e2cf47cb5..e5825c0f5 100644 --- a/src/redisassert.h +++ b/src/redisassert.h @@ -38,7 +38,9 @@ #ifndef __REDIS_ASSERT_H__ #define __REDIS_ASSERT_H__ -#define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1))) +#include /* for _exit() */ + +#define assert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1))) void _redisAssert(char *estr, char *file, int line); diff --git a/src/ziplist.c b/src/ziplist.c index fdfc2e9ce..d78f8f5da 100644 --- a/src/ziplist.c +++ b/src/ziplist.c @@ -105,12 +105,12 @@ #include #include #include -#include #include #include "zmalloc.h" #include "util.h" #include "ziplist.h" #include "endianconv.h" +#include "redisassert.h" #define ZIP_END 255 #define ZIP_BIGLEN 254 diff --git a/src/zipmap.c b/src/zipmap.c index d9b7c8b31..3cd56e5ff 100644 --- a/src/zipmap.c +++ b/src/zipmap.c @@ -78,7 +78,6 @@ #include #include -#include #include "zmalloc.h" #include "endianconv.h" From d22d557e41151c1d716045e0059550e197d6e526 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 19 Aug 2013 15:10:33 +0200 Subject: [PATCH 0143/2500] Fixed type in dict.c comment: 265 -> 256. --- src/dict.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dict.c b/src/dict.c index dd4156d2f..7af1c30f0 100644 --- a/src/dict.c +++ b/src/dict.c @@ -534,7 +534,7 @@ long long dictFingerprint(dict *d) { /* For the hashing step we use Tomas Wang's 64 bit integer hash. */ hash = (~hash) + (hash << 21); // hash = (hash << 21) - hash - 1; hash = hash ^ (hash >> 24); - hash = (hash + (hash << 3)) + (hash << 8); // hash * 265 + hash = (hash + (hash << 3)) + (hash << 8); // hash * 256 hash = hash ^ (hash >> 14); hash = (hash + (hash << 2)) + (hash << 4); // hash * 21 hash = hash ^ (hash >> 28); From 5824ab54dddba36efbf677d1a482a84b1e1fc045 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 19 Aug 2013 17:25:48 +0200 Subject: [PATCH 0144/2500] Revert "Fixed type in dict.c comment: 265 -> 256." This reverts commit d22d557e41151c1d716045e0059550e197d6e526. --- src/dict.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dict.c b/src/dict.c index 7af1c30f0..dd4156d2f 100644 --- a/src/dict.c +++ b/src/dict.c @@ -534,7 +534,7 @@ long long dictFingerprint(dict *d) { /* For the hashing step we use Tomas Wang's 64 bit integer hash. */ hash = (~hash) + (hash << 21); // hash = (hash << 21) - hash - 1; hash = hash ^ (hash >> 24); - hash = (hash + (hash << 3)) + (hash << 8); // hash * 256 + hash = (hash + (hash << 3)) + (hash << 8); // hash * 265 hash = hash ^ (hash >> 14); hash = (hash + (hash << 2)) + (hash << 4); // hash * 21 hash = hash ^ (hash >> 28); From 13c59cfdc86ea59f0afce4ce6751778731a3a6b7 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 20 Aug 2013 11:49:55 +0200 Subject: [PATCH 0145/2500] dictFingerprint(): cast pointers to integer of same size. --- src/dict.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/dict.c b/src/dict.c index dd4156d2f..97a2bca43 100644 --- a/src/dict.c +++ b/src/dict.c @@ -515,10 +515,10 @@ long long dictFingerprint(dict *d) { long long integers[6], hash = 0; int j; - integers[0] = (long long) d->ht[0].table; + integers[0] = (long) d->ht[0].table; integers[1] = d->ht[0].size; integers[2] = d->ht[0].used; - integers[3] = (long long) d->ht[1].table; + integers[3] = (long) d->ht[1].table; integers[4] = d->ht[1].size; integers[5] = d->ht[1].used; From 2f47ed9f5f79453286f7e76b0fb9e085a326075c Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 20 Aug 2013 12:04:57 +0200 Subject: [PATCH 0146/2500] Use printf %zu specifier to print private_dirty. --- src/aof.c | 2 +- src/rdb.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/aof.c b/src/aof.c index 2a29f72fb..dcbcc62e1 100644 --- a/src/aof.c +++ b/src/aof.c @@ -970,7 +970,7 @@ int rewriteAppendOnlyFileBackground(void) { if (private_dirty) { redisLog(REDIS_NOTICE, - "AOF rewrite: %lu MB of memory used by copy-on-write", + "AOF rewrite: %zu MB of memory used by copy-on-write", private_dirty/(1024*1024)); } exitFromChild(0); diff --git a/src/rdb.c b/src/rdb.c index 1c2b0ed0d..69b24fbba 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -737,7 +737,7 @@ int rdbSaveBackground(char *filename) { if (private_dirty) { redisLog(REDIS_NOTICE, - "RDB: %lu MB of memory used by copy-on-write", + "RDB: %zu MB of memory used by copy-on-write", private_dirty/(1024*1024)); } } From 18317ae6b45bd7a88e4386043b5b70d7838b62b1 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 21 Aug 2013 11:36:09 +0200 Subject: [PATCH 0147/2500] Fix for issue #1214 simplified. --- src/redis.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/src/redis.c b/src/redis.c index 35a7580e5..036fba7a5 100644 --- a/src/redis.c +++ b/src/redis.c @@ -1476,7 +1476,6 @@ void adjustOpenFilesLimit(void) { void initServer() { int j; - int ip_count; signal(SIGHUP, SIG_IGN); signal(SIGPIPE, SIG_IGN); @@ -1508,19 +1507,18 @@ void initServer() { if (server.bindaddr_count == 0) server.bindaddr[0] = NULL; for (j = 0; j < server.bindaddr_count || j == 0; j++) { if (server.bindaddr[j] == NULL) { - /* Bind * for both IPv6 and IPv4. - * Should consider that someone only has IPV6 and someone only get IPV4 */ - ip_count = 0; - server.ipfd[ip_count] = anetTcp6Server(server.neterr,server.port,NULL); - if (server.ipfd[ip_count] != ANET_ERR) ip_count++; - - server.ipfd[ip_count] = anetTcpServer(server.neterr,server.port,NULL); - if(server.ipfd[ip_count] != ANET_ERR ) ip_count++; - - /* It should be ip_count plus one - * because out of this branch, the server.ipfd_count would increase */ - server.ipfd_count += (ip_count - 1); - + /* Bind * for both IPv6 and IPv4, we enter here only if + * server.bindaddr_count == 0, so we try to bind and then + * break to exit the loop ASAP. */ + server.ipfd[server.ipfd_count] = + anetTcp6Server(server.neterr,server.port,NULL); + if (server.ipfd[server.ipfd_count] != ANET_ERR) + server.ipfd_count++; + server.ipfd[server.ipfd_count] = + anetTcpServer(server.neterr,server.port,NULL); + if(server.ipfd[server.ipfd_count] != ANET_ERR) + server.ipfd_count++; + break; } else if (strchr(server.bindaddr[j],':')) { /* Bind IPv6 address. */ server.ipfd[server.ipfd_count] = anetTcp6Server(server.neterr,server.port,server.bindaddr[j]); From 487951c9b4c1ac5967473cc2f784b5d38124c3d8 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 21 Aug 2013 15:51:10 +0200 Subject: [PATCH 0148/2500] Use a safe dict.c iterator in clusterCron(). --- src/cluster.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cluster.c b/src/cluster.c index 89ef24b90..2c813f582 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -1585,7 +1585,7 @@ void clusterCron(void) { clusterNode *min_pong_node = NULL; /* Check if we have disconnected nodes and re-establish the connection. */ - di = dictGetIterator(server.cluster->nodes); + di = dictGetSafeIterator(server.cluster->nodes); while((de = dictNext(di)) != NULL) { clusterNode *node = dictGetVal(de); From 77c71b20463b8c4cdef5aaa41b4dd79cfd4e8ea0 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 22 Aug 2013 11:53:28 +0200 Subject: [PATCH 0149/2500] Cluster: process MEET packets as PING packets. Somewhat a previous commit broken this so CLUSTER MEET was no longer working. --- src/cluster.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/cluster.c b/src/cluster.c index 2c813f582..c4a68f87d 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -887,7 +887,9 @@ int clusterProcessPacket(clusterLink *link) { } /* PING or PONG: process config information. */ - if (type == CLUSTERMSG_TYPE_PING || type == CLUSTERMSG_TYPE_PONG) { + if (type == CLUSTERMSG_TYPE_PING || type == CLUSTERMSG_TYPE_PONG || + type == CLUSTERMSG_TYPE_MEET) + { int update_state = 0; int update_config = 0; From f45f05531dcddaacdf0666540b14c661ce5e1a43 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 22 Aug 2013 11:54:25 +0200 Subject: [PATCH 0150/2500] Cluster: fix CLUSTER MEET ip address validation. This was broken by the IPv6 support patches. --- src/cluster.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index c4a68f87d..82a4d74ef 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -2082,8 +2082,15 @@ void clusterCommand(redisClient *c) { long port; /* Perform sanity checks on IP/port */ - if ((inet_pton(AF_INET,c->argv[0]->ptr,&(((struct sockaddr_in *)&sa)->sin_addr)) || - inet_pton(AF_INET6,c->argv[0]->ptr,&(((struct sockaddr_in6 *)&sa)->sin6_addr))) == 0) { + if (inet_pton(AF_INET,c->argv[2]->ptr, + &(((struct sockaddr_in *)&sa)->sin_addr))) + { + sa.ss_family = AF_INET; + } else if (inet_pton(AF_INET6,c->argv[2]->ptr, + &(((struct sockaddr_in6 *)&sa)->sin6_addr))) + { + sa.ss_family = AF_INET6; + } else { addReplyError(c,"Invalid IP address in MEET"); return; } @@ -2097,9 +2104,17 @@ void clusterCommand(redisClient *c) { /* Finally add the node to the cluster with a random name, this * will get fixed in the first handshake (ping/pong). */ n = createClusterNode(NULL,REDIS_NODE_HANDSHAKE|REDIS_NODE_MEET); - sa.ss_family == AF_INET ? - inet_ntop(AF_INET,(void*)&(((struct sockaddr_in *)&sa)->sin_addr),n->ip,REDIS_CLUSTER_IPLEN) : - inet_ntop(AF_INET6,(void*)&(((struct sockaddr_in6 *)&sa)->sin6_addr),n->ip,REDIS_CLUSTER_IPLEN); + + /* Set node->ip as the normalized string representation of the node + * IP address. */ + if (sa.ss_family == AF_INET) + inet_ntop(AF_INET, + (void*)&(((struct sockaddr_in *)&sa)->sin_addr), + n->ip,REDIS_CLUSTER_IPLEN); + else + inet_ntop(AF_INET6, + (void*)&(((struct sockaddr_in6 *)&sa)->sin6_addr), + n->ip,REDIS_CLUSTER_IPLEN); n->port = port; clusterAddNode(n); addReply(c,shared.ok); From d42715de508248c2a0672e6fbae2e713a77386be Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 22 Aug 2013 13:02:59 +0200 Subject: [PATCH 0151/2500] Print error message when can't bind * on any address. --- src/redis.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/redis.c b/src/redis.c index 036fba7a5..6ef2da1f8 100644 --- a/src/redis.c +++ b/src/redis.c @@ -1508,8 +1508,7 @@ void initServer() { for (j = 0; j < server.bindaddr_count || j == 0; j++) { if (server.bindaddr[j] == NULL) { /* Bind * for both IPv6 and IPv4, we enter here only if - * server.bindaddr_count == 0, so we try to bind and then - * break to exit the loop ASAP. */ + * server.bindaddr_count == 0. */ server.ipfd[server.ipfd_count] = anetTcp6Server(server.neterr,server.port,NULL); if (server.ipfd[server.ipfd_count] != ANET_ERR) @@ -1518,7 +1517,10 @@ void initServer() { anetTcpServer(server.neterr,server.port,NULL); if(server.ipfd[server.ipfd_count] != ANET_ERR) server.ipfd_count++; - break; + /* Exit the loop if we were able to bind * on IPv4 or IPv6, + * otherwise server.ipfd[server.ipfd_count] will be ANET_ERR + * and we'll print an error and exit. */ + if (server.ipfd_count) break; } else if (strchr(server.bindaddr[j],':')) { /* Bind IPv6 address. */ server.ipfd[server.ipfd_count] = anetTcp6Server(server.neterr,server.port,server.bindaddr[j]); From 85bfeb8feaa5f55d49d141fb1f460a881df76384 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 22 Aug 2013 14:01:16 +0200 Subject: [PATCH 0152/2500] Opening TCP listening ports refactored into a function. --- src/redis.c | 96 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 58 insertions(+), 38 deletions(-) diff --git a/src/redis.c b/src/redis.c index 6ef2da1f8..e68ce2c84 100644 --- a/src/redis.c +++ b/src/redis.c @@ -1474,6 +1474,61 @@ void adjustOpenFilesLimit(void) { } } +/* Initialize a set of file descriptors to listen to the specified 'port' + * binding the addresses specified in the Redis server configuration. + * + * The listening file descriptors are stored in the integer array 'fds' + * and their number is set in '*count'. + * + * The addresses to bind are specified in the global server.bindaddr array + * and their number is server.bindaddr_count. If the server configuration + * contains no specific addresses to bind, this function will try to + * bind * (all addresses) for both the IPv4 and IPv6 protocols. + * + * On success the function returns REDIS_OK. + * + * On error the function returns REDIS_ERR. For the function to be on + * error, at least one of the server.bindaddr addresses was + * impossible to bind, or no bind addresses were specified in the server + * configuration but the function is not able to bind * for at least + * one of the IPv4 or IPv6 protocols. */ +int listenToPort(int port, int *fds, int *count) { + int j; + + /* Force binding of 0.0.0.0 if no bind address is specified, always + * entering the loop if j == 0. */ + if (server.bindaddr_count == 0) server.bindaddr[0] = NULL; + for (j = 0; j < server.bindaddr_count || j == 0; j++) { + if (server.bindaddr[j] == NULL) { + /* Bind * for both IPv6 and IPv4, we enter here only if + * server.bindaddr_count == 0. */ + fds[*count] = anetTcp6Server(server.neterr,port,NULL); + if (fds[*count] != ANET_ERR) (*count)++; + fds[*count] = anetTcpServer(server.neterr,port,NULL); + if (fds[*count] != ANET_ERR) (*count)++; + /* Exit the loop if we were able to bind * on IPv4 or IPv6, + * otherwise fds[*count] will be ANET_ERR and we'll print an + * error and return to the caller with an error. */ + if (*count) break; + } else if (strchr(server.bindaddr[j],':')) { + /* Bind IPv6 address. */ + fds[*count] = anetTcp6Server(server.neterr,port,server.bindaddr[j]); + } else { + /* Bind IPv4 address. */ + fds[*count] = anetTcpServer(server.neterr,port,server.bindaddr[j]); + } + if (fds[*count] == ANET_ERR) { + redisLog(REDIS_WARNING, + "Creating Server TCP listening socket %s:%d: %s", + server.bindaddr[j] ? server.bindaddr[j] : "*", + server.port, server.neterr); + return REDIS_ERR; + } + (*count)++; + } + return REDIS_OK; +} + void initServer() { int j; @@ -1500,44 +1555,9 @@ void initServer() { server.el = aeCreateEventLoop(server.maxclients+REDIS_EVENTLOOP_FDSET_INCR); server.db = zmalloc(sizeof(redisDb)*server.dbnum); - /* Open the TCP listening sockets. */ - if (server.port != 0) { - /* Force binding of 0.0.0.0 if no bind address is specified, always - * entering the loop if j == 0. */ - if (server.bindaddr_count == 0) server.bindaddr[0] = NULL; - for (j = 0; j < server.bindaddr_count || j == 0; j++) { - if (server.bindaddr[j] == NULL) { - /* Bind * for both IPv6 and IPv4, we enter here only if - * server.bindaddr_count == 0. */ - server.ipfd[server.ipfd_count] = - anetTcp6Server(server.neterr,server.port,NULL); - if (server.ipfd[server.ipfd_count] != ANET_ERR) - server.ipfd_count++; - server.ipfd[server.ipfd_count] = - anetTcpServer(server.neterr,server.port,NULL); - if(server.ipfd[server.ipfd_count] != ANET_ERR) - server.ipfd_count++; - /* Exit the loop if we were able to bind * on IPv4 or IPv6, - * otherwise server.ipfd[server.ipfd_count] will be ANET_ERR - * and we'll print an error and exit. */ - if (server.ipfd_count) break; - } else if (strchr(server.bindaddr[j],':')) { - /* Bind IPv6 address. */ - server.ipfd[server.ipfd_count] = anetTcp6Server(server.neterr,server.port,server.bindaddr[j]); - } else { - /* Bind IPv4 address. */ - server.ipfd[server.ipfd_count] = anetTcpServer(server.neterr,server.port,server.bindaddr[j]); - } - if (server.ipfd[server.ipfd_count] == ANET_ERR) { - redisLog(REDIS_WARNING, - "Creating Server TCP listening socket %s:%d: %s", - server.bindaddr[j] ? server.bindaddr[j] : "*", - server.port, server.neterr); - exit(1); - } - server.ipfd_count++; - } - } + /* Open the TCP listening socket for the user commands. */ + if (listenToPort(server.port,server.ipfd,&server.ipfd_count) == REDIS_ERR) + exit(1); /* Open the listening Unix domain socket. */ if (server.unixsocket != NULL) { From a8dc4ecd210e819058b02238833fbf2da2fcc287 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 22 Aug 2013 14:05:07 +0200 Subject: [PATCH 0153/2500] Use listenToPort() in cluster.c as well. --- src/cluster.c | 27 +++++++++------------------ src/redis.h | 1 + 2 files changed, 10 insertions(+), 18 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 82a4d74ef..fdd47fe0c 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -226,7 +226,7 @@ void clusterSaveConfigOrDie(void) { } void clusterInit(void) { - int saveconf = 0, j; + int saveconf = 0; server.cluster = zmalloc(sizeof(clusterState)); server.cluster->myself = NULL; @@ -252,25 +252,16 @@ void clusterInit(void) { saveconf = 1; } if (saveconf) clusterSaveConfigOrDie(); - /* We need a listening TCP port for our cluster messaging needs */ + + /* We need a listening TCP port for our cluster messaging needs. */ server.cfd_count = 0; - if (server.bindaddr_count == 0) server.bindaddr[0] = NULL; - for (j = 0; j < server.bindaddr_count || j == 0; j++) { - server.cfd[j] = anetTcpServer( - server.neterr, server.port+REDIS_CLUSTER_PORT_INCR, - server.bindaddr[j]); - if (server.cfd[j] == -1) { - redisLog(REDIS_WARNING, - "Opening cluster listening TCP socket %s:%d: %s", - server.bindaddr[j] ? server.bindaddr[j] : "*", - server.port+REDIS_CLUSTER_PORT_INCR, - server.neterr); - exit(1); - } - if (aeCreateFileEvent(server.el, server.cfd[j], AE_READABLE, - clusterAcceptHandler, NULL) == AE_ERR) redisPanic("Unrecoverable error creating Redis Cluster file event."); - server.cfd_count++; + if (listenToPort(server.port+REDIS_CLUSTER_PORT_INCR, + server.cfd,&server.cfd_count) == REDIS_ERR) + { + exit(1); } + + /* The slots -> keys map is a sorted set. Init it. */ server.cluster->slots_to_keys = zslCreate(); } diff --git a/src/redis.h b/src/redis.h index b5241b3fd..1a0523218 100644 --- a/src/redis.h +++ b/src/redis.h @@ -1096,6 +1096,7 @@ int getClientLimitClassByName(char *name); char *getClientLimitClassName(int class); void flushSlavesOutputBuffers(void); void disconnectSlaves(void); +int listenToPort(int port, int *fds, int *count); #ifdef __GNUC__ void addReplyErrorFormat(redisClient *c, const char *fmt, ...) From d7d90442f5c7af8d614093e2008864ca8eccb5b3 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 22 Aug 2013 14:53:53 +0200 Subject: [PATCH 0154/2500] Cluster: set event handler in cluster bus listening socket. The commit using listenToPort() introduced this bug by no longer creating the event handler to handle incoming messages from the cluster bus. --- src/cluster.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/cluster.c b/src/cluster.c index fdd47fe0c..3710db144 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -259,6 +259,15 @@ void clusterInit(void) { server.cfd,&server.cfd_count) == REDIS_ERR) { exit(1); + } else { + int j; + + for (j = 0; j < server.cfd_count; j++) { + if (aeCreateFileEvent(server.el, server.cfd[j], AE_READABLE, + clusterAcceptHandler, NULL) == AE_ERR) + redisPanic("Unrecoverable error creating Redis Cluster " + "file event."); + } } /* The slots -> keys map is a sorted set. Init it. */ From 33b286bf68097da123e44113cad2ba27ce808a36 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 26 Aug 2013 16:16:25 +0200 Subject: [PATCH 0155/2500] Don't update node pong time via gossip. This feature was implemented in the initial days of the Redis Cluster implementaiton but is not a good idea at all. 1) It depends on clocks to be synchronized, that is already very bad. 2) Moreover it adds a bug where the pong time is updated via gossip so no new PING is ever sent by the current node, with the effect of no PONG received, no update of tables, no clearing of PFAIL flag. In general to trust other nodes about the reachability of other nodes is a broken distributed programming model. --- src/cluster.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 3710db144..2afc62cce 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -704,15 +704,8 @@ void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) { /* Update our state accordingly to the gossip sections */ node = clusterLookupNode(g->nodename); if (node != NULL) { - /* We already know this node. Let's start updating the last - * time PONG figure if it is newer than our figure. - * Note that it's not a problem if we have a PING already - * in progress against this node. */ - if (node->pong_received < (signed) ntohl(g->pong_received)) { - redisLog(REDIS_DEBUG,"Node pong_received updated by gossip"); - node->pong_received = ntohl(g->pong_received); - } - /* Handle failure reports, only when the sender is a master. */ + /* We already know this node. + Handle failure reports, only when the sender is a master. */ if (sender && sender->flags & REDIS_NODE_MASTER && node != server.cluster->myself) { From a97a8b89546c9dfaf340316f6653d47c4a81ddd4 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 27 Aug 2013 09:31:43 +0200 Subject: [PATCH 0156/2500] Only run the fast active expire cycle if master & enabled. --- src/redis.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/redis.c b/src/redis.c index e68ce2c84..a1ff80758 100644 --- a/src/redis.c +++ b/src/redis.c @@ -1180,8 +1180,10 @@ void beforeSleep(struct aeEventLoop *eventLoop) { listNode *ln; redisClient *c; - /* Run a fast expire cycle. */ - activeExpireCycle(ACTIVE_EXPIRE_CYCLE_FAST); + /* Run a fast expire cycle (the called function will return + * ASAP if a fast cycle is not needed). */ + if (server.active_expire_enabled && server.masterhost == NULL) + activeExpireCycle(ACTIVE_EXPIRE_CYCLE_FAST); /* Try to process pending commands for clients that were just unblocked. */ while (listLength(server.unblocked_clients)) { From 2862182e9eff148e47ef4b5e79bece6b8cdc03d9 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 27 Aug 2013 10:16:01 +0200 Subject: [PATCH 0157/2500] Update server.lastbgsave_status when fork() fails. --- src/rdb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/rdb.c b/src/rdb.c index 69b24fbba..d1804d745 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -746,6 +746,7 @@ int rdbSaveBackground(char *filename) { /* Parent */ server.stat_fork_time = ustime()-start; if (childpid == -1) { + server.lastbgsave_status = REDIS_ERR; redisLog(REDIS_WARNING,"Can't save in background: fork: %s", strerror(errno)); return REDIS_ERR; From 2c3fa3e3920c3de8c3f77b3eb64cf2d97727ad21 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 27 Aug 2013 11:52:12 +0200 Subject: [PATCH 0158/2500] DEBUG SDSLEN added. This command is only useful for low-level debugging of memory issues due to sds wasting memory as empty buffer at the end of the string. --- src/debug.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/debug.c b/src/debug.c index a0352b5dd..e8e16cc8b 100644 --- a/src/debug.c +++ b/src/debug.c @@ -296,6 +296,29 @@ void debugCommand(redisClient *c) { (void*)val, val->refcount, strenc, (long long) rdbSavedObjectLen(val), val->lru, estimateObjectIdleTime(val)); + } else if (!strcasecmp(c->argv[1]->ptr,"sdslen") && c->argc == 3) { + dictEntry *de; + robj *val; + sds key; + + if ((de = dictFind(c->db->dict,c->argv[2]->ptr)) == NULL) { + addReply(c,shared.nokeyerr); + return; + } + val = dictGetVal(de); + key = dictGetKey(de); + + if (val->type != REDIS_STRING || !sdsEncodedObject(val)) { + addReplyError(c,"Not an sds encoded string."); + } else { + addReplyStatusFormat(c, + "key_sds_len:%lld, key_sds_avail:%lld, " + "val_sds_len:%lld, val_sds_avail:%lld", + (long long) sdslen(key), + (long long) sdsavail(key), + (long long) sdslen(val->ptr), + (long long) sdsavail(val->ptr)); + } } else if (!strcasecmp(c->argv[1]->ptr,"populate") && c->argc == 3) { long keys, j; robj *key, *val; From 77c3c946a134b88352c042f5b80033eee426ab04 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 27 Aug 2013 11:54:38 +0200 Subject: [PATCH 0159/2500] Don't over-allocate the sds string for large bulk requests. The call to sdsMakeRoomFor() did not accounted for the amount of data already present in the query buffer, resulting into over-allocation. --- src/networking.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/networking.c b/src/networking.c index be78a19f2..d0d0430c0 100644 --- a/src/networking.c +++ b/src/networking.c @@ -989,13 +989,13 @@ int processMultibulkBuffer(redisClient *c) { if (ll >= REDIS_MBULK_BIG_ARG) { /* If we are going to read a large object from network * try to make it likely that it will start at c->querybuf - * boundary so that we can optimized object creation + * boundary so that we can optimize object creation * avoiding a large copy of data. */ sdsrange(c->querybuf,pos,-1); pos = 0; /* Hint the sds library about the amount of bytes this string is * going to contain. */ - c->querybuf = sdsMakeRoomFor(c->querybuf,ll+2); + c->querybuf = sdsMakeRoomFor(c->querybuf,ll+2-sdslen(c->querybuf)); } c->bulklen = ll; } From 99ec8f117d75ec3d9ca8a85fc2e10ab4beeac043 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 27 Aug 2013 11:56:47 +0200 Subject: [PATCH 0160/2500] tryObjectEncoding(): don't call stringl2() for too big strings. We are sure that a string that is longer than 21 chars cannot be represented by a 64 bit signed integer, as -(2^64) is 21 chars: strlen(-18446744073709551616) => 21 --- src/object.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/object.c b/src/object.c index 159e92ada..75a4287ba 100644 --- a/src/object.c +++ b/src/object.c @@ -339,6 +339,7 @@ int isObjectRepresentableAsLongLong(robj *o, long long *llval) { robj *tryObjectEncoding(robj *o) { long value; sds s = o->ptr; + size_t len; if (o->encoding == REDIS_ENCODING_INT) return o; /* Already encoded */ @@ -351,8 +352,11 @@ robj *tryObjectEncoding(robj *o) { /* Currently we try to encode only strings */ redisAssertWithInfo(NULL,o,o->type == REDIS_STRING); - /* Check if we can represent this string as a long integer */ - if (!string2l(s,sdslen(s),&value)) { + /* Check if we can represent this string as a long integer. + * Note that we are sure that a string larger than 21 chars is not + * representable as a 64 bit integer. */ + len = sdslen(s); + if (len > 21 || !string2l(s,len,&value)) { /* Integer encoding not possible. Check if we can use EMBSTR. */ if (sdslen(s) <= REDIS_ENCODING_EMBSTR_SIZE_LIMIT) { robj *emb = createEmbeddedStringObject(s,sdslen(s)); From 3816606eb23842a3b7db61cf8205f7f1077e6c09 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 27 Aug 2013 11:59:34 +0200 Subject: [PATCH 0161/2500] tryObjectEncoding(): optimize sds strings if possible. When no encoding is possible, at least try to reallocate the sds string with one that does not waste memory (with free space at the end of the buffer) when the string is large enough. --- src/object.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/object.c b/src/object.c index 75a4287ba..b3acd2cc2 100644 --- a/src/object.c +++ b/src/object.c @@ -363,7 +363,22 @@ robj *tryObjectEncoding(robj *o) { decrRefCount(o); return emb; } else { - /* Otherwise return the original object. */ + /* We can't encode the object... + * + * Do the last try, and at least optimize the SDS string inside + * the string object to require little space, in case there + * is more than 10% of free space at the end of the SDS string. + * + * We do that only for relatively large strings as this branch + * is only entered if the length of the string is greater than + * REDIS_ENCODING_EMBSTR_SIZE_LIMIT. */ + if (len > 64 && + o->encoding == REDIS_ENCODING_RAW && + sdsavail(s) > len/10) + { + o->ptr = sdsRemoveFreeSpace(o->ptr); + } + /* Return the original object. */ return o; } } From acbaa37cbfe0fd93770f603730d1e30c7064440a Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 27 Aug 2013 12:36:52 +0200 Subject: [PATCH 0162/2500] Remove useful check from tryObjectEncoding(). We are sure the string is large, since when the sds optimization branch is entered it means that it was not possible to encode it as EMBSTR for size concerns. --- src/object.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/object.c b/src/object.c index b3acd2cc2..bfbd3ba1a 100644 --- a/src/object.c +++ b/src/object.c @@ -372,8 +372,7 @@ robj *tryObjectEncoding(robj *o) { * We do that only for relatively large strings as this branch * is only entered if the length of the string is greater than * REDIS_ENCODING_EMBSTR_SIZE_LIMIT. */ - if (len > 64 && - o->encoding == REDIS_ENCODING_RAW && + if (o->encoding == REDIS_ENCODING_RAW && sdsavail(s) > len/10) { o->ptr = sdsRemoveFreeSpace(o->ptr); From 82189282e7f7fe51f0ed55b61dd8bf8b0cb25578 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 27 Aug 2013 13:00:06 +0200 Subject: [PATCH 0163/2500] Fix an hypothetical issue in processMultibulkBuffer(). --- src/networking.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/networking.c b/src/networking.c index d0d0430c0..23ef11dc8 100644 --- a/src/networking.c +++ b/src/networking.c @@ -987,15 +987,19 @@ int processMultibulkBuffer(redisClient *c) { pos += newline-(c->querybuf+pos)+2; if (ll >= REDIS_MBULK_BIG_ARG) { + size_t qblen; + /* If we are going to read a large object from network * try to make it likely that it will start at c->querybuf * boundary so that we can optimize object creation * avoiding a large copy of data. */ sdsrange(c->querybuf,pos,-1); pos = 0; + qblen = sdslen(c->querybuf); /* Hint the sds library about the amount of bytes this string is * going to contain. */ - c->querybuf = sdsMakeRoomFor(c->querybuf,ll+2-sdslen(c->querybuf)); + if (qblen < ll+2) + c->querybuf = sdsMakeRoomFor(c->querybuf,ll+2-qblen); } c->bulklen = ll; } From a8ec7abb1b91bf1a5d23a64a8afa08f01530bb91 Mon Sep 17 00:00:00 2001 From: yihuang Date: Tue, 13 Aug 2013 17:47:42 +0800 Subject: [PATCH 0164/2500] fix lua_cmsgpack pack map as array --- deps/lua/src/lua_cmsgpack.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/lua/src/lua_cmsgpack.c b/deps/lua/src/lua_cmsgpack.c index 906121361..53dc1cf61 100644 --- a/deps/lua/src/lua_cmsgpack.c +++ b/deps/lua/src/lua_cmsgpack.c @@ -374,7 +374,7 @@ static int table_is_an_array(lua_State *L) { while(lua_next(L,-2)) { /* Stack: ... key value */ lua_pop(L,1); /* Stack: ... key */ - if (!lua_isnumber(L,-1)) goto not_array; + if (lua_type(L,-1) != LUA_TNUMBER) goto not_array; n = lua_tonumber(L,-1); idx = n; if (idx != n || idx < 1) goto not_array; From 47f6823a734200d72018c19d5ed27e04447305d3 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 29 Aug 2013 11:49:23 +0200 Subject: [PATCH 0165/2500] Fixed critical memory leak from EVAL. Multiple missing calls to lua_pop prevented the error handler function pushed on the stack for lua_pcall() to be popped before returning, causing a memory leak in almost all the code paths of EVAL (both successful calls and calls returning errors). This caused two issues: Lua leaking memory (and this was very visible from INFO memory output, as the 'used_memory_lua' field reported an always increasing amount of memory used), and as a result slower and slower GC cycles resulting in all the CPU being used. Thanks to Tanguy Le Barzic for noticing something was wrong with his 2.8 slave, and for creating a testing EC2 environment where I was able to investigate the issue. --- src/scripting.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/scripting.c b/src/scripting.c index baf585279..ac1a913f0 100644 --- a/src/scripting.c +++ b/src/scripting.c @@ -876,7 +876,12 @@ void evalGenericCommand(redisClient *c, int evalsha) { addReply(c, shared.noscripterr); return; } - if (luaCreateFunction(c,lua,funcname,c->argv[1]) == REDIS_ERR) return; + if (luaCreateFunction(c,lua,funcname,c->argv[1]) == REDIS_ERR) { + lua_pop(lua,1); /* remove the error handler from the stack. */ + /* The error is sent to the client by luaCreateFunction() + * itself when it returns REDIS_ERR. */ + return; + } /* Now the following is guaranteed to return non nil */ lua_getglobal(lua, funcname); redisAssert(!lua_isnil(lua,-1)); @@ -905,7 +910,6 @@ void evalGenericCommand(redisClient *c, int evalsha) { /* At this point whether this script was never seen before or if it was * already defined, we can call it. We have zero arguments and expect * a single return value. */ - err = lua_pcall(lua,0,1,-2); /* Perform some cleanup that we need to do both on error and success. */ @@ -924,11 +928,12 @@ void evalGenericCommand(redisClient *c, int evalsha) { if (err) { addReplyErrorFormat(c,"Error running script (call to %s): %s\n", funcname, lua_tostring(lua,-1)); - lua_pop(lua,1); /* Consume the Lua reply. */ + lua_pop(lua,2); /* Consume the Lua reply and remove error handler. */ } else { /* On success convert the Lua return value into Redis protocol, and * send it to * the client. */ - luaReplyToRedisReply(c,lua); + luaReplyToRedisReply(c,lua); /* Convert and consume the reply. */ + lua_pop(lua,1); /* Remove the error handler. */ } /* EVALSHA should be propagated to Slave and AOF file as full EVAL, unless From b3277bab4bf6b0624ebb28b8fab8f7c0b7105f52 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 29 Aug 2013 16:23:57 +0200 Subject: [PATCH 0166/2500] Test: added a memory efficiency test. --- tests/test_helper.tcl | 1 + tests/unit/memefficiency.tcl | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) create mode 100644 tests/unit/memefficiency.tcl diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl index 1c3049d32..d8de34e18 100644 --- a/tests/test_helper.tcl +++ b/tests/test_helper.tcl @@ -45,6 +45,7 @@ set ::all_tests { unit/limits unit/obuf-limits unit/bitops + unit/memefficiency } # Index to the next test to run in the ::all_tests list. set ::next_test 0 diff --git a/tests/unit/memefficiency.tcl b/tests/unit/memefficiency.tcl new file mode 100644 index 000000000..3612f06e5 --- /dev/null +++ b/tests/unit/memefficiency.tcl @@ -0,0 +1,32 @@ +proc test_memory_efficiency {range} { + r flushall + set base_mem [s used_memory] + set written 0 + for {set j 0} {$j < 10000} {incr j} { + set key key:$j + set val [string repeat A [expr {int(rand()*$range)}]] + r set $key $val + incr written [string length $key] + incr written [string length $val] + incr written 2 ;# A separator is the minimum to store key-value data. + } + set current_mem [s used_memory] + set used [expr {$current_mem-$base_mem}] + set efficiency [expr {double($written)/$used}] + return $efficiency +} + +start_server {tags {"memefficiency"}} { + foreach {size_range expected_min_efficiency} { + 32 0.15 + 64 0.25 + 128 0.35 + 1024 0.75 + 16384 0.90 + } { + test "Memory efficiency with values in range $size_range" { + set efficiency [test_memory_efficiency $size_range] + assert {$efficiency >= $expected_min_efficiency} + } + } +} From ebe91a49c9b4f6f6b5c95d3400389691a5bc8bb8 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 30 Aug 2013 08:59:11 +0200 Subject: [PATCH 0167/2500] Test: Lua stack leak regression test added. --- tests/unit/scripting.tcl | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/unit/scripting.tcl b/tests/unit/scripting.tcl index 3e08f630c..ec5230bfe 100644 --- a/tests/unit/scripting.tcl +++ b/tests/unit/scripting.tcl @@ -282,6 +282,21 @@ start_server {tags {"scripting"}} { assert {$rand2 ne $rand3} } + test {EVAL does not leak in the Lua stack} { + r set x 0 + # Use a non blocking client to speedup the loop. + set rd [redis_deferring_client] + for {set j 0} {$j < 10000} {incr j} { + $rd eval {return redis.call("incr",KEYS[1])} 1 x + } + for {set j 0} {$j < 10000} {incr j} { + $rd read + } + assert {[s used_memory_lua] < 1024*100} + $rd close + r get x + } {10000} + test {EVAL processes writes from AOF in read-only slaves} { r flushall r config set appendonly yes From d3726385c2f71904575173582a52a142545f1c0c Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 3 Sep 2013 11:27:01 +0200 Subject: [PATCH 0168/2500] Cluster: fixed a bug in clusterSendPublish() due to inverted statements. The code used to copy the header *after* the 'hdr' pointer was already switched to the new buffer. Of course we need to do the reverse. --- src/cluster.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cluster.c b/src/cluster.c index 2afc62cce..bc0e34bed 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -1380,8 +1380,8 @@ void clusterSendPublish(clusterLink *link, robj *channel, robj *message) { payload = buf; } else { payload = zmalloc(totlen); - hdr = (clusterMsg*) payload; memcpy(payload,hdr,sizeof(*hdr)); + hdr = (clusterMsg*) payload; } memcpy(hdr->data.publish.msg.bulk_data,channel->ptr,sdslen(channel->ptr)); memcpy(hdr->data.publish.msg.bulk_data+sdslen(channel->ptr), From e307150c215da40627cb855b9be4bb6ca146bad1 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 3 Sep 2013 11:42:09 +0200 Subject: [PATCH 0169/2500] Cluster: use non-blocking I/O for the cluster bus. --- src/cluster.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/cluster.c b/src/cluster.c index bc0e34bed..8625cee1f 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -317,6 +317,10 @@ void clusterAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) { redisLog(REDIS_VERBOSE,"Accepting cluster node: %s", server.neterr); return; } + anetNonBlock(NULL,cfd); + anetEnableTcpNoDelay(NULL,cfd); + + /* Use non-blocking I/O for cluster messages. */ /* IPV6: might want to wrap a v6 address in [] */ redisLog(REDIS_VERBOSE,"Accepted cluster node %s:%d", cip, cport); /* We need to create a temporary node in order to read the incoming From 853defe0718b8504df2e83202846a86cf5121659 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 3 Sep 2013 11:43:07 +0200 Subject: [PATCH 0170/2500] Cluster: clusterReadHandler() reworked to be more correct and simpler to follow. --- src/cluster.c | 83 +++++++++++++++++++++++++-------------------------- 1 file changed, 41 insertions(+), 42 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 8625cee1f..b0b24c5d8 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -1169,53 +1169,52 @@ void clusterReadHandler(aeEventLoop *el, int fd, void *privdata, int mask) { REDIS_NOTUSED(el); REDIS_NOTUSED(mask); -again: - rcvbuflen = sdslen(link->rcvbuf); - if (rcvbuflen < 4) { - /* First, obtain the first four bytes to get the full message - * length. */ - readlen = 4 - rcvbuflen; - } else { - /* Finally read the full message. */ - hdr = (clusterMsg*) link->rcvbuf; - if (rcvbuflen == 4) { - /* Perform some sanity check on the message length. */ - if (ntohl(hdr->totlen) < CLUSTERMSG_MIN_LEN) { - redisLog(REDIS_WARNING, - "Bad message length received from Cluster bus."); - handleLinkIOError(link); - return; + while(1) { /* Read as long as there is data to read. */ + rcvbuflen = sdslen(link->rcvbuf); + if (rcvbuflen < 4) { + /* First, obtain the first four bytes to get the full message + * length. */ + readlen = 4 - rcvbuflen; + } else { + /* Finally read the full message. */ + hdr = (clusterMsg*) link->rcvbuf; + if (rcvbuflen == 4) { + /* Perform some sanity check on the message length. */ + if (ntohl(hdr->totlen) < CLUSTERMSG_MIN_LEN) { + redisLog(REDIS_WARNING, + "Bad message length received from Cluster bus."); + handleLinkIOError(link); + return; + } } + readlen = ntohl(hdr->totlen) - rcvbuflen; + if (readlen > sizeof(buf)) readlen = sizeof(buf); } - readlen = ntohl(hdr->totlen) - rcvbuflen; - } - nread = read(fd,buf,readlen); - if (nread == -1 && errno == EAGAIN) return; /* No more data ready. */ + nread = read(fd,buf,readlen); + if (nread == -1 && errno == EAGAIN) return; /* No more data ready. */ - if (nread <= 0) { - /* I/O error... */ - redisLog(REDIS_DEBUG,"I/O error reading from node link: %s", - (nread == 0) ? "connection closed" : strerror(errno)); - handleLinkIOError(link); - return; - } else { - /* Read data and recast the pointer to the new buffer. */ - link->rcvbuf = sdscatlen(link->rcvbuf,buf,nread); - hdr = (clusterMsg*) link->rcvbuf; - rcvbuflen += nread; - } + if (nread <= 0) { + /* I/O error... */ + redisLog(REDIS_DEBUG,"I/O error reading from node link: %s", + (nread == 0) ? "connection closed" : strerror(errno)); + handleLinkIOError(link); + return; + } else { + /* Read data and recast the pointer to the new buffer. */ + link->rcvbuf = sdscatlen(link->rcvbuf,buf,nread); + hdr = (clusterMsg*) link->rcvbuf; + rcvbuflen += nread; + } - /* Total length obtained? read the payload now instead of burning - * cycles waiting for a new event to fire. */ - if (rcvbuflen == 4) goto again; - - /* Whole packet in memory? We can process it. */ - if (rcvbuflen == ntohl(hdr->totlen)) { - if (clusterProcessPacket(link)) { - sdsfree(link->rcvbuf); - link->rcvbuf = sdsempty(); - rcvbuflen = 0; /* Useless line of code currently... defensive. */ + /* Total length obtained? Process this packet. */ + if (rcvbuflen >= 4 && rcvbuflen == ntohl(hdr->totlen)) { + if (clusterProcessPacket(link)) { + sdsfree(link->rcvbuf); + link->rcvbuf = sdsempty(); + } else { + return; /* Link no longer valid. */ + } } } } From 1885c6bada9bd76a37aa32e732a61645a37fb72b Mon Sep 17 00:00:00 2001 From: Maxim Zakharov Date: Wed, 28 Aug 2013 15:44:40 +1000 Subject: [PATCH 0171/2500] A mistype fixed --- src/replication.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/replication.c b/src/replication.c index 27187476b..81323cb60 100644 --- a/src/replication.c +++ b/src/replication.c @@ -189,7 +189,7 @@ void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) { /* Add the multi bulk reply length. */ aux[0] = '*'; - len = ll2string(aux+1,sizeof(aux-1),argc); + len = ll2string(aux+1,sizeof(aux)-1,argc); aux[len+1] = '\r'; aux[len+2] = '\n'; feedReplicationBacklog(aux,len+3); From ff18243fce4cf8ac9c064b2f70b6315e2eb17bcf Mon Sep 17 00:00:00 2001 From: Maxim Zakharov Date: Wed, 28 Aug 2013 15:34:52 +1000 Subject: [PATCH 0172/2500] mistype fixed --- src/zipmap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zipmap.h b/src/zipmap.h index acb25d67a..9cf1b2484 100644 --- a/src/zipmap.h +++ b/src/zipmap.h @@ -32,7 +32,7 @@ * POSSIBILITY OF SUCH DAMAGE. */ -#ifndef _ZIMMAP_H +#ifndef _ZIPMAP_H #define _ZIPMAP_H unsigned char *zipmapNew(void); From 6e460eac5853ff8fe2e8b08000535cf5e3822a15 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 4 Sep 2013 10:07:50 +0200 Subject: [PATCH 0173/2500] Cluster: always use safe iteartors to iterate server.cluster->nodes. --- src/cluster.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index b0b24c5d8..6cd2eed36 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -545,7 +545,7 @@ void clusterDelNode(clusterNode *delnode) { } /* 2) Remove failure reports. */ - di = dictGetIterator(server.cluster->nodes); + di = dictGetSafeIterator(server.cluster->nodes); while((de = dictNext(di)) != NULL) { clusterNode *node = dictGetVal(de); @@ -1234,7 +1234,7 @@ void clusterBroadcastMessage(void *buf, size_t len) { dictIterator *di; dictEntry *de; - di = dictGetIterator(server.cluster->nodes); + di = dictGetSafeIterator(server.cluster->nodes); while((de = dictNext(di)) != NULL) { clusterNode *node = dictGetVal(de); @@ -1346,7 +1346,7 @@ void clusterBroadcastPong(void) { dictIterator *di; dictEntry *de; - di = dictGetIterator(server.cluster->nodes); + di = dictGetSafeIterator(server.cluster->nodes); while((de = dictNext(di)) != NULL) { clusterNode *node = dictGetVal(de); @@ -1647,7 +1647,7 @@ void clusterCron(void) { } /* Iterate nodes to check if we need to flag something as failing */ - di = dictGetIterator(server.cluster->nodes); + di = dictGetSafeIterator(server.cluster->nodes); while((de = dictNext(di)) != NULL) { clusterNode *node = dictGetVal(de); time_t now = time(NULL); @@ -1849,7 +1849,7 @@ void clusterUpdateState(void) { dictEntry *de; server.cluster->size = 0; - di = dictGetIterator(server.cluster->nodes); + di = dictGetSafeIterator(server.cluster->nodes); while((de = dictNext(di)) != NULL) { clusterNode *node = dictGetVal(de); @@ -1978,7 +1978,7 @@ sds clusterGenNodesDescription(void) { dictEntry *de; int j, start; - di = dictGetIterator(server.cluster->nodes); + di = dictGetSafeIterator(server.cluster->nodes); while((de = dictNext(di)) != NULL) { clusterNode *node = dictGetVal(de); From 61eb16c4dab6a1261d075e153c6f864343cb442d Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 4 Sep 2013 10:25:26 +0200 Subject: [PATCH 0174/2500] Cluster: don't save HANDSHAKE nodes in nodes.conf. --- src/cluster.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 6cd2eed36..d5fbc5e75 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -45,7 +45,7 @@ void clusterSendFail(char *nodename); void clusterSendFailoverAuthIfNeeded(clusterNode *sender); void clusterUpdateState(void); int clusterNodeGetSlotBit(clusterNode *n, int slot); -sds clusterGenNodesDescription(void); +sds clusterGenNodesDescription(int filter); clusterNode *clusterLookupNode(char *name); int clusterNodeAddSlave(clusterNode *master, clusterNode *slave); int clusterAddSlot(clusterNode *n, int slot); @@ -203,7 +203,7 @@ fmterr: * This function writes the node config and returns 0, on error -1 * is returned. */ int clusterSaveConfig(void) { - sds ci = clusterGenNodesDescription(); + sds ci = clusterGenNodesDescription(REDIS_NODE_HANDSHAKE); int fd; if ((fd = open(server.cluster_configfile,O_WRONLY|O_CREAT|O_TRUNC,0644)) @@ -1972,7 +1972,19 @@ void clusterSetMaster(clusterNode *n) { * CLUSTER command * -------------------------------------------------------------------------- */ -sds clusterGenNodesDescription(void) { +/* Generate a csv-alike representation of the nodes we are aware of, + * including the "myself" node, and return an SDS string containing the + * representation (it is up to the caller to free it). + * + * All the nodes matching at least one of the node flags specified in + * "filter" are excluded from the output, so using zero as a filter will + * include all the known nodes in the representation, including nodes in + * the HANDSHAKE state. + * + * The representation obtained using this function is used for the output + * of the CLUSTER NODES function, and as format for the cluster + * configuration file (nodes.conf) for a given node. */ +sds clusterGenNodesDescription(int filter) { sds ci = sdsempty(); dictIterator *di; dictEntry *de; @@ -1982,6 +1994,8 @@ sds clusterGenNodesDescription(void) { while((de = dictNext(di)) != NULL) { clusterNode *node = dictGetVal(de); + if (node->flags & filter) continue; + /* Node coordinates */ ci = sdscatprintf(ci,"%.40s %s:%d ", node->name, @@ -2117,7 +2131,7 @@ void clusterCommand(redisClient *c) { } else if (!strcasecmp(c->argv[1]->ptr,"nodes") && c->argc == 2) { /* CLUSTER NODES */ robj *o; - sds ci = clusterGenNodesDescription(); + sds ci = clusterGenNodesDescription(0); o = createObject(REDIS_STRING,ci); addReplyBulk(c,o); From 232f84ec2dd316a347763e3cc6511dda79264ea0 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 4 Sep 2013 10:32:09 +0200 Subject: [PATCH 0175/2500] Cluster: CLUSTER SAVECONFIG command added. --- src/cluster.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/cluster.c b/src/cluster.c index d5fbc5e75..694c49252 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -2314,6 +2314,14 @@ void clusterCommand(redisClient *c) { (unsigned long)sdslen(info))); addReplySds(c,info); addReply(c,shared.crlf); + } else if (!strcasecmp(c->argv[1]->ptr,"saveconfig") && c->argc == 2) { + int retval = clusterSaveConfig(); + + if (retval == 0) + addReply(c,shared.ok); + else + addReplyErrorFormat(c,"error saving the cluster node config: %s", + strerror(errno)); } else if (!strcasecmp(c->argv[1]->ptr,"keyslot") && c->argc == 3) { /* CLUSTER KEYSLOT */ sds key = c->argv[2]->ptr; From ee99df2d598b6c0f773a02776ee0674a38509385 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 4 Sep 2013 10:35:53 +0200 Subject: [PATCH 0176/2500] redis-cli: fix big keys search when the key no longer exist. The code freed a reply object that was never created, resulting in a segfault every time randomkey returned a key that was deleted before we queried it for size. --- src/redis-cli.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/redis-cli.c b/src/redis-cli.c index b4ce644f7..19cace328 100644 --- a/src/redis-cli.c +++ b/src/redis-cli.c @@ -1324,7 +1324,6 @@ static void findBigKeys(void) { } else if (!strcmp(reply2->str,"none")) { freeReplyObject(reply1); freeReplyObject(reply2); - freeReplyObject(reply3); continue; } else { fprintf(stderr, "Unknown key type '%s' for key '%s'\n", From 79a1deac280439a9db7a0793f650d1ae954adece Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 4 Sep 2013 12:41:21 +0200 Subject: [PATCH 0177/2500] Cluster: free HANDSHAKE nodes after node_timeout. Handshake nodes should turn into normal nodes or be freed in a reasonable amount of time, otherwise they'll keep accumulating if the address they are associated with is not reachable for some reason. --- src/cluster.c | 11 +++++++++++ src/redis.h | 1 + 2 files changed, 12 insertions(+) diff --git a/src/cluster.c b/src/cluster.c index 694c49252..f89e5bad6 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -359,6 +359,7 @@ clusterNode *createClusterNode(char *nodename, int flags) { memcpy(node->name, nodename, REDIS_CLUSTER_NAMELEN); else getRandomHexChars(node->name, REDIS_CLUSTER_NAMELEN); + node->ctime = time(NULL); node->flags = flags; memset(node->slots,0,sizeof(node->slots)); node->numslots = 0; @@ -1588,6 +1589,16 @@ void clusterCron(void) { clusterNode *node = dictGetVal(de); if (node->flags & (REDIS_NODE_MYSELF|REDIS_NODE_NOADDR)) continue; + + /* A Node in HANDSHAKE state has a limited lifespan equal to the + * configured node timeout. */ + if (node->flags & REDIS_NODE_HANDSHAKE && + server.unixtime - node->ctime > server.cluster_node_timeout) + { + freeClusterNode(node); + continue; + } + if (node->link == NULL) { int fd; time_t old_ping_sent; diff --git a/src/redis.h b/src/redis.h index 1a0523218..17ea71477 100644 --- a/src/redis.h +++ b/src/redis.h @@ -614,6 +614,7 @@ struct clusterNodeFailReport { } typedef clusterNodeFailReport; struct clusterNode { + time_t ctime; /* Node object creation time. */ char name[REDIS_CLUSTER_NAMELEN]; /* Node name, hex string, sha1-size */ int flags; /* REDIS_NODE_... */ unsigned char slots[REDIS_CLUSTER_SLOTS/8]; /* slots handled by this node */ From c7cb80c8bbf907034e7fdd29e32db9f80613db7e Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 4 Sep 2013 15:52:16 +0200 Subject: [PATCH 0178/2500] Cluster: don't add an handshake node for the same ip:port pair multiple times. --- src/cluster.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/cluster.c b/src/cluster.c index f89e5bad6..186f0ac4e 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -674,6 +674,24 @@ void clearNodeFailureIfNeeded(clusterNode *node) { } } +/* Return true if we already have a node in HANDSHAKE state matching the + * specified ip address and port number. This function is used in order to + * avoid adding a new handshake node for the same address multiple times. */ +int clusterHandshakeInProgress(char *ip, int port) { + dictIterator *di; + dictEntry *de; + + di = dictGetSafeIterator(server.cluster->nodes); + while((de = dictNext(di)) != NULL) { + clusterNode *node = dictGetVal(de); + + if (!(node->flags & REDIS_NODE_HANDSHAKE)) continue; + if (!strcasecmp(node->ip,ip) && node->port == port) break; + } + dictReleaseIterator(di); + return de != NULL; +} + /* Process the gossip section of PING or PONG packets. * Note that this function assumes that the packet is already sanity-checked * by the caller, not in the content of the gossip section, but in the @@ -736,7 +754,9 @@ void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) { * Note that we require that the sender of this gossip message * is a well known node in our cluster, otherwise we risk * joining another cluster. */ - if (sender && !(flags & REDIS_NODE_NOADDR)) { + if (sender && !(flags & REDIS_NODE_NOADDR) && + !clusterHandshakeInProgress(g->ip,ntohs(g->port))) + { clusterNode *newnode; redisLog(REDIS_DEBUG,"Adding the new node"); From 90e1829ec42328f36a873df69cebd6be227eb8a7 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 17 Sep 2013 09:46:01 +0200 Subject: [PATCH 0179/2500] Allow AUTH / PING when disconnected from slave and serve-stale-data is no. --- src/redis.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/redis.c b/src/redis.c index a1ff80758..2792f1393 100644 --- a/src/redis.c +++ b/src/redis.c @@ -210,8 +210,8 @@ struct redisCommand redisCommandTable[] = { {"pexpireat",pexpireatCommand,3,"w",0,NULL,1,1,1,0,0}, {"keys",keysCommand,2,"rS",0,NULL,0,0,0,0,0}, {"dbsize",dbsizeCommand,1,"r",0,NULL,0,0,0,0,0}, - {"auth",authCommand,2,"rsl",0,NULL,0,0,0,0,0}, - {"ping",pingCommand,1,"r",0,NULL,0,0,0,0,0}, + {"auth",authCommand,2,"rslt",0,NULL,0,0,0,0,0}, + {"ping",pingCommand,1,"rt",0,NULL,0,0,0,0,0}, {"echo",echoCommand,2,"r",0,NULL,0,0,0,0,0}, {"save",saveCommand,1,"ars",0,NULL,0,0,0,0,0}, {"bgsave",bgsaveCommand,1,"ar",0,NULL,0,0,0,0,0}, From 3f5034d1d7eb009caa30a915801b8fdfd7f65927 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 20 Sep 2013 09:22:21 +0200 Subject: [PATCH 0180/2500] Cluster: added time field in cluster bus messages. The time is sent in requests, and copied back in reply packets. This way the receiver can compare the time field in a reply with its local clock and check the age of the request associated with this reply. This is an easy way to discard delayed replies. Note that only a clock is used here, that is the one of the node sending the packet. The receiver only copies the field back into the reply, so no synchronization is needed between clocks of different hosts. --- src/cluster.c | 18 ++++++++++++------ src/redis.h | 3 +++ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 186f0ac4e..f85b75690 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -42,7 +42,7 @@ void clusterAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask); void clusterReadHandler(aeEventLoop *el, int fd, void *privdata, int mask); void clusterSendPing(clusterLink *link, int type); void clusterSendFail(char *nodename); -void clusterSendFailoverAuthIfNeeded(clusterNode *sender); +void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request); void clusterUpdateState(void); int clusterNodeGetSlotBit(clusterNode *n, int slot); sds clusterGenNodesDescription(int filter); @@ -1134,7 +1134,7 @@ int clusterProcessPacket(clusterLink *link) { if (!sender) return 1; /* We don't know that node. */ /* If we are not a master, ignore that message at all. */ if (!(server.cluster->myself->flags & REDIS_NODE_MASTER)) return 0; - clusterSendFailoverAuthIfNeeded(sender); + clusterSendFailoverAuthIfNeeded(sender,hdr); } else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK) { if (!sender) return 1; /* We don't know that node. */ /* If this is a master, increment the number of acknowledges @@ -1464,11 +1464,14 @@ void clusterRequestFailoverAuth(void) { clusterBuildMessageHdr(hdr,CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST); totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData); hdr->totlen = htonl(totlen); + hdr->time = mstime(); clusterBroadcastMessage(buf,totlen); } -/* Send a FAILOVER_AUTH_ACK message to the specified node. */ -void clusterSendFailoverAuth(clusterNode *node) { +/* Send a FAILOVER_AUTH_ACK message to the specified node. + * Reqtime is the time field from the original failover auth request packet, + * so that the receiver is able to check the reply age. */ +void clusterSendFailoverAuth(clusterNode *node, uint64_t reqtime) { unsigned char buf[4096]; clusterMsg *hdr = (clusterMsg*) buf; uint32_t totlen; @@ -1477,11 +1480,14 @@ void clusterSendFailoverAuth(clusterNode *node) { clusterBuildMessageHdr(hdr,CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK); totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData); hdr->totlen = htonl(totlen); + hdr->time = reqtime; clusterSendMessage(node->link,buf,totlen); } /* If we believe 'node' is the "first slave" of it's master, reply with * a FAILOVER_AUTH_GRANTED packet. + * The 'request' field points to the authorization request packet header, we + * need it in order to copy back the 'time' field in our reply. * * To be a first slave the sender must: * 1) Be a slave. @@ -1489,7 +1495,7 @@ void clusterSendFailoverAuth(clusterNode *node) { * 3) Ordering all the slaves IDs for its master by run-id, it should be the * first (the smallest) among the ones not in FAIL / PFAIL state. */ -void clusterSendFailoverAuthIfNeeded(clusterNode *node) { +void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { char first[REDIS_CLUSTER_NAMELEN]; clusterNode *master = node->slaveof; int j; @@ -1514,7 +1520,7 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node) { if (memcmp(node->name,first,sizeof(first)) != 0) return; /* We can send the packet. */ - clusterSendFailoverAuth(node); + clusterSendFailoverAuth(node,request->time); } /* This function is called if we are a slave node and our master serving diff --git a/src/redis.h b/src/redis.h index 17ea71477..6e19ea3bd 100644 --- a/src/redis.h +++ b/src/redis.h @@ -704,6 +704,9 @@ typedef struct { uint32_t totlen; /* Total length of this message */ uint16_t type; /* Message type */ uint16_t count; /* Only used for some kind of messages. */ + uint64_t time; /* Time at which this request was sent (in milliseconds), + this field is copied in reply messages so that the + original sender knows how old the reply is. */ char sender[REDIS_CLUSTER_NAMELEN]; /* Name of the sender node */ unsigned char myslots[REDIS_CLUSTER_SLOTS/8]; char slaveof[REDIS_CLUSTER_NAMELEN]; From 3a9bf5e6185a311c8c7d688dc8176c5d88d3ddca Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 20 Sep 2013 11:26:44 +0200 Subject: [PATCH 0181/2500] Cluster: PFAIL -> FAIL transition allowed for slaves. First change: now there is no need to be a master in order to detect a failure, however the majority of masters signaling PFAIL or FAIL is needed. This change is important because it allows slaves rejoining the cluster after a partition to sense the FAIL condition so that eventually all the nodes agree on failures. --- src/cluster.c | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index f85b75690..e562c00c1 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -594,25 +594,36 @@ void clusterRenameNode(clusterNode *node, char *newname) { /* This function checks if a given node should be marked as FAIL. * It happens if the following conditions are met: * - * 1) We are a master node. Only master nodes can mark a node as failing. - * 2) We received enough failure reports from other nodes via gossip. - * Enough means that the majority of the masters believe the node is - * down. - * 3) We believe this node is in PFAIL state. + * 1) We received enough failure reports from other master nodes via gossip. + * Enough means that the majority of the masters signaled the node is + * down recently. + * 2) We believe this node is in PFAIL state. * * If a failure is detected we also inform the whole cluster about this * event trying to force every other node to set the FAIL flag for the node. + * + * Note that the form of agreement used here is weak, as we collect the majority + * of masters state during some time, and even if we force agreement by + * propagating the FAIL message, because of partitions we may not reach every + * node. However: + * + * 1) Either we reach the majority and eventually the FAIL state will propagate + * to all the cluster. + * 2) Or there is no majority so no slave promotion will be authorized and the + * FAIL flag will be cleared after some time. */ void markNodeAsFailingIfNeeded(clusterNode *node) { int failures; int needed_quorum = (server.cluster->size / 2) + 1; - if (!(server.cluster->myself->flags & REDIS_NODE_MASTER)) return; if (!(node->flags & REDIS_NODE_PFAIL)) return; /* We can reach it. */ if (node->flags & REDIS_NODE_FAIL) return; /* Already FAILing. */ - failures = 1 + clusterNodeFailureReportsCount(node); /* +1 is for myself. */ - if (failures < needed_quorum) return; + failures = clusterNodeFailureReportsCount(node); + /* Also count myself as a voter if I'm a master. */ + if (server.cluster->myself->flags & REDIS_NODE_MASTER) + failures += 1; + if (failures < needed_quorum) return; /* No weak agreement from masters. */ redisLog(REDIS_NOTICE, "Marking node %.40s as failing (quorum reached).", node->name); @@ -622,8 +633,10 @@ void markNodeAsFailingIfNeeded(clusterNode *node) { node->flags |= REDIS_NODE_FAIL; node->fail_time = time(NULL); - /* Broadcast the failing node name to everybody */ - clusterSendFail(node->name); + /* Broadcast the failing node name to everybody, forcing all the other + * reachable nodes to flag the node as FAIL. */ + if (server.cluster->myself->flags & REDIS_NODE_MASTER) + clusterSendFail(node->name); clusterUpdateState(); clusterSaveConfigOrDie(); } From 98d1253053f580f53c077597d328dec81e9c6998 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 25 Sep 2013 09:26:36 +0200 Subject: [PATCH 0182/2500] htonu64() and ntohu64 added to endianconv.h. --- src/endianconv.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/endianconv.h b/src/endianconv.h index 7afe61c62..7c16e175f 100644 --- a/src/endianconv.h +++ b/src/endianconv.h @@ -61,4 +61,14 @@ uint64_t intrev64(uint64_t v); #define intrev64ifbe(v) intrev64(v) #endif +/* The functions htonu64() and ntohu64() convert the specified value to + * network byte ordering and back. In big endian systems they are no-ops. */ +#if (BYTE_ORDER == BIG_ENDIAN) +#define htonu64(v) (v) +#define ntohu64(v) (v) +#else +#define htonu64(v) intrev64(v) +#define ntohu64(v) intrev64(v) +#endif + #endif From cdf4eede587ab7fada61c8c3abe695fc26b69200 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 25 Sep 2013 11:47:13 +0200 Subject: [PATCH 0183/2500] Cluster: configEpoch added in cluster nodes description. --- src/cluster.c | 28 ++++++++++++++++++++++++++-- src/redis.h | 5 +++++ 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index e562c00c1..43a8133f9 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -59,6 +59,23 @@ int bitmapTestBit(unsigned char *bitmap, int pos); * Initialization * -------------------------------------------------------------------------- */ +/* This function is called at startup in order to set the currentEpoch + * (which is not saved on permanent storage) to the greatest configEpoch found + * in the loaded nodes (configEpoch is stored on permanent storage as soon as + * it changes for some node). */ +void clusterSetStartupEpoch() { + dictIterator *di; + dictEntry *de; + + di = dictGetSafeIterator(server.cluster->nodes); + while((de = dictNext(di)) != NULL) { + clusterNode *node = dictGetVal(de); + if (node->configEpoch > server.cluster->currentEpoch) + server.cluster->currentEpoch = node->configEpoch; + } + dictReleaseIterator(di); +} + int clusterLoadConfig(char *filename) { FILE *fp = fopen(filename,"r"); char *line; @@ -143,8 +160,11 @@ int clusterLoadConfig(char *filename) { if (atoi(argv[4])) n->ping_sent = time(NULL); if (atoi(argv[5])) n->pong_received = time(NULL); + /* Set configEpoch for this node. */ + n->configEpoch = strtoull(argv[6],NULL,10); + /* Populate hash slots served by this instance. */ - for (j = 7; j < argc; j++) { + for (j = 8; j < argc; j++) { int start, stop; if (argv[j][0] == '[') { @@ -189,6 +209,7 @@ int clusterLoadConfig(char *filename) { redisAssert(server.cluster->myself != NULL); redisLog(REDIS_NOTICE,"Node configuration loaded, I'm %.40s", server.cluster->myself->name); + clusterSetStartupEpoch(); clusterUpdateState(); return REDIS_OK; @@ -230,6 +251,7 @@ void clusterInit(void) { server.cluster = zmalloc(sizeof(clusterState)); server.cluster->myself = NULL; + server.cluster->currentEpoch = 0; server.cluster->state = REDIS_CLUSTER_FAIL; server.cluster->size = 1; server.cluster->nodes = dictCreate(&clusterNodesDictType,NULL); @@ -360,6 +382,7 @@ clusterNode *createClusterNode(char *nodename, int flags) { else getRandomHexChars(node->name, REDIS_CLUSTER_NAMELEN); node->ctime = time(NULL); + node->configEpoch = 0; node->flags = flags; memset(node->slots,0,sizeof(node->slots)); node->numslots = 0; @@ -2071,9 +2094,10 @@ sds clusterGenNodesDescription(int filter) { ci = sdscatprintf(ci,"- "); /* Latency from the POV of this node, link status */ - ci = sdscatprintf(ci,"%ld %ld %s", + ci = sdscatprintf(ci,"%ld %ld %llu %s", (long) node->ping_sent, (long) node->pong_received, + (unsigned long long) node->configEpoch, (node->link || node->flags & REDIS_NODE_MYSELF) ? "connected" : "disconnected"); diff --git a/src/redis.h b/src/redis.h index 6e19ea3bd..cd2495fd8 100644 --- a/src/redis.h +++ b/src/redis.h @@ -617,6 +617,7 @@ struct clusterNode { time_t ctime; /* Node object creation time. */ char name[REDIS_CLUSTER_NAMELEN]; /* Node name, hex string, sha1-size */ int flags; /* REDIS_NODE_... */ + uint64_t configEpoch; /* Last configEpoch observed for this node */ unsigned char slots[REDIS_CLUSTER_SLOTS/8]; /* slots handled by this node */ int numslots; /* Number of slots handled by this node */ int numslaves; /* Number of slave nodes, if this is a master */ @@ -634,6 +635,7 @@ typedef struct clusterNode clusterNode; typedef struct { clusterNode *myself; /* This node */ + uint64_t currentEpoch; int state; /* REDIS_CLUSTER_OK, REDIS_CLUSTER_FAIL, ... */ int size; /* Num of master nodes with at least one slot */ dict *nodes; /* Hash table of name -> clusterNode structures */ @@ -707,6 +709,9 @@ typedef struct { uint64_t time; /* Time at which this request was sent (in milliseconds), this field is copied in reply messages so that the original sender knows how old the reply is. */ + uint64_t currentEpoch; /* The epoch accordingly to the sending node. */ + uint64_t configEpoch; /* The config epoch if it's a master, or the last epoch + advertised by its master if it is a slave. */ char sender[REDIS_CLUSTER_NAMELEN]; /* Name of the sender node */ unsigned char myslots[REDIS_CLUSTER_SLOTS/8]; char slaveof[REDIS_CLUSTER_NAMELEN]; From 1adf457b5bfe59c780b363e7290e590b2943ab39 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 25 Sep 2013 11:53:35 +0200 Subject: [PATCH 0184/2500] Cluster: broadcast currentEpoch and configEpoch in packets header. --- src/cluster.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/cluster.c b/src/cluster.c index 43a8133f9..f1cdb915e 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -1321,12 +1321,20 @@ void clusterBuildMessageHdr(clusterMsg *hdr, int type) { hdr->flags = htons(server.cluster->myself->flags); hdr->state = server.cluster->state; + /* Set the currentEpoch and configEpochs. Note that configEpoch is + * set to the master configEpoch if this node is a slave. */ + hdr->currentEpoch = htonu64(server.cluster->currentEpoch); + if (server.cluster->myself->flags & REDIS_NODE_SLAVE) + hdr->configEpoch = htonu64(server.cluster->myself->slaveof->configEpoch); + else + hdr->configEpoch = htonu64(server.cluster->myself->configEpoch); + if (type == CLUSTERMSG_TYPE_FAIL) { totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData); totlen += sizeof(clusterMsgDataFail); } hdr->totlen = htonl(totlen); - /* For PING, PONG, and MEET, fixing the totlen field is up to the caller */ + /* For PING, PONG, and MEET, fixing the totlen field is up to the caller. */ } /* Send a PING or PONG packet to the specified node, making sure to add enough From 6dbd939a2451e7e3176227bd3318a8d62a552a65 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 25 Sep 2013 12:36:29 +0200 Subject: [PATCH 0185/2500] Cluster: update our currentEpoch when a greater one is seen. --- src/cluster.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/cluster.c b/src/cluster.c index f1cdb915e..c5837647f 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -873,6 +873,7 @@ int clusterProcessPacket(clusterLink *link) { uint32_t totlen = ntohl(hdr->totlen); uint16_t type = ntohs(hdr->type); uint16_t flags = ntohs(hdr->flags); + uint64_t senderCurrentEpoch, senderConfigEpoch; clusterNode *sender; redisLog(REDIS_DEBUG,"--- Processing packet of type %d, %lu bytes", @@ -909,9 +910,17 @@ int clusterProcessPacket(clusterLink *link) { if (totlen != explen) return 1; } - /* Process packets by type. */ + /* Check if the sender is known. + * If it is, update our currentEpoch to its epoch if greater than our. */ sender = clusterLookupNode(hdr->sender); + if (sender && !(sender->flags & REDIS_NODE_HANDSHAKE)) { + senderCurrentEpoch = ntohu64(hdr->currentEpoch); + senderConfigEpoch = ntohu64(hdr->configEpoch); + if (senderCurrentEpoch > server.cluster->currentEpoch) + server.cluster->currentEpoch = senderCurrentEpoch; + } + /* Process packets by type. */ if (type == CLUSTERMSG_TYPE_PING || type == CLUSTERMSG_TYPE_MEET) { int update_config = 0; redisLog(REDIS_DEBUG,"Ping packet received: %p", (void*)link->node); From 24b28941941fb364fb41e12c0a61c1423d6b9003 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 25 Sep 2013 12:38:36 +0200 Subject: [PATCH 0186/2500] Cluster: add currentEpoch to CLUSTER INFO. --- src/cluster.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/cluster.c b/src/cluster.c index c5837647f..ede00f793 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -2393,13 +2393,15 @@ void clusterCommand(redisClient *c) { "cluster_slots_fail:%d\r\n" "cluster_known_nodes:%lu\r\n" "cluster_size:%d\r\n" + "cluster_current_epoch:%llu\r\n" , statestr[server.cluster->state], slots_assigned, slots_ok, slots_pfail, slots_fail, dictSize(server.cluster->nodes), - server.cluster->size + server.cluster->size, + (unsigned long long) server.cluster->currentEpoch ); addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n", (unsigned long)sdslen(info))); From 2cac667a8b68238354404f80e5d4a1c2d1b93e3b Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 25 Sep 2013 12:44:47 +0200 Subject: [PATCH 0187/2500] Cluster: fix redis-trib for added configEpoch field in CLUSTER NODES. --- src/redis-trib.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/redis-trib.rb b/src/redis-trib.rb index 96fdddc1c..1dc18e957 100755 --- a/src/redis-trib.rb +++ b/src/redis-trib.rb @@ -118,7 +118,7 @@ class ClusterNode nodes.each{|n| # name addr flags role ping_sent ping_recv link_status slots split = n.split - name,addr,flags,role,ping_sent,ping_recv,link_status = split[0..6] + name,addr,flags,role,ping_sent,ping_recv,config_epoch,link_status = split[0..6] slots = split[7..-1] info = { :name => name, From d392f33abb632abf59db43d6cb4639ec22f02766 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 25 Sep 2013 12:51:01 +0200 Subject: [PATCH 0188/2500] Cluster: fix redis-trib node config fingerprinting for new nodes format. --- src/redis-trib.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/redis-trib.rb b/src/redis-trib.rb index 1dc18e957..4b7acea42 100755 --- a/src/redis-trib.rb +++ b/src/redis-trib.rb @@ -119,7 +119,7 @@ class ClusterNode # name addr flags role ping_sent ping_recv link_status slots split = n.split name,addr,flags,role,ping_sent,ping_recv,config_epoch,link_status = split[0..6] - slots = split[7..-1] + slots = split[8..-1] info = { :name => name, :addr => addr, @@ -230,7 +230,7 @@ class ClusterNode config = [] @r.cluster("nodes").each_line{|l| s = l.split - slots = s[7..-1].select {|x| x[0..0] != "["} + slots = s[8..-1].select {|x| x[0..0] != "["} next if slots.length == 0 config << s[0]+":"+(slots.sort.join(",")) } From 17ce9a8da0b4133e137b7abc0c7e850bfa20ed11 Mon Sep 17 00:00:00 2001 From: Michel Martens Date: Sat, 21 Sep 2013 21:36:32 +0200 Subject: [PATCH 0189/2500] Document the redis-cli --csv option. --- src/redis-cli.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/redis-cli.c b/src/redis-cli.c index 19cace328..3b73bb908 100644 --- a/src/redis-cli.c +++ b/src/redis-cli.c @@ -784,6 +784,7 @@ static void usage() { " -c Enable cluster mode (follow -ASK and -MOVED redirections)\n" " --raw Use raw formatting for replies (default when STDOUT is\n" " not a tty)\n" +" --csv Output in CSV format\n" " --latency Enter a special mode continuously sampling latency\n" " --latency-history Like --latency but tracking latency changes over time.\n" " Default time interval is 15 sec. Change it using -i.\n" From f94165009152a93e95d6bae078850852326e901c Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 26 Sep 2013 11:13:17 +0200 Subject: [PATCH 0190/2500] Cluster: slave node now uses the new protocol to get elected. --- src/cluster.c | 52 ++++++++++++++++++++++++++++++++++++++------------ src/redis.h | 11 ++++++++--- src/sentinel.c | 2 -- 3 files changed, 48 insertions(+), 17 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index ede00f793..9ce2905e8 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -257,6 +257,7 @@ void clusterInit(void) { server.cluster->nodes = dictCreate(&clusterNodesDictType,NULL); server.cluster->failover_auth_time = 0; server.cluster->failover_auth_count = 0; + server.cluster->failover_auth_epoch = 0; memset(server.cluster->migrating_slots_to,0, sizeof(server.cluster->migrating_slots_to)); memset(server.cluster->importing_slots_from,0, @@ -1581,16 +1582,22 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { * * The gaol of this function is: * 1) To check if we are able to perform a failover, is our data updated? - * 2) Ask reachable masters the authorization to perform the failover. + * 2) Try to get elected by masters. * 3) Check if there is the majority of masters agreeing we should failover. * 4) Perform the failover informing all the other nodes. */ void clusterHandleSlaveFailover(void) { time_t data_age = server.unixtime - server.repl_down_since; - time_t auth_age = server.unixtime - server.cluster->failover_auth_time; + mstime_t auth_age = mstime() - server.cluster->failover_auth_time; int needed_quorum = (server.cluster->size / 2) + 1; int j; + /* Remove the node timeout from the data age as it is fine that we are + * disconnected from our master at least for the time it was down to be + * flagged as FAIL, that's the baseline. */ + if (data_age > server.cluster_node_timeout) + data_age -= server.cluster_node_timeout; + /* Check if our data is recent enough. For now we just use a fixed * constant of ten times the node timeout since the cluster should * react much faster to a master down. */ @@ -1598,19 +1605,37 @@ void clusterHandleSlaveFailover(void) { server.cluster_node_timeout * REDIS_CLUSTER_SLAVE_VALIDITY_MULT) return; - /* TODO: check if we are the first slave as well? Or just rely on the - * master authorization? */ - - /* Ask masters if we are authorized to perform the failover. If there - * is a pending auth request that's too old, reset it. */ + /* Compute the time at which we can start an election. */ if (server.cluster->failover_auth_time == 0 || auth_age > - server.cluster_node_timeout * REDIS_CLUSTER_FAILOVER_AUTH_RETRY_MULT) + server.cluster_node_timeout * 1000 * REDIS_CLUSTER_FAILOVER_AUTH_RETRY_MULT) { - redisLog(REDIS_WARNING,"Asking masters if I can failover..."); - server.cluster->failover_auth_time = time(NULL); + server.cluster->failover_auth_time = mstime() + + 500 + /* Fixed delay of 500 milliseconds, let FAIL msg propagate. */ + data_age * 100 + /* Add 100 milliseconds for every second of age. */ + random() % 500; /* Random delay between 0 and 500 milliseconds. */ server.cluster->failover_auth_count = 0; + server.cluster->failover_auth_sent = 0; + redisLog(REDIS_WARNING,"Start of election delayed for %lld milliseconds.", + server.cluster->failover_auth_time - mstime()); + return; + } + + /* Return ASAP if we can't still start the election. */ + if (mstime() < server.cluster->failover_auth_time) return; + + /* Return ASAP if the election is too old to be valid. */ + if (mstime() - server.cluster->failover_auth_time > server.cluster_node_timeout) + return; + + /* Ask for votes if needed. */ + if (server.cluster->failover_auth_sent == 0) { + server.cluster->currentEpoch++; + server.cluster->failover_auth_epoch = server.cluster->currentEpoch; + redisLog(REDIS_WARNING,"Starting a failover election for epoch %llu.", + server.cluster->currentEpoch); clusterRequestFailoverAuth(); + server.cluster->failover_auth_sent = 1; return; /* Wait for replies. */ } @@ -1619,7 +1644,7 @@ void clusterHandleSlaveFailover(void) { clusterNode *oldmaster = server.cluster->myself->slaveof; redisLog(REDIS_WARNING, - "Masters quorum reached: failing over my (failing) master."); + "Failover election won: failing over my (failing) master."); /* We have the quorum, perform all the steps to correctly promote * this slave to a master. * @@ -1644,7 +1669,10 @@ void clusterHandleSlaveFailover(void) { * accordingly and detect that we switched to master role. */ clusterBroadcastPong(); - /* 4) Update state and save config. */ + /* 4) Update my configEpoch to the epoch of the election. */ + server.cluster->myself->configEpoch = server.cluster->failover_auth_epoch; + + /* 5) Update state and save config. */ clusterUpdateState(); clusterSaveConfigOrDie(); } diff --git a/src/redis.h b/src/redis.h index cd2495fd8..2b7ca7a04 100644 --- a/src/redis.h +++ b/src/redis.h @@ -368,6 +368,8 @@ * Data types *----------------------------------------------------------------------------*/ +typedef long long mstime_t; /* millisecond time type. */ + /* A redis object, that is a type able to hold a string / list / set */ /* The actual Redis Object */ @@ -581,7 +583,7 @@ typedef struct redisOpArray { #define REDIS_CLUSTER_FAIL_UNDO_TIME_MULT 2 /* Undo fail if master is back. */ #define REDIS_CLUSTER_FAIL_UNDO_TIME_ADD 10 /* Some additional time. */ #define REDIS_CLUSTER_SLAVE_VALIDITY_MULT 10 /* Slave data validity. */ -#define REDIS_CLUSTER_FAILOVER_AUTH_RETRY_MULT 1 /* Auth request retry time. */ +#define REDIS_CLUSTER_FAILOVER_AUTH_RETRY_MULT 4 /* Auth request retry time. */ #define REDIS_CLUSTER_FAILOVER_DELAY 5 /* Seconds */ struct clusterNode; @@ -643,8 +645,11 @@ typedef struct { clusterNode *importing_slots_from[REDIS_CLUSTER_SLOTS]; clusterNode *slots[REDIS_CLUSTER_SLOTS]; zskiplist *slots_to_keys; - int failover_auth_time; /* Time at which we sent the AUTH request. */ - int failover_auth_count; /* Number of authorizations received. */ + /* The following fields are used to take the slave state on elections. */ + mstime_t failover_auth_time;/* Time at which we'll try to get elected in ms. */ + int failover_auth_count; /* Number of votes received so far. */ + int failover_auth_sent; /* True if we already asked for votes. */ + uint64_t failover_auth_epoch; /* Epoch of the current election. */ } clusterState; /* Redis cluster messages header */ diff --git a/src/sentinel.c b/src/sentinel.c index b257ad685..4bea156d6 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -43,8 +43,6 @@ extern char **environ; /* ======================== Sentinel global state =========================== */ -typedef long long mstime_t; /* millisecond time type. */ - /* Address object, used to describe an ip:port pair. */ typedef struct sentinelAddr { char *ip; From 3bd69bcdf11a555c0282d86b2de13d1ac3c12d0b Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 26 Sep 2013 13:00:41 +0200 Subject: [PATCH 0191/2500] Cluster: master node now uses new protocol to vote. --- src/cluster.c | 69 +++++++++++++++++++++++++-------------------------- src/redis.h | 3 +++ 2 files changed, 37 insertions(+), 35 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 9ce2905e8..24d3efe33 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -258,6 +258,7 @@ void clusterInit(void) { server.cluster->failover_auth_time = 0; server.cluster->failover_auth_count = 0; server.cluster->failover_auth_epoch = 0; + server.cluster->last_vote_epoch = 0; memset(server.cluster->migrating_slots_to,0, sizeof(server.cluster->migrating_slots_to)); memset(server.cluster->importing_slots_from,0, @@ -396,6 +397,7 @@ clusterNode *createClusterNode(char *nodename, int flags) { memset(node->ip,0,sizeof(node->ip)); node->port = 0; node->fail_reports = listCreate(); + node->voted_time = 0; listSetFreeMethod(node->fail_reports,zfree); return node; } @@ -1178,15 +1180,18 @@ int clusterProcessPacket(clusterLink *link) { } } else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST) { if (!sender) return 1; /* We don't know that node. */ - /* If we are not a master, ignore that message at all. */ - if (!(server.cluster->myself->flags & REDIS_NODE_MASTER)) return 0; clusterSendFailoverAuthIfNeeded(sender,hdr); } else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK) { if (!sender) return 1; /* We don't know that node. */ - /* If this is a master, increment the number of acknowledges - * we received so far. */ - if (sender->flags & REDIS_NODE_MASTER) + /* We consider this vote only if the sender if a master serving + * a non zero number of slots, with the currentEpoch that is equal + * to our currentEpoch. */ + if (sender->flags & REDIS_NODE_MASTER && + sender->numslots > 0 && + senderCurrentEpoch == server.cluster->currentEpoch) + { server.cluster->failover_auth_count++; + } } else { redisLog(REDIS_WARNING,"Received unknown packet type: %d", type); } @@ -1538,43 +1543,38 @@ void clusterSendFailoverAuth(clusterNode *node, uint64_t reqtime) { clusterSendMessage(node->link,buf,totlen); } -/* If we believe 'node' is the "first slave" of it's master, reply with - * a FAILOVER_AUTH_GRANTED packet. - * The 'request' field points to the authorization request packet header, we - * need it in order to copy back the 'time' field in our reply. - * - * To be a first slave the sender must: - * 1) Be a slave. - * 2) Its master should be in FAIL state. - * 3) Ordering all the slaves IDs for its master by run-id, it should be the - * first (the smallest) among the ones not in FAIL / PFAIL state. - */ +/* Vote for the node asking for our vote if there are the conditions. */ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { - char first[REDIS_CLUSTER_NAMELEN]; clusterNode *master = node->slaveof; - int j; + uint64_t requestEpoch = ntohu64(request->currentEpoch); - /* Node is a slave? Its master is down? */ + /* IF we are not a master serving at least 1 slot, we don't have the + * right to vote, as the cluster size in Redis Cluster is the number + * of masters serving at least one slot, and quorum is the cluster size + 1 */ + if (!(server.cluster->myself->flags & REDIS_NODE_MASTER)) return; + if (server.cluster->myself->numslots == 0) return; + + /* Request epoch must be >= our currentEpoch. */ + if (requestEpoch < server.cluster->currentEpoch) return; + + /* I already voted for this epoch? Return ASAP. */ + if (server.cluster->last_vote_epoch == server.cluster->currentEpoch) return; + + /* Node must be a slave and its master down. */ if (!(node->flags & REDIS_NODE_SLAVE) || master == NULL || !(master->flags & REDIS_NODE_FAIL)) return; - /* Iterate all the master slaves to check what's the first one. */ - memset(first,0xff,sizeof(first)); - for (j = 0; j < master->numslaves; j++) { - clusterNode *slave = master->slaves[j]; + /* We did not voted for a slave about this master for two + * times the node timeout. This is not strictly needed for correctness + * of the algorithm but makes the base case more linear. */ + if (server.unixtime - node->slaveof->voted_time < + server.cluster_node_timeout * 2) return; - if (slave->flags & (REDIS_NODE_FAIL|REDIS_NODE_PFAIL)) continue; - if (memcmp(slave->name,first,sizeof(first)) < 0) { - memcpy(first,slave->name,sizeof(first)); - } - } - - /* Is 'node' the first slave? */ - if (memcmp(node->name,first,sizeof(first)) != 0) return; - - /* We can send the packet. */ + /* We can vote for this slave. */ clusterSendFailoverAuth(node,request->time); + server.cluster->last_vote_epoch = server.cluster->currentEpoch; + node->slaveof->voted_time = server.unixtime; } /* This function is called if we are a slave node and our master serving @@ -1583,8 +1583,7 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { * The gaol of this function is: * 1) To check if we are able to perform a failover, is our data updated? * 2) Try to get elected by masters. - * 3) Check if there is the majority of masters agreeing we should failover. - * 4) Perform the failover informing all the other nodes. + * 3) Perform the failover informing all the other nodes. */ void clusterHandleSlaveFailover(void) { time_t data_age = server.unixtime - server.repl_down_since; diff --git a/src/redis.h b/src/redis.h index 2b7ca7a04..66c751a12 100644 --- a/src/redis.h +++ b/src/redis.h @@ -628,6 +628,7 @@ struct clusterNode { time_t ping_sent; /* Unix time we sent latest ping */ time_t pong_received; /* Unix time we received the pong */ time_t fail_time; /* Unix time when FAIL flag was set */ + time_t voted_time; /* Last time we voted for a slave of this master */ char ip[REDIS_IP_STR_LEN]; /* Latest known IP address of this node */ int port; /* Latest known port of this node */ clusterLink *link; /* TCP/IP link with this node */ @@ -650,6 +651,8 @@ typedef struct { int failover_auth_count; /* Number of votes received so far. */ int failover_auth_sent; /* True if we already asked for votes. */ uint64_t failover_auth_epoch; /* Epoch of the current election. */ + /* The followign fields are uesd by masters to take state on elections. */ + uint64_t last_vote_epoch; /* Epoch of the last vote granted. */ } clusterState; /* Redis cluster messages header */ From 7dfa4c59810014ddef09ff502151dc00234c99b1 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 26 Sep 2013 13:28:19 +0200 Subject: [PATCH 0192/2500] Cluster: removed an old source of delay to start the slave failover. --- src/cluster.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 24d3efe33..d85086069 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -1624,7 +1624,7 @@ void clusterHandleSlaveFailover(void) { if (mstime() < server.cluster->failover_auth_time) return; /* Return ASAP if the election is too old to be valid. */ - if (mstime() - server.cluster->failover_auth_time > server.cluster_node_timeout) + if (mstime() - server.cluster->failover_auth_time > server.cluster_node_timeout * 1000) return; /* Ask for votes if needed. */ @@ -1835,16 +1835,10 @@ void clusterCron(void) { } /* If we are a slave and our master is down, but is serving slots, - * call the function that handles the failover. - * This function is called with a small delay in order to let the - * FAIL message to propagate after failure detection, this is not - * strictly required but makes 99.99% of failovers mechanically - * simpler. */ + * call the function that handles the failover. */ if (server.cluster->myself->flags & REDIS_NODE_SLAVE && server.cluster->myself->slaveof && server.cluster->myself->slaveof->flags & REDIS_NODE_FAIL && - (server.unixtime - server.cluster->myself->slaveof->fail_time) > - REDIS_CLUSTER_FAILOVER_DELAY && server.cluster->myself->slaveof->numslots != 0) { clusterHandleSlaveFailover(); From c8d6bc94e4cd074af0679f17ac7cd0ff821eb449 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 26 Sep 2013 16:54:43 +0200 Subject: [PATCH 0193/2500] Cluster: react faster when a slave wins an election. --- src/cluster.c | 55 +++++++++++++++++++++++++++++++++++++-------------- src/redis.c | 3 +++ src/redis.h | 4 +++- 3 files changed, 46 insertions(+), 16 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index d85086069..dd496be4c 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -53,6 +53,7 @@ int clusterDelSlot(int slot); int clusterDelNodeSlots(clusterNode *node); int clusterNodeSetSlotBit(clusterNode *n, int slot); void clusterSetMaster(clusterNode *n); +void clusterHandleSlaveFailover(void); int bitmapTestBit(unsigned char *bitmap, int pos); /* ----------------------------------------------------------------------------- @@ -1191,6 +1192,9 @@ int clusterProcessPacket(clusterLink *link) { senderCurrentEpoch == server.cluster->currentEpoch) { server.cluster->failover_auth_count++; + /* Maybe we reached a quorum here, set a flag to make sure + * we check ASAP. */ + server.cluster->handle_slave_failover_asap++; } } else { redisLog(REDIS_WARNING,"Received unknown packet type: %d", type); @@ -1291,7 +1295,11 @@ void clusterReadHandler(aeEventLoop *el, int fd, void *privdata, int mask) { } } -/* Put stuff into the send buffer. */ +/* Put stuff into the send buffer. + * + * It is guaranteed that this function will never have as a side effect + * the link to be invalidated, so it is safe to call this function + * from event handlers that will do stuff with the same link later. */ void clusterSendMessage(clusterLink *link, unsigned char *msg, size_t msglen) { if (sdslen(link->sndbuf) == 0 && msglen != 0) aeCreateFileEvent(server.el,link->fd,AE_WRITABLE, @@ -1301,7 +1309,11 @@ void clusterSendMessage(clusterLink *link, unsigned char *msg, size_t msglen) { } /* Send a message to all the nodes that are part of the cluster having - * a connected link. */ + * a connected link. + * + * It is guaranteed that this function will never have as a side effect + * some node->link to be invalidated, so it is safe to call this function + * from event handlers that will do stuff with node links later. */ void clusterBroadcastMessage(void *buf, size_t len) { dictIterator *di; dictEntry *de; @@ -1416,10 +1428,11 @@ void clusterSendPing(clusterLink *link, int type) { clusterSendMessage(link,buf,totlen); } -/* Send a PONG packet to every connected node that's not in handshake state. +/* Send a PONG packet to every connected node that's not in handshake state + * and for which we have a valid link. * - * In Redis Cluster pings are not just used for failure detection, but also - * to carry important configuration informations. So broadcasting a pong is + * In Redis Cluster pongs are not used just for failure detection, but also + * to carry important configuration information. So broadcasting a pong is * useful when something changes in the configuration and we want to make * the cluster aware ASAP (for instance after a slave promotion). */ void clusterBroadcastPong(void) { @@ -1430,6 +1443,7 @@ void clusterBroadcastPong(void) { while((de = dictNext(di)) != NULL) { clusterNode *node = dictGetVal(de); + if (!node->link) continue; if (node->flags & (REDIS_NODE_MYSELF|REDIS_NODE_HANDSHAKE)) continue; clusterSendPing(node->link,CLUSTERMSG_TYPE_PONG); } @@ -1591,6 +1605,15 @@ void clusterHandleSlaveFailover(void) { int needed_quorum = (server.cluster->size / 2) + 1; int j; + /* Pre conditions to run the function: + * 1) We are a slave. + * 2) Our master is flagged as FAIL. + * 3) It is serving slots. */ + if (!(server.cluster->myself->flags & REDIS_NODE_SLAVE) || + server.cluster->myself->slaveof == NULL || + !(server.cluster->myself->slaveof->flags & REDIS_NODE_FAIL) || + server.cluster->myself->slaveof->numslots == 0) return; + /* Remove the node timeout from the data age as it is fine that we are * disconnected from our master at least for the time it was down to be * flagged as FAIL, that's the baseline. */ @@ -1834,19 +1857,21 @@ void clusterCron(void) { server.cluster->myself->slaveof->port); } - /* If we are a slave and our master is down, but is serving slots, - * call the function that handles the failover. */ - if (server.cluster->myself->flags & REDIS_NODE_SLAVE && - server.cluster->myself->slaveof && - server.cluster->myself->slaveof->flags & REDIS_NODE_FAIL && - server.cluster->myself->slaveof->numslots != 0) - { - clusterHandleSlaveFailover(); - } - + clusterHandleSlaveFailover(); if (update_state) clusterUpdateState(); } +/* This function is called before the event handler returns to sleep for + * events. It is useful to perform operations that must be done ASAP in + * reaction to events fired but that are not safe to perform inside event + * handlers. */ +void clusterBeforeSleep(void) { + if (server.cluster->handle_slave_failover_asap) { + clusterHandleSlaveFailover(); + server.cluster->handle_slave_failover_asap = 0; + } +} + /* ----------------------------------------------------------------------------- * Slots management * -------------------------------------------------------------------------- */ diff --git a/src/redis.c b/src/redis.c index 2792f1393..bd547cd37 100644 --- a/src/redis.c +++ b/src/redis.c @@ -1203,6 +1203,9 @@ void beforeSleep(struct aeEventLoop *eventLoop) { /* Write the AOF buffer on disk */ flushAppendOnlyFile(0); + + /* Call the Redis Cluster before sleep function. */ + if (server.cluster_enabled) clusterBeforeSleep(); } /* =========================== Server initialization ======================== */ diff --git a/src/redis.h b/src/redis.h index 66c751a12..5883bd382 100644 --- a/src/redis.h +++ b/src/redis.h @@ -647,12 +647,13 @@ typedef struct { clusterNode *slots[REDIS_CLUSTER_SLOTS]; zskiplist *slots_to_keys; /* The following fields are used to take the slave state on elections. */ - mstime_t failover_auth_time;/* Time at which we'll try to get elected in ms. */ + mstime_t failover_auth_time;/* Time at which we'll try to get elected in ms*/ int failover_auth_count; /* Number of votes received so far. */ int failover_auth_sent; /* True if we already asked for votes. */ uint64_t failover_auth_epoch; /* Epoch of the current election. */ /* The followign fields are uesd by masters to take state on elections. */ uint64_t last_vote_epoch; /* Epoch of the last vote granted. */ + int handle_slave_failover_asap; /* Call clusterHandleSlaveFailover() ASAP. */ } clusterState; /* Redis cluster messages header */ @@ -1380,6 +1381,7 @@ void clusterCron(void); clusterNode *getNodeByQuery(redisClient *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot, int *ask); void clusterPropagatePublish(robj *channel, robj *message); void migrateCloseTimedoutSockets(void); +void clusterBeforeSleep(void); /* Sentinel */ void initSentinelConfig(void); From 8fa4e7817a4f0d16a83963e07280685ee0722833 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 27 Sep 2013 09:55:41 +0200 Subject: [PATCH 0194/2500] Cluster: update the node configEpoch when newer is detected. --- src/cluster.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index dd496be4c..7aad66542 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -914,14 +914,17 @@ int clusterProcessPacket(clusterLink *link) { if (totlen != explen) return 1; } - /* Check if the sender is known. - * If it is, update our currentEpoch to its epoch if greater than our. */ + /* Check if the sender is a known node. */ sender = clusterLookupNode(hdr->sender); if (sender && !(sender->flags & REDIS_NODE_HANDSHAKE)) { + /* Update our curretEpoch if we see a newer epoch in the cluster. */ senderCurrentEpoch = ntohu64(hdr->currentEpoch); senderConfigEpoch = ntohu64(hdr->configEpoch); if (senderCurrentEpoch > server.cluster->currentEpoch) server.cluster->currentEpoch = senderCurrentEpoch; + /* Update the sender configEpoch if it is publishing a newer one. */ + if (senderConfigEpoch > sender->configEpoch) + sender->configEpoch = senderConfigEpoch; } /* Process packets by type. */ @@ -1999,8 +2002,14 @@ void clusterUpdateState(void) { } /* If we can't reach at least half the masters, change the cluster state - * as FAIL, as we are not even able to mark nodes as FAIL in this side - * of the netsplit because of lack of majority. */ + * to FAIL, as we are not even able to mark nodes as FAIL in this side + * of the netsplit because of lack of majority. + * + * TODO: when this condition is entered, we should not undo it for some + * (small) time after the majority is reachable again, to make sure that + * other nodes have enough time to inform this node of a configuration change. + * Otherwise a client with an old routing table may write to this node + * and later it may turn into a slave losing the write. */ { int needed_quorum = (server.cluster->size / 2) + 1; From 5d393adeac8251ff54dbf4eef4c199dde2862d5b Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 30 Sep 2013 10:13:07 +0200 Subject: [PATCH 0195/2500] Cluster: fsync data when saving the cluster config. --- src/cluster.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/cluster.c b/src/cluster.c index 7aad66542..4751c78f7 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -231,6 +231,7 @@ int clusterSaveConfig(void) { if ((fd = open(server.cluster_configfile,O_WRONLY|O_CREAT|O_TRUNC,0644)) == -1) goto err; if (write(fd,ci,sdslen(ci)) != (ssize_t)sdslen(ci)) goto err; + fsync(fd); close(fd); sdsfree(ci); return 0; From 0b63dc2841984afe5a3d913e51aba9cb21c5f486 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 30 Sep 2013 10:13:33 +0200 Subject: [PATCH 0196/2500] Cluster: when upading the configEpoch for a node, save config on disk ASAP. --- src/cluster.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/cluster.c b/src/cluster.c index 4751c78f7..f0a6ddeb4 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -924,8 +924,10 @@ int clusterProcessPacket(clusterLink *link) { if (senderCurrentEpoch > server.cluster->currentEpoch) server.cluster->currentEpoch = senderCurrentEpoch; /* Update the sender configEpoch if it is publishing a newer one. */ - if (senderConfigEpoch > sender->configEpoch) + if (senderConfigEpoch > sender->configEpoch) { sender->configEpoch = senderConfigEpoch; + clusterSaveConfigOrDie(); + } } /* Process packets by type. */ From 2a391b8bac975b0f547064010226b71278c4e22c Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 30 Sep 2013 10:13:58 +0200 Subject: [PATCH 0197/2500] Cluster: re-order failover operations to make it safer. We need to: 1) Increment the configEpoch. 2) Save it to disk and fsync the file. 3) Broadcast the PONG with the new configuration. If other nodes will receive the updated configuration we need to be sure to restart with this new config in the event of a crash. --- src/cluster.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index f0a6ddeb4..e8ee45c40 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -1693,16 +1693,16 @@ void clusterHandleSlaveFailover(void) { } } - /* 3) Pong all the other nodes so that they can update the state - * accordingly and detect that we switched to master role. */ - clusterBroadcastPong(); - - /* 4) Update my configEpoch to the epoch of the election. */ + /* 3) Update my configEpoch to the epoch of the election. */ server.cluster->myself->configEpoch = server.cluster->failover_auth_epoch; - /* 5) Update state and save config. */ + /* 4) Update state and save config. */ clusterUpdateState(); clusterSaveConfigOrDie(); + + /* 5) Pong all the other nodes so that they can update the state + * accordingly and detect that we switched to master role. */ + clusterBroadcastPong(); } } From 1239f490659397dd4eaf3c73baa8baf06833af3e Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 30 Sep 2013 11:44:23 +0200 Subject: [PATCH 0198/2500] Cluster: detect cluster reconfiguration when master slots drop to 0. The old algorithm used a PROMOTED flag and explicitly checks about slave->master convertions. Wit the new cluster meta-data propagation algorithm we just look at the configEpoch to check if we need to reconfigure slots, then: 1) If a node is a master but it reaches zero served slots becuase of reconfiguration. 2) If a node is a slave but the master reaches zero served slots because of a reconfiguration. We switch as a replica of the new slots owner. --- src/cluster.c | 77 ++++++++++++++++++++++++--------------------------- 1 file changed, 36 insertions(+), 41 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index e8ee45c40..a5faa1b86 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -135,8 +135,6 @@ int clusterLoadConfig(char *filename) { n->flags |= REDIS_NODE_HANDSHAKE; } else if (!strcasecmp(s,"noaddr")) { n->flags |= REDIS_NODE_NOADDR; - } else if (!strcasecmp(s,"promoted")) { - n->flags |= REDIS_NODE_PROMOTED; } else if (!strcasecmp(s,"noflags")) { /* nothing to do */ } else { @@ -755,7 +753,6 @@ void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) { if (flags & REDIS_NODE_FAIL) ci = sdscat(ci,"fail,"); if (flags & REDIS_NODE_HANDSHAKE) ci = sdscat(ci,"handshake,"); if (flags & REDIS_NODE_NOADDR) ci = sdscat(ci,"noaddr,"); - if (flags & REDIS_NODE_PROMOTED) ci = sdscat(ci,"promoted,"); if (ci[sdslen(ci)-1] == ',') ci[sdslen(ci)-1] = ' '; redisLog(REDIS_DEBUG,"GOSSIP %.40s %s:%d %s", @@ -1051,9 +1048,6 @@ int clusterProcessPacket(clusterLink *link) { { /* Node is a master. */ if (sender->flags & REDIS_NODE_SLAVE) { - /* Slave turned into master! */ - clusterNode *oldmaster = sender->slaveof; - /* Reconfigure node as master. */ if (sender->slaveof) clusterNodeRemoveSlave(sender->slaveof,sender); @@ -1061,29 +1055,6 @@ int clusterProcessPacket(clusterLink *link) { sender->flags |= REDIS_NODE_MASTER; sender->slaveof = NULL; - /* If this node used to be our slave, and now has the - * PROMOTED flag set. We'll turn ourself into a slave - * of the new master. */ - if (flags & REDIS_NODE_PROMOTED && - oldmaster == server.cluster->myself) - { - redisLog(REDIS_WARNING,"One of my slaves took my place. Reconfiguring myself as a replica of %.40s", sender->name); - clusterDelNodeSlots(server.cluster->myself); - clusterSetMaster(sender); - } - - /* If we are a slave, and this node used to be a slave - * of our master, and now has the PROMOTED flag set, we - * need to switch our replication setup over it. */ - if (flags & REDIS_NODE_PROMOTED && - server.cluster->myself->flags & REDIS_NODE_SLAVE && - server.cluster->myself->slaveof == oldmaster) - { - redisLog(REDIS_WARNING,"One of the slaves failed over my master. Reconfiguring myself as a replica of %.40s", sender->name); - clusterDelNodeSlots(server.cluster->myself); - clusterSetMaster(sender); - } - /* Update config and state. */ update_state = 1; update_config = 1; @@ -1125,26 +1096,55 @@ int clusterProcessPacket(clusterLink *link) { changes = memcmp(sender->slots,hdr->myslots,sizeof(hdr->myslots)) != 0; if (changes) { + clusterNode *curmaster, *newmaster = NULL; + + /* Here we set curmaster to this node or the node this node + * replicates to if it's a slave. In the for loop we are + * interested to check if slots are taken away from curmaster. */ + if (server.cluster->myself->flags & REDIS_NODE_MASTER) + curmaster = server.cluster->myself; + else + curmaster = server.cluster->myself->slaveof; + for (j = 0; j < REDIS_CLUSTER_SLOTS; j++) { if (bitmapTestBit(hdr->myslots,j)) { - /* If this slot was not served, or served by a node - * in FAIL state, update the table with the new node - * claiming to serve the slot. */ + /* We rebind the slot to the new node claiming it if: + * 1) The slot was unassigned. + * 2) The new node claims it with a greater configEpoch. */ if (server.cluster->slots[j] == sender) continue; if (server.cluster->slots[j] == NULL || - server.cluster->slots[j]->flags & REDIS_NODE_FAIL) + server.cluster->slots[j]->configEpoch < + senderConfigEpoch) { + if (server.cluster->slots[j] == curmaster) + newmaster = sender; clusterDelSlot(j); clusterAddSlot(sender,j); update_state = update_config = 1; } } else { /* This node claims to no longer handling the slot, - * however we don't change our config as this is likely - * happening because a resharding is in progress, and - * it already knows where to redirect clients. */ + * however we don't change our config as this is likely: + * 1) Rehashing in progress. + * 2) Failover. + * In both cases we'll be informed about who is serving + * the slot eventually. In the meantime it's up to the + * original owner to try to redirect our clients to the + * right node. */ } } + + /* If at least one slot was reassigned from a node to another node + * with a greater configEpoch, it is possible that: + * 1) We are a master is left without slots. This means that we were + * failed over and we should turn into a replica of the new + * master. + * 2) We are a slave and our master is left without slots. We need + * to replicate to the new slots owner. */ + if (newmaster && curmaster->numslots == 0) { + redisLog(REDIS_WARNING,"Configuration change detected. Reconfiguring myself as a replica of %.40s", sender->name); + clusterSetMaster(sender); + } } } @@ -1681,7 +1681,6 @@ void clusterHandleSlaveFailover(void) { server.cluster->myself); server.cluster->myself->flags &= ~REDIS_NODE_SLAVE; server.cluster->myself->flags |= REDIS_NODE_MASTER; - server.cluster->myself->flags |= REDIS_NODE_PROMOTED; server.cluster->myself->slaveof = NULL; replicationUnsetMaster(); @@ -2109,9 +2108,6 @@ void clusterSetMaster(clusterNode *n) { myself->flags &= ~REDIS_NODE_MASTER; myself->flags |= REDIS_NODE_SLAVE; } - /* Clear the promoted flag anyway if we are a slave, to ensure it will - * be set only when the node turns into a master because of fail over. */ - myself->flags &= ~REDIS_NODE_PROMOTED; myself->slaveof = n; replicationSetMaster(n->ip, n->port); } @@ -2159,7 +2155,6 @@ sds clusterGenNodesDescription(int filter) { if (node->flags & REDIS_NODE_FAIL) ci = sdscat(ci,"fail,"); if (node->flags & REDIS_NODE_HANDSHAKE) ci =sdscat(ci,"handshake,"); if (node->flags & REDIS_NODE_NOADDR) ci = sdscat(ci,"noaddr,"); - if (node->flags & REDIS_NODE_PROMOTED) ci = sdscat(ci,"promoted,"); if (ci[sdslen(ci)-1] == ',') ci[sdslen(ci)-1] = ' '; /* Slave of... or just "-" */ From ec3bd0695b69bf7f1db43a099342e7281a9594b3 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 30 Sep 2013 11:48:09 +0200 Subject: [PATCH 0199/2500] Make clear that runids are not cluster node IDs. --- src/replication.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/replication.c b/src/replication.c index 81323cb60..0b75f4eeb 100644 --- a/src/replication.c +++ b/src/replication.c @@ -343,7 +343,7 @@ int masterTryPartialResynchronization(redisClient *c) { /* Run id "?" is used by slaves that want to force a full resync. */ if (master_runid[0] != '?') { redisLog(REDIS_NOTICE,"Partial resynchronization not accepted: " - "Runid mismatch (Client asked for '%s', I'm '%s')", + "Runid mismatch (Client asked for runid '%s', my runid is '%s')", master_runid, server.runid); } else { redisLog(REDIS_NOTICE,"Full resync requested by slave."); From 60d4ae49be51a9be88d69162624a996e13e9969f Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 30 Sep 2013 11:51:58 +0200 Subject: [PATCH 0200/2500] Cluster: log message shortened. --- src/cluster.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cluster.c b/src/cluster.c index a5faa1b86..a174a964b 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -1672,7 +1672,7 @@ void clusterHandleSlaveFailover(void) { clusterNode *oldmaster = server.cluster->myself->slaveof; redisLog(REDIS_WARNING, - "Failover election won: failing over my (failing) master."); + "Failover election won: I'm the new master."); /* We have the quorum, perform all the steps to correctly promote * this slave to a master. * From 0b3a8f20727e0b87d7f0a83ba3b147c49626016a Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 30 Sep 2013 11:53:18 +0200 Subject: [PATCH 0201/2500] Add REWRITE to CONFIG subcommands help message. --- src/config.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/config.c b/src/config.c index 3a14a6e94..2707bfaeb 100644 --- a/src/config.c +++ b/src/config.c @@ -1786,7 +1786,7 @@ void configCommand(redisClient *c) { } } else { addReplyError(c, - "CONFIG subcommand must be one of GET, SET, RESETSTAT"); + "CONFIG subcommand must be one of GET, SET, RESETSTAT, REWRITE"); } return; From 6ed0dee927f8476418f7330ab561dfa8e8753721 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 30 Sep 2013 16:19:44 +0200 Subject: [PATCH 0202/2500] Cluster: time field removed from cluster messages header. The new algorithm does not check replies time as checking for the currentEpoch in the reply ensures that the reply is about the current election process. --- src/cluster.c | 10 +++------- src/redis.h | 3 --- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index a174a964b..42c4acb9b 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -1543,14 +1543,11 @@ void clusterRequestFailoverAuth(void) { clusterBuildMessageHdr(hdr,CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST); totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData); hdr->totlen = htonl(totlen); - hdr->time = mstime(); clusterBroadcastMessage(buf,totlen); } -/* Send a FAILOVER_AUTH_ACK message to the specified node. - * Reqtime is the time field from the original failover auth request packet, - * so that the receiver is able to check the reply age. */ -void clusterSendFailoverAuth(clusterNode *node, uint64_t reqtime) { +/* Send a FAILOVER_AUTH_ACK message to the specified node. */ +void clusterSendFailoverAuth(clusterNode *node) { unsigned char buf[4096]; clusterMsg *hdr = (clusterMsg*) buf; uint32_t totlen; @@ -1559,7 +1556,6 @@ void clusterSendFailoverAuth(clusterNode *node, uint64_t reqtime) { clusterBuildMessageHdr(hdr,CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK); totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData); hdr->totlen = htonl(totlen); - hdr->time = reqtime; clusterSendMessage(node->link,buf,totlen); } @@ -1592,7 +1588,7 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { server.cluster_node_timeout * 2) return; /* We can vote for this slave. */ - clusterSendFailoverAuth(node,request->time); + clusterSendFailoverAuth(node); server.cluster->last_vote_epoch = server.cluster->currentEpoch; node->slaveof->voted_time = server.unixtime; } diff --git a/src/redis.h b/src/redis.h index 5883bd382..995198f66 100644 --- a/src/redis.h +++ b/src/redis.h @@ -715,9 +715,6 @@ typedef struct { uint32_t totlen; /* Total length of this message */ uint16_t type; /* Message type */ uint16_t count; /* Only used for some kind of messages. */ - uint64_t time; /* Time at which this request was sent (in milliseconds), - this field is copied in reply messages so that the - original sender knows how old the reply is. */ uint64_t currentEpoch; /* The epoch accordingly to the sending node. */ uint64_t configEpoch; /* The config epoch if it's a master, or the last epoch advertised by its master if it is a slave. */ From 0000cfbf3888bf591d7ed49de1848470029bdfa5 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 1 Oct 2013 15:40:20 +0200 Subject: [PATCH 0203/2500] Cluster: fix typo in clusterProcessPacket() comment. --- src/cluster.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cluster.c b/src/cluster.c index 42c4acb9b..e5aa043fa 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -1190,7 +1190,7 @@ int clusterProcessPacket(clusterLink *link) { clusterSendFailoverAuthIfNeeded(sender,hdr); } else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK) { if (!sender) return 1; /* We don't know that node. */ - /* We consider this vote only if the sender if a master serving + /* We consider this vote only if the sender is a master serving * a non zero number of slots, with the currentEpoch that is equal * to our currentEpoch. */ if (sender->flags & REDIS_NODE_MASTER && From 3be5010adb1abded30ab48b4222f681db9455d72 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 1 Oct 2013 17:21:28 +0200 Subject: [PATCH 0204/2500] Cluster: senderCurrentEpoch == node currentEpoch was too strict. We can accept a vote as long as its epoch is >= the epoch at which we started the voting process. There is no need for it to be exactly the same. --- src/cluster.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index e5aa043fa..ab5283933 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -1191,11 +1191,11 @@ int clusterProcessPacket(clusterLink *link) { } else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK) { if (!sender) return 1; /* We don't know that node. */ /* We consider this vote only if the sender is a master serving - * a non zero number of slots, with the currentEpoch that is equal - * to our currentEpoch. */ + * a non zero number of slots, and its currentEpoch is greater or + * equal to epoch where this node started the election. */ if (sender->flags & REDIS_NODE_MASTER && sender->numslots > 0 && - senderCurrentEpoch == server.cluster->currentEpoch) + senderCurrentEpoch >= server.cluster->failover_auth_epoch) { server.cluster->failover_auth_count++; /* Maybe we reached a quorum here, set a flag to make sure From 90b06ab7b5b06cb098be5bd2727dd2f12b63053b Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 2 Oct 2013 09:42:35 +0200 Subject: [PATCH 0205/2500] Cluster: FAIL messages from unknown senders are handled better. Previously the event was not logged but instead the node reported an unknown packet type received. --- src/cluster.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index ab5283933..9328ab1be 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -1154,20 +1154,26 @@ int clusterProcessPacket(clusterLink *link) { /* Update the cluster state if needed */ if (update_state) clusterUpdateState(); if (update_config) clusterSaveConfigOrDie(); - } else if (type == CLUSTERMSG_TYPE_FAIL && sender) { + } else if (type == CLUSTERMSG_TYPE_FAIL) { clusterNode *failing; - failing = clusterLookupNode(hdr->data.fail.about.nodename); - if (failing && !(failing->flags & (REDIS_NODE_FAIL|REDIS_NODE_MYSELF))) - { + if (sender) { + failing = clusterLookupNode(hdr->data.fail.about.nodename); + if (failing && !(failing->flags & (REDIS_NODE_FAIL|REDIS_NODE_MYSELF))) + { + redisLog(REDIS_NOTICE, + "FAIL message received from %.40s about %.40s", + hdr->sender, hdr->data.fail.about.nodename); + failing->flags |= REDIS_NODE_FAIL; + failing->fail_time = time(NULL); + failing->flags &= ~REDIS_NODE_PFAIL; + clusterUpdateState(); + clusterSaveConfigOrDie(); + } + } else { redisLog(REDIS_NOTICE, - "FAIL message received from %.40s about %.40s", + "Ignoring FAIL message from unknonw node %.40s about %.40s", hdr->sender, hdr->data.fail.about.nodename); - failing->flags |= REDIS_NODE_FAIL; - failing->fail_time = time(NULL); - failing->flags &= ~REDIS_NODE_PFAIL; - clusterUpdateState(); - clusterSaveConfigOrDie(); } } else if (type == CLUSTERMSG_TYPE_PUBLISH) { robj *channel, *message; From 5cbb913994afeb6117559bf21dd83aba46389d4c Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 2 Oct 2013 10:10:08 +0200 Subject: [PATCH 0206/2500] Cluster: bus messages stats in CLUSTER info. --- src/cluster.c | 10 +++++++++- src/redis.h | 2 ++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/cluster.c b/src/cluster.c index 9328ab1be..c90112460 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -259,6 +259,8 @@ void clusterInit(void) { server.cluster->failover_auth_count = 0; server.cluster->failover_auth_epoch = 0; server.cluster->last_vote_epoch = 0; + server.cluster->stats_bus_messages_sent = 0; + server.cluster->stats_bus_messages_received = 0; memset(server.cluster->migrating_slots_to,0, sizeof(server.cluster->migrating_slots_to)); memset(server.cluster->importing_slots_from,0, @@ -878,6 +880,7 @@ int clusterProcessPacket(clusterLink *link) { uint64_t senderCurrentEpoch, senderConfigEpoch; clusterNode *sender; + server.cluster->stats_bus_messages_received++; redisLog(REDIS_DEBUG,"--- Processing packet of type %d, %lu bytes", type, (unsigned long) totlen); @@ -1318,6 +1321,7 @@ void clusterSendMessage(clusterLink *link, unsigned char *msg, size_t msglen) { clusterWriteHandler,link); link->sndbuf = sdscatlen(link->sndbuf, msg, msglen); + server.cluster->stats_bus_messages_sent++; } /* Send a message to all the nodes that are part of the cluster having @@ -2449,6 +2453,8 @@ void clusterCommand(redisClient *c) { "cluster_known_nodes:%lu\r\n" "cluster_size:%d\r\n" "cluster_current_epoch:%llu\r\n" + "cluster_stats_messages_sent:%lld\r\n" + "cluster_stats_messages_received:%lld\r\n" , statestr[server.cluster->state], slots_assigned, slots_ok, @@ -2456,7 +2462,9 @@ void clusterCommand(redisClient *c) { slots_fail, dictSize(server.cluster->nodes), server.cluster->size, - (unsigned long long) server.cluster->currentEpoch + (unsigned long long) server.cluster->currentEpoch, + server.cluster->stats_bus_messages_sent, + server.cluster->stats_bus_messages_received ); addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n", (unsigned long)sdslen(info))); diff --git a/src/redis.h b/src/redis.h index 995198f66..844c4a323 100644 --- a/src/redis.h +++ b/src/redis.h @@ -654,6 +654,8 @@ typedef struct { /* The followign fields are uesd by masters to take state on elections. */ uint64_t last_vote_epoch; /* Epoch of the last vote granted. */ int handle_slave_failover_asap; /* Call clusterHandleSlaveFailover() ASAP. */ + long long stats_bus_messages_sent; /* Num of msg sent via cluster bus. */ + long long stats_bus_messages_received; /* Num of msg received via cluster bus. */ } clusterState; /* Redis cluster messages header */ From 43f3df99c897533edf12dbe94ee998487dcea773 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 2 Oct 2013 12:27:12 +0200 Subject: [PATCH 0207/2500] Cluster: update cluster config when slave changes master. --- src/cluster.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/cluster.c b/src/cluster.c index c90112460..35b6a5b45 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -1086,6 +1086,9 @@ int clusterProcessPacket(clusterLink *link) { clusterNodeRemoveSlave(sender->slaveof,sender); clusterNodeAddSlave(master,sender); sender->slaveof = master; + + /* Update config. */ + update_config = 1; } } } From dbf6c85d5e569d9342bb2169f520acc550e10623 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 3 Oct 2013 09:55:20 +0200 Subject: [PATCH 0208/2500] Cluster: new clusterDoBeforeSleep() API. The new API is able to remember operations to perform before returning to the event loop, such as checking if there is the failover quorum for a slave, save and fsync the configuraiton file, and so forth. Because this operations are performed before returning on the event loop we are sure that messages that are sent in the same event loop run will be delivered *after* the configuration is already saved, that is a requirement sometimes. For instance we want to publish a new epoch only when it is already stored in nodes.conf in order to avoid returning back in the logical clock when a node is restarted. This new API provides a big performance advantage compared to saving and possibly fsyncing the configuration file multiple times in the same event loop run, especially in the case of big clusters with tens or hundreds of nodes. --- src/cluster.c | 127 ++++++++++++++++++++++++++------------------------ src/redis.h | 8 +++- 2 files changed, 72 insertions(+), 63 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 35b6a5b45..403ccf387 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -55,6 +55,7 @@ int clusterNodeSetSlotBit(clusterNode *n, int slot); void clusterSetMaster(clusterNode *n); void clusterHandleSlaveFailover(void); int bitmapTestBit(unsigned char *bitmap, int pos); +void clusterDoBeforeSleep(int flags); /* ----------------------------------------------------------------------------- * Initialization @@ -222,14 +223,14 @@ fmterr: * * This function writes the node config and returns 0, on error -1 * is returned. */ -int clusterSaveConfig(void) { +int clusterSaveConfig(int do_fsync) { sds ci = clusterGenNodesDescription(REDIS_NODE_HANDSHAKE); int fd; if ((fd = open(server.cluster_configfile,O_WRONLY|O_CREAT|O_TRUNC,0644)) == -1) goto err; if (write(fd,ci,sdslen(ci)) != (ssize_t)sdslen(ci)) goto err; - fsync(fd); + if (do_fsync) fsync(fd); close(fd); sdsfree(ci); return 0; @@ -239,8 +240,8 @@ err: return -1; } -void clusterSaveConfigOrDie(void) { - if (clusterSaveConfig() == -1) { +void clusterSaveConfigOrDie(int do_fsync) { + if (clusterSaveConfig(do_fsync) == -1) { redisLog(REDIS_WARNING,"Fatal: can't update cluster config file."); exit(1); } @@ -277,7 +278,7 @@ void clusterInit(void) { clusterAddNode(server.cluster->myself); saveconf = 1; } - if (saveconf) clusterSaveConfigOrDie(); + if (saveconf) clusterSaveConfigOrDie(1); /* We need a listening TCP port for our cluster messaging needs. */ server.cfd_count = 0; @@ -665,15 +666,13 @@ void markNodeAsFailingIfNeeded(clusterNode *node) { * reachable nodes to flag the node as FAIL. */ if (server.cluster->myself->flags & REDIS_NODE_MASTER) clusterSendFail(node->name); - clusterUpdateState(); - clusterSaveConfigOrDie(); + clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); } /* This function is called only if a node is marked as FAIL, but we are able * to reach it again. It checks if there are the conditions to undo the FAIL * state. */ void clearNodeFailureIfNeeded(clusterNode *node) { - int changes = 0; time_t now = time(NULL); redisAssert(node->flags & REDIS_NODE_FAIL); @@ -685,7 +684,7 @@ void clearNodeFailureIfNeeded(clusterNode *node) { "Clear FAIL state for node %.40s: slave is already reachable.", node->name); node->flags &= ~REDIS_NODE_FAIL; - changes++; + clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); } /* If it is a master and... @@ -705,13 +704,7 @@ void clearNodeFailureIfNeeded(clusterNode *node) { "Clear FAIL state for node %.40s: is reachable again and nobody is serving its slots after some time.", node->name); node->flags &= ~REDIS_NODE_FAIL; - changes++; - } - - /* Update state and save config. */ - if (changes) { - clusterUpdateState(); - clusterSaveConfigOrDie(); + clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); } } @@ -926,13 +919,12 @@ int clusterProcessPacket(clusterLink *link) { /* Update the sender configEpoch if it is publishing a newer one. */ if (senderConfigEpoch > sender->configEpoch) { sender->configEpoch = senderConfigEpoch; - clusterSaveConfigOrDie(); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|CLUSTER_TODO_FSYNC_CONFIG); } } /* Process packets by type. */ if (type == CLUSTERMSG_TYPE_PING || type == CLUSTERMSG_TYPE_MEET) { - int update_config = 0; redisLog(REDIS_DEBUG,"Ping packet received: %p", (void*)link->node); /* Add this node if it is new for us and the msg type is MEET. @@ -946,7 +938,7 @@ int clusterProcessPacket(clusterLink *link) { nodeIp2String(node->ip,link); node->port = ntohs(hdr->port); clusterAddNode(node); - update_config = 1; + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); } /* Get info from the gossip section */ @@ -954,18 +946,12 @@ int clusterProcessPacket(clusterLink *link) { /* Anyway reply with a PONG */ clusterSendPing(link,CLUSTERMSG_TYPE_PONG); - - /* Update config if needed */ - if (update_config) clusterSaveConfigOrDie(); } /* PING or PONG: process config information. */ if (type == CLUSTERMSG_TYPE_PING || type == CLUSTERMSG_TYPE_PONG || type == CLUSTERMSG_TYPE_MEET) { - int update_state = 0; - int update_config = 0; - redisLog(REDIS_DEBUG,"%s packet received: %p", type == CLUSTERMSG_TYPE_PING ? "ping" : "pong", (void*)link->node); @@ -978,8 +964,8 @@ int clusterProcessPacket(clusterLink *link) { "Handshake error: we already know node %.40s, updating the address if needed.", sender->name); if (nodeUpdateAddressIfNeeded(sender,link,ntohs(hdr->port))) { - clusterUpdateState(); - clusterSaveConfigOrDie(); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_UPDATE_STATE); } /* Free this node as we alrady have it. This will * cause the link to be freed as well. */ @@ -994,7 +980,7 @@ int clusterProcessPacket(clusterLink *link) { link->node->name); link->node->flags &= ~REDIS_NODE_HANDSHAKE; link->node->flags |= flags&(REDIS_NODE_MASTER|REDIS_NODE_SLAVE); - update_config = 1; + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); } else if (memcmp(link->node->name,hdr->sender, REDIS_CLUSTER_NAMELEN) != 0) { @@ -1006,7 +992,7 @@ int clusterProcessPacket(clusterLink *link) { link->node->ip[0] = '\0'; link->node->port = 0; freeClusterLink(link); - update_config = 1; + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); /* FIXME: remove this node if we already have it. * * If we already have it but the IP is different, use @@ -1021,8 +1007,7 @@ int clusterProcessPacket(clusterLink *link) { !(sender->flags & REDIS_NODE_HANDSHAKE) && nodeUpdateAddressIfNeeded(sender,link,ntohs(hdr->port))) { - update_state = 1; - update_config = 1; + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|CLUSTER_TODO_UPDATE_STATE); } /* Update our info about the node */ @@ -1038,7 +1023,8 @@ int clusterProcessPacket(clusterLink *link) { * conditions detected by clearNodeFailureIfNeeded(). */ if (link->node->flags & REDIS_NODE_PFAIL) { link->node->flags &= ~REDIS_NODE_PFAIL; - update_state = 1; + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_UPDATE_STATE); } else if (link->node->flags & REDIS_NODE_FAIL) { clearNodeFailureIfNeeded(link->node); } @@ -1059,8 +1045,8 @@ int clusterProcessPacket(clusterLink *link) { sender->slaveof = NULL; /* Update config and state. */ - update_state = 1; - update_config = 1; + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_UPDATE_STATE); } } else { /* Node is a slave. */ @@ -1076,8 +1062,8 @@ int clusterProcessPacket(clusterLink *link) { if (sender->numslaves) clusterNodeResetSlaves(sender); /* Update config and state. */ - update_state = 1; - update_config = 1; + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_UPDATE_STATE); } /* Master node changed for this slave? */ @@ -1088,7 +1074,7 @@ int clusterProcessPacket(clusterLink *link) { sender->slaveof = master; /* Update config. */ - update_config = 1; + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); } } } @@ -1126,7 +1112,9 @@ int clusterProcessPacket(clusterLink *link) { newmaster = sender; clusterDelSlot(j); clusterAddSlot(sender,j); - update_state = update_config = 1; + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_UPDATE_STATE| + CLUSTER_TODO_FSYNC_CONFIG); } } else { /* This node claims to no longer handling the slot, @@ -1150,16 +1138,15 @@ int clusterProcessPacket(clusterLink *link) { if (newmaster && curmaster->numslots == 0) { redisLog(REDIS_WARNING,"Configuration change detected. Reconfiguring myself as a replica of %.40s", sender->name); clusterSetMaster(sender); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_UPDATE_STATE| + CLUSTER_TODO_FSYNC_CONFIG); } } } /* Get info from the gossip section */ clusterProcessGossipSection(hdr,link); - - /* Update the cluster state if needed */ - if (update_state) clusterUpdateState(); - if (update_config) clusterSaveConfigOrDie(); } else if (type == CLUSTERMSG_TYPE_FAIL) { clusterNode *failing; @@ -1173,8 +1160,7 @@ int clusterProcessPacket(clusterLink *link) { failing->flags |= REDIS_NODE_FAIL; failing->fail_time = time(NULL); failing->flags &= ~REDIS_NODE_PFAIL; - clusterUpdateState(); - clusterSaveConfigOrDie(); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|CLUSTER_TODO_UPDATE_STATE); } } else { redisLog(REDIS_NOTICE, @@ -1185,7 +1171,8 @@ int clusterProcessPacket(clusterLink *link) { robj *channel, *message; uint32_t channel_len, message_len; - /* Don't bother creating useless objects if there are no Pub/Sub subscribers. */ + /* Don't bother creating useless objects if there are no + * Pub/Sub subscribers. */ if (dictSize(server.pubsub_channels) || listLength(server.pubsub_patterns)) { channel_len = ntohl(hdr->data.publish.msg.channel_len); message_len = ntohl(hdr->data.publish.msg.message_len); @@ -1212,7 +1199,7 @@ int clusterProcessPacket(clusterLink *link) { server.cluster->failover_auth_count++; /* Maybe we reached a quorum here, set a flag to make sure * we check ASAP. */ - server.cluster->handle_slave_failover_asap++; + clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER); } } else { redisLog(REDIS_WARNING,"Received unknown packet type: %d", type); @@ -1673,6 +1660,9 @@ void clusterHandleSlaveFailover(void) { server.cluster->currentEpoch); clusterRequestFailoverAuth(); server.cluster->failover_auth_sent = 1; + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_UPDATE_STATE| + CLUSTER_TODO_FSYNC_CONFIG); return; /* Wait for replies. */ } @@ -1706,7 +1696,7 @@ void clusterHandleSlaveFailover(void) { /* 4) Update state and save config. */ clusterUpdateState(); - clusterSaveConfigOrDie(); + clusterSaveConfigOrDie(1); /* 5) Pong all the other nodes so that they can update the state * accordingly and detect that we switched to master role. */ @@ -1878,12 +1868,30 @@ void clusterCron(void) { /* This function is called before the event handler returns to sleep for * events. It is useful to perform operations that must be done ASAP in * reaction to events fired but that are not safe to perform inside event - * handlers. */ + * handlers, or to perform potentially expansive tasks that we need to do + * a single time before replying to clients. */ void clusterBeforeSleep(void) { - if (server.cluster->handle_slave_failover_asap) { + /* Handle failover, this is needed when it is likely that there is already + * the quorum from masters in order to react fast. */ + if (server.cluster->todo_before_sleep & CLUSTER_TODO_HANDLE_FAILOVER) clusterHandleSlaveFailover(); - server.cluster->handle_slave_failover_asap = 0; + + /* Update the cluster state. */ + if (server.cluster->todo_before_sleep & CLUSTER_TODO_UPDATE_STATE) + clusterUpdateState(); + + /* Save the config, possibly using fsync. */ + if (server.cluster->todo_before_sleep & CLUSTER_TODO_SAVE_CONFIG) { + int fsync = server.cluster->todo_before_sleep & CLUSTER_TODO_FSYNC_CONFIG; + clusterSaveConfigOrDie(fsync); } + + /* Reset our flags. */ + server.cluster->todo_before_sleep = 0; +} + +void clusterDoBeforeSleep(int flags) { + server.cluster->todo_before_sleep |= flags; } /* ----------------------------------------------------------------------------- @@ -2097,7 +2105,7 @@ int verifyClusterConfigWithData(void) { server.cluster->importing_slots_from[j] = server.cluster->slots[j]; } } - if (update_config) clusterSaveConfigOrDie(); + if (update_config) clusterSaveConfigOrDie(1); return REDIS_OK; } @@ -2296,8 +2304,7 @@ void clusterCommand(redisClient *c) { return; } clusterDelNodeSlots(server.cluster->myself); - clusterUpdateState(); - clusterSaveConfigOrDie(); + clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); addReply(c,shared.ok); } else if ((!strcasecmp(c->argv[1]->ptr,"addslots") || !strcasecmp(c->argv[1]->ptr,"delslots")) && c->argc >= 3) @@ -2347,8 +2354,7 @@ void clusterCommand(redisClient *c) { } } zfree(slots); - clusterUpdateState(); - clusterSaveConfigOrDie(); + clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); addReply(c,shared.ok); } else if (!strcasecmp(c->argv[1]->ptr,"setslot") && c->argc >= 4) { /* SETSLOT 10 MIGRATING */ @@ -2424,8 +2430,7 @@ void clusterCommand(redisClient *c) { addReplyError(c,"Invalid CLUSTER SETSLOT action or number of arguments"); return; } - clusterUpdateState(); - clusterSaveConfigOrDie(); + clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); addReply(c,shared.ok); } else if (!strcasecmp(c->argv[1]->ptr,"info") && c->argc == 2) { /* CLUSTER INFO */ @@ -2474,7 +2479,7 @@ void clusterCommand(redisClient *c) { addReplySds(c,info); addReply(c,shared.crlf); } else if (!strcasecmp(c->argv[1]->ptr,"saveconfig") && c->argc == 2) { - int retval = clusterSaveConfig(); + int retval = clusterSaveConfig(1); if (retval == 0) addReply(c,shared.ok); @@ -2526,8 +2531,7 @@ void clusterCommand(redisClient *c) { return; } clusterDelNode(n); - clusterUpdateState(); - clusterSaveConfigOrDie(); + clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); addReply(c,shared.ok); } else if (!strcasecmp(c->argv[1]->ptr,"replicate") && c->argc == 3) { /* CLUSTER REPLICATE */ @@ -2562,8 +2566,7 @@ void clusterCommand(redisClient *c) { /* Set the master. */ clusterSetMaster(n); - clusterUpdateState(); - clusterSaveConfigOrDie(); + clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); addReply(c,shared.ok); } else { addReplyError(c,"Wrong CLUSTER subcommand or number of arguments"); diff --git a/src/redis.h b/src/redis.h index 844c4a323..94decca9d 100644 --- a/src/redis.h +++ b/src/redis.h @@ -653,11 +653,17 @@ typedef struct { uint64_t failover_auth_epoch; /* Epoch of the current election. */ /* The followign fields are uesd by masters to take state on elections. */ uint64_t last_vote_epoch; /* Epoch of the last vote granted. */ - int handle_slave_failover_asap; /* Call clusterHandleSlaveFailover() ASAP. */ + int todo_before_sleep; /* Things to do in clusterBeforeSleep(). */ long long stats_bus_messages_sent; /* Num of msg sent via cluster bus. */ long long stats_bus_messages_received; /* Num of msg received via cluster bus. */ } clusterState; +/* clusterState todo_before_sleep flags. */ +#define CLUSTER_TODO_HANDLE_FAILOVER (1<<0) +#define CLUSTER_TODO_UPDATE_STATE (1<<1) +#define CLUSTER_TODO_SAVE_CONFIG (1<<2) +#define CLUSTER_TODO_FSYNC_CONFIG (1<<3) + /* Redis cluster messages header */ /* Note that the PING, PONG and MEET messages are actually the same exact From cd73a69c18c0b0c4c94a91cfb22da1ed2ece8c21 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 4 Oct 2013 12:25:09 +0200 Subject: [PATCH 0209/2500] PSYNC: safer handling of PSYNC requests. There was a bug that over-esteemed the amount of backlog available, however this could only happen when a slave was asking for an offset that was in the "future" compared to the master replication backlog. Now this case is handled well and logged as an incident in the master log file. --- src/replication.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/replication.c b/src/replication.c index 0b75f4eeb..bcc80b110 100644 --- a/src/replication.c +++ b/src/replication.c @@ -356,10 +356,14 @@ int masterTryPartialResynchronization(redisClient *c) { REDIS_OK) goto need_full_resync; if (!server.repl_backlog || psync_offset < server.repl_backlog_off || - psync_offset >= (server.repl_backlog_off + server.repl_backlog_size)) + psync_offset > (server.repl_backlog_off + server.repl_backlog_histlen)) { redisLog(REDIS_NOTICE, "Unable to partial resync with the slave for lack of backlog (Slave request was: %lld).", psync_offset); + if (psync_offset > server.master_repl_offset) { + redisLog(REDIS_WARNING, + "Warning: slave tried to PSYNC with an offset that is greater than the master replication offset."); + } goto need_full_resync; } From de86e24ba6a6c1343e0a727414bc576c074898e2 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 30 Sep 2013 11:53:18 +0200 Subject: [PATCH 0210/2500] Add REWRITE to CONFIG subcommands help message. --- src/config.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/config.c b/src/config.c index 3a14a6e94..2707bfaeb 100644 --- a/src/config.c +++ b/src/config.c @@ -1786,7 +1786,7 @@ void configCommand(redisClient *c) { } } else { addReplyError(c, - "CONFIG subcommand must be one of GET, SET, RESETSTAT"); + "CONFIG subcommand must be one of GET, SET, RESETSTAT, REWRITE"); } return; From e2e4c81d9d23af6b61178e0cc75e48fc64a944b5 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 4 Oct 2013 12:25:09 +0200 Subject: [PATCH 0211/2500] PSYNC: safer handling of PSYNC requests. There was a bug that over-esteemed the amount of backlog available, however this could only happen when a slave was asking for an offset that was in the "future" compared to the master replication backlog. Now this case is handled well and logged as an incident in the master log file. --- src/replication.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/replication.c b/src/replication.c index 81323cb60..f35c43469 100644 --- a/src/replication.c +++ b/src/replication.c @@ -356,10 +356,14 @@ int masterTryPartialResynchronization(redisClient *c) { REDIS_OK) goto need_full_resync; if (!server.repl_backlog || psync_offset < server.repl_backlog_off || - psync_offset >= (server.repl_backlog_off + server.repl_backlog_size)) + psync_offset > (server.repl_backlog_off + server.repl_backlog_histlen)) { redisLog(REDIS_NOTICE, "Unable to partial resync with the slave for lack of backlog (Slave request was: %lld).", psync_offset); + if (psync_offset > server.master_repl_offset) { + redisLog(REDIS_WARNING, + "Warning: slave tried to PSYNC with an offset that is greater than the master replication offset."); + } goto need_full_resync; } From cca9f8c4323c63141afbe8f99dbb657e0a0fea37 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 4 Oct 2013 12:59:24 +0200 Subject: [PATCH 0212/2500] Replication: fix master timeout. Since we started sending REPLCONF ACK from slaves to masters, the lastinteraction field of the client structure is always refreshed as soon as there is room in the socket output buffer, so masters in timeout are detected with too much delay (the socket buffer takes a lot of time to be filled by small REPLCONF ACK entries). This commit only counts data received as interactions with a master, solving the issue. --- src/networking.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/networking.c b/src/networking.c index 23ef11dc8..1da5a5a58 100644 --- a/src/networking.c +++ b/src/networking.c @@ -829,7 +829,13 @@ void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) { return; } } - if (totwritten > 0) c->lastinteraction = server.unixtime; + if (totwritten > 0) { + /* For clients representing masters we don't count sending data + * as an interaction, since we always send REPLCONF ACK commands + * that take some time to just fill the socket output buffer. + * We just rely on data / pings received for timeout detection. */ + if (!(c->flags & REDIS_MASTER)) c->lastinteraction = server.unixtime; + } if (c->bufpos == 0 && listLength(c->reply) == 0) { c->sentlen = 0; aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE); From 4cddbc8ad47d14d716f1cd60ab9cb8f3abd4254e Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 4 Oct 2013 12:59:24 +0200 Subject: [PATCH 0213/2500] Replication: fix master timeout. Since we started sending REPLCONF ACK from slaves to masters, the lastinteraction field of the client structure is always refreshed as soon as there is room in the socket output buffer, so masters in timeout are detected with too much delay (the socket buffer takes a lot of time to be filled by small REPLCONF ACK entries). This commit only counts data received as interactions with a master, solving the issue. --- src/networking.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/networking.c b/src/networking.c index 23ef11dc8..1da5a5a58 100644 --- a/src/networking.c +++ b/src/networking.c @@ -829,7 +829,13 @@ void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) { return; } } - if (totwritten > 0) c->lastinteraction = server.unixtime; + if (totwritten > 0) { + /* For clients representing masters we don't count sending data + * as an interaction, since we always send REPLCONF ACK commands + * that take some time to just fill the socket output buffer. + * We just rely on data / pings received for timeout detection. */ + if (!(c->flags & REDIS_MASTER)) c->lastinteraction = server.unixtime; + } if (c->bufpos == 0 && listLength(c->reply) == 0) { c->sentlen = 0; aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE); From 8432ddcedbaaa08bc3a92e1aba581bd759ef947b Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 4 Oct 2013 16:12:25 +0200 Subject: [PATCH 0214/2500] Replication: install the write handler when reusing a cached master. Sometimes when we resurrect a cached master after a successful partial resynchronization attempt, there is pending data in the output buffers of the client structure representing the master (likely REPLCONF ACK commands). If we don't reinstall the write handler, it will never be installed again by addReply*() family functions as they'll assume that if there is already data pending, the write handler is already installed. This bug caused some slaves after a successful partial sync to never send REPLCONF ACK, and continuously being detected as timing out by the master, with a disconnection / reconnection loop. --- src/replication.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/replication.c b/src/replication.c index bcc80b110..8102fc2db 100644 --- a/src/replication.c +++ b/src/replication.c @@ -1375,6 +1375,16 @@ void replicationResurrectCachedMaster(int newfd) { redisLog(REDIS_WARNING,"Error resurrecting the cached master, impossible to add the readable handler: %s", strerror(errno)); freeClientAsync(server.master); /* Close ASAP. */ } + + /* We may also need to install the write handler as well if there is + * pending data in the write buffers. */ + if (server.master->bufpos || listLength(server.master->reply)) { + if (aeCreateFileEvent(server.el, newfd, AE_WRITABLE, + sendReplyToClient, server.master)) { + redisLog(REDIS_WARNING,"Error resurrecting the cached master, impossible to add the writable handler: %s", strerror(errno)); + freeClientAsync(server.master); /* Close ASAP. */ + } + } } /* ------------------------- MIN-SLAVES-TO-WRITE --------------------------- */ From 8adeb2b2e3a63893da6d8b2d11a60afc43658d5c Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 4 Oct 2013 16:12:25 +0200 Subject: [PATCH 0215/2500] Replication: install the write handler when reusing a cached master. Sometimes when we resurrect a cached master after a successful partial resynchronization attempt, there is pending data in the output buffers of the client structure representing the master (likely REPLCONF ACK commands). If we don't reinstall the write handler, it will never be installed again by addReply*() family functions as they'll assume that if there is already data pending, the write handler is already installed. This bug caused some slaves after a successful partial sync to never send REPLCONF ACK, and continuously being detected as timing out by the master, with a disconnection / reconnection loop. --- src/replication.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/replication.c b/src/replication.c index f35c43469..d0d0594a1 100644 --- a/src/replication.c +++ b/src/replication.c @@ -1375,6 +1375,16 @@ void replicationResurrectCachedMaster(int newfd) { redisLog(REDIS_WARNING,"Error resurrecting the cached master, impossible to add the readable handler: %s", strerror(errno)); freeClientAsync(server.master); /* Close ASAP. */ } + + /* We may also need to install the write handler as well if there is + * pending data in the write buffers. */ + if (server.master->bufpos || listLength(server.master->reply)) { + if (aeCreateFileEvent(server.el, newfd, AE_WRITABLE, + sendReplyToClient, server.master)) { + redisLog(REDIS_WARNING,"Error resurrecting the cached master, impossible to add the writable handler: %s", strerror(errno)); + freeClientAsync(server.master); /* Close ASAP. */ + } + } } /* ------------------------- MIN-SLAVES-TO-WRITE --------------------------- */ From e9b8b30c81002be04a69d060520ee9d6a021e68f Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 7 Oct 2013 11:30:58 +0200 Subject: [PATCH 0216/2500] Cluster: slave nodes advertise master slots bitmap and configEpoch. --- src/cluster.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 403ccf387..bee1b7b5c 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -1339,12 +1339,21 @@ void clusterBroadcastMessage(void *buf, size_t len) { /* Build the message header */ void clusterBuildMessageHdr(clusterMsg *hdr, int type) { int totlen = 0; + clusterNode *master; + + /* If this node is a master, we send its slots bitmap and configEpoch. + * If this node is a slave we send the master's information instead (the + * node is flagged as slave so the receiver knows that it is NOT really + * in charge for this slots. */ + master = (server.cluster->myself->flags & REDIS_NODE_SLAVE && + server.cluster->myself->slaveof) ? + server.cluster->myself->slaveof : server.cluster->myself; memset(hdr,0,sizeof(*hdr)); hdr->type = htons(type); memcpy(hdr->sender,server.cluster->myself->name,REDIS_CLUSTER_NAMELEN); - memcpy(hdr->myslots,server.cluster->myself->slots, - sizeof(hdr->myslots)); + + memcpy(hdr->myslots,master->slots,sizeof(hdr->myslots)); memset(hdr->slaveof,0,REDIS_CLUSTER_NAMELEN); if (server.cluster->myself->slaveof != NULL) { memcpy(hdr->slaveof,server.cluster->myself->slaveof->name, @@ -1354,13 +1363,9 @@ void clusterBuildMessageHdr(clusterMsg *hdr, int type) { hdr->flags = htons(server.cluster->myself->flags); hdr->state = server.cluster->state; - /* Set the currentEpoch and configEpochs. Note that configEpoch is - * set to the master configEpoch if this node is a slave. */ + /* Set the currentEpoch and configEpochs. */ hdr->currentEpoch = htonu64(server.cluster->currentEpoch); - if (server.cluster->myself->flags & REDIS_NODE_SLAVE) - hdr->configEpoch = htonu64(server.cluster->myself->slaveof->configEpoch); - else - hdr->configEpoch = htonu64(server.cluster->myself->configEpoch); + hdr->configEpoch = htonu64(master->configEpoch); if (type == CLUSTERMSG_TYPE_FAIL) { totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData); From acd9ec222ee9388399ddd2a51232f41e12bef4e7 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 7 Oct 2013 15:44:58 +0200 Subject: [PATCH 0217/2500] Cluster: log message improved when FAIL is cleared from a slave node. --- src/cluster.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cluster.c b/src/cluster.c index bee1b7b5c..6939bac32 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -681,7 +681,7 @@ void clearNodeFailureIfNeeded(clusterNode *node) { * node again. */ if (node->flags & REDIS_NODE_SLAVE) { redisLog(REDIS_NOTICE, - "Clear FAIL state for node %.40s: slave is already reachable.", + "Clear FAIL state for node %.40s: slave is reachable again.", node->name); node->flags &= ~REDIS_NODE_FAIL; clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); From 26ea55b7f50d68e203f57c9727aaa29bb56a83ed Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 7 Oct 2013 16:07:13 +0200 Subject: [PATCH 0218/2500] Cluster: fix slave data age computation when master is still connected. --- src/cluster.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/cluster.c b/src/cluster.c index 6939bac32..332120720 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -1607,11 +1607,18 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { * 3) Perform the failover informing all the other nodes. */ void clusterHandleSlaveFailover(void) { - time_t data_age = server.unixtime - server.repl_down_since; + time_t data_age; mstime_t auth_age = mstime() - server.cluster->failover_auth_time; int needed_quorum = (server.cluster->size / 2) + 1; int j; + /* Set data_age to the number of seconds we are disconnected from the master. */ + if (server.repl_state == REDIS_REPL_CONNECTED) { + data_age = server.unixtime - server.master->lastinteraction; + } else { + data_age = server.unixtime - server.repl_down_since; + } + /* Pre conditions to run the function: * 1) We are a slave. * 2) Our master is flagged as FAIL. From 0f079966c73f2d00ea7c3ec80e81e744bed7793d Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 8 Oct 2013 12:45:35 +0200 Subject: [PATCH 0219/2500] Cluster: masters don't vote for a slave with stale config. When a slave requests our vote, the configEpoch he claims for its master and the set of served slots must be greater or equal to the configEpoch of the nodes serving these slots in the current configuraiton of the master granting its vote. In other terms, masters don't vote for slaves having a stale configuration for the slots they want to serve. --- src/cluster.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 332120720..23d4196d2 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -1567,7 +1567,10 @@ void clusterSendFailoverAuth(clusterNode *node) { /* Vote for the node asking for our vote if there are the conditions. */ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { clusterNode *master = node->slaveof; - uint64_t requestEpoch = ntohu64(request->currentEpoch); + uint64_t requestCurrentEpoch = ntohu64(request->currentEpoch); + uint64_t requestConfigEpoch = ntohu64(request->configEpoch); + unsigned char *claimed_slots = request->myslots; + int j; /* IF we are not a master serving at least 1 slot, we don't have the * right to vote, as the cluster size in Redis Cluster is the number @@ -1576,7 +1579,7 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { if (server.cluster->myself->numslots == 0) return; /* Request epoch must be >= our currentEpoch. */ - if (requestEpoch < server.cluster->currentEpoch) return; + if (requestCurrentEpoch < server.cluster->currentEpoch) return; /* I already voted for this epoch? Return ASAP. */ if (server.cluster->last_vote_epoch == server.cluster->currentEpoch) return; @@ -1592,6 +1595,19 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { if (server.unixtime - node->slaveof->voted_time < server.cluster_node_timeout * 2) return; + /* The slave requesting the vote must have a configEpoch for the claimed slots + * that is >= the one of the masters currently serving the same slots in the + * current configuration. */ + for (j = 0; j < REDIS_CLUSTER_SLOTS; j++) { + if (bitmapTestBit(claimed_slots, j) == 0) continue; + if (server.cluster->slots[j] == NULL || + server.cluster->slots[j]->configEpoch <= requestConfigEpoch) continue; + /* If we reached this point we found a slot that in our current slots + * is served by a master with a greater configEpoch than the one claimed + * by the slave requesting our vote. Refuse to vote for this slave. */ + return; + } + /* We can vote for this slave. */ clusterSendFailoverAuth(node); server.cluster->last_vote_epoch = server.cluster->currentEpoch; @@ -1910,7 +1926,7 @@ void clusterDoBeforeSleep(int flags) { * Slots management * -------------------------------------------------------------------------- */ -/* Test bit 'pos' in a generic bitmap. Return 1 if the bit is zet, +/* Test bit 'pos' in a generic bitmap. Return 1 if the bit is set, * otherwise 0. */ int bitmapTestBit(unsigned char *bitmap, int pos) { off_t byte = pos/8; From 1560b708890b025606cfad278f6e9999ecbb43ea Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 9 Oct 2013 15:37:20 +0200 Subject: [PATCH 0220/2500] Cluster: cluster stuff moved from redis.h to cluster.h. --- src/Makefile.dep | 134 ++++++++++++++++++----------------- src/cluster.c | 3 + src/cluster.h | 181 +++++++++++++++++++++++++++++++++++++++++++++++ src/config.c | 1 + src/db.c | 1 + src/redis.c | 1 + src/redis.h | 181 +---------------------------------------------- 7 files changed, 258 insertions(+), 244 deletions(-) create mode 100644 src/cluster.h diff --git a/src/Makefile.dep b/src/Makefile.dep index e945efb0c..9ec6d9c91 100644 --- a/src/Makefile.dep +++ b/src/Makefile.dep @@ -6,112 +6,114 @@ ae_kqueue.o: ae_kqueue.c ae_select.o: ae_select.c anet.o: anet.c fmacros.h anet.h aof.o: aof.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h bio.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h bio.h bio.o: bio.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h bio.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h bio.h bitops.o: bitops.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h cluster.o: cluster.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h endianconv.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h cluster.h endianconv.h config.o: config.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h cluster.h crc16.o: crc16.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h crc64.o: crc64.c db.o: db.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h cluster.h debug.o: debug.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h sha1.h crc64.h bio.h -dict.o: dict.c fmacros.h dict.h zmalloc.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h sha1.h crc64.h bio.h +dict.o: dict.c fmacros.h dict.h zmalloc.h redisassert.h endianconv.o: endianconv.c intset.o: intset.c intset.h zmalloc.h endianconv.h config.h lzf_c.o: lzf_c.c lzfP.h lzf_d.o: lzf_d.c lzfP.h memtest.o: memtest.c config.h multi.o: multi.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h networking.o: networking.c redis.h fmacros.h config.h \ - ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h sds.h dict.h \ - adlist.h zmalloc.h anet.h ziplist.h intset.h version.h util.h rdb.h \ - rio.h + ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h sds.h dict.h \ + adlist.h zmalloc.h anet.h ziplist.h intset.h version.h util.h rdb.h \ + rio.h notify.o: notify.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h object.o: object.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h pqsort.o: pqsort.c pubsub.o: pubsub.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h rand.o: rand.c rdb.o: rdb.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h lzf.h zipmap.h \ - endianconv.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h lzf.h zipmap.h \ + endianconv.h redis-benchmark.o: redis-benchmark.c fmacros.h ae.h \ - ../deps/hiredis/hiredis.h sds.h adlist.h zmalloc.h + ../deps/hiredis/hiredis.h sds.h adlist.h zmalloc.h redis-check-aof.o: redis-check-aof.c fmacros.h config.h redis-check-dump.o: redis-check-dump.c lzf.h crc64.h redis-cli.o: redis-cli.c fmacros.h version.h ../deps/hiredis/hiredis.h \ - sds.h zmalloc.h ../deps/linenoise/linenoise.h help.h anet.h ae.h + sds.h zmalloc.h ../deps/linenoise/linenoise.h help.h anet.h ae.h redis.o: redis.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h slowlog.h bio.h \ - asciilogo.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h cluster.h slowlog.h \ + bio.h asciilogo.h release.o: release.c release.h version.h crc64.h replication.o: replication.c redis.h fmacros.h config.h \ - ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h sds.h dict.h \ - adlist.h zmalloc.h anet.h ziplist.h intset.h version.h util.h rdb.h \ - rio.h -rio.o: rio.c fmacros.h rio.h sds.h util.h crc64.h + ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h sds.h dict.h \ + adlist.h zmalloc.h anet.h ziplist.h intset.h version.h util.h rdb.h \ + rio.h +rio.o: rio.c fmacros.h rio.h sds.h util.h crc64.h config.h redis.h \ + ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h dict.h adlist.h \ + zmalloc.h anet.h ziplist.h intset.h version.h rdb.h scripting.o: scripting.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h sha1.h rand.h \ - ../deps/lua/src/lauxlib.h ../deps/lua/src/lua.h \ - ../deps/lua/src/lualib.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h sha1.h rand.h \ + ../deps/lua/src/lauxlib.h ../deps/lua/src/lua.h ../deps/lua/src/lualib.h sds.o: sds.c sds.h zmalloc.h sentinel.o: sentinel.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h \ - ../deps/hiredis/hiredis.h ../deps/hiredis/async.h \ - ../deps/hiredis/hiredis.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h \ + ../deps/hiredis/hiredis.h ../deps/hiredis/async.h \ + ../deps/hiredis/hiredis.h setproctitle.o: setproctitle.c sha1.o: sha1.c sha1.h config.h slowlog.o: slowlog.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h slowlog.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h slowlog.h sort.o: sort.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h pqsort.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h pqsort.h syncio.o: syncio.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h t_hash.o: t_hash.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h t_list.o: t_list.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h t_set.o: t_set.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h t_string.o: t_string.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h t_zset.o: t_zset.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h -util.o: util.c fmacros.h util.h -ziplist.o: ziplist.c zmalloc.h util.h ziplist.h endianconv.h config.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h +util.o: util.c fmacros.h util.h sds.h +ziplist.o: ziplist.c zmalloc.h util.h sds.h ziplist.h endianconv.h \ + config.h redisassert.h zipmap.o: zipmap.c zmalloc.h endianconv.h config.h zmalloc.o: zmalloc.c config.h zmalloc.h diff --git a/src/cluster.c b/src/cluster.c index 23d4196d2..9c0d3e409 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -29,6 +29,7 @@ */ #include "redis.h" +#include "cluster.h" #include "endianconv.h" #include @@ -38,6 +39,8 @@ #include #include +clusterNode *createClusterNode(char *nodename, int flags); +int clusterAddNode(clusterNode *node); void clusterAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask); void clusterReadHandler(aeEventLoop *el, int fd, void *privdata, int mask); void clusterSendPing(clusterLink *link, int type); diff --git a/src/cluster.h b/src/cluster.h new file mode 100644 index 000000000..d46b105f8 --- /dev/null +++ b/src/cluster.h @@ -0,0 +1,181 @@ +#ifndef __REDIS_CLUSTER_H +#define __REDIS_CLUSTER_H + +/*----------------------------------------------------------------------------- + * Redis cluster data structures, defines, exported API. + *----------------------------------------------------------------------------*/ + +#define REDIS_CLUSTER_SLOTS 16384 +#define REDIS_CLUSTER_OK 0 /* Everything looks ok */ +#define REDIS_CLUSTER_FAIL 1 /* The cluster can't work */ +#define REDIS_CLUSTER_NAMELEN 40 /* sha1 hex length */ +#define REDIS_CLUSTER_PORT_INCR 10000 /* Cluster port = baseport + PORT_INCR */ +#define REDIS_CLUSTER_IPLEN INET6_ADDRSTRLEN /* IPv6 address string length */ + +/* The following defines are amunt of time, sometimes expressed as + * multiplicators of the node timeout value (when ending with MULT). */ +#define REDIS_CLUSTER_DEFAULT_NODE_TIMEOUT 15 +#define REDIS_CLUSTER_FAIL_REPORT_VALIDITY_MULT 2 /* Fail report validity. */ +#define REDIS_CLUSTER_FAIL_UNDO_TIME_MULT 2 /* Undo fail if master is back. */ +#define REDIS_CLUSTER_FAIL_UNDO_TIME_ADD 10 /* Some additional time. */ +#define REDIS_CLUSTER_SLAVE_VALIDITY_MULT 10 /* Slave data validity. */ +#define REDIS_CLUSTER_FAILOVER_AUTH_RETRY_MULT 4 /* Auth request retry time. */ +#define REDIS_CLUSTER_FAILOVER_DELAY 5 /* Seconds */ + +struct clusterNode; + +/* clusterLink encapsulates everything needed to talk with a remote node. */ +typedef struct clusterLink { + time_t ctime; /* Link creation time */ + int fd; /* TCP socket file descriptor */ + sds sndbuf; /* Packet send buffer */ + sds rcvbuf; /* Packet reception buffer */ + struct clusterNode *node; /* Node related to this link if any, or NULL */ +} clusterLink; + +/* Node flags */ +#define REDIS_NODE_MASTER 1 /* The node is a master */ +#define REDIS_NODE_SLAVE 2 /* The node is a slave */ +#define REDIS_NODE_PFAIL 4 /* Failure? Need acknowledge */ +#define REDIS_NODE_FAIL 8 /* The node is believed to be malfunctioning */ +#define REDIS_NODE_MYSELF 16 /* This node is myself */ +#define REDIS_NODE_HANDSHAKE 32 /* We have still to exchange the first ping */ +#define REDIS_NODE_NOADDR 64 /* We don't know the address of this node */ +#define REDIS_NODE_MEET 128 /* Send a MEET message to this node */ +#define REDIS_NODE_PROMOTED 256 /* Master was a slave propoted by failover */ +#define REDIS_NODE_NULL_NAME "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000" + +/* This structure represent elements of node->fail_reports. */ +struct clusterNodeFailReport { + struct clusterNode *node; /* Node reporting the failure condition. */ + time_t time; /* Time of the last report from this node. */ +} typedef clusterNodeFailReport; + +struct clusterNode { + time_t ctime; /* Node object creation time. */ + char name[REDIS_CLUSTER_NAMELEN]; /* Node name, hex string, sha1-size */ + int flags; /* REDIS_NODE_... */ + uint64_t configEpoch; /* Last configEpoch observed for this node */ + unsigned char slots[REDIS_CLUSTER_SLOTS/8]; /* slots handled by this node */ + int numslots; /* Number of slots handled by this node */ + int numslaves; /* Number of slave nodes, if this is a master */ + struct clusterNode **slaves; /* pointers to slave nodes */ + struct clusterNode *slaveof; /* pointer to the master node */ + time_t ping_sent; /* Unix time we sent latest ping */ + time_t pong_received; /* Unix time we received the pong */ + time_t fail_time; /* Unix time when FAIL flag was set */ + time_t voted_time; /* Last time we voted for a slave of this master */ + char ip[REDIS_IP_STR_LEN]; /* Latest known IP address of this node */ + int port; /* Latest known port of this node */ + clusterLink *link; /* TCP/IP link with this node */ + list *fail_reports; /* List of nodes signaling this as failing */ +}; +typedef struct clusterNode clusterNode; + +typedef struct clusterState { + clusterNode *myself; /* This node */ + uint64_t currentEpoch; + int state; /* REDIS_CLUSTER_OK, REDIS_CLUSTER_FAIL, ... */ + int size; /* Num of master nodes with at least one slot */ + dict *nodes; /* Hash table of name -> clusterNode structures */ + clusterNode *migrating_slots_to[REDIS_CLUSTER_SLOTS]; + clusterNode *importing_slots_from[REDIS_CLUSTER_SLOTS]; + clusterNode *slots[REDIS_CLUSTER_SLOTS]; + zskiplist *slots_to_keys; + /* The following fields are used to take the slave state on elections. */ + mstime_t failover_auth_time;/* Time at which we'll try to get elected in ms*/ + int failover_auth_count; /* Number of votes received so far. */ + int failover_auth_sent; /* True if we already asked for votes. */ + uint64_t failover_auth_epoch; /* Epoch of the current election. */ + /* The followign fields are uesd by masters to take state on elections. */ + uint64_t last_vote_epoch; /* Epoch of the last vote granted. */ + int todo_before_sleep; /* Things to do in clusterBeforeSleep(). */ + long long stats_bus_messages_sent; /* Num of msg sent via cluster bus. */ + long long stats_bus_messages_received; /* Num of msg received via cluster bus. */ +} clusterState; + +/* clusterState todo_before_sleep flags. */ +#define CLUSTER_TODO_HANDLE_FAILOVER (1<<0) +#define CLUSTER_TODO_UPDATE_STATE (1<<1) +#define CLUSTER_TODO_SAVE_CONFIG (1<<2) +#define CLUSTER_TODO_FSYNC_CONFIG (1<<3) + +/* Redis cluster messages header */ + +/* Note that the PING, PONG and MEET messages are actually the same exact + * kind of packet. PONG is the reply to ping, in the exact format as a PING, + * while MEET is a special PING that forces the receiver to add the sender + * as a node (if it is not already in the list). */ +#define CLUSTERMSG_TYPE_PING 0 /* Ping */ +#define CLUSTERMSG_TYPE_PONG 1 /* Pong (reply to Ping) */ +#define CLUSTERMSG_TYPE_MEET 2 /* Meet "let's join" message */ +#define CLUSTERMSG_TYPE_FAIL 3 /* Mark node xxx as failing */ +#define CLUSTERMSG_TYPE_PUBLISH 4 /* Pub/Sub Publish propagation */ +#define CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST 5 /* May I failover? */ +#define CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK 6 /* Yes, you can failover. */ + +/* Initially we don't know our "name", but we'll find it once we connect + * to the first node, using the getsockname() function. Then we'll use this + * address for all the next messages. */ +typedef struct { + char nodename[REDIS_CLUSTER_NAMELEN]; + uint32_t ping_sent; + uint32_t pong_received; + char ip[16]; /* IP address last time it was seen */ + uint16_t port; /* port last time it was seen */ + uint16_t flags; + uint32_t notused; /* for 64 bit alignment */ +} clusterMsgDataGossip; + +typedef struct { + char nodename[REDIS_CLUSTER_NAMELEN]; +} clusterMsgDataFail; + +typedef struct { + uint32_t channel_len; + uint32_t message_len; + unsigned char bulk_data[8]; /* defined as 8 just for alignment concerns. */ +} clusterMsgDataPublish; + +union clusterMsgData { + /* PING, MEET and PONG */ + struct { + /* Array of N clusterMsgDataGossip structures */ + clusterMsgDataGossip gossip[1]; + } ping; + + /* FAIL */ + struct { + clusterMsgDataFail about; + } fail; + + /* PUBLISH */ + struct { + clusterMsgDataPublish msg; + } publish; +}; + +typedef struct { + uint32_t totlen; /* Total length of this message */ + uint16_t type; /* Message type */ + uint16_t count; /* Only used for some kind of messages. */ + uint64_t currentEpoch; /* The epoch accordingly to the sending node. */ + uint64_t configEpoch; /* The config epoch if it's a master, or the last epoch + advertised by its master if it is a slave. */ + char sender[REDIS_CLUSTER_NAMELEN]; /* Name of the sender node */ + unsigned char myslots[REDIS_CLUSTER_SLOTS/8]; + char slaveof[REDIS_CLUSTER_NAMELEN]; + char notused1[32]; /* 32 bytes reserved for future usage. */ + uint16_t port; /* Sender TCP base port */ + uint16_t flags; /* Sender node flags */ + unsigned char state; /* Cluster state from the POV of the sender */ + unsigned char notused2[3]; /* Reserved for future use. For alignment. */ + union clusterMsgData data; +} clusterMsg; + +#define CLUSTERMSG_MIN_LEN (sizeof(clusterMsg)-sizeof(union clusterMsgData)) + +/* ----------------------- API exported outside cluster.c ------------------------- */ +clusterNode *getNodeByQuery(redisClient *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot, int *ask); + +#endif /* __REDIS_CLUSTER_H */ diff --git a/src/config.c b/src/config.c index 2707bfaeb..64f836c7b 100644 --- a/src/config.c +++ b/src/config.c @@ -29,6 +29,7 @@ */ #include "redis.h" +#include "cluster.h" #include #include diff --git a/src/db.c b/src/db.c index 02f8dd3a7..9c0349bd0 100644 --- a/src/db.c +++ b/src/db.c @@ -28,6 +28,7 @@ */ #include "redis.h" +#include "cluster.h" #include #include diff --git a/src/redis.c b/src/redis.c index bd547cd37..bc75b1c99 100644 --- a/src/redis.c +++ b/src/redis.c @@ -28,6 +28,7 @@ */ #include "redis.h" +#include "cluster.h" #include "slowlog.h" #include "bio.h" diff --git a/src/redis.h b/src/redis.h index 94decca9d..88792f771 100644 --- a/src/redis.h +++ b/src/redis.h @@ -565,184 +565,12 @@ typedef struct redisOpArray { int numops; } redisOpArray; -/*----------------------------------------------------------------------------- - * Redis cluster data structures - *----------------------------------------------------------------------------*/ - -#define REDIS_CLUSTER_SLOTS 16384 -#define REDIS_CLUSTER_OK 0 /* Everything looks ok */ -#define REDIS_CLUSTER_FAIL 1 /* The cluster can't work */ -#define REDIS_CLUSTER_NAMELEN 40 /* sha1 hex length */ -#define REDIS_CLUSTER_PORT_INCR 10000 /* Cluster port = baseport + PORT_INCR */ -#define REDIS_CLUSTER_IPLEN INET6_ADDRSTRLEN /* IPv6 address string length */ - -/* The following defines are amunt of time, sometimes expressed as - * multiplicators of the node timeout value (when ending with MULT). */ -#define REDIS_CLUSTER_DEFAULT_NODE_TIMEOUT 15 -#define REDIS_CLUSTER_FAIL_REPORT_VALIDITY_MULT 2 /* Fail report validity. */ -#define REDIS_CLUSTER_FAIL_UNDO_TIME_MULT 2 /* Undo fail if master is back. */ -#define REDIS_CLUSTER_FAIL_UNDO_TIME_ADD 10 /* Some additional time. */ -#define REDIS_CLUSTER_SLAVE_VALIDITY_MULT 10 /* Slave data validity. */ -#define REDIS_CLUSTER_FAILOVER_AUTH_RETRY_MULT 4 /* Auth request retry time. */ -#define REDIS_CLUSTER_FAILOVER_DELAY 5 /* Seconds */ - -struct clusterNode; - -/* clusterLink encapsulates everything needed to talk with a remote node. */ -typedef struct clusterLink { - time_t ctime; /* Link creation time */ - int fd; /* TCP socket file descriptor */ - sds sndbuf; /* Packet send buffer */ - sds rcvbuf; /* Packet reception buffer */ - struct clusterNode *node; /* Node related to this link if any, or NULL */ -} clusterLink; - -/* Node flags */ -#define REDIS_NODE_MASTER 1 /* The node is a master */ -#define REDIS_NODE_SLAVE 2 /* The node is a slave */ -#define REDIS_NODE_PFAIL 4 /* Failure? Need acknowledge */ -#define REDIS_NODE_FAIL 8 /* The node is believed to be malfunctioning */ -#define REDIS_NODE_MYSELF 16 /* This node is myself */ -#define REDIS_NODE_HANDSHAKE 32 /* We have still to exchange the first ping */ -#define REDIS_NODE_NOADDR 64 /* We don't know the address of this node */ -#define REDIS_NODE_MEET 128 /* Send a MEET message to this node */ -#define REDIS_NODE_PROMOTED 256 /* Master was a slave propoted by failover */ -#define REDIS_NODE_NULL_NAME "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000" - -/* This structure represent elements of node->fail_reports. */ -struct clusterNodeFailReport { - struct clusterNode *node; /* Node reporting the failure condition. */ - time_t time; /* Time of the last report from this node. */ -} typedef clusterNodeFailReport; - -struct clusterNode { - time_t ctime; /* Node object creation time. */ - char name[REDIS_CLUSTER_NAMELEN]; /* Node name, hex string, sha1-size */ - int flags; /* REDIS_NODE_... */ - uint64_t configEpoch; /* Last configEpoch observed for this node */ - unsigned char slots[REDIS_CLUSTER_SLOTS/8]; /* slots handled by this node */ - int numslots; /* Number of slots handled by this node */ - int numslaves; /* Number of slave nodes, if this is a master */ - struct clusterNode **slaves; /* pointers to slave nodes */ - struct clusterNode *slaveof; /* pointer to the master node */ - time_t ping_sent; /* Unix time we sent latest ping */ - time_t pong_received; /* Unix time we received the pong */ - time_t fail_time; /* Unix time when FAIL flag was set */ - time_t voted_time; /* Last time we voted for a slave of this master */ - char ip[REDIS_IP_STR_LEN]; /* Latest known IP address of this node */ - int port; /* Latest known port of this node */ - clusterLink *link; /* TCP/IP link with this node */ - list *fail_reports; /* List of nodes signaling this as failing */ -}; -typedef struct clusterNode clusterNode; - -typedef struct { - clusterNode *myself; /* This node */ - uint64_t currentEpoch; - int state; /* REDIS_CLUSTER_OK, REDIS_CLUSTER_FAIL, ... */ - int size; /* Num of master nodes with at least one slot */ - dict *nodes; /* Hash table of name -> clusterNode structures */ - clusterNode *migrating_slots_to[REDIS_CLUSTER_SLOTS]; - clusterNode *importing_slots_from[REDIS_CLUSTER_SLOTS]; - clusterNode *slots[REDIS_CLUSTER_SLOTS]; - zskiplist *slots_to_keys; - /* The following fields are used to take the slave state on elections. */ - mstime_t failover_auth_time;/* Time at which we'll try to get elected in ms*/ - int failover_auth_count; /* Number of votes received so far. */ - int failover_auth_sent; /* True if we already asked for votes. */ - uint64_t failover_auth_epoch; /* Epoch of the current election. */ - /* The followign fields are uesd by masters to take state on elections. */ - uint64_t last_vote_epoch; /* Epoch of the last vote granted. */ - int todo_before_sleep; /* Things to do in clusterBeforeSleep(). */ - long long stats_bus_messages_sent; /* Num of msg sent via cluster bus. */ - long long stats_bus_messages_received; /* Num of msg received via cluster bus. */ -} clusterState; - -/* clusterState todo_before_sleep flags. */ -#define CLUSTER_TODO_HANDLE_FAILOVER (1<<0) -#define CLUSTER_TODO_UPDATE_STATE (1<<1) -#define CLUSTER_TODO_SAVE_CONFIG (1<<2) -#define CLUSTER_TODO_FSYNC_CONFIG (1<<3) - -/* Redis cluster messages header */ - -/* Note that the PING, PONG and MEET messages are actually the same exact - * kind of packet. PONG is the reply to ping, in the exact format as a PING, - * while MEET is a special PING that forces the receiver to add the sender - * as a node (if it is not already in the list). */ -#define CLUSTERMSG_TYPE_PING 0 /* Ping */ -#define CLUSTERMSG_TYPE_PONG 1 /* Pong (reply to Ping) */ -#define CLUSTERMSG_TYPE_MEET 2 /* Meet "let's join" message */ -#define CLUSTERMSG_TYPE_FAIL 3 /* Mark node xxx as failing */ -#define CLUSTERMSG_TYPE_PUBLISH 4 /* Pub/Sub Publish propagation */ -#define CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST 5 /* May I failover? */ -#define CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK 6 /* Yes, you can failover. */ - -/* Initially we don't know our "name", but we'll find it once we connect - * to the first node, using the getsockname() function. Then we'll use this - * address for all the next messages. */ -typedef struct { - char nodename[REDIS_CLUSTER_NAMELEN]; - uint32_t ping_sent; - uint32_t pong_received; - char ip[16]; /* IP address last time it was seen */ - uint16_t port; /* port last time it was seen */ - uint16_t flags; - uint32_t notused; /* for 64 bit alignment */ -} clusterMsgDataGossip; - -typedef struct { - char nodename[REDIS_CLUSTER_NAMELEN]; -} clusterMsgDataFail; - -typedef struct { - uint32_t channel_len; - uint32_t message_len; - unsigned char bulk_data[8]; /* defined as 8 just for alignment concerns. */ -} clusterMsgDataPublish; - -union clusterMsgData { - /* PING, MEET and PONG */ - struct { - /* Array of N clusterMsgDataGossip structures */ - clusterMsgDataGossip gossip[1]; - } ping; - - /* FAIL */ - struct { - clusterMsgDataFail about; - } fail; - - /* PUBLISH */ - struct { - clusterMsgDataPublish msg; - } publish; -}; - -typedef struct { - uint32_t totlen; /* Total length of this message */ - uint16_t type; /* Message type */ - uint16_t count; /* Only used for some kind of messages. */ - uint64_t currentEpoch; /* The epoch accordingly to the sending node. */ - uint64_t configEpoch; /* The config epoch if it's a master, or the last epoch - advertised by its master if it is a slave. */ - char sender[REDIS_CLUSTER_NAMELEN]; /* Name of the sender node */ - unsigned char myslots[REDIS_CLUSTER_SLOTS/8]; - char slaveof[REDIS_CLUSTER_NAMELEN]; - char notused1[32]; /* 32 bytes reserved for future usage. */ - uint16_t port; /* Sender TCP base port */ - uint16_t flags; /* Sender node flags */ - unsigned char state; /* Cluster state from the POV of the sender */ - unsigned char notused2[3]; /* Reserved for future use. For alignment. */ - union clusterMsgData data; -} clusterMsg; - -#define CLUSTERMSG_MIN_LEN (sizeof(clusterMsg)-sizeof(union clusterMsgData)) - /*----------------------------------------------------------------------------- * Global server state *----------------------------------------------------------------------------*/ +struct clusterState; + struct redisServer { /* General */ char *configfile; /* Absolute config file path, or NULL */ @@ -942,7 +770,7 @@ struct redisServer { int cluster_enabled; /* Is cluster enabled? */ int cluster_node_timeout; /* Cluster node timeout. */ char *cluster_configfile; /* Cluster auto-generated config file name. */ - clusterState *cluster; /* State of the cluster */ + struct clusterState *cluster; /* State of the cluster */ /* Scripting */ lua_State *lua; /* The Lua interpreter. We use just one for all clients */ redisClient *lua_client; /* The "fake client" to query Redis from Lua */ @@ -1380,10 +1208,7 @@ int *zunionInterGetKeys(struct redisCommand *cmd,robj **argv, int argc, int *num void clusterInit(void); unsigned short crc16(const char *buf, int len); unsigned int keyHashSlot(char *key, int keylen); -clusterNode *createClusterNode(char *nodename, int flags); -int clusterAddNode(clusterNode *node); void clusterCron(void); -clusterNode *getNodeByQuery(redisClient *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot, int *ask); void clusterPropagatePublish(robj *channel, robj *message); void migrateCloseTimedoutSockets(void); void clusterBeforeSleep(void); From e4b341a335279ce55ba1feb922404a49c12ae0dd Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 9 Oct 2013 16:18:33 +0200 Subject: [PATCH 0221/2500] Cluster: time switched from seconds to milliseconds. All the internal state of cluster involving time is now using mstime_t and mstime() in order to use milliseconds resolution. Also the clusterCron() function is called with a 10 hz frequency instead of 1 hz. The cluster node_timeout must be also configured in milliseconds by the user in redis.conf. --- src/cluster.c | 68 ++++++++++++++++++++++++--------------------------- src/cluster.h | 16 ++++++------ src/config.c | 2 +- src/redis.c | 2 +- src/redis.h | 2 +- 5 files changed, 43 insertions(+), 47 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 9c0d3e409..193c654e8 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -134,7 +134,7 @@ int clusterLoadConfig(char *filename) { n->flags |= REDIS_NODE_PFAIL; } else if (!strcasecmp(s,"fail")) { n->flags |= REDIS_NODE_FAIL; - n->fail_time = time(NULL); + n->fail_time = mstime(); } else if (!strcasecmp(s,"handshake")) { n->flags |= REDIS_NODE_HANDSHAKE; } else if (!strcasecmp(s,"noaddr")) { @@ -160,8 +160,8 @@ int clusterLoadConfig(char *filename) { } /* Set ping sent / pong received timestamps */ - if (atoi(argv[4])) n->ping_sent = time(NULL); - if (atoi(argv[5])) n->pong_received = time(NULL); + if (atoi(argv[4])) n->ping_sent = mstime(); + if (atoi(argv[5])) n->pong_received = mstime(); /* Set configEpoch for this node. */ n->configEpoch = strtoull(argv[6],NULL,10); @@ -310,7 +310,7 @@ void clusterInit(void) { clusterLink *createClusterLink(clusterNode *node) { clusterLink *link = zmalloc(sizeof(*link)); - link->ctime = time(NULL); + link->ctime = mstime(); link->sndbuf = sdsempty(); link->rcvbuf = sdsempty(); link->node = node; @@ -389,7 +389,7 @@ clusterNode *createClusterNode(char *nodename, int flags) { memcpy(node->name, nodename, REDIS_CLUSTER_NAMELEN); else getRandomHexChars(node->name, REDIS_CLUSTER_NAMELEN); - node->ctime = time(NULL); + node->ctime = mstime(); node->configEpoch = 0; node->flags = flags; memset(node->slots,0,sizeof(node->slots)); @@ -430,7 +430,7 @@ int clusterNodeAddFailureReport(clusterNode *failing, clusterNode *sender) { while ((ln = listNext(&li)) != NULL) { fr = ln->value; if (fr->node == sender) { - fr->time = time(NULL); + fr->time = mstime(); return 0; } } @@ -438,7 +438,7 @@ int clusterNodeAddFailureReport(clusterNode *failing, clusterNode *sender) { /* Otherwise create a new report. */ fr = zmalloc(sizeof(*fr)); fr->node = sender; - fr->time = time(NULL); + fr->time = mstime(); listAddNodeTail(l,fr); return 1; } @@ -453,9 +453,9 @@ void clusterNodeCleanupFailureReports(clusterNode *node) { listNode *ln; listIter li; clusterNodeFailReport *fr; - time_t maxtime = server.cluster_node_timeout * + mstime_t maxtime = server.cluster_node_timeout * REDIS_CLUSTER_FAIL_REPORT_VALIDITY_MULT; - time_t now = time(NULL); + mstime_t now = mstime(); listRewind(l,&li); while ((ln = listNext(&li)) != NULL) { @@ -663,7 +663,7 @@ void markNodeAsFailingIfNeeded(clusterNode *node) { /* Mark the node as failing. */ node->flags &= ~REDIS_NODE_PFAIL; node->flags |= REDIS_NODE_FAIL; - node->fail_time = time(NULL); + node->fail_time = mstime(); /* Broadcast the failing node name to everybody, forcing all the other * reachable nodes to flag the node as FAIL. */ @@ -676,7 +676,7 @@ void markNodeAsFailingIfNeeded(clusterNode *node) { * to reach it again. It checks if there are the conditions to undo the FAIL * state. */ void clearNodeFailureIfNeeded(clusterNode *node) { - time_t now = time(NULL); + time_t now = mstime(); redisAssert(node->flags & REDIS_NODE_FAIL); @@ -691,17 +691,13 @@ void clearNodeFailureIfNeeded(clusterNode *node) { } /* If it is a master and... - * 1) The FAIL state is old enough. We use our node timeout multiplicator - * plus some additional fixed time. The additional time is useful when - * the node timeout is extremely short and the reaction time of - * the cluster may be longer, so wait at least a few seconds always. + * 1) The FAIL state is old enough. * 2) It is yet serving slots from our point of view (not failed over). * Apparently no one is going to fix these slots, clear the FAIL flag. */ if (node->flags & REDIS_NODE_MASTER && node->numslots > 0 && (now - node->fail_time) > - (server.cluster_node_timeout * REDIS_CLUSTER_FAIL_UNDO_TIME_MULT + - REDIS_CLUSTER_FAIL_UNDO_TIME_ADD)) + (server.cluster_node_timeout * REDIS_CLUSTER_FAIL_UNDO_TIME_MULT)) { redisLog(REDIS_NOTICE, "Clear FAIL state for node %.40s: is reachable again and nobody is serving its slots after some time.", @@ -1015,7 +1011,7 @@ int clusterProcessPacket(clusterLink *link) { /* Update our info about the node */ if (link->node && type == CLUSTERMSG_TYPE_PONG) { - link->node->pong_received = time(NULL); + link->node->pong_received = mstime(); link->node->ping_sent = 0; /* The PFAIL condition can be reversed without external @@ -1161,7 +1157,7 @@ int clusterProcessPacket(clusterLink *link) { "FAIL message received from %.40s about %.40s", hdr->sender, hdr->data.fail.about.nodename); failing->flags |= REDIS_NODE_FAIL; - failing->fail_time = time(NULL); + failing->fail_time = mstime(); failing->flags &= ~REDIS_NODE_PFAIL; clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|CLUSTER_TODO_UPDATE_STATE); } @@ -1393,7 +1389,7 @@ void clusterSendPing(clusterLink *link, int type) { int freshnodes = dictSize(server.cluster->nodes)-2; if (link->node && type == CLUSTERMSG_TYPE_PING) - link->node->ping_sent = time(NULL); + link->node->ping_sent = mstime(); clusterBuildMessageHdr(hdr,type); /* Populate the gossip fields */ @@ -1595,8 +1591,8 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { /* We did not voted for a slave about this master for two * times the node timeout. This is not strictly needed for correctness * of the algorithm but makes the base case more linear. */ - if (server.unixtime - node->slaveof->voted_time < - server.cluster_node_timeout * 2) return; + if (mstime() - node->slaveof->voted_time < server.cluster_node_timeout * 2) + return; /* The slave requesting the vote must have a configEpoch for the claimed slots * that is >= the one of the masters currently serving the same slots in the @@ -1614,7 +1610,7 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { /* We can vote for this slave. */ clusterSendFailoverAuth(node); server.cluster->last_vote_epoch = server.cluster->currentEpoch; - node->slaveof->voted_time = server.unixtime; + node->slaveof->voted_time = mstime(); } /* This function is called if we are a slave node and our master serving @@ -1626,16 +1622,16 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { * 3) Perform the failover informing all the other nodes. */ void clusterHandleSlaveFailover(void) { - time_t data_age; + mstime_t data_age; mstime_t auth_age = mstime() - server.cluster->failover_auth_time; int needed_quorum = (server.cluster->size / 2) + 1; int j; /* Set data_age to the number of seconds we are disconnected from the master. */ if (server.repl_state == REDIS_REPL_CONNECTED) { - data_age = server.unixtime - server.master->lastinteraction; + data_age = server.unixtime - server.master->lastinteraction * 1000; } else { - data_age = server.unixtime - server.repl_down_since; + data_age = server.unixtime - server.repl_down_since * 1000; } /* Pre conditions to run the function: @@ -1663,11 +1659,11 @@ void clusterHandleSlaveFailover(void) { /* Compute the time at which we can start an election. */ if (server.cluster->failover_auth_time == 0 || auth_age > - server.cluster_node_timeout * 1000 * REDIS_CLUSTER_FAILOVER_AUTH_RETRY_MULT) + server.cluster_node_timeout * REDIS_CLUSTER_FAILOVER_AUTH_RETRY_MULT) { server.cluster->failover_auth_time = mstime() + 500 + /* Fixed delay of 500 milliseconds, let FAIL msg propagate. */ - data_age * 100 + /* Add 100 milliseconds for every second of age. */ + data_age / 10 + /* Add 100 milliseconds for every second of age. */ random() % 500; /* Random delay between 0 and 500 milliseconds. */ server.cluster->failover_auth_count = 0; server.cluster->failover_auth_sent = 0; @@ -1680,7 +1676,7 @@ void clusterHandleSlaveFailover(void) { if (mstime() < server.cluster->failover_auth_time) return; /* Return ASAP if the election is too old to be valid. */ - if (mstime() - server.cluster->failover_auth_time > server.cluster_node_timeout * 1000) + if (mstime() - server.cluster->failover_auth_time > server.cluster_node_timeout) return; /* Ask for votes if needed. */ @@ -1739,12 +1735,12 @@ void clusterHandleSlaveFailover(void) { * CLUSTER cron job * -------------------------------------------------------------------------- */ -/* This is executed 1 time every second */ +/* This is executed 10 times every second */ void clusterCron(void) { dictIterator *di; dictEntry *de; int j, update_state = 0; - time_t min_pong = 0; + mstime_t min_pong = 0, now = mstime(); clusterNode *min_pong_node = NULL; /* Check if we have disconnected nodes and re-establish the connection. */ @@ -1757,7 +1753,7 @@ void clusterCron(void) { /* A Node in HANDSHAKE state has a limited lifespan equal to the * configured node timeout. */ if (node->flags & REDIS_NODE_HANDSHAKE && - server.unixtime - node->ctime > server.cluster_node_timeout) + now - node->ctime > server.cluster_node_timeout) { freeClusterNode(node); continue; @@ -1765,7 +1761,7 @@ void clusterCron(void) { if (node->link == NULL) { int fd; - time_t old_ping_sent; + mstime_t old_ping_sent; clusterLink *link; fd = anetTcpNonBlockConnect(server.neterr, node->ip, @@ -1804,7 +1800,7 @@ void clusterCron(void) { /* Ping some random node. Check a few random nodes and ping the one with * the oldest pong_received time */ - for (j = 0; j < 5; j++) { + for (j = 0; j < 2; j++) { de = dictGetRandomKey(server.cluster->nodes); clusterNode *this = dictGetVal(de); @@ -1825,7 +1821,7 @@ void clusterCron(void) { di = dictGetSafeIterator(server.cluster->nodes); while((de = dictNext(di)) != NULL) { clusterNode *node = dictGetVal(de); - time_t now = time(NULL); + now = mstime(); /* Use an updated time at every iteration. */ int delay; if (node->flags & @@ -1836,7 +1832,7 @@ void clusterCron(void) { * timeout, reconnect the link: maybe there is a connection * issue even if the node is alive. */ if (node->link && /* is connected */ - time(NULL) - node->link->ctime > + now - node->link->ctime > server.cluster_node_timeout && /* was not already reconnected */ node->ping_sent && /* we already sent a ping */ node->pong_received < node->ping_sent && /* still waiting pong */ diff --git a/src/cluster.h b/src/cluster.h index d46b105f8..9c598be01 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -14,7 +14,7 @@ /* The following defines are amunt of time, sometimes expressed as * multiplicators of the node timeout value (when ending with MULT). */ -#define REDIS_CLUSTER_DEFAULT_NODE_TIMEOUT 15 +#define REDIS_CLUSTER_DEFAULT_NODE_TIMEOUT 15000 #define REDIS_CLUSTER_FAIL_REPORT_VALIDITY_MULT 2 /* Fail report validity. */ #define REDIS_CLUSTER_FAIL_UNDO_TIME_MULT 2 /* Undo fail if master is back. */ #define REDIS_CLUSTER_FAIL_UNDO_TIME_ADD 10 /* Some additional time. */ @@ -26,7 +26,7 @@ struct clusterNode; /* clusterLink encapsulates everything needed to talk with a remote node. */ typedef struct clusterLink { - time_t ctime; /* Link creation time */ + mstime_t ctime; /* Link creation time */ int fd; /* TCP socket file descriptor */ sds sndbuf; /* Packet send buffer */ sds rcvbuf; /* Packet reception buffer */ @@ -48,11 +48,11 @@ typedef struct clusterLink { /* This structure represent elements of node->fail_reports. */ struct clusterNodeFailReport { struct clusterNode *node; /* Node reporting the failure condition. */ - time_t time; /* Time of the last report from this node. */ + mstime_t time; /* Time of the last report from this node. */ } typedef clusterNodeFailReport; struct clusterNode { - time_t ctime; /* Node object creation time. */ + mstime_t ctime; /* Node object creation time. */ char name[REDIS_CLUSTER_NAMELEN]; /* Node name, hex string, sha1-size */ int flags; /* REDIS_NODE_... */ uint64_t configEpoch; /* Last configEpoch observed for this node */ @@ -61,10 +61,10 @@ struct clusterNode { int numslaves; /* Number of slave nodes, if this is a master */ struct clusterNode **slaves; /* pointers to slave nodes */ struct clusterNode *slaveof; /* pointer to the master node */ - time_t ping_sent; /* Unix time we sent latest ping */ - time_t pong_received; /* Unix time we received the pong */ - time_t fail_time; /* Unix time when FAIL flag was set */ - time_t voted_time; /* Last time we voted for a slave of this master */ + mstime_t ping_sent; /* Unix time we sent latest ping */ + mstime_t pong_received; /* Unix time we received the pong */ + mstime_t fail_time; /* Unix time when FAIL flag was set */ + mstime_t voted_time; /* Last time we voted for a slave of this master */ char ip[REDIS_IP_STR_LEN]; /* Latest known IP address of this node */ int port; /* Latest known port of this node */ clusterLink *link; /* TCP/IP link with this node */ diff --git a/src/config.c b/src/config.c index 64f836c7b..8bfb208f9 100644 --- a/src/config.c +++ b/src/config.c @@ -416,7 +416,7 @@ void loadServerConfigFromString(char *config) { zfree(server.cluster_configfile); server.cluster_configfile = zstrdup(argv[1]); } else if (!strcasecmp(argv[0],"cluster-node-timeout") && argc == 2) { - server.cluster_node_timeout = atoi(argv[1]); + server.cluster_node_timeout = strtoll(argv[1],NULL,10); if (server.cluster_node_timeout <= 0) { err = "cluster node timeout must be 1 or greater"; goto loaderr; } diff --git a/src/redis.c b/src/redis.c index bc75b1c99..30348a674 100644 --- a/src/redis.c +++ b/src/redis.c @@ -1155,7 +1155,7 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) { run_with_period(1000) replicationCron(); /* Run the Redis Cluster cron. */ - run_with_period(1000) { + run_with_period(100) { if (server.cluster_enabled) clusterCron(); } diff --git a/src/redis.h b/src/redis.h index 88792f771..a8a68bced 100644 --- a/src/redis.h +++ b/src/redis.h @@ -768,7 +768,7 @@ struct redisServer { xor of REDIS_NOTIFY... flags. */ /* Cluster */ int cluster_enabled; /* Is cluster enabled? */ - int cluster_node_timeout; /* Cluster node timeout. */ + mstime_t cluster_node_timeout; /* Cluster node timeout. */ char *cluster_configfile; /* Cluster auto-generated config file name. */ struct clusterState *cluster; /* State of the cluster */ /* Scripting */ From 8d541008581559558164549af9409fcbeb27feaa Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 9 Oct 2013 16:21:27 +0200 Subject: [PATCH 0222/2500] Cluster: example redis.conf updated from sec to ms for cluster-node-timeout. --- redis.conf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/redis.conf b/redis.conf index a31d014c4..abe74bfc4 100644 --- a/redis.conf +++ b/redis.conf @@ -523,11 +523,11 @@ lua-time-limit 5000 # # cluster-config-file nodes-6379.conf -# Cluster node timeout is the amount of seconds a node must be unreachable +# Cluster node timeout is the amount of milliseconds a node must be unreachable # for it to be considered in failure state. -# Most other internal time limits are multiplicators of the node timeout. +# Most other internal time limits are multiple of the node timeout. # -# cluster-node-timeout 15 +# cluster-node-timeout 15000 # In order to setup your cluster make sure to read the documentation # available at http://redis.io web site. From aa0e7dbcf3aaafda2c5576ad66b06996334ac149 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 9 Oct 2013 16:29:14 +0200 Subject: [PATCH 0223/2500] Cluster: clusterCron() freq is now 10h. Still ping 1 node every sec. After the change in clusterCron() frequency of call, we still want to ping just one random node every second. --- src/cluster.c | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 193c654e8..aec0176f2 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -1742,6 +1742,9 @@ void clusterCron(void) { int j, update_state = 0; mstime_t min_pong = 0, now = mstime(); clusterNode *min_pong_node = NULL; + static unsigned long long iteration = 0; + + iteration++; /* Number of times this function was called so far. */ /* Check if we have disconnected nodes and re-establish the connection. */ di = dictGetSafeIterator(server.cluster->nodes); @@ -1798,23 +1801,27 @@ void clusterCron(void) { } dictReleaseIterator(di); - /* Ping some random node. Check a few random nodes and ping the one with - * the oldest pong_received time */ - for (j = 0; j < 2; j++) { - de = dictGetRandomKey(server.cluster->nodes); - clusterNode *this = dictGetVal(de); + /* Ping some random node 1 time every 10 iterations, so that we usually ping + * one random node every second. */ + if (!(iteration % 10)) { + /* Check a few random nodes and ping the one with the oldest + * pong_received time. */ + for (j = 0; j < 5; j++) { + de = dictGetRandomKey(server.cluster->nodes); + clusterNode *this = dictGetVal(de); - /* Don't ping nodes disconnected or with a ping currently active. */ - if (this->link == NULL || this->ping_sent != 0) continue; - if (this->flags & (REDIS_NODE_MYSELF|REDIS_NODE_HANDSHAKE)) continue; - if (min_pong_node == NULL || min_pong > this->pong_received) { - min_pong_node = this; - min_pong = this->pong_received; + /* Don't ping nodes disconnected or with a ping currently active. */ + if (this->link == NULL || this->ping_sent != 0) continue; + if (this->flags & (REDIS_NODE_MYSELF|REDIS_NODE_HANDSHAKE)) continue; + if (min_pong_node == NULL || min_pong > this->pong_received) { + min_pong_node = this; + min_pong = this->pong_received; + } + } + if (min_pong_node) { + redisLog(REDIS_DEBUG,"Pinging node %.40s", min_pong_node->name); + clusterSendPing(min_pong_node->link, CLUSTERMSG_TYPE_PING); } - } - if (min_pong_node) { - redisLog(REDIS_DEBUG,"Pinging node %.40s", min_pong_node->name); - clusterSendPing(min_pong_node->link, CLUSTERMSG_TYPE_PING); } /* Iterate nodes to check if we need to flag something as failing */ From 39c90945e0cc9251b83dd07fcf84f6103ec2bcae Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 9 Oct 2013 16:36:00 +0200 Subject: [PATCH 0224/2500] Cluster: data_age conversion to milliseconds fixed. --- src/cluster.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index aec0176f2..26b1f40b1 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -1629,9 +1629,9 @@ void clusterHandleSlaveFailover(void) { /* Set data_age to the number of seconds we are disconnected from the master. */ if (server.repl_state == REDIS_REPL_CONNECTED) { - data_age = server.unixtime - server.master->lastinteraction * 1000; + data_age = (server.unixtime - server.master->lastinteraction) * 1000; } else { - data_age = server.unixtime - server.repl_down_since * 1000; + data_age = (server.unixtime - server.repl_down_since) * 1000; } /* Pre conditions to run the function: From e45d9420e0ad0622f6eb4d702e3b060b2184e21b Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 11 Oct 2013 10:34:32 +0200 Subject: [PATCH 0225/2500] Cluster: there is a lower limit for the handshake timeout. --- src/cluster.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/cluster.c b/src/cluster.c index 26b1f40b1..d4f0f9300 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -1743,9 +1743,17 @@ void clusterCron(void) { mstime_t min_pong = 0, now = mstime(); clusterNode *min_pong_node = NULL; static unsigned long long iteration = 0; + mstime_t handshake_timeout; iteration++; /* Number of times this function was called so far. */ + /* The handshake timeout is the time after which an handshake node that was + * not turned into a normal node is removed from the nodes. Usually it is + * just the NODE_TIMEOUT value, but when NODE_TIMEOUT is too small we use + * the value of 1 second. */ + handshake_timeout = server.cluster_node_timeout; + if (handshake_timeout < 1000) handshake_timeout = 1000; + /* Check if we have disconnected nodes and re-establish the connection. */ di = dictGetSafeIterator(server.cluster->nodes); while((de = dictNext(di)) != NULL) { @@ -1756,7 +1764,7 @@ void clusterCron(void) { /* A Node in HANDSHAKE state has a limited lifespan equal to the * configured node timeout. */ if (node->flags & REDIS_NODE_HANDSHAKE && - now - node->ctime > server.cluster_node_timeout) + now - node->ctime > handshake_timeout) { freeClusterNode(node); continue; From 0dbe09bfeccca70f144908ca35509e3910dc8fa7 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 11 Oct 2013 17:33:19 +0200 Subject: [PATCH 0226/2500] Cluster: rough support for sub-command options in redis-trib. --- src/redis-trib.rb | 66 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 51 insertions(+), 15 deletions(-) diff --git a/src/redis-trib.rb b/src/redis-trib.rb index 4b7acea42..76eb6f436 100755 --- a/src/redis-trib.rb +++ b/src/redis-trib.rb @@ -589,19 +589,19 @@ class RedisTrib # redis-trib subcommands implementations - def check_cluster_cmd - load_cluster_info_from_node(ARGV[1]) + def check_cluster_cmd(argv,opt) + load_cluster_info_from_node(argv[0]) check_cluster end - def fix_cluster_cmd + def fix_cluster_cmd(argv,opt) @fix = true - load_cluster_info_from_node(ARGV[1]) + load_cluster_info_from_node(argv[0]) check_cluster end - def reshard_cluster_cmd - load_cluster_info_from_node(ARGV[1]) + def reshard_cluster_cmd(argv,opt) + load_cluster_info_from_node(argv[0]) check_cluster if @errors.length != 0 puts "*** Please fix your cluster problems before resharding" @@ -667,9 +667,9 @@ class RedisTrib } end - def create_cluster_cmd + def create_cluster_cmd(argv,opt) xputs ">>> Creating cluster" - ARGV[1..-1].each{|n| + argv[0..-1].each{|n| node = ClusterNode.new(n) node.connect(:abort => true) node.assert_cluster @@ -693,15 +693,15 @@ class RedisTrib check_cluster end - def addnode_cluster_cmd - xputs ">>> Adding node #{ARGV[1]} to cluster #{ARGV[2]}" + def addnode_cluster_cmd(argv,opt) + xputs ">>> Adding node #{argv[0]} to cluster #{argv[1]}" # Check the existing cluster - load_cluster_info_from_node(ARGV[2]) + load_cluster_info_from_node(argv[1]) check_cluster # Add the new node - new = ClusterNode.new(ARGV[1]) + new = ClusterNode.new(argv[0]) new.connect(:abort => true) new.assert_cluster new.load_info @@ -713,10 +713,39 @@ class RedisTrib new.r.cluster("meet",first[:host],first[:port]) end - def help_cluster_cmd + def help_cluster_cmd(opt) show_help exit 0 end + + # Parse the options for the specific command "cmd". + # Returns an hash populate with option => value pairs, and the index of + # the first non-option argument in ARGV. + def parse_options(cmd) + idx = 1 ; # Current index into ARGV + options={} + while idx < ARGV.length && ARGV[idx][0..1] == '--' + if ARGV[idx][0..1] == "--" + option = ARGV[idx][2..-1] + idx += 1 + if ALLOWED_OPTIONS[cmd] == nil || ALLOWED_OPTIONS[cmd][option] == nil + puts "Unknown option '#{option}' for command '#{cmd}'" + exit 1 + end + if ALLOWED_OPTIONS[cmd][option] + value = ARGV[idx] + idx += 1 + else + value = true + end + options[option] = value + else + # Remaining arguments are not options. + break + end + end + return options,idx + end end COMMANDS={ @@ -728,6 +757,10 @@ COMMANDS={ "help" => ["help_cluster_cmd", 1, "(show this help)"] } +ALLOWED_OPTIONS={ + "create" => {"slaves" => false} +} + def show_help puts "Usage: redis-trib " puts @@ -749,7 +782,10 @@ if !cmd_spec puts "Unknown redis-trib subcommand '#{ARGV[0]}'" exit 1 end -rt.check_arity(cmd_spec[1],ARGV.length) + +# Parse options +cmd_options,first_non_option = rt.parse_options(ARGV[0].downcase) +rt.check_arity(cmd_spec[1],ARGV.length-(first_non_option-1)) # Dispatch -rt.send(cmd_spec[0]) +rt.send(cmd_spec[0],ARGV[first_non_option..-1],cmd_options) From 956c0ed9275abe07c34a24801a3a18659e8c7121 Mon Sep 17 00:00:00 2001 From: Pieter Noordhuis Date: Mon, 9 Jul 2012 01:00:26 -0700 Subject: [PATCH 0227/2500] Add SCAN command --- src/db.c | 100 +++++++++++++++++++++++++++++++++++++++++++ src/dict.c | 92 +++++++++++++++++++++++++++++++++++++++ src/dict.h | 3 ++ src/redis.c | 1 + src/redis.h | 1 + tests/unit/basic.tcl | 54 +++++++++++++++++++++++ 6 files changed, 251 insertions(+) diff --git a/src/db.c b/src/db.c index 9c0349bd0..32d539f6b 100644 --- a/src/db.c +++ b/src/db.c @@ -319,6 +319,106 @@ void keysCommand(redisClient *c) { setDeferredMultiBulkLength(c,replylen,numkeys); } +void scanCallback(void *privdata, const dictEntry *de) { + list *keys = (list *)privdata; + sds key = dictGetKey(de); + robj *kobj = createStringObject(key, sdslen(key)); + listAddNodeTail(keys, kobj); +} + +void scanCommand(redisClient *c) { + int rv; + int i, j; + char buf[32]; + list *keys = listCreate(); + listNode *ln, *ln_; + unsigned long cursor = 0; + long count = 1; + sds pat; + int patlen, patnoop = 1; + + /* Use sscanf because we need an *unsigned* long */ + rv = sscanf(c->argv[1]->ptr, "%lu", &cursor); + if (rv != 1) { + addReplyError(c, "invalid cursor"); + goto cleanup; + } + + i = 2; + while (i < c->argc) { + j = c->argc - i; + if (!strcasecmp(c->argv[i]->ptr, "count") && j >= 2) { + if (getLongFromObjectOrReply(c, c->argv[i+1], &count, NULL) != REDIS_OK) { + goto cleanup; + } + + if (count < 1) { + addReply(c,shared.syntaxerr); + goto cleanup; + } + + i += 2; + } else if (!strcasecmp(c->argv[i]->ptr, "pattern") && j >= 2) { + pat = c->argv[i+1]->ptr; + patlen = sdslen(pat); + + /* The pattern is a no-op iff == "*" */ + patnoop = (pat[0] == '*' && patlen == 1); + + i += 2; + } else { + addReply(c,shared.syntaxerr); + goto cleanup; + } + } + + do { + cursor = dictScan(c->db->dict, cursor, scanCallback, keys); + } while (cursor && listLength(keys) < count); + + /* Filter keys */ + ln = listFirst(keys); + while (ln) { + robj *kobj = listNodeValue(ln); + ln_ = listNextNode(ln); + + /* Keep key iff pattern matches and it hasn't expired */ + if ((patnoop || stringmatchlen(pat, patlen, kobj->ptr, sdslen(kobj->ptr), 0)) && + (expireIfNeeded(c->db, kobj) == 0)) + { + /* Keep */ + } else { + decrRefCount(kobj); + listDelNode(keys, ln); + } + + ln = ln_; + } + + addReplyMultiBulkLen(c, 2); + + rv = snprintf(buf, sizeof(buf), "%lu", cursor); + redisAssert(rv < sizeof(buf)); + addReplyBulkCBuffer(c, buf, rv); + + addReplyMultiBulkLen(c, listLength(keys)); + while ((ln = listFirst(keys)) != NULL) { + robj *kobj = listNodeValue(ln); + addReplyBulk(c, kobj); + decrRefCount(kobj); + listDelNode(keys, ln); + } + +cleanup: + while ((ln = listFirst(keys)) != NULL) { + robj *kobj = listNodeValue(ln); + decrRefCount(kobj); + listDelNode(keys, ln); + } + + listRelease(keys); +} + void dbsizeCommand(redisClient *c) { addReplyLongLong(c,dictSize(c->db->dict)); } diff --git a/src/dict.c b/src/dict.c index 97a2bca43..f4a44cf2e 100644 --- a/src/dict.c +++ b/src/dict.c @@ -648,6 +648,98 @@ dictEntry *dictGetRandomKey(dict *d) return he; } +/* Function to reverse bits. Algorithm from: + * http://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel */ +static unsigned long rev(unsigned long v) { + unsigned long s = 8 * sizeof(v); // bit size; must be power of 2 + unsigned long mask = ~0; + while ((s >>= 1) > 0) { + mask ^= (mask << s); + v = ((v >> s) & mask) | ((v << s) & ~mask); + } + return v; +} + +unsigned long dictScan(dict *d, + unsigned long v, + dictScanFunction *fn, + void *privdata) +{ + dictht *t0, *t1; + const dictEntry *de; + unsigned long s0, s1; + unsigned long m0, m1; + + if (!dictIsRehashing(d)) { + t0 = &(d->ht[0]); + m0 = t0->sizemask; + + /* Emit entries at cursor */ + de = t0->table[v & m0]; + while (de) { + fn(privdata, de); + de = de->next; + } + + } else { + t0 = &d->ht[0]; + t1 = &d->ht[1]; + + /* Make sure t0 is the smaller and t1 is the bigger table */ + if (t0->size > t1->size) { + t0 = &d->ht[1]; + t1 = &d->ht[0]; + } + + s0 = t0->size; + s1 = t1->size; + m0 = t0->sizemask; + m1 = t1->sizemask; + + /* Emit entries at cursor */ + de = t0->table[v & m0]; + while (de) { + fn(privdata, de); + de = de->next; + } + + /* Iterate over indices in larger table that are the expansion + * of the index pointed to by the cursor in the smaller table */ + do { + /* Emit entries at cursor */ + de = t1->table[v & m1]; + while (de) { + fn(privdata, de); + de = de->next; + } + + /* Increment bits not covered by the smaller mask */ + v = (((v | m0) + 1) & ~m0) | (v & m0); + + /* Continue while bits covered by mask difference is non-zero */ + } while (v & (m0 ^ m1)); + } + + /* Set unmasked bits so incrementing the reversed cursor + * operates on the masked bits of the smaller table */ + v |= ~m0; + + /* Increment the reverse cursor */ + v = rev(v); + v++; + v = rev(v); + + /* Only preprare cursor for the next iteration when it is non-zero, + * so that 0 can be used as end-of-scan sentinel. */ + if (v) { + /* Set unmasked bits so the cursor will keep its position + * regardless of the mask in the next iterations */ + v |= ~m0; + } + + return v; +} + /* ------------------------- private functions ------------------------------ */ /* Expand the hash table if needed */ diff --git a/src/dict.h b/src/dict.h index 4d750ae85..11e1b97ee 100644 --- a/src/dict.h +++ b/src/dict.h @@ -91,6 +91,8 @@ typedef struct dictIterator { long long fingerprint; /* unsafe iterator fingerprint for misuse detection */ } dictIterator; +typedef void (dictScanFunction)(void *privdata, const dictEntry *de); + /* This is the initial size of every hash table */ #define DICT_HT_INITIAL_SIZE 4 @@ -165,6 +167,7 @@ int dictRehash(dict *d, int n); int dictRehashMilliseconds(dict *d, int ms); void dictSetHashFunctionSeed(unsigned int initval); unsigned int dictGetHashFunctionSeed(void); +unsigned long dictScan(dict *d, unsigned long v, dictScanFunction *fn, void *privdata); /* Hash table types */ extern dictType dictTypeHeapStringCopyKey; diff --git a/src/redis.c b/src/redis.c index 30348a674..6e5181e1f 100644 --- a/src/redis.c +++ b/src/redis.c @@ -210,6 +210,7 @@ struct redisCommand redisCommandTable[] = { {"pexpire",pexpireCommand,3,"w",0,NULL,1,1,1,0,0}, {"pexpireat",pexpireatCommand,3,"w",0,NULL,1,1,1,0,0}, {"keys",keysCommand,2,"rS",0,NULL,0,0,0,0,0}, + {"scan",scanCommand,-1,"RS",0,NULL,0,0,0,0,0}, {"dbsize",dbsizeCommand,1,"r",0,NULL,0,0,0,0,0}, {"auth",authCommand,2,"rslt",0,NULL,0,0,0,0,0}, {"ping",pingCommand,1,"rt",0,NULL,0,0,0,0,0}, diff --git a/src/redis.h b/src/redis.h index a8a68bced..7b643017a 100644 --- a/src/redis.h +++ b/src/redis.h @@ -1250,6 +1250,7 @@ void incrbyfloatCommand(redisClient *c); void selectCommand(redisClient *c); void randomkeyCommand(redisClient *c); void keysCommand(redisClient *c); +void scanCommand(redisClient *c); void dbsizeCommand(redisClient *c); void lastsaveCommand(redisClient *c); void saveCommand(redisClient *c); diff --git a/tests/unit/basic.tcl b/tests/unit/basic.tcl index 1f46ba666..a4a0e791a 100644 --- a/tests/unit/basic.tcl +++ b/tests/unit/basic.tcl @@ -761,4 +761,58 @@ start_server {tags {"basic"}} { r keys * r keys * } {dlskeriewrioeuwqoirueioqwrueoqwrueqw} + + test "SCAN basic" { + r flushdb + r debug populate 1000 + + set cur 0 + set keys {} + while 1 { + set res [r scan $cur] + set cur [lindex $res 0] + set k [lindex $res 1] + lappend keys $k + if {$cur == 0} break + } + + set keys [lsort -unique [concat {*}$keys]] + assert_equal 1000 [llength $keys] + } + + test "SCAN COUNT" { + r flushdb + r debug populate 1000 + + set cur 0 + set keys {} + while 1 { + set res [r scan $cur count 5] + set cur [lindex $res 0] + set k [lindex $res 1] + lappend keys $k + if {$cur == 0} break + } + + set keys [lsort -unique [concat {*}$keys]] + assert_equal 1000 [llength $keys] + } + + test "SCAN PATTERN" { + r flushdb + r debug populate 1000 + + set cur 0 + set keys {} + while 1 { + set res [r scan $cur pattern "key:1??"] + set cur [lindex $res 0] + set k [lindex $res 1] + lappend keys $k + if {$cur == 0} break + } + + set keys [lsort -unique [concat {*}$keys]] + assert_equal 100 [llength $keys] + } } From 25ae316f652fa62fa85e16546a66c047fc7edc73 Mon Sep 17 00:00:00 2001 From: Pieter Noordhuis Date: Tue, 10 Jul 2012 15:51:43 -0700 Subject: [PATCH 0228/2500] SCAN requires at least 1 argument --- src/db.c | 2 ++ src/redis.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/db.c b/src/db.c index 32d539f6b..ce69d58b6 100644 --- a/src/db.c +++ b/src/db.c @@ -337,6 +337,8 @@ void scanCommand(redisClient *c) { sds pat; int patlen, patnoop = 1; + redisAssert(c->argc >= 2); + /* Use sscanf because we need an *unsigned* long */ rv = sscanf(c->argv[1]->ptr, "%lu", &cursor); if (rv != 1) { diff --git a/src/redis.c b/src/redis.c index 6e5181e1f..04313f2f4 100644 --- a/src/redis.c +++ b/src/redis.c @@ -210,7 +210,7 @@ struct redisCommand redisCommandTable[] = { {"pexpire",pexpireCommand,3,"w",0,NULL,1,1,1,0,0}, {"pexpireat",pexpireatCommand,3,"w",0,NULL,1,1,1,0,0}, {"keys",keysCommand,2,"rS",0,NULL,0,0,0,0,0}, - {"scan",scanCommand,-1,"RS",0,NULL,0,0,0,0,0}, + {"scan",scanCommand,-2,"RS",0,NULL,0,0,0,0,0}, {"dbsize",dbsizeCommand,1,"r",0,NULL,0,0,0,0,0}, {"auth",authCommand,2,"rslt",0,NULL,0,0,0,0,0}, {"ping",pingCommand,1,"rt",0,NULL,0,0,0,0,0}, From f18269d1ef6cc794be1a420cfeaae7ad15612233 Mon Sep 17 00:00:00 2001 From: Pieter Noordhuis Date: Tue, 23 Apr 2013 10:02:36 -0700 Subject: [PATCH 0229/2500] Fix error in scan algorithm The irrelevant bits shouldn't be masked to 1. This can result in slots being skipped when the hash table is resized between calls to the iterator. --- src/dict.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/dict.c b/src/dict.c index f4a44cf2e..946e23c4a 100644 --- a/src/dict.c +++ b/src/dict.c @@ -729,14 +729,6 @@ unsigned long dictScan(dict *d, v++; v = rev(v); - /* Only preprare cursor for the next iteration when it is non-zero, - * so that 0 can be used as end-of-scan sentinel. */ - if (v) { - /* Set unmasked bits so the cursor will keep its position - * regardless of the mask in the next iterations */ - v |= ~m0; - } - return v; } From 5227a1f66f6606563ae4951751bb7979a0404f2c Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 25 Oct 2013 11:45:32 +0200 Subject: [PATCH 0230/2500] SCAN option name changed: pattern -> match. --- src/db.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/db.c b/src/db.c index ce69d58b6..9f6313f81 100644 --- a/src/db.c +++ b/src/db.c @@ -360,7 +360,7 @@ void scanCommand(redisClient *c) { } i += 2; - } else if (!strcasecmp(c->argv[i]->ptr, "pattern") && j >= 2) { + } else if (!strcasecmp(c->argv[i]->ptr, "match") && j >= 2) { pat = c->argv[i+1]->ptr; patlen = sdslen(pat); From 0a970b93e911c0ece1fa2551ade71e122d8c01e1 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 25 Oct 2013 11:45:56 +0200 Subject: [PATCH 0231/2500] Fixed typo in SCAN comment. iff -> if. --- src/db.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/db.c b/src/db.c index 9f6313f81..5719b164e 100644 --- a/src/db.c +++ b/src/db.c @@ -364,7 +364,7 @@ void scanCommand(redisClient *c) { pat = c->argv[i+1]->ptr; patlen = sdslen(pat); - /* The pattern is a no-op iff == "*" */ + /* The pattern is a no-op if == "*" */ patnoop = (pat[0] == '*' && patlen == 1); i += 2; From 6874fcc216baeedfe7cfe6279cc81cdce911b6be Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 25 Oct 2013 11:48:18 +0200 Subject: [PATCH 0232/2500] SCAN: use define REDIS_LONGSTR_SIZE instead of fixed len. --- src/db.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/db.c b/src/db.c index 5719b164e..671683a1d 100644 --- a/src/db.c +++ b/src/db.c @@ -329,7 +329,7 @@ void scanCallback(void *privdata, const dictEntry *de) { void scanCommand(redisClient *c) { int rv; int i, j; - char buf[32]; + char buf[REDIS_LONGSTR_SIZE]; list *keys = listCreate(); listNode *ln, *ln_; unsigned long cursor = 0; From 32b555c5fb8709500e5a01b7e7cdb56ab144dc4a Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 25 Oct 2013 11:49:08 +0200 Subject: [PATCH 0233/2500] SCAN: remove useless assertion, already enforced by command table. --- src/db.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/db.c b/src/db.c index 671683a1d..8bb7b8417 100644 --- a/src/db.c +++ b/src/db.c @@ -337,8 +337,6 @@ void scanCommand(redisClient *c) { sds pat; int patlen, patnoop = 1; - redisAssert(c->argc >= 2); - /* Use sscanf because we need an *unsigned* long */ rv = sscanf(c->argv[1]->ptr, "%lu", &cursor); if (rv != 1) { From 6bff0f3cb6037d72ea3fe97834551a1bf8834bcb Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 25 Oct 2013 11:51:08 +0200 Subject: [PATCH 0234/2500] SCAN: remove additional newlines to conform to Redis code base. --- src/db.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/db.c b/src/db.c index 8bb7b8417..d8bc1943e 100644 --- a/src/db.c +++ b/src/db.c @@ -391,12 +391,10 @@ void scanCommand(redisClient *c) { decrRefCount(kobj); listDelNode(keys, ln); } - ln = ln_; } addReplyMultiBulkLen(c, 2); - rv = snprintf(buf, sizeof(buf), "%lu", cursor); redisAssert(rv < sizeof(buf)); addReplyBulkCBuffer(c, buf, rv); @@ -415,7 +413,6 @@ cleanup: decrRefCount(kobj); listDelNode(keys, ln); } - listRelease(keys); } From 6f69128751171c81851b356d5b14a398f5780010 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 25 Oct 2013 11:54:37 +0200 Subject: [PATCH 0235/2500] SCAN: improve variable names for readability. --- src/db.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/db.c b/src/db.c index d8bc1943e..9bf135613 100644 --- a/src/db.c +++ b/src/db.c @@ -331,7 +331,7 @@ void scanCommand(redisClient *c) { int i, j; char buf[REDIS_LONGSTR_SIZE]; list *keys = listCreate(); - listNode *ln, *ln_; + listNode *node, *nextnode; unsigned long cursor = 0; long count = 1; sds pat; @@ -377,10 +377,10 @@ void scanCommand(redisClient *c) { } while (cursor && listLength(keys) < count); /* Filter keys */ - ln = listFirst(keys); - while (ln) { - robj *kobj = listNodeValue(ln); - ln_ = listNextNode(ln); + node = listFirst(keys); + while (node) { + robj *kobj = listNodeValue(node); + nextnode = listNextNode(node); /* Keep key iff pattern matches and it hasn't expired */ if ((patnoop || stringmatchlen(pat, patlen, kobj->ptr, sdslen(kobj->ptr), 0)) && @@ -389,9 +389,9 @@ void scanCommand(redisClient *c) { /* Keep */ } else { decrRefCount(kobj); - listDelNode(keys, ln); + listDelNode(keys, node); } - ln = ln_; + node = nextnode; } addReplyMultiBulkLen(c, 2); @@ -400,18 +400,18 @@ void scanCommand(redisClient *c) { addReplyBulkCBuffer(c, buf, rv); addReplyMultiBulkLen(c, listLength(keys)); - while ((ln = listFirst(keys)) != NULL) { - robj *kobj = listNodeValue(ln); + while ((node = listFirst(keys)) != NULL) { + robj *kobj = listNodeValue(node); addReplyBulk(c, kobj); decrRefCount(kobj); - listDelNode(keys, ln); + listDelNode(keys, node); } cleanup: - while ((ln = listFirst(keys)) != NULL) { - robj *kobj = listNodeValue(ln); + while ((node = listFirst(keys)) != NULL) { + robj *kobj = listNodeValue(node); decrRefCount(kobj); - listDelNode(keys, ln); + listDelNode(keys, node); } listRelease(keys); } From b18ac5f5742a54f702fff515580fdb4fc3c2a1c4 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 25 Oct 2013 11:55:28 +0200 Subject: [PATCH 0236/2500] SCAN: Fix test after option renamed from PATTERN to MATCH. --- tests/unit/basic.tcl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/basic.tcl b/tests/unit/basic.tcl index a4a0e791a..67651666e 100644 --- a/tests/unit/basic.tcl +++ b/tests/unit/basic.tcl @@ -798,14 +798,14 @@ start_server {tags {"basic"}} { assert_equal 1000 [llength $keys] } - test "SCAN PATTERN" { + test "SCAN MATCH" { r flushdb r debug populate 1000 set cur 0 set keys {} while 1 { - set res [r scan $cur pattern "key:1??"] + set res [r scan $cur match "key:1??"] set cur [lindex $res 0] set k [lindex $res 1] lappend keys $k From 7bd87839c13fb5fc310ac02190a29de2c6698785 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 25 Oct 2013 11:58:03 +0200 Subject: [PATCH 0237/2500] SCAN: simplify keys list cleanup using listSetFreeMethod(). --- src/db.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/db.c b/src/db.c index 9bf135613..be3839bb6 100644 --- a/src/db.c +++ b/src/db.c @@ -408,11 +408,7 @@ void scanCommand(redisClient *c) { } cleanup: - while ((node = listFirst(keys)) != NULL) { - robj *kobj = listNodeValue(node); - decrRefCount(kobj); - listDelNode(keys, node); - } + listSetFreeMethod(keys,decrRefCountVoid); listRelease(keys); } From fe0ffe6a1cdd017debfcf32ee140e9af31e7c8e0 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 25 Oct 2013 12:00:13 +0200 Subject: [PATCH 0238/2500] Revert "Fixed typo in SCAN comment. iff -> if." Probably here Pieter means "if and only if". This reverts commit 0a970b93e911c0ece1fa2551ade71e122d8c01e1. --- src/db.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/db.c b/src/db.c index be3839bb6..9f5949534 100644 --- a/src/db.c +++ b/src/db.c @@ -362,7 +362,7 @@ void scanCommand(redisClient *c) { pat = c->argv[i+1]->ptr; patlen = sdslen(pat); - /* The pattern is a no-op if == "*" */ + /* The pattern is a no-op iff == "*" */ patnoop = (pat[0] == '*' && patlen == 1); i += 2; From eda292a7fa362c4dd7309499c33c335706061ff5 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 25 Oct 2013 12:01:49 +0200 Subject: [PATCH 0239/2500] SCAN: stay inside 80 cols. --- src/db.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/db.c b/src/db.c index 9f5949534..74ce4b9ed 100644 --- a/src/db.c +++ b/src/db.c @@ -348,7 +348,9 @@ void scanCommand(redisClient *c) { while (i < c->argc) { j = c->argc - i; if (!strcasecmp(c->argv[i]->ptr, "count") && j >= 2) { - if (getLongFromObjectOrReply(c, c->argv[i+1], &count, NULL) != REDIS_OK) { + if (getLongFromObjectOrReply(c, c->argv[i+1], &count, NULL) + != REDIS_OK) + { goto cleanup; } @@ -383,7 +385,8 @@ void scanCommand(redisClient *c) { nextnode = listNextNode(node); /* Keep key iff pattern matches and it hasn't expired */ - if ((patnoop || stringmatchlen(pat, patlen, kobj->ptr, sdslen(kobj->ptr), 0)) && + if ((patnoop || + stringmatchlen(pat, patlen, kobj->ptr, sdslen(kobj->ptr), 0)) && (expireIfNeeded(c->db, kobj) == 0)) { /* Keep */ From 817e6766aa07eba98cc63c9aad232434ebf67fa8 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 25 Oct 2013 17:01:30 +0200 Subject: [PATCH 0240/2500] dictScan() algorithm documented. --- src/dict.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/src/dict.c b/src/dict.c index 946e23c4a..a735c499d 100644 --- a/src/dict.c +++ b/src/dict.c @@ -660,6 +660,90 @@ static unsigned long rev(unsigned long v) { return v; } +/* dictScan() is used to iterate over the elements of a dictionary. + * + * Iterating works in the following way: + * + * 1) Initially you call the function using a cursor (v) value of 0. + * 2) The function performs one step of the iteration, and returns the + * new cursor value that you must use in the next call. + * 3) When the returned cursor is 0, the iteration is complete. + * + * The function guarantees that all the elements that are present in the + * dictionary from the start to the end of the iteration are returned. + * However it is possible that some element is returned multiple time. + * + * For every element returned, the callback 'fn' passed as argument is + * called, with 'privdata' as first argument and the dictionar entry + * 'de' as second argument. + * + * HOW IT WORKS. + * + * The algorithm used in the iteration was designed by Pieter Noordhuis. + * The main idea is to increment a cursor starting from the higher order + * bits, that is, instead of incrementing the cursor normally, the bits + * of the cursor are reversed, then the cursor is incremented, and finally + * the bits are reversed again. + * + * This strategy is needed because the hash table may be resized from one + * call to the other call of the same iteration. + * + * dict.c hash tables are always power of two in size, and they + * use chaining, so the position of an element in a given table is given + * always by computing the bitwise AND between Hash(key) and SIZE-1 + * (where SIZE-1 is always the mask that is equivalent to taking the rest + * of the division between the Hash of the key and SIZE). + * + * For example if the current hash table size is 64, the mask is + * (in binary) 1111. The position of a key in the hash table will be always + * the last four bits of the hash output, and so forth. + * + * WHAT HAPPENS DURING REHASHING, WHEN YOU HAVE TWO TABLES? + * + * If the hash table grows, elements can go anyway in one multiple of + * the old bucket: for example let's say that we already iterated with + * a 4 bit cursor 1100, since the mask is 1111 (hash table size = 16). + * + * If the hash table will be resized to 64 elements, and the new mask will + * be 111111, the new buckets that you obtain substituting in ??1100 + * either 0 or 1, can be targeted only by keys that we already visited + * when scanning the bucket 1100 in the smaller hash table. + * + * By iterating the higher bits first, because of the inverted counter, the + * cursor does not need to restart if the table size gets bigger, and will + * just continue iterating with cursors that don't have '1100' at the end, + * nor any other combination of final 4 bits already explored. + * + * Similarly when the table size shrinks over time, for example going from + * 16 to 8, If a combination of the lower three bits (the mask for size 8 + * is 111) was already completely explored, it will not be visited again + * as we are sure that, we tried for example, both 0111 and 1111 (all the + * variations of the higher bit) so we don't need to test it again. + * + * But wait... You have *two* tables in a given moment during rehashing! + * + * Yes, this is true, but we always iterate the smaller one of the tables, + * testing also all the expansions of the current cursor into the larger + * table. So for example if the current cursor is 101 and we also have a + * larger table of size 16, we also test (0)101 and (1)101 inside the larger + * table. This reduces the problem back to having only one table, where + * the larger one, if exists, is just an expansion of the smaller one. + * + * LIMITATIONS + * + * This iterator is completely stateless, and this is a huge advantage, + * including no additional memory used. + * + * The disadvantages resulting from this design are: + * + * 1) It is possible that we return duplicated elements. However this is usually + * easy to deal with in the application level. + * 2) The iterator must return multiple elements per call, as it needs to always + * return all the keys chained in a given bucket, and all the expansions, so + * we are sure we don't miss keys moving. + * 3) The reverse cursor is somewhat hard to understand at first, but this + * comment is supposed to help. + */ unsigned long dictScan(dict *d, unsigned long v, dictScanFunction *fn, From f86c07df30560908cd9e2e40a4d763b363dd710e Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 25 Oct 2013 17:05:55 +0200 Subject: [PATCH 0241/2500] Fixed typos in dictScan() comment. --- src/dict.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/dict.c b/src/dict.c index a735c499d..814d5f65b 100644 --- a/src/dict.c +++ b/src/dict.c @@ -698,7 +698,7 @@ static unsigned long rev(unsigned long v) { * (in binary) 1111. The position of a key in the hash table will be always * the last four bits of the hash output, and so forth. * - * WHAT HAPPENS DURING REHASHING, WHEN YOU HAVE TWO TABLES? + * WHAT HAPPENS IF THE TABLE CHANGES IN SIZE? * * If the hash table grows, elements can go anyway in one multiple of * the old bucket: for example let's say that we already iterated with @@ -720,7 +720,7 @@ static unsigned long rev(unsigned long v) { * as we are sure that, we tried for example, both 0111 and 1111 (all the * variations of the higher bit) so we don't need to test it again. * - * But wait... You have *two* tables in a given moment during rehashing! + * WAIT... YOU HAVE *TWO* TABLES DURING REHASHING! * * Yes, this is true, but we always iterate the smaller one of the tables, * testing also all the expansions of the current cursor into the larger From 0685c1ca13ed117f58a5930b325512296adbddb5 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 28 Oct 2013 11:11:34 +0100 Subject: [PATCH 0242/2500] SCAN: refactored into scanGenericCommand. The new implementation is capable of iterating the keyspace but also sets, hashes, and sorted sets, and can be used to implement SSCAN, ZSCAN and HSCAN. --- src/db.c | 140 +++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 126 insertions(+), 14 deletions(-) diff --git a/src/db.c b/src/db.c index 74ce4b9ed..65144e5ac 100644 --- a/src/db.c +++ b/src/db.c @@ -319,32 +319,76 @@ void keysCommand(redisClient *c) { setDeferredMultiBulkLength(c,replylen,numkeys); } +/* This callback is used by scanGenericCommand in order to collect elements + * returned by the dictionary iterator into a list. */ void scanCallback(void *privdata, const dictEntry *de) { - list *keys = (list *)privdata; - sds key = dictGetKey(de); - robj *kobj = createStringObject(key, sdslen(key)); - listAddNodeTail(keys, kobj); + void **pd = (void**) privdata; + list *keys = pd[0]; + robj *o = pd[1]; + robj *key, *val = NULL; + + if (o == NULL) { + sds sdskey = dictGetKey(de); + key = createStringObject(sdskey, sdslen(sdskey)); + } else if (o->type == REDIS_SET) { + key = dictGetKey(de); + incrRefCount(key); + } else if (o->type == REDIS_HASH) { + key = dictGetKey(de); + incrRefCount(key); + val = dictGetVal(de); + incrRefCount(val); + } else if (o->type == REDIS_ZSET) { + key = dictGetKey(de); + incrRefCount(key); + val = createStringObjectFromLongDouble(*(double*)dictGetVal(de)); + } else { + redisPanic("Type not handled in SCAN callback."); + } + + listAddNodeTail(keys, key); + if (val) listAddNodeTail(keys, val); } -void scanCommand(redisClient *c) { +/* This command implements SCAN, HSCAN and SSCAN commands. + * If object 'o' is passed, then it must be an Hash or Set object, otherwise + * if 'o' is NULL the command will operate on the dictionary associated with + * the current database. + * + * When 'o' is not NULL the function assumes that the first argument in + * the client arguments vector is a key so it skips it before iterating + * in order to parse options. + * + * In the case of an Hash object the function returns both the field and value + * of every element on the Hash. */ +void scanGenericCommand(redisClient *c, robj *o) { int rv; int i, j; char buf[REDIS_LONGSTR_SIZE]; list *keys = listCreate(); listNode *node, *nextnode; unsigned long cursor = 0; - long count = 1; + long count = 10; sds pat; int patlen, patnoop = 1; + dict *ht; + + /* Object must be NULL (to iterate keys names), or the type of the object + * must be Set, Sorted Set, or Hash. */ + redisAssert(o == NULL || o->type == REDIS_SET || o->type == REDIS_HASH || + o->type == REDIS_ZSET); + + /* Set i to the first option argument. The previous one is the cursor. */ + i = (o == NULL) ? 2 : 3; /* Skip the key argument if needed. */ /* Use sscanf because we need an *unsigned* long */ - rv = sscanf(c->argv[1]->ptr, "%lu", &cursor); + rv = sscanf(c->argv[i-1]->ptr, "%lu", &cursor); if (rv != 1) { addReplyError(c, "invalid cursor"); goto cleanup; } - i = 2; + /* Step 1: Parse options. */ while (i < c->argc) { j = c->argc - i; if (!strcasecmp(c->argv[i]->ptr, "count") && j >= 2) { @@ -374,29 +418,92 @@ void scanCommand(redisClient *c) { } } - do { - cursor = dictScan(c->db->dict, cursor, scanCallback, keys); - } while (cursor && listLength(keys) < count); + /* Step 2: Iterate the collection. + * + * Note that if the object is encoded with a ziplist, intset, or any other + * representation that is not an hash table, we are sure that it is also + * composed of a small number of elements. So to avoid taking state we + * just return everything inside the object in a single call, setting the + * cursor to zero to signal the end of the iteration. */ - /* Filter keys */ + /* Handle the case of an hash table. */ + ht = NULL; + if (o == NULL) { + ht = c->db->dict; + } else if (o->type == REDIS_SET && o->encoding == REDIS_ENCODING_HT) { + ht = o->ptr; + } else if (o->type == REDIS_HASH && o->encoding == REDIS_ENCODING_HT) { + ht = o->ptr; + count *= 2; /* We return key / value for this type. */ + } else if (o->type == REDIS_ZSET && o->encoding == REDIS_ENCODING_SKIPLIST) { + zset *zs = o->ptr; + ht = zs->dict; + count *= 2; /* We return key / value for this type. */ + } + + if (ht) { + void *privdata[2]; + + /* We pass two pointers to the callback: the list to which it will + * add new elements, and the object containing the dictionary so that + * it is possible to fetch more data in a type-dependent way. */ + privdata[0] = keys; + privdata[1] = o; + do { + cursor = dictScan(ht, cursor, scanCallback, privdata); + } while (cursor && listLength(keys) < count); + } else if (o->type == REDIS_SET) { + int pos = 0; + long long ll; + + while(intsetGet(o->ptr,pos++,&ll)) + listAddNodeTail(keys,createStringObjectFromLongLong(ll)); + } else if (o->type == REDIS_HASH || o->type == REDIS_ZSET) { + unsigned char *p = ziplistIndex(o->ptr,0); + unsigned char *vstr; + unsigned int vlen; + long long vll; + + while(p) { + ziplistGet(p,&vstr,&vlen,&vll); + listAddNodeTail(keys, + (vstr != NULL) ? createStringObject((char*)vstr,vlen) : + createStringObjectFromLongLong(vll)); + ziplistNext(o->ptr,p); + } + } else { + redisPanic("Not handled encoding in SCAN."); + } + + /* Step 3: Filter elements. */ node = listFirst(keys); while (node) { robj *kobj = listNodeValue(node); nextnode = listNextNode(node); - /* Keep key iff pattern matches and it hasn't expired */ + /* Keep key iff pattern matches and, if we are iterating the key + * space, check that the key hasn't expired. */ if ((patnoop || stringmatchlen(pat, patlen, kobj->ptr, sdslen(kobj->ptr), 0)) && - (expireIfNeeded(c->db, kobj) == 0)) + (o != NULL || expireIfNeeded(c->db, kobj) == 0)) { /* Keep */ } else { decrRefCount(kobj); listDelNode(keys, node); + /* Also remove the value for hashes and sorted sets. */ + if (o && (o->type == REDIS_ZSET || o->type == REDIS_HASH)) { + node = nextnode; + kobj = listNodeValue(node); + nextnode = listNextNode(node); + decrRefCount(kobj); + listDelNode(keys, node); + } } node = nextnode; } + /* Step 4: Reply to the client. */ addReplyMultiBulkLen(c, 2); rv = snprintf(buf, sizeof(buf), "%lu", cursor); redisAssert(rv < sizeof(buf)); @@ -415,6 +522,11 @@ cleanup: listRelease(keys); } +/* The SCAN command completely relies on scanGenericCommand. */ +void scanCommand(redisClient *c) { + scanGenericCommand(c,NULL); +} + void dbsizeCommand(redisClient *c) { addReplyLongLong(c,dictSize(c->db->dict)); } From e74c235567e0be8bc4b0b6ca63610909acd1f405 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 28 Oct 2013 11:13:43 +0100 Subject: [PATCH 0243/2500] SCAN is a random command and does not require output sorting. Sorting the output helps when we want to turn a non-deterministic into a deterministic command, in that case this is not possible. --- src/redis.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/redis.c b/src/redis.c index 04313f2f4..ccbf6a5c7 100644 --- a/src/redis.c +++ b/src/redis.c @@ -210,7 +210,7 @@ struct redisCommand redisCommandTable[] = { {"pexpire",pexpireCommand,3,"w",0,NULL,1,1,1,0,0}, {"pexpireat",pexpireatCommand,3,"w",0,NULL,1,1,1,0,0}, {"keys",keysCommand,2,"rS",0,NULL,0,0,0,0,0}, - {"scan",scanCommand,-2,"RS",0,NULL,0,0,0,0,0}, + {"scan",scanCommand,-2,"rR",0,NULL,0,0,0,0,0}, {"dbsize",dbsizeCommand,1,"r",0,NULL,0,0,0,0,0}, {"auth",authCommand,2,"rslt",0,NULL,0,0,0,0,0}, {"ping",pingCommand,1,"rt",0,NULL,0,0,0,0,0}, From 99efa37a6b3ae74524565b073b16d64ff0981ada Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 28 Oct 2013 11:17:18 +0100 Subject: [PATCH 0244/2500] dictScan(): empty hash table requires special handling. --- src/dict.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/dict.c b/src/dict.c index 814d5f65b..72eecae38 100644 --- a/src/dict.c +++ b/src/dict.c @@ -754,6 +754,8 @@ unsigned long dictScan(dict *d, unsigned long s0, s1; unsigned long m0, m1; + if (dictSize(d) == 0) return 0; + if (!dictIsRehashing(d)) { t0 = &(d->ht[0]); m0 = t0->sizemask; From e96ffac5636967f994b8b6b75071f52b644fa190 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 28 Oct 2013 11:17:32 +0100 Subject: [PATCH 0245/2500] SSCAN implemented. --- src/redis.c | 2 ++ src/redis.h | 4 +++- src/t_set.c | 8 ++++++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/src/redis.c b/src/redis.c index ccbf6a5c7..6cfa0ec51 100644 --- a/src/redis.c +++ b/src/redis.c @@ -165,6 +165,7 @@ struct redisCommand redisCommandTable[] = { {"sdiff",sdiffCommand,-2,"rS",0,NULL,1,-1,1,0,0}, {"sdiffstore",sdiffstoreCommand,-3,"wm",0,NULL,1,-1,1,0,0}, {"smembers",sinterCommand,2,"rS",0,NULL,1,1,1,0,0}, + {"sscan",sscanCommand,-3,"rR",0,NULL,1,1,1,0,0}, {"zadd",zaddCommand,-4,"wm",0,NULL,1,1,1,0,0}, {"zincrby",zincrbyCommand,4,"wm",0,NULL,1,1,1,0,0}, {"zrem",zremCommand,-3,"w",0,NULL,1,1,1,0,0}, @@ -1227,6 +1228,7 @@ void createSharedObjects(void) { shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n")); shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n")); shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n")); + shared.emptyscan = createObject(REDIS_STRING,sdsnew("*2\r\n$1\r\n0\r\n*0\r\n")); shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew( "-WRONGTYPE Operation against a key holding the wrong kind of value\r\n")); shared.nokeyerr = createObject(REDIS_STRING,sdsnew( diff --git a/src/redis.h b/src/redis.h index 7b643017a..7d0a733e0 100644 --- a/src/redis.h +++ b/src/redis.h @@ -504,7 +504,7 @@ struct sharedObjectsStruct { *masterdownerr, *roslaveerr, *execaborterr, *noautherr, *noreplicaserr, *oomerr, *plus, *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *psubscribebulk, *punsubscribebulk, *del, *rpop, *lpop, - *lpush, + *lpush, *emptyscan, *select[REDIS_SHARED_SELECT_CMDS], *integers[REDIS_SHARED_INTEGERS], *mbulkhdr[REDIS_SHARED_BULKHDR_LEN], /* "*\r\n" */ @@ -1194,6 +1194,7 @@ void signalFlushedDb(int dbid); unsigned int getKeysInSlot(unsigned int hashslot, robj **keys, unsigned int count); unsigned int countKeysInSlot(unsigned int hashslot); int verifyClusterConfigWithData(void); +void scanGenericCommand(redisClient *c, robj *o); /* API to get key arguments from commands */ #define REDIS_GETKEYS_ALL 0 @@ -1286,6 +1287,7 @@ void sunionCommand(redisClient *c); void sunionstoreCommand(redisClient *c); void sdiffCommand(redisClient *c); void sdiffstoreCommand(redisClient *c); +void sscanCommand(redisClient *c); void syncCommand(redisClient *c); void flushdbCommand(redisClient *c); void flushallCommand(redisClient *c); diff --git a/src/t_set.c b/src/t_set.c index ab65e23f3..ebffc5dc0 100644 --- a/src/t_set.c +++ b/src/t_set.c @@ -906,3 +906,11 @@ void sdiffCommand(redisClient *c) { void sdiffstoreCommand(redisClient *c) { sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF); } + +void sscanCommand(redisClient *c) { + robj *set; + + if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.emptyscan)) == NULL || + checkType(c,set,REDIS_SET)) return; + scanGenericCommand(c,set); +} From 6618167a9fd19671f932d7d06c741777887de817 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 28 Oct 2013 11:32:34 +0100 Subject: [PATCH 0246/2500] HSCAN implemented. --- src/db.c | 2 +- src/redis.c | 1 + src/redis.h | 1 + src/t_hash.c | 8 ++++++++ 4 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/db.c b/src/db.c index 65144e5ac..83806e713 100644 --- a/src/db.c +++ b/src/db.c @@ -469,7 +469,7 @@ void scanGenericCommand(redisClient *c, robj *o) { listAddNodeTail(keys, (vstr != NULL) ? createStringObject((char*)vstr,vlen) : createStringObjectFromLongLong(vll)); - ziplistNext(o->ptr,p); + p = ziplistNext(o->ptr,p); } } else { redisPanic("Not handled encoding in SCAN."); diff --git a/src/redis.c b/src/redis.c index 6cfa0ec51..f44c9908d 100644 --- a/src/redis.c +++ b/src/redis.c @@ -195,6 +195,7 @@ struct redisCommand redisCommandTable[] = { {"hvals",hvalsCommand,2,"rS",0,NULL,1,1,1,0,0}, {"hgetall",hgetallCommand,2,"r",0,NULL,1,1,1,0,0}, {"hexists",hexistsCommand,3,"r",0,NULL,1,1,1,0,0}, + {"hscan",hscanCommand,-3,"rR",0,NULL,1,1,1,0,0}, {"incrby",incrbyCommand,3,"wm",0,NULL,1,1,1,0,0}, {"decrby",decrbyCommand,3,"wm",0,NULL,1,1,1,0,0}, {"incrbyfloat",incrbyfloatCommand,3,"wm",0,NULL,1,1,1,0,0}, diff --git a/src/redis.h b/src/redis.h index 7d0a733e0..55c3ea77c 100644 --- a/src/redis.h +++ b/src/redis.h @@ -1344,6 +1344,7 @@ void hkeysCommand(redisClient *c); void hvalsCommand(redisClient *c); void hgetallCommand(redisClient *c); void hexistsCommand(redisClient *c); +void hscanCommand(redisClient *c); void configCommand(redisClient *c); void hincrbyCommand(redisClient *c); void hincrbyfloatCommand(redisClient *c); diff --git a/src/t_hash.c b/src/t_hash.c index 3b87b92ca..3ccacdc12 100644 --- a/src/t_hash.c +++ b/src/t_hash.c @@ -759,3 +759,11 @@ void hexistsCommand(redisClient *c) { addReply(c, hashTypeExists(o,c->argv[2]) ? shared.cone : shared.czero); } + +void hscanCommand(redisClient *c) { + robj *o; + + if ((o= lookupKeyReadOrReply(c,c->argv[1],shared.emptyscan)) == NULL || + checkType(c,o,REDIS_HASH)) return; + scanGenericCommand(c,o); +} From b2618c6cdb508903be73833b15974414688d918d Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 28 Oct 2013 11:36:42 +0100 Subject: [PATCH 0247/2500] ZSCAN implemented. --- src/redis.c | 1 + src/redis.h | 1 + src/t_zset.c | 8 ++++++++ 3 files changed, 10 insertions(+) diff --git a/src/redis.c b/src/redis.c index f44c9908d..7932824e7 100644 --- a/src/redis.c +++ b/src/redis.c @@ -182,6 +182,7 @@ struct redisCommand redisCommandTable[] = { {"zscore",zscoreCommand,3,"r",0,NULL,1,1,1,0,0}, {"zrank",zrankCommand,3,"r",0,NULL,1,1,1,0,0}, {"zrevrank",zrevrankCommand,3,"r",0,NULL,1,1,1,0,0}, + {"zscan",zscanCommand,-3,"rR",0,NULL,1,1,1,0,0}, {"hset",hsetCommand,4,"wm",0,NULL,1,1,1,0,0}, {"hsetnx",hsetnxCommand,4,"wm",0,NULL,1,1,1,0,0}, {"hget",hgetCommand,3,"r",0,NULL,1,1,1,0,0}, diff --git a/src/redis.h b/src/redis.h index 55c3ea77c..4312b3421 100644 --- a/src/redis.h +++ b/src/redis.h @@ -1340,6 +1340,7 @@ void hlenCommand(redisClient *c); void zremrangebyrankCommand(redisClient *c); void zunionstoreCommand(redisClient *c); void zinterstoreCommand(redisClient *c); +void zscanCommand(redisClient *c); void hkeysCommand(redisClient *c); void hvalsCommand(redisClient *c); void hgetallCommand(redisClient *c); diff --git a/src/t_zset.c b/src/t_zset.c index 1fcfd6bb6..9d3a79026 100644 --- a/src/t_zset.c +++ b/src/t_zset.c @@ -2207,3 +2207,11 @@ void zrankCommand(redisClient *c) { void zrevrankCommand(redisClient *c) { zrankGenericCommand(c, 1); } + +void zscanCommand(redisClient *c) { + robj *o; + + if ((o= lookupKeyReadOrReply(c,c->argv[1],shared.emptyscan)) == NULL || + checkType(c,o,REDIS_ZSET)) return; + scanGenericCommand(c,o); +} From e23fa0ec995d5df8684d847a2210668223e93627 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 28 Oct 2013 13:20:03 +0100 Subject: [PATCH 0248/2500] Aesthetic fix (missing space) into HSCAN and ZSCAN implementations. Thanks to @badboy for reporting. --- src/t_hash.c | 2 +- src/t_zset.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/t_hash.c b/src/t_hash.c index 3ccacdc12..298414f85 100644 --- a/src/t_hash.c +++ b/src/t_hash.c @@ -763,7 +763,7 @@ void hexistsCommand(redisClient *c) { void hscanCommand(redisClient *c) { robj *o; - if ((o= lookupKeyReadOrReply(c,c->argv[1],shared.emptyscan)) == NULL || + if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptyscan)) == NULL || checkType(c,o,REDIS_HASH)) return; scanGenericCommand(c,o); } diff --git a/src/t_zset.c b/src/t_zset.c index 9d3a79026..1ff289ce1 100644 --- a/src/t_zset.c +++ b/src/t_zset.c @@ -2211,7 +2211,7 @@ void zrevrankCommand(redisClient *c) { void zscanCommand(redisClient *c) { robj *o; - if ((o= lookupKeyReadOrReply(c,c->argv[1],shared.emptyscan)) == NULL || + if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptyscan)) == NULL || checkType(c,o,REDIS_ZSET)) return; scanGenericCommand(c,o); } From af411f4c9253a21b7d70325e9b7836ec5467753e Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 28 Oct 2013 18:13:39 +0100 Subject: [PATCH 0249/2500] redis-benchmark: update help for new __rand_int__ form. --- src/redis-benchmark.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/redis-benchmark.c b/src/redis-benchmark.c index 59b906ed7..bf18d2ab3 100644 --- a/src/redis-benchmark.c +++ b/src/redis-benchmark.c @@ -556,12 +556,11 @@ usage: " -dbnum SELECT the specified db number (default 0)\n" " -k 1=keep alive 0=reconnect (default 1)\n" " -r Use random keys for SET/GET/INCR, random values for SADD\n" -" Using this option the benchmark will get/set keys\n" -" in the form mykey_rand:000000012456 instead of constant\n" -" keys, the argument determines the max\n" -" number of values for the random number. For instance\n" -" if set to 10 only rand:000000000000 - rand:000000000009\n" -" range will be allowed.\n" +" Using this option the benchmark will expand the string __rand_int__\n" +" inside an argument with a 12 digits number in the specified range\n" +" from 0 to keyspacelen-1. The substitution changes every time a command\n" +" is executed. Default tests use this to hit random keys in the\n" +" specified range.\n" " -P Pipeline requests. Default 1 (no pipeline).\n" " -q Quiet. Just show query/sec values\n" " --csv Output in CSV format\n" From d1bdb17b423d9ddaf3b0410b7de7624dbe6ea174 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 30 Oct 2013 11:34:01 +0100 Subject: [PATCH 0250/2500] SCAN: tests moved to unit/scan.tcl. --- tests/test_helper.tcl | 1 + tests/unit/basic.tcl | 54 ------------------------------------------ tests/unit/scan.tcl | 55 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 54 deletions(-) create mode 100644 tests/unit/scan.tcl diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl index d8de34e18..058ea0c09 100644 --- a/tests/test_helper.tcl +++ b/tests/test_helper.tcl @@ -17,6 +17,7 @@ set ::all_tests { unit/auth unit/protocol unit/basic + unit/scan unit/type/list unit/type/list-2 unit/type/list-3 diff --git a/tests/unit/basic.tcl b/tests/unit/basic.tcl index 67651666e..1f46ba666 100644 --- a/tests/unit/basic.tcl +++ b/tests/unit/basic.tcl @@ -761,58 +761,4 @@ start_server {tags {"basic"}} { r keys * r keys * } {dlskeriewrioeuwqoirueioqwrueoqwrueqw} - - test "SCAN basic" { - r flushdb - r debug populate 1000 - - set cur 0 - set keys {} - while 1 { - set res [r scan $cur] - set cur [lindex $res 0] - set k [lindex $res 1] - lappend keys $k - if {$cur == 0} break - } - - set keys [lsort -unique [concat {*}$keys]] - assert_equal 1000 [llength $keys] - } - - test "SCAN COUNT" { - r flushdb - r debug populate 1000 - - set cur 0 - set keys {} - while 1 { - set res [r scan $cur count 5] - set cur [lindex $res 0] - set k [lindex $res 1] - lappend keys $k - if {$cur == 0} break - } - - set keys [lsort -unique [concat {*}$keys]] - assert_equal 1000 [llength $keys] - } - - test "SCAN MATCH" { - r flushdb - r debug populate 1000 - - set cur 0 - set keys {} - while 1 { - set res [r scan $cur match "key:1??"] - set cur [lindex $res 0] - set k [lindex $res 1] - lappend keys $k - if {$cur == 0} break - } - - set keys [lsort -unique [concat {*}$keys]] - assert_equal 100 [llength $keys] - } } diff --git a/tests/unit/scan.tcl b/tests/unit/scan.tcl new file mode 100644 index 000000000..275a4d656 --- /dev/null +++ b/tests/unit/scan.tcl @@ -0,0 +1,55 @@ +start_server {tags {"scan"}} { + test "SCAN basic" { + r flushdb + r debug populate 1000 + + set cur 0 + set keys {} + while 1 { + set res [r scan $cur] + set cur [lindex $res 0] + set k [lindex $res 1] + lappend keys $k + if {$cur == 0} break + } + + set keys [lsort -unique [concat {*}$keys]] + assert_equal 1000 [llength $keys] + } + + test "SCAN COUNT" { + r flushdb + r debug populate 1000 + + set cur 0 + set keys {} + while 1 { + set res [r scan $cur count 5] + set cur [lindex $res 0] + set k [lindex $res 1] + lappend keys $k + if {$cur == 0} break + } + + set keys [lsort -unique [concat {*}$keys]] + assert_equal 1000 [llength $keys] + } + + test "SCAN MATCH" { + r flushdb + r debug populate 1000 + + set cur 0 + set keys {} + while 1 { + set res [r scan $cur match "key:1??"] + set cur [lindex $res 0] + set k [lindex $res 1] + lappend keys $k + if {$cur == 0} break + } + + set keys [lsort -unique [concat {*}$keys]] + assert_equal 100 [llength $keys] + } +} From a23bf277180b903c723569faf234fd47294aa228 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 30 Oct 2013 11:36:12 +0100 Subject: [PATCH 0251/2500] SCAN test keys sorting turned into more idiomatic Tcl. --- tests/unit/scan.tcl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/unit/scan.tcl b/tests/unit/scan.tcl index 275a4d656..af4bb7667 100644 --- a/tests/unit/scan.tcl +++ b/tests/unit/scan.tcl @@ -9,11 +9,11 @@ start_server {tags {"scan"}} { set res [r scan $cur] set cur [lindex $res 0] set k [lindex $res 1] - lappend keys $k + lappend keys {*}$k if {$cur == 0} break } - set keys [lsort -unique [concat {*}$keys]] + set keys [lsort -unique $keys] assert_equal 1000 [llength $keys] } @@ -27,11 +27,11 @@ start_server {tags {"scan"}} { set res [r scan $cur count 5] set cur [lindex $res 0] set k [lindex $res 1] - lappend keys $k + lappend keys {*}$k if {$cur == 0} break } - set keys [lsort -unique [concat {*}$keys]] + set keys [lsort -unique $keys] assert_equal 1000 [llength $keys] } @@ -45,11 +45,11 @@ start_server {tags {"scan"}} { set res [r scan $cur match "key:1??"] set cur [lindex $res 0] set k [lindex $res 1] - lappend keys $k + lappend keys {*}$k if {$cur == 0} break } - set keys [lsort -unique [concat {*}$keys]] + set keys [lsort -unique $keys] assert_equal 100 [llength $keys] } } From 35250fa9df04f99020def647abace0814cb35297 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 30 Oct 2013 11:58:04 +0100 Subject: [PATCH 0252/2500] Test: added SSCAN test. --- tests/unit/scan.tcl | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/tests/unit/scan.tcl b/tests/unit/scan.tcl index af4bb7667..280d31411 100644 --- a/tests/unit/scan.tcl +++ b/tests/unit/scan.tcl @@ -52,4 +52,38 @@ start_server {tags {"scan"}} { set keys [lsort -unique $keys] assert_equal 100 [llength $keys] } + + foreach enc {intset hashtable} { + test "SSCAN with encoding $enc" { + # Create the Set + r del set + if {$enc eq {intset}} { + set prefix "" + } else { + set prefix "ele:" + } + set elements {} + for {set j 0} {$j < 100} {incr j} { + lappend elements ${prefix}${j} + } + r sadd set {*}$elements + + # Verify that the encoding matches. + assert {[r object encoding set] eq $enc} + + # Test SSCAN + set cur 0 + set keys {} + while 1 { + set res [r sscan set $cur] + set cur [lindex $res 0] + set k [lindex $res 1] + lappend keys {*}$k + if {$cur == 0} break + } + + set keys [lsort -unique $keys] + assert_equal 100 [llength $keys] + } + } } From 82dcd8550382c211107deb0ec0b66cdbd0b5d8d4 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 30 Oct 2013 16:24:39 +0100 Subject: [PATCH 0253/2500] Test: added HSCAN test. --- tests/unit/scan.tcl | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/tests/unit/scan.tcl b/tests/unit/scan.tcl index 280d31411..02519ff7c 100644 --- a/tests/unit/scan.tcl +++ b/tests/unit/scan.tcl @@ -86,4 +86,44 @@ start_server {tags {"scan"}} { assert_equal 100 [llength $keys] } } + + foreach enc {ziplist hashtable} { + test "HSCAN with encoding $enc" { + # Create the Hash + r del hash + if {$enc eq {ziplist}} { + set count 30 + } else { + set count 1000 + } + set elements {} + for {set j 0} {$j < $count} {incr j} { + lappend elements key:$j $j + } + r hmset hash {*}$elements + + # Verify that the encoding matches. + assert {[r object encoding hash] eq $enc} + + # Test HSCAN + set cur 0 + set keys {} + while 1 { + set res [r hscan hash $cur] + set cur [lindex $res 0] + set k [lindex $res 1] + lappend keys {*}$k + if {$cur == 0} break + } + + set keys2 {} + foreach {k v} $keys { + assert {$k eq "key:$v"} + lappend keys2 $k + } + + set keys2 [lsort -unique $keys2] + assert_equal $count [llength $keys2] + } + } } From cde7c072ba10bdd7c3691db47c5868840f8738fc Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 30 Oct 2013 16:25:47 +0100 Subject: [PATCH 0254/2500] Test: added ZSCAN test. --- tests/unit/scan.tcl | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/tests/unit/scan.tcl b/tests/unit/scan.tcl index 02519ff7c..8951bcdee 100644 --- a/tests/unit/scan.tcl +++ b/tests/unit/scan.tcl @@ -126,4 +126,44 @@ start_server {tags {"scan"}} { assert_equal $count [llength $keys2] } } + + foreach enc {ziplist skiplist} { + test "ZSCAN with encoding $enc" { + # Create the Sorted Set + r del zset + if {$enc eq {ziplist}} { + set count 30 + } else { + set count 1000 + } + set elements {} + for {set j 0} {$j < $count} {incr j} { + lappend elements $j key:$j + } + r zadd zset {*}$elements + + # Verify that the encoding matches. + assert {[r object encoding zset] eq $enc} + + # Test ZSCAN + set cur 0 + set keys {} + while 1 { + set res [r zscan zset $cur] + set cur [lindex $res 0] + set k [lindex $res 1] + lappend keys {*}$k + if {$cur == 0} break + } + + set keys2 {} + foreach {k v} $keys { + assert {$k eq "key:$v"} + lappend keys2 $k + } + + set keys2 [lsort -unique $keys2] + assert_equal $count [llength $keys2] + } + } } From 996bffbfb6c97589a7d9de28c76186832b856658 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 30 Oct 2013 16:50:25 +0100 Subject: [PATCH 0255/2500] Test: added a SCAN test trying to trigger HT resize. --- tests/unit/scan.tcl | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tests/unit/scan.tcl b/tests/unit/scan.tcl index 8951bcdee..71c4e3655 100644 --- a/tests/unit/scan.tcl +++ b/tests/unit/scan.tcl @@ -166,4 +166,33 @@ start_server {tags {"scan"}} { assert_equal $count [llength $keys2] } } + + test "SCAN guarantees check under write load" { + r flushdb + r debug populate 100 + + # We start scanning here, so keys from 0 to 99 should all be + # reported at the end of the iteration. + set keys {} + while 1 { + set res [r scan $cur] + set cur [lindex $res 0] + set k [lindex $res 1] + lappend keys {*}$k + if {$cur == 0} break + # Write 10 random keys at every SCAN iteration. + for {set j 0} {$j < 10} {incr j} { + r set addedkey:[randomInt 1000] foo + } + } + + set keys2 {} + foreach k $keys { + if {[string length $k] > 6} continue + lappend keys2 $k + } + + set keys2 [lsort -unique $keys2] + assert_equal 100 [llength $keys2] + } } From 4f3d9c05956c01559c7544a4e2a224afd75641f2 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 31 Oct 2013 09:43:21 +0100 Subject: [PATCH 0256/2500] Regression test added for [SHZ]SCAN issue #1354. --- tests/unit/scan.tcl | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/unit/scan.tcl b/tests/unit/scan.tcl index 71c4e3655..511dd1fc7 100644 --- a/tests/unit/scan.tcl +++ b/tests/unit/scan.tcl @@ -195,4 +195,12 @@ start_server {tags {"scan"}} { set keys2 [lsort -unique $keys2] assert_equal 100 [llength $keys2] } + + test "SSCAN with integer encoded object (issue #1345)" { + set objects {1 a} + r del set + r sadd set {*}$objects + set res [r sscan set 0 MATCH *a* COUNT 100] + assert_equal [lsort -unique [lindex $res 1]] {a} + } } From 34e471e748ca76c03210c100577a46cb234c49be Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 31 Oct 2013 10:32:33 +0100 Subject: [PATCH 0257/2500] scanGenericCommand() refactoring and handling of integer encoded elements. This commit fixes issue #1354. --- src/db.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/src/db.c b/src/db.c index 83806e713..234f80f53 100644 --- a/src/db.c +++ b/src/db.c @@ -480,15 +480,28 @@ void scanGenericCommand(redisClient *c, robj *o) { while (node) { robj *kobj = listNodeValue(node); nextnode = listNextNode(node); + int filter = 0; - /* Keep key iff pattern matches and, if we are iterating the key - * space, check that the key hasn't expired. */ - if ((patnoop || - stringmatchlen(pat, patlen, kobj->ptr, sdslen(kobj->ptr), 0)) && - (o != NULL || expireIfNeeded(c->db, kobj) == 0)) - { - /* Keep */ - } else { + /* Filter element if it does not match the pattern. */ + if (!filter && !patnoop) { + if (sdsEncodedObject(kobj)) { + if (!stringmatchlen(pat, patlen, kobj->ptr, sdslen(kobj->ptr), 0)) + filter = 1; + } else { + char buf[REDIS_LONGSTR_SIZE]; + int len; + + redisAssert(kobj->encoding == REDIS_ENCODING_INT); + len = ll2string(buf,sizeof(buf),(long)kobj->ptr); + if (!stringmatchlen(pat, patlen, buf, len, 0)) filter = 1; + } + } + + /* Filter element if it is an expired key. */ + if (!filter && o == NULL && expireIfNeeded(c->db, kobj)) filter = 1; + + /* Remove the element and its associted value if needed. */ + if (filter) { decrRefCount(kobj); listDelNode(keys, node); /* Also remove the value for hashes and sorted sets. */ From 653347d2538139ad121301ecef6311bad8cfa60e Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 31 Oct 2013 10:35:56 +0100 Subject: [PATCH 0258/2500] Inverted variable boolean value and name after scanGenericCommand() refactoring. --- src/db.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/db.c b/src/db.c index 234f80f53..d490e2aa2 100644 --- a/src/db.c +++ b/src/db.c @@ -370,7 +370,7 @@ void scanGenericCommand(redisClient *c, robj *o) { unsigned long cursor = 0; long count = 10; sds pat; - int patlen, patnoop = 1; + int patlen, use_pattern = 0; dict *ht; /* Object must be NULL (to iterate keys names), or the type of the object @@ -408,8 +408,9 @@ void scanGenericCommand(redisClient *c, robj *o) { pat = c->argv[i+1]->ptr; patlen = sdslen(pat); - /* The pattern is a no-op iff == "*" */ - patnoop = (pat[0] == '*' && patlen == 1); + /* The pattern always matches if it is exactly "*", so it is + * equivalent to disabling it. */ + use_pattern = !(pat[0] == '*' && patlen == 1); i += 2; } else { @@ -483,7 +484,7 @@ void scanGenericCommand(redisClient *c, robj *o) { int filter = 0; /* Filter element if it does not match the pattern. */ - if (!filter && !patnoop) { + if (!filter && use_pattern) { if (sdsEncodedObject(kobj)) { if (!stringmatchlen(pat, patlen, kobj->ptr, sdslen(kobj->ptr), 0)) filter = 1; From adcc1fc04db00e3c2a087bb4813dc293e364f2b0 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 31 Oct 2013 10:37:27 +0100 Subject: [PATCH 0259/2500] SSCAN with integer encoded object test improved. --- tests/unit/scan.tcl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/unit/scan.tcl b/tests/unit/scan.tcl index 511dd1fc7..45498a514 100644 --- a/tests/unit/scan.tcl +++ b/tests/unit/scan.tcl @@ -202,5 +202,7 @@ start_server {tags {"scan"}} { r sadd set {*}$objects set res [r sscan set 0 MATCH *a* COUNT 100] assert_equal [lsort -unique [lindex $res 1]] {a} + set res [r sscan set 0 MATCH *1* COUNT 100] + assert_equal [lsort -unique [lindex $res 1]] {1} } } From e1a469d3b0e26a42e383eb488794755cb18bf2da Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 5 Nov 2013 11:24:24 +0100 Subject: [PATCH 0260/2500] Initial support for --replicas in redis-trib. --- src/redis-trib.rb | 123 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 109 insertions(+), 14 deletions(-) diff --git a/src/redis-trib.rb b/src/redis-trib.rb index 76eb6f436..b6bf2980c 100755 --- a/src/redis-trib.rb +++ b/src/redis-trib.rb @@ -61,6 +61,7 @@ class ClusterNode @info[:slots] = {} @info[:migrating] = {} @info[:importing] = {} + @info[:replicate] = false @dirty = false # True if we need to flush slots info into node. @friends = [] end @@ -172,16 +173,33 @@ class ClusterNode @dirty = true end + def set_as_replica(node_id) + @info[:replicate] = node_id + @dirty = true + end + def flush_node_config return if !@dirty - new = [] - @info[:slots].each{|s,val| - if val == :new - new << s - @info[:slots][s] = true + if @info[:replicate] + begin + @r.cluster("replicate",@info[:replicate]) + rescue + # If the cluster did not already joined it is possible that + # the slave does not know the master node yet. So on errors + # we return ASAP leaving the dirty flag set, to flush the + # config later. + return end - } - @r.cluster("addslots",*new) + else + new = [] + @info[:slots].each{|s,val| + if val == :new + new << s + @info[:slots][s] = true + end + } + @r.cluster("addslots",*new) + end @dirty = false end @@ -218,9 +236,14 @@ class ClusterNode }.join(",") role = self.has_flag?("master") ? "M" : "S" - "#{role}: #{self.info[:name]} #{self.to_s}\n"+ - " slots:#{slots} (#{self.slots.length} slots) "+ - "#{(self.info[:flags]-["myself"]).join(",")}" + + if self.info[:replicate] and @dirty + "S: #{self.info[:name]} #{self.to_s}" + else + "#{role}: #{self.info[:name]} #{self.to_s}\n"+ + " slots:#{slots} (#{self.slots.length} slots) "+ + "#{(self.info[:flags]-["myself"]).join(",")}" + end end # Return a single string representing nodes and associated slots. @@ -460,15 +483,68 @@ class RedisTrib end def alloc_slots - slots_per_node = ClusterHashSlots/@nodes.length - i = 0 + nodes_count = @nodes.length + masters_count = @nodes.length / (@replicas+1) + slots_per_node = ClusterHashSlots / masters_count + masters = [] + slaves = [] + + # The first step is to split instances by IP. This is useful as + # we'll try to allocate master nodes in different physical machines + # (as much as possible) and to allocate slaves of a given master in + # different physical machines as well. + # + # This code assumes just that if the IP is different, than it is more + # likely that the instance is running in a different physical host + # or at least a different virtual machine. + ips = {} @nodes.each{|n| + ips[n.info[:host]] = [] if !ips[n.info[:host]] + ips[n.info[:host]] << n + } + + # Select master instances + puts "Using #{masters_count} masters:" + while masters.length < masters_count + ips.each{|ip,nodes_list| + next if nodes_list.length == 0 + masters << nodes_list.shift + puts masters[-1] + nodes_count -= 1 + break if masters.length == masters_count + } + end + + # Alloc slots on masters + i = 0 + masters.each{|n| first = i*slots_per_node last = first+slots_per_node-1 last = ClusterHashSlots-1 if i == @nodes.length-1 n.add_slots first..last i += 1 } + + # Select N replicas for every master. + # We try to split the replicas among all the IPs with spare nodes + # trying to avoid the host where the master is running, if possible. + masters.each{|m| + i = 0 + while i < @replicas + ips.each{|ip,nodes_list| + next if nodes_list.length == 0 + # Skip instances with the same IP as the master if we + # have some more IPs available. + next if ip == m.info[:host] && nodes_count > nodes_list.length + slave = nodes_list.shift + slave.set_as_replica(m.info[:name]) + nodes_count -= 1 + i += 1 + puts "#{m} replica ##{i} is #{slave}" + break if masters.length == masters_count + } + end + } end def flush_nodes_config @@ -667,7 +743,24 @@ class RedisTrib } end + # This is an helper function for create_cluster_cmd that verifies if + # the number of nodes and the specified replicas have a valid configuration + # where there are at least three master nodes and enough replicas per node. + def check_create_parameters + masters = @nodes.length/(@replicas+1) + if masters < 3 + puts "*** ERROR: Invalid configuration for cluster creation." + puts "*** Redis Cluster requires at least 3 master nodes." + puts "*** This is not possible with #{@nodes.length} nodes and #{@replicas} replicas per node." + puts "*** At least #{3*(@replicas+1)} nodes are required." + exit 1 + end + end + def create_cluster_cmd(argv,opt) + opt = {'replicas' => 0}.merge(opt) + @replicas = opt['replicas'].to_i + xputs ">>> Creating cluster" argv[0..-1].each{|n| node = ClusterNode.new(n) @@ -677,6 +770,7 @@ class RedisTrib node.assert_empty add_node(node) } + check_create_parameters xputs ">>> Performing hash slots allocation on #{@nodes.length} nodes..." alloc_slots show_nodes @@ -690,6 +784,7 @@ class RedisTrib # they are still empty with unassigned slots. sleep 1 wait_cluster_join + flush_nodes_config # Useful for the replicas check_cluster end @@ -758,11 +853,11 @@ COMMANDS={ } ALLOWED_OPTIONS={ - "create" => {"slaves" => false} + "create" => {"replicas" => true} } def show_help - puts "Usage: redis-trib " + puts "Usage: redis-trib " puts COMMANDS.each{|k,v| puts " #{k.ljust(10)} #{v[2]}" From 865d3b0f3305b1f8a8bc7e16ea84620514446c27 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 5 Nov 2013 11:56:11 +0100 Subject: [PATCH 0261/2500] removed not used vars in dictScan(). --- src/dict.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/dict.c b/src/dict.c index 72eecae38..bd54a5022 100644 --- a/src/dict.c +++ b/src/dict.c @@ -751,7 +751,6 @@ unsigned long dictScan(dict *d, { dictht *t0, *t1; const dictEntry *de; - unsigned long s0, s1; unsigned long m0, m1; if (dictSize(d) == 0) return 0; @@ -777,8 +776,6 @@ unsigned long dictScan(dict *d, t1 = &d->ht[0]; } - s0 = t0->size; - s1 = t1->size; m0 = t0->sizemask; m1 = t1->sizemask; From 522549729ffeb07b9166f39f344c0c7d82f9852d Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 5 Nov 2013 11:57:30 +0100 Subject: [PATCH 0262/2500] Pass int64_t to intsetGet() instead of long long. --- src/db.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/db.c b/src/db.c index d490e2aa2..2115515ce 100644 --- a/src/db.c +++ b/src/db.c @@ -455,7 +455,7 @@ void scanGenericCommand(redisClient *c, robj *o) { } while (cursor && listLength(keys) < count); } else if (o->type == REDIS_SET) { int pos = 0; - long long ll; + int64_t ll; while(intsetGet(o->ptr,pos++,&ll)) listAddNodeTail(keys,createStringObjectFromLongLong(ll)); From f6738923a66d5a816fa961b5ca469a2e97c4163c Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 5 Nov 2013 12:01:07 +0100 Subject: [PATCH 0263/2500] Cluster: initialize senderConfigEpoch and senderCurrentEpoch for warnings suppression. --- src/cluster.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index d4f0f9300..4a36eb3dc 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -869,7 +869,7 @@ int clusterProcessPacket(clusterLink *link) { uint32_t totlen = ntohl(hdr->totlen); uint16_t type = ntohs(hdr->type); uint16_t flags = ntohs(hdr->flags); - uint64_t senderCurrentEpoch, senderConfigEpoch; + uint64_t senderCurrentEpoch = 0, senderConfigEpoch = 0; clusterNode *sender; server.cluster->stats_bus_messages_received++; @@ -1684,7 +1684,7 @@ void clusterHandleSlaveFailover(void) { server.cluster->currentEpoch++; server.cluster->failover_auth_epoch = server.cluster->currentEpoch; redisLog(REDIS_WARNING,"Starting a failover election for epoch %llu.", - server.cluster->currentEpoch); + (unsigned long long) server.cluster->currentEpoch); clusterRequestFailoverAuth(); server.cluster->failover_auth_sent = 1; clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| From 56f53f7b3cc18ab3c685f0ca1508c26e911e5ab5 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 5 Nov 2013 12:16:29 +0100 Subject: [PATCH 0264/2500] HSCAN/ZSCAN: skip value when matching. This fixes issue #1360 and #1362. --- src/db.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/db.c b/src/db.c index 2115515ce..71b639a5a 100644 --- a/src/db.c +++ b/src/db.c @@ -505,11 +505,16 @@ void scanGenericCommand(redisClient *c, robj *o) { if (filter) { decrRefCount(kobj); listDelNode(keys, node); - /* Also remove the value for hashes and sorted sets. */ - if (o && (o->type == REDIS_ZSET || o->type == REDIS_HASH)) { - node = nextnode; + } + + /* If this is an hash or a sorted set, we have a flat list of + * key-value elements, so if this element was filtered, remove the + * value, or skip it if it was not filtered: we only match keys. */ + if (o && (o->type == REDIS_ZSET || o->type == REDIS_HASH)) { + node = nextnode; + nextnode = listNextNode(node); + if (filter) { kobj = listNodeValue(node); - nextnode = listNextNode(node); decrRefCount(kobj); listDelNode(keys, node); } From 7dfcfe47b66c078d5cda6297bec943c73578e661 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 5 Nov 2013 15:19:44 +0100 Subject: [PATCH 0265/2500] Added tests for [SHZ]SCAN with MATCH. --- tests/unit/scan.tcl | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tests/unit/scan.tcl b/tests/unit/scan.tcl index 45498a514..2b1033e39 100644 --- a/tests/unit/scan.tcl +++ b/tests/unit/scan.tcl @@ -205,4 +205,25 @@ start_server {tags {"scan"}} { set res [r sscan set 0 MATCH *1* COUNT 100] assert_equal [lsort -unique [lindex $res 1]] {1} } + + test "SSCAN with PATTERN" { + r del mykey + r sadd mykey foo fab fiz foobar 1 2 3 4 + set res [r sscan mykey 0 MATCH foo* COUNT 10000] + lsort -unique [lindex $res 1] + } {foo foobar} + + test "HSCAN with PATTERN" { + r del mykey + r hmset mykey foo 1 fab 2 fiz 3 foobar 10 1 a 2 b 3 c 4 d + set res [r hscan mykey 0 MATCH foo* COUNT 10000] + lsort -unique [lindex $res 1] + } {1 10 foo foobar} + + test "ZSCAN with PATTERN" { + r del mykey + r zadd mykey 1 foo 2 fab 3 fiz 10 foobar + set res [r zscan mykey 0 MATCH foo* COUNT 10000] + lsort -unique [lindex $res 1] + } } From b97cff8b63ae1cf51d8b40db556abe08cb9222bf Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 5 Nov 2013 15:30:21 +0100 Subject: [PATCH 0266/2500] Use strtoul() instead of sscanf() in SCAN implementation. --- src/db.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/db.c b/src/db.c index 71b639a5a..c7de00496 100644 --- a/src/db.c +++ b/src/db.c @@ -364,7 +364,7 @@ void scanCallback(void *privdata, const dictEntry *de) { void scanGenericCommand(redisClient *c, robj *o) { int rv; int i, j; - char buf[REDIS_LONGSTR_SIZE]; + char buf[REDIS_LONGSTR_SIZE], *eptr; list *keys = listCreate(); listNode *node, *nextnode; unsigned long cursor = 0; @@ -381,9 +381,12 @@ void scanGenericCommand(redisClient *c, robj *o) { /* Set i to the first option argument. The previous one is the cursor. */ i = (o == NULL) ? 2 : 3; /* Skip the key argument if needed. */ - /* Use sscanf because we need an *unsigned* long */ - rv = sscanf(c->argv[i-1]->ptr, "%lu", &cursor); - if (rv != 1) { + /* Use strtoul() because we need an *unsigned* long, so + * getLongLongFromObject() does not cover the whole cursor space. */ + errno = 0; + cursor = strtoul(c->argv[i-1]->ptr, &eptr, 10); + if (isspace(((char*)c->argv[i-1])[0]) || eptr[0] != '\0' || errno == ERANGE) + { addReplyError(c, "invalid cursor"); goto cleanup; } From f31cf249b97d116dc9c0f8c1af3d33f6e3687050 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 5 Nov 2013 15:32:21 +0100 Subject: [PATCH 0267/2500] SCAN: when iterating ziplists or intsets always return cursor of 0. The previous implementation assumed that the first call always happens with cursor set to 0, this may not be the case, and we want to return 0 anyway otherwise the (broken) client code will loop forever. --- src/db.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/db.c b/src/db.c index c7de00496..1ed0bb2af 100644 --- a/src/db.c +++ b/src/db.c @@ -462,6 +462,7 @@ void scanGenericCommand(redisClient *c, robj *o) { while(intsetGet(o->ptr,pos++,&ll)) listAddNodeTail(keys,createStringObjectFromLongLong(ll)); + cursor = 0; } else if (o->type == REDIS_HASH || o->type == REDIS_ZSET) { unsigned char *p = ziplistIndex(o->ptr,0); unsigned char *vstr; @@ -475,6 +476,7 @@ void scanGenericCommand(redisClient *c, robj *o) { createStringObjectFromLongLong(vll)); p = ziplistNext(o->ptr,p); } + cursor = 0; } else { redisPanic("Not handled encoding in SCAN."); } From 1a1eb8bc8d93897a73ba9eb440373abf0db1ecf1 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 5 Nov 2013 15:47:50 +0100 Subject: [PATCH 0268/2500] SCAN code refactored to parse cursor first. The previous implementation of SCAN parsed the cursor in the generic function implementing SCAN, SSCAN, HSCAN and ZSCAN. The actual higher-level command implementation only checked for empty keys and return ASAP in that case. The result was that inverting the arguments of, for instance, SSCAN for example and write: SSCAN 0 key Instead of SSCAN key 0 Resulted into no error, since 0 is a non-existing key name very likely. Just the iterator returned no elements at all. In order to fix this issue the code was refactored to extract the function to parse the cursor and return the error. Every higher level command implementation now parses the cursor and later checks if the key exist or not. --- src/db.c | 38 ++++++++++++++++++++++++-------------- src/redis.h | 3 ++- src/t_hash.c | 4 +++- src/t_set.c | 4 +++- src/t_zset.c | 4 +++- 5 files changed, 35 insertions(+), 18 deletions(-) diff --git a/src/db.c b/src/db.c index 1ed0bb2af..8d538662e 100644 --- a/src/db.c +++ b/src/db.c @@ -350,6 +350,25 @@ void scanCallback(void *privdata, const dictEntry *de) { if (val) listAddNodeTail(keys, val); } +/* Try to parse a SCAN cursor stored ad object 'o': + * if the cursor is valid, store it as unsigned integer into *cursor and + * returns REDIS_OK. Otherwise return REDIS_ERR and send an error to the + * client. */ +int parseScanCursorOrReply(redisClient *c, robj *o, unsigned long *cursor) { + char *eptr; + + /* Use strtoul() because we need an *unsigned* long, so + * getLongLongFromObject() does not cover the whole cursor space. */ + errno = 0; + *cursor = strtoul(o->ptr, &eptr, 10); + if (isspace(((char*)o->ptr)[0]) || eptr[0] != '\0' || errno == ERANGE) + { + addReplyError(c, "invalid cursor"); + return REDIS_ERR; + } + return REDIS_OK; +} + /* This command implements SCAN, HSCAN and SSCAN commands. * If object 'o' is passed, then it must be an Hash or Set object, otherwise * if 'o' is NULL the command will operate on the dictionary associated with @@ -361,13 +380,12 @@ void scanCallback(void *privdata, const dictEntry *de) { * * In the case of an Hash object the function returns both the field and value * of every element on the Hash. */ -void scanGenericCommand(redisClient *c, robj *o) { +void scanGenericCommand(redisClient *c, robj *o, unsigned long cursor) { int rv; int i, j; - char buf[REDIS_LONGSTR_SIZE], *eptr; + char buf[REDIS_LONGSTR_SIZE]; list *keys = listCreate(); listNode *node, *nextnode; - unsigned long cursor = 0; long count = 10; sds pat; int patlen, use_pattern = 0; @@ -381,16 +399,6 @@ void scanGenericCommand(redisClient *c, robj *o) { /* Set i to the first option argument. The previous one is the cursor. */ i = (o == NULL) ? 2 : 3; /* Skip the key argument if needed. */ - /* Use strtoul() because we need an *unsigned* long, so - * getLongLongFromObject() does not cover the whole cursor space. */ - errno = 0; - cursor = strtoul(c->argv[i-1]->ptr, &eptr, 10); - if (isspace(((char*)c->argv[i-1])[0]) || eptr[0] != '\0' || errno == ERANGE) - { - addReplyError(c, "invalid cursor"); - goto cleanup; - } - /* Step 1: Parse options. */ while (i < c->argc) { j = c->argc - i; @@ -548,7 +556,9 @@ cleanup: /* The SCAN command completely relies on scanGenericCommand. */ void scanCommand(redisClient *c) { - scanGenericCommand(c,NULL); + unsigned long cursor; + if (parseScanCursorOrReply(c,c->argv[1],&cursor) == REDIS_ERR) return; + scanGenericCommand(c,NULL,cursor); } void dbsizeCommand(redisClient *c) { diff --git a/src/redis.h b/src/redis.h index 4312b3421..f0b5aa862 100644 --- a/src/redis.h +++ b/src/redis.h @@ -1194,7 +1194,8 @@ void signalFlushedDb(int dbid); unsigned int getKeysInSlot(unsigned int hashslot, robj **keys, unsigned int count); unsigned int countKeysInSlot(unsigned int hashslot); int verifyClusterConfigWithData(void); -void scanGenericCommand(redisClient *c, robj *o); +void scanGenericCommand(redisClient *c, robj *o, unsigned long cursor); +int parseScanCursorOrReply(redisClient *c, robj *o, unsigned long *cursor); /* API to get key arguments from commands */ #define REDIS_GETKEYS_ALL 0 diff --git a/src/t_hash.c b/src/t_hash.c index 298414f85..f5ceb36e9 100644 --- a/src/t_hash.c +++ b/src/t_hash.c @@ -762,8 +762,10 @@ void hexistsCommand(redisClient *c) { void hscanCommand(redisClient *c) { robj *o; + unsigned long cursor; + if (parseScanCursorOrReply(c,c->argv[2],&cursor) == REDIS_ERR) return; if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptyscan)) == NULL || checkType(c,o,REDIS_HASH)) return; - scanGenericCommand(c,o); + scanGenericCommand(c,o,cursor); } diff --git a/src/t_set.c b/src/t_set.c index ebffc5dc0..0ba8335aa 100644 --- a/src/t_set.c +++ b/src/t_set.c @@ -909,8 +909,10 @@ void sdiffstoreCommand(redisClient *c) { void sscanCommand(redisClient *c) { robj *set; + unsigned long cursor; + if (parseScanCursorOrReply(c,c->argv[2],&cursor) == REDIS_ERR) return; if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.emptyscan)) == NULL || checkType(c,set,REDIS_SET)) return; - scanGenericCommand(c,set); + scanGenericCommand(c,set,cursor); } diff --git a/src/t_zset.c b/src/t_zset.c index 1ff289ce1..c3bcbb88f 100644 --- a/src/t_zset.c +++ b/src/t_zset.c @@ -2210,8 +2210,10 @@ void zrevrankCommand(redisClient *c) { void zscanCommand(redisClient *c) { robj *o; + unsigned long cursor; + if (parseScanCursorOrReply(c,c->argv[2],&cursor) == REDIS_ERR) return; if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptyscan)) == NULL || checkType(c,o,REDIS_ZSET)) return; - scanGenericCommand(c,o); + scanGenericCommand(c,o,cursor); } From 6e98e93c5fa1166b8b8b3940bf5ae704e13073c3 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 5 Nov 2013 17:23:11 +0100 Subject: [PATCH 0269/2500] Fixed typo in parseScanCursorOrReply(): ad -> at. Thanks to @badboy for reporting it. --- src/db.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/db.c b/src/db.c index 8d538662e..1bdcf9e83 100644 --- a/src/db.c +++ b/src/db.c @@ -350,7 +350,7 @@ void scanCallback(void *privdata, const dictEntry *de) { if (val) listAddNodeTail(keys, val); } -/* Try to parse a SCAN cursor stored ad object 'o': +/* Try to parse a SCAN cursor stored at object 'o': * if the cursor is valid, store it as unsigned integer into *cursor and * returns REDIS_OK. Otherwise return REDIS_ERR and send an error to the * client. */ From 58efb0120411cda53e8e7c8c46b74d6f6f1ad1f0 Mon Sep 17 00:00:00 2001 From: Ryan Biesemeyer Date: Wed, 6 Nov 2013 08:31:57 +0000 Subject: [PATCH 0270/2500] Deprecate utils/redis-copy.rb in favor of redis-copy gem --- utils/redis-copy.rb | 67 ++++++++------------------------------------- 1 file changed, 12 insertions(+), 55 deletions(-) diff --git a/utils/redis-copy.rb b/utils/redis-copy.rb index d892e377f..7c5c52dd6 100644 --- a/utils/redis-copy.rb +++ b/utils/redis-copy.rb @@ -3,66 +3,23 @@ # # Copy the whole dataset from one Redis instance to another one # -# WARNING: currently hashes and sorted sets are not supported! This -# program should be updated. +# WARNING: this utility is deprecated and serves as a legacy adapter +# for the more-robust redis-copy gem. -require 'rubygems' -require 'redis' -require 'digest/sha1' +require 'shellwords' def redisCopy(opts={}) - sha1="" - src = Redis.new(:host => opts[:srchost], :port => opts[:srcport]) - dst = Redis.new(:host => opts[:dsthost], :port => opts[:dstport]) - puts "Loading key names..." - keys = src.keys('*') - puts "Copying #{keys.length} keys..." - c = 0 - keys.each{|k| - vtype = src.type?(k) - ttl = src.ttl(k).to_i if vtype != "none" - - if vtype == "string" - dst[k] = src[k] - elsif vtype == "list" - list = src.lrange(k,0,-1) - if list.length == 0 - # Empty list special case - dst.lpush(k,"") - dst.lpop(k) - else - list.each{|ele| - dst.rpush(k,ele) - } - end - elsif vtype == "set" - set = src.smembers(k) - if set.length == 0 - # Empty set special case - dst.sadd(k,"") - dst.srem(k,"") - else - set.each{|ele| - dst.sadd(k,ele) - } - end - elsif vtype == "none" - puts "WARNING: key '#{k}' was removed in the meanwhile." - end - - # Handle keys with an expire time set - if ttl != -1 and vtype != "none" - dst.expire(k,ttl) - end - - c = c+1 - if (c % 1000) == 0 - puts "#{c}/#{keys.length} completed" - end - } - puts "DONE!" + src = "#{opts[:srchost]}:#{opts[:srcport]}" + dst = "#{opts[:dsthost]}:#{opts[:dstport]}" + `redis-copy #{src.shellescape} #{dst.shellescape}` +rescue Errno::ENOENT + $stderr.puts 'This utility requires the redis-copy executable', + 'from the redis-copy gem on https://rubygems.org', + 'To install it, run `gem install redis-copy`.' + exit 1 end +$stderr.puts "This utility is deprecated. Use the redis-copy gem instead." if ARGV.length != 4 puts "Usage: redis-copy.rb " exit 1 From 1071abc4a429ff6c04488edec48159cc200f02a8 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 5 Nov 2013 17:23:11 +0100 Subject: [PATCH 0271/2500] Sentinel: always send CONFIG REWRITE when changing instance role. This change makes Sentinel less fragile about a number of failure modes. This commit also fixes a different bug as a side effect, SLAVEOF command was sent multiple times without incrementing the pending commands count. --- src/db.c | 2 +- src/sentinel.c | 64 +++++++++++++++++++++++++++++++------------------- 2 files changed, 41 insertions(+), 25 deletions(-) diff --git a/src/db.c b/src/db.c index 8d538662e..1bdcf9e83 100644 --- a/src/db.c +++ b/src/db.c @@ -350,7 +350,7 @@ void scanCallback(void *privdata, const dictEntry *de) { if (val) listAddNodeTail(keys, val); } -/* Try to parse a SCAN cursor stored ad object 'o': +/* Try to parse a SCAN cursor stored at object 'o': * if the cursor is valid, store it as unsigned integer into *cursor and * returns REDIS_OK. Otherwise return REDIS_ERR and send an error to the * client. */ diff --git a/src/sentinel.c b/src/sentinel.c index 4bea156d6..307a6071b 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1518,8 +1518,7 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { (mstime() - ri->master->info_refresh) < SENTINEL_INFO_PERIOD*2) { int retval; - retval = redisAsyncCommand(ri->cc, - sentinelDiscardReplyCallback, NULL, "SLAVEOF %s %d", + retval = sentinelSendSlaveOf(ri, ri->master->addr->ip, ri->master->addr->port); if (retval == REDIS_OK) @@ -2521,6 +2520,39 @@ char *sentinelGetObjectiveLeader(sentinelRedisInstance *master) { return winner; } +/* Send SLAVEOF to the specified instance, always followed by a + * CONFIG REWRITE command in order to store the new configuration on disk + * when possible (that is, if the Redis instance is recent enough to support + * config rewriting, and if the server was started with a configuration file). + * + * If Host is NULL the function sends "SLAVEOF NO ONE". + * + * The command returns REDIS_OK if the SLAVEOF command was accepted for + * (later) delivery otherwise REDIS_ERR. The command replies are just + * discarded. */ +int sentinelSendSlaveOf(sentinelRedisInstance *ri, char *host, int port) { + char portstr[32]; + + ll2string(portstr,sizeof(portstr),port); + + if (host == NULL) { + host = "NO"; + memcpy(portstr,"ONE",4); + } + + retval = redisAsyncCommand(ri->cc, + sentinelDiscardReplyCallback, NULL, "SLAVEOF %s %s", host, portstr); + if (retval == REDIS_ERR) return retval; + + ri->pending_commands++; + if (redisAsyncCommand(ri->cc, + sentinelDiscardReplyCallback, NULL, "CONFIG REWRITE") == REDIS_OK) + { + ri->pending_commands++; + } + return REDIS_OK; +} + /* Setup the master state to start a failover as a leader. * * State can be either: @@ -2750,10 +2782,8 @@ void sentinelFailoverSendSlaveOfNoOne(sentinelRedisInstance *ri) { * We actually register a generic callback for this command as we don't * really care about the reply. We check if it worked indirectly observing * if INFO returns a different role (master instead of slave). */ - retval = redisAsyncCommand(ri->promoted_slave->cc, - sentinelDiscardReplyCallback, NULL, "SLAVEOF NO ONE"); + retval = sentinelSendSlaveOf(ri->promoted_slave,NULL,0); if (retval != REDIS_OK) return; - ri->promoted_slave->pending_commands++; sentinelEvent(REDIS_NOTICE, "+failover-state-wait-promotion", ri->promoted_slave,"%@"); ri->failover_state = SENTINEL_FAILOVER_STATE_WAIT_PROMOTION; @@ -2823,10 +2853,6 @@ void sentinelFailoverDetectEnd(sentinelRedisInstance *master) { if (timeout && (master->flags & SRI_I_AM_THE_LEADER)) { dictIterator *di; dictEntry *de; - char master_port[32]; - - ll2string(master_port,sizeof(master_port), - master->promoted_slave->addr->port); di = dictGetIterator(master->slaves); while((de = dictNext(di)) != NULL) { @@ -2836,10 +2862,9 @@ void sentinelFailoverDetectEnd(sentinelRedisInstance *master) { if (slave->flags & (SRI_RECONF_DONE|SRI_RECONF_SENT|SRI_DISCONNECTED)) continue; - retval = redisAsyncCommand(slave->cc, - sentinelDiscardReplyCallback, NULL, "SLAVEOF %s %s", + retval = sentinelSendSlaveOf(slave, master->promoted_slave->addr->ip, - master_port); + master->promoted_slave->addr->port); if (retval == REDIS_OK) { sentinelEvent(REDIS_NOTICE,"+slave-reconf-sent-be",slave,"%@"); slave->flags |= SRI_RECONF_SENT; @@ -2892,15 +2917,11 @@ void sentinelFailoverReconfNextSlave(sentinelRedisInstance *master) { continue; /* Send SLAVEOF . */ - ll2string(master_port,sizeof(master_port), - master->promoted_slave->addr->port); - retval = redisAsyncCommand(slave->cc, - sentinelDiscardReplyCallback, NULL, "SLAVEOF %s %s", + retval = sentinelSendSlaveOf(slave, master->promoted_slave->addr->ip, - master_port); + master->promoted_slave->addr->port); if (retval == REDIS_OK) { slave->flags |= SRI_RECONF_SENT; - slave->pending_commands++; slave->slave_reconf_sent_time = mstime(); sentinelEvent(REDIS_NOTICE,"+slave-reconf-sent",slave,"%@"); in_progress++; @@ -2982,13 +3003,11 @@ void sentinelFailoverStateMachine(sentinelRedisInstance *ri) { * back to the master as well, sending a best effort SLAVEOF command. */ void sentinelAbortFailover(sentinelRedisInstance *ri) { - char master_port[32]; dictIterator *di; dictEntry *de; int sentinel_role; redisAssert(ri->flags & SRI_FAILOVER_IN_PROGRESS); - ll2string(master_port,sizeof(master_port),ri->addr->port); /* Clear failover related flags from slaves. * Also if we are the leader make sure to send SLAVEOF commands to all the @@ -3004,10 +3023,7 @@ void sentinelAbortFailover(sentinelRedisInstance *ri) { { int retval; - retval = redisAsyncCommand(slave->cc, - sentinelDiscardReplyCallback, NULL, "SLAVEOF %s %s", - ri->addr->ip, - master_port); + retval = sentinelSendSlaveOf(slave,ri->addr->ip,ri->addr->port); if (retval == REDIS_OK) sentinelEvent(REDIS_NOTICE,"-slave-reconf-undo",slave,"%@"); } From 3be2b18d9fb42e27ea0b444a5a1dd7b6e1d83330 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 6 Nov 2013 11:21:44 +0100 Subject: [PATCH 0272/2500] Sentinel: increment pending_commands counter in two more places. AUTH and SCRIPT KILL were sent without incrementing the pending commands counter. Clearly this needs some kind of wrapper doing it for the caller in order to be less bug prone. --- src/sentinel.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 307a6071b..6bb407623 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1298,9 +1298,10 @@ void sentinelSendAuthIfNeeded(sentinelRedisInstance *ri, redisAsyncContext *c) { char *auth_pass = (ri->flags & SRI_MASTER) ? ri->auth_pass : ri->master->auth_pass; - if (auth_pass) - redisAsyncCommand(c, sentinelDiscardReplyCallback, NULL, "AUTH %s", - auth_pass); + if (auth_pass) { + if (redisAsyncCommand(c, sentinelDiscardReplyCallback, NULL, "AUTH %s", + auth_pass) == REDIS_OK) ri->pending_commands++; + } } /* Create the async connections for the specified instance if the instance @@ -1689,8 +1690,10 @@ void sentinelPingReplyCallback(redisAsyncContext *c, void *reply, void *privdata (ri->flags & SRI_S_DOWN) && !(ri->flags & SRI_SCRIPT_KILL_SENT)) { - redisAsyncCommand(ri->cc, - sentinelDiscardReplyCallback, NULL, "SCRIPT KILL"); + if (redisAsyncCommand(ri->cc, + sentinelDiscardReplyCallback, NULL, + "SCRIPT KILL") == REDIS_OK) + ri->pending_commands++; ri->flags |= SRI_SCRIPT_KILL_SENT; } } From b64dbf7387b3433fc6275d102a73d38a44e4ba37 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 6 Nov 2013 11:23:49 +0100 Subject: [PATCH 0273/2500] Sentinel: sentinelSendSlaveOf() was missing a var and the prototype. --- src/sentinel.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/sentinel.c b/src/sentinel.c index 6bb407623..09052e81e 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -327,6 +327,7 @@ sentinelRedisInstance *sentinelSelectSlave(sentinelRedisInstance *master); void sentinelScheduleScriptExecution(char *path, ...); void sentinelStartFailover(sentinelRedisInstance *master, int state); void sentinelDiscardReplyCallback(redisAsyncContext *c, void *reply, void *privdata); +int sentinelSendSlaveOf(sentinelRedisInstance *ri, char *host, int port); /* ========================= Dictionary types =============================== */ @@ -2535,6 +2536,7 @@ char *sentinelGetObjectiveLeader(sentinelRedisInstance *master) { * discarded. */ int sentinelSendSlaveOf(sentinelRedisInstance *ri, char *host, int port) { char portstr[32]; + int retval; ll2string(portstr,sizeof(portstr),port); @@ -2899,7 +2901,6 @@ void sentinelFailoverReconfNextSlave(sentinelRedisInstance *master) { { sentinelRedisInstance *slave = dictGetVal(de); int retval; - char master_port[32]; /* Skip the promoted slave, and already configured slaves. */ if (slave->flags & (SRI_PROMOTED|SRI_RECONF_DONE)) continue; From 605ba6ad38c4dd7454dc43b52e66f61d00d5ca4a Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 7 Nov 2013 16:12:06 +0100 Subject: [PATCH 0274/2500] redis-trib: fixed slot allocation when --replicas is used. --- src/redis-trib.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/redis-trib.rb b/src/redis-trib.rb index b6bf2980c..b9a9ee606 100755 --- a/src/redis-trib.rb +++ b/src/redis-trib.rb @@ -517,10 +517,10 @@ class RedisTrib # Alloc slots on masters i = 0 - masters.each{|n| + masters.each_with_index{|n,masternum| first = i*slots_per_node last = first+slots_per_node-1 - last = ClusterHashSlots-1 if i == @nodes.length-1 + last = ClusterHashSlots-1 if masternum == masters.length-1 n.add_slots first..last i += 1 } From 8c2127f9c9f94b5094ede5f961a886048b88ced5 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 7 Nov 2013 23:53:18 +0100 Subject: [PATCH 0275/2500] Fix broken rdbWriteRaw() return value check in rdb.c. Thanks to @PhoneLi for reporting. --- src/rdb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rdb.c b/src/rdb.c index d1804d745..f04f6defd 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -99,7 +99,7 @@ int rdbSaveLen(rio *rdb, uint32_t len) { buf[0] = (REDIS_RDB_32BITLEN<<6); if (rdbWriteRaw(rdb,buf,1) == -1) return -1; len = htonl(len); - if (rdbWriteRaw(rdb,&len,4) == -4) return -1; + if (rdbWriteRaw(rdb,&len,4) == -1) return -1; nwritten = 1+4; } return nwritten; From 4666966c9f51eab888f393b12165d2a5b1c43bd1 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 8 Nov 2013 10:32:16 +0100 Subject: [PATCH 0276/2500] Cluster: refactoring of slots update code and more. The commit also introduces detection of nodes publishing not updated configuration. More work in progress to send an UPDATE packet to inform of the config change. --- src/cluster.c | 152 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 98 insertions(+), 54 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 4a36eb3dc..998f2eca9 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -1029,7 +1029,7 @@ int clusterProcessPacket(clusterLink *link) { } } - /* Update master/slave state */ + /* Check for role switch: slave -> master or master -> slave. */ if (sender) { if (!memcmp(hdr->slaveof,REDIS_NODE_NULL_NAME, sizeof(hdr->slaveof))) @@ -1079,67 +1079,111 @@ int clusterProcessPacket(clusterLink *link) { } /* Update our info about served slots. + * * Note: this MUST happen after we update the master/slave state * so that REDIS_NODE_MASTER flag will be set. */ - if (sender && sender->flags & REDIS_NODE_MASTER) { - int changes, j; - changes = - memcmp(sender->slots,hdr->myslots,sizeof(hdr->myslots)) != 0; - if (changes) { - clusterNode *curmaster, *newmaster = NULL; + /* Many checks are only needed if the set of served slots this + * instance claims is different compared to the set of slots we have for + * it. Check this ASAP to avoid other computational expansive checks later. */ + clusterNode *sender_master = NULL; /* Sender or its master if it is a slave. */ + int dirty_slots = 0; /* Sender claimed slots don't match my view? */ - /* Here we set curmaster to this node or the node this node - * replicates to if it's a slave. In the for loop we are - * interested to check if slots are taken away from curmaster. */ - if (server.cluster->myself->flags & REDIS_NODE_MASTER) - curmaster = server.cluster->myself; - else - curmaster = server.cluster->myself->slaveof; + if (sender) { + sender_master = (sender->flags & REDIS_NODE_MASTER) ? sender : + sender->slaveof; + if (sender_master) { + dirty_slots = memcmp(sender_master->slots, + hdr->myslots,sizeof(hdr->myslots)) != 0; + } + } - for (j = 0; j < REDIS_CLUSTER_SLOTS; j++) { - if (bitmapTestBit(hdr->myslots,j)) { - /* We rebind the slot to the new node claiming it if: - * 1) The slot was unassigned. - * 2) The new node claims it with a greater configEpoch. */ - if (server.cluster->slots[j] == sender) continue; - if (server.cluster->slots[j] == NULL || - server.cluster->slots[j]->configEpoch < - senderConfigEpoch) - { - if (server.cluster->slots[j] == curmaster) - newmaster = sender; - clusterDelSlot(j); - clusterAddSlot(sender,j); - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| - CLUSTER_TODO_UPDATE_STATE| - CLUSTER_TODO_FSYNC_CONFIG); - } - } else { - /* This node claims to no longer handling the slot, - * however we don't change our config as this is likely: - * 1) Rehashing in progress. - * 2) Failover. - * In both cases we'll be informed about who is serving - * the slot eventually. In the meantime it's up to the - * original owner to try to redirect our clients to the - * right node. */ + /* 1) We check if the sender (master) is claiming slots that we belive to + * serve as a master, or replicate as a slave, but with a configEpoch + * that is newer: in that case we rebind the slots to the claiming node + * in our routing table. + * + * Moreover if we are left with 0 slots to serve, we reconfigure as + * a replica of the sender. */ + if (sender && sender->flags & REDIS_NODE_MASTER && dirty_slots) { + int j; + clusterNode *curmaster, *newmaster = NULL; + + /* Here we set curmaster to this node or the node this node + * replicates to if it's a slave. In the for loop we are + * interested to check if slots are taken away from curmaster. */ + if (server.cluster->myself->flags & REDIS_NODE_MASTER) + curmaster = server.cluster->myself; + else + curmaster = server.cluster->myself->slaveof; + + for (j = 0; j < REDIS_CLUSTER_SLOTS; j++) { + if (bitmapTestBit(hdr->myslots,j)) { + /* We rebind the slot to the new node claiming it if: + * 1) The slot was unassigned. + * 2) The new node claims it with a greater configEpoch. */ + if (server.cluster->slots[j] == sender) continue; + if (server.cluster->slots[j] == NULL || + server.cluster->slots[j]->configEpoch < + senderConfigEpoch) + { + if (server.cluster->slots[j] == curmaster) + newmaster = sender; + clusterDelSlot(j); + clusterAddSlot(sender,j); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_UPDATE_STATE| + CLUSTER_TODO_FSYNC_CONFIG); } } + } - /* If at least one slot was reassigned from a node to another node - * with a greater configEpoch, it is possible that: - * 1) We are a master is left without slots. This means that we were - * failed over and we should turn into a replica of the new - * master. - * 2) We are a slave and our master is left without slots. We need - * to replicate to the new slots owner. */ - if (newmaster && curmaster->numslots == 0) { - redisLog(REDIS_WARNING,"Configuration change detected. Reconfiguring myself as a replica of %.40s", sender->name); - clusterSetMaster(sender); - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| - CLUSTER_TODO_UPDATE_STATE| - CLUSTER_TODO_FSYNC_CONFIG); + /* If at least one slot was reassigned from a node to another node + * with a greater configEpoch, it is possible that: + * 1) We are a master is left without slots. This means that we were + * failed over and we should turn into a replica of the new + * master. + * 2) We are a slave and our master is left without slots. We need + * to replicate to the new slots owner. */ + if (newmaster && curmaster->numslots == 0) { + redisLog(REDIS_WARNING,"Configuration change detected. Reconfiguring myself as a replica of %.40s", sender->name); + clusterSetMaster(sender); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_UPDATE_STATE| + CLUSTER_TODO_FSYNC_CONFIG); + } + } + + /* 2) We also check for the reverse condition, that is, the sender claims + * to serve slots we know are served by a master with a greater + * configEpoch. If this happens we inform the sender. + * + * This is useful because sometimes after a partition heals, a reappearing + * master may be the last one to claim a given set of hash slots, but with + * a configuration that other instances know to be deprecated. Example: + * + * A and B are master and slave for slots 1,2,3. + * A is partitioned away, B gets promoted. + * B is partitioned away, and A returns available. + * + * Usually B would PING A publishing its set of served slots and its + * configEpoch, but because of the partition B can't inform A of the new + * configuration, so other nodes that have an updated table must do it. + * In this way A will stop to act as a master (or can try to failover if + * there are the conditions to win the election). */ + if (sender && dirty_slots) { + int j; + + for (j = 0; j < REDIS_CLUSTER_SLOTS; j++) { + if (bitmapTestBit(hdr->myslots,j)) { + if (server.cluster->slots[j] == sender || + server.cluster->slots[j] == NULL) continue; + if (server.cluster->slots[j]->configEpoch > + senderConfigEpoch) + { + printf("MASTER or SLAVE have old config\n"); + break; + } } } } From a19147c2facb94d61a850913af70c97071669584 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 8 Nov 2013 16:26:50 +0100 Subject: [PATCH 0277/2500] Cluster: UPDATE msg data structure and sending function. --- src/cluster.c | 25 +++++++++++++++++++++++-- src/cluster.h | 14 +++++++++++++- 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 998f2eca9..ceafc5c3b 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -59,6 +59,7 @@ void clusterSetMaster(clusterNode *n); void clusterHandleSlaveFailover(void); int bitmapTestBit(unsigned char *bitmap, int pos); void clusterDoBeforeSleep(int flags); +void clusterSendUpdate(clusterLink *link, clusterNode *node); /* ----------------------------------------------------------------------------- * Initialization @@ -1181,8 +1182,11 @@ int clusterProcessPacket(clusterLink *link) { if (server.cluster->slots[j]->configEpoch > senderConfigEpoch) { - printf("MASTER or SLAVE have old config\n"); - break; + redisLog(REDIS_WARNING, + "Node %.40s has old slots configuration, sending " + "an UPDATE message about %.40s\n", + sender->name, server.cluster->slots[j]->name); + clusterSendUpdate(sender->link,server.cluster->slots[j]); } } } @@ -1413,6 +1417,9 @@ void clusterBuildMessageHdr(clusterMsg *hdr, int type) { if (type == CLUSTERMSG_TYPE_FAIL) { totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData); totlen += sizeof(clusterMsgDataFail); + } else if (type == CLUSTERMSG_TYPE_UPDATE) { + totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData); + totlen += sizeof(clusterMsgDataUpdate); } hdr->totlen = htonl(totlen); /* For PING, PONG, and MEET, fixing the totlen field is up to the caller. */ @@ -1562,6 +1569,20 @@ void clusterSendFail(char *nodename) { clusterBroadcastMessage(buf,ntohl(hdr->totlen)); } +/* Send an UPDATE message to the specified link carrying the specified 'node' + * slots configuration. The node name, slots bitmap, and configEpoch info + * are included. */ +void clusterSendUpdate(clusterLink *link, clusterNode *node) { + unsigned char buf[4096]; + clusterMsg *hdr = (clusterMsg*) buf; + + clusterBuildMessageHdr(hdr,CLUSTERMSG_TYPE_UPDATE); + memcpy(hdr->data.update.nodecfg.nodename,node->name,REDIS_CLUSTER_NAMELEN); + hdr->data.update.nodecfg.configEpoch = htonu64(node->configEpoch); + memcpy(hdr->data.update.nodecfg.slots,node->slots,sizeof(node->slots)); + clusterSendMessage(link,buf,ntohl(hdr->totlen)); +} + /* ----------------------------------------------------------------------------- * CLUSTER Pub/Sub support * diff --git a/src/cluster.h b/src/cluster.h index 9c598be01..658a364bd 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -112,7 +112,8 @@ typedef struct clusterState { #define CLUSTERMSG_TYPE_FAIL 3 /* Mark node xxx as failing */ #define CLUSTERMSG_TYPE_PUBLISH 4 /* Pub/Sub Publish propagation */ #define CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST 5 /* May I failover? */ -#define CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK 6 /* Yes, you can failover. */ +#define CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK 6 /* Yes, you have my vote */ +#define CLUSTERMSG_TYPE_UPDATE 7 /* Another node slots configuration */ /* Initially we don't know our "name", but we'll find it once we connect * to the first node, using the getsockname() function. Then we'll use this @@ -137,6 +138,12 @@ typedef struct { unsigned char bulk_data[8]; /* defined as 8 just for alignment concerns. */ } clusterMsgDataPublish; +typedef struct { + uint64_t configEpoch; /* Config epoch of the specified instance. */ + char nodename[REDIS_CLUSTER_NAMELEN]; /* Name of the slots owner. */ + unsigned char slots[REDIS_CLUSTER_SLOTS/8]; /* Slots bitmap. */ +} clusterMsgDataUpdate; + union clusterMsgData { /* PING, MEET and PONG */ struct { @@ -153,6 +160,11 @@ union clusterMsgData { struct { clusterMsgDataPublish msg; } publish; + + /* UPDATE */ + struct { + clusterMsgDataUpdate nodecfg; + } update; }; typedef struct { From 36db83ac50dcb0947cf470c843cd22b718dac91c Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 8 Nov 2013 17:02:10 +0100 Subject: [PATCH 0278/2500] Cluster: slots update refactored + UPDATE msg processing. Now there is a function that handles the update of the local slot configuration every time we have some new info about a node and its set of served slots and configEpoch. Moreoever the UPDATE packets are now processed when received (it was a work in progress in the previous commit). --- src/cluster.c | 168 +++++++++++++++++++++++++++++++------------------- 1 file changed, 103 insertions(+), 65 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index ceafc5c3b..c786acf67 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -856,6 +856,84 @@ int nodeUpdateAddressIfNeeded(clusterNode *node, clusterLink *link, int port) { return 1; } +/* Reconfigure the specified node 'n' as a master. This function is called when + * a node that we believed to be a slave is now acting as master in order to + * update the state of the node. */ +void clusterSetNodeAsMaster(clusterNode *n) { + if (n->flags & REDIS_NODE_MASTER) return; + + if (n->slaveof) clusterNodeRemoveSlave(n->slaveof,n); + n->flags &= ~REDIS_NODE_SLAVE; + n->flags |= REDIS_NODE_MASTER; + n->slaveof = NULL; + + /* Update config and state. */ + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_UPDATE_STATE); +} + +/* This function is called when we receive a master configuration via a + * PING, PONG or UPDATE packet. What we receive is a node, a configEpoch of the + * node, and the set of slots claimed under this configEpoch. + * + * What we do is to rebind the slots with newer configuration compared to our + * local configuration, and if needed, we turn ourself into a replica of the + * node (see the function comments for more info). + * + * The 'sender' is the node for which we received a configuration update. + * Sometimes it is not actaully the "Sender" of the information, like in the case + * we receive the info via an UPDATE packet. */ +void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoch, + unsigned char *slots) +{ + int j; + clusterNode *curmaster, *newmaster = NULL; + + /* Here we set curmaster to this node or the node this node + * replicates to if it's a slave. In the for loop we are + * interested to check if slots are taken away from curmaster. */ + if (server.cluster->myself->flags & REDIS_NODE_MASTER) + curmaster = server.cluster->myself; + else + curmaster = server.cluster->myself->slaveof; + + for (j = 0; j < REDIS_CLUSTER_SLOTS; j++) { + if (bitmapTestBit(slots,j)) { + /* We rebind the slot to the new node claiming it if: + * 1) The slot was unassigned. + * 2) The new node claims it with a greater configEpoch. */ + if (server.cluster->slots[j] == sender) continue; + if (server.cluster->slots[j] == NULL || + server.cluster->slots[j]->configEpoch < + senderConfigEpoch) + { + if (server.cluster->slots[j] == curmaster) + newmaster = sender; + clusterDelSlot(j); + clusterAddSlot(sender,j); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_UPDATE_STATE| + CLUSTER_TODO_FSYNC_CONFIG); + } + } + } + + /* If at least one slot was reassigned from a node to another node + * with a greater configEpoch, it is possible that: + * 1) We are a master left without slots. This means that we were + * failed over and we should turn into a replica of the new + * master. + * 2) We are a slave and our master is left without slots. We need + * to replicate to the new slots owner. */ + if (newmaster && curmaster->numslots == 0) { + redisLog(REDIS_WARNING,"Configuration change detected. Reconfiguring myself as a replica of %.40s", sender->name); + clusterSetMaster(sender); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_UPDATE_STATE| + CLUSTER_TODO_FSYNC_CONFIG); + } +} + /* When this function is called, there is a packet to process starting * at node->rcvbuf. Releasing the buffer is up to the caller, so this * function should just handle the higher level stuff of processing the @@ -905,6 +983,11 @@ int clusterProcessPacket(clusterLink *link) { type == CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK) { uint32_t explen = sizeof(clusterMsg)-sizeof(union clusterMsgData); + if (totlen != explen) return 1; + } else if (type == CLUSTERMSG_TYPE_UPDATE) { + uint32_t explen = sizeof(clusterMsg)-sizeof(union clusterMsgData); + + explen += sizeof(clusterMsgDataUpdate); if (totlen != explen) return 1; } @@ -1036,18 +1119,7 @@ int clusterProcessPacket(clusterLink *link) { sizeof(hdr->slaveof))) { /* Node is a master. */ - if (sender->flags & REDIS_NODE_SLAVE) { - /* Reconfigure node as master. */ - if (sender->slaveof) - clusterNodeRemoveSlave(sender->slaveof,sender); - sender->flags &= ~REDIS_NODE_SLAVE; - sender->flags |= REDIS_NODE_MASTER; - sender->slaveof = NULL; - - /* Update config and state. */ - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| - CLUSTER_TODO_UPDATE_STATE); - } + clusterSetNodeAsMaster(sender); } else { /* Node is a slave. */ clusterNode *master = clusterLookupNode(hdr->slaveof); @@ -1099,60 +1171,11 @@ int clusterProcessPacket(clusterLink *link) { } } - /* 1) We check if the sender (master) is claiming slots that we belive to - * serve as a master, or replicate as a slave, but with a configEpoch - * that is newer: in that case we rebind the slots to the claiming node - * in our routing table. - * - * Moreover if we are left with 0 slots to serve, we reconfigure as - * a replica of the sender. */ + /* 1) If the sender of the message is a master, and we detected that the + * set of slots it claims changed, scan the slots to see if we need + * to update our configuration. */ if (sender && sender->flags & REDIS_NODE_MASTER && dirty_slots) { - int j; - clusterNode *curmaster, *newmaster = NULL; - - /* Here we set curmaster to this node or the node this node - * replicates to if it's a slave. In the for loop we are - * interested to check if slots are taken away from curmaster. */ - if (server.cluster->myself->flags & REDIS_NODE_MASTER) - curmaster = server.cluster->myself; - else - curmaster = server.cluster->myself->slaveof; - - for (j = 0; j < REDIS_CLUSTER_SLOTS; j++) { - if (bitmapTestBit(hdr->myslots,j)) { - /* We rebind the slot to the new node claiming it if: - * 1) The slot was unassigned. - * 2) The new node claims it with a greater configEpoch. */ - if (server.cluster->slots[j] == sender) continue; - if (server.cluster->slots[j] == NULL || - server.cluster->slots[j]->configEpoch < - senderConfigEpoch) - { - if (server.cluster->slots[j] == curmaster) - newmaster = sender; - clusterDelSlot(j); - clusterAddSlot(sender,j); - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| - CLUSTER_TODO_UPDATE_STATE| - CLUSTER_TODO_FSYNC_CONFIG); - } - } - } - - /* If at least one slot was reassigned from a node to another node - * with a greater configEpoch, it is possible that: - * 1) We are a master is left without slots. This means that we were - * failed over and we should turn into a replica of the new - * master. - * 2) We are a slave and our master is left without slots. We need - * to replicate to the new slots owner. */ - if (newmaster && curmaster->numslots == 0) { - redisLog(REDIS_WARNING,"Configuration change detected. Reconfiguring myself as a replica of %.40s", sender->name); - clusterSetMaster(sender); - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| - CLUSTER_TODO_UPDATE_STATE| - CLUSTER_TODO_FSYNC_CONFIG); - } + clusterUpdateSlotsConfigWith(sender,senderConfigEpoch,hdr->myslots); } /* 2) We also check for the reverse condition, that is, the sender claims @@ -1248,6 +1271,21 @@ int clusterProcessPacket(clusterLink *link) { * we check ASAP. */ clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER); } + } else if (type == CLUSTERMSG_TYPE_UPDATE) { + clusterNode *n; /* The node the update is about. */ + uint64_t reportedConfigEpoch = ntohu64(hdr->data.update.nodecfg.configEpoch); + + if (!sender) return 1; /* We don't know the sender. */ + n = clusterLookupNode(hdr->data.update.nodecfg.nodename); + if (!n) return 1; /* We don't know the reported node. */ + if (n->configEpoch >= reportedConfigEpoch) return 1; /* Nothing new. */ + + /* If in our current config the node is a slave, set it as a master. */ + if (n->flags & REDIS_NODE_SLAVE) clusterSetNodeAsMaster(n); + + /* Check the bitmap of served slots and udpate our config accordingly. */ + clusterUpdateSlotsConfigWith(n,reportedConfigEpoch, + hdr->data.update.nodecfg.slots); } else { redisLog(REDIS_WARNING,"Received unknown packet type: %d", type); } From a146482c83fbf41f6e8b0db56d1e09e38270e538 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 8 Nov 2013 17:19:19 +0100 Subject: [PATCH 0279/2500] Cluster: replace hardcoded 4096 for bus msg len with sizeof(). --- src/cluster.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index c786acf67..8806ac42f 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -1327,7 +1327,7 @@ void clusterWriteHandler(aeEventLoop *el, int fd, void *privdata, int mask) { * full length of the packet. When a whole packet is in memory this function * will call the function to process the packet. And so forth. */ void clusterReadHandler(aeEventLoop *el, int fd, void *privdata, int mask) { - char buf[4096]; + char buf[sizeof(clusterMsg)]; ssize_t nread; clusterMsg *hdr; clusterLink *link = (clusterLink*) privdata; @@ -1466,7 +1466,7 @@ void clusterBuildMessageHdr(clusterMsg *hdr, int type) { /* Send a PING or PONG packet to the specified node, making sure to add enough * gossip informations. */ void clusterSendPing(clusterLink *link, int type) { - unsigned char buf[4096]; + unsigned char buf[sizeof(clusterMsg)]; clusterMsg *hdr = (clusterMsg*) buf; int gossipcount = 0, totlen; /* freshnodes is the number of nodes we can still use to populate the @@ -1553,7 +1553,7 @@ void clusterBroadcastPong(void) { * * If link is NULL, then the message is broadcasted to the whole cluster. */ void clusterSendPublish(clusterLink *link, robj *channel, robj *message) { - unsigned char buf[4096], *payload; + unsigned char buf[sizeof(clusterMsg)], *payload; clusterMsg *hdr = (clusterMsg*) buf; uint32_t totlen; uint32_t channel_len, message_len; @@ -1599,7 +1599,7 @@ void clusterSendPublish(clusterLink *link, robj *channel, robj *message) { * we switch the node state to REDIS_NODE_FAIL and ask all the other * nodes to do the same ASAP. */ void clusterSendFail(char *nodename) { - unsigned char buf[4096]; + unsigned char buf[sizeof(clusterMsg)]; clusterMsg *hdr = (clusterMsg*) buf; clusterBuildMessageHdr(hdr,CLUSTERMSG_TYPE_FAIL); @@ -1611,7 +1611,7 @@ void clusterSendFail(char *nodename) { * slots configuration. The node name, slots bitmap, and configEpoch info * are included. */ void clusterSendUpdate(clusterLink *link, clusterNode *node) { - unsigned char buf[4096]; + unsigned char buf[sizeof(clusterMsg)]; clusterMsg *hdr = (clusterMsg*) buf; clusterBuildMessageHdr(hdr,CLUSTERMSG_TYPE_UPDATE); @@ -1643,7 +1643,7 @@ void clusterPropagatePublish(robj *channel, robj *message) { * Note that we send the failover request to everybody, master and slave nodes, * but only the masters are supposed to reply to our query. */ void clusterRequestFailoverAuth(void) { - unsigned char buf[4096]; + unsigned char buf[sizeof(clusterMsg)]; clusterMsg *hdr = (clusterMsg*) buf; uint32_t totlen; @@ -1655,7 +1655,7 @@ void clusterRequestFailoverAuth(void) { /* Send a FAILOVER_AUTH_ACK message to the specified node. */ void clusterSendFailoverAuth(clusterNode *node) { - unsigned char buf[4096]; + unsigned char buf[sizeof(clusterMsg)]; clusterMsg *hdr = (clusterMsg*) buf; uint32_t totlen; From a67935e5e3037d9ed7030e4415027e9f87b65c60 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 8 Nov 2013 17:25:49 +0100 Subject: [PATCH 0280/2500] Cluster: send a single UPDATE packet for now. --- src/cluster.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/cluster.c b/src/cluster.c index 8806ac42f..fd63b7087 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -1210,6 +1210,11 @@ int clusterProcessPacket(clusterLink *link) { "an UPDATE message about %.40s\n", sender->name, server.cluster->slots[j]->name); clusterSendUpdate(sender->link,server.cluster->slots[j]); + + /* TODO: instead of exiting the loop send every other + * UPDATE packet for other nodes that are the new owner + * of sender's slots. */ + break; } } } From e159239f9c2dedf6e78340aeea8d1d0ac0982df1 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 8 Nov 2013 17:27:59 +0100 Subject: [PATCH 0281/2500] Cluster: removed not needed newline at end of redisLog() msg. --- src/cluster.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cluster.c b/src/cluster.c index fd63b7087..2f26100e7 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -1207,7 +1207,7 @@ int clusterProcessPacket(clusterLink *link) { { redisLog(REDIS_WARNING, "Node %.40s has old slots configuration, sending " - "an UPDATE message about %.40s\n", + "an UPDATE message about %.40s", sender->name, server.cluster->slots[j]->name); clusterSendUpdate(sender->link,server.cluster->slots[j]); From c46f655c9001fbf30bc3e646bc43211fefd6478c Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 11 Nov 2013 09:25:36 +0100 Subject: [PATCH 0282/2500] Log to what master a slave is going to connect to. --- src/replication.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/replication.c b/src/replication.c index 8102fc2db..7357ae5ec 100644 --- a/src/replication.c +++ b/src/replication.c @@ -1527,7 +1527,8 @@ void replicationCron(void) { /* Check if we should connect to a MASTER */ if (server.repl_state == REDIS_REPL_CONNECT) { - redisLog(REDIS_NOTICE,"Connecting to MASTER..."); + redisLog(REDIS_NOTICE,"Connecting to MASTER %s:%d", + server.masterhost, server.masterport); if (connectWithMaster() == REDIS_OK) { redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync started"); } From 49170f6c12b2116dc5d251041aad8d10ca4ad83b Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 11 Nov 2013 11:05:58 +0100 Subject: [PATCH 0283/2500] Sentinel: epoch introduced. Sentinel state now includes the idea of current epoch and config epoch. In the Hello message, that is now published both on masters and slaves, a Sentinel no longer just advertises itself but also broadcasts its current view of the configuration: the master name / ip / port and its current epoch. Sentinels receiving such information switch to the new master if the configuration epoch received is newer and the ip / port of the master are indeed different compared to the previos ones. --- src/sentinel.c | 87 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 62 insertions(+), 25 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 09052e81e..76ae9d313 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -133,6 +133,7 @@ typedef struct sentinelRedisInstance { int flags; /* See SRI_... defines */ char *name; /* Master name from the point of view of this sentinel. */ char *runid; /* run ID of this instance. */ + uint64_t config_epoch; /* Configuration epoch. */ sentinelAddr *addr; /* Master host. */ redisAsyncContext *cc; /* Hiredis context for commands. */ redisAsyncContext *pc; /* Hiredis context for Pub / Sub. */ @@ -191,13 +192,14 @@ typedef struct sentinelRedisInstance { /* Main state. */ struct sentinelState { + uint64_t current_epoch; /* Current epoch. */ dict *masters; /* Dictionary of master sentinelRedisInstances. Key is the instance name, value is the sentinelRedisInstance structure pointer. */ int tilt; /* Are we in TILT mode? */ int running_scripts; /* Number of scripts in execution right now. */ mstime_t tilt_start_time; /* When TITL started. */ - mstime_t previous_time; /* Time last time we ran the time handler. */ + mstime_t previous_time; /* Last time we ran the time handler. */ list *scripts_queue; /* Queue of user scripts to execute. */ } sentinel; @@ -402,6 +404,7 @@ void initSentinel(void) { } /* Initialize various data structures. */ + sentinel.current_epoch = 0; sentinel.masters = dictCreate(&instancesDictType,NULL); sentinel.tilt = 0; sentinel.tilt_start_time = 0; @@ -861,6 +864,7 @@ sentinelRedisInstance *createSentinelRedisInstance(char *name, int flags, char * ri->flags = flags | SRI_DISCONNECTED; ri->name = sdsname; ri->runid = NULL; + ri->config_epoch = 0; ri->addr = addr; ri->cc = NULL; ri->pc = NULL; @@ -1745,24 +1749,28 @@ void sentinelReceiveHelloMessages(redisAsyncContext *c, void *reply, void *privd if (strstr(r->element[2]->str,server.runid) != NULL) return; { - int numtokens, port, removed, canfailover; - /* Separator changed from ":" to "," in recent versions in order to - * play well with IPv6 addresses. For now we make sure to parse both - * correctly detecting if there is "," inside the string. */ - char *sep = strchr(r->element[2]->str,',') ? "," : ":"; + /* Format is composed of 9 tokens: + * 0=ip,1=port,2=runid,3=can_failover,4=current_epoch, + * 5=master_name,6=master_ip,7=master_port,8=master_config_epoch. */ + int numtokens, port, removed, canfailover, master_port; + uint64_t current_epoch, master_config_epoch; char **token = sdssplitlen(r->element[2]->str, r->element[2]->len, - sep,1,&numtokens); - sentinelRedisInstance *sentinel; + ",",1,&numtokens); + sentinelRedisInstance *si; - if (numtokens == 4) { + if (numtokens == 9) { /* First, try to see if we already have this sentinel. */ port = atoi(token[1]); + master_port = atoi(token[7]); canfailover = atoi(token[3]); - sentinel = getSentinelRedisInstanceByAddrAndRunID( + si = getSentinelRedisInstanceByAddrAndRunID( ri->sentinels,token[0],port,token[2]); + current_epoch = strtoull(token[4],NULL,10); + master_config_epoch = strtoull(token[8],NULL,10); + sentinelRedisInstance *master; - if (!sentinel) { + if (!si) { /* If not, remove all the sentinels that have the same runid * OR the same ip/port, because it's either a restart or a * network topology change. */ @@ -1775,24 +1783,45 @@ void sentinelReceiveHelloMessages(redisAsyncContext *c, void *reply, void *privd } /* Add the new sentinel. */ - sentinel = createSentinelRedisInstance(NULL,SRI_SENTINEL, + si = createSentinelRedisInstance(NULL,SRI_SENTINEL, token[0],port,ri->quorum,ri); - if (sentinel) { - sentinelEvent(REDIS_NOTICE,"+sentinel",sentinel,"%@"); + if (si) { + sentinelEvent(REDIS_NOTICE,"+sentinel",si,"%@"); /* The runid is NULL after a new instance creation and * for Sentinels we don't have a later chance to fill it, * so do it now. */ - sentinel->runid = sdsnew(token[2]); + si->runid = sdsnew(token[2]); + } + } + + /* Update local current_epoch if received current_epoch is greater. */ + if (current_epoch > sentinel.current_epoch) + sentinel.current_epoch = current_epoch; + + /* Update master info if received configuration is newer. */ + if ((master = sentinelGetMasterByName(token[5])) != NULL) { + if (master->config_epoch < master_config_epoch) { + master->config_epoch = master_config_epoch; + if (master_port != master->addr->port || + !strcmp(master->addr->ip, token[6])) + { + sentinelEvent(REDIS_WARNING,"+switch-master", + master,"%s %s %d %s %d", + master->name, master->addr->ip, master->addr->port, + token[6], master_port); + sentinelResetMasterAndChangeAddress(ri, + token[6], master_port); + } } } /* Update the state of the Sentinel. */ - if (sentinel) { - sentinel->last_hello_time = mstime(); + if (si) { + si->last_hello_time = mstime(); if (canfailover) - sentinel->flags |= SRI_CAN_FAILOVER; + si->flags |= SRI_CAN_FAILOVER; else - sentinel->flags &= ~SRI_CAN_FAILOVER; + si->flags &= ~SRI_CAN_FAILOVER; } } sdsfreesplitres(token,numtokens); @@ -1842,20 +1871,28 @@ void sentinelPingInstance(sentinelRedisInstance *ri) { sentinelPingReplyCallback, NULL, "PING"); if (retval != REDIS_OK) return; ri->pending_commands++; - } else if ((ri->flags & SRI_MASTER) && + } else if ((ri->flags & SRI_SENTINEL) == 0 && (now - ri->last_pub_time) > SENTINEL_PUBLISH_PERIOD) { - /* PUBLISH hello messages only to masters. */ + /* PUBLISH hello messages to masters and slaves. */ char ip[REDIS_IP_STR_LEN]; if (anetSockName(ri->cc->c.fd,ip,sizeof(ip),NULL) != -1) { - char myaddr[REDIS_IP_STR_LEN+128]; + char payload[REDIS_IP_STR_LEN+1024]; + sentinelRedisInstance *master = (ri->flags & SRI_MASTER) ? + NULL : ri->master; - snprintf(myaddr,sizeof(myaddr),"%s,%d,%s,%d", + snprintf(payload,sizeof(payload), + "%s,%d,%s,%d,%llu," /* Info about this sentinel. */ + "%s,%s,%d,%lld", /* Info about current master. */ ip, server.port, server.runid, - (ri->flags & SRI_CAN_FAILOVER) != 0); + (ri->flags & SRI_CAN_FAILOVER) != 0, + (unsigned long long) sentinel.current_epoch, + /* --- */ + master->name,master->addr->ip,master->addr->port, + master->config_epoch); retval = redisAsyncCommand(ri->cc, sentinelPublishReplyCallback, NULL, "PUBLISH %s %s", - SENTINEL_HELLO_CHANNEL,myaddr); + SENTINEL_HELLO_CHANNEL,payload); if (retval != REDIS_OK) return; ri->pending_commands++; } From 3d7000f2812f4aa48cf56b97deb72ae15afd6275 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 11 Nov 2013 12:05:16 +0100 Subject: [PATCH 0284/2500] Sentinel: remove code not useful in the new design. --- src/sentinel.c | 171 ++++++++++--------------------------------------- 1 file changed, 33 insertions(+), 138 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 76ae9d313..9c73a279f 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -72,8 +72,6 @@ typedef struct sentinelAddr { #define SRI_RECONF_DONE (1<<13) /* Slave synchronized with new master. */ #define SRI_FORCE_FAILOVER (1<<14) /* Force failover with master up. */ #define SRI_SCRIPT_KILL_SENT (1<<15) /* SCRIPT KILL already sent on -BUSY */ -#define SRI_DEMOTE (1<<16) /* If the instance claims to be a master, demote - it into a slave sending SLAVEOF. */ #define SENTINEL_INFO_PERIOD 10000 #define SENTINEL_PING_PERIOD 1000 @@ -90,7 +88,6 @@ typedef struct sentinelAddr { #define SENTINEL_MIN_LINK_RECONNECT_PERIOD 15000 #define SENTINEL_DEFAULT_FAILOVER_TIMEOUT (60*15*1000) #define SENTINEL_MAX_PENDING_COMMANDS 100 -#define SENTINEL_EXTENDED_SDOWN_MULTIPLIER 10 /* How many milliseconds is an information valid? This applies for instance * to the reply to SENTINEL IS-MASTER-DOWN-BY-ADDR replies. */ @@ -1502,108 +1499,44 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { /* Handle slave -> master role switch. */ if ((ri->flags & SRI_SLAVE) && role == SRI_MASTER) { - if (!sentinel.tilt && ri->flags & SRI_DEMOTE) { - /* If this sentinel was partitioned from the slave's master, - * or tilted recently, wait some time before to act, - * so that DOWN and roles INFO will be refreshed. */ - mstime_t wait_time = SENTINEL_INFO_PERIOD*2 + - ri->master->down_after_period*2; + /* If this is a promoted slave we can change state to the + * failover state machine. */ + if (!sentinel.tilt && + (ri->master->flags & SRI_FAILOVER_IN_PROGRESS) && + (ri->master->flags & SRI_I_AM_THE_LEADER) && + (ri->master->failover_state == + SENTINEL_FAILOVER_STATE_WAIT_PROMOTION)) + { + ri->master->failover_state = SENTINEL_FAILOVER_STATE_RECONF_SLAVES; + ri->master->failover_state_change_time = mstime(); + sentinelEvent(REDIS_WARNING,"+promoted-slave",ri,"%@"); + sentinelEvent(REDIS_WARNING,"+failover-state-reconf-slaves", + ri->master,"%@"); + sentinelCallClientReconfScript(ri->master,SENTINEL_LEADER, + "start",ri->master->addr,ri->addr); + } else if (!sentinel.tilt) { + /* A slave turned into a master. We want to force our view and + * reconfigure as slave, but make sure to wait some time before + * doing this in order to make sure to receive an updated + * configuratio via Pub/Sub if any. */ + mstime_t wait_time = SENTINEL_PUBLISH_PERIOD*4; if (!sentinelRedisInstanceNoDownFor(ri->master,wait_time) || (mstime()-sentinel.tilt_start_time) < wait_time) return; - /* Old master returned back? Turn it into a slave ASAP if - * we can reach what we believe is the new master now, and - * have a recent role information for it. - * - * Note: we'll clear the DEMOTE flag only when we have the - * acknowledge that it's a slave again. */ + /* Make sure the master is sane before reconfiguring this instance + * into a slave. */ if (ri->master->flags & SRI_MASTER && (ri->master->flags & (SRI_S_DOWN|SRI_O_DOWN)) == 0 && (mstime() - ri->master->info_refresh) < SENTINEL_INFO_PERIOD*2) { - int retval; - retval = sentinelSendSlaveOf(ri, + int retval = sentinelSendSlaveOf(ri, ri->master->addr->ip, ri->master->addr->port); if (retval == REDIS_OK) sentinelEvent(REDIS_NOTICE,"+demote-old-slave",ri,"%@"); - } else { - /* Otherwise if there are not the conditions to demote, we - * no longer trust the DEMOTE flag and remove it. */ - ri->flags &= ~SRI_DEMOTE; - sentinelEvent(REDIS_NOTICE,"-demote-flag-cleared",ri,"%@"); } - } else if (!(ri->master->flags & SRI_FAILOVER_IN_PROGRESS) && - (runid_changed || first_runid)) - { - /* If a slave turned into master but: - * - * 1) Failover not in progress. - * 2) RunID has changed or its the first time we see an INFO output. - * - * We assume this is a reboot with a wrong configuration. - * Log the event and remove the slave. Note that this is processed - * in tilt mode as well, otherwise we lose the information that the - * runid changed (reboot?) and when the tilt mode ends a fake - * failover will be detected. */ - int retval; - - sentinelEvent(REDIS_WARNING,"-slave-restart-as-master",ri,"%@ #removing it from the attached slaves"); - retval = dictDelete(ri->master->slaves,ri->name); - redisAssert(retval == REDIS_OK); - return; - } else if (!sentinel.tilt && ri->flags & SRI_PROMOTED) { - /* If this is a promoted slave we can change state to the - * failover state machine. */ - if ((ri->master->flags & SRI_FAILOVER_IN_PROGRESS) && - (ri->master->flags & SRI_I_AM_THE_LEADER) && - (ri->master->failover_state == - SENTINEL_FAILOVER_STATE_WAIT_PROMOTION)) - { - ri->master->failover_state = SENTINEL_FAILOVER_STATE_RECONF_SLAVES; - ri->master->failover_state_change_time = mstime(); - sentinelEvent(REDIS_WARNING,"+promoted-slave",ri,"%@"); - sentinelEvent(REDIS_WARNING,"+failover-state-reconf-slaves", - ri->master,"%@"); - sentinelCallClientReconfScript(ri->master,SENTINEL_LEADER, - "start",ri->master->addr,ri->addr); - } - } else if (!sentinel.tilt && ( - !(ri->master->flags & SRI_FAILOVER_IN_PROGRESS) || - ((ri->master->flags & SRI_FAILOVER_IN_PROGRESS) && - (ri->master->flags & SRI_I_AM_THE_LEADER) && - ri->master->failover_state == - SENTINEL_FAILOVER_STATE_WAIT_START))) - { - /* No failover in progress? Then it is the start of a failover - * and we are an observer. - * - * We also do that if we are a leader doing a failover, in wait - * start, but well, somebody else started before us. */ - - if (ri->master->flags & SRI_FAILOVER_IN_PROGRESS) { - sentinelEvent(REDIS_WARNING,"-failover-abort-race", - ri->master, "%@"); - sentinelAbortFailover(ri->master); - } - - ri->master->flags |= SRI_FAILOVER_IN_PROGRESS; - sentinelEvent(REDIS_WARNING,"+failover-detected",ri->master,"%@"); - ri->master->failover_state = SENTINEL_FAILOVER_STATE_DETECT_END; - ri->master->failover_state_change_time = mstime(); - ri->master->promoted_slave = ri; - ri->flags |= SRI_PROMOTED; - ri->flags &= ~SRI_DEMOTE; - sentinelCallClientReconfScript(ri->master,SENTINEL_OBSERVER, - "start", ri->master->addr,ri->addr); - /* We are an observer, so we can only assume that the leader - * is reconfiguring the slave instances. For this reason we - * set all the instances as RECONF_SENT waiting for progresses - * on this side. */ - sentinelAddFlagsToDictOfRedisInstances(ri->master->slaves, - SRI_RECONF_SENT); } } @@ -1641,13 +1574,6 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { ri->failover_state_change_time = mstime(); } } - - /* Detect if the old master was demoted as slave and generate the - * +slave event. */ - if (role == SRI_SLAVE && ri->flags & SRI_DEMOTE) { - sentinelEvent(REDIS_NOTICE,"+slave",ri,"%@"); - ri->flags &= ~SRI_DEMOTE; - } } void sentinelInfoReplyCallback(redisAsyncContext *c, void *reply, void *privdata) { @@ -1956,7 +1882,6 @@ void addReplySentinelRedisInstance(redisClient *c, sentinelRedisInstance *ri) { if (ri->flags & SRI_RECONF_SENT) flags = sdscat(flags,"reconf_sent,"); if (ri->flags & SRI_RECONF_INPROG) flags = sdscat(flags,"reconf_inprog,"); if (ri->flags & SRI_RECONF_DONE) flags = sdscat(flags,"reconf_done,"); - if (ri->flags & SRI_DEMOTE) flags = sdscat(flags,"demote,"); if (sdslen(flags) != 0) sdsrange(flags,0,-2); /* remove last "," */ addReplyBulkCString(c,flags); @@ -2748,7 +2673,6 @@ sentinelRedisInstance *sentinelSelectSlave(sentinelRedisInstance *master) { mstime_t info_validity_time = mstime()-SENTINEL_INFO_VALIDITY_TIME; if (slave->flags & (SRI_S_DOWN|SRI_O_DOWN|SRI_DISCONNECTED)) continue; - if (slave->flags & SRI_DEMOTE) continue; /* Old master not yet ready. */ if (slave->last_avail_time < info_validity_time) continue; if (slave->slave_priority == 0) continue; @@ -2992,16 +2916,14 @@ void sentinelFailoverSwitchToPromotedSlave(sentinelRedisInstance *master) { old_master_ip = sdsdup(master->addr->ip); old_master_port = master->addr->port; sentinelResetMasterAndChangeAddress(master,ref->addr->ip,ref->addr->port); - /* If this is a real switch, that is, we have master->promoted_slave not - * NULL, then we want to add the old master as a slave of the new master, - * but flagging it with SRI_DEMOTE so that we know we'll need to send - * SLAVEOF once the old master is reachable again. */ + /* If this is a real switch and not just a user requested reset, we want + * to add all the known instances as slaves, and also all the sentinels + * back to this master. */ if (master != ref) { - /* Add the new slave, but don't generate a Sentinel event as it will - * happen later when finally the instance will claim to be a slave - * in the INFO output. */ - createSentinelRedisInstance(NULL,SRI_SLAVE|SRI_DEMOTE, + /* TODO: + createSentinelRedisInstance(NULL,SRI_SLAVE old_master_ip, old_master_port, master->quorum, master); + */ } sdsfree(old_master_ip); } @@ -3034,15 +2956,9 @@ void sentinelFailoverStateMachine(sentinelRedisInstance *ri) { } /* Abort a failover in progress with the following steps: - * 1) If this instance is the leaer send a SLAVEOF command to all the already - * reconfigured slaves if any to configure them to replicate with the - * original master. - * 2) For both leaders and observers: clear the failover flags and state in - * the master instance. - * 3) If there is already a promoted slave and we are the leader, and this - * slave is not DISCONNECTED, try to reconfigure it to replicate - * back to the master as well, sending a best effort SLAVEOF command. - */ + * 1) Set the master back to the original one, increment the config epoch. + * 2) Reconfig slaves to replicate to the old master. + * 3) Reconfig the promoted slave as a slave as well. */ void sentinelAbortFailover(sentinelRedisInstance *ri) { dictIterator *di; dictEntry *de; @@ -3085,26 +3001,6 @@ void sentinelAbortFailover(sentinelRedisInstance *ri) { } } -/* The following is called only for master instances and will abort the - * failover process if: - * - * 1) The failover is in progress. - * 2) We already promoted a slave. - * 3) The promoted slave is in extended SDOWN condition. - */ -void sentinelAbortFailoverIfNeeded(sentinelRedisInstance *ri) { - /* Failover is in progress? Do we have a promoted slave? */ - if (!(ri->flags & SRI_FAILOVER_IN_PROGRESS) || !ri->promoted_slave) return; - - /* Is the promoted slave into an extended SDOWN state? */ - if (!(ri->promoted_slave->flags & SRI_S_DOWN) || - (mstime() - ri->promoted_slave->s_down_since_time) < - (ri->down_after_period * SENTINEL_EXTENDED_SDOWN_MULTIPLIER)) return; - - sentinelEvent(REDIS_WARNING,"-failover-abort-x-sdown",ri->promoted_slave,"%@"); - sentinelAbortFailover(ri); -} - /* ======================== SENTINEL timer handler ========================== * This is the "main" our Sentinel, being sentinel completely non blocking * in design. The function is called every second. @@ -3150,7 +3046,6 @@ void sentinelHandleRedisInstance(sentinelRedisInstance *ri) { sentinelCheckObjectivelyDown(ri); sentinelStartFailoverIfNeeded(ri); sentinelFailoverStateMachine(ri); - sentinelAbortFailoverIfNeeded(ri); } } From e46942e4b047e8af4da6e6ad7a0a55eb725c5d8b Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 11 Nov 2013 17:10:28 +0100 Subject: [PATCH 0285/2500] Sentinel: handle Hello messages received via slaves correctly. Even when messages are received via the slave, we should perform operations (like adding a new Sentinel) in the context of the master. --- src/sentinel.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 9c73a279f..af5363a78 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1651,12 +1651,14 @@ void sentinelPublishReplyCallback(redisAsyncContext *c, void *reply, void *privd /* This is our Pub/Sub callback for the Hello channel. It's useful in order * to discover other sentinels attached at the same master. */ void sentinelReceiveHelloMessages(redisAsyncContext *c, void *reply, void *privdata) { - sentinelRedisInstance *ri = c->data; + sentinelRedisInstance *ri = c->data, *master; redisReply *r; if (!reply || !ri) return; r = reply; + master = (ri->flags & SRI_MASTER) ? ri : ri->master; + /* Update the last activity in the pubsub channel. Note that since we * receive our messages as well this timestamp can be used to detect * if the link is probably disconnected even if it seems otherwise. */ @@ -1691,26 +1693,26 @@ void sentinelReceiveHelloMessages(redisAsyncContext *c, void *reply, void *privd master_port = atoi(token[7]); canfailover = atoi(token[3]); si = getSentinelRedisInstanceByAddrAndRunID( - ri->sentinels,token[0],port,token[2]); + master->sentinels,token[0],port,token[2]); current_epoch = strtoull(token[4],NULL,10); master_config_epoch = strtoull(token[8],NULL,10); - sentinelRedisInstance *master; + sentinelRedisInstance *msgmaster; if (!si) { /* If not, remove all the sentinels that have the same runid * OR the same ip/port, because it's either a restart or a * network topology change. */ - removed = removeMatchingSentinelsFromMaster(ri,token[0],port, + removed = removeMatchingSentinelsFromMaster(master,token[0],port, token[2]); if (removed) { - sentinelEvent(REDIS_NOTICE,"-dup-sentinel",ri, + sentinelEvent(REDIS_NOTICE,"-dup-sentinel",master, "%@ #duplicate of %s:%d or %s", token[0],port,token[2]); } /* Add the new sentinel. */ si = createSentinelRedisInstance(NULL,SRI_SENTINEL, - token[0],port,ri->quorum,ri); + token[0],port,master->quorum,master); if (si) { sentinelEvent(REDIS_NOTICE,"+sentinel",si,"%@"); /* The runid is NULL after a new instance creation and @@ -1725,17 +1727,18 @@ void sentinelReceiveHelloMessages(redisAsyncContext *c, void *reply, void *privd sentinel.current_epoch = current_epoch; /* Update master info if received configuration is newer. */ - if ((master = sentinelGetMasterByName(token[5])) != NULL) { - if (master->config_epoch < master_config_epoch) { - master->config_epoch = master_config_epoch; - if (master_port != master->addr->port || - !strcmp(master->addr->ip, token[6])) + if ((msgmaster = sentinelGetMasterByName(token[5])) != NULL) { + if (msgmaster->config_epoch < master_config_epoch) { + msgmaster->config_epoch = master_config_epoch; + if (master_port != msgmaster->addr->port || + !strcmp(msgmaster->addr->ip, token[6])) { sentinelEvent(REDIS_WARNING,"+switch-master", - master,"%s %s %d %s %d", - master->name, master->addr->ip, master->addr->port, + msgmaster,"%s %s %d %s %d", + msgmaster->name, + msgmaster->addr->ip, msgmaster->addr->port, token[6], master_port); - sentinelResetMasterAndChangeAddress(ri, + sentinelResetMasterAndChangeAddress(msgmaster, token[6], master_port); } } From 6273e87b4a281bc9e3d83ed066e281995b1761b9 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 11 Nov 2013 18:30:11 +0100 Subject: [PATCH 0286/2500] Sentinel: leadership handling changes WIP. Changes to leadership handling. Now the leader gets selected by every Sentinel, for a specified epoch, when the SENTINEL is-master-down-by-addr is sent. This command now includes the runid and the currentEpoch of the instance seeking for a vote. The Sentinel only votes a single time in a given epoch. Still a work in progress, does not even compile at this stage. --- src/sentinel.c | 139 ++++++++++++++++++++++++++++--------------------- 1 file changed, 81 insertions(+), 58 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index af5363a78..7bb924f04 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -173,9 +173,8 @@ typedef struct sentinelRedisInstance { char *leader; /* If this is a master instance, this is the runid of the Sentinel that should perform the failover. If this is a Sentinel, this is the runid of the Sentinel - that this other Sentinel is voting as leader. - This field is valid only if SRI_MASTER_DOWN is - set on the Sentinel instance. */ + that this Sentinel voted as leader. */ + uint64_t leader_epoch; /* Epoch of the 'leader' field. */ int failover_state; /* See SENTINEL_FAILOVER_STATE_* defines. */ mstime_t failover_state_change_time; mstime_t failover_start_time; /* When to start to failover if leader. */ @@ -327,6 +326,7 @@ void sentinelScheduleScriptExecution(char *path, ...); void sentinelStartFailover(sentinelRedisInstance *master, int state); void sentinelDiscardReplyCallback(redisAsyncContext *c, void *reply, void *privdata); int sentinelSendSlaveOf(sentinelRedisInstance *ri, char *host, int port); +char *sentinelVoteLeader(sentinelRedisInstance *master, uint64_t req_epoch, char *req_runid, uint64_t *leader_epoch); /* ========================= Dictionary types =============================== */ @@ -894,6 +894,7 @@ sentinelRedisInstance *createSentinelRedisInstance(char *name, int flags, char * /* Failover state. */ ri->leader = NULL; + ri->leader_epoch = 0; ri->failover_state = SENTINEL_FAILOVER_STATE_NONE; ri->failover_state_change_time = 0; ri->failover_start_time = 0; @@ -1031,7 +1032,7 @@ sentinelRedisInstance *getSentinelRedisInstanceByAddrAndRunID(dict *instances, c return instance; } -/* Simple master lookup by name */ +/* Master lookup by name */ sentinelRedisInstance *sentinelGetMasterByName(char *name) { sentinelRedisInstance *ri; sds sdsname = sdsnew(name); @@ -1041,6 +1042,24 @@ sentinelRedisInstance *sentinelGetMasterByName(char *name) { return ri; } +/* Senitnel lookup by runid */ +sentinelRedisInstance *sentinelGetSentinelByRunid(sentinelRedisInstance *master, char *runid) { + sentinelRedisInstance *retval = NULL; + dictIterator *di; + dictEntry *de; + + di = dictGetIterator(master->sentinels); + while((de = dictNext(di)) != NULL) { + sentinelRedisInstance *ri = dictGetVal(de); + if (!strcmp(ri->runid,runid)) { + retval = ri; + break; + } + } + dictReleaseIterator(di); + return retval; +} + /* Add the specified flags to all the instances in the specified dictionary. */ void sentinelAddFlagsToDictOfRedisInstances(dict *instances, int flags) { dictIterator *di; @@ -1979,11 +1998,13 @@ void addReplySentinelRedisInstance(redisClient *c, sentinelRedisInstance *ri) { addReplyBulkLongLong(c,(ri->flags & SRI_CAN_FAILOVER) != 0); fields++; - if (ri->flags & SRI_MASTER_DOWN) { - addReplyBulkCString(c,"subjective-leader"); - addReplyBulkCString(c,ri->leader ? ri->leader : "?"); - fields++; - } + addReplyBulkCString(c,"voted-leader"); + addReplyBulkCString(c,ri->leader ? ri->leader : "?"); + fields++; + + addReplyBulkCString(c,"voted-leader-epoch"); + addReplyBulkLongLong(c,ri->leader_epoch); + fields++; } setDeferredMultiBulkLength(c,mbl,fields*2); @@ -2044,14 +2065,18 @@ void sentinelCommand(redisClient *c) { return; addReplyDictOfRedisInstances(c,ri->sentinels); } else if (!strcasecmp(c->argv[1]->ptr,"is-master-down-by-addr")) { - /* SENTINEL IS-MASTER-DOWN-BY-ADDR */ + /* SENTINEL IS-MASTER-DOWN-BY-ADDR */ sentinelRedisInstance *ri; + long long req_epoch; + uint64_t leader_epoch = 0; char *leader = NULL; long port; int isdown = 0; - if (c->argc != 4) goto numargserr; - if (getLongFromObjectOrReply(c,c->argv[3],&port,NULL) != REDIS_OK) + if (c->argc != 6) goto numargserr; + if (getLongFromObjectOrReply(c,c->argv[3],&port,NULL) != REDIS_OK || + getLongLongFromObjectOrReply(c,c->argv[4],&req_epoch,NULL) + != REDIS_OK) return; ri = getSentinelRedisInstanceByAddrAndRunID(sentinel.masters, c->argv[2]->ptr,port,NULL); @@ -2061,12 +2086,20 @@ void sentinelCommand(redisClient *c) { if (!sentinel.tilt && ri && (ri->flags & SRI_S_DOWN) && (ri->flags & SRI_MASTER)) isdown = 1; - if (ri) leader = sentinelGetSubjectiveLeader(ri); - /* Reply with a two-elements multi-bulk reply: down state, leader. */ - addReplyMultiBulkLen(c,2); + /* Vote for the master (or fetch the previous vote) */ + if (ri && ri->flags & SRI_MASTER) { + leader = sentinelVoteLeader(ri,(uint64_t)req_epoch, + c->argv[5]->ptr, + &leader_epoch); + } + + /* Reply with a three-elements multi-bulk reply: + * down state, leader, vote epoch. */ + addReplyMultiBulkLen(c,3); addReply(c, isdown ? shared.cone : shared.czero); addReplyBulkCString(c, leader ? leader : "?"); + addReplyLongLong(c, (long long)leader_epoch); if (leader) sdsfree(leader); } else if (!strcasecmp(c->argv[1]->ptr,"reset")) { /* SENTINEL RESET */ @@ -2289,9 +2322,10 @@ void sentinelReceiveIsMasterDownReply(redisAsyncContext *c, void *reply, void *p /* Ignore every error or unexpected reply. * Note that if the command returns an error for any reason we'll * end clearing the SRI_MASTER_DOWN flag for timeout anyway. */ - if (r->type == REDIS_REPLY_ARRAY && r->elements == 2 && + if (r->type == REDIS_REPLY_ARRAY && r->elements == 3 && r->element[0]->type == REDIS_REPLY_INTEGER && - r->element[1]->type == REDIS_REPLY_STRING) + r->element[1]->type == REDIS_REPLY_STRING && + r->element[2]->type == REDIS_REPLY_INTEGER) { ri->last_master_down_reply_time = mstime(); if (r->element[0]->integer == 1) { @@ -2301,6 +2335,7 @@ void sentinelReceiveIsMasterDownReply(redisAsyncContext *c, void *reply, void *p } sdsfree(ri->leader); ri->leader = sdsnew(r->element[1]->str); + ri->leader_epoch = r->element[2]->integer; } } @@ -2341,8 +2376,8 @@ void sentinelAskMasterStateToOtherSentinels(sentinelRedisInstance *master) { ll2string(port,sizeof(port),master->addr->port); retval = redisAsyncCommand(ri->cc, sentinelReceiveIsMasterDownReply, NULL, - "SENTINEL is-master-down-by-addr %s %s", - master->addr->ip, port); + "SENTINEL is-master-down-by-addr %s %s %llu %s", + master->addr->ip, port, sentinel.current_epoch, server.runid); if (retval == REDIS_OK) ri->pending_commands++; } dictReleaseIterator(di); @@ -2369,41 +2404,25 @@ int compareRunID(const void *a, const void *b) { return strcasecmp(*aptrptr, *bptrptr); } -char *sentinelGetSubjectiveLeader(sentinelRedisInstance *master) { - dictIterator *di; - dictEntry *de; - char **instance = - zmalloc(sizeof(char*)*(dictSize(master->sentinels)+1)); - int instances = 0; - char *leader = NULL; +/* Vote for the sentinel with 'req_runid' or return the old vote if already + * voted for the specifed 'req_epoch' or one greater. + * + * If a vote is not available returns NULL, otherwise return the Sentinel + * runid and populate the leader_epoch with the epoch of the last vote. */ +char *sentinelVoteLeader(sentinelRedisInstance *master, uint64_t req_epoch, char *req_runid, uint64_t *leader_epoch) { + sentinelRedisInstance *si = sentinelGetSentinelByRunid(master,req_runid); - if (master->flags & SRI_CAN_FAILOVER) { - /* Add myself if I'm a Sentinel that can failover this master. */ - instance[instances++] = server.runid; + if (req_epoch > sentinel.current_epoch) + sentinel.current_epoch = req_epoch; + + if (si && master->leader_epoch < req_epoch) { + sdsfree(master->leader); + master->leader = sdsnew(req_runid); + master->leader_epoch = sentinel.current_epoch; } - di = dictGetIterator(master->sentinels); - while((de = dictNext(di)) != NULL) { - sentinelRedisInstance *ri = dictGetVal(de); - mstime_t lag = mstime() - ri->last_avail_time; - - if (lag > SENTINEL_INFO_VALIDITY_TIME || - !(ri->flags & SRI_CAN_FAILOVER) || - (ri->flags & SRI_DISCONNECTED) || - ri->runid == NULL) - continue; - instance[instances++] = ri->runid; - } - dictReleaseIterator(di); - - /* If we have at least one instance passing our checks, order the array - * by runid. */ - if (instances) { - qsort(instance,instances,sizeof(char*),compareRunID); - leader = sdsnew(instance[0]); - } - zfree(instance); - return leader; + *leader_epoch = master->leader_epoch; + return master->leader; } struct sentinelLeader { @@ -2411,9 +2430,9 @@ struct sentinelLeader { unsigned long votes; }; -/* Helper function for sentinelGetObjectiveLeader, increment the counter +/* Helper function for sentinelGetLeader, increment the counter * relative to the specified runid. */ -void sentinelObjectiveLeaderIncr(dict *counters, char *runid) { +void sentinelLeaderIncr(dict *counters, char *runid) { dictEntry *de = dictFind(counters,runid); uint64_t oldval; @@ -2427,9 +2446,13 @@ void sentinelObjectiveLeaderIncr(dict *counters, char *runid) { } } -/* Scan all the Sentinels attached to this master to check what is the - * most voted leader among Sentinels. */ -char *sentinelGetObjectiveLeader(sentinelRedisInstance *master) { +/* Scan all the Sentinels attached to this master to check if there + * is a leader for a given term, and return it if any. + * + * To be a leader for a given epoch, we should have the majorify of + * the Sentinels we know about that reported the same instance as + * leader for the same epoch. */ +char *sentinelGetLeader(sentinelRedisInstance *master, uint64_t term) { dict *counters; dictIterator *di; dictEntry *de; @@ -2443,7 +2466,7 @@ char *sentinelGetObjectiveLeader(sentinelRedisInstance *master) { /* Count my vote. */ myvote = sentinelGetSubjectiveLeader(master); if (myvote) { - sentinelObjectiveLeaderIncr(counters,myvote); + sentinelLeaderIncr(counters,myvote); voters++; } @@ -2458,7 +2481,7 @@ char *sentinelGetObjectiveLeader(sentinelRedisInstance *master) { * leader fails. In that case we consider all the voters. */ if (!(master->flags & SRI_FAILOVER_IN_PROGRESS) && !(ri->flags & SRI_MASTER_DOWN)) continue; - sentinelObjectiveLeaderIncr(counters,ri->leader); + sentinelLeaderIncr(counters,ri->leader); voters++; } dictReleaseIterator(di); From 0e359cc68e0fb8dd62360b05c14a71b6a4a9caba Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 12 Nov 2013 11:09:35 +0100 Subject: [PATCH 0287/2500] Sentinel: epoch introduced in leader vote. --- src/sentinel.c | 213 ++++++++++++++++--------------------------------- 1 file changed, 67 insertions(+), 146 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 7bb924f04..6530b2b60 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -65,13 +65,12 @@ typedef struct sentinelAddr { #define SRI_CAN_FAILOVER (1<<7) #define SRI_FAILOVER_IN_PROGRESS (1<<8) /* Failover is in progress for this master. */ -#define SRI_I_AM_THE_LEADER (1<<9) /* We are the leader for this master. */ -#define SRI_PROMOTED (1<<10) /* Slave selected for promotion. */ -#define SRI_RECONF_SENT (1<<11) /* SLAVEOF sent. */ -#define SRI_RECONF_INPROG (1<<12) /* Slave synchronization in progress. */ -#define SRI_RECONF_DONE (1<<13) /* Slave synchronized with new master. */ -#define SRI_FORCE_FAILOVER (1<<14) /* Force failover with master up. */ -#define SRI_SCRIPT_KILL_SENT (1<<15) /* SCRIPT KILL already sent on -BUSY */ +#define SRI_PROMOTED (1<<9) /* Slave selected for promotion. */ +#define SRI_RECONF_SENT (1<<10) /* SLAVEOF sent. */ +#define SRI_RECONF_INPROG (1<<11) /* Slave synchronization in progress. */ +#define SRI_RECONF_DONE (1<<12) /* Slave synchronized with new master. */ +#define SRI_FORCE_FAILOVER (1<<13) /* Force failover with master up. */ +#define SRI_SCRIPT_KILL_SENT (1<<14) /* SCRIPT KILL already sent on -BUSY */ #define SENTINEL_INFO_PERIOD 10000 #define SENTINEL_PING_PERIOD 1000 @@ -175,6 +174,7 @@ typedef struct sentinelRedisInstance { this is a Sentinel, this is the runid of the Sentinel that this Sentinel voted as leader. */ uint64_t leader_epoch; /* Epoch of the 'leader' field. */ + uint64_t failover_epoch; /* Epoch of the currently started failover. */ int failover_state; /* See SENTINEL_FAILOVER_STATE_* defines. */ mstime_t failover_state_change_time; mstime_t failover_start_time; /* When to start to failover if leader. */ @@ -323,7 +323,7 @@ void sentinelAbortFailover(sentinelRedisInstance *ri); void sentinelEvent(int level, char *type, sentinelRedisInstance *ri, const char *fmt, ...); sentinelRedisInstance *sentinelSelectSlave(sentinelRedisInstance *master); void sentinelScheduleScriptExecution(char *path, ...); -void sentinelStartFailover(sentinelRedisInstance *master, int state); +void sentinelStartFailover(sentinelRedisInstance *master); void sentinelDiscardReplyCallback(redisAsyncContext *c, void *reply, void *privdata); int sentinelSendSlaveOf(sentinelRedisInstance *ri, char *host, int port); char *sentinelVoteLeader(sentinelRedisInstance *master, uint64_t req_epoch, char *req_runid, uint64_t *leader_epoch); @@ -895,6 +895,7 @@ sentinelRedisInstance *createSentinelRedisInstance(char *name, int flags, char * /* Failover state. */ ri->leader = NULL; ri->leader_epoch = 0; + ri->failover_epoch = 0; ri->failover_state = SENTINEL_FAILOVER_STATE_NONE; ri->failover_state_change_time = 0; ri->failover_start_time = 0; @@ -1522,7 +1523,6 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { * failover state machine. */ if (!sentinel.tilt && (ri->master->flags & SRI_FAILOVER_IN_PROGRESS) && - (ri->master->flags & SRI_I_AM_THE_LEADER) && (ri->master->failover_state == SENTINEL_FAILOVER_STATE_WAIT_PROMOTION)) { @@ -1898,8 +1898,6 @@ void addReplySentinelRedisInstance(redisClient *c, sentinelRedisInstance *ri) { if (ri->flags & SRI_MASTER_DOWN) flags = sdscat(flags,"master_down,"); if (ri->flags & SRI_FAILOVER_IN_PROGRESS) flags = sdscat(flags,"failover_in_progress,"); - if (ri->flags & SRI_I_AM_THE_LEADER) - flags = sdscat(flags,"i_am_the_leader,"); if (ri->flags & SRI_PROMOTED) flags = sdscat(flags,"promoted,"); if (ri->flags & SRI_RECONF_SENT) flags = sdscat(flags,"reconf_sent,"); if (ri->flags & SRI_RECONF_INPROG) flags = sdscat(flags,"reconf_inprog,"); @@ -2147,7 +2145,7 @@ void sentinelCommand(redisClient *c) { addReplySds(c,sdsnew("-NOGOODSLAVE No suitable slave to promote\r\n")); return; } - sentinelStartFailover(ri,SENTINEL_FAILOVER_STATE_WAIT_START); + sentinelStartFailover(ri); ri->flags |= SRI_FORCE_FAILOVER; addReply(c,shared.ok); } else if (!strcasecmp(c->argv[1]->ptr,"pending-scripts")) { @@ -2347,6 +2345,14 @@ void sentinelAskMasterStateToOtherSentinels(sentinelRedisInstance *master) { dictIterator *di; dictEntry *de; + /* Vote for myself if I see the master is already in ODOWN state. */ + if (master->flags & SRI_O_DOWN) { + uint64_t leader_epoch; + + sentinelVoteLeader(master,sentinel.current_epoch,server.runid, + &leader_epoch); + } + di = dictGetIterator(master->sentinels); while((de = dictNext(di)) != NULL) { sentinelRedisInstance *ri = dictGetVal(de); @@ -2366,8 +2372,7 @@ void sentinelAskMasterStateToOtherSentinels(sentinelRedisInstance *master) { * 1) We believe it is down, or there is a failover in progress. * 2) Sentinel is connected. * 3) We did not received the info within SENTINEL_ASK_PERIOD ms. */ - if ((master->flags & (SRI_S_DOWN|SRI_FAILOVER_IN_PROGRESS)) == 0) - continue; + if ((master->flags & SRI_S_DOWN) == 0) continue; if (ri->flags & SRI_DISCONNECTED) continue; if (mstime() - ri->last_master_down_reply_time < SENTINEL_ASK_PERIOD) continue; @@ -2377,7 +2382,9 @@ void sentinelAskMasterStateToOtherSentinels(sentinelRedisInstance *master) { retval = redisAsyncCommand(ri->cc, sentinelReceiveIsMasterDownReply, NULL, "SENTINEL is-master-down-by-addr %s %s %llu %s", - master->addr->ip, port, sentinel.current_epoch, server.runid); + master->addr->ip, port, + sentinel.current_epoch, + server.runid); if (retval == REDIS_OK) ri->pending_commands++; } dictReleaseIterator(di); @@ -2415,7 +2422,9 @@ char *sentinelVoteLeader(sentinelRedisInstance *master, uint64_t req_epoch, char if (req_epoch > sentinel.current_epoch) sentinel.current_epoch = req_epoch; - if (si && master->leader_epoch < req_epoch) { + if (si && master->leader_epoch < req_epoch && + sentinel.current_epoch <= req_epoch) + { sdsfree(master->leader); master->leader = sdsnew(req_runid); master->leader_epoch = sentinel.current_epoch; @@ -2447,25 +2456,27 @@ void sentinelLeaderIncr(dict *counters, char *runid) { } /* Scan all the Sentinels attached to this master to check if there - * is a leader for a given term, and return it if any. + * is a leader for the specified epoch. * * To be a leader for a given epoch, we should have the majorify of - * the Sentinels we know about that reported the same instance as + * the Sentinels we know that reported the same instance as * leader for the same epoch. */ -char *sentinelGetLeader(sentinelRedisInstance *master, uint64_t term) { +char *sentinelGetLeader(sentinelRedisInstance *master, uint64_t epoch) { dict *counters; dictIterator *di; dictEntry *de; unsigned int voters = 0, voters_quorum; char *myvote; char *winner = NULL; + uint64_t leader_epoch; redisAssert(master->flags & (SRI_O_DOWN|SRI_FAILOVER_IN_PROGRESS)); counters = dictCreate(&leaderVotesDictType,NULL); - /* Count my vote. */ - myvote = sentinelGetSubjectiveLeader(master); - if (myvote) { + /* Count my vote (and vote for myself if I still did not voted for + * the currnet epoch). */ + myvote = sentinelVoteLeader(master,epoch,server.runid,&leader_epoch); + if (myvote && leader_epoch == epoch) { sentinelLeaderIncr(counters,myvote); voters++; } @@ -2474,13 +2485,8 @@ char *sentinelGetLeader(sentinelRedisInstance *master, uint64_t term) { di = dictGetIterator(master->sentinels); while((de = dictNext(di)) != NULL) { sentinelRedisInstance *ri = dictGetVal(de); - if (ri->leader == NULL) continue; - /* If the failover is not already in progress we are only interested - * in Sentinels that believe the master is down. Otherwise the leader - * selection is useful for the "failover-takedown" when the original - * leader fails. In that case we consider all the voters. */ - if (!(master->flags & SRI_FAILOVER_IN_PROGRESS) && - !(ri->flags & SRI_MASTER_DOWN)) continue; + if (ri->leader == NULL || ri->leader_epoch != sentinel.current_epoch) + continue; sentinelLeaderIncr(counters,ri->leader); voters++; } @@ -2546,32 +2552,14 @@ int sentinelSendSlaveOf(sentinelRedisInstance *ri, char *host, int port) { return REDIS_OK; } -/* Setup the master state to start a failover as a leader. - * - * State can be either: - * - * SENTINEL_FAILOVER_STATE_WAIT_START: starts a failover from scratch. - * SENTINEL_FAILOVER_STATE_RECONF_SLAVES: takedown a failed failover. - */ -void sentinelStartFailover(sentinelRedisInstance *master, int state) { +/* Setup the master state to start a failover. */ +void sentinelStartFailover(sentinelRedisInstance *master) { redisAssert(master->flags & SRI_MASTER); - redisAssert(state == SENTINEL_FAILOVER_STATE_WAIT_START || - state == SENTINEL_FAILOVER_STATE_RECONF_SLAVES); - master->failover_state = state; - master->flags |= SRI_FAILOVER_IN_PROGRESS|SRI_I_AM_THE_LEADER; + master->failover_state = SENTINEL_FAILOVER_STATE_WAIT_START; + master->flags |= SRI_FAILOVER_IN_PROGRESS; + master->failover_epoch = ++sentinel.current_epoch; sentinelEvent(REDIS_WARNING,"+failover-triggered",master,"%@"); - - /* Pick a random delay if it's a fresh failover (WAIT_START), and not - * a recovery of a failover started by another sentinel. */ - if (master->failover_state == SENTINEL_FAILOVER_STATE_WAIT_START) { - master->failover_start_time = mstime() + - SENTINEL_FAILOVER_FIXED_DELAY + - (rand() % SENTINEL_FAILOVER_MAX_RANDOM_DELAY); - sentinelEvent(REDIS_WARNING,"+failover-state-wait-start",master, - "%@ #starting in %lld milliseconds", - master->failover_start_time-mstime()); - } master->failover_state_change_time = mstime(); } @@ -2580,66 +2568,18 @@ void sentinelStartFailover(sentinelRedisInstance *master, int state) { * * 1) Enough time has passed since O_DOWN. * 2) The master is marked as SRI_CAN_FAILOVER, so we can failover it. - * 3) We are the objectively leader for this master. - * - * If the conditions are met we flag the master as SRI_FAILOVER_IN_PROGRESS - * and SRI_I_AM_THE_LEADER. - */ + * + * We still don't know if we'll win the election so it is possible that we + * start the failover but that we'll not be able to act. */ void sentinelStartFailoverIfNeeded(sentinelRedisInstance *master) { - char *leader; - int isleader; - - /* We can't failover if the master is not in O_DOWN state or if - * there is not already a failover in progress (to perform the - * takedown if the leader died) or if this Sentinel is not allowed - * to start a failover. */ + /* We can't failover if the master is not in O_DOWN state. */ if (!(master->flags & SRI_CAN_FAILOVER) || - !(master->flags & (SRI_O_DOWN|SRI_FAILOVER_IN_PROGRESS))) return; + !(master->flags & SRI_O_DOWN)) return; - leader = sentinelGetObjectiveLeader(master); - isleader = leader && strcasecmp(leader,server.runid) == 0; - sdsfree(leader); + /* Failover already in progress? */ + if (master->flags & SRI_FAILOVER_IN_PROGRESS) return; - /* If I'm not the leader, I can't failover for sure. */ - if (!isleader) return; - - /* If the failover is already in progress there are two options... */ - if (master->flags & SRI_FAILOVER_IN_PROGRESS) { - if (master->flags & SRI_I_AM_THE_LEADER) { - /* 1) I'm flagged as leader so I already started the failover. - * Just return. */ - return; - } else { - mstime_t elapsed = mstime() - master->failover_state_change_time; - - /* 2) I'm the new leader, but I'm not flagged as leader in the - * master: I did not started the failover, but the original - * leader has no longer the leadership. - * - * In this case if the failover appears to be lagging - * for at least 25% of the configured failover timeout, - * I can assume I can take control. Otherwise - * it's better to return and wait more. */ - if (elapsed < (master->failover_timeout/4)) return; - sentinelEvent(REDIS_WARNING,"+failover-takedown",master,"%@"); - /* We have already an elected slave if we are in - * FAILOVER_IN_PROGRESS state, that is, the slave that we - * observed turning into a master. */ - sentinelStartFailover(master,SENTINEL_FAILOVER_STATE_RECONF_SLAVES); - /* As an observer we flagged all the slaves as RECONF_SENT but - * now we are in charge of actually sending the reconfiguration - * command so let's clear this flag for all the instances. */ - sentinelDelFlagsToDictOfRedisInstances(master->slaves, - SRI_RECONF_SENT); - } - } else { - /* Brand new failover as SRI_FAILOVER_IN_PROGRESS was not set. - * - * Do we have a slave to promote? Otherwise don't start a failover - * at all. */ - if (sentinelSelectSlave(master) == NULL) return; - sentinelStartFailover(master,SENTINEL_FAILOVER_STATE_WAIT_START); - } + sentinelStartFailover(master); } /* Select a suitable slave to promote. The current algorithm only uses @@ -2723,29 +2663,22 @@ sentinelRedisInstance *sentinelSelectSlave(sentinelRedisInstance *master) { /* ---------------- Failover state machine implementation ------------------- */ void sentinelFailoverWaitStart(sentinelRedisInstance *ri) { - /* If we in "wait start" but the master is no longer in ODOWN nor in - * SDOWN condition we abort the failover. This is important as it - * prevents a useless failover in a a notable case of netsplit, where - * the sentinels are split from the redis instances. In this case - * the failover will not start while there is the split because no - * good slave can be reached. However when the split is resolved, we - * can go to waitstart if the slave is back reachable a few milliseconds - * before the master is. In that case when the master is back online - * we cancel the failover. */ - if ((ri->flags & (SRI_S_DOWN|SRI_O_DOWN|SRI_FORCE_FAILOVER)) == 0) { - sentinelEvent(REDIS_WARNING,"-failover-abort-master-is-back", - ri,"%@"); - sentinelAbortFailover(ri); - return; - } + char *leader; + int isleader; + + /* Check if we are the leader for the failover epoch. */ + leader = sentinelGetLeader(ri, ri->failover_epoch); + isleader = leader && strcasecmp(leader,server.runid) == 0; + sdsfree(leader); + + /* If I'm not the leader, I can't continue with the failover. */ + if (!isleader) return; /* Start the failover going to the next state if enough time has * elapsed. */ - if (mstime() >= ri->failover_start_time) { - ri->failover_state = SENTINEL_FAILOVER_STATE_SELECT_SLAVE; - ri->failover_state_change_time = mstime(); - sentinelEvent(REDIS_WARNING,"+failover-state-select-slave",ri,"%@"); - } + ri->failover_state = SENTINEL_FAILOVER_STATE_SELECT_SLAVE; + ri->failover_state_change_time = mstime(); + sentinelEvent(REDIS_WARNING,"+failover-state-select-slave",ri,"%@"); } void sentinelFailoverSelectSlave(sentinelRedisInstance *ri) { @@ -2829,8 +2762,7 @@ void sentinelFailoverDetectEnd(sentinelRedisInstance *master) { } if (not_reconfigured == 0) { - int role = (master->flags & SRI_I_AM_THE_LEADER) ? SENTINEL_LEADER : - SENTINEL_OBSERVER; + int role = SENTINEL_LEADER; sentinelEvent(REDIS_WARNING,"+failover-end",master,"%@"); master->failover_state = SENTINEL_FAILOVER_STATE_UPDATE_CONFIG; @@ -2842,7 +2774,7 @@ void sentinelFailoverDetectEnd(sentinelRedisInstance *master) { /* If I'm the leader it is a good idea to send a best effort SLAVEOF * command to all the slaves still not reconfigured to replicate with * the new master. */ - if (timeout && (master->flags & SRI_I_AM_THE_LEADER)) { + if (timeout) { dictIterator *di; dictEntry *de; @@ -2999,8 +2931,7 @@ void sentinelAbortFailover(sentinelRedisInstance *ri) { di = dictGetIterator(ri->slaves); while((de = dictNext(di)) != NULL) { sentinelRedisInstance *slave = dictGetVal(de); - if ((ri->flags & SRI_I_AM_THE_LEADER) && - !(slave->flags & SRI_DISCONNECTED) && + if (!(slave->flags & SRI_DISCONNECTED) && (slave->flags & (SRI_PROMOTED|SRI_RECONF_SENT|SRI_RECONF_INPROG| SRI_RECONF_DONE))) { @@ -3014,9 +2945,8 @@ void sentinelAbortFailover(sentinelRedisInstance *ri) { } dictReleaseIterator(di); - sentinel_role = (ri->flags & SRI_I_AM_THE_LEADER) ? SENTINEL_LEADER : - SENTINEL_OBSERVER; - ri->flags &= ~(SRI_FAILOVER_IN_PROGRESS|SRI_I_AM_THE_LEADER|SRI_FORCE_FAILOVER); + sentinel_role = SENTINEL_LEADER; + ri->flags &= ~(SRI_FAILOVER_IN_PROGRESS|SRI_FORCE_FAILOVER); ri->failover_state = SENTINEL_FAILOVER_STATE_NONE; ri->failover_state_change_time = mstime(); if (ri->promoted_slave) { @@ -3039,16 +2969,6 @@ void sentinelHandleRedisInstance(sentinelRedisInstance *ri) { sentinelReconnectInstance(ri); sentinelPingInstance(ri); - /* Masters and slaves */ - if (ri->flags & (SRI_MASTER|SRI_SLAVE)) { - /* Nothing so far. */ - } - - /* Only masters */ - if (ri->flags & SRI_MASTER) { - sentinelAskMasterStateToOtherSentinels(ri); - } - /* ============== ACTING HALF ============= */ /* We don't proceed with the acting half if we are in TILT mode. * TILT happens when we find something odd with the time, like a @@ -3072,6 +2992,7 @@ void sentinelHandleRedisInstance(sentinelRedisInstance *ri) { sentinelCheckObjectivelyDown(ri); sentinelStartFailoverIfNeeded(ri); sentinelFailoverStateMachine(ri); + sentinelAskMasterStateToOtherSentinels(ri); } } From c53ced91ed6a761566fda9fd8a2392e87afeabc8 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 12 Nov 2013 11:10:10 +0100 Subject: [PATCH 0288/2500] Sentinel: fix PUBLISH to masters and slaves. --- src/sentinel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 6530b2b60..be99147d7 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1827,13 +1827,13 @@ void sentinelPingInstance(sentinelRedisInstance *ri) { if (anetSockName(ri->cc->c.fd,ip,sizeof(ip),NULL) != -1) { char payload[REDIS_IP_STR_LEN+1024]; sentinelRedisInstance *master = (ri->flags & SRI_MASTER) ? - NULL : ri->master; + ri : ri->master; snprintf(payload,sizeof(payload), "%s,%d,%s,%d,%llu," /* Info about this sentinel. */ "%s,%s,%d,%lld", /* Info about current master. */ ip, server.port, server.runid, - (ri->flags & SRI_CAN_FAILOVER) != 0, + (master->flags & SRI_CAN_FAILOVER) != 0, (unsigned long long) sentinel.current_epoch, /* --- */ master->name,master->addr->ip,master->addr->port, From 8246e06ee9ccc725c19fd4fcb341453ba42aeaa6 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 12 Nov 2013 11:32:40 +0100 Subject: [PATCH 0289/2500] Sentinel: allow to vote for myself. --- src/sentinel.c | 26 +++----------------------- 1 file changed, 3 insertions(+), 23 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index be99147d7..0a69107ec 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1043,24 +1043,6 @@ sentinelRedisInstance *sentinelGetMasterByName(char *name) { return ri; } -/* Senitnel lookup by runid */ -sentinelRedisInstance *sentinelGetSentinelByRunid(sentinelRedisInstance *master, char *runid) { - sentinelRedisInstance *retval = NULL; - dictIterator *di; - dictEntry *de; - - di = dictGetIterator(master->sentinels); - while((de = dictNext(di)) != NULL) { - sentinelRedisInstance *ri = dictGetVal(de); - if (!strcmp(ri->runid,runid)) { - retval = ri; - break; - } - } - dictReleaseIterator(di); - return retval; -} - /* Add the specified flags to all the instances in the specified dictionary. */ void sentinelAddFlagsToDictOfRedisInstances(dict *instances, int flags) { dictIterator *di; @@ -2417,17 +2399,15 @@ int compareRunID(const void *a, const void *b) { * If a vote is not available returns NULL, otherwise return the Sentinel * runid and populate the leader_epoch with the epoch of the last vote. */ char *sentinelVoteLeader(sentinelRedisInstance *master, uint64_t req_epoch, char *req_runid, uint64_t *leader_epoch) { - sentinelRedisInstance *si = sentinelGetSentinelByRunid(master,req_runid); - if (req_epoch > sentinel.current_epoch) sentinel.current_epoch = req_epoch; - if (si && master->leader_epoch < req_epoch && - sentinel.current_epoch <= req_epoch) - { + if (master->leader_epoch < req_epoch && sentinel.current_epoch <= req_epoch) { sdsfree(master->leader); master->leader = sdsnew(req_runid); master->leader_epoch = sentinel.current_epoch; + printf("Selected leader %s for epoch %llu\n", master->leader, + (unsigned long long) master->leader_epoch); } *leader_epoch = master->leader_epoch; From ed01b141e8aa1548f4c2c91861116deff42b3c36 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 12 Nov 2013 13:30:31 +0100 Subject: [PATCH 0290/2500] Sentinel: wait some time between failover attempts. --- src/sentinel.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 0a69107ec..6d7dbc525 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -177,7 +177,7 @@ typedef struct sentinelRedisInstance { uint64_t failover_epoch; /* Epoch of the currently started failover. */ int failover_state; /* See SENTINEL_FAILOVER_STATE_* defines. */ mstime_t failover_state_change_time; - mstime_t failover_start_time; /* When to start to failover if leader. */ + mstime_t failover_start_time; /* Last failover attempt start time. */ mstime_t failover_timeout; /* Max time to refresh failover state. */ struct sentinelRedisInstance *promoted_slave; /* Promoted slave instance. */ /* Scripts executed to notify admin or reconfigure clients: when they @@ -2411,7 +2411,7 @@ char *sentinelVoteLeader(sentinelRedisInstance *master, uint64_t req_epoch, char } *leader_epoch = master->leader_epoch; - return master->leader; + return master->leader ? sdsnew(master->leader) : NULL; } struct sentinelLeader { @@ -2540,6 +2540,7 @@ void sentinelStartFailover(sentinelRedisInstance *master) { master->flags |= SRI_FAILOVER_IN_PROGRESS; master->failover_epoch = ++sentinel.current_epoch; sentinelEvent(REDIS_WARNING,"+failover-triggered",master,"%@"); + master->failover_start_time = mstime(); master->failover_state_change_time = mstime(); } @@ -2559,6 +2560,10 @@ void sentinelStartFailoverIfNeeded(sentinelRedisInstance *master) { /* Failover already in progress? */ if (master->flags & SRI_FAILOVER_IN_PROGRESS) return; + /* Last failover attempt started too little time ago? */ + if (mstime() - master->failover_start_time < + SENTINEL_PUBLISH_PERIOD*4) return; + sentinelStartFailover(master); } From d63c15923ff98cb8c1643929f3ee5c7fa9e1b2f3 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 12 Nov 2013 13:35:25 +0100 Subject: [PATCH 0291/2500] Sentinel: +new-epoch events. --- src/sentinel.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 6d7dbc525..c0bce3451 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1724,8 +1724,11 @@ void sentinelReceiveHelloMessages(redisAsyncContext *c, void *reply, void *privd } /* Update local current_epoch if received current_epoch is greater. */ - if (current_epoch > sentinel.current_epoch) + if (current_epoch > sentinel.current_epoch) { sentinel.current_epoch = current_epoch; + sentinelEvent(REDIS_WARNING,"+new-epoch",ri,"%llu", + (unsigned long long) sentinel.current_epoch); + } /* Update master info if received configuration is newer. */ if ((msgmaster = sentinelGetMasterByName(token[5])) != NULL) { @@ -2399,15 +2402,18 @@ int compareRunID(const void *a, const void *b) { * If a vote is not available returns NULL, otherwise return the Sentinel * runid and populate the leader_epoch with the epoch of the last vote. */ char *sentinelVoteLeader(sentinelRedisInstance *master, uint64_t req_epoch, char *req_runid, uint64_t *leader_epoch) { - if (req_epoch > sentinel.current_epoch) + if (req_epoch > sentinel.current_epoch) { sentinel.current_epoch = req_epoch; + sentinelEvent(REDIS_WARNING,"+new-epoch",master,"%llu", + (unsigned long long) sentinel.current_epoch); + } if (master->leader_epoch < req_epoch && sentinel.current_epoch <= req_epoch) { sdsfree(master->leader); master->leader = sdsnew(req_runid); master->leader_epoch = sentinel.current_epoch; - printf("Selected leader %s for epoch %llu\n", master->leader, - (unsigned long long) master->leader_epoch); + sentinelEvent(REDIS_WARNING,"+vote-for-leader",master,"%s %llu", + master->leader, (unsigned long long) master->leader_epoch); } *leader_epoch = master->leader_epoch; @@ -2539,7 +2545,9 @@ void sentinelStartFailover(sentinelRedisInstance *master) { master->failover_state = SENTINEL_FAILOVER_STATE_WAIT_START; master->flags |= SRI_FAILOVER_IN_PROGRESS; master->failover_epoch = ++sentinel.current_epoch; - sentinelEvent(REDIS_WARNING,"+failover-triggered",master,"%@"); + sentinelEvent(REDIS_WARNING,"+new-epoch",master,"%llu", + (unsigned long long) sentinel.current_epoch); + sentinelEvent(REDIS_WARNING,"+try-failover",master,"%@"); master->failover_start_time = mstime(); master->failover_state_change_time = mstime(); } @@ -2658,6 +2666,7 @@ void sentinelFailoverWaitStart(sentinelRedisInstance *ri) { /* If I'm not the leader, I can't continue with the failover. */ if (!isleader) return; + sentinelEvent(REDIS_WARNING,"+elected-leader",ri,"%@"); /* Start the failover going to the next state if enough time has * elapsed. */ From 61794132b2f4f47672eaa2abcf051bf3a4cd23f9 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 12 Nov 2013 16:38:02 +0100 Subject: [PATCH 0292/2500] Sentinel: when starting failover seek for votes ASAP. --- src/sentinel.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index c0bce3451..7cf32d7e0 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -72,6 +72,7 @@ typedef struct sentinelAddr { #define SRI_FORCE_FAILOVER (1<<13) /* Force failover with master up. */ #define SRI_SCRIPT_KILL_SENT (1<<14) /* SCRIPT KILL already sent on -BUSY */ +#define SENTINEL_NO_FLAGS 0 /* Generic no flags define. */ #define SENTINEL_INFO_PERIOD 10000 #define SENTINEL_PING_PERIOD 1000 #define SENTINEL_ASK_PERIOD 1000 @@ -2326,7 +2327,8 @@ void sentinelReceiveIsMasterDownReply(redisAsyncContext *c, void *reply, void *p * SENTINEL IS-MASTER-DOWN-BY-ADDR requests to other sentinels * in order to get the replies that allow to reach the quorum and * possibly also mark the master as objectively down. */ -void sentinelAskMasterStateToOtherSentinels(sentinelRedisInstance *master) { +#define SENTINEL_ASK_FORCED (1<<0) +void sentinelAskMasterStateToOtherSentinels(sentinelRedisInstance *master, int flags) { dictIterator *di; dictEntry *de; @@ -2359,7 +2361,8 @@ void sentinelAskMasterStateToOtherSentinels(sentinelRedisInstance *master) { * 3) We did not received the info within SENTINEL_ASK_PERIOD ms. */ if ((master->flags & SRI_S_DOWN) == 0) continue; if (ri->flags & SRI_DISCONNECTED) continue; - if (mstime() - ri->last_master_down_reply_time < SENTINEL_ASK_PERIOD) + if (!(flags & SENTINEL_ASK_FORCED) && + mstime() - ri->last_master_down_reply_time < SENTINEL_ASK_PERIOD) continue; /* Ask */ @@ -2559,20 +2562,23 @@ void sentinelStartFailover(sentinelRedisInstance *master) { * 2) The master is marked as SRI_CAN_FAILOVER, so we can failover it. * * We still don't know if we'll win the election so it is possible that we - * start the failover but that we'll not be able to act. */ -void sentinelStartFailoverIfNeeded(sentinelRedisInstance *master) { + * start the failover but that we'll not be able to act. + * + * Return non-zero if a failover was started. */ +int sentinelStartFailoverIfNeeded(sentinelRedisInstance *master) { /* We can't failover if the master is not in O_DOWN state. */ if (!(master->flags & SRI_CAN_FAILOVER) || - !(master->flags & SRI_O_DOWN)) return; + !(master->flags & SRI_O_DOWN)) return 0; /* Failover already in progress? */ - if (master->flags & SRI_FAILOVER_IN_PROGRESS) return; + if (master->flags & SRI_FAILOVER_IN_PROGRESS) return 0; /* Last failover attempt started too little time ago? */ if (mstime() - master->failover_start_time < - SENTINEL_PUBLISH_PERIOD*4) return; + SENTINEL_PUBLISH_PERIOD*4) return 0; sentinelStartFailover(master); + return 1; } /* Select a suitable slave to promote. The current algorithm only uses @@ -2984,9 +2990,10 @@ void sentinelHandleRedisInstance(sentinelRedisInstance *ri) { /* Only masters */ if (ri->flags & SRI_MASTER) { sentinelCheckObjectivelyDown(ri); - sentinelStartFailoverIfNeeded(ri); + if (sentinelStartFailoverIfNeeded(ri)) + sentinelAskMasterStateToOtherSentinels(ri,SENTINEL_ASK_FORCED); sentinelFailoverStateMachine(ri); - sentinelAskMasterStateToOtherSentinels(ri); + sentinelAskMasterStateToOtherSentinels(ri,SENTINEL_NO_FLAGS); } } From 1e7462344617c54f31fb36060b9cef5c2db25dd0 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 12 Nov 2013 17:07:31 +0100 Subject: [PATCH 0293/2500] Sentinel: new failover algo, desync slaves and update config epoch. --- src/sentinel.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/sentinel.c b/src/sentinel.c index 7cf32d7e0..5ad7bee3f 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1509,6 +1509,12 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { (ri->master->failover_state == SENTINEL_FAILOVER_STATE_WAIT_PROMOTION)) { + /* Now that we are sure the slave was reconfigured as a master + * set the master configuration epoch to the epoch we won the + * election to perform this failover. This will force the other + * Sentinels to update their config (assuming there is not + * a newer one already available). */ + ri->master->config_epoch = ri->master->failover_epoch; ri->master->failover_state = SENTINEL_FAILOVER_STATE_RECONF_SLAVES; ri->master->failover_state_change_time = mstime(); sentinelEvent(REDIS_WARNING,"+promoted-slave",ri,"%@"); @@ -2417,6 +2423,13 @@ char *sentinelVoteLeader(sentinelRedisInstance *master, uint64_t req_epoch, char master->leader_epoch = sentinel.current_epoch; sentinelEvent(REDIS_WARNING,"+vote-for-leader",master,"%s %llu", master->leader, (unsigned long long) master->leader_epoch); + /* If we did not voted for ourselves, set the master failover start + * time to now, in order to force a delay before we can start a + * failover for the same master. + * + * The random addition is useful to desynchronize a bit the slaves + * and reduce the chance that no slave gets majority. */ + master->failover_start_time = mstime() + rand() % 2000; } *leader_epoch = master->leader_epoch; @@ -2671,7 +2684,14 @@ void sentinelFailoverWaitStart(sentinelRedisInstance *ri) { sdsfree(leader); /* If I'm not the leader, I can't continue with the failover. */ - if (!isleader) return; + if (!isleader) { + /* Abort the failover if I'm not the leader after some time. */ + if (mstime() - ri->failover_start_time > 10000) { + sentinelEvent(REDIS_WARNING,"-failover-abort-not-elected",ri,"%@"); + sentinelAbortFailover(ri); + } + return; + } sentinelEvent(REDIS_WARNING,"+elected-leader",ri,"%@"); /* Start the failover going to the next state if enough time has From 4d365131b0db3f173f25ba6fa8512fee2aecc2e3 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 12 Nov 2013 17:21:48 +0100 Subject: [PATCH 0294/2500] Sentinel: added config-epoch to SENTINEL masters output. --- src/sentinel.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/sentinel.c b/src/sentinel.c index 5ad7bee3f..479df17ef 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1939,6 +1939,10 @@ void addReplySentinelRedisInstance(redisClient *c, sentinelRedisInstance *ri) { /* Only masters */ if (ri->flags & SRI_MASTER) { + addReplyBulkCString(c,"config-epoch"); + addReplyBulkLongLong(c,ri->config_epoch); + fields++; + addReplyBulkCString(c,"num-slaves"); addReplyBulkLongLong(c,dictSize(ri->slaves)); fields++; From 37203602f9ff484fc45c387b688cd0e35ac5b98e Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 12 Nov 2013 23:00:14 +0100 Subject: [PATCH 0295/2500] Sentinel: change event name when converting master to slave. --- src/sentinel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sentinel.c b/src/sentinel.c index 479df17ef..af3c264d8 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1543,7 +1543,7 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { ri->master->addr->ip, ri->master->addr->port); if (retval == REDIS_OK) - sentinelEvent(REDIS_NOTICE,"+demote-old-slave",ri,"%@"); + sentinelEvent(REDIS_NOTICE,"+convert-to-slave",ri,"%@"); } } } From 9896fd51e0c77f73931646bdcf117b8c3b379c7a Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 12 Nov 2013 23:07:33 +0100 Subject: [PATCH 0296/2500] Sentinel: receive Pub/Sub messages from slaves. --- src/sentinel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sentinel.c b/src/sentinel.c index af3c264d8..e66b8b84d 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1334,7 +1334,7 @@ void sentinelReconnectInstance(sentinelRedisInstance *ri) { } } /* Pub / Sub */ - if ((ri->flags & SRI_MASTER) && ri->pc == NULL) { + if ((ri->flags & (SRI_MASTER|SRI_SLAVE)) && ri->pc == NULL) { ri->pc = redisAsyncConnect(ri->addr->ip,ri->addr->port); if (ri->pc->err) { sentinelEvent(REDIS_DEBUG,"-pubsub-link-reconnection",ri,"%@ #%s", From f5b5a8eb9f00a8bc7961bf626d5deff28d0aff84 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 13 Nov 2013 10:30:45 +0100 Subject: [PATCH 0297/2500] Sentinel: sentinelResetMaster() new flag to avoid removing set of sentinels. This commit also removes some dead code and cleanup generic flags. --- src/sentinel.c | 36 ++++++++++++++---------------------- 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index e66b8b84d..648e1ac04 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -72,7 +72,6 @@ typedef struct sentinelAddr { #define SRI_FORCE_FAILOVER (1<<13) /* Force failover with master up. */ #define SRI_SCRIPT_KILL_SENT (1<<14) /* SCRIPT KILL already sent on -BUSY */ -#define SENTINEL_NO_FLAGS 0 /* Generic no flags define. */ #define SENTINEL_INFO_PERIOD 10000 #define SENTINEL_PING_PERIOD 1000 #define SENTINEL_ASK_PERIOD 1000 @@ -111,11 +110,13 @@ typedef struct sentinelAddr { #define SENTINEL_MASTER_LINK_STATUS_UP 0 #define SENTINEL_MASTER_LINK_STATUS_DOWN 1 -/* Generic flags that can be used with different functions. */ +/* Generic flags that can be used with different functions. + * They use higher bits to avoid colliding with the function specific + * flags. */ #define SENTINEL_NO_FLAGS 0 -#define SENTINEL_GENERATE_EVENT 1 -#define SENTINEL_LEADER 2 -#define SENTINEL_OBSERVER 4 +#define SENTINEL_GENERATE_EVENT (1<<16) +#define SENTINEL_LEADER (1<<17) +#define SENTINEL_OBSERVER (1<<18) /* Script execution flags and limits. */ #define SENTINEL_SCRIPT_NONE 0 @@ -1079,12 +1080,16 @@ void sentinelDelFlagsToDictOfRedisInstances(dict *instances, int flags) { * 5) In the process of doing this undo the failover if in progress. * 6) Disconnect the connections with the master (will reconnect automatically). */ + +#define SENTINEL_RESET_NO_SENTINELS (1<<0) void sentinelResetMaster(sentinelRedisInstance *ri, int flags) { redisAssert(ri->flags & SRI_MASTER); dictRelease(ri->slaves); - dictRelease(ri->sentinels); ri->slaves = dictCreate(&instancesDictType,NULL); - ri->sentinels = dictCreate(&instancesDictType,NULL); + if (!(flags & SENTINEL_RESET_NO_SENTINELS)) { + dictRelease(ri->sentinels); + ri->sentinels = dictCreate(&instancesDictType,NULL); + } if (ri->cc) sentinelKillLink(ri,ri->cc); if (ri->pc) sentinelKillLink(ri,ri->pc); ri->flags &= SRI_MASTER|SRI_CAN_FAILOVER|SRI_DISCONNECTED; @@ -1134,17 +1139,13 @@ int sentinelResetMastersByPattern(char *pattern, int flags) { * This is used to handle the +switch-master and +redirect-to-master events. * * The function returns REDIS_ERR if the address can't be resolved for some - * reason. Otherwise REDIS_OK is returned. - * - * TODO: make this reset so that original sentinels are re-added with - * same ip / port / runid. - */ + * reason. Otherwise REDIS_OK is returned. */ int sentinelResetMasterAndChangeAddress(sentinelRedisInstance *master, char *ip, int port) { sentinelAddr *oldaddr, *newaddr; newaddr = createSentinelAddr(ip,port); if (newaddr == NULL) return REDIS_ERR; - sentinelResetMaster(master,SENTINEL_NO_FLAGS); + sentinelResetMaster(master,SENTINEL_RESET_NO_SENTINELS); oldaddr = master->addr; master->addr = newaddr; master->o_down_since_time = 0; @@ -2898,15 +2899,6 @@ void sentinelFailoverSwitchToPromotedSlave(sentinelRedisInstance *master) { old_master_ip = sdsdup(master->addr->ip); old_master_port = master->addr->port; sentinelResetMasterAndChangeAddress(master,ref->addr->ip,ref->addr->port); - /* If this is a real switch and not just a user requested reset, we want - * to add all the known instances as slaves, and also all the sentinels - * back to this master. */ - if (master != ref) { - /* TODO: - createSentinelRedisInstance(NULL,SRI_SLAVE - old_master_ip, old_master_port, master->quorum, master); - */ - } sdsfree(old_master_ip); } From 47598f4a88b61118628ca56f686ee52fb2a5d69d Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 13 Nov 2013 13:01:11 +0100 Subject: [PATCH 0298/2500] Sentinel: readd slaves back after a master reset. --- src/sentinel.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/src/sentinel.c b/src/sentinel.c index 648e1ac04..72ef3587b 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -443,6 +443,11 @@ void releaseSentinelAddr(sentinelAddr *sa) { zfree(sa); } +/* Return non-zero if two addresses are equal. */ +int sentinelAddrIsEqual(sentinelAddr *a, sentinelAddr *b) { + return a->port == b->port && !strcasecmp(a->ip,b->ip); +} + /* =========================== Events notification ========================== */ /* Send an event to log, pub/sub, user notification script. @@ -1142,15 +1147,54 @@ int sentinelResetMastersByPattern(char *pattern, int flags) { * reason. Otherwise REDIS_OK is returned. */ int sentinelResetMasterAndChangeAddress(sentinelRedisInstance *master, char *ip, int port) { sentinelAddr *oldaddr, *newaddr; + sentinelAddr **slaves = NULL; + int numslaves = 0, j; + dictIterator *di; + dictEntry *de; newaddr = createSentinelAddr(ip,port); if (newaddr == NULL) return REDIS_ERR; + + /* Make a list of slaves to add back after the reset. + * Don't include the one having the address we are switching to. */ + di = dictGetIterator(master->slaves); + while((de = dictNext(di)) != NULL) { + sentinelRedisInstance *slave = dictGetVal(de); + + if (sentinelAddrIsEqual(slave->addr,newaddr)) continue; + slaves = zrealloc(slaves,sizeof(sentinelAddr*)*(numslaves+1)); + slaves[numslaves++] = createSentinelAddr(slave->addr->ip, + slave->addr->port); + } + dictReleaseIterator(di); + + /* If we are switching to a different address, include the old address + * as a slave as well, so that we'll be able to sense / reconfigure + * the old master. */ + if (!sentinelAddrIsEqual(newaddr,master->addr)) { + slaves = zrealloc(slaves,sizeof(sentinelAddr*)*(numslaves+1)); + slaves[numslaves++] = createSentinelAddr(master->addr->ip, + master->addr->port); + } + + /* Reset and switch address. */ sentinelResetMaster(master,SENTINEL_RESET_NO_SENTINELS); oldaddr = master->addr; master->addr = newaddr; master->o_down_since_time = 0; master->s_down_since_time = 0; + /* Add slaves back. */ + for (j = 0; j < numslaves; j++) { + sentinelRedisInstance *slave; + + slave = createSentinelRedisInstance(NULL,SRI_SLAVE,slaves[j]->ip, + slaves[j]->port, master->quorum, master); + releaseSentinelAddr(slaves[j]); + if (slave) sentinelEvent(REDIS_NOTICE,"+slave",slave,"%@"); + } + zfree(slaves); + /* Release the old address at the end so we are safe even if the function * gets the master->addr->ip and master->addr->port as arguments. */ releaseSentinelAddr(oldaddr); From e1d06fdeb996dd751fa99fb464db0e54da05c9b0 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 13 Nov 2013 13:43:59 +0100 Subject: [PATCH 0299/2500] Sentinel: fix no-down check in master->slave conversion code. --- src/sentinel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sentinel.c b/src/sentinel.c index 72ef3587b..fb5896eb1 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1574,7 +1574,7 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { * configuratio via Pub/Sub if any. */ mstime_t wait_time = SENTINEL_PUBLISH_PERIOD*4; - if (!sentinelRedisInstanceNoDownFor(ri->master,wait_time) || + if (!sentinelRedisInstanceNoDownFor(ri,wait_time) || (mstime()-sentinel.tilt_start_time) < wait_time) return; From 72629ad358a1d45f52edc28a5f6c3f7b30da15ba Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 13 Nov 2013 16:18:23 +0100 Subject: [PATCH 0300/2500] Sentinel: track role change time. Wait before reconfigurations. --- src/sentinel.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/src/sentinel.c b/src/sentinel.c index fb5896eb1..442eebc3e 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -155,6 +155,14 @@ typedef struct sentinelRedisInstance { mstime_t down_after_period; /* Consider it down after that period. */ mstime_t info_refresh; /* Time at which we received INFO output from it. */ + /* Role and the first time we observed it. + * This is useful in order to delay replacing what the instance reports + * with our own configuration. We need to always wait some time in order + * to give a chance to the leader to report the new configuration before + * we do silly things. */ + int role_reported; + mstime_t role_reported_time; + /* Master specific. */ dict *sentinels; /* Other sentinels monitoring the same master. */ dict *slaves; /* Slaves for this master instance. */ @@ -911,6 +919,10 @@ sentinelRedisInstance *createSentinelRedisInstance(char *name, int flags, char * ri->notification_script = NULL; ri->client_reconfig_script = NULL; + /* Role */ + ri->role_reported = ri->flags & (SRI_MASTER|SRI_SLAVE); + ri->role_reported_time = mstime(); + /* Add into the right table. */ dictAdd(table, ri->name, ri); return ri; @@ -1536,6 +1548,11 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { * master, always. */ if ((ri->flags & SRI_MASTER) && role == SRI_SLAVE && ri->slave_master_host) { + if (ri->role_reported != SRI_MASTER) { + ri->role_reported_time = mstime(); + ri->role_reported = SRI_MASTER; + } + sentinelEvent(REDIS_WARNING,"+redirect-to-master",ri, "%s %s %d %s %d", ri->name, ri->addr->ip, ri->addr->port, @@ -1547,6 +1564,11 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { /* Handle slave -> master role switch. */ if ((ri->flags & SRI_SLAVE) && role == SRI_MASTER) { + if (ri->role_reported != SRI_SLAVE) { + ri->role_reported_time = mstime(); + ri->role_reported = SRI_SLAVE; + } + /* If this is a promoted slave we can change state to the * failover state machine. */ if (!sentinel.tilt && @@ -1575,7 +1597,8 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { mstime_t wait_time = SENTINEL_PUBLISH_PERIOD*4; if (!sentinelRedisInstanceNoDownFor(ri,wait_time) || - (mstime()-sentinel.tilt_start_time) < wait_time) + mstime() - ri->role_reported_time < wait_time || + mstime() - sentinel.tilt_start_time < wait_time) return; /* Make sure the master is sane before reconfiguring this instance From b27dd8b9d302b80ae3b9de2e5f53f6faae96c1a3 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 13 Nov 2013 16:21:58 +0100 Subject: [PATCH 0301/2500] Sentinel: make sure role_reported is always updated. --- src/sentinel.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 442eebc3e..a62e64954 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1546,20 +1546,21 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { /* When what we believe is our master, turned into a slave, the wiser * thing we can do is to follow the events and redirect to the new * master, always. */ - if ((ri->flags & SRI_MASTER) && role == SRI_SLAVE && ri->slave_master_host) - { + if ((ri->flags & SRI_MASTER) && role == SRI_SLAVE) { if (ri->role_reported != SRI_MASTER) { ri->role_reported_time = mstime(); ri->role_reported = SRI_MASTER; } - sentinelEvent(REDIS_WARNING,"+redirect-to-master",ri, - "%s %s %d %s %d", - ri->name, ri->addr->ip, ri->addr->port, - ri->slave_master_host, ri->slave_master_port); - sentinelResetMasterAndChangeAddress(ri,ri->slave_master_host, - ri->slave_master_port); - return; /* Don't process anything after this event. */ + if (ri->slave_master_host) { + sentinelEvent(REDIS_WARNING,"+redirect-to-master",ri, + "%s %s %d %s %d", + ri->name, ri->addr->ip, ri->addr->port, + ri->slave_master_host, ri->slave_master_port); + sentinelResetMasterAndChangeAddress(ri,ri->slave_master_host, + ri->slave_master_port); + return; /* Don't process anything after this event. */ + } } /* Handle slave -> master role switch. */ From cc721419ac7cec8e2b4d27c2120d835056f0d98d Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 13 Nov 2013 16:28:52 +0100 Subject: [PATCH 0302/2500] Sentinel: being a master and reporting as slave is considered SDOWN. --- src/sentinel.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index a62e64954..b115de870 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -2319,8 +2319,18 @@ void sentinelCheckSubjectivelyDown(sentinelRedisInstance *ri) { sentinelKillLink(ri,ri->pc); } - /* Update the subjectively down flag. */ - if (elapsed > ri->down_after_period) { + /* Update the subjectively down flag. We believe the instance is in SDOWN + * state if: + * 1) It is not replying. + * 2) We believe it is a master, it reports to be a slave for enough time + * to meet the down_after_period, plus enough time to get two times + * INFO report from the instance. */ + if (elapsed > ri->down_after_period || + (ri->flags & SRI_MASTER && + ri->role_reported == SRI_SLAVE && + mstime() - ri->role_reported_time > + (ri->down_after_period+SENTINEL_INFO_PERIOD*2))) + { /* Is subjectively down */ if ((ri->flags & SRI_S_DOWN) == 0) { sentinelEvent(REDIS_WARNING,"+sdown",ri,"%@"); From 406fee011d23dbe51a297be990e9ee30da5f9bfa Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 13 Nov 2013 16:36:40 +0100 Subject: [PATCH 0303/2500] Sentinel: role reporting fixed and added in SENTINEL output. --- src/sentinel.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index b115de870..b60c375de 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1547,9 +1547,9 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { * thing we can do is to follow the events and redirect to the new * master, always. */ if ((ri->flags & SRI_MASTER) && role == SRI_SLAVE) { - if (ri->role_reported != SRI_MASTER) { + if (ri->role_reported != SRI_SLAVE) { ri->role_reported_time = mstime(); - ri->role_reported = SRI_MASTER; + ri->role_reported = SRI_SLAVE; } if (ri->slave_master_host) { @@ -1565,9 +1565,9 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { /* Handle slave -> master role switch. */ if ((ri->flags & SRI_SLAVE) && role == SRI_MASTER) { - if (ri->role_reported != SRI_SLAVE) { + if (ri->role_reported != SRI_MASTER) { ri->role_reported_time = mstime(); - ri->role_reported = SRI_SLAVE; + ri->role_reported = SRI_MASTER; } /* If this is a promoted slave we can change state to the @@ -2004,6 +2004,15 @@ void addReplySentinelRedisInstance(redisClient *c, sentinelRedisInstance *ri) { addReplyBulkCString(c,"info-refresh"); addReplyBulkLongLong(c,mstime() - ri->info_refresh); fields++; + + addReplyBulkCString(c,"role-reported"); + addReplyBulkCString(c, (ri->role_reported == SRI_MASTER) ? "master" : + "slave"); + fields++; + + addReplyBulkCString(c,"role-reported-time"); + addReplyBulkLongLong(c,mstime() - ri->role_reported_time); + fields++; } /* Only masters */ From f81306af33428671201970c7adf34fb013438283 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 13 Nov 2013 17:02:09 +0100 Subject: [PATCH 0304/2500] Sentinel: safer slave reconfig, master reported role should match. --- src/sentinel.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/sentinel.c b/src/sentinel.c index b60c375de..d6b394beb 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1605,6 +1605,7 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { /* Make sure the master is sane before reconfiguring this instance * into a slave. */ if (ri->master->flags & SRI_MASTER && + ri->master->role_reported == SRI_MASTER && (ri->master->flags & (SRI_S_DOWN|SRI_O_DOWN)) == 0 && (mstime() - ri->master->info_refresh) < SENTINEL_INFO_PERIOD*2) { From 4608f5da94b7076a74475a75d4f22b8c97649312 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 13 Nov 2013 17:03:42 +0100 Subject: [PATCH 0305/2500] Sentinel: redirect-to-master is not ok with new algorithm. Now Sentinel believe the current configuration is always the winner and should be applied by Sentinels instead of trying to adapt our view of the cluster based on what we observe. So the only way to modify what a Sentinel believe to be the truth is to win an election and advertise the new configuration via Pub / Sub with a greater configuration epoch. --- src/sentinel.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index d6b394beb..89cb631cb 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1543,24 +1543,12 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { * Some things will not happen if sentinel.tilt is true, but some will * still be processed. */ - /* When what we believe is our master, turned into a slave, the wiser - * thing we can do is to follow the events and redirect to the new - * master, always. */ + /* Handle master -> slave role switch. */ if ((ri->flags & SRI_MASTER) && role == SRI_SLAVE) { if (ri->role_reported != SRI_SLAVE) { ri->role_reported_time = mstime(); ri->role_reported = SRI_SLAVE; } - - if (ri->slave_master_host) { - sentinelEvent(REDIS_WARNING,"+redirect-to-master",ri, - "%s %s %d %s %d", - ri->name, ri->addr->ip, ri->addr->port, - ri->slave_master_host, ri->slave_master_port); - sentinelResetMasterAndChangeAddress(ri,ri->slave_master_host, - ri->slave_master_port); - return; /* Don't process anything after this event. */ - } } /* Handle slave -> master role switch. */ From e7fb6c5697b381c8c276f7f461c05755ac1670b9 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 14 Nov 2013 00:08:13 +0100 Subject: [PATCH 0306/2500] Sentinel: remember last time slave changed master. --- src/sentinel.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 89cb631cb..a6e0c3099 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -162,6 +162,7 @@ typedef struct sentinelRedisInstance { * we do silly things. */ int role_reported; mstime_t role_reported_time; + mstime_t slave_conf_change_time; /* Last time slave master addr changed. */ /* Master specific. */ dict *sentinels; /* Other sentinels monitoring the same master. */ @@ -922,6 +923,7 @@ sentinelRedisInstance *createSentinelRedisInstance(char *name, int flags, char * /* Role */ ri->role_reported = ri->flags & (SRI_MASTER|SRI_SLAVE); ri->role_reported_time = mstime(); + ri->slave_conf_change_time = mstime(); /* Add into the right table. */ dictAdd(table, ri->name, ri); @@ -1515,13 +1517,24 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { if (role == SRI_SLAVE) { /* master_host: */ if (sdslen(l) >= 12 && !memcmp(l,"master_host:",12)) { - sdsfree(ri->slave_master_host); - ri->slave_master_host = sdsnew(l+12); + if (ri->slave_master_host == NULL || + strcasecmp(l+12,ri->slave_master_host)) + { + sdsfree(ri->slave_master_host); + ri->slave_master_host = sdsnew(l+12); + ri->slave_conf_change_time = mstime(); + } } /* master_port: */ - if (sdslen(l) >= 12 && !memcmp(l,"master_port:",12)) - ri->slave_master_port = atoi(l+12); + if (sdslen(l) >= 12 && !memcmp(l,"master_port:",12)) { + int slave_master_port = atoi(l+12); + + if (ri->slave_master_port != slave_master_port) { + ri->slave_master_port = slave_master_port; + ri->slave_conf_change_time = mstime(); + } + } /* master_link_status: */ if (sdslen(l) >= 19 && !memcmp(l,"master_link_status:",19)) { @@ -1548,6 +1561,7 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { if (ri->role_reported != SRI_SLAVE) { ri->role_reported_time = mstime(); ri->role_reported = SRI_SLAVE; + ri->slave_conf_change_time = mstime(); } } From 5ceb9dc93ee77af81e4ffa5c5e788c12631953a9 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 14 Nov 2013 00:29:38 +0100 Subject: [PATCH 0307/2500] Sentinel: reconfigure slaves to right master. --- src/sentinel.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/src/sentinel.c b/src/sentinel.c index a6e0c3099..4e42f2b17 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1596,7 +1596,7 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { /* A slave turned into a master. We want to force our view and * reconfigure as slave, but make sure to wait some time before * doing this in order to make sure to receive an updated - * configuratio via Pub/Sub if any. */ + * configuration via Pub/Sub if any. */ mstime_t wait_time = SENTINEL_PUBLISH_PERIOD*4; if (!sentinelRedisInstanceNoDownFor(ri,wait_time) || @@ -1620,6 +1620,32 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { } } + /* Handle slaves replicating to a different master address. */ + if ((ri->flags & SRI_SLAVE) && !sentinel.tilt && + (ri->slave_master_port != ri->master->addr->port || + strcasecmp(ri->slave_master_host,ri->master->addr->ip))) + { + mstime_t wait_time = SENTINEL_PUBLISH_PERIOD*4; + + if (!sentinelRedisInstanceNoDownFor(ri,wait_time) || + mstime() - ri->slave_conf_change_time < wait_time) + return; + + /* Make sure the master is sane before reconfiguring this instance + * into a slave. */ + if (ri->master->flags & SRI_MASTER && + ri->master->role_reported == SRI_MASTER && + (ri->master->flags & (SRI_S_DOWN|SRI_O_DOWN)) == 0 && + (mstime() - ri->master->info_refresh) < SENTINEL_INFO_PERIOD*2) + { + int retval = sentinelSendSlaveOf(ri, + ri->master->addr->ip, + ri->master->addr->port); + if (retval == REDIS_OK) + sentinelEvent(REDIS_NOTICE,"+fix-slave-config",ri,"%@"); + } + } + /* None of the following conditions are processed when in tilt mode, so * return asap. */ if (sentinel.tilt) return; From f4216da4f104cdef43db3475e738e1047f5f6064 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 14 Nov 2013 00:36:43 +0100 Subject: [PATCH 0308/2500] Sentinel: simplify and refactor slave reconfig code. --- src/sentinel.c | 43 +++++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 4e42f2b17..a01ae8fdf 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1431,6 +1431,19 @@ void sentinelReconnectInstance(sentinelRedisInstance *ri) { /* ======================== Redis instances pinging ======================== */ +/* Return true if master looks "sane", that is: + * 1) It is actually a master in the current configuration. + * 2) It reports itself as a master. + * 3) It is not SDOWN or ODOWN. + * 4) We obtained last INFO no more than two times the INFO period of time ago. */ +int sentinelMasterLooksSane(sentinelRedisInstance *master) { + return + master->flags & SRI_MASTER && + master->role_reported == SRI_MASTER && + (master->flags & (SRI_S_DOWN|SRI_O_DOWN)) == 0 && + (mstime() - master->info_refresh) < SENTINEL_INFO_PERIOD*2; +} + /* Process the INFO output from masters. */ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { sds *lines; @@ -1594,22 +1607,13 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { "start",ri->master->addr,ri->addr); } else if (!sentinel.tilt) { /* A slave turned into a master. We want to force our view and - * reconfigure as slave, but make sure to wait some time before - * doing this in order to make sure to receive an updated - * configuration via Pub/Sub if any. */ + * reconfigure as slave. Wait some time after the change before + * going forward, to receive new configs if any. */ mstime_t wait_time = SENTINEL_PUBLISH_PERIOD*4; - if (!sentinelRedisInstanceNoDownFor(ri,wait_time) || - mstime() - ri->role_reported_time < wait_time || - mstime() - sentinel.tilt_start_time < wait_time) - return; - - /* Make sure the master is sane before reconfiguring this instance - * into a slave. */ - if (ri->master->flags & SRI_MASTER && - ri->master->role_reported == SRI_MASTER && - (ri->master->flags & (SRI_S_DOWN|SRI_O_DOWN)) == 0 && - (mstime() - ri->master->info_refresh) < SENTINEL_INFO_PERIOD*2) + if (sentinelMasterLooksSane(ri->master) && + sentinelRedisInstanceNoDownFor(ri,wait_time) && + mstime() - ri->role_reported_time > wait_time) { int retval = sentinelSendSlaveOf(ri, ri->master->addr->ip, @@ -1627,16 +1631,11 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { { mstime_t wait_time = SENTINEL_PUBLISH_PERIOD*4; - if (!sentinelRedisInstanceNoDownFor(ri,wait_time) || - mstime() - ri->slave_conf_change_time < wait_time) - return; - /* Make sure the master is sane before reconfiguring this instance * into a slave. */ - if (ri->master->flags & SRI_MASTER && - ri->master->role_reported == SRI_MASTER && - (ri->master->flags & (SRI_S_DOWN|SRI_O_DOWN)) == 0 && - (mstime() - ri->master->info_refresh) < SENTINEL_INFO_PERIOD*2) + if (sentinelMasterLooksSane(ri->master) && + sentinelRedisInstanceNoDownFor(ri,wait_time) && + mstime() - ri->slave_conf_change_time > wait_time) { int retval = sentinelSendSlaveOf(ri, ri->master->addr->ip, From d1864486fc85426ee9b6a165d65022179b10fbe7 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 14 Nov 2013 10:23:05 +0100 Subject: [PATCH 0309/2500] Sentinel: fix conditional to only affect slaves with wrong master. --- src/sentinel.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/sentinel.c b/src/sentinel.c index a01ae8fdf..31f583388 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1626,6 +1626,7 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { /* Handle slaves replicating to a different master address. */ if ((ri->flags & SRI_SLAVE) && !sentinel.tilt && + role == SRI_SLAVE && (ri->slave_master_port != ri->master->addr->port || strcasecmp(ri->slave_master_host,ri->master->addr->ip))) { From a031dc6efebb7d20ce5205235377b793759dc397 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 14 Nov 2013 10:23:54 +0100 Subject: [PATCH 0310/2500] Sentinel: master address selection in get-master-address refactored. --- src/sentinel.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 31f583388..b59f2dc8d 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1226,6 +1226,24 @@ int sentinelRedisInstanceNoDownFor(sentinelRedisInstance *ri, mstime_t ms) { return most_recent == 0 || (mstime() - most_recent) > ms; } +/* Return the current master address, that is, its address or the address + * of the promoted slave if already operational. */ +sentinelAddr *sentinelGetCurrentMasterAddress(sentinelRedisInstance *master) { + /* If we are failing over the master, and the state is already + * SENTINEL_FAILOVER_STATE_RECONF_SLAVES or greater, it means that we + * already have the new configuration epoch in the master, and the + * slave acknowledged the configuration switch. Advertise the new + * address. */ + if ((master->flags & SRI_FAILOVER_IN_PROGRESS) && + master->promoted_slave && + master->failover_state >= SENTINEL_FAILOVER_STATE_RECONF_SLAVES) + { + return master->promoted_slave->addr; + } else { + return master->addr; + } +} + /* ============================ Config handling ============================= */ char *sentinelHandleConfiguration(char **argv, int argc) { sentinelRedisInstance *ri; @@ -2217,18 +2235,8 @@ void sentinelCommand(redisClient *c) { } else if (ri->info_refresh == 0) { addReplySds(c,sdsnew("-IDONTKNOW I have not enough information to reply. Please ask another Sentinel.\r\n")); } else { - sentinelAddr *addr = ri->addr; + sentinelAddr *addr = sentinelGetCurrentMasterAddress(ri); - /* If we are in the middle of a failover, and the slave was - * already successfully switched to master role, we can advertise - * the new address as slave in order to allow clients to talk - * with the new master ASAP. */ - if ((ri->flags & SRI_FAILOVER_IN_PROGRESS) && - ri->promoted_slave && - ri->failover_state >= SENTINEL_FAILOVER_STATE_RECONF_SLAVES) - { - addr = ri->promoted_slave->addr; - } addReplyMultiBulkLen(c,2); addReplyBulkCString(c,addr->ip); addReplyBulkLongLong(c,addr->port); From 297df0e910947f3591a111af00cb3d539977e6d9 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 14 Nov 2013 10:25:55 +0100 Subject: [PATCH 0311/2500] Sentinel: fix address of master in Hello messages. Once we switched configuration during a failover, we should advertise the new address. This was a serious race condition as the Sentinel performing the failover for a moment advertised the old address with the new configuration epoch: once trasmitted to the other Sentinels the broken configuration would remain there forever, until the next failover (because a greater configuration epoch is required to overwrite an older one). --- src/sentinel.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/sentinel.c b/src/sentinel.c index b59f2dc8d..e6c39dd53 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1936,6 +1936,7 @@ void sentinelPingInstance(sentinelRedisInstance *ri) { char payload[REDIS_IP_STR_LEN+1024]; sentinelRedisInstance *master = (ri->flags & SRI_MASTER) ? ri : ri->master; + sentinelAddr *master_addr = sentinelGetCurrentMasterAddress(master); snprintf(payload,sizeof(payload), "%s,%d,%s,%d,%llu," /* Info about this sentinel. */ @@ -1944,7 +1945,7 @@ void sentinelPingInstance(sentinelRedisInstance *ri) { (master->flags & SRI_CAN_FAILOVER) != 0, (unsigned long long) sentinel.current_epoch, /* --- */ - master->name,master->addr->ip,master->addr->port, + master->name,master_addr->ip,master_addr->port, master->config_epoch); retval = redisAsyncCommand(ri->cc, sentinelPublishReplyCallback, NULL, "PUBLISH %s %s", From 00cad98228bb0f6d75f6e7106c26aa884b9fa312 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 18 Nov 2013 10:08:06 +0100 Subject: [PATCH 0312/2500] Sentinel: election timeout define. --- src/sentinel.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/sentinel.c b/src/sentinel.c index e6c39dd53..88d057e1c 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -87,6 +87,7 @@ typedef struct sentinelAddr { #define SENTINEL_MIN_LINK_RECONNECT_PERIOD 15000 #define SENTINEL_DEFAULT_FAILOVER_TIMEOUT (60*15*1000) #define SENTINEL_MAX_PENDING_COMMANDS 100 +#define SENTINEL_ELECTION_TIMEOUT 10000 /* How many milliseconds is an information valid? This applies for instance * to the reply to SENTINEL IS-MASTER-DOWN-BY-ADDR replies. */ @@ -2816,7 +2817,7 @@ void sentinelFailoverWaitStart(sentinelRedisInstance *ri) { /* If I'm not the leader, I can't continue with the failover. */ if (!isleader) { /* Abort the failover if I'm not the leader after some time. */ - if (mstime() - ri->failover_start_time > 10000) { + if (mstime() - ri->failover_start_time > SENTINEL_ELECTION_TIMEOUT) { sentinelEvent(REDIS_WARNING,"-failover-abort-not-elected",ri,"%@"); sentinelAbortFailover(ri); } From 7b7763ff3ec68c8c196e4afcfee0ce5b8c0a3e97 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 18 Nov 2013 11:12:58 +0100 Subject: [PATCH 0313/2500] Sentinel: state machine and timeouts simplified. --- src/sentinel.c | 95 +++++++++++++++++++++++++------------------------- 1 file changed, 48 insertions(+), 47 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 88d057e1c..f6b7c019e 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -85,7 +85,7 @@ typedef struct sentinelAddr { #define SENTINEL_SLAVE_RECONF_RETRY_PERIOD 10000 #define SENTINEL_DEFAULT_PARALLEL_SYNCS 1 #define SENTINEL_MIN_LINK_RECONNECT_PERIOD 15000 -#define SENTINEL_DEFAULT_FAILOVER_TIMEOUT (60*15*1000) +#define SENTINEL_DEFAULT_FAILOVER_TIMEOUT (60*5*1000) #define SENTINEL_MAX_PENDING_COMMANDS 100 #define SENTINEL_ELECTION_TIMEOUT 10000 @@ -105,8 +105,7 @@ typedef struct sentinelAddr { #define SENTINEL_FAILOVER_STATE_WAIT_NEXT_SLAVE 6 /* wait replication */ #define SENTINEL_FAILOVER_STATE_ALERT_CLIENTS 7 /* Run user script. */ #define SENTINEL_FAILOVER_STATE_WAIT_ALERT_SCRIPT 8 /* Wait script exec. */ -#define SENTINEL_FAILOVER_STATE_DETECT_END 9 /* Check for failover end. */ -#define SENTINEL_FAILOVER_STATE_UPDATE_CONFIG 10 /* Monitor promoted slave. */ +#define SENTINEL_FAILOVER_STATE_UPDATE_CONFIG 9 /* Monitor promoted slave. */ #define SENTINEL_MASTER_LINK_STATUS_UP 0 #define SENTINEL_MASTER_LINK_STATUS_DOWN 1 @@ -1693,10 +1692,6 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { ri->flags &= ~SRI_RECONF_INPROG; ri->flags |= SRI_RECONF_DONE; sentinelEvent(REDIS_NOTICE,"+slave-reconf-done",ri,"%@"); - /* If we are moving forward (a new slave is now configured) - * we update the change_time as we are conceptually passing - * to the next slave. */ - ri->failover_state_change_time = mstime(); } } } @@ -1968,7 +1963,6 @@ const char *sentinelFailoverStateStr(int state) { case SENTINEL_FAILOVER_STATE_WAIT_PROMOTION: return "wait_promotion"; case SENTINEL_FAILOVER_STATE_RECONF_SLAVES: return "reconf_slaves"; case SENTINEL_FAILOVER_STATE_ALERT_CLIENTS: return "alert_clients"; - case SENTINEL_FAILOVER_STATE_DETECT_END: return "detect_end"; case SENTINEL_FAILOVER_STATE_UPDATE_CONFIG: return "update_config"; default: return "unknown"; } @@ -2816,17 +2810,20 @@ void sentinelFailoverWaitStart(sentinelRedisInstance *ri) { /* If I'm not the leader, I can't continue with the failover. */ if (!isleader) { + int election_timeout = SENTINEL_ELECTION_TIMEOUT; + + /* The election timeout is the MIN between SENTINEL_ELECTION_TIMEOUT + * and the configured failover timeout. */ + if (election_timeout > ri->failover_timeout) + election_timeout = ri->failover_timeout; /* Abort the failover if I'm not the leader after some time. */ - if (mstime() - ri->failover_start_time > SENTINEL_ELECTION_TIMEOUT) { + if (mstime() - ri->failover_start_time > election_timeout) { sentinelEvent(REDIS_WARNING,"-failover-abort-not-elected",ri,"%@"); sentinelAbortFailover(ri); } return; } sentinelEvent(REDIS_WARNING,"+elected-leader",ri,"%@"); - - /* Start the failover going to the next state if enough time has - * elapsed. */ ri->failover_state = SENTINEL_FAILOVER_STATE_SELECT_SLAVE; ri->failover_state_change_time = mstime(); sentinelEvent(REDIS_WARNING,"+failover-state-select-slave",ri,"%@"); @@ -2835,6 +2832,8 @@ void sentinelFailoverWaitStart(sentinelRedisInstance *ri) { void sentinelFailoverSelectSlave(sentinelRedisInstance *ri) { sentinelRedisInstance *slave = sentinelSelectSlave(ri); + /* We don't handle the timeout in this state as the function aborts + * the failover or go forward in the next state. */ if (slave == NULL) { sentinelEvent(REDIS_WARNING,"-failover-abort-no-good-slave",ri,"%@"); sentinelAbortFailover(ri); @@ -2852,7 +2851,16 @@ void sentinelFailoverSelectSlave(sentinelRedisInstance *ri) { void sentinelFailoverSendSlaveOfNoOne(sentinelRedisInstance *ri) { int retval; - if (ri->promoted_slave->flags & SRI_DISCONNECTED) return; + /* We can't send the command to the promoted slave if it is now + * disconnected. Retry again and again with this state until the timeout + * is reached, then abort the failover. */ + if (ri->promoted_slave->flags & SRI_DISCONNECTED) { + if (mstime() - ri->failover_state_change_time > ri->failover_timeout) { + sentinelEvent(REDIS_WARNING,"-failover-abort-slave-timeout",ri,"%@"); + sentinelAbortFailover(ri); + } + return; + } /* Send SLAVEOF NO ONE command to turn the slave into a master. * We actually register a generic callback for this command as we don't @@ -2869,16 +2877,11 @@ void sentinelFailoverSendSlaveOfNoOne(sentinelRedisInstance *ri) { /* We actually wait for promotion indirectly checking with INFO when the * slave turns into a master. */ void sentinelFailoverWaitPromotion(sentinelRedisInstance *ri) { - mstime_t elapsed = mstime() - ri->failover_state_change_time; - - if (elapsed >= SENTINEL_PROMOTION_RETRY_PERIOD) { - sentinelEvent(REDIS_WARNING,"-promotion-timeout",ri->promoted_slave, - "%@"); - sentinelEvent(REDIS_WARNING,"+failover-state-select-slave",ri,"%@"); - ri->failover_state = SENTINEL_FAILOVER_STATE_SELECT_SLAVE; - ri->failover_state_change_time = mstime(); - ri->promoted_slave->flags &= ~SRI_PROMOTED; - ri->promoted_slave = NULL; + /* Just handle the timeout. Switching to the next state is handled + * by the function parsing the INFO command of the promoted slave. */ + if (mstime() - ri->failover_state_change_time > ri->failover_timeout) { + sentinelEvent(REDIS_WARNING,"-failover-abort-slave-timeout",ri,"%@"); + sentinelAbortFailover(ri); } } @@ -3002,6 +3005,8 @@ void sentinelFailoverReconfNextSlave(sentinelRedisInstance *master) { } } dictReleaseIterator(di); + + /* Check if all the slaves are reconfigured and handle timeout. */ sentinelFailoverDetectEnd(master); } @@ -3049,50 +3054,46 @@ void sentinelFailoverStateMachine(sentinelRedisInstance *ri) { case SENTINEL_FAILOVER_STATE_RECONF_SLAVES: sentinelFailoverReconfNextSlave(ri); break; - case SENTINEL_FAILOVER_STATE_DETECT_END: - sentinelFailoverDetectEnd(ri); - break; } } -/* Abort a failover in progress with the following steps: - * 1) Set the master back to the original one, increment the config epoch. - * 2) Reconfig slaves to replicate to the old master. - * 3) Reconfig the promoted slave as a slave as well. */ +/* Abort a failover in progress: + * + * This function can only be called before the promoted slave acknowledged + * the slave -> master switch. Otherwise the failover can't be aborted and + * will reach its end. + * + * If there is a promoted slave and we already got acknowledge of the + * slave -> master switch, we clear our flags and redirect to the + * new master. Eventually the config will be propagated if it is the one + * with the greater config epoch for this master. + * + * Otherwise if we still did not received the acknowledgement from the + * promoted slave, or there is no promoted slave at all, we just clear the + * failover-in-progress state as there is nothing to do (if the promoted + * slave for some reason actually received our "SLAVEOF NO ONE" command + * even if we did not received the ACK, it will be reverted to slave again + * by one of the Sentinels). */ void sentinelAbortFailover(sentinelRedisInstance *ri) { dictIterator *di; dictEntry *de; - int sentinel_role; redisAssert(ri->flags & SRI_FAILOVER_IN_PROGRESS); + redisAssert(ri->failover_state <= SENTINEL_FAILOVER_STATE_WAIT_PROMOTION); - /* Clear failover related flags from slaves. - * Also if we are the leader make sure to send SLAVEOF commands to all the - * already reconfigured slaves in order to turn them back into slaves of - * the original master. */ + /* Clear failover related flags from slaves. */ di = dictGetIterator(ri->slaves); while((de = dictNext(di)) != NULL) { sentinelRedisInstance *slave = dictGetVal(de); - if (!(slave->flags & SRI_DISCONNECTED) && - (slave->flags & (SRI_PROMOTED|SRI_RECONF_SENT|SRI_RECONF_INPROG| - SRI_RECONF_DONE))) - { - int retval; - - retval = sentinelSendSlaveOf(slave,ri->addr->ip,ri->addr->port); - if (retval == REDIS_OK) - sentinelEvent(REDIS_NOTICE,"-slave-reconf-undo",slave,"%@"); - } slave->flags &= ~(SRI_RECONF_SENT|SRI_RECONF_INPROG|SRI_RECONF_DONE); } dictReleaseIterator(di); - sentinel_role = SENTINEL_LEADER; ri->flags &= ~(SRI_FAILOVER_IN_PROGRESS|SRI_FORCE_FAILOVER); ri->failover_state = SENTINEL_FAILOVER_STATE_NONE; ri->failover_state_change_time = mstime(); if (ri->promoted_slave) { - sentinelCallClientReconfScript(ri,sentinel_role,"abort", + sentinelCallClientReconfScript(ri,SENTINEL_LEADER,"abort", ri->promoted_slave->addr,ri->addr); ri->promoted_slave->flags &= ~SRI_PROMOTED; ri->promoted_slave = NULL; From 16bc1ae5f4685ddb770fdd5bfdba9a8f5e2fe3ba Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 18 Nov 2013 11:30:08 +0100 Subject: [PATCH 0314/2500] Sentinel: failover restart time is now multiple of failover timeout. Also defaulf failover timeout changed to 3 minutes as the failover is a fairly fast procedure most of the times, unless there are a very big number of slaves and the user picked to configure them sequentially (in that case the user should change the failover timeout accordingly). --- src/sentinel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index f6b7c019e..b46ab3064 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -85,7 +85,7 @@ typedef struct sentinelAddr { #define SENTINEL_SLAVE_RECONF_RETRY_PERIOD 10000 #define SENTINEL_DEFAULT_PARALLEL_SYNCS 1 #define SENTINEL_MIN_LINK_RECONNECT_PERIOD 15000 -#define SENTINEL_DEFAULT_FAILOVER_TIMEOUT (60*5*1000) +#define SENTINEL_DEFAULT_FAILOVER_TIMEOUT (60*3*1000) #define SENTINEL_MAX_PENDING_COMMANDS 100 #define SENTINEL_ELECTION_TIMEOUT 10000 @@ -2713,7 +2713,7 @@ int sentinelStartFailoverIfNeeded(sentinelRedisInstance *master) { /* Last failover attempt started too little time ago? */ if (mstime() - master->failover_start_time < - SENTINEL_PUBLISH_PERIOD*4) return 0; + master->failover_timeout*2) return 0; sentinelStartFailover(master); return 1; From 5196131fb99df2008d50e4c4947e1652a8ba5dd3 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 18 Nov 2013 11:37:24 +0100 Subject: [PATCH 0315/2500] Sentinel: slaves reconfig delay modified. The time Sentinel waits since the slave is detected to be configured to the wrong master, before reconfiguring it, is now the failover_timeout time as this makes more sense in order to give the Sentinel performing the failover enoung time to reconfigure the slaves slowly (if required by the configuration). Also we now PUBLISH more frequently the new configuraiton as this allows to switch the reapprearing master back to slave faster. --- src/sentinel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index b46ab3064..2c4e83438 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -75,7 +75,7 @@ typedef struct sentinelAddr { #define SENTINEL_INFO_PERIOD 10000 #define SENTINEL_PING_PERIOD 1000 #define SENTINEL_ASK_PERIOD 1000 -#define SENTINEL_PUBLISH_PERIOD 5000 +#define SENTINEL_PUBLISH_PERIOD 2000 #define SENTINEL_DOWN_AFTER_PERIOD 30000 #define SENTINEL_HELLO_CHANNEL "__sentinel__:hello" #define SENTINEL_TILT_TRIGGER 2000 @@ -1648,7 +1648,7 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { (ri->slave_master_port != ri->master->addr->port || strcasecmp(ri->slave_master_host,ri->master->addr->ip))) { - mstime_t wait_time = SENTINEL_PUBLISH_PERIOD*4; + mstime_t wait_time = ri->master->failover_timeout; /* Make sure the master is sane before reconfiguring this instance * into a slave. */ From 9cc4330b0683e0afb27f6ade3feaba14978f3cc6 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 18 Nov 2013 11:43:35 +0100 Subject: [PATCH 0316/2500] Sentinel: failover abort function simplified. --- src/sentinel.c | 29 ++--------------------------- 1 file changed, 2 insertions(+), 27 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 2c4e83438..18a7058e3 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1155,7 +1155,7 @@ int sentinelResetMastersByPattern(char *pattern, int flags) { /* Reset the specified master with sentinelResetMaster(), and also change * the ip:port address, but take the name of the instance unmodified. * - * This is used to handle the +switch-master and +redirect-to-master events. + * This is used to handle the +switch-master event. * * The function returns REDIS_ERR if the address can't be resolved for some * reason. Otherwise REDIS_OK is returned. */ @@ -3061,40 +3061,15 @@ void sentinelFailoverStateMachine(sentinelRedisInstance *ri) { * * This function can only be called before the promoted slave acknowledged * the slave -> master switch. Otherwise the failover can't be aborted and - * will reach its end. - * - * If there is a promoted slave and we already got acknowledge of the - * slave -> master switch, we clear our flags and redirect to the - * new master. Eventually the config will be propagated if it is the one - * with the greater config epoch for this master. - * - * Otherwise if we still did not received the acknowledgement from the - * promoted slave, or there is no promoted slave at all, we just clear the - * failover-in-progress state as there is nothing to do (if the promoted - * slave for some reason actually received our "SLAVEOF NO ONE" command - * even if we did not received the ACK, it will be reverted to slave again - * by one of the Sentinels). */ + * will reach its end (possibly by timeout). */ void sentinelAbortFailover(sentinelRedisInstance *ri) { - dictIterator *di; - dictEntry *de; - redisAssert(ri->flags & SRI_FAILOVER_IN_PROGRESS); redisAssert(ri->failover_state <= SENTINEL_FAILOVER_STATE_WAIT_PROMOTION); - /* Clear failover related flags from slaves. */ - di = dictGetIterator(ri->slaves); - while((de = dictNext(di)) != NULL) { - sentinelRedisInstance *slave = dictGetVal(de); - slave->flags &= ~(SRI_RECONF_SENT|SRI_RECONF_INPROG|SRI_RECONF_DONE); - } - dictReleaseIterator(di); - ri->flags &= ~(SRI_FAILOVER_IN_PROGRESS|SRI_FORCE_FAILOVER); ri->failover_state = SENTINEL_FAILOVER_STATE_NONE; ri->failover_state_change_time = mstime(); if (ri->promoted_slave) { - sentinelCallClientReconfScript(ri,SENTINEL_LEADER,"abort", - ri->promoted_slave->addr,ri->addr); ri->promoted_slave->flags &= ~SRI_PROMOTED; ri->promoted_slave = NULL; } From a6c9d2d79641c9aa0348689123862c80a64c90b5 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 18 Nov 2013 16:02:58 +0100 Subject: [PATCH 0317/2500] Sentinel: added config options useful to take state on config rewrite. We'll use CONFIG REWRITE (internally) in order to store the new configuration of a Sentinel after the internal state changes. In order to do so, we need configuration options (that usually the user will not touch at all) about config epoch of the master, Sentinels and Slaves known for this master, and so forth. --- src/sentinel.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/src/sentinel.c b/src/sentinel.c index 18a7058e3..492614702 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1312,6 +1312,35 @@ char *sentinelHandleConfiguration(char **argv, int argc) { ri = sentinelGetMasterByName(argv[1]); if (!ri) return "No such master with specified name."; ri->auth_pass = sdsnew(argv[2]); + } else if (!strcasecmp(argv[0],"config-epoch") && argc == 3) { + /* config-epoch */ + ri = sentinelGetMasterByName(argv[1]); + if (!ri) return "No such master with specified name."; + ri->config_epoch = strtoull(argv[2],NULL,10); + if (ri->config_epoch > sentinel.current_epoch) + sentinel.current_epoch = ri->config_epoch; + } else if (!strcasecmp(argv[0],"slave") && argc == 3) { + sentinelRedisInstance *slave; + + /* slave */ + ri = sentinelGetMasterByName(argv[1]); + if (!ri) return "No such master with specified name."; + if ((slave = createSentinelRedisInstance(NULL,SRI_SLAVE,argv[2], + atoi(argv[3]), ri->quorum, ri)) == NULL) + { + return "Wrong hostname or port for slave."; + } + } else if (!strcasecmp(argv[0],"sentinel") && argc == 3) { + sentinelRedisInstance *si; + + /* sentinel */ + ri = sentinelGetMasterByName(argv[1]); + if (!ri) return "No such master with specified name."; + if ((si = createSentinelRedisInstance(NULL,SRI_SENTINEL,argv[2], + atoi(argv[3]), ri->quorum, ri)) == NULL) + { + return "Wrong hostname or port for sentinel."; + } } else { return "Unrecognized sentinel configuration statement."; } From 33ccfbb62428e0b7a3824ac8d093486541b2a297 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 18 Nov 2013 18:18:04 +0100 Subject: [PATCH 0318/2500] Fix typo 'configuraiton' in rewriteConfigRewriteLine() comment. --- src/config.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/config.c b/src/config.c index 8bfb208f9..c0e780ebb 100644 --- a/src/config.c +++ b/src/config.c @@ -1258,7 +1258,7 @@ struct rewriteConfigState *rewriteConfigReadOldFile(char *path) { /* Rewrite the specified configuration option with the new "line". * It progressively uses lines of the file that were already used for the same - * configuraiton option in the old version of the file, removing that line from + * configuration option in the old version of the file, removing that line from * the map of options -> line numbers. * * If there are lines associated with a given configuration option and From 2c5afa88c7af8f9c6db9f52e7488ed1e1752904f Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 19 Nov 2013 09:28:47 +0100 Subject: [PATCH 0319/2500] Sentinel: can-failover option removed, many comments fixed. --- src/sentinel.c | 105 ++++++++++++++----------------------------------- 1 file changed, 29 insertions(+), 76 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 492614702..05c066f56 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -58,25 +58,20 @@ typedef struct sentinelAddr { #define SRI_O_DOWN (1<<5) /* Objectively down (quorum reached). */ #define SRI_MASTER_DOWN (1<<6) /* A Sentinel with this flag set thinks that its master is down. */ -/* SRI_CAN_FAILOVER when set in an SRI_MASTER instance means that we are - * allowed to perform the failover for this master. - * When set in a SRI_SENTINEL instance means that sentinel is allowed to - * perform the failover on its master. */ -#define SRI_CAN_FAILOVER (1<<7) -#define SRI_FAILOVER_IN_PROGRESS (1<<8) /* Failover is in progress for +#define SRI_FAILOVER_IN_PROGRESS (1<<7) /* Failover is in progress for this master. */ -#define SRI_PROMOTED (1<<9) /* Slave selected for promotion. */ -#define SRI_RECONF_SENT (1<<10) /* SLAVEOF sent. */ -#define SRI_RECONF_INPROG (1<<11) /* Slave synchronization in progress. */ -#define SRI_RECONF_DONE (1<<12) /* Slave synchronized with new master. */ -#define SRI_FORCE_FAILOVER (1<<13) /* Force failover with master up. */ -#define SRI_SCRIPT_KILL_SENT (1<<14) /* SCRIPT KILL already sent on -BUSY */ +#define SRI_PROMOTED (1<<8) /* Slave selected for promotion. */ +#define SRI_RECONF_SENT (1<<9) /* SLAVEOF sent. */ +#define SRI_RECONF_INPROG (1<<10) /* Slave synchronization in progress. */ +#define SRI_RECONF_DONE (1<<11) /* Slave synchronized with new master. */ +#define SRI_FORCE_FAILOVER (1<<12) /* Force failover with master up. */ +#define SRI_SCRIPT_KILL_SENT (1<<13) /* SCRIPT KILL already sent on -BUSY */ #define SENTINEL_INFO_PERIOD 10000 #define SENTINEL_PING_PERIOD 1000 #define SENTINEL_ASK_PERIOD 1000 #define SENTINEL_PUBLISH_PERIOD 2000 -#define SENTINEL_DOWN_AFTER_PERIOD 30000 +#define SENTINEL_DEFAULT_DOWN_AFTER 30000 #define SENTINEL_HELLO_CHANNEL "__sentinel__:hello" #define SENTINEL_TILT_TRIGGER 2000 #define SENTINEL_TILT_PERIOD (SENTINEL_PING_PERIOD*30) @@ -893,7 +888,7 @@ sentinelRedisInstance *createSentinelRedisInstance(char *name, int flags, char * ri->s_down_since_time = 0; ri->o_down_since_time = 0; ri->down_after_period = master ? master->down_after_period : - SENTINEL_DOWN_AFTER_PERIOD; + SENTINEL_DEFAULT_DOWN_AFTER; ri->master_link_down_time = 0; ri->auth_pass = NULL; ri->slave_priority = SENTINEL_DEFAULT_SLAVE_PRIORITY; @@ -1111,7 +1106,7 @@ void sentinelResetMaster(sentinelRedisInstance *ri, int flags) { } if (ri->cc) sentinelKillLink(ri,ri->cc); if (ri->pc) sentinelKillLink(ri,ri->pc); - ri->flags &= SRI_MASTER|SRI_CAN_FAILOVER|SRI_DISCONNECTED; + ri->flags &= SRI_MASTER|SRI_DISCONNECTED; if (ri->leader) { sdsfree(ri->leader); ri->leader = NULL; @@ -1276,17 +1271,6 @@ char *sentinelHandleConfiguration(char **argv, int argc) { ri->failover_timeout = atoi(argv[2]); if (ri->failover_timeout <= 0) return "negative or zero time parameter."; - } else if (!strcasecmp(argv[0],"can-failover") && argc == 3) { - /* can-failover */ - int yesno = yesnotoi(argv[2]); - - ri = sentinelGetMasterByName(argv[1]); - if (!ri) return "No such master with specified name."; - if (yesno == -1) return "Argument must be either yes or no."; - if (yesno) - ri->flags |= SRI_CAN_FAILOVER; - else - ri->flags &= ~SRI_CAN_FAILOVER; } else if (!strcasecmp(argv[0],"parallel-syncs") && argc == 3) { /* parallel-syncs */ ri = sentinelGetMasterByName(argv[1]); @@ -1826,25 +1810,24 @@ void sentinelReceiveHelloMessages(redisAsyncContext *c, void *reply, void *privd if (strstr(r->element[2]->str,server.runid) != NULL) return; { - /* Format is composed of 9 tokens: - * 0=ip,1=port,2=runid,3=can_failover,4=current_epoch, - * 5=master_name,6=master_ip,7=master_port,8=master_config_epoch. */ - int numtokens, port, removed, canfailover, master_port; + /* Format is composed of 8 tokens: + * 0=ip,1=port,2=runid,3=current_epoch,4=master_name, + * 5=master_ip,6=master_port,7=master_config_epoch. */ + int numtokens, port, removed, master_port; uint64_t current_epoch, master_config_epoch; char **token = sdssplitlen(r->element[2]->str, r->element[2]->len, ",",1,&numtokens); sentinelRedisInstance *si; - if (numtokens == 9) { + if (numtokens == 8) { /* First, try to see if we already have this sentinel. */ port = atoi(token[1]); - master_port = atoi(token[7]); - canfailover = atoi(token[3]); + master_port = atoi(token[6]); si = getSentinelRedisInstanceByAddrAndRunID( master->sentinels,token[0],port,token[2]); - current_epoch = strtoull(token[4],NULL,10); - master_config_epoch = strtoull(token[8],NULL,10); + current_epoch = strtoull(token[3],NULL,10); + master_config_epoch = strtoull(token[7],NULL,10); sentinelRedisInstance *msgmaster; if (!si) { @@ -1871,7 +1854,7 @@ void sentinelReceiveHelloMessages(redisAsyncContext *c, void *reply, void *privd } } - /* Update local current_epoch if received current_epoch is greater. */ + /* Update local current_epoch if received current_epoch is greater.*/ if (current_epoch > sentinel.current_epoch) { sentinel.current_epoch = current_epoch; sentinelEvent(REDIS_WARNING,"+new-epoch",ri,"%llu", @@ -1879,31 +1862,25 @@ void sentinelReceiveHelloMessages(redisAsyncContext *c, void *reply, void *privd } /* Update master info if received configuration is newer. */ - if ((msgmaster = sentinelGetMasterByName(token[5])) != NULL) { + if ((msgmaster = sentinelGetMasterByName(token[4])) != NULL) { if (msgmaster->config_epoch < master_config_epoch) { msgmaster->config_epoch = master_config_epoch; if (master_port != msgmaster->addr->port || - !strcmp(msgmaster->addr->ip, token[6])) + !strcmp(msgmaster->addr->ip, token[5])) { sentinelEvent(REDIS_WARNING,"+switch-master", msgmaster,"%s %s %d %s %d", msgmaster->name, msgmaster->addr->ip, msgmaster->addr->port, - token[6], master_port); + token[5], master_port); sentinelResetMasterAndChangeAddress(msgmaster, - token[6], master_port); + token[5], master_port); } } } /* Update the state of the Sentinel. */ - if (si) { - si->last_hello_time = mstime(); - if (canfailover) - si->flags |= SRI_CAN_FAILOVER; - else - si->flags &= ~SRI_CAN_FAILOVER; - } + if (si) si->last_hello_time = mstime(); } sdsfreesplitres(token,numtokens); } @@ -1964,10 +1941,9 @@ void sentinelPingInstance(sentinelRedisInstance *ri) { sentinelAddr *master_addr = sentinelGetCurrentMasterAddress(master); snprintf(payload,sizeof(payload), - "%s,%d,%s,%d,%llu," /* Info about this sentinel. */ + "%s,%d,%s,%llu," /* Info about this sentinel. */ "%s,%s,%d,%lld", /* Info about current master. */ ip, server.port, server.runid, - (master->flags & SRI_CAN_FAILOVER) != 0, (unsigned long long) sentinel.current_epoch, /* --- */ master->name,master_addr->ip,master_addr->port, @@ -2138,10 +2114,6 @@ void addReplySentinelRedisInstance(redisClient *c, sentinelRedisInstance *ri) { addReplyBulkLongLong(c,mstime() - ri->last_hello_time); fields++; - addReplyBulkCString(c,"can-failover-its-master"); - addReplyBulkLongLong(c,(ri->flags & SRI_CAN_FAILOVER) != 0); - fields++; - addReplyBulkCString(c,"voted-leader"); addReplyBulkCString(c,ri->leader ? ri->leader : "?"); fields++; @@ -2540,25 +2512,6 @@ void sentinelAskMasterStateToOtherSentinels(sentinelRedisInstance *master, int f /* =============================== FAILOVER ================================= */ -/* Given a master get the "subjective leader", that is, among all the sentinels - * with given characteristics, the one with the lexicographically smaller - * runid. The characteristics required are: - * - * 1) Has SRI_CAN_FAILOVER flag. - * 2) Is not disconnected. - * 3) Recently answered to our ping (no longer than - * SENTINEL_INFO_VALIDITY_TIME milliseconds ago). - * - * The function returns a pointer to an sds string representing the runid of the - * leader sentinel instance (from our point of view). Otherwise NULL is - * returned if there are no suitable sentinels. - */ - -int compareRunID(const void *a, const void *b) { - char **aptrptr = (char**)a, **bptrptr = (char**)b; - return strcasecmp(*aptrptr, *bptrptr); -} - /* Vote for the sentinel with 'req_runid' or return the old vote if already * voted for the specifed 'req_epoch' or one greater. * @@ -2725,8 +2678,9 @@ void sentinelStartFailover(sentinelRedisInstance *master) { /* This function checks if there are the conditions to start the failover, * that is: * - * 1) Enough time has passed since O_DOWN. - * 2) The master is marked as SRI_CAN_FAILOVER, so we can failover it. + * 1) Master must be in ODOWN condition. + * 2) No failover already in progress. + * 3) No failover already attempted recently. * * We still don't know if we'll win the election so it is possible that we * start the failover but that we'll not be able to act. @@ -2734,8 +2688,7 @@ void sentinelStartFailover(sentinelRedisInstance *master) { * Return non-zero if a failover was started. */ int sentinelStartFailoverIfNeeded(sentinelRedisInstance *master) { /* We can't failover if the master is not in O_DOWN state. */ - if (!(master->flags & SRI_CAN_FAILOVER) || - !(master->flags & SRI_O_DOWN)) return 0; + if (!(master->flags & SRI_O_DOWN)) return 0; /* Failover already in progress? */ if (master->flags & SRI_FAILOVER_IN_PROGRESS) return 0; From 45666c4c22a5da533ba62fe2ef40a93c8f5f2f00 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 19 Nov 2013 09:48:12 +0100 Subject: [PATCH 0320/2500] Sentinel: CONFIG REWRITE support for Sentinel config. --- src/config.c | 5 +++ src/redis.h | 2 + src/sentinel.c | 109 +++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 112 insertions(+), 4 deletions(-) diff --git a/src/config.c b/src/config.c index 8bfb208f9..d39546e13 100644 --- a/src/config.c +++ b/src/config.c @@ -1162,6 +1162,10 @@ int dictSdsKeyCompare(void *privdata, const void *key1, const void *key2); void dictSdsDestructor(void *privdata, void *val); void dictListDestructor(void *privdata, void *val); +/* Sentinel config rewriting is implemented inside sentinel.c by + * rewriteConfigSentinelOption(). */ +void rewriteConfigSentinelOption(struct rewriteConfigState *state); + dictType optionToLineDictType = { dictSdsHash, /* hash function */ NULL, /* key dup */ @@ -1735,6 +1739,7 @@ int rewriteConfig(char *path) { rewriteConfigClientoutputbufferlimitOption(state); rewriteConfigNumericalOption(state,"hz",server.hz,REDIS_DEFAULT_HZ); rewriteConfigYesNoOption(state,"aof-rewrite-incremental-fsync",server.aof_rewrite_incremental_fsync,REDIS_DEFAULT_AOF_REWRITE_INCREMENTAL_FSYNC); + if (server.sentinel_mode) rewriteConfigSentinelOption(state); /* Step 3: remove all the orphaned lines in the old file, that is, lines * that were used by a config option and are no longer used, like in case diff --git a/src/redis.h b/src/redis.h index f0b5aa862..2361e03d6 100644 --- a/src/redis.h +++ b/src/redis.h @@ -1169,6 +1169,8 @@ sds keyspaceEventsFlagsToString(int flags); void loadServerConfig(char *filename, char *options); void appendServerSaveParams(time_t seconds, int changes); void resetServerSaveParams(); +struct rewriteConfigState; /* Forward declaration to export API. */ +void rewriteConfigRewriteLine(struct rewriteConfigState *state, char *option, sds line, int force); /* db.c -- Keyspace access API */ int removeExpire(redisDb *db, robj *key); diff --git a/src/sentinel.c b/src/sentinel.c index 05c066f56..31005c9d1 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1303,10 +1303,10 @@ char *sentinelHandleConfiguration(char **argv, int argc) { ri->config_epoch = strtoull(argv[2],NULL,10); if (ri->config_epoch > sentinel.current_epoch) sentinel.current_epoch = ri->config_epoch; - } else if (!strcasecmp(argv[0],"slave") && argc == 3) { + } else if (!strcasecmp(argv[0],"known-slave") && argc == 3) { sentinelRedisInstance *slave; - /* slave */ + /* known-slave */ ri = sentinelGetMasterByName(argv[1]); if (!ri) return "No such master with specified name."; if ((slave = createSentinelRedisInstance(NULL,SRI_SLAVE,argv[2], @@ -1314,10 +1314,10 @@ char *sentinelHandleConfiguration(char **argv, int argc) { { return "Wrong hostname or port for slave."; } - } else if (!strcasecmp(argv[0],"sentinel") && argc == 3) { + } else if (!strcasecmp(argv[0],"known-sentinel") && argc == 3) { sentinelRedisInstance *si; - /* sentinel */ + /* known-sentinel */ ri = sentinelGetMasterByName(argv[1]); if (!ri) return "No such master with specified name."; if ((si = createSentinelRedisInstance(NULL,SRI_SENTINEL,argv[2], @@ -1331,6 +1331,107 @@ char *sentinelHandleConfiguration(char **argv, int argc) { return NULL; } +/* Implements CONFIG REWRITE for "sentinel" option. + * This is used not just to rewrite the configuration given by the user + * (the configured masters) but also in order to retain the state of + * Sentinel across restarts: config epoch of masters, associated slaves + * and sentinel instances, and so forth. */ +void rewriteConfigSentinelOption(struct rewriteConfigState *state) { + dictIterator *di, *di2; + dictEntry *de; + + /* For every master emit a "sentinel monitor" config entry. */ + di = dictGetIterator(sentinel.masters); + while((de = dictNext(di)) != NULL) { + sentinelRedisInstance *master, *ri; + sds line; + + /* sentinel monitor */ + master = dictGetVal(de); + line = sdscatprintf(sdsempty(),"sentinel monitor %s %s %d %d", + master->name, master->addr->ip, master->addr->port, + master->quorum); + rewriteConfigRewriteLine(state,"sentinel",line,1); + + /* sentinel down-after-milliseconds */ + if (master->down_after_period != SENTINEL_DEFAULT_DOWN_AFTER) { + line = sdscatprintf(sdsempty(), + "sentinel down-after-milliseconds %s %ld", + master->name, (long) master->down_after_period); + rewriteConfigRewriteLine(state,"sentinel",line,1); + } + + /* sentinel failover-timeout */ + if (master->failover_timeout != SENTINEL_DEFAULT_FAILOVER_TIMEOUT) { + line = sdscatprintf(sdsempty(), + "sentinel failover-timeout %s %ld", + master->name, (long) master->failover_timeout); + rewriteConfigRewriteLine(state,"sentinel",line,1); + } + + /* sentinel parallel-syncs */ + if (master->parallel_syncs != SENTINEL_DEFAULT_PARALLEL_SYNCS) { + line = sdscatprintf(sdsempty(), + "sentinel parallel-syncs %s %d", + master->name, master->parallel_syncs); + rewriteConfigRewriteLine(state,"sentinel",line,1); + } + + /* sentinel notification-script */ + if (master->notification_script) { + line = sdscatprintf(sdsempty(), + "sentinel notification-script %s %s", + master->name, master->notification_script); + rewriteConfigRewriteLine(state,"sentinel",line,1); + } + + /* sentinel client-reconfig-script */ + if (master->client_reconfig_script) { + line = sdscatprintf(sdsempty(), + "sentinel client-reconfig-script %s %s", + master->name, master->client_reconfig_script); + rewriteConfigRewriteLine(state,"sentinel",line,1); + } + + /* sentinel auth-pass */ + if (master->auth_pass) { + line = sdscatprintf(sdsempty(), + "sentinel auth-pass %s %s", + master->name, master->auth_pass); + rewriteConfigRewriteLine(state,"sentinel",line,1); + } + + /* sentinel config-epoch */ + line = sdscatprintf(sdsempty(), + "sentinel config-epoch %s %llu", + master->name, (unsigned long long) master->config_epoch); + rewriteConfigRewriteLine(state,"sentinel",line,1); + + /* sentinel known-slave */ + di2 = dictGetIterator(master->slaves); + while((de = dictNext(di)) != NULL) { + ri = dictGetVal(de); + line = sdscatprintf(sdsempty(), + "sentinel known-slave %s %s %d", + master->name, ri->addr->ip, ri->addr->port); + rewriteConfigRewriteLine(state,"sentinel",line,1); + } + dictReleaseIterator(di2); + + /* sentinel known-sentinel */ + di2 = dictGetIterator(master->sentinels); + while((de = dictNext(di)) != NULL) { + ri = dictGetVal(de); + line = sdscatprintf(sdsempty(), + "sentinel known-sentinel %s %s %d", + master->name, ri->addr->ip, ri->addr->port); + rewriteConfigRewriteLine(state,"sentinel",line,1); + } + dictReleaseIterator(di2); + } + dictReleaseIterator(di); +} + /* ====================== hiredis connection handling ======================= */ /* Completely disconnect an hiredis link from an instance. */ From d345a59943928972c4aee8f2b73da6f5e08384d5 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 19 Nov 2013 10:13:04 +0100 Subject: [PATCH 0321/2500] Sentinel: sentinelFlushConfig() to CONFIG REWRITE + fsync. --- src/redis.h | 1 + src/sentinel.c | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/src/redis.h b/src/redis.h index 2361e03d6..bdc695228 100644 --- a/src/redis.h +++ b/src/redis.h @@ -1171,6 +1171,7 @@ void appendServerSaveParams(time_t seconds, int changes); void resetServerSaveParams(); struct rewriteConfigState; /* Forward declaration to export API. */ void rewriteConfigRewriteLine(struct rewriteConfigState *state, char *option, sds line, int force); +int rewriteConfig(char *path); /* db.c -- Keyspace access API */ int removeExpire(redisDb *db, robj *key); diff --git a/src/sentinel.c b/src/sentinel.c index 31005c9d1..bd951d8c0 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -36,6 +36,7 @@ #include #include #include +#include extern char **environ; @@ -1432,6 +1433,27 @@ void rewriteConfigSentinelOption(struct rewriteConfigState *state) { dictReleaseIterator(di); } +/* This function uses the config rewriting Redis engine in order to persist + * the state of the Sentinel in the current configuration file. + * + * Before returning the function calls fsync() against the generated + * configuration file to make sure changes are committed to disk. + * + * On failure the function logs a warning on the Redis log. */ +void sentinelFlushConfig(void) { + int fd; + + if (rewriteConfig(server.configfile) == -1) { + redisLog(REDIS_WARNING,"WARNING: Senitnel was not able to save the new configuration on disk!!!: %s", strerror(errno)); + return; + } + if ((fd = open(server.configfile,O_RDONLY)) != -1) { + fsync(fd); + close(fd); + } + return; +} + /* ====================== hiredis connection handling ======================= */ /* Completely disconnect an hiredis link from an instance. */ From 88b2f6525e55d1eb078b60f09a5517b7c0ee1cf1 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 19 Nov 2013 10:55:43 +0100 Subject: [PATCH 0322/2500] Sentinel: call sentinelFlushConfig() to persist state when needed. Also the sentinel configuration rewriting was modified in order to account for failover in progress, where we need to provide the promoted slave address as master address, and the old master address as one of the slaves address. --- src/sentinel.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index bd951d8c0..99a5bb503 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -334,6 +334,7 @@ void sentinelStartFailover(sentinelRedisInstance *master); void sentinelDiscardReplyCallback(redisAsyncContext *c, void *reply, void *privdata); int sentinelSendSlaveOf(sentinelRedisInstance *ri, char *host, int port); char *sentinelVoteLeader(sentinelRedisInstance *master, uint64_t req_epoch, char *req_runid, uint64_t *leader_epoch); +void sentinelFlushConfig(void); /* ========================= Dictionary types =============================== */ @@ -1201,13 +1202,17 @@ int sentinelResetMasterAndChangeAddress(sentinelRedisInstance *master, char *ip, slave = createSentinelRedisInstance(NULL,SRI_SLAVE,slaves[j]->ip, slaves[j]->port, master->quorum, master); releaseSentinelAddr(slaves[j]); - if (slave) sentinelEvent(REDIS_NOTICE,"+slave",slave,"%@"); + if (slave) { + sentinelEvent(REDIS_NOTICE,"+slave",slave,"%@"); + sentinelFlushConfig(); + } } zfree(slaves); /* Release the old address at the end so we are safe even if the function * gets the master->addr->ip and master->addr->port as arguments. */ releaseSentinelAddr(oldaddr); + sentinelFlushConfig(); return REDIS_OK; } @@ -1345,12 +1350,14 @@ void rewriteConfigSentinelOption(struct rewriteConfigState *state) { di = dictGetIterator(sentinel.masters); while((de = dictNext(di)) != NULL) { sentinelRedisInstance *master, *ri; + sentinelAddr *master_addr; sds line; /* sentinel monitor */ master = dictGetVal(de); + master_addr = sentinelGetCurrentMasterAddress(master); line = sdscatprintf(sdsempty(),"sentinel monitor %s %s %d %d", - master->name, master->addr->ip, master->addr->port, + master->name, master_addr->ip, master_addr->port, master->quorum); rewriteConfigRewriteLine(state,"sentinel",line,1); @@ -1411,7 +1418,18 @@ void rewriteConfigSentinelOption(struct rewriteConfigState *state) { /* sentinel known-slave */ di2 = dictGetIterator(master->slaves); while((de = dictNext(di)) != NULL) { + sentinelAddr *slave_addr; + ri = dictGetVal(de); + slave_addr = ri->addr; + + /* If master_addr (obtained using sentinelGetCurrentMasterAddress() + * so it may be the address of the promoted slave) is equal to this + * slave's address, a failover is in progress and the slave was + * already successfully promoted. So as the address of this slave + * we use the old master address instead. */ + if (sentinelAddrIsEqual(slave_addr,master_addr)) + slave_addr = master->addr; line = sdscatprintf(sdsempty(), "sentinel known-slave %s %s %d", master->name, ri->addr->ip, ri->addr->port); @@ -1754,6 +1772,7 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { ri->master->config_epoch = ri->master->failover_epoch; ri->master->failover_state = SENTINEL_FAILOVER_STATE_RECONF_SLAVES; ri->master->failover_state_change_time = mstime(); + sentinelFlushConfig(); sentinelEvent(REDIS_WARNING,"+promoted-slave",ri,"%@"); sentinelEvent(REDIS_WARNING,"+failover-state-reconf-slaves", ri->master,"%@"); @@ -1974,6 +1993,7 @@ void sentinelReceiveHelloMessages(redisAsyncContext *c, void *reply, void *privd * for Sentinels we don't have a later chance to fill it, * so do it now. */ si->runid = sdsnew(token[2]); + sentinelFlushConfig(); } } From 8b7b01058048d65556fc14453ed95d442a462594 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 19 Nov 2013 10:59:47 +0100 Subject: [PATCH 0323/2500] Sentinel: rewriteConfigSentinelOption() sub-iterators var typo fixed. --- src/sentinel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 99a5bb503..340dd6465 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1417,7 +1417,7 @@ void rewriteConfigSentinelOption(struct rewriteConfigState *state) { /* sentinel known-slave */ di2 = dictGetIterator(master->slaves); - while((de = dictNext(di)) != NULL) { + while((de = dictNext(di2)) != NULL) { sentinelAddr *slave_addr; ri = dictGetVal(de); @@ -1439,7 +1439,7 @@ void rewriteConfigSentinelOption(struct rewriteConfigState *state) { /* sentinel known-sentinel */ di2 = dictGetIterator(master->sentinels); - while((de = dictNext(di)) != NULL) { + while((de = dictNext(di2)) != NULL) { ri = dictGetVal(de); line = sdscatprintf(sdsempty(), "sentinel known-sentinel %s %s %d", From 8ca008692f51f416002dc6a56b64c11f43c75021 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 19 Nov 2013 11:03:47 +0100 Subject: [PATCH 0324/2500] Sentinel: arity of known-sentinel/slave is 4 not 3. --- src/sentinel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 340dd6465..dd2b681f0 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1309,7 +1309,7 @@ char *sentinelHandleConfiguration(char **argv, int argc) { ri->config_epoch = strtoull(argv[2],NULL,10); if (ri->config_epoch > sentinel.current_epoch) sentinel.current_epoch = ri->config_epoch; - } else if (!strcasecmp(argv[0],"known-slave") && argc == 3) { + } else if (!strcasecmp(argv[0],"known-slave") && argc == 4) { sentinelRedisInstance *slave; /* known-slave */ @@ -1320,7 +1320,7 @@ char *sentinelHandleConfiguration(char **argv, int argc) { { return "Wrong hostname or port for slave."; } - } else if (!strcasecmp(argv[0],"known-sentinel") && argc == 3) { + } else if (!strcasecmp(argv[0],"known-sentinel") && argc == 4) { sentinelRedisInstance *si; /* known-sentinel */ From 934e4d103f181c60ffb6173beaae6d0e4fbc3c64 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 19 Nov 2013 11:11:43 +0100 Subject: [PATCH 0325/2500] Sentinel: when writing config on disk, remember sentinels runid. --- src/sentinel.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index dd2b681f0..e383b58f7 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1320,10 +1320,11 @@ char *sentinelHandleConfiguration(char **argv, int argc) { { return "Wrong hostname or port for slave."; } - } else if (!strcasecmp(argv[0],"known-sentinel") && argc == 4) { + } else if (!strcasecmp(argv[0],"known-sentinel") && + (argc == 4 || argc == 5)) { sentinelRedisInstance *si; - /* known-sentinel */ + /* known-sentinel [runid] */ ri = sentinelGetMasterByName(argv[1]); if (!ri) return "No such master with specified name."; if ((si = createSentinelRedisInstance(NULL,SRI_SENTINEL,argv[2], @@ -1331,6 +1332,7 @@ char *sentinelHandleConfiguration(char **argv, int argc) { { return "Wrong hostname or port for sentinel."; } + if (argc == 5) si->runid = sdsnew(argv[4]); } else { return "Unrecognized sentinel configuration statement."; } @@ -1442,8 +1444,10 @@ void rewriteConfigSentinelOption(struct rewriteConfigState *state) { while((de = dictNext(di2)) != NULL) { ri = dictGetVal(de); line = sdscatprintf(sdsempty(), - "sentinel known-sentinel %s %s %d", - master->name, ri->addr->ip, ri->addr->port); + "sentinel known-sentinel %s %s %d%s%s", + master->name, ri->addr->ip, ri->addr->port, + ri->runid ? " " : "", + ri->runid ? ri->runid : ""); rewriteConfigRewriteLine(state,"sentinel",line,1); } dictReleaseIterator(di2); From 02b42dc7c7133affd9a76a08aa7e55fe013c8df3 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 19 Nov 2013 11:24:36 +0100 Subject: [PATCH 0326/2500] Sentinel: no longer used defines removed. --- src/sentinel.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index e383b58f7..7319e731e 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -77,7 +77,6 @@ typedef struct sentinelAddr { #define SENTINEL_TILT_TRIGGER 2000 #define SENTINEL_TILT_PERIOD (SENTINEL_PING_PERIOD*30) #define SENTINEL_DEFAULT_SLAVE_PRIORITY 100 -#define SENTINEL_PROMOTION_RETRY_PERIOD 30000 #define SENTINEL_SLAVE_RECONF_RETRY_PERIOD 10000 #define SENTINEL_DEFAULT_PARALLEL_SYNCS 1 #define SENTINEL_MIN_LINK_RECONNECT_PERIOD 15000 @@ -88,8 +87,6 @@ typedef struct sentinelAddr { /* How many milliseconds is an information valid? This applies for instance * to the reply to SENTINEL IS-MASTER-DOWN-BY-ADDR replies. */ #define SENTINEL_INFO_VALIDITY_TIME 5000 -#define SENTINEL_FAILOVER_FIXED_DELAY 5000 -#define SENTINEL_FAILOVER_MAX_RANDOM_DELAY 10000 /* Failover machine different states. */ #define SENTINEL_FAILOVER_STATE_NONE 0 /* No failover in progress. */ @@ -98,10 +95,7 @@ typedef struct sentinelAddr { #define SENTINEL_FAILOVER_STATE_SEND_SLAVEOF_NOONE 3 /* Slave -> Master */ #define SENTINEL_FAILOVER_STATE_WAIT_PROMOTION 4 /* Wait slave to change role */ #define SENTINEL_FAILOVER_STATE_RECONF_SLAVES 5 /* SLAVEOF newmaster */ -#define SENTINEL_FAILOVER_STATE_WAIT_NEXT_SLAVE 6 /* wait replication */ -#define SENTINEL_FAILOVER_STATE_ALERT_CLIENTS 7 /* Run user script. */ -#define SENTINEL_FAILOVER_STATE_WAIT_ALERT_SCRIPT 8 /* Wait script exec. */ -#define SENTINEL_FAILOVER_STATE_UPDATE_CONFIG 9 /* Monitor promoted slave. */ +#define SENTINEL_FAILOVER_STATE_UPDATE_CONFIG 6 /* Monitor promoted slave. */ #define SENTINEL_MASTER_LINK_STATUS_UP 0 #define SENTINEL_MASTER_LINK_STATUS_DOWN 1 @@ -2114,7 +2108,6 @@ const char *sentinelFailoverStateStr(int state) { case SENTINEL_FAILOVER_STATE_SEND_SLAVEOF_NOONE: return "send_slaveof_noone"; case SENTINEL_FAILOVER_STATE_WAIT_PROMOTION: return "wait_promotion"; case SENTINEL_FAILOVER_STATE_RECONF_SLAVES: return "reconf_slaves"; - case SENTINEL_FAILOVER_STATE_ALERT_CLIENTS: return "alert_clients"; case SENTINEL_FAILOVER_STATE_UPDATE_CONFIG: return "update_config"; default: return "unknown"; } From 101f583689f66876d861d79759a00dc4c78a2756 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 19 Nov 2013 12:34:39 +0100 Subject: [PATCH 0327/2500] Sentinel: failover script execution fixed. --- src/sentinel.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 7319e731e..4b40e7a20 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -437,6 +437,16 @@ sentinelAddr *createSentinelAddr(char *hostname, int port) { return sa; } +/* Return a duplicate of the source address. */ +sentinelAddr *dupSentinelAddr(sentinelAddr *src) { + sentinelAddr *sa; + + sa = zmalloc(sizeof(*sa)); + sa->ip = sdsnew(src->ip); + sa->port = src->port; + return sa; +} + /* Free a Sentinel address. Can't fail. */ void releaseSentinelAddr(sentinelAddr *sa) { sdsfree(sa->ip); @@ -783,15 +793,13 @@ void sentinelPendingScriptsCommand(redisClient *c) { * * * - * It is called every time a failover starts, ends, or is aborted. + * It is called every time a failover is performed. * - * is "start", "end" or "abort". + * is currently always "failover". * is either "leader" or "observer". * * from/to fields are respectively master -> promoted slave addresses for - * "start" and "end", or the reverse (promoted slave -> master) in case of - * "abort". - */ + * "start" and "end". */ void sentinelCallClientReconfScript(sentinelRedisInstance *master, int role, char *state, sentinelAddr *from, sentinelAddr *to) { char fromport[32], toport[32]; @@ -2009,13 +2017,21 @@ void sentinelReceiveHelloMessages(redisAsyncContext *c, void *reply, void *privd if (master_port != msgmaster->addr->port || !strcmp(msgmaster->addr->ip, token[5])) { + sentinelAddr *old_addr; + sentinelEvent(REDIS_WARNING,"+switch-master", msgmaster,"%s %s %d %s %d", msgmaster->name, msgmaster->addr->ip, msgmaster->addr->port, token[5], master_port); + + old_addr = dupSentinelAddr(msgmaster->addr); sentinelResetMasterAndChangeAddress(msgmaster, token[5], master_port); + sentinelCallClientReconfScript(msgmaster, + SENTINEL_OBSERVER,"start", + old_addr,msgmaster->addr); + releaseSentinelAddr(old_addr); } } } @@ -3038,13 +3054,9 @@ void sentinelFailoverDetectEnd(sentinelRedisInstance *master) { } if (not_reconfigured == 0) { - int role = SENTINEL_LEADER; - sentinelEvent(REDIS_WARNING,"+failover-end",master,"%@"); master->failover_state = SENTINEL_FAILOVER_STATE_UPDATE_CONFIG; master->failover_state_change_time = mstime(); - sentinelCallClientReconfScript(master,role,"end",master->addr, - master->promoted_slave->addr); } /* If I'm the leader it is a good idea to send a best effort SLAVEOF From 9bae762af3421510937d649edbbc61ff0665c1af Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 19 Nov 2013 16:20:42 +0100 Subject: [PATCH 0328/2500] Sentinel: various fixes to leader election implementation. --- src/sentinel.c | 70 +++++++++++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 29 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 4b40e7a20..7abd1cfbe 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -2672,7 +2672,7 @@ void sentinelAskMasterStateToOtherSentinels(sentinelRedisInstance *master, int f * voted for the specifed 'req_epoch' or one greater. * * If a vote is not available returns NULL, otherwise return the Sentinel - * runid and populate the leader_epoch with the epoch of the last vote. */ + * runid and populate the leader_epoch with the epoch of the vote. */ char *sentinelVoteLeader(sentinelRedisInstance *master, uint64_t req_epoch, char *req_runid, uint64_t *leader_epoch) { if (req_epoch > sentinel.current_epoch) { sentinel.current_epoch = req_epoch; @@ -2680,7 +2680,8 @@ char *sentinelVoteLeader(sentinelRedisInstance *master, uint64_t req_epoch, char (unsigned long long) sentinel.current_epoch); } - if (master->leader_epoch < req_epoch && sentinel.current_epoch <= req_epoch) { + if (master->leader_epoch < req_epoch && sentinel.current_epoch <= req_epoch) + { sdsfree(master->leader); master->leader = sdsnew(req_runid); master->leader_epoch = sentinel.current_epoch; @@ -2692,7 +2693,8 @@ char *sentinelVoteLeader(sentinelRedisInstance *master, uint64_t req_epoch, char * * The random addition is useful to desynchronize a bit the slaves * and reduce the chance that no slave gets majority. */ - master->failover_start_time = mstime() + rand() % 2000; + if (strcasecmp(master->leader,server.runid)) + master->failover_start_time = mstime() + rand() % 2000; } *leader_epoch = master->leader_epoch; @@ -2706,17 +2708,19 @@ struct sentinelLeader { /* Helper function for sentinelGetLeader, increment the counter * relative to the specified runid. */ -void sentinelLeaderIncr(dict *counters, char *runid) { +int sentinelLeaderIncr(dict *counters, char *runid) { dictEntry *de = dictFind(counters,runid); uint64_t oldval; if (de) { oldval = dictGetUnsignedIntegerVal(de); dictSetUnsignedIntegerVal(de,oldval+1); + return oldval+1; } else { de = dictAddRaw(counters,runid); redisAssert(de != NULL); dictSetUnsignedIntegerVal(de,1); + return 1; } } @@ -2734,49 +2738,57 @@ char *sentinelGetLeader(sentinelRedisInstance *master, uint64_t epoch) { char *myvote; char *winner = NULL; uint64_t leader_epoch; + uint64_t max_votes = 0; redisAssert(master->flags & (SRI_O_DOWN|SRI_FAILOVER_IN_PROGRESS)); counters = dictCreate(&leaderVotesDictType,NULL); - /* Count my vote (and vote for myself if I still did not voted for - * the currnet epoch). */ - myvote = sentinelVoteLeader(master,epoch,server.runid,&leader_epoch); - if (myvote && leader_epoch == epoch) { - sentinelLeaderIncr(counters,myvote); - voters++; - } - /* Count other sentinels votes */ di = dictGetIterator(master->sentinels); while((de = dictNext(di)) != NULL) { sentinelRedisInstance *ri = dictGetVal(de); - if (ri->leader == NULL || ri->leader_epoch != sentinel.current_epoch) - continue; - sentinelLeaderIncr(counters,ri->leader); + if (ri->leader != NULL && ri->leader_epoch == sentinel.current_epoch) + sentinelLeaderIncr(counters,ri->leader); voters++; } dictReleaseIterator(di); - voters_quorum = voters/2+1; /* Check what's the winner. For the winner to win, it needs two conditions: * 1) Absolute majority between voters (50% + 1). * 2) And anyway at least master->quorum votes. */ - { - uint64_t max_votes = 0; /* Max votes so far. */ + di = dictGetIterator(counters); + while((de = dictNext(di)) != NULL) { + uint64_t votes = dictGetUnsignedIntegerVal(de); - di = dictGetIterator(counters); - while((de = dictNext(di)) != NULL) { - uint64_t votes = dictGetUnsignedIntegerVal(de); - - if (max_votes < votes) { - max_votes = votes; - winner = dictGetKey(de); - } + if (votes > max_votes) { + max_votes = votes; + winner = dictGetKey(de); } - dictReleaseIterator(di); - if (winner && (max_votes < voters_quorum || max_votes < master->quorum)) - winner = NULL; } + dictReleaseIterator(di); + + /* Count this Sentinel vote: + * if this Sentinel did not voted yet, either vote for the most + * common voted sentinel, or for itself if no vote exists at all. */ + if (winner) + myvote = sentinelVoteLeader(master,epoch,winner,&leader_epoch); + else + myvote = sentinelVoteLeader(master,epoch,server.runid,&leader_epoch); + + if (myvote && leader_epoch == epoch) { + uint64_t votes = sentinelLeaderIncr(counters,myvote); + + if (votes > max_votes) { + max_votes = votes; + winner = myvote; + } + } + voters++; /* Anyway, count me as one of the voters. */ + + voters_quorum = voters/2+1; + if (winner && (max_votes < voters_quorum || max_votes < master->quorum)) + winner = NULL; + winner = winner ? sdsnew(winner) : NULL; sdsfree(myvote); dictRelease(counters); From 5d77fe69c72b8ce4f651d9ed1a8bd3b0e6edeb0f Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 19 Nov 2013 16:50:04 +0100 Subject: [PATCH 0329/2500] Sentinel: distinguish between is-master-down-by-addr requests. Some are just to know if the master is down, and in this case the runid in the request is set to "*", others are actually in order to seek for a vote and get elected. In the latter case the runid is set to the runid of the instance seeking for the vote. --- src/redis.c | 2 ++ src/sentinel.c | 20 +++++++++++++------- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/src/redis.c b/src/redis.c index 7932824e7..af24505b6 100644 --- a/src/redis.c +++ b/src/redis.c @@ -3108,6 +3108,8 @@ int main(int argc, char **argv) { redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port); if (server.sofd > 0) redisLog(REDIS_NOTICE,"The server is now ready to accept connections at %s", server.unixsocket); + } else { + redisLog(REDIS_WARNING,"Sentinel runid is %s", server.runid); } /* Warning the user about suspicious maxmemory setting. */ diff --git a/src/sentinel.c b/src/sentinel.c index 7abd1cfbe..972f8921c 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -2359,8 +2359,9 @@ void sentinelCommand(redisClient *c) { (ri->flags & SRI_MASTER)) isdown = 1; - /* Vote for the master (or fetch the previous vote) */ - if (ri && ri->flags & SRI_MASTER) { + /* Vote for the master (or fetch the previous vote) if the request + * includes a runid, otherwise the sender is not seeking for a vote. */ + if (ri && ri->flags & SRI_MASTER && strcasecmp(c->argv[5]->ptr,"*")) { leader = sentinelVoteLeader(ri,(uint64_t)req_epoch, c->argv[5]->ptr, &leader_epoch); @@ -2370,7 +2371,7 @@ void sentinelCommand(redisClient *c) { * down state, leader, vote epoch. */ addReplyMultiBulkLen(c,3); addReply(c, isdown ? shared.cone : shared.czero); - addReplyBulkCString(c, leader ? leader : "?"); + addReplyBulkCString(c, leader ? leader : "*"); addReplyLongLong(c, (long long)leader_epoch); if (leader) sdsfree(leader); } else if (!strcasecmp(c->argv[1]->ptr,"reset")) { @@ -2605,9 +2606,13 @@ void sentinelReceiveIsMasterDownReply(redisAsyncContext *c, void *reply, void *p } else { ri->flags &= ~SRI_MASTER_DOWN; } - sdsfree(ri->leader); - ri->leader = sdsnew(r->element[1]->str); - ri->leader_epoch = r->element[2]->integer; + if (strcmp(r->element[1]->str,"*")) { + /* If the runid in the reply is not "*" the Sentinel actually + * replied with a vote. */ + sdsfree(ri->leader); + ri->leader = sdsnew(r->element[1]->str); + ri->leader_epoch = r->element[2]->integer; + } } } @@ -2660,7 +2665,8 @@ void sentinelAskMasterStateToOtherSentinels(sentinelRedisInstance *master, int f "SENTINEL is-master-down-by-addr %s %s %llu %s", master->addr->ip, port, sentinel.current_epoch, - server.runid); + (master->failover_state > SENTINEL_FAILOVER_STATE_NONE) ? + server.runid : "*"); if (retval == REDIS_OK) ri->pending_commands++; } dictReleaseIterator(di); From 19f625ed5c6aa020a19c52eb7c6635cbd4aafbe2 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 19 Nov 2013 17:58:11 +0100 Subject: [PATCH 0330/2500] CONFIG REWRITE: don't add the signature if it already exists. At the end of the file, CONFIG REWRITE adds a comment line that: # Generated by CONFIG REWRITE Followed by the additional config options required. However this was added again and again at every rewrite in praticular conditions (when a given set of options change in a given time during the time). Now if it was alrady encountered, it is not added a second time. This is especially important for Sentinel that rewrites the config at every state change. --- src/config.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/config.c b/src/config.c index d39546e13..4f7ddda98 100644 --- a/src/config.c +++ b/src/config.c @@ -1154,6 +1154,8 @@ void configGetCommand(redisClient *c) { * */ +#define REDIS_CONFIG_REWRITE_SIGNATURE "# Generated by CONFIG REWRITE" + /* We use the following dictionary type to store where a configuration * option is mentioned in the old configuration file, so it's * like "maxmemory" -> list of line numbers (first line is zero). */ @@ -1230,6 +1232,8 @@ struct rewriteConfigState *rewriteConfigReadOldFile(char *path) { /* Handle comments and empty lines. */ if (line[0] == '#' || line[0] == '\0') { + if (!state->has_tail && !strcmp(line,REDIS_CONFIG_REWRITE_SIGNATURE)) + state->has_tail = 1; rewriteConfigAppendLine(state,line); continue; } @@ -1301,7 +1305,7 @@ void rewriteConfigRewriteLine(struct rewriteConfigState *state, char *option, sd /* Append a new line. */ if (!state->has_tail) { rewriteConfigAppendLine(state, - sdsnew("# Generated by CONFIG REWRITE")); + sdsnew(REDIS_CONFIG_REWRITE_SIGNATURE)); state->has_tail = 1; } rewriteConfigAppendLine(state,line); From 3ea52291d9c9d4fce2829847907c20ed87791d6a Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 20 Nov 2013 15:52:44 +0100 Subject: [PATCH 0331/2500] Sentinel: take the replication offset in slaves state. --- src/sentinel.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/sentinel.c b/src/sentinel.c index 972f8921c..1fef26df4 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -165,10 +165,11 @@ typedef struct sentinelRedisInstance { mstime_t master_link_down_time; /* Slave replication link down time. */ int slave_priority; /* Slave priority according to its INFO output. */ mstime_t slave_reconf_sent_time; /* Time at which we sent SLAVE OF */ - struct sentinelRedisInstance *master; /* Master instance if SRI_SLAVE is set. */ + struct sentinelRedisInstance *master; /* Master instance if it's slave. */ char *slave_master_host; /* Master host as reported by INFO */ int slave_master_port; /* Master port as reported by INFO */ int slave_master_link_status; /* Master link status as reported by INFO */ + unsigned long long slave_repl_offset; /* Slave replication offset. */ /* Failover */ char *leader; /* If this is a master instance, this is the runid of the Sentinel that should perform the failover. If @@ -900,6 +901,7 @@ sentinelRedisInstance *createSentinelRedisInstance(char *name, int flags, char * ri->slave_master_host = NULL; ri->slave_master_port = 0; ri->slave_master_link_status = SENTINEL_MASTER_LINK_STATUS_DOWN; + ri->slave_repl_offset = 0; ri->sentinels = dictCreate(&instancesDictType,NULL); ri->quorum = quorum; ri->parallel_syncs = SENTINEL_DEFAULT_PARALLEL_SYNCS; @@ -1738,6 +1740,10 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { /* slave_priority: */ if (sdslen(l) >= 15 && !memcmp(l,"slave_priority:",15)) ri->slave_priority = atoi(l+15); + + /* slave_repl_offset: */ + if (sdslen(l) >= 18 && !memcmp(l,"slave_repl_offset:",18)) + ri->slave_repl_offset = strtoull(l+18,NULL,10); } } ri->info_refresh = mstime(); @@ -2262,6 +2268,10 @@ void addReplySentinelRedisInstance(redisClient *c, sentinelRedisInstance *ri) { addReplyBulkCString(c,"slave-priority"); addReplyBulkLongLong(c,ri->slave_priority); fields++; + + addReplyBulkCString(c,"slave-repl-offset"); + addReplyBulkLongLong(c,ri->slave_repl_offset); + fields++; } /* Only sentinels */ From 3f92ee09ae37cd67d36f7e42f545ba165fa8b27f Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 20 Nov 2013 16:05:36 +0100 Subject: [PATCH 0332/2500] Sentinel: select slave with best (greater) replication offset. --- src/sentinel.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 1fef26df4..516141902 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -2903,6 +2903,9 @@ int sentinelStartFailoverIfNeeded(sentinelRedisInstance *master) { * NULL if no suitable slave was found. */ +/* Helper for sentinelSelectSlave(). This is used by qsort() in order to + * sort suitable slaves in a "better first" order, to take the first of + * the list. */ int compareSlavesForPromotion(const void *a, const void *b) { sentinelRedisInstance **sa = (sentinelRedisInstance **)a, **sb = (sentinelRedisInstance **)b; @@ -2911,8 +2914,16 @@ int compareSlavesForPromotion(const void *a, const void *b) { if ((*sa)->slave_priority != (*sb)->slave_priority) return (*sa)->slave_priority - (*sb)->slave_priority; - /* If priority is the same, select the slave with that has the - * lexicographically smaller runid. Note that we try to handle runid + /* If priority is the same, select the slave with greater replication + * offset (processed more data frmo the master). */ + if ((*sa)->slave_repl_offset > (*sb)->slave_repl_offset) { + return -1; /* a < b */ + } else if ((*sa)->slave_repl_offset < (*sb)->slave_repl_offset) { + return 1; /* b > a */ + } + + /* If the replication offset is the same select the slave with that has + * the lexicographically smaller runid. Note that we try to handle runid * == NULL as there are old Redis versions that don't publish runid in * INFO. A NULL runid is considered bigger than any other runid. */ sa_runid = (*sa)->runid; From 221d4d48f4ac3013fffccb21dd943db5eea2f84b Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 21 Nov 2013 11:31:06 +0100 Subject: [PATCH 0333/2500] Sentinel: Hello message sending code refactored. --- src/sentinel.c | 62 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 21 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 516141902..9ab91d0b8 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -2049,6 +2049,46 @@ void sentinelReceiveHelloMessages(redisAsyncContext *c, void *reply, void *privd } } +/* Send an "Hello" message via Pub/Sub to the specified 'ri' Redis + * instance in order to broadcast the current configuraiton for this + * master, and to advertise the existence of this Sentinel at the same time. + * + * The message has the following format: + * + * sentinel_ip,sentinel_port,sentinel_runid,current_epoch, + * master_name,master_ip,master_port,master_config_epoch. + * + * Returns REDIS_OK if the PUBLISH was queued correctly, otherwise + * REDIS_ERR is returned. */ +int sentinelSendHello(sentinelRedisInstance *ri) { + char ip[REDIS_IP_STR_LEN]; + char payload[REDIS_IP_STR_LEN+1024]; + int retval; + sentinelRedisInstance *master = (ri->flags & SRI_MASTER) ? ri : ri->master; + sentinelAddr *master_addr = sentinelGetCurrentMasterAddress(master); + + /* Try to obtain our own IP address. */ + if (anetSockName(ri->cc->c.fd,ip,sizeof(ip),NULL) == -1) return REDIS_ERR; + + /* Format and send the Hello message. */ + snprintf(payload,sizeof(payload), + "%s,%d,%s,%llu," /* Info about this sentinel. */ + "%s,%s,%d,%lld", /* Info about current master. */ + ip, server.port, server.runid, + (unsigned long long) sentinel.current_epoch, + /* --- */ + master->name,master_addr->ip,master_addr->port, + master->config_epoch); + retval = redisAsyncCommand(ri->cc, + sentinelPublishReplyCallback, NULL, "PUBLISH %s %s", + SENTINEL_HELLO_CHANNEL,payload); + if (retval != REDIS_OK) return REDIS_ERR; + ri->pending_commands++; + return REDIS_OK; +} + +/* Send periodic PING, INFO, and PUBLISH to the Hello channel to + * the specified master or slave instance. */ void sentinelPingInstance(sentinelRedisInstance *ri) { mstime_t now = mstime(); mstime_t info_period; @@ -2096,27 +2136,7 @@ void sentinelPingInstance(sentinelRedisInstance *ri) { (now - ri->last_pub_time) > SENTINEL_PUBLISH_PERIOD) { /* PUBLISH hello messages to masters and slaves. */ - char ip[REDIS_IP_STR_LEN]; - if (anetSockName(ri->cc->c.fd,ip,sizeof(ip),NULL) != -1) { - char payload[REDIS_IP_STR_LEN+1024]; - sentinelRedisInstance *master = (ri->flags & SRI_MASTER) ? - ri : ri->master; - sentinelAddr *master_addr = sentinelGetCurrentMasterAddress(master); - - snprintf(payload,sizeof(payload), - "%s,%d,%s,%llu," /* Info about this sentinel. */ - "%s,%s,%d,%lld", /* Info about current master. */ - ip, server.port, server.runid, - (unsigned long long) sentinel.current_epoch, - /* --- */ - master->name,master_addr->ip,master_addr->port, - master->config_epoch); - retval = redisAsyncCommand(ri->cc, - sentinelPublishReplyCallback, NULL, "PUBLISH %s %s", - SENTINEL_HELLO_CHANNEL,payload); - if (retval != REDIS_OK) return; - ri->pending_commands++; - } + sentinelSendHello(ri); } } From 98f08fa3abb7aeae7b0ca9d47b4bb36738b3031e Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 21 Nov 2013 11:35:50 +0100 Subject: [PATCH 0334/2500] Sentinel: check for disconnected links in sentinelSendHello(). Does not fix any bug as the test is performed by the caller, but better to have the check. --- src/sentinel.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/sentinel.c b/src/sentinel.c index 9ab91d0b8..b963da10a 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -2069,6 +2069,7 @@ int sentinelSendHello(sentinelRedisInstance *ri) { /* Try to obtain our own IP address. */ if (anetSockName(ri->cc->c.fd,ip,sizeof(ip),NULL) == -1) return REDIS_ERR; + if (ri->flags & SRI_DISCONNECTED) return; /* Format and send the Hello message. */ snprintf(payload,sizeof(payload), From adbba45d5d17c20ca28b2f36af09d4106eb90953 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 21 Nov 2013 12:27:14 +0100 Subject: [PATCH 0335/2500] Sentinel: test for writable config file. This commit introduces a funciton called when Sentinel is ready for normal operations to avoid putting Sentinel specific stuff in redis.c. --- src/redis.c | 2 +- src/redis.h | 1 + src/sentinel.c | 13 ++++++++++++- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/redis.c b/src/redis.c index af24505b6..76ca2a218 100644 --- a/src/redis.c +++ b/src/redis.c @@ -3109,7 +3109,7 @@ int main(int argc, char **argv) { if (server.sofd > 0) redisLog(REDIS_NOTICE,"The server is now ready to accept connections at %s", server.unixsocket); } else { - redisLog(REDIS_WARNING,"Sentinel runid is %s", server.runid); + sentinelIsRunning(); } /* Warning the user about suspicious maxmemory setting. */ diff --git a/src/redis.h b/src/redis.h index bdc695228..76f7dd3b3 100644 --- a/src/redis.h +++ b/src/redis.h @@ -1223,6 +1223,7 @@ void initSentinelConfig(void); void initSentinel(void); void sentinelTimer(void); char *sentinelHandleConfiguration(char **argv, int argc); +void sentinelIsRunning(void); /* Scripting */ void scriptingInit(void); diff --git a/src/sentinel.c b/src/sentinel.c index b963da10a..ca6d0eb5e 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -413,6 +413,17 @@ void initSentinel(void) { sentinel.scripts_queue = listCreate(); } +/* This function gets called when the server is in Sentinel mode, started, + * loaded the configuration, and is ready for normal operations. */ +void sentinelIsRunning(void) { + redisLog(REDIS_WARNING,"Sentinel runid is %s", server.runid); + + if (server.configfile == NULL || access(server.configfile,W_OK) == -1) { + redisLog(REDIS_WARNING,"Sentinel started without a config file, or config file not writable. Exiting..."); + exit(1); + } +} + /* ============================== sentinelAddr ============================== */ /* Create a sentinelAddr object and return it on success. @@ -2069,7 +2080,7 @@ int sentinelSendHello(sentinelRedisInstance *ri) { /* Try to obtain our own IP address. */ if (anetSockName(ri->cc->c.fd,ip,sizeof(ip),NULL) == -1) return REDIS_ERR; - if (ri->flags & SRI_DISCONNECTED) return; + if (ri->flags & SRI_DISCONNECTED) return REDIS_ERR; /* Format and send the Hello message. */ snprintf(payload,sizeof(payload), From 166b380011f23ccabfbfe3f637048588d612c892 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 21 Nov 2013 12:39:47 +0100 Subject: [PATCH 0336/2500] Sentinel: manual failover works again. --- src/sentinel.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index ca6d0eb5e..134e8ee92 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -2452,6 +2452,8 @@ void sentinelCommand(redisClient *c) { addReplySds(c,sdsnew("-NOGOODSLAVE No suitable slave to promote\r\n")); return; } + redisLog(REDIS_WARNING,"Executing user requested FAILOVER of '%s'", + ri->name); sentinelStartFailover(ri); ri->flags |= SRI_FORCE_FAILOVER; addReply(c,shared.ok); @@ -3017,8 +3019,9 @@ void sentinelFailoverWaitStart(sentinelRedisInstance *ri) { isleader = leader && strcasecmp(leader,server.runid) == 0; sdsfree(leader); - /* If I'm not the leader, I can't continue with the failover. */ - if (!isleader) { + /* If I'm not the leader, and it is not a forced failover via + * SENTINEL FAILOVER, then I can't continue with the failover. */ + if (!isleader && !(ri->flags & SRI_FORCE_FAILOVER)) { int election_timeout = SENTINEL_ELECTION_TIMEOUT; /* The election timeout is the MIN between SENTINEL_ELECTION_TIMEOUT From 0fa5d0e537d8a27ca34c419ecbbe3dd2d0082610 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 21 Nov 2013 15:43:48 +0100 Subject: [PATCH 0337/2500] Sentinel: removed mem leak and useless code. --- src/sentinel.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 134e8ee92..ee9e6e454 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -2669,14 +2669,6 @@ void sentinelAskMasterStateToOtherSentinels(sentinelRedisInstance *master, int f dictIterator *di; dictEntry *de; - /* Vote for myself if I see the master is already in ODOWN state. */ - if (master->flags & SRI_O_DOWN) { - uint64_t leader_epoch; - - sentinelVoteLeader(master,sentinel.current_epoch,server.runid, - &leader_epoch); - } - di = dictGetIterator(master->sentinels); while((de = dictNext(di)) != NULL) { sentinelRedisInstance *ri = dictGetVal(de); From 6feb6cfdf88c9da3098fb6b4abefaa12402db5a5 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 21 Nov 2013 16:05:41 +0100 Subject: [PATCH 0338/2500] Sentinel: cleanup around SENTINEL_INFO_VALIDITY_TIME. --- src/sentinel.c | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index ee9e6e454..e6aa83d4c 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -56,7 +56,7 @@ typedef struct sentinelAddr { #define SRI_SENTINEL (1<<2) #define SRI_DISCONNECTED (1<<3) #define SRI_S_DOWN (1<<4) /* Subjectively down (no quorum). */ -#define SRI_O_DOWN (1<<5) /* Objectively down (quorum reached). */ +#define SRI_O_DOWN (1<<5) /* Objectively down (confirmed by others). */ #define SRI_MASTER_DOWN (1<<6) /* A Sentinel with this flag set thinks that its master is down. */ #define SRI_FAILOVER_IN_PROGRESS (1<<7) /* Failover is in progress for @@ -68,6 +68,7 @@ typedef struct sentinelAddr { #define SRI_FORCE_FAILOVER (1<<12) /* Force failover with master up. */ #define SRI_SCRIPT_KILL_SENT (1<<13) /* SCRIPT KILL already sent on -BUSY */ +/* Note: times are in milliseconds. */ #define SENTINEL_INFO_PERIOD 10000 #define SENTINEL_PING_PERIOD 1000 #define SENTINEL_ASK_PERIOD 1000 @@ -84,10 +85,6 @@ typedef struct sentinelAddr { #define SENTINEL_MAX_PENDING_COMMANDS 100 #define SENTINEL_ELECTION_TIMEOUT 10000 -/* How many milliseconds is an information valid? This applies for instance - * to the reply to SENTINEL IS-MASTER-DOWN-BY-ADDR replies. */ -#define SENTINEL_INFO_VALIDITY_TIME 5000 - /* Failover machine different states. */ #define SENTINEL_FAILOVER_STATE_NONE 0 /* No failover in progress. */ #define SENTINEL_FAILOVER_STATE_WAIT_START 1 /* Wait for failover_start_time*/ @@ -2677,7 +2674,7 @@ void sentinelAskMasterStateToOtherSentinels(sentinelRedisInstance *master, int f int retval; /* If the master state from other sentinel is too old, we clear it. */ - if (elapsed > SENTINEL_INFO_VALIDITY_TIME) { + if (elapsed > SENTINEL_ASK_PERIOD*5) { ri->flags &= ~SRI_MASTER_DOWN; sdsfree(ri->leader); ri->leader = NULL; @@ -2915,15 +2912,26 @@ int sentinelStartFailoverIfNeeded(sentinelRedisInstance *master) { * the following parameters: * * 1) None of the following conditions: S_DOWN, O_DOWN, DISCONNECTED. - * 2) last_avail_time more recent than SENTINEL_INFO_VALIDITY_TIME. - * 3) info_refresh more recent than SENTINEL_INFO_VALIDITY_TIME. + * 2) Last time the slave replied to ping no more than 5 times the PING period. + * 3) info_refresh not older than 3 times the INFO refresh period. * 4) master_link_down_time no more than: * (now - master->s_down_since_time) + (master->down_after_period * 10). + * Basically since the master is down from our POV, the slave reports + * to be disconnected no more than 10 times the configured down-after-period. + * This is pretty much black magic but the idea is, the master was not + * available so the slave may be lagging, but not over a certain time. + * Anyway we'll select the best slave according to replication offset. * 5) Slave priority can't be zero, otherwise the slave is discarded. * * Among all the slaves matching the above conditions we select the slave - * with lower slave_priority. If priority is the same we select the slave - * with lexicographically smaller runid. + * with, in order of sorting key: + * + * - lower slave_priority. + * - bigger processed replication offset. + * - lexicographically smaller runid. + * + * Basically if runid is the same, the slave that processed more commands + * from the master is selected. * * The function returns the pointer to the selected slave, otherwise * NULL if no suitable slave was found. @@ -2976,18 +2984,20 @@ sentinelRedisInstance *sentinelSelectSlave(sentinelRedisInstance *master) { di = dictGetIterator(master->slaves); while((de = dictNext(di)) != NULL) { sentinelRedisInstance *slave = dictGetVal(de); - mstime_t info_validity_time = mstime()-SENTINEL_INFO_VALIDITY_TIME; + mstime_t info_validity_time; if (slave->flags & (SRI_S_DOWN|SRI_O_DOWN|SRI_DISCONNECTED)) continue; - if (slave->last_avail_time < info_validity_time) continue; + if (mstime() - slave->last_avail_time > SENTINEL_PING_PERIOD*5) continue; if (slave->slave_priority == 0) continue; /* If the master is in SDOWN state we get INFO for slaves every second. * Otherwise we get it with the usual period so we need to account for * a larger delay. */ - if ((master->flags & SRI_S_DOWN) == 0) - info_validity_time -= SENTINEL_INFO_PERIOD; - if (slave->info_refresh < info_validity_time) continue; + if (master->flags & SRI_S_DOWN) + info_validity_time = SENTINEL_PING_PERIOD*5; + else + info_validity_time = SENTINEL_INFO_PERIOD*3; + if (mstime() - slave->info_refresh > info_validity_time) continue; if (slave->master_link_down_time > max_master_down_time) continue; instance[instances++] = slave; } From e8b13dc679409308e0f10a6b52185cd159ab4fd5 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 21 Nov 2013 16:22:59 +0100 Subject: [PATCH 0339/2500] Sentinel: different comments updated to new implementation. --- src/sentinel.c | 39 ++++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index e6aa83d4c..e8d21ae98 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -993,8 +993,8 @@ const char *sentinelRedisInstanceTypeStr(sentinelRedisInstance *ri) { else return "unknown"; } -/* This function removes all the instances found in the dictionary of instances - * 'd', having either: +/* This function removes all the instances found in the dictionary of + * sentinels in the specified 'master', having either: * * 1) The same ip/port as specified. * 2) The same runid. @@ -1005,13 +1005,9 @@ const char *sentinelRedisInstanceTypeStr(sentinelRedisInstance *ri) { * * This function is useful because every time we add a new Sentinel into * a master's Sentinels dictionary, we want to be very sure about not - * having duplicated instances for any reason. This is so important because - * we use those other sentinels in order to run our quorum protocol to - * understand if it's time to proceed with the fail over. - * - * Making sure no duplication is possible we greatly improve the robustness - * of the quorum (otherwise we may end counting the same instance multiple - * times for some reason). + * having duplicated instances for any reason. This is important because + * other sentinels are needed to reach ODOWN quorum, and later to get + * voted for a given configuration epoch in order to perform the failover. * * The function returns the number of Sentinels removed. */ int removeMatchingSentinelsFromMaster(sentinelRedisInstance *master, char *ip, int port, char *runid) { @@ -1623,7 +1619,7 @@ void sentinelReconnectInstance(sentinelRedisInstance *ri) { * 1) It is actually a master in the current configuration. * 2) It reports itself as a master. * 3) It is not SDOWN or ODOWN. - * 4) We obtained last INFO no more than two times the INFO period of time ago. */ + * 4) We obtained last INFO no more than two times the INFO period time ago. */ int sentinelMasterLooksSane(sentinelRedisInstance *master) { return master->flags & SRI_MASTER && @@ -2560,8 +2556,8 @@ void sentinelCheckSubjectivelyDown(sentinelRedisInstance *ri) { sentinelKillLink(ri,ri->pc); } - /* Update the subjectively down flag. We believe the instance is in SDOWN - * state if: + /* Update the SDOWN flag. We believe the instance is SDOWN if: + * * 1) It is not replying. * 2) We believe it is a master, it reports to be a slave for enough time * to meet the down_after_period, plus enough time to get two times @@ -2587,7 +2583,12 @@ void sentinelCheckSubjectivelyDown(sentinelRedisInstance *ri) { } } -/* Is this instance down accordingly to the configured quorum? */ +/* Is this instance down according to the configured quorum? + * + * Note that ODOWN is a weak quorum, it only means that enough Sentinels + * reported in a given time range that the instance was not reachable. + * However messages can be delayed so there are no strong guarantees about + * N instances agreeing at the same time about the down state. */ void sentinelCheckObjectivelyDown(sentinelRedisInstance *master) { dictIterator *di; dictEntry *de; @@ -2657,10 +2658,10 @@ void sentinelReceiveIsMasterDownReply(redisAsyncContext *c, void *reply, void *p } } -/* If we think (subjectively) the master is down, we start sending +/* If we think the master is down, we start sending * SENTINEL IS-MASTER-DOWN-BY-ADDR requests to other sentinels - * in order to get the replies that allow to reach the quorum and - * possibly also mark the master as objectively down. */ + * in order to get the replies that allow to reach the quorum + * needed to mark the master in ODOWN state and trigger a failover. */ #define SENTINEL_ASK_FORCED (1<<0) void sentinelAskMasterStateToOtherSentinels(sentinelRedisInstance *master, int flags) { dictIterator *di; @@ -3222,11 +3223,7 @@ void sentinelFailoverReconfNextSlave(sentinelRedisInstance *master) { /* This function is called when the slave is in * SENTINEL_FAILOVER_STATE_UPDATE_CONFIG state. In this state we need - * to remove it from the master table and add the promoted slave instead. - * - * If there are no promoted slaves as this instance is unique, we remove - * and re-add it with the same address to trigger a complete state - * refresh. */ + * to remove it from the master table and add the promoted slave instead. */ void sentinelFailoverSwitchToPromotedSlave(sentinelRedisInstance *master) { sentinelRedisInstance *ref = master->promoted_slave ? master->promoted_slave : master; From 573c416e00ec848c23883c98478dd0069a543185 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 21 Nov 2013 17:07:00 +0100 Subject: [PATCH 0340/2500] Sentinel: example sentinel.conf updated. --- sentinel.conf | 58 ++++++++++++++++++++++++--------------------------- 1 file changed, 27 insertions(+), 31 deletions(-) diff --git a/sentinel.conf b/sentinel.conf index ac687b535..248e76c06 100644 --- a/sentinel.conf +++ b/sentinel.conf @@ -9,6 +9,10 @@ port 26379 # Tells Sentinel to monitor this slave, and to consider it in O_DOWN # (Objectively Down) state only if at least sentinels agree. # +# Note that whatever is the ODOWN quorum, a Sentinel will require to +# be elected by the majority of the known Sentinels in order to +# start a failover, so no failover can be performed in minority. +# # Note: master name should not include special characters or spaces. # The valid charset is A-z 0-9 and the three characters ".-_". sentinel monitor mymaster 127.0.0.1 6379 2 @@ -42,11 +46,6 @@ sentinel monitor mymaster 127.0.0.1 6379 2 # Default is 30 seconds. sentinel down-after-milliseconds mymaster 30000 -# sentinel can-failover -# -# Specify if this Sentinel can start the failover for this master. -sentinel can-failover mymaster yes - # sentinel parallel-syncs # # How many slaves we can reconfigure to point to the new slave simultaneously @@ -57,19 +56,28 @@ sentinel parallel-syncs mymaster 1 # sentinel failover-timeout # -# Specifies the failover timeout in milliseconds. When this time has elapsed -# without any progress in the failover process, it is considered concluded by -# the sentinel even if not all the attached slaves were correctly configured -# to replicate with the new master (however a "best effort" SLAVEOF command -# is sent to all the slaves before). +# Specifies the failover timeout in milliseconds. It is used in many ways: # -# Also when 25% of this time has elapsed without any advancement, and there -# is a leader switch (the sentinel did not started the failover but is now -# elected as leader), the sentinel will continue the failover doing a -# "takeover". +# - The time needed to re-start a failover after a previous failover was +# already tried against the same master by a given Sentinel, is two +# times the failover timeout. # -# Default is 15 minutes. -sentinel failover-timeout mymaster 900000 +# - The time needed for a slave replicating to a wrong master according +# to a Sentinel currnet configuration, to be forced to replicate +# with the right master, is exactly the failover timeout (counting since +# the moment a Sentinel detected the misconfiguration). +# +# - The time needed to cancel a failover that is already in progress but +# did not produced any configuration change (SLAVEOF NO ONE yet not +# acknowledged by the promoted slave). +# +# - The maximum time a failover in progress waits for all the slaves to be +# reconfigured as slaves of the new master. However even after this time +# the slaves will be reconfigured by the Sentinels anyway, but not with +# the exact parallel-syncs progression as specified. +# +# Default is 3 minutes. +sentinel failover-timeout mymaster 180000 # SCRIPTS EXECUTION # @@ -114,32 +122,20 @@ sentinel failover-timeout mymaster 900000 # # sentinel client-reconfig-script # -# When the failover starts, ends, or is aborted, a script can be called in +# When the master changed because of a failover a script can be called in # order to perform application-specific tasks to notify the clients that the # configuration has changed and the master is at a different address. # -# The script is called in the following cases: -# -# Failover started (a slave is already promoted) -# Failover finished (all the additional slaves already reconfigured) -# Failover aborted (in that case the script was previously called when the -# failover started, and now gets called again with swapped -# addresses). -# # The following arguments are passed to the script: # # # -# is "start", "end" or "abort" +# is currently always "failover" # is either "leader" or "observer" # # The arguments from-ip, from-port, to-ip, to-port are used to communicate # the old address of the master and the new address of the elected slave -# (now a master) in the case state is "start" or "end". -# -# For abort instead the "from" is the address of the promoted slave and -# "to" is the address of the original master address, since the failover -# was aborted. +# (now a master). # # This script should be resistant to multiple invocations. # From 8bc3e626a5801f87065e42df4a51d3e698922d09 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 25 Nov 2013 10:21:18 +0100 Subject: [PATCH 0341/2500] Fix false positive in memory efficiency test. Fixes issue #1298. --- tests/unit/memefficiency.tcl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/memefficiency.tcl b/tests/unit/memefficiency.tcl index 3612f06e5..14e135ced 100644 --- a/tests/unit/memefficiency.tcl +++ b/tests/unit/memefficiency.tcl @@ -22,7 +22,7 @@ start_server {tags {"memefficiency"}} { 64 0.25 128 0.35 1024 0.75 - 16384 0.90 + 16384 0.82 } { test "Memory efficiency with values in range $size_range" { set efficiency [test_memory_efficiency $size_range] From 90bacd032e83670bbbbcec39c63b0c577d0600fb Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 25 Nov 2013 10:24:34 +0100 Subject: [PATCH 0342/2500] Sentinel: fix type specifier for Hello msg generation. This fixes issue #1395. --- src/sentinel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index e8d21ae98..d61920050 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -2078,12 +2078,12 @@ int sentinelSendHello(sentinelRedisInstance *ri) { /* Format and send the Hello message. */ snprintf(payload,sizeof(payload), "%s,%d,%s,%llu," /* Info about this sentinel. */ - "%s,%s,%d,%lld", /* Info about current master. */ + "%s,%s,%d,%llu", /* Info about current master. */ ip, server.port, server.runid, (unsigned long long) sentinel.current_epoch, /* --- */ master->name,master_addr->ip,master_addr->port, - master->config_epoch); + (unsigned long long) master->config_epoch); retval = redisAsyncCommand(ri->cc, sentinelPublishReplyCallback, NULL, "PUBLISH %s %s", SENTINEL_HELLO_CHANNEL,payload); From 299530216522bd3dd596abae6c2c6898634189fb Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 25 Nov 2013 10:57:20 +0100 Subject: [PATCH 0343/2500] Sentinel: fixes inverted strcmp() test preventing config updates. The result of this one-char bug was pretty serious, if the new master had the same port of the previous master, but just a different IP address, non-leader Sentinels would not be able to recognize the configuration change. This commit fixes issue #1394. Many thanks to @shanemadden that reported the bug and helped investigating it. --- src/sentinel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sentinel.c b/src/sentinel.c index d61920050..ed2f56d9d 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -2025,7 +2025,7 @@ void sentinelReceiveHelloMessages(redisAsyncContext *c, void *reply, void *privd if (msgmaster->config_epoch < master_config_epoch) { msgmaster->config_epoch = master_config_epoch; if (master_port != msgmaster->addr->port || - !strcmp(msgmaster->addr->ip, token[5])) + strcmp(msgmaster->addr->ip, token[5])) { sentinelAddr *old_addr; From a1979d9d5594b58d8df23584e12f827341b56117 Mon Sep 17 00:00:00 2001 From: huangz1990 Date: Tue, 26 Nov 2013 19:55:51 +0800 Subject: [PATCH 0344/2500] fix a bug in sentinel.c about pub/sub link --- src/sentinel.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index ed2f56d9d..e846b0fcb 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1607,9 +1607,8 @@ void sentinelReconnectInstance(sentinelRedisInstance *ri) { } } /* Clear the DISCONNECTED flags only if we have both the connections - * (or just the commands connection if this is a slave or a - * sentinel instance). */ - if (ri->cc && (ri->flags & (SRI_SLAVE|SRI_SENTINEL) || ri->pc)) + * (or just the commands connection if this is a sentinel instance). */ + if (ri->cc && (ri->flags & SRI_SENTINEL || ri->pc)) ri->flags &= ~SRI_DISCONNECTED; } From 394bccd1377927e35dba7d9df59ee4f64a8c4091 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 28 Nov 2013 15:23:46 +0100 Subject: [PATCH 0345/2500] Sentinel: log vote received from other Sentinels. --- src/sentinel.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/sentinel.c b/src/sentinel.c index e846b0fcb..3d7fbf553 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -2651,6 +2651,11 @@ void sentinelReceiveIsMasterDownReply(redisAsyncContext *c, void *reply, void *p /* If the runid in the reply is not "*" the Sentinel actually * replied with a vote. */ sdsfree(ri->leader); + if (ri->leader_epoch != r->element[2]->integer) + redisLog(REDIS_WARNING, + "%s voted for %s %llu", ri->name, + r->element[1]->str, + (unsigned long long) r->element[2]->integer); ri->leader = sdsnew(r->element[1]->str); ri->leader_epoch = r->element[2]->integer; } From d0a7a5a39f863b0cff04410ccab93efa2837d544 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 28 Nov 2013 16:16:58 +0100 Subject: [PATCH 0346/2500] Reply to PING with error when there is a MISCONF state. --- src/redis.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/redis.c b/src/redis.c index 76ca2a218..99ec8f60e 100644 --- a/src/redis.c +++ b/src/redis.c @@ -1996,7 +1996,8 @@ int processCommand(redisClient *c) { if (server.stop_writes_on_bgsave_err && server.saveparamslen > 0 && server.lastbgsave_status == REDIS_ERR && - c->cmd->flags & REDIS_CMD_WRITE) + (c->cmd->flags & REDIS_CMD_WRITE || + c->cmd->proc == pingCommand)) { flagTransaction(c); addReply(c, shared.bgsaveerr); From 7cd6b48963641b560f69bcf426eb2f0e6be078ec Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 28 Nov 2013 16:25:49 +0100 Subject: [PATCH 0347/2500] Stop writes on MISCONF only if instance is a master. From the point of view of the slave not accepting writes from the master can only create a bigger consistency issue. --- src/redis.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/redis.c b/src/redis.c index 99ec8f60e..9d7f9164b 100644 --- a/src/redis.c +++ b/src/redis.c @@ -1992,10 +1992,12 @@ int processCommand(redisClient *c) { } } - /* Don't accept write commands if there are problems persisting on disk. */ + /* Don't accept write commands if there are problems persisting on disk + * and if this is a master instance. */ if (server.stop_writes_on_bgsave_err && server.saveparamslen > 0 && server.lastbgsave_status == REDIS_ERR && + server.masterhost != NULL && (c->cmd->flags & REDIS_CMD_WRITE || c->cmd->proc == pingCommand)) { @@ -2005,7 +2007,7 @@ int processCommand(redisClient *c) { } /* Don't accept write commands if there are not enough good slaves and - * used configured the min-slaves-to-write option. */ + * user configured the min-slaves-to-write option. */ if (server.repl_min_slaves_to_write && server.repl_min_slaves_max_lag && c->cmd->flags & REDIS_CMD_WRITE && From a829c85988dbc6b40f08a7bf30dc13beab2c73cd Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 29 Nov 2013 16:17:05 +0100 Subject: [PATCH 0348/2500] Cluster: some code about clusterHandleSlaveFailover() marginally improved. 80 cols friendly, some minor change to the code to make it simpler. --- src/cluster.c | 24 +++++++++++++----------- src/cluster.h | 4 ++-- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 2f26100e7..55d0e8970 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -1681,7 +1681,8 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { /* IF we are not a master serving at least 1 slot, we don't have the * right to vote, as the cluster size in Redis Cluster is the number - * of masters serving at least one slot, and quorum is the cluster size + 1 */ + * of masters serving at least one slot, and quorum is the cluster + * size + 1 */ if (!(server.cluster->myself->flags & REDIS_NODE_MASTER)) return; if (server.cluster->myself->numslots == 0) return; @@ -1702,9 +1703,9 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { if (mstime() - node->slaveof->voted_time < server.cluster_node_timeout * 2) return; - /* The slave requesting the vote must have a configEpoch for the claimed slots - * that is >= the one of the masters currently serving the same slots in the - * current configuration. */ + /* The slave requesting the vote must have a configEpoch for the claimed + * slots that is >= the one of the masters currently serving the same + * slots in the current configuration. */ for (j = 0; j < REDIS_CLUSTER_SLOTS; j++) { if (bitmapTestBit(claimed_slots, j) == 0) continue; if (server.cluster->slots[j] == NULL || @@ -1735,7 +1736,8 @@ void clusterHandleSlaveFailover(void) { int needed_quorum = (server.cluster->size / 2) + 1; int j; - /* Set data_age to the number of seconds we are disconnected from the master. */ + /* Set data_age to the number of seconds we are disconnected from + * the master. */ if (server.repl_state == REDIS_REPL_CONNECTED) { data_age = (server.unixtime - server.master->lastinteraction) * 1000; } else { @@ -1765,8 +1767,7 @@ void clusterHandleSlaveFailover(void) { return; /* Compute the time at which we can start an election. */ - if (server.cluster->failover_auth_time == 0 || - auth_age > + if (auth_age > server.cluster_node_timeout * REDIS_CLUSTER_FAILOVER_AUTH_RETRY_MULT) { server.cluster->failover_auth_time = mstime() + @@ -1775,7 +1776,8 @@ void clusterHandleSlaveFailover(void) { random() % 500; /* Random delay between 0 and 500 milliseconds. */ server.cluster->failover_auth_count = 0; server.cluster->failover_auth_sent = 0; - redisLog(REDIS_WARNING,"Start of election delayed for %lld milliseconds.", + redisLog(REDIS_WARNING, + "Start of election delayed for %lld milliseconds.", server.cluster->failover_auth_time - mstime()); return; } @@ -1784,8 +1786,7 @@ void clusterHandleSlaveFailover(void) { if (mstime() < server.cluster->failover_auth_time) return; /* Return ASAP if the election is too old to be valid. */ - if (mstime() - server.cluster->failover_auth_time > server.cluster_node_timeout) - return; + if (auth_age > server.cluster_node_timeout) return; /* Ask for votes if needed. */ if (server.cluster->failover_auth_sent == 0) { @@ -1827,7 +1828,8 @@ void clusterHandleSlaveFailover(void) { } /* 3) Update my configEpoch to the epoch of the election. */ - server.cluster->myself->configEpoch = server.cluster->failover_auth_epoch; + server.cluster->myself->configEpoch = + server.cluster->failover_auth_epoch; /* 4) Update state and save config. */ clusterUpdateState(); diff --git a/src/cluster.h b/src/cluster.h index 658a364bd..28d9af806 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -83,7 +83,7 @@ typedef struct clusterState { clusterNode *slots[REDIS_CLUSTER_SLOTS]; zskiplist *slots_to_keys; /* The following fields are used to take the slave state on elections. */ - mstime_t failover_auth_time;/* Time at which we'll try to get elected in ms*/ + mstime_t failover_auth_time; /* Time of previous or next election. */ int failover_auth_count; /* Number of votes received so far. */ int failover_auth_sent; /* True if we already asked for votes. */ uint64_t failover_auth_epoch; /* Epoch of the current election. */ @@ -91,7 +91,7 @@ typedef struct clusterState { uint64_t last_vote_epoch; /* Epoch of the last vote granted. */ int todo_before_sleep; /* Things to do in clusterBeforeSleep(). */ long long stats_bus_messages_sent; /* Num of msg sent via cluster bus. */ - long long stats_bus_messages_received; /* Num of msg received via cluster bus. */ + long long stats_bus_messages_received; /* Num of msg rcvd via cluster bus.*/ } clusterState; /* clusterState todo_before_sleep flags. */ From 5502face592f207f5c8288633fee9ec8532d00cf Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 29 Nov 2013 17:37:06 +0100 Subject: [PATCH 0349/2500] Cluster: basic data structures for nodes black list. --- src/cluster.c | 2 ++ src/cluster.h | 1 + src/redis.c | 12 ++++++++++++ src/redis.h | 1 + 4 files changed, 16 insertions(+) diff --git a/src/cluster.c b/src/cluster.c index 55d0e8970..d907a993d 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -260,6 +260,8 @@ void clusterInit(void) { server.cluster->state = REDIS_CLUSTER_FAIL; server.cluster->size = 1; server.cluster->nodes = dictCreate(&clusterNodesDictType,NULL); + server.cluster->nodes_black_list = + dictCreate(&clusterNodesBlackListDictType,NULL); server.cluster->failover_auth_time = 0; server.cluster->failover_auth_count = 0; server.cluster->failover_auth_epoch = 0; diff --git a/src/cluster.h b/src/cluster.h index 28d9af806..faba13477 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -78,6 +78,7 @@ typedef struct clusterState { int state; /* REDIS_CLUSTER_OK, REDIS_CLUSTER_FAIL, ... */ int size; /* Num of master nodes with at least one slot */ dict *nodes; /* Hash table of name -> clusterNode structures */ + dict *nodes_black_list; /* Nodes we don't re-add for a few seconds. */ clusterNode *migrating_slots_to[REDIS_CLUSTER_SLOTS]; clusterNode *importing_slots_from[REDIS_CLUSTER_SLOTS]; clusterNode *slots[REDIS_CLUSTER_SLOTS]; diff --git a/src/redis.c b/src/redis.c index 9d7f9164b..e112f1b56 100644 --- a/src/redis.c +++ b/src/redis.c @@ -583,6 +583,18 @@ dictType clusterNodesDictType = { NULL /* val destructor */ }; +/* Cluster re-addition blacklist. This maps node IDs to the time + * we can re-add this node. The goal is to avoid readding a removed + * node for some time. */ +dictType clusterNodesBlackListDictType = { + dictSdsCaseHash, /* hash function */ + NULL, /* key dup */ + NULL, /* val dup */ + dictSdsKeyCaseCompare, /* key compare */ + dictSdsDestructor, /* key destructor */ + NULL /* val destructor */ +}; + /* Migrate cache dict type. */ dictType migrateCacheDictType = { dictSdsHash, /* hash function */ diff --git a/src/redis.h b/src/redis.h index 76f7dd3b3..9886c6476 100644 --- a/src/redis.h +++ b/src/redis.h @@ -884,6 +884,7 @@ extern struct sharedObjectsStruct shared; extern dictType setDictType; extern dictType zsetDictType; extern dictType clusterNodesDictType; +extern dictType clusterNodesBlackListDictType; extern dictType dbDictType; extern dictType shaScriptObjectDictType; extern double R_Zero, R_PosInf, R_NegInf, R_Nan; From b7c955046dd4a94dd066431e7985ddff9d7bdc14 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 2 Dec 2013 11:12:23 +0100 Subject: [PATCH 0350/2500] Cluster: nodes re-addition blacklist API. --- src/cluster.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/src/cluster.c b/src/cluster.c index d907a993d..248b527d5 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -622,6 +622,75 @@ void clusterRenameNode(clusterNode *node, char *newname) { clusterAddNode(node); } +/* ----------------------------------------------------------------------------- + * CLUSTER nodes blacklist + * + * The nodes blacklist is just a way to ensure that a given node with a given + * Node ID is not readded before some time elapsed (this time is specified + * in seconds in REDIS_CLUSTER_BLACKLIST_TTL). + * + * This is useful when we want to remove a node from the cluster completely: + * when CLUSTER FORGET is called, it also puts the node into the blacklist so + * that even if we receive gossip messages from other nodes that still remember + * about the node we want to remove, we don't re-add it before some time. + * + * Currently the REDIS_CLUSTER_BLACKLIST_TTL is set to 1 minute, this means + * that redis-trib has 60 seconds to send CLUSTER FORGET messages to nodes + * in the cluster without dealing with the problem if other nodes re-adding + * back the node to nodes we already sent the FORGET command to. + * + * The data structure used is an hash table with an sds string representing + * the node ID as key, and the time when it is ok to re-add the node as + * value. + * -------------------------------------------------------------------------- */ + +#define REDIS_CLUSTER_BLACKLIST_TTL 60 /* 1 minute. */ + + +/* Before of the addNode() or Exists() operations we always remove expired + * entries from the black list. This is an O(N) operation but it is not a + * problem since add / exists operations are called very infrequently and + * the hash table is supposed to contain very little elements at max. + * However without the cleanup during long uptimes and with some automated + * node add/removal procedures, entries could accumulate. */ +void clusterBlacklistCleanup(void) { + dictIterator *di; + dictEntry *de; + + di = dictGetSafeIterator(server.cluster->nodes_black_list); + while((de = dictNext(di)) != NULL) { + int64_t expire = dictGetUnsignedIntegerVal(de); + + if (expire < server.unixtime) + dictDelete(server.cluster->nodes_black_list,dictGetKey(de)); + } + dictReleaseIterator(di); +} + +/* Cleanup the blacklist and add a new node ID to the black list. */ +void clusterBlacklistAddNode(clusterNode *node) { + dictEntry *de; + sds id = sdsnewlen(node->name,REDIS_CLUSTER_NAMELEN); + + clusterBlacklistCleanup(); + if (dictAdd(server.cluster->nodes_black_list,id,NULL) == DICT_ERR) + sdsfree(id); /* Key was already there. */ + de = dictFind(server.cluster->nodes_black_list,node->name); + dictSetUnsignedIntegerVal(de,time(NULL)); +} + +/* Return non-zero if the specified node ID exists in the blacklist. + * You don't need to pass an sds string here, any pointer to 40 bytes + * will work. */ +int clusterBlacklistExists(char *nodeid) { + sds id = sdsnewlen(nodeid,REDIS_CLUSTER_NAMELEN); + int retval; + + retval = dictFind(server.cluster->nodes_black_list,id) != NULL; + sdsfree(id); + return retval; +} + /* ----------------------------------------------------------------------------- * CLUSTER messages exchange - PING/PONG and gossip * -------------------------------------------------------------------------- */ From 4df452caf643f6670090a42bb8ec6edd3ad27e7a Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 2 Dec 2013 12:22:17 +0100 Subject: [PATCH 0351/2500] Sentinel: better time desynchronization. Sentinels are now desynchronized in a better way changing the time handler frequency between 10 and 20 HZ. This way on average a desynchronization of 25 milliesconds is produced that should be larger enough compared to network latency, avoiding most split-brain condition during the vote. Now that the clocks are desynchronized, to have larger random delays when performing operations can be easily achieved in the following way. Take as example the function that starts the failover, that is called with a frequency between 10 and 20 HZ and will start the failover every time there are the conditions. By just adding as an additional condition something like rand()%4 == 0, we can amplify the desynchronization between Sentinel instances easily. See issue #1419. --- src/sentinel.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 3d7fbf553..bb4cfbfdc 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -2733,12 +2733,9 @@ char *sentinelVoteLeader(sentinelRedisInstance *master, uint64_t req_epoch, char master->leader, (unsigned long long) master->leader_epoch); /* If we did not voted for ourselves, set the master failover start * time to now, in order to force a delay before we can start a - * failover for the same master. - * - * The random addition is useful to desynchronize a bit the slaves - * and reduce the chance that no slave gets majority. */ + * failover for the same master. */ if (strcasecmp(master->leader,server.runid)) - master->failover_start_time = mstime() + rand() % 2000; + master->failover_start_time = mstime(); } *leader_epoch = master->leader_epoch; @@ -3389,5 +3386,13 @@ void sentinelTimer(void) { sentinelRunPendingScripts(); sentinelCollectTerminatedScripts(); sentinelKillTimedoutScripts(); + + /* We continuously change the frequency of the Redis "timer interrupt" + * in order to desynchronize every Sentinel from every other. + * This non-determinism avoids that Sentinels started at the same time + * exactly continue to stay synchronized asking to be voted at the + * same time again and again (resulting in nobody likely winning the + * election because of split brain voting). */ + server.hz = REDIS_DEFAULT_HZ + rand() % REDIS_DEFAULT_HZ; } From 6fc6c6bda979b1a701b57996555a08ebbe0b57fe Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 2 Dec 2013 15:55:19 +0100 Subject: [PATCH 0352/2500] Sentinel: don't write HZ when flushing config. See issue #1419. --- src/sentinel.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index bb4cfbfdc..a936e6851 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1472,15 +1472,19 @@ void rewriteConfigSentinelOption(struct rewriteConfigState *state) { * On failure the function logs a warning on the Redis log. */ void sentinelFlushConfig(void) { int fd; + int saved_hz = server.hz; - if (rewriteConfig(server.configfile) == -1) { + server.hz = REDIS_DEFAULT_HZ; + if (rewriteConfig(server.configfile) != -1) { + /* Rewrite succeded, fsync it. */ + if ((fd = open(server.configfile,O_RDONLY)) != -1) { + fsync(fd); + close(fd); + } + } else { redisLog(REDIS_WARNING,"WARNING: Senitnel was not able to save the new configuration on disk!!!: %s", strerror(errno)); - return; - } - if ((fd = open(server.configfile,O_RDONLY)) != -1) { - fsync(fd); - close(fd); } + server.hz = saved_hz; return; } From 1ea9a283cb316193ae0a63cfd79b3c84cb2ff227 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 3 Dec 2013 13:40:41 +0100 Subject: [PATCH 0353/2500] Grammar fix in freeClient(). --- src/networking.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/networking.c b/src/networking.c index 1da5a5a58..ee0e0b93d 100644 --- a/src/networking.c +++ b/src/networking.c @@ -698,7 +698,7 @@ void freeClient(redisClient *c) { listDelNode(server.clients,ln); } /* When client was just unblocked because of a blocking operation, - * remove it from the list with unblocked clients. */ + * remove it from the list of unblocked clients. */ if (c->flags & REDIS_UNBLOCKED) { ln = listSearchKey(server.unblocked_clients,c); redisAssert(ln != NULL); From a6ed453b33ac26bf10e8cafd45a5d1add3c2e8c3 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 3 Dec 2013 13:54:06 +0100 Subject: [PATCH 0354/2500] Removed old comments and dead code from freeClient(). --- src/networking.c | 26 ++++++++++++++------------ src/redis.h | 2 -- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/networking.c b/src/networking.c index ee0e0b93d..0582789ff 100644 --- a/src/networking.c +++ b/src/networking.c @@ -111,9 +111,7 @@ redisClient *createClient(int fd) { c->bpop.keys = dictCreate(&setDictType,NULL); c->bpop.timeout = 0; c->bpop.target = NULL; - c->io_keys = listCreate(); c->watched_keys = listCreate(); - listSetFreeMethod(c->io_keys,decrRefCountVoid); c->pubsub_channels = dictCreate(&setDictType,NULL); c->pubsub_patterns = listCreate(); listSetFreeMethod(c->pubsub_patterns,decrRefCountVoid); @@ -663,13 +661,11 @@ void freeClient(redisClient *c) { return; } - /* Note that if the client we are freeing is blocked into a blocking - * call, we have to set querybuf to NULL *before* to call - * unblockClientWaitingData() to avoid processInputBuffer() will get - * called. Also it is important to remove the file events after - * this, because this call adds the READABLE event. */ + /* Free the query buffer */ sdsfree(c->querybuf); c->querybuf = NULL; + + /* Deallocate structures used to block on blocking ops. */ if (c->flags & REDIS_BLOCKED) unblockClientWaitingData(c); dictRelease(c->bpop.keys); @@ -677,11 +673,13 @@ void freeClient(redisClient *c) { /* UNWATCH all the keys */ unwatchAllKeys(c); listRelease(c->watched_keys); + /* Unsubscribe from all the pubsub channels */ pubsubUnsubscribeAllChannels(c,0); pubsubUnsubscribeAllPatterns(c,0); dictRelease(c->pubsub_channels); listRelease(c->pubsub_patterns); + /* Close socket, unregister events, and remove list of replies and * accumulated arguments. */ if (c->fd != -1) { @@ -691,12 +689,14 @@ void freeClient(redisClient *c) { } listRelease(c->reply); freeClientArgv(c); + /* Remove from the list of clients */ if (c->fd != -1) { ln = listSearchKey(server.clients,c); redisAssert(ln != NULL); listDelNode(server.clients,ln); } + /* When client was just unblocked because of a blocking operation, * remove it from the list of unblocked clients. */ if (c->flags & REDIS_UNBLOCKED) { @@ -704,9 +704,9 @@ void freeClient(redisClient *c) { redisAssert(ln != NULL); listDelNode(server.unblocked_clients,ln); } - listRelease(c->io_keys); - /* Master/slave cleanup. - * Case 1: we lost the connection with a slave. */ + + /* Master/slave cleanup Case 1: + * we lost the connection with a slave. */ if (c->flags & REDIS_SLAVE) { if (c->replstate == REDIS_REPL_SEND_BULK) { if (c->repldbfd != -1) close(c->repldbfd); @@ -724,7 +724,8 @@ void freeClient(redisClient *c) { refreshGoodSlavesCount(); } - /* Case 2: we lost the connection with the master. */ + /* Master/slave cleanup Case 2: + * we lost the connection with the master. */ if (c->flags & REDIS_MASTER) replicationHandleMasterDisconnection(); /* If this client was scheduled for async freeing we need to remove it @@ -735,7 +736,8 @@ void freeClient(redisClient *c) { listDelNode(server.clients_to_close,ln); } - /* Release memory */ + /* Release other dynamically allocated client structure fields, + * and finally release the client structure itself. */ if (c->name) decrRefCount(c->name); zfree(c->argv); freeClientMultiState(c); diff --git a/src/redis.h b/src/redis.h index 9886c6476..e4413d28b 100644 --- a/src/redis.h +++ b/src/redis.h @@ -480,8 +480,6 @@ typedef struct redisClient { int slave_listening_port; /* As configured with: SLAVECONF listening-port */ multiState mstate; /* MULTI/EXEC state */ blockingState bpop; /* blocking state */ - list *io_keys; /* Keys this client is waiting to be loaded from the - * swap file in order to continue. */ list *watched_keys; /* Keys WATCHED for MULTI/EXEC CAS */ dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */ list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */ From 83e363d3e67c27865d7679c27f466c5e12b3d4ee Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 3 Dec 2013 17:43:53 +0100 Subject: [PATCH 0355/2500] BLPOP blocking code refactored to be generic & reusable. --- src/Makefile | 2 +- src/aof.c | 1 + src/blocked.c | 121 +++++++++++++++++++++++++++++++++++++++++++++++ src/networking.c | 8 ++-- src/redis.c | 23 ++------- src/redis.h | 27 ++++++++++- src/t_list.c | 48 +++++-------------- 7 files changed, 169 insertions(+), 61 deletions(-) create mode 100644 src/blocked.c diff --git a/src/Makefile b/src/Makefile index d4551d924..e0592710f 100644 --- a/src/Makefile +++ b/src/Makefile @@ -103,7 +103,7 @@ endif REDIS_SERVER_NAME=redis-server REDIS_SENTINEL_NAME=redis-sentinel -REDIS_SERVER_OBJ=adlist.o ae.o anet.o dict.o redis.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o crc16.o endianconv.o slowlog.o scripting.o bio.o rio.o rand.o memtest.o crc64.o bitops.o sentinel.o notify.o setproctitle.o +REDIS_SERVER_OBJ=adlist.o ae.o anet.o dict.o redis.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o crc16.o endianconv.o slowlog.o scripting.o bio.o rio.o rand.o memtest.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o REDIS_CLI_NAME=redis-cli REDIS_CLI_OBJ=anet.o sds.o adlist.o redis-cli.o zmalloc.o release.o anet.o ae.o crc64.o REDIS_BENCHMARK_NAME=redis-benchmark diff --git a/src/aof.c b/src/aof.c index dcbcc62e1..8cbdec889 100644 --- a/src/aof.c +++ b/src/aof.c @@ -449,6 +449,7 @@ struct redisClient *createFakeClient(void) { c->argv = NULL; c->bufpos = 0; c->flags = 0; + c->btype = REDIS_BLOCKED_NONE; /* We set the fake client as a slave waiting for the synchronization * so that Redis will not try to send replies to this client. */ c->replstate = REDIS_REPL_WAIT_BGSAVE_START; diff --git a/src/blocked.c b/src/blocked.c new file mode 100644 index 000000000..3f4dd6e8d --- /dev/null +++ b/src/blocked.c @@ -0,0 +1,121 @@ +/* blocked.c - generic support for blocking operations like BLPOP & WAIT. + * + * Copyright (c) 2009-2012, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "redis.h" + +/* Get a timeout value from an object and store it into 'timeout'. + * The final timeout is always stored as milliseconds as a time where the + * timeout will expire, however the parsing is performed according to + * the 'unit' that can be seconds or milliseconds. + * + * Note that if the timeout is zero (usually from the point of view of + * commands API this means no timeout) the value stored into 'timeout' + * is zero. */ +int getTimeoutFromObjectOrReply(redisClient *c, robj *object, mstime_t *timeout, int unit) { + long long tval; + + if (getLongLongFromObjectOrReply(c,object,&tval, + "timeout is not an integer or out of range") != REDIS_OK) + return REDIS_ERR; + + if (tval < 0) { + addReplyError(c,"timeout is negative"); + return REDIS_ERR; + } + + if (tval > 0) { + if (unit == UNIT_SECONDS) tval *= 1000; + tval += mstime(); + } + *timeout = tval; + + return REDIS_OK; +} + +/* Block a client for the specific operation type. Once the REDIS_BLOCKED + * flag is set client query buffer is not longer processed, but accumulated, + * and will be processed when the client is unblocked. */ +void blockClient(redisClient *c, int btype) { + c->flags |= REDIS_BLOCKED; + c->btype = btype; + server.bpop_blocked_clients++; +} + +/* This function is called in the beforeSleep() function of the event loop + * in order to process the pending input buffer of clients that were + * unblocked after a blocking operation. */ +void processUnblockedClients(void) { + listNode *ln; + redisClient *c; + + while (listLength(server.unblocked_clients)) { + ln = listFirst(server.unblocked_clients); + redisAssert(ln != NULL); + c = ln->value; + listDelNode(server.unblocked_clients,ln); + c->flags &= ~REDIS_UNBLOCKED; + c->btype = REDIS_BLOCKED_NONE; + + /* Process remaining data in the input buffer. */ + if (c->querybuf && sdslen(c->querybuf) > 0) { + server.current_client = c; + processInputBuffer(c); + server.current_client = NULL; + } + } +} + +/* Unblock a client calling the right function depending on the kind + * of operation the client is blocking for. */ +void unblockClient(redisClient *c) { + if (c->btype == REDIS_BLOCKED_LIST) { + unblockClientWaitingData(c); + } else { + redisPanic("Unknown btype in unblockClient()."); + } + /* Clear the flags, and put the client in the unblocked list so that + * we'll process new commands in its query buffer ASAP. */ + c->flags &= ~REDIS_BLOCKED; + c->flags |= REDIS_UNBLOCKED; + c->btype = REDIS_BLOCKED_NONE; + server.bpop_blocked_clients--; + listAddNodeTail(server.unblocked_clients,c); +} + +/* This function gets called when a blocked client timed out in order to + * send it a reply of some kind. */ +void replyToBlockedClientTimedOut(redisClient *c) { + if (c->btype == REDIS_BLOCKED_LIST) { + addReply(c,shared.nullmultibulk); + } else { + redisPanic("Unknown btype in replyToBlockedClientTimedOut()."); + } +} + diff --git a/src/networking.c b/src/networking.c index 0582789ff..7ed8d2c09 100644 --- a/src/networking.c +++ b/src/networking.c @@ -108,9 +108,12 @@ redisClient *createClient(int fd) { c->obuf_soft_limit_reached_time = 0; listSetFreeMethod(c->reply,decrRefCountVoid); listSetDupMethod(c->reply,dupClientReplyValue); - c->bpop.keys = dictCreate(&setDictType,NULL); + c->btype = REDIS_BLOCKED_NONE; c->bpop.timeout = 0; + c->bpop.keys = dictCreate(&setDictType,NULL); c->bpop.target = NULL; + c->bpop.numreplicas = 0; + c->bpop.reploffset = 0; c->watched_keys = listCreate(); c->pubsub_channels = dictCreate(&setDictType,NULL); c->pubsub_patterns = listCreate(); @@ -666,8 +669,7 @@ void freeClient(redisClient *c) { c->querybuf = NULL; /* Deallocate structures used to block on blocking ops. */ - if (c->flags & REDIS_BLOCKED) - unblockClientWaitingData(c); + if (c->flags & REDIS_BLOCKED) unblockClient(c); dictRelease(c->bpop.keys); /* UNWATCH all the keys */ diff --git a/src/redis.c b/src/redis.c index e112f1b56..f29397769 100644 --- a/src/redis.c +++ b/src/redis.c @@ -871,7 +871,7 @@ long long getOperationsPerSecond(void) { /* Check for timeouts. Returns non-zero if the client was terminated */ int clientsCronHandleTimeout(redisClient *c) { - time_t now = server.unixtime; + mstime_t now = mstime(); if (server.maxidletime && !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */ @@ -886,8 +886,8 @@ int clientsCronHandleTimeout(redisClient *c) { return 1; } else if (c->flags & REDIS_BLOCKED) { if (c->bpop.timeout != 0 && c->bpop.timeout < now) { - addReply(c,shared.nullmultibulk); - unblockClientWaitingData(c); + replyToBlockedClientTimedOut(c); + unblockClient(c); } } return 0; @@ -1194,8 +1194,6 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) { * for ready file descriptors. */ void beforeSleep(struct aeEventLoop *eventLoop) { REDIS_NOTUSED(eventLoop); - listNode *ln; - redisClient *c; /* Run a fast expire cycle (the called function will return * ASAP if a fast cycle is not needed). */ @@ -1203,20 +1201,7 @@ void beforeSleep(struct aeEventLoop *eventLoop) { activeExpireCycle(ACTIVE_EXPIRE_CYCLE_FAST); /* Try to process pending commands for clients that were just unblocked. */ - while (listLength(server.unblocked_clients)) { - ln = listFirst(server.unblocked_clients); - redisAssert(ln != NULL); - c = ln->value; - listDelNode(server.unblocked_clients,ln); - c->flags &= ~REDIS_UNBLOCKED; - - /* Process remaining data in the input buffer. */ - if (c->querybuf && sdslen(c->querybuf) > 0) { - server.current_client = c; - processInputBuffer(c); - server.current_client = NULL; - } - } + processUnblockedClients(); /* Write the AOF buffer on disk */ flushAppendOnlyFile(0); diff --git a/src/redis.h b/src/redis.h index e4413d28b..bf968e3c7 100644 --- a/src/redis.h +++ b/src/redis.h @@ -232,6 +232,12 @@ #define REDIS_FORCE_REPL (1<<15) /* Force replication of current cmd. */ #define REDIS_PRE_PSYNC_SLAVE (1<<16) /* Slave don't understand PSYNC. */ +/* Client block type (btype field in client structure) + * if REDIS_BLOCKED flag is set. */ +#define REDIS_BLOCKED_NONE 0 /* Not blocked, no REDIS_BLOCKED flag set. */ +#define REDIS_BLOCKED_LIST 1 /* BLPOP & co. */ +#define REDIS_BLOCKED_WAIT 2 /* WAIT for synchronous replication. */ + /* Client request types */ #define REDIS_REQ_INLINE 1 #define REDIS_REQ_MULTIBULK 2 @@ -419,13 +425,22 @@ typedef struct multiState { time_t minreplicas_timeout; /* MINREPLICAS timeout as unixtime. */ } multiState; +/* This structure holds the blocking operation state for a client. + * The fields used depend on client->btype. */ typedef struct blockingState { + /* Generic fields. */ + mstime_t timeout; /* Blocking operation timeout. If UNIX current time + * is > timeout then the operation timed out. */ + + /* REDIS_BLOCK_LIST */ dict *keys; /* The keys we are waiting to terminate a blocking * operation such as BLPOP. Otherwise NULL. */ - time_t timeout; /* Blocking operation timeout. If UNIX current time - * is > timeout then the operation timed out. */ robj *target; /* The key that should receive the element, * for BRPOPLPUSH. */ + + /* REDIS_BLOCK_WAIT */ + int numreplicas; /* Number of replicas we are waiting for ACK. */ + long long reploffset; /* Replication offset to reach. */ } blockingState; /* The following structure represents a node in the server.ready_keys list, @@ -479,6 +494,7 @@ typedef struct redisClient { char replrunid[REDIS_RUN_ID_SIZE+1]; /* master run id if this is a master */ int slave_listening_port; /* As configured with: SLAVECONF listening-port */ multiState mstate; /* MULTI/EXEC state */ + int btype; /* Type of blocking op if REDIS_BLOCKED. */ blockingState bpop; /* blocking state */ list *watched_keys; /* Keys WATCHED for MULTI/EXEC CAS */ dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */ @@ -1227,6 +1243,13 @@ void sentinelIsRunning(void); /* Scripting */ void scriptingInit(void); +/* Blocked clients */ +void processUnblockedClients(void); +void blockClient(redisClient *c, int btype); +void unblockClient(redisClient *c); +void replyToBlockedClientTimedOut(redisClient *c); +int getTimeoutFromObjectOrReply(redisClient *c, robj *object, mstime_t *timeout, int unit); + /* Git SHA1 */ char *redisGitSHA1(void); char *redisGitDirty(void); diff --git a/src/t_list.c b/src/t_list.c index a8ce9b976..555cb31e9 100644 --- a/src/t_list.c +++ b/src/t_list.c @@ -778,7 +778,7 @@ void rpoplpushCommand(redisClient *c) { /* Set a client in blocking mode for the specified key, with the specified * timeout */ -void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout, robj *target) { +void blockForKeys(redisClient *c, robj **keys, int numkeys, mstime_t timeout, robj *target) { dictEntry *de; list *l; int j; @@ -808,13 +808,11 @@ void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout, robj } listAddNodeTail(l,c); } - - /* Mark the client as a blocked client */ - c->flags |= REDIS_BLOCKED; - server.bpop_blocked_clients++; + blockClient(c,REDIS_BLOCKED_LIST); } -/* Unblock a client that's waiting in a blocking operation such as BLPOP */ +/* Unblock a client that's waiting in a blocking operation such as BLPOP. + * You should never call this function directly, but unblockClient() instead. */ void unblockClientWaitingData(redisClient *c) { dictEntry *de; dictIterator *di; @@ -842,10 +840,6 @@ void unblockClientWaitingData(redisClient *c) { decrRefCount(c->bpop.target); c->bpop.target = NULL; } - c->flags &= ~REDIS_BLOCKED; - c->flags |= REDIS_UNBLOCKED; - server.bpop_blocked_clients--; - listAddNodeTail(server.unblocked_clients,c); } /* If the specified key has clients blocked waiting for list pushes, this @@ -1000,10 +994,10 @@ void handleClientsBlockedOnLists(void) { if (value) { /* Protect receiver->bpop.target, that will be - * freed by the next unblockClientWaitingData() + * freed by the next unblockClient() * call. */ if (dstkey) incrRefCount(dstkey); - unblockClientWaitingData(receiver); + unblockClient(receiver); if (serveClientBlockedOnList(receiver, rl->key,dstkey,rl->db,value, @@ -1036,32 +1030,14 @@ void handleClientsBlockedOnLists(void) { } } -int getTimeoutFromObjectOrReply(redisClient *c, robj *object, time_t *timeout) { - long tval; - - if (getLongFromObjectOrReply(c,object,&tval, - "timeout is not an integer or out of range") != REDIS_OK) - return REDIS_ERR; - - if (tval < 0) { - addReplyError(c,"timeout is negative"); - return REDIS_ERR; - } - - if (tval > 0) tval += server.unixtime; - *timeout = tval; - - return REDIS_OK; -} - /* Blocking RPOP/LPOP */ void blockingPopGenericCommand(redisClient *c, int where) { robj *o; - time_t timeout; + mstime_t timeout; int j; - if (getTimeoutFromObjectOrReply(c,c->argv[c->argc-1],&timeout) != REDIS_OK) - return; + if (getTimeoutFromObjectOrReply(c,c->argv[c->argc-1],&timeout,UNIT_SECONDS) + != REDIS_OK) return; for (j = 1; j < c->argc-1; j++) { o = lookupKeyWrite(c->db,c->argv[j]); @@ -1120,10 +1096,10 @@ void brpopCommand(redisClient *c) { } void brpoplpushCommand(redisClient *c) { - time_t timeout; + mstime_t timeout; - if (getTimeoutFromObjectOrReply(c,c->argv[3],&timeout) != REDIS_OK) - return; + if (getTimeoutFromObjectOrReply(c,c->argv[3],&timeout,UNIT_SECONDS) + != REDIS_OK) return; robj *key = lookupKeyWrite(c->db, c->argv[1]); From 5f743cc4f8e7cbacf6c3a6a8be749fad5bdaa17f Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 3 Dec 2013 18:03:15 +0100 Subject: [PATCH 0356/2500] blocked.c API commented. --- src/blocked.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/src/blocked.c b/src/blocked.c index 3f4dd6e8d..667b75a62 100644 --- a/src/blocked.c +++ b/src/blocked.c @@ -26,6 +26,39 @@ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. + * + * --------------------------------------------------------------------------- + * + * API: + * + * getTimeoutFromObjectOrReply() is just an utility function to parse a + * timeout argument since blocking operations usually require a timeout. + * + * blockClient() set the REDIS_BLOCKED flag in the client, and set the + * specified block type 'btype' filed to one of REDIS_BLOCKED_* macros. + * + * unblockClient() unblocks the client doing the following: + * 1) It calls the btype-specific function to cleanup the state. + * 2) It unblocks the client by unsetting the REDIS_BLOCKED flag. + * 3) It puts the client into a list of just unblocked clients that are + * processed ASAP in the beforeSleep() event loop callback, so that + * if there is some query buffer to process, we do it. This is also + * required because otherwise there is no 'readable' event fired, we + * already read the pending commands. We also set the REDIS_UNBLOCKED + * flag to remember the client is in the unblocked_clients list. + * + * processUnblockedClients() is called inside the beforeSleep() function + * to process the query buffer from unblocked clients and remove the clients + * from the blocked_clients queue. + * + * replyToBlockedClientTimedOut() is called by the cron function when + * a client blocked reaches the specified timeout (if the timeout is set + * to 0, no timeout is processed). + * It usually just needs to send a reply to the client. + * + * When implementing a new type of blocking opeation, the implementation + * should modify unblockClient() and replyToBlockedClientTimedOut() in order + * to handle the btype-specific behavior of this two functions. */ #include "redis.h" From a7ebb0c7bf401e89e6cc7b46ea3106e397df7291 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 4 Dec 2013 15:52:20 +0100 Subject: [PATCH 0357/2500] WAIT command: synchronous replication for Redis. --- src/blocked.c | 4 ++ src/networking.c | 1 + src/redis.c | 29 +++++++++- src/redis.h | 10 +++- src/replication.c | 137 +++++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 177 insertions(+), 4 deletions(-) diff --git a/src/blocked.c b/src/blocked.c index 667b75a62..4cd632bd3 100644 --- a/src/blocked.c +++ b/src/blocked.c @@ -130,6 +130,8 @@ void processUnblockedClients(void) { void unblockClient(redisClient *c) { if (c->btype == REDIS_BLOCKED_LIST) { unblockClientWaitingData(c); + } else if (c->btype == REDIS_BLOCKED_WAIT) { + unblockClientWaitingReplicas(c); } else { redisPanic("Unknown btype in unblockClient()."); } @@ -147,6 +149,8 @@ void unblockClient(redisClient *c) { void replyToBlockedClientTimedOut(redisClient *c) { if (c->btype == REDIS_BLOCKED_LIST) { addReply(c,shared.nullmultibulk); + } else if (c->btype == REDIS_BLOCKED_WAIT) { + addReplyLongLong(c,replicationCountAcksByOffset(c->bpop.reploffset)); } else { redisPanic("Unknown btype in replyToBlockedClientTimedOut()."); } diff --git a/src/networking.c b/src/networking.c index 7ed8d2c09..fd8ab3c16 100644 --- a/src/networking.c +++ b/src/networking.c @@ -114,6 +114,7 @@ redisClient *createClient(int fd) { c->bpop.target = NULL; c->bpop.numreplicas = 0; c->bpop.reploffset = 0; + c->woff = 0; c->watched_keys = listCreate(); c->pubsub_channels = dictCreate(&setDictType,NULL); c->pubsub_patterns = listCreate(); diff --git a/src/redis.c b/src/redis.c index f29397769..72c9f661f 100644 --- a/src/redis.c +++ b/src/redis.c @@ -263,7 +263,8 @@ struct redisCommand redisCommandTable[] = { {"script",scriptCommand,-2,"ras",0,NULL,0,0,0,0,0}, {"time",timeCommand,1,"rR",0,NULL,0,0,0,0,0}, {"bitop",bitopCommand,-4,"wm",0,NULL,2,-1,1,0,0}, - {"bitcount",bitcountCommand,-2,"r",0,NULL,1,1,1,0,0} + {"bitcount",bitcountCommand,-2,"r",0,NULL,1,1,1,0,0}, + {"wait",waitCommand,3,"rs",0,NULL,0,0,0,0,0} }; /*============================ Utility functions ============================ */ @@ -1200,8 +1201,29 @@ void beforeSleep(struct aeEventLoop *eventLoop) { if (server.active_expire_enabled && server.masterhost == NULL) activeExpireCycle(ACTIVE_EXPIRE_CYCLE_FAST); + /* Send all the slaves an ACK request if at least one client blocked + * during the previous event loop iteration. */ + if (server.get_ack_from_slaves) { + robj *argv[3]; + + argv[0] = createStringObject("REPLCONF",8); + argv[1] = createStringObject("GETACK",6); + argv[2] = createStringObject("*",1); /* Not used argument. */ + replicationFeedSlaves(server.slaves, server.slaveseldb, argv, 3); + decrRefCount(argv[0]); + decrRefCount(argv[1]); + decrRefCount(argv[2]); + server.get_ack_from_slaves = 0; + } + + /* Unblock all the clients blocked for synchronous replication + * in WAIT. */ + if (listLength(server.clients_waiting_acks)) + processClientsWaitingReplicas(); + /* Try to process pending commands for clients that were just unblocked. */ - processUnblockedClients(); + if (listLength(server.unblocked_clients)) + processUnblockedClients(); /* Write the AOF buffer on disk */ flushAppendOnlyFile(0); @@ -1557,6 +1579,8 @@ void initServer() { server.slaveseldb = -1; /* Force to emit the first SELECT command. */ server.unblocked_clients = listCreate(); server.ready_keys = listCreate(); + server.clients_waiting_acks = listCreate(); + server.get_ack_from_slaves = 0; createSharedObjects(); adjustOpenFilesLimit(); @@ -2079,6 +2103,7 @@ int processCommand(redisClient *c) { addReply(c,shared.queued); } else { call(c,REDIS_CALL_FULL); + c->woff = server.master_repl_offset; if (listLength(server.ready_keys)) handleClientsBlockedOnLists(); } diff --git a/src/redis.h b/src/redis.h index bf968e3c7..a9b06838b 100644 --- a/src/redis.h +++ b/src/redis.h @@ -495,7 +495,8 @@ typedef struct redisClient { int slave_listening_port; /* As configured with: SLAVECONF listening-port */ multiState mstate; /* MULTI/EXEC state */ int btype; /* Type of blocking op if REDIS_BLOCKED. */ - blockingState bpop; /* blocking state */ + blockingState bpop; /* blocking state */ + long long woff; /* Last write global replication offset. */ list *watched_keys; /* Keys WATCHED for MULTI/EXEC CAS */ dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */ list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */ @@ -750,6 +751,9 @@ struct redisServer { dict *repl_scriptcache_dict; /* SHA1 all slaves are aware of. */ list *repl_scriptcache_fifo; /* First in, first out LRU eviction. */ int repl_scriptcache_size; /* Max number of elements. */ + /* Synchronous replication. */ + list *clients_waiting_acks; /* Clients waiting in WAIT command. */ + int get_ack_from_slaves; /* If true we send REPLCONF GETACK. */ /* Limits */ unsigned int maxclients; /* Max number of simultaneous clients */ unsigned long long maxmemory; /* Max number of memory bytes to use */ @@ -1063,6 +1067,9 @@ void replicationScriptCacheInit(void); void replicationScriptCacheFlush(void); void replicationScriptCacheAdd(sds sha1); int replicationScriptCacheExists(sds sha1); +void processClientsWaitingReplicas(void); +void unblockClientWaitingReplicas(redisClient *c); +int replicationCountAcksByOffset(long long offset); /* Generic persistence functions */ void startLoading(FILE *fp); @@ -1398,6 +1405,7 @@ void timeCommand(redisClient *c); void bitopCommand(redisClient *c); void bitcountCommand(redisClient *c); void replconfCommand(redisClient *c); +void waitCommand(redisClient *c); #if defined(__GNUC__) void *calloc(size_t count, size_t size) __attribute__ ((deprecated)); diff --git a/src/replication.c b/src/replication.c index 7357ae5ec..c555d1d3a 100644 --- a/src/replication.c +++ b/src/replication.c @@ -39,6 +39,7 @@ void replicationDiscardCachedMaster(void); void replicationResurrectCachedMaster(int newfd); +void replicationSendAck(void); /* ---------------------------------- MASTER -------------------------------- */ @@ -560,6 +561,11 @@ void replconfCommand(redisClient *c) { c->repl_ack_time = server.unixtime; /* Note: this command does not reply anything! */ return; + } else if (!strcasecmp(c->argv[j]->ptr,"getack")) { + /* REPLCONF GETACK is used in order to request an ACK ASAP + * to the slave. */ + if (server.masterhost && server.master) replicationSendAck(); + /* Note: this command does not reply anything! */ } else { addReplyErrorFormat(c,"Unrecognized REPLCONF option: %s", (char*)c->argv[j]->ptr); @@ -1495,7 +1501,136 @@ int replicationScriptCacheExists(sds sha1) { return dictFind(server.repl_scriptcache_dict,sha1) != NULL; } -/* --------------------------- REPLICATION CRON ----------------------------- */ +/* ----------------------- SYNCHRONOUS REPLICATION -------------------------- + * Redis synchronous replication design can be summarized in points: + * + * - Redis masters have a global replication offset, used by PSYNC. + * - Master increment the offset every time new commands are sent to slaves. + * - Slaves ping back masters with the offset processed so far. + * + * So synchronous replication adds a new WAIT command in the form: + * + * WAIT + * + * That returns the number of replicas that processed the query when + * we finally have at least num_replicas, or when the timeout was + * reached. + * + * The command is implemented in this way: + * + * - Every time a client processes a command, we remember the replication + * offset after sending that command to the slaves. + * - When WAIT is called, we ask slaves to send an acknowledgement ASAP. + * The client is blocked at the same time (see blocked.c). + * - Once we receive enough ACKs for a given offset or when the timeout + * is reached, the WAIT command is unblocked and the reply sent to the + * client. + */ + +/* This just set a flag so that we broadcast a REPLCONF GETACK command + * to all the slaves in the beforeSleep() function. Note that this way + * we "group" all the clients that want to wait for synchronouns replication + * in a given event loop iteration, and send a single GETACK for them all. */ +void replicationRequestAckFromSlaves(void) { + server.get_ack_from_slaves = 1; +} + +/* Return the number of slaves that already acknowledged the specified + * replication offset. */ +int replicationCountAcksByOffset(long long offset) { + listIter li; + listNode *ln; + int count = 0; + + listRewind(server.slaves,&li); + while((ln = listNext(&li))) { + redisClient *slave = ln->value; + + if (slave->replstate != REDIS_REPL_ONLINE) continue; + if (slave->repl_ack_off >= offset) count++; + } + return count; +} + +/* WAIT for N replicas to acknowledge the processing of our latest + * write command (and all the previous commands). */ +void waitCommand(redisClient *c) { + mstime_t timeout; + long numreplicas, ackreplicas; + long long offset = c->woff; + + /* Argument parsing. */ + if (getLongFromObjectOrReply(c,c->argv[1],&numreplicas,NULL) != REDIS_OK) + return; + if (getTimeoutFromObjectOrReply(c,c->argv[2],&timeout,UNIT_MILLISECONDS) + != REDIS_OK) return; + + /* First try without blocking at all. */ + ackreplicas = replicationCountAcksByOffset(c->woff); + if (ackreplicas >= numreplicas || c->flags & REDIS_MULTI) { + addReplyLongLong(c,ackreplicas); + return; + } + + /* Otherwise block the client and put it into our list of clients + * waiting for ack from slaves. */ + c->bpop.timeout = timeout; + c->bpop.reploffset = offset; + c->bpop.numreplicas = numreplicas; + listAddNodeTail(server.clients_waiting_acks,c); + blockClient(c,REDIS_BLOCKED_WAIT); + + /* Make sure that the server will send an ACK request to all the slaves + * before returning to the event loop. */ + replicationRequestAckFromSlaves(); +} + +/* This is called by unblockClient() to perform the blocking op type + * specific cleanup. We just remove the client from the list of clients + * waiting for replica acks. Never call it directly, call unblockClient() + * instead. */ +void unblockClientWaitingReplicas(redisClient *c) { + listNode *ln = listSearchKey(server.clients_waiting_acks,c); + redisAssert(ln != NULL); + listDelNode(server.clients_waiting_acks,ln); +} + +/* Check if there are clients blocked in WAIT that can be unblocked since + * we received enough ACKs from slaves. */ +void processClientsWaitingReplicas(void) { + long long last_offset = 0; + int last_numreplicas = 0; + + listIter li; + listNode *ln; + + listRewind(server.clients_waiting_acks,&li); + while((ln = listNext(&li))) { + redisClient *c = ln->value; + + /* Every time we find a client that is satisfied for a given + * offset and number of replicas, we remember it so the next client + * may be unblocked without calling replicationCountAcksByOffset() + * if the requested offset / replicas were equal or less. */ + if (last_offset && last_offset > c->bpop.reploffset && + last_numreplicas > c->bpop.numreplicas) + { + unblockClient(c); + addReplyLongLong(c,last_numreplicas); + } else { + int numreplicas = replicationCountAcksByOffset(c->bpop.reploffset); + + if (numreplicas >= c->bpop.numreplicas) { + last_offset = c->bpop.reploffset; + last_numreplicas = numreplicas; + unblockClient(c); + addReplyLongLong(c,numreplicas); + } + } + } +} + +/* --------------------------- REPLICATION CRON ---------------------------- */ /* Replication cron funciton, called 1 time per second. */ void replicationCron(void) { From faa5495eea4237cbbdafbaada3bb49b276787963 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 5 Dec 2013 14:55:07 +0100 Subject: [PATCH 0358/2500] Fix clients timeout handling. During the refactoring of blocking operations, commit 83e363d3e67c27865d7679c27f466c5e12b3d4ee, a bug was introduced where a milliseconds time is compared to a seconds time, so all the clients always appear to timeout if timeout is set to non-zero value. Thanks to Jonathan Leibiusky for finding the bug and helping verifying the cause and fix. --- src/redis.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/redis.c b/src/redis.c index 72c9f661f..9c92aaf33 100644 --- a/src/redis.c +++ b/src/redis.c @@ -872,7 +872,7 @@ long long getOperationsPerSecond(void) { /* Check for timeouts. Returns non-zero if the client was terminated */ int clientsCronHandleTimeout(redisClient *c) { - mstime_t now = mstime(); + time_t now = server.unixtime; if (server.maxidletime && !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */ @@ -886,7 +886,12 @@ int clientsCronHandleTimeout(redisClient *c) { freeClient(c); return 1; } else if (c->flags & REDIS_BLOCKED) { - if (c->bpop.timeout != 0 && c->bpop.timeout < now) { + /* Blocked OPS timeout is handled with milliseconds resolution. + * However note that the actual resolution is limited by + * server.hz. */ + mstime_t now_ms = mstime(); + + if (c->bpop.timeout != 0 && c->bpop.timeout < now_ms) { replyToBlockedClientTimedOut(c); unblockClient(c); } From 6763faef5812c151e28a19a231e2bb37971e6c79 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 5 Dec 2013 16:28:35 +0100 Subject: [PATCH 0359/2500] Fixed typos in redis.conf file. --- redis.conf | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/redis.conf b/redis.conf index abe74bfc4..bd2d6feeb 100644 --- a/redis.conf +++ b/redis.conf @@ -34,7 +34,7 @@ port 6379 # bind 192.168.1.100 10.0.0.1 # bind 127.0.0.1 -# Specify the path for the unix socket that will be used to listen for +# Specify the path for the Unix socket that will be used to listen for # incoming connections. There is no default, so Redis will not listen # on a unix socket when not specified. # @@ -68,7 +68,7 @@ tcp-keepalive 0 # warning (only very important / critical messages are logged) loglevel notice -# Specify the log file name. Also the emptry string can be used to force +# Specify the log file name. Also the empty string can be used to force # Redis to log on the standard output. Note that if you use standard # output for logging but daemonize, logs will be sent to /dev/null logfile "" @@ -116,9 +116,9 @@ save 60 10000 # By default Redis will stop accepting writes if RDB snapshots are enabled # (at least one save point) and the latest background save failed. -# This will make the user aware (in an hard way) that data is not persisting +# This will make the user aware (in a hard way) that data is not persisting # on disk properly, otherwise chances are that no one will notice and some -# distater will happen. +# disaster will happen. # # If the background saving process will start working again Redis will # automatically allow writes again. @@ -263,7 +263,7 @@ repl-disable-tcp-nodelay no # # A slave with a low priority number is considered better for promotion, so # for instance if there are three slaves with priority 10, 100, 25 Sentinel will -# pick the one wtih priority 10, that is the lowest. +# pick the one with priority 10, that is the lowest. # # However a special priority of 0 marks the slave as not able to perform the # role of master, so a slave with priority of 0 will never be selected by @@ -351,7 +351,7 @@ slave-priority 100 # to reply to read-only commands like GET. # # This option is usually useful when using Redis as an LRU cache, or to set -# an hard memory limit for an instance (using the 'noeviction' policy). +# a hard memory limit for an instance (using the 'noeviction' policy). # # WARNING: If you have slaves attached to an instance with maxmemory on, # the size of the output buffers needed to feed the slaves are subtracted @@ -632,7 +632,7 @@ zset-max-ziplist-value 64 # Active rehashing uses 1 millisecond every 100 milliseconds of CPU time in # order to help rehashing the main Redis hash table (the one mapping top-level # keys to values). The hash table implementation Redis uses (see dict.c) -# performs a lazy rehashing: the more operation you run into an hash table +# performs a lazy rehashing: the more operation you run into a hash table # that is rehashing, the more rehashing "steps" are performed, so if the # server is idle the rehashing is never complete and some more memory is used # by the hash table. @@ -687,7 +687,7 @@ client-output-buffer-limit slave 256mb 64mb 60 client-output-buffer-limit pubsub 32mb 8mb 60 # Redis calls an internal function to perform many background tasks, like -# closing connections of clients in timeot, purging expired keys that are +# closing connections of clients in timeout, purging expired keys that are # never requested, and so forth. # # Not all tasks are performed with the same frequency, but Redis checks for From 7a5a646df9db18190c209c4797f38216c77c5f71 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 5 Dec 2013 16:35:32 +0100 Subject: [PATCH 0360/2500] Fixed grammar: before H the article is a, not an. --- src/ae_epoll.c | 2 +- src/aof.c | 2 +- src/cluster.c | 8 ++++---- src/db.c | 10 +++++----- src/dict.c | 4 ++-- src/networking.c | 2 +- src/object.c | 2 +- src/rdb.c | 2 +- src/redis-check-dump.c | 2 +- src/redis.c | 2 +- src/replication.c | 2 +- src/scripting.c | 4 ++-- src/sds.c | 2 +- src/sentinel.c | 6 +++--- src/sort.c | 4 ++-- src/t_list.c | 4 ++-- src/t_zset.c | 2 +- src/zipmap.c | 2 +- src/zmalloc.c | 2 +- 19 files changed, 32 insertions(+), 32 deletions(-) diff --git a/src/ae_epoll.c b/src/ae_epoll.c index 41af3e874..da9c7b906 100644 --- a/src/ae_epoll.c +++ b/src/ae_epoll.c @@ -45,7 +45,7 @@ static int aeApiCreate(aeEventLoop *eventLoop) { zfree(state); return -1; } - state->epfd = epoll_create(1024); /* 1024 is just an hint for the kernel */ + state->epfd = epoll_create(1024); /* 1024 is just a hint for the kernel */ if (state->epfd == -1) { zfree(state->events); zfree(state); diff --git a/src/aof.c b/src/aof.c index 8cbdec889..c32b32e4b 100644 --- a/src/aof.c +++ b/src/aof.c @@ -772,7 +772,7 @@ int rewriteSortedSetObject(rio *r, robj *key, robj *o) { return 1; } -/* Write either the key or the value of the currently selected item of an hash. +/* Write either the key or the value of the currently selected item of a hash. * The 'hi' argument passes a valid Redis hash iterator. * The 'what' filed specifies if to write a key or a value and can be * either REDIS_HASH_KEY or REDIS_HASH_VALUE. diff --git a/src/cluster.c b/src/cluster.c index 248b527d5..59a72065a 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -639,7 +639,7 @@ void clusterRenameNode(clusterNode *node, char *newname) { * in the cluster without dealing with the problem if other nodes re-adding * back the node to nodes we already sent the FORGET command to. * - * The data structure used is an hash table with an sds string representing + * The data structure used is a hash table with an sds string representing * the node ID as key, and the time when it is ok to re-add the node as * value. * -------------------------------------------------------------------------- */ @@ -853,7 +853,7 @@ void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) { } } else { /* If it's not in NOADDR state and we don't have it, we - * start an handshake process against this IP/PORT pairs. + * start a handshake process against this IP/PORT pairs. * * Note that we require that the sender of this gossip message * is a well known node in our cluster, otherwise we risk @@ -1128,7 +1128,7 @@ int clusterProcessPacket(clusterLink *link) { } /* First thing to do is replacing the random name with the - * right node name if this was an handshake stage. */ + * right node name if this was a handshake stage. */ clusterRenameNode(link->node, hdr->sender); redisLog(REDIS_DEBUG,"Handshake with node %.40s completed.", link->node->name); @@ -1928,7 +1928,7 @@ void clusterCron(void) { iteration++; /* Number of times this function was called so far. */ - /* The handshake timeout is the time after which an handshake node that was + /* The handshake timeout is the time after which a handshake node that was * not turned into a normal node is removed from the nodes. Usually it is * just the NODE_TIMEOUT value, but when NODE_TIMEOUT is too small we use * the value of 1 second. */ diff --git a/src/db.c b/src/db.c index 1bdcf9e83..be71d2592 100644 --- a/src/db.c +++ b/src/db.c @@ -370,7 +370,7 @@ int parseScanCursorOrReply(redisClient *c, robj *o, unsigned long *cursor) { } /* This command implements SCAN, HSCAN and SSCAN commands. - * If object 'o' is passed, then it must be an Hash or Set object, otherwise + * If object 'o' is passed, then it must be a Hash or Set object, otherwise * if 'o' is NULL the command will operate on the dictionary associated with * the current database. * @@ -378,7 +378,7 @@ int parseScanCursorOrReply(redisClient *c, robj *o, unsigned long *cursor) { * the client arguments vector is a key so it skips it before iterating * in order to parse options. * - * In the case of an Hash object the function returns both the field and value + * In the case of a Hash object the function returns both the field and value * of every element on the Hash. */ void scanGenericCommand(redisClient *c, robj *o, unsigned long cursor) { int rv; @@ -433,12 +433,12 @@ void scanGenericCommand(redisClient *c, robj *o, unsigned long cursor) { /* Step 2: Iterate the collection. * * Note that if the object is encoded with a ziplist, intset, or any other - * representation that is not an hash table, we are sure that it is also + * representation that is not a hash table, we are sure that it is also * composed of a small number of elements. So to avoid taking state we * just return everything inside the object in a single call, setting the * cursor to zero to signal the end of the iteration. */ - /* Handle the case of an hash table. */ + /* Handle the case of a hash table. */ ht = NULL; if (o == NULL) { ht = c->db->dict; @@ -520,7 +520,7 @@ void scanGenericCommand(redisClient *c, robj *o, unsigned long cursor) { listDelNode(keys, node); } - /* If this is an hash or a sorted set, we have a flat list of + /* If this is a hash or a sorted set, we have a flat list of * key-value elements, so if this element was filtered, remove the * value, or skip it if it was not filtered: we only match keys. */ if (o && (o->type == REDIS_ZSET || o->type == REDIS_HASH)) { diff --git a/src/dict.c b/src/dict.c index bd54a5022..c6ba2b745 100644 --- a/src/dict.c +++ b/src/dict.c @@ -53,7 +53,7 @@ * around when there is a child performing saving operations. * * Note that even when dict_can_resize is set to 0, not all resizes are - * prevented: an hash table is still allowed to grow if the ratio between + * prevented: a hash table is still allowed to grow if the ratio between * the number of elements and the buckets > dict_force_resize_ratio. */ static int dict_can_resize = 1; static unsigned int dict_force_resize_ratio = 5; @@ -853,7 +853,7 @@ static unsigned long _dictNextPower(unsigned long size) } /* Returns the index of a free slot that can be populated with - * an hash entry for the given 'key'. + * a hash entry for the given 'key'. * If the key already exists, -1 is returned. * * Note that if we are in the process of rehashing the hash table, the diff --git a/src/networking.c b/src/networking.c index fd8ab3c16..2af8915a2 100644 --- a/src/networking.c +++ b/src/networking.c @@ -1173,7 +1173,7 @@ void getClientsMaxBuffers(unsigned long *longest_output_list, *biggest_input_buffer = bib; } -/* This is an helper function for getClientPeerId(). +/* This is a helper function for getClientPeerId(). * It writes the specified ip/port to "peerid" as a null termiated string * in the form ip:port if ip does not contain ":" itself, otherwise * [ip]:port format is used (for IPv6 addresses basically). */ diff --git a/src/object.c b/src/object.c index bfbd3ba1a..d6efcd8c2 100644 --- a/src/object.c +++ b/src/object.c @@ -659,7 +659,7 @@ unsigned long estimateObjectIdleTime(robj *o) { } } -/* This is an helper function for the DEBUG command. We need to lookup keys +/* This is a helper function for the DEBUG command. We need to lookup keys * without any modification of LRU or other parameters. */ robj *objectCommandLookup(redisClient *c, robj *key) { dictEntry *de; diff --git a/src/rdb.c b/src/rdb.c index f04f6defd..bc0957c2f 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -891,7 +891,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb) { o = createHashObject(); - /* Too many entries? Use an hash table. */ + /* Too many entries? Use a hash table. */ if (len > server.hash_max_ziplist_entries) hashTypeConvert(o, REDIS_ENCODING_HT); diff --git a/src/redis-check-dump.c b/src/redis-check-dump.c index d09527781..4b0c1300f 100644 --- a/src/redis-check-dump.c +++ b/src/redis-check-dump.c @@ -60,7 +60,7 @@ #define REDIS_ENCODING_RAW 0 /* Raw representation */ #define REDIS_ENCODING_INT 1 /* Encoded as integer */ #define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */ -#define REDIS_ENCODING_HT 3 /* Encoded as an hash table */ +#define REDIS_ENCODING_HT 3 /* Encoded as a hash table */ /* Object types only used for dumping to disk */ #define REDIS_EXPIRETIME_MS 252 diff --git a/src/redis.c b/src/redis.c index 9c92aaf33..47ec4ff26 100644 --- a/src/redis.c +++ b/src/redis.c @@ -377,7 +377,7 @@ void exitFromChild(int retcode) { /*====================== Hash table type implementation ==================== */ -/* This is an hash table type that uses the SDS dynamic strings library as +/* This is a hash table type that uses the SDS dynamic strings library as * keys and radis objects as values (objects can hold SDS strings, * lists, sets). */ diff --git a/src/replication.c b/src/replication.c index c555d1d3a..1e87eb185 100644 --- a/src/replication.c +++ b/src/replication.c @@ -1422,7 +1422,7 @@ void refreshGoodSlavesCount(void) { * connected slave, in order to be able to replicate EVALSHA as it is without * translating it to EVAL every time it is possible. * - * We use a capped collection implemented by an hash table for fast lookup + * We use a capped collection implemented by a hash table for fast lookup * of scripts we can send as EVALSHA, plus a linked list that is used for * eviction of the oldest entry when the max number of items is reached. * diff --git a/src/scripting.c b/src/scripting.c index ac1a913f0..e2ce60765 100644 --- a/src/scripting.c +++ b/src/scripting.c @@ -895,7 +895,7 @@ void evalGenericCommand(redisClient *c, int evalsha) { /* Select the right DB in the context of the Lua client */ selectDb(server.lua_client,c->db->id); - /* Set an hook in order to be able to stop the script execution if it + /* Set a hook in order to be able to stop the script execution if it * is running for too much time. * We set the hook only if the time limit is enabled as the hook will * make the Lua script execution slower. */ @@ -1059,7 +1059,7 @@ void scriptCommand(redisClient *c) { if (server.lua_caller == NULL) { addReplySds(c,sdsnew("-NOTBUSY No scripts in execution right now.\r\n")); } else if (server.lua_write_dirty) { - addReplySds(c,sdsnew("-UNKILLABLE Sorry the script already executed write commands against the dataset. You can either wait the script termination or kill the server in an hard way using the SHUTDOWN NOSAVE command.\r\n")); + addReplySds(c,sdsnew("-UNKILLABLE Sorry the script already executed write commands against the dataset. You can either wait the script termination or kill the server in a hard way using the SHUTDOWN NOSAVE command.\r\n")); } else { server.lua_kill = 1; addReply(c,shared.ok); diff --git a/src/sds.c b/src/sds.c index d66c1d730..338b4262a 100644 --- a/src/sds.c +++ b/src/sds.c @@ -582,7 +582,7 @@ int is_hex_digit(char c) { (c >= 'A' && c <= 'F'); } -/* Helper function for sdssplitargs() that converts an hex digit into an +/* Helper function for sdssplitargs() that converts a hex digit into an * integer from 0 to 15 */ int hex_digit_to_int(char c) { switch(c) { diff --git a/src/sentinel.c b/src/sentinel.c index a936e6851..1d61ac320 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -133,7 +133,7 @@ typedef struct sentinelRedisInstance { if the link is idle and must be reconnected. */ mstime_t last_pub_time; /* Last time we sent hello via Pub/Sub. */ mstime_t last_hello_time; /* Only used if SRI_SENTINEL is set. Last time - we received an hello from this Sentinel + we received a hello from this Sentinel via Pub/Sub. */ mstime_t last_master_down_reply_time; /* Time of last reply to SENTINEL is-master-down command. */ @@ -1490,7 +1490,7 @@ void sentinelFlushConfig(void) { /* ====================== hiredis connection handling ======================= */ -/* Completely disconnect an hiredis link from an instance. */ +/* Completely disconnect a hiredis link from an instance. */ void sentinelKillLink(sentinelRedisInstance *ri, redisAsyncContext *c) { if (ri->cc == c) { ri->cc = NULL; @@ -1502,7 +1502,7 @@ void sentinelKillLink(sentinelRedisInstance *ri, redisAsyncContext *c) { redisAsyncFree(c); } -/* This function takes an hiredis context that is in an error condition +/* This function takes a hiredis context that is in an error condition * and make sure to mark the instance as disconnected performing the * cleanup needed. * diff --git a/src/sort.c b/src/sort.c index ebdf5469c..d90cf2c2d 100644 --- a/src/sort.c +++ b/src/sort.c @@ -48,8 +48,8 @@ redisSortOperation *createSortOperation(int type, robj *pattern) { * 1) The first occurrence of '*' in 'pattern' is substituted with 'subst'. * * 2) If 'pattern' matches the "->" string, everything on the left of - * the arrow is treated as the name of an hash field, and the part on the - * left as the key name containing an hash. The value of the specified + * the arrow is treated as the name of a hash field, and the part on the + * left as the key name containing a hash. The value of the specified * field is returned. * * 3) If 'pattern' equals "#", the function simply returns 'subst' itself so diff --git a/src/t_list.c b/src/t_list.c index 555cb31e9..216e071ba 100644 --- a/src/t_list.c +++ b/src/t_list.c @@ -844,7 +844,7 @@ void unblockClientWaitingData(redisClient *c) { /* If the specified key has clients blocked waiting for list pushes, this * function will put the key reference into the server.ready_keys list. - * Note that db->ready_keys is an hash table that allows us to avoid putting + * Note that db->ready_keys is a hash table that allows us to avoid putting * the same key again and again in the list in case of multiple pushes * made by a script or in the context of MULTI/EXEC. * @@ -872,7 +872,7 @@ void signalListAsReady(redisClient *c, robj *key) { redisAssert(dictAdd(c->db->ready_keys,key,NULL) == DICT_OK); } -/* This is an helper function for handleClientsBlockedOnLists(). It's work +/* This is a helper function for handleClientsBlockedOnLists(). It's work * is to serve a specific client (receiver) that is blocked on 'key' * in the context of the specified 'db', doing the following: * diff --git a/src/t_zset.c b/src/t_zset.c index c3bcbb88f..d5f6023a2 100644 --- a/src/t_zset.c +++ b/src/t_zset.c @@ -36,7 +36,7 @@ * in order to get O(log(N)) INSERT and REMOVE operations into a sorted * data structure. * - * The elements are added to an hash table mapping Redis objects to scores. + * The elements are added to a hash table mapping Redis objects to scores. * At the same time the elements are added to a skip list mapping scores * to Redis objects (so objects are sorted by scores in this "view"). */ diff --git a/src/zipmap.c b/src/zipmap.c index 3cd56e5ff..140126a71 100644 --- a/src/zipmap.c +++ b/src/zipmap.c @@ -4,7 +4,7 @@ * efficient. * * The Redis Hash type uses this data structure for hashes composed of a small - * number of elements, to switch to an hash table once a given number of + * number of elements, to switch to a hash table once a given number of * elements is reached. * * Given that many times Redis Hashes are used to represent objects composed diff --git a/src/zmalloc.c b/src/zmalloc.c index 210425828..e7e97aa67 100644 --- a/src/zmalloc.c +++ b/src/zmalloc.c @@ -176,7 +176,7 @@ void *zrealloc(void *ptr, size_t size) { } /* Provide zmalloc_size() for systems where this function is not provided by - * malloc itself, given that in that case we store an header with this + * malloc itself, given that in that case we store a header with this * information as the first bytes of every allocation. */ #ifndef HAVE_MALLOC_SIZE size_t zmalloc_size(void *ptr) { From 6232143904de3b6c6b6762a8dcac274b03392329 Mon Sep 17 00:00:00 2001 From: Anurag Ramdasan Date: Thu, 5 Dec 2013 21:09:31 +0530 Subject: [PATCH 0361/2500] Fixed grammar: 'usually' to 'usual' --- redis.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/redis.conf b/redis.conf index bd2d6feeb..0363bd7d9 100644 --- a/redis.conf +++ b/redis.conf @@ -125,7 +125,7 @@ save 60 10000 # # However if you have setup your proper monitoring of the Redis server # and persistence, you may want to disable this feature so that Redis will -# continue to work as usually even if there are problems with disk, +# continue to work as usual even if there are problems with disk, # permissions, and so forth. stop-writes-on-bgsave-error yes From f416ddc327555c235c2afafe03d3ec51d8bdbf71 Mon Sep 17 00:00:00 2001 From: Anurag Ramdasan Date: Thu, 5 Dec 2013 21:47:17 +0530 Subject: [PATCH 0362/2500] fixed typo --- redis.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/redis.conf b/redis.conf index 0363bd7d9..adfdfedd5 100644 --- a/redis.conf +++ b/redis.conf @@ -197,7 +197,7 @@ slave-serve-stale-data yes # Note: read only slaves are not designed to be exposed to untrusted clients # on the internet. It's just a protection layer against misuse of the instance. # Still a read only slave exports by default all the administrative commands -# such as CONFIG, DEBUG, and so forth. To a limited extend you can improve +# such as CONFIG, DEBUG, and so forth. To a limited extent you can improve # security of read only slaves using 'rename-command' to shadow all the # administrative / dangerous commands. slave-read-only yes From f775787f2e0533077d9fb06d9ffe039ffc83b927 Mon Sep 17 00:00:00 2001 From: Anurag Ramdasan Date: Thu, 5 Dec 2013 23:15:47 +0530 Subject: [PATCH 0363/2500] Grammar fix. --- CONTRIBUTING | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING b/CONTRIBUTING index 06e1e9660..f7b6836f7 100644 --- a/CONTRIBUTING +++ b/CONTRIBUTING @@ -22,7 +22,7 @@ each source file that you contribute. 1. Drop a message to the Redis Google Group with a proposal of semantics/API. -2. If in steps 1 you get an acknowledge from the project leaders, use the +2. If in step 1 you get an acknowledge from the project leaders, use the following procedure to submit a patch: a. Fork Redis on github ( http://help.github.com/fork-a-repo/ ) From 87d6939e79ea2d2d514f723a5aa4a780e99b2182 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 6 Dec 2013 10:48:40 +0100 Subject: [PATCH 0364/2500] Fixed typo in redis.conf. --- redis.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/redis.conf b/redis.conf index adfdfedd5..76461d6b1 100644 --- a/redis.conf +++ b/redis.conf @@ -343,7 +343,7 @@ slave-priority 100 # Don't use more memory than the specified amount of bytes. # When the memory limit is reached Redis will try to remove keys -# accordingly to the eviction policy selected (see maxmemmory-policy). +# according to the eviction policy selected (see maxmemory-policy). # # If Redis can't remove keys according to the policy, or if the policy is # set to 'noeviction', Redis will start to reply with errors to commands From 33ea91332929f43fd9205bd700d152c5ae599d11 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 6 Dec 2013 11:37:46 +0100 Subject: [PATCH 0365/2500] Sentinel: fix reported role fields when master is reset. When there is a master address switch, the reported role must be set to master so that we have a chance to re-sample the INFO output to check if the new address is reporting the right role. Otherwise if the role was wrong, it will be sensed as wrong even after the address switch, and for enough time according to the role change time, for Sentinel consider the master SDOWN. This fixes isue #1446, that describes the effects of this bug in practice. --- src/sentinel.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/sentinel.c b/src/sentinel.c index 1d61ac320..05649b249 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1131,6 +1131,8 @@ void sentinelResetMaster(sentinelRedisInstance *ri, int flags) { ri->slave_master_host = NULL; ri->last_avail_time = mstime(); ri->last_pong_time = mstime(); + ri->role_reported_time = mstime(); + ri->role_reported = SRI_MASTER; if (flags & SENTINEL_GENERATE_EVENT) sentinelEvent(REDIS_WARNING,"+reset-master",ri,"%@"); } From b6d79f34e8768ea8eccb8c3d752718d9d62a6d02 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 6 Dec 2013 12:46:56 +0100 Subject: [PATCH 0366/2500] Sentinel: fix reported role info sampling. The way the role change was recoded was not sane and too much convoluted, causing the role information to be not always updated. This commit fixes issue #1445. --- src/sentinel.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 05649b249..3bd1ee8c8 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1762,22 +1762,22 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { * Some things will not happen if sentinel.tilt is true, but some will * still be processed. */ + /* Remember when the role changed. */ + if (role != ri->role_reported) { + ri->role_reported_time = mstime(); + ri->role_reported = role; + if (role == SRI_SLAVE) ri->slave_conf_change_time = mstime(); + } + /* Handle master -> slave role switch. */ if ((ri->flags & SRI_MASTER) && role == SRI_SLAVE) { - if (ri->role_reported != SRI_SLAVE) { - ri->role_reported_time = mstime(); - ri->role_reported = SRI_SLAVE; - ri->slave_conf_change_time = mstime(); - } + /* Nothing to do, but masters claiming to be slaves are + * considered to be unreachable by Sentinel, so eventually + * a failover will be triggered. */ } /* Handle slave -> master role switch. */ if ((ri->flags & SRI_SLAVE) && role == SRI_MASTER) { - if (ri->role_reported != SRI_MASTER) { - ri->role_reported_time = mstime(); - ri->role_reported = SRI_MASTER; - } - /* If this is a promoted slave we can change state to the * failover state machine. */ if (!sentinel.tilt && From 834a5f530dffb748c8405c7cd5cc91e1c37c13f7 Mon Sep 17 00:00:00 2001 From: Yossi Gottlieb Date: Sun, 8 Dec 2013 12:57:03 +0200 Subject: [PATCH 0367/2500] Return proper error on requests with an unbalanced number of quotes. --- src/networking.c | 5 +++++ tests/unit/protocol.tcl | 8 ++++++++ 2 files changed, 13 insertions(+) diff --git a/src/networking.c b/src/networking.c index 2af8915a2..7ec5a5b9c 100644 --- a/src/networking.c +++ b/src/networking.c @@ -884,6 +884,11 @@ int processInlineBuffer(redisClient *c) { aux = sdsnewlen(c->querybuf,querylen); argv = sdssplitargs(aux,&argc); sdsfree(aux); + if (argv == NULL) { + addReplyError(c,"Protocol error: unbalanced quotes in request"); + setProtocolError(c,0); + return REDIS_ERR; + } /* Leave data after the first line of the query in the buffer */ sdsrange(c->querybuf,querylen+2,-1); diff --git a/tests/unit/protocol.tcl b/tests/unit/protocol.tcl index 1700e4892..ac99c3abb 100644 --- a/tests/unit/protocol.tcl +++ b/tests/unit/protocol.tcl @@ -60,6 +60,14 @@ start_server {tags {"protocol"}} { assert_error "*wrong*arguments*ping*" {r ping x y z} } + test "Unbalanced number of quotes" { + reconnect + r write "set \"\"\"test-key\"\"\" test-value\r\n" + r write "ping\r\n" + r flush + assert_error "*unbalanced*" {r read} + } + set c 0 foreach seq [list "\x00" "*\x00" "$\x00"] { incr c From ae81525d3527eda21d0272922c0f83109f01878e Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 9 Dec 2013 13:28:39 +0100 Subject: [PATCH 0368/2500] Handle inline requested terminated with just \n. --- src/networking.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/networking.c b/src/networking.c index 2af8915a2..e8283c272 100644 --- a/src/networking.c +++ b/src/networking.c @@ -865,11 +865,14 @@ void resetClient(redisClient *c) { } int processInlineBuffer(redisClient *c) { - char *newline = strstr(c->querybuf,"\r\n"); + char *newline; int argc, j; sds *argv, aux; size_t querylen; + /* Search for end of line */ + newline = strchr(c->querybuf,'\n'); + /* Nothing to do without a \r\n */ if (newline == NULL) { if (sdslen(c->querybuf) > REDIS_INLINE_MAX_SIZE) { @@ -879,6 +882,10 @@ int processInlineBuffer(redisClient *c) { return REDIS_ERR; } + /* Handle the \r\n case. */ + if (newline && newline != c->querybuf && *(newline-1) == '\r') + newline--; + /* Split the input buffer up to the \r\n */ querylen = newline-(c->querybuf); aux = sdsnewlen(c->querybuf,querylen); From 54a526687d1470b84efb85702b53207c2746b664 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 9 Dec 2013 13:32:44 +0100 Subject: [PATCH 0369/2500] Slaves heartbeat while loading RDB files. Starting with Redis 2.8 masters are able to detect timed out slaves, while before 2.8 only slaves were able to detect a timed out master. Now that timeout detection is bi-directional the following problem happens as described "in the field" by issue #1449: 1) Master and slave setup with big dataset. 2) Slave performs the first synchronization, or a full sync after a failed partial resync. 3) Master sends the RDB payload to the slave. 4) Slave loads this payload. 5) Master detects the slave as timed out since does not receive back the REPLCONF ACK acknowledges. Here the problem is that the master has no way to know how much the slave will take to load the RDB file in memory. The obvious solution is to use a greater replication timeout setting, but this is a shame since for the 0.1% of operation time we are forced to use a timeout that is not what is suited for 99.9% of operation time. This commit tries to fix this problem with a solution that is a bit of an hack, but that modifies little of the replication internals, in order to be back ported to 2.8 safely. During the RDB loading time, we send the master newlines to avoid being sensed as timed out. This is the same that the master already does while saving the RDB file to still signal its presence to the slave. The single newline is used because: 1) It can't desync the protocol, as it is only transmitted all or nothing. 2) It can be safely sent while we don't have a client structure for the master or in similar situations just with write(2). --- src/networking.c | 6 ++++++ src/rdb.c | 13 ++++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/src/networking.c b/src/networking.c index e8283c272..6ae4da5a2 100644 --- a/src/networking.c +++ b/src/networking.c @@ -892,6 +892,12 @@ int processInlineBuffer(redisClient *c) { argv = sdssplitargs(aux,&argc); sdsfree(aux); + /* Newline from slaves can be used to refresh the last ACK time. + * This is useful for a slave to ping back while loading a big + * RDB file. */ + if (querylen == 0 && c->flags & REDIS_SLAVE) + c->repl_ack_time = server.unixtime; + /* Leave data after the first line of the query in the buffer */ sdsrange(c->querybuf,querylen+2,-1); diff --git a/src/rdb.c b/src/rdb.c index bc0957c2f..ab24c6427 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -1063,7 +1063,18 @@ void rdbLoadProgressCallback(rio *r, const void *buf, size_t len) { if (server.rdb_checksum) rioGenericUpdateChecksum(r, buf, len); if (server.loading_process_events_interval_bytes && - (r->processed_bytes + len)/server.loading_process_events_interval_bytes > r->processed_bytes/server.loading_process_events_interval_bytes) { + (r->processed_bytes + len)/server.loading_process_events_interval_bytes > r->processed_bytes/server.loading_process_events_interval_bytes) + { + if (server.masterhost && server.repl_state == REDIS_REPL_TRANSFER) { + /* Avoid the master to detect the slave is timing out while + * loading the RDB file in initial synchronization. We send + * a single newline character that is valid protocol but is + * guaranteed to either be sent entierly or not, since the byte + * is indivisible. */ + if (write(server.repl_transfer_s,"\n",1) == -1) { + /* Pinging back in this stage is best-effort. */ + } + } loadingProgress(r->processed_bytes); aeProcessEvents(server.el, AE_FILE_EVENTS|AE_DONT_WAIT); } From 18d92c0836a836e03c52dd28737dbf5b4bd0bb6a Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 10 Dec 2013 17:49:45 +0100 Subject: [PATCH 0370/2500] Don't send more than 1 newline/sec while loading RDB. --- src/rdb.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/rdb.c b/src/rdb.c index ab24c6427..0fcf426f8 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -1066,13 +1066,17 @@ void rdbLoadProgressCallback(rio *r, const void *buf, size_t len) { (r->processed_bytes + len)/server.loading_process_events_interval_bytes > r->processed_bytes/server.loading_process_events_interval_bytes) { if (server.masterhost && server.repl_state == REDIS_REPL_TRANSFER) { + static time_t newline_sent; /* Avoid the master to detect the slave is timing out while * loading the RDB file in initial synchronization. We send * a single newline character that is valid protocol but is * guaranteed to either be sent entierly or not, since the byte * is indivisible. */ - if (write(server.repl_transfer_s,"\n",1) == -1) { - /* Pinging back in this stage is best-effort. */ + if (time(NULL) != newline_sent) { + newline_sent = time(NULL); + if (write(server.repl_transfer_s,"\n",1) == -1) { + /* Pinging back in this stage is best-effort. */ + } } } loadingProgress(r->processed_bytes); From 2860b5e23468023843af347a6d34a4696c49c03a Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 10 Dec 2013 17:51:14 +0100 Subject: [PATCH 0371/2500] Log empty DB + Loading data into two separated messages. --- src/replication.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/replication.c b/src/replication.c index 1e87eb185..0b4ac16d9 100644 --- a/src/replication.c +++ b/src/replication.c @@ -794,7 +794,7 @@ void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask) { replicationAbortSyncTransfer(); return; } - redisLog(REDIS_NOTICE, "MASTER <-> SLAVE sync: Loading DB in memory"); + redisLog(REDIS_NOTICE, "MASTER <-> SLAVE sync: Flushing old data"); signalFlushedDb(-1); emptyDb(); /* Before loading the DB into memory we need to delete the readable @@ -802,6 +802,7 @@ void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask) { * rdbLoad() will call the event loop to process events from time to * time for non blocking loading. */ aeDeleteFileEvent(server.el,server.repl_transfer_s,AE_READABLE); + redisLog(REDIS_NOTICE, "MASTER <-> SLAVE sync: Loading DB in memory"); if (rdbLoad(server.rdb_filename) != REDIS_OK) { redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk"); replicationAbortSyncTransfer(); From 247a311317cc87e6ca5e275ccc1063fb91964f2f Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 10 Dec 2013 18:18:24 +0100 Subject: [PATCH 0372/2500] dict.c: added optional callback to dictEmpty(). Redis hash table implementation has many non-blocking features like incremental rehashing, however while deleting a large hash table there was no way to have a callback called to do some incremental work. This commit adds this support, as an optiona callback argument to dictEmpty() that is currently called at a fixed interval (one time every 65k deletions). --- src/db.c | 12 ++++++------ src/debug.c | 4 ++-- src/dict.c | 15 ++++++++------- src/dict.h | 2 +- src/redis.h | 2 +- src/replication.c | 4 ++-- src/sentinel.c | 2 +- src/t_list.c | 2 +- 8 files changed, 22 insertions(+), 21 deletions(-) diff --git a/src/db.c b/src/db.c index be71d2592..5bea8db8e 100644 --- a/src/db.c +++ b/src/db.c @@ -170,14 +170,14 @@ int dbDelete(redisDb *db, robj *key) { } } -long long emptyDb() { +long long emptyDb(void(callback)(void*)) { int j; long long removed = 0; for (j = 0; j < server.dbnum; j++) { removed += dictSize(server.db[j].dict); - dictEmpty(server.db[j].dict); - dictEmpty(server.db[j].expires); + dictEmpty(server.db[j].dict,callback); + dictEmpty(server.db[j].expires,callback); } if (server.cluster_enabled) slotToKeyFlush(); return removed; @@ -214,15 +214,15 @@ void signalFlushedDb(int dbid) { void flushdbCommand(redisClient *c) { server.dirty += dictSize(c->db->dict); signalFlushedDb(c->db->id); - dictEmpty(c->db->dict); - dictEmpty(c->db->expires); + dictEmpty(c->db->dict,NULL); + dictEmpty(c->db->expires,NULL); if (server.cluster_enabled) slotToKeyFlush(); addReply(c,shared.ok); } void flushallCommand(redisClient *c) { signalFlushedDb(-1); - server.dirty += emptyDb(); + server.dirty += emptyDb(NULL); addReply(c,shared.ok); if (server.rdb_child_pid != -1) { kill(server.rdb_child_pid,SIGUSR1); diff --git a/src/debug.c b/src/debug.c index e8e16cc8b..7d9a8bfe5 100644 --- a/src/debug.c +++ b/src/debug.c @@ -261,7 +261,7 @@ void debugCommand(redisClient *c) { addReply(c,shared.err); return; } - emptyDb(); + emptyDb(NULL); if (rdbLoad(server.rdb_filename) != REDIS_OK) { addReplyError(c,"Error trying to load the RDB dump"); return; @@ -269,7 +269,7 @@ void debugCommand(redisClient *c) { redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD"); addReply(c,shared.ok); } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) { - emptyDb(); + emptyDb(NULL); if (loadAppendOnlyFile(server.aof_filename) != REDIS_OK) { addReply(c,shared.err); return; diff --git a/src/dict.c b/src/dict.c index c6ba2b745..17a322870 100644 --- a/src/dict.c +++ b/src/dict.c @@ -444,14 +444,15 @@ int dictDeleteNoFree(dict *ht, const void *key) { } /* Destroy an entire dictionary */ -int _dictClear(dict *d, dictht *ht) -{ +int _dictClear(dict *d, dictht *ht, void(callback)(void *)) { unsigned long i; /* Free all the elements */ for (i = 0; i < ht->size && ht->used > 0; i++) { dictEntry *he, *nextHe; + if (callback && (i & 65535) == 0) callback(d->privdata); + if ((he = ht->table[i]) == NULL) continue; while(he) { nextHe = he->next; @@ -472,8 +473,8 @@ int _dictClear(dict *d, dictht *ht) /* Clear & Release the hash table */ void dictRelease(dict *d) { - _dictClear(d,&d->ht[0]); - _dictClear(d,&d->ht[1]); + _dictClear(d,&d->ht[0],NULL); + _dictClear(d,&d->ht[1],NULL); zfree(d); } @@ -882,9 +883,9 @@ static int _dictKeyIndex(dict *d, const void *key) return idx; } -void dictEmpty(dict *d) { - _dictClear(d,&d->ht[0]); - _dictClear(d,&d->ht[1]); +void dictEmpty(dict *d, void(callback)(void*)) { + _dictClear(d,&d->ht[0],callback); + _dictClear(d,&d->ht[1],callback); d->rehashidx = -1; d->iterators = 0; } diff --git a/src/dict.h b/src/dict.h index 11e1b97ee..3385e9f06 100644 --- a/src/dict.h +++ b/src/dict.h @@ -160,7 +160,7 @@ dictEntry *dictGetRandomKey(dict *d); void dictPrintStats(dict *d); unsigned int dictGenHashFunction(const void *key, int len); unsigned int dictGenCaseHashFunction(const unsigned char *buf, int len); -void dictEmpty(dict *d); +void dictEmpty(dict *d, void(callback)(void*)); void dictEnableResize(void); void dictDisableResize(void); int dictRehash(dict *d, int n); diff --git a/src/redis.h b/src/redis.h index a9b06838b..16403a8b5 100644 --- a/src/redis.h +++ b/src/redis.h @@ -1212,7 +1212,7 @@ void setKey(redisDb *db, robj *key, robj *val); int dbExists(redisDb *db, robj *key); robj *dbRandomKey(redisDb *db); int dbDelete(redisDb *db, robj *key); -long long emptyDb(); +long long emptyDb(void(callback)(void*)); int selectDb(redisClient *c, int id); void signalModifiedKey(redisDb *db, robj *key); void signalFlushedDb(int dbid); diff --git a/src/replication.c b/src/replication.c index 0b4ac16d9..5044ca331 100644 --- a/src/replication.c +++ b/src/replication.c @@ -796,7 +796,7 @@ void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask) { } redisLog(REDIS_NOTICE, "MASTER <-> SLAVE sync: Flushing old data"); signalFlushedDb(-1); - emptyDb(); + emptyDb(NULL); /* Before loading the DB into memory we need to delete the readable * handler, otherwise it will get called recursively since * rdbLoad() will call the event loop to process events from time to @@ -1468,7 +1468,7 @@ void replicationScriptCacheInit(void) { * to reclaim otherwise unused memory. */ void replicationScriptCacheFlush(void) { - dictEmpty(server.repl_scriptcache_dict); + dictEmpty(server.repl_scriptcache_dict,NULL); listRelease(server.repl_scriptcache_fifo); server.repl_scriptcache_fifo = listCreate(); } diff --git a/src/sentinel.c b/src/sentinel.c index 3bd1ee8c8..9a5012df3 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -391,7 +391,7 @@ void initSentinel(void) { /* Remove usual Redis commands from the command table, then just add * the SENTINEL command. */ - dictEmpty(server.commands); + dictEmpty(server.commands,NULL); for (j = 0; j < sizeof(sentinelcmds)/sizeof(sentinelcmds[0]); j++) { int retval; struct redisCommand *cmd = sentinelcmds+j; diff --git a/src/t_list.c b/src/t_list.c index 216e071ba..70f5cf164 100644 --- a/src/t_list.c +++ b/src/t_list.c @@ -835,7 +835,7 @@ void unblockClientWaitingData(redisClient *c) { dictReleaseIterator(di); /* Cleanup the client structure */ - dictEmpty(c->bpop.keys); + dictEmpty(c->bpop.keys,NULL); if (c->bpop.target) { decrRefCount(c->bpop.target); c->bpop.target = NULL; From ccd6ccc7ddefafaf0591d32d7ade3f218d5e8ec2 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 10 Dec 2013 18:38:26 +0100 Subject: [PATCH 0373/2500] Slaves heartbeats during sync improved. The previous fix for false positive timeout detected by master was not complete. There is another blocking stage while loading data for the first synchronization with the master, that is, flushing away the current data from the DB memory. This commit uses the newly introduced dict.c callback in order to make some incremental work (to send "\n" heartbeats to the master) while flushing the old data from memory. It is hard to write a regression test for this issue unfortunately. More support for debugging in the Redis core would be needed in terms of functionalities to simulate a slow DB loading / deletion. --- src/rdb.c | 16 ++-------------- src/redis.h | 1 + src/replication.c | 27 ++++++++++++++++++++++++++- 3 files changed, 29 insertions(+), 15 deletions(-) diff --git a/src/rdb.c b/src/rdb.c index 0fcf426f8..60dd7113e 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -1065,20 +1065,8 @@ void rdbLoadProgressCallback(rio *r, const void *buf, size_t len) { if (server.loading_process_events_interval_bytes && (r->processed_bytes + len)/server.loading_process_events_interval_bytes > r->processed_bytes/server.loading_process_events_interval_bytes) { - if (server.masterhost && server.repl_state == REDIS_REPL_TRANSFER) { - static time_t newline_sent; - /* Avoid the master to detect the slave is timing out while - * loading the RDB file in initial synchronization. We send - * a single newline character that is valid protocol but is - * guaranteed to either be sent entierly or not, since the byte - * is indivisible. */ - if (time(NULL) != newline_sent) { - newline_sent = time(NULL); - if (write(server.repl_transfer_s,"\n",1) == -1) { - /* Pinging back in this stage is best-effort. */ - } - } - } + if (server.masterhost && server.repl_state == REDIS_REPL_TRANSFER) + replicationSendNewlineToMaster(); loadingProgress(r->processed_bytes); aeProcessEvents(server.el, AE_FILE_EVENTS|AE_DONT_WAIT); } diff --git a/src/redis.h b/src/redis.h index 16403a8b5..42f531ea7 100644 --- a/src/redis.h +++ b/src/redis.h @@ -1070,6 +1070,7 @@ int replicationScriptCacheExists(sds sha1); void processClientsWaitingReplicas(void); void unblockClientWaitingReplicas(redisClient *c); int replicationCountAcksByOffset(long long offset); +void replicationSendNewlineToMaster(void); /* Generic persistence functions */ void startLoading(FILE *fp); diff --git a/src/replication.c b/src/replication.c index 5044ca331..a6c5d0d42 100644 --- a/src/replication.c +++ b/src/replication.c @@ -715,6 +715,31 @@ void replicationAbortSyncTransfer(void) { server.repl_state = REDIS_REPL_CONNECT; } +/* Avoid the master to detect the slave is timing out while loading the + * RDB file in initial synchronization. We send a single newline character + * that is valid protocol but is guaranteed to either be sent entierly or + * not, since the byte is indivisible. + * + * The function is called in two contexts: while we flush the current + * data with emptyDb(), and while we load the new data received as an + * RDB file from the master. */ +void replicationSendNewlineToMaster(void) { + static time_t newline_sent; + if (time(NULL) != newline_sent) { + newline_sent = time(NULL); + if (write(server.repl_transfer_s,"\n",1) == -1) { + /* Pinging back in this stage is best-effort. */ + } + } +} + +/* Callback used by emptyDb() while flushing away old data to load + * the new dataset received by the master. */ +void replicationEmptyDbCallback(void *privdata) { + REDIS_NOTUSED(privdata); + replicationSendNewlineToMaster(); +} + /* Asynchronously read the SYNC payload we receive from a master */ #define REPL_MAX_WRITTEN_BEFORE_FSYNC (1024*1024*8) /* 8 MB */ void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask) { @@ -796,7 +821,7 @@ void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask) { } redisLog(REDIS_NOTICE, "MASTER <-> SLAVE sync: Flushing old data"); signalFlushedDb(-1); - emptyDb(NULL); + emptyDb(replicationEmptyDbCallback); /* Before loading the DB into memory we need to delete the readable * handler, otherwise it will get called recursively since * rdbLoad() will call the event loop to process events from time to From c7ae8e7d1f3a2d7588ffaed9de1f1ee832aac707 Mon Sep 17 00:00:00 2001 From: Yubao Liu Date: Sat, 30 Nov 2013 14:14:28 +0800 Subject: [PATCH 0374/2500] fix typo in redis.conf and sentinel.conf --- redis.conf | 2 +- sentinel.conf | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/redis.conf b/redis.conf index 76461d6b1..0a79591b5 100644 --- a/redis.conf +++ b/redis.conf @@ -658,7 +658,7 @@ activerehashing yes # # normal -> normal clients # slave -> slave clients and MONITOR clients -# pubsub -> clients subcribed to at least one pubsub channel or pattern +# pubsub -> clients subscribed to at least one pubsub channel or pattern # # The syntax of every client-output-buffer-limit directive is the following: # diff --git a/sentinel.conf b/sentinel.conf index 248e76c06..e44342221 100644 --- a/sentinel.conf +++ b/sentinel.conf @@ -6,7 +6,7 @@ port 26379 # sentinel monitor # -# Tells Sentinel to monitor this slave, and to consider it in O_DOWN +# Tells Sentinel to monitor this master, and to consider it in O_DOWN # (Objectively Down) state only if at least sentinels agree. # # Note that whatever is the ODOWN quorum, a Sentinel will require to @@ -63,7 +63,7 @@ sentinel parallel-syncs mymaster 1 # times the failover timeout. # # - The time needed for a slave replicating to a wrong master according -# to a Sentinel currnet configuration, to be forced to replicate +# to a Sentinel current configuration, to be forced to replicate # with the right master, is exactly the failover timeout (counting since # the moment a Sentinel detected the misconfiguration). # @@ -102,7 +102,7 @@ sentinel failover-timeout mymaster 180000 # # sentinel notification-script # -# Call the specified notification script for any sentienl event that is +# Call the specified notification script for any sentinel event that is # generated in the WARNING level (for instance -sdown, -odown, and so forth). # This script should notify the system administrator via email, SMS, or any # other messaging system, that there is something wrong with the monitored From 74d9f048faaa40f071c03f91709dc96e08e70cc6 Mon Sep 17 00:00:00 2001 From: Yossi Gottlieb Date: Mon, 24 Dec 2012 23:10:41 +0200 Subject: [PATCH 0375/2500] Fix wrong repldboff type which causes dropped replication in rare cases. --- src/redis.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/redis.h b/src/redis.h index 42f531ea7..d9f8cc084 100644 --- a/src/redis.h +++ b/src/redis.h @@ -485,7 +485,7 @@ typedef struct redisClient { int authenticated; /* when requirepass is non-NULL */ int replstate; /* replication state if this is a slave */ int repldbfd; /* replication DB file descriptor */ - long repldboff; /* replication DB file offset */ + off_t repldboff; /* replication DB file offset */ off_t repldbsize; /* replication DB file size */ sds replpreamble; /* replication DB preamble. */ long long reploff; /* replication offset if this is our master */ From 982e2855b84bf4c34c1ef57069e6e4654c8a56bf Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 11 Dec 2013 15:23:10 +0100 Subject: [PATCH 0376/2500] Replication: publish the slave_repl_offset when disconnected from master. When a slave was disconnected from its master the replication offset was reported as -1. Now it is reported as the replication offset of the previous master, so that failover can be performed using this value in order to try to select a slave with more processed data from a set of slaves of the old master. --- src/redis.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/redis.c b/src/redis.c index 47ec4ff26..0a1cdd0bb 100644 --- a/src/redis.c +++ b/src/redis.c @@ -2522,6 +2522,13 @@ sds genRedisInfoString(char *section) { "role:%s\r\n", server.masterhost == NULL ? "master" : "slave"); if (server.masterhost) { + long long slave_repl_offset = 1; + + if (server.master) + slave_repl_offset = server.master->reploff; + else if (server.cached_master) + slave_repl_offset = server.cached_master->reploff; + info = sdscatprintf(info, "master_host:%s\r\n" "master_port:%d\r\n" @@ -2536,7 +2543,7 @@ sds genRedisInfoString(char *section) { server.master ? ((int)(server.unixtime-server.master->lastinteraction)) : -1, server.repl_state == REDIS_REPL_TRANSFER, - server.master ? server.master->reploff : -1 + slave_repl_offset ); if (server.repl_state == REDIS_REPL_TRANSFER) { From 45637a3a8206fbf8077dab1805200e2189e0f897 Mon Sep 17 00:00:00 2001 From: codeeply Date: Thu, 12 Dec 2013 16:33:29 +0800 Subject: [PATCH 0377/2500] comment mistake fixed --- src/sds.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sds.c b/src/sds.c index 338b4262a..64c9a9615 100644 --- a/src/sds.c +++ b/src/sds.c @@ -383,7 +383,7 @@ sds sdstrim(sds s, const char *cset) { * Example: * * s = sdsnew("Hello World"); - * sdstrim(s,1,-1); => "ello Worl" + * sdsrange(s,1,-1); => "ello World" */ void sdsrange(sds s, int start, int end) { struct sdshdr *sh = (void*) (s-(sizeof(struct sdshdr))); From 34a9b7d65601cbf4444eba111a91f273e0a6bf7c Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 12 Dec 2013 15:19:08 +0100 Subject: [PATCH 0378/2500] Makefile: remove odd syntax not compatible with some make versions. See issue #1448. --- src/Makefile | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/Makefile b/src/Makefile index e0592710f..0b4cff7a1 100644 --- a/src/Makefile +++ b/src/Makefile @@ -55,15 +55,19 @@ FINAL_LIBS=-lm DEBUG=-g -ggdb ifeq ($(uname_S),SunOS) + # SunOS INSTALL=cp -pf FINAL_CFLAGS+= -D__EXTENSIONS__ -D_XPG6 FINAL_LIBS+= -ldl -lnsl -lsocket -lpthread -else ifeq ($(uname_S),Darwin) - else +ifeq ($(uname_S),Darwin) + # Darwin (nothing to do) +else + # All the other OSes (notably Linux) FINAL_LDFLAGS+= -rdynamic FINAL_LIBS+= -pthread endif +endif # Include paths to dependencies FINAL_CFLAGS+= -I../deps/hiredis -I../deps/linenoise -I../deps/lua/src From 134b4e97e7f99831f6371931b2962655609ee716 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 13 Dec 2013 11:01:13 +0100 Subject: [PATCH 0379/2500] Sentinel: dead code removed. --- src/sentinel.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 9a5012df3..44510a23e 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1638,8 +1638,6 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { sds *lines; int numlines, j; int role = 0; - int runid_changed = 0; /* true if runid changed. */ - int first_runid = 0; /* true if this is the first runid we receive. */ /* The following fields must be reset to a given value in the case they * are not found at all in the INFO output. */ @@ -1655,10 +1653,8 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { if (sdslen(l) >= 47 && !memcmp(l,"run_id:",7)) { if (ri->runid == NULL) { ri->runid = sdsnewlen(l+7,40); - first_runid = 1; } else { if (strncmp(ri->runid,l+7,40) != 0) { - runid_changed = 1; sentinelEvent(REDIS_NOTICE,"+reboot",ri,"%@"); sdsfree(ri->runid); ri->runid = sdsnewlen(l+7,40); @@ -3234,17 +3230,12 @@ void sentinelFailoverReconfNextSlave(sentinelRedisInstance *master) { void sentinelFailoverSwitchToPromotedSlave(sentinelRedisInstance *master) { sentinelRedisInstance *ref = master->promoted_slave ? master->promoted_slave : master; - sds old_master_ip; - int old_master_port; sentinelEvent(REDIS_WARNING,"+switch-master",master,"%s %s %d %s %d", master->name, master->addr->ip, master->addr->port, ref->addr->ip, ref->addr->port); - old_master_ip = sdsdup(master->addr->ip); - old_master_port = master->addr->port; sentinelResetMasterAndChangeAddress(master,ref->addr->ip,ref->addr->port); - sdsfree(old_master_ip); } void sentinelFailoverStateMachine(sentinelRedisInstance *ri) { From 69a3303c18f5f8880c38249ef7c63cd057bb2f5e Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 13 Dec 2013 11:29:59 +0100 Subject: [PATCH 0380/2500] SDIFF iterator misuse fixed in diff algorithm #1. The bug could be easily triggered by: SADD foo a b c 1 2 3 4 5 6 SDIFF foo foo When the key was the same in two sets, an unsafe iterator was used to check existence of elements in the same set we were iterating. Usually this would just result into a wrong output, however with the dict.c API misuse protection we have in place, the result was actually an assertion failed that was triggered by the CI test, while creating random datasets for the "MASTER and SLAVE consistency" test. --- src/t_set.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/t_set.c b/src/t_set.c index 0ba8335aa..6e8e1f4e0 100644 --- a/src/t_set.c +++ b/src/t_set.c @@ -820,6 +820,7 @@ void sunionDiffGenericCommand(redisClient *c, robj **setkeys, int setnum, robj * while((ele = setTypeNextObject(si)) != NULL) { for (j = 1; j < setnum; j++) { if (!sets[j]) continue; /* no key is an empty set. */ + if (sets[j] == sets[0]) break; /* same set! */ if (setTypeIsMember(sets[j],ele)) break; } if (j == setnum) { From f71de5ab4e44fa4931ae598ab6ddf0196aebfe0b Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 13 Dec 2013 11:37:03 +0100 Subject: [PATCH 0381/2500] SDIFF iterator misuse bug regression test added. See commit 69a3303 for more info about the bug. --- tests/unit/type/set.tcl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/unit/type/set.tcl b/tests/unit/type/set.tcl index a77759e5d..162de0af7 100644 --- a/tests/unit/type/set.tcl +++ b/tests/unit/type/set.tcl @@ -214,6 +214,12 @@ start_server { r sdiff set1 set2 set3 } {} + test "SDIFF with same set two times" { + r del set1 + r sadd set1 a b c 1 2 3 4 5 6 + r sdiff set1 set1 + } {} + test "SDIFF fuzzing" { for {set j 0} {$j < 100} {incr j} { unset -nocomplain s From 229267abd13b558b5bebec0cb789dcf820179d9a Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 13 Dec 2013 13:10:05 +0100 Subject: [PATCH 0382/2500] Makefile.dep updated. --- src/Makefile.dep | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Makefile.dep b/src/Makefile.dep index 9ec6d9c91..b66e00df4 100644 --- a/src/Makefile.dep +++ b/src/Makefile.dep @@ -1,5 +1,5 @@ adlist.o: adlist.c adlist.h zmalloc.h -ae.o: ae.c ae.h zmalloc.h config.h ae_kqueue.c +ae.o: ae.c ae.h zmalloc.h config.h ae_kqueue.c ae_epoll.c ae_select.c ae_evport.c ae_epoll.o: ae_epoll.c ae_evport.o: ae_evport.c ae_kqueue.o: ae_kqueue.c @@ -117,3 +117,6 @@ ziplist.o: ziplist.c zmalloc.h util.h sds.h ziplist.h endianconv.h \ config.h redisassert.h zipmap.o: zipmap.c zmalloc.h endianconv.h config.h zmalloc.o: zmalloc.c config.h zmalloc.h +blocked.o: blocked.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h From 9180fb79313e6155a5337faef77905a14013a4db Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 17 Dec 2013 09:38:11 +0100 Subject: [PATCH 0383/2500] Cluster: use long long for timestamps in clusterGenNodesDescription(). Ping sent and pong received fields need to be casted to long long to be printed correctly into 32 bit systems. --- src/cluster.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 59a72065a..7ae34ebe3 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -2404,9 +2404,9 @@ sds clusterGenNodesDescription(int filter) { ci = sdscatprintf(ci,"- "); /* Latency from the POV of this node, link status */ - ci = sdscatprintf(ci,"%ld %ld %llu %s", - (long) node->ping_sent, - (long) node->pong_received, + ci = sdscatprintf(ci,"%lld %lld %llu %s", + (long long) node->ping_sent, + (long long) node->pong_received, (unsigned long long) node->configEpoch, (node->link || node->flags & REDIS_NODE_MYSELF) ? "connected" : "disconnected"); From 118d0fb5334ce30829d4c961f6282eb8fd1295da Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 17 Dec 2013 09:45:42 +0100 Subject: [PATCH 0384/2500] Fixed clearNodeFailureIfNeeded() time type to mstime_t. This prevented 32bit cluster instances from clearing the FAIL flag when needed. --- src/cluster.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cluster.c b/src/cluster.c index 7ae34ebe3..55863ad89 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -748,7 +748,7 @@ void markNodeAsFailingIfNeeded(clusterNode *node) { * to reach it again. It checks if there are the conditions to undo the FAIL * state. */ void clearNodeFailureIfNeeded(clusterNode *node) { - time_t now = mstime(); + mstime_t now = mstime(); redisAssert(node->flags & REDIS_NODE_FAIL); From b1d3dd657d36c74a16c54ab64bcdb440a4344c5c Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 17 Dec 2013 10:00:33 +0100 Subject: [PATCH 0385/2500] Cluster: use an hardcoded 60 sec timeout in redis-trib connections. Later this should be configurable from the command line but at least now we use something more appropriate for our use case compared to the redis-rb default timeout. --- src/redis-trib.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/redis-trib.rb b/src/redis-trib.rb index b9a9ee606..9c06c22ab 100755 --- a/src/redis-trib.rb +++ b/src/redis-trib.rb @@ -87,7 +87,7 @@ class ClusterNode print "Connecting to node #{self}: " STDOUT.flush begin - @r = Redis.new(:host => @info[:host], :port => @info[:port]) + @r = Redis.new(:host => @info[:host], :port => @info[:port], :timeout => 60) @r.ping rescue xputs "[ERR] Sorry, can't connect to node #{self}" From 195aab33451255d5d198db37b7f7742df63d5b02 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 17 Dec 2013 10:27:12 +0100 Subject: [PATCH 0386/2500] Cluster: use proper type mstime_t for ping delay var. --- src/cluster.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cluster.c b/src/cluster.c index 55863ad89..7ae433bfd 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -2018,7 +2018,7 @@ void clusterCron(void) { while((de = dictNext(di)) != NULL) { clusterNode *node = dictGetVal(de); now = mstime(); /* Use an updated time at every iteration. */ - int delay; + mstime_t delay; if (node->flags & (REDIS_NODE_MYSELF|REDIS_NODE_NOADDR|REDIS_NODE_HANDSHAKE)) From c17be18035ac6a3ef54294e9712a9a1772d2e0e4 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 17 Dec 2013 12:22:02 +0100 Subject: [PATCH 0387/2500] Cluster: initialize todo_before_sleep flags to 0. --- src/cluster.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/cluster.c b/src/cluster.c index 7ae433bfd..e88eb8c3c 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -259,6 +259,7 @@ void clusterInit(void) { server.cluster->currentEpoch = 0; server.cluster->state = REDIS_CLUSTER_FAIL; server.cluster->size = 1; + server.cluster->todo_before_sleep = 0; server.cluster->nodes = dictCreate(&clusterNodesDictType,NULL); server.cluster->nodes_black_list = dictCreate(&clusterNodesBlackListDictType,NULL); From 7f51cf8b5610ce6328d654b25e0557507d28d254 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 17 Dec 2013 12:28:37 +0100 Subject: [PATCH 0388/2500] Cluster: check link is valid before sending UPDATE. --- src/cluster.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/cluster.c b/src/cluster.c index e88eb8c3c..be709322f 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -1691,6 +1691,7 @@ void clusterSendUpdate(clusterLink *link, clusterNode *node) { unsigned char buf[sizeof(clusterMsg)]; clusterMsg *hdr = (clusterMsg*) buf; + if (link == NULL) return; clusterBuildMessageHdr(hdr,CLUSTERMSG_TYPE_UPDATE); memcpy(hdr->data.update.nodecfg.nodename,node->name,REDIS_CLUSTER_NAMELEN); hdr->data.update.nodecfg.configEpoch = htonu64(node->configEpoch); From e48365e2c2507e2b3eac0b5fda07c5f8c46e3ff7 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 17 Dec 2013 14:50:24 +0100 Subject: [PATCH 0389/2500] Cluster: set n->slaves to NULL in clusterNodeResetSlaves(). The value was otherwise undefined, so next time the node was promoted again from slave to master, adding a slave to the list of slaves would likely crash the server or result into undefined behavior. --- src/cluster.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/cluster.c b/src/cluster.c index be709322f..75a4e59d2 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -537,6 +537,7 @@ int clusterNodeAddSlave(clusterNode *master, clusterNode *slave) { void clusterNodeResetSlaves(clusterNode *n) { zfree(n->slaves); n->numslaves = 0; + n->slaves = NULL; } void freeClusterNode(clusterNode *n) { From 4b44b03cb950c20bd0a6bbb10c737a544f4d82e0 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 19 Dec 2013 10:18:45 +0100 Subject: [PATCH 0390/2500] Example redis.conf formatted to better show appendfilename option. --- redis.conf | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/redis.conf b/redis.conf index 0a79591b5..86ff1dac6 100644 --- a/redis.conf +++ b/redis.conf @@ -420,7 +420,8 @@ slave-priority 100 appendonly no # The name of the append only file (default: "appendonly.aof") -# appendfilename appendonly.aof + +appendfilename "appendonly.aof" # The fsync() call tells the Operating System to actually write data on disk # instead to wait for more data in the output buffer. Some OS will really flush @@ -467,6 +468,7 @@ appendfsync everysec # # If you have latency problems turn this to "yes". Otherwise leave it as # "no" that is the safest pick from the point of view of durability. + no-appendfsync-on-rewrite no # Automatic rewrite of the append only file. From f075607239fb81e52c2bdf90bb6b3cf0cafc4c96 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 19 Dec 2013 15:25:45 +0100 Subject: [PATCH 0391/2500] CONFIG REWRITE: don't wipe unknown options. With this commit options not explicitly rewritten by CONFIG REWRITE are not touched at all. These include new options that may not have support for REWRITE, and other special cases like rename-command and include. --- src/config.c | 59 ++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 50 insertions(+), 9 deletions(-) diff --git a/src/config.c b/src/config.c index 71b16d8ba..b455c079e 100644 --- a/src/config.c +++ b/src/config.c @@ -1169,17 +1169,27 @@ void dictListDestructor(void *privdata, void *val); void rewriteConfigSentinelOption(struct rewriteConfigState *state); dictType optionToLineDictType = { - dictSdsHash, /* hash function */ + dictSdsCaseHash, /* hash function */ NULL, /* key dup */ NULL, /* val dup */ - dictSdsKeyCompare, /* key compare */ + dictSdsKeyCaseCompare, /* key compare */ dictSdsDestructor, /* key destructor */ dictListDestructor /* val destructor */ }; +dictType optionSetDictType = { + dictSdsCaseHash, /* hash function */ + NULL, /* key dup */ + NULL, /* val dup */ + dictSdsKeyCaseCompare, /* key compare */ + dictSdsDestructor, /* key destructor */ + NULL /* val destructor */ +}; + /* The config rewrite state. */ struct rewriteConfigState { dict *option_to_line; /* Option -> list of config file lines map */ + dict *rewritten; /* Dictionary of already processed options */ int numlines; /* Number of lines in current config */ sds *lines; /* Current lines as an array of sds strings */ int has_tail; /* True if we already added directives that were @@ -1203,6 +1213,16 @@ void rewriteConfigAddLineNumberToOption(struct rewriteConfigState *state, sds op listAddNodeTail(l,(void*)(long)linenum); } +/* Add the specified option to the set of processed options. + * This is useful as only unused lines of processed options will be blanked + * in the config file, while options the rewrite process does not understand + * remain untouched. */ +void rewriteConfigMarkAsProcessed(struct rewriteConfigState *state, char *option) { + sds opt = sdsnew(option); + + if (dictAdd(state->rewritten,opt) != DICT_OK) sdsfree(opt); +} + /* Read the old file, split it into lines to populate a newly created * config rewrite state, and return it to the caller. * @@ -1217,6 +1237,7 @@ struct rewriteConfigState *rewriteConfigReadOldFile(char *path) { if (fp == NULL && errno != ENOENT) return NULL; state->option_to_line = dictCreate(&optionToLineDictType,NULL); + state->rewritten = dictCreate(&optionSetDictType,NULL); state->numlines = 0; state->lines = NULL; state->has_tail = 0; @@ -1284,6 +1305,8 @@ void rewriteConfigRewriteLine(struct rewriteConfigState *state, char *option, sd sds o = sdsnew(option); list *l = dictFetchValue(state->option_to_line,o); + rewriteConfigMarkAsProcessed(state,option); + if (!l && !force) { /* Option not used previously, and we are not forced to use it. */ sdsfree(line); @@ -1340,7 +1363,6 @@ void rewriteConfigBytesOption(struct rewriteConfigState *state, char *option, lo rewriteConfigFormatMemory(buf,sizeof(buf),value); line = sdscatprintf(sdsempty(),"%s %s",option,buf); rewriteConfigRewriteLine(state,option,line,force); - } /* Rewrite a yes/no option. */ @@ -1359,7 +1381,10 @@ void rewriteConfigStringOption(struct rewriteConfigState *state, char *option, c /* String options set to NULL need to be not present at all in the * configuration file to be set to NULL again at the next reboot. */ - if (value == NULL) return; + if (value == NULL) { + rewriteConfigMarkAsProcessed(state,option); + return; + } /* Compare the strings as sds strings to have a binary safe comparison. */ if (defvalue && strcmp(value,defvalue) == 0) force = 0; @@ -1449,7 +1474,10 @@ void rewriteConfigSaveOption(struct rewriteConfigState *state) { void rewriteConfigDirOption(struct rewriteConfigState *state) { char cwd[1024]; - if (getcwd(cwd,sizeof(cwd)) == NULL) return; /* no rewrite on error. */ + if (getcwd(cwd,sizeof(cwd)) == NULL) { + rewriteConfigMarkAsProcessed(state,"dir"); + return; /* no rewrite on error. */ + } rewriteConfigStringOption(state,"dir",cwd,NULL); } @@ -1460,7 +1488,10 @@ void rewriteConfigSlaveofOption(struct rewriteConfigState *state) { /* If this is a master, we want all the slaveof config options * in the file to be removed. */ - if (server.masterhost == NULL) return; + if (server.masterhost == NULL) { + rewriteConfigMarkAsProcessed(state,"slaveof"); + return; + } line = sdscatprintf(sdsempty(),"%s %s %d", option, server.masterhost, server.masterport); rewriteConfigRewriteLine(state,option,line,1); @@ -1525,7 +1556,10 @@ void rewriteConfigBindOption(struct rewriteConfigState *state) { char *option = "bind"; /* Nothing to rewrite if we don't have bind addresses. */ - if (server.bindaddr_count == 0) return; + if (server.bindaddr_count == 0) { + rewriteConfigMarkAsProcessed(state,option); + return; + } /* Rewrite as bind ... */ addresses = sdsjoin(server.bindaddr,server.bindaddr_count," "); @@ -1561,6 +1595,7 @@ sds rewriteConfigGetContentFromState(struct rewriteConfigState *state) { void rewriteConfigReleaseState(struct rewriteConfigState *state) { sdsfreesplitres(state->lines,state->numlines); dictRelease(state->option_to_line); + dictRelease(state->rewritten); zfree(state); } @@ -1578,6 +1613,14 @@ void rewriteConfigRemoveOrphaned(struct rewriteConfigState *state) { while((de = dictNext(di)) != NULL) { list *l = dictGetVal(de); + sds option = dictGetKey(de); + + /* Don't blank lines about options the rewrite process + * don't understand. */ + if (dictFetch(state->rewritten,option) == NULL) { + redisLog(REDIS_DEBUG,"Not rewritten option: %s", option); + continue; + } while(listLength(l)) { listNode *ln = listFirst(l); @@ -1667,8 +1710,6 @@ int rewriteConfig(char *path) { /* Step 2: rewrite every single option, replacing or appending it inside * the rewrite state. */ - /* TODO: Turn every default into a define, use it also in - * initServerConfig(). */ rewriteConfigYesNoOption(state,"daemonize",server.daemonize,0); rewriteConfigStringOption(state,"pidfile",server.pidfile,REDIS_DEFAULT_PID_FILE); rewriteConfigNumericalOption(state,"port",server.port,REDIS_SERVERPORT); From 5131d7da7467f0db9f112e2bb8b26f225ed49ca9 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 19 Dec 2013 15:30:06 +0100 Subject: [PATCH 0392/2500] CONFIG REWRITE: old development comments removed. --- src/config.c | 36 ++++-------------------------------- 1 file changed, 4 insertions(+), 32 deletions(-) diff --git a/src/config.c b/src/config.c index b455c079e..e7d2b78c9 100644 --- a/src/config.c +++ b/src/config.c @@ -1126,41 +1126,13 @@ void configGetCommand(redisClient *c) { * CONFIG REWRITE implementation *----------------------------------------------------------------------------*/ -/* IGNORE: - * - * rename-command - * include - * - * Special handling: - * - * notify-keyspace-events - * client-output-buffer-limit - * save - * appendonly - * appendfsync - * dir - * maxmemory-policy - * loglevel - * unixsocketperm - * slaveof - * - * Type of config directives: - * - * CUSTOM - * VERBATIM - * YESNO - * L - * LL - * - */ - #define REDIS_CONFIG_REWRITE_SIGNATURE "# Generated by CONFIG REWRITE" /* We use the following dictionary type to store where a configuration * option is mentioned in the old configuration file, so it's * like "maxmemory" -> list of line numbers (first line is zero). */ -unsigned int dictSdsHash(const void *key); -int dictSdsKeyCompare(void *privdata, const void *key1, const void *key2); +unsigned int dictSdsCaseHash(const void *key); +int dictSdsKeyCaseCompare(void *privdata, const void *key1, const void *key2); void dictSdsDestructor(void *privdata, void *val); void dictListDestructor(void *privdata, void *val); @@ -1220,7 +1192,7 @@ void rewriteConfigAddLineNumberToOption(struct rewriteConfigState *state, sds op void rewriteConfigMarkAsProcessed(struct rewriteConfigState *state, char *option) { sds opt = sdsnew(option); - if (dictAdd(state->rewritten,opt) != DICT_OK) sdsfree(opt); + if (dictAdd(state->rewritten,opt,NULL) != DICT_OK) sdsfree(opt); } /* Read the old file, split it into lines to populate a newly created @@ -1617,7 +1589,7 @@ void rewriteConfigRemoveOrphaned(struct rewriteConfigState *state) { /* Don't blank lines about options the rewrite process * don't understand. */ - if (dictFetch(state->rewritten,option) == NULL) { + if (dictFetchValue(state->rewritten,option) == NULL) { redisLog(REDIS_DEBUG,"Not rewritten option: %s", option); continue; } From 9846af124d5a5ab3e4ff6ae5a9d9b7f17f34e369 Mon Sep 17 00:00:00 2001 From: Yubao Liu Date: Sat, 30 Nov 2013 23:57:48 +0800 Subject: [PATCH 0393/2500] CONFIG REWRITE: don't throw some options on config rewrite Those options will be thrown without this patch: include, rename-command, min-slaves-to-write, min-slaves-max-lag, appendfilename. --- redis.conf | 27 +++++++++++++++++---------- src/config.c | 29 ++++++++++++++++------------- src/redis.c | 2 +- src/redis.h | 1 + 4 files changed, 35 insertions(+), 24 deletions(-) diff --git a/redis.conf b/redis.conf index 86ff1dac6..4c18120e1 100644 --- a/redis.conf +++ b/redis.conf @@ -12,6 +12,22 @@ # # units are case insensitive so 1GB 1Gb 1gB are all the same. +################################## INCLUDES ################################### + +# Include one or more other config files here. This is useful if you +# have a standard template that goes to all Redis server but also need +# to customize a few per-server settings. Include files can include +# other files, so use this wisely. +# +# Notice option "include" won't be rewritten by command "CONFIG REWRITE" +# from admin or Redis sentinel, you'd better put this option at the +# beginning of this file to avoid overwriting config change at runtime. +# +# include /path/to/local.conf +# include /path/to/other.conf + +################################ GENERAL ##################################### + # By default Redis does not run as a daemon. Use 'yes' if you need it. # Note that Redis will write a pid file in /var/run/redis.pid when daemonized. daemonize no @@ -88,7 +104,7 @@ logfile "" # dbid is a number between 0 and 'databases'-1 databases 16 -################################ SNAPSHOTTING ################################# +################################ SNAPSHOTTING ################################ # # Save the DB on disk: # @@ -711,12 +727,3 @@ hz 10 # big latency spikes. aof-rewrite-incremental-fsync yes -################################## INCLUDES ################################### - -# Include one or more other config files here. This is useful if you -# have a standard template that goes to all Redis server but also need -# to customize a few per-server settings. Include files can include -# other files, so use this wisely. -# -# include /path/to/local.conf -# include /path/to/other.conf diff --git a/src/config.c b/src/config.c index e7d2b78c9..c5baab6a0 100644 --- a/src/config.c +++ b/src/config.c @@ -1469,17 +1469,6 @@ void rewriteConfigSlaveofOption(struct rewriteConfigState *state) { rewriteConfigRewriteLine(state,option,line,1); } -/* Rewrite the appendonly option. */ -void rewriteConfigAppendonlyOption(struct rewriteConfigState *state) { - int force = server.aof_state != REDIS_AOF_OFF; - char *option = "appendonly"; - sds line; - - line = sdscatprintf(sdsempty(),"%s %s", option, - (server.aof_state == REDIS_AOF_OFF) ? "no" : "yes"); - rewriteConfigRewriteLine(state,option,line,force); -} - /* Rewrite the notify-keyspace-events option. */ void rewriteConfigNotifykeyspaceeventsOption(struct rewriteConfigState *state) { int force = server.notify_keyspace_events != 0; @@ -1578,12 +1567,23 @@ void rewriteConfigReleaseState(struct rewriteConfigState *state) { * should be replaced by empty lines. * * This function does just this, iterating all the option names and - * blanking all the lines still associated. */ + * blanking all the lines still associated. + * + * Two options "include" and "rename-command" are special, they are + * just kept because struct RedisServer doesn't record them. Notice + * this also means the included config file isn't rewritten, you'd + * better put "include" at the beginning of Redis main config file + * so that runtime config change won't be canceled by conflicted + * options in the included config file. */ void rewriteConfigRemoveOrphaned(struct rewriteConfigState *state) { dictIterator *di = dictGetIterator(state->option_to_line); dictEntry *de; while((de = dictNext(di)) != NULL) { + sds option = dictGetKey(de); + if (!strcmp(option, "include") || !strcmp(option, "rename-command")) + continue; + list *l = dictGetVal(de); sds option = dictGetKey(de); @@ -1717,6 +1717,8 @@ int rewriteConfig(char *path) { rewriteConfigBytesOption(state,"repl-backlog-ttl",server.repl_backlog_time_limit,REDIS_DEFAULT_REPL_BACKLOG_TIME_LIMIT); rewriteConfigYesNoOption(state,"repl-disable-tcp-nodelay",server.repl_disable_tcp_nodelay,REDIS_DEFAULT_REPL_DISABLE_TCP_NODELAY); rewriteConfigNumericalOption(state,"slave-priority",server.slave_priority,REDIS_DEFAULT_SLAVE_PRIORITY); + rewriteConfigNumericalOption(state,"min-slaves-to-write",server.repl_min_slaves_to_write,REDIS_DEFAULT_MIN_SLAVES_TO_WRITE); + rewriteConfigNumericalOption(state,"min-slaves-max-lag",server.repl_min_slaves_max_lag,REDIS_DEFAULT_MIN_SLAVES_MAX_LAG); rewriteConfigStringOption(state,"requirepass",server.requirepass,NULL); rewriteConfigNumericalOption(state,"maxclients",server.maxclients,REDIS_MAX_CLIENTS); rewriteConfigBytesOption(state,"maxmemory",server.maxmemory,REDIS_DEFAULT_MAXMEMORY); @@ -1729,7 +1731,8 @@ int rewriteConfig(char *path) { "noeviction", REDIS_MAXMEMORY_NO_EVICTION, NULL, REDIS_DEFAULT_MAXMEMORY_POLICY); rewriteConfigNumericalOption(state,"maxmemory-samples",server.maxmemory_samples,REDIS_DEFAULT_MAXMEMORY_SAMPLES); - rewriteConfigAppendonlyOption(state); + rewriteConfigYesNoOption(state,"appendonly",server.aof_state != REDIS_AOF_OFF,0); + rewriteConfigStringOption(state,"appendfilename",server.aof_filename,REDIS_DEFAULT_AOF_FILENAME); rewriteConfigEnumOption(state,"appendfsync",server.aof_fsync, "everysec", AOF_FSYNC_EVERYSEC, "always", AOF_FSYNC_ALWAYS, diff --git a/src/redis.c b/src/redis.c index 0a1cdd0bb..98a6f67a3 100644 --- a/src/redis.c +++ b/src/redis.c @@ -1366,7 +1366,7 @@ void initServerConfig() { server.aof_rewrite_incremental_fsync = REDIS_DEFAULT_AOF_REWRITE_INCREMENTAL_FSYNC; server.pidfile = zstrdup(REDIS_DEFAULT_PID_FILE); server.rdb_filename = zstrdup(REDIS_DEFAULT_RDB_FILENAME); - server.aof_filename = zstrdup("appendonly.aof"); + server.aof_filename = zstrdup(REDIS_DEFAULT_AOF_FILENAME); server.requirepass = NULL; server.rdb_compression = REDIS_DEFAULT_RDB_COMPRESSION; server.rdb_checksum = REDIS_DEFAULT_RDB_CHECKSUM; diff --git a/src/redis.h b/src/redis.h index d9f8cc084..2db1eb3ea 100644 --- a/src/redis.h +++ b/src/redis.h @@ -113,6 +113,7 @@ #define REDIS_DEFAULT_REPL_DISABLE_TCP_NODELAY 0 #define REDIS_DEFAULT_MAXMEMORY 0 #define REDIS_DEFAULT_MAXMEMORY_SAMPLES 3 +#define REDIS_DEFAULT_AOF_FILENAME "appendonly.aof" #define REDIS_DEFAULT_AOF_NO_FSYNC_ON_REWRITE 0 #define REDIS_DEFAULT_ACTIVE_REHASHING 1 #define REDIS_DEFAULT_AOF_REWRITE_INCREMENTAL_FSYNC 1 From 9dc5817de7365b9b7ca08d05444dd9a08bf52f8a Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 19 Dec 2013 15:55:25 +0100 Subject: [PATCH 0394/2500] CONFIG REWRITE: no special handling or include and rename-command. CONFIG REWRITE is now wiser and does not touch what it does not understand inside redis.conf. --- src/config.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/src/config.c b/src/config.c index c5baab6a0..3bd1d8592 100644 --- a/src/config.c +++ b/src/config.c @@ -1567,23 +1567,12 @@ void rewriteConfigReleaseState(struct rewriteConfigState *state) { * should be replaced by empty lines. * * This function does just this, iterating all the option names and - * blanking all the lines still associated. - * - * Two options "include" and "rename-command" are special, they are - * just kept because struct RedisServer doesn't record them. Notice - * this also means the included config file isn't rewritten, you'd - * better put "include" at the beginning of Redis main config file - * so that runtime config change won't be canceled by conflicted - * options in the included config file. */ + * blanking all the lines still associated. */ void rewriteConfigRemoveOrphaned(struct rewriteConfigState *state) { dictIterator *di = dictGetIterator(state->option_to_line); dictEntry *de; while((de = dictNext(di)) != NULL) { - sds option = dictGetKey(de); - if (!strcmp(option, "include") || !strcmp(option, "rename-command")) - continue; - list *l = dictGetVal(de); sds option = dictGetKey(de); From e76443455fef3e40366af261de10406d40bd64e0 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 19 Dec 2013 16:02:24 +0100 Subject: [PATCH 0395/2500] Clarify include directive behavior in example redis.conf. --- redis.conf | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/redis.conf b/redis.conf index 4c18120e1..7fb4e4953 100644 --- a/redis.conf +++ b/redis.conf @@ -20,8 +20,12 @@ # other files, so use this wisely. # # Notice option "include" won't be rewritten by command "CONFIG REWRITE" -# from admin or Redis sentinel, you'd better put this option at the -# beginning of this file to avoid overwriting config change at runtime. +# from admin or Redis Sentinel. Since Redis always uses the last processed +# line as value of a configuration directive, you'd better put includes +# at the beginning of this file to avoid overwriting config change at runtime. +# +# If instead you are interested in using includes to override configuration +# options, it is better to use include as the last line. # # include /path/to/local.conf # include /path/to/other.conf From f42e0277abc6eb35d56fa18786c73f1c09d05d32 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 20 Dec 2013 09:56:18 +0100 Subject: [PATCH 0396/2500] Redis Cluster: delay state change when in the majority again. As specified in the Redis Cluster specification, when a node can reach the majority again after a period in which it was partitioend away with the minorty of masters, wait some time before accepting queries, to provide a reasonable amount of time for other nodes to upgrade its configuration. This lowers the probabilities of both a client and a master with not updated configuration to rejoin the cluster at the same time, with a stale master accepting writes. --- src/cluster.c | 47 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 75a4e59d2..090dbf985 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -2205,20 +2205,24 @@ int clusterDelNodeSlots(clusterNode *node) { /* ----------------------------------------------------------------------------- * Cluster state evaluation function * -------------------------------------------------------------------------- */ + +#define REDIS_CLUSTER_MAX_REJOIN_DELAY 5000 + void clusterUpdateState(void) { - int j, initial_state = server.cluster->state; + int j, new_state; int unreachable_masters = 0; + static mstime_t among_minority_time; /* Start assuming the state is OK. We'll turn it into FAIL if there * are the right conditions. */ - server.cluster->state = REDIS_CLUSTER_OK; + new_state = REDIS_CLUSTER_OK; /* Check if all the slots are covered. */ for (j = 0; j < REDIS_CLUSTER_SLOTS; j++) { if (server.cluster->slots[j] == NULL || server.cluster->slots[j]->flags & (REDIS_NODE_FAIL)) { - server.cluster->state = REDIS_CLUSTER_FAIL; + new_state = REDIS_CLUSTER_FAIL; break; } } @@ -2248,24 +2252,39 @@ void clusterUpdateState(void) { /* If we can't reach at least half the masters, change the cluster state * to FAIL, as we are not even able to mark nodes as FAIL in this side - * of the netsplit because of lack of majority. - * - * TODO: when this condition is entered, we should not undo it for some - * (small) time after the majority is reachable again, to make sure that - * other nodes have enough time to inform this node of a configuration change. - * Otherwise a client with an old routing table may write to this node - * and later it may turn into a slave losing the write. */ + * of the netsplit because of lack of majority. */ { int needed_quorum = (server.cluster->size / 2) + 1; - if (unreachable_masters >= needed_quorum) - server.cluster->state = REDIS_CLUSTER_FAIL; + if (unreachable_masters >= needed_quorum) { + new_state = REDIS_CLUSTER_FAIL; + among_minority_time = mstime(); + } } /* Log a state change */ - if (initial_state != server.cluster->state) + if (new_state != server.cluster->state) { + mstime_t rejoin_delay = server.cluster_node_timeout; + + /* If the instance is a master and was partitioned away with the + * minority, don't let it accept queries for some time after the + * partition heals, to make sure there is enough time to receive + * a configuration update. */ + if (rejoin_delay > REDIS_CLUSTER_MAX_REJOIN_DELAY) + rejoin_delay = REDIS_CLUSTER_MAX_REJOIN_DELAY; + + if (new_state == REDIS_CLUSTER_OK && + server.cluster->myself->flags & REDIS_NODE_MASTER && + mstime() - among_minority_time < rejoin_delay) + { + return; + } + + /* Change the state and log the event. */ redisLog(REDIS_WARNING,"Cluster state changed: %s", - server.cluster->state == REDIS_CLUSTER_OK ? "ok" : "fail"); + new_state == REDIS_CLUSTER_OK ? "ok" : "fail"); + server.cluster->state = new_state; + } } /* This function is called after the node startup in order to verify that data From 4d11d4c86c336df2b6aaf86a4745eea8cc472216 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 20 Dec 2013 12:37:18 +0100 Subject: [PATCH 0397/2500] Redis Cluster: handshake code refactoring + Gossip IP switch detection. This commit makes it simple to start an handshake with a specific node address, and uses this in order to detect a node IP change and start a new handshake in order to fix the IP if possible. --- src/cluster.c | 128 +++++++++++++++++++++++++++++++------------------- 1 file changed, 80 insertions(+), 48 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 090dbf985..019678d7b 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -799,6 +799,64 @@ int clusterHandshakeInProgress(char *ip, int port) { return de != NULL; } +/* Start an handshake with the specified address if there is not one + * already in progress. Returns non-zero if the handshake was actually + * started. On error zero is returned and errno is set to one of the + * following values: + * + * EAGAIN - There is already an handshake in progress for this address. + * EINVAL - IP or port are not valid. */ +int clusterStartHandshake(char *ip, int port) { + clusterNode *n; + char norm_ip[REDIS_IP_STR_LEN]; + struct sockaddr_storage sa; + + /* IP sanity check */ + if (inet_pton(AF_INET,ip, + &(((struct sockaddr_in *)&sa)->sin_addr))) + { + sa.ss_family = AF_INET; + } else if (inet_pton(AF_INET6,ip, + &(((struct sockaddr_in6 *)&sa)->sin6_addr))) + { + sa.ss_family = AF_INET6; + } else { + errno = EINVAL; + return 0; + } + + /* Port sanity check */ + if (port <= 0 || port > (65535-REDIS_CLUSTER_PORT_INCR)) { + errno = EINVAL; + return 0; + } + + /* Set norm_ip as the normalized string representation of the node + * IP address. */ + if (sa.ss_family == AF_INET) + inet_ntop(AF_INET, + (void*)&(((struct sockaddr_in *)&sa)->sin_addr), + norm_ip,REDIS_CLUSTER_IPLEN); + else + inet_ntop(AF_INET6, + (void*)&(((struct sockaddr_in6 *)&sa)->sin6_addr), + norm_ip,REDIS_CLUSTER_IPLEN); + + if (clusterHandshakeInProgress(norm_ip,port)) { + errno = EAGAIN; + return 0; + } + + /* Add the node with a random address (NULL as first argument to + * createClusterNode()). Everything will be fixed during the + * handskake. */ + n = createClusterNode(NULL,REDIS_NODE_HANDSHAKE|REDIS_NODE_MEET); + memcpy(n->ip,norm_ip,sizeof(n->ip)); + n->port = port; + clusterAddNode(n); + return 1; +} + /* Process the gossip section of PING or PONG packets. * Note that this function assumes that the packet is already sanity-checked * by the caller, not in the content of the gossip section, but in the @@ -832,7 +890,7 @@ void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) { /* Update our state accordingly to the gossip sections */ node = clusterLookupNode(g->nodename); - if (node != NULL) { + if (node) { /* We already know this node. Handle failure reports, only when the sender is a master. */ if (sender && sender->flags & REDIS_NODE_MASTER && @@ -853,6 +911,17 @@ void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) { } } } + + /* If we already know this node, but it is not reachable, and + * we see a different address in the gossip section, start an + * handshake with the (possibly) new address: this will result + * into a node address update if the handshake will be + * successful. */ + if (node->flags & (REDIS_NODE_FAIL|REDIS_NODE_PFAIL) && + (strcasecmp(node->ip,g->ip) || node->port != ntohs(g->port))) + { + clusterStartHandshake(g->ip,ntohs(g->port)); + } } else { /* If it's not in NOADDR state and we don't have it, we * start a handshake process against this IP/PORT pairs. @@ -860,17 +929,8 @@ void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) { * Note that we require that the sender of this gossip message * is a well known node in our cluster, otherwise we risk * joining another cluster. */ - if (sender && !(flags & REDIS_NODE_NOADDR) && - !clusterHandshakeInProgress(g->ip,ntohs(g->port))) - { - clusterNode *newnode; - - redisLog(REDIS_DEBUG,"Adding the new node"); - newnode = createClusterNode(NULL,REDIS_NODE_HANDSHAKE); - memcpy(newnode->ip,g->ip,sizeof(g->ip)); - newnode->port = ntohs(g->port); - clusterAddNode(newnode); - } + if (sender && !(flags & REDIS_NODE_NOADDR)) + clusterStartHandshake(g->ip,ntohs(g->port)); } /* Next node */ @@ -2492,48 +2552,20 @@ void clusterCommand(redisClient *c) { } if (!strcasecmp(c->argv[1]->ptr,"meet") && c->argc == 4) { - /* CLUSTER MEET */ - clusterNode *n; - struct sockaddr_storage sa; long port; - /* Perform sanity checks on IP/port */ - if (inet_pton(AF_INET,c->argv[2]->ptr, - &(((struct sockaddr_in *)&sa)->sin_addr))) - { - sa.ss_family = AF_INET; - } else if (inet_pton(AF_INET6,c->argv[2]->ptr, - &(((struct sockaddr_in6 *)&sa)->sin6_addr))) - { - sa.ss_family = AF_INET6; - } else { - addReplyError(c,"Invalid IP address in MEET"); - return; - } - if (getLongFromObjectOrReply(c, c->argv[3], &port, NULL) != REDIS_OK || - port < 0 || port > (65535-REDIS_CLUSTER_PORT_INCR)) - { + if (getLongFromObjectOrReply(c, c->argv[3], &port, NULL) != REDIS_OK) { addReplyError(c,"Invalid TCP port specified"); return; } - /* Finally add the node to the cluster with a random name, this - * will get fixed in the first handshake (ping/pong). */ - n = createClusterNode(NULL,REDIS_NODE_HANDSHAKE|REDIS_NODE_MEET); - - /* Set node->ip as the normalized string representation of the node - * IP address. */ - if (sa.ss_family == AF_INET) - inet_ntop(AF_INET, - (void*)&(((struct sockaddr_in *)&sa)->sin_addr), - n->ip,REDIS_CLUSTER_IPLEN); - else - inet_ntop(AF_INET6, - (void*)&(((struct sockaddr_in6 *)&sa)->sin6_addr), - n->ip,REDIS_CLUSTER_IPLEN); - n->port = port; - clusterAddNode(n); - addReply(c,shared.ok); + if (clusterStartHandshake(c->argv[2]->ptr,port) == 0 && + errno == EINVAL) + { + addReplyError(c,"Invalid node address specified"); + } else { + addReply(c,shared.ok); + } } else if (!strcasecmp(c->argv[1]->ptr,"nodes") && c->argc == 2) { /* CLUSTER NODES */ robj *o; From dd10efb31a90f91b787f7d83a9960a4766ca4fee Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 20 Dec 2013 12:47:13 +0100 Subject: [PATCH 0398/2500] Redis Cluster: reconfigure replication when master changes address. --- src/cluster.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/cluster.c b/src/cluster.c index 019678d7b..85638d783 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -986,6 +986,14 @@ int nodeUpdateAddressIfNeeded(clusterNode *node, clusterLink *link, int port) { if (node->link) freeClusterLink(node->link); redisLog(REDIS_WARNING,"Address updated for node %.40s, now %s:%d", node->name, node->ip, node->port); + + /* Check if this is our master and we have to change the + * replication target as well. */ + if (server.cluster->myself->flags & REDIS_NODE_SLAVE && + server.cluster->myself->slaveof == node) + { + replicationSetMaster(node->ip, node->port); + } return 1; } From 8527ba1eeaa3aa40821bb554fa3a3e9cfaa2726a Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 20 Dec 2013 14:40:11 +0100 Subject: [PATCH 0399/2500] Redis Cluster: remove no longer relevant comment. --- src/cluster.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 85638d783..de2bda1c6 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -1217,11 +1217,6 @@ int clusterProcessPacket(clusterLink *link) { link->node->port = 0; freeClusterLink(link); clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); - /* FIXME: remove this node if we already have it. - * - * If we already have it but the IP is different, use - * the new one if the old node is in FAIL, PFAIL, or NOADDR - * status... */ return 0; } } From db016acb7ff0039072e73cfa2072638aaa9a1062 Mon Sep 17 00:00:00 2001 From: antirez Date: Sat, 21 Dec 2013 00:04:53 +0100 Subject: [PATCH 0400/2500] Redis Cluster: move node failure reports logging from VERBOSE to NOTICE level. --- src/cluster.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index de2bda1c6..6b0b59acd 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -898,14 +898,14 @@ void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) { { if (flags & (REDIS_NODE_FAIL|REDIS_NODE_PFAIL)) { if (clusterNodeAddFailureReport(node,sender)) { - redisLog(REDIS_NOTICE, + redisLog(REDIS_VERBOSE, "Node %.40s reported node %.40s as not reachable.", sender->name, node->name); } markNodeAsFailingIfNeeded(node); } else { if (clusterNodeDelFailureReport(node,sender)) { - redisLog(REDIS_NOTICE, + redisLog(REDIS_VERBOSE, "Node %.40s reported node %.40s is back online.", sender->name, node->name); } From 74da5ee594edd462b7193513b3d964588d0d4ef8 Mon Sep 17 00:00:00 2001 From: antirez Date: Sat, 21 Dec 2013 00:23:37 +0100 Subject: [PATCH 0401/2500] Log when a slave lose the connection with its master. --- src/networking.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/networking.c b/src/networking.c index aee54b757..0661137d7 100644 --- a/src/networking.c +++ b/src/networking.c @@ -654,15 +654,16 @@ void freeClient(redisClient *c) { * * Note that before doing this we make sure that the client is not in * some unexpected state, by checking its flags. */ - if (server.master && - (c->flags & REDIS_MASTER) && - !(c->flags & (REDIS_CLOSE_AFTER_REPLY| - REDIS_CLOSE_ASAP| - REDIS_BLOCKED| - REDIS_UNBLOCKED))) - { - replicationCacheMaster(c); - return; + if (server.master && c->flags & REDIS_MASTER) { + redisLog(REDIS_WARNING,"Connection with master lost."); + if (!(c->flags & (REDIS_CLOSE_AFTER_REPLY| + REDIS_CLOSE_ASAP| + REDIS_BLOCKED| + REDIS_UNBLOCKED))) + { + replicationCacheMaster(c); + return; + } } /* Free the query buffer */ From a75b334bdf71128b6d636a2c59e55586bb33cdd8 Mon Sep 17 00:00:00 2001 From: antirez Date: Sun, 22 Dec 2013 10:05:16 +0100 Subject: [PATCH 0402/2500] Redis Cluster: add repl_ping_slave_period to slave data validity time. When the configured node timeout is very small, the data validity time (maximum data age for a slave to try a failover) is too little (ten times the configured node timeout) when the replication link with the master is mostly idle. In this case we'll receive some data from the master only every server.repl_ping_slave_period to refresh the last interaction with the master. This commit adds to the max data validity time the slave ping period to avoid this problem of slaves sensing too old data without a good reason. However this max data validity time is likely a setting that should be configurable by the Redis Cluster user in a way completely independent from the node timeout. --- src/cluster.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/cluster.c b/src/cluster.c index 6b0b59acd..5831fb5c8 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -1900,7 +1900,8 @@ void clusterHandleSlaveFailover(void) { * constant of ten times the node timeout since the cluster should * react much faster to a master down. */ if (data_age > - server.cluster_node_timeout * REDIS_CLUSTER_SLAVE_VALIDITY_MULT) + (server.repl_ping_slave_period * 1000) + + (server.cluster_node_timeout * REDIS_CLUSTER_SLAVE_VALIDITY_MULT)) return; /* Compute the time at which we can start an election. */ From 0bfe6badf5a575ea17849daf1a86eb11b7935910 Mon Sep 17 00:00:00 2001 From: antirez Date: Sun, 22 Dec 2013 10:15:35 +0100 Subject: [PATCH 0403/2500] Slave disconnection is an event worth logging. --- src/networking.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/networking.c b/src/networking.c index 0661137d7..f2913cf0c 100644 --- a/src/networking.c +++ b/src/networking.c @@ -666,6 +666,16 @@ void freeClient(redisClient *c) { } } + /* Log link disconnection with slave */ + if (c->flags & REDIS_SLAVE) { + char ip[REDIS_IP_STR_LEN]; + + if (anetPeerToString(c->fd,ip,sizeof(ip),NULL) == -1) + strncpy(ip,"?",REDIS_IP_STR_LEN); + redisLog(REDIS_WARNING,"Connection with slave %s:%d lost.", + ip, c->slave_listening_port); + } + /* Free the query buffer */ sdsfree(c->querybuf); c->querybuf = NULL; From c123005f8cc4a286686afe36db2fea5b283c8b4d Mon Sep 17 00:00:00 2001 From: antirez Date: Sun, 22 Dec 2013 11:43:25 +0100 Subject: [PATCH 0404/2500] Make new masters inherit replication offsets. Currently replication offsets could be used into a limited way in order to understand, out of a set of slaves, what is the one with the most updated data. For example this comparison is possible of N slaves were replicating all with the same master. However the replication offset was not transferred from master to slaves (that are later promoted as masters) in any way, so for instance if there were three instances A, B, C, with A master and B and C replication from A, the following could happen: C disconnects from A. B is turned into master. A is switched to master of B. B receives some write. In this context there was no way to compare the offset of A and C, because B would use its own local master replication offset as replication offset to initialize the replication with A. With this commit what happens is that when B is turned into master it inherits the replication offset from A, making A and C comparable. In the above case assuming no inconsistencies are created during the disconnection and failover process, A will show to have a replication offset greater than C. Note that this does not mean offsets are always comparable to understand what is, in a set of instances, since in more complex examples the replica with the higher replication offset could be partitioned away when picking the instance to elect as new master. However this in general improves the ability of a system to try to pick a good replica to promote to master. --- src/replication.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/replication.c b/src/replication.c index a6c5d0d42..5725613be 100644 --- a/src/replication.c +++ b/src/replication.c @@ -1256,6 +1256,7 @@ void replicationSetMaster(char *ip, int port) { freeReplicationBacklog(); /* Don't allow our chained slaves to PSYNC. */ cancelReplicationHandshake(); server.repl_state = REDIS_REPL_CONNECT; + server.master_repl_offset = 0; } /* Cancel replication, setting the instance as a master itself. */ @@ -1263,7 +1264,17 @@ void replicationUnsetMaster(void) { if (server.masterhost == NULL) return; /* Nothing to do. */ sdsfree(server.masterhost); server.masterhost = NULL; - if (server.master) freeClient(server.master); + if (server.master) { + if (listLength(server.slaves) == 0) { + /* If this instance is turned into a master and there are no + * slaves, it inherits the replication offset from the master. + * Under certain conditions this makes replicas comparable by + * replication offset to understand what is the most updated. */ + server.master_repl_offset = server.master->reploff; + freeReplicationBacklog(); + } + freeClient(server.master); + } replicationDiscardCachedMaster(); cancelReplicationHandshake(); server.repl_state = REDIS_REPL_NONE; From ab003665047f1610be66835d17b9d2d2230083ea Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 23 Dec 2013 11:31:35 +0100 Subject: [PATCH 0405/2500] Configuring port to 0 disables IP socket as specified. This was no longer the case with 2.8 becuase of a bug introduced with the IPv6 support. Now it is fixed. This fixes issue #1287 and #1477. --- src/redis.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/redis.c b/src/redis.c index 98a6f67a3..e926e9c8d 100644 --- a/src/redis.c +++ b/src/redis.c @@ -1593,7 +1593,8 @@ void initServer() { server.db = zmalloc(sizeof(redisDb)*server.dbnum); /* Open the TCP listening socket for the user commands. */ - if (listenToPort(server.port,server.ipfd,&server.ipfd_count) == REDIS_ERR) + if (server.port != 0 && + listenToPort(server.port,server.ipfd,&server.ipfd_count) == REDIS_ERR) exit(1); /* Open the listening Unix domain socket. */ From c571290943ed8a472dc129ae91db55091c158525 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 23 Dec 2013 12:48:39 +0100 Subject: [PATCH 0406/2500] Fix CONFIG REWRITE handling of unknown options. There were two problems with the implementation. 1) "save" was not correctly processed when no save point was configured, as reported in issue #1416. 2) The way the code checked if an option existed in the "processed" dictionary was wrong, as we add the element with as a key associated with a NULL value, so dictFetchValue() can't be used to check for existance, but dictFind() must be used, that returns NULL only if the entry does not exist at all. --- src/config.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/config.c b/src/config.c index 3bd1d8592..d01b3e816 100644 --- a/src/config.c +++ b/src/config.c @@ -1440,6 +1440,8 @@ void rewriteConfigSaveOption(struct rewriteConfigState *state) { server.saveparams[j].seconds, server.saveparams[j].changes); rewriteConfigRewriteLine(state,"save",line,1); } + /* Mark "save" as processed in case server.saveparamslen is zero. */ + rewriteConfigMarkAsProcessed(state,"save"); } /* Rewrite the dir option, always using absolute paths.*/ @@ -1578,7 +1580,7 @@ void rewriteConfigRemoveOrphaned(struct rewriteConfigState *state) { /* Don't blank lines about options the rewrite process * don't understand. */ - if (dictFetchValue(state->rewritten,option) == NULL) { + if (dictFind(state->rewritten,option) == NULL) { redisLog(REDIS_DEBUG,"Not rewritten option: %s", option); continue; } From 98901950f99b962caa952a81a7102527bdd96f69 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 25 Dec 2013 17:57:36 +0100 Subject: [PATCH 0407/2500] Cluster: clusterProcessPacket() was not 80 cols friendly. The function actually needs to be split into sub-functions at some point in the future. --- src/cluster.c | 72 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 47 insertions(+), 25 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 5831fb5c8..af4096f84 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -1293,44 +1293,46 @@ int clusterProcessPacket(clusterLink *link) { * so that REDIS_NODE_MASTER flag will be set. */ /* Many checks are only needed if the set of served slots this - * instance claims is different compared to the set of slots we have for - * it. Check this ASAP to avoid other computational expansive checks later. */ - clusterNode *sender_master = NULL; /* Sender or its master if it is a slave. */ + * instance claims is different compared to the set of slots we have + * for it. Check this ASAP to avoid other computational expansive + * checks later. */ + clusterNode *sender_master = NULL; /* Sender or its master if slave. */ int dirty_slots = 0; /* Sender claimed slots don't match my view? */ if (sender) { - sender_master = (sender->flags & REDIS_NODE_MASTER) ? sender : - sender->slaveof; + sender_master = (sender->flags & REDIS_NODE_MASTER) ? + sender : sender->slaveof; if (sender_master) { dirty_slots = memcmp(sender_master->slots, hdr->myslots,sizeof(hdr->myslots)) != 0; } } - /* 1) If the sender of the message is a master, and we detected that the - * set of slots it claims changed, scan the slots to see if we need - * to update our configuration. */ + /* 1) If the sender of the message is a master, and we detected that + * the set of slots it claims changed, scan the slots to see if we + * need to update our configuration. */ if (sender && sender->flags & REDIS_NODE_MASTER && dirty_slots) { clusterUpdateSlotsConfigWith(sender,senderConfigEpoch,hdr->myslots); } - /* 2) We also check for the reverse condition, that is, the sender claims - * to serve slots we know are served by a master with a greater - * configEpoch. If this happens we inform the sender. + /* 2) We also check for the reverse condition, that is, the sender + * claims to serve slots we know are served by a master with a + * greater configEpoch. If this happens we inform the sender. * - * This is useful because sometimes after a partition heals, a reappearing - * master may be the last one to claim a given set of hash slots, but with - * a configuration that other instances know to be deprecated. Example: + * This is useful because sometimes after a partition heals, a + * reappearing master may be the last one to claim a given set of + * hash slots, but with a configuration that other instances know to + * be deprecated. Example: * * A and B are master and slave for slots 1,2,3. * A is partitioned away, B gets promoted. * B is partitioned away, and A returns available. * * Usually B would PING A publishing its set of served slots and its - * configEpoch, but because of the partition B can't inform A of the new - * configuration, so other nodes that have an updated table must do it. - * In this way A will stop to act as a master (or can try to failover if - * there are the conditions to win the election). */ + * configEpoch, but because of the partition B can't inform A of the + * new configuration, so other nodes that have an updated table must + * do it. In this way A will stop to act as a master (or can try to + * failover if there are the conditions to win the election). */ if (sender && dirty_slots) { int j; @@ -1363,7 +1365,8 @@ int clusterProcessPacket(clusterLink *link) { if (sender) { failing = clusterLookupNode(hdr->data.fail.about.nodename); - if (failing && !(failing->flags & (REDIS_NODE_FAIL|REDIS_NODE_MYSELF))) + if (failing && + !(failing->flags & (REDIS_NODE_FAIL|REDIS_NODE_MYSELF))) { redisLog(REDIS_NOTICE, "FAIL message received from %.40s about %.40s", @@ -1384,13 +1387,16 @@ int clusterProcessPacket(clusterLink *link) { /* Don't bother creating useless objects if there are no * Pub/Sub subscribers. */ - if (dictSize(server.pubsub_channels) || listLength(server.pubsub_patterns)) { + if (dictSize(server.pubsub_channels) || + listLength(server.pubsub_patterns)) + { channel_len = ntohl(hdr->data.publish.msg.channel_len); message_len = ntohl(hdr->data.publish.msg.message_len); channel = createStringObject( (char*)hdr->data.publish.msg.bulk_data,channel_len); message = createStringObject( - (char*)hdr->data.publish.msg.bulk_data+channel_len, message_len); + (char*)hdr->data.publish.msg.bulk_data+channel_len, + message_len); pubsubPublishMessage(channel,message); decrRefCount(channel); decrRefCount(message); @@ -1424,7 +1430,8 @@ int clusterProcessPacket(clusterLink *link) { /* If in our current config the node is a slave, set it as a master. */ if (n->flags & REDIS_NODE_SLAVE) clusterSetNodeAsMaster(n); - /* Check the bitmap of served slots and udpate our config accordingly. */ + /* Check the bitmap of served slots and udpate our + * config accordingly. */ clusterUpdateSlotsConfigWith(n,reportedConfigEpoch, hdr->data.update.nodecfg.slots); } else { @@ -1824,21 +1831,34 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { if (server.cluster->myself->numslots == 0) return; /* Request epoch must be >= our currentEpoch. */ - if (requestCurrentEpoch < server.cluster->currentEpoch) return; + if (requestCurrentEpoch < server.cluster->currentEpoch) { + printf("REFUSED BECAUSE OF EPOCH\n"); + return; + } /* I already voted for this epoch? Return ASAP. */ - if (server.cluster->last_vote_epoch == server.cluster->currentEpoch) return; + if (server.cluster->last_vote_epoch == server.cluster->currentEpoch) { + printf("REFUSED BECAUSE ALREADY VOTED FOR EPOCH\n"); + return; + } /* Node must be a slave and its master down. */ if (!(node->flags & REDIS_NODE_SLAVE) || master == NULL || - !(master->flags & REDIS_NODE_FAIL)) return; + !(master->flags & REDIS_NODE_FAIL)) + { + printf("REFUSED BECAUSE NOT A SLAVE OR MASTER NOT FAIL.\n"); + return; + } /* We did not voted for a slave about this master for two * times the node timeout. This is not strictly needed for correctness * of the algorithm but makes the base case more linear. */ if (mstime() - node->slaveof->voted_time < server.cluster_node_timeout * 2) + { + printf("REFUSED BECAUSE ALREADY VOTED WITHIN NODE_TIMEOUT*2.\n"); return; + } /* The slave requesting the vote must have a configEpoch for the claimed * slots that is >= the one of the masters currently serving the same @@ -1850,10 +1870,12 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { /* If we reached this point we found a slot that in our current slots * is served by a master with a greater configEpoch than the one claimed * by the slave requesting our vote. Refuse to vote for this slave. */ + printf("REFUSED BECAUSE SLAVE CONFIG EPOCH FOR SLOTS IS STALE.\n"); return; } /* We can vote for this slave. */ + printf("I VOTED.\n"); clusterSendFailoverAuth(node); server.cluster->last_vote_epoch = server.cluster->currentEpoch; node->slaveof->voted_time = mstime(); From b4bee62561ab1395e6d25622f9ed1f533a7db7b5 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 25 Dec 2013 18:13:27 +0100 Subject: [PATCH 0408/2500] anetTcpGenericConnect() code improved + 1 bug fix. Now the socket is closed if anetNonBlock() fails, and in general the code structure makes it harder to introduce this kind of bugs in the future. Reference: pull request #1059. --- src/anet.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/src/anet.c b/src/anet.c index 257b491e9..45b31e785 100644 --- a/src/anet.c +++ b/src/anet.c @@ -219,11 +219,11 @@ static int anetCreateSocket(char *err, int domain) { #define ANET_CONNECT_NONBLOCK 1 static int anetTcpGenericConnect(char *err, char *addr, int port, int flags) { - int s, rv; - char _port[6]; /* strlen("65535"); */ + int s = ANET_ERR, rv; + char _port[6]; /* strlen("65535") + 1; */ struct addrinfo hints, *servinfo, *p; - snprintf(_port,6,"%d",port); + snprintf(_port,sizeof(port),"%d",port); memset(&hints,0,sizeof(hints)); hints.ai_family = AF_UNSPEC; hints.ai_socktype = SOCK_STREAM; @@ -233,20 +233,26 @@ static int anetTcpGenericConnect(char *err, char *addr, int port, int flags) return ANET_ERR; } for (p = servinfo; p != NULL; p = p->ai_next) { + /* Try to create the socket and to connect it. + * If we fail in the socket() call, or on connect(), we retry with + * the next entry in servinfo. */ if ((s = socket(p->ai_family,p->ai_socktype,p->ai_protocol)) == -1) continue; - - /* if we set err then goto cleanup, otherwise next */ if (anetSetReuseAddr(err,s) == ANET_ERR) goto error; if (flags & ANET_CONNECT_NONBLOCK && anetNonBlock(err,s) != ANET_OK) goto error; if (connect(s,p->ai_addr,p->ai_addrlen) == -1) { - if (errno == EINPROGRESS && flags & ANET_CONNECT_NONBLOCK) goto end; + /* If the socket is non-blocking, it is ok for connect() to + * return an EINPROGRESS error here. */ + if (errno == EINPROGRESS && flags & ANET_CONNECT_NONBLOCK) + goto end; close(s); + s = ANET_ERR; continue; } - /* break with the socket */ + /* If we ended an iteration of the for loop without errors, we + * have a connected socket. Let's return to the caller. */ goto end; } if (p == NULL) { @@ -255,7 +261,10 @@ static int anetTcpGenericConnect(char *err, char *addr, int port, int flags) } error: - s = ANET_ERR; + if (s != ANET_ERR) { + close(s); + s = ANET_ERR; + } end: freeaddrinfo(servinfo); return s; From cf71d130a1b4bdeb5d7fd03c27eec2e10249d325 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 25 Dec 2013 18:24:04 +0100 Subject: [PATCH 0409/2500] Remove useless goto from anetTcpGenericConnect(). --- src/anet.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/anet.c b/src/anet.c index 45b31e785..75bebf7b1 100644 --- a/src/anet.c +++ b/src/anet.c @@ -255,10 +255,8 @@ static int anetTcpGenericConnect(char *err, char *addr, int port, int flags) * have a connected socket. Let's return to the caller. */ goto end; } - if (p == NULL) { + if (p == NULL) anetSetError(err, "creating socket: %s", strerror(errno)); - goto error; - } error: if (s != ANET_ERR) { From 9a1cfab59b755a5e1fd68acf033d61bb64421c06 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 25 Dec 2013 18:28:30 +0100 Subject: [PATCH 0410/2500] anetTcpGenericConnect() bug introduced in 9d19977 fixed. Durign a refactoring I mispelled _port for port. This is one of the reasons I never used _varname myself. --- src/anet.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/anet.c b/src/anet.c index 75bebf7b1..e37ae8064 100644 --- a/src/anet.c +++ b/src/anet.c @@ -220,15 +220,15 @@ static int anetCreateSocket(char *err, int domain) { static int anetTcpGenericConnect(char *err, char *addr, int port, int flags) { int s = ANET_ERR, rv; - char _port[6]; /* strlen("65535") + 1; */ + char portstr[6]; /* strlen("65535") + 1; */ struct addrinfo hints, *servinfo, *p; - snprintf(_port,sizeof(port),"%d",port); + snprintf(portstr,sizeof(portstr),"%d",port); memset(&hints,0,sizeof(hints)); hints.ai_family = AF_UNSPEC; hints.ai_socktype = SOCK_STREAM; - if ((rv = getaddrinfo(addr,_port,&hints,&servinfo)) != 0) { + if ((rv = getaddrinfo(addr,portstr,&hints,&servinfo)) != 0) { anetSetError(err, "%s", gai_strerror(rv)); return ANET_ERR; } From 87b56174b9a17d3a391dfe5c57b98ccacc0f01d2 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 25 Dec 2013 18:39:49 +0100 Subject: [PATCH 0411/2500] anetPeerToString / SockName: port can be NULL on errors too. --- src/anet.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/anet.c b/src/anet.c index e37ae8064..ef52d955d 100644 --- a/src/anet.c +++ b/src/anet.c @@ -488,7 +488,7 @@ int anetPeerToString(int fd, char *ip, size_t ip_len, int *port) { socklen_t salen = sizeof(sa); if (getpeername(fd,(struct sockaddr*)&sa,&salen) == -1) { - *port = 0; + if (port) *port = 0; ip[0] = '?'; ip[1] = '\0'; return -1; @@ -510,7 +510,7 @@ int anetSockName(int fd, char *ip, size_t ip_len, int *port) { socklen_t salen = sizeof(sa); if (getsockname(fd,(struct sockaddr*)&sa,&salen) == -1) { - *port = 0; + if (port) *port = 0; ip[0] = '?'; ip[1] = '\0'; return -1; From c0b9515805ad14d73a8d31e50438993fb533b572 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 25 Dec 2013 18:41:10 +0100 Subject: [PATCH 0412/2500] Log disconnection with slave only when ip:port is available. --- src/networking.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/networking.c b/src/networking.c index f2913cf0c..ff3681f52 100644 --- a/src/networking.c +++ b/src/networking.c @@ -670,10 +670,10 @@ void freeClient(redisClient *c) { if (c->flags & REDIS_SLAVE) { char ip[REDIS_IP_STR_LEN]; - if (anetPeerToString(c->fd,ip,sizeof(ip),NULL) == -1) - strncpy(ip,"?",REDIS_IP_STR_LEN); - redisLog(REDIS_WARNING,"Connection with slave %s:%d lost.", - ip, c->slave_listening_port); + if (anetPeerToString(c->fd,ip,sizeof(ip),NULL) != -1) { + redisLog(REDIS_WARNING,"Connection with slave %s:%d lost.", + ip, c->slave_listening_port); + } } /* Free the query buffer */ From c1a042fda9123e4f5aabdc2e642cf360f1cf0117 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 8 Jan 2014 14:11:02 +0100 Subject: [PATCH 0413/2500] Clarify a comment in slaveTryPartialResynchronization(). --- src/replication.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/replication.c b/src/replication.c index 5725613be..84bf7400a 100644 --- a/src/replication.c +++ b/src/replication.c @@ -1005,7 +1005,7 @@ int slaveTryPartialResynchronization(int fd) { /* If we reach this point we receied either an error since the master does * not understand PSYNC, or an unexpected reply from the master. - * Reply with PSYNC_NOT_SUPPORTED in both cases. */ + * Return PSYNC_NOT_SUPPORTED to the caller in both cases. */ if (strncmp(reply,"-ERR",4)) { /* If it's not an error, log the unexpected event. */ From c0cdcaf3739c255afeb9e8406e4a72c77d9818b1 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 8 Jan 2014 14:25:06 +0100 Subject: [PATCH 0414/2500] Don't send REPLCONF ACK to old masters. Masters not understanding REPLCONF ACK will reply with errors to our requests causing a number of possible issues. This commit detects a global replication offest set to -1 at the end of the replication, and marks the client representing the master with the REDIS_PRE_PSYNC flag. Note that this flag was called REDIS_PRE_PSYNC_SLAVE but now it is just REDIS_PRE_PSYNC as it is used for both slaves and masters starting with this commit. This commit fixes issue #1488. --- src/redis.h | 2 +- src/replication.c | 15 +++++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/src/redis.h b/src/redis.h index 2db1eb3ea..3e928180e 100644 --- a/src/redis.h +++ b/src/redis.h @@ -231,7 +231,7 @@ #define REDIS_MASTER_FORCE_REPLY (1<<13) /* Queue replies even if is master */ #define REDIS_FORCE_AOF (1<<14) /* Force AOF propagation of current cmd. */ #define REDIS_FORCE_REPL (1<<15) /* Force replication of current cmd. */ -#define REDIS_PRE_PSYNC_SLAVE (1<<16) /* Slave don't understand PSYNC. */ +#define REDIS_PRE_PSYNC (1<<16) /* Instance don't understand PSYNC. */ /* Client block type (btype field in client structure) * if REDIS_BLOCKED flag is set. */ diff --git a/src/replication.c b/src/replication.c index 84bf7400a..f00cb0334 100644 --- a/src/replication.c +++ b/src/replication.c @@ -459,7 +459,7 @@ void syncCommand(redisClient *c) { /* If a slave uses SYNC, we are dealing with an old implementation * of the replication protocol (like redis-cli --slave). Flag the client * so that we don't expect to receive REPLCONF ACK feedbacks. */ - c->flags |= REDIS_PRE_PSYNC_SLAVE; + c->flags |= REDIS_PRE_PSYNC; } /* Full resynchronization. */ @@ -843,6 +843,10 @@ void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask) { server.master->reploff = server.repl_master_initial_offset; memcpy(server.master->replrunid, server.repl_master_runid, sizeof(server.repl_master_runid)); + /* If master offset is set to -1, this master is old and is not + * PSYNC capable, so we flag it accordingly. */ + if (server.master->reploff == -1) + server.master->flags |= REDIS_PRE_PSYNC; redisLog(REDIS_NOTICE, "MASTER <-> SLAVE sync: Finished with success"); /* Restart the AOF subsystem now that we finished the sync. This * will trigger an AOF rewrite, and when done will start appending @@ -1706,8 +1710,11 @@ void replicationCron(void) { } } - /* Send ACK to master from time to time. */ - if (server.masterhost && server.master) + /* Send ACK to master from time to time. + * Note that we do not send periodic acks to masters that don't + * support PSYNC and replication offsets. */ + if (server.masterhost && server.master && + !(server.master->flags & REDIS_PRE_PSYNC)) replicationSendAck(); /* If we have attached slaves, PING them from time to time. @@ -1751,7 +1758,7 @@ void replicationCron(void) { redisClient *slave = ln->value; if (slave->replstate != REDIS_REPL_ONLINE) continue; - if (slave->flags & REDIS_PRE_PSYNC_SLAVE) continue; + if (slave->flags & REDIS_PRE_PSYNC) continue; if ((server.unixtime - slave->repl_ack_time) > server.repl_timeout) { char ip[REDIS_IP_STR_LEN]; From 088a617c61d2c869cee35de452fca7b6bb8b2ba6 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 8 Jan 2014 17:16:04 +0100 Subject: [PATCH 0415/2500] Test: stress events flags to/from string conversion. --- tests/unit/pubsub.tcl | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/unit/pubsub.tcl b/tests/unit/pubsub.tcl index 901668851..e2420830e 100644 --- a/tests/unit/pubsub.tcl +++ b/tests/unit/pubsub.tcl @@ -356,4 +356,15 @@ start_server {tags {"pubsub"}} { r config set maxmemory 0 $rd1 close } + + test "Keyspace notifications: test CONFIG GET/SET of event flags" { + r config set notify-keyspace-events gKE + assert_equal {gKE} [lindex [r config get notify-keyspace-events] 1] + r config set notify-keyspace-events {$lshzxeKE} + assert_equal {$lshzxeKE} [lindex [r config get notify-keyspace-events] 1] + r config set notify-keyspace-events KA + assert_equal {AK} [lindex [r config get notify-keyspace-events] 1] + r config set notify-keyspace-events EA + assert_equal {AE} [lindex [r config get notify-keyspace-events] 1] + } } From 937732d50a0c0e0f6f000c7fff6b0021b5e3832a Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 8 Jan 2014 17:18:00 +0100 Subject: [PATCH 0416/2500] Fix keyspace events flags-to-string conversion. Fixes issue #1491 on Github. --- src/notify.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/notify.c b/src/notify.c index bb598c4f2..f77239ecf 100644 --- a/src/notify.c +++ b/src/notify.c @@ -67,17 +67,19 @@ int keyspaceEventsStringToFlags(char *classes) { sds keyspaceEventsFlagsToString(int flags) { sds res; - if ((flags & REDIS_NOTIFY_ALL) == REDIS_NOTIFY_ALL) - return sdsnew("A"); res = sdsempty(); - if (flags & REDIS_NOTIFY_GENERIC) res = sdscatlen(res,"g",1); - if (flags & REDIS_NOTIFY_STRING) res = sdscatlen(res,"$",1); - if (flags & REDIS_NOTIFY_LIST) res = sdscatlen(res,"l",1); - if (flags & REDIS_NOTIFY_SET) res = sdscatlen(res,"s",1); - if (flags & REDIS_NOTIFY_HASH) res = sdscatlen(res,"h",1); - if (flags & REDIS_NOTIFY_ZSET) res = sdscatlen(res,"z",1); - if (flags & REDIS_NOTIFY_EXPIRED) res = sdscatlen(res,"x",1); - if (flags & REDIS_NOTIFY_EVICTED) res = sdscatlen(res,"e",1); + if ((flags & REDIS_NOTIFY_ALL) == REDIS_NOTIFY_ALL) { + res = sdscatlen(res,"A",1); + } else { + if (flags & REDIS_NOTIFY_GENERIC) res = sdscatlen(res,"g",1); + if (flags & REDIS_NOTIFY_STRING) res = sdscatlen(res,"$",1); + if (flags & REDIS_NOTIFY_LIST) res = sdscatlen(res,"l",1); + if (flags & REDIS_NOTIFY_SET) res = sdscatlen(res,"s",1); + if (flags & REDIS_NOTIFY_HASH) res = sdscatlen(res,"h",1); + if (flags & REDIS_NOTIFY_ZSET) res = sdscatlen(res,"z",1); + if (flags & REDIS_NOTIFY_EXPIRED) res = sdscatlen(res,"x",1); + if (flags & REDIS_NOTIFY_EVICTED) res = sdscatlen(res,"e",1); + } if (flags & REDIS_NOTIFY_KEYSPACE) res = sdscatlen(res,"K",1); if (flags & REDIS_NOTIFY_KEYEVENT) res = sdscatlen(res,"E",1); return res; From ed3c6c012466d96fd08f502fa08f832e230db90e Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 9 Jan 2014 11:09:23 +0100 Subject: [PATCH 0417/2500] Fix RESTORE ttl handling in 32 bit archs. long was used instead of long long in order to handle a 64 bit resolution millisecond timestamp. This fixes issue #1483. --- src/cluster.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index af4096f84..66cfe4ff6 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -2959,7 +2959,7 @@ void dumpCommand(redisClient *c) { /* RESTORE key ttl serialized-value [REPLACE] */ void restoreCommand(redisClient *c) { - long ttl; + long long ttl; rio payload; int j, type, replace = 0; robj *obj; @@ -2981,7 +2981,7 @@ void restoreCommand(redisClient *c) { } /* Check if the TTL value makes sense */ - if (getLongFromObjectOrReply(c,c->argv[2],&ttl,NULL) != REDIS_OK) { + if (getLongLongFromObjectOrReply(c,c->argv[2],&ttl,NULL) != REDIS_OK) { return; } else if (ttl < 0) { addReplyError(c,"Invalid TTL value, must be >= 0"); @@ -2989,7 +2989,8 @@ void restoreCommand(redisClient *c) { } /* Verify RDB version and data checksum. */ - if (verifyDumpPayload(c->argv[3]->ptr,sdslen(c->argv[3]->ptr)) == REDIS_ERR) { + if (verifyDumpPayload(c->argv[3]->ptr,sdslen(c->argv[3]->ptr)) == REDIS_ERR) + { addReplyError(c,"DUMP payload version or checksum are wrong"); return; } From e9786a3255991d2247a71f4701b93de11782229a Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 9 Jan 2014 11:19:03 +0100 Subject: [PATCH 0418/2500] Test: regression for issues #1483. --- tests/unit/dump.tcl | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/unit/dump.tcl b/tests/unit/dump.tcl index 5c7291361..dbe4e1dbf 100644 --- a/tests/unit/dump.tcl +++ b/tests/unit/dump.tcl @@ -16,6 +16,16 @@ start_server {tags {"dump"}} { r get foo } {bar} + test {RESTORE can set an expire that overflows a 32 bit integer} { + r set foo bar + set encoded [r dump foo] + r del foo + r restore foo 2569591501 $encoded + set ttl [r pttl foo] + assert {$ttl >= (2569591501-3000) && $ttl <= 2569591501} + r get foo + } {bar} + test {RESTORE returns an error of the key already exists} { r set foo bar set e {} From 1f73921d24e49e1300f4951bce472caf46d776fd Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 10 Jan 2014 14:13:04 +0100 Subject: [PATCH 0419/2500] Trip comment to 80 cols in SentinelCommand(). --- src/sentinel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sentinel.c b/src/sentinel.c index 44510a23e..d4877e19d 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -2373,7 +2373,7 @@ void sentinelCommand(redisClient *c) { return; addReplyDictOfRedisInstances(c,ri->sentinels); } else if (!strcasecmp(c->argv[1]->ptr,"is-master-down-by-addr")) { - /* SENTINEL IS-MASTER-DOWN-BY-ADDR */ + /* SENTINEL IS-MASTER-DOWN-BY-ADDR */ sentinelRedisInstance *ri; long long req_epoch; uint64_t leader_epoch = 0; From 46429f36a75b502e84cd0cc71ebc3756bceba18f Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 10 Jan 2014 14:31:41 +0100 Subject: [PATCH 0420/2500] Add all the configurable fields to addReplySentinelRedisInstance(). Note: the auth password with the master is voluntarily not exposed. --- src/sentinel.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/sentinel.c b/src/sentinel.c index d4877e19d..8bc262ce4 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -2268,6 +2268,30 @@ void addReplySentinelRedisInstance(redisClient *c, sentinelRedisInstance *ri) { addReplyBulkCString(c,"quorum"); addReplyBulkLongLong(c,ri->quorum); fields++; + + addReplyBulkCString(c,"down-after-milliseconds"); + addReplyBulkLongLong(c,ri->down_after_period); + fields++; + + addReplyBulkCString(c,"failover-timeout"); + addReplyBulkLongLong(c,ri->failover_timeout); + fields++; + + addReplyBulkCString(c,"parallel-syncs"); + addReplyBulkLongLong(c,ri->parallel_syncs); + fields++; + + if (ri->notification_script) { + addReplyBulkCString(c,"notification-script"); + addReplyBulkCString(c,ri->notification_script); + fields++; + } + + if (ri->client_reconfig_script) { + addReplyBulkCString(c,"client-reconfig-script"); + addReplyBulkCString(c,ri->client_reconfig_script); + fields++; + } } /* Only slaves */ From 7d7e3f00e02f1a86e11a412a4db99ee25e0c5c66 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 10 Jan 2014 14:41:44 +0100 Subject: [PATCH 0421/2500] Sentinel: added SENTINEL MASTER command. With SENTINEL MASTERS it was already possible to list all the configured masters, but not a specific one. --- src/sentinel.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/sentinel.c b/src/sentinel.c index 8bc262ce4..4966a1dbd 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -2378,8 +2378,14 @@ void sentinelCommand(redisClient *c) { if (!strcasecmp(c->argv[1]->ptr,"masters")) { /* SENTINEL MASTERS */ if (c->argc != 2) goto numargserr; - addReplyDictOfRedisInstances(c,sentinel.masters); + } else if (!strcasecmp(c->argv[1]->ptr,"master")) { + /* SENTINEL MASTER */ + sentinelRedisInstance *ri; + + if ((ri = sentinelGetMasterByNameOrReplyError(c,c->argv[2])) + == NULL) return; + addReplySentinelRedisInstance(c,ri); } else if (!strcasecmp(c->argv[1]->ptr,"slaves")) { /* SENTINEL SLAVES */ sentinelRedisInstance *ri; From 057392f8765c7b3f11d8f3a7e330d1380b582f4a Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 10 Jan 2014 15:02:39 +0100 Subject: [PATCH 0422/2500] anetResolveIP() added to anet.c. The new function is used when we want to normalize an IP address without performing a DNS lookup if the string to resolve is not a valid IP. This is useful every time only IPs are valid inputs or when we want to skip DNS resolution that is slow during runtime operations if we are required to block. --- src/anet.c | 19 ++++++++++++++++++- src/anet.h | 4 ++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/src/anet.c b/src/anet.c index ef52d955d..fc585c8d1 100644 --- a/src/anet.c +++ b/src/anet.c @@ -163,12 +163,21 @@ int anetTcpKeepAlive(char *err, int fd) return ANET_OK; } -int anetResolve(char *err, char *host, char *ipbuf, size_t ipbuf_len) +/* anetGenericResolve() is called by anetResolve() and anetResolveIP() to + * do the actual work. It resolves the hostname "host" and set the string + * representation of the IP address into the buffer pointed by "ipbuf". + * + * If flags is set to ANET_IP_ONLY the function only resolves hostnames + * that are actually already IPv4 or IPv6 addresses. This turns the function + * into a validating / normalizing function. */ +int anetGenericResolve(char *err, char *host, char *ipbuf, size_t ipbuf_len, + int flags) { struct addrinfo hints, *info; int rv; memset(&hints,0,sizeof(hints)); + if (flags & ANET_IP_ONLY) hints.ai_flags = AI_NUMERICHOST; hints.ai_family = AF_UNSPEC; hints.ai_socktype = SOCK_STREAM; /* specify socktype to avoid dups */ @@ -188,6 +197,14 @@ int anetResolve(char *err, char *host, char *ipbuf, size_t ipbuf_len) return ANET_OK; } +int anetResolve(char *err, char *host, char *ipbuf, size_t ipbuf_len) { + return anetGenericResolve(err,host,ipbuf,ipbuf_len,ANET_NONE); +} + +int anetResolveIP(char *err, char *host, char *ipbuf, size_t ipbuf_len) { + return anetGenericResolve(err,host,ipbuf,ipbuf_len,ANET_IP_ONLY); +} + static int anetSetReuseAddr(char *err, int fd) { int yes = 1; /* Make sure connection-intensive things like the redis benckmark diff --git a/src/anet.h b/src/anet.h index b23411cbb..f0ab63ab7 100644 --- a/src/anet.h +++ b/src/anet.h @@ -35,6 +35,10 @@ #define ANET_ERR -1 #define ANET_ERR_LEN 256 +/* Flags used with certain functions. */ +#define ANET_NONE 0 +#define ANET_IP_ONLY (1<<0) + #if defined(__sun) #define AF_LOCAL AF_UNIX #endif From 61302ba560c35d5d93a128d9f242662d581e181f Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 10 Jan 2014 15:16:55 +0100 Subject: [PATCH 0423/2500] Sentinel: SENTINEL MONITOR command implemented. It allows to add new masters to monitor at runtime. --- src/sentinel.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/src/sentinel.c b/src/sentinel.c index 4966a1dbd..6c328356c 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -2486,6 +2486,40 @@ void sentinelCommand(redisClient *c) { if (c->argc != 2) goto numargserr; sentinelPendingScriptsCommand(c); + } else if (!strcasecmp(c->argv[1]->ptr,"monitor")) { + /* SENTINEL MONITOR */ + long quorum, port; + char buf[32]; + + if (c->argc != 6) goto numargserr; + if (getLongFromObjectOrReply(c,c->argv[5],&quorum,"Invalid quorum") + != REDIS_OK) return; + if (getLongFromObjectOrReply(c,c->argv[4],&port,"Invalid port") + != REDIS_OK) return; + /* Make sure the IP field is actually a valid IP before passing it + * to createSentinelRedisInstance(), otherwise we may trigger a + * DNS lookup at runtime. */ + if (anetResolveIP(NULL,c->argv[3]->ptr,buf,sizeof(buf)) == ANET_ERR) { + addReplyError(c,"Invalid IP address specified"); + return; + } + if (createSentinelRedisInstance(c->argv[2]->ptr,SRI_MASTER, + c->argv[3]->ptr,port,quorum,NULL) == NULL) + { + switch(errno) { + case EBUSY: + addReplyError(c,"Duplicated master name"); + break; + case EINVAL: + addReplyError(c,"Invalid port number"); + break; + default: + addReplyError(c,"Unspecified error adding the instance"); + break; + } + } else { + addReply(c,shared.ok); + } } else { addReplyErrorFormat(c,"Unknown sentinel subcommand '%s'", (char*)c->argv[1]->ptr); From 7dae2c368177626e87e3e109cf8dac7ccd2e685d Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 10 Jan 2014 15:18:41 +0100 Subject: [PATCH 0424/2500] anetResolveIP() prototype added to anet.h. --- src/anet.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/anet.h b/src/anet.h index f0ab63ab7..2ab9398ad 100644 --- a/src/anet.h +++ b/src/anet.h @@ -49,6 +49,7 @@ int anetUnixConnect(char *err, char *path); int anetUnixNonBlockConnect(char *err, char *path); int anetRead(int fd, char *buf, int count); int anetResolve(char *err, char *host, char *ipbuf, size_t ipbuf_len); +int anetResolveIP(char *err, char *host, char *ipbuf, size_t ipbuf_len); int anetTcpServer(char *err, int port, char *bindaddr); int anetTcp6Server(char *err, int port, char *bindaddr); int anetUnixServer(char *err, char *path, mode_t perm); From 282b2b4660d9bbcea58983e825f292e5560a747d Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 10 Jan 2014 15:22:06 +0100 Subject: [PATCH 0425/2500] Sentinel: flush config on disk when new master is added. --- src/sentinel.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/sentinel.c b/src/sentinel.c index 6c328356c..eaf15daa1 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -2518,6 +2518,7 @@ void sentinelCommand(redisClient *c) { break; } } else { + sentinelFlushConfig(); addReply(c,shared.ok); } } else { From 23066081673b10a68e22ade2fa0d676519e934a8 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 10 Jan 2014 15:33:42 +0100 Subject: [PATCH 0426/2500] Sentinel: releaseSentinelRedisInstance() top comment fixed. The claim about unlinking the instance from the connected hash tables was the opposite of the reality. Also the current actual behavior is safer in most cases, so it is better to manually unlink when needed. --- src/sentinel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sentinel.c b/src/sentinel.c index eaf15daa1..278e1ef4c 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -940,7 +940,7 @@ sentinelRedisInstance *createSentinelRedisInstance(char *name, int flags, char * } /* Release this instance and all its slaves, sentinels, hiredis connections. - * This function also takes care of unlinking the instance from the main + * This function does not take care of unlinking the instance from the main * masters table (if it is a master) or from its master sentinels/slaves table * if it is a slave or sentinel. */ void releaseSentinelRedisInstance(sentinelRedisInstance *ri) { From 2c6c1b12716ee0cd16b854fd549c8f675e4f605c Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 10 Jan 2014 15:39:10 +0100 Subject: [PATCH 0427/2500] Sentinel: SENTINEL REMOVE command added. The command totally removes a monitored master. --- src/sentinel.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/sentinel.c b/src/sentinel.c index 278e1ef4c..174a45769 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -2521,6 +2521,15 @@ void sentinelCommand(redisClient *c) { sentinelFlushConfig(); addReply(c,shared.ok); } + } else if (!strcasecmp(c->argv[1]->ptr,"remove")) { + /* SENTINEL REMOVE */ + sentinelRedisInstance *ri; + + if ((ri = sentinelGetMasterByNameOrReplyError(c,c->argv[2])) + == NULL) return; + dictDelete(sentinel.masters,c->argv[2]->ptr); + sentinelFlushConfig(); + addReply(c,shared.ok); } else { addReplyErrorFormat(c,"Unknown sentinel subcommand '%s'", (char*)c->argv[1]->ptr); From 1642da19bb04e07a036d028fb4d458f576c4b02f Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 13 Jan 2014 11:05:13 +0100 Subject: [PATCH 0428/2500] Sentinel: fix wrong arity error message. --- src/sentinel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sentinel.c b/src/sentinel.c index 174a45769..3df4d3a08 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -2537,7 +2537,7 @@ void sentinelCommand(redisClient *c) { return; numargserr: - addReplyErrorFormat(c,"Wrong number of commands for 'sentinel %s'", + addReplyErrorFormat(c,"Wrong number of arguments for 'sentinel %s'", (char*)c->argv[1]->ptr); } From 74f84e3a3ddf7aa55d99dbf63e9619d6887710f0 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 13 Jan 2014 11:50:38 +0100 Subject: [PATCH 0429/2500] SENTINEL SET implemented. The new command allows to change master-specific configurations at runtime. All the settable parameters can be retrivied via the SENTINEL MASTER command, so there is no equivalent "GET" command. --- src/sentinel.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/src/sentinel.c b/src/sentinel.c index 3df4d3a08..f2fe1540c 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -368,6 +368,7 @@ dictType leaderVotesDictType = { void sentinelCommand(redisClient *c); void sentinelInfoCommand(redisClient *c); +void sentinelSetCommand(redisClient *c); struct redisCommand sentinelcmds[] = { {"ping",pingCommand,1,"",0,NULL,0,0,0,0,0}, @@ -2530,6 +2531,9 @@ void sentinelCommand(redisClient *c) { dictDelete(sentinel.masters,c->argv[2]->ptr); sentinelFlushConfig(); addReply(c,shared.ok); + } else if (!strcasecmp(c->argv[1]->ptr,"set")) { + if (c->argc < 3 || c->argc % 2 == 0) goto numargserr; + sentinelSetCommand(c); } else { addReplyErrorFormat(c,"Unknown sentinel subcommand '%s'", (char*)c->argv[1]->ptr); @@ -2541,6 +2545,7 @@ numargserr: (char*)c->argv[1]->ptr); } +/* SENTINEL INFO [section] */ void sentinelInfoCommand(redisClient *c) { char *section = c->argc == 2 ? c->argv[1]->ptr : "default"; sds info = sdsempty(); @@ -2600,6 +2605,79 @@ void sentinelInfoCommand(redisClient *c) { addReply(c,shared.crlf); } +/* SENTINEL SET [