From 90b9f08e5d1657e7bfffe43f31f6663bf469ee75 Mon Sep 17 00:00:00 2001 From: filipe oliveira Date: Thu, 31 Dec 2020 14:53:43 +0000 Subject: [PATCH] Add errorstats info section, Add failed_calls and rejected_calls to commandstats (#8217) This Commit pushes forward the observability on overall error statistics and command statistics within redis-server: It extends INFO COMMANDSTATS to have - failed_calls in - so we can keep track of errors that happen from the command itself, broken by command. - rejected_calls - so we can keep track of errors that were triggered outside the commmand processing per se Adds a new section to INFO, named ERRORSTATS that enables keeping track of the different errors that occur within redis ( within processCommand and call ) based on the reply Error Prefix ( The first word after the "-", up to the first space ). This commit also fixes RM_ReplyWithError so that it can be correctly identified as an error reply. --- src/config.c | 1 + src/module.c | 24 +++-- src/networking.c | 18 ++++ src/server.c | 63 ++++++++++++- src/server.h | 10 ++- tests/cluster/tests/18-info.tcl | 36 ++++++++ tests/support/util.tcl | 6 ++ tests/test_helper.tcl | 1 + tests/unit/info.tcl | 153 ++++++++++++++++++++++++++++++++ 9 files changed, 294 insertions(+), 18 deletions(-) create mode 100644 tests/cluster/tests/18-info.tcl create mode 100644 tests/unit/info.tcl diff --git a/src/config.c b/src/config.c index a92b41624..9d078bfe3 100644 --- a/src/config.c +++ b/src/config.c @@ -2560,6 +2560,7 @@ NULL } else if (!strcasecmp(c->argv[1]->ptr,"resetstat") && c->argc == 2) { resetServerStats(); resetCommandTableStats(); + resetErrorTableStats(); addReply(c,shared.ok); } else if (!strcasecmp(c->argv[1]->ptr,"rewrite") && c->argc == 2) { if (server.configfile == NULL) { diff --git a/src/module.c b/src/module.c index 11f5f4489..913b4de5d 100644 --- a/src/module.c +++ b/src/module.c @@ -1368,18 +1368,6 @@ int RM_ReplyWithLongLong(RedisModuleCtx *ctx, long long ll) { return REDISMODULE_OK; } -/* Reply with an error or simple string (status message). Used to implement - * ReplyWithSimpleString() and ReplyWithError(). - * The function always returns REDISMODULE_OK. */ -int replyWithStatus(RedisModuleCtx *ctx, const char *msg, char *prefix) { - client *c = moduleGetReplyClient(ctx); - if (c == NULL) return REDISMODULE_OK; - addReplyProto(c,prefix,strlen(prefix)); - addReplyProto(c,msg,strlen(msg)); - addReplyProto(c,"\r\n",2); - return REDISMODULE_OK; -} - /* Reply with the error 'err'. * * Note that 'err' must contain all the error, including @@ -1395,7 +1383,10 @@ int replyWithStatus(RedisModuleCtx *ctx, const char *msg, char *prefix) { * The function always returns REDISMODULE_OK. */ int RM_ReplyWithError(RedisModuleCtx *ctx, const char *err) { - return replyWithStatus(ctx,err,"-"); + client *c = moduleGetReplyClient(ctx); + if (c == NULL) return REDISMODULE_OK; + addReplyErrorFormat(c,"-%s",err); + return REDISMODULE_OK; } /* Reply with a simple string (+... \r\n in RESP protocol). This replies @@ -1404,7 +1395,12 @@ int RM_ReplyWithError(RedisModuleCtx *ctx, const char *err) { * * The function always returns REDISMODULE_OK. */ int RM_ReplyWithSimpleString(RedisModuleCtx *ctx, const char *msg) { - return replyWithStatus(ctx,msg,"+"); + client *c = moduleGetReplyClient(ctx); + if (c == NULL) return REDISMODULE_OK; + addReplyProto(c,"+",1); + addReplyProto(c,msg,strlen(msg)); + addReplyProto(c,"\r\n",2); + return REDISMODULE_OK; } /* Reply with an array type of 'len' elements. However 'len' other calls diff --git a/src/networking.c b/src/networking.c index 34e8b8481..7957cc82f 100644 --- a/src/networking.c +++ b/src/networking.c @@ -405,6 +405,24 @@ void addReplyErrorLength(client *c, const char *s, size_t len) { /* Do some actions after an error reply was sent (Log if needed, updates stats, etc.) */ void afterErrorReply(client *c, const char *s, size_t len) { + /* Increment the global error counter */ + server.stat_total_error_replies++; + /* Increment the error stats + * If the string already starts with "-..." then the error prefix + * is provided by the caller ( we limit the search to 32 chars). Otherwise we use "-ERR". */ + if (s[0] != '-') { + incrementErrorCount("ERR", 3); + } else { + char *spaceloc = memchr(s, ' ', len < 32 ? len : 32); + if (spaceloc) { + const size_t errEndPos = (size_t)(spaceloc - s); + incrementErrorCount(s+1, errEndPos-1); + } else { + /* Fallback to ERR if we can't retrieve the error prefix */ + incrementErrorCount("ERR", 3); + } + } + /* Sometimes it could be normal that a slave replies to a master with * an error and this function gets called. Actually the error will never * be sent because addReply*() against master clients has no effect... diff --git a/src/server.c b/src/server.c index 257a39f71..dbcaa767d 100644 --- a/src/server.c +++ b/src/server.c @@ -2952,6 +2952,7 @@ void resetServerStats(void) { atomicSet(server.stat_net_input_bytes, 0); atomicSet(server.stat_net_output_bytes, 0); server.stat_unexpected_error_replies = 0; + server.stat_total_error_replies = 0; server.stat_dump_payload_sanitizations = 0; server.aof_delayed_fsync = 0; server.blocked_last_cron = 0; @@ -2985,6 +2986,7 @@ void initServer(void) { server.in_fork_child = CHILD_TYPE_NONE; server.main_thread_id = pthread_self(); server.current_client = NULL; + server.errors = raxNew(); server.fixed_time_expire = 0; server.clients = listCreate(); server.clients_index = raxNew(); @@ -3291,11 +3293,18 @@ void resetCommandTableStats(void) { c = (struct redisCommand *) dictGetVal(de); c->microseconds = 0; c->calls = 0; + c->rejected_calls = 0; + c->failed_calls = 0; } dictReleaseIterator(di); } +void resetErrorTableStats(void) { + raxFree(server.errors); + server.errors = raxNew(); +} + /* ========================== Redis OP Array API ============================ */ void redisOpArrayInit(redisOpArray *oa) { @@ -3490,6 +3499,7 @@ void call(client *c, int flags) { ustime_t start, duration; int client_old_flags = c->flags; struct redisCommand *real_cmd = c->cmd; + static long long prev_err_count; server.fixed_time_expire++; @@ -3510,6 +3520,7 @@ void call(client *c, int flags) { /* Call the command. */ dirty = server.dirty; + prev_err_count = server.stat_total_error_replies; updateCachedTime(0); start = server.ustime; c->cmd->proc(c); @@ -3517,6 +3528,14 @@ void call(client *c, int flags) { dirty = server.dirty-dirty; if (dirty < 0) dirty = 0; + /* Update failed command calls if required. + * We leverage a static variable (prev_err_count) to retain + * the counter across nested function calls and avoid logging + * the same error twice. */ + if ((server.stat_total_error_replies - prev_err_count) > 0) { + real_cmd->failed_calls++; + } + /* After executing command, we will close the client after writing entire * reply if it is set 'CLIENT_CLOSE_AFTER_COMMAND' flag. */ if (c->flags & CLIENT_CLOSE_AFTER_COMMAND) { @@ -3655,6 +3674,7 @@ void call(client *c, int flags) { server.fixed_time_expire--; server.stat_numcommands++; + prev_err_count = server.stat_total_error_replies; /* Record peak memory after each command and before the eviction that runs * before the next command. */ @@ -3670,6 +3690,7 @@ void call(client *c, int flags) { * Note: 'reply' is expected to end with \r\n */ void rejectCommand(client *c, robj *reply) { flagTransaction(c); + if (c->cmd) c->cmd->rejected_calls++; if (c->cmd && c->cmd->proc == execCommand) { execCommandAbort(c, reply->ptr); } else { @@ -3679,6 +3700,7 @@ void rejectCommand(client *c, robj *reply) { } void rejectCommandFormat(client *c, const char *fmt, ...) { + if (c->cmd) c->cmd->rejected_calls++; flagTransaction(c); va_list ap; va_start(ap,fmt); @@ -3805,6 +3827,7 @@ int processCommand(client *c) { flagTransaction(c); } clusterRedirectClient(c,n,hashslot,error_code); + c->cmd->rejected_calls++; return C_OK; } } @@ -3962,9 +3985,22 @@ int processCommand(client *c) { if (listLength(server.ready_keys)) handleClientsBlockedOnKeys(); } + return C_OK; } +/* ====================== Error lookup and execution ===================== */ + +void incrementErrorCount(const char *fullerr, size_t namelen) { + struct redisError *error = raxFind(server.errors,(unsigned char*)fullerr,namelen); + if (error == raxNotFound) { + error = zmalloc(sizeof(*error)); + error->count = 0; + raxInsert(server.errors,(unsigned char*)fullerr,namelen,error,NULL); + } + error->count++; +} + /*================================== Shutdown =============================== */ /* Close listening sockets. Also unlink the unix domain socket if @@ -4681,6 +4717,7 @@ sds genRedisInfoString(const char *section) { "tracking_total_items:%lld\r\n" "tracking_total_prefixes:%lld\r\n" "unexpected_error_replies:%lld\r\n" + "total_error_replies:%lld\r\n" "dump_payload_sanitizations:%lld\r\n" "total_reads_processed:%lld\r\n" "total_writes_processed:%lld\r\n" @@ -4718,6 +4755,7 @@ sds genRedisInfoString(const char *section) { (unsigned long long) trackingGetTotalItems(), (unsigned long long) trackingGetTotalPrefixes(), server.stat_unexpected_error_replies, + server.stat_total_error_replies, server.stat_dump_payload_sanitizations, stat_total_reads_processed, stat_total_writes_processed, @@ -4908,14 +4946,33 @@ sds genRedisInfoString(const char *section) { di = dictGetSafeIterator(server.commands); while((de = dictNext(di)) != NULL) { c = (struct redisCommand *) dictGetVal(de); - if (!c->calls) continue; + if (!c->calls && !c->failed_calls && !c->rejected_calls) + continue; info = sdscatprintf(info, - "cmdstat_%s:calls=%lld,usec=%lld,usec_per_call=%.2f\r\n", + "cmdstat_%s:calls=%lld,usec=%lld,usec_per_call=%.2f" + ",rejected_calls=%lld,failed_calls=%lld\r\n", c->name, c->calls, c->microseconds, - (c->calls == 0) ? 0 : ((float)c->microseconds/c->calls)); + (c->calls == 0) ? 0 : ((float)c->microseconds/c->calls), + c->rejected_calls, c->failed_calls); } dictReleaseIterator(di); } + /* Error statistics */ + if (allsections || defsections || !strcasecmp(section,"errorstats")) { + if (sections++) info = sdscat(info,"\r\n"); + info = sdscat(info, "# Errorstats\r\n"); + raxIterator ri; + raxStart(&ri,server.errors); + raxSeek(&ri,"^",NULL,0); + struct redisError *e; + while(raxNext(&ri)) { + e = (struct redisError *) ri.data; + info = sdscatprintf(info, + "errorstat_%.*s:count=%lld\r\n", + (int)ri.key_len, ri.key, e->count); + } + raxStop(&ri); + } /* Cluster */ if (allsections || defsections || !strcasecmp(section,"cluster")) { diff --git a/src/server.h b/src/server.h index 1dc761959..5f6b21ec4 100644 --- a/src/server.h +++ b/src/server.h @@ -1122,6 +1122,7 @@ struct redisServer { dict *commands; /* Command table */ dict *orig_commands; /* Command table before command renaming. */ aeEventLoop *el; + rax *errors; /* Errors table */ redisAtomic unsigned int lruclock; /* Clock for LRU eviction */ volatile sig_atomic_t shutdown_asap; /* SHUTDOWN needed ASAP */ int activerehashing; /* Incremental rehash in serverCron() */ @@ -1231,6 +1232,7 @@ struct redisServer { size_t stat_module_cow_bytes; /* Copy on write bytes during module fork. */ uint64_t stat_clients_type_memory[CLIENT_TYPE_COUNT];/* Mem usage by type */ long long stat_unexpected_error_replies; /* Number of unexpected (aof-loading, replica to master, etc.) error replies */ + long long stat_total_error_replies; /* Total number of issued error replies ( command + rejected errors ) */ long long stat_dump_payload_sanitizations; /* Number deep dump payloads integrity validations. */ long long stat_io_reads_processed; /* Number of read events processed by IO / Main threads */ long long stat_io_writes_processed; /* Number of write events processed by IO / Main threads */ @@ -1579,7 +1581,7 @@ struct redisCommand { int firstkey; /* The first argument that's a key (0 = no keys) */ int lastkey; /* The last argument that's a key */ int keystep; /* The step between first and last key */ - long long microseconds, calls; + long long microseconds, calls, rejected_calls, failed_calls; int id; /* Command ID. This is a progressive ID starting from 0 that is assigned at runtime, and is used in order to check ACLs. A connection is able to execute a given command if @@ -1587,6 +1589,10 @@ struct redisCommand { bit set in the bitmap of allowed commands. */ }; +struct redisError { + long long count; +}; + struct redisFunctionSym { char *name; unsigned long pointer; @@ -2132,7 +2138,9 @@ void updateDictResizePolicy(void); int htNeedsResize(dict *dict); void populateCommandTable(void); void resetCommandTableStats(void); +void resetErrorTableStats(void); void adjustOpenFilesLimit(void); +void incrementErrorCount(const char *fullerr, size_t namelen); void closeListeningSockets(int unlink_unix_socket); void updateCachedTime(int update_daylight_info); void resetServerStats(void); diff --git a/tests/cluster/tests/18-info.tcl b/tests/cluster/tests/18-info.tcl new file mode 100644 index 000000000..83dbf833f --- /dev/null +++ b/tests/cluster/tests/18-info.tcl @@ -0,0 +1,36 @@ +# Check cluster info stats + +source "../tests/includes/init-tests.tcl" + +test "Create a primary with a replica" { + create_cluster 2 0 +} + +test "Cluster should start ok" { + assert_cluster_state ok +} + +set primary1 [Rn 0] +set primary2 [Rn 1] + +proc cmdstat {instace cmd} { + return [cmdrstat $cmd $instace] +} + +proc errorstat {instace cmd} { + return [errorrstat $cmd $instace] +} + +test "errorstats: rejected call due to MOVED Redirection" { + $primary1 config resetstat + $primary2 config resetstat + assert_match {} [errorstat $primary1 MOVED] + assert_match {} [errorstat $primary2 MOVED] + # we know that the primary 2 will have a MOVED reply + catch {$primary1 set key{0x0000} b} replyP1 + catch {$primary2 set key{0x0000} b} replyP2 + assert_match {OK} $replyP1 + assert_match {} [errorstat $primary1 MOVED] + assert_match {*count=1*} [errorstat $primary2 MOVED] + assert_match {*calls=0,*,rejected_calls=1,failed_calls=0} [cmdstat $primary2 set] +} diff --git a/tests/support/util.tcl b/tests/support/util.tcl index e4b70ed20..86f2753c2 100644 --- a/tests/support/util.tcl +++ b/tests/support/util.tcl @@ -561,6 +561,12 @@ proc cmdrstat {cmd r} { } } +proc errorrstat {cmd r} { + if {[regexp "\r\nerrorstat_$cmd:(.*?)\r\n" [$r info errorstats] _ value]} { + set _ $value + } +} + proc generate_fuzzy_traffic_on_key {key duration} { # Commands per type, blocking commands removed # TODO: extract these from help.h or elsewhere, and improve to include other types diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl index 3b8dc16da..4bef921ff 100644 --- a/tests/test_helper.tcl +++ b/tests/test_helper.tcl @@ -18,6 +18,7 @@ set ::all_tests { unit/protocol unit/keyspace unit/scan + unit/info unit/type/string unit/type/incr unit/type/list diff --git a/tests/unit/info.tcl b/tests/unit/info.tcl new file mode 100644 index 000000000..5a44c0647 --- /dev/null +++ b/tests/unit/info.tcl @@ -0,0 +1,153 @@ +proc cmdstat {cmd} { + return [cmdrstat $cmd r] +} + +proc errorstat {cmd} { + return [errorrstat $cmd r] +} + +start_server {tags {"info"}} { + start_server {} { + + test {errorstats: failed call authentication error} { + r config resetstat + assert_match {} [errorstat ERR] + assert_equal [s total_error_replies] 0 + catch {r auth k} e + assert_match {ERR AUTH*} $e + assert_match {*count=1*} [errorstat ERR] + assert_match {*calls=1,*,rejected_calls=0,failed_calls=1} [cmdstat auth] + assert_equal [s total_error_replies] 1 + r config resetstat + assert_match {} [errorstat ERR] + } + + test {errorstats: failed call within MULTI/EXEC} { + r config resetstat + assert_match {} [errorstat ERR] + assert_equal [s total_error_replies] 0 + r multi + r set a b + r auth a + catch {r exec} e + assert_match {ERR AUTH*} $e + assert_match {*count=1*} [errorstat ERR] + assert_match {*calls=1,*,rejected_calls=0,failed_calls=0} [cmdstat set] + assert_match {*calls=1,*,rejected_calls=0,failed_calls=1} [cmdstat auth] + assert_match {*calls=1,*,rejected_calls=0,failed_calls=0} [cmdstat exec] + assert_equal [s total_error_replies] 1 + + # MULTI/EXEC command errors should still be pinpointed to him + catch {r exec} e + assert_match {ERR EXEC without MULTI} $e + assert_match {*calls=2,*,rejected_calls=0,failed_calls=1} [cmdstat exec] + assert_match {*count=2*} [errorstat ERR] + assert_equal [s total_error_replies] 2 + } + + test {errorstats: failed call within LUA} { + r config resetstat + assert_match {} [errorstat ERR] + assert_equal [s total_error_replies] 0 + catch {r eval {redis.pcall('XGROUP', 'CREATECONSUMER', 's1', 'mygroup', 'consumer') return } 0} e + assert_match {*count=1*} [errorstat ERR] + assert_match {*calls=1,*,rejected_calls=0,failed_calls=1} [cmdstat xgroup] + assert_match {*calls=1,*,rejected_calls=0,failed_calls=0} [cmdstat eval] + + # EVAL command errors should still be pinpointed to him + catch {r eval a} e + assert_match {ERR wrong*} $e + assert_match {*calls=1,*,rejected_calls=1,failed_calls=0} [cmdstat eval] + assert_match {*count=2*} [errorstat ERR] + assert_equal [s total_error_replies] 2 + } + + test {errorstats: failed call NOGROUP error} { + r config resetstat + assert_match {} [errorstat NOGROUP] + r del mystream + r XADD mystream * f v + catch {r XGROUP CREATECONSUMER mystream mygroup consumer} e + assert_match {NOGROUP*} $e + assert_match {*count=1*} [errorstat NOGROUP] + assert_match {*calls=1,*,rejected_calls=0,failed_calls=1} [cmdstat xgroup] + r config resetstat + assert_match {} [errorstat NOGROUP] + } + + test {errorstats: rejected call unknown command} { + r config resetstat + assert_equal [s total_error_replies] 0 + assert_match {} [errorstat ERR] + catch {r asdf} e + assert_match {ERR unknown*} $e + assert_match {*count=1*} [errorstat ERR] + assert_equal [s total_error_replies] 1 + r config resetstat + assert_match {} [errorstat ERR] + } + + test {errorstats: rejected call within MULTI/EXEC} { + r config resetstat + assert_equal [s total_error_replies] 0 + assert_match {} [errorstat ERR] + r multi + catch {r set} e + assert_match {ERR wrong number of arguments*} $e + catch {r exec} e + assert_match {EXECABORT*} $e + assert_match {*count=1*} [errorstat ERR] + assert_equal [s total_error_replies] 1 + assert_match {*calls=0,*,rejected_calls=1,failed_calls=0} [cmdstat set] + assert_match {*calls=1,*,rejected_calls=0,failed_calls=0} [cmdstat multi] + assert_match {*calls=1,*,rejected_calls=0,failed_calls=0} [cmdstat exec] + assert_equal [s total_error_replies] 1 + r config resetstat + assert_match {} [errorstat ERR] + } + + test {errorstats: rejected call due to wrong arity} { + r config resetstat + assert_equal [s total_error_replies] 0 + assert_match {} [errorstat ERR] + catch {r set k} e + assert_match {ERR wrong number of arguments*} $e + assert_match {*count=1*} [errorstat ERR] + assert_match {*calls=0,*,rejected_calls=1,failed_calls=0} [cmdstat set] + # ensure that after a rejected command, valid ones are counted properly + r set k1 v1 + r set k2 v2 + assert_match {calls=2,*,rejected_calls=1,failed_calls=0} [cmdstat set] + assert_equal [s total_error_replies] 1 + } + + test {errorstats: rejected call by OOM error} { + r config resetstat + assert_equal [s total_error_replies] 0 + assert_match {} [errorstat OOM] + r config set maxmemory 1 + catch {r set a b} e + assert_match {OOM*} $e + assert_match {*count=1*} [errorstat OOM] + assert_match {*calls=0,*,rejected_calls=1,failed_calls=0} [cmdstat set] + assert_equal [s total_error_replies] 1 + r config resetstat + assert_match {} [errorstat OOM] + } + + test {errorstats: rejected call by authorization error} { + r config resetstat + assert_equal [s total_error_replies] 0 + assert_match {} [errorstat NOPERM] + r ACL SETUSER alice on >p1pp0 ~cached:* +get +info +config + r auth alice p1pp0 + catch {r set a b} e + assert_match {NOPERM*} $e + assert_match {*count=1*} [errorstat NOPERM] + assert_match {*calls=0,*,rejected_calls=1,failed_calls=0} [cmdstat set] + assert_equal [s total_error_replies] 1 + r config resetstat + assert_match {} [errorstat NOPERM] + } + } +}