From 90b9f08e5d1657e7bfffe43f31f6663bf469ee75 Mon Sep 17 00:00:00 2001
From: filipe oliveira <filipecosta.90@gmail.com>
Date: Thu, 31 Dec 2020 14:53:43 +0000
Subject: [PATCH] Add errorstats info section, Add failed_calls and
 rejected_calls to commandstats (#8217)

This Commit pushes forward the observability on overall error statistics and command statistics within redis-server:

It extends INFO COMMANDSTATS to have
- failed_calls in - so we can keep track of errors that happen from the command itself, broken by command.
- rejected_calls - so we can keep track of errors that were triggered outside the commmand processing per se

Adds a new section to INFO, named ERRORSTATS that enables keeping track of the different errors that
occur within redis ( within processCommand and call ) based on the reply Error Prefix ( The first word
after the "-", up to the first space ).

This commit also fixes RM_ReplyWithError so that it can be correctly identified as an error reply.
---
 src/config.c                    |   1 +
 src/module.c                    |  24 +++--
 src/networking.c                |  18 ++++
 src/server.c                    |  63 ++++++++++++-
 src/server.h                    |  10 ++-
 tests/cluster/tests/18-info.tcl |  36 ++++++++
 tests/support/util.tcl          |   6 ++
 tests/test_helper.tcl           |   1 +
 tests/unit/info.tcl             | 153 ++++++++++++++++++++++++++++++++
 9 files changed, 294 insertions(+), 18 deletions(-)
 create mode 100644 tests/cluster/tests/18-info.tcl
 create mode 100644 tests/unit/info.tcl

diff --git a/src/config.c b/src/config.c
index a92b41624..9d078bfe3 100644
--- a/src/config.c
+++ b/src/config.c
@@ -2560,6 +2560,7 @@ NULL
     } else if (!strcasecmp(c->argv[1]->ptr,"resetstat") && c->argc == 2) {
         resetServerStats();
         resetCommandTableStats();
+        resetErrorTableStats();
         addReply(c,shared.ok);
     } else if (!strcasecmp(c->argv[1]->ptr,"rewrite") && c->argc == 2) {
         if (server.configfile == NULL) {
diff --git a/src/module.c b/src/module.c
index 11f5f4489..913b4de5d 100644
--- a/src/module.c
+++ b/src/module.c
@@ -1368,18 +1368,6 @@ int RM_ReplyWithLongLong(RedisModuleCtx *ctx, long long ll) {
     return REDISMODULE_OK;
 }
 
-/* Reply with an error or simple string (status message). Used to implement
- * ReplyWithSimpleString() and ReplyWithError().
- * The function always returns REDISMODULE_OK. */
-int replyWithStatus(RedisModuleCtx *ctx, const char *msg, char *prefix) {
-    client *c = moduleGetReplyClient(ctx);
-    if (c == NULL) return REDISMODULE_OK;
-    addReplyProto(c,prefix,strlen(prefix));
-    addReplyProto(c,msg,strlen(msg));
-    addReplyProto(c,"\r\n",2);
-    return REDISMODULE_OK;
-}
-
 /* Reply with the error 'err'.
  *
  * Note that 'err' must contain all the error, including
@@ -1395,7 +1383,10 @@ int replyWithStatus(RedisModuleCtx *ctx, const char *msg, char *prefix) {
  * The function always returns REDISMODULE_OK.
  */
 int RM_ReplyWithError(RedisModuleCtx *ctx, const char *err) {
-    return replyWithStatus(ctx,err,"-");
+    client *c = moduleGetReplyClient(ctx);
+    if (c == NULL) return REDISMODULE_OK;
+    addReplyErrorFormat(c,"-%s",err);
+    return REDISMODULE_OK;
 }
 
 /* Reply with a simple string (+... \r\n in RESP protocol). This replies
@@ -1404,7 +1395,12 @@ int RM_ReplyWithError(RedisModuleCtx *ctx, const char *err) {
  *
  * The function always returns REDISMODULE_OK. */
 int RM_ReplyWithSimpleString(RedisModuleCtx *ctx, const char *msg) {
-    return replyWithStatus(ctx,msg,"+");
+    client *c = moduleGetReplyClient(ctx);
+    if (c == NULL) return REDISMODULE_OK;
+    addReplyProto(c,"+",1);
+    addReplyProto(c,msg,strlen(msg));
+    addReplyProto(c,"\r\n",2);
+    return REDISMODULE_OK;
 }
 
 /* Reply with an array type of 'len' elements. However 'len' other calls
diff --git a/src/networking.c b/src/networking.c
index 34e8b8481..7957cc82f 100644
--- a/src/networking.c
+++ b/src/networking.c
@@ -405,6 +405,24 @@ void addReplyErrorLength(client *c, const char *s, size_t len) {
 
 /* Do some actions after an error reply was sent (Log if needed, updates stats, etc.) */
 void afterErrorReply(client *c, const char *s, size_t len) {
+    /* Increment the global error counter */
+    server.stat_total_error_replies++;
+    /* Increment the error stats
+     * If the string already starts with "-..." then the error prefix
+     * is provided by the caller ( we limit the search to 32 chars). Otherwise we use "-ERR". */
+    if (s[0] != '-') {
+        incrementErrorCount("ERR", 3);
+    } else {
+        char *spaceloc = memchr(s, ' ', len < 32 ? len : 32);
+        if (spaceloc) {
+            const size_t errEndPos = (size_t)(spaceloc - s);
+            incrementErrorCount(s+1, errEndPos-1);
+        } else {
+            /* Fallback to ERR if we can't retrieve the error prefix */
+            incrementErrorCount("ERR", 3);
+        }
+    }
+
     /* Sometimes it could be normal that a slave replies to a master with
      * an error and this function gets called. Actually the error will never
      * be sent because addReply*() against master clients has no effect...
diff --git a/src/server.c b/src/server.c
index 257a39f71..dbcaa767d 100644
--- a/src/server.c
+++ b/src/server.c
@@ -2952,6 +2952,7 @@ void resetServerStats(void) {
     atomicSet(server.stat_net_input_bytes, 0);
     atomicSet(server.stat_net_output_bytes, 0);
     server.stat_unexpected_error_replies = 0;
+    server.stat_total_error_replies = 0;
     server.stat_dump_payload_sanitizations = 0;
     server.aof_delayed_fsync = 0;
     server.blocked_last_cron = 0;
@@ -2985,6 +2986,7 @@ void initServer(void) {
     server.in_fork_child = CHILD_TYPE_NONE;
     server.main_thread_id = pthread_self();
     server.current_client = NULL;
+    server.errors = raxNew();
     server.fixed_time_expire = 0;
     server.clients = listCreate();
     server.clients_index = raxNew();
@@ -3291,11 +3293,18 @@ void resetCommandTableStats(void) {
         c = (struct redisCommand *) dictGetVal(de);
         c->microseconds = 0;
         c->calls = 0;
+        c->rejected_calls = 0;
+        c->failed_calls = 0;
     }
     dictReleaseIterator(di);
 
 }
 
+void resetErrorTableStats(void) {
+    raxFree(server.errors);
+    server.errors = raxNew();
+}
+
 /* ========================== Redis OP Array API ============================ */
 
 void redisOpArrayInit(redisOpArray *oa) {
@@ -3490,6 +3499,7 @@ void call(client *c, int flags) {
     ustime_t start, duration;
     int client_old_flags = c->flags;
     struct redisCommand *real_cmd = c->cmd;
+    static long long prev_err_count;
 
     server.fixed_time_expire++;
 
@@ -3510,6 +3520,7 @@ void call(client *c, int flags) {
 
     /* Call the command. */
     dirty = server.dirty;
+    prev_err_count = server.stat_total_error_replies;
     updateCachedTime(0);
     start = server.ustime;
     c->cmd->proc(c);
@@ -3517,6 +3528,14 @@ void call(client *c, int flags) {
     dirty = server.dirty-dirty;
     if (dirty < 0) dirty = 0;
 
+    /* Update failed command calls if required.
+     * We leverage a static variable (prev_err_count) to retain
+     * the counter across nested function calls and avoid logging
+     * the same error twice. */
+    if ((server.stat_total_error_replies - prev_err_count) > 0) {
+        real_cmd->failed_calls++;
+    }
+
     /* After executing command, we will close the client after writing entire
      * reply if it is set 'CLIENT_CLOSE_AFTER_COMMAND' flag. */
     if (c->flags & CLIENT_CLOSE_AFTER_COMMAND) {
@@ -3655,6 +3674,7 @@ void call(client *c, int flags) {
 
     server.fixed_time_expire--;
     server.stat_numcommands++;
+    prev_err_count = server.stat_total_error_replies;
 
     /* Record peak memory after each command and before the eviction that runs
      * before the next command. */
@@ -3670,6 +3690,7 @@ void call(client *c, int flags) {
  * Note: 'reply' is expected to end with \r\n */
 void rejectCommand(client *c, robj *reply) {
     flagTransaction(c);
+    if (c->cmd) c->cmd->rejected_calls++;
     if (c->cmd && c->cmd->proc == execCommand) {
         execCommandAbort(c, reply->ptr);
     } else {
@@ -3679,6 +3700,7 @@ void rejectCommand(client *c, robj *reply) {
 }
 
 void rejectCommandFormat(client *c, const char *fmt, ...) {
+    if (c->cmd) c->cmd->rejected_calls++;
     flagTransaction(c);
     va_list ap;
     va_start(ap,fmt);
@@ -3805,6 +3827,7 @@ int processCommand(client *c) {
                 flagTransaction(c);
             }
             clusterRedirectClient(c,n,hashslot,error_code);
+            c->cmd->rejected_calls++;
             return C_OK;
         }
     }
@@ -3962,9 +3985,22 @@ int processCommand(client *c) {
         if (listLength(server.ready_keys))
             handleClientsBlockedOnKeys();
     }
+
     return C_OK;
 }
 
+/* ====================== Error lookup and execution ===================== */
+
+void incrementErrorCount(const char *fullerr, size_t namelen) {
+    struct redisError *error = raxFind(server.errors,(unsigned char*)fullerr,namelen);
+    if (error == raxNotFound) {
+        error = zmalloc(sizeof(*error));
+        error->count = 0;
+        raxInsert(server.errors,(unsigned char*)fullerr,namelen,error,NULL);
+    }
+    error->count++;
+}
+
 /*================================== Shutdown =============================== */
 
 /* Close listening sockets. Also unlink the unix domain socket if
@@ -4681,6 +4717,7 @@ sds genRedisInfoString(const char *section) {
             "tracking_total_items:%lld\r\n"
             "tracking_total_prefixes:%lld\r\n"
             "unexpected_error_replies:%lld\r\n"
+            "total_error_replies:%lld\r\n"
             "dump_payload_sanitizations:%lld\r\n"
             "total_reads_processed:%lld\r\n"
             "total_writes_processed:%lld\r\n"
@@ -4718,6 +4755,7 @@ sds genRedisInfoString(const char *section) {
             (unsigned long long) trackingGetTotalItems(),
             (unsigned long long) trackingGetTotalPrefixes(),
             server.stat_unexpected_error_replies,
+            server.stat_total_error_replies,
             server.stat_dump_payload_sanitizations,
             stat_total_reads_processed,
             stat_total_writes_processed,
@@ -4908,14 +4946,33 @@ sds genRedisInfoString(const char *section) {
         di = dictGetSafeIterator(server.commands);
         while((de = dictNext(di)) != NULL) {
             c = (struct redisCommand *) dictGetVal(de);
-            if (!c->calls) continue;
+            if (!c->calls && !c->failed_calls && !c->rejected_calls)
+                continue;
             info = sdscatprintf(info,
-                "cmdstat_%s:calls=%lld,usec=%lld,usec_per_call=%.2f\r\n",
+                "cmdstat_%s:calls=%lld,usec=%lld,usec_per_call=%.2f"
+                ",rejected_calls=%lld,failed_calls=%lld\r\n",
                 c->name, c->calls, c->microseconds,
-                (c->calls == 0) ? 0 : ((float)c->microseconds/c->calls));
+                (c->calls == 0) ? 0 : ((float)c->microseconds/c->calls),
+                c->rejected_calls, c->failed_calls);
         }
         dictReleaseIterator(di);
     }
+    /* Error statistics */
+    if (allsections || defsections || !strcasecmp(section,"errorstats")) {
+        if (sections++) info = sdscat(info,"\r\n");
+        info = sdscat(info, "# Errorstats\r\n");
+        raxIterator ri;
+        raxStart(&ri,server.errors);
+        raxSeek(&ri,"^",NULL,0);
+        struct redisError *e;
+        while(raxNext(&ri)) {
+            e = (struct redisError *) ri.data;
+            info = sdscatprintf(info,
+                "errorstat_%.*s:count=%lld\r\n",
+                (int)ri.key_len, ri.key, e->count);
+        }
+        raxStop(&ri);
+    }
 
     /* Cluster */
     if (allsections || defsections || !strcasecmp(section,"cluster")) {
diff --git a/src/server.h b/src/server.h
index 1dc761959..5f6b21ec4 100644
--- a/src/server.h
+++ b/src/server.h
@@ -1122,6 +1122,7 @@ struct redisServer {
     dict *commands;             /* Command table */
     dict *orig_commands;        /* Command table before command renaming. */
     aeEventLoop *el;
+    rax *errors;                /* Errors table */
     redisAtomic unsigned int lruclock; /* Clock for LRU eviction */
     volatile sig_atomic_t shutdown_asap; /* SHUTDOWN needed ASAP */
     int activerehashing;        /* Incremental rehash in serverCron() */
@@ -1231,6 +1232,7 @@ struct redisServer {
     size_t stat_module_cow_bytes;   /* Copy on write bytes during module fork. */
     uint64_t stat_clients_type_memory[CLIENT_TYPE_COUNT];/* Mem usage by type */
     long long stat_unexpected_error_replies; /* Number of unexpected (aof-loading, replica to master, etc.) error replies */
+    long long stat_total_error_replies; /* Total number of issued error replies ( command + rejected errors ) */
     long long stat_dump_payload_sanitizations; /* Number deep dump payloads integrity validations. */
     long long stat_io_reads_processed; /* Number of read events processed by IO / Main threads */
     long long stat_io_writes_processed; /* Number of write events processed by IO / Main threads */
@@ -1579,7 +1581,7 @@ struct redisCommand {
     int firstkey; /* The first argument that's a key (0 = no keys) */
     int lastkey;  /* The last argument that's a key */
     int keystep;  /* The step between first and last key */
-    long long microseconds, calls;
+    long long microseconds, calls, rejected_calls, failed_calls;
     int id;     /* Command ID. This is a progressive ID starting from 0 that
                    is assigned at runtime, and is used in order to check
                    ACLs. A connection is able to execute a given command if
@@ -1587,6 +1589,10 @@ struct redisCommand {
                    bit set in the bitmap of allowed commands. */
 };
 
+struct redisError {
+    long long count;
+};
+
 struct redisFunctionSym {
     char *name;
     unsigned long pointer;
@@ -2132,7 +2138,9 @@ void updateDictResizePolicy(void);
 int htNeedsResize(dict *dict);
 void populateCommandTable(void);
 void resetCommandTableStats(void);
+void resetErrorTableStats(void);
 void adjustOpenFilesLimit(void);
+void incrementErrorCount(const char *fullerr, size_t namelen);
 void closeListeningSockets(int unlink_unix_socket);
 void updateCachedTime(int update_daylight_info);
 void resetServerStats(void);
diff --git a/tests/cluster/tests/18-info.tcl b/tests/cluster/tests/18-info.tcl
new file mode 100644
index 000000000..83dbf833f
--- /dev/null
+++ b/tests/cluster/tests/18-info.tcl
@@ -0,0 +1,36 @@
+# Check cluster info stats
+
+source "../tests/includes/init-tests.tcl"
+
+test "Create a primary with a replica" {
+    create_cluster 2 0
+}
+
+test "Cluster should start ok" {
+    assert_cluster_state ok
+}
+
+set primary1 [Rn 0]
+set primary2 [Rn 1]
+
+proc cmdstat {instace cmd} {
+    return [cmdrstat $cmd $instace]
+}
+
+proc errorstat {instace cmd} {
+    return [errorrstat $cmd $instace]
+}
+
+test "errorstats: rejected call due to MOVED Redirection" {
+    $primary1 config resetstat
+    $primary2 config resetstat
+    assert_match {} [errorstat $primary1 MOVED]
+    assert_match {} [errorstat $primary2 MOVED]
+    # we know that the primary 2 will have a MOVED reply
+    catch {$primary1 set key{0x0000} b} replyP1
+    catch {$primary2 set key{0x0000} b} replyP2
+    assert_match {OK} $replyP1
+    assert_match {} [errorstat $primary1 MOVED]
+    assert_match {*count=1*} [errorstat $primary2 MOVED]
+    assert_match {*calls=0,*,rejected_calls=1,failed_calls=0} [cmdstat $primary2 set]
+}
diff --git a/tests/support/util.tcl b/tests/support/util.tcl
index e4b70ed20..86f2753c2 100644
--- a/tests/support/util.tcl
+++ b/tests/support/util.tcl
@@ -561,6 +561,12 @@ proc cmdrstat {cmd r} {
     }
 }
 
+proc errorrstat {cmd r} {
+    if {[regexp "\r\nerrorstat_$cmd:(.*?)\r\n" [$r info errorstats] _ value]} {
+        set _ $value
+    }
+}
+
 proc generate_fuzzy_traffic_on_key {key duration} {
     # Commands per type, blocking commands removed
     # TODO: extract these from help.h or elsewhere, and improve to include other types
diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl
index 3b8dc16da..4bef921ff 100644
--- a/tests/test_helper.tcl
+++ b/tests/test_helper.tcl
@@ -18,6 +18,7 @@ set ::all_tests {
     unit/protocol
     unit/keyspace
     unit/scan
+    unit/info
     unit/type/string
     unit/type/incr
     unit/type/list
diff --git a/tests/unit/info.tcl b/tests/unit/info.tcl
new file mode 100644
index 000000000..5a44c0647
--- /dev/null
+++ b/tests/unit/info.tcl
@@ -0,0 +1,153 @@
+proc cmdstat {cmd} {
+    return [cmdrstat $cmd r]
+}
+
+proc errorstat {cmd} {
+    return [errorrstat $cmd r]
+}
+
+start_server {tags {"info"}} {
+    start_server {} {
+
+        test {errorstats: failed call authentication error} {
+            r config resetstat
+            assert_match {} [errorstat ERR]
+            assert_equal [s total_error_replies] 0
+            catch {r auth k} e
+            assert_match {ERR AUTH*} $e
+            assert_match {*count=1*} [errorstat ERR]
+            assert_match {*calls=1,*,rejected_calls=0,failed_calls=1} [cmdstat auth]
+            assert_equal [s total_error_replies] 1
+            r config resetstat
+            assert_match {} [errorstat ERR]
+        }
+
+        test {errorstats: failed call within MULTI/EXEC} {
+            r config resetstat
+            assert_match {} [errorstat ERR]
+            assert_equal [s total_error_replies] 0
+            r multi
+            r set a b
+            r auth a
+            catch {r exec} e
+            assert_match {ERR AUTH*} $e
+            assert_match {*count=1*} [errorstat ERR]
+            assert_match {*calls=1,*,rejected_calls=0,failed_calls=0} [cmdstat set]
+            assert_match {*calls=1,*,rejected_calls=0,failed_calls=1} [cmdstat auth]
+            assert_match {*calls=1,*,rejected_calls=0,failed_calls=0} [cmdstat exec]
+            assert_equal [s total_error_replies] 1
+
+            # MULTI/EXEC command errors should still be pinpointed to him
+            catch {r exec} e
+            assert_match {ERR EXEC without MULTI} $e
+            assert_match {*calls=2,*,rejected_calls=0,failed_calls=1} [cmdstat exec]
+            assert_match {*count=2*} [errorstat ERR]
+            assert_equal [s total_error_replies] 2
+        }
+
+        test {errorstats: failed call within LUA} {
+            r config resetstat
+            assert_match {} [errorstat ERR]
+            assert_equal [s total_error_replies] 0
+            catch {r eval {redis.pcall('XGROUP', 'CREATECONSUMER', 's1', 'mygroup', 'consumer') return } 0} e
+            assert_match {*count=1*} [errorstat ERR]
+            assert_match {*calls=1,*,rejected_calls=0,failed_calls=1} [cmdstat xgroup]
+            assert_match {*calls=1,*,rejected_calls=0,failed_calls=0} [cmdstat eval]
+
+            # EVAL command errors should still be pinpointed to him
+            catch {r eval a} e
+            assert_match {ERR wrong*} $e
+            assert_match {*calls=1,*,rejected_calls=1,failed_calls=0} [cmdstat eval]
+            assert_match {*count=2*} [errorstat ERR]
+            assert_equal [s total_error_replies] 2
+        }
+
+        test {errorstats: failed call NOGROUP error} {
+            r config resetstat
+            assert_match {} [errorstat NOGROUP]
+            r del mystream
+            r XADD mystream * f v
+            catch {r XGROUP CREATECONSUMER mystream mygroup consumer} e
+            assert_match {NOGROUP*} $e
+            assert_match {*count=1*} [errorstat NOGROUP]
+            assert_match {*calls=1,*,rejected_calls=0,failed_calls=1} [cmdstat xgroup]
+            r config resetstat
+            assert_match {} [errorstat NOGROUP]
+        }
+
+        test {errorstats: rejected call unknown command} {
+            r config resetstat
+            assert_equal [s total_error_replies] 0
+            assert_match {} [errorstat ERR]
+            catch {r asdf} e
+            assert_match {ERR unknown*} $e
+            assert_match {*count=1*} [errorstat ERR]
+            assert_equal [s total_error_replies] 1
+            r config resetstat
+            assert_match {} [errorstat ERR]
+        }
+
+        test {errorstats: rejected call within MULTI/EXEC} {
+            r config resetstat
+            assert_equal [s total_error_replies] 0
+            assert_match {} [errorstat ERR]
+            r multi
+            catch {r set} e
+            assert_match {ERR wrong number of arguments*} $e
+            catch {r exec} e
+            assert_match {EXECABORT*} $e
+            assert_match {*count=1*} [errorstat ERR]
+            assert_equal [s total_error_replies] 1
+            assert_match {*calls=0,*,rejected_calls=1,failed_calls=0} [cmdstat set]
+            assert_match {*calls=1,*,rejected_calls=0,failed_calls=0} [cmdstat multi]
+            assert_match {*calls=1,*,rejected_calls=0,failed_calls=0} [cmdstat exec]
+            assert_equal [s total_error_replies] 1
+            r config resetstat
+            assert_match {} [errorstat ERR]
+        }
+
+        test {errorstats: rejected call due to wrong arity} {
+            r config resetstat
+            assert_equal [s total_error_replies] 0
+            assert_match {} [errorstat ERR]
+            catch {r set k} e
+            assert_match {ERR wrong number of arguments*} $e
+            assert_match {*count=1*} [errorstat ERR]
+            assert_match {*calls=0,*,rejected_calls=1,failed_calls=0} [cmdstat set]
+            # ensure that after a rejected command, valid ones are counted properly
+            r set k1 v1
+            r set k2 v2
+            assert_match {calls=2,*,rejected_calls=1,failed_calls=0} [cmdstat set]
+            assert_equal [s total_error_replies] 1
+        }
+
+        test {errorstats: rejected call by OOM error} {
+            r config resetstat
+            assert_equal [s total_error_replies] 0
+            assert_match {} [errorstat OOM]
+            r config set maxmemory 1
+            catch {r set a b} e
+            assert_match {OOM*} $e
+            assert_match {*count=1*} [errorstat OOM]
+            assert_match {*calls=0,*,rejected_calls=1,failed_calls=0} [cmdstat set]
+            assert_equal [s total_error_replies] 1
+            r config resetstat
+            assert_match {} [errorstat OOM]
+        }
+
+        test {errorstats: rejected call by authorization error} {
+            r config resetstat
+            assert_equal [s total_error_replies] 0
+            assert_match {} [errorstat NOPERM]
+            r ACL SETUSER alice on >p1pp0 ~cached:* +get +info +config
+            r auth alice p1pp0
+            catch {r set a b} e
+            assert_match {NOPERM*} $e
+            assert_match {*count=1*} [errorstat NOPERM]
+            assert_match {*calls=0,*,rejected_calls=1,failed_calls=0} [cmdstat set]
+            assert_equal [s total_error_replies] 1
+            r config resetstat
+            assert_match {} [errorstat NOPERM]
+        }
+    }
+}