From 248059e50c59c9c9b896f81bf6f82d0b85e45c53 Mon Sep 17 00:00:00 2001
From: christianEQ <christian@eqalpha.com>
Date: Fri, 18 Jun 2021 19:01:51 +0000
Subject: [PATCH] detect stagnating server load before 100% (diagnostic tool)

Former-commit-id: 534b70643b8f39303331048d3e86475caa08b864
---
 src/keydb-diagnostic-tool.cpp | 57 ++++++++++++++++++++++++++---------
 1 file changed, 42 insertions(+), 15 deletions(-)
diff --git a/src/keydb-diagnostic-tool.cpp b/src/keydb-diagnostic-tool.cpp
index b5986127a..506cc3ebe 100644
--- a/src/keydb-diagnostic-tool.cpp
+++ b/src/keydb-diagnostic-tool.cpp
@@ -42,6 +42,7 @@
 #include <assert.h>
 #include <math.h>
 #include <pthread.h>
+#include <deque>
 extern "C" {
 #include <sds.h> /* Use hiredis sds. */
 #include "hiredis.h"
@@ -891,23 +892,31 @@ double getServerCpuTime(redisContext *ctx) {
     redisReply *reply = (redisReply*)redisCommand(ctx, "INFO CPU");
     if (reply->type != REDIS_REPLY_STRING) {
         freeReplyObject(reply);
-        printf("Error executing INFO command. Exiting.\r\n");
+        printf("Error executing INFO command. Exiting.\n");
         return -1;
     }
 
     double used_cpu_user, used_cpu_sys;
     if (extractPropertyFromInfo(reply->str, "used_cpu_user", used_cpu_user)) {
-        printf("Error reading user CPU usage from INFO command. Exiting.\r\n");
+        printf("Error reading user CPU usage from INFO command. Exiting.\n");
         return -1;
     }
     if (extractPropertyFromInfo(reply->str, "used_cpu_sys", used_cpu_sys)) {
-        printf("Error reading system CPU usage from INFO command. Exiting.\r\n");
+        printf("Error reading system CPU usage from INFO command. Exiting.\n");
         return -1;
     }
     freeReplyObject(reply);
     return used_cpu_user + used_cpu_sys;
 }
 
+double getMean(std::deque<double> *q) {
+    double sum = 0;
+    for (long unsigned int i = 0; i < q->size(); i++) {
+        sum += (*q)[i];
+    }
+    return sum / q->size();
+}
+
 bool isAtFullLoad(double cpuPercent, unsigned int threads) {
     return cpuPercent / threads >= 96;
 }
@@ -954,7 +963,9 @@ int main(int argc, const char **argv) {
     double server_cpu_time, last_server_cpu_time = getServerCpuTime(ctx);
     struct rusage self_ru;
     double self_cpu_time, last_self_cpu_time = getSelfCpuTime(&self_ru);
-    double server_cpu_load, last_server_cpu_load, self_cpu_load, server_cpu_gain, last_server_cpu_gain;
+    double server_cpu_load, last_server_cpu_load = 0, self_cpu_load, server_cpu_gain;
+    std::deque<double> load_gain_history = {};
+    double current_gain_avg, peak_gain_avg = 0;
 
     redisReply *reply = (redisReply*)redisCommand(ctx, "INFO CPU");
     if (reply->type != REDIS_REPLY_STRING) {
@@ -971,19 +982,15 @@ int main(int argc, const char **argv) {
 
     printf("Server has %d threads.\n", server_threads);
 
-
     while (self_threads < config.max_threads) {
-        printf("Creating %d clients for thread %d...\n", config.numclients, self_threads);
         for (int i = 0; i < config.numclients; i++) {
             sprintf(command, "SET %d %s\r\n", self_threads * config.numclients + i, set_value);
             createClient(command, strlen(command), NULL,self_threads);
         }
 
-        printf("Starting thread %d\n", self_threads);
-
         benchmarkThread *t = config.threads[self_threads];
         if (pthread_create(&(t->thread), NULL, execBenchmarkThread, t)){
-            fprintf(stderr, "FATAL: Failed to start thread %d.\n", self_threads);
+            fprintf(stderr, "FATAL: Failed to start thread %d. Exiting.\n", self_threads);
             exit(1);
         }
         self_threads++;
@@ -997,23 +1004,42 @@ int main(int argc, const char **argv) {
         if (server_cpu_time < 0) {
             break;
         }
-        printf("CPU Usage Self: %.1f%%, Server: %.1f%%\r\n", self_cpu_load, server_cpu_load);
+        printf("%d threads, %d total clients. CPU Usage Self: %.1f%% (%.1f%% per thread), Server: %.1f%% (%.1f%% per thread)\r",
+                self_threads,
+                self_threads * config.numclients,
+                self_cpu_load,
+                self_cpu_load / self_threads,
+                server_cpu_load,
+                server_cpu_load / server_threads);
+        fflush(stdout);
         server_cpu_gain = server_cpu_load - last_server_cpu_load;
+        load_gain_history.push_back(server_cpu_gain);
+        if (load_gain_history.size() > 5) {
+            load_gain_history.pop_front();
+        }
+        current_gain_avg = getMean(&load_gain_history);
+        if (current_gain_avg > peak_gain_avg) {
+            peak_gain_avg = current_gain_avg;
+        }
         last_server_cpu_time = server_cpu_time;
         last_self_cpu_time = self_cpu_time;
         last_server_cpu_load = server_cpu_load;
 
-
-
         if (isAtFullLoad(server_cpu_load, server_threads)) {
-            printf("Server is at full CPU load.\n");
+            printf("\nServer is at full CPU load. If higher performance is expected, check server configuration.\n");
             break;
         }
 
-        if (isAtFullLoad(self_cpu_load, self_threads)) {
-            printf("Diagnostic tool is at full CPU load.\n");
+        if (current_gain_avg <= 0.05 * peak_gain_avg) {
+            printf("\nServer CPU load appears to have stagnated with increasing clients.\n"
+                   "Server does not appear to be at full load. Check network for throughput.\n");
             break;
         }
+
+        if (self_threads * config.numclients > 2000) {
+            printf("\nClient limit of 2000 reached. Server is not at full load and appears to be increasing.\n"
+                   "2000 clients should be more than enough to reach a bottleneck. Check all configuration.\n");
+        }
     }
 
     printf("Done.\n");
@@ -1023,3 +1049,4 @@ int main(int argc, const char **argv) {
 
     return 0;
 }
+ 
\ No newline at end of file