Port load shedding and availability-zone (#202)

2023-06-27 15:37:28 -04:00 · 2023-06-27 15:37:28 -04:00 · f53e0337ef
commit f53e0337ef
parent dcd623b079
4 changed files with 59 additions and 2 deletions
--- a/keydb.conf
+++ b/keydb.conf
@ -2088,3 +2088,16 @@ active-client-balancing yes
 # disk space or any other I/O error KeyDB will instead use memory.
 #
 # blob-support false
+
+# Begin load shedding if we use more than X% CPU relative to the number of server threads
+# E.g. if overload-protect-percent is set to 80 and there are 8 server-threads, then the 
+# actual CPU protection will be 8 * 100 * 0.80 = 640% CPU usage.
+#
+# Set to 0 to disable
+# overload-protect-percent 0
+
+# Inform KeyDB of the availability zone if running in a cloud environment.  Currently
+# this is only exposed via the info command for clients to use, but in the future we
+# we may also use this when making decisions for replication.
+#
+# availability-zone "us-east-1a"
--- a/src/config.cpp
+++ b/src/config.cpp
@ -2953,6 +2953,8 @@ standardConfig configs[] = {
    createBoolConfig("soft-shutdown", NULL, MODIFIABLE_CONFIG, g_pserver->config_soft_shutdown, 0, NULL, NULL),
    createBoolConfig("flash-disable-key-cache", NULL, MODIFIABLE_CONFIG, g_pserver->flash_disable_key_cache, 0, NULL, NULL),
    createSizeTConfig("semi-ordered-set-bucket-size", NULL, MODIFIABLE_CONFIG, 0, 1024, g_semiOrderedSetTargetBucketSize, 0, INTEGER_CONFIG, NULL, NULL),
+    createSDSConfig("availability-zone", NULL, MODIFIABLE_CONFIG, 0, g_pserver->sdsAvailabilityZone, "", NULL, NULL),
+    createIntConfig("overload-protect-percent", NULL, MODIFIABLE_CONFIG, 0, 200, g_pserver->overload_protect_threshold, 0, INTEGER_CONFIG, NULL, NULL),

 #ifdef USE_OPENSSL
    createIntConfig("tls-port", NULL, MODIFIABLE_CONFIG, 0, 65535, g_pserver->tls_port, 0, INTEGER_CONFIG, NULL, updateTLSPort), /* TCP port. */
--- a/src/server.cpp
+++ b/src/server.cpp
@ -1958,6 +1958,16 @@ void getExpansiveClientsInfo(size_t *in_usage, size_t *out_usage) {
    *out_usage = o;
 }

+int closeClientOnOverload(client *c) {
+    if (g_pserver->overload_closed_clients > MAX_CLIENTS_SHED_PER_PERIOD) return false;
+    if (!g_pserver->is_overloaded) return false;
+    // Don't close masters, replicas, or pub/sub clients
+    if (c->flags & (CLIENT_MASTER | CLIENT_SLAVE | CLIENT_PENDING_WRITE | CLIENT_PUBSUB | CLIENT_BLOCKED)) return false;
+    freeClient(c);
+    ++g_pserver->overload_closed_clients;
+    return true;
+}
+
 /* This function is called by serverCron() and is used in order to perform
 * operations on clients that are important to perform constantly. For instance
 * we use this function in order to disconnect clients after a timeout, including
@ -2028,6 +2038,7 @@ void clientsCron(int iel) {
            if (clientsCronTrackExpansiveClients(c, curr_peak_mem_usage_slot)) goto LContinue;
            if (clientsCronTrackClientsMemUsage(c)) goto LContinue;
            if (closeClientOnOutputBufferLimitReached(c, 0)) continue; // Client also free'd
+            if (closeClientOnOverload(c)) continue;
        LContinue:
            fastlock_unlock(&c->lock);
        }        
@ -2581,6 +2592,26 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
        migrateCloseTimedoutSockets();
    }

+    /* Check for CPU Overload */
+    run_with_period(10'000) {
+        g_pserver->is_overloaded = false;
+        g_pserver->overload_closed_clients = 0;
+        static clock_t last = 0;
+        if (g_pserver->overload_protect_threshold > 0) {
+            clock_t cur = clock();
+            double perc = static_cast<double>(cur - last) / (CLOCKS_PER_SEC*10);
+            perc /= cserver.cthreads;
+            perc *= 100.0;
+            serverLog(LL_WARNING, "CPU Used: %.2f", perc);
+            if (perc > g_pserver->overload_protect_threshold) {
+                serverLog(LL_WARNING, "\tWARNING: CPU overload detected.");
+                g_pserver->is_overloaded = true;
+            }
+            last = cur;
+        }
+    }
+
+    /* Tune the fastlock to CPU load */
    run_with_period(30000) {
        /* Tune the fastlock to CPU load */
        fastlock_auto_adjust_waits();
@ -5602,7 +5633,8 @@ sds genRedisInfoString(const char *section) {
            "configured_hz:%i\r\n"
            "lru_clock:%u\r\n"
            "executable:%s\r\n"
-            "config_file:%s\r\n",
+            "config_file:%s\r\n"
+            "availability_zone:%s\r\n",
            KEYDB_SET_VERSION,
            redisGitSHA1(),
            strtol(redisGitDirty(),NULL,10) > 0,
@ -5628,7 +5660,8 @@ sds genRedisInfoString(const char *section) {
            g_pserver->config_hz,
            lruclock,
            cserver.executable ? cserver.executable : "",
-            cserver.configfile ? cserver.configfile : "");
+            cserver.configfile ? cserver.configfile : "",
+            g_pserver->sdsAvailabilityZone);
    }

    /* Clients */
--- a/src/server.h
+++ b/src/server.h
@ -122,6 +122,9 @@ typedef long long ustime_t; /* microsecond time type. */
 #define LOADING_BOOT 1
 #define LOADING_REPLICATION 2

+#define OVERLOAD_PROTECT_PERIOD_MS 10'000 // 10 seconds
+#define MAX_CLIENTS_SHED_PER_PERIOD (OVERLOAD_PROTECT_PERIOD_MS / 10)  // Restrict to one client per 10ms
+
 extern int g_fTestMode;
 extern struct redisServer *g_pserver;

@ -2744,6 +2747,12 @@ struct redisServer {
    uint16_t rglockSamples[s_lockContentionSamples];
    unsigned ilockRingHead = 0;

+
+    sds sdsAvailabilityZone;
+    int overload_protect_threshold = 0;
+    int is_overloaded = 0;
+    int overload_closed_clients = 0;
+
        int module_blocked_pipe[2]; /* Pipe used to awake the event loop if a
                            client blocked on a module command needs
                            to be processed. */