From f53e0337ef42d5ab6e02e12a4b31f097b427e02c Mon Sep 17 00:00:00 2001 From: John Sully Date: Tue, 27 Jun 2023 15:37:28 -0400 Subject: [PATCH] Port load shedding and availability-zone (#202) --- keydb.conf | 13 +++++++++++++ src/config.cpp | 2 ++ src/server.cpp | 37 +++++++++++++++++++++++++++++++++++-- src/server.h | 9 +++++++++ 4 files changed, 59 insertions(+), 2 deletions(-) diff --git a/keydb.conf b/keydb.conf index 800657138..59ddf5abb 100644 --- a/keydb.conf +++ b/keydb.conf @@ -2088,3 +2088,16 @@ active-client-balancing yes # disk space or any other I/O error KeyDB will instead use memory. # # blob-support false + +# Begin load shedding if we use more than X% CPU relative to the number of server threads +# E.g. if overload-protect-percent is set to 80 and there are 8 server-threads, then the +# actual CPU protection will be 8 * 100 * 0.80 = 640% CPU usage. +# +# Set to 0 to disable +# overload-protect-percent 0 + +# Inform KeyDB of the availability zone if running in a cloud environment. Currently +# this is only exposed via the info command for clients to use, but in the future we +# we may also use this when making decisions for replication. +# +# availability-zone "us-east-1a" \ No newline at end of file diff --git a/src/config.cpp b/src/config.cpp index 78cdb3877..ef901a5f8 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -2953,6 +2953,8 @@ standardConfig configs[] = { createBoolConfig("soft-shutdown", NULL, MODIFIABLE_CONFIG, g_pserver->config_soft_shutdown, 0, NULL, NULL), createBoolConfig("flash-disable-key-cache", NULL, MODIFIABLE_CONFIG, g_pserver->flash_disable_key_cache, 0, NULL, NULL), createSizeTConfig("semi-ordered-set-bucket-size", NULL, MODIFIABLE_CONFIG, 0, 1024, g_semiOrderedSetTargetBucketSize, 0, INTEGER_CONFIG, NULL, NULL), + createSDSConfig("availability-zone", NULL, MODIFIABLE_CONFIG, 0, g_pserver->sdsAvailabilityZone, "", NULL, NULL), + createIntConfig("overload-protect-percent", NULL, MODIFIABLE_CONFIG, 0, 200, g_pserver->overload_protect_threshold, 0, INTEGER_CONFIG, NULL, NULL), #ifdef USE_OPENSSL createIntConfig("tls-port", NULL, MODIFIABLE_CONFIG, 0, 65535, g_pserver->tls_port, 0, INTEGER_CONFIG, NULL, updateTLSPort), /* TCP port. */ diff --git a/src/server.cpp b/src/server.cpp index 95d19459b..fe1259aa2 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -1958,6 +1958,16 @@ void getExpansiveClientsInfo(size_t *in_usage, size_t *out_usage) { *out_usage = o; } +int closeClientOnOverload(client *c) { + if (g_pserver->overload_closed_clients > MAX_CLIENTS_SHED_PER_PERIOD) return false; + if (!g_pserver->is_overloaded) return false; + // Don't close masters, replicas, or pub/sub clients + if (c->flags & (CLIENT_MASTER | CLIENT_SLAVE | CLIENT_PENDING_WRITE | CLIENT_PUBSUB | CLIENT_BLOCKED)) return false; + freeClient(c); + ++g_pserver->overload_closed_clients; + return true; +} + /* This function is called by serverCron() and is used in order to perform * operations on clients that are important to perform constantly. For instance * we use this function in order to disconnect clients after a timeout, including @@ -2028,6 +2038,7 @@ void clientsCron(int iel) { if (clientsCronTrackExpansiveClients(c, curr_peak_mem_usage_slot)) goto LContinue; if (clientsCronTrackClientsMemUsage(c)) goto LContinue; if (closeClientOnOutputBufferLimitReached(c, 0)) continue; // Client also free'd + if (closeClientOnOverload(c)) continue; LContinue: fastlock_unlock(&c->lock); } @@ -2581,6 +2592,26 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) { migrateCloseTimedoutSockets(); } + /* Check for CPU Overload */ + run_with_period(10'000) { + g_pserver->is_overloaded = false; + g_pserver->overload_closed_clients = 0; + static clock_t last = 0; + if (g_pserver->overload_protect_threshold > 0) { + clock_t cur = clock(); + double perc = static_cast(cur - last) / (CLOCKS_PER_SEC*10); + perc /= cserver.cthreads; + perc *= 100.0; + serverLog(LL_WARNING, "CPU Used: %.2f", perc); + if (perc > g_pserver->overload_protect_threshold) { + serverLog(LL_WARNING, "\tWARNING: CPU overload detected."); + g_pserver->is_overloaded = true; + } + last = cur; + } + } + + /* Tune the fastlock to CPU load */ run_with_period(30000) { /* Tune the fastlock to CPU load */ fastlock_auto_adjust_waits(); @@ -5602,7 +5633,8 @@ sds genRedisInfoString(const char *section) { "configured_hz:%i\r\n" "lru_clock:%u\r\n" "executable:%s\r\n" - "config_file:%s\r\n", + "config_file:%s\r\n" + "availability_zone:%s\r\n", KEYDB_SET_VERSION, redisGitSHA1(), strtol(redisGitDirty(),NULL,10) > 0, @@ -5628,7 +5660,8 @@ sds genRedisInfoString(const char *section) { g_pserver->config_hz, lruclock, cserver.executable ? cserver.executable : "", - cserver.configfile ? cserver.configfile : ""); + cserver.configfile ? cserver.configfile : "", + g_pserver->sdsAvailabilityZone); } /* Clients */ diff --git a/src/server.h b/src/server.h index 022cca2dd..a99529244 100644 --- a/src/server.h +++ b/src/server.h @@ -122,6 +122,9 @@ typedef long long ustime_t; /* microsecond time type. */ #define LOADING_BOOT 1 #define LOADING_REPLICATION 2 +#define OVERLOAD_PROTECT_PERIOD_MS 10'000 // 10 seconds +#define MAX_CLIENTS_SHED_PER_PERIOD (OVERLOAD_PROTECT_PERIOD_MS / 10) // Restrict to one client per 10ms + extern int g_fTestMode; extern struct redisServer *g_pserver; @@ -2744,6 +2747,12 @@ struct redisServer { uint16_t rglockSamples[s_lockContentionSamples]; unsigned ilockRingHead = 0; + + sds sdsAvailabilityZone; + int overload_protect_threshold = 0; + int is_overloaded = 0; + int overload_closed_clients = 0; + int module_blocked_pipe[2]; /* Pipe used to awake the event loop if a client blocked on a module command needs to be processed. */