From 2530dc0ebd8be8d792f4673073401377cd5bdc42 Mon Sep 17 00:00:00 2001 From: Yossi Gottlieb Date: Wed, 12 Aug 2020 17:58:56 +0300 Subject: [PATCH] Add oom-score-adj configuration option to control Linux OOM killer. (#1690) Add Linux kernel OOM killer control option. This adds the ability to control the Linux OOM killer oom_score_adj parameter for all Redis processes, depending on the process role (i.e. master, replica, background child). A oom-score-adj global boolean flag control this feature. In addition, specific values can be configured using oom-score-adj-values if additional tuning is required. --- redis.conf | 26 ++++++++ src/config.c | 119 +++++++++++++++++++++++++++++++++++ src/config.h | 1 + src/replication.c | 6 ++ src/server.c | 60 ++++++++++++++++++ src/server.h | 12 ++++ tests/test_helper.tcl | 1 + tests/unit/oom-score-adj.tcl | 81 ++++++++++++++++++++++++ 8 files changed, 306 insertions(+) create mode 100644 tests/unit/oom-score-adj.tcl diff --git a/redis.conf b/redis.conf index b8134e8c4..35ee752e1 100644 --- a/redis.conf +++ b/redis.conf @@ -1049,6 +1049,32 @@ lazyfree-lazy-user-del no # --threads option to match the number of Redis theads, otherwise you'll not # be able to notice the improvements. +############################ KERNEL OOM CONTROL ############################## + +# On Linux, it is possible to hint the kernel OOM killer on what processes +# should be killed first when out of memory. +# +# Enabling this feature makes Redis actively control the oom_score_adj value +# for all its processes, depending on their role. The default scores will +# attempt to have background child processes killed before all others, and +# replicas killed before masters. + +oom-score-adj no + +# When oom-score-adj is used, this directive controls the specific values used +# for master, replica and background child processes. Values range -1000 to +# 1000 (higher means more likely to be killed). +# +# Unprivileged processes (not root, and without CAP_SYS_RESOURCE capabilities) +# can freely increase their value, but not decrease it below its initial +# settings. +# +# Values are used relative to the initial value of oom_score_adj when the server +# starts. Because typically the initial value is 0, they will often match the +# absolute values. + +oom-score-adj-values 0 200 800 + ############################## APPEND ONLY MODE ############################### # By default Redis asynchronously dumps the dataset on disk. This mode is diff --git a/src/config.c b/src/config.c index 8a1b6d934..866665a68 100644 --- a/src/config.c +++ b/src/config.c @@ -111,6 +111,9 @@ clientBufferLimitsConfig clientBufferLimitsDefaults[CLIENT_TYPE_OBUF_COUNT] = { {1024*1024*32, 1024*1024*8, 60} /* pubsub */ }; +/* OOM Score defaults */ +int configOOMScoreAdjValuesDefaults[CONFIG_OOM_COUNT] = { 0, 200, 800 }; + /* Generic config infrastructure function pointers * int is_valid_fn(val, err) * Return 1 when val is valid, and 0 when invalid. @@ -286,6 +289,59 @@ void queueLoadModule(sds path, sds *argv, int argc) { listAddNodeTail(server.loadmodule_queue,loadmod); } +/* Parse an array of CONFIG_OOM_COUNT sds strings, validate and populate + * server.oom_score_adj_values if valid. + */ + +static int updateOOMScoreAdjValues(sds *args, char **err) { + int i; + int values[CONFIG_OOM_COUNT]; + + for (i = 0; i < CONFIG_OOM_COUNT; i++) { + char *eptr; + long long val = strtoll(args[i], &eptr, 10); + + if (*eptr != '\0' || val < -1000 || val > 1000) { + if (err) *err = "Invalid oom-score-adj-values, elements must be between -1000 and 1000."; + return C_ERR; + } + + values[i] = val; + } + + /* Verify that the values make sense. If they don't omit a warning but + * keep the configuration, which may still be valid for privileged processes. + */ + + if (values[CONFIG_OOM_REPLICA] < values[CONFIG_OOM_MASTER] || + values[CONFIG_OOM_BGCHILD] < values[CONFIG_OOM_REPLICA]) { + serverLog(LOG_WARNING, + "The oom-score-adj-values configuration may not work for non-privileged processes! " + "Please consult the documentation."); + } + + /* Store values, retain previous config for rollback in case we fail. */ + int old_values[CONFIG_OOM_COUNT]; + for (i = 0; i < CONFIG_OOM_COUNT; i++) { + old_values[i] = server.oom_score_adj_values[i]; + server.oom_score_adj_values[i] = values[i]; + } + + /* Update */ + if (setOOMScoreAdj(-1) == C_ERR) { + /* Roll back */ + for (i = 0; i < CONFIG_OOM_COUNT; i++) + server.oom_score_adj_values[i] = old_values[i]; + + if (err) + *err = "Failed to apply oom-score-adj-values configuration, check server logs."; + + return C_ERR; + } + + return C_OK; +} + void initConfigValues() { for (standardConfig *config = configs; config->name != NULL; config++) { config->interface.init(config->data); @@ -479,6 +535,8 @@ void loadServerConfigFromString(char *config) { server.client_obuf_limits[class].hard_limit_bytes = hard; server.client_obuf_limits[class].soft_limit_bytes = soft; server.client_obuf_limits[class].soft_limit_seconds = soft_seconds; + } else if (!strcasecmp(argv[0],"oom-score-adj-values") && argc == 1 + CONFIG_OOM_COUNT) { + if (updateOOMScoreAdjValues(&argv[1], &err) == C_ERR) goto loaderr; } else if (!strcasecmp(argv[0],"notify-keyspace-events") && argc == 2) { int flags = keyspaceEventsStringToFlags(argv[1]); @@ -728,6 +786,17 @@ void configSetCommand(client *c) { server.client_obuf_limits[class].soft_limit_seconds = soft_seconds; } sdsfreesplitres(v,vlen); + } config_set_special_field("oom-score-adj-values") { + int vlen; + int success = 1; + + sds *v = sdssplitlen(o->ptr, sdslen(o->ptr), " ", 1, &vlen); + if (vlen != CONFIG_OOM_COUNT || updateOOMScoreAdjValues(v, &errstr) == C_ERR) + success = 0; + + sdsfreesplitres(v, vlen); + if (!success) + goto badfmt; } config_set_special_field("notify-keyspace-events") { int flags = keyspaceEventsStringToFlags(o->ptr); @@ -923,6 +992,22 @@ void configGetCommand(client *c) { matches++; } + if (stringmatch(pattern,"oom-score-adj-values",0)) { + sds buf = sdsempty(); + int j; + + for (j = 0; j < CONFIG_OOM_COUNT; j++) { + buf = sdscatprintf(buf,"%d", server.oom_score_adj_values[j]); + if (j != CONFIG_OOM_COUNT-1) + buf = sdscatlen(buf," ",1); + } + + addReplyBulkCString(c,"oom-score-adj-values"); + addReplyBulkCString(c,buf); + sdsfree(buf); + matches++; + } + setDeferredMapLen(c,replylen,matches); } @@ -1330,6 +1415,25 @@ void rewriteConfigClientoutputbufferlimitOption(struct rewriteConfigState *state } } +/* Rewrite the oom-score-adj-values option. */ +void rewriteConfigOOMScoreAdjValuesOption(struct rewriteConfigState *state) { + int force = 0; + int j; + char *option = "oom-score-adj-values"; + sds line; + + line = sdsempty(); + for (j = 0; j < CONFIG_OOM_COUNT; j++) { + if (server.oom_score_adj_values[j] != configOOMScoreAdjValuesDefaults[j]) + force = 1; + + line = sdscatprintf(line, "%d", server.oom_score_adj_values[j]); + if (j+1 != CONFIG_OOM_COUNT) + line = sdscatlen(line, " ", 1); + } + rewriteConfigRewriteLine(state,option,line,force); +} + /* Rewrite the bind option. */ void rewriteConfigBindOption(struct rewriteConfigState *state) { int force = 1; @@ -1528,6 +1632,7 @@ int rewriteConfig(char *path) { rewriteConfigStringOption(state,"cluster-config-file",server.cluster_configfile,CONFIG_DEFAULT_CLUSTER_CONFIG_FILE); rewriteConfigNotifykeyspaceeventsOption(state); rewriteConfigClientoutputbufferlimitOption(state); + rewriteConfigOOMScoreAdjValuesOption(state); /* Rewrite Sentinel config if in Sentinel mode. */ if (server.sentinel_mode) rewriteConfigSentinelOption(state); @@ -2082,6 +2187,19 @@ static int updateMaxclients(long long val, long long prev, char **err) { return 1; } +static int updateOOMScoreAdj(int val, int prev, char **err) { + UNUSED(prev); + + if (val) { + if (setOOMScoreAdj(-1) == C_ERR) { + *err = "Failed to set current oom_score_adj. Check server logs."; + return 0; + } + } + + return 1; +} + #ifdef USE_OPENSSL static int updateTlsCfg(char *val, char *prev, char **err) { UNUSED(val); @@ -2146,6 +2264,7 @@ standardConfig configs[] = { createBoolConfig("crash-log-enabled", NULL, MODIFIABLE_CONFIG, server.crashlog_enabled, 1, NULL, updateSighandlerEnabled), createBoolConfig("crash-memcheck-enabled", NULL, MODIFIABLE_CONFIG, server.memcheck_enabled, 1, NULL, NULL), createBoolConfig("use-exit-on-panic", NULL, MODIFIABLE_CONFIG, server.use_exit_on_panic, 0, NULL, NULL), + createBoolConfig("oom-score-adj", NULL, MODIFIABLE_CONFIG, server.oom_score_adj, 0, NULL, updateOOMScoreAdj), /* String Configs */ createStringConfig("aclfile", NULL, IMMUTABLE_CONFIG, ALLOW_EMPTY_STRING, server.acl_filename, "", NULL, NULL), diff --git a/src/config.h b/src/config.h index 0fcc42972..e807b9330 100644 --- a/src/config.h +++ b/src/config.h @@ -54,6 +54,7 @@ #define HAVE_PROC_MAPS 1 #define HAVE_PROC_SMAPS 1 #define HAVE_PROC_SOMAXCONN 1 +#define HAVE_PROC_OOM_SCORE_ADJ 1 #endif /* Test for task_info() */ diff --git a/src/replication.c b/src/replication.c index 05d450558..846f3a26b 100644 --- a/src/replication.c +++ b/src/replication.c @@ -2497,6 +2497,9 @@ void replicationSetMaster(char *ip, int port) { server.masterhost = sdsnew(ip); server.masterport = port; + /* Update oom_score_adj */ + setOOMScoreAdj(-1); + /* Force our slaves to resync with us as well. They may hopefully be able * to partially resync with us, but we can notify the replid change. */ disconnectSlaves(); @@ -2564,6 +2567,9 @@ void replicationUnsetMaster(void) { * master switch. */ server.slaveseldb = -1; + /* Update oom_score_adj */ + setOOMScoreAdj(-1); + /* Once we turn from slave to master, we consider the starting time without * slaves (that is used to count the replication backlog time to live) as * starting from now. Otherwise the backlog will be freed after a diff --git a/src/server.c b/src/server.c index 5a9d55c0c..caf8d302d 100644 --- a/src/server.c +++ b/src/server.c @@ -2422,6 +2422,10 @@ void initServerConfig(void) { for (j = 0; j < CLIENT_TYPE_OBUF_COUNT; j++) server.client_obuf_limits[j] = clientBufferLimitsDefaults[j]; + /* Linux OOM Score config */ + for (j = 0; j < CONFIG_OOM_COUNT; j++) + server.oom_score_adj_values[j] = configOOMScoreAdjValuesDefaults[j]; + /* Double constants initialization */ R_Zero = 0.0; R_PosInf = 1.0/R_Zero; @@ -2527,6 +2531,58 @@ int restartServer(int flags, mstime_t delay) { return C_ERR; /* Never reached. */ } +static void readOOMScoreAdj(void) { +#ifdef HAVE_PROC_OOM_SCORE_ADJ + char buf[64]; + int fd = open("/proc/self/oom_score_adj", O_RDONLY); + + if (fd < 0) return; + if (read(fd, buf, sizeof(buf)) > 0) + server.oom_score_adj_base = atoi(buf); + close(fd); +#endif +} + +/* This function will configure the current process's oom_score_adj according + * to user specified configuration. This is currently implemented on Linux + * only. + * + * A process_class value of -1 implies OOM_CONFIG_MASTER or OOM_CONFIG_REPLICA, + * depending on current role. + */ +int setOOMScoreAdj(int process_class) { + int fd; + int val; + char buf[64]; + + if (!server.oom_score_adj) return C_OK; + if (process_class == -1) + process_class = (server.masterhost ? CONFIG_OOM_REPLICA : CONFIG_OOM_MASTER); + + serverAssert(process_class >= 0 && process_class < CONFIG_OOM_COUNT); + +#ifdef HAVE_PROC_OOM_SCORE_ADJ + val = server.oom_score_adj_base + server.oom_score_adj_values[process_class]; + if (val > 1000) val = 1000; + if (val < -1000) val = -1000; + + snprintf(buf, sizeof(buf) - 1, "%d\n", val); + + fd = open("/proc/self/oom_score_adj", O_WRONLY); + if (fd < 0 || write(fd, buf, strlen(buf)) < 0) { + serverLog(LOG_WARNING, "Unable to write oom_score_adj: %s", strerror(errno)); + if (fd != -1) close(fd); + return C_ERR; + } + + close(fd); + return C_OK; +#else + /* Unsupported */ + return C_ERR; +#endif +} + /* This function will try to raise the max number of open files accordingly to * the configured max number of clients. It also reserves a number of file * descriptors (CONFIG_MIN_RESERVED_FDS) for extra operations of @@ -4866,6 +4922,7 @@ int redisFork() { long long start = ustime(); if ((childpid = fork()) == 0) { /* Child */ + setOOMScoreAdj(CONFIG_OOM_BGCHILD); closeListeningSockets(0); setupChildSignalHandlers(); } else { @@ -5197,6 +5254,7 @@ int main(int argc, char **argv) { server.supervised = redisIsSupervised(server.supervised_mode); int background = server.daemonize && !server.supervised; if (background) daemonize(); + readOOMScoreAdj(); initServer(); if (background || server.pidfile) createPidFile(); @@ -5250,6 +5308,8 @@ int main(int argc, char **argv) { } redisSetCpuAffinity(server.server_cpulist); + setOOMScoreAdj(-1); + aeMain(server.el); aeDeleteEventLoop(server.el); return 0; diff --git a/src/server.h b/src/server.h index 7f1e7ea7b..4940b0c5b 100644 --- a/src/server.h +++ b/src/server.h @@ -150,6 +150,14 @@ typedef long long ustime_t; /* microsecond time type. */ * in order to make sure of not over provisioning more than 128 fds. */ #define CONFIG_FDSET_INCR (CONFIG_MIN_RESERVED_FDS+96) +/* OOM Score Adjustment classes. */ +#define CONFIG_OOM_MASTER 0 +#define CONFIG_OOM_REPLICA 1 +#define CONFIG_OOM_BGCHILD 2 +#define CONFIG_OOM_COUNT 3 + +extern int configOOMScoreAdjValuesDefaults[CONFIG_OOM_COUNT]; + /* Hash table parameters */ #define HASHTABLE_MIN_FILL 10 /* Minimal hash table fill 10% */ @@ -1350,6 +1358,9 @@ struct redisServer { int lfu_log_factor; /* LFU logarithmic counter factor. */ int lfu_decay_time; /* LFU counter decay factor. */ long long proto_max_bulk_len; /* Protocol bulk length maximum size. */ + int oom_score_adj_base; /* Base oom_score_adj value, as observed on startup */ + int oom_score_adj_values[CONFIG_OOM_COUNT]; /* Linux oom_score_adj configuration */ + int oom_score_adj; /* If true, oom_score_adj is managed */ /* Blocked clients */ unsigned int blocked_clients; /* # of clients executing a blocking cmd.*/ unsigned int blocked_clients_by_type[BLOCKED_NUM]; @@ -2016,6 +2027,7 @@ const char *evictPolicyToString(void); struct redisMemOverhead *getMemoryOverheadData(void); void freeMemoryOverheadData(struct redisMemOverhead *mh); void checkChildrenDone(void); +int setOOMScoreAdj(int process_class); #define RESTART_SERVER_NONE 0 #define RESTART_SERVER_GRACEFULLY (1<<0) /* Do proper shutdown. */ diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl index 7ce0d545e..d0f962762 100644 --- a/tests/test_helper.tcl +++ b/tests/test_helper.tcl @@ -68,6 +68,7 @@ set ::all_tests { unit/pendingquerybuf unit/tls unit/tracking + unit/oom-score-adj } # Index to the next test to run in the ::all_tests list. set ::next_test 0 diff --git a/tests/unit/oom-score-adj.tcl b/tests/unit/oom-score-adj.tcl new file mode 100644 index 000000000..894a70fb2 --- /dev/null +++ b/tests/unit/oom-score-adj.tcl @@ -0,0 +1,81 @@ +set system_name [string tolower [exec uname -s]] +set user_id [exec id -u] + +if {$system_name eq {linux}} { + start_server {tags {"oom-score-adj"}} { + proc get_oom_score_adj {{pid ""}} { + if {$pid == ""} { + set pid [srv 0 pid] + } + set fd [open "/proc/$pid/oom_score_adj" "r"] + set val [gets $fd] + close $fd + + return $val + } + + proc get_child_pid {} { + set pid [srv 0 pid] + set fd [open "|ps --ppid $pid -o pid -h" "r"] + set child_pid [string trim [read $fd]] + close $fd + + return $child_pid + } + + test {CONFIG SET oom-score-adj works as expected} { + set base [get_oom_score_adj] + + # Enable oom-score-adj, check defaults + r config set oom-score-adj-values "10 20 30" + r config set oom-score-adj yes + + assert {[get_oom_score_adj] == [expr $base + 10]} + + # Modify current class + r config set oom-score-adj-values "15 20 30" + assert {[get_oom_score_adj] == [expr $base + 15]} + + # Check replica class + r replicaof localhost 1 + assert {[get_oom_score_adj] == [expr $base + 20]} + r replicaof no one + assert {[get_oom_score_adj] == [expr $base + 15]} + + # Check child process + r set key-a value-a + r config set rdb-key-save-delay 100000 + r bgsave + + set child_pid [get_child_pid] + assert {[get_oom_score_adj $child_pid] == [expr $base + 30]} + } + + # Failed oom-score-adj tests can only run unprivileged + if {$user_id != 0} { + test {CONFIG SET oom-score-adj handles configuration failures} { + # Bad config + r config set oom-score-adj no + r config set oom-score-adj-values "-1000 -1000 -1000" + + # Make sure it fails + catch {r config set oom-score-adj yes} e + assert_match {*Failed to set*} $e + + # Make sure it remains off + assert {[r config get oom-score-adj] == "oom-score-adj no"} + + # Fix config + r config set oom-score-adj-values "0 100 100" + r config set oom-score-adj yes + + # Make sure it fails + catch {r config set oom-score-adj-values "-1000 -1000 -1000"} e + assert_match {*Failed*} $e + + # Make sure previous values remain + assert {[r config get oom-score-adj-values] == {oom-score-adj-values {0 100 100}}} + } + } + } +}