From c3df27d1ea3eeff8ad4d0bec2bfc2646efb9843f Mon Sep 17 00:00:00 2001 From: Yossi Gottlieb Date: Mon, 22 Mar 2021 13:25:58 +0200 Subject: [PATCH] Fix slowdown due to child reporting CoW. (#8645) Reading CoW from /proc//smaps can be slow with large processes on some platforms. This measures the time it takes to read CoW info and limits the duty cycle of future updates to roughly 1/100. As current_cow_size no longer represnets a current, fixed interval value there is also a new current_cow_size_age field that provides information about the age of the size value, in seconds. --- src/childinfo.c | 49 +++++++++++++++++++++++++++++++++++++------------ src/server.c | 5 +++++ src/server.h | 6 ++++++ src/zmalloc.c | 5 +++++ 4 files changed, 53 insertions(+), 12 deletions(-) diff --git a/src/childinfo.c b/src/childinfo.c index e3f33a96c..4f0a42001 100644 --- a/src/childinfo.c +++ b/src/childinfo.c @@ -33,6 +33,7 @@ typedef struct { size_t keys; size_t cow; + monotime cow_updated; double progress; childInfoType information_type; /* Type of information */ } child_info_data; @@ -69,18 +70,39 @@ void closeChildInfoPipe(void) { void sendChildInfoGeneric(childInfoType info_type, size_t keys, double progress, char *pname) { if (server.child_info_pipe[1] == -1) return; - child_info_data data = {0}; /* zero everything, including padding to sattisfy valgrind */ + static monotime cow_updated = 0; + static uint64_t cow_update_cost = 0; + static size_t cow = 0; + + child_info_data data = {0}; /* zero everything, including padding to satisfy valgrind */ + + /* When called to report current info, we need to throttle down CoW updates as they + * can be very expensive. To do that, we measure the time it takes to get a reading + * and schedule the next reading to happen not before time*CHILD_COW_COST_FACTOR + * passes. */ + + monotime now = getMonotonicUs(); + if (info_type != CHILD_INFO_TYPE_CURRENT_INFO || + !cow_updated || + now - cow_updated > cow_update_cost * CHILD_COW_DUTY_CYCLE) + { + cow = zmalloc_get_private_dirty(-1); + cow_updated = getMonotonicUs(); + cow_update_cost = cow_updated - now; + + if (cow) { + serverLog((info_type == CHILD_INFO_TYPE_CURRENT_INFO) ? LL_VERBOSE : LL_NOTICE, + "%s: %zu MB of memory used by copy-on-write", + pname, data.cow / (1024 * 1024)); + } + } + data.information_type = info_type; data.keys = keys; - data.cow = zmalloc_get_private_dirty(-1); + data.cow = cow; + data.cow_updated = cow_updated; data.progress = progress; - if (data.cow) { - serverLog((info_type == CHILD_INFO_TYPE_CURRENT_INFO) ? LL_VERBOSE : LL_NOTICE, - "%s: %zu MB of memory used by copy-on-write", - pname, data.cow/(1024*1024)); - } - ssize_t wlen = sizeof(data); if (write(server.child_info_pipe[1], &data, wlen) != wlen) { @@ -89,9 +111,10 @@ void sendChildInfoGeneric(childInfoType info_type, size_t keys, double progress, } /* Update Child info. */ -void updateChildInfo(childInfoType information_type, size_t cow, size_t keys, double progress) { +void updateChildInfo(childInfoType information_type, size_t cow, monotime cow_updated, size_t keys, double progress) { if (information_type == CHILD_INFO_TYPE_CURRENT_INFO) { server.stat_current_cow_bytes = cow; + server.stat_current_cow_updated = cow_updated; server.stat_current_save_keys_processed = keys; if (progress != -1) server.stat_module_progress = progress; } else if (information_type == CHILD_INFO_TYPE_AOF_COW_SIZE) { @@ -107,7 +130,7 @@ void updateChildInfo(childInfoType information_type, size_t cow, size_t keys, do * if complete data read into the buffer, * data is stored into *buffer, and returns 1. * otherwise, the partial data is left in the buffer, waiting for the next read, and returns 0. */ -int readChildInfo(childInfoType *information_type, size_t *cow, size_t *keys, double* progress) { +int readChildInfo(childInfoType *information_type, size_t *cow, monotime *cow_updated, size_t *keys, double* progress) { /* We are using here a static buffer in combination with the server.child_info_nread to handle short reads */ static child_info_data buffer; ssize_t wlen = sizeof(buffer); @@ -124,6 +147,7 @@ int readChildInfo(childInfoType *information_type, size_t *cow, size_t *keys, do if (server.child_info_nread == wlen) { *information_type = buffer.information_type; *cow = buffer.cow; + *cow_updated = buffer.cow_updated; *keys = buffer.keys; *progress = buffer.progress; return 1; @@ -137,12 +161,13 @@ void receiveChildInfo(void) { if (server.child_info_pipe[0] == -1) return; size_t cow; + monotime cow_updated; size_t keys; double progress; childInfoType information_type; /* Drain the pipe and update child info so that we get the final message. */ - while (readChildInfo(&information_type, &cow, &keys, &progress)) { - updateChildInfo(information_type, cow, keys, progress); + while (readChildInfo(&information_type, &cow, &cow_updated, &keys, &progress)) { + updateChildInfo(information_type, cow, cow_updated, keys, progress); } } diff --git a/src/server.c b/src/server.c index 0ed58c07f..e19ddbed8 100644 --- a/src/server.c +++ b/src/server.c @@ -1614,6 +1614,7 @@ void resetChildState() { server.child_type = CHILD_TYPE_NONE; server.child_pid = -1; server.stat_current_cow_bytes = 0; + server.stat_current_cow_updated = 0; server.stat_current_save_keys_processed = 0; server.stat_module_progress = 0; server.stat_current_save_keys_total = 0; @@ -3267,6 +3268,7 @@ void initServer(void) { server.stat_starttime = time(NULL); server.stat_peak_memory = 0; server.stat_current_cow_bytes = 0; + server.stat_current_cow_updated = 0; server.stat_current_save_keys_processed = 0; server.stat_current_save_keys_total = 0; server.stat_rdb_cow_bytes = 0; @@ -4835,6 +4837,7 @@ sds genRedisInfoString(const char *section) { "# Persistence\r\n" "loading:%d\r\n" "current_cow_size:%zu\r\n" + "current_cow_size_age:%lu\r\n" "current_fork_perc:%.2f\r\n" "current_save_keys_processed:%zu\r\n" "current_save_keys_total:%zu\r\n" @@ -4857,6 +4860,7 @@ sds genRedisInfoString(const char *section) { "module_fork_last_cow_size:%zu\r\n", (int)server.loading, server.stat_current_cow_bytes, + server.stat_current_cow_updated ? (unsigned long) elapsedMs(server.stat_current_cow_updated) / 1000 : 0, fork_perc, server.stat_current_save_keys_processed, server.stat_current_save_keys_total, @@ -5829,6 +5833,7 @@ int redisFork(int purpose) { server.child_pid = childpid; server.child_type = purpose; server.stat_current_cow_bytes = 0; + server.stat_current_cow_updated = 0; server.stat_current_save_keys_processed = 0; server.stat_module_progress = 0; server.stat_current_save_keys_total = dbTotalServerKeyCount(); diff --git a/src/server.h b/src/server.h index 81395f4bb..f0fbefef0 100644 --- a/src/server.h +++ b/src/server.h @@ -130,6 +130,11 @@ typedef long long ustime_t; /* microsecond time type. */ * special code. */ #define SERVER_CHILD_NOERROR_RETVAL 255 +/* Reading copy-on-write info is sometimes expensive and may slow down child + * processes that report it continuously. We measure the cost of obtaining it + * and hold back additional reading based on this factor. */ +#define CHILD_COW_DUTY_CYCLE 100 + /* Instantaneous metrics tracking. */ #define STATS_METRIC_SAMPLES 16 /* Number of samples per metric. */ #define STATS_METRIC_COMMAND 0 /* Number of commands executed. */ @@ -1279,6 +1284,7 @@ struct redisServer { redisAtomic long long stat_net_input_bytes; /* Bytes read from network. */ redisAtomic long long stat_net_output_bytes; /* Bytes written to network. */ size_t stat_current_cow_bytes; /* Copy on write bytes while child is active. */ + monotime stat_current_cow_updated; /* Last update time of stat_current_cow_bytes */ size_t stat_current_save_keys_processed; /* Processed keys while child is active. */ size_t stat_current_save_keys_total; /* Number of keys when child started. */ size_t stat_rdb_cow_bytes; /* Copy on write bytes during RDB saving. */ diff --git a/src/zmalloc.c b/src/zmalloc.c index 77eeea174..3645efcf1 100644 --- a/src/zmalloc.c +++ b/src/zmalloc.c @@ -613,6 +613,11 @@ size_t zmalloc_get_smap_bytes_by_field(char *field, long pid) { } #endif +/* Return the total number bytes in pages marked as Private Dirty. + * + * Note: depending on the platform and memory footprint of the process, this + * call can be slow, exceeding 1000ms! + */ size_t zmalloc_get_private_dirty(long pid) { return zmalloc_get_smap_bytes_by_field("Private_Dirty:",pid); }