Fix slowdown due to child reporting CoW. (#8645)

Reading CoW from /proc/<pid>/smaps can be slow with large processes on
some platforms.

This measures the time it takes to read CoW info and limits the duty
cycle of future updates to roughly 1/100.

As current_cow_size no longer represnets a current, fixed interval value
there is also a new current_cow_size_age field that provides information
about the age of the size value, in seconds.
This commit is contained in:
Yossi Gottlieb 2021-03-22 13:25:58 +02:00 committed by GitHub
parent 2f717c156a
commit c3df27d1ea
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 53 additions and 12 deletions

View File

@ -33,6 +33,7 @@
typedef struct { typedef struct {
size_t keys; size_t keys;
size_t cow; size_t cow;
monotime cow_updated;
double progress; double progress;
childInfoType information_type; /* Type of information */ childInfoType information_type; /* Type of information */
} child_info_data; } child_info_data;
@ -69,17 +70,38 @@ void closeChildInfoPipe(void) {
void sendChildInfoGeneric(childInfoType info_type, size_t keys, double progress, char *pname) { void sendChildInfoGeneric(childInfoType info_type, size_t keys, double progress, char *pname) {
if (server.child_info_pipe[1] == -1) return; if (server.child_info_pipe[1] == -1) return;
child_info_data data = {0}; /* zero everything, including padding to sattisfy valgrind */ static monotime cow_updated = 0;
data.information_type = info_type; static uint64_t cow_update_cost = 0;
data.keys = keys; static size_t cow = 0;
data.cow = zmalloc_get_private_dirty(-1);
data.progress = progress;
if (data.cow) { child_info_data data = {0}; /* zero everything, including padding to satisfy valgrind */
/* When called to report current info, we need to throttle down CoW updates as they
* can be very expensive. To do that, we measure the time it takes to get a reading
* and schedule the next reading to happen not before time*CHILD_COW_COST_FACTOR
* passes. */
monotime now = getMonotonicUs();
if (info_type != CHILD_INFO_TYPE_CURRENT_INFO ||
!cow_updated ||
now - cow_updated > cow_update_cost * CHILD_COW_DUTY_CYCLE)
{
cow = zmalloc_get_private_dirty(-1);
cow_updated = getMonotonicUs();
cow_update_cost = cow_updated - now;
if (cow) {
serverLog((info_type == CHILD_INFO_TYPE_CURRENT_INFO) ? LL_VERBOSE : LL_NOTICE, serverLog((info_type == CHILD_INFO_TYPE_CURRENT_INFO) ? LL_VERBOSE : LL_NOTICE,
"%s: %zu MB of memory used by copy-on-write", "%s: %zu MB of memory used by copy-on-write",
pname, data.cow / (1024 * 1024)); pname, data.cow / (1024 * 1024));
} }
}
data.information_type = info_type;
data.keys = keys;
data.cow = cow;
data.cow_updated = cow_updated;
data.progress = progress;
ssize_t wlen = sizeof(data); ssize_t wlen = sizeof(data);
@ -89,9 +111,10 @@ void sendChildInfoGeneric(childInfoType info_type, size_t keys, double progress,
} }
/* Update Child info. */ /* Update Child info. */
void updateChildInfo(childInfoType information_type, size_t cow, size_t keys, double progress) { void updateChildInfo(childInfoType information_type, size_t cow, monotime cow_updated, size_t keys, double progress) {
if (information_type == CHILD_INFO_TYPE_CURRENT_INFO) { if (information_type == CHILD_INFO_TYPE_CURRENT_INFO) {
server.stat_current_cow_bytes = cow; server.stat_current_cow_bytes = cow;
server.stat_current_cow_updated = cow_updated;
server.stat_current_save_keys_processed = keys; server.stat_current_save_keys_processed = keys;
if (progress != -1) server.stat_module_progress = progress; if (progress != -1) server.stat_module_progress = progress;
} else if (information_type == CHILD_INFO_TYPE_AOF_COW_SIZE) { } else if (information_type == CHILD_INFO_TYPE_AOF_COW_SIZE) {
@ -107,7 +130,7 @@ void updateChildInfo(childInfoType information_type, size_t cow, size_t keys, do
* if complete data read into the buffer, * if complete data read into the buffer,
* data is stored into *buffer, and returns 1. * data is stored into *buffer, and returns 1.
* otherwise, the partial data is left in the buffer, waiting for the next read, and returns 0. */ * otherwise, the partial data is left in the buffer, waiting for the next read, and returns 0. */
int readChildInfo(childInfoType *information_type, size_t *cow, size_t *keys, double* progress) { int readChildInfo(childInfoType *information_type, size_t *cow, monotime *cow_updated, size_t *keys, double* progress) {
/* We are using here a static buffer in combination with the server.child_info_nread to handle short reads */ /* We are using here a static buffer in combination with the server.child_info_nread to handle short reads */
static child_info_data buffer; static child_info_data buffer;
ssize_t wlen = sizeof(buffer); ssize_t wlen = sizeof(buffer);
@ -124,6 +147,7 @@ int readChildInfo(childInfoType *information_type, size_t *cow, size_t *keys, do
if (server.child_info_nread == wlen) { if (server.child_info_nread == wlen) {
*information_type = buffer.information_type; *information_type = buffer.information_type;
*cow = buffer.cow; *cow = buffer.cow;
*cow_updated = buffer.cow_updated;
*keys = buffer.keys; *keys = buffer.keys;
*progress = buffer.progress; *progress = buffer.progress;
return 1; return 1;
@ -137,12 +161,13 @@ void receiveChildInfo(void) {
if (server.child_info_pipe[0] == -1) return; if (server.child_info_pipe[0] == -1) return;
size_t cow; size_t cow;
monotime cow_updated;
size_t keys; size_t keys;
double progress; double progress;
childInfoType information_type; childInfoType information_type;
/* Drain the pipe and update child info so that we get the final message. */ /* Drain the pipe and update child info so that we get the final message. */
while (readChildInfo(&information_type, &cow, &keys, &progress)) { while (readChildInfo(&information_type, &cow, &cow_updated, &keys, &progress)) {
updateChildInfo(information_type, cow, keys, progress); updateChildInfo(information_type, cow, cow_updated, keys, progress);
} }
} }

View File

@ -1614,6 +1614,7 @@ void resetChildState() {
server.child_type = CHILD_TYPE_NONE; server.child_type = CHILD_TYPE_NONE;
server.child_pid = -1; server.child_pid = -1;
server.stat_current_cow_bytes = 0; server.stat_current_cow_bytes = 0;
server.stat_current_cow_updated = 0;
server.stat_current_save_keys_processed = 0; server.stat_current_save_keys_processed = 0;
server.stat_module_progress = 0; server.stat_module_progress = 0;
server.stat_current_save_keys_total = 0; server.stat_current_save_keys_total = 0;
@ -3267,6 +3268,7 @@ void initServer(void) {
server.stat_starttime = time(NULL); server.stat_starttime = time(NULL);
server.stat_peak_memory = 0; server.stat_peak_memory = 0;
server.stat_current_cow_bytes = 0; server.stat_current_cow_bytes = 0;
server.stat_current_cow_updated = 0;
server.stat_current_save_keys_processed = 0; server.stat_current_save_keys_processed = 0;
server.stat_current_save_keys_total = 0; server.stat_current_save_keys_total = 0;
server.stat_rdb_cow_bytes = 0; server.stat_rdb_cow_bytes = 0;
@ -4835,6 +4837,7 @@ sds genRedisInfoString(const char *section) {
"# Persistence\r\n" "# Persistence\r\n"
"loading:%d\r\n" "loading:%d\r\n"
"current_cow_size:%zu\r\n" "current_cow_size:%zu\r\n"
"current_cow_size_age:%lu\r\n"
"current_fork_perc:%.2f\r\n" "current_fork_perc:%.2f\r\n"
"current_save_keys_processed:%zu\r\n" "current_save_keys_processed:%zu\r\n"
"current_save_keys_total:%zu\r\n" "current_save_keys_total:%zu\r\n"
@ -4857,6 +4860,7 @@ sds genRedisInfoString(const char *section) {
"module_fork_last_cow_size:%zu\r\n", "module_fork_last_cow_size:%zu\r\n",
(int)server.loading, (int)server.loading,
server.stat_current_cow_bytes, server.stat_current_cow_bytes,
server.stat_current_cow_updated ? (unsigned long) elapsedMs(server.stat_current_cow_updated) / 1000 : 0,
fork_perc, fork_perc,
server.stat_current_save_keys_processed, server.stat_current_save_keys_processed,
server.stat_current_save_keys_total, server.stat_current_save_keys_total,
@ -5829,6 +5833,7 @@ int redisFork(int purpose) {
server.child_pid = childpid; server.child_pid = childpid;
server.child_type = purpose; server.child_type = purpose;
server.stat_current_cow_bytes = 0; server.stat_current_cow_bytes = 0;
server.stat_current_cow_updated = 0;
server.stat_current_save_keys_processed = 0; server.stat_current_save_keys_processed = 0;
server.stat_module_progress = 0; server.stat_module_progress = 0;
server.stat_current_save_keys_total = dbTotalServerKeyCount(); server.stat_current_save_keys_total = dbTotalServerKeyCount();

View File

@ -130,6 +130,11 @@ typedef long long ustime_t; /* microsecond time type. */
* special code. */ * special code. */
#define SERVER_CHILD_NOERROR_RETVAL 255 #define SERVER_CHILD_NOERROR_RETVAL 255
/* Reading copy-on-write info is sometimes expensive and may slow down child
* processes that report it continuously. We measure the cost of obtaining it
* and hold back additional reading based on this factor. */
#define CHILD_COW_DUTY_CYCLE 100
/* Instantaneous metrics tracking. */ /* Instantaneous metrics tracking. */
#define STATS_METRIC_SAMPLES 16 /* Number of samples per metric. */ #define STATS_METRIC_SAMPLES 16 /* Number of samples per metric. */
#define STATS_METRIC_COMMAND 0 /* Number of commands executed. */ #define STATS_METRIC_COMMAND 0 /* Number of commands executed. */
@ -1279,6 +1284,7 @@ struct redisServer {
redisAtomic long long stat_net_input_bytes; /* Bytes read from network. */ redisAtomic long long stat_net_input_bytes; /* Bytes read from network. */
redisAtomic long long stat_net_output_bytes; /* Bytes written to network. */ redisAtomic long long stat_net_output_bytes; /* Bytes written to network. */
size_t stat_current_cow_bytes; /* Copy on write bytes while child is active. */ size_t stat_current_cow_bytes; /* Copy on write bytes while child is active. */
monotime stat_current_cow_updated; /* Last update time of stat_current_cow_bytes */
size_t stat_current_save_keys_processed; /* Processed keys while child is active. */ size_t stat_current_save_keys_processed; /* Processed keys while child is active. */
size_t stat_current_save_keys_total; /* Number of keys when child started. */ size_t stat_current_save_keys_total; /* Number of keys when child started. */
size_t stat_rdb_cow_bytes; /* Copy on write bytes during RDB saving. */ size_t stat_rdb_cow_bytes; /* Copy on write bytes during RDB saving. */

View File

@ -613,6 +613,11 @@ size_t zmalloc_get_smap_bytes_by_field(char *field, long pid) {
} }
#endif #endif
/* Return the total number bytes in pages marked as Private Dirty.
*
* Note: depending on the platform and memory footprint of the process, this
* call can be slow, exceeding 1000ms!
*/
size_t zmalloc_get_private_dirty(long pid) { size_t zmalloc_get_private_dirty(long pid) {
return zmalloc_get_smap_bytes_by_field("Private_Dirty:",pid); return zmalloc_get_smap_bytes_by_field("Private_Dirty:",pid);
} }