
# Background The RDB file is usually generated and used once and seldom used again, but the content would reside in page cache until OS evicts it. A potential problem is that once the free memory exhausts, the OS have to reclaim some memory from page cache or swap anonymous page out, which may result in a jitters to the Redis service. Supposing an exact scenario, a high-capacity machine hosts many redis instances, and we're upgrading the Redis together. The page cache in host machine increases as RDBs are generated. Once the free memory drop into low watermark(which is more likely to happen in older Linux kernel like 3.10, before [watermark_scale_factor](https://lore.kernel.org/lkml/1455813719-2395-1-git-send-email-hannes@cmpxchg.org/) is introduced, the `low watermark` is linear to `min watermark`, and there'is not too much buffer space for `kswapd` to be wake up to reclaim memory), a `direct reclaim` happens, which means the process would stall to wait for memory allocation. # What the PR does The PR introduces a capability to reclaim the cache when the RDB is operated. Generally there're two cases, read and write the RDB. For read it's a little messy to address the incremental reclaim, so the reclaim is done in one go in background after the load is finished to avoid blocking the work thread. For write, incremental reclaim amortizes the work of reclaim so no need to put it into background, and the peak watermark of cache can be reduced in this way. Two cases are addresses specially, replication and restart, for both of which the cache is leveraged to speed up the processing, so the reclaim is postponed to a right time. To do this, a flag is added to`rdbSave` and `rdbLoad` to control whether the cache need to be kept, with the default value false. # Something deserve noting 1. Though `posix_fadvise` is the POSIX standard, but only few platform support it, e.g. Linux, FreeBSD 10.0. 2. In Linux `posix_fadvise` only take effect on writeback-ed pages, so a `sync`(or `fsync`, `fdatasync`) is needed to flush the dirty page before `posix_fadvise` if we reclaim write cache. # About test A unit test is added to verify the effect of `posix_fadvise`. In integration test overall cache increase is checked, as well as the cache backed by RDB as a specific TCL test is executed in isolated Github action job.
184 lines
8.7 KiB
C
184 lines
8.7 KiB
C
/*
|
|
* Copyright (c) 2009-2012, Salvatore Sanfilippo <antirez at gmail dot com>
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* * Redistributions of source code must retain the above copyright notice,
|
|
* this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of Redis nor the names of its contributors may be used
|
|
* to endorse or promote products derived from this software without
|
|
* specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#ifndef __RDB_H
|
|
#define __RDB_H
|
|
|
|
#include <stdio.h>
|
|
#include "rio.h"
|
|
|
|
/* TBD: include only necessary headers. */
|
|
#include "server.h"
|
|
|
|
/* The current RDB version. When the format changes in a way that is no longer
|
|
* backward compatible this number gets incremented. */
|
|
#define RDB_VERSION 11
|
|
|
|
/* Defines related to the dump file format. To store 32 bits lengths for short
|
|
* keys requires a lot of space, so we check the most significant 2 bits of
|
|
* the first byte to interpreter the length:
|
|
*
|
|
* 00|XXXXXX => if the two MSB are 00 the len is the 6 bits of this byte
|
|
* 01|XXXXXX XXXXXXXX => 01, the len is 14 bits, 6 bits + 8 bits of next byte
|
|
* 10|000000 [32 bit integer] => A full 32 bit len in net byte order will follow
|
|
* 10|000001 [64 bit integer] => A full 64 bit len in net byte order will follow
|
|
* 11|OBKIND this means: specially encoded object will follow. The six bits
|
|
* number specify the kind of object that follows.
|
|
* See the RDB_ENC_* defines.
|
|
*
|
|
* Lengths up to 63 are stored using a single byte, most DB keys, and may
|
|
* values, will fit inside. */
|
|
#define RDB_6BITLEN 0
|
|
#define RDB_14BITLEN 1
|
|
#define RDB_32BITLEN 0x80
|
|
#define RDB_64BITLEN 0x81
|
|
#define RDB_ENCVAL 3
|
|
#define RDB_LENERR UINT64_MAX
|
|
|
|
/* When a length of a string object stored on disk has the first two bits
|
|
* set, the remaining six bits specify a special encoding for the object
|
|
* accordingly to the following defines: */
|
|
#define RDB_ENC_INT8 0 /* 8 bit signed integer */
|
|
#define RDB_ENC_INT16 1 /* 16 bit signed integer */
|
|
#define RDB_ENC_INT32 2 /* 32 bit signed integer */
|
|
#define RDB_ENC_LZF 3 /* string compressed with FASTLZ */
|
|
|
|
/* Map object types to RDB object types. Macros starting with OBJ_ are for
|
|
* memory storage and may change. Instead RDB types must be fixed because
|
|
* we store them on disk. */
|
|
#define RDB_TYPE_STRING 0
|
|
#define RDB_TYPE_LIST 1
|
|
#define RDB_TYPE_SET 2
|
|
#define RDB_TYPE_ZSET 3
|
|
#define RDB_TYPE_HASH 4
|
|
#define RDB_TYPE_ZSET_2 5 /* ZSET version 2 with doubles stored in binary. */
|
|
#define RDB_TYPE_MODULE_PRE_GA 6 /* Used in 4.0 release candidates */
|
|
#define RDB_TYPE_MODULE_2 7 /* Module value with annotations for parsing without
|
|
the generating module being loaded. */
|
|
/* NOTE: WHEN ADDING NEW RDB TYPE, UPDATE rdbIsObjectType() BELOW */
|
|
|
|
/* Object types for encoded objects. */
|
|
#define RDB_TYPE_HASH_ZIPMAP 9
|
|
#define RDB_TYPE_LIST_ZIPLIST 10
|
|
#define RDB_TYPE_SET_INTSET 11
|
|
#define RDB_TYPE_ZSET_ZIPLIST 12
|
|
#define RDB_TYPE_HASH_ZIPLIST 13
|
|
#define RDB_TYPE_LIST_QUICKLIST 14
|
|
#define RDB_TYPE_STREAM_LISTPACKS 15
|
|
#define RDB_TYPE_HASH_LISTPACK 16
|
|
#define RDB_TYPE_ZSET_LISTPACK 17
|
|
#define RDB_TYPE_LIST_QUICKLIST_2 18
|
|
#define RDB_TYPE_STREAM_LISTPACKS_2 19
|
|
#define RDB_TYPE_SET_LISTPACK 20
|
|
#define RDB_TYPE_STREAM_LISTPACKS_3 21
|
|
/* NOTE: WHEN ADDING NEW RDB TYPE, UPDATE rdbIsObjectType() BELOW */
|
|
|
|
/* Test if a type is an object type. */
|
|
#define rdbIsObjectType(t) (((t) >= 0 && (t) <= 7) || ((t) >= 9 && (t) <= 21))
|
|
|
|
/* Special RDB opcodes (saved/loaded with rdbSaveType/rdbLoadType). */
|
|
#define RDB_OPCODE_FUNCTION2 245 /* function library data */
|
|
#define RDB_OPCODE_FUNCTION_PRE_GA 246 /* old function library data for 7.0 rc1 and rc2 */
|
|
#define RDB_OPCODE_MODULE_AUX 247 /* Module auxiliary data. */
|
|
#define RDB_OPCODE_IDLE 248 /* LRU idle time. */
|
|
#define RDB_OPCODE_FREQ 249 /* LFU frequency. */
|
|
#define RDB_OPCODE_AUX 250 /* RDB aux field. */
|
|
#define RDB_OPCODE_RESIZEDB 251 /* Hash table resize hint. */
|
|
#define RDB_OPCODE_EXPIRETIME_MS 252 /* Expire time in milliseconds. */
|
|
#define RDB_OPCODE_EXPIRETIME 253 /* Old expire time in seconds. */
|
|
#define RDB_OPCODE_SELECTDB 254 /* DB number of the following keys. */
|
|
#define RDB_OPCODE_EOF 255 /* End of the RDB file. */
|
|
|
|
/* Module serialized values sub opcodes */
|
|
#define RDB_MODULE_OPCODE_EOF 0 /* End of module value. */
|
|
#define RDB_MODULE_OPCODE_SINT 1 /* Signed integer. */
|
|
#define RDB_MODULE_OPCODE_UINT 2 /* Unsigned integer. */
|
|
#define RDB_MODULE_OPCODE_FLOAT 3 /* Float. */
|
|
#define RDB_MODULE_OPCODE_DOUBLE 4 /* Double. */
|
|
#define RDB_MODULE_OPCODE_STRING 5 /* String. */
|
|
|
|
/* rdbLoad...() functions flags. */
|
|
#define RDB_LOAD_NONE 0
|
|
#define RDB_LOAD_ENC (1<<0)
|
|
#define RDB_LOAD_PLAIN (1<<1)
|
|
#define RDB_LOAD_SDS (1<<2)
|
|
|
|
/* flags on the purpose of rdb save or load */
|
|
#define RDBFLAGS_NONE 0 /* No special RDB loading. */
|
|
#define RDBFLAGS_AOF_PREAMBLE (1<<0) /* Load/save the RDB as AOF preamble. */
|
|
#define RDBFLAGS_REPLICATION (1<<1) /* Load/save for SYNC. */
|
|
#define RDBFLAGS_ALLOW_DUP (1<<2) /* Allow duplicated keys when loading.*/
|
|
#define RDBFLAGS_FEED_REPL (1<<3) /* Feed replication stream when loading.*/
|
|
#define RDBFLAGS_KEEP_CACHE (1<<4) /* Don't reclaim cache after rdb file is generated */
|
|
|
|
/* When rdbLoadObject() returns NULL, the err flag is
|
|
* set to hold the type of error that occurred */
|
|
#define RDB_LOAD_ERR_EMPTY_KEY 1 /* Error of empty key */
|
|
#define RDB_LOAD_ERR_OTHER 2 /* Any other errors */
|
|
|
|
ssize_t rdbWriteRaw(rio *rdb, void *p, size_t len);
|
|
int rdbSaveType(rio *rdb, unsigned char type);
|
|
int rdbLoadType(rio *rdb);
|
|
time_t rdbLoadTime(rio *rdb);
|
|
int rdbSaveLen(rio *rdb, uint64_t len);
|
|
int rdbSaveMillisecondTime(rio *rdb, long long t);
|
|
long long rdbLoadMillisecondTime(rio *rdb, int rdbver);
|
|
uint64_t rdbLoadLen(rio *rdb, int *isencoded);
|
|
int rdbLoadLenByRef(rio *rdb, int *isencoded, uint64_t *lenptr);
|
|
int rdbSaveObjectType(rio *rdb, robj *o);
|
|
int rdbLoadObjectType(rio *rdb);
|
|
int rdbLoad(char *filename, rdbSaveInfo *rsi, int rdbflags);
|
|
int rdbSaveBackground(int req, char *filename, rdbSaveInfo *rsi, int rdbflags);
|
|
int rdbSaveToSlavesSockets(int req, rdbSaveInfo *rsi);
|
|
void rdbRemoveTempFile(pid_t childpid, int from_signal);
|
|
int rdbSave(int req, char *filename, rdbSaveInfo *rsi, int rdbflags);
|
|
ssize_t rdbSaveObject(rio *rdb, robj *o, robj *key, int dbid);
|
|
size_t rdbSavedObjectLen(robj *o, robj *key, int dbid);
|
|
robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error);
|
|
void backgroundSaveDoneHandler(int exitcode, int bysignal);
|
|
int rdbSaveKeyValuePair(rio *rdb, robj *key, robj *val, long long expiretime,int dbid);
|
|
ssize_t rdbSaveSingleModuleAux(rio *rdb, int when, moduleType *mt);
|
|
robj *rdbLoadCheckModuleValue(rio *rdb, char *modulename);
|
|
robj *rdbLoadStringObject(rio *rdb);
|
|
ssize_t rdbSaveStringObject(rio *rdb, robj *obj);
|
|
ssize_t rdbSaveRawString(rio *rdb, unsigned char *s, size_t len);
|
|
void *rdbGenericLoadStringObject(rio *rdb, int flags, size_t *lenptr);
|
|
int rdbSaveBinaryDoubleValue(rio *rdb, double val);
|
|
int rdbLoadBinaryDoubleValue(rio *rdb, double *val);
|
|
int rdbSaveBinaryFloatValue(rio *rdb, float val);
|
|
int rdbLoadBinaryFloatValue(rio *rdb, float *val);
|
|
int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi);
|
|
int rdbLoadRioWithLoadingCtx(rio *rdb, int rdbflags, rdbSaveInfo *rsi, rdbLoadingCtx *rdb_loading_ctx);
|
|
int rdbFunctionLoad(rio *rdb, int ver, functionsLibCtx* lib_ctx, int rdbflags, sds *err);
|
|
int rdbSaveRio(int req, rio *rdb, int *error, int rdbflags, rdbSaveInfo *rsi);
|
|
ssize_t rdbSaveFunctions(rio *rdb);
|
|
rdbSaveInfo *rdbPopulateSaveInfo(rdbSaveInfo *rsi);
|
|
|
|
#endif
|