Use sync_file_range to optimize fsync if possible (#9409)

We implement incremental data sync in rio.c by call fsync, on slow disk, that may cost a lot of time,
sync_file_range could provide async fsync, so we could serialize key/value and sync file data at the same time.

> one tip for sync_file_range usage: http://lkml.iu.edu/hypermail/linux/kernel/1005.2/01845.html

Additionally, this change avoids a single large write to be used, which can result in a mass of dirty
pages in the kernel (increasing the risk of someone else's write to block).

On HDD, current solution could reduce approximate half of dumping RDB time,
this PR costs 50s for dump 7.7G rdb but unstable branch costs 93s.
On NVME SSD, this PR can't reduce much time,  this PR costs 40s, unstable branch costs 48s.

Moreover, I find calling data sync every 4MB is better than 32MB.
This commit is contained in:
Wang Yuan 2021-08-30 15:24:53 +08:00 committed by GitHub
parent aefbc23451
commit 9a0c0617f1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 47 additions and 13 deletions

View File

@ -1881,13 +1881,13 @@ hz 10
dynamic-hz yes
# When a child rewrites the AOF file, if the following option is enabled
# the file will be fsync-ed every 32 MB of data generated. This is useful
# the file will be fsync-ed every 4 MB of data generated. This is useful
# in order to commit the file to the disk more incrementally and avoid
# big latency spikes.
aof-rewrite-incremental-fsync yes
# When redis saves RDB file, if the following option is enabled
# the file will be fsync-ed every 32 MB of data generated. This is useful
# the file will be fsync-ed every 4 MB of data generated. This is useful
# in order to commit the file to the disk more incrementally and avoid
# big latency spikes.
rdb-save-incremental-fsync yes

View File

@ -120,6 +120,7 @@
/* Define rdb_fsync_range to sync_file_range() on Linux, otherwise we use
* the plain fsync() call. */
#if (defined(__linux__) && defined(SYNC_FILE_RANGE_WAIT_BEFORE))
#define HAVE_SYNC_FILE_RANGE 1
#define rdb_fsync_range(fd,off,size) sync_file_range(fd,off,size,SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE)
#else
#define rdb_fsync_range(fd,off,size) fsync(fd)

View File

@ -108,19 +108,52 @@ void rioInitWithBuffer(rio *r, sds s) {
/* Returns 1 or 0 for success/failure. */
static size_t rioFileWrite(rio *r, const void *buf, size_t len) {
size_t retval;
if (!r->io.file.autosync) return fwrite(buf,len,1,r->io.file.fp);
retval = fwrite(buf,len,1,r->io.file.fp);
r->io.file.buffered += len;
size_t nwritten = 0;
/* Incrementally write data to the file, avoid a single write larger than
* the autosync threshold (so that the kernel's buffer cache never has too
* many dirty pages at once). */
while (len != nwritten) {
serverAssert(r->io.file.autosync > r->io.file.buffered);
size_t nalign = (size_t)(r->io.file.autosync - r->io.file.buffered);
size_t towrite = nalign > len-nwritten ? len-nwritten : nalign;
if (r->io.file.autosync &&
r->io.file.buffered >= r->io.file.autosync)
{
fflush(r->io.file.fp);
if (redis_fsync(fileno(r->io.file.fp)) == -1) return 0;
r->io.file.buffered = 0;
if (fwrite((char*)buf+nwritten,towrite,1,r->io.file.fp) == 0) return 0;
nwritten += towrite;
r->io.file.buffered += towrite;
if (r->io.file.buffered >= r->io.file.autosync) {
fflush(r->io.file.fp);
size_t processed = r->processed_bytes + nwritten;
serverAssert(processed % r->io.file.autosync == 0);
serverAssert(r->io.file.buffered == r->io.file.autosync);
#if HAVE_SYNC_FILE_RANGE
/* Start writeout asynchronously. */
if (sync_file_range(fileno(r->io.file.fp),
processed - r->io.file.autosync, r->io.file.autosync,
SYNC_FILE_RANGE_WRITE) == -1)
return 0;
if (processed >= (size_t)r->io.file.autosync * 2) {
/* To keep the promise to 'autosync', we should make sure last
* asynchronous writeout persists into disk. This call may block
* if last writeout is not finished since disk is slow. */
if (sync_file_range(fileno(r->io.file.fp),
processed - r->io.file.autosync*2,
r->io.file.autosync, SYNC_FILE_RANGE_WAIT_BEFORE|
SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER) == -1)
return 0;
}
#else
if (redis_fsync(fileno(r->io.file.fp)) == -1) return 0;
#endif
r->io.file.buffered = 0;
}
}
return retval;
return 1;
}
/* Returns 1 or 0 for success/failure. */

View File

@ -151,7 +151,7 @@ typedef long long ustime_t; /* microsecond time type. */
#define PROTO_MBULK_BIG_ARG (1024*32)
#define PROTO_RESIZE_THRESHOLD (1024*32) /* Threshold for determining whether to resize query buffer */
#define LONG_STR_SIZE 21 /* Bytes needed for long -> str + '\0' */
#define REDIS_AUTOSYNC_BYTES (1024*1024*32) /* fdatasync every 32MB */
#define REDIS_AUTOSYNC_BYTES (1024*1024*4) /* Sync file every 4MB. */
#define LIMIT_PENDING_QUERYBUF (4*1024*1024) /* 4mb */