diff --git a/redis.conf b/redis.conf index 84f45fdb7..09107dcea 100644 --- a/redis.conf +++ b/redis.conf @@ -1881,13 +1881,13 @@ hz 10 dynamic-hz yes # When a child rewrites the AOF file, if the following option is enabled -# the file will be fsync-ed every 32 MB of data generated. This is useful +# the file will be fsync-ed every 4 MB of data generated. This is useful # in order to commit the file to the disk more incrementally and avoid # big latency spikes. aof-rewrite-incremental-fsync yes # When redis saves RDB file, if the following option is enabled -# the file will be fsync-ed every 32 MB of data generated. This is useful +# the file will be fsync-ed every 4 MB of data generated. This is useful # in order to commit the file to the disk more incrementally and avoid # big latency spikes. rdb-save-incremental-fsync yes diff --git a/src/config.h b/src/config.h index 4700e7208..5a4223fbd 100644 --- a/src/config.h +++ b/src/config.h @@ -120,6 +120,7 @@ /* Define rdb_fsync_range to sync_file_range() on Linux, otherwise we use * the plain fsync() call. */ #if (defined(__linux__) && defined(SYNC_FILE_RANGE_WAIT_BEFORE)) +#define HAVE_SYNC_FILE_RANGE 1 #define rdb_fsync_range(fd,off,size) sync_file_range(fd,off,size,SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE) #else #define rdb_fsync_range(fd,off,size) fsync(fd) diff --git a/src/rio.c b/src/rio.c index 2234ec6e3..70817347b 100644 --- a/src/rio.c +++ b/src/rio.c @@ -108,19 +108,52 @@ void rioInitWithBuffer(rio *r, sds s) { /* Returns 1 or 0 for success/failure. */ static size_t rioFileWrite(rio *r, const void *buf, size_t len) { - size_t retval; + if (!r->io.file.autosync) return fwrite(buf,len,1,r->io.file.fp); - retval = fwrite(buf,len,1,r->io.file.fp); - r->io.file.buffered += len; + size_t nwritten = 0; + /* Incrementally write data to the file, avoid a single write larger than + * the autosync threshold (so that the kernel's buffer cache never has too + * many dirty pages at once). */ + while (len != nwritten) { + serverAssert(r->io.file.autosync > r->io.file.buffered); + size_t nalign = (size_t)(r->io.file.autosync - r->io.file.buffered); + size_t towrite = nalign > len-nwritten ? len-nwritten : nalign; - if (r->io.file.autosync && - r->io.file.buffered >= r->io.file.autosync) - { - fflush(r->io.file.fp); - if (redis_fsync(fileno(r->io.file.fp)) == -1) return 0; - r->io.file.buffered = 0; + if (fwrite((char*)buf+nwritten,towrite,1,r->io.file.fp) == 0) return 0; + nwritten += towrite; + r->io.file.buffered += towrite; + + if (r->io.file.buffered >= r->io.file.autosync) { + fflush(r->io.file.fp); + + size_t processed = r->processed_bytes + nwritten; + serverAssert(processed % r->io.file.autosync == 0); + serverAssert(r->io.file.buffered == r->io.file.autosync); + +#if HAVE_SYNC_FILE_RANGE + /* Start writeout asynchronously. */ + if (sync_file_range(fileno(r->io.file.fp), + processed - r->io.file.autosync, r->io.file.autosync, + SYNC_FILE_RANGE_WRITE) == -1) + return 0; + + if (processed >= (size_t)r->io.file.autosync * 2) { + /* To keep the promise to 'autosync', we should make sure last + * asynchronous writeout persists into disk. This call may block + * if last writeout is not finished since disk is slow. */ + if (sync_file_range(fileno(r->io.file.fp), + processed - r->io.file.autosync*2, + r->io.file.autosync, SYNC_FILE_RANGE_WAIT_BEFORE| + SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER) == -1) + return 0; + } +#else + if (redis_fsync(fileno(r->io.file.fp)) == -1) return 0; +#endif + r->io.file.buffered = 0; + } } - return retval; + return 1; } /* Returns 1 or 0 for success/failure. */ diff --git a/src/server.h b/src/server.h index 177525aa4..1d7a84615 100644 --- a/src/server.h +++ b/src/server.h @@ -151,7 +151,7 @@ typedef long long ustime_t; /* microsecond time type. */ #define PROTO_MBULK_BIG_ARG (1024*32) #define PROTO_RESIZE_THRESHOLD (1024*32) /* Threshold for determining whether to resize query buffer */ #define LONG_STR_SIZE 21 /* Bytes needed for long -> str + '\0' */ -#define REDIS_AUTOSYNC_BYTES (1024*1024*32) /* fdatasync every 32MB */ +#define REDIS_AUTOSYNC_BYTES (1024*1024*4) /* Sync file every 4MB. */ #define LIMIT_PENDING_QUERYBUF (4*1024*1024) /* 4mb */