AOF: recover from last write error after turn on appendonly again (#8030)
The key point is how to recover from last AOF write error, for example: 1. start redis with appendonly yes, and append some write commands 2. short write or something else error happen, `server.aof_last_write_status` changed to `C_ERR`, now redis doesn't accept write commands 3. execute `CONFIG SET appendonly no` to avoid the above problem, now redis can accept write commands again 4. disk error resolved, and execute `CONFIG SET appendonly yes` to reopen AOF, but `server.aof_last_write_status` cannot be changed to `C_OK` (if background aof rewrite run less then 1 second, it will free `server.aof_buf` and then serverCron cannot fix `aof_last_write_status`), then redis cannot accept write commands forever. This PR use a simple way to fix it: 1. just free `server.aof_buf` when stop appendonly to save memory, if error happens in `flushAppendOnlyFile(1)`, the `server.aof_buf` may contains some data which has not be written to aof, I think we can ignore it because we turn off the appendonly. 2. reset fsync status after stop appendonly and call `flushAppendOnlyFile` only when `aof_state` is ON 3. reset `server.last_write_status` when reopen aof to accept write commands
This commit is contained in:
parent
0d18a1e85f
commit
49b3663332
19
src/aof.c
19
src/aof.c
@ -235,6 +235,8 @@ void stopAppendOnly(void) {
|
|||||||
serverAssert(server.aof_state != AOF_OFF);
|
serverAssert(server.aof_state != AOF_OFF);
|
||||||
flushAppendOnlyFile(1);
|
flushAppendOnlyFile(1);
|
||||||
redis_fsync(server.aof_fd);
|
redis_fsync(server.aof_fd);
|
||||||
|
server.aof_fsync_offset = server.aof_current_size;
|
||||||
|
server.aof_last_fsync = server.unixtime;
|
||||||
close(server.aof_fd);
|
close(server.aof_fd);
|
||||||
|
|
||||||
server.aof_fd = -1;
|
server.aof_fd = -1;
|
||||||
@ -242,6 +244,8 @@ void stopAppendOnly(void) {
|
|||||||
server.aof_state = AOF_OFF;
|
server.aof_state = AOF_OFF;
|
||||||
server.aof_rewrite_scheduled = 0;
|
server.aof_rewrite_scheduled = 0;
|
||||||
killAppendOnlyChild();
|
killAppendOnlyChild();
|
||||||
|
sdsfree(server.aof_buf);
|
||||||
|
server.aof_buf = sdsempty();
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Called when the user switches from "appendonly no" to "appendonly yes"
|
/* Called when the user switches from "appendonly no" to "appendonly yes"
|
||||||
@ -285,6 +289,12 @@ int startAppendOnly(void) {
|
|||||||
server.aof_state = AOF_WAIT_REWRITE;
|
server.aof_state = AOF_WAIT_REWRITE;
|
||||||
server.aof_last_fsync = server.unixtime;
|
server.aof_last_fsync = server.unixtime;
|
||||||
server.aof_fd = newfd;
|
server.aof_fd = newfd;
|
||||||
|
|
||||||
|
/* If AOF was in error state, we just ignore it and log the event. */
|
||||||
|
if (server.aof_last_write_status == C_ERR) {
|
||||||
|
serverLog(LL_WARNING,"AOF reopen, just ignore the last error.");
|
||||||
|
server.aof_last_write_status = C_OK;
|
||||||
|
}
|
||||||
return C_OK;
|
return C_OK;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1855,18 +1865,22 @@ void backgroundRewriteDoneHandler(int exitcode, int bysignal) {
|
|||||||
close(newfd);
|
close(newfd);
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
latencyEndMonitor(latency);
|
||||||
|
latencyAddSampleIfNeeded("aof-rewrite-diff-write",latency);
|
||||||
|
|
||||||
if (server.aof_fsync == AOF_FSYNC_EVERYSEC) {
|
if (server.aof_fsync == AOF_FSYNC_EVERYSEC) {
|
||||||
aof_background_fsync(newfd);
|
aof_background_fsync(newfd);
|
||||||
} else if (server.aof_fsync == AOF_FSYNC_ALWAYS) {
|
} else if (server.aof_fsync == AOF_FSYNC_ALWAYS) {
|
||||||
|
latencyStartMonitor(latency);
|
||||||
if (redis_fsync(newfd) == -1) {
|
if (redis_fsync(newfd) == -1) {
|
||||||
serverLog(LL_WARNING,
|
serverLog(LL_WARNING,
|
||||||
"Error trying to fsync the parent diff to the rewritten AOF: %s", strerror(errno));
|
"Error trying to fsync the parent diff to the rewritten AOF: %s", strerror(errno));
|
||||||
close(newfd);
|
close(newfd);
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
latencyEndMonitor(latency);
|
||||||
|
latencyAddSampleIfNeeded("aof-rewrite-done-fsync",latency);
|
||||||
}
|
}
|
||||||
latencyEndMonitor(latency);
|
|
||||||
latencyAddSampleIfNeeded("aof-rewrite-diff-write",latency);
|
|
||||||
|
|
||||||
serverLog(LL_NOTICE,
|
serverLog(LL_NOTICE,
|
||||||
"Residual parent diff successfully flushed to the rewritten AOF (%.2f MB)", (double) aofRewriteBufferSize() / (1024*1024));
|
"Residual parent diff successfully flushed to the rewritten AOF (%.2f MB)", (double) aofRewriteBufferSize() / (1024*1024));
|
||||||
@ -1938,6 +1952,7 @@ void backgroundRewriteDoneHandler(int exitcode, int bysignal) {
|
|||||||
aofUpdateCurrentSize();
|
aofUpdateCurrentSize();
|
||||||
server.aof_rewrite_base_size = server.aof_current_size;
|
server.aof_rewrite_base_size = server.aof_current_size;
|
||||||
server.aof_fsync_offset = server.aof_current_size;
|
server.aof_fsync_offset = server.aof_current_size;
|
||||||
|
server.aof_last_fsync = server.unixtime;
|
||||||
|
|
||||||
/* Clear regular AOF buffer since its contents was just written to
|
/* Clear regular AOF buffer since its contents was just written to
|
||||||
* the new AOF from the background rewrite buffer. */
|
* the new AOF from the background rewrite buffer. */
|
||||||
|
@ -2174,14 +2174,15 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
|
|||||||
|
|
||||||
/* AOF postponed flush: Try at every cron cycle if the slow fsync
|
/* AOF postponed flush: Try at every cron cycle if the slow fsync
|
||||||
* completed. */
|
* completed. */
|
||||||
if (server.aof_flush_postponed_start) flushAppendOnlyFile(0);
|
if (server.aof_state == AOF_ON && server.aof_flush_postponed_start)
|
||||||
|
flushAppendOnlyFile(0);
|
||||||
|
|
||||||
/* AOF write errors: in this case we have a buffer to flush as well and
|
/* AOF write errors: in this case we have a buffer to flush as well and
|
||||||
* clear the AOF error in case of success to make the DB writable again,
|
* clear the AOF error in case of success to make the DB writable again,
|
||||||
* however to try every second is enough in case of 'hz' is set to
|
* however to try every second is enough in case of 'hz' is set to
|
||||||
* a higher frequency. */
|
* a higher frequency. */
|
||||||
run_with_period(1000) {
|
run_with_period(1000) {
|
||||||
if (server.aof_last_write_status == C_ERR)
|
if (server.aof_state == AOF_ON && server.aof_last_write_status == C_ERR)
|
||||||
flushAppendOnlyFile(0);
|
flushAppendOnlyFile(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2418,7 +2419,8 @@ void beforeSleep(struct aeEventLoop *eventLoop) {
|
|||||||
trackingBroadcastInvalidationMessages();
|
trackingBroadcastInvalidationMessages();
|
||||||
|
|
||||||
/* Write the AOF buffer on disk */
|
/* Write the AOF buffer on disk */
|
||||||
flushAppendOnlyFile(0);
|
if (server.aof_state == AOF_ON)
|
||||||
|
flushAppendOnlyFile(0);
|
||||||
|
|
||||||
/* Handle writes with pending output buffers. */
|
/* Handle writes with pending output buffers. */
|
||||||
handleClientsWithPendingWritesUsingThreads();
|
handleClientsWithPendingWritesUsingThreads();
|
||||||
|
Loading…
x
Reference in New Issue
Block a user