Add latency stats around cluster config file operations (#1534)

When the cluster changes, we need to persist the cluster configuration,
and these file IO operations may cause latency.

Signed-off-by: Binbin <binloveplay1314@qq.com>
This commit is contained in:
Binbin 2025-01-11 11:03:10 +08:00 committed by GitHub
parent 10357ceda5
commit 11cb8ee27c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 40 additions and 2 deletions

View File

@ -817,6 +817,7 @@ int clusterSaveConfig(int do_fsync) {
ssize_t written_bytes;
int fd = -1;
int retval = C_ERR;
mstime_t latency;
server.cluster->todo_before_sleep &= ~CLUSTER_TODO_SAVE_CONFIG;
@ -830,11 +831,15 @@ int clusterSaveConfig(int do_fsync) {
/* Create a temp file with the new content. */
tmpfilename = sdscatfmt(sdsempty(), "%s.tmp-%i-%I", server.cluster_configfile, (int)getpid(), mstime());
latencyStartMonitor(latency);
if ((fd = open(tmpfilename, O_WRONLY | O_CREAT, 0644)) == -1) {
serverLog(LL_WARNING, "Could not open temp cluster config file: %s", strerror(errno));
goto cleanup;
}
latencyEndMonitor(latency);
latencyAddSampleIfNeeded("cluster-config-open", latency);
latencyStartMonitor(latency);
while (offset < content_size) {
written_bytes = write(fd, ci + offset, content_size - offset);
if (written_bytes <= 0) {
@ -845,31 +850,52 @@ int clusterSaveConfig(int do_fsync) {
}
offset += written_bytes;
}
latencyEndMonitor(latency);
latencyAddSampleIfNeeded("cluster-config-write", latency);
if (do_fsync) {
latencyStartMonitor(latency);
server.cluster->todo_before_sleep &= ~CLUSTER_TODO_FSYNC_CONFIG;
if (valkey_fsync(fd) == -1) {
serverLog(LL_WARNING, "Could not sync tmp cluster config file: %s", strerror(errno));
goto cleanup;
}
latencyEndMonitor(latency);
latencyAddSampleIfNeeded("cluster-config-fsync", latency);
}
latencyStartMonitor(latency);
if (rename(tmpfilename, server.cluster_configfile) == -1) {
serverLog(LL_WARNING, "Could not rename tmp cluster config file: %s", strerror(errno));
goto cleanup;
}
latencyEndMonitor(latency);
latencyAddSampleIfNeeded("cluster-config-rename", latency);
if (do_fsync) {
latencyStartMonitor(latency);
if (fsyncFileDir(server.cluster_configfile) == -1) {
serverLog(LL_WARNING, "Could not sync cluster config file dir: %s", strerror(errno));
goto cleanup;
}
latencyEndMonitor(latency);
latencyAddSampleIfNeeded("cluster-config-dir-fsync", latency);
}
retval = C_OK; /* If we reached this point, everything is fine. */
cleanup:
if (fd != -1) close(fd);
if (retval == C_ERR) unlink(tmpfilename);
if (fd != -1) {
latencyStartMonitor(latency);
close(fd);
latencyEndMonitor(latency);
latencyAddSampleIfNeeded("cluster-config-close", latency);
}
if (retval == C_ERR) {
latencyStartMonitor(latency);
unlink(tmpfilename);
latencyEndMonitor(latency);
latencyAddSampleIfNeeded("cluster-config-unlink", latency);
}
sdsfree(tmpfilename);
sdsfree(ci);
return retval;

View File

@ -189,3 +189,15 @@ tags {"needs:debug"} {
assert_match "*wrong number of arguments for 'latency|help' command" $e
}
}
start_cluster 1 1 {tags {"latency-monitor cluster external:skip needs:latency"} overrides {latency-monitor-threshold 1}} {
test "Cluster config file latency" {
# This test just a sanity test so that we can make sure the code path is cover.
# We don't assert anything since we can't be sure whether it will be counted.
R 0 cluster saveconfig
R 1 cluster saveconfig
R 1 cluster failover force
R 0 latency latest
R 1 latency latest
}
}