Set replicas to panic on disk errors, and optionally panic on replication errors (#10504)

* Till now, replicas that were unable to persist, would still execute the commands they got from the master, now they'll panic by default, and we add a new `replica-ignore-disk-errors` config to change that. * Till now, when a command failed on a replica or AOF-loading, it only logged a warning and a stat, we add a new `propagation-error-behavior` config to allow panicking in that state (may become the default one day) Note that commands that fail on the replica can either indicate a bug that could cause data inconsistency between the replica and the master, or they could be in some cases (specifically in previous versions), a result of a command (e.g. EVAL) that failed on the master, but still had to be propagated to fail on the replica as well.
2022-04-26 03:25:33 -07:00 · 2022-04-26 03:25:33 -07:00 · 6fa8e4f7af
commit 6fa8e4f7af
parent efcd1bf394
8 changed files with 136 additions and 7 deletions
--- a/redis.conf
+++ b/redis.conf
@ -728,6 +728,31 @@ repl-disable-tcp-nodelay no
 # By default the priority is 100.
 replica-priority 100

+# The propagation error behavior controls how Redis will behave when it is
+# unable to handle a command being processed in the replication stream from a master
+# or processed while reading from an AOF file. Errors that occur during propagation
+# are unexpected, and can cause data inconsistency. However, there are edge cases
+# in earlier versions of Redis where it was possible for the server to replicate or persist
+# commands that would fail on future versions. For this reason the default behavior
+# is to ignore such errors and continue processing commands.
+#
+# If an application wants to ensure there is no data divergence, this configuration
+# should be set to 'panic' instead. The value can also be set to 'panic-on-replicas'
+# to only panic when a replica encounters an error on the replication stream. One of
+# these two panic values will become the default value in the future once there are
+# sufficient safety mechanisms in place to prevent false positive crashes.
+#
+# propagation-error-behavior ignore
+
+# Replica ignore disk write errors controls the behavior of a replica when it is
+# unable to persist a write command received from its master to disk. By default,
+# this configuration is set to 'no' and will crash the replica in this condition.
+# It is not recommended to change this default, however in order to be compatible
+# with older versions of Redis this config can be toggled to 'yes' which will just
+# log a warning and execute the write command it got from the master.
+#
+# replica-ignore-disk-write-errors no
+
 # -----------------------------------------------------------------------------
 # By default, Redis Sentinel includes all replicas in its reports. A replica
 # can be excluded from Redis Sentinel's announcements. An unannounced replica
--- a/src/config.c
+++ b/src/config.c
@ -143,6 +143,13 @@ configEnum cluster_preferred_endpoint_type_enum[] = {
    {NULL, 0}
 };

+configEnum propagation_error_behavior_enum[] = {
+    {"ignore", PROPAGATION_ERR_BEHAVIOR_IGNORE},
+    {"panic", PROPAGATION_ERR_BEHAVIOR_PANIC},
+    {"panic-on-replicas", PROPAGATION_ERR_BEHAVIOR_PANIC_ON_REPLICAS},
+    {NULL, 0}
+};
+
 /* Output buffer limits presets. */
 clientBufferLimitsConfig clientBufferLimitsDefaults[CLIENT_TYPE_OBUF_COUNT] = {
    {0, 0, 0}, /* normal */
@ -2893,7 +2900,8 @@ standardConfig static_configs[] = {
    createBoolConfig("replica-announced", NULL, MODIFIABLE_CONFIG, server.replica_announced, 1, NULL, NULL),
    createBoolConfig("latency-tracking", NULL, MODIFIABLE_CONFIG, server.latency_tracking_enabled, 1, NULL, NULL),
    createBoolConfig("aof-disable-auto-gc", NULL, MODIFIABLE_CONFIG, server.aof_disable_auto_gc, 0, NULL, updateAofAutoGCEnabled),
-    
+    createBoolConfig("replica-ignore-disk-write-errors", NULL, MODIFIABLE_CONFIG, server.repl_ignore_disk_write_error, 0, NULL, NULL),
+
    /* String Configs */
    createStringConfig("aclfile", NULL, IMMUTABLE_CONFIG, ALLOW_EMPTY_STRING, server.acl_filename, "", NULL, NULL),
    createStringConfig("unixsocket", NULL, IMMUTABLE_CONFIG, EMPTY_STRING_IS_NULL, server.unixsocket, NULL, NULL, NULL),
@ -2934,6 +2942,7 @@ standardConfig static_configs[] = {
    createEnumConfig("enable-debug-command", NULL, IMMUTABLE_CONFIG, protected_action_enum, server.enable_debug_cmd, PROTECTED_ACTION_ALLOWED_NO, NULL, NULL),
    createEnumConfig("enable-module-command", NULL, IMMUTABLE_CONFIG, protected_action_enum, server.enable_module_cmd, PROTECTED_ACTION_ALLOWED_NO, NULL, NULL),
    createEnumConfig("cluster-preferred-endpoint-type", NULL, MODIFIABLE_CONFIG, cluster_preferred_endpoint_type_enum, server.cluster_preferred_endpoint_type, CLUSTER_ENDPOINT_TYPE_IP, NULL, NULL),
+    createEnumConfig("propagation-error-behavior", NULL, MODIFIABLE_CONFIG, propagation_error_behavior_enum, server.propagation_error_behavior, PROPAGATION_ERR_BEHAVIOR_IGNORE, NULL, NULL),

    /* Integer configs */
    createIntConfig("databases", NULL, IMMUTABLE_CONFIG, 1, INT_MAX, server.dbnum, 16, INTEGER_CONFIG, NULL, NULL),
--- a/src/debug.c
+++ b/src/debug.c
@ -416,6 +416,8 @@ void debugCommand(client *c) {
 "    Like HTSTATS but for the hash table stored at <key>'s value.",
 "LOADAOF",
 "    Flush the AOF buffers on disk and reload the AOF in memory.",
+"REPLICATE <string>",
+"    Replicates the provided string to replicas, allowing data divergence.",
 #ifdef USE_JEMALLOC
 "MALLCTL <key> [<val>]",
 "    Get or set a malloc tuning integer.",
@ -849,6 +851,10 @@ NULL
    {
        server.aof_flush_sleep = atoi(c->argv[2]->ptr);
        addReply(c,shared.ok);
+    } else if (!strcasecmp(c->argv[1]->ptr,"replicate") && c->argc >= 3) {
+        replicationFeedSlaves(server.slaves, server.slaveseldb,
+                c->argv + 2, c->argc - 2);
+        addReply(c,shared.ok);
    } else if (!strcasecmp(c->argv[1]->ptr,"error") && c->argc == 3) {
        sds errstr = sdsnewlen("-",1);

--- a/src/networking.c
+++ b/src/networking.c
@ -539,6 +539,21 @@ void afterErrorReply(client *c, const char *s, size_t len, int flags) {
            showLatestBacklog();
        }
        server.stat_unexpected_error_replies++;
+
+        /* Based off the propagation error behavior, check if we need to panic here. There
+         * are currently two checked cases:
+         * * If this command was from our master and we are not a writable replica.
+         * * We are reading from an AOF file. */
+        int panic_in_replicas = (ctype == CLIENT_TYPE_MASTER && server.repl_slave_ro)
+            && (server.propagation_error_behavior == PROPAGATION_ERR_BEHAVIOR_PANIC ||
+            server.propagation_error_behavior == PROPAGATION_ERR_BEHAVIOR_PANIC_ON_REPLICAS);
+        int panic_in_aof = c->id == CLIENT_ID_AOF 
+            && server.propagation_error_behavior == PROPAGATION_ERR_BEHAVIOR_PANIC;
+        if (panic_in_replicas || panic_in_aof) {
+            serverPanic("This %s panicked sending an error to its %s"
+                " after processing the command '%s'",
+                from, to, cmdname ? cmdname : "<unknown>");
+        }
    }
 }

--- a/src/server.c
+++ b/src/server.c
@ -3755,15 +3755,29 @@ int processCommand(client *c) {
    if (server.tracking_clients) trackingLimitUsedSlots();

    /* Don't accept write commands if there are problems persisting on disk
-     * unless coming from our master. */
+     * unless coming from our master, in which case check the replica ignore
+     * disk write error config to either log or crash. */
    int deny_write_type = writeCommandsDeniedByDiskError();
    if (deny_write_type != DISK_ERROR_TYPE_NONE &&
-        !obey_client &&
-        (is_write_command ||c->cmd->proc == pingCommand))
+        (is_write_command || c->cmd->proc == pingCommand))
    {
-        sds err = writeCommandsGetDiskErrorMessage(deny_write_type);
-        rejectCommandSds(c, err);
-        return C_OK;
+        if (obey_client) {
+            if (!server.repl_ignore_disk_write_error && c->cmd->proc != pingCommand) {
+                serverPanic("Replica was unable to write command to disk.");
+            } else {
+                static mstime_t last_log_time_ms = 0;
+                const mstime_t log_interval_ms = 10000;
+                if (server.mstime > last_log_time_ms + log_interval_ms) {
+                    last_log_time_ms = server.mstime;
+                    serverLog(LL_WARNING, "Replica is applying a command even though "
+                                          "it is unable to write to disk.");
+                }
+            }
+        } else {
+            sds err = writeCommandsGetDiskErrorMessage(deny_write_type);
+            rejectCommandSds(c, err);
+            return C_OK;
+        }
    }

    /* Don't accept write commands if there are not enough good slaves and
--- a/src/server.h
+++ b/src/server.h
@ -1325,6 +1325,15 @@ struct redisMemOverhead {
    } *db;
 };

+/* Replication error behavior determines the replica behavior
+ * when it receives an error over the replication stream. In
+ * either case the error is logged. */
+enum {
+    PROPAGATION_ERR_BEHAVIOR_IGNORE = 0,
+    PROPAGATION_ERR_BEHAVIOR_PANIC,
+    PROPAGATION_ERR_BEHAVIOR_PANIC_ON_REPLICAS
+} replicationErrorBehavior;
+
 /* This structure can be optionally passed to RDB save/load functions in
 * order to implement additional functionalities, by storing and loading
 * metadata to the RDB file.
@ -1772,6 +1781,10 @@ struct redisServer {
    int replica_announced;          /* If true, replica is announced by Sentinel */
    int slave_announce_port;        /* Give the master this listening port. */
    char *slave_announce_ip;        /* Give the master this ip address. */
+    int propagation_error_behavior; /* Configures the behavior of the replica
+                                     * when it receives an error on the replication stream */
+    int repl_ignore_disk_write_error;   /* Configures whether replicas panic when unable to
+                                         * persist writes to AOF. */
    /* The following two fields is where we store master PSYNC replid/offset
     * while the PSYNC is in progress. At the end we'll copy the fields into
     * the server->master client structure. */
--- a/tests/assets/default.conf
+++ b/tests/assets/default.conf
@ -30,3 +30,5 @@ activerehashing yes
 enable-protected-configs yes
 enable-debug-command yes
 enable-module-command yes
+
+propagation-error-behavior panic
--- a/tests/integration/replication-4.tcl
+++ b/tests/integration/replication-4.tcl
@ -195,3 +195,48 @@ start_server {tags {"repl external:skip"}} {
        }
    }
 }
+
+start_server {tags {"repl external:skip"}} {
+    start_server {} {
+        set master [srv -1 client]
+        set master_host [srv -1 host]
+        set master_port [srv -1 port]
+        set replica [srv 0 client]
+
+        test {First server should have role slave after SLAVEOF} {
+            $replica slaveof $master_host $master_port
+            wait_for_condition 50 100 {
+                [s 0 role] eq {slave}
+            } else {
+                fail "Replication not started."
+            }
+            wait_for_sync $replica
+        }
+
+        test {Data divergence can happen under default conditions} {       
+            $replica config set propagation-error-behavior ignore     
+            $master debug replicate fake-command-1
+
+            # Wait for replication to normalize
+            $master set foo bar2
+            $master wait 1 2000
+
+            # Make sure we triggered the error, by finding the critical
+            # message and the fake command.
+            assert_equal [count_log_message 0 "fake-command-1"] 1
+            assert_equal [count_log_message 0 "== CRITICAL =="] 1
+        }
+
+        test {Data divergence is allowed on writable replicas} {            
+            $replica config set replica-read-only no
+            $replica set number2 foo
+            $master incrby number2 1
+            $master wait 1 2000
+
+            assert_equal [$master get number2] 1
+            assert_equal [$replica get number2] foo
+
+            assert_equal [count_log_message 0 "incrby"] 1
+        }
+    }
+}