futriix/tests/integration/shutdown.tcl

# This test suite tests shutdown when there are lagging replicas connected.

# Fill up the OS socket send buffer for the replica connection 1M at a time.
# When the replication buffer memory increases beyond 2M (often after writing 4M
# or so), we assume it's because the OS socket send buffer can't swallow
# anymore.
proc fill_up_os_socket_send_buffer_for_repl {idx} {
    set i 0
    while {1} {
        incr i
        populate 1024 junk$i: 1024 $idx
        after 10
        set buf_size [s $idx mem_total_replication_buffers]
        if {$buf_size > 2*1024*1024} {
            break
        }
    }
}

foreach how {sigterm shutdown} {
    test "Shutting down master waits for replica to catch up ($how)" {
        start_server {overrides {save "" repl-backlog-size 1MB}} {
            start_server {overrides {save "" repl-backlog-size 1MB}} {
                set master [srv -1 client]
                set master_host [srv -1 host]
                set master_port [srv -1 port]
                set master_pid [srv -1 pid]
                set replica [srv 0 client]
                set replica_pid [srv 0 pid]

                # Config master.
                $master config set shutdown-timeout 300; # 5min for slow CI
                $master config set repl-backlog-size 1;  # small as possible
                $master config set hz 100;               # cron runs every 10ms

                # Config replica.
                $replica replicaof $master_host $master_port
                wait_for_sync $replica

                # Preparation: Set k to 1 on both master and replica.
                $master set k 1
                wait_for_ofs_sync $master $replica

                # Pause the replica.
                pause_process $replica_pid

                # Fill up the OS socket send buffer for the replica connection
                # to prevent the following INCR from reaching the replica via
                # the OS.
                fill_up_os_socket_send_buffer_for_repl -1

                # Incr k and immediately shutdown master.
                $master incr k
                switch $how {
                    sigterm {
                        exec kill -SIGTERM $master_pid
                    }
                    shutdown {
                        set rd [valkey_deferring_client -1]
                        $rd shutdown
                    }
                }
                wait_for_condition 50 100 {
                    [s -1 shutdown_in_milliseconds] > 0
                } else {
                    fail "Master not indicating ongoing shutdown."
                }

                # Wake up replica and check if master has waited for it.
                after 20; # 2 cron intervals
                resume_process $replica_pid
                wait_for_condition 300 1000 {
                    [$replica get k] eq 2
                } else {
                    fail "Master exited before replica could catch up."
                }

                # Check shutdown log messages on master
                wait_for_log_messages -1 {"*ready to exit, bye bye*"} 0 100 500
                assert_equal 0 [count_log_message -1 "*Lagging replica*"]
                verify_log_message -1 "*1 of 1 replicas are in sync*" 0
            }
        }
    } {} {repl external:skip}
}

test {Shutting down master waits for replica timeout} {
    start_server {overrides {save "" repl-backlog-size 1MB}} {
        start_server {overrides {save "" repl-backlog-size 1MB}} {
            set master [srv -1 client]
            set master_host [srv -1 host]
            set master_port [srv -1 port]
            set master_pid [srv -1 pid]
            set replica [srv 0 client]
            set replica_pid [srv 0 pid]

            # Config master.
            $master config set shutdown-timeout 1; # second

            # Config replica.
            $replica replicaof $master_host $master_port
            wait_for_sync $replica

            # Preparation: Set k to 1 on both master and replica.
            $master set k 1
            wait_for_ofs_sync $master $replica

            # Pause the replica.
            pause_process $replica_pid

            # Fill up the OS socket send buffer for the replica connection to
            # prevent the following INCR k from reaching the replica via the OS.
            fill_up_os_socket_send_buffer_for_repl -1

            # Incr k and immediately shutdown master.
            $master incr k
            exec kill -SIGTERM $master_pid
            wait_for_condition 50 100 {
                [s -1 shutdown_in_milliseconds] > 0
            } else {
                fail "Master not indicating ongoing shutdown."
            }

            # Let master finish shutting down and check log.
            wait_for_log_messages -1 {"*ready to exit, bye bye*"} 0 100 100
            verify_log_message -1 "*Lagging replica*" 0
            verify_log_message -1 "*0 of 1 replicas are in sync*" 0

            # Wake up replica.
            resume_process $replica_pid
            assert_equal 1 [$replica get k]
        }
    }
} {} {repl external:skip}

test "Shutting down master waits for replica then fails" {
    start_server {overrides {save "" repl-backlog-size 1MB}} {
        start_server {overrides {save "" repl-backlog-size 1MB}} {
            set master [srv -1 client]
            set master_host [srv -1 host]
            set master_port [srv -1 port]
            set master_pid [srv -1 pid]
            set replica [srv 0 client]
            set replica_pid [srv 0 pid]

            # Config master and replica.
            $replica replicaof $master_host $master_port
            wait_for_sync $replica

            # Pause the replica and write a key on master.
            pause_process $replica_pid
            $master incr k

            # Two clients call blocking SHUTDOWN in parallel.
            set rd1 [valkey_deferring_client -1]
            set rd2 [valkey_deferring_client -1]
            $rd1 shutdown
            $rd2 shutdown
            wait_for_condition 50 100 {
                [llength [lsearch -all [split [string trim [$master client list]] "\r\n"] *cmd=shutdown*]] == 2
            } else {
                fail "SHUTDOWN not called on all clients"
            }

            set info_clients [$master info clients]
            assert_match "*connected_clients:3*" $info_clients
            assert_match "*blocked_clients:2*" $info_clients

            # Start a very slow initial AOFRW, which will prevent shutdown.
            $master config set rdb-key-save-delay 30000000; # 30 seconds
            $master config set appendonly yes

            # Wake up replica, causing master to continue shutting down.
            resume_process $replica_pid

            # SHUTDOWN returns an error to both clients blocking on SHUTDOWN.
            catch { $rd1 read } e1
            catch { $rd2 read } e2
            assert_match "*Errors trying to SHUTDOWN. Check logs*" $e1
            assert_match "*Errors trying to SHUTDOWN. Check logs*" $e2
            $rd1 close
            $rd2 close

            # Check shutdown log messages on master.
            verify_log_message -1 "*1 of 1 replicas are in sync*" 0
            verify_log_message -1 "*Writing initial AOF, can't exit*" 0
            verify_log_message -1 "*Errors trying to shut down*" 0

            # Let master to exit fast, without waiting for the very slow AOFRW.
            catch {$master shutdown nosave force}
        }
    }
} {} {repl external:skip}

test "Shutting down master waits for replica then aborted" {
    start_server {overrides {save "" repl-backlog-size 1MB}} {
        start_server {overrides {save "" repl-backlog-size 1MB}} {
            set master [srv -1 client]
            set master_host [srv -1 host]
            set master_port [srv -1 port]
            set master_pid [srv -1 pid]
            set replica [srv 0 client]
            set replica_pid [srv 0 pid]

            # Config master and replica.
            $replica replicaof $master_host $master_port
            wait_for_sync $replica

            # Pause the replica and write a key on master.
            pause_process $replica_pid
            $master incr k

            # Two clients call blocking SHUTDOWN in parallel.
            set rd1 [valkey_deferring_client -1]
            set rd2 [valkey_deferring_client -1]
            $rd1 shutdown
            $rd2 shutdown
            wait_for_condition 50 100 {
                [llength [lsearch -all [split [string trim [$master client list]] "\r\n"] *cmd=shutdown*]] == 2
            } else {
                fail "SHUTDOWN not called on all clients"
            }

            set info_clients [$master info clients]
            assert_match "*connected_clients:3*" $info_clients
            assert_match "*blocked_clients:2*" $info_clients

            # Abort the shutdown
            $master shutdown abort

            # Wake up replica, causing master to continue shutting down.
            resume_process $replica_pid

            # SHUTDOWN returns an error to both clients blocking on SHUTDOWN.
            catch { $rd1 read } e1
            catch { $rd2 read } e2
            assert_match "*Errors trying to SHUTDOWN. Check logs*" $e1
            assert_match "*Errors trying to SHUTDOWN. Check logs*" $e2
            $rd1 close
            $rd2 close

            # Check shutdown log messages on master.
            verify_log_message -1 "*Shutdown manually aborted*" 0
        }
    }
} {} {repl external:skip}