2024-06-27 19:00:45 +08:00
|
|
|
start_server {tags {needs:repl external:skip}} {
|
|
|
|
start_server {} {
|
2024-08-18 15:20:53 +02:00
|
|
|
set primary [srv -1 client]
|
2024-06-27 19:00:45 +08:00
|
|
|
set primary_host [srv -1 host]
|
|
|
|
set primary_port [srv -1 port]
|
To avoid bouncing -REDIRECT during FAILOVER (#871)
Fix #821
During the `FAILOVER` process, when conditions are met (such as when the
force time is reached or the primary and replica offsets are
consistent), the primary actively becomes the replica and transitions to
the `FAILOVER_IN_PROGRESS` state. After the primary becomes the replica,
and after handshaking and other operations, it will eventually send the
`PSYNC FAILOVER` command to the replica, after which the replica will
become the primary. This means that the upgrade of the replica to the
primary is an asynchronous operation, which implies that during the
`FAILOVER_IN_PROGRESS` state, there may be a period of time where both
nodes are replicas. In this scenario, if a `-REDIRECT` is returned, the
request will be redirected to the replica and then redirected back,
causing back and forth redirection. To avoid this situation, during the
`FAILOVER_IN_PROGRESS state`, we temporarily suspend the clients that
need to be redirected until the replica truly becomes the primary, and
then resume the execution.
---------
Signed-off-by: zhaozhao.zz <zhaozhao.zz@alibaba-inc.com>
2024-08-14 14:04:29 +08:00
|
|
|
set primary_pid [srv -1 pid]
|
|
|
|
|
|
|
|
set replica_host [srv 0 host]
|
|
|
|
set replica_port [srv 0 port]
|
|
|
|
set replica_pid [srv 0 pid]
|
2024-06-27 19:00:45 +08:00
|
|
|
|
2024-08-30 10:17:53 +08:00
|
|
|
test {write command inside MULTI is QUEUED, EXEC should be REDIRECT} {
|
|
|
|
set rr [valkey_client]
|
|
|
|
$rr client capa redirect
|
|
|
|
$rr multi
|
|
|
|
assert_equal "QUEUED" [$rr set foo bar]
|
|
|
|
|
|
|
|
# Change the current instance to be a replica
|
|
|
|
r replicaof $primary_host $primary_port
|
|
|
|
wait_replica_online $primary
|
|
|
|
|
|
|
|
assert_error "REDIRECT*" {$rr exec}
|
|
|
|
$rr close
|
|
|
|
}
|
|
|
|
|
|
|
|
test {write command inside MULTI is REDIRECT, EXEC should be EXECABORT} {
|
|
|
|
set rr [valkey_client]
|
|
|
|
$rr client capa redirect
|
|
|
|
$rr multi
|
|
|
|
assert_error "REDIRECT*" {$rr set foo bar}
|
|
|
|
assert_error "EXECABORT*" {$rr exec}
|
|
|
|
$rr close
|
|
|
|
}
|
2024-06-27 19:00:45 +08:00
|
|
|
|
|
|
|
test {replica allow read command by default} {
|
|
|
|
r get foo
|
|
|
|
} {}
|
|
|
|
|
|
|
|
test {replica reply READONLY error for write command by default} {
|
|
|
|
assert_error {READONLY*} {r set foo bar}
|
|
|
|
}
|
|
|
|
|
|
|
|
test {replica redirect read and write command after CLIENT CAPA REDIRECT} {
|
|
|
|
r client capa redirect
|
|
|
|
assert_error "REDIRECT $primary_host:$primary_port" {r set foo bar}
|
|
|
|
assert_error "REDIRECT $primary_host:$primary_port" {r get foo}
|
|
|
|
}
|
|
|
|
|
|
|
|
test {non-data access commands are not redirected} {
|
|
|
|
r ping
|
|
|
|
} {PONG}
|
|
|
|
|
|
|
|
test {replica allow read command in READONLY mode} {
|
|
|
|
r readonly
|
|
|
|
r get foo
|
|
|
|
} {}
|
To avoid bouncing -REDIRECT during FAILOVER (#871)
Fix #821
During the `FAILOVER` process, when conditions are met (such as when the
force time is reached or the primary and replica offsets are
consistent), the primary actively becomes the replica and transitions to
the `FAILOVER_IN_PROGRESS` state. After the primary becomes the replica,
and after handshaking and other operations, it will eventually send the
`PSYNC FAILOVER` command to the replica, after which the replica will
become the primary. This means that the upgrade of the replica to the
primary is an asynchronous operation, which implies that during the
`FAILOVER_IN_PROGRESS` state, there may be a period of time where both
nodes are replicas. In this scenario, if a `-REDIRECT` is returned, the
request will be redirected to the replica and then redirected back,
causing back and forth redirection. To avoid this situation, during the
`FAILOVER_IN_PROGRESS state`, we temporarily suspend the clients that
need to be redirected until the replica truly becomes the primary, and
then resume the execution.
---------
Signed-off-by: zhaozhao.zz <zhaozhao.zz@alibaba-inc.com>
2024-08-14 14:04:29 +08:00
|
|
|
|
|
|
|
test {client paused during failover-in-progress} {
|
|
|
|
pause_process $replica_pid
|
|
|
|
# replica will never acknowledge this write
|
|
|
|
r -1 set foo bar
|
|
|
|
r -1 failover to $replica_host $replica_port TIMEOUT 100 FORCE
|
|
|
|
|
|
|
|
# Wait for primary to give up on sync attempt and start failover
|
|
|
|
wait_for_condition 50 100 {
|
|
|
|
[s -1 master_failover_state] == "failover-in-progress"
|
|
|
|
} else {
|
|
|
|
fail "Failover from primary to replica did not timeout"
|
|
|
|
}
|
|
|
|
|
|
|
|
set rd [valkey_deferring_client -1]
|
|
|
|
$rd client capa redirect
|
|
|
|
assert_match "OK" [$rd read]
|
|
|
|
$rd set foo bar
|
|
|
|
|
|
|
|
# Client paused during failover-in-progress, see more details in PR #871
|
|
|
|
wait_for_blocked_clients_count 1 100 10 -1
|
|
|
|
|
|
|
|
resume_process $replica_pid
|
|
|
|
|
|
|
|
# Wait for failover to end
|
|
|
|
wait_for_condition 50 100 {
|
|
|
|
[s -1 master_failover_state] == "no-failover"
|
|
|
|
} else {
|
|
|
|
fail "Failover from primary to replica did not finish"
|
|
|
|
}
|
|
|
|
|
|
|
|
assert_match *master* [r role]
|
|
|
|
assert_match *slave* [r -1 role]
|
|
|
|
|
|
|
|
assert_error "REDIRECT $replica_host:$replica_port" {$rd read}
|
|
|
|
$rd close
|
|
|
|
}
|
2024-06-27 19:00:45 +08:00
|
|
|
}
|
|
|
|
}
|