Try to stabilize the failover call in the slot migration test (#1078)

The CI report replica will return the error when performing CLUSTER
FAILOVER:
```
-ERR Master is down or failed, please use CLUSTER FAILOVER FORCE
```

This may because the primary state is fail or the cluster connection
is disconnected during the primary pause. In this PR, we added some
waits in wait_for_role, if the role is replica, we will wait for the
replication link and the cluster link to be ok.

Signed-off-by: Binbin <binloveplay1314@qq.com>
This commit is contained in:
Binbin 2024-11-07 13:42:20 +08:00 committed by GitHub
parent a0b1cbad83
commit 22bc49c4a6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 52 additions and 0 deletions

View File

@ -277,6 +277,14 @@ proc cluster_get_myself id {
return {}
}
# Returns the parsed "myself's primary" CLUSTER NODES entry as a dictionary.
proc cluster_get_myself_primary id {
set myself [cluster_get_myself $id]
set replicaof [dict get $myself slaveof]
set node [cluster_get_node_by_id $id $replicaof]
return $node
}
# Get a specific node by ID by parsing the CLUSTER NODES output
# of the instance Number 'instance_id'
proc cluster_get_node_by_id {instance_id node_id} {

View File

@ -14,17 +14,61 @@ proc get_cluster_role {srv_idx} {
return $role
}
proc get_myself_primary_flags {srv_idx} {
set flags [dict get [cluster_get_myself_primary $srv_idx] flags]
return $flags
}
proc get_myself_primary_linkstate {srv_idx} {
set linkstate [dict get [cluster_get_myself_primary $srv_idx] linkstate]
return $linkstate
}
proc wait_for_role {srv_idx role} {
# Wait for the role, make sure the replication role matches.
wait_for_condition 100 100 {
[lindex [split [R $srv_idx ROLE] " "] 0] eq $role
} else {
puts "R $srv_idx ROLE: [R $srv_idx ROLE]"
fail "R $srv_idx didn't assume the replication $role in time"
}
if {$role eq "slave"} {
# Wait for the replication link, make sure the replication link is normal.
wait_for_condition 100 100 {
[s -$srv_idx master_link_status] eq "up"
} else {
puts "R $srv_idx INFO REPLICATION: [R $srv_idx INFO REPLICATION]"
fail "R $srv_idx didn't assume the replication link in time"
}
}
# Wait for the cluster role, make sure the cluster role matches.
wait_for_condition 100 100 {
[get_cluster_role $srv_idx] eq $role
} else {
puts "R $srv_idx CLUSTER NODES: [R $srv_idx CLUSTER NODES]"
fail "R $srv_idx didn't assume the cluster $role in time"
}
if {$role eq "slave"} {
# Wait for the flags, make sure the primary node is not failed.
wait_for_condition 100 100 {
[get_myself_primary_flags $srv_idx] eq "master"
} else {
puts "R $srv_idx CLUSTER NODES: [R $srv_idx CLUSTER NODES]"
fail "R $srv_idx didn't assume the primary state in time"
}
# Wait for the cluster link, make sure that the cluster connection is normal.
wait_for_condition 100 100 {
[get_myself_primary_linkstate $srv_idx] eq "connected"
} else {
puts "R $srv_idx CLUSTER NODES: [R $srv_idx CLUSTER NODES]"
fail "R $srv_idx didn't assume the cluster primary link in time"
}
}
wait_for_cluster_propagation
}