From cc7bc8f4ef5318f36a28b3c4ca0927a8c5792f0a Mon Sep 17 00:00:00 2001 From: Binbin Date: Thu, 7 Nov 2024 13:42:20 +0800 Subject: [PATCH] Try to stabilize the failover call in the slot migration test (#1078) The CI report replica will return the error when performing CLUSTER FAILOVER: ``` -ERR Master is down or failed, please use CLUSTER FAILOVER FORCE ``` This may because the primary state is fail or the cluster connection is disconnected during the primary pause. In this PR, we added some waits in wait_for_role, if the role is replica, we will wait for the replication link and the cluster link to be ok. Signed-off-by: Binbin --- tests/support/cluster_util.tcl | 8 +++++ tests/unit/cluster/slot-migration.tcl | 44 +++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/tests/support/cluster_util.tcl b/tests/support/cluster_util.tcl index dd5cd84df..4b399214b 100644 --- a/tests/support/cluster_util.tcl +++ b/tests/support/cluster_util.tcl @@ -277,6 +277,14 @@ proc cluster_get_myself id { return {} } +# Returns the parsed "myself's primary" CLUSTER NODES entry as a dictionary. +proc cluster_get_myself_primary id { + set myself [cluster_get_myself $id] + set replicaof [dict get $myself slaveof] + set node [cluster_get_node_by_id $id $replicaof] + return $node +} + # Get a specific node by ID by parsing the CLUSTER NODES output # of the instance Number 'instance_id' proc cluster_get_node_by_id {instance_id node_id} { diff --git a/tests/unit/cluster/slot-migration.tcl b/tests/unit/cluster/slot-migration.tcl index d79897196..289c20578 100644 --- a/tests/unit/cluster/slot-migration.tcl +++ b/tests/unit/cluster/slot-migration.tcl @@ -14,17 +14,61 @@ proc get_cluster_role {srv_idx} { return $role } +proc get_myself_primary_flags {srv_idx} { + set flags [dict get [cluster_get_myself_primary $srv_idx] flags] + return $flags +} + +proc get_myself_primary_linkstate {srv_idx} { + set linkstate [dict get [cluster_get_myself_primary $srv_idx] linkstate] + return $linkstate +} + proc wait_for_role {srv_idx role} { + # Wait for the role, make sure the replication role matches. wait_for_condition 100 100 { [lindex [split [R $srv_idx ROLE] " "] 0] eq $role } else { + puts "R $srv_idx ROLE: [R $srv_idx ROLE]" fail "R $srv_idx didn't assume the replication $role in time" } + + if {$role eq "slave"} { + # Wait for the replication link, make sure the replication link is normal. + wait_for_condition 100 100 { + [s -$srv_idx master_link_status] eq "up" + } else { + puts "R $srv_idx INFO REPLICATION: [R $srv_idx INFO REPLICATION]" + fail "R $srv_idx didn't assume the replication link in time" + } + } + + # Wait for the cluster role, make sure the cluster role matches. wait_for_condition 100 100 { [get_cluster_role $srv_idx] eq $role } else { + puts "R $srv_idx CLUSTER NODES: [R $srv_idx CLUSTER NODES]" fail "R $srv_idx didn't assume the cluster $role in time" } + + if {$role eq "slave"} { + # Wait for the flags, make sure the primary node is not failed. + wait_for_condition 100 100 { + [get_myself_primary_flags $srv_idx] eq "master" + } else { + puts "R $srv_idx CLUSTER NODES: [R $srv_idx CLUSTER NODES]" + fail "R $srv_idx didn't assume the primary state in time" + } + + # Wait for the cluster link, make sure that the cluster connection is normal. + wait_for_condition 100 100 { + [get_myself_primary_linkstate $srv_idx] eq "connected" + } else { + puts "R $srv_idx CLUSTER NODES: [R $srv_idx CLUSTER NODES]" + fail "R $srv_idx didn't assume the cluster primary link in time" + } + } + wait_for_cluster_propagation }