Try to stabilize the failover call in the slot migration test (#1078)

The CI report replica will return the error when performing CLUSTER FAILOVER: ``` -ERR Master is down or failed, please use CLUSTER FAILOVER FORCE ``` This may because the primary state is fail or the cluster connection is disconnected during the primary pause. In this PR, we added some waits in wait_for_role, if the role is replica, we will wait for the replication link and the cluster link to be ok. Signed-off-by: Binbin <binloveplay1314@qq.com>
2024-11-07 13:42:20 +08:00 · 2024-11-07 13:42:20 +08:00 · 22bc49c4a6
commit 22bc49c4a6
parent a0b1cbad83
2 changed files with 52 additions and 0 deletions
--- a/tests/support/cluster_util.tcl
+++ b/tests/support/cluster_util.tcl
@ -277,6 +277,14 @@ proc cluster_get_myself id {
    return {}
 }

+# Returns the parsed "myself's primary" CLUSTER NODES entry as a dictionary.
+proc cluster_get_myself_primary id {
+    set myself [cluster_get_myself $id]
+    set replicaof [dict get $myself slaveof]
+    set node [cluster_get_node_by_id $id $replicaof]
+    return $node
+}
+
 # Get a specific node by ID by parsing the CLUSTER NODES output
 # of the instance Number 'instance_id'
 proc cluster_get_node_by_id {instance_id node_id} {
--- a/tests/unit/cluster/slot-migration.tcl
+++ b/tests/unit/cluster/slot-migration.tcl
@ -14,17 +14,61 @@ proc get_cluster_role {srv_idx} {
    return $role
 }

+proc get_myself_primary_flags {srv_idx} {
+    set flags [dict get [cluster_get_myself_primary $srv_idx] flags]
+    return $flags
+}
+
+proc get_myself_primary_linkstate {srv_idx} {
+    set linkstate [dict get [cluster_get_myself_primary $srv_idx] linkstate]
+    return $linkstate
+}
+
 proc wait_for_role {srv_idx role} {
+    # Wait for the role, make sure the replication role matches.
    wait_for_condition 100 100 {
        [lindex [split [R $srv_idx ROLE] " "] 0] eq $role
    } else {
+        puts "R $srv_idx ROLE: [R $srv_idx ROLE]"
        fail "R $srv_idx didn't assume the replication $role in time"
    }
+
+    if {$role eq "slave"} {
+        # Wait for the replication link, make sure the replication link is normal.
+        wait_for_condition 100 100 {
+            [s -$srv_idx master_link_status] eq "up"
+        } else {
+            puts "R $srv_idx INFO REPLICATION: [R $srv_idx INFO REPLICATION]"
+            fail "R $srv_idx didn't assume the replication link in time"
+        }
+    }
+
+    # Wait for the cluster role, make sure the cluster role matches.
    wait_for_condition 100 100 {
        [get_cluster_role $srv_idx] eq $role
    } else {
+        puts "R $srv_idx CLUSTER NODES: [R $srv_idx CLUSTER NODES]"
        fail "R $srv_idx didn't assume the cluster $role in time"
    }
+
+    if {$role eq "slave"} {
+        # Wait for the flags, make sure the primary node is not failed.
+        wait_for_condition 100 100 {
+            [get_myself_primary_flags $srv_idx] eq "master"
+        } else {
+            puts "R $srv_idx CLUSTER NODES: [R $srv_idx CLUSTER NODES]"
+            fail "R $srv_idx didn't assume the primary state in time"
+        }
+
+        # Wait for the cluster link, make sure that the cluster connection is normal.
+        wait_for_condition 100 100 {
+            [get_myself_primary_linkstate $srv_idx] eq "connected"
+        } else {
+            puts "R $srv_idx CLUSTER NODES: [R $srv_idx CLUSTER NODES]"
+            fail "R $srv_idx didn't assume the cluster primary link in time"
+        }
+    }
+
    wait_for_cluster_propagation
 }