From 493e31e3ad299c99cbb96b8581b7598b19b23892 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Thu, 8 Feb 2024 22:39:58 +0800
Subject: [PATCH] Add new DEBUG dict-resizing command to disable the dict
 resize (#13043)

The test fails here and there:
```
*** [err]: expire scan should skip dictionaries with lot's of empty buckets in tests/unit/expire.tcl
scan didn't handle slot skipping logic.
```

There are two case:
1. In the case of passing the test, we use child process to avoid the
dict resize, but it can not completely limit it, since in the dictDelete
we still have chance to trigger the resize (hit the force radio). The
reason why our test passed before is because the expire dict is still
in the rehashing process, so the dictDelete, the dictShrinkIfNeeded can
not trigger the resize.

2. In the case of failing the test, the expire dict finished the
rehashing,
so the last dictDelete, the dictShrinkIfNeeded trigger the dict resize
since it hit the force radio, so the skipping logic fail.

This PR add a new DEBUG command to disbale the dict resize.
---
 src/debug.c           |  5 +++++
 src/server.c          |  4 ++++
 src/server.h          |  1 +
 tests/unit/expire.tcl | 16 ++++++----------
 tests/unit/other.tcl  | 28 ++++++++--------------------
 5 files changed, 24 insertions(+), 30 deletions(-)

diff --git a/src/debug.c b/src/debug.c
index 1b89a7f6e..fcd62855c 100644
--- a/src/debug.c
+++ b/src/debug.c
@@ -496,6 +496,8 @@ void debugCommand(client *c) {
 "    In case RESET is provided the peak reset time will be restored to the default value",
 "REPLYBUFFER RESIZING <0|1>",
 "    Enable or disable the reply buffer resize cron job",
+"DICT-RESIZING <0|1>",
+"    Enable or disable the main dict and expire dict resizing.",
 NULL
         };
         addExtendedReplyHelp(c, help, clusterDebugCommandExtendedHelp());
@@ -1021,6 +1023,9 @@ NULL
             return;
         }
         addReply(c, shared.ok);
+    } else if (!strcasecmp(c->argv[1]->ptr, "dict-resizing") && c->argc == 3) {
+        server.dict_resizing = atoi(c->argv[2]->ptr);
+        addReply(c, shared.ok);
     } else if(!handleDebugClusterCommand(c)) {
         addReplySubcommandSyntaxError(c);
         return;
diff --git a/src/server.c b/src/server.c
index e743eeb8c..1040c50c8 100644
--- a/src/server.c
+++ b/src/server.c
@@ -429,6 +429,9 @@ uint64_t dictEncObjHash(const void *key) {
  * but to guarantee the performance of redis, we still allow dict to expand
  * if dict load factor exceeds HASHTABLE_MAX_LOAD_FACTOR. */
 int dictResizeAllowed(size_t moreMem, double usedRatio) {
+    /* for debug purposes: dict is not allowed to be resized. */
+    if (!server.dict_resizing) return 0;
+
     if (usedRatio <= HASHTABLE_MAX_LOAD_FACTOR) {
         return !overMaxmemoryAfterAlloc(moreMem);
     } else {
@@ -2079,6 +2082,7 @@ void initServerConfig(void) {
     server.next_client_id = 1; /* Client IDs, start from 1 .*/
     server.page_size = sysconf(_SC_PAGESIZE);
     server.pause_cron = 0;
+    server.dict_resizing = 1;
 
     server.latency_tracking_info_percentiles_len = 3;
     server.latency_tracking_info_percentiles = zmalloc(sizeof(double)*(server.latency_tracking_info_percentiles_len));
diff --git a/src/server.h b/src/server.h
index 067b5df93..3b0e9e96e 100644
--- a/src/server.h
+++ b/src/server.h
@@ -1754,6 +1754,7 @@ struct redisServer {
     char *proc_title_template;      /* Process title template format */
     clientBufferLimitsConfig client_obuf_limits[CLIENT_TYPE_OBUF_COUNT];
     int pause_cron;                 /* Don't run cron tasks (debug) */
+    int dict_resizing;              /* Whether to allow main dict and expired dict to be resized (debug) */
     int latency_tracking_enabled;   /* 1 if extended latency tracking is enabled, 0 otherwise. */
     double *latency_tracking_info_percentiles; /* Extended latency tracking info output percentile list configuration. */
     int latency_tracking_info_percentiles_len;
diff --git a/tests/unit/expire.tcl b/tests/unit/expire.tcl
index f05a4407d..3e58bd4f7 100644
--- a/tests/unit/expire.tcl
+++ b/tests/unit/expire.tcl
@@ -853,9 +853,9 @@ start_cluster 1 0 {tags {"expire external:skip cluster slow"}} {
         # hashslot(key) is 12539
         r psetex key 500 val
 
-        # disable resizing
-        r config set rdb-key-save-delay 10000000
-        r bgsave
+        # disable resizing, the reason for not using slow bgsave is because
+        # it will hit the dict_force_resize_ratio.
+        r debug dict-resizing 0
 
         # delete data to have lot's (99%) of empty buckets (slot 12182 should be skipped)
         for {set j 1} {$j <= 99} {incr j} {
@@ -872,20 +872,16 @@ start_cluster 1 0 {tags {"expire external:skip cluster slow"}} {
             [r dbsize] eq 1
         } else {
             if {[r dbsize] eq 0} {
+                puts [r debug htstats 0]
                 fail "scan didn't handle slot skipping logic."
             } else {
+                puts [r debug htstats 0]
                 fail "scan didn't process all valid slots."
             }
         }
 
         # Enable resizing
-        r config set rdb-key-save-delay 0
-        catch {exec kill -9 [get_child_pid 0]}
-        wait_for_condition 1000 10 {
-            [s rdb_bgsave_in_progress] eq 0
-        } else {
-            fail "bgsave did not stop in time."
-        }
+        r debug dict-resizing 1
 
         # put some data into slot 12182 and trigger the resize
         r psetex "{foo}0" 500 a
diff --git a/tests/unit/other.tcl b/tests/unit/other.tcl
index 718266e65..b42abaafe 100644
--- a/tests/unit/other.tcl
+++ b/tests/unit/other.tcl
@@ -438,9 +438,9 @@ start_cluster 1 0 {tags {"other external:skip cluster slow"}} {
         }
         assert_match "*table size: 128*" [r debug HTSTATS 0]
 
-        # disable resizing
-        r config set rdb-key-save-delay 10000000
-        r bgsave
+        # disable resizing, the reason for not using slow bgsave is because
+        # it will hit the dict_force_resize_ratio.
+        r debug dict-resizing 0
 
         # delete data to have lot's (96%) of empty buckets
         for {set j 1} {$j <= 123} {incr j} {
@@ -449,13 +449,7 @@ start_cluster 1 0 {tags {"other external:skip cluster slow"}} {
         assert_match "*table size: 128*" [r debug HTSTATS 0]
 
         # enable resizing
-        r config set rdb-key-save-delay 0
-        catch {exec kill -9 [get_child_pid 0]}
-        wait_for_condition 1000 10 {
-            [s rdb_bgsave_in_progress] eq 0
-        } else {
-            fail "bgsave did not stop in time."
-        }
+        r debug dict-resizing 1
 
         # waiting for serverCron to resize the tables
         wait_for_condition 1000 10 {
@@ -474,22 +468,16 @@ start_cluster 1 0 {tags {"other external:skip cluster slow"}} {
             r set "{alice}$j" a
         }
 
-        # disable resizing
-        r config set rdb-key-save-delay 10000000
-        r bgsave
+        # disable resizing, the reason for not using slow bgsave is because
+        # it will hit the dict_force_resize_ratio.
+        r debug dict-resizing 0
 
         for {set j 1} {$j <= 123} {incr j} {
             r del "{alice}$j"
         }
 
         # enable resizing
-        r config set rdb-key-save-delay 0
-        catch {exec kill -9 [get_child_pid 0]}
-        wait_for_condition 1000 10 {
-            [s rdb_bgsave_in_progress] eq 0
-        } else {
-            fail "bgsave did not stop in time."
-        }
+        r debug dict-resizing 1
 
         # waiting for serverCron to resize the tables
         wait_for_condition 1000 10 {