Skip tracking clients OOM test when I/O threads are enabled (#764)

Fix feedback loop in key eviction with tracking clients when using I/O threads. Current issue: Evicting keys while tracking clients or key space-notification exist creates a feedback loop when using I/O threads: While evicting keys we send tracking async writes to I/O threads, preventing immediate release of tracking clients' COB memory consumption. Before the I/O thread finishes its write, we recheck used_memory, which now includes the tracking clients' COB and thus continue to evict more keys. **Fix:** We will skip the test for now while IO threads are active. We may consider avoiding sending writes in `processPendingWrites` to I/O threads for tracking clients when we are out of memory. --------- Signed-off-by: Uri Yagelnik <uriy@amazon.com> Signed-off-by: Madelyn Olson <madelyneolson@gmail.com> Co-authored-by: Madelyn Olson <madelyneolson@gmail.com>
2024-08-22 03:02:57 +03:00 · 2024-08-22 03:02:57 +03:00 · 2e937773ec
commit 2e937773ec
parent 0f551a2dca
6 changed files with 64 additions and 52 deletions
--- a/.github/workflows/daily.yml
+++ b/.github/workflows/daily.yml
@ -379,10 +379,10 @@ jobs:
        run: sudo apt-get install tcl8.6 tclx
      - name: test
        if: true && !contains(github.event.inputs.skiptests, 'valkey')
-        run: ./runtest --config io-threads 2 --config events-per-io-thread 0 --accurate --verbose --tags network --dump-logs ${{github.event.inputs.test_args}}
+        run: ./runtest --io-threads --accurate --verbose --tags network --dump-logs ${{github.event.inputs.test_args}}
      - name: cluster tests
        if: true && !contains(github.event.inputs.skiptests, 'cluster')
-        run: ./runtest-cluster --config io-threads 2 --config events-per-io-thread 0 ${{github.event.inputs.cluster_test_args}}
+        run: ./runtest-cluster --io-threads ${{github.event.inputs.cluster_test_args}}

  test-ubuntu-reclaim-cache:
    runs-on: ubuntu-latest
--- a/tests/support/server.tcl
+++ b/tests/support/server.tcl
@ -241,6 +241,11 @@ proc tags_acceptable {tags err_return} {
        return 0
    }

+    if {$::io_threads && [lsearch $tags "io-threads:skip"] >= 0} {
+        set err "Not supported in io-threads mode"
+        return 0
+    }
+
    if {$::tcl_version < 8.6 && [lsearch $tags "ipv6"] >= 0} {
        set err "TCL version is too low and does not support this"
        return 0
@ -502,6 +507,12 @@ proc start_server {options {code undefined}} {
        dict set config "tls-ca-cert-file" [format "%s/tests/tls/ca.crt" [pwd]]
        dict set config "loglevel" "debug"
    }
+
+    if {$::io_threads} {
+        dict set config "io-threads" 2
+        dict set config "events-per-io-thread" 0
+    }
+
    foreach line $data {
        if {[string length $line] > 0 && [string index $line 0] ne "#"} {
            set elements [split $line " "]
--- a/tests/test_helper.tcl
+++ b/tests/test_helper.tcl
@ -41,6 +41,7 @@ set ::traceleaks 0
 set ::valgrind 0
 set ::durable 0
 set ::tls 0
+set ::io_threads 0
 set ::tls_module 0
 set ::stack_logging 0
 set ::verbose 0
@ -576,6 +577,7 @@ proc print_help_screen {} {
        "--loops <count>    Execute the specified set of tests several times."
        "--wait-server      Wait after server is started (so that you can attach a debugger)."
        "--dump-logs        Dump server log on test failure."
+        "--io-threads       Run tests with IO threads."
        "--tls              Run tests in TLS mode."
        "--tls-module       Run tests in TLS mode with Valkey module."
        "--host <addr>      Run tests against an external host."
@ -630,6 +632,8 @@ for {set j 0} {$j < [llength $argv]} {incr j} {
        }
    } elseif {$opt eq {--quiet}} {
        set ::quiet 1
+   } elseif {$opt eq {--io-threads}} {
+        set ::io_threads 1
    } elseif {$opt eq {--tls} || $opt eq {--tls-module}} {
        package require tls 1.6
        set ::tls 1
--- a/tests/unit/info.tcl
+++ b/tests/unit/info.tcl
@ -308,51 +308,49 @@ start_server {tags {"info" "external:skip"}} {
            assert_equal "count=2" [errorstat ERR]
        }

-        # skip the following 2 tests if we are running with io-threads as the eventloop metrics are different in that case.
-        if {[r config get io-threads] eq 0} {
-            test {stats: eventloop metrics} {
-                set info1 [r info stats]
-                set cycle1 [getInfoProperty $info1 eventloop_cycles]
-                set el_sum1 [getInfoProperty $info1 eventloop_duration_sum]
-                set cmd_sum1 [getInfoProperty $info1 eventloop_duration_cmd_sum]
-                assert_morethan $cycle1 0
-                assert_morethan $el_sum1 0
-                assert_morethan $cmd_sum1 0
-                after 110 ;# default hz is 10, wait for a cron tick. 
-                set info2 [r info stats]
-                set cycle2 [getInfoProperty $info2 eventloop_cycles]
-                set el_sum2 [getInfoProperty $info2 eventloop_duration_sum]
-                set cmd_sum2 [getInfoProperty $info2 eventloop_duration_cmd_sum]
-                if {$::verbose} { puts "eventloop metrics cycle1: $cycle1, cycle2: $cycle2" }
-                assert_morethan $cycle2 $cycle1
-                assert_lessthan $cycle2 [expr $cycle1+10] ;# we expect 2 or 3 cycles here, but allow some tolerance
-                if {$::verbose} { puts "eventloop metrics el_sum1: $el_sum1, el_sum2: $el_sum2" }
-                assert_morethan $el_sum2 $el_sum1
-                assert_lessthan $el_sum2 [expr $el_sum1+30000] ;# we expect roughly 100ms here, but allow some tolerance
-                if {$::verbose} { puts "eventloop metrics cmd_sum1: $cmd_sum1, cmd_sum2: $cmd_sum2" }
-                assert_morethan $cmd_sum2 $cmd_sum1
-                assert_lessthan $cmd_sum2 [expr $cmd_sum1+15000] ;# we expect about tens of ms here, but allow some tolerance
+        test {stats: eventloop metrics} {
+            set info1 [r info stats]
+            set cycle1 [getInfoProperty $info1 eventloop_cycles]
+            set el_sum1 [getInfoProperty $info1 eventloop_duration_sum]
+            set cmd_sum1 [getInfoProperty $info1 eventloop_duration_cmd_sum]
+            assert_morethan $cycle1 0
+            assert_morethan $el_sum1 0
+            assert_morethan $cmd_sum1 0
+            after 110 ;# default hz is 10, wait for a cron tick. 
+            set info2 [r info stats]
+            set cycle2 [getInfoProperty $info2 eventloop_cycles]
+            set el_sum2 [getInfoProperty $info2 eventloop_duration_sum]
+            set cmd_sum2 [getInfoProperty $info2 eventloop_duration_cmd_sum]
+            if {$::verbose} { puts "eventloop metrics cycle1: $cycle1, cycle2: $cycle2" }
+            assert_morethan $cycle2 $cycle1
+            assert_lessthan $cycle2 [expr $cycle1+10] ;# we expect 2 or 3 cycles here, but allow some tolerance
+            if {$::verbose} { puts "eventloop metrics el_sum1: $el_sum1, el_sum2: $el_sum2" }
+            assert_morethan $el_sum2 $el_sum1
+            assert_lessthan $el_sum2 [expr $el_sum1+30000] ;# we expect roughly 100ms here, but allow some tolerance
+            if {$::verbose} { puts "eventloop metrics cmd_sum1: $cmd_sum1, cmd_sum2: $cmd_sum2" }
+            assert_morethan $cmd_sum2 $cmd_sum1
+            assert_lessthan $cmd_sum2 [expr $cmd_sum1+15000] ;# we expect about tens of ms here, but allow some tolerance
+        } {} {io-threads:skip} ; # skip with io-threads as the eventloop metrics are different in that case.
+
+        test {stats: instantaneous metrics} {
+            r config resetstat
+            set retries 0
+            for {set retries 1} {$retries < 4} {incr retries} {
+                after 1600 ;# hz is 10, wait for 16 cron tick so that sample array is fulfilled
+                set value [s instantaneous_eventloop_cycles_per_sec]
+                if {$value > 0} break
            }
-    
-            test {stats: instantaneous metrics} {
-                r config resetstat
-                set retries 0
-                for {set retries 1} {$retries < 4} {incr retries} {
-                    after 1600 ;# hz is 10, wait for 16 cron tick so that sample array is fulfilled
-                    set value [s instantaneous_eventloop_cycles_per_sec]
-                    if {$value > 0} break
-                }
-    
-                assert_lessthan $retries 4
-                if {$::verbose} { puts "instantaneous metrics instantaneous_eventloop_cycles_per_sec: $value" }
-                assert_morethan $value 0
-                assert_lessthan $value [expr $retries*15] ;# default hz is 10
-                set value [s instantaneous_eventloop_duration_usec]
-                if {$::verbose} { puts "instantaneous metrics instantaneous_eventloop_duration_usec: $value" }
-                assert_morethan $value 0
-                assert_lessthan $value [expr $retries*22000] ;# default hz is 10, so duration < 1000 / 10, allow some tolerance
-            }
-        }
+
+            assert_lessthan $retries 4
+            if {$::verbose} { puts "instantaneous metrics instantaneous_eventloop_cycles_per_sec: $value" }
+            assert_morethan $value 0
+            assert_lessthan $value [expr $retries*15] ;# default hz is 10
+            set value [s instantaneous_eventloop_duration_usec]
+            if {$::verbose} { puts "instantaneous metrics instantaneous_eventloop_duration_usec: $value" }
+            assert_morethan $value 0
+            assert_lessthan $value [expr $retries*22000] ;# default hz is 10, so duration < 1000 / 10, allow some tolerance
+        } {} {io-threads:skip} ; # skip with io-threads as the eventloop metrics are different in that case.
+        

        test {stats: debug metrics} {
            # make sure debug info is hidden
--- a/tests/unit/maxmemory.tcl
+++ b/tests/unit/maxmemory.tcl
@ -456,13 +456,15 @@ start_server {tags {"maxmemory external:skip"}} {
    } {4098}
 }

-start_server {tags {"maxmemory external:skip"}} {
+# Skip the following test when running with IO threads
+# With IO threads, we asynchronously write to tracking clients.
+# This invalidates the assumption that their output buffers will be free within the same event loop.
+start_server {tags {"maxmemory external:skip io-threads:skip"}} {
    test {client tracking don't cause eviction feedback loop} {
        r config set latency-tracking no
        r config set maxmemory 0
        r config set maxmemory-policy allkeys-lru
        r config set maxmemory-eviction-tenacity 100
-
        # 10 clients listening on tracking messages
        set clients {}
        for {set j 0} {$j < 10} {incr j} {
--- a/tests/unit/memefficiency.tcl
+++ b/tests/unit/memefficiency.tcl
@ -404,8 +404,6 @@ run_solo {defrag} {
            r save ;# saving an rdb iterates over all the data / pointers
        } {OK}

-        # Skip the following two tests if we are running with IO threads, as the IO threads allocate the command arguments in a different arena. As a result, fragmentation is not as expected.
-        if {[r config get io-threads] eq 0} {
        test "Active defrag pubsub: $type" {
            r flushdb
            r config resetstat
@ -503,8 +501,7 @@ run_solo {defrag} {
                $rd_pubsub read
            }
            $rd_pubsub close
-        }
-        } ;# io-threads
+        } {0} {io-threads:skip} ; # skip with io-threads as the threads may allocate the command arguments in a different arena. As a result, fragmentation is not as expected.

        if {$type eq "standalone"} { ;# skip in cluster mode
        test "Active defrag big list: $type" {