Fix possible int overflow when hashing an sds. (#9916)

This caused a crash when adding elements larger than 2GB to a set (same goes for hash keys). See #8455. Details: * The fix makes the dict hash functions receive a `size_t` instead of an `int`. In practice the dict hash functions call siphash which receives a `size_t` and the callers to the hash function pass a `size_t` to it so the fix is trivial. * The issue was recreated by attempting to add a >2gb value to a set. Appropriate tests were added where I create a set with large elements and check basic functionality on it (SADD, SCARD, SPOP, etc...). * When I added the tests I also refactored a bit all the tests code which is run under the `--large-memory` flag. This removed code duplication for the test framework's `write_big_bulk` and `write_big_bulk` code and also takes care of not allocating the test frameworks helper huge string used by these tests when not run under `--large-memory`. * I also added the _violoations.tcl_ unit tests to be part of the entire test suite and leaned up non relevant list related tests that were in there. This was done in this PR because most of the _violations_ tests are "large memory" tests.
2021-12-13 20:16:25 +01:00 · 2021-12-13 20:16:25 +01:00 · c7dc17fc0f
commit c7dc17fc0f
parent c40d23b89f
7 changed files with 125 additions and 107 deletions
--- a/src/dict.c
+++ b/src/dict.c
@ -83,11 +83,11 @@ uint8_t *dictGetHashFunctionSeed(void) {
 uint64_t siphash(const uint8_t *in, const size_t inlen, const uint8_t *k);
 uint64_t siphash_nocase(const uint8_t *in, const size_t inlen, const uint8_t *k);

-uint64_t dictGenHashFunction(const void *key, int len) {
+uint64_t dictGenHashFunction(const void *key, size_t len) {
    return siphash(key,len,dict_hash_function_seed);
 }

-uint64_t dictGenCaseHashFunction(const unsigned char *buf, int len) {
+uint64_t dictGenCaseHashFunction(const unsigned char *buf, size_t len) {
    return siphash_nocase(buf,len,dict_hash_function_seed);
 }

--- a/src/dict.h
+++ b/src/dict.h
@ -192,8 +192,8 @@ dictEntry *dictGetRandomKey(dict *d);
 dictEntry *dictGetFairRandomKey(dict *d);
 unsigned int dictGetSomeKeys(dict *d, dictEntry **des, unsigned int count);
 void dictGetStats(char *buf, size_t bufsize, dict *d);
-uint64_t dictGenHashFunction(const void *key, int len);
-uint64_t dictGenCaseHashFunction(const unsigned char *buf, int len);
+uint64_t dictGenHashFunction(const void *key, size_t len);
+uint64_t dictGenCaseHashFunction(const unsigned char *buf, size_t len);
 void dictEmpty(dict *d, void(callback)(dict*));
 void dictEnableResize(void);
 void dictDisableResize(void);
--- a/tests/support/util.tcl
+++ b/tests/support/util.tcl
@ -907,3 +907,74 @@ proc delete_lines_with_pattern {filename tmpfilename pattern} {
    close $fh_out
    file rename -force $tmpfilename $filename
 }
+
+# The following functions and variables are used only when running large-memory
+# tests. We avoid defining them when not running large-memory tests because the 
+# global variables takes up lots of memory.
+proc init_large_mem_vars {} {
+    if {![info exists ::str500]} {
+        set ::str500 [string repeat x 500000000] ;# 500mb
+        set ::str500_len [string length $::str500]
+    }
+}
+
+# Utility function to write big argument into redis client connection
+proc write_big_bulk {size {prefix ""} {skip_read no}} {
+    init_large_mem_vars
+
+    assert {[string length prefix] <= $size}
+    r write "\$$size\r\n"
+    r write $prefix
+    incr size -[string length $prefix]
+    while {$size >= 500000000} {
+        r write $::str500
+        incr size -500000000
+    }
+    if {$size > 0} {
+        r write [string repeat x $size]
+    }
+    r write "\r\n"
+    if {!$skip_read} {
+        r flush
+        r read
+    }
+}
+
+# Utility to read big bulk response (work around Tcl limitations)
+proc read_big_bulk {code {compare no} {prefix ""}} {
+    init_large_mem_vars
+
+    r readraw 1
+    set resp_len [uplevel 1 $code] ;# get the first line of the RESP response
+    assert_equal [string range $resp_len 0 0] "$"
+    set resp_len [string range $resp_len 1 end]
+    set prefix_len [string length $prefix]
+    if {$compare} {
+        assert {$prefix_len <= $resp_len}
+        assert {$prefix_len <= $::str500_len}
+    }
+
+    set remaining $resp_len
+    while {$remaining > 0} {
+        set l $remaining
+        if {$l > $::str500_len} {set l $::str500_len} ; # can't read more than 2gb at a time, so read 500mb so we can easily verify read data
+        set read_data [r rawread $l]
+        set nbytes [string length $read_data]
+        if {$compare} {
+            set comp_len $nbytes
+            # Compare prefix part
+            if {$remaining == $resp_len} {
+                assert_equal $prefix [string range $read_data 0 [expr $prefix_len - 1]]
+                set read_data [string range $read_data $prefix_len $nbytes]
+                incr comp_len -$prefix_len
+            }
+            # Compare rest of data, evaluate and then assert to avoid huge print in case of failure
+            set data_equal [expr {$read_data == [string range $::str500 0 [expr $comp_len - 1]]}]
+            assert $data_equal
+        }
+        incr remaining -$nbytes
+    }
+    assert_equal [r rawread 2] "\r\n"
+    r readraw 0
+    return $resp_len
+}
--- a/tests/test_helper.tcl
+++ b/tests/test_helper.tcl
@ -85,6 +85,7 @@ set ::all_tests {
    unit/networking
    unit/cluster
    unit/client-eviction
+    unit/violations
 }
 # Index to the next test to run in the ::all_tests list.
 set ::next_test 0
--- a/tests/unit/type/list.tcl
+++ b/tests/unit/type/list.tcl
@ -1,38 +1,3 @@
-set ::str500 [string repeat x 500000000] ;# 500mb
-
-# Utility function to write big argument into redis client connection
-proc write_big_bulk {size} {
-    r write "\$$size\r\n"
-    while {$size >= 500000000} {
-        r write $::str500
-        incr size -500000000
-    }
-    if {$size > 0} {
-        r write [string repeat x $size]
-    }
-    r write "\r\n"
-    r flush
-    r read
-}
-
-# Utility to read big bulk response (work around Tcl limitations)
-proc read_big_bulk {code} {
-    r readraw 1
-    set resp_len [uplevel 1 $code] ;# get the first line of the RESP response
-    assert_equal [string range $resp_len 0 0] "$"
-    set resp_len [string range $resp_len 1 end]
-    set remaining $resp_len
-    while {$remaining > 0} {
-        set l $remaining
-        if {$l > 2147483647} {set l 2147483647}
-        set nbytes [string length [r rawread $l]]
-        incr remaining [expr {- $nbytes}]
-    }
-    assert_equal [r rawread 2] "\r\n"
-    r readraw 0
-    return $resp_len
-}
-
 # check functionality compression of plain and zipped nodes
 start_server [list overrides [list save ""] ] {
    r config set list-compress-depth 2
--- a/tests/unit/type/set.tcl
+++ b/tests/unit/type/set.tcl
@ -934,3 +934,38 @@ start_server {
        }
    }
 }
+
+start_server [list overrides [list save ""] ] {
+
+# test if the server supports such large configs (avoid 32 bit builds)
+catch {
+    r config set proto-max-bulk-len 10000000000 ;#10gb
+    r config set client-query-buffer-limit 10000000000 ;#10gb
+}
+if {[lindex [r config get proto-max-bulk-len] 1] == 10000000000} {
+
+    set str_length 4400000000 ;#~4.4GB
+
+    test {SADD, SCARD, SISMEMBER - large data} {
+        r flushdb
+        r write "*3\r\n\$4\r\nSADD\r\n\$5\r\nmyset\r\n"
+        assert_equal 1 [write_big_bulk $str_length "aaa"]
+        r write "*3\r\n\$4\r\nSADD\r\n\$5\r\nmyset\r\n"
+        assert_equal 1 [write_big_bulk $str_length "bbb"]
+        r write "*3\r\n\$4\r\nSADD\r\n\$5\r\nmyset\r\n"
+        assert_equal 0 [write_big_bulk $str_length "aaa"]
+        assert_encoding hashtable myset
+        set s0 [s used_memory]
+        assert {$s0 > [expr $str_length * 2]}
+        assert_equal 2 [r scard myset]
+
+        r write "*3\r\n\$9\r\nSISMEMBER\r\n\$5\r\nmyset\r\n"
+        assert_equal 1 [write_big_bulk $str_length "aaa"]
+        r write "*3\r\n\$9\r\nSISMEMBER\r\n\$5\r\nmyset\r\n"
+        assert_equal 0 [write_big_bulk $str_length "ccc"]
+        r write "*3\r\n\$4\r\nSREM\r\n\$5\r\nmyset\r\n"
+        assert_equal 1 [write_big_bulk $str_length "bbb"]
+        assert_equal [read_big_bulk {r spop myset} yes "aaa"] $str_length
+    } {} {large-memory}
+} ;# skip 32bit builds
+}
--- a/tests/unit/violations.tcl
+++ b/tests/unit/violations.tcl
@ -1,20 +1,3 @@
-# These tests consume massive amounts of memory, and are not
-# suitable to be executed as part of the normal test suite
-set ::str500 [string repeat x 500000000] ;# 500mb
-
-# Utility function to write big argument into redis client connection
-proc write_big_bulk {size} {
-    r write "\$$size\r\n"
-    while {$size >= 500000000} {
-        r write $::str500
-        incr size -500000000
-    }
-    if {$size > 0} {
-        r write [string repeat x $size]
-    }
-    r write "\r\n"
-}
-
 # One XADD with one huge 5GB field
 # Expected to fail resulting in an empty stream
 start_server [list overrides [list save ""] ] {
@ -23,12 +6,12 @@ start_server [list overrides [list save ""] ] {
        r config set client-query-buffer-limit 10000000000 ;#10gb
        r write "*5\r\n\$4\r\nXADD\r\n\$2\r\nS1\r\n\$1\r\n*\r\n"
        r write "\$1\r\nA\r\n"
-        write_big_bulk 5000000000 ;#5gb
-        r flush
-        catch {r read} err
+        catch {
+            write_big_bulk 5000000000 ;#5gb
+        } err
        assert_match {*too large*} $err
        r xlen S1
-    } {0}
+    } {0} {large-memory}
 }

 # One XADD with one huge (exactly nearly) 4GB field
@ -40,12 +23,12 @@ start_server [list overrides [list save ""] ] {
        r config set client-query-buffer-limit 10000000000 ;#10gb
        r write "*5\r\n\$4\r\nXADD\r\n\$2\r\nS1\r\n\$1\r\n*\r\n"
        r write "\$1\r\nA\r\n"
-        write_big_bulk 4294967295 ;#4gb-1
-        r flush
-        catch {r read} err
+        catch {
+            write_big_bulk 4294967295 ;#4gb-1
+        } err
        assert_match {*too large*} $err
        r xlen S1
-    } {0}
+    } {0} {large-memory}
 }

 # Gradually add big stream fields using repeated XADD calls
@ -57,7 +40,7 @@ start_server [list overrides [list save ""] ] {
        }
        r ping
        r xlen stream
-    } {10}
+    } {10} {large-memory}
 }

 # Add over 4GB to a single stream listpack (one XADD command)
@ -67,13 +50,13 @@ start_server [list overrides [list save ""] ] {
        r write "*23\r\n\$4\r\nXADD\r\n\$1\r\nS\r\n\$1\r\n*\r\n"
        for {set j 0} {$j<10} {incr j} {
            r write "\$1\r\n$j\r\n"
-            write_big_bulk 500000000 ;#500mb
+            write_big_bulk 500000000 "" yes ;#500mb
        }
        r flush
        catch {r read} err
        assert_match {*too large*} $err
        r xlen S
-    } {0}
+    } {0} {large-memory}
 }

 # Gradually add big hash fields using repeated HSET calls
@ -86,7 +69,7 @@ start_server [list overrides [list save ""] ] {
            r hset h $j $::str500
        }
        r object encoding h
-    } {hashtable}
+    } {hashtable} {large-memory}
 }

 # Add over 4GB to a single hash field (one HSET command)
@ -99,47 +82,10 @@ start_server [list overrides [list save ""] ] {
        r write "*4\r\n\$4\r\nHSET\r\n\$2\r\nH1\r\n"
        r write "\$1\r\nA\r\n"
        write_big_bulk 5000000000 ;#5gb
-        r flush
-        r read
        r object encoding H1
-    } {hashtable}
+    } {hashtable} {large-memory}
 }

-# Add over 4GB to a single list member (one LPUSH command)
-# Currently unsupported, and expected to fail rather than being truncated
-# Expected to fail resulting in a non-existing list
-start_server [list overrides [list save ""] ] {
-    test {list with one huge field} {
-        r config set proto-max-bulk-len 10000000000 ;#10gb
-        r config set client-query-buffer-limit 10000000000 ;#10gb
-        r write "*3\r\n\$5\r\nLPUSH\r\n\$2\r\nL1\r\n"
-        write_big_bulk 5000000000 ;#5gb
-        r flush
-        catch {r read} err
-        assert_match {*too large*} $err
-        r exists L1
-    } {0}
-}
-
-# SORT which attempts to store an element larger than 4GB into a list.
-# Currently unsupported and results in an assertion instead of truncation
-start_server [list overrides [list save ""] ] {
-    test {SORT adds huge field to list} {
-        r config set proto-max-bulk-len 10000000000 ;#10gb
-        r config set client-query-buffer-limit 10000000000 ;#10gb
-        r write "*3\r\n\$3\r\nSET\r\n\$2\r\nS1\r\n"
-        write_big_bulk 5000000000 ;#5gb
-        r flush
-        r read
-        assert_equal [r strlen S1] 5000000000
-        r set S2 asdf
-        r sadd myset 1 2
-        r mset D1 1 D2 2
-        catch {r sort myset by D* get S* store mylist}
-        assert_equal [count_log_message 0 "crashed by signal"] 0
-        assert_equal [count_log_message 0 "ASSERTION FAILED"] 1
-    }
-}

 # SORT which stores an integer encoded element into a list.
 # Just for coverage, no news here.
@ -152,5 +98,5 @@ start_server [list overrides [list save ""] ] {
        r mset D1 1 D2 2
        r sort myset by D* get S* store mylist
        r llen mylist
-    } {2}
+    } {2} {cluster:skip}
 }