diff --git a/src/dict.c b/src/dict.c index 1d9934b4b..6760da154 100644 --- a/src/dict.c +++ b/src/dict.c @@ -294,9 +294,12 @@ int dictTryExpand(dict *d, unsigned long size) { * work it does would be unbound and the function may block for a long time. */ int dictRehash(dict *d, int n) { int empty_visits = n*10; /* Max number of empty buckets to visit. */ + unsigned long s0 = DICTHT_SIZE(d->ht_size_exp[0]); + unsigned long s1 = DICTHT_SIZE(d->ht_size_exp[1]); if (dict_can_resize == DICT_RESIZE_FORBID || !dictIsRehashing(d)) return 0; if (dict_can_resize == DICT_RESIZE_AVOID && - (DICTHT_SIZE(d->ht_size_exp[1]) / DICTHT_SIZE(d->ht_size_exp[0]) < dict_force_resize_ratio)) + ((s1 > s0 && s1 / s0 < dict_force_resize_ratio) || + (s1 < s0 && s0 / s1 < dict_force_resize_ratio))) { return 0; } @@ -1095,19 +1098,30 @@ unsigned int dictGetSomeKeys(dict *d, dictEntry **des, unsigned int count) { } else { emptylen = 0; while (he) { - /* Collect all the elements of the buckets found non - * empty while iterating. */ - *des = he; - des++; + /* Collect all the elements of the buckets found non empty while iterating. + * To avoid the issue of being unable to sample the end of a long chain, + * we utilize the Reservoir Sampling algorithm to optimize the sampling process. + * This means that even when the maximum number of samples has been reached, + * we continue sampling until we reach the end of the chain. + * See https://en.wikipedia.org/wiki/Reservoir_sampling. */ + if (stored < count) { + des[stored] = he; + } else { + unsigned long r = randomULong() % (stored + 1); + if (r < count) des[r] = he; + } + he = dictGetNext(he); stored++; - if (stored == count) return stored; } + if (stored >= count) goto end; } } i = (i+1) & maxsizemask; } - return stored; + +end: + return stored > count ? count : stored; } @@ -1510,7 +1524,9 @@ size_t _dictGetStatsHt(char *buf, size_t bufsize, dict *d, int htidx, int full) if (d->ht_used[htidx] == 0) { return snprintf(buf,bufsize, - "No stats available for empty dictionaries\n"); + "Hash table %d stats (%s):\n" + "No stats available for empty dictionaries\n", + htidx, (htidx == 0) ? "main hash table" : "rehashing target"); } if (!full) { diff --git a/tests/unit/type/set.tcl b/tests/unit/type/set.tcl index 9fd552747..29275622d 100644 --- a/tests/unit/type/set.tcl +++ b/tests/unit/type/set.tcl @@ -1002,6 +1002,129 @@ foreach type {single multiple single_multiple} { } } + proc is_rehashing {myset} { + set htstats [r debug HTSTATS-KEY $myset] + return [string match {*rehashing target*} $htstats] + } + + proc rem_hash_set_top_N {myset n} { + set cursor 0 + set members {} + set enough 0 + while 1 { + set res [r sscan $myset $cursor] + set cursor [lindex $res 0] + set k [lindex $res 1] + foreach m $k { + lappend members $m + if {[llength $members] >= $n} { + set enough 1 + break + } + } + if {$enough || $cursor == 0} { + break + } + } + r srem $myset {*}$members + } + + test "SRANDMEMBER with a dict containing long chain" { + set origin_save [config_get_set save ""] + set origin_max_lp [config_get_set set-max-listpack-entries 0] + set origin_save_delay [config_get_set rdb-key-save-delay 2147483647] + + # 1) Create a hash set with 100000 members. + set members {} + for {set i 0} {$i < 100000} {incr i} { + lappend members [format "m:%d" $i] + } + create_set myset $members + + # 2) Wait for the hash set rehashing to finish. + while {[is_rehashing myset]} { + r srandmember myset 100 + } + + # 3) Turn off the rehashing of this set, and remove the members to 500. + r bgsave + rem_hash_set_top_N myset [expr {[r scard myset] - 500}] + assert_equal [r scard myset] 500 + + # 4) Kill RDB child process to restart rehashing. + set pid1 [get_child_pid 0] + catch {exec kill -9 $pid1} + waitForBgsave r + + # 5) Let the set hash to start rehashing + r spop myset 1 + assert [is_rehashing myset] + + # 6) Verify that when rdb saving is in progress, rehashing will still be performed (because + # the ratio is extreme) by waiting for it to finish during an active bgsave. + r bgsave + + while {[is_rehashing myset]} { + r srandmember myset 1 + } + if {$::verbose} { + puts [r debug HTSTATS-KEY myset full] + } + + set pid1 [get_child_pid 0] + catch {exec kill -9 $pid1} + waitForBgsave r + + # 7) Check that eventually, SRANDMEMBER returns all elements. + array set allmyset {} + foreach ele [r smembers myset] { + set allmyset($ele) 1 + } + unset -nocomplain auxset + set iterations 1000 + while {$iterations != 0} { + incr iterations -1 + set res [r srandmember myset -10] + foreach ele $res { + set auxset($ele) 1 + } + if {[lsort [array names allmyset]] eq + [lsort [array names auxset]]} { + break; + } + } + assert {$iterations != 0} + + # 8) Remove the members to 30 in order to calculate the value of Chi-Square Distribution, + # otherwise we would need more iterations. + rem_hash_set_top_N myset [expr {[r scard myset] - 30}] + assert_equal [r scard myset] 30 + assert {[is_rehashing myset]} + + # Now that we have a hash set with only one long chain bucket. + set htstats [r debug HTSTATS-KEY myset full] + assert {[regexp {different slots: ([0-9]+)} $htstats - different_slots]} + assert {[regexp {max chain length: ([0-9]+)} $htstats - max_chain_length]} + assert {$different_slots == 1 && $max_chain_length == 30} + + # 9) Use positive count (PATH 4) to get 10 elements (out of 30) each time. + unset -nocomplain allkey + set iterations 1000 + while {$iterations != 0} { + incr iterations -1 + set res [r srandmember myset 10] + foreach ele $res { + lappend allkey $ele + } + } + # validate even distribution of random sampling (df = 29, 73 means 0.00001 probability) + assert_lessthan [chi_square_value $allkey] 73 + + r config set save $origin_save + r config set set-max-listpack-entries $origin_max_lp + r config set rdb-key-save-delay $origin_save_delay + } {OK} {needs:debug slow} + proc setup_move {} { r del myset3{t} myset4{t} create_set myset1{t} {1 a b}