Replace dict with new hashtable for sets datatype (#1176)

The new `hashtable` provides faster lookups and uses less memory than
`dict`.

A TCL test case "SRANDMEMBER with a dict containing long chain" is
deleted because it's covered by a hashtable unit test
"test_random_entry_with_long_chain", which is already present.

This change also moves some logic from dismissMemory (object.c) to
zmadvise_dontneed (zmalloc.c), so the hashtable implementation which
needs the dismiss functionality doesn't need to depend on object.c and
server.h.

This PR follows #1186.

---------

Signed-off-by: Rain Valentine <rsg000@gmail.com>
Signed-off-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
Co-authored-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
This commit is contained in:
Rain Valentine 2024-12-14 11:53:48 -08:00 committed by GitHub
parent 0e96bb311e
commit 88942c8e61
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
17 changed files with 326 additions and 371 deletions

View File

@ -978,7 +978,7 @@ void keysScanCallback(void *privdata, void *entry) {
/* This callback is used by scanGenericCommand in order to collect elements
* returned by the dictionary iterator into a list. */
void scanCallback(void *privdata, const dictEntry *de) {
void dictScanCallback(void *privdata, const dictEntry *de) {
scanData *data = (scanData *)privdata;
list *keys = data->keys;
robj *o = data->o;
@ -998,9 +998,7 @@ void scanCallback(void *privdata, const dictEntry *de) {
}
}
if (o->type == OBJ_SET) {
key = keysds;
} else if (o->type == OBJ_HASH) {
if (o->type == OBJ_HASH) {
key = keysds;
if (!data->only_keys) {
val = dictGetVal(de);
@ -1013,13 +1011,33 @@ void scanCallback(void *privdata, const dictEntry *de) {
val = sdsnewlen(buf, len);
}
} else {
serverPanic("Type not handled in SCAN callback.");
serverPanic("Type not handled in dict SCAN callback.");
}
listAddNodeTail(keys, key);
if (val) listAddNodeTail(keys, val);
}
void hashtableScanCallback(void *privdata, void *entry) {
scanData *data = (scanData *)privdata;
robj *o = data->o;
list *keys = data->keys;
data->sampled++;
/* currently only implemented for SET scan */
serverAssert(o && o->type == OBJ_SET && o->encoding == OBJ_ENCODING_HASHTABLE);
sds key = (sds)entry; /* Specific for OBJ_SET */
/* Filter element if it does not match the pattern. */
if (data->pattern) {
if (!stringmatchlen(data->pattern, sdslen(data->pattern), key, sdslen(key), 0)) {
return;
}
}
listAddNodeTail(keys, key);
}
/* Try to parse a SCAN cursor stored at object 'o':
* if the cursor is valid, store it as unsigned integer into *cursor and
* returns C_OK. Otherwise return C_ERR and send an error to the
@ -1083,7 +1101,6 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) {
sds typename = NULL;
long long type = LLONG_MAX;
int patlen = 0, use_pattern = 0, only_keys = 0;
dict *ht;
/* Object must be NULL (to iterate keys names), or the type of the object
* must be Set, Sorted Set, or Hash. */
@ -1152,34 +1169,35 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) {
* just return everything inside the object in a single call, setting the
* cursor to zero to signal the end of the iteration. */
/* Handle the case of a hash table. */
ht = NULL;
/* Handle the case of kvstore, dict or hashtable. */
dict *dict_table = NULL;
hashtable *hashtable_table = NULL;
int shallow_copied_list_items = 0;
if (o == NULL) {
ht = NULL;
} else if (o->type == OBJ_SET && o->encoding == OBJ_ENCODING_HT) {
ht = o->ptr;
shallow_copied_list_items = 1;
} else if (o->type == OBJ_SET && o->encoding == OBJ_ENCODING_HASHTABLE) {
hashtable_table = o->ptr;
shallow_copied_list_items = 1;
} else if (o->type == OBJ_HASH && o->encoding == OBJ_ENCODING_HT) {
ht = o->ptr;
dict_table = o->ptr;
shallow_copied_list_items = 1;
} else if (o->type == OBJ_ZSET && o->encoding == OBJ_ENCODING_SKIPLIST) {
zset *zs = o->ptr;
ht = zs->dict;
dict_table = zs->dict;
/* scanning ZSET allocates temporary strings even though it's a dict */
shallow_copied_list_items = 0;
}
list *keys = listCreate();
/* Set a free callback for the contents of the collected keys list.
* For the main keyspace dict, and when we scan a key that's dict encoded
* (we have 'ht'), we don't need to define free method because the strings
* in the list are just a shallow copy from the pointer in the dictEntry.
* When scanning a key with other encodings (e.g. listpack), we need to
* free the temporary strings we add to that list.
* The exception to the above is ZSET, where we do allocate temporary
* strings even when scanning a dict. */
if (o && (!ht || o->type == OBJ_ZSET)) {
/* Set a free callback for the contents of the collected keys list if they
* are deep copied temporary strings. We must not free them if they are just
* a shallow copy - a pointer to the actual data in the data structure */
if (!shallow_copied_list_items) {
listSetFreeMethod(keys, (void (*)(void *))sdsfree);
}
/* For main dictionary scan or data structure using hashtable. */
if (!o || ht) {
/* For main hash table scan or scannable data structure. */
if (!o || dict_table || hashtable_table) {
/* We set the max number of iterations to ten times the specified
* COUNT, so if the hash table is in a pathological state (very
* sparsely populated) we avoid to block too much time at the cost
@ -1188,7 +1206,7 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) {
/* We pass scanData which have three pointers to the callback:
* 1. data.keys: the list to which it will add new elements;
* 2. data.o: the object containing the dictionary so that
* 2. data.o: the object containing the hash table so that
* it is possible to fetch more data in a type-dependent way;
* 3. data.type: the specified type scan in the db, LLONG_MAX means
* type matching is no needed;
@ -1219,8 +1237,10 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) {
* If cursor is empty, we should try exploring next non-empty slot. */
if (o == NULL) {
cursor = kvstoreScan(c->db->keys, cursor, onlydidx, keysScanCallback, NULL, &data);
} else if (dict_table) {
cursor = dictScan(dict_table, cursor, dictScanCallback, &data);
} else {
cursor = dictScan(ht, cursor, scanCallback, &data);
cursor = hashtableScan(hashtable_table, cursor, hashtableScanCallback, &data);
}
} while (cursor && maxiterations-- && data.sampled < count);
} else if (o->type == OBJ_SET) {

View File

@ -916,30 +916,35 @@ void debugCommand(client *c) {
addReplyVerbatim(c, stats, sdslen(stats), "txt");
sdsfree(stats);
} else if (!strcasecmp(c->argv[1]->ptr, "htstats-key") && c->argc >= 3) {
robj *o;
dict *ht = NULL;
int full = 0;
if (c->argc >= 4 && !strcasecmp(c->argv[3]->ptr, "full")) full = 1;
if ((o = objectCommandLookupOrReply(c, c->argv[2], shared.nokeyerr)) == NULL) return;
robj *o = objectCommandLookupOrReply(c, c->argv[2], shared.nokeyerr);
if (o == NULL) return;
/* Get the hash table reference from the object, if possible. */
/* Get the dict reference from the object, if possible. */
dict *d = NULL;
hashtable *ht = NULL;
switch (o->encoding) {
case OBJ_ENCODING_SKIPLIST: {
zset *zs = o->ptr;
ht = zs->dict;
d = zs->dict;
} break;
case OBJ_ENCODING_HT: ht = o->ptr; break;
case OBJ_ENCODING_HT: d = o->ptr; break;
case OBJ_ENCODING_HASHTABLE: ht = o->ptr; break;
}
if (ht == NULL) {
if (d != NULL) {
char buf[4096];
dictGetStats(buf, sizeof(buf), d, full);
addReplyVerbatim(c, buf, strlen(buf), "txt");
} else if (ht != NULL) {
char buf[4096];
hashtableGetStats(buf, sizeof(buf), ht, full);
addReplyVerbatim(c, buf, strlen(buf), "txt");
} else {
addReplyError(c, "The value stored at the specified key is not "
"represented using an hash table");
} else {
char buf[4096];
dictGetStats(buf, sizeof(buf), ht, full);
addReplyVerbatim(c, buf, strlen(buf), "txt");
}
} else if (!strcasecmp(c->argv[1]->ptr, "change-repl-id") && c->argc == 2) {
serverLog(LL_NOTICE, "Changing replication IDs after receiving DEBUG change-repl-id");

View File

@ -34,6 +34,7 @@
*/
#include "server.h"
#include "hashtable.h"
#include "script.h"
#include <stddef.h>
@ -379,6 +380,20 @@ static void activeDefragSdsDict(dict *d, int val_type) {
} while (cursor != 0);
}
void activeDefragSdsHashtableCallback(void *privdata, void *entry_ref) {
UNUSED(privdata);
sds *sds_ref = (sds *)entry_ref;
sds new_sds = activeDefragSds(*sds_ref);
if (new_sds != NULL) *sds_ref = new_sds;
}
void activeDefragSdsHashtable(hashtable *ht) {
unsigned long cursor = 0;
do {
cursor = hashtableScanDefrag(ht, cursor, activeDefragSdsHashtableCallback, NULL, activeDefragAlloc, HASHTABLE_SCAN_EMIT_REF);
} while (cursor != 0);
}
/* Defrag a list of ptr, sds or robj string values */
static void activeDefragQuickListNode(quicklist *ql, quicklistNode **node_ref) {
quicklistNode *newnode, *node = *node_ref;
@ -497,11 +512,9 @@ static void scanCallbackCountScanned(void *privdata, const dictEntry *de) {
}
static void scanLaterSet(robj *ob, unsigned long *cursor) {
if (ob->type != OBJ_SET || ob->encoding != OBJ_ENCODING_HT) return;
dict *d = ob->ptr;
dictDefragFunctions defragfns = {.defragAlloc = activeDefragAlloc,
.defragKey = (dictDefragAllocFunction *)activeDefragSds};
*cursor = dictScanDefrag(d, *cursor, scanCallbackCountScanned, &defragfns, NULL);
if (ob->type != OBJ_SET || ob->encoding != OBJ_ENCODING_HASHTABLE) return;
hashtable *ht = ob->ptr;
*cursor = hashtableScanDefrag(ht, *cursor, activeDefragSdsHashtableCallback, NULL, activeDefragAlloc, HASHTABLE_SCAN_EMIT_REF);
}
static void scanLaterHash(robj *ob, unsigned long *cursor) {
@ -560,15 +573,16 @@ static void defragHash(robj *ob) {
}
static void defragSet(robj *ob) {
dict *d, *newd;
serverAssert(ob->type == OBJ_SET && ob->encoding == OBJ_ENCODING_HT);
d = ob->ptr;
if (dictSize(d) > server.active_defrag_max_scan_fields)
serverAssert(ob->type == OBJ_SET && ob->encoding == OBJ_ENCODING_HASHTABLE);
hashtable *ht = ob->ptr;
if (hashtableSize(ht) > server.active_defrag_max_scan_fields) {
defragLater(ob);
else
activeDefragSdsDict(d, DEFRAG_SDS_DICT_NO_VAL);
/* defrag the dict struct and tables */
if ((newd = dictDefragTables(ob->ptr))) ob->ptr = newd;
} else {
activeDefragSdsHashtable(ht);
}
/* defrag the hashtable struct and tables */
hashtable *newHashtable = hashtableDefragTables(ht, activeDefragAlloc);
if (newHashtable) ob->ptr = newHashtable;
}
/* Defrag callback for radix tree iterator, called for each node,
@ -766,7 +780,7 @@ static void defragKey(defragKeysCtx *ctx, robj **elemref) {
serverPanic("Unknown list encoding");
}
} else if (ob->type == OBJ_SET) {
if (ob->encoding == OBJ_ENCODING_HT) {
if (ob->encoding == OBJ_ENCODING_HASHTABLE) {
defragSet(ob);
} else if (ob->encoding == OBJ_ENCODING_INTSET || ob->encoding == OBJ_ENCODING_LISTPACK) {
void *newptr, *ptr = ob->ptr;

View File

@ -1023,7 +1023,7 @@ void *hashtableMetadata(hashtable *ht) {
}
/* Returns the number of entries stored. */
size_t hashtableSize(hashtable *ht) {
size_t hashtableSize(const hashtable *ht) {
return ht->used[0] + ht->used[1];
}
@ -1180,6 +1180,14 @@ hashtable *hashtableDefragTables(hashtable *ht, void *(*defragfn)(void *)) {
return ht1;
}
/* Used for releasing memory to OS to avoid unnecessary CoW. Called when we've
* forked and memory won't be used again. See zmadvise_dontneed() */
void dismissHashtable(hashtable *ht) {
for (int i = 0; i < 2; i++) {
zmadvise_dontneed(ht->tables[i], numBuckets(ht->bucket_exp[i]) * sizeof(bucket *));
}
}
/* Returns 1 if an entry was found matching the key. Also points *found to it,
* if found is provided. Returns 0 if no matching entry was found. */
int hashtableFind(hashtable *ht, const void *key, void **found) {

View File

@ -108,7 +108,7 @@ void hashtableRelease(hashtable *ht);
void hashtableEmpty(hashtable *ht, void(callback)(hashtable *));
hashtableType *hashtableGetType(hashtable *ht);
void *hashtableMetadata(hashtable *ht);
size_t hashtableSize(hashtable *ht);
size_t hashtableSize(const hashtable *ht);
size_t hashtableBuckets(hashtable *ht);
size_t hashtableChainedBuckets(hashtable *ht, int table);
size_t hashtableMemUsage(hashtable *ht);
@ -123,6 +123,7 @@ int hashtableTryExpand(hashtable *ht, size_t size);
int hashtableExpandIfNeeded(hashtable *ht);
int hashtableShrinkIfNeeded(hashtable *ht);
hashtable *hashtableDefragTables(hashtable *ht, void *(*defragfn)(void *));
void dismissHashtable(hashtable *ht);
/* Entries */
int hashtableFind(hashtable *ht, const void *key, void **found);

View File

@ -116,9 +116,9 @@ size_t lazyfreeGetFreeEffort(robj *key, robj *obj, int dbid) {
if (obj->type == OBJ_LIST && obj->encoding == OBJ_ENCODING_QUICKLIST) {
quicklist *ql = obj->ptr;
return ql->len;
} else if (obj->type == OBJ_SET && obj->encoding == OBJ_ENCODING_HT) {
dict *ht = obj->ptr;
return dictSize(ht);
} else if (obj->type == OBJ_SET && obj->encoding == OBJ_ENCODING_HASHTABLE) {
hashtable *ht = obj->ptr;
return hashtableSize(ht);
} else if (obj->type == OBJ_ZSET && obj->encoding == OBJ_ENCODING_SKIPLIST) {
zset *zs = obj->ptr;
return zs->zsl->length;

View File

@ -11017,20 +11017,20 @@ typedef struct {
ValkeyModuleScanKeyCB fn;
} ScanKeyCBData;
static void moduleScanKeyCallback(void *privdata, const dictEntry *de) {
static void moduleScanKeyDictCallback(void *privdata, const dictEntry *de) {
ScanKeyCBData *data = privdata;
sds key = dictGetKey(de);
robj *o = data->key->value;
robj *field = createStringObject(key, sdslen(key));
robj *value = NULL;
if (o->type == OBJ_SET) {
value = NULL;
} else if (o->type == OBJ_HASH) {
if (o->type == OBJ_HASH) {
sds val = dictGetVal(de);
value = createStringObject(val, sdslen(val));
} else if (o->type == OBJ_ZSET) {
double *val = (double *)dictGetVal(de);
value = createStringObjectFromLongDouble(*val, 0);
} else {
serverPanic("unexpected object type");
}
data->fn(data->key, field, value, data->user_data);
@ -11038,6 +11038,17 @@ static void moduleScanKeyCallback(void *privdata, const dictEntry *de) {
if (value) decrRefCount(value);
}
static void moduleScanKeyHashtableCallback(void *privdata, void *entry) {
ScanKeyCBData *data = privdata;
robj *o = data->key->value;
serverAssert(o->type == OBJ_SET);
sds key = entry;
robj *field = createStringObject(key, sdslen(key));
data->fn(data->key, field, NULL, data->user_data);
decrRefCount(field);
}
/* Scan api that allows a module to scan the elements in a hash, set or sorted set key
*
* Callback for scan implementation.
@ -11091,14 +11102,15 @@ int VM_ScanKey(ValkeyModuleKey *key, ValkeyModuleScanCursor *cursor, ValkeyModul
errno = EINVAL;
return 0;
}
dict *ht = NULL;
dict *d = NULL;
hashtable *ht = NULL;
robj *o = key->value;
if (o->type == OBJ_SET) {
if (o->encoding == OBJ_ENCODING_HT) ht = o->ptr;
if (o->encoding == OBJ_ENCODING_HASHTABLE) ht = o->ptr;
} else if (o->type == OBJ_HASH) {
if (o->encoding == OBJ_ENCODING_HT) ht = o->ptr;
if (o->encoding == OBJ_ENCODING_HT) d = o->ptr;
} else if (o->type == OBJ_ZSET) {
if (o->encoding == OBJ_ENCODING_SKIPLIST) ht = ((zset *)o->ptr)->dict;
if (o->encoding == OBJ_ENCODING_SKIPLIST) d = ((zset *)o->ptr)->dict;
} else {
errno = EINVAL;
return 0;
@ -11108,9 +11120,16 @@ int VM_ScanKey(ValkeyModuleKey *key, ValkeyModuleScanCursor *cursor, ValkeyModul
return 0;
}
int ret = 1;
if (ht) {
if (d) {
ScanKeyCBData data = {key, privdata, fn};
cursor->cursor = dictScan(ht, cursor->cursor, moduleScanKeyCallback, &data);
cursor->cursor = dictScan(d, cursor->cursor, moduleScanKeyDictCallback, &data);
if (cursor->cursor == 0) {
cursor->done = 1;
ret = 0;
}
} else if (ht) {
ScanKeyCBData data = {key, privdata, fn};
cursor->cursor = hashtableScan(ht, cursor->cursor, moduleScanKeyHashtableCallback, &data);
if (cursor->cursor == 0) {
cursor->done = 1;
ret = 0;

View File

@ -429,9 +429,9 @@ robj *createListListpackObject(void) {
}
robj *createSetObject(void) {
dict *d = dictCreate(&setDictType);
robj *o = createObject(OBJ_SET, d);
o->encoding = OBJ_ENCODING_HT;
hashtable *ht = hashtableCreate(&setHashtableType);
robj *o = createObject(OBJ_SET, ht);
o->encoding = OBJ_ENCODING_HASHTABLE;
return o;
}
@ -506,7 +506,7 @@ void freeListObject(robj *o) {
void freeSetObject(robj *o) {
switch (o->encoding) {
case OBJ_ENCODING_HT: dictRelease((dict *)o->ptr); break;
case OBJ_ENCODING_HASHTABLE: hashtableRelease((hashtable *)o->ptr); break;
case OBJ_ENCODING_INTSET:
case OBJ_ENCODING_LISTPACK: zfree(o->ptr); break;
default: serverPanic("Unknown set encoding type");
@ -622,23 +622,23 @@ void dismissListObject(robj *o, size_t size_hint) {
/* See dismissObject() */
void dismissSetObject(robj *o, size_t size_hint) {
if (o->encoding == OBJ_ENCODING_HT) {
dict *set = o->ptr;
serverAssert(dictSize(set) != 0);
if (o->encoding == OBJ_ENCODING_HASHTABLE) {
hashtable *ht = o->ptr;
serverAssert(hashtableSize(ht) != 0);
/* We iterate all nodes only when average member size is bigger than a
* page size, and there's a high chance we'll actually dismiss something. */
if (size_hint / dictSize(set) >= server.page_size) {
dictEntry *de;
dictIterator *di = dictGetIterator(set);
while ((de = dictNext(di)) != NULL) {
dismissSds(dictGetKey(de));
if (size_hint / hashtableSize(ht) >= server.page_size) {
hashtableIterator iter;
hashtableInitIterator(&iter, ht);
void *next;
while (hashtableNext(&iter, &next)) {
sds item = next;
dismissSds(item);
}
dictReleaseIterator(di);
hashtableResetIterator(&iter);
}
/* Dismiss hash table memory. */
dismissMemory(set->ht_table[0], DICTHT_SIZE(set->ht_size_exp[0]) * sizeof(dictEntry *));
dismissMemory(set->ht_table[1], DICTHT_SIZE(set->ht_size_exp[1]) * sizeof(dictEntry *));
dismissHashtable(ht);
} else if (o->encoding == OBJ_ENCODING_INTSET) {
dismissMemory(o->ptr, intsetBlobLen((intset *)o->ptr));
} else if (o->encoding == OBJ_ENCODING_LISTPACK) {
@ -728,7 +728,7 @@ void dismissStreamObject(robj *o, size_t size_hint) {
* modifies any keys due to write traffic, it'll cause CoW which consume
* physical memory. In the child process, after serializing the key and value,
* the data is definitely not accessed again, so to avoid unnecessary CoW, we
* try to release their memory back to OS. see dismissMemory().
* try to release their memory back to OS. see zmadvise_dontneed().
*
* Because of the cost of iterating all node/field/member/entry of complex data
* types, we iterate and dismiss them only when approximate average we estimate
@ -1109,6 +1109,7 @@ char *strEncoding(int encoding) {
case OBJ_ENCODING_RAW: return "raw";
case OBJ_ENCODING_INT: return "int";
case OBJ_ENCODING_HT: return "hashtable";
case OBJ_ENCODING_HASHTABLE: return "hashtable";
case OBJ_ENCODING_QUICKLIST: return "quicklist";
case OBJ_ENCODING_LISTPACK: return "listpack";
case OBJ_ENCODING_INTSET: return "intset";
@ -1160,17 +1161,20 @@ size_t objectComputeSize(robj *key, robj *o, size_t sample_size, int dbid) {
serverPanic("Unknown list encoding");
}
} else if (o->type == OBJ_SET) {
if (o->encoding == OBJ_ENCODING_HT) {
d = o->ptr;
di = dictGetIterator(d);
asize = sizeof(*o) + sizeof(dict) + (sizeof(struct dictEntry *) * dictBuckets(d));
while ((de = dictNext(di)) != NULL && samples < sample_size) {
ele = dictGetKey(de);
elesize += dictEntryMemUsage(de) + sdsAllocSize(ele);
if (o->encoding == OBJ_ENCODING_HASHTABLE) {
hashtable *ht = o->ptr;
asize = sizeof(*o) + hashtableMemUsage(ht);
hashtableIterator iter;
hashtableInitIterator(&iter, ht);
void *next;
while (hashtableNext(&iter, &next) && samples < sample_size) {
sds element = next;
elesize += sdsAllocSize(element);
samples++;
}
dictReleaseIterator(di);
if (samples) asize += (double)elesize / samples * dictSize(d);
hashtableResetIterator(&iter);
if (samples) asize += (double)elesize / samples * hashtableSize(ht);
} else if (o->encoding == OBJ_ENCODING_INTSET) {
asize = sizeof(*o) + zmalloc_size(o->ptr);
} else if (o->encoding == OBJ_ENCODING_LISTPACK) {

View File

@ -692,7 +692,7 @@ int rdbSaveObjectType(rio *rdb, robj *o) {
case OBJ_SET:
if (o->encoding == OBJ_ENCODING_INTSET)
return rdbSaveType(rdb, RDB_TYPE_SET_INTSET);
else if (o->encoding == OBJ_ENCODING_HT)
else if (o->encoding == OBJ_ENCODING_HASHTABLE)
return rdbSaveType(rdb, RDB_TYPE_SET);
else if (o->encoding == OBJ_ENCODING_LISTPACK)
return rdbSaveType(rdb, RDB_TYPE_SET_LISTPACK);
@ -876,26 +876,26 @@ ssize_t rdbSaveObject(rio *rdb, robj *o, robj *key, int dbid) {
}
} else if (o->type == OBJ_SET) {
/* Save a set value */
if (o->encoding == OBJ_ENCODING_HT) {
dict *set = o->ptr;
dictIterator *di = dictGetIterator(set);
dictEntry *de;
if (o->encoding == OBJ_ENCODING_HASHTABLE) {
hashtable *set = o->ptr;
if ((n = rdbSaveLen(rdb, dictSize(set))) == -1) {
dictReleaseIterator(di);
if ((n = rdbSaveLen(rdb, hashtableSize(set))) == -1) {
return -1;
}
nwritten += n;
while ((de = dictNext(di)) != NULL) {
sds ele = dictGetKey(de);
hashtableIterator iterator;
hashtableInitIterator(&iterator, set);
void *next;
while (hashtableNext(&iterator, &next)) {
sds ele = next;
if ((n = rdbSaveRawString(rdb, (unsigned char *)ele, sdslen(ele))) == -1) {
dictReleaseIterator(di);
hashtableResetIterator(&iterator);
return -1;
}
nwritten += n;
}
dictReleaseIterator(di);
hashtableResetIterator(&iterator);
} else if (o->encoding == OBJ_ENCODING_INTSET) {
size_t l = intsetBlobLen((intset *)o->ptr);
@ -1909,8 +1909,8 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) {
o = createSetObject();
/* It's faster to expand the dict to the right size asap in order
* to avoid rehashing */
if (len > DICT_HT_INITIAL_SIZE && dictTryExpand(o->ptr, len) != DICT_OK) {
rdbReportCorruptRDB("OOM in dictTryExpand %llu", (unsigned long long)len);
if (!hashtableTryExpand(o->ptr, len)) {
rdbReportCorruptRDB("OOM in hashtableTryExpand %llu", (unsigned long long)len);
decrRefCount(o);
return NULL;
}
@ -1949,8 +1949,8 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) {
* of many small ones. It's OK since lpSafeToAdd doesn't
* care about individual elements, only the total size. */
setTypeConvert(o, OBJ_ENCODING_LISTPACK);
} else if (setTypeConvertAndExpand(o, OBJ_ENCODING_HT, len, 0) != C_OK) {
rdbReportCorruptRDB("OOM in dictTryExpand %llu", (unsigned long long)len);
} else if (setTypeConvertAndExpand(o, OBJ_ENCODING_HASHTABLE, len, 0) != C_OK) {
rdbReportCorruptRDB("OOM in hashtableTryExpand %llu", (unsigned long long)len);
sdsfree(sdsele);
decrRefCount(o);
return NULL;
@ -1970,8 +1970,8 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) {
return NULL;
}
o->ptr = lpAppend(o->ptr, (unsigned char *)sdsele, elelen);
} else if (setTypeConvertAndExpand(o, OBJ_ENCODING_HT, len, 0) != C_OK) {
rdbReportCorruptRDB("OOM in dictTryExpand %llu", (unsigned long long)len);
} else if (setTypeConvertAndExpand(o, OBJ_ENCODING_HASHTABLE, len, 0) != C_OK) {
rdbReportCorruptRDB("OOM in hashtableTryExpand %llu", (unsigned long long)len);
sdsfree(sdsele);
decrRefCount(o);
return NULL;
@ -1980,8 +1980,8 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) {
/* This will also be called when the set was just converted
* to a regular hash table encoded set. */
if (o->encoding == OBJ_ENCODING_HT) {
if (dictAdd((dict *)o->ptr, sdsele, NULL) != DICT_OK) {
if (o->encoding == OBJ_ENCODING_HASHTABLE) {
if (!hashtableAdd((hashtable *)o->ptr, sdsele)) {
rdbReportCorruptRDB("Duplicate set members detected");
decrRefCount(o);
sdsfree(sdsele);
@ -2356,7 +2356,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) {
}
o->type = OBJ_SET;
o->encoding = OBJ_ENCODING_INTSET;
if (intsetLen(o->ptr) > server.set_max_intset_entries) setTypeConvert(o, OBJ_ENCODING_HT);
if (intsetLen(o->ptr) > server.set_max_intset_entries) setTypeConvert(o, OBJ_ENCODING_HASHTABLE);
break;
case RDB_TYPE_SET_LISTPACK:
if (deep_integrity_validation) server.stat_dump_payload_sanitizations++;
@ -2376,7 +2376,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) {
decrRefCount(o);
goto emptykey;
}
if (setTypeSize(o) > server.set_max_listpack_entries) setTypeConvert(o, OBJ_ENCODING_HT);
if (setTypeSize(o) > server.set_max_listpack_entries) setTypeConvert(o, OBJ_ENCODING_HASHTABLE);
break;
case RDB_TYPE_ZSET_ZIPLIST: {
unsigned char *lp = lpNew(encoded_len);

View File

@ -372,6 +372,7 @@ void dictDictDestructor(void *val) {
dictRelease((dict *)val);
}
/* Returns 1 when keys match */
int dictSdsKeyCompare(const void *key1, const void *key2) {
int l1, l2;
l1 = sdslen((sds)key1);
@ -380,6 +381,12 @@ int dictSdsKeyCompare(const void *key1, const void *key2) {
return memcmp(key1, key2, l1) == 0;
}
/* Returns 0 when keys match */
int hashtableSdsKeyCompare(const void *key1, const void *key2) {
const sds sds1 = (const sds)key1, sds2 = (const sds)key2;
return sdslen(sds1) != sdslen(sds2) || sdscmp(sds1, sds2);
}
size_t dictSdsEmbedKey(unsigned char *buf, size_t buf_len, const void *key, uint8_t *key_offset) {
return sdscopytobuffer(buf, buf_len, (sds)key, key_offset);
}
@ -542,17 +549,11 @@ dictType objectKeyHeapPointerValueDictType = {
NULL /* allow to expand */
};
/* Set dictionary type. Keys are SDS strings, values are not used. */
dictType setDictType = {
dictSdsHash, /* hash function */
NULL, /* key dup */
dictSdsKeyCompare, /* key compare */
dictSdsDestructor, /* key destructor */
NULL, /* val destructor */
NULL, /* allow to expand */
.no_value = 1, /* no values in this dict */
.keys_are_odd = 1 /* an SDS string is always an odd pointer */
};
/* Set hashtable type. Items are SDS strings */
hashtableType setHashtableType = {
.hashFunction = dictSdsHash,
.keyCompare = hashtableSdsKeyCompare,
.entryDestructor = dictSdsDestructor};
/* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
dictType zsetDictType = {
@ -572,11 +573,6 @@ const void *hashtableObjectGetKey(const void *entry) {
return objectGetKey(entry);
}
int hashtableSdsKeyCompare(const void *key1, const void *key2) {
const sds sds1 = (const sds)key1, sds2 = (const sds)key2;
return sdslen(sds1) != sdslen(sds2) || sdscmp(sds1, sds2);
}
int hashtableObjKeyCompare(const void *key1, const void *key2) {
const robj *o1 = key1, *o2 = key2;
return hashtableSdsKeyCompare(o1->ptr, o2->ptr);
@ -645,6 +641,11 @@ dictType sdsReplyDictType = {
NULL /* allow to expand */
};
/* Hashtable type without destructor */
hashtableType sdsReplyHashtableType = {
.hashFunction = dictSdsCaseHash,
.keyCompare = hashtableSdsKeyCompare};
/* Keylist hash table type has unencoded Objects as keys and
* lists as values. It's used for blocking operations (BLPOP) and to
* map swapped keys to a list of clients waiting for this keys to be loaded. */
@ -6521,27 +6522,7 @@ void sendChildInfo(childInfoType info_type, size_t keys, char *pname) {
sendChildInfoGeneric(info_type, keys, -1, pname);
}
/* Try to release pages back to the OS directly (bypassing the allocator),
* in an effort to decrease CoW during fork. For small allocations, we can't
* release any full page, so in an effort to avoid getting the size of the
* allocation from the allocator (malloc_size) when we already know it's small,
* we check the size_hint. If the size is not already known, passing a size_hint
* of 0 will lead the checking the real size of the allocation.
* Also please note that the size may be not accurate, so in order to make this
* solution effective, the judgement for releasing memory pages should not be
* too strict. */
void dismissMemory(void *ptr, size_t size_hint) {
if (ptr == NULL) return;
/* madvise(MADV_DONTNEED) can not release pages if the size of memory
* is too small, we try to release only for the memory which the size
* is more than half of page size. */
if (size_hint && size_hint <= server.page_size / 2) return;
zmadvise_dontneed(ptr);
}
/* Dismiss big chunks of memory inside a client structure, see dismissMemory() */
/* Dismiss big chunks of memory inside a client structure, see zmadvise_dontneed() */
void dismissClientMemory(client *c) {
/* Dismiss client query buffer and static reply buffer. */
dismissMemory(c->buf, c->buf_usable_size);
@ -6572,7 +6553,7 @@ void dismissClientMemory(client *c) {
/* In the child process, we don't need some buffers anymore, and these are
* likely to change in the parent when there's heavy write traffic.
* We dismiss them right away, to avoid CoW.
* see dismissMemory(). */
* see zmadvise_dontneed(). */
void dismissMemoryInChild(void) {
/* madvise(MADV_DONTNEED) may not work if Transparent Huge Pages is enabled. */
if (server.thp_enabled) return;

View File

@ -83,6 +83,8 @@ typedef long long ustime_t; /* microsecond time type. */
#include "connection.h" /* Connection abstraction */
#include "memory_prefetch.h"
#define dismissMemory zmadvise_dontneed
#define VALKEYMODULE_CORE 1
typedef struct serverObject robj;
#include "valkeymodule.h" /* Modules API defines. */
@ -873,6 +875,7 @@ struct ValkeyModuleDigest {
#define OBJ_ENCODING_QUICKLIST 9 /* Encoded as linked list of listpacks */
#define OBJ_ENCODING_STREAM 10 /* Encoded as a radix tree of listpacks */
#define OBJ_ENCODING_LISTPACK 11 /* Encoded as a listpack */
#define OBJ_ENCODING_HASHTABLE 12 /* Encoded as a hashtable */
#define LRU_BITS 24
#define LRU_CLOCK_MAX ((1 << LRU_BITS) - 1) /* Max value of obj->lru */
@ -2634,7 +2637,7 @@ typedef struct {
robj *subject;
int encoding;
int ii; /* intset iterator */
dictIterator *di;
hashtableIterator *hashtable_iterator;
unsigned char *lpi; /* listpack iterator */
} setTypeIterator;
@ -2665,7 +2668,7 @@ extern struct valkeyServer server;
extern struct sharedObjectsStruct shared;
extern dictType objectKeyPointerValueDictType;
extern dictType objectKeyHeapPointerValueDictType;
extern dictType setDictType;
extern hashtableType setHashtableType;
extern dictType BenchmarkDictType;
extern dictType zsetDictType;
extern hashtableType kvstoreKeysHashtableType;
@ -2680,6 +2683,7 @@ extern dictType objToDictDictType;
extern hashtableType kvstoreChannelHashtableType;
extern dictType modulesDictType;
extern dictType sdsReplyDictType;
extern hashtableType sdsReplyHashtableType;
extern dictType keylistDictType;
extern dict *modules;
@ -3374,7 +3378,6 @@ void rejectCommandFormat(client *c, const char *fmt, ...);
void *activeDefragAlloc(void *ptr);
robj *activeDefragStringOb(robj *ob);
void dismissSds(sds s);
void dismissMemory(void *ptr, size_t size_hint);
void dismissMemoryInChild(void);
#define RESTART_SERVER_NONE 0

View File

@ -28,6 +28,7 @@
*/
#include "server.h"
#include "hashtable.h"
#include "intset.h" /* Compact integer set structure */
/*-----------------------------------------------------------------------------
@ -50,7 +51,7 @@ robj *setTypeCreate(sds value, size_t size_hint) {
/* We may oversize the set by using the hint if the hint is not accurate,
* but we will assume this is acceptable to maximize performance. */
robj *o = createSetObject();
dictExpand(o->ptr, size_hint);
hashtableExpand(o->ptr, size_hint);
return o;
}
@ -59,7 +60,7 @@ robj *setTypeCreate(sds value, size_t size_hint) {
void setTypeMaybeConvert(robj *set, size_t size_hint) {
if ((set->encoding == OBJ_ENCODING_LISTPACK && size_hint > server.set_max_listpack_entries) ||
(set->encoding == OBJ_ENCODING_INTSET && size_hint > server.set_max_intset_entries)) {
setTypeConvertAndExpand(set, OBJ_ENCODING_HT, size_hint, 1);
setTypeConvertAndExpand(set, OBJ_ENCODING_HASHTABLE, size_hint, 1);
}
}
@ -74,7 +75,7 @@ static size_t intsetMaxEntries(void) {
/* Converts intset to HT if it contains too many entries. */
static void maybeConvertIntset(robj *subject) {
serverAssert(subject->encoding == OBJ_ENCODING_INTSET);
if (intsetLen(subject->ptr) > intsetMaxEntries()) setTypeConvert(subject, OBJ_ENCODING_HT);
if (intsetLen(subject->ptr) > intsetMaxEntries()) setTypeConvert(subject, OBJ_ENCODING_HASHTABLE);
}
/* When you know all set elements are integers, call this to convert the set to
@ -91,7 +92,7 @@ static void maybeConvertToIntset(robj *set) {
while (setTypeNext(si, &str, &len, &llval) != -1) {
if (str) {
/* If the element is returned as a string, we may be able to convert
* it to integer. This happens for OBJ_ENCODING_HT. */
* it to integer. This happens for OBJ_ENCODING_HASHTABLE. */
serverAssert(string2ll(str, len, (long long *)&llval));
}
uint8_t success = 0;
@ -134,20 +135,21 @@ int setTypeAddAux(robj *set, char *str, size_t len, int64_t llval, int str_is_sd
}
serverAssert(str);
if (set->encoding == OBJ_ENCODING_HT) {
if (set->encoding == OBJ_ENCODING_HASHTABLE) {
/* Avoid duping the string if it is an sds string. */
sds sdsval = str_is_sds ? (sds)str : sdsnewlen(str, len);
dict *ht = set->ptr;
void *position = dictFindPositionForInsert(ht, sdsval, NULL);
if (position) {
hashtable *ht = set->ptr;
hashtablePosition position;
if (hashtableFindPositionForInsert(ht, sdsval, &position, NULL)) {
/* Key doesn't already exist in the set. Add it but dup the key. */
if (sdsval == str) sdsval = sdsdup(sdsval);
dictInsertAtPosition(ht, sdsval, position);
hashtableInsertAtPosition(ht, sdsval, &position);
return 1;
} else if (sdsval != str) {
/* String is already a member. Free our temporary sds copy. */
sdsfree(sdsval);
return 0;
}
return (position != NULL);
} else if (set->encoding == OBJ_ENCODING_LISTPACK) {
unsigned char *lp = set->ptr;
unsigned char *p = lpFirst(lp);
@ -166,8 +168,8 @@ int setTypeAddAux(robj *set, char *str, size_t len, int64_t llval, int str_is_sd
set->ptr = lp;
} else {
/* Size limit is reached. Convert to hashtable and add. */
setTypeConvertAndExpand(set, OBJ_ENCODING_HT, lpLength(lp) + 1, 1);
serverAssert(dictAdd(set->ptr, sdsnewlen(str, len), NULL) == DICT_OK);
setTypeConvertAndExpand(set, OBJ_ENCODING_HASHTABLE, lpLength(lp) + 1, 1);
serverAssert(hashtableAdd(set->ptr, sdsnewlen(str, len)));
}
return 1;
}
@ -204,10 +206,10 @@ int setTypeAddAux(robj *set, char *str, size_t len, int64_t llval, int str_is_sd
set->ptr = lp;
return 1;
} else {
setTypeConvertAndExpand(set, OBJ_ENCODING_HT, intsetLen(set->ptr) + 1, 1);
setTypeConvertAndExpand(set, OBJ_ENCODING_HASHTABLE, intsetLen(set->ptr) + 1, 1);
/* The set *was* an intset and this value is not integer
* encodable, so dictAdd should always work. */
serverAssert(dictAdd(set->ptr, sdsnewlen(str, len), NULL) == DICT_OK);
* encodable, so hashtableAdd should always work. */
serverAssert(hashtableAdd(set->ptr, sdsnewlen(str, len)));
return 1;
}
}
@ -242,9 +244,9 @@ int setTypeRemoveAux(robj *setobj, char *str, size_t len, int64_t llval, int str
str_is_sds = 0;
}
if (setobj->encoding == OBJ_ENCODING_HT) {
if (setobj->encoding == OBJ_ENCODING_HASHTABLE) {
sds sdsval = str_is_sds ? (sds)str : sdsnewlen(str, len);
int deleted = (dictDelete(setobj->ptr, sdsval) == DICT_OK);
int deleted = hashtableDelete(setobj->ptr, sdsval);
if (sdsval != str) sdsfree(sdsval); /* free temp copy */
return deleted;
} else if (setobj->encoding == OBJ_ENCODING_LISTPACK) {
@ -298,11 +300,11 @@ int setTypeIsMemberAux(robj *set, char *str, size_t len, int64_t llval, int str_
} else if (set->encoding == OBJ_ENCODING_INTSET) {
long long llval;
return string2ll(str, len, &llval) && intsetFind(set->ptr, llval);
} else if (set->encoding == OBJ_ENCODING_HT && str_is_sds) {
return dictFind(set->ptr, (sds)str) != NULL;
} else if (set->encoding == OBJ_ENCODING_HT) {
} else if (set->encoding == OBJ_ENCODING_HASHTABLE && str_is_sds) {
return hashtableFind(set->ptr, (sds)str, NULL);
} else if (set->encoding == OBJ_ENCODING_HASHTABLE) {
sds sdsval = sdsnewlen(str, len);
int result = dictFind(set->ptr, sdsval) != NULL;
int result = hashtableFind(set->ptr, sdsval, NULL);
sdsfree(sdsval);
return result;
} else {
@ -314,8 +316,8 @@ setTypeIterator *setTypeInitIterator(robj *subject) {
setTypeIterator *si = zmalloc(sizeof(setTypeIterator));
si->subject = subject;
si->encoding = subject->encoding;
if (si->encoding == OBJ_ENCODING_HT) {
si->di = dictGetIterator(subject->ptr);
if (si->encoding == OBJ_ENCODING_HASHTABLE) {
si->hashtable_iterator = hashtableCreateIterator(subject->ptr);
} else if (si->encoding == OBJ_ENCODING_INTSET) {
si->ii = 0;
} else if (si->encoding == OBJ_ENCODING_LISTPACK) {
@ -327,7 +329,7 @@ setTypeIterator *setTypeInitIterator(robj *subject) {
}
void setTypeReleaseIterator(setTypeIterator *si) {
if (si->encoding == OBJ_ENCODING_HT) dictReleaseIterator(si->di);
if (si->encoding == OBJ_ENCODING_HASHTABLE) hashtableReleaseIterator(si->hashtable_iterator);
zfree(si);
}
@ -340,7 +342,7 @@ void setTypeReleaseIterator(setTypeIterator *si) {
* (str and len) or (llele) depending on whether the value is stored as a string
* or as an integer internally.
*
* If OBJ_ENCODING_HT is returned, then str points to an sds string and can be
* If OBJ_ENCODING_HASHTABLE is returned, then str points to an sds string and can be
* used as such. If OBJ_ENCODING_INTSET, then llele is populated and str is
* pointed to NULL. If OBJ_ENCODING_LISTPACK is returned, the value can be
* either a string or an integer. If *str is not NULL, then str and len are
@ -353,10 +355,10 @@ void setTypeReleaseIterator(setTypeIterator *si) {
*
* When there are no more elements -1 is returned. */
int setTypeNext(setTypeIterator *si, char **str, size_t *len, int64_t *llele) {
if (si->encoding == OBJ_ENCODING_HT) {
dictEntry *de = dictNext(si->di);
if (de == NULL) return -1;
*str = dictGetKey(de);
if (si->encoding == OBJ_ENCODING_HASHTABLE) {
void *next;
if (!hashtableNext(si->hashtable_iterator, &next)) return -1;
*str = next;
*len = sdslen(*str);
*llele = -123456789; /* Not needed. Defensive. */
} else if (si->encoding == OBJ_ENCODING_INTSET) {
@ -406,15 +408,16 @@ sds setTypeNextObject(setTypeIterator *si) {
* object. The return value of the function is the object->encoding
* field of the object and can be used by the caller to check if the
* int64_t pointer or the str and len pointers were populated, as for
* setTypeNext. If OBJ_ENCODING_HT is returned, str is pointed to a
* setTypeNext. If OBJ_ENCODING_HASHTABLE is returned, str is pointed to a
* string which is actually an sds string and it can be used as such.
*
* Note that both the str, len and llele pointers should be passed and cannot
* be NULL. If str is set to NULL, the value is an integer stored in llele. */
int setTypeRandomElement(robj *setobj, char **str, size_t *len, int64_t *llele) {
if (setobj->encoding == OBJ_ENCODING_HT) {
dictEntry *de = dictGetFairRandomKey(setobj->ptr);
*str = dictGetKey(de);
if (setobj->encoding == OBJ_ENCODING_HASHTABLE) {
void *entry = NULL;
hashtableFairRandomEntry(setobj->ptr, &entry);
*str = entry;
*len = sdslen(*str);
*llele = -123456789; /* Not needed. Defensive. */
} else if (setobj->encoding == OBJ_ENCODING_INTSET) {
@ -457,14 +460,14 @@ robj *setTypePopRandom(robj *set) {
obj = createStringObject(str, len);
else
obj = createStringObjectFromLongLong(llele);
setTypeRemoveAux(set, str, len, llele, encoding == OBJ_ENCODING_HT);
setTypeRemoveAux(set, str, len, llele, encoding == OBJ_ENCODING_HASHTABLE);
}
return obj;
}
unsigned long setTypeSize(const robj *subject) {
if (subject->encoding == OBJ_ENCODING_HT) {
return dictSize((const dict *)subject->ptr);
if (subject->encoding == OBJ_ENCODING_HASHTABLE) {
return hashtableSize((const hashtable *)subject->ptr);
} else if (subject->encoding == OBJ_ENCODING_INTSET) {
return intsetLen((const intset *)subject->ptr);
} else if (subject->encoding == OBJ_ENCODING_LISTPACK) {
@ -474,7 +477,7 @@ unsigned long setTypeSize(const robj *subject) {
}
}
/* Convert the set to specified encoding. The resulting dict (when converting
/* Convert the set to specified encoding. The resulting hashtable (when converting
* to a hash table) is presized to hold the number of elements in the original
* set. */
void setTypeConvert(robj *setobj, int enc) {
@ -489,28 +492,28 @@ int setTypeConvertAndExpand(robj *setobj, int enc, unsigned long cap, int panic)
setTypeIterator *si;
serverAssertWithInfo(NULL, setobj, setobj->type == OBJ_SET && setobj->encoding != enc);
if (enc == OBJ_ENCODING_HT) {
dict *d = dictCreate(&setDictType);
if (enc == OBJ_ENCODING_HASHTABLE) {
hashtable *ht = hashtableCreate(&setHashtableType);
sds element;
/* Presize the dict to avoid rehashing */
/* Presize the hashtable to avoid rehashing */
if (panic) {
dictExpand(d, cap);
} else if (dictTryExpand(d, cap) != DICT_OK) {
dictRelease(d);
hashtableExpand(ht, cap);
} else if (!hashtableTryExpand(ht, cap)) {
hashtableRelease(ht);
return C_ERR;
}
/* To add the elements we extract integers and create Objects */
si = setTypeInitIterator(setobj);
while ((element = setTypeNextObject(si)) != NULL) {
serverAssert(dictAdd(d, element, NULL) == DICT_OK);
serverAssert(hashtableAdd(ht, element));
}
setTypeReleaseIterator(si);
freeSetObject(setobj); /* frees the internals but not setobj itself */
setobj->encoding = OBJ_ENCODING_HT;
setobj->ptr = d;
setobj->encoding = OBJ_ENCODING_HASHTABLE;
setobj->ptr = ht;
} else if (enc == OBJ_ENCODING_LISTPACK) {
/* Preallocate the minimum two bytes per element (enc/value + backlen) */
size_t estcap = cap * 2;
@ -568,10 +571,10 @@ robj *setTypeDup(robj *o) {
memcpy(new_lp, lp, sz);
set = createObject(OBJ_SET, new_lp);
set->encoding = OBJ_ENCODING_LISTPACK;
} else if (o->encoding == OBJ_ENCODING_HT) {
} else if (o->encoding == OBJ_ENCODING_HASHTABLE) {
set = createSetObject();
dict *d = o->ptr;
dictExpand(set->ptr, dictSize(d));
hashtable *ht = o->ptr;
hashtableExpand(set->ptr, hashtableSize(ht));
si = setTypeInitIterator(o);
char *str;
size_t len;
@ -891,8 +894,8 @@ void spopWithCountCommand(client *c) {
if (!newset) {
newset = str ? createSetListpackObject() : createIntsetObject();
}
setTypeAddAux(newset, str, len, llele, encoding == OBJ_ENCODING_HT);
setTypeRemoveAux(set, str, len, llele, encoding == OBJ_ENCODING_HT);
setTypeAddAux(newset, str, len, llele, encoding == OBJ_ENCODING_HASHTABLE);
setTypeRemoveAux(set, str, len, llele, encoding == OBJ_ENCODING_HASHTABLE);
}
}
@ -1001,8 +1004,6 @@ void srandmemberWithCountCommand(client *c) {
size_t len;
int64_t llele;
dict *d;
if (getRangeLongFromObjectOrReply(c, c->argv[2], -LONG_MAX, LONG_MAX, &l, NULL) != C_OK) return;
if (l >= 0) {
count = (unsigned long)l;
@ -1111,8 +1112,8 @@ void srandmemberWithCountCommand(client *c) {
return;
}
/* For CASE 3 and CASE 4 we need an auxiliary dictionary. */
d = dictCreate(&sdsReplyDictType);
/* For CASE 3 and CASE 4 we need an auxiliary hashtable. */
hashtable *ht = hashtableCreate(&sdsReplyHashtableType);
/* CASE 3:
* The number of elements inside the set is not greater than
@ -1126,29 +1127,25 @@ void srandmemberWithCountCommand(client *c) {
if (count * SRANDMEMBER_SUB_STRATEGY_MUL > size) {
setTypeIterator *si;
/* Add all the elements into the temporary dictionary. */
/* Add all the elements into the temporary hashtable. */
si = setTypeInitIterator(set);
dictExpand(d, size);
hashtableExpand(ht, size);
while (setTypeNext(si, &str, &len, &llele) != -1) {
int retval = DICT_ERR;
if (str == NULL) {
retval = dictAdd(d, sdsfromlonglong(llele), NULL);
serverAssert(hashtableAdd(ht, (void *)sdsfromlonglong(llele)));
} else {
retval = dictAdd(d, sdsnewlen(str, len), NULL);
serverAssert(hashtableAdd(ht, (void *)sdsnewlen(str, len)));
}
serverAssert(retval == DICT_OK);
}
setTypeReleaseIterator(si);
serverAssert(dictSize(d) == size);
serverAssert(hashtableSize(ht) == size);
/* Remove random elements to reach the right count. */
while (size > count) {
dictEntry *de;
de = dictGetFairRandomKey(d);
dictUnlink(d, dictGetKey(de));
sdsfree(dictGetKey(de));
dictFreeUnlinkedEntry(d, de);
void *element;
hashtableFairRandomEntry(ht, &element);
hashtableDelete(ht, element);
sdsfree((sds)element);
size--;
}
}
@ -1161,7 +1158,7 @@ void srandmemberWithCountCommand(client *c) {
unsigned long added = 0;
sds sdsele;
dictExpand(d, count);
hashtableExpand(ht, count);
while (added < count) {
setTypeRandomElement(set, &str, &len, &llele);
if (str == NULL) {
@ -1172,7 +1169,7 @@ void srandmemberWithCountCommand(client *c) {
/* Try to add the object to the dictionary. If it already exists
* free it, otherwise increment the number of objects we have
* in the result dictionary. */
if (dictAdd(d, sdsele, NULL) == DICT_OK)
if (hashtableAdd(ht, sdsele))
added++;
else
sdsfree(sdsele);
@ -1181,14 +1178,15 @@ void srandmemberWithCountCommand(client *c) {
/* CASE 3 & 4: send the result to the user. */
{
dictIterator *di;
dictEntry *de;
hashtableIterator iter;
hashtableInitIterator(&iter, ht);
addReplyArrayLen(c, count);
di = dictGetIterator(d);
while ((de = dictNext(di)) != NULL) addReplyBulkSds(c, dictGetKey(de));
dictReleaseIterator(di);
dictRelease(d);
serverAssert(count == hashtableSize(ht));
void *element;
while (hashtableNext(&iter, &element)) addReplyBulkSds(c, (sds)element);
hashtableResetIterator(&iter);
hashtableRelease(ht);
}
}
@ -1336,7 +1334,7 @@ void sinterGenericCommand(client *c,
while ((encoding = setTypeNext(si, &str, &len, &intobj)) != -1) {
for (j = 1; j < setnum; j++) {
if (sets[j] == sets[0]) continue;
if (!setTypeIsMemberAux(sets[j], str, len, intobj, encoding == OBJ_ENCODING_HT)) break;
if (!setTypeIsMemberAux(sets[j], str, len, intobj, encoding == OBJ_ENCODING_HASHTABLE)) break;
}
/* Only take action when all sets contain the member */
@ -1355,7 +1353,7 @@ void sinterGenericCommand(client *c,
} else {
if (str && only_integers) {
/* It may be an integer although we got it as a string. */
if (encoding == OBJ_ENCODING_HT && string2ll(str, len, (long long *)&intobj)) {
if (encoding == OBJ_ENCODING_HASHTABLE && string2ll(str, len, (long long *)&intobj)) {
if (dstset->encoding == OBJ_ENCODING_LISTPACK || dstset->encoding == OBJ_ENCODING_INTSET) {
/* Adding it as an integer is more efficient. */
str = NULL;
@ -1365,7 +1363,7 @@ void sinterGenericCommand(client *c,
only_integers = 0;
}
}
setTypeAddAux(dstset, str, len, intobj, encoding == OBJ_ENCODING_HT);
setTypeAddAux(dstset, str, len, intobj, encoding == OBJ_ENCODING_HASHTABLE);
}
}
}
@ -1467,7 +1465,7 @@ void sunionDiffGenericCommand(client *c, robj **setkeys, int setnum, robj *dstke
/* For a SET's encoding, according to the factory method setTypeCreate(), currently have 3 types:
* 1. OBJ_ENCODING_INTSET
* 2. OBJ_ENCODING_LISTPACK
* 3. OBJ_ENCODING_HT
* 3. OBJ_ENCODING_HASHTABLE
* 'dstset_encoding' is used to determine which kind of encoding to use when initialize 'dstset'.
*
* If all sets are all OBJ_ENCODING_INTSET encoding or 'dstkey' is not null, keep 'dstset'
@ -1478,8 +1476,8 @@ void sunionDiffGenericCommand(client *c, robj **setkeys, int setnum, robj *dstke
* the hashtable is more efficient when find and compare than the listpack. The corresponding
* time complexity are O(1) vs O(n). */
if (!dstkey && dstset_encoding == OBJ_ENCODING_INTSET &&
(setobj->encoding == OBJ_ENCODING_LISTPACK || setobj->encoding == OBJ_ENCODING_HT)) {
dstset_encoding = OBJ_ENCODING_HT;
(setobj->encoding == OBJ_ENCODING_LISTPACK || setobj->encoding == OBJ_ENCODING_HASHTABLE)) {
dstset_encoding = OBJ_ENCODING_HASHTABLE;
}
sets[j] = setobj;
if (j > 0 && sets[0] == sets[j]) {
@ -1536,7 +1534,7 @@ void sunionDiffGenericCommand(client *c, robj **setkeys, int setnum, robj *dstke
si = setTypeInitIterator(sets[j]);
while ((encoding = setTypeNext(si, &str, &len, &llval)) != -1) {
cardinality += setTypeAddAux(dstset, str, len, llval, encoding == OBJ_ENCODING_HT);
cardinality += setTypeAddAux(dstset, str, len, llval, encoding == OBJ_ENCODING_HASHTABLE);
}
setTypeReleaseIterator(si);
}
@ -1556,11 +1554,11 @@ void sunionDiffGenericCommand(client *c, robj **setkeys, int setnum, robj *dstke
for (j = 1; j < setnum; j++) {
if (!sets[j]) continue; /* no key is an empty set. */
if (sets[j] == sets[0]) break; /* same set! */
if (setTypeIsMemberAux(sets[j], str, len, llval, encoding == OBJ_ENCODING_HT)) break;
if (setTypeIsMemberAux(sets[j], str, len, llval, encoding == OBJ_ENCODING_HASHTABLE)) break;
}
if (j == setnum) {
/* There is no other set with this element. Add it. */
cardinality += setTypeAddAux(dstset, str, len, llval, encoding == OBJ_ENCODING_HT);
cardinality += setTypeAddAux(dstset, str, len, llval, encoding == OBJ_ENCODING_HASHTABLE);
}
}
setTypeReleaseIterator(si);
@ -1578,9 +1576,9 @@ void sunionDiffGenericCommand(client *c, robj **setkeys, int setnum, robj *dstke
si = setTypeInitIterator(sets[j]);
while ((encoding = setTypeNext(si, &str, &len, &llval)) != -1) {
if (j == 0) {
cardinality += setTypeAddAux(dstset, str, len, llval, encoding == OBJ_ENCODING_HT);
cardinality += setTypeAddAux(dstset, str, len, llval, encoding == OBJ_ENCODING_HASHTABLE);
} else {
cardinality -= setTypeRemoveAux(dstset, str, len, llval, encoding == OBJ_ENCODING_HT);
cardinality -= setTypeRemoveAux(dstset, str, len, llval, encoding == OBJ_ENCODING_HASHTABLE);
}
}
setTypeReleaseIterator(si);

View File

@ -2069,9 +2069,7 @@ typedef struct {
int ii;
} is;
struct {
dict *dict;
dictIterator *di;
dictEntry *de;
hashtableIterator *iter;
} ht;
struct {
unsigned char *lp;
@ -2126,10 +2124,8 @@ void zuiInitIterator(zsetopsrc *op) {
if (op->encoding == OBJ_ENCODING_INTSET) {
it->is.is = op->subject->ptr;
it->is.ii = 0;
} else if (op->encoding == OBJ_ENCODING_HT) {
it->ht.dict = op->subject->ptr;
it->ht.di = dictGetIterator(op->subject->ptr);
it->ht.de = dictNext(it->ht.di);
} else if (op->encoding == OBJ_ENCODING_HASHTABLE) {
it->ht.iter = hashtableCreateIterator(op->subject->ptr);
} else if (op->encoding == OBJ_ENCODING_LISTPACK) {
it->lp.lp = op->subject->ptr;
it->lp.p = lpFirst(it->lp.lp);
@ -2166,8 +2162,8 @@ void zuiClearIterator(zsetopsrc *op) {
iterset *it = &op->iter.set;
if (op->encoding == OBJ_ENCODING_INTSET) {
UNUSED(it); /* skip */
} else if (op->encoding == OBJ_ENCODING_HT) {
dictReleaseIterator(it->ht.di);
} else if (op->encoding == OBJ_ENCODING_HASHTABLE) {
hashtableReleaseIterator(it->ht.iter);
} else if (op->encoding == OBJ_ENCODING_LISTPACK) {
UNUSED(it);
} else {
@ -2235,13 +2231,11 @@ int zuiNext(zsetopsrc *op, zsetopval *val) {
/* Move to next element. */
it->is.ii++;
} else if (op->encoding == OBJ_ENCODING_HT) {
if (it->ht.de == NULL) return 0;
val->ele = dictGetKey(it->ht.de);
} else if (op->encoding == OBJ_ENCODING_HASHTABLE) {
void *next;
if (!hashtableNext(it->ht.iter, &next)) return 0;
val->ele = next;
val->score = 1.0;
/* Move to next element. */
it->ht.de = dictNext(it->ht.di);
} else if (op->encoding == OBJ_ENCODING_LISTPACK) {
if (it->lp.p == NULL) return 0;
val->estr = lpGetValue(it->lp.p, &val->elen, &val->ell);

View File

@ -451,15 +451,25 @@ void zmalloc_set_oom_handler(void (*oom_handler)(size_t)) {
zmalloc_oom_handler = oom_handler;
}
/* Use 'MADV_DONTNEED' to release memory to operating system quickly.
* We do that in a fork child process to avoid CoW when the parent modifies
* these shared pages. */
void zmadvise_dontneed(void *ptr) {
/* Try to release pages back to the OS directly using 'MADV_DONTNEED' (bypassing
* the allocator) in a fork child process to avoid CoW when the parent modifies
* those shared pages. For small allocations, we can't release any full page,
* so in an effort to avoid getting the size of the allocation from the
* allocator (malloc_size) when we already know it's small, we check the
* size_hint. If the size is not already known, passing a size_hint of 0 will
* lead the checking the real size of the allocation.
* Also please note that the size may be not accurate, so in order to make this
* solution effective, the judgement for releasing memory pages should not be
* too strict. */
void zmadvise_dontneed(void *ptr, size_t size_hint) {
#if defined(USE_JEMALLOC) && defined(__linux__)
if (ptr == NULL) return;
static size_t page_size = 0;
if (page_size == 0) page_size = sysconf(_SC_PAGESIZE);
size_t page_size_mask = page_size - 1;
if (size_hint && size_hint / 2 < page_size) return;
size_t real_size = zmalloc_size(ptr);
if (real_size < page_size) return;
@ -473,6 +483,7 @@ void zmadvise_dontneed(void *ptr) {
}
#else
(void)(ptr);
(void)(size_hint);
#endif
}

View File

@ -139,7 +139,7 @@ size_t zmalloc_get_smap_bytes_by_field(char *field, long pid);
size_t zmalloc_get_memory_size(void);
void zlibc_free(void *ptr);
void zlibc_trim(void);
void zmadvise_dontneed(void *ptr);
void zmadvise_dontneed(void *ptr, size_t size_hint);
#ifndef HAVE_MALLOC_SIZE
size_t zmalloc_size(void *ptr);

View File

@ -515,10 +515,10 @@ start_server {tags {"info" "external:skip"}} {
set info_mem [r info memory]
set mem_stats [r memory stats]
assert_equal [getInfoProperty $info_mem mem_overhead_db_hashtable_rehashing] {0}
# overhead.db.hashtable.lut = memory overhead of hashset including hashset struct and tables
set hashset_overhead [dict get $mem_stats overhead.db.hashtable.lut]
if {$hashset_overhead < 140} {
# 32-bit version (hashset struct + 1 bucket of 64 bytes)
# overhead.db.hashtable.lut = memory overhead of hashtable including hashtable struct and tables
set hashtable_overhead [dict get $mem_stats overhead.db.hashtable.lut]
if {$hashtable_overhead < 140} {
# 32-bit version (hashtable struct + 1 bucket of 64 bytes)
set bits 32
} else {
set bits 64

View File

@ -33,6 +33,7 @@ start_server {
assert_equal {0 1} [r smismember myset bla foo]
assert_equal {0} [r smismember myset bla]
assert_equal "bar $initelems($type)" [lsort [r smembers myset]]
r memory usage myset
}
}
@ -51,6 +52,7 @@ start_server {
assert_equal {0 1} [r smismember myset 18 16]
assert_equal {0} [r smismember myset 18]
assert_equal {16 17} [lsort [r smembers myset]]
r memory usage myset
}
test {SMISMEMBER SMEMBERS SCARD against non set} {
@ -1029,111 +1031,6 @@ foreach type {single multiple single_multiple} {
r srem $myset {*}$members
}
proc verify_rehashing_completed_key {myset table_size keys} {
set htstats [r debug HTSTATS-KEY $myset]
assert {![string match {*rehashing target*} $htstats]}
return {[string match {*table size: $table_size*number of elements: $keys*} $htstats]}
}
test "SRANDMEMBER with a dict containing long chain" {
set origin_save [config_get_set save ""]
set origin_max_lp [config_get_set set-max-listpack-entries 0]
set origin_save_delay [config_get_set rdb-key-save-delay 2147483647]
# 1) Create a hash set with 100000 members.
set members {}
for {set i 0} {$i < 100000} {incr i} {
lappend members [format "m:%d" $i]
}
create_set myset $members
# 2) Wait for the hash set rehashing to finish.
while {[is_rehashing myset]} {
r srandmember myset 100
}
# 3) Turn off the rehashing of this set, and remove the members to 500.
r bgsave
rem_hash_set_top_N myset [expr {[r scard myset] - 500}]
assert_equal [r scard myset] 500
# 4) Kill RDB child process to restart rehashing.
set pid1 [get_child_pid 0]
catch {exec kill -9 $pid1}
waitForBgsave r
# 5) Let the set hash to start rehashing
r spop myset 1
assert [is_rehashing myset]
# 6) Verify that when rdb saving is in progress, rehashing will still be performed (because
# the ratio is extreme) by waiting for it to finish during an active bgsave.
r bgsave
while {[is_rehashing myset]} {
r srandmember myset 1
}
if {$::verbose} {
puts [r debug HTSTATS-KEY myset full]
}
set pid1 [get_child_pid 0]
catch {exec kill -9 $pid1}
waitForBgsave r
# 7) Check that eventually, SRANDMEMBER returns all elements.
array set allmyset {}
foreach ele [r smembers myset] {
set allmyset($ele) 1
}
unset -nocomplain auxset
set iterations 1000
while {$iterations != 0} {
incr iterations -1
set res [r srandmember myset -10]
foreach ele $res {
set auxset($ele) 1
}
if {[lsort [array names allmyset]] eq
[lsort [array names auxset]]} {
break;
}
}
assert {$iterations != 0}
# 8) Remove the members to 30 in order to calculate the value of Chi-Square Distribution,
# otherwise we would need more iterations.
rem_hash_set_top_N myset [expr {[r scard myset] - 30}]
assert_equal [r scard myset] 30
# Hash set rehashing would be completed while removing members from the `myset`
# We also check the size and members in the hash table.
verify_rehashing_completed_key myset 64 30
# Now that we have a hash set with only one long chain bucket.
set htstats [r debug HTSTATS-KEY myset full]
assert {[regexp {different slots: ([0-9]+)} $htstats - different_slots]}
assert {[regexp {max chain length: ([0-9]+)} $htstats - max_chain_length]}
assert {$different_slots == 1 && $max_chain_length == 30}
# 9) Use positive count (PATH 4) to get 10 elements (out of 30) each time.
unset -nocomplain allkey
set iterations 1000
while {$iterations != 0} {
incr iterations -1
set res [r srandmember myset 10]
foreach ele $res {
lappend allkey $ele
}
}
# validate even distribution of random sampling (df = 29, 73 means 0.00001 probability)
assert_lessthan [chi_square_value $allkey] 73
r config set save $origin_save
r config set set-max-listpack-entries $origin_max_lp
r config set rdb-key-save-delay $origin_save_delay
} {OK} {needs:debug slow}
proc setup_move {} {
r del myset3{t} myset4{t}
create_set myset1{t} {1 a b}