futriix/src/lazyfree.c
Wang Yuan 75f9dec644
Limit the main db and expires dictionaries to expand (#7954)
As we know, redis may reject user's requests or evict some keys if
used memory is over maxmemory. Dictionaries expanding may make
things worse, some big dictionaries, such as main db and expires dict,
may eat huge memory at once for allocating a new big hash table and be
far more than maxmemory after expanding.
There are related issues: #4213 #4583

More details, when expand dict in redis, we will allocate a new big
ht[1] that generally is double of ht[0], The size of ht[1] will be
very big if ht[0] already is big. For db dict, if we have more than
64 million keys, we need to cost 1GB for ht[1] when dict expands.

If the sum of used memory and new hash table of dict needed exceeds
maxmemory, we shouldn't allow the dict to expand. Because, if we
enable keys eviction, we still couldn't add much more keys after
eviction and rehashing, what's worse, redis will keep less keys when
redis only remains a little memory for storing new hash table instead
of users' data. Moreover users can't write data in redis if disable
keys eviction.

What this commit changed ?

Add a new member function expandAllowed for dict type, it provide a way
for caller to allow expand or not. We expose two parameters for this
function: more memory needed for expanding and dict current load factor,
users can implement a function to make a decision by them.
For main db dict and expires dict type, these dictionaries may be very
big and cost huge memory for expanding, so we implement a judgement
function: we can stop dict to expand provisionally if used memory will
be over maxmemory after dict expands, but to guarantee the performance
of redis, we still allow dict to expand if dict load factor exceeds the
safe load factor.
Add test cases to verify we don't allow main db to expand when left
memory is not enough, so that avoid keys eviction.

Other changes:

For new hash table size when expand. Before this commit, the size is
that double used of dict and later _dictNextPower. Actually we aim to
control a dict load factor between 0.5 and 1.0. Now we replace *2 with
+1, since the first check is that used >= size, the outcome of before
will usually be the same as _dictNextPower(used+1). The only case where
it'll differ is when dict_can_resize is false during fork, so that later
the _dictNextPower(used*2) will cause the dict to jump to *4 (i.e.
_dictNextPower(1025*2) will return 4096).
Fix rehash test cases due to changing algorithm of new hash table size
when expand.
2020-12-06 11:53:04 +02:00

194 lines
7.2 KiB
C

#include "server.h"
#include "bio.h"
#include "atomicvar.h"
#include "cluster.h"
static redisAtomic size_t lazyfree_objects = 0;
static redisAtomic size_t lazyfreed_objects = 0;
/* Return the number of currently pending objects to free. */
size_t lazyfreeGetPendingObjectsCount(void) {
size_t aux;
atomicGet(lazyfree_objects,aux);
return aux;
}
/* Return the number of objects that have been freed. */
size_t lazyfreeGetFreedObjectsCount(void) {
size_t aux;
atomicGet(lazyfreed_objects,aux);
return aux;
}
/* Return the amount of work needed in order to free an object.
* The return value is not always the actual number of allocations the
* object is composed of, but a number proportional to it.
*
* For strings the function always returns 1.
*
* For aggregated objects represented by hash tables or other data structures
* the function just returns the number of elements the object is composed of.
*
* Objects composed of single allocations are always reported as having a
* single item even if they are actually logical composed of multiple
* elements.
*
* For lists the function returns the number of elements in the quicklist
* representing the list. */
size_t lazyfreeGetFreeEffort(robj *key, robj *obj) {
if (obj->type == OBJ_LIST) {
quicklist *ql = obj->ptr;
return ql->len;
} else if (obj->type == OBJ_SET && obj->encoding == OBJ_ENCODING_HT) {
dict *ht = obj->ptr;
return dictSize(ht);
} else if (obj->type == OBJ_ZSET && obj->encoding == OBJ_ENCODING_SKIPLIST){
zset *zs = obj->ptr;
return zs->zsl->length;
} else if (obj->type == OBJ_HASH && obj->encoding == OBJ_ENCODING_HT) {
dict *ht = obj->ptr;
return dictSize(ht);
} else if (obj->type == OBJ_STREAM) {
size_t effort = 0;
stream *s = obj->ptr;
/* Make a best effort estimate to maintain constant runtime. Every macro
* node in the Stream is one allocation. */
effort += s->rax->numnodes;
/* Every consumer group is an allocation and so are the entries in its
* PEL. We use size of the first group's PEL as an estimate for all
* others. */
if (s->cgroups) {
raxIterator ri;
streamCG *cg;
raxStart(&ri,s->cgroups);
raxSeek(&ri,"^",NULL,0);
/* There must be at least one group so the following should always
* work. */
serverAssert(raxNext(&ri));
cg = ri.data;
effort += raxSize(s->cgroups)*(1+raxSize(cg->pel));
raxStop(&ri);
}
return effort;
} else if (obj->type == OBJ_MODULE) {
moduleValue *mv = obj->ptr;
moduleType *mt = mv->type;
if (mt->free_effort != NULL) {
size_t effort = mt->free_effort(key,mv->value);
/* If the module's free_effort returns 0, it will use asynchronous free
memory by default */
return effort == 0 ? ULONG_MAX : effort;
} else {
return 1;
}
} else {
return 1; /* Everything else is a single allocation. */
}
}
/* Delete a key, value, and associated expiration entry if any, from the DB.
* If there are enough allocations to free the value object may be put into
* a lazy free list instead of being freed synchronously. The lazy free list
* will be reclaimed in a different bio.c thread. */
#define LAZYFREE_THRESHOLD 64
int dbAsyncDelete(redisDb *db, robj *key) {
/* Deleting an entry from the expires dict will not free the sds of
* the key, because it is shared with the main dictionary. */
if (dictSize(db->expires) > 0) dictDelete(db->expires,key->ptr);
/* If the value is composed of a few allocations, to free in a lazy way
* is actually just slower... So under a certain limit we just free
* the object synchronously. */
dictEntry *de = dictUnlink(db->dict,key->ptr);
if (de) {
robj *val = dictGetVal(de);
/* Tells the module that the key has been unlinked from the database. */
moduleNotifyKeyUnlink(key,val);
size_t free_effort = lazyfreeGetFreeEffort(key,val);
/* If releasing the object is too much work, do it in the background
* by adding the object to the lazy free list.
* Note that if the object is shared, to reclaim it now it is not
* possible. This rarely happens, however sometimes the implementation
* of parts of the Redis core may call incrRefCount() to protect
* objects, and then call dbDelete(). In this case we'll fall
* through and reach the dictFreeUnlinkedEntry() call, that will be
* equivalent to just calling decrRefCount(). */
if (free_effort > LAZYFREE_THRESHOLD && val->refcount == 1) {
atomicIncr(lazyfree_objects,1);
bioCreateBackgroundJob(BIO_LAZY_FREE,val,NULL,NULL);
dictSetVal(db->dict,de,NULL);
}
}
/* Release the key-val pair, or just the key if we set the val
* field to NULL in order to lazy free it later. */
if (de) {
dictFreeUnlinkedEntry(db->dict,de);
if (server.cluster_enabled) slotToKeyDel(key->ptr);
return 1;
} else {
return 0;
}
}
/* Free an object, if the object is huge enough, free it in async way. */
void freeObjAsync(robj *key, robj *obj) {
size_t free_effort = lazyfreeGetFreeEffort(key,obj);
if (free_effort > LAZYFREE_THRESHOLD && obj->refcount == 1) {
atomicIncr(lazyfree_objects,1);
bioCreateBackgroundJob(BIO_LAZY_FREE,obj,NULL,NULL);
} else {
decrRefCount(obj);
}
}
/* Empty a Redis DB asynchronously. What the function does actually is to
* create a new empty set of hash tables and scheduling the old ones for
* lazy freeing. */
void emptyDbAsync(redisDb *db) {
dict *oldht1 = db->dict, *oldht2 = db->expires;
db->dict = dictCreate(&dbDictType,NULL);
db->expires = dictCreate(&dbExpiresDictType,NULL);
atomicIncr(lazyfree_objects,dictSize(oldht1));
bioCreateBackgroundJob(BIO_LAZY_FREE,NULL,oldht1,oldht2);
}
/* Release the radix tree mapping Redis Cluster keys to slots asynchronously. */
void freeSlotsToKeysMapAsync(rax *rt) {
atomicIncr(lazyfree_objects,rt->numele);
bioCreateBackgroundJob(BIO_LAZY_FREE,NULL,NULL,rt);
}
/* Release objects from the lazyfree thread. It's just decrRefCount()
* updating the count of objects to release. */
void lazyfreeFreeObjectFromBioThread(robj *o) {
decrRefCount(o);
atomicDecr(lazyfree_objects,1);
atomicIncr(lazyfreed_objects,1);
}
/* Release a database from the lazyfree thread. The 'db' pointer is the
* database which was substituted with a fresh one in the main thread
* when the database was logically deleted. */
void lazyfreeFreeDatabaseFromBioThread(dict *ht1, dict *ht2) {
size_t numkeys = dictSize(ht1);
dictRelease(ht1);
dictRelease(ht2);
atomicDecr(lazyfree_objects,numkeys);
atomicIncr(lazyfreed_objects,numkeys);
}
/* Release the radix tree mapping Redis Cluster keys to slots in the
* lazyfree thread. */
void lazyfreeFreeSlotsMapFromBioThread(rax *rt) {
size_t len = rt->numele;
raxFree(rt);
atomicDecr(lazyfree_objects,len);
atomicIncr(lazyfreed_objects,len);
}