
The LRU eviction code used to make local choices: for each DB visited it selected the best key to evict. This was repeated for each DB. However this means that there could be DBs with very frequently accessed keys that are targeted by the LRU algorithm while there were other DBs with many better candidates to expire. This commit attempts to fix this problem for the LRU policy. However the TTL policy is still not fixed by this commit. The TTL policy will be fixed in a successive commit. This is an initial (partial because of TTL policy) fix for issue #2647.
439 lines
18 KiB
C
439 lines
18 KiB
C
/* Maxmemory directive handling (LRU eviction and other policies).
|
|
*
|
|
* ----------------------------------------------------------------------------
|
|
*
|
|
* Copyright (c) 2009-2016, Salvatore Sanfilippo <antirez at gmail dot com>
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* * Redistributions of source code must retain the above copyright notice,
|
|
* this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of Redis nor the names of its contributors may be used
|
|
* to endorse or promote products derived from this software without
|
|
* specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#include "server.h"
|
|
#include "bio.h"
|
|
|
|
/* ----------------------------------------------------------------------------
|
|
* Data structures
|
|
* --------------------------------------------------------------------------*/
|
|
|
|
/* To improve the quality of the LRU approximation we take a set of keys
|
|
* that are good candidate for eviction across freeMemoryIfNeeded() calls.
|
|
*
|
|
* Entries inside the eviciton pool are taken ordered by idle time, putting
|
|
* greater idle times to the right (ascending order).
|
|
*
|
|
* Empty entries have the key pointer set to NULL. */
|
|
#define EVPOOL_SIZE 16
|
|
#define EVPOOL_CACHED_SDS_SIZE 255
|
|
struct evictionPoolEntry {
|
|
unsigned long long idle; /* Object idle time. */
|
|
sds key; /* Key name. */
|
|
sds cached; /* Cached SDS object for key name. */
|
|
int dbid; /* Key DB number. */
|
|
};
|
|
|
|
static struct evictionPoolEntry *EvictionPoolLRU;
|
|
|
|
/* ----------------------------------------------------------------------------
|
|
* Implementation of eviction, aging and LRU
|
|
* --------------------------------------------------------------------------*/
|
|
|
|
/* Return the LRU clock, based on the clock resolution. This is a time
|
|
* in a reduced-bits format that can be used to set and check the
|
|
* object->lru field of redisObject structures. */
|
|
unsigned int getLRUClock(void) {
|
|
return (mstime()/LRU_CLOCK_RESOLUTION) & LRU_CLOCK_MAX;
|
|
}
|
|
|
|
/* Given an object returns the min number of milliseconds the object was never
|
|
* requested, using an approximated LRU algorithm. */
|
|
unsigned long long estimateObjectIdleTime(robj *o) {
|
|
unsigned long long lruclock = LRU_CLOCK();
|
|
if (lruclock >= o->lru) {
|
|
return (lruclock - o->lru) * LRU_CLOCK_RESOLUTION;
|
|
} else {
|
|
return (lruclock + (LRU_CLOCK_MAX - o->lru)) *
|
|
LRU_CLOCK_RESOLUTION;
|
|
}
|
|
}
|
|
|
|
/* freeMemoryIfNeeded() gets called when 'maxmemory' is set on the config
|
|
* file to limit the max memory used by the server, before processing a
|
|
* command.
|
|
*
|
|
* The goal of the function is to free enough memory to keep Redis under the
|
|
* configured memory limit.
|
|
*
|
|
* The function starts calculating how many bytes should be freed to keep
|
|
* Redis under the limit, and enters a loop selecting the best keys to
|
|
* evict accordingly to the configured policy.
|
|
*
|
|
* If all the bytes needed to return back under the limit were freed the
|
|
* function returns C_OK, otherwise C_ERR is returned, and the caller
|
|
* should block the execution of commands that will result in more memory
|
|
* used by the server.
|
|
*
|
|
* ------------------------------------------------------------------------
|
|
*
|
|
* LRU approximation algorithm
|
|
*
|
|
* Redis uses an approximation of the LRU algorithm that runs in constant
|
|
* memory. Every time there is a key to expire, we sample N keys (with
|
|
* N very small, usually in around 5) to populate a pool of best keys to
|
|
* evict of M keys (the pool size is defined by EVPOOL_SIZE).
|
|
*
|
|
* The N keys sampled are added in the pool of good keys to expire (the one
|
|
* with an old access time) if they are better than one of the current keys
|
|
* in the pool.
|
|
*
|
|
* After the pool is populated, the best key we have in the pool is expired.
|
|
* However note that we don't remove keys from the pool when they are deleted
|
|
* so the pool may contain keys that no longer exist.
|
|
*
|
|
* When we try to evict a key, and all the entries in the pool don't exist
|
|
* we populate it again. This time we'll be sure that the pool has at least
|
|
* one key that can be evicted, if there is at least one key that can be
|
|
* evicted in the whole database. */
|
|
|
|
/* Create a new eviction pool. */
|
|
void evictionPoolAlloc(void) {
|
|
struct evictionPoolEntry *ep;
|
|
int j;
|
|
|
|
ep = zmalloc(sizeof(*ep)*EVPOOL_SIZE);
|
|
for (j = 0; j < EVPOOL_SIZE; j++) {
|
|
ep[j].idle = 0;
|
|
ep[j].key = NULL;
|
|
ep[j].cached = sdsnewlen(NULL,EVPOOL_CACHED_SDS_SIZE);
|
|
ep[j].dbid = 0;
|
|
}
|
|
EvictionPoolLRU = ep;
|
|
}
|
|
|
|
/* This is an helper function for freeMemoryIfNeeded(), it is used in order
|
|
* to populate the evictionPool with a few entries every time we want to
|
|
* expire a key. Keys with idle time smaller than one of the current
|
|
* keys are added. Keys are always added if there are free entries.
|
|
*
|
|
* We insert keys on place in ascending order, so keys with the smaller
|
|
* idle time are on the left, and keys with the higher idle time on the
|
|
* right. */
|
|
|
|
void evictionPoolPopulate(int dbid, dict *sampledict, dict *keydict, struct evictionPoolEntry *pool) {
|
|
int j, k, count;
|
|
dictEntry *samples[server.maxmemory_samples];
|
|
|
|
count = dictGetSomeKeys(sampledict,samples,server.maxmemory_samples);
|
|
for (j = 0; j < count; j++) {
|
|
unsigned long long idle;
|
|
sds key;
|
|
robj *o;
|
|
dictEntry *de;
|
|
|
|
de = samples[j];
|
|
key = dictGetKey(de);
|
|
/* If the dictionary we are sampling from is not the main
|
|
* dictionary (but the expires one) we need to lookup the key
|
|
* again in the key dictionary to obtain the value object. */
|
|
if (sampledict != keydict) de = dictFind(keydict, key);
|
|
o = dictGetVal(de);
|
|
idle = estimateObjectIdleTime(o);
|
|
|
|
/* Insert the element inside the pool.
|
|
* First, find the first empty bucket or the first populated
|
|
* bucket that has an idle time smaller than our idle time. */
|
|
k = 0;
|
|
while (k < EVPOOL_SIZE &&
|
|
pool[k].key &&
|
|
pool[k].idle < idle) k++;
|
|
if (k == 0 && pool[EVPOOL_SIZE-1].key != NULL) {
|
|
/* Can't insert if the element is < the worst element we have
|
|
* and there are no empty buckets. */
|
|
continue;
|
|
} else if (k < EVPOOL_SIZE && pool[k].key == NULL) {
|
|
/* Inserting into empty position. No setup needed before insert. */
|
|
} else {
|
|
/* Inserting in the middle. Now k points to the first element
|
|
* greater than the element to insert. */
|
|
if (pool[EVPOOL_SIZE-1].key == NULL) {
|
|
/* Free space on the right? Insert at k shifting
|
|
* all the elements from k to end to the right. */
|
|
|
|
/* Save SDS before overwriting. */
|
|
sds cached = pool[EVPOOL_SIZE-1].cached;
|
|
memmove(pool+k+1,pool+k,
|
|
sizeof(pool[0])*(EVPOOL_SIZE-k-1));
|
|
pool[k].cached = cached;
|
|
} else {
|
|
/* No free space on right? Insert at k-1 */
|
|
k--;
|
|
/* Shift all elements on the left of k (included) to the
|
|
* left, so we discard the element with smaller idle time. */
|
|
sds cached = pool[0].cached; /* Save SDS before overwriting. */
|
|
if (pool[0].key != pool[0].cached) sdsfree(pool[0].key);
|
|
memmove(pool,pool+1,sizeof(pool[0])*k);
|
|
pool[k].cached = cached;
|
|
}
|
|
}
|
|
|
|
/* Try to reuse the cached SDS string allocated in the pool entry,
|
|
* because allocating and deallocating this object is costly
|
|
* (according to the profiler, not my fantasy. Remember:
|
|
* premature optimizbla bla bla bla. */
|
|
int klen = sdslen(key);
|
|
if (klen > EVPOOL_CACHED_SDS_SIZE) {
|
|
pool[k].key = sdsdup(key);
|
|
} else {
|
|
memcpy(pool[k].cached,key,klen+1);
|
|
sdssetlen(pool[k].cached,klen);
|
|
pool[k].key = pool[k].cached;
|
|
}
|
|
pool[k].idle = idle;
|
|
pool[k].dbid = dbid;
|
|
}
|
|
}
|
|
|
|
int freeMemoryIfNeeded(void) {
|
|
size_t mem_reported, mem_used, mem_tofree, mem_freed;
|
|
int slaves = listLength(server.slaves);
|
|
mstime_t latency, eviction_latency;
|
|
long long delta;
|
|
|
|
/* Check if we are over the memory usage limit. If we are not, no need
|
|
* to subtract the slaves output buffers. We can just return ASAP. */
|
|
mem_reported = zmalloc_used_memory();
|
|
if (mem_reported <= server.maxmemory) return C_OK;
|
|
|
|
/* Remove the size of slaves output buffers and AOF buffer from the
|
|
* count of used memory. */
|
|
mem_used = mem_reported;
|
|
if (slaves) {
|
|
listIter li;
|
|
listNode *ln;
|
|
|
|
listRewind(server.slaves,&li);
|
|
while((ln = listNext(&li))) {
|
|
client *slave = listNodeValue(ln);
|
|
unsigned long obuf_bytes = getClientOutputBufferMemoryUsage(slave);
|
|
if (obuf_bytes > mem_used)
|
|
mem_used = 0;
|
|
else
|
|
mem_used -= obuf_bytes;
|
|
}
|
|
}
|
|
if (server.aof_state != AOF_OFF) {
|
|
mem_used -= sdslen(server.aof_buf);
|
|
mem_used -= aofRewriteBufferSize();
|
|
}
|
|
|
|
/* Check if we are still over the memory limit. */
|
|
if (mem_used <= server.maxmemory) return C_OK;
|
|
|
|
/* Compute how much memory we need to free. */
|
|
mem_tofree = mem_used - server.maxmemory;
|
|
mem_freed = 0;
|
|
|
|
if (server.maxmemory_policy == MAXMEMORY_NO_EVICTION)
|
|
goto cant_free; /* We need to free memory, but policy forbids. */
|
|
|
|
latencyStartMonitor(latency);
|
|
while (mem_freed < mem_tofree) {
|
|
int j, k, i, keys_freed = 0;
|
|
static int next_db = 0;
|
|
sds bestkey = NULL;
|
|
int bestdbid;
|
|
redisDb *db;
|
|
dict *dict;
|
|
dictEntry *de;
|
|
|
|
if (server.maxmemory_policy == MAXMEMORY_ALLKEYS_LRU ||
|
|
server.maxmemory_policy == MAXMEMORY_VOLATILE_LRU)
|
|
{
|
|
struct evictionPoolEntry *pool = EvictionPoolLRU;
|
|
|
|
while(bestkey == NULL) {
|
|
unsigned long total_keys = 0, keys;
|
|
|
|
/* We don't want to make local-db choices when expiring keys,
|
|
* so to start populate the eviction pool sampling keys from
|
|
* every DB. */
|
|
for (i = 0; i < server.dbnum; i++) {
|
|
db = server.db+i;
|
|
dict = (server.maxmemory_policy == MAXMEMORY_ALLKEYS_LRU) ?
|
|
db->dict : db->expires;
|
|
if ((keys = dictSize(dict)) != 0) {
|
|
evictionPoolPopulate(i, dict, db->dict, pool);
|
|
total_keys += keys;
|
|
}
|
|
}
|
|
if (!total_keys) break; /* No keys to evict. */
|
|
|
|
/* Go backward from best to worst element to evict. */
|
|
for (k = EVPOOL_SIZE-1; k >= 0; k--) {
|
|
if (pool[k].key == NULL) continue;
|
|
bestdbid = pool[k].dbid;
|
|
|
|
if (server.maxmemory_policy == MAXMEMORY_ALLKEYS_LRU) {
|
|
de = dictFind(server.db[pool[k].dbid].dict,
|
|
pool[k].key);
|
|
} else {
|
|
de = dictFind(server.db[pool[k].dbid].expires,
|
|
pool[k].key);
|
|
}
|
|
|
|
/* Remove the entry from the pool. */
|
|
if (pool[k].key != pool[k].cached)
|
|
sdsfree(pool[k].key);
|
|
pool[k].key = NULL;
|
|
pool[k].idle = 0;
|
|
|
|
/* If the key exists, is our pick. Otherwise it is
|
|
* a ghost and we need to try the next element. */
|
|
if (de) {
|
|
bestkey = dictGetKey(de);
|
|
break;
|
|
} else {
|
|
/* Ghost... Iterate again. */
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* volatile-random and allkeys-random policy */
|
|
else if (server.maxmemory_policy == MAXMEMORY_ALLKEYS_RANDOM ||
|
|
server.maxmemory_policy == MAXMEMORY_VOLATILE_RANDOM)
|
|
{
|
|
/* When evicting a random key, we try to evict a key for
|
|
* each DB, so we use the static 'next_db' variable to
|
|
* incrementally visit all DBs. */
|
|
for (i = 0; i < server.dbnum; i++) {
|
|
j = (++next_db) % server.dbnum;
|
|
db = server.db+j;
|
|
dict = (server.maxmemory_policy == MAXMEMORY_ALLKEYS_RANDOM) ?
|
|
db->dict : db->expires;
|
|
if (dictSize(dict) != 0) {
|
|
de = dictGetRandomKey(dict);
|
|
bestkey = dictGetKey(de);
|
|
bestdbid = j;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* volatile-ttl */
|
|
else if (server.maxmemory_policy == MAXMEMORY_VOLATILE_TTL) {
|
|
long bestttl = 0; /* Initialized to avoid warning. */
|
|
|
|
/* In this policy we scan a single DB per iteration (visiting
|
|
* a different DB per call), expiring the key with the smallest
|
|
* TTL among the few sampled.
|
|
*
|
|
* Note that this algorithm makes local-DB choices, and should
|
|
* use a pool and code more similr to the one used in the
|
|
* LRU eviction policies in the future. */
|
|
for (i = 0; i < server.dbnum; i++) {
|
|
j = (++next_db) % server.dbnum;
|
|
db = server.db+j;
|
|
dict = db->expires;
|
|
if (dictSize(dict) != 0) {
|
|
for (k = 0; k < server.maxmemory_samples; k++) {
|
|
sds thiskey;
|
|
long thisttl;
|
|
|
|
de = dictGetRandomKey(dict);
|
|
thiskey = dictGetKey(de);
|
|
thisttl = (long) dictGetVal(de);
|
|
|
|
/* Keys expiring sooner (smaller unix timestamp) are
|
|
* better candidates for deletion */
|
|
if (bestkey == NULL || thisttl < bestttl) {
|
|
bestkey = thiskey;
|
|
bestttl = thisttl;
|
|
bestdbid = j;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Finally remove the selected key. */
|
|
if (bestkey) {
|
|
db = server.db+bestdbid;
|
|
robj *keyobj = createStringObject(bestkey,sdslen(bestkey));
|
|
propagateExpire(db,keyobj,server.lazyfree_lazy_eviction);
|
|
/* We compute the amount of memory freed by db*Delete() alone.
|
|
* It is possible that actually the memory needed to propagate
|
|
* the DEL in AOF and replication link is greater than the one
|
|
* we are freeing removing the key, but we can't account for
|
|
* that otherwise we would never exit the loop.
|
|
*
|
|
* AOF and Output buffer memory will be freed eventually so
|
|
* we only care about memory used by the key space. */
|
|
delta = (long long) zmalloc_used_memory();
|
|
latencyStartMonitor(eviction_latency);
|
|
if (server.lazyfree_lazy_eviction)
|
|
dbAsyncDelete(db,keyobj);
|
|
else
|
|
dbSyncDelete(db,keyobj);
|
|
latencyEndMonitor(eviction_latency);
|
|
latencyAddSampleIfNeeded("eviction-del",eviction_latency);
|
|
latencyRemoveNestedEvent(latency,eviction_latency);
|
|
delta -= (long long) zmalloc_used_memory();
|
|
mem_freed += delta;
|
|
server.stat_evictedkeys++;
|
|
notifyKeyspaceEvent(NOTIFY_EVICTED, "evicted",
|
|
keyobj, db->id);
|
|
decrRefCount(keyobj);
|
|
keys_freed++;
|
|
|
|
/* When the memory to free starts to be big enough, we may
|
|
* start spending so much time here that is impossible to
|
|
* deliver data to the slaves fast enough, so we force the
|
|
* transmission here inside the loop. */
|
|
if (slaves) flushSlavesOutputBuffers();
|
|
}
|
|
|
|
if (!keys_freed) {
|
|
latencyEndMonitor(latency);
|
|
latencyAddSampleIfNeeded("eviction-cycle",latency);
|
|
goto cant_free; /* nothing to free... */
|
|
}
|
|
}
|
|
latencyEndMonitor(latency);
|
|
latencyAddSampleIfNeeded("eviction-cycle",latency);
|
|
return C_OK;
|
|
|
|
cant_free:
|
|
/* We are here if we are not able to reclaim memory. There is only one
|
|
* last thing we can try: check if the lazyfree thread has jobs in queue
|
|
* and wait... */
|
|
while(bioPendingJobsOfType(BIO_LAZY_FREE)) {
|
|
if (((mem_reported - zmalloc_used_memory()) + mem_freed) >= mem_tofree)
|
|
break;
|
|
usleep(1000);
|
|
}
|
|
return C_ERR;
|
|
}
|
|
|