/* Implementation of EXPIRE (keys with fixed time to live). * * ---------------------------------------------------------------------------- * * Copyright (c) 2009-2016, Salvatore Sanfilippo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Redis nor the names of its contributors may be used * to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "server.h" #include "cron.h" void activeExpireCycleExpireFullKey(redisDb *db, const char *key) { robj *keyobj = createStringObject(key,sdslen(key)); propagateExpire(db,keyobj,g_pserver->lazyfree_lazy_expire); if (g_pserver->lazyfree_lazy_expire) dbAsyncDelete(db,keyobj); else dbSyncDelete(db,keyobj); notifyKeyspaceEvent(NOTIFY_EXPIRED, "expired",keyobj,db->id); if (g_pserver->tracking_clients) trackingInvalidateKey(keyobj); decrRefCount(keyobj); g_pserver->stat_expiredkeys++; } /*----------------------------------------------------------------------------- * Incremental collection of expired keys. * * When keys are accessed they are expired on-access. However we need a * mechanism in order to ensure keys are eventually removed when expired even * if no access is performed on them. *----------------------------------------------------------------------------*/ /* Helper function for the activeExpireCycle() function. * This function will try to expire the key that is stored in the hash table * entry 'de' of the 'expires' hash table of a Redis database. * * If the key is found to be expired, it is removed from the database and * 1 is returned. Otherwise no operation is performed and 0 is returned. * * When a key is expired, g_pserver->stat_expiredkeys is incremented. * * The parameter 'now' is the current time in milliseconds as is passed * to the function to avoid too many gettimeofday() syscalls. */ void activeExpireCycleExpire(redisDb *db, expireEntry &e, long long now) { if (!e.FFat()) { activeExpireCycleExpireFullKey(db, e.key()); return; } expireEntryFat *pfat = e.pfatentry(); dictEntry *de = dictFind(db->pdict, e.key()); robj *val = (robj*)dictGetVal(de); int deleted = 0; while (!pfat->FEmpty()) { if (pfat->nextExpireEntry().when > now) break; // Is it the full key expiration? if (pfat->nextExpireEntry().spsubkey == nullptr) { activeExpireCycleExpireFullKey(db, e.key()); return; } switch (val->type) { case OBJ_SET: if (setTypeRemove(val,pfat->nextExpireEntry().spsubkey.get())) { deleted++; if (setTypeSize(val) == 0) { activeExpireCycleExpireFullKey(db, e.key()); return; } } break; case OBJ_HASH: if (hashTypeDelete(val,(sds)pfat->nextExpireEntry().spsubkey.get())) { deleted++; if (hashTypeLength(val) == 0) { activeExpireCycleExpireFullKey(db, e.key()); return; } } break; case OBJ_ZSET: if (zsetDel(val,(sds)pfat->nextExpireEntry().spsubkey.get())) { deleted++; if (zsetLength(val) == 0) { activeExpireCycleExpireFullKey(db, e.key()); return; } } break; case OBJ_CRON: executeCronJobExpireHook(e.key(), val); return; case OBJ_LIST: default: serverAssert(false); } pfat->popfrontExpireEntry(); } if (deleted) { robj objT; switch (val->type) { case OBJ_SET: initStaticStringObject(objT, (char*)e.key()); signalModifiedKey(db,&objT); notifyKeyspaceEvent(NOTIFY_SET,"srem",&objT,db->id); break; } } if (pfat->FEmpty()) { robj *keyobj = createStringObject(e.key(),sdslen(e.key())); removeExpire(db, keyobj); decrRefCount(keyobj); } } int parseUnitString(const char *sz) { if (strcasecmp(sz, "s") == 0) return UNIT_SECONDS; if (strcasecmp(sz, "ms") == 0) return UNIT_MILLISECONDS; return -1; } void expireMemberCore(client *c, robj *key, robj *subkey, long long basetime, long long when, int unit) { switch (unit) { case UNIT_SECONDS: when *= 1000; case UNIT_MILLISECONDS: break; default: addReplyError(c, "Invalid unit arg"); return; } when += basetime; /* No key, return zero. */ robj *val = lookupKeyWriteOrReply(c, key, shared.czero); if (val == NULL) { return; } double dblT; switch (val->type) { case OBJ_SET: if (!setTypeIsMember(val, szFromObj(subkey))) { addReply(c,shared.czero); return; } break; case OBJ_HASH: if (!hashTypeExists(val, szFromObj(subkey))) { addReply(c,shared.czero); return; } break; case OBJ_ZSET: if (zsetScore(val, szFromObj(subkey), &dblT) == C_ERR) { addReply(c,shared.czero); return; } break; default: addReplyError(c, "object type is unsupported"); return; } setExpire(c, c->db, key, subkey, when); addReply(c, shared.cone); } void expireMemberCommand(client *c) { long long when; if (getLongLongFromObjectOrReply(c, c->argv[3], &when, NULL) != C_OK) return; if (c->argc > 5) { addReplyError(c, "Invalid number of arguments"); return; } int unit = UNIT_SECONDS; if (c->argc == 5) { unit = parseUnitString(szFromObj(c->argv[4])); } expireMemberCore(c, c->argv[1], c->argv[2], mstime(), when, unit); } void expireMemberAtCommand(client *c) { long long when; if (getLongLongFromObjectOrReply(c, c->argv[3], &when, NULL) != C_OK) return; expireMemberCore(c, c->argv[1], c->argv[2], 0, when, UNIT_SECONDS); } /* Try to expire a few timed out keys. The algorithm used is adaptive and * will use few CPU cycles if there are few expiring keys, otherwise * it will get more aggressive to avoid that too much memory is used by * keys that can be removed from the keyspace. * * No more than CRON_DBS_PER_CALL databases are tested at every * iteration. * * This kind of call is used when Redis detects that timelimit_exit is * true, so there is more work to do, and we do it more incrementally from * the beforeSleep() function of the event loop. * * Expire cycle type: * * If type is ACTIVE_EXPIRE_CYCLE_FAST the function will try to run a * "fast" expire cycle that takes no longer than EXPIRE_FAST_CYCLE_DURATION * microseconds, and is not repeated again before the same amount of time. * * If type is ACTIVE_EXPIRE_CYCLE_SLOW, that normal expire cycle is * executed, where the time limit is a percentage of the REDIS_HZ period * as specified by the ACTIVE_EXPIRE_CYCLE_SLOW_TIME_PERC define. */ void activeExpireCycle(int type) { /* This function has some global state in order to continue the work * incrementally across calls. */ static unsigned int current_db = 0; /* Last DB tested. */ static int timelimit_exit = 0; /* Time limit hit in previous call? */ static long long last_fast_cycle = 0; /* When last fast cycle ran. */ int j, iteration = 0; int dbs_per_call = CRON_DBS_PER_CALL; long long start = ustime(), timelimit, elapsed; /* When clients are paused the dataset should be static not just from the * POV of clients not being able to write, but also from the POV of * expires and evictions of keys not being performed. */ if (clientsArePaused()) return; if (type == ACTIVE_EXPIRE_CYCLE_FAST) { /* Don't start a fast cycle if the previous cycle did not exit * for time limit. Also don't repeat a fast cycle for the same period * as the fast cycle total duration itself. */ if (!timelimit_exit) return; if (start < last_fast_cycle + ACTIVE_EXPIRE_CYCLE_FAST_DURATION*2) return; last_fast_cycle = start; } /* We usually should test CRON_DBS_PER_CALL per iteration, with * two exceptions: * * 1) Don't test more DBs than we have. * 2) If last time we hit the time limit, we want to scan all DBs * in this iteration, as there is work to do in some DB and we don't want * expired keys to use memory for too much time. */ if (dbs_per_call > cserver.dbnum || timelimit_exit) dbs_per_call = cserver.dbnum; /* We can use at max ACTIVE_EXPIRE_CYCLE_SLOW_TIME_PERC percentage of CPU time * per iteration. Since this function gets called with a frequency of * g_pserver->hz times per second, the following is the max amount of * microseconds we can spend in this function. */ timelimit = 1000000*ACTIVE_EXPIRE_CYCLE_SLOW_TIME_PERC/g_pserver->hz/100; timelimit_exit = 0; if (timelimit <= 0) timelimit = 1; if (type == ACTIVE_EXPIRE_CYCLE_FAST) timelimit = ACTIVE_EXPIRE_CYCLE_FAST_DURATION; /* in microseconds. */ /* Accumulate some global stats as we expire keys, to have some idea * about the number of keys that are already logically expired, but still * existing inside the database. */ long total_sampled = 0; long total_expired = 0; for (j = 0; j < dbs_per_call && timelimit_exit == 0; j++) { redisDb *db = g_pserver->db+(current_db % cserver.dbnum); /* Increment the DB now so we are sure if we run out of time * in the current DB we'll restart from the next. This allows to * distribute the time evenly across DBs. */ current_db++; long long now; iteration++; now = mstime(); /* If there is nothing to expire try next DB ASAP. */ if (db->setexpire->empty()) { db->avg_ttl = 0; db->last_expire_set = now; continue; } size_t expired = 0; size_t tried = 0; long long check = ACTIVE_EXPIRE_CYCLE_FAST_DURATION; // assume a check is roughly 1us. It isn't but good enough db->expireitr = db->setexpire->enumerate(db->expireitr, now, [&](expireEntry &e) __attribute__((always_inline)) { if (e.when() < now) { activeExpireCycleExpire(db, e, now); ++expired; } ++tried; if ((tried % ACTIVE_EXPIRE_CYCLE_LOOKUPS_PER_LOOP) == 0) { /* We can't block forever here even if there are many keys to * expire. So after a given amount of milliseconds return to the * caller waiting for the other active expire cycle. */ elapsed = ustime()-start; if (elapsed > timelimit) { timelimit_exit = 1; g_pserver->stat_expired_time_cap_reached_count++; return false; } check = ACTIVE_EXPIRE_CYCLE_FAST_DURATION; } return true; }, &check); total_expired += expired; } elapsed = ustime()-start; latencyAddSampleIfNeeded("expire-cycle",elapsed/1000); /* Update our estimate of keys existing but yet to be expired. * Running average with this sample accounting for 5%. */ double current_perc; if (total_sampled) { current_perc = (double)total_expired/total_sampled; } else current_perc = 0; g_pserver->stat_expired_stale_perc = (current_perc*0.05)+ (g_pserver->stat_expired_stale_perc*0.95); } /*----------------------------------------------------------------------------- * Expires of keys created in writable slaves * * Normally slaves do not process expires: they wait the masters to synthesize * DEL operations in order to retain consistency. However writable slaves are * an exception: if a key is created in the replica and an expire is assigned * to it, we need a way to expire such a key, since the master does not know * anything about such a key. * * In order to do so, we track keys created in the replica side with an expire * set, and call the expireSlaveKeys() function from time to time in order to * reclaim the keys if they already expired. * * Note that the use case we are trying to cover here, is a popular one where * slaves are put in writable mode in order to compute slow operations in * the replica side that are mostly useful to actually read data in a more * processed way. Think at sets intersections in a tmp key, with an expire so * that it is also used as a cache to avoid intersecting every time. * * This implementation is currently not perfect but a lot better than leaking * the keys as implemented in 3.2. *----------------------------------------------------------------------------*/ /* The dictionary where we remember key names and database ID of keys we may * want to expire from the replica. Since this function is not often used we * don't even care to initialize the database at startup. We'll do it once * the feature is used the first time, that is, when rememberSlaveKeyWithExpire() * is called. * * The dictionary has an SDS string representing the key as the hash table * key, while the value is a 64 bit unsigned integer with the bits corresponding * to the DB where the keys may exist set to 1. Currently the keys created * with a DB id > 63 are not expired, but a trivial fix is to set the bitmap * to the max 64 bit unsigned value when we know there is a key with a DB * ID greater than 63, and check all the configured DBs in such a case. */ dict *slaveKeysWithExpire = NULL; /* Check the set of keys created by the master with an expire set in order to * check if they should be evicted. */ void expireSlaveKeys(void) { if (slaveKeysWithExpire == NULL || dictSize(slaveKeysWithExpire) == 0) return; int cycles = 0, noexpire = 0; mstime_t start = mstime(); while(1) { dictEntry *de = dictGetRandomKey(slaveKeysWithExpire); sds keyname = (sds)dictGetKey(de); uint64_t dbids = dictGetUnsignedIntegerVal(de); uint64_t new_dbids = 0; /* Check the key against every database corresponding to the * bits set in the value bitmap. */ int dbid = 0; while(dbids && dbid < cserver.dbnum) { if ((dbids & 1) != 0) { redisDb *db = g_pserver->db+dbid; // the expire is hashed based on the key pointer, so we need the point in the main db dictEntry *deMain = dictFind(db->pdict, keyname); auto itr = db->setexpire->end(); if (deMain != nullptr) itr = db->setexpire->find((sds)dictGetKey(deMain)); int expired = 0; if (itr != db->setexpire->end()) { if (itr->when() < start) { activeExpireCycleExpire(g_pserver->db+dbid,*itr,start); expired = 1; } } /* If the key was not expired in this DB, we need to set the * corresponding bit in the new bitmap we set as value. * At the end of the loop if the bitmap is zero, it means we * no longer need to keep track of this key. */ if (itr != db->setexpire->end() && !expired) { noexpire++; new_dbids |= (uint64_t)1 << dbid; } } dbid++; dbids >>= 1; } /* Set the new bitmap as value of the key, in the dictionary * of keys with an expire set directly in the writable replica. Otherwise * if the bitmap is zero, we no longer need to keep track of it. */ if (new_dbids) dictSetUnsignedIntegerVal(de,new_dbids); else dictDelete(slaveKeysWithExpire,keyname); /* Stop conditions: found 3 keys we cna't expire in a row or * time limit was reached. */ cycles++; if (noexpire > 3) break; if ((cycles % 64) == 0 && mstime()-start > 1) break; if (dictSize(slaveKeysWithExpire) == 0) break; } } /* Track keys that received an EXPIRE or similar command in the context * of a writable replica. */ void rememberSlaveKeyWithExpire(redisDb *db, robj *key) { if (slaveKeysWithExpire == NULL) { static dictType dt = { dictSdsHash, /* hash function */ NULL, /* key dup */ NULL, /* val dup */ dictSdsKeyCompare, /* key compare */ dictSdsDestructor, /* key destructor */ NULL /* val destructor */ }; slaveKeysWithExpire = dictCreate(&dt,NULL); } if (db->id > 63) return; dictEntry *de = dictAddOrFind(slaveKeysWithExpire,ptrFromObj(key)); /* If the entry was just created, set it to a copy of the SDS string * representing the key: we don't want to need to take those keys * in sync with the main DB. The keys will be removed by expireSlaveKeys() * as it scans to find keys to remove. */ if (de->key == ptrFromObj(key)) { de->key = sdsdup(szFromObj(key)); dictSetUnsignedIntegerVal(de,0); } uint64_t dbids = dictGetUnsignedIntegerVal(de); dbids |= (uint64_t)1 << db->id; dictSetUnsignedIntegerVal(de,dbids); } /* Return the number of keys we are tracking. */ size_t getSlaveKeyWithExpireCount(void) { if (slaveKeysWithExpire == NULL) return 0; return dictSize(slaveKeysWithExpire); } /* Remove the keys in the hash table. We need to do that when data is * flushed from the g_pserver-> We may receive new keys from the master with * the same name/db and it is no longer a good idea to expire them. * * Note: technically we should handle the case of a single DB being flushed * but it is not worth it since anyway race conditions using the same set * of key names in a wriatable replica and in its master will lead to * inconsistencies. This is just a best-effort thing we do. */ void flushSlaveKeysWithExpireList(void) { if (slaveKeysWithExpire) { dictRelease(slaveKeysWithExpire); slaveKeysWithExpire = NULL; } } /*----------------------------------------------------------------------------- * Expires Commands *----------------------------------------------------------------------------*/ /* This is the generic command implementation for EXPIRE, PEXPIRE, EXPIREAT * and PEXPIREAT. Because the commad second argument may be relative or absolute * the "basetime" argument is used to signal what the base time is (either 0 * for *AT variants of the command, or the current time for relative expires). * * unit is either UNIT_SECONDS or UNIT_MILLISECONDS, and is only used for * the argv[2] parameter. The basetime is always specified in milliseconds. */ void expireGenericCommand(client *c, long long basetime, int unit) { robj *key = c->argv[1], *param = c->argv[2]; long long when; /* unix time in milliseconds when the key will expire. */ if (getLongLongFromObjectOrReply(c, param, &when, NULL) != C_OK) return; if (unit == UNIT_SECONDS) when *= 1000; when += basetime; /* No key, return zero. */ if (lookupKeyWrite(c->db,key) == NULL) { addReply(c,shared.czero); return; } /* EXPIRE with negative TTL, or EXPIREAT with a timestamp into the past * should never be executed as a DEL when load the AOF or in the context * of a replica instance. * * Instead we take the other branch of the IF statement setting an expire * (possibly in the past) and wait for an explicit DEL from the master. */ if (when <= mstime() && !g_pserver->loading && (!listLength(g_pserver->masters) || g_pserver->fActiveReplica)) { robj *aux; int deleted = g_pserver->lazyfree_lazy_expire ? dbAsyncDelete(c->db,key) : dbSyncDelete(c->db,key); serverAssertWithInfo(c,key,deleted); g_pserver->dirty++; /* Replicate/AOF this as an explicit DEL or UNLINK. */ aux = g_pserver->lazyfree_lazy_expire ? shared.unlink : shared.del; rewriteClientCommandVector(c,2,aux,key); signalModifiedKey(c->db,key); notifyKeyspaceEvent(NOTIFY_GENERIC,"del",key,c->db->id); addReply(c, shared.cone); return; } else { setExpire(c,c->db,key,nullptr,when); addReply(c,shared.cone); signalModifiedKey(c->db,key); notifyKeyspaceEvent(NOTIFY_GENERIC,"expire",key,c->db->id); g_pserver->dirty++; return; } } /* EXPIRE key seconds */ void expireCommand(client *c) { expireGenericCommand(c,mstime(),UNIT_SECONDS); } /* EXPIREAT key time */ void expireatCommand(client *c) { expireGenericCommand(c,0,UNIT_SECONDS); } /* PEXPIRE key milliseconds */ void pexpireCommand(client *c) { expireGenericCommand(c,mstime(),UNIT_MILLISECONDS); } /* PEXPIREAT key ms_time */ void pexpireatCommand(client *c) { expireGenericCommand(c,0,UNIT_MILLISECONDS); } /* Implements TTL and PTTL */ void ttlGenericCommand(client *c, int output_ms) { long long expire = -1, ttl = -1; /* If the key does not exist at all, return -2 */ if (lookupKeyReadWithFlags(c->db,c->argv[1],LOOKUP_NOTOUCH) == nullptr) { addReplyLongLong(c,-2); return; } /* The key exists. Return -1 if it has no expire, or the actual * TTL value otherwise. */ expireEntry *pexpire = getExpire(c->db,c->argv[1]); if (c->argc == 2) { // primary expire if (pexpire != nullptr) pexpire->FGetPrimaryExpire(&expire); } else if (c->argc == 3) { // We want a subkey expire if (pexpire && pexpire->FFat()) { for (auto itr : *pexpire) { if (itr.subkey() == nullptr) continue; if (sdscmp((sds)itr.subkey(), szFromObj(c->argv[2])) == 0) { expire = itr.when(); break; } } } } else { addReplyError(c, "Invalid arguments"); return; } if (expire != -1) { ttl = expire-mstime(); if (ttl < 0) ttl = 0; } if (ttl == -1) { addReplyLongLong(c,-1); } else { addReplyLongLong(c,output_ms ? ttl : ((ttl+500)/1000)); } } /* TTL key */ void ttlCommand(client *c) { ttlGenericCommand(c, 0); } /* PTTL key */ void pttlCommand(client *c) { ttlGenericCommand(c, 1); } /* PERSIST key */ void persistCommand(client *c) { if (lookupKeyWrite(c->db,c->argv[1])) { if (c->argc == 2) { if (removeExpire(c->db,c->argv[1])) { addReply(c,shared.cone); g_pserver->dirty++; } else { addReply(c,shared.czero); } } else if (c->argc == 3) { if (removeSubkeyExpire(c->db, c->argv[1], c->argv[2])) { addReply(c,shared.cone); g_pserver->dirty++; } else { addReply(c,shared.czero); } } else { addReplyError(c, "Invalid arguments"); } } else { addReply(c,shared.czero); } } /* TOUCH key1 [key2 key3 ... keyN] */ void touchCommand(client *c) { int touched = 0; for (int j = 1; j < c->argc; j++) if (lookupKeyRead(c->db,c->argv[j]) != nullptr) touched++; addReplyLongLong(c,touched); }