Merge branch 'keydbpro' into PRO_RELEASE_6
Former-commit-id: ed98be0ba81ffdc501847ea0d2486f5f01391319
This commit is contained in:
commit
dcf607622c
@ -1,6 +1,5 @@
|
||||

|
||||

|
||||
[](https://gitter.im/KeyDB/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
|
||||
[](https://stackshare.io/eq-alpha-technology-inc/eq-alpha-technology-inc)
|
||||
|
||||
##### New! Want to extend KeyDB with Javascript? Try [ModJS](https://github.com/JohnSully/ModJS)
|
||||
|
5
deps/Makefile
vendored
5
deps/Makefile
vendored
@ -1,6 +1,7 @@
|
||||
# Redis dependency Makefile
|
||||
|
||||
uname_S:= $(shell sh -c 'uname -s 2>/dev/null || echo not')
|
||||
uname_M := $(shell sh -c 'uname -m 2>/dev/null || echo not')
|
||||
|
||||
CCCOLOR="\033[34m"
|
||||
LINKCOLOR="\033[34;1m"
|
||||
@ -94,6 +95,10 @@ jemalloc: .make-prerequisites
|
||||
|
||||
rocksdb: .make-prerequisites
|
||||
@printf '%b %b\n' $(MAKECOLOR)MAKE$(ENDCOLOR) $(BINCOLOR)$@$(ENDCOLOR)
|
||||
ifeq ($(uname_M),x86_64)
|
||||
cd rocksdb && PORTABLE=1 USE_SSE=1 FORCE_SSE42=1 $(MAKE) static_lib
|
||||
else
|
||||
cd rocksdb && PORTABLE=1 $(MAKE) static_lib
|
||||
endif
|
||||
|
||||
.PHONY: rocksdb
|
||||
|
@ -45,7 +45,7 @@ void AsyncWorkQueue::WorkerThreadMain()
|
||||
ProcessPendingAsyncWrites();
|
||||
aeReleaseLock();
|
||||
g_pserver->garbageCollector.endEpoch(serverTL->gcEpoch);
|
||||
serverTL->gcEpoch = 0;
|
||||
serverTL->gcEpoch.reset();
|
||||
}
|
||||
|
||||
listRelease(vars.clients_pending_asyncwrite);
|
||||
|
@ -47,6 +47,11 @@ endif
|
||||
|
||||
USEASM?=true
|
||||
|
||||
ifeq ($(NOMVCC),)
|
||||
CFLAGS+= -DENABLE_MVCC
|
||||
CXXFLAGS+= -DENABLE_MVCC
|
||||
endif
|
||||
|
||||
ifneq ($(SANITIZE),)
|
||||
CFLAGS+= -fsanitize=$(SANITIZE) -DSANITIZE
|
||||
CXXFLAGS+= -fsanitize=$(SANITIZE) -DSANITIZE
|
||||
|
@ -261,9 +261,11 @@ int aeCreateRemoteFileEvent(aeEventLoop *eventLoop, int fd, int mask,
|
||||
|
||||
if (fSynchronous)
|
||||
{
|
||||
{
|
||||
std::unique_lock<std::mutex> ulock(cmd.pctl->mutexcv, std::adopt_lock);
|
||||
cmd.pctl->cv.wait(ulock);
|
||||
ret = cmd.pctl->rval;
|
||||
}
|
||||
delete cmd.pctl;
|
||||
}
|
||||
|
||||
@ -315,9 +317,11 @@ int aePostFunction(aeEventLoop *eventLoop, std::function<void()> fn, bool fSynch
|
||||
int ret = AE_OK;
|
||||
if (fSynchronous)
|
||||
{
|
||||
{
|
||||
std::unique_lock<std::mutex> ulock(cmd.pctl->mutexcv, std::adopt_lock);
|
||||
cmd.pctl->cv.wait(ulock);
|
||||
ret = cmd.pctl->rval;
|
||||
}
|
||||
delete cmd.pctl;
|
||||
}
|
||||
return ret;
|
||||
|
@ -4920,9 +4920,11 @@ void createDumpPayload(rio *payload, robj_roptr o, robj *key) {
|
||||
rioInitWithBuffer(payload,sdsempty());
|
||||
serverAssert(rdbSaveObjectType(payload,o));
|
||||
serverAssert(rdbSaveObject(payload,o,key));
|
||||
#ifdef ENABLE_MVCC
|
||||
char szT[32];
|
||||
snprintf(szT, 32, "%" PRIu64, o->mvcc_tstamp);
|
||||
serverAssert(rdbSaveAuxFieldStrStr(payload,"mvcc-tstamp", szT) != -1);
|
||||
#endif
|
||||
|
||||
/* Write the footer, this is how it looks like:
|
||||
* ----------------+---------------------+---------------+
|
||||
@ -5064,9 +5066,11 @@ void restoreCommand(client *c) {
|
||||
decrRefCount(auxkey);
|
||||
goto eoferr;
|
||||
}
|
||||
#ifdef ENABLE_MVCC
|
||||
if (strcasecmp(szFromObj(auxkey), "mvcc-tstamp") == 0) {
|
||||
obj->mvcc_tstamp = strtoull(szFromObj(auxval), nullptr, 10);
|
||||
}
|
||||
#endif
|
||||
decrRefCount(auxkey);
|
||||
decrRefCount(auxval);
|
||||
}
|
||||
|
45
src/db.cpp
45
src/db.cpp
@ -91,7 +91,9 @@ static robj* lookupKey(redisDb *db, robj *key, int flags) {
|
||||
robj *val = itr.val();
|
||||
lookupKeyUpdateObj(val, flags);
|
||||
if (flags & LOOKUP_UPDATEMVCC) {
|
||||
#ifdef ENABLE_MVCC
|
||||
val->mvcc_tstamp = getMvccTstamp();
|
||||
#endif
|
||||
db->trackkey(key, true /* fUpdate */);
|
||||
}
|
||||
return val;
|
||||
@ -218,8 +220,10 @@ robj *lookupKeyWriteOrReply(client *c, robj *key, robj *reply) {
|
||||
bool dbAddCore(redisDb *db, robj *key, robj *val, bool fAssumeNew = false) {
|
||||
serverAssert(!val->FExpires());
|
||||
sds copy = sdsdupshared(szFromObj(key));
|
||||
#ifdef ENABLE_MVCC
|
||||
if (g_pserver->fActiveReplica)
|
||||
val->mvcc_tstamp = key->mvcc_tstamp = getMvccTstamp();
|
||||
#endif
|
||||
|
||||
bool fInserted = db->insert(copy, val, fAssumeNew);
|
||||
|
||||
@ -270,7 +274,9 @@ void redisDb::dbOverwriteCore(redisDb::iter itr, robj *key, robj *val, bool fUpd
|
||||
if (fUpdateMvcc) {
|
||||
if (val->getrefcount(std::memory_order_relaxed) == OBJ_SHARED_REFCOUNT)
|
||||
val = dupStringObject(val);
|
||||
#ifdef ENABLE_MVCC
|
||||
val->mvcc_tstamp = getMvccTstamp();
|
||||
#endif
|
||||
}
|
||||
|
||||
if (g_pserver->lazyfree_lazy_server_del)
|
||||
@ -303,13 +309,15 @@ int dbMerge(redisDb *db, robj *key, robj *val, int fReplace)
|
||||
if (itr == nullptr)
|
||||
return (dbAddCore(db, key, val) == true);
|
||||
|
||||
#ifdef ENABLE_MVCC
|
||||
robj *old = itr.val();
|
||||
if (old->mvcc_tstamp <= val->mvcc_tstamp)
|
||||
{
|
||||
db->dbOverwriteCore(itr, key, val, false, true);
|
||||
return true;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
return false;
|
||||
}
|
||||
else
|
||||
@ -330,6 +338,7 @@ int dbMerge(redisDb *db, robj *key, robj *val, int fReplace)
|
||||
* The client 'c' argument may be set to NULL if the operation is performed
|
||||
* in a context where there is no clear client performing the operation. */
|
||||
void genericSetKey(client *c, redisDb *db, robj *key, robj *val, int keepttl, int signal) {
|
||||
db->prepOverwriteForSnapshot(szFromObj(key));
|
||||
if (!dbAddCore(db, key, val)) {
|
||||
dbOverwrite(db, key, val, !keepttl);
|
||||
}
|
||||
@ -421,8 +430,9 @@ bool redisDbPersistentData::syncDelete(robj *key)
|
||||
auto itr = m_pdbSnapshot->find_cached_threadsafe(szFromObj(key));
|
||||
if (itr != nullptr)
|
||||
{
|
||||
sds keyTombstone = sdsdup(szFromObj(key));
|
||||
if (dictAdd(m_pdictTombstone, keyTombstone, nullptr) != DICT_OK)
|
||||
sds keyTombstone = sdsdupshared(itr.key());
|
||||
uint64_t hash = dictGetHash(m_pdict, keyTombstone);
|
||||
if (dictAdd(m_pdictTombstone, keyTombstone, (void*)hash) != DICT_OK)
|
||||
sdsfree(keyTombstone);
|
||||
}
|
||||
}
|
||||
@ -2290,7 +2300,7 @@ void redisDbPersistentData::initialize()
|
||||
{
|
||||
m_pdbSnapshot = nullptr;
|
||||
m_pdict = dictCreate(&dbDictType,this);
|
||||
m_pdictTombstone = dictCreate(&dbDictType,this);
|
||||
m_pdictTombstone = dictCreate(&dbTombstoneDictType,this);
|
||||
m_setexpire = new(MALLOC_LOCAL) expireset();
|
||||
m_fAllChanged = 0;
|
||||
m_fTrackingChanges = 0;
|
||||
@ -2349,6 +2359,24 @@ bool redisDbPersistentData::insert(char *key, robj *o, bool fAssumeNew)
|
||||
return (res == DICT_OK);
|
||||
}
|
||||
|
||||
// This is a performance tool to prevent us copying over an object we're going to overwrite anyways
|
||||
void redisDbPersistentData::prepOverwriteForSnapshot(char *key)
|
||||
{
|
||||
if (g_pserver->maxmemory_policy & MAXMEMORY_FLAG_LFU)
|
||||
return;
|
||||
|
||||
if (m_pdbSnapshot != nullptr)
|
||||
{
|
||||
auto itr = m_pdbSnapshot->find_cached_threadsafe(key);
|
||||
if (itr.key() != nullptr)
|
||||
{
|
||||
sds keyNew = sdsdupshared(itr.key());
|
||||
if (dictAdd(m_pdictTombstone, keyNew, (void*)dictHashKey(m_pdict, key)) != DICT_OK)
|
||||
sdsfree(keyNew);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void redisDbPersistentData::tryResize()
|
||||
{
|
||||
if (htNeedsResize(m_pdict))
|
||||
@ -2470,15 +2498,20 @@ void redisDbPersistentData::ensure(const char *sdsKey, dictEntry **pde)
|
||||
sdsfree(strT);
|
||||
dictAdd(m_pdict, keyNew, objNew);
|
||||
serverAssert(objNew->getrefcount(std::memory_order_relaxed) == 1);
|
||||
#ifdef ENABLE_MVCC
|
||||
serverAssert(objNew->mvcc_tstamp == itr.val()->mvcc_tstamp);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
dictAdd(m_pdict, keyNew, nullptr);
|
||||
}
|
||||
*pde = dictFind(m_pdict, sdsKey);
|
||||
dictAdd(m_pdictTombstone, sdsdupshared(itr.key()), nullptr);
|
||||
uint64_t hash = dictGetHash(m_pdict, sdsKey);
|
||||
dictEntry **deT;
|
||||
dictht *ht;
|
||||
*pde = dictFindWithPrev(m_pdict, sdsKey, hash, &deT, &ht);
|
||||
dictAdd(m_pdictTombstone, sdsdupshared(itr.key()), (void*)hash);
|
||||
}
|
||||
}
|
||||
|
||||
|
92
src/dict.cpp
92
src/dict.cpp
@ -179,6 +179,9 @@ int dictExpand(dict *d, unsigned long size)
|
||||
|
||||
int dictMerge(dict *dst, dict *src)
|
||||
{
|
||||
#define MERGE_BLOCK_SIZE 4
|
||||
dictEntry *rgdeT[MERGE_BLOCK_SIZE];
|
||||
|
||||
assert(dst != src);
|
||||
if (dictSize(src) == 0)
|
||||
return DICT_OK;
|
||||
@ -197,6 +200,8 @@ int dictMerge(dict *dst, dict *src)
|
||||
std::swap(dst->iterators, src->iterators);
|
||||
}
|
||||
|
||||
src->rehashidx = -1;
|
||||
|
||||
if (!dictIsRehashing(dst) && !dictIsRehashing(src))
|
||||
{
|
||||
if (dst->ht[0].size >= src->ht[0].size)
|
||||
@ -210,6 +215,50 @@ int dictMerge(dict *dst, dict *src)
|
||||
}
|
||||
_dictReset(&src->ht[0]);
|
||||
dst->rehashidx = 0;
|
||||
assert(dictIsRehashing(dst));
|
||||
assert((dictSize(src)+dictSize(dst)) == expectedSize);
|
||||
return DICT_OK;
|
||||
}
|
||||
|
||||
if (!dictIsRehashing(src) && dictSize(src) > 0 &&
|
||||
(src->ht[0].size == dst->ht[0].size || src->ht[0].size == dst->ht[1].size))
|
||||
{
|
||||
auto &htDst = (src->ht[0].size == dst->ht[0].size) ? dst->ht[0] : dst->ht[1];
|
||||
|
||||
assert(src->ht[0].size == htDst.size);
|
||||
for (size_t ide = 0; ide < src->ht[0].size; ide += MERGE_BLOCK_SIZE)
|
||||
{
|
||||
if (src->ht[0].used == 0)
|
||||
break;
|
||||
|
||||
for (int dde = 0; dde < MERGE_BLOCK_SIZE; ++dde) {
|
||||
rgdeT[dde] = src->ht[0].table[ide + dde];
|
||||
src->ht[0].table[ide + dde] = nullptr;
|
||||
}
|
||||
|
||||
for (;;) {
|
||||
bool fAnyFound = false;
|
||||
for (int dde = 0; dde < MERGE_BLOCK_SIZE; ++dde) {
|
||||
if (rgdeT[dde] == nullptr)
|
||||
continue;
|
||||
dictEntry *deNext = rgdeT[dde]->next;
|
||||
rgdeT[dde]->next = htDst.table[ide+dde];
|
||||
htDst.table[ide+dde] = rgdeT[dde];
|
||||
rgdeT[dde] = deNext;
|
||||
htDst.used++;
|
||||
src->ht[0].used--;
|
||||
|
||||
fAnyFound = fAnyFound || (deNext != nullptr);
|
||||
}
|
||||
|
||||
if (!fAnyFound)
|
||||
break;
|
||||
}
|
||||
}
|
||||
// If we copied to the base hash table of a rehashing dst, reset the rehash
|
||||
if (dictIsRehashing(dst) && src->ht[0].size == dst->ht[0].size)
|
||||
dst->rehashidx = 0;
|
||||
assert(dictSize(src) == 0);
|
||||
assert((dictSize(src)+dictSize(dst)) == expectedSize);
|
||||
return DICT_OK;
|
||||
}
|
||||
@ -218,10 +267,34 @@ int dictMerge(dict *dst, dict *src)
|
||||
auto &htDst = dictIsRehashing(dst) ? dst->ht[1] : dst->ht[0];
|
||||
for (int iht = 0; iht < 2; ++iht)
|
||||
{
|
||||
for (size_t ide = 0; ide < src->ht[iht].size; ++ide)
|
||||
for (size_t ide = 0; ide < src->ht[iht].size; ide += MERGE_BLOCK_SIZE)
|
||||
{
|
||||
if (src->ht[iht].used == 0)
|
||||
break;
|
||||
|
||||
for (int dde = 0; dde < MERGE_BLOCK_SIZE; ++dde) {
|
||||
rgdeT[dde] = src->ht[iht].table[ide + dde];
|
||||
src->ht[iht].table[ide + dde] = nullptr;
|
||||
}
|
||||
|
||||
for (;;) {
|
||||
bool fAnyFound = false;
|
||||
for (int dde = 0; dde < MERGE_BLOCK_SIZE; ++dde) {
|
||||
if (rgdeT[dde] == nullptr)
|
||||
continue;
|
||||
uint64_t h = dictHashKey(dst, rgdeT[dde]->key) & htDst.sizemask;
|
||||
dictEntry *deNext = rgdeT[dde]->next;
|
||||
rgdeT[dde]->next = htDst.table[h];
|
||||
htDst.table[h] = rgdeT[dde];
|
||||
rgdeT[dde] = deNext;
|
||||
htDst.used++;
|
||||
src->ht[iht].used--;
|
||||
fAnyFound = fAnyFound || (deNext != nullptr);
|
||||
}
|
||||
if (!fAnyFound)
|
||||
break;
|
||||
}
|
||||
#if 0
|
||||
dictEntry *de = src->ht[iht].table[ide];
|
||||
src->ht[iht].table[ide] = nullptr;
|
||||
while (de != nullptr)
|
||||
@ -236,6 +309,7 @@ int dictMerge(dict *dst, dict *src)
|
||||
de = deNext;
|
||||
src->ht[iht].used--;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
assert((dictSize(src)+dictSize(dst)) == expectedSize);
|
||||
@ -326,7 +400,7 @@ int dictRehashMilliseconds(dict *d, int ms) {
|
||||
static void _dictRehashStep(dict *d) {
|
||||
unsigned long iterators;
|
||||
__atomic_load(&d->iterators, &iterators, __ATOMIC_RELAXED);
|
||||
if (iterators == 0) dictRehash(d,1);
|
||||
if (iterators == 0) dictRehash(d,2);
|
||||
}
|
||||
|
||||
/* Add an element to the target hash table */
|
||||
@ -541,21 +615,20 @@ void dictRelease(dict *d)
|
||||
zfree(d);
|
||||
}
|
||||
|
||||
dictEntry *dictFindWithPrev(dict *d, const void *key, dictEntry ***dePrevPtr, dictht **pht)
|
||||
dictEntry *dictFindWithPrev(dict *d, const void *key, uint64_t h, dictEntry ***dePrevPtr, dictht **pht, bool fShallowCompare)
|
||||
{
|
||||
dictEntry *he;
|
||||
uint64_t h, idx, table;
|
||||
uint64_t idx, table;
|
||||
|
||||
if (dictSize(d) == 0) return NULL; /* dict is empty */
|
||||
if (dictIsRehashing(d)) _dictRehashStep(d);
|
||||
h = dictHashKey(d, key);
|
||||
for (table = 0; table <= 1; table++) {
|
||||
*pht = d->ht + table;
|
||||
idx = h & d->ht[table].sizemask;
|
||||
he = d->ht[table].table[idx];
|
||||
*dePrevPtr = &d->ht[table].table[idx];
|
||||
while(he) {
|
||||
if (key==he->key || dictCompareKeys(d, key, he->key)) {
|
||||
if (key==he->key || (!fShallowCompare && dictCompareKeys(d, key, he->key))) {
|
||||
return he;
|
||||
}
|
||||
*dePrevPtr = &he->next;
|
||||
@ -570,7 +643,8 @@ dictEntry *dictFind(dict *d, const void *key)
|
||||
{
|
||||
dictEntry **deT;
|
||||
dictht *ht;
|
||||
return dictFindWithPrev(d, key, &deT, &ht);
|
||||
uint64_t h = dictHashKey(d, key);
|
||||
return dictFindWithPrev(d, key, h, &deT, &ht);
|
||||
}
|
||||
|
||||
void *dictFetchValue(dict *d, const void *key) {
|
||||
@ -1220,7 +1294,9 @@ void dictGetStats(char *buf, size_t bufsize, dict *d) {
|
||||
|
||||
void dictForceRehash(dict *d)
|
||||
{
|
||||
while (dictIsRehashing(d)) _dictRehashStep(d);
|
||||
unsigned long iterators;
|
||||
__atomic_load(&d->iterators, &iterators, __ATOMIC_RELAXED);
|
||||
while (iterators == 0 && dictIsRehashing(d)) _dictRehashStep(d);
|
||||
}
|
||||
|
||||
/* ------------------------------- Benchmark ---------------------------------*/
|
||||
|
@ -167,7 +167,7 @@ dictEntry *dictUnlink(dict *ht, const void *key);
|
||||
void dictFreeUnlinkedEntry(dict *d, dictEntry *he);
|
||||
void dictRelease(dict *d);
|
||||
dictEntry * dictFind(dict *d, const void *key);
|
||||
dictEntry * dictFindWithPrev(dict *d, const void *key, dictEntry ***dePrevPtr, dictht **ht);
|
||||
dictEntry * dictFindWithPrev(dict *d, const void *key, uint64_t h, dictEntry ***dePrevPtr, dictht **ht, bool fShallowCompare = false);
|
||||
void *dictFetchValue(dict *d, const void *key);
|
||||
int dictResize(dict *d);
|
||||
dictIterator *dictGetIterator(dict *d);
|
||||
|
6
src/gc.h
6
src/gc.h
@ -3,6 +3,12 @@
|
||||
#include <assert.h>
|
||||
#include <unordered_set>
|
||||
|
||||
struct ICollectable
|
||||
{
|
||||
virtual ~ICollectable() {}
|
||||
bool FWillFreeChildDebug() { return false; }
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
class GarbageCollector
|
||||
{
|
||||
|
@ -62,7 +62,10 @@ bool redisDbPersistentData::asyncDelete(robj *key) {
|
||||
dictEntry *de = dictUnlink(m_pdict,ptrFromObj(key));
|
||||
if (de) {
|
||||
if (m_pdbSnapshot != nullptr && m_pdbSnapshot->find_cached_threadsafe(szFromObj(key)) != nullptr)
|
||||
dictAdd(m_pdictTombstone, sdsdup((sds)dictGetKey(de)), nullptr);
|
||||
{
|
||||
uint64_t hash = dictGetHash(m_pdict, szFromObj(key));
|
||||
dictAdd(m_pdictTombstone, sdsdup((sds)dictGetKey(de)), (void*)hash);
|
||||
}
|
||||
|
||||
robj *val = (robj*)dictGetVal(de);
|
||||
if (val->FExpires())
|
||||
|
@ -46,7 +46,9 @@ robj *createObject(int type, void *ptr) {
|
||||
o->encoding = OBJ_ENCODING_RAW;
|
||||
o->m_ptr = ptr;
|
||||
o->setrefcount(1);
|
||||
#ifdef ENABLE_MVCC
|
||||
o->mvcc_tstamp = OBJ_MVCC_INVALID;
|
||||
#endif
|
||||
|
||||
/* Set the LRU to the current lruclock (minutes resolution), or
|
||||
* alternatively the LFU counter. */
|
||||
@ -92,6 +94,7 @@ robj *createRawStringObject(const char *ptr, size_t len) {
|
||||
* an object where the sds string is actually an unmodifiable string
|
||||
* allocated in the same chunk as the object itself. */
|
||||
robj *createEmbeddedStringObject(const char *ptr, size_t len) {
|
||||
serverAssert(len <= UINT8_MAX);
|
||||
size_t allocsize = sizeof(struct sdshdr8)+len+1;
|
||||
if (allocsize < sizeof(void*))
|
||||
allocsize = sizeof(void*);
|
||||
@ -101,7 +104,9 @@ robj *createEmbeddedStringObject(const char *ptr, size_t len) {
|
||||
o->type = OBJ_STRING;
|
||||
o->encoding = OBJ_ENCODING_EMBSTR;
|
||||
o->setrefcount(1);
|
||||
#ifdef ENABLE_MVCC
|
||||
o->mvcc_tstamp = OBJ_MVCC_INVALID;
|
||||
#endif
|
||||
|
||||
if (g_pserver->maxmemory_policy & MAXMEMORY_FLAG_LFU) {
|
||||
o->lru = (LFUGetTimeInMinutes()<<8) | LFU_INIT_VAL;
|
||||
@ -129,7 +134,12 @@ robj *createEmbeddedStringObject(const char *ptr, size_t len) {
|
||||
*
|
||||
* The current limit of 52 is chosen so that the biggest string object
|
||||
* we allocate as EMBSTR will still fit into the 64 byte arena of jemalloc. */
|
||||
#ifdef ENABLE_MVCC
|
||||
#define OBJ_ENCODING_EMBSTR_SIZE_LIMIT 48
|
||||
#else
|
||||
#define OBJ_ENCODING_EMBSTR_SIZE_LIMIT 56
|
||||
#endif
|
||||
|
||||
static_assert((sizeof(redisObject)+OBJ_ENCODING_EMBSTR_SIZE_LIMIT-8) == 64, "Max EMBSTR obj should be 64 bytes total");
|
||||
robj *createStringObject(const char *ptr, size_t len) {
|
||||
if (len <= OBJ_ENCODING_EMBSTR_SIZE_LIMIT)
|
||||
@ -1316,10 +1326,12 @@ NULL
|
||||
* because we update the access time only
|
||||
* when the key is read or overwritten. */
|
||||
addReplyLongLong(c,LFUDecrAndReturn(o.unsafe_robjcast()));
|
||||
#ifdef ENABLE_MVCC
|
||||
} else if (!strcasecmp(szFromObj(c->argv[1]), "lastmodified") && c->argc == 3) {
|
||||
if ((o = objectCommandLookupOrReply(c,c->argv[2],shared.null[c->resp]))
|
||||
== nullptr) return;
|
||||
addReplyLongLong(c, (g_pserver->mstime - (o->mvcc_tstamp >> MVCC_MS_SHIFT)) / 1000);
|
||||
#endif
|
||||
} else {
|
||||
addReplySubcommandSyntaxError(c);
|
||||
}
|
||||
@ -1579,9 +1591,11 @@ robj *deserializeStoredObjectCore(const void *data, size_t cb)
|
||||
decrRefCount(auxkey);
|
||||
goto eoferr;
|
||||
}
|
||||
#ifdef ENABLE_MVCC
|
||||
if (strcasecmp(szFromObj(auxkey), "mvcc-tstamp") == 0) {
|
||||
obj->mvcc_tstamp = strtoull(szFromObj(auxval), nullptr, 10);
|
||||
}
|
||||
#endif
|
||||
decrRefCount(auxkey);
|
||||
decrRefCount(auxval);
|
||||
}
|
||||
|
26
src/rdb.cpp
26
src/rdb.cpp
@ -349,20 +349,24 @@ writeerr:
|
||||
}
|
||||
|
||||
ssize_t rdbSaveLzfStringObject(rio *rdb, const unsigned char *s, size_t len) {
|
||||
char rgbuf[2048];
|
||||
size_t comprlen, outlen;
|
||||
void *out;
|
||||
void *out = rgbuf;
|
||||
|
||||
/* We require at least four bytes compression for this to be worth it */
|
||||
if (len <= 4) return 0;
|
||||
outlen = len-4;
|
||||
if ((out = zmalloc(outlen+1, MALLOC_LOCAL)) == NULL) return 0;
|
||||
if (outlen >= sizeof(rgbuf))
|
||||
if ((out = zmalloc(outlen+1, MALLOC_LOCAL)) == NULL) return 0;
|
||||
comprlen = lzf_compress(s, len, out, outlen);
|
||||
if (comprlen == 0) {
|
||||
zfree(out);
|
||||
if (out != rgbuf)
|
||||
zfree(out);
|
||||
return 0;
|
||||
}
|
||||
ssize_t nwritten = rdbSaveLzfBlob(rdb, out, comprlen, len);
|
||||
zfree(out);
|
||||
if (out != rgbuf)
|
||||
zfree(out);
|
||||
return nwritten;
|
||||
}
|
||||
|
||||
@ -1092,8 +1096,12 @@ int rdbSaveKeyValuePair(rio *rdb, robj_roptr key, robj_roptr val, const expireEn
|
||||
}
|
||||
|
||||
char szT[32];
|
||||
snprintf(szT, 32, "%" PRIu64, val->mvcc_tstamp);
|
||||
if (rdbSaveAuxFieldStrStr(rdb,"mvcc-tstamp", szT) == -1) return -1;
|
||||
#ifdef ENABLE_MVCC
|
||||
if (g_pserver->fActiveReplica) {
|
||||
snprintf(szT, 32, "%" PRIu64, val->mvcc_tstamp);
|
||||
if (rdbSaveAuxFieldStrStr(rdb,"mvcc-tstamp", szT) == -1) return -1;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Save type, key, value */
|
||||
if (rdbSaveObjectType(rdb,val) == -1) return -1;
|
||||
@ -2131,7 +2139,9 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, uint64_t mvcc_tstamp) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#ifdef ENABLE_MVCC
|
||||
o->mvcc_tstamp = mvcc_tstamp;
|
||||
#endif
|
||||
serverAssert(!o->FExpires());
|
||||
return o;
|
||||
}
|
||||
@ -2489,7 +2499,11 @@ int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi) {
|
||||
key = nullptr;
|
||||
goto eoferr;
|
||||
}
|
||||
#ifdef ENABLE_MVCC
|
||||
bool fStaleMvccKey = (rsi) ? val->mvcc_tstamp < rsi->mvccMinThreshold : false;
|
||||
#else
|
||||
bool fStaleMvccKey = false;
|
||||
#endif
|
||||
|
||||
/* Check if the key already expired. This function is used when loading
|
||||
* an RDB file from disk, either at startup, or when an RDB was
|
||||
|
@ -1387,14 +1387,14 @@ dictType dbDictType = {
|
||||
dictObjectDestructor /* val destructor */
|
||||
};
|
||||
|
||||
/* db->pdict, keys are sds strings, vals uints. */
|
||||
dictType dbDictTypeTombstone = {
|
||||
/* db->pdict, keys are sds strings, vals are Redis objects. */
|
||||
dictType dbTombstoneDictType = {
|
||||
dictSdsHash, /* hash function */
|
||||
NULL, /* key dup */
|
||||
NULL, /* val dup */
|
||||
dictSdsKeyCompare, /* key compare */
|
||||
dictDbKeyDestructor, /* key destructor */
|
||||
NULL /* val destructor */
|
||||
dictDbKeyDestructor, /* key destructor */
|
||||
NULL /* val destructor */
|
||||
};
|
||||
|
||||
dictType dbSnapshotDictType = {
|
||||
@ -1539,8 +1539,9 @@ void tryResizeHashTables(int dbid) {
|
||||
* is returned. */
|
||||
int redisDbPersistentData::incrementallyRehash() {
|
||||
/* Keys dictionary */
|
||||
if (dictIsRehashing(m_pdict)) {
|
||||
if (dictIsRehashing(m_pdict) || dictIsRehashing(m_pdictTombstone)) {
|
||||
dictRehashMilliseconds(m_pdict,1);
|
||||
dictRehashMilliseconds(m_pdictTombstone,1);
|
||||
return 1; /* already used our millisecond for this loop... */
|
||||
}
|
||||
return 0;
|
||||
@ -2219,11 +2220,22 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
|
||||
CONFIG_BGSAVE_RETRY_DELAY ||
|
||||
g_pserver->lastbgsave_status == C_OK))
|
||||
{
|
||||
serverLog(LL_NOTICE,"%d changes in %d seconds. Saving...",
|
||||
sp->changes, (int)sp->seconds);
|
||||
rdbSaveInfo rsi, *rsiptr;
|
||||
rsiptr = rdbPopulateSaveInfo(&rsi);
|
||||
rdbSaveBackground(rsiptr);
|
||||
// Ensure rehashing is complete
|
||||
bool fRehashInProgress = false;
|
||||
if (g_pserver->activerehashing) {
|
||||
for (int idb = 0; idb < cserver.dbnum && !fRehashInProgress; ++idb) {
|
||||
if (g_pserver->db[idb]->FRehashing())
|
||||
fRehashInProgress = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (!fRehashInProgress) {
|
||||
serverLog(LL_NOTICE,"%d changes in %d seconds. Saving...",
|
||||
sp->changes, (int)sp->seconds);
|
||||
rdbSaveInfo rsi, *rsiptr;
|
||||
rsiptr = rdbPopulateSaveInfo(&rsi);
|
||||
rdbSaveBackground(rsiptr);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -2312,14 +2324,16 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
|
||||
}
|
||||
}
|
||||
|
||||
bool fAnySnapshots = false;
|
||||
for (int idb = 0; idb < cserver.dbnum && !fAnySnapshots; ++idb)
|
||||
fAnySnapshots = fAnySnapshots || g_pserver->db[0]->FSnapshot();
|
||||
if (fAnySnapshots)
|
||||
{
|
||||
g_pserver->asyncworkqueue->AddWorkFunction([]{
|
||||
g_pserver->db[0]->consolidate_snapshot();
|
||||
}, true /*HiPri*/);
|
||||
run_with_period(100) {
|
||||
bool fAnySnapshots = false;
|
||||
for (int idb = 0; idb < cserver.dbnum && !fAnySnapshots; ++idb)
|
||||
fAnySnapshots = fAnySnapshots || g_pserver->db[0]->FSnapshot();
|
||||
if (fAnySnapshots)
|
||||
{
|
||||
g_pserver->asyncworkqueue->AddWorkFunction([]{
|
||||
g_pserver->db[0]->consolidate_snapshot();
|
||||
}, true /*HiPri*/);
|
||||
}
|
||||
}
|
||||
|
||||
/* Fire the cron loop modules event. */
|
||||
@ -2477,17 +2491,17 @@ void beforeSleep(struct aeEventLoop *eventLoop) {
|
||||
latencyAddSampleIfNeeded("storage-commit", commit_latency);
|
||||
|
||||
handleClientsWithPendingWrites(iel, aof_state);
|
||||
if (serverTL->gcEpoch != 0)
|
||||
if (!serverTL->gcEpoch.isReset())
|
||||
g_pserver->garbageCollector.endEpoch(serverTL->gcEpoch, true /*fNoFree*/);
|
||||
serverTL->gcEpoch = 0;
|
||||
serverTL->gcEpoch.reset();
|
||||
aeAcquireLock();
|
||||
|
||||
/* Close clients that need to be closed asynchronous */
|
||||
freeClientsInAsyncFreeQueue(iel);
|
||||
|
||||
if (serverTL->gcEpoch != 0)
|
||||
if (!serverTL->gcEpoch.isReset())
|
||||
g_pserver->garbageCollector.endEpoch(serverTL->gcEpoch, true /*fNoFree*/);
|
||||
serverTL->gcEpoch = 0;
|
||||
serverTL->gcEpoch.reset();
|
||||
|
||||
/* Before we are going to sleep, let the threads access the dataset by
|
||||
* releasing the GIL. Redis main thread will not touch anything at this
|
||||
@ -2503,7 +2517,7 @@ void afterSleep(struct aeEventLoop *eventLoop) {
|
||||
UNUSED(eventLoop);
|
||||
if (moduleCount()) moduleAcquireGIL(TRUE /*fServerThread*/);
|
||||
|
||||
serverAssert(serverTL->gcEpoch == 0);
|
||||
serverAssert(serverTL->gcEpoch.isReset());
|
||||
serverTL->gcEpoch = g_pserver->garbageCollector.startEpoch();
|
||||
aeAcquireLock();
|
||||
for (int idb = 0; idb < cserver.dbnum; ++idb)
|
||||
@ -5159,12 +5173,20 @@ sds genRedisInfoString(const char *section) {
|
||||
}
|
||||
|
||||
if (allsections || defsections || !strcasecmp(section,"keydb")) {
|
||||
// Compute the MVCC depth
|
||||
int mvcc_depth = 0;
|
||||
for (int idb = 0; idb < cserver.dbnum; ++idb) {
|
||||
mvcc_depth = std::max(mvcc_depth, g_pserver->db[idb]->snapshot_depth());
|
||||
}
|
||||
|
||||
if (sections++) info = sdscat(info,"\r\n");
|
||||
info = sdscatprintf(info,
|
||||
"# KeyDB\r\n"
|
||||
"variant:pro\r\n"
|
||||
"license_status:%s\r\n",
|
||||
cserver.license_key ? "OK" : "Trial"
|
||||
"license_status:%s\r\n"
|
||||
"mvcc_depth:%d\r\n",
|
||||
cserver.license_key ? "OK" : "Trial",
|
||||
mvcc_depth
|
||||
);
|
||||
}
|
||||
|
||||
|
77
src/server.h
77
src/server.h
@ -877,7 +877,9 @@ typedef struct redisObject {
|
||||
private:
|
||||
mutable std::atomic<unsigned> refcount {0};
|
||||
public:
|
||||
#ifdef ENABLE_MVCC
|
||||
uint64_t mvcc_tstamp;
|
||||
#endif
|
||||
void *m_ptr;
|
||||
|
||||
inline bool FExpires() const { return refcount.load(std::memory_order_relaxed) >> 31; }
|
||||
@ -888,7 +890,11 @@ public:
|
||||
void addref() const { refcount.fetch_add(1, std::memory_order_relaxed); }
|
||||
unsigned release() const { return refcount.fetch_sub(1, std::memory_order_seq_cst) & ~(1U << 31); }
|
||||
} robj;
|
||||
#ifdef ENABLE_MVCC
|
||||
static_assert(sizeof(redisObject) == 24, "object size is critical, don't increase");
|
||||
#else
|
||||
static_assert(sizeof(redisObject) == 16, "object size is critical, don't increase");
|
||||
#endif
|
||||
|
||||
__attribute__((always_inline)) inline const void *ptrFromObj(robj_roptr &o)
|
||||
{
|
||||
@ -1320,6 +1326,9 @@ public:
|
||||
void setExpire(robj *key, robj *subkey, long long when);
|
||||
void setExpire(expireEntry &&e);
|
||||
void initialize();
|
||||
void prepOverwriteForSnapshot(char *key);
|
||||
|
||||
bool FRehashing() const { return dictIsRehashing(m_pdict) || dictIsRehashing(m_pdictTombstone); }
|
||||
|
||||
void setStorageProvider(StorageCache *pstorage);
|
||||
|
||||
@ -1411,14 +1420,16 @@ private:
|
||||
class redisDbPersistentDataSnapshot : protected redisDbPersistentData
|
||||
{
|
||||
friend class redisDbPersistentData;
|
||||
private:
|
||||
bool iterate_threadsafe_core(std::function<bool(const char*, robj_roptr o)> &fn, bool fKeyOnly, bool fCacheOnly, bool fTop) const;
|
||||
|
||||
protected:
|
||||
bool m_fConsolidated = false;
|
||||
static void gcDisposeSnapshot(redisDbPersistentDataSnapshot *psnapshot);
|
||||
int snapshot_depth() const;
|
||||
void consolidate_children(redisDbPersistentData *pdbPrimary, bool fForce);
|
||||
void freeTombstoneObjects(int depth);
|
||||
bool freeTombstoneObjects(int depth);
|
||||
|
||||
public:
|
||||
int snapshot_depth() const;
|
||||
bool FWillFreeChildDebug() const { return m_spdbSnapshotHOLDER != nullptr; }
|
||||
|
||||
bool iterate_threadsafe(std::function<bool(const char*, robj_roptr o)> fn, bool fKeyOnly = false, bool fCacheOnly = false) const;
|
||||
@ -1521,6 +1532,8 @@ struct redisDb : public redisDbPersistentDataSnapshot
|
||||
using redisDbPersistentData::dictUnsafeKeyOnly;
|
||||
using redisDbPersistentData::resortExpire;
|
||||
using redisDbPersistentData::prefetchKeysAsync;
|
||||
using redisDbPersistentData::prepOverwriteForSnapshot;
|
||||
using redisDbPersistentData::FRehashing;
|
||||
|
||||
public:
|
||||
expireset::setiter expireitr;
|
||||
@ -1949,6 +1962,58 @@ struct clusterState;
|
||||
#define MAX_EVENT_LOOPS 16
|
||||
#define IDX_EVENT_LOOP_MAIN 0
|
||||
|
||||
class GarbageCollectorCollection
|
||||
{
|
||||
GarbageCollector<redisDbPersistentDataSnapshot> garbageCollectorSnapshot;
|
||||
GarbageCollector<ICollectable> garbageCollectorGeneric;
|
||||
|
||||
public:
|
||||
struct Epoch
|
||||
{
|
||||
uint64_t epochSnapshot = 0;
|
||||
uint64_t epochGeneric = 0;
|
||||
|
||||
void reset() {
|
||||
epochSnapshot = 0;
|
||||
epochGeneric = 0;
|
||||
}
|
||||
|
||||
bool isReset() const {
|
||||
return epochSnapshot == 0 && epochGeneric == 0;
|
||||
}
|
||||
};
|
||||
|
||||
Epoch startEpoch()
|
||||
{
|
||||
Epoch e;
|
||||
e.epochSnapshot = garbageCollectorSnapshot.startEpoch();
|
||||
e.epochGeneric = garbageCollectorGeneric.startEpoch();
|
||||
return e;
|
||||
}
|
||||
|
||||
void endEpoch(Epoch e, bool fNoFree = false)
|
||||
{
|
||||
garbageCollectorSnapshot.endEpoch(e.epochSnapshot, fNoFree);
|
||||
garbageCollectorGeneric.endEpoch(e.epochGeneric, fNoFree);
|
||||
}
|
||||
|
||||
void shutdown()
|
||||
{
|
||||
garbageCollectorSnapshot.shutdown();
|
||||
garbageCollectorGeneric.shutdown();
|
||||
}
|
||||
|
||||
void enqueue(Epoch e, std::unique_ptr<redisDbPersistentDataSnapshot> &&sp)
|
||||
{
|
||||
garbageCollectorSnapshot.enqueue(e.epochSnapshot, std::move(sp));
|
||||
}
|
||||
|
||||
void enqueue(Epoch e, std::unique_ptr<ICollectable> &&sp)
|
||||
{
|
||||
garbageCollectorGeneric.enqueue(e.epochGeneric, std::move(sp));
|
||||
}
|
||||
};
|
||||
|
||||
// Per-thread variabels that may be accessed without a lock
|
||||
struct redisServerThreadVars {
|
||||
aeEventLoop *el;
|
||||
@ -1970,7 +2035,7 @@ struct redisServerThreadVars {
|
||||
struct fastlock lockPendingWrite { "thread pending write" };
|
||||
char neterr[ANET_ERR_LEN]; /* Error buffer for anet.c */
|
||||
long unsigned commandsExecuted = 0;
|
||||
uint64_t gcEpoch = 0;
|
||||
GarbageCollectorCollection::Epoch gcEpoch;
|
||||
const redisDbPersistentDataSnapshot **rgdbSnapshot = nullptr;
|
||||
bool fRetrySetAofEvent = false;
|
||||
|
||||
@ -2424,7 +2489,7 @@ struct redisServer {
|
||||
/* System hardware info */
|
||||
size_t system_memory_size; /* Total memory in system as reported by OS */
|
||||
|
||||
GarbageCollector<redisDbPersistentDataSnapshot> garbageCollector;
|
||||
GarbageCollectorCollection garbageCollector;
|
||||
|
||||
IStorageFactory *m_pstorageFactory = nullptr;
|
||||
int storage_flush_period; // The time between flushes in the CRON job
|
||||
@ -2553,7 +2618,7 @@ extern dictType zsetDictType;
|
||||
extern dictType clusterNodesDictType;
|
||||
extern dictType clusterNodesBlackListDictType;
|
||||
extern dictType dbDictType;
|
||||
extern dictType dbDictTypeTombstone;
|
||||
extern dictType dbTombstoneDictType;
|
||||
extern dictType dbSnapshotDictType;
|
||||
extern dictType shaScriptObjectDictType;
|
||||
extern double R_Zero, R_PosInf, R_NegInf, R_Nan;
|
||||
|
180
src/snapshot.cpp
180
src/snapshot.cpp
@ -2,6 +2,29 @@
|
||||
#include "aelocker.h"
|
||||
|
||||
static const size_t c_elementsSmallLimit = 500000;
|
||||
static fastlock s_lock {"consolidate_children"}; // this lock ensures only one thread is consolidating at a time
|
||||
|
||||
class LazyFree : public ICollectable
|
||||
{
|
||||
public:
|
||||
virtual ~LazyFree()
|
||||
{
|
||||
for (auto *de : vecde)
|
||||
{
|
||||
dbDictType.keyDestructor(nullptr, dictGetKey(de));
|
||||
dbDictType.valDestructor(nullptr, dictGetVal(de));
|
||||
zfree(de);
|
||||
}
|
||||
for (robj *o : vecobjLazyFree)
|
||||
decrRefCount(o);
|
||||
for (dict *d : vecdictLazyFree)
|
||||
dictRelease(d);
|
||||
}
|
||||
|
||||
std::vector<dict*> vecdictLazyFree;
|
||||
std::vector<robj*> vecobjLazyFree;
|
||||
std::vector<dictEntry*> vecde;
|
||||
};
|
||||
|
||||
const redisDbPersistentDataSnapshot *redisDbPersistentData::createSnapshot(uint64_t mvccCheckpoint, bool fOptional)
|
||||
{
|
||||
@ -70,6 +93,7 @@ const redisDbPersistentDataSnapshot *redisDbPersistentData::createSnapshot(uint6
|
||||
|
||||
auto spdb = std::unique_ptr<redisDbPersistentDataSnapshot>(new (MALLOC_LOCAL) redisDbPersistentDataSnapshot());
|
||||
|
||||
dictRehashMilliseconds(m_pdict, 50); // Give us the best chance at a fast cleanup
|
||||
spdb->m_fAllChanged = false;
|
||||
spdb->m_fTrackingChanges = 0;
|
||||
spdb->m_pdict = m_pdict;
|
||||
@ -90,8 +114,13 @@ const redisDbPersistentDataSnapshot *redisDbPersistentData::createSnapshot(uint6
|
||||
spdb->m_setexpire->pause_rehash(); // needs to be const
|
||||
}
|
||||
|
||||
if (dictIsRehashing(spdb->m_pdict) || dictIsRehashing(spdb->m_pdictTombstone)) {
|
||||
serverLog(LL_NOTICE, "NOTICE: Suboptimal snapshot");
|
||||
}
|
||||
|
||||
m_pdict = dictCreate(&dbDictType,this);
|
||||
m_pdictTombstone = dictCreate(&dbDictTypeTombstone, this);
|
||||
dictExpand(m_pdict, 1024); // minimize rehash overhead
|
||||
m_pdictTombstone = dictCreate(&dbTombstoneDictType, this);
|
||||
|
||||
serverAssert(spdb->m_pdict->iterators == 1);
|
||||
|
||||
@ -183,7 +212,18 @@ void redisDbPersistentData::restoreSnapshot(const redisDbPersistentDataSnapshot
|
||||
void redisDbPersistentData::endSnapshotAsync(const redisDbPersistentDataSnapshot *psnapshot)
|
||||
{
|
||||
mstime_t latency;
|
||||
aeAcquireLock(); latencyStartMonitor(latency);
|
||||
|
||||
aeAcquireLock();
|
||||
while (dictIsRehashing(m_pdict) || dictIsRehashing(m_pdictTombstone)) {
|
||||
dictRehashMilliseconds(m_pdict, 1);
|
||||
dictRehashMilliseconds(m_pdictTombstone, 1);
|
||||
// Give someone else a chance
|
||||
aeReleaseLock();
|
||||
usleep(300);
|
||||
aeAcquireLock();
|
||||
}
|
||||
|
||||
latencyStartMonitor(latency);
|
||||
if (m_pdbSnapshotASYNC && m_pdbSnapshotASYNC->m_mvccCheckpoint <= psnapshot->m_mvccCheckpoint)
|
||||
{
|
||||
// Free a stale async snapshot so consolidate_children can clean it up later
|
||||
@ -209,11 +249,22 @@ void redisDbPersistentData::endSnapshotAsync(const redisDbPersistentDataSnapshot
|
||||
auto psnapshotT = createSnapshot(LLONG_MAX, false);
|
||||
endSnapshot(psnapshot); // this will just dec the ref count since our new snapshot has a ref
|
||||
psnapshot = nullptr;
|
||||
aeReleaseLock(); latencyEndMonitor(latency);
|
||||
|
||||
latencyEndMonitor(latency);
|
||||
latencyAddSampleIfNeeded("end-snapshot-async-phase-1", latency);
|
||||
aeReleaseLock();
|
||||
|
||||
// do the expensive work of merging snapshots outside the ref
|
||||
const_cast<redisDbPersistentDataSnapshot*>(psnapshotT)->freeTombstoneObjects(1); // depth is one because we just creted it
|
||||
if (const_cast<redisDbPersistentDataSnapshot*>(psnapshotT)->freeTombstoneObjects(1)) // depth is one because we just creted it
|
||||
{
|
||||
aeAcquireLock();
|
||||
if (m_pdbSnapshotASYNC != nullptr)
|
||||
endSnapshot(m_pdbSnapshotASYNC);
|
||||
m_pdbSnapshotASYNC = nullptr;
|
||||
endSnapshot(psnapshotT);
|
||||
aeReleaseLock();
|
||||
return;
|
||||
}
|
||||
const_cast<redisDbPersistentDataSnapshot*>(psnapshotT)->consolidate_children(this, true);
|
||||
|
||||
// Final Cleanup
|
||||
@ -222,33 +273,80 @@ void redisDbPersistentData::endSnapshotAsync(const redisDbPersistentDataSnapshot
|
||||
m_pdbSnapshotASYNC = psnapshotT;
|
||||
else
|
||||
endSnapshot(psnapshotT); // finally clean up our temp snapshot
|
||||
aeReleaseLock(); latencyEndMonitor(latency);
|
||||
|
||||
|
||||
latencyEndMonitor(latency);
|
||||
latencyAddSampleIfNeeded("end-snapshot-async-phase-2", latency);
|
||||
aeReleaseLock();
|
||||
}
|
||||
|
||||
void redisDbPersistentDataSnapshot::freeTombstoneObjects(int depth)
|
||||
bool redisDbPersistentDataSnapshot::freeTombstoneObjects(int depth)
|
||||
{
|
||||
if (m_pdbSnapshot == nullptr)
|
||||
return;
|
||||
{
|
||||
serverAssert(dictSize(m_pdictTombstone) == 0);
|
||||
return true;
|
||||
}
|
||||
|
||||
const_cast<redisDbPersistentDataSnapshot*>(m_pdbSnapshot)->freeTombstoneObjects(depth+1);
|
||||
if (!const_cast<redisDbPersistentDataSnapshot*>(m_pdbSnapshot)->freeTombstoneObjects(depth+1))
|
||||
return false;
|
||||
|
||||
{
|
||||
AeLocker ae;
|
||||
ae.arm(nullptr);
|
||||
if (m_pdbSnapshot->m_refCount != depth && (m_pdbSnapshot->m_refCount != (m_refCount+1)))
|
||||
return;
|
||||
return false;
|
||||
ae.disarm();
|
||||
}
|
||||
|
||||
std::unique_lock<fastlock> lock(s_lock, std::defer_lock);
|
||||
if (!lock.try_lock())
|
||||
return false; // this is a best effort function
|
||||
|
||||
std::unique_ptr<LazyFree> splazy = std::make_unique<LazyFree>();
|
||||
|
||||
dict *dictTombstoneNew = dictCreate(&dbTombstoneDictType, nullptr);
|
||||
dictIterator *di = dictGetIterator(m_pdictTombstone);
|
||||
dictEntry *de;
|
||||
std::vector<dictEntry*> vecdeFree;
|
||||
vecdeFree.reserve(dictSize(m_pdictTombstone));
|
||||
unsigned rgcremoved[2] = {0};
|
||||
while ((de = dictNext(di)) != nullptr)
|
||||
{
|
||||
dictEntry *deObj = dictFind(m_pdbSnapshot->m_pdict, dictGetKey(de));
|
||||
if (deObj != nullptr && dictGetVal(deObj) != nullptr)
|
||||
dictEntry **dePrev = nullptr;
|
||||
dictht *ht = nullptr;
|
||||
sds key = (sds)dictGetKey(de);
|
||||
// BUG BUG: Why can't we do a shallow search here?
|
||||
dictEntry *deObj = dictFindWithPrev(m_pdbSnapshot->m_pdict, key, (uint64_t)dictGetVal(de), &dePrev, &ht, false);
|
||||
|
||||
if (deObj != nullptr)
|
||||
{
|
||||
decrRefCount((robj*)dictGetVal(deObj));
|
||||
void *ptrSet = nullptr;
|
||||
__atomic_store(&deObj->v.val, &ptrSet, __ATOMIC_RELAXED);
|
||||
// Now unlink the DE
|
||||
__atomic_store(dePrev, &deObj->next, __ATOMIC_RELEASE);
|
||||
if (ht == &m_pdbSnapshot->m_pdict->ht[0])
|
||||
rgcremoved[0]++;
|
||||
else
|
||||
rgcremoved[1]++;
|
||||
splazy->vecde.push_back(deObj);
|
||||
} else {
|
||||
serverAssert(dictFind(m_pdbSnapshot->m_pdict, key) == nullptr);
|
||||
serverAssert(m_pdbSnapshot->find_cached_threadsafe(key) != nullptr);
|
||||
dictAdd(dictTombstoneNew, sdsdupshared((sds)dictGetKey(de)), dictGetVal(de));
|
||||
}
|
||||
}
|
||||
dictReleaseIterator(di);
|
||||
|
||||
dictForceRehash(dictTombstoneNew);
|
||||
aeAcquireLock();
|
||||
dict *dT = m_pdbSnapshot->m_pdict;
|
||||
splazy->vecdictLazyFree.push_back(m_pdictTombstone);
|
||||
__atomic_store(&m_pdictTombstone, &dictTombstoneNew, __ATOMIC_RELEASE);
|
||||
__atomic_fetch_sub(&dT->ht[0].used, rgcremoved[0], __ATOMIC_RELEASE);
|
||||
__atomic_fetch_sub(&dT->ht[1].used, rgcremoved[1], __ATOMIC_RELEASE);
|
||||
serverLog(LL_WARNING, "tombstones removed: %u, remain: %lu", rgcremoved[0]+rgcremoved[1], dictSize(m_pdictTombstone));
|
||||
g_pserver->garbageCollector.enqueue(serverTL->gcEpoch, std::move(splazy));
|
||||
aeReleaseLock();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void redisDbPersistentData::endSnapshot(const redisDbPersistentDataSnapshot *psnapshot)
|
||||
@ -299,15 +397,20 @@ void redisDbPersistentData::endSnapshot(const redisDbPersistentDataSnapshot *psn
|
||||
// Stage 1 Loop through all the tracked deletes and remove them from the snapshot DB
|
||||
dictIterator *di = dictGetIterator(m_pdictTombstone);
|
||||
dictEntry *de;
|
||||
m_spdbSnapshotHOLDER->m_pdict->iterators++;
|
||||
auto splazy = std::make_unique<LazyFree>();
|
||||
while ((de = dictNext(di)) != NULL)
|
||||
{
|
||||
dictEntry **dePrev;
|
||||
dictht *ht;
|
||||
dictEntry *deSnapshot = dictFindWithPrev(m_spdbSnapshotHOLDER->m_pdict, dictGetKey(de), &dePrev, &ht);
|
||||
// BUG BUG Why not a shallow search?
|
||||
dictEntry *deSnapshot = dictFindWithPrev(m_spdbSnapshotHOLDER->m_pdict, dictGetKey(de), (uint64_t)dictGetVal(de), &dePrev, &ht, false /*!!sdsisshared((sds)dictGetKey(de))*/);
|
||||
if (deSnapshot == nullptr && m_spdbSnapshotHOLDER->m_pdbSnapshot)
|
||||
{
|
||||
// The tombstone is for a grand child, propogate it (or possibly in the storage provider - but an extra tombstone won't hurt)
|
||||
#ifdef CHECKED_BUILD
|
||||
serverAssert(m_spdbSnapshotHOLDER->m_pdbSnapshot->find_cached_threadsafe((const char*)dictGetKey(de)) != nullptr);
|
||||
#endif
|
||||
dictAdd(m_spdbSnapshotHOLDER->m_pdictTombstone, sdsdupshared((sds)dictGetKey(de)), nullptr);
|
||||
continue;
|
||||
}
|
||||
@ -318,15 +421,16 @@ void redisDbPersistentData::endSnapshot(const redisDbPersistentDataSnapshot *psn
|
||||
}
|
||||
|
||||
// Delete the object from the source dict, we don't use dictDelete to avoid a second search
|
||||
dictFreeKey(m_spdbSnapshotHOLDER->m_pdict, deSnapshot);
|
||||
dictFreeVal(m_spdbSnapshotHOLDER->m_pdict, deSnapshot);
|
||||
serverAssert(*dePrev == deSnapshot);
|
||||
splazy->vecde.push_back(deSnapshot);
|
||||
*dePrev = deSnapshot->next;
|
||||
zfree(deSnapshot);
|
||||
ht->used--;
|
||||
}
|
||||
|
||||
|
||||
m_spdbSnapshotHOLDER->m_pdict->iterators--;
|
||||
dictReleaseIterator(di);
|
||||
dictEmpty(m_pdictTombstone, nullptr);
|
||||
splazy->vecdictLazyFree.push_back(m_pdictTombstone);
|
||||
m_pdictTombstone = dictCreate(&dbTombstoneDictType, nullptr);
|
||||
|
||||
// Stage 2 Move all new keys to the snapshot DB
|
||||
dictMerge(m_spdbSnapshotHOLDER->m_pdict, m_pdict);
|
||||
@ -355,8 +459,10 @@ void redisDbPersistentData::endSnapshot(const redisDbPersistentDataSnapshot *psn
|
||||
|
||||
auto spsnapshotFree = std::move(m_spdbSnapshotHOLDER);
|
||||
m_spdbSnapshotHOLDER = std::move(spsnapshotFree->m_spdbSnapshotHOLDER);
|
||||
if (serverTL != nullptr)
|
||||
if (serverTL != nullptr) {
|
||||
g_pserver->garbageCollector.enqueue(serverTL->gcEpoch, std::move(spsnapshotFree));
|
||||
g_pserver->garbageCollector.enqueue(serverTL->gcEpoch, std::move(splazy));
|
||||
}
|
||||
|
||||
// Sanity Checks
|
||||
serverAssert(m_spdbSnapshotHOLDER != nullptr || m_pdbSnapshot == nullptr);
|
||||
@ -393,8 +499,10 @@ dict_iter redisDbPersistentDataSnapshot::random_cache_threadsafe(bool fPrimaryOn
|
||||
|
||||
dict_iter redisDbPersistentData::find_cached_threadsafe(const char *key) const
|
||||
{
|
||||
dict *dictTombstone;
|
||||
__atomic_load(&m_pdictTombstone, &dictTombstone, __ATOMIC_ACQUIRE);
|
||||
dictEntry *de = dictFind(m_pdict, key);
|
||||
if (de == nullptr && m_pdbSnapshot != nullptr && dictFind(m_pdictTombstone, key) == nullptr)
|
||||
if (de == nullptr && m_pdbSnapshot != nullptr && dictFind(dictTombstone, key) == nullptr)
|
||||
{
|
||||
auto itr = m_pdbSnapshot->find_cached_threadsafe(key);
|
||||
if (itr != nullptr)
|
||||
@ -460,11 +568,20 @@ unsigned long redisDbPersistentDataSnapshot::scan_threadsafe(unsigned long itera
|
||||
}
|
||||
|
||||
bool redisDbPersistentDataSnapshot::iterate_threadsafe(std::function<bool(const char*, robj_roptr o)> fn, bool fKeyOnly, bool fCacheOnly) const
|
||||
{
|
||||
return iterate_threadsafe_core(fn, fKeyOnly, fCacheOnly, true);
|
||||
}
|
||||
|
||||
bool redisDbPersistentDataSnapshot::iterate_threadsafe_core(std::function<bool(const char*, robj_roptr o)> &fn, bool fKeyOnly, bool fCacheOnly, bool fFirst) const
|
||||
{
|
||||
// Take the size so we can ensure we visited every element exactly once
|
||||
// use volatile to ensure it's not checked too late. This makes it more
|
||||
// likely we'll detect races (but it won't gurantee it)
|
||||
aeAcquireLock();
|
||||
dict *dictTombstone;
|
||||
__atomic_load(&m_pdictTombstone, &dictTombstone, __ATOMIC_ACQUIRE);
|
||||
volatile ssize_t celem = (ssize_t)size();
|
||||
aeReleaseLock();
|
||||
|
||||
dictEntry *de = nullptr;
|
||||
bool fResult = true;
|
||||
@ -510,19 +627,22 @@ bool redisDbPersistentDataSnapshot::iterate_threadsafe(std::function<bool(const
|
||||
__atomic_load(&m_pdbSnapshot, &psnapshot, __ATOMIC_ACQUIRE);
|
||||
if (fResult && psnapshot != nullptr)
|
||||
{
|
||||
fResult = psnapshot->iterate_threadsafe([this, &fn, &celem](const char *key, robj_roptr o) {
|
||||
dictEntry *deTombstone = dictFind(m_pdictTombstone, key);
|
||||
std::function<bool(const char*, robj_roptr o)> fnNew = [this, &fn, &celem, dictTombstone](const char *key, robj_roptr o) {
|
||||
dictEntry *deTombstone = dictFind(dictTombstone, key);
|
||||
if (deTombstone != nullptr)
|
||||
return true;
|
||||
|
||||
// Alright it's a key in the use keyspace, lets ensure it and then pass it off
|
||||
--celem;
|
||||
return fn(key, o);
|
||||
}, fKeyOnly, fCacheOnly);
|
||||
};
|
||||
fResult = psnapshot->iterate_threadsafe_core(fnNew, fKeyOnly, fCacheOnly, false);
|
||||
}
|
||||
|
||||
// we should have hit all keys or had a good reason not to
|
||||
serverAssert(!fResult || celem == 0 || (m_spstorage && fCacheOnly));
|
||||
if (!(!fResult || celem == 0 || (m_spstorage && fCacheOnly)))
|
||||
serverLog(LL_WARNING, "celem: %ld", celem);
|
||||
serverAssert(!fResult || celem == 0 || (m_spstorage && fCacheOnly) || !fFirst);
|
||||
return fResult;
|
||||
}
|
||||
|
||||
@ -538,11 +658,12 @@ void redisDbPersistentData::consolidate_snapshot()
|
||||
{
|
||||
aeAcquireLock();
|
||||
auto psnapshot = (m_pdbSnapshot != nullptr) ? m_spdbSnapshotHOLDER.get() : nullptr;
|
||||
if (psnapshot == nullptr)
|
||||
if (psnapshot == nullptr || psnapshot->snapshot_depth() == 0)
|
||||
{
|
||||
aeReleaseLock();
|
||||
return;
|
||||
}
|
||||
|
||||
psnapshot->m_refCount++; // ensure it's not free'd
|
||||
aeReleaseLock();
|
||||
psnapshot->consolidate_children(this, false /* fForce */);
|
||||
@ -554,8 +675,6 @@ void redisDbPersistentData::consolidate_snapshot()
|
||||
// only call this on the "real" database to consolidate the first child
|
||||
void redisDbPersistentDataSnapshot::consolidate_children(redisDbPersistentData *pdbPrimary, bool fForce)
|
||||
{
|
||||
static fastlock s_lock {"consolidate_children"}; // this lock ensures only one thread is consolidating at a time
|
||||
|
||||
std::unique_lock<fastlock> lock(s_lock, std::defer_lock);
|
||||
if (!lock.try_lock())
|
||||
return; // this is a best effort function
|
||||
@ -615,7 +734,6 @@ void redisDbPersistentDataSnapshot::consolidate_children(redisDbPersistentData *
|
||||
|
||||
serverLog(LL_VERBOSE, "cleaned %d snapshots", snapshot_depth()-1);
|
||||
spdb->m_refCount = depth;
|
||||
spdb->m_fConsolidated = true;
|
||||
// Drop our refs from this snapshot and its children
|
||||
psnapshotT = this;
|
||||
std::vector<redisDbPersistentDataSnapshot*> vecT;
|
||||
|
Loading…
x
Reference in New Issue
Block a user