From c6fc1bcfe3923ba8c84c07cfbf051eac98d49546 Mon Sep 17 00:00:00 2001 From: John Sully Date: Tue, 23 Mar 2021 03:44:20 +0000 Subject: [PATCH 01/99] Perform GET command inline Former-commit-id: 5623936d99e334ab103f3dc1541b145c125d0ee8 --- src/db.cpp | 74 +++++++++++++++++++++++++++------------------- src/networking.cpp | 8 +++-- src/server.cpp | 6 ++++ src/server.h | 2 +- 4 files changed, 55 insertions(+), 35 deletions(-) diff --git a/src/db.cpp b/src/db.cpp index 20b70fe02..70fd2d1dc 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -2998,10 +2998,10 @@ int dbnumFromDb(redisDb *db) serverPanic("invalid database pointer"); } -void redisDbPersistentData::prefetchKeysAsync(client *c, parsed_command &command) +bool redisDbPersistentData::prefetchKeysAsync(client *c, parsed_command &command, bool fExecOK) { if (m_spstorage == nullptr) - return; + return false; AeLocker lock; @@ -3010,7 +3010,7 @@ void redisDbPersistentData::prefetchKeysAsync(client *c, parsed_command &command getKeysResult result = GETKEYS_RESULT_INIT; auto cmd = lookupCommand(szFromObj(command.argv[0])); if (cmd == nullptr) - return; // Bad command? It's not for us to judge, just bail + return false; // Bad command? It's not for us to judge, just bail int numkeys = getKeysFromCommand(cmd, command.argv, command.argc, &result); for (int ikey = 0; ikey < numkeys; ++ikey) { @@ -3042,41 +3042,53 @@ void redisDbPersistentData::prefetchKeysAsync(client *c, parsed_command &command } } - lock.arm(c); - for (auto &tuple : vecInserts) - { - sds sharedKey = std::get<0>(tuple); - robj *o = std::get<1>(tuple); - std::unique_ptr spexpire = std::move(std::get<2>(tuple)); - - if (o != nullptr) + if (!vecInserts.empty()) { + lock.arm(c); + for (auto &tuple : vecInserts) { - if (this->find_cached_threadsafe(sharedKey) != nullptr) + sds sharedKey = std::get<0>(tuple); + robj *o = std::get<1>(tuple); + std::unique_ptr spexpire = std::move(std::get<2>(tuple)); + + if (o != nullptr) { - // While unlocked this was already ensured - decrRefCount(o); - sdsfree(sharedKey); + if (this->find_cached_threadsafe(sharedKey) != nullptr) + { + // While unlocked this was already ensured + decrRefCount(o); + sdsfree(sharedKey); + } + else + { + dictAdd(m_pdict, sharedKey, o); + o->SetFExpires(spexpire != nullptr); + + if (spexpire != nullptr) + { + auto itr = m_setexpire->find(sharedKey); + if (itr != m_setexpire->end()) + m_setexpire->erase(itr); + m_setexpire->insert(std::move(*spexpire)); + serverAssert(m_setexpire->find(sharedKey) != m_setexpire->end()); + } + serverAssert(o->FExpires() == (m_setexpire->find(sharedKey) != m_setexpire->end())); + } } else { - dictAdd(m_pdict, sharedKey, o); - o->SetFExpires(spexpire != nullptr); - - if (spexpire != nullptr) - { - auto itr = m_setexpire->find(sharedKey); - if (itr != m_setexpire->end()) - m_setexpire->erase(itr); - m_setexpire->insert(std::move(*spexpire)); - serverAssert(m_setexpire->find(sharedKey) != m_setexpire->end()); - } - serverAssert(o->FExpires() == (m_setexpire->find(sharedKey) != m_setexpire->end())); + if (sharedKey != nullptr) + sdsfree(sharedKey); // BUG but don't bother crashing } } - else - { - if (sharedKey != nullptr) - sdsfree(sharedKey); // BUG but don't bother crashing + lock.disarm(); + } + + if (fExecOK && cmd->proc == getCommand && !vecInserts.empty()) { + robj *o = std::get<1>(vecInserts[0]); + if (o != nullptr) { + addReplyBulk(c, o); + return true; } } + return false; } \ No newline at end of file diff --git a/src/networking.cpp b/src/networking.cpp index 15aa6f43a..e9959c263 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -2343,7 +2343,7 @@ void parseClientCommandBuffer(client *c) { } } - size_t cqueries = c->vecqueuedcmd.size(); + size_t cqueriesStart = c->vecqueuedcmd.size(); if (c->reqtype == PROTO_REQ_INLINE) { if (processInlineBuffer(c) != C_OK) break; } else if (c->reqtype == PROTO_REQ_MULTIBULK) { @@ -2359,10 +2359,12 @@ void parseClientCommandBuffer(client *c) { } /* Prefetch if we have a storage provider and we're not in the global lock */ - if (cqueries < c->vecqueuedcmd.size() && g_pserver->m_pstorageFactory != nullptr && !GlobalLocksAcquired()) { + if (cqueriesStart < c->vecqueuedcmd.size() && g_pserver->m_pstorageFactory != nullptr && !GlobalLocksAcquired()) { auto &query = c->vecqueuedcmd.back(); if (query.argc > 0 && query.argc == query.argcMax) { - c->db->prefetchKeysAsync(c, query); + if (c->db->prefetchKeysAsync(c, query, c->vecqueuedcmd.size() == 1)) { + c->vecqueuedcmd.erase(c->vecqueuedcmd.begin()); + } } } c->reqtype = 0; diff --git a/src/server.cpp b/src/server.cpp index 96d62e4ea..814909bf6 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -2168,6 +2168,9 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) { aeAcquireLock(); } + if (g_pserver->maxmemory && g_pserver->m_pstorageFactory) + freeMemoryIfNeededAndSafe(false, false); + /* If another threads unblocked one of our clients, and this thread has been idle then beforeSleep won't have a chance to process the unblocking. So we also process them here in the cron job to ensure they don't starve. @@ -2455,6 +2458,9 @@ int serverCronLite(struct aeEventLoop *eventLoop, long long id, void *clientData aeAcquireLock(); } + if (g_pserver->maxmemory && g_pserver->m_pstorageFactory) + freeMemoryIfNeededAndSafe(false, false); + int iel = ielFromEventLoop(eventLoop); serverAssert(iel != IDX_EVENT_LOOP_MAIN); diff --git a/src/server.h b/src/server.h index 07f025c46..9ffccd6a0 100644 --- a/src/server.h +++ b/src/server.h @@ -1134,7 +1134,7 @@ public: bool removeCachedValue(const char *key); void removeAllCachedValues(); - void prefetchKeysAsync(client *c, struct parsed_command &command); + bool prefetchKeysAsync(client *c, struct parsed_command &command, bool fExecOK); bool FSnapshot() const { return m_spdbSnapshotHOLDER != nullptr; } From 18da2dd0919315cb3414d56e7882e9925458010d Mon Sep 17 00:00:00 2001 From: John Sully Date: Wed, 24 Mar 2021 19:58:51 +0000 Subject: [PATCH 02/99] Fix bug where we skip valid dict elements in dictGetRandomKey Former-commit-id: 291a3610a679cb1d17caadf6ab067cad41885935 --- src/dict.cpp | 11 +++++++++-- src/dict.h | 1 + 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/dict.cpp b/src/dict.cpp index c682e2ec9..831a32f8f 100644 --- a/src/dict.cpp +++ b/src/dict.cpp @@ -373,6 +373,7 @@ int dictRehash(dict *d, int n) { dictAsyncRehashCtl::dictAsyncRehashCtl(struct dict *d, dictAsyncRehashCtl *next) : dict(d), next(next) { queue.reserve(c_targetQueueSize); __atomic_fetch_add(&d->refcount, 1, __ATOMIC_RELEASE); + this->rehashIdxBase = d->rehashidx; } dictAsyncRehashCtl *dictRehashAsyncStart(dict *d, int buckets) { @@ -931,12 +932,18 @@ dictEntry *dictGetRandomKey(dict *d) if (dictSize(d) == 0) return NULL; if (dictIsRehashing(d)) _dictRehashStep(d); if (dictIsRehashing(d)) { + long rehashidx = d->rehashidx; + auto async = d->asyncdata; + while (async != nullptr) { + rehashidx = std::min((long)async->rehashIdxBase, rehashidx); + async = async->next; + } do { /* We are sure there are no elements in indexes from 0 * to rehashidx-1 */ - h = d->rehashidx + (random() % (d->ht[0].size + + h = rehashidx + (random() % (d->ht[0].size + d->ht[1].size - - d->rehashidx)); + rehashidx)); he = (h >= d->ht[0].size) ? d->ht[1].table[h - d->ht[0].size] : d->ht[0].table[h]; } while(he == NULL); diff --git a/src/dict.h b/src/dict.h index ab57a7d7f..f24108d32 100644 --- a/src/dict.h +++ b/src/dict.h @@ -100,6 +100,7 @@ struct dictAsyncRehashCtl { struct dict *dict = nullptr; std::vector queue; size_t hashIdx = 0; + long rehashIdxBase; dictAsyncRehashCtl *next = nullptr; std::atomic done { false }; std::atomic abondon { false }; From fa244c930e37a949a3d010b88207191760ceb87b Mon Sep 17 00:00:00 2001 From: John Sully Date: Wed, 24 Mar 2021 20:12:43 +0000 Subject: [PATCH 03/99] Bump RocksDB version for better perf Former-commit-id: ab4ae61b9c54b3c28dc5fd775d0df3d377c4846a From 54fb01e24a6725c3d8832cdd5a440c015df89d5e Mon Sep 17 00:00:00 2001 From: John Sully Date: Wed, 24 Mar 2021 20:13:42 +0000 Subject: [PATCH 04/99] Don't run code in evict unless we really have to Former-commit-id: b665b1c2b2df96883a6e2237f7bf3f9b1bec2a89 --- src/evict.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/evict.cpp b/src/evict.cpp index 31cadeae5..887b100b9 100644 --- a/src/evict.cpp +++ b/src/evict.cpp @@ -587,7 +587,7 @@ int freeMemoryIfNeeded(bool fQuickCycle, bool fPreSnapshot) { /* volatile-random and allkeys-random policy */ if (g_pserver->maxmemory_policy == MAXMEMORY_ALLKEYS_RANDOM || g_pserver->maxmemory_policy == MAXMEMORY_VOLATILE_RANDOM - || fEvictToStorage) + || fFallback) { /* When evicting a random key, we try to evict a key for * each DB, so we use the static 'next_db' variable to From 111bdabbaeed3b56cad87aca296252d5dc7db3db Mon Sep 17 00:00:00 2001 From: John Sully Date: Wed, 24 Mar 2021 19:58:51 +0000 Subject: [PATCH 05/99] Fix bug where we skip valid dict elements in dictGetRandomKey Former-commit-id: c25a9a3b84c967428b3598c99a65b14ed2417571 --- src/dict.cpp | 11 +++++++++-- src/dict.h | 1 + 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/dict.cpp b/src/dict.cpp index c682e2ec9..831a32f8f 100644 --- a/src/dict.cpp +++ b/src/dict.cpp @@ -373,6 +373,7 @@ int dictRehash(dict *d, int n) { dictAsyncRehashCtl::dictAsyncRehashCtl(struct dict *d, dictAsyncRehashCtl *next) : dict(d), next(next) { queue.reserve(c_targetQueueSize); __atomic_fetch_add(&d->refcount, 1, __ATOMIC_RELEASE); + this->rehashIdxBase = d->rehashidx; } dictAsyncRehashCtl *dictRehashAsyncStart(dict *d, int buckets) { @@ -931,12 +932,18 @@ dictEntry *dictGetRandomKey(dict *d) if (dictSize(d) == 0) return NULL; if (dictIsRehashing(d)) _dictRehashStep(d); if (dictIsRehashing(d)) { + long rehashidx = d->rehashidx; + auto async = d->asyncdata; + while (async != nullptr) { + rehashidx = std::min((long)async->rehashIdxBase, rehashidx); + async = async->next; + } do { /* We are sure there are no elements in indexes from 0 * to rehashidx-1 */ - h = d->rehashidx + (random() % (d->ht[0].size + + h = rehashidx + (random() % (d->ht[0].size + d->ht[1].size - - d->rehashidx)); + rehashidx)); he = (h >= d->ht[0].size) ? d->ht[1].table[h - d->ht[0].size] : d->ht[0].table[h]; } while(he == NULL); diff --git a/src/dict.h b/src/dict.h index ab57a7d7f..f24108d32 100644 --- a/src/dict.h +++ b/src/dict.h @@ -100,6 +100,7 @@ struct dictAsyncRehashCtl { struct dict *dict = nullptr; std::vector queue; size_t hashIdx = 0; + long rehashIdxBase; dictAsyncRehashCtl *next = nullptr; std::atomic done { false }; std::atomic abondon { false }; From aa8800abafd05a5c4e30682d49cc60948add77a4 Mon Sep 17 00:00:00 2001 From: John Sully Date: Sun, 28 Mar 2021 18:27:14 +0000 Subject: [PATCH 06/99] Enable LTO Former-commit-id: 3ec75184bae92c0e7af579eda8cbe6cfa2375327 --- src/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Makefile b/src/Makefile index d76255fd4..927782e51 100644 --- a/src/Makefile +++ b/src/Makefile @@ -15,7 +15,7 @@ release_hdr := $(shell sh -c './mkreleasehdr.sh') uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not') uname_M := $(shell sh -c 'uname -m 2>/dev/null || echo not') -OPTIMIZATION?=-O2 +OPTIMIZATION?=-O2 -flto DEPENDENCY_TARGETS=hiredis linenoise lua rocksdb NODEPS:=clean distclean From 39f4615893cde5b3358cfbdefcd895b90741f517 Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 26 Mar 2021 23:44:42 +0000 Subject: [PATCH 07/99] Eliminate unnecessary lookup in ensure when there is no snapshot Former-commit-id: 1f363ed7c13c186f0c120ab4f3e321144667f50f --- src/db.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/db.cpp b/src/db.cpp index 20b70fe02..524cdb66c 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -2581,6 +2581,8 @@ void redisDbPersistentData::updateValue(dict_iter itr, robj *val) void redisDbPersistentData::ensure(const char *key) { + if (m_pdbSnapshot == nullptr && m_spstorage == nullptr) + return; dictEntry *de = dictFind(m_pdict, key); ensure(key, &de); } From afeb4db219393e73195fab4cf5ecbb4129047773 Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 26 Mar 2021 23:48:24 +0000 Subject: [PATCH 08/99] Prefetch keys even in pure RAM scenarios Former-commit-id: d7219de186d60a5a437c1828ac97117eaad34819 --- src/db.cpp | 25 ++++++++++++++++++++++++- src/networking.cpp | 4 ++-- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/src/db.cpp b/src/db.cpp index 524cdb66c..9978a8ca5 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -35,6 +35,11 @@ #include #include +// Needed for prefetch +#if defined(__x86_64__) || defined(__i386__) +#include +#endif + /* Database backup. */ struct dbBackup { const redisDbPersistentDataSnapshot **dbarray; @@ -3002,8 +3007,26 @@ int dbnumFromDb(redisDb *db) void redisDbPersistentData::prefetchKeysAsync(client *c, parsed_command &command) { - if (m_spstorage == nullptr) + if (m_spstorage == nullptr) { +#if defined(__x86_64__) || defined(__i386__) + // We do a quick 'n dirty check for set & get. Anything else is too slow. + // Should the user do something weird like remap them then the worst that will + // happen is we don't prefetch or we prefetch wrong data. A mild perf hit, but + // not dangerous + const char *cmd = szFromObj(command.argv[0]); + if (!strcasecmp(cmd, "set") || !strcasecmp(cmd, "get")) { + auto h = dictSdsHash(szFromObj(command.argv[1])); + for (int iht = 0; iht < 2; ++iht) { + auto hT = h & c->db->m_pdict->ht[iht].sizemask; + if (c->db->m_pdict->ht[iht].table != nullptr) + _mm_prefetch(c->db->m_pdict->ht[iht].table[hT], _MM_HINT_T1); + if (!dictIsRehashing(c->db->m_pdict)) + break; + } + } +#endif return; + } AeLocker lock; diff --git a/src/networking.cpp b/src/networking.cpp index 15aa6f43a..2919750bc 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -2358,8 +2358,8 @@ void parseClientCommandBuffer(client *c) { serverAssert(c->vecqueuedcmd.back().reploff >= 0); } - /* Prefetch if we have a storage provider and we're not in the global lock */ - if (cqueries < c->vecqueuedcmd.size() && g_pserver->m_pstorageFactory != nullptr && !GlobalLocksAcquired()) { + /* Prefetch outside the lock for better perf */ + if (cqueries < c->vecqueuedcmd.size() && !GlobalLocksAcquired()) { auto &query = c->vecqueuedcmd.back(); if (query.argc > 0 && query.argc == query.argcMax) { c->db->prefetchKeysAsync(c, query); From 33197a128d78ce41ae732481028cb37713389c61 Mon Sep 17 00:00:00 2001 From: John Sully Date: Sun, 28 Mar 2021 17:58:43 +0000 Subject: [PATCH 09/99] Fix thread safety issues with the cache prefetch logic Former-commit-id: 4892122fc02109d98684a350bd732a0b08a8c7b4 --- src/db.cpp | 22 +++++++++++++--------- src/dict.cpp | 8 +++++--- src/redis-benchmark.cpp | 4 ++++ src/redis-cli.c | 5 +++++ src/server.cpp | 9 +++++++++ src/server.h | 20 ++++++++++++++++++++ 6 files changed, 56 insertions(+), 12 deletions(-) diff --git a/src/db.cpp b/src/db.cpp index 9978a8ca5..59eb08a68 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -3013,15 +3013,19 @@ void redisDbPersistentData::prefetchKeysAsync(client *c, parsed_command &command // Should the user do something weird like remap them then the worst that will // happen is we don't prefetch or we prefetch wrong data. A mild perf hit, but // not dangerous - const char *cmd = szFromObj(command.argv[0]); - if (!strcasecmp(cmd, "set") || !strcasecmp(cmd, "get")) { - auto h = dictSdsHash(szFromObj(command.argv[1])); - for (int iht = 0; iht < 2; ++iht) { - auto hT = h & c->db->m_pdict->ht[iht].sizemask; - if (c->db->m_pdict->ht[iht].table != nullptr) - _mm_prefetch(c->db->m_pdict->ht[iht].table[hT], _MM_HINT_T1); - if (!dictIsRehashing(c->db->m_pdict)) - break; + if (command.argc >= 2) { + const char *cmd = szFromObj(command.argv[0]); + if (!strcasecmp(cmd, "set") || !strcasecmp(cmd, "get")) { + auto h = dictSdsHash(szFromObj(command.argv[1])); + for (int iht = 0; iht < 2; ++iht) { + auto hT = h & c->db->m_pdict->ht[iht].sizemask; + dictEntry **table; + __atomic_load(&c->db->m_pdict->ht[iht].table, &table, __ATOMIC_RELAXED); + if (table != nullptr) + _mm_prefetch(table[hT], _MM_HINT_T2); + if (!dictIsRehashing(c->db->m_pdict)) + break; + } } } #endif diff --git a/src/dict.cpp b/src/dict.cpp index 831a32f8f..4b9f0f6d1 100644 --- a/src/dict.cpp +++ b/src/dict.cpp @@ -74,6 +74,8 @@ static int _dictInit(dict *ht, dictType *type, void *privDataPtr); static uint8_t dict_hash_function_seed[16]; +extern "C" void asyncFreeDictTable(dictEntry **de); + void dictSetHashFunctionSeed(uint8_t *seed) { memcpy(dict_hash_function_seed,seed,sizeof(dict_hash_function_seed)); } @@ -359,7 +361,7 @@ int dictRehash(dict *d, int n) { /* Check if we already rehashed the whole table... */ if (d->ht[0].used == 0) { - zfree(d->ht[0].table); + asyncFreeDictTable(d->ht[0].table); d->ht[0] = d->ht[1]; _dictReset(&d->ht[1]); d->rehashidx = -1; @@ -487,7 +489,7 @@ void dictCompleteRehashAsync(dictAsyncRehashCtl *ctl, bool fFree) { /* Check if we already rehashed the whole table... */ if (d->ht[0].used == 0 && d->asyncdata == nullptr) { - zfree(d->ht[0].table); + asyncFreeDictTable(d->ht[0].table); d->ht[0] = d->ht[1]; _dictReset(&d->ht[1]); d->rehashidx = -1; @@ -762,7 +764,7 @@ int _dictClear(dict *d, dictht *ht, void(callback)(void *)) { } } /* Free the table and the allocated cache structure */ - zfree(ht->table); + asyncFreeDictTable(ht->table); /* Re-initialize the table */ _dictReset(ht); return DICT_OK; /* never fails */ diff --git a/src/redis-benchmark.cpp b/src/redis-benchmark.cpp index 41449099d..3b130679d 100644 --- a/src/redis-benchmark.cpp +++ b/src/redis-benchmark.cpp @@ -813,6 +813,10 @@ static int ipow(int base, int exp) { return result; } +extern "C" void asyncFreeDictTable(dictEntry **de) { + zfree(de); +} + static void showLatencyReport(void) { int i, curlat = 0; int usbetweenlat = ipow(10, MAX_LATENCY_PRECISION-config.precision); diff --git a/src/redis-cli.c b/src/redis-cli.c index c555a711e..1bfe4c8a2 100644 --- a/src/redis-cli.c +++ b/src/redis-cli.c @@ -144,6 +144,11 @@ static void cliRefreshPrompt(void) { sdsfree(prompt); } +struct dictEntry; +void asyncFreeDictTable(struct dictEntry **de) { + zfree(de); +} + /* Return the name of the dotfile for the specified 'dotfilename'. * Normally it just concatenates user $HOME to the file specified * in 'dotfilename'. However if the environment variable 'envoverride' diff --git a/src/server.cpp b/src/server.cpp index 96d62e4ea..e4eba3453 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -2479,6 +2479,15 @@ int serverCronLite(struct aeEventLoop *eventLoop, long long id, void *clientData return 1000/g_pserver->hz; } +extern "C" void asyncFreeDictTable(dictEntry **de) +{ + if (de == nullptr || serverTL == nullptr || serverTL->gcEpoch.isReset()) { + zfree(de); + } else { + g_pserver->garbageCollector.enqueueCPtr(serverTL->gcEpoch, de); + } +} + extern int ProcessingEventsWhileBlocked; void processClients(); diff --git a/src/server.h b/src/server.h index 07f025c46..3612d93f0 100644 --- a/src/server.h +++ b/src/server.h @@ -1786,6 +1786,19 @@ class GarbageCollectorCollection GarbageCollector garbageCollectorSnapshot; GarbageCollector garbageCollectorGeneric; + class CPtrCollectable : public ICollectable + { + void *m_pv; + + public: + CPtrCollectable(void *pv) + : m_pv(pv) + {} + ~CPtrCollectable() { + zfree(m_pv); + } + }; + public: struct Epoch { @@ -1831,6 +1844,13 @@ public: { garbageCollectorGeneric.enqueue(e.epochGeneric, std::move(sp)); } + + template + void enqueueCPtr(Epoch e, T p) + { + auto sp = std::make_unique(reinterpret_cast(p)); + enqueue(e, std::move(sp)); + } }; // Per-thread variabels that may be accessed without a lock From 22b1ac1e8d487348b5e8856ace05d1dab0d0e349 Mon Sep 17 00:00:00 2001 From: John Sully Date: Sun, 28 Mar 2021 17:59:02 +0000 Subject: [PATCH 10/99] Excessive rehashing adds latency Former-commit-id: ee5a4528d61420a18f89a07f4ac63e2181a19738 --- src/dict.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dict.cpp b/src/dict.cpp index 4b9f0f6d1..9acc01dc5 100644 --- a/src/dict.cpp +++ b/src/dict.cpp @@ -546,7 +546,7 @@ int dictRehashMilliseconds(dict *d, int ms) { static void _dictRehashStep(dict *d) { unsigned iterators; __atomic_load(&d->iterators, &iterators, __ATOMIC_RELAXED); - if (iterators == 0) dictRehash(d,2); + if (iterators == 0) dictRehash(d,1); } /* Add an element to the target hash table */ From 999dba6619c05021a1be4390cf402621c9af562f Mon Sep 17 00:00:00 2001 From: John Sully Date: Sun, 28 Mar 2021 18:27:00 +0000 Subject: [PATCH 11/99] Make some asserts debug only for perf Former-commit-id: dc66209f2cf8eadb794dad302bd1ea92890e75b0 --- src/networking.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/networking.cpp b/src/networking.cpp index 2919750bc..ff7f83d06 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -1662,7 +1662,7 @@ int writeToClient(client *c, int handler_installed) { ssize_t nwritten = 0, totwritten = 0; clientReplyBlock *o; - AssertCorrectThread(c); + serverAssertDebug(FCorrectThread(c)); std::unique_locklock)> lock(c->lock); @@ -1881,7 +1881,7 @@ int handleClientsWithPendingWrites(int iel, int aof_state) { processed += (int)vec.size(); for (client *c : vec) { - AssertCorrectThread(c); + serverAssertDebug(FCorrectThread(c)); uint64_t flags = c->flags.fetch_and(~CLIENT_PENDING_WRITE, std::memory_order_relaxed); @@ -2428,8 +2428,8 @@ void readQueryFromClient(connection *conn) { int nread, readlen; size_t qblen; - serverAssert(FCorrectThread(c)); - serverAssert(!GlobalLocksAcquired()); + serverAssertDebug(FCorrectThread(c) sdfsdf); + serverAssertDebug(!GlobalLocksAcquired()); AeLocker aelock; AssertCorrectThread(c); From e15f035bfbc9bc85843227a0bdf73b89c77472d3 Mon Sep 17 00:00:00 2001 From: John Sully Date: Thu, 25 Mar 2021 23:14:48 +0000 Subject: [PATCH 12/99] Prevent unnecessary copies in replication scenarios Former-commit-id: b152a9bd88c081ce98eebe9a7af49649e60e5523 --- src/networking.cpp | 10 ++++++++++ src/replication.cpp | 3 +++ 2 files changed, 13 insertions(+) diff --git a/src/networking.cpp b/src/networking.cpp index ff7f83d06..e2819ff19 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -312,6 +312,16 @@ int prepareClientToWrite(client *c) { * Low level functions to add more data to output buffers. * -------------------------------------------------------------------------- */ +void _clientAsyncReplyBufferReserve(client *c, size_t len) { + if (c->replyAsync != nullptr) + return; + size_t newsize = std::max(len, (size_t)PROTO_ASYNC_REPLY_CHUNK_BYTES); + clientReplyBlock *replyNew = (clientReplyBlock*)zmalloc(sizeof(clientReplyBlock) + newsize); + replyNew->size = zmalloc_usable(replyNew) - sizeof(clientReplyBlock); + replyNew->used = 0; + c->replyAsync = replyNew; +} + /* Attempts to add the reply to the static buffer in the client struct. * Returns C_ERR if the buffer is full, or the reply list is not empty, * in which case the reply must be added to the reply list. */ diff --git a/src/replication.cpp b/src/replication.cpp index 0589868cb..e78df9a62 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -4449,6 +4449,7 @@ void replicateSubkeyExpire(redisDb *db, robj_roptr key, robj_roptr subkey, long sdsfree(szFromObj(&objTtl)); } +void _clientAsyncReplyBufferReserve(client *c, size_t len); void flushReplBacklogToClients() { serverAssert(GlobalLocksAcquired()); @@ -4486,6 +4487,8 @@ void flushReplBacklogToClients() addReplyProto(replica, g_pserver->repl_backlog + g_pserver->repl_batch_idxStart, cbCopy); } else { auto cbPhase1 = g_pserver->repl_backlog_size - g_pserver->repl_batch_idxStart; + if (fAsyncWrite) + _clientAsyncReplyBufferReserve(replica, cbPhase1 + g_pserver->repl_backlog_idx); addReplyProto(replica, g_pserver->repl_backlog + g_pserver->repl_batch_idxStart, cbPhase1); addReplyProto(replica, g_pserver->repl_backlog, g_pserver->repl_backlog_idx); serverAssert((cbPhase1 + g_pserver->repl_backlog_idx) == (g_pserver->master_repl_offset - g_pserver->repl_batch_offStart)); From 0b6a66ca55d6078e8deb9573a162b47e139682c7 Mon Sep 17 00:00:00 2001 From: John Sully Date: Tue, 30 Mar 2021 20:44:22 +0000 Subject: [PATCH 13/99] Fix crash in RDB save Former-commit-id: b032809b3e978fe571b791179d32ecdc9c067045 --- src/gc.h | 6 +++--- src/server.h | 31 +++++++++++++++++++++++++++---- tests/unit/memefficiency.tcl | 7 ++++--- 3 files changed, 34 insertions(+), 10 deletions(-) diff --git a/src/gc.h b/src/gc.h index 4715bc8de..5d92e38cb 100644 --- a/src/gc.h +++ b/src/gc.h @@ -52,7 +52,7 @@ public: void endEpoch(uint64_t epoch, bool fNoFree = false) { std::unique_lock lock(m_lock); - assert(m_setepochOutstanding.find(epoch) != m_setepochOutstanding.end()); + serverAssert(m_setepochOutstanding.find(epoch) != m_setepochOutstanding.end()); bool fMinElement = *std::min_element(m_setepochOutstanding.begin(), m_setepochOutstanding.end()); m_setepochOutstanding.erase(epoch); if (fNoFree) @@ -91,8 +91,8 @@ public: void enqueue(uint64_t epoch, std::unique_ptr &&sp) { std::unique_lock lock(m_lock); - assert(m_setepochOutstanding.find(epoch) != m_setepochOutstanding.end()); - assert(sp->FWillFreeChildDebug() == false); + serverAssert(m_setepochOutstanding.find(epoch) != m_setepochOutstanding.end()); + serverAssert(sp->FWillFreeChildDebug() == false); auto itr = std::find(m_vecepochs.begin(), m_vecepochs.end(), m_epochNext+1); if (itr == m_vecepochs.end()) diff --git a/src/server.h b/src/server.h index 3612d93f0..1fbb77c60 100644 --- a/src/server.h +++ b/src/server.h @@ -1794,7 +1794,13 @@ class GarbageCollectorCollection CPtrCollectable(void *pv) : m_pv(pv) {} - ~CPtrCollectable() { + + CPtrCollectable(CPtrCollectable &&move) { + m_pv = move.m_pv; + move.m_pv = nullptr; + } + + virtual ~CPtrCollectable() { zfree(m_pv); } }; @@ -1810,6 +1816,20 @@ public: epochGeneric = 0; } + Epoch() = default; + + Epoch (const Epoch &other) { + epochSnapshot = other.epochSnapshot; + epochGeneric = other.epochGeneric; + } + + Epoch &operator=(const Epoch &other) { + serverAssert(isReset()); + epochSnapshot = other.epochSnapshot; + epochGeneric = other.epochGeneric; + return *this; + } + bool isReset() const { return epochSnapshot == 0 && epochGeneric == 0; } @@ -1823,10 +1843,13 @@ public: return e; } - void endEpoch(Epoch e, bool fNoFree = false) + void endEpoch(Epoch &e, bool fNoFree = false) { - garbageCollectorSnapshot.endEpoch(e.epochSnapshot, fNoFree); - garbageCollectorGeneric.endEpoch(e.epochGeneric, fNoFree); + auto epochSnapshot = e.epochSnapshot; + auto epochGeneric = e.epochGeneric; + e.reset(); // We must do this early as GC'd dtors can themselves try to enqueue more data + garbageCollectorSnapshot.endEpoch(epochSnapshot, fNoFree); + garbageCollectorGeneric.endEpoch(epochGeneric, fNoFree); } void shutdown() diff --git a/tests/unit/memefficiency.tcl b/tests/unit/memefficiency.tcl index c0a6ec4d7..5bf69787b 100644 --- a/tests/unit/memefficiency.tcl +++ b/tests/unit/memefficiency.tcl @@ -154,7 +154,7 @@ start_server {tags {"defrag"} overrides {server-threads 1} } { $rd read ; # Discard replies } - set expected_frag 1.7 + set expected_frag 1.5 if {$::accurate} { # scale the hash to 1m fields in order to have a measurable the latency for {set j 10000} {$j < 1000000} {incr j} { @@ -265,7 +265,7 @@ start_server {tags {"defrag"} overrides {server-threads 1} } { # create big keys with 10k items set rd [redis_deferring_client] - set expected_frag 1.7 + set expected_frag 1.5 # add a mass of list nodes to two lists (allocations are interlaced) set val [string repeat A 100] ;# 5 items of 100 bytes puts us in the 640 bytes bin, which has 32 regs, so high potential for fragmentation set elements 500000 @@ -543,4 +543,5 @@ start_server {tags {"defrag"} overrides {server-threads 1 active-replica yes} } } {OK} } } -} ;# run solo \ No newline at end of file +} ;# run solo + From 6b2567298d809e74d6c17cf3bb6aab27342ceb5a Mon Sep 17 00:00:00 2001 From: John Sully Date: Thu, 1 Apr 2021 18:17:05 +0000 Subject: [PATCH 14/99] Don't ensure if we don't have to Former-commit-id: b7b678a3ada531890e67313c867b7b49b01fe41e --- src/db.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/db.cpp b/src/db.cpp index 59eb08a68..39749f59d 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -2464,7 +2464,7 @@ void redisDb::storageProviderInitialize() bool redisDbPersistentData::insert(char *key, robj *o, bool fAssumeNew) { - if (!fAssumeNew) + if (!fAssumeNew && (g_pserver->m_pstorageFactory != nullptr || m_pdbSnapshot != nullptr)) ensure(key); int res = dictAdd(m_pdict, key, o); serverAssert(FImplies(fAssumeNew, res == DICT_OK)); From 6601905d8c3f10e444fd4ca431909c63a95d928b Mon Sep 17 00:00:00 2001 From: Cloud User Date: Tue, 6 Apr 2021 22:39:20 +0000 Subject: [PATCH 15/99] Make high pri time thread configurable Former-commit-id: 7e94207765d2166f46792aea0919786f3f30d7b3 --- src/config.cpp | 1 + src/server.cpp | 2 ++ src/server.h | 1 + tests/unit/introspection.tcl | 1 + 4 files changed, 5 insertions(+) diff --git a/src/config.cpp b/src/config.cpp index ca9181788..9385b440c 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -2535,6 +2535,7 @@ standardConfig configs[] = { createBoolConfig("use-fork", NULL, IMMUTABLE_CONFIG, cserver.fForkBgSave, 0, NULL, NULL), createBoolConfig("allow-write-during-load", NULL, MODIFIABLE_CONFIG, g_pserver->fWriteDuringActiveLoad, 0, NULL, NULL), createBoolConfig("io-threads-do-reads", NULL, IMMUTABLE_CONFIG, fDummy, 0, NULL, NULL), + createBoolConfig("time-thread-priority", NULL, IMMUTABLE_CONFIG, cserver.time_thread_priority, 0, NULL, NULL), /* String Configs */ createStringConfig("aclfile", NULL, IMMUTABLE_CONFIG, ALLOW_EMPTY_STRING, g_pserver->acl_filename, "", NULL, NULL), diff --git a/src/server.cpp b/src/server.cpp index e4eba3453..f4a068892 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -6443,9 +6443,11 @@ int main(int argc, char **argv) { serverAssert(cserver.cthreads > 0 && cserver.cthreads <= MAX_EVENT_LOOPS); pthread_create(&cserver.time_thread_id, nullptr, timeThreadMain, nullptr); +if (cserver.time_thread_priority) { struct sched_param time_thread_priority; time_thread_priority.sched_priority = sched_get_priority_max(SCHED_FIFO); pthread_setschedparam(cserver.time_thread_id, SCHED_FIFO, &time_thread_priority); +} pthread_attr_t tattr; pthread_attr_init(&tattr); diff --git a/src/server.h b/src/server.h index 1fbb77c60..cb93a2047 100644 --- a/src/server.h +++ b/src/server.h @@ -2000,6 +2000,7 @@ struct redisServerConst { int storage_memory_model = STORAGE_WRITETHROUGH; char *storage_conf = nullptr; int fForkBgSave = false; + int time_thread_priority = false; }; struct redisServer { diff --git a/tests/unit/introspection.tcl b/tests/unit/introspection.tcl index 54d9dbbcd..30376ba45 100644 --- a/tests/unit/introspection.tcl +++ b/tests/unit/introspection.tcl @@ -90,6 +90,7 @@ start_server {tags {"introspection"}} { server_cpulist bio_cpulist aof_rewrite_cpulist + time-thread-priority bgsave_cpulist storage-cache-mode storage-provider-options From eb818d4c0095e8c9f8cc40c032670334607c5fe2 Mon Sep 17 00:00:00 2001 From: John Sully Date: Wed, 7 Apr 2021 15:25:22 +0000 Subject: [PATCH 16/99] Fix build break from merge conflicts Former-commit-id: 5052388bd43b0e42430fd741fb2b403544684246 --- src/db.cpp | 2 +- src/networking.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/db.cpp b/src/db.cpp index e4f1a5f99..994ee2b41 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -3029,7 +3029,7 @@ bool redisDbPersistentData::prefetchKeysAsync(client *c, parsed_command &command } } #endif - return; + return false; } AeLocker lock; diff --git a/src/networking.cpp b/src/networking.cpp index fce10c52d..82b731e19 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -2369,7 +2369,7 @@ void parseClientCommandBuffer(client *c) { } /* Prefetch outside the lock for better perf */ - if (cqueries < c->vecqueuedcmd.size() && !GlobalLocksAcquired()) { + if (cqueriesStart < c->vecqueuedcmd.size() && !GlobalLocksAcquired()) { auto &query = c->vecqueuedcmd.back(); if (query.argc > 0 && query.argc == query.argcMax) { if (c->db->prefetchKeysAsync(c, query, c->vecqueuedcmd.size() == 1)) { From 55a118ecd005b351a75262fca761bb4fcf137c4a Mon Sep 17 00:00:00 2001 From: John Sully Date: Wed, 7 Apr 2021 15:49:58 +0000 Subject: [PATCH 17/99] Reduce connection latency for cluster clients Former-commit-id: f1acee99bdaee4faf9e18cdf4734d51a73e78a41 --- src/networking.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/networking.cpp b/src/networking.cpp index 82b731e19..ab0a335e4 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -1259,7 +1259,9 @@ void acceptOnThread(connection *conn, int flags, char *cip) } else { - ielTarget = chooseBestThreadForAccept(); + // Cluster connections are more transient, so its not worth the cost to balance + // we can trust that SO_REUSEPORT is doing its job of distributing connections + ielTarget = g_pserver->cluster_enabled ? ielCur : chooseBestThreadForAccept(); } rgacceptsInFlight[ielTarget].fetch_add(1, std::memory_order_relaxed); From 635ddad7fc78725296d99869dfbab643ed23e49b Mon Sep 17 00:00:00 2001 From: John Sully Date: Wed, 7 Apr 2021 17:30:59 +0000 Subject: [PATCH 18/99] Fix issue where GC is not free'd until a BGSAVE Former-commit-id: e19350005b571591876e49219bfca75e905604a5 --- src/server.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/server.cpp b/src/server.cpp index 23cf9d343..71d2a5ed4 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -2440,6 +2440,16 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) { /* CRON functions may trigger async writes, so do this last */ ProcessPendingAsyncWrites(); + run_with_period(10) { + // Server threads don't free the GC, but if we don't have a + // a bgsave or some other async task then we'll hold onto the + // data for too long + g_pserver->asyncworkqueue->AddWorkFunction([]{ + auto epoch = g_pserver->garbageCollector.startEpoch(); + g_pserver->garbageCollector.endEpoch(epoch); + }); + } + g_pserver->cronloops++; return 1000/g_pserver->hz; } From 571718f774142e095be8caf24da3cc6efb22ad84 Mon Sep 17 00:00:00 2001 From: John Sully Date: Wed, 7 Apr 2021 17:41:20 +0000 Subject: [PATCH 19/99] Avoid posting unnecessary async tasks Former-commit-id: 08f63fdfd6c78388bba093ec2edda7d77fc9173e --- src/gc.h | 8 +++++++- src/server.cpp | 16 +++++++++------- src/server.h | 5 +++++ 3 files changed, 21 insertions(+), 8 deletions(-) diff --git a/src/gc.h b/src/gc.h index 5d92e38cb..8ed834ae9 100644 --- a/src/gc.h +++ b/src/gc.h @@ -49,6 +49,12 @@ public: m_setepochOutstanding.clear(); } + bool empty() const + { + std::unique_lock lock(m_lock); + return m_vecepochs.empty(); + } + void endEpoch(uint64_t epoch, bool fNoFree = false) { std::unique_lock lock(m_lock); @@ -109,7 +115,7 @@ public: } private: - fastlock m_lock { "Garbage Collector"}; + mutable fastlock m_lock { "Garbage Collector"}; std::vector m_vecepochs; std::unordered_set m_setepochOutstanding; diff --git a/src/server.cpp b/src/server.cpp index 71d2a5ed4..a323053f9 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -2441,13 +2441,15 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) { ProcessPendingAsyncWrites(); run_with_period(10) { - // Server threads don't free the GC, but if we don't have a - // a bgsave or some other async task then we'll hold onto the - // data for too long - g_pserver->asyncworkqueue->AddWorkFunction([]{ - auto epoch = g_pserver->garbageCollector.startEpoch(); - g_pserver->garbageCollector.endEpoch(epoch); - }); + if (!g_pserver->garbageCollector.empty()) { + // Server threads don't free the GC, but if we don't have a + // a bgsave or some other async task then we'll hold onto the + // data for too long + g_pserver->asyncworkqueue->AddWorkFunction([]{ + auto epoch = g_pserver->garbageCollector.startEpoch(); + g_pserver->garbageCollector.endEpoch(epoch); + }); + } } g_pserver->cronloops++; diff --git a/src/server.h b/src/server.h index 2f95f02bc..0e8bf8ccb 100644 --- a/src/server.h +++ b/src/server.h @@ -1852,6 +1852,11 @@ public: garbageCollectorGeneric.endEpoch(epochGeneric, fNoFree); } + bool empty() + { + return garbageCollectorGeneric.empty() && garbageCollectorSnapshot.empty(); + } + void shutdown() { garbageCollectorSnapshot.shutdown(); From 69f7a194bc260111f02a05ed52ccd7ebf9fcccb5 Mon Sep 17 00:00:00 2001 From: John Sully Date: Thu, 8 Apr 2021 19:52:38 +0000 Subject: [PATCH 20/99] Make prefetch configurable Former-commit-id: 16996c048042bd3799c8051645bbe7c54137d54c --- src/config.cpp | 1 + src/networking.cpp | 2 +- src/server.h | 2 ++ 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/config.cpp b/src/config.cpp index 9385b440c..f7609feb7 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -2536,6 +2536,7 @@ standardConfig configs[] = { createBoolConfig("allow-write-during-load", NULL, MODIFIABLE_CONFIG, g_pserver->fWriteDuringActiveLoad, 0, NULL, NULL), createBoolConfig("io-threads-do-reads", NULL, IMMUTABLE_CONFIG, fDummy, 0, NULL, NULL), createBoolConfig("time-thread-priority", NULL, IMMUTABLE_CONFIG, cserver.time_thread_priority, 0, NULL, NULL), + createBoolConfig("prefetch-enabled", NULL, MODIFIABLE_CONFIG, g_pserver->prefetch_enabled, 1, NULL, NULL), /* String Configs */ createStringConfig("aclfile", NULL, IMMUTABLE_CONFIG, ALLOW_EMPTY_STRING, g_pserver->acl_filename, "", NULL, NULL), diff --git a/src/networking.cpp b/src/networking.cpp index ab0a335e4..b8d3ef0b2 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -2371,7 +2371,7 @@ void parseClientCommandBuffer(client *c) { } /* Prefetch outside the lock for better perf */ - if (cqueriesStart < c->vecqueuedcmd.size() && !GlobalLocksAcquired()) { + if (g_pserver->prefetch_enabled && cqueriesStart < c->vecqueuedcmd.size() && !GlobalLocksAcquired()) { auto &query = c->vecqueuedcmd.back(); if (query.argc > 0 && query.argc == query.argcMax) { if (c->db->prefetchKeysAsync(c, query, c->vecqueuedcmd.size() == 1)) { diff --git a/src/server.h b/src/server.h index 0e8bf8ccb..8619ee28e 100644 --- a/src/server.h +++ b/src/server.h @@ -2399,6 +2399,8 @@ struct redisServer { char *aof_rewrite_cpulist; /* cpu affinity list of aof rewrite process. */ char *bgsave_cpulist; /* cpu affinity list of bgsave process. */ + int prefetch_enabled = 1; + long long repl_batch_offStart = -1; long long repl_batch_idxStart = -1; From 2d2ae90f30c70e0d22be52939b749d4991f3cc72 Mon Sep 17 00:00:00 2001 From: John Sully Date: Mon, 12 Apr 2021 03:22:22 +0000 Subject: [PATCH 21/99] Reduce P99 latency with async rehash Former-commit-id: 6c045837c7cf92dc92be35465229b482e09e46d2 --- src/dict.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dict.cpp b/src/dict.cpp index 9acc01dc5..88ad116bb 100644 --- a/src/dict.cpp +++ b/src/dict.cpp @@ -384,7 +384,7 @@ dictAsyncRehashCtl *dictRehashAsyncStart(dict *d, int buckets) { d->asyncdata = new dictAsyncRehashCtl(d, d->asyncdata); - int empty_visits = buckets * 10; + int empty_visits = buckets; while (d->asyncdata->queue.size() < (size_t)buckets && (size_t)d->rehashidx < d->ht[0].size) { dictEntry *de; From 5ccaa9265c2654c486dfdcee7f240bbe99c00a26 Mon Sep 17 00:00:00 2001 From: John Sully Date: Mon, 12 Apr 2021 03:23:49 +0000 Subject: [PATCH 22/99] Don't prefetch when lock contention is low, it increases latency Former-commit-id: 9b2629f6a20368cec8e55f0d006f3a67c8b770b7 --- src/ae.cpp | 16 ++++++++++++++++ src/ae.h | 2 ++ src/networking.cpp | 3 ++- src/server.cpp | 21 +++++++++++++++++++-- src/server.h | 5 +++++ 5 files changed, 44 insertions(+), 3 deletions(-) diff --git a/src/ae.cpp b/src/ae.cpp index d88328ded..f96ef4f6c 100644 --- a/src/ae.cpp +++ b/src/ae.cpp @@ -870,3 +870,19 @@ int aeThreadOwnsLock() { return g_lock.fOwnLock(); } + +int aeLockContested(int threshold) +{ + return g_lock.m_ticket.m_active < static_cast(g_lock.m_ticket.m_avail - threshold); +} + +int aeLockContention() +{ + ticket ticketT; + __atomic_load(&g_lock.m_ticket.u, &ticketT.u, __ATOMIC_RELAXED); + int32_t avail = ticketT.m_avail; + int32_t active = ticketT.m_active; + if (avail < active) + avail += 0x10000; + return avail - active; +} \ No newline at end of file diff --git a/src/ae.h b/src/ae.h index 3a240877d..aec1df154 100644 --- a/src/ae.h +++ b/src/ae.h @@ -169,6 +169,8 @@ void aeAcquireLock(); int aeTryAcquireLock(int fWeak); void aeReleaseLock(); int aeThreadOwnsLock(); +int aeLockContested(int threshold); +int aeLockContention(); // returns the number of instantaneous threads waiting on the lock #ifdef __cplusplus } diff --git a/src/networking.cpp b/src/networking.cpp index b8d3ef0b2..58ff54bac 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -2371,7 +2371,8 @@ void parseClientCommandBuffer(client *c) { } /* Prefetch outside the lock for better perf */ - if (g_pserver->prefetch_enabled && cqueriesStart < c->vecqueuedcmd.size() && !GlobalLocksAcquired()) { + if (g_pserver->prefetch_enabled && cqueriesStart < c->vecqueuedcmd.size() && + (g_pserver->m_pstorageFactory || aeLockContested(cserver.cthreads/2)) && !GlobalLocksAcquired()) { auto &query = c->vecqueuedcmd.back(); if (query.argc > 0 && query.argc == query.argcMax) { if (c->db->prefetchKeysAsync(c, query, c->vecqueuedcmd.size() == 1)) { diff --git a/src/server.cpp b/src/server.cpp index a323053f9..fbcd0cc43 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -2452,6 +2452,14 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) { } } + // Measure lock contention from a different thread to be more accurate + g_pserver->asyncworkqueue->AddWorkFunction([]{ + g_pserver->rglockSamples[g_pserver->ilockRingHead] = (uint16_t)aeLockContention(); + ++g_pserver->ilockRingHead; + if (g_pserver->ilockRingHead >= redisServer::s_lockContentionSamples) + g_pserver->ilockRingHead = 0; + }); + g_pserver->cronloops++; return 1000/g_pserver->hz; } @@ -5138,6 +5146,11 @@ sds genRedisInfoString(const char *section) { /* Stats */ if (allsections || defsections || !strcasecmp(section,"stats")) { + double avgLockContention = 0; + for (unsigned i = 0; i < redisServer::s_lockContentionSamples; ++i) + avgLockContention += g_pserver->rglockSamples[i]; + avgLockContention /= redisServer::s_lockContentionSamples; + if (sections++) info = sdscat(info,"\r\n"); info = sdscatprintf(info, "# Stats\r\n" @@ -5173,7 +5186,9 @@ sds genRedisInfoString(const char *section) { "tracking_total_prefixes:%lld\r\n" "unexpected_error_replies:%lld\r\n" "total_reads_processed:%lld\r\n" - "total_writes_processed:%lld\r\n", + "total_writes_processed:%lld\r\n" + "instantaneous_lock_contention:%d\r\n" + "avg_lock_contention:%f\r\n", g_pserver->stat_numconnections, g_pserver->stat_numcommands, getInstantaneousMetric(STATS_METRIC_COMMAND), @@ -5206,7 +5221,9 @@ sds genRedisInfoString(const char *section) { (unsigned long long) trackingGetTotalPrefixes(), g_pserver->stat_unexpected_error_replies, g_pserver->stat_total_reads_processed.load(std::memory_order_relaxed), - g_pserver->stat_total_writes_processed.load(std::memory_order_relaxed)); + g_pserver->stat_total_writes_processed.load(std::memory_order_relaxed), + aeLockContention(), + avgLockContention); } /* Replication */ diff --git a/src/server.h b/src/server.h index 8619ee28e..bb895a814 100644 --- a/src/server.h +++ b/src/server.h @@ -2404,6 +2404,11 @@ struct redisServer { long long repl_batch_offStart = -1; long long repl_batch_idxStart = -1; + /* Lock Contention Ring Buffer */ + static const size_t s_lockContentionSamples = 64; + uint16_t rglockSamples[s_lockContentionSamples]; + unsigned ilockRingHead = 0; + bool FRdbSaveInProgress() const { return rdbThreadVars.fRdbThreadActive; } }; From da545be5b5d72ada7538c9ea26349d23ab1752d0 Mon Sep 17 00:00:00 2001 From: John Sully Date: Mon, 12 Apr 2021 03:24:05 +0000 Subject: [PATCH 23/99] Make prefetch more aggressive Former-commit-id: 9cb0be197fb96d7fb3cb697c19b5be313b01337a --- src/db.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/db.cpp b/src/db.cpp index 994ee2b41..00a001877 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -3021,8 +3021,13 @@ bool redisDbPersistentData::prefetchKeysAsync(client *c, parsed_command &command auto hT = h & c->db->m_pdict->ht[iht].sizemask; dictEntry **table; __atomic_load(&c->db->m_pdict->ht[iht].table, &table, __ATOMIC_RELAXED); - if (table != nullptr) - _mm_prefetch(table[hT], _MM_HINT_T2); + if (table != nullptr) { + dictEntry *de = table[hT]; + while (de != nullptr) { + _mm_prefetch(dictGetKey(de), _MM_HINT_T2); + de = de->next; + } + } if (!dictIsRehashing(c->db->m_pdict)) break; } From 9c1cf39b70c7c9670d5c7acf9f5194ae3bd7e47a Mon Sep 17 00:00:00 2001 From: John Sully Date: Wed, 21 Apr 2021 01:18:38 +0000 Subject: [PATCH 24/99] _dictExpandIfNeeded is called too late to be useful Former-commit-id: 7f75ca5d3a9ed47465bceb22f5f74fd6f0760008 --- src/dict.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dict.cpp b/src/dict.cpp index 88ad116bb..41e1a05da 100644 --- a/src/dict.cpp +++ b/src/dict.cpp @@ -681,6 +681,7 @@ static dictEntry *dictGenericDelete(dict *d, const void *key, int nofree) { } } d->ht[table].used--; + _dictExpandIfNeeded(d); return he; } prevHe = he; @@ -689,7 +690,6 @@ static dictEntry *dictGenericDelete(dict *d, const void *key, int nofree) { if (!dictIsRehashing(d)) break; } - _dictExpandIfNeeded(d); return NULL; /* not found */ } From 6e546b3c441dc1dbde30ad708c8e571fb8a19cdb Mon Sep 17 00:00:00 2001 From: John Sully Date: Wed, 21 Apr 2021 01:19:29 +0000 Subject: [PATCH 25/99] Fix issue where we reply a dangling pointer Former-commit-id: fd11e490c39fe876d979eace5eaec56c645cdfcf --- src/db.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/db.cpp b/src/db.cpp index 00a001877..afed04ee8 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -3076,6 +3076,7 @@ bool redisDbPersistentData::prefetchKeysAsync(client *c, parsed_command &command } } + bool fNoInsert = false; if (!vecInserts.empty()) { lock.arm(c); for (auto &tuple : vecInserts) @@ -3091,9 +3092,16 @@ bool redisDbPersistentData::prefetchKeysAsync(client *c, parsed_command &command // While unlocked this was already ensured decrRefCount(o); sdsfree(sharedKey); + fNoInsert = true; } else { + if (spexpire != nullptr) { + if (spexpire->when() < mstime()) { + fNoInsert = true; + break; + } + } dictAdd(m_pdict, sharedKey, o); o->SetFExpires(spexpire != nullptr); @@ -3117,7 +3125,7 @@ bool redisDbPersistentData::prefetchKeysAsync(client *c, parsed_command &command lock.disarm(); } - if (fExecOK && cmd->proc == getCommand && !vecInserts.empty()) { + if (fExecOK && !fNoInsert && cmd->proc == getCommand && !vecInserts.empty()) { robj *o = std::get<1>(vecInserts[0]); if (o != nullptr) { addReplyBulk(c, o); From 3966f8486254684e225f2d7f3cef41905997d746 Mon Sep 17 00:00:00 2001 From: John Sully Date: Wed, 21 Apr 2021 01:19:44 +0000 Subject: [PATCH 26/99] Better RocksDB perf tuning Former-commit-id: cbe1e0a7c1b5dc49c1adff07b7c32042e673acf4 From 95ae92a6911e9b2943bf3b062994eecc23a4824a Mon Sep 17 00:00:00 2001 From: John Sully Date: Wed, 21 Apr 2021 01:20:13 +0000 Subject: [PATCH 27/99] Optimize freeMemory by lazy freeing objects Former-commit-id: cca31ed5ee2f42975f0051cfabf1e88720b3d678 --- src/db.cpp | 8 ++++-- src/evict.cpp | 75 ++++++++++++++++++++++++++++++++++++++++++++++++--- src/server.h | 2 +- 3 files changed, 78 insertions(+), 7 deletions(-) diff --git a/src/db.cpp b/src/db.cpp index afed04ee8..13734aed5 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -2838,7 +2838,7 @@ size_t redisDbPersistentData::size() const + (m_pdbSnapshot ? (m_pdbSnapshot->size() - dictSize(m_pdictTombstone)) : 0); } -bool redisDbPersistentData::removeCachedValue(const char *key) +bool redisDbPersistentData::removeCachedValue(const char *key, dictEntry **ppde) { serverAssert(m_spstorage != nullptr); // First ensure its not a pending key @@ -2854,7 +2854,11 @@ bool redisDbPersistentData::removeCachedValue(const char *key) } // since we write ASAP the database already has a valid copy so safe to delete - dictDelete(m_pdict, key); + if (ppde != nullptr) { + *ppde = dictUnlink(m_pdict, key); + } else { + dictDelete(m_pdict, key); + } if (m_spstorage != nullptr) m_spstorage->batch_unlock(); diff --git a/src/evict.cpp b/src/evict.cpp index 887b100b9..3e1888ae1 100644 --- a/src/evict.cpp +++ b/src/evict.cpp @@ -34,6 +34,7 @@ #include "bio.h" #include "atomicvar.h" #include +#include /* ---------------------------------------------------------------------------- * Data structures @@ -475,6 +476,64 @@ int getMaxmemoryState(size_t *total, size_t *logical, size_t *tofree, float *lev return C_ERR; } +class FreeMemoryLazyFree : public ICollectable +{ + ssize_t m_cb = 0; + std::vector>> vecdictvecde; + +public: + static std::atomic s_clazyFreesInProgress; + + FreeMemoryLazyFree() { + s_clazyFreesInProgress++; + } + + ~FreeMemoryLazyFree() { + --s_clazyFreesInProgress; + for (auto &pair : vecdictvecde) { + for (auto de : pair.second) { + dictFreeUnlinkedEntry(pair.first, de); + } + } + } + + ssize_t addEntry(dict *d, dictEntry *de) { + ssize_t cbFreedNow = 0; + ssize_t cb = sizeof(dictEntry); + cb += sdsAllocSize((sds)dictGetKey(de)); + robj *o = (robj*)dictGetVal(de); + switch (o->type) { + case OBJ_STRING: + cb += getStringObjectSdsUsedMemory(o)+sizeof(robj); + break; + + default: + // If we don't know about it we can't accurately track the memory so free now + cbFreedNow = zmalloc_used_memory(); + decrRefCount(o); + cbFreedNow -= zmalloc_used_memory(); + de->v.val = nullptr; + } + + auto itr = std::lower_bound(vecdictvecde.begin(), vecdictvecde.end(), d, + [](const std::pair> &a, dict *d) -> bool { + return a.first < d; + } + ); + if (itr == vecdictvecde.end() || itr->first != d) { + itr = vecdictvecde.insert(itr, std::make_pair(d, std::vector())); + } + serverAssert(itr->first == d); + itr->second.push_back(de); + m_cb += cb; + return cb + cbFreedNow; + } + + size_t memory_queued() { return m_cb; } +}; + +std::atomic FreeMemoryLazyFree::s_clazyFreesInProgress {0}; + /* This function is periodically called to see if there is memory to free * according to the current "maxmemory" settings. In case we are over the * memory limit, the function will try to free some memory to return back @@ -490,6 +549,9 @@ int freeMemoryIfNeeded(bool fQuickCycle, bool fPreSnapshot) { /* By default replicas should ignore maxmemory * and just be masters exact copies. */ if (g_pserver->m_pstorageFactory == nullptr && listLength(g_pserver->masters) && g_pserver->repl_slave_ignore_maxmemory && !g_pserver->fActiveReplica) return C_OK; + + /* If we have a lazy free obj pending, our amounts will be off, wait for it to go away */ + if (FreeMemoryLazyFree::s_clazyFreesInProgress > 0) return C_OK; size_t mem_reported, mem_tofree, mem_freed; mstime_t latency, eviction_latency, lazyfree_latency; @@ -500,6 +562,8 @@ int freeMemoryIfNeeded(bool fQuickCycle, bool fPreSnapshot) { int ckeysFailed = 0; int keys_freed = 0; + std::unique_ptr splazy = std::make_unique(); + if (g_pserver->maxstorage && g_pserver->m_pstorageFactory != nullptr && g_pserver->m_pstorageFactory->totalDiskspaceUsed() >= g_pserver->maxstorage) goto cant_free_storage; @@ -623,9 +687,9 @@ int freeMemoryIfNeeded(bool fQuickCycle, bool fPreSnapshot) { if (fEvictToStorage) { // This key is in the storage so we only need to free the object - delta = (long long) zmalloc_used_memory(); - if (db->removeCachedValue(bestkey)) { - delta -= (long long) zmalloc_used_memory(); + dictEntry *deT; + if (db->removeCachedValue(bestkey, &deT)) { + mem_freed += splazy->addEntry(db->dictUnsafeKeyOnly(), deT); ckeysFailed = 0; } else { @@ -634,7 +698,6 @@ int freeMemoryIfNeeded(bool fQuickCycle, bool fPreSnapshot) { if (ckeysFailed > 1024) goto cant_free; } - mem_freed += delta; } else { @@ -691,6 +754,10 @@ int freeMemoryIfNeeded(bool fQuickCycle, bool fPreSnapshot) { } result = C_OK; + if (splazy != nullptr && splazy->memory_queued() > 0 && !serverTL->gcEpoch.isReset()) { + g_pserver->garbageCollector.enqueue(serverTL->gcEpoch, std::move(splazy)); + } + cant_free: if (g_pserver->m_pstorageFactory) { diff --git a/src/server.h b/src/server.h index bb895a814..9d3e70197 100644 --- a/src/server.h +++ b/src/server.h @@ -1131,7 +1131,7 @@ public: void restoreSnapshot(const redisDbPersistentDataSnapshot *psnapshot); bool FStorageProvider() { return m_spstorage != nullptr; } - bool removeCachedValue(const char *key); + bool removeCachedValue(const char *key, dictEntry **ppde = nullptr); void removeAllCachedValues(); bool prefetchKeysAsync(client *c, struct parsed_command &command, bool fExecOK); From 05fe41b33a3b1ebc25a5c17748a5c00a3e0a2984 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Mon, 26 Apr 2021 22:13:32 +0000 Subject: [PATCH 28/99] Primitive implementation of bypassing client buffer, stats are all messed up and print statements everywhere Former-commit-id: 8ae310fb0f7b53add826f76891da333b63860001 --- src/networking.cpp | 159 +++++++++++++++++++++++++++++++++++++++----- src/replication.cpp | 156 +++++++++++++++++++++++++++++++++++++++++++ src/server.h | 11 +++ 3 files changed, 308 insertions(+), 18 deletions(-) diff --git a/src/networking.cpp b/src/networking.cpp index 574a8bc6c..18ab382bd 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -224,6 +224,7 @@ void clientInstallWriteHandler(client *c) { (c->replstate == REPL_STATE_NONE || (c->replstate == SLAVE_STATE_ONLINE && !c->repl_put_online_on_ack))) { + // serverLog(LL_NOTICE, "we installing boyz"); AssertCorrectThread(c); serverAssert(c->lock.fOwnLock()); /* Here instead of installing the write handler, we just flag the @@ -301,7 +302,7 @@ int prepareClientToWrite(client *c) { /* Schedule the client to write the output buffers to the socket, unless * it should already be setup to do so (it has already pending data). */ - if (!fAsync && !clientHasPendingReplies(c)) clientInstallWriteHandler(c); + if (!fAsync && !clientHasPendingReplies(c) && c->repl_curr_idx == -1) clientInstallWriteHandler(c); if (fAsync && !(c->fPendingAsyncWrite)) clientInstallAsyncWriteHandler(c); /* Authorize the caller to queue in the output buffer of this client. */ @@ -1676,15 +1677,33 @@ int writeToClient(client *c, int handler_installed) { std::unique_locklock)> lock(c->lock); + /* if this is a write to a replica, it's coming straight from the replication backlog */ + long long repl_backlog_idx = g_pserver->repl_backlog_idx; + + bool wroteFromClientBuffer = false; /* True if you wrote from the client buffer in this function call */ + while(clientHasPendingReplies(c)) { + wroteFromClientBuffer = true; + if (c->flags & CLIENT_SLAVE && listLength(c->reply) % 10 == 0){ + + serverLog(LL_NOTICE, "-----------------------------------------"); + serverLog(LL_NOTICE, "replica w/ pending replies, with a reply list size of: %lu", listLength(c->reply)); + serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size); + serverLog(LL_NOTICE, "repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset); + serverLog(LL_NOTICE, "-----------------------------------------"); + + } if (c->bufpos > 0) { + // serverLog(LL_NOTICE, "Sending reply %d", x); + // serverLog(LL_NOTICE, "SUSSUS AMOGUS, %ld", c->bufpos); nwritten = connWrite(c->conn,c->buf+c->sentlen,c->bufpos-c->sentlen); if (nwritten <= 0) break; c->sentlen += nwritten; totwritten += nwritten; /* If the buffer was sent, set bufpos to zero to continue with - * the remainder of the reply. */ + * the remainder of the reply. */ + // serverLog(LL_NOTICE, "buf pos: %d, sentlen: %ld", c->bufpos, c->sentlen); if ((int)c->sentlen == c->bufpos) { c->bufpos = 0; c->sentlen = 0; @@ -1714,23 +1733,112 @@ int writeToClient(client *c, int handler_installed) { } } /* Note that we avoid to send more than NET_MAX_WRITES_PER_EVENT - * bytes, in a single threaded server it's a good idea to serve - * other clients as well, even if a very large request comes from - * super fast link that is always able to accept data (in real world - * scenario think about 'KEYS *' against the loopback interface). - * - * However if we are over the maxmemory limit we ignore that and - * just deliver as much data as it is possible to deliver. - * - * Moreover, we also send as much as possible if the client is - * a replica or a monitor (otherwise, on high-speed traffic, the - * replication/output buffer will grow indefinitely) */ + * bytes, in a single threaded server it's a good idea to serve + * other clients as well, even if a very large request comes from + * super fast link that is always able to accept data (in real world + * scenario think about 'KEYS *' against the loopback interface). + * + * However if we are over the maxmemory limit we ignore that and + * just deliver as much data as it is possible to deliver. + * + * Moreover, we also send as much as possible if the client is + * a replica or a monitor (otherwise, on high-speed traffic, the + * replication/output buffer will grow indefinitely) */ if (totwritten > NET_MAX_WRITES_PER_EVENT && (g_pserver->maxmemory == 0 || - zmalloc_used_memory() < g_pserver->maxmemory) && + zmalloc_used_memory() < g_pserver->maxmemory) && !(c->flags & CLIENT_SLAVE)) break; } - + + /* If there are no more pending replies, then we have transmitted the RDB. + * This means further replication commands will be taken straight from the + * replication backlog from now on. */ + if (c->flags & CLIENT_SLAVE && c->replstate == SLAVE_STATE_ONLINE && !clientHasPendingReplies(c)){ + if (!c->transmittedRDB) + serverLog(LL_NOTICE, "---------->>>>>>>> TRANSMISSION OF THE RDB HAS COMPLETED <<<<<<<<----------"); + c->transmittedRDB = true; + } + + /* For replicas, we don't store all the information in the client buffer + * Most of the time (aside from immediately after synchronizing), we read + * from the replication backlog directly */ + if (c->flags & CLIENT_SLAVE && c->repl_curr_idx != -1 && c->transmittedRDB){ + /* copy global variables into local scope so if they change in between we don't care */ + long long repl_backlog_size = g_pserver->repl_backlog_size; + long long nwrittenPart2 = 0; + + ssize_t nrequested; /* The number of bytes requested to write */ + /* normal case with no wrap around */ + if (repl_backlog_idx >= c->repl_curr_idx){ + nrequested = repl_backlog_idx - c->repl_curr_idx; + nwritten = connWrite(c->conn, g_pserver->repl_backlog + c->repl_curr_idx, repl_backlog_idx - c->repl_curr_idx); + /* wrap around case, v. rare */ + /* also v. buggy so there's that */ + } else { + serverLog(LL_NOTICE, "WRAP CASE"); + serverLog(LL_NOTICE, "-----------------------------------------"); + serverLog(LL_NOTICE, "requested to write: %ld", nrequested); + serverLog(LL_NOTICE, "actually written: %ld", nwritten); + serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size); + serverLog(LL_NOTICE, "buf pos: %d, sentlen: %ld", c->bufpos, c->sentlen); + serverLog(LL_NOTICE, "nwritten: %ld", nwritten); + serverLog(LL_NOTICE, "-----------------------------------------"); + + nrequested = repl_backlog_size + repl_backlog_idx - c->repl_curr_idx; + nwritten = connWrite(c->conn, g_pserver->repl_backlog + c->repl_curr_idx, repl_backlog_size - c->repl_curr_idx); + /* only attempt wrapping if we write the correct number of bytes */ + if (nwritten == repl_backlog_size - c->repl_curr_idx){ + serverLog(LL_NOTICE, "SECOND STAGE"); + serverLog(LL_NOTICE, "-----------------------------------------"); + serverLog(LL_NOTICE, "requested to write: %ld", nrequested); + serverLog(LL_NOTICE, "actually written: %ld", nwritten); + serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size); + serverLog(LL_NOTICE, "buf pos: %d, sentlen: %ld", c->bufpos, c->sentlen); + serverLog(LL_NOTICE, "-----------------------------------------"); + + long long nwrittenPart2 = connWrite(c->conn, g_pserver->repl_backlog, repl_backlog_idx); + if (nwrittenPart2 != -1) + nwritten += nwrittenPart2; + + serverLog(LL_NOTICE, "nwrittenPart2: %lld", nwrittenPart2); + serverLog(LL_NOTICE, "-----------------------------------------"); + } else { + serverLog(LL_NOTICE, "SUPER SHORT"); + } + + } + + /* only update the replica's current index if bytes were sent */ + + // if (nrequested != nwritten){ + serverLog(LL_NOTICE, "-----------------------------------------"); + serverLog(LL_NOTICE, "AFTER THE FACT"); + serverLog(LL_NOTICE, "requested to write: %ld", nrequested); + serverLog(LL_NOTICE, "actually written: %ld", nwritten); + serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size); + serverLog(LL_NOTICE, "repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset); + serverLog(LL_NOTICE, "-----------------------------------------"); + // } + + + if (nwritten == nrequested){ + c->repl_curr_idx = -1; /* -1 denotes no more replica writes */ + } + else if (nwritten > 0) + c->repl_curr_idx = (c->repl_curr_idx + nwritten) % repl_backlog_size; + + serverAssert(c->repl_curr_idx < repl_backlog_size); + + /* only increment bytes if an error didn't occur */ + if (nwritten > 0){ + totwritten += nwritten; + c->repl_curr_off += nwritten; + } + + /* If the second part of a write didn't go through, we still need to register that */ + if (nwrittenPart2 == -1) nwritten = -1; + } + g_pserver->stat_net_output_bytes += totwritten; if (nwritten == -1) { if (connGetState(c->conn) == CONN_STATE_CONNECTED) { @@ -1750,7 +1858,7 @@ int writeToClient(client *c, int handler_installed) { * We just rely on data / pings received for timeout detection. */ if (!(c->flags & CLIENT_MASTER)) c->lastinteraction = g_pserver->unixtime; } - if (!clientHasPendingReplies(c)) { + if (!clientHasPendingReplies(c) && c->repl_curr_idx == -1) { c->sentlen = 0; if (handler_installed) connSetWriteHandler(c->conn, NULL); @@ -1904,6 +2012,12 @@ int handleClientsWithPendingWrites(int iel, int aof_state) { /* Don't write to clients that are going to be closed anyway. */ if (c->flags & CLIENT_CLOSE_ASAP) continue; + // if (c->flags & CLIENT_SLAVE){ + // if(clientHasPendingReplies(c)) + // serverLog(LL_NOTICE, "somehow the client buffer has these values: %s", c->buf); + // serverLog(LL_NOTICE, "LOL"); + // } + /* Try to write buffers to the client socket. */ if (writeToClient(c,0) == C_ERR) { @@ -1920,7 +2034,7 @@ int handleClientsWithPendingWrites(int iel, int aof_state) { /* If after the synchronous writes above we still have data to * output to the client, we need to install the writable handler. */ - if (clientHasPendingReplies(c)) { + if (clientHasPendingReplies(c) || c->repl_curr_idx != -1) { if (connSetWriteHandlerWithBarrier(c->conn, sendReplyToClient, ae_flags, true) == C_ERR) freeClientAsync(c); } @@ -3268,6 +3382,13 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) { } } +/* In the case of a replica client, it is possible (and very likely) + * that writes to said replica are using data from the replication backlog + * as opposed to it's own internal buffer, this number should keep track of that */ +unsigned long getClientReplicationBacklogSharedUsage(client *c) { + return (c->repl_curr_idx == -1 && c->flags & CLIENT_SLAVE) ? 0 : g_pserver->master_repl_offset - c->repl_curr_off; +} + /* This function returns the number of bytes that Redis is * using to store the reply still not read by the client. * @@ -3276,9 +3397,11 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) { * enforcing the client output length limits. */ unsigned long getClientOutputBufferMemoryUsage(client *c) { unsigned long list_item_size = sizeof(listNode) + sizeof(clientReplyBlock); - return c->reply_bytes + (list_item_size*listLength(c->reply)) + (c->replyAsync ? c->replyAsync->size : 0); + return c->reply_bytes + (list_item_size*listLength(c->reply)) + (c->replyAsync ? c->replyAsync->size : 0) + getClientReplicationBacklogSharedUsage(c); } + + /* Get the class of a client, used in order to enforce limits to different * classes of clients. * diff --git a/src/replication.cpp b/src/replication.cpp index 2533bae52..ccb538a69 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -47,6 +47,9 @@ #include #include +#define BYPASS_BUFFER +// #define BYPASS_PSYNC + void replicationDiscardCachedMaster(redisMaster *mi); void replicationResurrectCachedMaster(redisMaster *mi, connection *conn); void replicationSendAck(redisMaster *mi); @@ -59,6 +62,18 @@ static void propagateMasterStaleKeys(); * the instance is configured to have no persistence. */ int RDBGeneratedByReplication = 0; +void resizeReplicationBacklogForClients(long long newsize); + +void setReplIdx(client *c, long long idx, long long off){ + if (prepareClientToWrite(c) != C_OK) return; + // serverLog(LL_NOTICE, "calling this garbage function w/ idx and off: %lld, %lld, %lld", idx, off, off-idx); + // serverLog(LL_NOTICE, "What is this value? %lld", c->repl_curr_idx); + if (c->repl_curr_idx == -1){ + c->repl_curr_idx = idx; + c->repl_curr_off = off; + } +} + /* --------------------------- Utility functions ---------------------------- */ /* Return the pointer to a string representing the replica ip:listening_port @@ -213,6 +228,8 @@ void resizeReplicationBacklog(long long newsize) { newsize = CONFIG_REPL_BACKLOG_MIN_SIZE; if (g_pserver->repl_backlog_size == newsize) return; + serverLog(LL_NOTICE, "WE HAD TO RESIZE from %lld to %lld", g_pserver->repl_backlog_size, newsize); + if (g_pserver->repl_backlog != NULL) { /* What we actually do is to flush the old buffer and realloc a new * empty one. It will refill with new data incrementally. @@ -253,6 +270,80 @@ void resizeReplicationBacklog(long long newsize) { g_pserver->repl_backlog_size = newsize; } + +/* The above but for when clients need extra replication backlog because ??? */ +void resizeReplicationBacklogForClients(long long newsize) { + if (newsize < CONFIG_REPL_BACKLOG_MIN_SIZE) + newsize = CONFIG_REPL_BACKLOG_MIN_SIZE; + if (g_pserver->repl_backlog_size == newsize) return; + + serverLog(LL_NOTICE, "WE HAD TO RESIZE from %lld to %lld", g_pserver->repl_backlog_size, newsize); + /* get the critical client size, i.e. the size of the data unflushed to clients */ + long long earliest_off = LONG_LONG_MAX; + long long earliest_idx = -1; + listIter li; + listNode *ln; + listRewind(g_pserver->slaves, &li); + while ((ln = listNext(&li))) { + client *replica = (client*)listNodeValue(ln); + if (replica->repl_curr_off != -1 && replica->repl_curr_off < earliest_off){ + earliest_off = replica->repl_curr_off; + earliest_idx = replica->repl_curr_idx; + } + } + + if (g_pserver->repl_backlog != NULL) { + /* What we actually do is to flush the old buffer and realloc a new + * empty one. It will refill with new data incrementally. + * The reason is that copying a few gigabytes adds latency and even + * worse often we need to alloc additional space before freeing the + * old buffer. */ + + if (earliest_idx >= 0) { + // We need to keep critical data so we can't shrink less than the hot data in the buffer + newsize = std::max(newsize, g_pserver->master_repl_offset - earliest_off); + char *backlog = (char*)zmalloc(newsize); + g_pserver->repl_backlog_histlen = g_pserver->master_repl_offset - earliest_off; + + if (g_pserver->repl_backlog_idx >= earliest_idx) { + auto cbActiveBacklog = g_pserver->repl_backlog_idx - earliest_idx; + memcpy(backlog, g_pserver->repl_backlog + earliest_idx, cbActiveBacklog); + serverLog(LL_NOTICE, "g_pserver->master_repl_offset: %lld, earliest_off: %lld, g_pserver->repl_backlog_idx: %lld, earliest_idx: %lld", + g_pserver->master_repl_offset, earliest_off, g_pserver->repl_backlog_idx, earliest_idx); + serverAssert(g_pserver->repl_backlog_histlen == cbActiveBacklog); + } else { + auto cbPhase1 = g_pserver->repl_backlog_size - earliest_idx; + memcpy(backlog, g_pserver->repl_backlog + earliest_idx, cbPhase1); + memcpy(backlog + cbPhase1, g_pserver->repl_backlog, g_pserver->repl_backlog_idx); + auto cbActiveBacklog = cbPhase1 + g_pserver->repl_backlog_idx; + serverAssert(g_pserver->repl_backlog_histlen == cbActiveBacklog); + } + zfree(g_pserver->repl_backlog); + g_pserver->repl_backlog = backlog; + g_pserver->repl_backlog_idx = g_pserver->repl_backlog_histlen; + listRewind(g_pserver->slaves, &li); + /* Go through the clients and update their replication indicies */ + while ((ln = listNext(&li))) { + client *replica = (client*)listNodeValue(ln); + if (replica->repl_curr_idx != -1){ + replica->repl_curr_idx -= earliest_idx; + if (replica->repl_curr_idx < 0) + replica->repl_curr_idx += g_pserver->repl_backlog_size; + } + } + g_pserver->repl_batch_idxStart = 0; + } else { + zfree(g_pserver->repl_backlog); + g_pserver->repl_backlog = (char*)zmalloc(newsize); + g_pserver->repl_backlog_histlen = 0; + g_pserver->repl_backlog_idx = 0; + /* Next byte we have is... the next since the buffer is empty. */ + g_pserver->repl_backlog_off = g_pserver->master_repl_offset+1; + } + } + g_pserver->repl_backlog_size = newsize; +} + void freeReplicationBacklog(void) { serverAssert(GlobalLocksAcquired()); listIter li; @@ -683,6 +774,10 @@ long long addReplyReplicationBacklog(client *c, long long offset) { * split the reply in two parts if we are cross-boundary. */ len = g_pserver->repl_backlog_histlen - skip; serverLog(LL_DEBUG, "[PSYNC] Reply total length: %lld", len); + serverLog(LL_NOTICE, "Coming through from the addReplicationBacklog"); +#ifdef BYPASS_PSYNC + setReplIdx(c, j, offset); +#else while(len) { long long thislen = ((g_pserver->repl_backlog_size - j) < len) ? @@ -693,6 +788,8 @@ long long addReplyReplicationBacklog(client *c, long long offset) { len -= thislen; j = 0; } +#endif + serverLog(LL_NOTICE, "rdb transmitted? %d, pending replies? %d", c->transmittedRDB, clientHasPendingReplies(c)); return g_pserver->repl_backlog_histlen - skip; } @@ -731,6 +828,8 @@ int replicationSetupSlaveForFullResync(client *replica, long long offset) { * a SELECT statement in the replication stream. */ g_pserver->replicaseldb = -1; + serverLog(LL_NOTICE, "We are setting up here lad"); + /* Don't send this reply to slaves that approached us with * the old SYNC command. */ if (!(replica->flags & CLIENT_PRE_PSYNC)) { @@ -989,6 +1088,7 @@ void syncCommand(client *c) { if (!strcasecmp((const char*)ptrFromObj(c->argv[0]),"psync")) { if (masterTryPartialResynchronization(c) == C_OK) { g_pserver->stat_sync_partial_ok++; + // c->repl_curr_idx = g_pserver->repl_backlog_idx; return; /* No full resync needed, return. */ } else { char *master_replid = (char*)ptrFromObj(c->argv[1]); @@ -1016,6 +1116,7 @@ void syncCommand(client *c) { connDisableTcpNoDelay(c->conn); /* Non critical if it fails. */ c->repldbfd = -1; c->flags |= CLIENT_SLAVE; + // c->repl_curr_idx = g_pserver->repl_backlog_idx; listAddNodeTail(g_pserver->slaves,c); /* Create the replication backlog if needed. */ @@ -1035,6 +1136,7 @@ void syncCommand(client *c) { if (g_pserver->FRdbSaveInProgress() && g_pserver->rdb_child_type == RDB_CHILD_TYPE_DISK) { + serverLog(LL_NOTICE, "case 1"); /* Ok a background save is in progress. Let's check if it is a good * one for replication, i.e. if there is another replica that is * registering differences since the server forked to save. */ @@ -1066,6 +1168,7 @@ void syncCommand(client *c) { } else if (g_pserver->FRdbSaveInProgress() && g_pserver->rdb_child_type == RDB_CHILD_TYPE_SOCKET) { + serverLog(LL_NOTICE, "case 2"); /* There is an RDB child process but it is writing directly to * children sockets. We need to wait for the next BGSAVE * in order to synchronize. */ @@ -1073,6 +1176,7 @@ void syncCommand(client *c) { /* CASE 3: There is no BGSAVE is progress. */ } else { + serverLog(LL_NOTICE, "case 3"); if (g_pserver->repl_diskless_sync && (c->slave_capa & SLAVE_CAPA_EOF)) { /* Diskless replication RDB child is created inside * replicationCron() since we want to delay its start a @@ -1278,6 +1382,7 @@ void replconfCommand(client *c) { * 3) Update the count of "good replicas". */ void putSlaveOnline(client *replica) { replica->replstate = SLAVE_STATE_ONLINE; + replica->repl_put_online_on_ack = 0; replica->repl_ack_time = g_pserver->unixtime; /* Prevent false timeout. */ if (connSetWriteHandler(replica->conn, sendReplyToClient, true) == C_ERR) { @@ -1415,11 +1520,13 @@ void sendBulkToSlave(connection *conn) { replica->repldboff += nwritten; g_pserver->stat_net_output_bytes += nwritten; + // replica->repl_curr_idx = g_pserver->repl_backlog_idx; if (replica->repldboff == replica->repldbsize) { close(replica->repldbfd); replica->repldbfd = -1; connSetWriteHandler(replica->conn,NULL); putSlaveOnline(replica); + serverLog(LL_NOTICE, "ABOUT TO DIE HERE"); } } @@ -4450,6 +4557,21 @@ void replicateSubkeyExpire(redisDb *db, robj_roptr key, robj_roptr subkey, long } void _clientAsyncReplyBufferReserve(client *c, size_t len); + +/* Has the end of the replication backlog overflowed past the beginning? */ +bool replOverflowHasOccured(){ + if (g_pserver->repl_backlog_idx > g_pserver->repl_batch_idxStart){ + long long repl_idx_difference = g_pserver->repl_backlog_idx > g_pserver->repl_batch_idxStart ? + g_pserver->repl_backlog_idx - g_pserver->repl_batch_idxStart : + (g_pserver->repl_backlog_size + g_pserver->repl_backlog_idx) - g_pserver->repl_batch_idxStart; + + return g_pserver->master_repl_offset - g_pserver->repl_batch_offStart > repl_idx_difference; + } + return false; +} + +thread_local int transmittedCount = 0; + void flushReplBacklogToClients() { serverAssert(GlobalLocksAcquired()); @@ -4463,11 +4585,31 @@ void flushReplBacklogToClients() serverAssert(g_pserver->master_repl_offset - g_pserver->repl_batch_offStart <= g_pserver->repl_backlog_size); serverAssert(g_pserver->repl_batch_idxStart != g_pserver->repl_backlog_idx); + serverAssert(!replOverflowHasOccured()); listIter li; listNode *ln; listRewind(g_pserver->slaves, &li); + +#if 0 + // check for potential overflow first while ((ln = listNext(&li))) { client *replica = (client*)listNodeValue(ln); + // serverLog(LL_NOTICE, "replica state: %d", replica->replstate); + + if (replica->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue; + if (replica->flags & CLIENT_CLOSE_ASAP) continue; + if (replica->repl_curr_idx == -1) continue; + + std::unique_lock ul(replica->lock, std::defer_lock); + if (FCorrectThread(replica)) + ul.lock(); + else + fAsyncWrite = true; +#endif + + while ((ln = listNext(&li))) { + client *replica = (client*)listNodeValue(ln); + // serverLog(LL_NOTICE, "replica state: %d", replica->replstate); if (replica->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue; if (replica->flags & CLIENT_CLOSE_ASAP) continue; @@ -4478,6 +4620,19 @@ void flushReplBacklogToClients() else fAsyncWrite = true; + +#ifdef BYPASS_BUFFER + /* If we are online and the RDB has been sent, there is no need to feed the client buffer + * We will send our replies directly from the replication backlog instead */ + if (replica->replstate == SLAVE_STATE_ONLINE && replica->transmittedRDB){ + setReplIdx(replica, g_pserver->repl_batch_idxStart, g_pserver->repl_batch_offStart); + continue; + } +#else + if (replica->replstate == SLAVE_STATE_ONLINE){ + // serverLog(LL_NOTICE, "would be calling this garbage function w/ offset: %lld", g_pserver->repl_batch_idxStart); + } +#endif if (g_pserver->repl_backlog_idx >= g_pserver->repl_batch_idxStart) { long long cbCopy = g_pserver->repl_backlog_idx - g_pserver->repl_batch_idxStart; serverAssert((g_pserver->master_repl_offset - g_pserver->repl_batch_offStart) == cbCopy); @@ -4491,6 +4646,7 @@ void flushReplBacklogToClients() _clientAsyncReplyBufferReserve(replica, cbPhase1 + g_pserver->repl_backlog_idx); addReplyProto(replica, g_pserver->repl_backlog + g_pserver->repl_batch_idxStart, cbPhase1); addReplyProto(replica, g_pserver->repl_backlog, g_pserver->repl_backlog_idx); + serverAssert((cbPhase1 + g_pserver->repl_backlog_idx) == (g_pserver->master_repl_offset - g_pserver->repl_batch_offStart)); } } diff --git a/src/server.h b/src/server.h index 878154bc5..cfd6c34a0 100644 --- a/src/server.h +++ b/src/server.h @@ -1516,6 +1516,8 @@ struct client { long long psync_initial_offset; /* FULLRESYNC reply offset other slaves copying this replica output buffer should use. */ + long long repl_curr_idx = -1; /* Replication index sent, if this is a replica */ + long long repl_curr_off = -1; char replid[CONFIG_RUN_ID_SIZE+1]; /* Master replication ID (if master). */ int slave_listening_port; /* As configured with: REPLCONF listening-port */ char slave_ip[NET_IP_STR_LEN]; /* Optionally given by REPLCONF ip-address */ @@ -1575,6 +1577,9 @@ struct client { robj **argv; size_t argv_len_sumActive = 0; + bool transmittedRDB = false; /* Have we finished transmitting the RDB to this replica? */ + /* If so, we can read from the replication backlog instead of the client buffer */ + // post a function from a non-client thread to run on its client thread bool postFunction(std::function fn, bool fLock = true); size_t argv_len_sum() const; @@ -3470,6 +3475,8 @@ void mixDigest(unsigned char *digest, const void *ptr, size_t len); void xorDigest(unsigned char *digest, const void *ptr, size_t len); int populateCommandTableParseFlags(struct redisCommand *c, const char *strflags); + + int moduleGILAcquiredByModule(void); extern int g_fInCrash; static inline int GlobalLocksAcquired(void) // Used in asserts to verify all global locks are correctly acquired for a server-thread to operate @@ -3526,6 +3533,8 @@ void tlsInit(void); void tlsInitThread(); int tlsConfigure(redisTLSContextConfig *ctx_config); +int prepareClientToWrite(client *c); + class ShutdownException {}; @@ -3538,3 +3547,5 @@ class ShutdownException int iAmMaster(void); #endif + + From d8367a92b2bea452d39561c201085a377c2021a6 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Thu, 29 Apr 2021 17:01:06 +0000 Subject: [PATCH 29/99] Updated resize logic Former-commit-id: e6d892ef21b7fc6f51433f32b01198038e555419 --- src/networking.cpp | 104 +++++++++++++++---------------------- src/replication.cpp | 123 ++++++++++++++++++++++++++++++++++++++------ src/server.cpp | 3 +- 3 files changed, 151 insertions(+), 79 deletions(-) diff --git a/src/networking.cpp b/src/networking.cpp index 18ab382bd..cac58ff07 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -1680,22 +1680,8 @@ int writeToClient(client *c, int handler_installed) { /* if this is a write to a replica, it's coming straight from the replication backlog */ long long repl_backlog_idx = g_pserver->repl_backlog_idx; - bool wroteFromClientBuffer = false; /* True if you wrote from the client buffer in this function call */ - while(clientHasPendingReplies(c)) { - wroteFromClientBuffer = true; - if (c->flags & CLIENT_SLAVE && listLength(c->reply) % 10 == 0){ - - serverLog(LL_NOTICE, "-----------------------------------------"); - serverLog(LL_NOTICE, "replica w/ pending replies, with a reply list size of: %lu", listLength(c->reply)); - serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size); - serverLog(LL_NOTICE, "repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset); - serverLog(LL_NOTICE, "-----------------------------------------"); - - } if (c->bufpos > 0) { - // serverLog(LL_NOTICE, "Sending reply %d", x); - // serverLog(LL_NOTICE, "SUSSUS AMOGUS, %ld", c->bufpos); nwritten = connWrite(c->conn,c->buf+c->sentlen,c->bufpos-c->sentlen); if (nwritten <= 0) break; c->sentlen += nwritten; @@ -1753,9 +1739,7 @@ int writeToClient(client *c, int handler_installed) { /* If there are no more pending replies, then we have transmitted the RDB. * This means further replication commands will be taken straight from the * replication backlog from now on. */ - if (c->flags & CLIENT_SLAVE && c->replstate == SLAVE_STATE_ONLINE && !clientHasPendingReplies(c)){ - if (!c->transmittedRDB) - serverLog(LL_NOTICE, "---------->>>>>>>> TRANSMISSION OF THE RDB HAS COMPLETED <<<<<<<<----------"); + if (c->flags & CLIENT_SLAVE && c->replstate == SLAVE_STATE_ONLINE && !clientHasPendingReplies(c) && c->replyAsync == nullptr){ c->transmittedRDB = true; } @@ -1775,49 +1759,27 @@ int writeToClient(client *c, int handler_installed) { /* wrap around case, v. rare */ /* also v. buggy so there's that */ } else { - serverLog(LL_NOTICE, "WRAP CASE"); - serverLog(LL_NOTICE, "-----------------------------------------"); - serverLog(LL_NOTICE, "requested to write: %ld", nrequested); - serverLog(LL_NOTICE, "actually written: %ld", nwritten); - serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size); - serverLog(LL_NOTICE, "buf pos: %d, sentlen: %ld", c->bufpos, c->sentlen); - serverLog(LL_NOTICE, "nwritten: %ld", nwritten); - serverLog(LL_NOTICE, "-----------------------------------------"); - nrequested = repl_backlog_size + repl_backlog_idx - c->repl_curr_idx; nwritten = connWrite(c->conn, g_pserver->repl_backlog + c->repl_curr_idx, repl_backlog_size - c->repl_curr_idx); /* only attempt wrapping if we write the correct number of bytes */ if (nwritten == repl_backlog_size - c->repl_curr_idx){ - serverLog(LL_NOTICE, "SECOND STAGE"); - serverLog(LL_NOTICE, "-----------------------------------------"); - serverLog(LL_NOTICE, "requested to write: %ld", nrequested); - serverLog(LL_NOTICE, "actually written: %ld", nwritten); - serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size); - serverLog(LL_NOTICE, "buf pos: %d, sentlen: %ld", c->bufpos, c->sentlen); - serverLog(LL_NOTICE, "-----------------------------------------"); - long long nwrittenPart2 = connWrite(c->conn, g_pserver->repl_backlog, repl_backlog_idx); if (nwrittenPart2 != -1) nwritten += nwrittenPart2; - serverLog(LL_NOTICE, "nwrittenPart2: %lld", nwrittenPart2); - serverLog(LL_NOTICE, "-----------------------------------------"); - } else { - serverLog(LL_NOTICE, "SUPER SHORT"); - } - + } } /* only update the replica's current index if bytes were sent */ // if (nrequested != nwritten){ - serverLog(LL_NOTICE, "-----------------------------------------"); - serverLog(LL_NOTICE, "AFTER THE FACT"); - serverLog(LL_NOTICE, "requested to write: %ld", nrequested); - serverLog(LL_NOTICE, "actually written: %ld", nwritten); - serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size); - serverLog(LL_NOTICE, "repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset); - serverLog(LL_NOTICE, "-----------------------------------------"); + // serverLog(LL_NOTICE, "-----------------------------------------"); + // serverLog(LL_NOTICE, "AFTER THE FACT"); + // serverLog(LL_NOTICE, "requested to write: %ld", nrequested); + // serverLog(LL_NOTICE, "actually written: %ld", nwritten); + // serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size); + // serverLog(LL_NOTICE, "repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset); + // serverLog(LL_NOTICE, "-----------------------------------------"); // } @@ -1902,25 +1864,36 @@ void ProcessPendingAsyncWrites() serverAssert(c->fPendingAsyncWrite); if (c->flags & (CLIENT_CLOSE_ASAP | CLIENT_CLOSE_AFTER_REPLY)) { - zfree(c->replyAsync); - c->replyAsync = nullptr; + if (c->replyAsync != nullptr){ + zfree(c->replyAsync); + c->replyAsync = nullptr; + } c->fPendingAsyncWrite = FALSE; continue; } - int size = c->replyAsync->used; + /* since writes from master to replica can come directly from the replication backlog, + * writes may have been signalled without having been copied to the replyAsync buffer, + * thus causing the buffer to be NULL */ + if (c->replyAsync != nullptr){ + int size = c->replyAsync->used; - if (listLength(c->reply) == 0 && size <= (PROTO_REPLY_CHUNK_BYTES - c->bufpos)) { - memcpy(c->buf + c->bufpos, c->replyAsync->buf(), size); - c->bufpos += size; - } else { - c->reply_bytes += c->replyAsync->size; - listAddNodeTail(c->reply, c->replyAsync); + if (listLength(c->reply) == 0 && size <= (PROTO_REPLY_CHUNK_BYTES - c->bufpos)) { + memcpy(c->buf + c->bufpos, c->replyAsync->buf(), size); + c->bufpos += size; + } else { + c->reply_bytes += c->replyAsync->size; + listAddNodeTail(c->reply, c->replyAsync); + c->replyAsync = nullptr; + } + + zfree(c->replyAsync); c->replyAsync = nullptr; + } else { + /* Only replicas should have empty async reply buffers */ + serverAssert(c->flags & CLIENT_SLAVE); } - zfree(c->replyAsync); - c->replyAsync = nullptr; c->fPendingAsyncWrite = FALSE; // Now install the write event handler @@ -1935,17 +1908,17 @@ void ProcessPendingAsyncWrites() { ae_flags |= AE_BARRIER; } - + if (!((c->replstate == REPL_STATE_NONE || (c->replstate == SLAVE_STATE_ONLINE && !c->repl_put_online_on_ack)))) continue; - + asyncCloseClientOnOutputBufferLimitReached(c); if (c->flags & CLIENT_CLOSE_ASAP) continue; // we will never write this so don't post an op - + std::atomic_thread_fence(std::memory_order_seq_cst); - + if (FCorrectThread(c)) { prepareClientToWrite(c); // queue an event @@ -3386,7 +3359,12 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) { * that writes to said replica are using data from the replication backlog * as opposed to it's own internal buffer, this number should keep track of that */ unsigned long getClientReplicationBacklogSharedUsage(client *c) { - return (c->repl_curr_idx == -1 && c->flags & CLIENT_SLAVE) ? 0 : g_pserver->master_repl_offset - c->repl_curr_off; + if (c->flags & CLIENT_SLAVE && c->repl_curr_idx != -1){ + // serverLog(LL_NOTICE, "repl_backlog_size %lld, repl_backlog_idx %lld, master_repl_offset %lld, repl_curr_idx %lld, repl_curr_off %lld", + // g_pserver->repl_backlog_size, g_pserver->repl_backlog_idx, g_pserver->master_repl_offset, c->repl_curr_idx, c->repl_curr_off); + } + + return (!(c->flags & CLIENT_SLAVE) || c->repl_curr_idx == -1) ? 0 : g_pserver->master_repl_offset - c->repl_curr_off; } /* This function returns the number of bytes that Redis is diff --git a/src/replication.cpp b/src/replication.cpp index ccb538a69..ef33fbfd9 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -47,8 +47,8 @@ #include #include -#define BYPASS_BUFFER -// #define BYPASS_PSYNC +// #define BYPASS_BUFFER +// #define RESIZE_BACKLOG void replicationDiscardCachedMaster(redisMaster *mi); void replicationResurrectCachedMaster(redisMaster *mi, connection *conn); @@ -57,6 +57,30 @@ void putSlaveOnline(client *replica); int cancelReplicationHandshake(redisMaster *mi); static void propagateMasterStaleKeys(); +/* gets the lowest offset amongst all of the replicas */ +long long getLowestOffsetAmongReplicas(){ + serverAssert(GlobalLocksAcquired()); + long long min_offset = LONG_LONG_MAX; + listIter li; + listNode *ln; + listRewind(g_pserver->slaves, &li); + // check for potential overflow first + while ((ln = listNext(&li))) { + client *replica = (client*)listNodeValue(ln); + + if (replica->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue; + if (replica->flags & CLIENT_CLOSE_ASAP) continue; + if (replica->repl_curr_idx == -1) continue; + + std::unique_lock ul(replica->lock, std::defer_lock); + if (FCorrectThread(replica)) + ul.lock(); + + min_offset = std::min(min_offset, replica->repl_curr_off); + } + /* return -1 if no other minimum was found */ + return min_offset == LONG_LONG_MAX ? -1 : min_offset; +} /* We take a global flag to remember if this instance generated an RDB * because of replication, so that we can remove the RDB file in case * the instance is configured to have no persistence. */ @@ -67,11 +91,13 @@ void resizeReplicationBacklogForClients(long long newsize); void setReplIdx(client *c, long long idx, long long off){ if (prepareClientToWrite(c) != C_OK) return; // serverLog(LL_NOTICE, "calling this garbage function w/ idx and off: %lld, %lld, %lld", idx, off, off-idx); - // serverLog(LL_NOTICE, "What is this value? %lld", c->repl_curr_idx); + // serverLog(LL_NOTICE, "Repl Index started at: %lld", c->repl_curr_idx); if (c->repl_curr_idx == -1){ c->repl_curr_idx = idx; c->repl_curr_off = off; } + // serverLog(LL_NOTICE, "Repl Index has become: %lld", c->repl_curr_idx); + } /* --------------------------- Utility functions ---------------------------- */ @@ -277,7 +303,7 @@ void resizeReplicationBacklogForClients(long long newsize) { newsize = CONFIG_REPL_BACKLOG_MIN_SIZE; if (g_pserver->repl_backlog_size == newsize) return; - serverLog(LL_NOTICE, "WE HAD TO RESIZE from %lld to %lld", g_pserver->repl_backlog_size, newsize); + serverLog(LL_NOTICE, "WE HAVE TO RESIZE from %lld to %lld", g_pserver->repl_backlog_size, newsize); /* get the critical client size, i.e. the size of the data unflushed to clients */ long long earliest_off = LONG_LONG_MAX; long long earliest_idx = -1; @@ -290,6 +316,20 @@ void resizeReplicationBacklogForClients(long long newsize) { earliest_off = replica->repl_curr_off; earliest_idx = replica->repl_curr_idx; } + serverLog(LL_NOTICE, "repl_curr_idx: %lld, earlistidx: %lld", replica->repl_curr_idx, earliest_idx); + } + serverLog(LL_NOTICE, "We are starting with: master_repl_offset: %lld, repl_batch_offStart: %lld, earliest_off: %lld, " + "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, earliest_idx: %lld, repl_backlog_size: %lld", + g_pserver->master_repl_offset, g_pserver->repl_batch_offStart, earliest_off, + g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, earliest_idx, g_pserver->repl_backlog_size + ); + + long long new_off = 0, new_idx = 0; + + /* if no earliest offset is found amongst the clients, they are all up to date with the flushed index */ + if (earliest_off == LONG_LONG_MAX && earliest_idx == -1){ + earliest_idx = g_pserver->repl_batch_idxStart; + earliest_off = g_pserver->repl_batch_offStart; } if (g_pserver->repl_backlog != NULL) { @@ -330,8 +370,11 @@ void resizeReplicationBacklogForClients(long long newsize) { if (replica->repl_curr_idx < 0) replica->repl_curr_idx += g_pserver->repl_backlog_size; } + new_idx = replica->repl_curr_idx; } - g_pserver->repl_batch_idxStart = 0; + g_pserver->repl_batch_idxStart -= earliest_idx; + if (g_pserver->repl_batch_idxStart < 0) + g_pserver->repl_batch_idxStart += g_pserver->repl_backlog_size; } else { zfree(g_pserver->repl_backlog); g_pserver->repl_backlog = (char*)zmalloc(newsize); @@ -342,6 +385,12 @@ void resizeReplicationBacklogForClients(long long newsize) { } } g_pserver->repl_backlog_size = newsize; + + serverLog(LL_NOTICE, "We are ending with: master_repl_offset: %lld, repl_batch_offStart: %lld, new_off: %lld, " + "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, new_idx: %lld, repl_backlog_size: %lld", + g_pserver->master_repl_offset, g_pserver->repl_batch_offStart, new_off, + g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, new_idx, g_pserver->repl_backlog_size + ); } void freeReplicationBacklog(void) { @@ -367,20 +416,41 @@ void feedReplicationBacklog(const void *ptr, size_t len) { const unsigned char *p = (const unsigned char*)ptr; if (g_pserver->repl_batch_idxStart >= 0) { - long long minimumsize = g_pserver->master_repl_offset + len - g_pserver->repl_batch_offStart+1; + /* we are lower bounded by the lower client offset or the offStart if all the clients are up to date */ + long long lower_bound = getLowestOffsetAmongReplicas(); + if (lower_bound == -1) + lower_bound = g_pserver->repl_batch_offStart; + long long minimumsize = g_pserver->master_repl_offset + len - lower_bound + 1; if (minimumsize > g_pserver->repl_backlog_size) { flushReplBacklogToClients(); - minimumsize = g_pserver->master_repl_offset + len - g_pserver->repl_batch_offStart+1; + minimumsize = g_pserver->master_repl_offset + len - lower_bound +1; if (minimumsize > g_pserver->repl_backlog_size) { // This is an emergency overflow, we better resize to fit long long newsize = std::max(g_pserver->repl_backlog_size*2, minimumsize); serverLog(LL_WARNING, "Replication backlog is too small, resizing to: %lld", newsize); - resizeReplicationBacklog(newsize); + resizeReplicationBacklogForClients(newsize); } } +#ifdef RESIZE_BACKLOG + long long lowest_replica_offset = getLowestOffsetAmongReplicas(); + minimumsize = g_pserver->master_repl_offset + len - lowest_replica_offset; + if (lowest_replica_offset != -1 && minimumsize > g_pserver->repl_backlog_size){ + serverLog(LL_WARNING, "THE REPLICATION BACKLOG SIZE IS TOO SMALL, THIS IS A PROBLEM"); + long long oldsize = g_pserver->repl_backlog_size; + resizeReplicationBacklogForClients(std::max(g_pserver->repl_backlog_size * 2, minimumsize)); + serverLog(LL_WARNING, "changed size from %lld to %lld", oldsize, g_pserver->repl_backlog_size); + flushReplBacklogToClients(); + } +#endif } + // serverLog(LL_NOTICE, "Pt2 start with: master_repl_offset: %lld, repl_batch_offStart: %lld, " + // "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, repl_backlog_size: %lld", + // g_pserver->master_repl_offset, g_pserver->repl_batch_offStart, + // g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, g_pserver->repl_backlog_size + // ); + g_pserver->master_repl_offset += len; /* This is a circular buffer, so write as much data we can at every @@ -395,12 +465,23 @@ void feedReplicationBacklog(const void *ptr, size_t len) { len -= thislen; p += thislen; g_pserver->repl_backlog_histlen += thislen; + // serverLog(LL_NOTICE, "Pt2 intermediate with: master_repl_offset: %lld, repl_batch_offStart: %lld, " + // "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, repl_backlog_size: %lld", + // g_pserver->master_repl_offset, g_pserver->repl_batch_offStart, + // g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, g_pserver->repl_backlog_size + // ); } if (g_pserver->repl_backlog_histlen > g_pserver->repl_backlog_size) g_pserver->repl_backlog_histlen = g_pserver->repl_backlog_size; /* Set the offset of the first byte we have in the backlog. */ g_pserver->repl_backlog_off = g_pserver->master_repl_offset - g_pserver->repl_backlog_histlen + 1; + + // serverLog(LL_NOTICE, "Pt2 end with: master_repl_offset: %lld, repl_batch_offStart: %lld, " + // "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, repl_backlog_size: %lld", + // g_pserver->master_repl_offset, g_pserver->repl_batch_offStart, + // g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, g_pserver->repl_backlog_size + // ); } /* Wrapper for feedReplicationBacklog() that takes Redis string objects @@ -774,7 +855,6 @@ long long addReplyReplicationBacklog(client *c, long long offset) { * split the reply in two parts if we are cross-boundary. */ len = g_pserver->repl_backlog_histlen - skip; serverLog(LL_DEBUG, "[PSYNC] Reply total length: %lld", len); - serverLog(LL_NOTICE, "Coming through from the addReplicationBacklog"); #ifdef BYPASS_PSYNC setReplIdx(c, j, offset); #else @@ -789,7 +869,6 @@ long long addReplyReplicationBacklog(client *c, long long offset) { j = 0; } #endif - serverLog(LL_NOTICE, "rdb transmitted? %d, pending replies? %d", c->transmittedRDB, clientHasPendingReplies(c)); return g_pserver->repl_backlog_histlen - skip; } @@ -1520,13 +1599,11 @@ void sendBulkToSlave(connection *conn) { replica->repldboff += nwritten; g_pserver->stat_net_output_bytes += nwritten; - // replica->repl_curr_idx = g_pserver->repl_backlog_idx; if (replica->repldboff == replica->repldbsize) { close(replica->repldbfd); replica->repldbfd = -1; connSetWriteHandler(replica->conn,NULL); putSlaveOnline(replica); - serverLog(LL_NOTICE, "ABOUT TO DIE HERE"); } } @@ -4560,6 +4637,7 @@ void _clientAsyncReplyBufferReserve(client *c, size_t len); /* Has the end of the replication backlog overflowed past the beginning? */ bool replOverflowHasOccured(){ + if (g_pserver->repl_backlog_idx > g_pserver->repl_batch_idxStart){ long long repl_idx_difference = g_pserver->repl_backlog_idx > g_pserver->repl_batch_idxStart ? g_pserver->repl_backlog_idx - g_pserver->repl_batch_idxStart : @@ -4575,8 +4653,13 @@ thread_local int transmittedCount = 0; void flushReplBacklogToClients() { serverAssert(GlobalLocksAcquired()); - if (g_pserver->repl_batch_offStart < 0) + if (g_pserver->repl_batch_offStart < 0){ + if (getLowestOffsetAmongReplicas() == -1){ + serverLog(LL_NOTICE, "this is a case i probably have to handle"); + } return; + } + if (g_pserver->repl_batch_offStart != g_pserver->master_repl_offset) { bool fAsyncWrite = false; @@ -4585,7 +4668,7 @@ void flushReplBacklogToClients() serverAssert(g_pserver->master_repl_offset - g_pserver->repl_batch_offStart <= g_pserver->repl_backlog_size); serverAssert(g_pserver->repl_batch_idxStart != g_pserver->repl_backlog_idx); - serverAssert(!replOverflowHasOccured()); + // serverAssert(!replOverflowHasOccured()); listIter li; listNode *ln; listRewind(g_pserver->slaves, &li); @@ -4605,11 +4688,21 @@ void flushReplBacklogToClients() ul.lock(); else fAsyncWrite = true; + + if (g_pserver->master_repl_offset - replica->repl_curr_off > g_pserver->repl_backlog_size){ + serverLog(LL_WARNING, "THE REPLICATION BACKLOG SIZE IS TOO SMALL, THIS IS A PROBLEM"); + long long oldsize = g_pserver->repl_backlog_size; + resizeReplicationBacklogForClients(std::max(g_pserver->repl_backlog_size * 2, g_pserver->master_repl_offset - replica->repl_curr_off)); + serverLog(LL_WARNING, "changing size from %lld to %lld", oldsize, g_pserver->repl_backlog_size); + } + + } + + listRewind(g_pserver->slaves, &li); #endif while ((ln = listNext(&li))) { client *replica = (client*)listNodeValue(ln); - // serverLog(LL_NOTICE, "replica state: %d", replica->replstate); if (replica->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue; if (replica->flags & CLIENT_CLOSE_ASAP) continue; diff --git a/src/server.cpp b/src/server.cpp index 3d547f748..9664a4a6b 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -1796,6 +1796,7 @@ int clientsCronTrackClientsMemUsage(client *c) { mem += zmalloc_size(c); mem += c->argv_len_sum(); if (c->argv) mem += zmalloc_size(c->argv); + // serverLog(LL_NOTICE, "Mem here is : %lu", mem); /* Now that we have the memory used by the client, remove the old * value from the old category, and add it back. */ g_pserver->stat_clients_type_memory[c->client_cron_last_memory_type] -= @@ -1854,7 +1855,7 @@ void clientsCron(int iel) { while(listLength(g_pserver->clients) && iterations--) { client *c; listNode *head; - + // serverLog(LL_NOTICE, "we are at iteration: %d", iterations); /* Rotate the list, take the current head, process. * This way if the client must be removed from the list it's the * first element and we don't incur into O(N) computation. */ From 7ef58a333f9331e8fd144163626ed7a6ccaa1a59 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Thu, 29 Apr 2021 18:51:30 +0000 Subject: [PATCH 30/99] Performance optimizations Former-commit-id: 7fd83d467784d293f7da78b74f9b9763ce387238 --- src/replication.cpp | 71 ++------------------------------------------- 1 file changed, 3 insertions(+), 68 deletions(-) diff --git a/src/replication.cpp b/src/replication.cpp index ef33fbfd9..1bae2773a 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -47,8 +47,7 @@ #include #include -// #define BYPASS_BUFFER -// #define RESIZE_BACKLOG +#define BYPASS_BUFFER void replicationDiscardCachedMaster(redisMaster *mi); void replicationResurrectCachedMaster(redisMaster *mi, connection *conn); @@ -89,10 +88,10 @@ int RDBGeneratedByReplication = 0; void resizeReplicationBacklogForClients(long long newsize); void setReplIdx(client *c, long long idx, long long off){ - if (prepareClientToWrite(c) != C_OK) return; // serverLog(LL_NOTICE, "calling this garbage function w/ idx and off: %lld, %lld, %lld", idx, off, off-idx); // serverLog(LL_NOTICE, "Repl Index started at: %lld", c->repl_curr_idx); if (c->repl_curr_idx == -1){ + if (prepareClientToWrite(c) != C_OK) return; c->repl_curr_idx = idx; c->repl_curr_off = off; } @@ -432,17 +431,6 @@ void feedReplicationBacklog(const void *ptr, size_t len) { resizeReplicationBacklogForClients(newsize); } } -#ifdef RESIZE_BACKLOG - long long lowest_replica_offset = getLowestOffsetAmongReplicas(); - minimumsize = g_pserver->master_repl_offset + len - lowest_replica_offset; - if (lowest_replica_offset != -1 && minimumsize > g_pserver->repl_backlog_size){ - serverLog(LL_WARNING, "THE REPLICATION BACKLOG SIZE IS TOO SMALL, THIS IS A PROBLEM"); - long long oldsize = g_pserver->repl_backlog_size; - resizeReplicationBacklogForClients(std::max(g_pserver->repl_backlog_size * 2, minimumsize)); - serverLog(LL_WARNING, "changed size from %lld to %lld", oldsize, g_pserver->repl_backlog_size); - flushReplBacklogToClients(); - } -#endif } // serverLog(LL_NOTICE, "Pt2 start with: master_repl_offset: %lld, repl_batch_offStart: %lld, " @@ -4635,30 +4623,11 @@ void replicateSubkeyExpire(redisDb *db, robj_roptr key, robj_roptr subkey, long void _clientAsyncReplyBufferReserve(client *c, size_t len); -/* Has the end of the replication backlog overflowed past the beginning? */ -bool replOverflowHasOccured(){ - - if (g_pserver->repl_backlog_idx > g_pserver->repl_batch_idxStart){ - long long repl_idx_difference = g_pserver->repl_backlog_idx > g_pserver->repl_batch_idxStart ? - g_pserver->repl_backlog_idx - g_pserver->repl_batch_idxStart : - (g_pserver->repl_backlog_size + g_pserver->repl_backlog_idx) - g_pserver->repl_batch_idxStart; - - return g_pserver->master_repl_offset - g_pserver->repl_batch_offStart > repl_idx_difference; - } - return false; -} - -thread_local int transmittedCount = 0; - void flushReplBacklogToClients() { serverAssert(GlobalLocksAcquired()); - if (g_pserver->repl_batch_offStart < 0){ - if (getLowestOffsetAmongReplicas() == -1){ - serverLog(LL_NOTICE, "this is a case i probably have to handle"); - } + if (g_pserver->repl_batch_offStart < 0) return; - } if (g_pserver->repl_batch_offStart != g_pserver->master_repl_offset) { @@ -4668,39 +4637,9 @@ void flushReplBacklogToClients() serverAssert(g_pserver->master_repl_offset - g_pserver->repl_batch_offStart <= g_pserver->repl_backlog_size); serverAssert(g_pserver->repl_batch_idxStart != g_pserver->repl_backlog_idx); - // serverAssert(!replOverflowHasOccured()); listIter li; listNode *ln; listRewind(g_pserver->slaves, &li); - -#if 0 - // check for potential overflow first - while ((ln = listNext(&li))) { - client *replica = (client*)listNodeValue(ln); - // serverLog(LL_NOTICE, "replica state: %d", replica->replstate); - - if (replica->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue; - if (replica->flags & CLIENT_CLOSE_ASAP) continue; - if (replica->repl_curr_idx == -1) continue; - - std::unique_lock ul(replica->lock, std::defer_lock); - if (FCorrectThread(replica)) - ul.lock(); - else - fAsyncWrite = true; - - if (g_pserver->master_repl_offset - replica->repl_curr_off > g_pserver->repl_backlog_size){ - serverLog(LL_WARNING, "THE REPLICATION BACKLOG SIZE IS TOO SMALL, THIS IS A PROBLEM"); - long long oldsize = g_pserver->repl_backlog_size; - resizeReplicationBacklogForClients(std::max(g_pserver->repl_backlog_size * 2, g_pserver->master_repl_offset - replica->repl_curr_off)); - serverLog(LL_WARNING, "changing size from %lld to %lld", oldsize, g_pserver->repl_backlog_size); - } - - } - - listRewind(g_pserver->slaves, &li); -#endif - while ((ln = listNext(&li))) { client *replica = (client*)listNodeValue(ln); @@ -4721,10 +4660,6 @@ void flushReplBacklogToClients() setReplIdx(replica, g_pserver->repl_batch_idxStart, g_pserver->repl_batch_offStart); continue; } -#else - if (replica->replstate == SLAVE_STATE_ONLINE){ - // serverLog(LL_NOTICE, "would be calling this garbage function w/ offset: %lld", g_pserver->repl_batch_idxStart); - } #endif if (g_pserver->repl_backlog_idx >= g_pserver->repl_batch_idxStart) { long long cbCopy = g_pserver->repl_backlog_idx - g_pserver->repl_batch_idxStart; From 556f394ed90d00f528eb12784ec90b4c869e0cff Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 30 Apr 2021 17:32:54 +0000 Subject: [PATCH 31/99] Initial implementation of multithread load Former-commit-id: 87b0657c3acd7a3c89964afe1702851b44467c9a --- src/rdb.cpp | 237 +++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 181 insertions(+), 56 deletions(-) diff --git a/src/rdb.cpp b/src/rdb.cpp index ec510546d..a9d701933 100644 --- a/src/rdb.cpp +++ b/src/rdb.cpp @@ -2370,18 +2370,169 @@ void rdbLoadProgressCallback(rio *r, const void *buf, size_t len) { } } + +struct rdbInsertJob +{ + redisDb *db; + sds key; + robj *val; + long long lru_clock; + long long expiretime; + long long lru_idle; + long long lfu_freq; +}; + +class rdbAsyncWorkThread +{ + rdbSaveInfo *rsi; + int rdbflags; + std::vector queuejobs; + std::vector> queuefn; // for custom jobs + std::mutex mutex; + std::condition_variable cv; + bool fExit = false; + std::atomic ckeysLoaded; + std::thread m_thread; + +public: + + rdbAsyncWorkThread(rdbSaveInfo *rsi, int rdbflags) + : rsi(rsi), rdbflags(rdbflags) + { + ckeysLoaded = 0; + } + + ~rdbAsyncWorkThread() { + if (!fExit && m_thread.joinable()) + endWork(); + } + + void start() { + m_thread = std::thread(&rdbAsyncWorkThread::loadWorkerThreadMain, this); + } + + void enqueue(rdbInsertJob &job) { + std::unique_lock l(mutex); + bool fNotify = queuejobs.empty(); + queuejobs.push_back(job); + if (fNotify) + cv.notify_one(); + } + + void enqueue(std::function &&fn) { + std::unique_lock l(mutex); + bool fNotify = queuefn.empty(); + queuefn.push_back(std::move(fn)); + if (fNotify) + cv.notify_one(); + } + + size_t ckeys() { return ckeysLoaded; } + + size_t endWork() { + std::unique_lock l(mutex); + fExit = true; + cv.notify_one(); + l.unlock(); + m_thread.join(); + return ckeysLoaded; + } + + static void loadWorkerThreadMain(rdbAsyncWorkThread *pqueue) { + rdbAsyncWorkThread &queue = *pqueue; + for (;;) { + std::unique_lock lock(queue.mutex); + if (queue.queuejobs.empty() && queue.queuefn.empty()) { + if (queue.fExit) + break; + queue.cv.wait(lock); + if (queue.fExit) + break; + } + + auto queuejobs = std::move(queue.queuejobs); + queue.queuejobs.reserve(1024); + auto queuefn = std::move(queue.queuefn); + lock.unlock(); + + for (auto &fn : queuefn) { + fn(); + } + + bool f1024thKey = false; + for (auto &job : queuejobs) { + redisObjectStack keyobj; + initStaticStringObject(keyobj,job.key); + + /* Add the new object in the hash table */ + int fInserted = dbMerge(job.db, &keyobj, job.val, (queue.rsi && queue.rsi->fForceSetKey) || (queue.rdbflags & RDBFLAGS_ALLOW_DUP)); // Note: dbMerge will incrRef + + if (fInserted) + { + auto ckeys = queue.ckeysLoaded.fetch_add(1, std::memory_order_relaxed); + f1024thKey = f1024thKey || (ckeys % 1024) == 0; + + /* Set the expire time if needed */ + if (job.expiretime != -1) + { + setExpire(NULL,job.db,&keyobj,nullptr,job.expiretime); + } + + /* Set usage information (for eviction). */ + objectSetLRUOrLFU(job.val,job.lfu_freq,job.lru_idle,job.lru_clock,1000); + + /* call key space notification on key loaded for modules only */ + moduleNotifyKeyspaceEvent(NOTIFY_LOADED, "loaded", &keyobj, job.db->id); + + replicationNotifyLoadedKey(job.db, &keyobj, job.val, job.expiretime); + } + else + { + decrRefCount(job.val); + } + if (job.key != nullptr) + { + sdsfree(job.key); + } + } + + /* If we have a storage provider check if we need to evict some keys to stay under our memory limit, + do this every 16 keys to limit the perf impact */ + if (g_pserver->m_pstorageFactory && f1024thKey) + { + bool fHighMemory = (getMaxmemoryState(NULL,NULL,NULL,NULL) != C_OK); + if (fHighMemory || f1024thKey) + { + for (int idb = 0; idb < cserver.dbnum; ++idb) + { + if (g_pserver->db[idb]->processChanges(false)) + g_pserver->db[idb]->commitChanges(); + if (fHighMemory && !(queue.rsi && queue.rsi->fForceSetKey)) { + g_pserver->db[idb]->removeAllCachedValues(); // During load we don't go through the normal eviction unless we're merging (i.e. an active replica) + fHighMemory = false; // we took care of it + } + g_pserver->db[idb]->trackChanges(false, 1024); + } + if (fHighMemory) + freeMemoryIfNeeded(false /*fQuickCycle*/, false /* fPreSnapshot*/); + } + } + } + } +}; + /* Load an RDB file from the rio stream 'rdb'. On success C_OK is returned, * otherwise C_ERR is returned and 'errno' is set accordingly. */ int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi) { uint64_t dbid = 0; int type, rdbver; - redisDb *db = g_pserver->db[dbid]; + redisDb *dbCur = g_pserver->db[dbid]; char buf[1024]; /* Key-specific attributes, set by opcodes before the key type. */ long long lru_idle = -1, lfu_freq = -1, expiretime = -1, now; long long lru_clock = 0; uint64_t mvcc_tstamp = OBJ_MVCC_INVALID; - size_t ckeysLoaded = 0; + rdbAsyncWorkThread wqueue(rsi, rdbflags); robj *subexpireKey = nullptr; sds key = nullptr; bool fLastKeyExpired = false; @@ -2409,6 +2560,7 @@ int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi) { now = mstime(); lru_clock = LRU_CLOCK(); + wqueue.start(); while(1) { robj *val; @@ -2456,7 +2608,7 @@ int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi) { "databases. Exiting\n", cserver.dbnum); exit(1); } - db = g_pserver->db[dbid]; + dbCur = g_pserver->db[dbid]; continue; /* Read next opcode. */ } else if (type == RDB_OPCODE_RESIZEDB) { /* RESIZEDB: Hint about the size of the keys in the currently @@ -2466,7 +2618,9 @@ int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi) { goto eoferr; if ((expires_size = rdbLoadLen(rdb,NULL)) == RDB_LENERR) goto eoferr; - db->expand(db_size); + wqueue.enqueue([dbCur, db_size]{ + dbCur->expand(db_size); + }); continue; /* Read next opcode. */ } else if (type == RDB_OPCODE_AUX) { /* AUX: generic string-string fields. Use to add state to RDB @@ -2540,12 +2694,14 @@ int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi) { } } else { - redisObjectStack keyobj; - initStaticStringObject(keyobj,key); long long expireT = strtoll(szFromObj(auxval), nullptr, 10); - setExpire(NULL, db, &keyobj, subexpireKey, expireT); - replicateSubkeyExpire(db, &keyobj, subexpireKey, expireT); - decrRefCount(subexpireKey); + wqueue.enqueue([dbCur, subexpireKey, key, expireT]{ + redisObjectStack keyobj; + initStaticStringObject(keyobj,key); + setExpire(NULL, dbCur, &keyobj, subexpireKey, expireT); + replicateSubkeyExpire(dbCur, &keyobj, subexpireKey, expireT); + decrRefCount(subexpireKey); + }); subexpireKey = nullptr; } } else { @@ -2637,6 +2793,7 @@ int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi) { initStaticStringObject(keyobj,key); bool fExpiredKey = iAmMaster() && !(rdbflags&RDBFLAGS_AOF_PREAMBLE) && expiretime != -1 && expiretime < now; if (fStaleMvccKey || fExpiredKey) { + #if 0 // TODO! if (fStaleMvccKey && !fExpiredKey && rsi != nullptr && rsi->mi != nullptr && rsi->mi->staleKeyMap != nullptr && lookupKeyRead(db, &keyobj) == nullptr) { // We have a key that we've already deleted and is not back in our database. // We'll need to inform the sending master of the delete if it is also a replica of us @@ -2648,56 +2805,21 @@ int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi) { key = nullptr; decrRefCount(val); val = nullptr; + #endif } else { - /* If we have a storage provider check if we need to evict some keys to stay under our memory limit, - do this every 16 keys to limit the perf impact */ - if (g_pserver->m_pstorageFactory && (ckeysLoaded % 128) == 0) - { - bool fHighMemory = (getMaxmemoryState(NULL,NULL,NULL,NULL) != C_OK); - if (fHighMemory || (ckeysLoaded % (1024)) == 0) - { - for (int idb = 0; idb < cserver.dbnum; ++idb) - { - if (g_pserver->db[idb]->processChanges(false)) - g_pserver->db[idb]->commitChanges(); - if (fHighMemory && !(rsi && rsi->fForceSetKey)) { - g_pserver->db[idb]->removeAllCachedValues(); // During load we don't go through the normal eviction unless we're merging (i.e. an active replica) - fHighMemory = false; // we took care of it - } - g_pserver->db[idb]->trackChanges(false, 1024); - } - if (fHighMemory) - freeMemoryIfNeeded(false /*fQuickCycle*/, false /* fPreSnapshot*/); - } - } - - /* Add the new object in the hash table */ - int fInserted = dbMerge(db, &keyobj, val, (rsi && rsi->fForceSetKey) || (rdbflags & RDBFLAGS_ALLOW_DUP)); // Note: dbMerge will incrRef fLastKeyExpired = false; + rdbInsertJob job; + job.db = dbCur; + job.key = key; + job.val = val; + job.lru_clock = lru_clock; + job.expiretime = expiretime; + job.lru_idle = lru_idle; + job.lfu_freq = lfu_freq; + wqueue.enqueue(job); - if (fInserted) - { - ++ckeysLoaded; - - /* Set the expire time if needed */ - if (expiretime != -1) - { - setExpire(NULL,db,&keyobj,nullptr,expiretime); - } - - /* Set usage information (for eviction). */ - objectSetLRUOrLFU(val,lfu_freq,lru_idle,lru_clock,1000); - - /* call key space notification on key loaded for modules only */ - moduleNotifyKeyspaceEvent(NOTIFY_LOADED, "loaded", &keyobj, db->id); - - replicationNotifyLoadedKey(db, &keyobj, val, expiretime); - } - else - { - decrRefCount(val); - val = nullptr; - } + key = nullptr; + val = nullptr; } if (g_pserver->key_load_delay) @@ -2744,6 +2866,8 @@ int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi) { } } + wqueue.endWork(); + for (int idb = 0; idb < cserver.dbnum; ++idb) { if (g_pserver->db[idb]->processChanges(false)) @@ -2756,6 +2880,7 @@ int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi) { * the RDB file from a socket during initial SYNC (diskless replica mode), * we'll report the error to the caller, so that we can retry. */ eoferr: + wqueue.endWork(); if (key != nullptr) { sdsfree(key); From 7352e4a45f64ccd97d03331c904bcf1905772d0f Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 30 Apr 2021 18:45:37 +0000 Subject: [PATCH 32/99] Fix majority of test issues with multithread load Former-commit-id: 4db88176e33e3615ffb90852b49e76b12d5b4622 --- src/ae.cpp | 8 ++++- src/ae.h | 1 + src/rdb.cpp | 92 +++++++++++++++++++++++++++++++---------------------- src/rio.cpp | 5 +++ src/rio.h | 1 + 5 files changed, 68 insertions(+), 39 deletions(-) diff --git a/src/ae.cpp b/src/ae.cpp index f96ef4f6c..29d687077 100644 --- a/src/ae.cpp +++ b/src/ae.cpp @@ -846,6 +846,7 @@ void aeSetAfterSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *aftersleep, } thread_local spin_worker tl_worker = nullptr; +thread_local bool fOwnLockOverride = false; void setAeLockSetThreadSpinWorker(spin_worker worker) { tl_worker = worker; @@ -866,9 +867,14 @@ void aeReleaseLock() g_lock.unlock(); } +void aeSetThreadOwnsLockOverride(bool fOverride) +{ + fOwnLockOverride = fOverride; +} + int aeThreadOwnsLock() { - return g_lock.fOwnLock(); + return fOwnLockOverride || g_lock.fOwnLock(); } int aeLockContested(int threshold) diff --git a/src/ae.h b/src/ae.h index aec1df154..9d8821143 100644 --- a/src/ae.h +++ b/src/ae.h @@ -169,6 +169,7 @@ void aeAcquireLock(); int aeTryAcquireLock(int fWeak); void aeReleaseLock(); int aeThreadOwnsLock(); +void aeSetThreadOwnsLockOverride(bool fOverride); int aeLockContested(int threshold); int aeLockContention(); // returns the number of instantaneous threads waiting on the lock diff --git a/src/rdb.cpp b/src/rdb.cpp index a9d701933..ad2823197 100644 --- a/src/rdb.cpp +++ b/src/rdb.cpp @@ -2336,41 +2336,6 @@ void stopSaving(int success) { NULL); } -/* Track loading progress in order to serve client's from time to time - and if needed calculate rdb checksum */ -void rdbLoadProgressCallback(rio *r, const void *buf, size_t len) { - if (g_pserver->rdb_checksum) - rioGenericUpdateChecksum(r, buf, len); - - if ((g_pserver->loading_process_events_interval_bytes && - (r->processed_bytes + len)/g_pserver->loading_process_events_interval_bytes > r->processed_bytes/g_pserver->loading_process_events_interval_bytes) || - (g_pserver->loading_process_events_interval_keys && - (r->keys_since_last_callback >= g_pserver->loading_process_events_interval_keys))) - { - listIter li; - listNode *ln; - listRewind(g_pserver->masters, &li); - while ((ln = listNext(&li))) - { - struct redisMaster *mi = (struct redisMaster*)listNodeValue(ln); - if (mi->repl_state == REPL_STATE_TRANSFER) - replicationSendNewlineToMaster(mi); - } - loadingProgress(r->processed_bytes); - processEventsWhileBlocked(serverTL - g_pserver->rgthreadvar); - processModuleLoadingProgressEvent(0); - - robj *ping_argv[1]; - - ping_argv[0] = createStringObject("PING",4); - replicationFeedSlaves(g_pserver->slaves, g_pserver->replicaseldb, ping_argv, 1); - decrRefCount(ping_argv[0]); - - r->keys_since_last_callback = 0; - } -} - - struct rdbInsertJob { redisDb *db; @@ -2393,6 +2358,7 @@ class rdbAsyncWorkThread bool fExit = false; std::atomic ckeysLoaded; std::thread m_thread; + list *clients_pending_async_write = nullptr; public: @@ -2405,10 +2371,14 @@ public: ~rdbAsyncWorkThread() { if (!fExit && m_thread.joinable()) endWork(); + if (clients_pending_async_write) + listRelease(clients_pending_async_write); } void start() { - m_thread = std::thread(&rdbAsyncWorkThread::loadWorkerThreadMain, this); + if (clients_pending_async_write == nullptr) + clients_pending_async_write = listCreate(); + m_thread = std::thread(&rdbAsyncWorkThread::loadWorkerThreadMain, this, clients_pending_async_write); } void enqueue(rdbInsertJob &job) { @@ -2435,18 +2405,24 @@ public: cv.notify_one(); l.unlock(); m_thread.join(); + listJoin(serverTL->clients_pending_asyncwrite, clients_pending_async_write); + ProcessPendingAsyncWrites(); return ckeysLoaded; } - static void loadWorkerThreadMain(rdbAsyncWorkThread *pqueue) { + static void loadWorkerThreadMain(rdbAsyncWorkThread *pqueue, list *clients_pending_asyncwrite) { rdbAsyncWorkThread &queue = *pqueue; + redisServerThreadVars vars; + vars.clients_pending_asyncwrite = clients_pending_asyncwrite; + serverTL = &vars; + aeSetThreadOwnsLockOverride(true); for (;;) { std::unique_lock lock(queue.mutex); if (queue.queuejobs.empty() && queue.queuefn.empty()) { if (queue.fExit) break; queue.cv.wait(lock); - if (queue.fExit) + if (queue.queuejobs.empty() && queue.queuefn.empty() && queue.fExit) break; } @@ -2518,9 +2494,48 @@ public: } } } + aeSetThreadOwnsLockOverride(false); } }; +/* Track loading progress in order to serve client's from time to time + and if needed calculate rdb checksum */ +void rdbLoadProgressCallback(rio *r, const void *buf, size_t len) { + if (g_pserver->rdb_checksum) + rioGenericUpdateChecksum(r, buf, len); + + if ((g_pserver->loading_process_events_interval_bytes && + (r->processed_bytes + len)/g_pserver->loading_process_events_interval_bytes > r->processed_bytes/g_pserver->loading_process_events_interval_bytes) || + (g_pserver->loading_process_events_interval_keys && + (r->keys_since_last_callback >= g_pserver->loading_process_events_interval_keys))) + { + rdbAsyncWorkThread *pwthread = reinterpret_cast(r->chksum_arg); + pwthread->endWork(); // We can't have the work queue modifying the database while processEventsWhileBlocked does its thing + listIter li; + listNode *ln; + listRewind(g_pserver->masters, &li); + while ((ln = listNext(&li))) + { + struct redisMaster *mi = (struct redisMaster*)listNodeValue(ln); + if (mi->repl_state == REPL_STATE_TRANSFER) + replicationSendNewlineToMaster(mi); + } + loadingProgress(r->processed_bytes); + processEventsWhileBlocked(serverTL - g_pserver->rgthreadvar); + processModuleLoadingProgressEvent(0); + + robj *ping_argv[1]; + + ping_argv[0] = createStringObject("PING",4); + replicationFeedSlaves(g_pserver->slaves, g_pserver->replicaseldb, ping_argv, 1); + decrRefCount(ping_argv[0]); + pwthread->start(); + + r->keys_since_last_callback = 0; + } +} + + /* Load an RDB file from the rio stream 'rdb'. On success C_OK is returned, * otherwise C_ERR is returned and 'errno' is set accordingly. */ int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi) { @@ -2543,6 +2558,7 @@ int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi) { } rdb->update_cksum = rdbLoadProgressCallback; + rdb->chksum_arg = &wqueue; rdb->max_processing_chunk = g_pserver->loading_process_events_interval_bytes; if (rioRead(rdb,buf,9) == 0) goto eoferr; buf[9] = '\0'; diff --git a/src/rio.cpp b/src/rio.cpp index 99c8e98c6..82a3969f3 100644 --- a/src/rio.cpp +++ b/src/rio.cpp @@ -99,6 +99,7 @@ static const rio rioBufferIO = { rioBufferTell, rioBufferFlush, NULL, /* update_checksum */ + NULL, /* update checksum arg */ 0, /* current checksum */ 0, /* flags */ 0, /* bytes read or written */ @@ -113,6 +114,7 @@ static const rio rioConstBufferIO = { rioBufferTell, rioBufferFlush, NULL, /* update_checksum */ + NULL, /* update checksum arg */ 0, /* current checksum */ 0, /* flags */ 0, /* bytes read or written */ @@ -176,6 +178,7 @@ static const rio rioFileIO = { rioFileTell, rioFileFlush, NULL, /* update_checksum */ + NULL, /* update checksum arg */ 0, /* current checksum */ 0, /* flags */ 0, /* bytes read or written */ @@ -272,6 +275,7 @@ static const rio rioConnIO = { rioConnTell, rioConnFlush, NULL, /* update_checksum */ + NULL, /* update checksum arg */ 0, /* current checksum */ 0, /* flags */ 0, /* bytes read or written */ @@ -391,6 +395,7 @@ static const rio rioFdIO = { rioFdTell, rioFdFlush, NULL, /* update_checksum */ + NULL, /* update checksum arg */ 0, /* current checksum */ 0, /* flags */ 0, /* bytes read or written */ diff --git a/src/rio.h b/src/rio.h index d48474fcb..86f3fa465 100644 --- a/src/rio.h +++ b/src/rio.h @@ -58,6 +58,7 @@ struct _rio { * and len fields pointing to the new block of data to add to the checksum * computation. */ void (*update_cksum)(struct _rio *, const void *buf, size_t len); + void *chksum_arg; /* The current checksum and flags (see RIO_FLAG_*) */ uint64_t cksum, flags; From 4677c62428e98dc10bc04c535407f9698bd4681d Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 30 Apr 2021 18:48:23 +0000 Subject: [PATCH 33/99] Fix issue where async load thread misses work Former-commit-id: a24a7b093295c5f5d69feee9fbc37c64cfa8aa03 --- src/rdb.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rdb.cpp b/src/rdb.cpp index ad2823197..92a466f68 100644 --- a/src/rdb.cpp +++ b/src/rdb.cpp @@ -2419,7 +2419,7 @@ public: for (;;) { std::unique_lock lock(queue.mutex); if (queue.queuejobs.empty() && queue.queuefn.empty()) { - if (queue.fExit) + if (queue.queuejobs.empty() && queue.queuefn.empty() && queue.fExit) break; queue.cv.wait(lock); if (queue.queuejobs.empty() && queue.queuefn.empty() && queue.fExit) From 5583fbb83851bb9af7677ec662cee7143fd89ab9 Mon Sep 17 00:00:00 2001 From: John Sully Date: Mon, 3 May 2021 02:37:02 +0000 Subject: [PATCH 34/99] Fix remaining test failures Former-commit-id: 37e607f9b13b6601ff52e74e613fb369cab22b56 --- src/ae.cpp | 2 +- src/ae.h | 2 +- src/rdb.cpp | 177 +++++++++++++++++++++++++++++----------------------- src/rdb.h | 9 +-- 4 files changed, 105 insertions(+), 85 deletions(-) diff --git a/src/ae.cpp b/src/ae.cpp index 29d687077..3b27f43dd 100644 --- a/src/ae.cpp +++ b/src/ae.cpp @@ -867,7 +867,7 @@ void aeReleaseLock() g_lock.unlock(); } -void aeSetThreadOwnsLockOverride(bool fOverride) +void aeSetThreadOwnsLockOverride(int fOverride) { fOwnLockOverride = fOverride; } diff --git a/src/ae.h b/src/ae.h index 9d8821143..8a1cdc304 100644 --- a/src/ae.h +++ b/src/ae.h @@ -169,7 +169,7 @@ void aeAcquireLock(); int aeTryAcquireLock(int fWeak); void aeReleaseLock(); int aeThreadOwnsLock(); -void aeSetThreadOwnsLockOverride(bool fOverride); +void aeSetThreadOwnsLockOverride(int fOverride); int aeLockContested(int threshold); int aeLockContention(); // returns the number of instantaneous threads waiting on the lock diff --git a/src/rdb.cpp b/src/rdb.cpp index 92a466f68..d969b4f54 100644 --- a/src/rdb.cpp +++ b/src/rdb.cpp @@ -519,7 +519,12 @@ void *rdbGenericLoadStringObject(rio *rdb, int flags, size_t *lenptr) { if (len == RDB_LENERR) return NULL; if (plain || sds) { - void *buf = plain ? zmalloc(len, MALLOC_SHARED) : sdsnewlen(SDS_NOINIT,len); + ssize_t lenSigned = (ssize_t)len; + if (flags & RDB_LOAD_SDS_SHARED) + lenSigned = -lenSigned; + void *buf = plain ? zmalloc(len, MALLOC_SHARED) : sdsnewlen(SDS_NOINIT, lenSigned); + if (buf == nullptr) + return nullptr; if (lenptr) *lenptr = len; if (len && rioRead(rdb,buf,len) == 0) { if (plain) @@ -2355,30 +2360,29 @@ class rdbAsyncWorkThread std::vector> queuefn; // for custom jobs std::mutex mutex; std::condition_variable cv; + bool fLaunched = false; bool fExit = false; std::atomic ckeysLoaded; std::thread m_thread; - list *clients_pending_async_write = nullptr; + long long now; public: - rdbAsyncWorkThread(rdbSaveInfo *rsi, int rdbflags) - : rsi(rsi), rdbflags(rdbflags) + rdbAsyncWorkThread(rdbSaveInfo *rsi, int rdbflags, long long now) + : rsi(rsi), rdbflags(rdbflags), now(now) { ckeysLoaded = 0; } ~rdbAsyncWorkThread() { - if (!fExit && m_thread.joinable()) - endWork(); - if (clients_pending_async_write) - listRelease(clients_pending_async_write); + if (m_thread.joinable()) + endWork(); } void start() { - if (clients_pending_async_write == nullptr) - clients_pending_async_write = listCreate(); - m_thread = std::thread(&rdbAsyncWorkThread::loadWorkerThreadMain, this, clients_pending_async_write); + serverAssert(!fLaunched); + m_thread = std::thread(&rdbAsyncWorkThread::loadWorkerThreadMain, this); + fLaunched = true; } void enqueue(rdbInsertJob &job) { @@ -2401,25 +2405,28 @@ public: size_t endWork() { std::unique_lock l(mutex); + serverAssert(fLaunched); fExit = true; cv.notify_one(); l.unlock(); m_thread.join(); - listJoin(serverTL->clients_pending_asyncwrite, clients_pending_async_write); - ProcessPendingAsyncWrites(); + fLaunched = false; + fExit = false; + serverAssert(queuejobs.empty()); + serverAssert(queuefn.empty()); return ckeysLoaded; } - static void loadWorkerThreadMain(rdbAsyncWorkThread *pqueue, list *clients_pending_asyncwrite) { + static void loadWorkerThreadMain(rdbAsyncWorkThread *pqueue) { rdbAsyncWorkThread &queue = *pqueue; - redisServerThreadVars vars; - vars.clients_pending_asyncwrite = clients_pending_asyncwrite; + redisServerThreadVars vars = {}; + vars.clients_pending_asyncwrite = listCreate(); serverTL = &vars; aeSetThreadOwnsLockOverride(true); for (;;) { std::unique_lock lock(queue.mutex); if (queue.queuejobs.empty() && queue.queuefn.empty()) { - if (queue.queuejobs.empty() && queue.queuefn.empty() && queue.fExit) + if (queue.fExit) break; queue.cv.wait(lock); if (queue.queuejobs.empty() && queue.queuefn.empty() && queue.fExit) @@ -2430,47 +2437,70 @@ public: queue.queuejobs.reserve(1024); auto queuefn = std::move(queue.queuefn); lock.unlock(); - - for (auto &fn : queuefn) { - fn(); - } bool f1024thKey = false; for (auto &job : queuejobs) { redisObjectStack keyobj; initStaticStringObject(keyobj,job.key); - /* Add the new object in the hash table */ - int fInserted = dbMerge(job.db, &keyobj, job.val, (queue.rsi && queue.rsi->fForceSetKey) || (queue.rdbflags & RDBFLAGS_ALLOW_DUP)); // Note: dbMerge will incrRef + bool fStaleMvccKey = (pqueue->rsi) ? mvccFromObj(job.val) < pqueue->rsi->mvccMinThreshold : false; - if (fInserted) - { - auto ckeys = queue.ckeysLoaded.fetch_add(1, std::memory_order_relaxed); - f1024thKey = f1024thKey || (ckeys % 1024) == 0; - - /* Set the expire time if needed */ - if (job.expiretime != -1) - { - setExpire(NULL,job.db,&keyobj,nullptr,job.expiretime); + /* Check if the key already expired. This function is used when loading + * an RDB file from disk, either at startup, or when an RDB was + * received from the master. In the latter case, the master is + * responsible for key expiry. If we would expire keys here, the + * snapshot taken by the master may not be reflected on the replica. */ + bool fExpiredKey = iAmMaster() && !(pqueue->rdbflags&RDBFLAGS_AOF_PREAMBLE) && job.expiretime != -1 && job.expiretime < pqueue->now; + if (fStaleMvccKey || fExpiredKey) { + if (fStaleMvccKey && !fExpiredKey && pqueue->rsi != nullptr && pqueue->rsi->mi != nullptr && pqueue->rsi->mi->staleKeyMap != nullptr && lookupKeyRead(job.db, &keyobj) == nullptr) { + // We have a key that we've already deleted and is not back in our database. + // We'll need to inform the sending master of the delete if it is also a replica of us + robj_sharedptr objKeyDup(createStringObject(job.key, sdslen(job.key))); + pqueue->rsi->mi->staleKeyMap->operator[](job.db->id).push_back(objKeyDup); } - - /* Set usage information (for eviction). */ - objectSetLRUOrLFU(job.val,job.lfu_freq,job.lru_idle,job.lru_clock,1000); - - /* call key space notification on key loaded for modules only */ - moduleNotifyKeyspaceEvent(NOTIFY_LOADED, "loaded", &keyobj, job.db->id); - - replicationNotifyLoadedKey(job.db, &keyobj, job.val, job.expiretime); - } - else - { + sdsfree(job.key); + job.key = nullptr; decrRefCount(job.val); + job.val = nullptr; + } else { + /* Add the new object in the hash table */ + int fInserted = dbMerge(job.db, &keyobj, job.val, (queue.rsi && queue.rsi->fForceSetKey) || (queue.rdbflags & RDBFLAGS_ALLOW_DUP)); // Note: dbMerge will incrRef + + if (fInserted) + { + auto ckeys = queue.ckeysLoaded.fetch_add(1, std::memory_order_relaxed); + f1024thKey = f1024thKey || (ckeys % 1024) == 0; + + /* Set the expire time if needed */ + if (job.expiretime != -1) + { + setExpire(NULL,job.db,&keyobj,nullptr,job.expiretime); + } + + /* Set usage information (for eviction). */ + objectSetLRUOrLFU(job.val,job.lfu_freq,job.lru_idle,job.lru_clock,1000); + + /* call key space notification on key loaded for modules only */ + moduleNotifyKeyspaceEvent(NOTIFY_LOADED, "loaded", &keyobj, job.db->id); + + replicationNotifyLoadedKey(job.db, &keyobj, job.val, job.expiretime); + } + else + { + decrRefCount(job.val); + } } + + if (job.key != nullptr) { sdsfree(job.key); } } + + for (auto &fn : queuefn) { + fn(); + } /* If we have a storage provider check if we need to evict some keys to stay under our memory limit, do this every 16 keys to limit the perf impact */ @@ -2494,6 +2524,11 @@ public: } } } + std::unique_lock lock(queue.mutex); + serverAssert(queue.queuefn.empty()); + serverAssert(queue.queuejobs.empty()); + ProcessPendingAsyncWrites(); + listRelease(vars.clients_pending_asyncwrite); aeSetThreadOwnsLockOverride(false); } }; @@ -2547,7 +2582,8 @@ int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi) { long long lru_idle = -1, lfu_freq = -1, expiretime = -1, now; long long lru_clock = 0; uint64_t mvcc_tstamp = OBJ_MVCC_INVALID; - rdbAsyncWorkThread wqueue(rsi, rdbflags); + now = mstime(); + rdbAsyncWorkThread wqueue(rsi, rdbflags, now); robj *subexpireKey = nullptr; sds key = nullptr; bool fLastKeyExpired = false; @@ -2574,7 +2610,6 @@ int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi) { return C_ERR; } - now = mstime(); lru_clock = LRU_CLOCK(); wqueue.start(); @@ -2711,12 +2746,14 @@ int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi) { } else { long long expireT = strtoll(szFromObj(auxval), nullptr, 10); - wqueue.enqueue([dbCur, subexpireKey, key, expireT]{ + sds keyT = sdsdupshared(key); + wqueue.enqueue([dbCur, subexpireKey, keyT, expireT]{ redisObjectStack keyobj; - initStaticStringObject(keyobj,key); + initStaticStringObject(keyobj,keyT); setExpire(NULL, dbCur, &keyobj, subexpireKey, expireT); replicateSubkeyExpire(dbCur, &keyobj, subexpireKey, expireT); decrRefCount(subexpireKey); + sdsfree(keyT); }); subexpireKey = nullptr; } @@ -2790,7 +2827,7 @@ int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi) { key = nullptr; } - if ((key = (sds)rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS,NULL)) == NULL) + if ((key = (sds)rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS_SHARED,NULL)) == NULL) goto eoferr; /* Read value */ if ((val = rdbLoadObject(type,rdb,key,mvcc_tstamp)) == NULL) { @@ -2798,45 +2835,27 @@ int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi) { key = nullptr; goto eoferr; } + + bool fStaleMvccKey = (rsi) ? mvccFromObj(val) < rsi->mvccMinThreshold : false; + rdbInsertJob job; + job.db = dbCur; + job.key = sdsdupshared(key); + job.val = val; + job.lru_clock = lru_clock; + job.expiretime = expiretime; + job.lru_idle = lru_idle; + job.lfu_freq = lfu_freq; + wqueue.enqueue(job); + val = nullptr; /* Check if the key already expired. This function is used when loading * an RDB file from disk, either at startup, or when an RDB was * received from the master. In the latter case, the master is * responsible for key expiry. If we would expire keys here, the * snapshot taken by the master may not be reflected on the replica. */ - redisObjectStack keyobj; - initStaticStringObject(keyobj,key); bool fExpiredKey = iAmMaster() && !(rdbflags&RDBFLAGS_AOF_PREAMBLE) && expiretime != -1 && expiretime < now; - if (fStaleMvccKey || fExpiredKey) { - #if 0 // TODO! - if (fStaleMvccKey && !fExpiredKey && rsi != nullptr && rsi->mi != nullptr && rsi->mi->staleKeyMap != nullptr && lookupKeyRead(db, &keyobj) == nullptr) { - // We have a key that we've already deleted and is not back in our database. - // We'll need to inform the sending master of the delete if it is also a replica of us - robj_sharedptr objKeyDup(createStringObject(key, sdslen(key))); - rsi->mi->staleKeyMap->operator[](db->id).push_back(objKeyDup); - } - fLastKeyExpired = true; - sdsfree(key); - key = nullptr; - decrRefCount(val); - val = nullptr; - #endif - } else { - fLastKeyExpired = false; - rdbInsertJob job; - job.db = dbCur; - job.key = key; - job.val = val; - job.lru_clock = lru_clock; - job.expiretime = expiretime; - job.lru_idle = lru_idle; - job.lfu_freq = lfu_freq; - wqueue.enqueue(job); - - key = nullptr; - val = nullptr; - } + fLastKeyExpired = fStaleMvccKey || fExpiredKey; if (g_pserver->key_load_delay) usleep(g_pserver->key_load_delay); diff --git a/src/rdb.h b/src/rdb.h index c561a8799..38fded807 100644 --- a/src/rdb.h +++ b/src/rdb.h @@ -119,10 +119,11 @@ #define RDB_MODULE_OPCODE_STRING 5 /* String. */ /* rdbLoad...() functions flags. */ -#define RDB_LOAD_NONE 0 -#define RDB_LOAD_ENC (1<<0) -#define RDB_LOAD_PLAIN (1<<1) -#define RDB_LOAD_SDS (1<<2) +#define RDB_LOAD_NONE 0 +#define RDB_LOAD_ENC (1<<0) +#define RDB_LOAD_PLAIN (1<<1) +#define RDB_LOAD_SDS (1<<2) +#define RDB_LOAD_SDS_SHARED ((1 << 3) | RDB_LOAD_SDS) /* flags on the purpose of rdb save or load */ #define RDBFLAGS_NONE 0 /* No special RDB loading. */ From eaaff16cca95ff5eb4bf2d4d9bb27f32a0b166bd Mon Sep 17 00:00:00 2001 From: John Sully Date: Mon, 3 May 2021 04:15:46 +0000 Subject: [PATCH 35/99] Don't sync too often it hurts perf Former-commit-id: a20e89a457a0a682483c22f0f1cdb5c93c574d28 --- src/rdb.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/rdb.cpp b/src/rdb.cpp index d969b4f54..2191f7bd8 100644 --- a/src/rdb.cpp +++ b/src/rdb.cpp @@ -2545,7 +2545,8 @@ void rdbLoadProgressCallback(rio *r, const void *buf, size_t len) { (r->keys_since_last_callback >= g_pserver->loading_process_events_interval_keys))) { rdbAsyncWorkThread *pwthread = reinterpret_cast(r->chksum_arg); - pwthread->endWork(); // We can't have the work queue modifying the database while processEventsWhileBlocked does its thing + if (pwthread && g_pserver->fActiveReplica) + pwthread->endWork(); // We can't have the work queue modifying the database while processEventsWhileBlocked does its thing listIter li; listNode *ln; listRewind(g_pserver->masters, &li); @@ -2564,7 +2565,8 @@ void rdbLoadProgressCallback(rio *r, const void *buf, size_t len) { ping_argv[0] = createStringObject("PING",4); replicationFeedSlaves(g_pserver->slaves, g_pserver->replicaseldb, ping_argv, 1); decrRefCount(ping_argv[0]); - pwthread->start(); + if (pwthread && g_pserver->fActiveReplica) + pwthread->start(); r->keys_since_last_callback = 0; } From f6305ed15bca84719504890f85dd0f1297e05365 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Mon, 3 May 2021 16:29:11 +0000 Subject: [PATCH 36/99] Now tracks memory and resizes 'accurately', need to fix cluster Former-commit-id: 5f0e01cc199427ab6dfd7f8f28321f6a1f34fd1c --- src/config.cpp | 1 + src/evict.cpp | 10 +++++++++- src/networking.cpp | 20 +++++++++++++------- src/replication.cpp | 16 ++++++++++++++++ 4 files changed, 39 insertions(+), 8 deletions(-) diff --git a/src/config.cpp b/src/config.cpp index 9d7f14007..b546ef607 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -2347,6 +2347,7 @@ static int updateReplBacklogSize(long long val, long long prev, const char **err UNUSED(err); g_pserver->repl_backlog_size = prev; resizeReplicationBacklog(val); + g_pserver->repl_backlog_config_size = g_pserver->repl_backlog_size; return 1; } diff --git a/src/evict.cpp b/src/evict.cpp index 31cadeae5..36837e17d 100644 --- a/src/evict.cpp +++ b/src/evict.cpp @@ -392,9 +392,16 @@ size_t freeMemoryGetNotCountedMemory(void) { while((ln = listNext(&li))) { client *replica = (client*)listNodeValue(ln); std::unique_lock(replica->lock); - overhead += getClientOutputBufferMemoryUsage(replica); + /* we don't wish to multiple count the replication backlog shared usage */ + overhead += (getClientOutputBufferMemoryUsage(replica) - getClientReplicationBacklogSharedUsage(replica)); } } + + /* also don't count the replication backlog memory + * that's where the replication clients get their memory from */ + overhead += (g_pserver->repl_backlog_size - g_pserver->repl_backlog_config_size); + + if (g_pserver->aof_state != AOF_OFF) { overhead += sdsalloc(g_pserver->aof_buf)+aofRewriteBufferSize(); } @@ -516,6 +523,7 @@ int freeMemoryIfNeeded(bool fQuickCycle, bool fPreSnapshot) { if (g_pserver->maxmemory_policy == MAXMEMORY_NO_EVICTION) goto cant_free; /* We need to free memory, but policy forbids. */ + serverLog(LL_NOTICE, "evicting i guess lol, the overhead was %ld, the repl_backlog_size, %lld", freeMemoryGetNotCountedMemory(), g_pserver->repl_backlog_size); while (mem_freed < mem_tofree) { int j, k, i; static unsigned int next_db = 0; diff --git a/src/networking.cpp b/src/networking.cpp index cac58ff07..c51a02a1d 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -224,7 +224,6 @@ void clientInstallWriteHandler(client *c) { (c->replstate == REPL_STATE_NONE || (c->replstate == SLAVE_STATE_ONLINE && !c->repl_put_online_on_ack))) { - // serverLog(LL_NOTICE, "we installing boyz"); AssertCorrectThread(c); serverAssert(c->lock.fOwnLock()); /* Here instead of installing the write handler, we just flag the @@ -1801,6 +1800,9 @@ int writeToClient(client *c, int handler_installed) { if (nwrittenPart2 == -1) nwritten = -1; } + if (c->flags & CLIENT_SLAVE && handler_installed) + serverLog(LL_NOTICE, "Total bytes written, %ld, write handler installed?: %d", totwritten, handler_installed); + g_pserver->stat_net_output_bytes += totwritten; if (nwritten == -1) { if (connGetState(c->conn) == CONN_STATE_CONNECTED) { @@ -1821,6 +1823,11 @@ int writeToClient(client *c, int handler_installed) { if (!(c->flags & CLIENT_MASTER)) c->lastinteraction = g_pserver->unixtime; } if (!clientHasPendingReplies(c) && c->repl_curr_idx == -1) { + if(c->flags & CLIENT_SLAVE && handler_installed){ + serverLog(LL_NOTICE, "Uninstalling handler"); + serverLog(LL_NOTICE, "handler repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size); + serverLog(LL_NOTICE, "handler repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset); + } c->sentlen = 0; if (handler_installed) connSetWriteHandler(c->conn, NULL); @@ -1836,6 +1843,7 @@ int writeToClient(client *c, int handler_installed) { /* Write event handler. Just send data to the client. */ void sendReplyToClient(connection *conn) { client *c = (client*)connGetPrivateData(conn); + serverLog(LL_NOTICE, "called the sendreplytoclient"); if (writeToClient(c,1) == C_ERR) { AeLocker ae; @@ -1970,6 +1978,7 @@ int handleClientsWithPendingWrites(int iel, int aof_state) { auto vec = std::move(g_pserver->rgthreadvar[iel].clients_pending_write); processed += (int)vec.size(); + // serverLog(LL_NOTICE, "entered handleClientsWithPendingWrites"); for (client *c : vec) { serverAssertDebug(FCorrectThread(c)); @@ -2008,8 +2017,10 @@ int handleClientsWithPendingWrites(int iel, int aof_state) { /* If after the synchronous writes above we still have data to * output to the client, we need to install the writable handler. */ if (clientHasPendingReplies(c) || c->repl_curr_idx != -1) { - if (connSetWriteHandlerWithBarrier(c->conn, sendReplyToClient, ae_flags, true) == C_ERR) + serverLog(LL_NOTICE, "Setting a write handler for later"); + if (connSetWriteHandlerWithBarrier(c->conn, sendReplyToClient, ae_flags, true) == C_ERR) { freeClientAsync(c); + } } } @@ -3359,11 +3370,6 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) { * that writes to said replica are using data from the replication backlog * as opposed to it's own internal buffer, this number should keep track of that */ unsigned long getClientReplicationBacklogSharedUsage(client *c) { - if (c->flags & CLIENT_SLAVE && c->repl_curr_idx != -1){ - // serverLog(LL_NOTICE, "repl_backlog_size %lld, repl_backlog_idx %lld, master_repl_offset %lld, repl_curr_idx %lld, repl_curr_off %lld", - // g_pserver->repl_backlog_size, g_pserver->repl_backlog_idx, g_pserver->master_repl_offset, c->repl_curr_idx, c->repl_curr_off); - } - return (!(c->flags & CLIENT_SLAVE) || c->repl_curr_idx == -1) ? 0 : g_pserver->master_repl_offset - c->repl_curr_off; } diff --git a/src/replication.cpp b/src/replication.cpp index 1bae2773a..60f25052a 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -4684,5 +4684,21 @@ void flushReplBacklogToClients() // This may be called multiple times per "frame" so update with our progress flushing to clients g_pserver->repl_batch_idxStart = g_pserver->repl_backlog_idx; g_pserver->repl_batch_offStart = g_pserver->master_repl_offset; + } else if (getLowestOffsetAmongReplicas() != -1){ + listIter li; + listNode *ln; + listRewind(g_pserver->slaves, &li); + while ((ln = listNext(&li))) { + client *replica = (client*)listNodeValue(ln); + + std::unique_lock ul(replica->lock, std::defer_lock); + if (FCorrectThread(replica)) + ul.lock(); + + /* try to force prepare client to write i guess? */ + if (replica->repl_curr_idx != -1){ + if (prepareClientToWrite(replica) != C_OK) continue; + } + } } } From c58739bbcbb99f5910aa7000a144609e11dfca10 Mon Sep 17 00:00:00 2001 From: John Sully Date: Mon, 3 May 2021 16:33:16 +0000 Subject: [PATCH 37/99] Respect replica output buffer limits when adding large commands to the ring buffer Former-commit-id: 37ec01cfd8a8da1e895c7cdc358d382d35ad59dd --- src/replication.cpp | 22 ++++++++++++++++++++-- tests/integration/replication-2.tcl | 23 +++++++++++++++++++++++ 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/src/replication.cpp b/src/replication.cpp index e78df9a62..d0cd15b37 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -279,9 +279,10 @@ void feedReplicationBacklog(const void *ptr, size_t len) { long long minimumsize = g_pserver->master_repl_offset + len - g_pserver->repl_batch_offStart+1; if (minimumsize > g_pserver->repl_backlog_size) { flushReplBacklogToClients(); + serverAssert(g_pserver->master_repl_offset == g_pserver->repl_batch_offStart); minimumsize = g_pserver->master_repl_offset + len - g_pserver->repl_batch_offStart+1; - if (minimumsize > g_pserver->repl_backlog_size) { + if (minimumsize > g_pserver->repl_backlog_size && minimumsize < (long long)cserver.client_obuf_limits[CLIENT_TYPE_SLAVE].hard_limit_bytes) { // This is an emergency overflow, we better resize to fit long long newsize = std::max(g_pserver->repl_backlog_size*2, minimumsize); serverLog(LL_WARNING, "Replication backlog is too small, resizing to: %lld", newsize); @@ -4458,8 +4459,24 @@ void flushReplBacklogToClients() if (g_pserver->repl_batch_offStart != g_pserver->master_repl_offset) { bool fAsyncWrite = false; - // Ensure no overflow + serverAssert(g_pserver->repl_batch_offStart < g_pserver->master_repl_offset); + if (g_pserver->master_repl_offset - g_pserver->repl_batch_offStart > g_pserver->repl_backlog_size) { + // We overflowed + listIter li; + listNode *ln; + listRewind(g_pserver->slaves, &li); + while ((ln = listNext(&li))) { + client *c = (client*)listNodeValue(ln); + sds sdsClient = catClientInfoString(sdsempty(),c); + freeClientAsync(c); + serverLog(LL_WARNING,"Client %s scheduled to be closed ASAP for overcoming of output buffer limits.", sdsClient); + sdsfree(sdsClient); + } + goto LDone; + } + + // Ensure no overflow if we get here serverAssert(g_pserver->master_repl_offset - g_pserver->repl_batch_offStart <= g_pserver->repl_backlog_size); serverAssert(g_pserver->repl_batch_idxStart != g_pserver->repl_backlog_idx); @@ -4497,6 +4514,7 @@ void flushReplBacklogToClients() if (fAsyncWrite) ProcessPendingAsyncWrites(); +LDone: // This may be called multiple times per "frame" so update with our progress flushing to clients g_pserver->repl_batch_idxStart = g_pserver->repl_backlog_idx; g_pserver->repl_batch_offStart = g_pserver->master_repl_offset; diff --git a/tests/integration/replication-2.tcl b/tests/integration/replication-2.tcl index 08905f11e..02687c619 100644 --- a/tests/integration/replication-2.tcl +++ b/tests/integration/replication-2.tcl @@ -86,5 +86,28 @@ start_server {tags {"repl"}} { } assert_equal [r debug digest] [r -1 debug digest] } + + test {REPL Backlog handles large value} { + # initialize bigval to 64-bytes + r flushall + r config set repl-backlog-size 1K + r config set client-output-buffer-limit "replica 1024 1024 0" + set bigval "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" + for {set i 0} { $i < 20 } { incr i } { + append bigval $bigval + } + r set bigkey $bigval + # We expect the replication to be disconnected so wait a bit + wait_for_condition 50 100 { + [s -1 master_link_status] eq {down} + } else { + fail "Memory limit exceeded but not detected" + } + wait_for_condition 50 100 { + [r debug digest] eq [r -1 debug digest] + } else { + fail "Replica did not reconnect" + } + } } } From 33a7b52899a10432e1f9085027ed9e30c07dda32 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Mon, 3 May 2021 16:49:09 +0000 Subject: [PATCH 38/99] Forgot to add server.h in last commit Former-commit-id: 34fa6119c9a3f1533cc3e6e5d118dc6424a70891 --- src/server.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/server.h b/src/server.h index cfd6c34a0..6c5265fbd 100644 --- a/src/server.h +++ b/src/server.h @@ -2411,6 +2411,9 @@ struct redisServer { uint16_t rglockSamples[s_lockContentionSamples]; unsigned ilockRingHead = 0; + long long repl_backlog_config_size = 1024*1024; /* This is a hack to ignore the resizing of the replication backlog + when using it as a defacto for the client buffer */ + bool FRdbSaveInProgress() const { return rdbThreadVars.fRdbThreadActive; } }; @@ -2657,6 +2660,7 @@ sds getAllClientsInfoString(int type); void rewriteClientCommandVector(client *c, int argc, ...); void rewriteClientCommandArgument(client *c, int i, robj *newval); void replaceClientCommandVector(client *c, int argc, robj **argv); +unsigned long getClientReplicationBacklogSharedUsage(client *c); unsigned long getClientOutputBufferMemoryUsage(client *c); int freeClientsInAsyncFreeQueue(int iel); void asyncCloseClientOnOutputBufferLimitReached(client *c); From 0c5585e5ded16582e62f19eb86de0f5aa19ff7f6 Mon Sep 17 00:00:00 2001 From: John Sully Date: Mon, 3 May 2021 20:18:45 +0000 Subject: [PATCH 39/99] Ensure multithread load works with FLASH storage Former-commit-id: 24e2991c7aa2cef90a89b1640f7095235c5d34ed --- src/rdb.cpp | 98 ++++++++++++++++++++++++++++++++------------------ src/server.cpp | 4 +-- 2 files changed, 65 insertions(+), 37 deletions(-) diff --git a/src/rdb.cpp b/src/rdb.cpp index 2191f7bd8..ac35c219c 100644 --- a/src/rdb.cpp +++ b/src/rdb.cpp @@ -2356,27 +2356,35 @@ class rdbAsyncWorkThread { rdbSaveInfo *rsi; int rdbflags; - std::vector queuejobs; + list *listJobs; std::vector> queuefn; // for custom jobs std::mutex mutex; std::condition_variable cv; + std::condition_variable cvThrottle; bool fLaunched = false; bool fExit = false; std::atomic ckeysLoaded; std::thread m_thread; long long now; + static void listFreeMethod(const void *v) { + delete reinterpret_cast(v); + } + public: rdbAsyncWorkThread(rdbSaveInfo *rsi, int rdbflags, long long now) : rsi(rsi), rdbflags(rdbflags), now(now) { ckeysLoaded = 0; + listJobs = listCreate(); + listSetFreeMethod(listJobs, listFreeMethod); } ~rdbAsyncWorkThread() { if (m_thread.joinable()) - endWork(); + endWork(); + listRelease(listJobs); } void start() { @@ -2385,10 +2393,18 @@ public: fLaunched = true; } + void throttle(std::unique_lock &l) { + if (listLength(listJobs) > 0 && (listLength(listJobs) % 1024 == 0) && (getMaxmemoryState(NULL,NULL,NULL,NULL) != C_OK)) { + cvThrottle.wait(l); + } + } + void enqueue(rdbInsertJob &job) { + rdbInsertJob *pjob = new rdbInsertJob(job); std::unique_lock l(mutex); - bool fNotify = queuejobs.empty(); - queuejobs.push_back(job); + throttle(l); + bool fNotify = listLength(listJobs) == 0; + listAddNodeTail(listJobs, pjob); if (fNotify) cv.notify_one(); } @@ -2412,7 +2428,7 @@ public: m_thread.join(); fLaunched = false; fExit = false; - serverAssert(queuejobs.empty()); + serverAssert(listLength(listJobs) == 0); serverAssert(queuefn.empty()); return ckeysLoaded; } @@ -2425,24 +2441,30 @@ public: aeSetThreadOwnsLockOverride(true); for (;;) { std::unique_lock lock(queue.mutex); - if (queue.queuejobs.empty() && queue.queuefn.empty()) { + if (listLength(queue.listJobs) == 0 && queue.queuefn.empty()) { if (queue.fExit) break; queue.cv.wait(lock); - if (queue.queuejobs.empty() && queue.queuefn.empty() && queue.fExit) + if (listLength(queue.listJobs) == 0 && queue.queuefn.empty() && queue.fExit) break; } + pqueue->cvThrottle.notify_one(); - auto queuejobs = std::move(queue.queuejobs); - queue.queuejobs.reserve(1024); + list *listJobs = queue.listJobs; + queue.listJobs = listCreate(); + listSetFreeMethod(queue.listJobs, listFreeMethod); + auto queuefn = std::move(queue.queuefn); lock.unlock(); - bool f1024thKey = false; - for (auto &job : queuejobs) { + vars.gcEpoch = g_pserver->garbageCollector.startEpoch(); + while (listLength(listJobs)) { + rdbInsertJob &job = *((rdbInsertJob*)listNodeValue(listFirst(listJobs))); + redisObjectStack keyobj; initStaticStringObject(keyobj,job.key); + bool f1024thKey = false; bool fStaleMvccKey = (pqueue->rsi) ? mvccFromObj(job.val) < pqueue->rsi->mvccMinThreshold : false; /* Check if the key already expired. This function is used when loading @@ -2469,7 +2491,7 @@ public: if (fInserted) { auto ckeys = queue.ckeysLoaded.fetch_add(1, std::memory_order_relaxed); - f1024thKey = f1024thKey || (ckeys % 1024) == 0; + f1024thKey = (ckeys % 1024) == 0; /* Set the expire time if needed */ if (job.expiretime != -1) @@ -2496,37 +2518,43 @@ public: { sdsfree(job.key); } + + /* If we have a storage provider check if we need to evict some keys to stay under our memory limit, + do this every 16 keys to limit the perf impact */ + if (g_pserver->m_pstorageFactory && f1024thKey) + { + bool fHighMemory = (getMaxmemoryState(NULL,NULL,NULL,NULL) != C_OK); + if (fHighMemory || f1024thKey) + { + for (int idb = 0; idb < cserver.dbnum; ++idb) + { + if (g_pserver->db[idb]->processChanges(false)) + g_pserver->db[idb]->commitChanges(); + if (fHighMemory && !(queue.rsi && queue.rsi->fForceSetKey)) { + g_pserver->db[idb]->removeAllCachedValues(); // During load we don't go through the normal eviction unless we're merging (i.e. an active replica) + fHighMemory = false; // we took care of it + } + g_pserver->db[idb]->trackChanges(false, 1024); + } + if (fHighMemory) + freeMemoryIfNeeded(false /*fQuickCycle*/, false /* fPreSnapshot*/); + } + } + + // Pop from the list + listDelNode(listJobs, listFirst(listJobs)); } - + listRelease(listJobs); + for (auto &fn : queuefn) { fn(); } - /* If we have a storage provider check if we need to evict some keys to stay under our memory limit, - do this every 16 keys to limit the perf impact */ - if (g_pserver->m_pstorageFactory && f1024thKey) - { - bool fHighMemory = (getMaxmemoryState(NULL,NULL,NULL,NULL) != C_OK); - if (fHighMemory || f1024thKey) - { - for (int idb = 0; idb < cserver.dbnum; ++idb) - { - if (g_pserver->db[idb]->processChanges(false)) - g_pserver->db[idb]->commitChanges(); - if (fHighMemory && !(queue.rsi && queue.rsi->fForceSetKey)) { - g_pserver->db[idb]->removeAllCachedValues(); // During load we don't go through the normal eviction unless we're merging (i.e. an active replica) - fHighMemory = false; // we took care of it - } - g_pserver->db[idb]->trackChanges(false, 1024); - } - if (fHighMemory) - freeMemoryIfNeeded(false /*fQuickCycle*/, false /* fPreSnapshot*/); - } - } + g_pserver->garbageCollector.endEpoch(vars.gcEpoch); } std::unique_lock lock(queue.mutex); serverAssert(queue.queuefn.empty()); - serverAssert(queue.queuejobs.empty()); + serverAssert(listLength(queue.listJobs) == 0); ProcessPendingAsyncWrites(); listRelease(vars.clients_pending_asyncwrite); aeSetThreadOwnsLockOverride(false); diff --git a/src/server.cpp b/src/server.cpp index fbcd0cc43..d4ace1aae 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -2424,7 +2424,7 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) { g_pserver->rdb_bgsave_scheduled = 0; } - if (cserver.storage_memory_model == STORAGE_WRITEBACK && g_pserver->m_pstorageFactory) { + if (cserver.storage_memory_model == STORAGE_WRITEBACK && g_pserver->m_pstorageFactory && !g_pserver->loading) { run_with_period(g_pserver->storage_flush_period) { flushStorageWeak(); } @@ -2611,7 +2611,7 @@ void beforeSleep(struct aeEventLoop *eventLoop) { static thread_local bool fFirstRun = true; // note: we also copy the DB pointer in case a DB swap is done while the lock is released std::vector vecdb; // note we cache the database pointer in case a dbswap is done while the lock is released - if (cserver.storage_memory_model == STORAGE_WRITETHROUGH && g_pserver->m_pstorageFactory != nullptr) + if (cserver.storage_memory_model == STORAGE_WRITETHROUGH && g_pserver->m_pstorageFactory != nullptr && !g_pserver->loading) { if (!fFirstRun) { mstime_t storage_process_latency; From eb35d7e9ec36d73e0aa8fa2bdb0eb7bb808e4627 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Wed, 5 May 2021 16:37:02 +0000 Subject: [PATCH 40/99] Updated maxmemory tests to account for overhead in new replication backlog behaviour Former-commit-id: 4cd197959693dfe4d1497c3f703cf6aaa27d34ad --- tests/unit/maxmemory.tcl | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/tests/unit/maxmemory.tcl b/tests/unit/maxmemory.tcl index 414733d1e..23879c38a 100644 --- a/tests/unit/maxmemory.tcl +++ b/tests/unit/maxmemory.tcl @@ -33,7 +33,8 @@ start_server {tags {"maxmemory"}} { # Get the current memory limit and calculate a new limit. # We just add 100k to the current memory size so that it is # fast for us to reach that limit. - set used [s used_memory] + set overhead [s mem_not_counted_for_evict] + set used [expr [s used_memory] - $overhead] set limit [expr {$used+100*1024}] r config set maxmemory $limit r config set maxmemory-policy $policy @@ -42,7 +43,7 @@ start_server {tags {"maxmemory"}} { while 1 { r setex [randomKey] 10000 x incr numkeys - if {[s used_memory]+4096 > $limit} { + if {[expr {[s used_memory] - $overhead + 4096}] > $limit} { assert {$numkeys > 10} break } @@ -52,7 +53,8 @@ start_server {tags {"maxmemory"}} { for {set j 0} {$j < $numkeys} {incr j} { r setex [randomKey] 10000 x } - assert {[s used_memory] < ($limit+4096)} + set used_amt [expr [s used_memory] - $overhead] + assert {$used_amt < ($limit+4096)} } } @@ -65,7 +67,8 @@ start_server {tags {"maxmemory"}} { # Get the current memory limit and calculate a new limit. # We just add 100k to the current memory size so that it is # fast for us to reach that limit. - set used [s used_memory] + set overhead [s mem_not_counted_for_evict] + set used [expr [s used_memory] - $overhead] set limit [expr {$used+100*1024}] r config set maxmemory $limit r config set maxmemory-policy $policy @@ -74,7 +77,7 @@ start_server {tags {"maxmemory"}} { while 1 { r set [randomKey] x incr numkeys - if {[s used_memory]+4096 > $limit} { + if {[expr [s used_memory] - $overhead]+4096 > $limit} { assert {$numkeys > 10} break } @@ -91,7 +94,7 @@ start_server {tags {"maxmemory"}} { } } if {[string match allkeys-* $policy]} { - assert {[s used_memory] < ($limit+4096)} + assert {[expr [s used_memory] - $overhead] < ($limit+4096)} } else { assert {$err == 1} } @@ -107,7 +110,8 @@ start_server {tags {"maxmemory"}} { # Get the current memory limit and calculate a new limit. # We just add 100k to the current memory size so that it is # fast for us to reach that limit. - set used [s used_memory] + set overhead [s mem_not_counted_for_evict] + set used [expr [s used_memory] - $overhead] set limit [expr {$used+100*1024}] r config set maxmemory $limit r config set maxmemory-policy $policy @@ -121,7 +125,7 @@ start_server {tags {"maxmemory"}} { } else { r set "key:$numkeys" x } - if {[s used_memory]+4096 > $limit} { + if {[expr [s used_memory] - $overhead]+4096 > $limit} { assert {$numkeys > 10} break } @@ -135,7 +139,7 @@ start_server {tags {"maxmemory"}} { catch {r setex "foo:$j" 10000 x} } # We should still be under the limit. - assert {[s used_memory] < ($limit+4096)} + assert {[expr [s used_memory] - $overhead] < ($limit+4096)} # However all our non volatile keys should be here. for {set j 0} {$j < $numkeys} {incr j 2} { assert {[r exists "key:$j"]} @@ -284,7 +288,8 @@ start_server {tags {"maxmemory"} overrides {server-threads 1}} { # we need to make sure to evict keynames of a total size of more than # 16kb since the (PROTO_REPLY_CHUNK_BYTES), only after that the # invalidation messages have a chance to trigger further eviction. - set used [s used_memory] + set overhead [s mem_not_counted_for_evict] + set used [expr [s used_memory] - $overhead] set limit [expr {$used - 40000}] r config set maxmemory $limit From f6a714db2658a4b8baa924e62dd0ab2c6a7adb9f Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Wed, 5 May 2021 17:04:08 +0000 Subject: [PATCH 41/99] Updated overhead calculation to only use repl_backlog_size Former-commit-id: 6f93c7eb44d84bb143b4ad4fff3c6a5436ebaaf7 --- src/evict.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/evict.cpp b/src/evict.cpp index 36837e17d..e7f0a10ef 100644 --- a/src/evict.cpp +++ b/src/evict.cpp @@ -399,8 +399,8 @@ size_t freeMemoryGetNotCountedMemory(void) { /* also don't count the replication backlog memory * that's where the replication clients get their memory from */ - overhead += (g_pserver->repl_backlog_size - g_pserver->repl_backlog_config_size); - + // overhead += (g_pserver->repl_backlog_size - g_pserver->repl_backlog_config_size); + overhead += g_pserver->repl_backlog_size; if (g_pserver->aof_state != AOF_OFF) { overhead += sdsalloc(g_pserver->aof_buf)+aofRewriteBufferSize(); From 7ff2fb716a4a92bf78a06a888fec82f246889c74 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Wed, 5 May 2021 22:26:36 +0000 Subject: [PATCH 42/99] Fixed data race? Seems to be passing multithreaded test cases now Former-commit-id: cb13edd1200c1230fa7e313d69c69e06129951d3 --- src/networking.cpp | 2 +- src/replication.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/networking.cpp b/src/networking.cpp index c51a02a1d..6f4aa6268 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -1782,7 +1782,7 @@ int writeToClient(client *c, int handler_installed) { // } - if (nwritten == nrequested){ + if (nwritten == nrequested && g_pserver->repl_backlog_idx == c->repl_curr_idx){ c->repl_curr_idx = -1; /* -1 denotes no more replica writes */ } else if (nwritten > 0) diff --git a/src/replication.cpp b/src/replication.cpp index 60f25052a..d3df6d12a 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -90,7 +90,7 @@ void resizeReplicationBacklogForClients(long long newsize); void setReplIdx(client *c, long long idx, long long off){ // serverLog(LL_NOTICE, "calling this garbage function w/ idx and off: %lld, %lld, %lld", idx, off, off-idx); // serverLog(LL_NOTICE, "Repl Index started at: %lld", c->repl_curr_idx); - if (c->repl_curr_idx == -1){ + if (c->repl_curr_idx == -1 && off >= c->repl_curr_off){ if (prepareClientToWrite(c) != C_OK) return; c->repl_curr_idx = idx; c->repl_curr_off = off; From 40fdb3ce05217d93c521e7028c6d902e6493fee2 Mon Sep 17 00:00:00 2001 From: John Sully Date: Thu, 6 May 2021 00:09:07 +0000 Subject: [PATCH 43/99] Add endurance testing to better detect threading bugs Former-commit-id: 945e428aa110968479fdcdfc2d5c5308a99eadc3 --- src/config.cpp | 1 + src/rdb.cpp | 8 +++++--- src/server.h | 1 + tests/integration/rdb.tcl | 12 ++++++++++++ 4 files changed, 19 insertions(+), 3 deletions(-) diff --git a/src/config.cpp b/src/config.cpp index f7609feb7..fea7b3076 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -2537,6 +2537,7 @@ standardConfig configs[] = { createBoolConfig("io-threads-do-reads", NULL, IMMUTABLE_CONFIG, fDummy, 0, NULL, NULL), createBoolConfig("time-thread-priority", NULL, IMMUTABLE_CONFIG, cserver.time_thread_priority, 0, NULL, NULL), createBoolConfig("prefetch-enabled", NULL, MODIFIABLE_CONFIG, g_pserver->prefetch_enabled, 1, NULL, NULL), + createBoolConfig("allow-rdb-resize-op", NULL, MODIFIABLE_CONFIG, g_pserver->allowRdbResizeOp, 1, NULL, NULL), /* String Configs */ createStringConfig("aclfile", NULL, IMMUTABLE_CONFIG, ALLOW_EMPTY_STRING, g_pserver->acl_filename, "", NULL, NULL), diff --git a/src/rdb.cpp b/src/rdb.cpp index ac35c219c..8c03fa60b 100644 --- a/src/rdb.cpp +++ b/src/rdb.cpp @@ -2699,9 +2699,11 @@ int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi) { goto eoferr; if ((expires_size = rdbLoadLen(rdb,NULL)) == RDB_LENERR) goto eoferr; - wqueue.enqueue([dbCur, db_size]{ - dbCur->expand(db_size); - }); + if (g_pserver->allowRdbResizeOp) { + wqueue.enqueue([dbCur, db_size]{ + dbCur->expand(db_size); + }); + } continue; /* Read next opcode. */ } else if (type == RDB_OPCODE_AUX) { /* AUX: generic string-string fields. Use to add state to RDB diff --git a/src/server.h b/src/server.h index 9d3e70197..65b1a4a5b 100644 --- a/src/server.h +++ b/src/server.h @@ -2161,6 +2161,7 @@ struct redisServer { sds aof_child_diff; /* AOF diff accumulator child side. */ int aof_rewrite_pending = 0; /* is a call to aofChildWriteDiffData already queued? */ /* RDB persistence */ + int allowRdbResizeOp; /* Debug situations we may want rehash to be ocurring, so ignore resize */ long long dirty; /* Changes to DB from the last save */ long long dirty_before_bgsave; /* Used to restore dirty on failed BGSAVE */ struct _rdbThreadVars diff --git a/tests/integration/rdb.tcl b/tests/integration/rdb.tcl index 58dc6c968..29af7af42 100644 --- a/tests/integration/rdb.tcl +++ b/tests/integration/rdb.tcl @@ -189,3 +189,15 @@ test {client freed during loading} { exec kill [srv 0 pid] } } + +test {repeated load} { + start_server [list overrides [list server-threads 3 allow-rdb-resize-op no]] { + r debug populate 500000 key 1000 + + set digest [r debug digest] + for {set j 0} {$j < 10} {incr j} { + r debug reload + assert_equal $digest [r debug digest] + } + } +} From 442aa5bbd9e52cdf86f12175a53308e6f50eb719 Mon Sep 17 00:00:00 2001 From: John Sully Date: Thu, 6 May 2021 00:42:49 +0000 Subject: [PATCH 44/99] Pause execution during rdbLoadProgressCallback as its too risky to let it run Former-commit-id: e70c01cb3e756d1e02ed190b76c73b7b7010c0d3 --- src/rdb.cpp | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/src/rdb.cpp b/src/rdb.cpp index 8c03fa60b..7198124c3 100644 --- a/src/rdb.cpp +++ b/src/rdb.cpp @@ -2361,6 +2361,7 @@ class rdbAsyncWorkThread std::mutex mutex; std::condition_variable cv; std::condition_variable cvThrottle; + fastlock m_lockPause { "rdbAsyncWork-Pause"}; bool fLaunched = false; bool fExit = false; std::atomic ckeysLoaded; @@ -2409,6 +2410,14 @@ public: cv.notify_one(); } + void pauseExecution() { + m_lockPause.lock(); + } + + void resumeExecution() { + m_lockPause.unlock(); + } + void enqueue(std::function &&fn) { std::unique_lock l(mutex); bool fNotify = queuefn.empty(); @@ -2459,6 +2468,7 @@ public: vars.gcEpoch = g_pserver->garbageCollector.startEpoch(); while (listLength(listJobs)) { + std::unique_lock ulPause(pqueue->m_lockPause); rdbInsertJob &job = *((rdbInsertJob*)listNodeValue(listFirst(listJobs))); redisObjectStack keyobj; @@ -2547,6 +2557,7 @@ public: listRelease(listJobs); for (auto &fn : queuefn) { + std::unique_lock ulPause(pqueue->m_lockPause); fn(); } @@ -2573,8 +2584,8 @@ void rdbLoadProgressCallback(rio *r, const void *buf, size_t len) { (r->keys_since_last_callback >= g_pserver->loading_process_events_interval_keys))) { rdbAsyncWorkThread *pwthread = reinterpret_cast(r->chksum_arg); - if (pwthread && g_pserver->fActiveReplica) - pwthread->endWork(); // We can't have the work queue modifying the database while processEventsWhileBlocked does its thing + if (pwthread) + pwthread->pauseExecution(); // We can't have the work queue modifying the database while processEventsWhileBlocked does its thing listIter li; listNode *ln; listRewind(g_pserver->masters, &li); @@ -2593,8 +2604,8 @@ void rdbLoadProgressCallback(rio *r, const void *buf, size_t len) { ping_argv[0] = createStringObject("PING",4); replicationFeedSlaves(g_pserver->slaves, g_pserver->replicaseldb, ping_argv, 1); decrRefCount(ping_argv[0]); - if (pwthread && g_pserver->fActiveReplica) - pwthread->start(); + if (pwthread) + pwthread->resumeExecution(); r->keys_since_last_callback = 0; } From 4fd76c47911f506909e54a138bf8f72b0fea8687 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Fri, 21 May 2021 17:05:55 +0000 Subject: [PATCH 45/99] Fixed single threaded for real this time, need to add synchronization for multi threaded Former-commit-id: 4d858dac1a503f4d518477212ba585069af22574 --- src/networking.cpp | 8 +++++--- src/replication.cpp | 5 +++++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/networking.cpp b/src/networking.cpp index 6f4aa6268..c39d8ce42 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -1676,8 +1676,7 @@ int writeToClient(client *c, int handler_installed) { std::unique_locklock)> lock(c->lock); - /* if this is a write to a replica, it's coming straight from the replication backlog */ - long long repl_backlog_idx = g_pserver->repl_backlog_idx; + while(clientHasPendingReplies(c)) { if (c->bufpos > 0) { @@ -1742,6 +1741,9 @@ int writeToClient(client *c, int handler_installed) { c->transmittedRDB = true; } + /* if this is a write to a replica, it's coming straight from the replication backlog */ + long long repl_backlog_idx = g_pserver->repl_backlog_idx; + /* For replicas, we don't store all the information in the client buffer * Most of the time (aside from immediately after synchronizing), we read * from the replication backlog directly */ @@ -1782,7 +1784,7 @@ int writeToClient(client *c, int handler_installed) { // } - if (nwritten == nrequested && g_pserver->repl_backlog_idx == c->repl_curr_idx){ + if (nwritten == nrequested && g_pserver->repl_backlog_idx == repl_backlog_idx){ c->repl_curr_idx = -1; /* -1 denotes no more replica writes */ } else if (nwritten > 0) diff --git a/src/replication.cpp b/src/replication.cpp index d3df6d12a..1d4e01289 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -3059,6 +3059,11 @@ void syncWithMaster(connection *conn) { if (psync_result == PSYNC_CONTINUE) { serverLog(LL_NOTICE, "MASTER <-> REPLICA sync: Master accepted a Partial Resynchronization."); + /* Reset the bulklen information in case it is lingering from the last connection + * The partial sync will start from the beginning of a command so these should be reset */ + mi->master->reqtype = 0; + mi->master->multibulklen = 0; + mi->master->bulklen = -1; if (cserver.supervised_mode == SUPERVISED_SYSTEMD) { redisCommunicateSystemd("STATUS=MASTER <-> REPLICA sync: Partial Resynchronization accepted. Ready to accept connections.\n"); redisCommunicateSystemd("READY=1\n"); From 6080ee8f2f33fd21de8dfa9c103ba569759bc127 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Wed, 26 May 2021 20:10:33 +0000 Subject: [PATCH 46/99] Added transmitted RDB lock Former-commit-id: 4b32167afc85742d85ff9b47b2c2e0b6b02e140a --- src/networking.cpp | 13 +++++++++++-- src/replication.cpp | 15 ++++++++++----- src/server.h | 2 ++ 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/src/networking.cpp b/src/networking.cpp index c39d8ce42..176693501 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -319,6 +319,7 @@ void _clientAsyncReplyBufferReserve(client *c, size_t len) { clientReplyBlock *replyNew = (clientReplyBlock*)zmalloc(sizeof(clientReplyBlock) + newsize); replyNew->size = zmalloc_usable(replyNew) - sizeof(clientReplyBlock); replyNew->used = 0; + std::unique_lock tRDBLock (c->transmittedRDBLock); c->replyAsync = replyNew; } @@ -332,6 +333,7 @@ int _addReplyToBuffer(client *c, const char *s, size_t len) { if (fAsync) { serverAssert(GlobalLocksAcquired()); + std::unique_lock tRDBLock (c->transmittedRDBLock); if (c->replyAsync == nullptr || (c->replyAsync->size - c->replyAsync->used) < len) { if (c->replyAsync == nullptr) { @@ -1737,9 +1739,14 @@ int writeToClient(client *c, int handler_installed) { /* If there are no more pending replies, then we have transmitted the RDB. * This means further replication commands will be taken straight from the * replication backlog from now on. */ + + std::unique_lock tRDBLock (c->transmittedRDBLock); + if (c->flags & CLIENT_SLAVE && c->replstate == SLAVE_STATE_ONLINE && !clientHasPendingReplies(c) && c->replyAsync == nullptr){ c->transmittedRDB = true; } + bool transmittedRDB = c->transmittedRDB; + tRDBLock.unlock(); /* if this is a write to a replica, it's coming straight from the replication backlog */ long long repl_backlog_idx = g_pserver->repl_backlog_idx; @@ -1747,7 +1754,7 @@ int writeToClient(client *c, int handler_installed) { /* For replicas, we don't store all the information in the client buffer * Most of the time (aside from immediately after synchronizing), we read * from the replication backlog directly */ - if (c->flags & CLIENT_SLAVE && c->repl_curr_idx != -1 && c->transmittedRDB){ + if (c->flags & CLIENT_SLAVE && c->repl_curr_idx != -1 && transmittedRDB){ /* copy global variables into local scope so if they change in between we don't care */ long long repl_backlog_size = g_pserver->repl_backlog_size; long long nwrittenPart2 = 0; @@ -1874,6 +1881,7 @@ void ProcessPendingAsyncWrites() serverAssert(c->fPendingAsyncWrite); if (c->flags & (CLIENT_CLOSE_ASAP | CLIENT_CLOSE_AFTER_REPLY)) { + std::unique_lock tRDBLock (c->transmittedRDBLock); if (c->replyAsync != nullptr){ zfree(c->replyAsync); c->replyAsync = nullptr; @@ -1885,6 +1893,7 @@ void ProcessPendingAsyncWrites() /* since writes from master to replica can come directly from the replication backlog, * writes may have been signalled without having been copied to the replyAsync buffer, * thus causing the buffer to be NULL */ + std::unique_lock tRDBLock (c->transmittedRDBLock); if (c->replyAsync != nullptr){ int size = c->replyAsync->used; @@ -1905,7 +1914,7 @@ void ProcessPendingAsyncWrites() } c->fPendingAsyncWrite = FALSE; - + tRDBLock.unlock(); // Now install the write event handler int ae_flags = AE_WRITABLE|AE_WRITE_THREADSAFE; /* For the fsync=always policy, we want that a given FD is never diff --git a/src/replication.cpp b/src/replication.cpp index 1d4e01289..ad79f4887 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -441,6 +441,8 @@ void feedReplicationBacklog(const void *ptr, size_t len) { g_pserver->master_repl_offset += len; + + /* This is a circular buffer, so write as much data we can at every * iteration and rewind the "idx" index if we reach the limit. */ while(len) { @@ -4659,11 +4661,14 @@ void flushReplBacklogToClients() #ifdef BYPASS_BUFFER - /* If we are online and the RDB has been sent, there is no need to feed the client buffer - * We will send our replies directly from the replication backlog instead */ - if (replica->replstate == SLAVE_STATE_ONLINE && replica->transmittedRDB){ - setReplIdx(replica, g_pserver->repl_batch_idxStart, g_pserver->repl_batch_offStart); - continue; + { + /* If we are online and the RDB has been sent, there is no need to feed the client buffer + * We will send our replies directly from the replication backlog instead */ + std::unique_lock tRDBLock (replica->transmittedRDBLock); + if (replica->replstate == SLAVE_STATE_ONLINE && replica->transmittedRDB){ + setReplIdx(replica, g_pserver->repl_batch_idxStart, g_pserver->repl_batch_offStart); + continue; + } } #endif if (g_pserver->repl_backlog_idx >= g_pserver->repl_batch_idxStart) { diff --git a/src/server.h b/src/server.h index 6c5265fbd..14005e7d5 100644 --- a/src/server.h +++ b/src/server.h @@ -1582,6 +1582,7 @@ struct client { // post a function from a non-client thread to run on its client thread bool postFunction(std::function fn, bool fLock = true); + fastlock transmittedRDBLock {"transmittedRDB"}; size_t argv_len_sum() const; }; @@ -2228,6 +2229,7 @@ struct redisServer { that is the next byte will'll write to.*/ long long repl_backlog_off; /* Replication "master offset" of first byte in the replication backlog buffer.*/ + fastlock repl_backlog_lock {"replication backlog"}; time_t repl_backlog_time_limit; /* Time without slaves after the backlog gets released. */ time_t repl_no_slaves_since; /* We have no slaves since that time. From bf120245faa6867db1b464f319ee3944c017ad28 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Thu, 27 May 2021 18:57:23 +0000 Subject: [PATCH 47/99] Added more synchronization and fixed some data races Former-commit-id: 183e015dac6f85df1c94d0761e89bc23d9f53319 --- src/multi.cpp | 2 + src/networking.cpp | 141 +++++++++++++++++++++++--------------------- src/replication.cpp | 57 ++++++++---------- src/server.cpp | 1 + src/server.h | 3 + 5 files changed, 105 insertions(+), 99 deletions(-) diff --git a/src/multi.cpp b/src/multi.cpp index 9df72383d..9fd5206fb 100644 --- a/src/multi.cpp +++ b/src/multi.cpp @@ -237,6 +237,8 @@ void execCommand(client *c) { * backlog with the final EXEC. */ if (g_pserver->repl_backlog && was_master && !is_master) { const char *execcmd = "*1\r\n$4\r\nEXEC\r\n"; + updateLowestOffsetAmongReplicas(); + std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); feedReplicationBacklog(execcmd,strlen(execcmd)); } } diff --git a/src/networking.cpp b/src/networking.cpp index 176693501..caefd6d1e 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -125,6 +125,7 @@ client *createClient(connection *conn, int iel) { client_id = g_pserver->next_client_id.fetch_add(1); c->iel = iel; c->id = client_id; + sprintf(c->lock.szName, "client %lu", client_id); c->resp = 2; c->conn = conn; c->name = NULL; @@ -1677,8 +1678,7 @@ int writeToClient(client *c, int handler_installed) { serverAssertDebug(FCorrectThread(c)); std::unique_locklock)> lock(c->lock); - - + // serverLog(LL_NOTICE, "acq client"); while(clientHasPendingReplies(c)) { if (c->bufpos > 0) { @@ -1736,82 +1736,87 @@ int writeToClient(client *c, int handler_installed) { !(c->flags & CLIENT_SLAVE)) break; } - /* If there are no more pending replies, then we have transmitted the RDB. - * This means further replication commands will be taken straight from the - * replication backlog from now on. */ + /* We can only directly read from the replication backlog if the client + is a replica, so only attempt to do so if that's the case. */ + if (c->flags & CLIENT_SLAVE) { + /* If there are no more pending replies, then we have transmitted the RDB. + * This means further replication commands will be taken straight from the + * replication backlog from now on. */ + std::unique_lock tRDBLock (c->transmittedRDBLock); - std::unique_lock tRDBLock (c->transmittedRDBLock); + if (c->replstate == SLAVE_STATE_ONLINE && !clientHasPendingReplies(c) && c->replyAsync == nullptr){ + c->transmittedRDB = true; + } + bool transmittedRDB = c->transmittedRDB; + tRDBLock.unlock(); - if (c->flags & CLIENT_SLAVE && c->replstate == SLAVE_STATE_ONLINE && !clientHasPendingReplies(c) && c->replyAsync == nullptr){ - c->transmittedRDB = true; - } - bool transmittedRDB = c->transmittedRDB; - tRDBLock.unlock(); + /* For replicas, we don't store all the information in the client buffer + * Most of the time (aside from immediately after synchronizing), we read + * from the replication backlog directly */ + if (c->repl_curr_idx != -1 && transmittedRDB){ + std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); - /* if this is a write to a replica, it's coming straight from the replication backlog */ - long long repl_backlog_idx = g_pserver->repl_backlog_idx; + /* copy global variables into local scope so if they change in between we don't care */ + long long repl_backlog_idx = g_pserver->repl_backlog_idx; + long long repl_backlog_size = g_pserver->repl_backlog_size; + long long nwrittenPart2 = 0; - /* For replicas, we don't store all the information in the client buffer - * Most of the time (aside from immediately after synchronizing), we read - * from the replication backlog directly */ - if (c->flags & CLIENT_SLAVE && c->repl_curr_idx != -1 && transmittedRDB){ - /* copy global variables into local scope so if they change in between we don't care */ - long long repl_backlog_size = g_pserver->repl_backlog_size; - long long nwrittenPart2 = 0; + ssize_t nrequested; /* The number of bytes requested to write */ + /* normal case with no wrap around */ + if (repl_backlog_idx >= c->repl_curr_idx){ + nrequested = repl_backlog_idx - c->repl_curr_idx; + nwritten = connWrite(c->conn, g_pserver->repl_backlog + c->repl_curr_idx, repl_backlog_idx - c->repl_curr_idx); + /* wrap around case, v. rare */ + /* also v. buggy so there's that */ + } else { + nrequested = repl_backlog_size + repl_backlog_idx - c->repl_curr_idx; + nwritten = connWrite(c->conn, g_pserver->repl_backlog + c->repl_curr_idx, repl_backlog_size - c->repl_curr_idx); + /* only attempt wrapping if we write the correct number of bytes */ + if (nwritten == repl_backlog_size - c->repl_curr_idx){ + long long nwrittenPart2 = connWrite(c->conn, g_pserver->repl_backlog, repl_backlog_idx); + if (nwrittenPart2 != -1) + nwritten += nwrittenPart2; - ssize_t nrequested; /* The number of bytes requested to write */ - /* normal case with no wrap around */ - if (repl_backlog_idx >= c->repl_curr_idx){ - nrequested = repl_backlog_idx - c->repl_curr_idx; - nwritten = connWrite(c->conn, g_pserver->repl_backlog + c->repl_curr_idx, repl_backlog_idx - c->repl_curr_idx); - /* wrap around case, v. rare */ - /* also v. buggy so there's that */ - } else { - nrequested = repl_backlog_size + repl_backlog_idx - c->repl_curr_idx; - nwritten = connWrite(c->conn, g_pserver->repl_backlog + c->repl_curr_idx, repl_backlog_size - c->repl_curr_idx); - /* only attempt wrapping if we write the correct number of bytes */ - if (nwritten == repl_backlog_size - c->repl_curr_idx){ - long long nwrittenPart2 = connWrite(c->conn, g_pserver->repl_backlog, repl_backlog_idx); - if (nwrittenPart2 != -1) - nwritten += nwrittenPart2; + } + } - } + /* only update the replica's current index if bytes were sent */ + + // if (nrequested != nwritten){ + // serverLog(LL_NOTICE, "-----------------------------------------"); + // serverLog(LL_NOTICE, "AFTER THE FACT"); + // serverLog(LL_NOTICE, "requested to write: %ld", nrequested); + // serverLog(LL_NOTICE, "actually written: %ld", nwritten); + // serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size); + // serverLog(LL_NOTICE, "repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset); + // serverLog(LL_NOTICE, "-----------------------------------------"); + // } + + + if (nwritten == nrequested && g_pserver->repl_backlog_idx == repl_backlog_idx){ + c->repl_curr_idx = -1; /* -1 denotes no more replica writes */ + } + else if (nwritten > 0) + c->repl_curr_idx = (c->repl_curr_idx + nwritten) % repl_backlog_size; + + serverAssert(c->repl_curr_idx < repl_backlog_size); + + /* only increment bytes if an error didn't occur */ + if (nwritten > 0){ + totwritten += nwritten; + c->repl_curr_off += nwritten; + } + + /* If the second part of a write didn't go through, we still need to register that */ + if (nwrittenPart2 == -1) nwritten = -1; } - /* only update the replica's current index if bytes were sent */ + if (c->flags & CLIENT_SLAVE && handler_installed) + serverLog(LL_NOTICE, "Total bytes written, %ld, write handler installed?: %d", totwritten, handler_installed); - // if (nrequested != nwritten){ - // serverLog(LL_NOTICE, "-----------------------------------------"); - // serverLog(LL_NOTICE, "AFTER THE FACT"); - // serverLog(LL_NOTICE, "requested to write: %ld", nrequested); - // serverLog(LL_NOTICE, "actually written: %ld", nwritten); - // serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size); - // serverLog(LL_NOTICE, "repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset); - // serverLog(LL_NOTICE, "-----------------------------------------"); - // } - - - if (nwritten == nrequested && g_pserver->repl_backlog_idx == repl_backlog_idx){ - c->repl_curr_idx = -1; /* -1 denotes no more replica writes */ - } - else if (nwritten > 0) - c->repl_curr_idx = (c->repl_curr_idx + nwritten) % repl_backlog_size; - - serverAssert(c->repl_curr_idx < repl_backlog_size); - - /* only increment bytes if an error didn't occur */ - if (nwritten > 0){ - totwritten += nwritten; - c->repl_curr_off += nwritten; - } - - /* If the second part of a write didn't go through, we still need to register that */ - if (nwrittenPart2 == -1) nwritten = -1; } - if (c->flags & CLIENT_SLAVE && handler_installed) - serverLog(LL_NOTICE, "Total bytes written, %ld, write handler installed?: %d", totwritten, handler_installed); - + // serverLog(LL_NOTICE, "rel client"); g_pserver->stat_net_output_bytes += totwritten; if (nwritten == -1) { if (connGetState(c->conn) == CONN_STATE_CONNECTED) { @@ -1834,7 +1839,7 @@ int writeToClient(client *c, int handler_installed) { if (!clientHasPendingReplies(c) && c->repl_curr_idx == -1) { if(c->flags & CLIENT_SLAVE && handler_installed){ serverLog(LL_NOTICE, "Uninstalling handler"); - serverLog(LL_NOTICE, "handler repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size); + serverLog(LL_NOTICE, "handler repl_curr_idx: %lld, repl_backlog_size: %lld", c->repl_curr_idx, g_pserver->repl_backlog_size); serverLog(LL_NOTICE, "handler repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset); } c->sentlen = 0; diff --git a/src/replication.cpp b/src/replication.cpp index ad79f4887..d1181bdf4 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -56,9 +56,11 @@ void putSlaveOnline(client *replica); int cancelReplicationHandshake(redisMaster *mi); static void propagateMasterStaleKeys(); -/* gets the lowest offset amongst all of the replicas */ -long long getLowestOffsetAmongReplicas(){ +/* gets the lowest offset amongst all of the replicas and stores it globally*/ +void updateLowestOffsetAmongReplicas(){ serverAssert(GlobalLocksAcquired()); + serverAssert(!g_pserver->repl_backlog_lock.fOwnLock()); + // serverLog(LL_NOTICE, "off- have repl"); long long min_offset = LONG_LONG_MAX; listIter li; listNode *ln; @@ -69,16 +71,15 @@ long long getLowestOffsetAmongReplicas(){ if (replica->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue; if (replica->flags & CLIENT_CLOSE_ASAP) continue; - if (replica->repl_curr_idx == -1) continue; - std::unique_lock ul(replica->lock, std::defer_lock); - if (FCorrectThread(replica)) - ul.lock(); + std::unique_lock ul(replica->lock); + // serverLog(LL_NOTICE, "off- acq client"); - min_offset = std::min(min_offset, replica->repl_curr_off); + min_offset = std::min(min_offset, replica->repl_curr_off); + // serverLog(LL_NOTICE, "off- rel client"); } /* return -1 if no other minimum was found */ - return min_offset == LONG_LONG_MAX ? -1 : min_offset; + g_pserver->repl_lowest_off.store(min_offset == LONG_LONG_MAX ? -1 : min_offset, std::memory_order_seq_cst); } /* We take a global flag to remember if this instance generated an RDB * because of replication, so that we can remove the RDB file in case @@ -412,11 +413,12 @@ void freeReplicationBacklog(void) { * the backlog without incrementing the offset. */ void feedReplicationBacklog(const void *ptr, size_t len) { serverAssert(GlobalLocksAcquired()); + serverAssert(g_pserver->repl_backlog_lock.fOwnLock()); const unsigned char *p = (const unsigned char*)ptr; if (g_pserver->repl_batch_idxStart >= 0) { /* we are lower bounded by the lower client offset or the offStart if all the clients are up to date */ - long long lower_bound = getLowestOffsetAmongReplicas(); + long long lower_bound = g_pserver->repl_lowest_off.load(std::memory_order_seq_cst); if (lower_bound == -1) lower_bound = g_pserver->repl_batch_offStart; long long minimumsize = g_pserver->master_repl_offset + len - lower_bound + 1; @@ -441,10 +443,9 @@ void feedReplicationBacklog(const void *ptr, size_t len) { g_pserver->master_repl_offset += len; - - /* This is a circular buffer, so write as much data we can at every * iteration and rewind the "idx" index if we reach the limit. */ + while(len) { size_t thislen = g_pserver->repl_backlog_size - g_pserver->repl_backlog_idx; if (thislen > len) thislen = len; @@ -598,6 +599,8 @@ void replicationFeedSlavesCore(list *slaves, int dictid, robj **argv, int argc) serverAssert(!(listLength(slaves) != 0 && g_pserver->repl_backlog == NULL)); bool fSendRaw = !g_pserver->fActiveReplica; + updateLowestOffsetAmongReplicas(); + std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); /* Send SELECT command to every replica if needed. */ if (g_pserver->replicaseldb != dictid) { @@ -619,7 +622,9 @@ void replicationFeedSlavesCore(list *slaves, int dictid, robj **argv, int argc) /* Add the SELECT command into the backlog. */ /* We don't do this for advanced replication because this will be done later when it adds the whole RREPLAY command */ - if (g_pserver->repl_backlog && fSendRaw) feedReplicationBacklogWithObject(selectcmd); + if (g_pserver->repl_backlog && fSendRaw) { + feedReplicationBacklogWithObject(selectcmd); + } if (dictid < 0 || dictid >= PROTO_SHARED_SELECT_CMDS) decrRefCount(selectcmd); @@ -632,7 +637,6 @@ void replicationFeedSlavesCore(list *slaves, int dictid, robj **argv, int argc) if (fSendRaw) { char aux[LONG_STR_SIZE+3]; - /* Add the multi bulk reply length. */ aux[0] = '*'; int multilen = ll2string(aux+1,sizeof(aux)-1,argc); @@ -759,7 +763,11 @@ void replicationFeedSlavesFromMasterStream(char *buf, size_t buflen) { printf("\n"); } - if (g_pserver->repl_backlog) feedReplicationBacklog(buf,buflen); + if (g_pserver->repl_backlog){ + updateLowestOffsetAmongReplicas(); + std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); + feedReplicationBacklog(buf,buflen); + } } void replicationFeedMonitors(client *c, list *monitors, int dictid, robj **argv, int argc) { @@ -4662,6 +4670,9 @@ void flushReplBacklogToClients() #ifdef BYPASS_BUFFER { + std::unique_lock asyncUl(replica->lock, std::defer_lock); + if (!FCorrectThread(replica)) + asyncUl.lock(); /* If we are online and the RDB has been sent, there is no need to feed the client buffer * We will send our replies directly from the replication backlog instead */ std::unique_lock tRDBLock (replica->transmittedRDBLock); @@ -4694,21 +4705,5 @@ void flushReplBacklogToClients() // This may be called multiple times per "frame" so update with our progress flushing to clients g_pserver->repl_batch_idxStart = g_pserver->repl_backlog_idx; g_pserver->repl_batch_offStart = g_pserver->master_repl_offset; - } else if (getLowestOffsetAmongReplicas() != -1){ - listIter li; - listNode *ln; - listRewind(g_pserver->slaves, &li); - while ((ln = listNext(&li))) { - client *replica = (client*)listNodeValue(ln); - - std::unique_lock ul(replica->lock, std::defer_lock); - if (FCorrectThread(replica)) - ul.lock(); - - /* try to force prepare client to write i guess? */ - if (replica->repl_curr_idx != -1){ - if (prepareClientToWrite(replica) != C_OK) continue; - } - } - } + } } diff --git a/src/server.cpp b/src/server.cpp index 9664a4a6b..439e1aeff 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -2924,6 +2924,7 @@ void initServerConfig(void) { g_pserver->enable_multimaster = CONFIG_DEFAULT_ENABLE_MULTIMASTER; g_pserver->repl_syncio_timeout = CONFIG_REPL_SYNCIO_TIMEOUT; g_pserver->master_repl_offset = 0; + g_pserver->repl_lowest_off.store(-1, std::memory_order_seq_cst); /* Replication partial resync backlog */ g_pserver->repl_backlog = NULL; diff --git a/src/server.h b/src/server.h index 14005e7d5..da1fce52e 100644 --- a/src/server.h +++ b/src/server.h @@ -2241,6 +2241,8 @@ struct redisServer { int repl_diskless_load; /* Slave parse RDB directly from the socket. * see REPL_DISKLESS_LOAD_* enum */ int repl_diskless_sync_delay; /* Delay to start a diskless repl BGSAVE. */ + std::atomic repl_lowest_off; /* The lowest offset amongst all clients + Updated before calls to feed the replication backlog */ /* Replication (replica) */ list *masters; int enable_multimaster; @@ -2838,6 +2840,7 @@ void rdbPipeReadHandler(struct aeEventLoop *eventLoop, int fd, void *clientData, void rdbPipeWriteHandlerConnRemoved(struct connection *conn); void replicationNotifyLoadedKey(redisDb *db, robj_roptr key, robj_roptr val, long long expire); void replicateSubkeyExpire(redisDb *db, robj_roptr key, robj_roptr subkey, long long expire); +void updateLowestOffsetAmongReplicas(void); /* Generic persistence functions */ void startLoadingFile(FILE* fp, const char * filename, int rdbflags); From d0e69e4c4778997ab69da6ca02c8256eb9d73a3c Mon Sep 17 00:00:00 2001 From: John Sully Date: Tue, 1 Jun 2021 20:01:41 +0000 Subject: [PATCH 48/99] Reduce lock contention when loading to a storage provider Former-commit-id: 58bc777f2215918043325753b6e2bf89dc3108f7 --- src/db.cpp | 28 +++++++++++++++++++++++++ src/rdb.cpp | 59 ++++++++++++++++++++++++++++++++-------------------- src/server.h | 2 ++ 3 files changed, 67 insertions(+), 22 deletions(-) diff --git a/src/db.cpp b/src/db.cpp index 13734aed5..e4cc6b8f7 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -2765,6 +2765,34 @@ bool redisDbPersistentData::processChanges(bool fSnapshot) return (m_spstorage != nullptr); } +void redisDbPersistentData::processChangesAsync(std::atomic &pendingJobs) +{ + ++pendingJobs; + dictEmpty(m_dictChanged, nullptr); + dict *dictNew = dictCreate(&dbDictType, nullptr); + std::swap(dictNew, m_pdict); + m_cnewKeysPending = 0; + g_pserver->asyncworkqueue->AddWorkFunction([dictNew, this, &pendingJobs]{ + dictIterator *di = dictGetIterator(dictNew); + dictEntry *de; + std::vector veckeys; + std::vector vecvals; + while ((de = dictNext(di)) != nullptr) + { + robj *o = (robj*)dictGetVal(de); + sds temp = serializeStoredObjectAndExpire(this, (const char*) dictGetKey(de), o); + veckeys.push_back((sds)dictGetKey(de)); + vecvals.push_back(temp); + } + m_spstorage->bulkInsert(veckeys.data(), vecvals.data(), veckeys.size()); + for (auto val : vecvals) + sdsfree(val); + dictReleaseIterator(di); + dictRelease(dictNew); + --pendingJobs; + }); +} + void redisDbPersistentData::commitChanges(const redisDbPersistentDataSnapshot **psnapshotFree) { if (m_pdbSnapshotStorageFlush) diff --git a/src/rdb.cpp b/src/rdb.cpp index 7198124c3..c940a11cc 100644 --- a/src/rdb.cpp +++ b/src/rdb.cpp @@ -2365,6 +2365,8 @@ class rdbAsyncWorkThread bool fLaunched = false; bool fExit = false; std::atomic ckeysLoaded; + std::atomic cstorageWritesInFlight; + std::atomic workerThreadDone; std::thread m_thread; long long now; @@ -2378,6 +2380,7 @@ public: : rsi(rsi), rdbflags(rdbflags), now(now) { ckeysLoaded = 0; + cstorageWritesInFlight = 0; listJobs = listCreate(); listSetFreeMethod(listJobs, listFreeMethod); } @@ -2397,6 +2400,14 @@ public: void throttle(std::unique_lock &l) { if (listLength(listJobs) > 0 && (listLength(listJobs) % 1024 == 0) && (getMaxmemoryState(NULL,NULL,NULL,NULL) != C_OK)) { cvThrottle.wait(l); + while (cstorageWritesInFlight.load(std::memory_order_relaxed) && (getMaxmemoryState(NULL,NULL,NULL,NULL) != C_OK)) { + l.unlock(); + usleep(100); + pauseExecution(); + processEventsWhileBlocked(serverTL - g_pserver->rgthreadvar); + resumeExecution(); + l.lock(); + } } } @@ -2404,9 +2415,8 @@ public: rdbInsertJob *pjob = new rdbInsertJob(job); std::unique_lock l(mutex); throttle(l); - bool fNotify = listLength(listJobs) == 0; listAddNodeTail(listJobs, pjob); - if (fNotify) + if (listLength(listJobs) == 1) cv.notify_one(); } @@ -2434,7 +2444,15 @@ public: fExit = true; cv.notify_one(); l.unlock(); + while (!workerThreadDone) { + usleep(100); + processEventsWhileBlocked(serverTL - g_pserver->rgthreadvar); + } m_thread.join(); + while (cstorageWritesInFlight.load(std::memory_order_seq_cst)) { + usleep(100); + processEventsWhileBlocked(serverTL - g_pserver->rgthreadvar); + } fLaunched = false; fExit = false; serverAssert(listLength(listJobs) == 0); @@ -2538,13 +2556,10 @@ public: { for (int idb = 0; idb < cserver.dbnum; ++idb) { - if (g_pserver->db[idb]->processChanges(false)) - g_pserver->db[idb]->commitChanges(); - if (fHighMemory && !(queue.rsi && queue.rsi->fForceSetKey)) { - g_pserver->db[idb]->removeAllCachedValues(); // During load we don't go through the normal eviction unless we're merging (i.e. an active replica) - fHighMemory = false; // we took care of it + if (g_pserver->m_pstorageFactory) { + g_pserver->db[idb]->processChangesAsync(queue.cstorageWritesInFlight); + fHighMemory = false; } - g_pserver->db[idb]->trackChanges(false, 1024); } if (fHighMemory) freeMemoryIfNeeded(false /*fQuickCycle*/, false /* fPreSnapshot*/); @@ -2563,6 +2578,13 @@ public: g_pserver->garbageCollector.endEpoch(vars.gcEpoch); } + + if (g_pserver->m_pstorageFactory) { + for (int idb = 0; idb < cserver.dbnum; ++idb) + g_pserver->db[idb]->processChangesAsync(queue.cstorageWritesInFlight); + } + + queue.workerThreadDone = true; std::unique_lock lock(queue.mutex); serverAssert(queue.queuefn.empty()); serverAssert(listLength(queue.listJobs) == 0); @@ -2584,8 +2606,6 @@ void rdbLoadProgressCallback(rio *r, const void *buf, size_t len) { (r->keys_since_last_callback >= g_pserver->loading_process_events_interval_keys))) { rdbAsyncWorkThread *pwthread = reinterpret_cast(r->chksum_arg); - if (pwthread) - pwthread->pauseExecution(); // We can't have the work queue modifying the database while processEventsWhileBlocked does its thing listIter li; listNode *ln; listRewind(g_pserver->masters, &li); @@ -2596,7 +2616,14 @@ void rdbLoadProgressCallback(rio *r, const void *buf, size_t len) { replicationSendNewlineToMaster(mi); } loadingProgress(r->processed_bytes); + + if (pwthread) + pwthread->pauseExecution(); processEventsWhileBlocked(serverTL - g_pserver->rgthreadvar); + if (pwthread) + pwthread->resumeExecution(); + + processModuleLoadingProgressEvent(0); robj *ping_argv[1]; @@ -2604,8 +2631,6 @@ void rdbLoadProgressCallback(rio *r, const void *buf, size_t len) { ping_argv[0] = createStringObject("PING",4); replicationFeedSlaves(g_pserver->slaves, g_pserver->replicaseldb, ping_argv, 1); decrRefCount(ping_argv[0]); - if (pwthread) - pwthread->resumeExecution(); r->keys_since_last_callback = 0; } @@ -2629,11 +2654,6 @@ int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi) { sds key = nullptr; bool fLastKeyExpired = false; - for (int idb = 0; idb < cserver.dbnum; ++idb) - { - g_pserver->db[idb]->trackChanges(true, 1024); - } - rdb->update_cksum = rdbLoadProgressCallback; rdb->chksum_arg = &wqueue; rdb->max_processing_chunk = g_pserver->loading_process_events_interval_bytes; @@ -2946,11 +2966,6 @@ int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi) { wqueue.endWork(); - for (int idb = 0; idb < cserver.dbnum; ++idb) - { - if (g_pserver->db[idb]->processChanges(false)) - g_pserver->db[idb]->commitChanges(); - } return C_OK; /* Unexpected end of file is handled here calling rdbReportReadError(): diff --git a/src/server.h b/src/server.h index c0b34defb..bd411aba8 100644 --- a/src/server.h +++ b/src/server.h @@ -1114,6 +1114,7 @@ public: // either release the global lock or keep the same global lock between the two functions as // a second look is kept to ensure writes to secondary storage are ordered bool processChanges(bool fSnapshot); + void processChangesAsync(std::atomic &pendingJobs); void commitChanges(const redisDbPersistentDataSnapshot **psnapshotFree = nullptr); // This should only be used if you look at the key, we do not fixup @@ -1278,6 +1279,7 @@ struct redisDb : public redisDbPersistentDataSnapshot using redisDbPersistentData::setExpire; using redisDbPersistentData::trackChanges; using redisDbPersistentData::processChanges; + using redisDbPersistentData::processChangesAsync; using redisDbPersistentData::commitChanges; using redisDbPersistentData::setexpireUnsafe; using redisDbPersistentData::setexpire; From ef41f966336d1b9248cf023fe9ace1a1a90774bc Mon Sep 17 00:00:00 2001 From: John Sully Date: Tue, 1 Jun 2021 22:15:35 +0000 Subject: [PATCH 49/99] Fix lock mismatch Former-commit-id: 98eb0e778bc3a5ff7da917d39997b2fdb4adbca6 --- src/StorageCache.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/StorageCache.cpp b/src/StorageCache.cpp index 29908c7f5..e33c97ff7 100644 --- a/src/StorageCache.cpp +++ b/src/StorageCache.cpp @@ -140,5 +140,6 @@ size_t StorageCache::count() const void StorageCache::beginWriteBatch() { serverAssert(GlobalLocksAcquired()); // Otherwise we deadlock + m_lock.lock(); m_spstorage->beginWriteBatch(); } \ No newline at end of file From af81622bfc8e398105851d12f7c579d9f0c2e292 Mon Sep 17 00:00:00 2001 From: John Sully Date: Tue, 1 Jun 2021 23:59:22 +0000 Subject: [PATCH 50/99] We need to refactor to gurantee the key is visible when loading subexpires. Keys may be temporarily invisible while waiting to be added to the storage Former-commit-id: 222eecb95925f7c60e28a5717d73163ad64b522b --- src/rdb.cpp | 102 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 64 insertions(+), 38 deletions(-) diff --git a/src/rdb.cpp b/src/rdb.cpp index c940a11cc..962eba589 100644 --- a/src/rdb.cpp +++ b/src/rdb.cpp @@ -2343,13 +2343,41 @@ void stopSaving(int success) { struct rdbInsertJob { - redisDb *db; - sds key; - robj *val; + redisDb *db = nullptr; + sds key = nullptr; + robj *val = nullptr; long long lru_clock; long long expiretime; long long lru_idle; long long lfu_freq; + std::vector> vecsubexpires; + + void addSubexpireKey(robj *subkey, long long when) { + vecsubexpires.push_back(std::make_pair(robj_sharedptr(subkey), when)); + decrRefCount(subkey); + } + + rdbInsertJob() = default; + rdbInsertJob(rdbInsertJob &&src) { + db = src.db; + src.db = nullptr; + key = src.key; + src.key = nullptr; + val = src.val; + src.val = nullptr; + lru_clock = src.lru_clock; + expiretime = src.expiretime; + lru_idle = src.lru_idle; + lfu_freq = src.lfu_freq; + vecsubexpires = std::move(src.vecsubexpires); + } + + ~rdbInsertJob() { + if (key) + sdsfree(key); + if (val) + decrRefCount(val); + } }; class rdbAsyncWorkThread @@ -2402,7 +2430,7 @@ public: cvThrottle.wait(l); while (cstorageWritesInFlight.load(std::memory_order_relaxed) && (getMaxmemoryState(NULL,NULL,NULL,NULL) != C_OK)) { l.unlock(); - usleep(100); + usleep(10); pauseExecution(); processEventsWhileBlocked(serverTL - g_pserver->rgthreadvar); resumeExecution(); @@ -2411,11 +2439,10 @@ public: } } - void enqueue(rdbInsertJob &job) { - rdbInsertJob *pjob = new rdbInsertJob(job); + void enqueue(std::unique_ptr &spjob) { std::unique_lock l(mutex); throttle(l); - listAddNodeTail(listJobs, pjob); + listAddNodeTail(listJobs, spjob.release()); if (listLength(listJobs) == 1) cv.notify_one(); } @@ -2445,12 +2472,14 @@ public: cv.notify_one(); l.unlock(); while (!workerThreadDone) { - usleep(100); + usleep(10); + pauseExecution(); processEventsWhileBlocked(serverTL - g_pserver->rgthreadvar); + resumeExecution(); } m_thread.join(); while (cstorageWritesInFlight.load(std::memory_order_seq_cst)) { - usleep(100); + usleep(10); processEventsWhileBlocked(serverTL - g_pserver->rgthreadvar); } fLaunched = false; @@ -2534,17 +2563,15 @@ public: moduleNotifyKeyspaceEvent(NOTIFY_LOADED, "loaded", &keyobj, job.db->id); replicationNotifyLoadedKey(job.db, &keyobj, job.val, job.expiretime); - } - else - { - decrRefCount(job.val); - } - } - - if (job.key != nullptr) - { - sdsfree(job.key); + for (auto &pair : job.vecsubexpires) + { + setExpire(NULL, job.db, &keyobj, pair.first, pair.second); + replicateSubkeyExpire(job.db, &keyobj, pair.first.get(), pair.second); + } + + job.val = nullptr; // don't free this as we moved ownership to the DB + } } /* If we have a storage provider check if we need to evict some keys to stay under our memory limit, @@ -2606,6 +2633,7 @@ void rdbLoadProgressCallback(rio *r, const void *buf, size_t len) { (r->keys_since_last_callback >= g_pserver->loading_process_events_interval_keys))) { rdbAsyncWorkThread *pwthread = reinterpret_cast(r->chksum_arg); + listIter li; listNode *ln; listRewind(g_pserver->masters, &li); @@ -2622,7 +2650,6 @@ void rdbLoadProgressCallback(rio *r, const void *buf, size_t len) { processEventsWhileBlocked(serverTL - g_pserver->rgthreadvar); if (pwthread) pwthread->resumeExecution(); - processModuleLoadingProgressEvent(0); @@ -2653,6 +2680,7 @@ int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi) { robj *subexpireKey = nullptr; sds key = nullptr; bool fLastKeyExpired = false; + std::unique_ptr spjob; rdb->update_cksum = rdbLoadProgressCallback; rdb->chksum_arg = &wqueue; @@ -2809,15 +2837,9 @@ int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi) { } else { long long expireT = strtoll(szFromObj(auxval), nullptr, 10); - sds keyT = sdsdupshared(key); - wqueue.enqueue([dbCur, subexpireKey, keyT, expireT]{ - redisObjectStack keyobj; - initStaticStringObject(keyobj,keyT); - setExpire(NULL, dbCur, &keyobj, subexpireKey, expireT); - replicateSubkeyExpire(dbCur, &keyobj, subexpireKey, expireT); - decrRefCount(subexpireKey); - sdsfree(keyT); - }); + serverAssert(spjob != nullptr); + serverAssert(sdscmp(key, spjob->key) == 0); + spjob->addSubexpireKey(subexpireKey, expireT); subexpireKey = nullptr; } } else { @@ -2901,15 +2923,16 @@ int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi) { bool fStaleMvccKey = (rsi) ? mvccFromObj(val) < rsi->mvccMinThreshold : false; - rdbInsertJob job; - job.db = dbCur; - job.key = sdsdupshared(key); - job.val = val; - job.lru_clock = lru_clock; - job.expiretime = expiretime; - job.lru_idle = lru_idle; - job.lfu_freq = lfu_freq; - wqueue.enqueue(job); + if (spjob != nullptr) + wqueue.enqueue(spjob); + spjob = std::make_unique(); + spjob->db = dbCur; + spjob->key = sdsdupshared(key); + spjob->val = val; + spjob->lru_clock = lru_clock; + spjob->expiretime = expiretime; + spjob->lru_idle = lru_idle; + spjob->lfu_freq = lfu_freq; val = nullptr; /* Check if the key already expired. This function is used when loading @@ -2932,6 +2955,9 @@ int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi) { lru_idle = -1; } + if (spjob != nullptr) + wqueue.enqueue(spjob); + if (key != nullptr) { sdsfree(key); From 2a6848a65a513926d3da6608d334351ed6878089 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Wed, 2 Jun 2021 23:41:36 +0000 Subject: [PATCH 51/99] Sync works single threaded properly, passes all but one testcase (which hangs) Former-commit-id: 9a6ca3a5d906b9d87fe70652d218decbb2775ac1 --- src/Makefile | 2 +- src/networking.cpp | 165 ++++++++++++++++++++++++++------------------ src/replication.cpp | 106 +++++++++++----------------- src/server.h | 9 +-- 4 files changed, 145 insertions(+), 137 deletions(-) diff --git a/src/Makefile b/src/Makefile index 966ce4400..a0ee5fe2a 100644 --- a/src/Makefile +++ b/src/Makefile @@ -15,7 +15,7 @@ release_hdr := $(shell sh -c './mkreleasehdr.sh') uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not') uname_M := $(shell sh -c 'uname -m 2>/dev/null || echo not') -OPTIMIZATION?=-O2 -flto +OPTIMIZATION?=-O2 DEPENDENCY_TARGETS=hiredis linenoise lua rocksdb NODEPS:=clean distclean diff --git a/src/networking.cpp b/src/networking.cpp index caefd6d1e..80120d0ca 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -146,6 +146,7 @@ client *createClient(connection *conn, int iel) { c->flags = 0; c->fPendingAsyncWrite = FALSE; c->fPendingAsyncWriteHandler = FALSE; + c->fPendingReplicaWrite = FALSE; c->ctime = c->lastinteraction = g_pserver->unixtime; /* If the default user does not require authentication, the user is * directly authenticated. */ @@ -221,6 +222,10 @@ void clientInstallWriteHandler(client *c) { /* Schedule the client to write the output buffers to the socket only * if not already done and, for slaves, if the replica can actually receive * writes at this stage. */ + + if (c->flags & CLIENT_SLAVE) + serverLog(LL_NOTICE, "installing write handler"); + if (!(c->flags & CLIENT_PENDING_WRITE) && (c->replstate == REPL_STATE_NONE || (c->replstate == SLAVE_STATE_ONLINE && !c->repl_put_online_on_ack))) @@ -272,6 +277,9 @@ void clientInstallAsyncWriteHandler(client *c) { int prepareClientToWrite(client *c) { bool fAsync = !FCorrectThread(c); // Not async if we're on the right thread + if (c->flags & CLIENT_SLAVE) + serverLog(LL_NOTICE, "got into prepareClientToWrite"); + if (!fAsync) { serverAssert(c->conn == nullptr || c->lock.fOwnLock()); } else { @@ -302,7 +310,7 @@ int prepareClientToWrite(client *c) { /* Schedule the client to write the output buffers to the socket, unless * it should already be setup to do so (it has already pending data). */ - if (!fAsync && !clientHasPendingReplies(c) && c->repl_curr_idx == -1) clientInstallWriteHandler(c); + if (!fAsync && !clientHasPendingReplies(c) && !c->fPendingReplicaWrite) clientInstallWriteHandler(c); if (fAsync && !(c->fPendingAsyncWrite)) clientInstallAsyncWriteHandler(c); /* Authorize the caller to queue in the output buffer of this client. */ @@ -320,7 +328,6 @@ void _clientAsyncReplyBufferReserve(client *c, size_t len) { clientReplyBlock *replyNew = (clientReplyBlock*)zmalloc(sizeof(clientReplyBlock) + newsize); replyNew->size = zmalloc_usable(replyNew) - sizeof(clientReplyBlock); replyNew->used = 0; - std::unique_lock tRDBLock (c->transmittedRDBLock); c->replyAsync = replyNew; } @@ -334,7 +341,6 @@ int _addReplyToBuffer(client *c, const char *s, size_t len) { if (fAsync) { serverAssert(GlobalLocksAcquired()); - std::unique_lock tRDBLock (c->transmittedRDBLock); if (c->replyAsync == nullptr || (c->replyAsync->size - c->replyAsync->used) < len) { if (c->replyAsync == nullptr) { @@ -1661,6 +1667,16 @@ client *lookupClientByID(uint64_t id) { return (c == raxNotFound) ? NULL : c; } +/* Compute the corresponding index from a replication backlog offset + * by taking the distance between the input offset and the replication backlog offset + * and applying that to the replication backlog index, wrapping around if the index + * becomes negative. + * TODO: Rewrite comment for new logic */ +long long getReplIndexFromOffset(long long offset){ + long long index = (offset - g_pserver->repl_backlog_start) % g_pserver->repl_backlog_size; + return index; +} + /* Write data in output buffers to client. Return C_OK if the client * is still valid after the call, C_ERR if it was freed because of some * error. If handler_installed is set, it will attempt to clear the @@ -1680,7 +1696,11 @@ int writeToClient(client *c, int handler_installed) { std::unique_locklock)> lock(c->lock); // serverLog(LL_NOTICE, "acq client"); + if (c->flags & CLIENT_SLAVE) + serverLog(LL_NOTICE, "writeToClient has happened"); + while(clientHasPendingReplies(c)) { + serverAssert(!(c->flags & CLIENT_SLAVE) || c->flags & CLIENT_MONITOR); if (c->bufpos > 0) { nwritten = connWrite(c->conn,c->buf+c->sentlen,c->bufpos-c->sentlen); if (nwritten <= 0) break; @@ -1739,80 +1759,67 @@ int writeToClient(client *c, int handler_installed) { /* We can only directly read from the replication backlog if the client is a replica, so only attempt to do so if that's the case. */ if (c->flags & CLIENT_SLAVE) { - /* If there are no more pending replies, then we have transmitted the RDB. - * This means further replication commands will be taken straight from the - * replication backlog from now on. */ - std::unique_lock tRDBLock (c->transmittedRDBLock); - - if (c->replstate == SLAVE_STATE_ONLINE && !clientHasPendingReplies(c) && c->replyAsync == nullptr){ - c->transmittedRDB = true; - } - bool transmittedRDB = c->transmittedRDB; - tRDBLock.unlock(); - /* For replicas, we don't store all the information in the client buffer - * Most of the time (aside from immediately after synchronizing), we read - * from the replication backlog directly */ - if (c->repl_curr_idx != -1 && transmittedRDB){ - std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); + * We always read from the replication backlog directly */ + std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); - /* copy global variables into local scope so if they change in between we don't care */ - long long repl_backlog_idx = g_pserver->repl_backlog_idx; - long long repl_backlog_size = g_pserver->repl_backlog_size; + /* Right now, we're bringing in the offStart into the scope + * If repl_batch_offStart is equal to -1, that means the mechanism is disabled + * which implies there is no data to flush and that the global offset is accurate */ + long long offStart = g_pserver->repl_batch_offStart == -1 ? g_pserver->master_repl_offset : g_pserver->repl_batch_offStart; + long long idxStart = getReplIndexFromOffset(offStart); + if (g_pserver->repl_batch_offStart != -1) + serverAssert(idxStart == g_pserver->repl_batch_idxStart); + else + serverAssert(idxStart == g_pserver->repl_backlog_idx); + + if (c->repl_curr_off != -1 && c->repl_curr_off != offStart){ + serverLog(LL_NOTICE, "printing the stats for client %lu: c->repl_curr_off: %lld, repl_batch_offStart: %lld, nwritten: %ld, offStart: %lld", + c->id, c->repl_curr_off, g_pserver->repl_batch_offStart, nwritten, offStart); + + long long curr_idx = getReplIndexFromOffset(c->repl_curr_off); long long nwrittenPart2 = 0; - - ssize_t nrequested; /* The number of bytes requested to write */ /* normal case with no wrap around */ - if (repl_backlog_idx >= c->repl_curr_idx){ - nrequested = repl_backlog_idx - c->repl_curr_idx; - nwritten = connWrite(c->conn, g_pserver->repl_backlog + c->repl_curr_idx, repl_backlog_idx - c->repl_curr_idx); + if (idxStart >= curr_idx){ + nwritten = connWrite(c->conn, g_pserver->repl_backlog + curr_idx, idxStart - curr_idx); /* wrap around case, v. rare */ /* also v. buggy so there's that */ } else { - nrequested = repl_backlog_size + repl_backlog_idx - c->repl_curr_idx; - nwritten = connWrite(c->conn, g_pserver->repl_backlog + c->repl_curr_idx, repl_backlog_size - c->repl_curr_idx); + serverLog(LL_NOTICE, "ROAD OF RESISTANCE"); + nwritten = connWrite(c->conn, g_pserver->repl_backlog + curr_idx, g_pserver->repl_backlog_size - curr_idx); /* only attempt wrapping if we write the correct number of bytes */ - if (nwritten == repl_backlog_size - c->repl_curr_idx){ - long long nwrittenPart2 = connWrite(c->conn, g_pserver->repl_backlog, repl_backlog_idx); + if (nwritten == g_pserver->repl_backlog_size - curr_idx){ + long long nwrittenPart2 = connWrite(c->conn, g_pserver->repl_backlog, idxStart); if (nwrittenPart2 != -1) nwritten += nwrittenPart2; - } } - /* only update the replica's current index if bytes were sent */ - - // if (nrequested != nwritten){ - // serverLog(LL_NOTICE, "-----------------------------------------"); - // serverLog(LL_NOTICE, "AFTER THE FACT"); - // serverLog(LL_NOTICE, "requested to write: %ld", nrequested); - // serverLog(LL_NOTICE, "actually written: %ld", nwritten); - // serverLog(LL_NOTICE, "repl_backlog_idx: %lld, repl_curr_idx: %lld, repl_backlog_size: %lld", repl_backlog_idx, c->repl_curr_idx, g_pserver->repl_backlog_size); - // serverLog(LL_NOTICE, "repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset); - // serverLog(LL_NOTICE, "-----------------------------------------"); - // } - - - if (nwritten == nrequested && g_pserver->repl_backlog_idx == repl_backlog_idx){ - c->repl_curr_idx = -1; /* -1 denotes no more replica writes */ - } - else if (nwritten > 0) - c->repl_curr_idx = (c->repl_curr_idx + nwritten) % repl_backlog_size; - - serverAssert(c->repl_curr_idx < repl_backlog_size); - /* only increment bytes if an error didn't occur */ if (nwritten > 0){ totwritten += nwritten; c->repl_curr_off += nwritten; + if (1){ + serverLog(LL_NOTICE, "printing the stats for client %lu: c->repl_curr_off: %lld, repl_batch_offStart: %lld, nwritten: %ld, offStart: %lld", + c->id, c->repl_curr_off, g_pserver->repl_batch_offStart, nwritten, offStart); + } + serverAssert(c->repl_curr_off <= offStart); + /* If the client offset matches the global offset, we wrote all we needed to, + * in which case, there is no pending write */ + if (c->repl_curr_off == offStart){ + serverLog(LL_NOTICE, "good, %lld", offStart); + c->fPendingReplicaWrite = false; + } else { + serverLog(LL_NOTICE, "mismatch between repl_curr_off (%lld) and offStart (%lld)", c->repl_curr_off, offStart); + } } /* If the second part of a write didn't go through, we still need to register that */ if (nwrittenPart2 == -1) nwritten = -1; } - if (c->flags & CLIENT_SLAVE && handler_installed) - serverLog(LL_NOTICE, "Total bytes written, %ld, write handler installed?: %d", totwritten, handler_installed); + // if (c->flags & CLIENT_SLAVE && handler_installed) + // serverLog(LL_NOTICE, "Total bytes written, %ld, write handler installed?: %d", totwritten, handler_installed); } @@ -1836,12 +1843,12 @@ int writeToClient(client *c, int handler_installed) { * We just rely on data / pings received for timeout detection. */ if (!(c->flags & CLIENT_MASTER)) c->lastinteraction = g_pserver->unixtime; } - if (!clientHasPendingReplies(c) && c->repl_curr_idx == -1) { - if(c->flags & CLIENT_SLAVE && handler_installed){ - serverLog(LL_NOTICE, "Uninstalling handler"); - serverLog(LL_NOTICE, "handler repl_curr_idx: %lld, repl_backlog_size: %lld", c->repl_curr_idx, g_pserver->repl_backlog_size); - serverLog(LL_NOTICE, "handler repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset); - } + if (!clientHasPendingReplies(c) && !c->fPendingReplicaWrite) { + // if(c->flags & CLIENT_SLAVE && handler_installed){ + // serverLog(LL_NOTICE, "Uninstalling handler"); + // serverLog(LL_NOTICE, "handler repl_curr_idx: %lld, repl_backlog_size: %lld", c->repl_curr_idx, g_pserver->repl_backlog_size); + // serverLog(LL_NOTICE, "handler repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset); + // } c->sentlen = 0; if (handler_installed) connSetWriteHandler(c->conn, NULL); @@ -1857,7 +1864,7 @@ int writeToClient(client *c, int handler_installed) { /* Write event handler. Just send data to the client. */ void sendReplyToClient(connection *conn) { client *c = (client*)connGetPrivateData(conn); - serverLog(LL_NOTICE, "called the sendreplytoclient"); + // serverLog(LL_NOTICE, "called the sendreplytoclient"); if (writeToClient(c,1) == C_ERR) { AeLocker ae; @@ -1886,7 +1893,6 @@ void ProcessPendingAsyncWrites() serverAssert(c->fPendingAsyncWrite); if (c->flags & (CLIENT_CLOSE_ASAP | CLIENT_CLOSE_AFTER_REPLY)) { - std::unique_lock tRDBLock (c->transmittedRDBLock); if (c->replyAsync != nullptr){ zfree(c->replyAsync); c->replyAsync = nullptr; @@ -1898,7 +1904,6 @@ void ProcessPendingAsyncWrites() /* since writes from master to replica can come directly from the replication backlog, * writes may have been signalled without having been copied to the replyAsync buffer, * thus causing the buffer to be NULL */ - std::unique_lock tRDBLock (c->transmittedRDBLock); if (c->replyAsync != nullptr){ int size = c->replyAsync->used; @@ -1919,7 +1924,6 @@ void ProcessPendingAsyncWrites() } c->fPendingAsyncWrite = FALSE; - tRDBLock.unlock(); // Now install the write event handler int ae_flags = AE_WRITABLE|AE_WRITE_THREADSAFE; /* For the fsync=always policy, we want that a given FD is never @@ -2032,8 +2036,7 @@ int handleClientsWithPendingWrites(int iel, int aof_state) { /* If after the synchronous writes above we still have data to * output to the client, we need to install the writable handler. */ - if (clientHasPendingReplies(c) || c->repl_curr_idx != -1) { - serverLog(LL_NOTICE, "Setting a write handler for later"); + if (clientHasPendingReplies(c) || c->fPendingReplicaWrite) { if (connSetWriteHandlerWithBarrier(c->conn, sendReplyToClient, ae_flags, true) == C_ERR) { freeClientAsync(c); } @@ -2214,6 +2217,34 @@ static void setProtocolError(const char *errstr, client *c) { c->flags |= (CLIENT_CLOSE_AFTER_REPLY|CLIENT_PROTOCOL_ERROR); } +static void printQueryBuffer(client *c) { + if (cserver.verbosity <= LL_VERBOSE || c->flags & CLIENT_MASTER) { + sds client = catClientInfoString(sdsempty(),c); + + /* Sample some protocol to given an idea about what was inside. */ + char buf[PROTO_DUMP_LEN*2]; + if (sdslen(c->querybuf)-c->qb_pos < PROTO_DUMP_LEN) { + snprintf(buf,sizeof(buf),"%s", c->querybuf+c->qb_pos); + } else { + snprintf(buf,sizeof(buf),"%.*s (... more %zu bytes ...) %.*s", PROTO_DUMP_LEN/2, c->querybuf+c->qb_pos, sdslen(c->querybuf)-c->qb_pos-PROTO_DUMP_LEN, PROTO_DUMP_LEN/2, c->querybuf+sdslen(c->querybuf)-PROTO_DUMP_LEN/2); + } + + /* Remove non printable chars. */ + char *p = buf; + while (*p != '\0') { + if (!isprint(*p)) *p = '.'; + p++; + } + + /* Log all the client and protocol info. */ + int loglevel = (c->flags & CLIENT_MASTER) ? LL_WARNING : + LL_VERBOSE; + serverLog(loglevel, + "Query buffer from client %lu: %s. %s", c->id, client, buf); + sdsfree(client); + } +} + /* Process the query buffer for client 'c', setting up the client argument * vector for command execution. Returns C_OK if after running the function * the client has a well-formed ready to be processed command, otherwise @@ -2468,6 +2499,8 @@ void parseClientCommandBuffer(client *c) { } size_t cqueriesStart = c->vecqueuedcmd.size(); + // if (c->flags & CLIENT_MASTER) + // printQueryBuffer(c); if (c->reqtype == PROTO_REQ_INLINE) { if (processInlineBuffer(c) != C_OK) break; } else if (c->reqtype == PROTO_REQ_MULTIBULK) { diff --git a/src/replication.cpp b/src/replication.cpp index d1181bdf4..97638e833 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -88,18 +88,6 @@ int RDBGeneratedByReplication = 0; void resizeReplicationBacklogForClients(long long newsize); -void setReplIdx(client *c, long long idx, long long off){ - // serverLog(LL_NOTICE, "calling this garbage function w/ idx and off: %lld, %lld, %lld", idx, off, off-idx); - // serverLog(LL_NOTICE, "Repl Index started at: %lld", c->repl_curr_idx); - if (c->repl_curr_idx == -1 && off >= c->repl_curr_off){ - if (prepareClientToWrite(c) != C_OK) return; - c->repl_curr_idx = idx; - c->repl_curr_off = off; - } - // serverLog(LL_NOTICE, "Repl Index has become: %lld", c->repl_curr_idx); - -} - /* --------------------------- Utility functions ---------------------------- */ /* Return the pointer to a string representing the replica ip:listening_port @@ -232,6 +220,7 @@ void createReplicationBacklog(void) { g_pserver->repl_backlog = (char*)zmalloc(g_pserver->repl_backlog_size, MALLOC_LOCAL); g_pserver->repl_backlog_histlen = 0; g_pserver->repl_backlog_idx = 0; + g_pserver->repl_backlog_start = g_pserver->master_repl_offset; /* We don't have any data inside our buffer, but virtually the first * byte we have is the next byte that will be generated for the @@ -284,6 +273,7 @@ void resizeReplicationBacklog(long long newsize) { g_pserver->repl_backlog = backlog; g_pserver->repl_backlog_idx = g_pserver->repl_backlog_histlen; g_pserver->repl_batch_idxStart = 0; + g_pserver->repl_backlog_start = g_pserver->master_repl_offset; } else { zfree(g_pserver->repl_backlog); g_pserver->repl_backlog = (char*)zmalloc(newsize); @@ -296,6 +286,7 @@ void resizeReplicationBacklog(long long newsize) { g_pserver->repl_backlog_size = newsize; } +long long getReplIndexFromOffset(long long offset); /* The above but for when clients need extra replication backlog because ??? */ void resizeReplicationBacklogForClients(long long newsize) { @@ -305,32 +296,8 @@ void resizeReplicationBacklogForClients(long long newsize) { serverLog(LL_NOTICE, "WE HAVE TO RESIZE from %lld to %lld", g_pserver->repl_backlog_size, newsize); /* get the critical client size, i.e. the size of the data unflushed to clients */ - long long earliest_off = LONG_LONG_MAX; - long long earliest_idx = -1; - listIter li; - listNode *ln; - listRewind(g_pserver->slaves, &li); - while ((ln = listNext(&li))) { - client *replica = (client*)listNodeValue(ln); - if (replica->repl_curr_off != -1 && replica->repl_curr_off < earliest_off){ - earliest_off = replica->repl_curr_off; - earliest_idx = replica->repl_curr_idx; - } - serverLog(LL_NOTICE, "repl_curr_idx: %lld, earlistidx: %lld", replica->repl_curr_idx, earliest_idx); - } - serverLog(LL_NOTICE, "We are starting with: master_repl_offset: %lld, repl_batch_offStart: %lld, earliest_off: %lld, " - "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, earliest_idx: %lld, repl_backlog_size: %lld", - g_pserver->master_repl_offset, g_pserver->repl_batch_offStart, earliest_off, - g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, earliest_idx, g_pserver->repl_backlog_size - ); + long long earliest_off = g_pserver->repl_lowest_off.load(); - long long new_off = 0, new_idx = 0; - - /* if no earliest offset is found amongst the clients, they are all up to date with the flushed index */ - if (earliest_off == LONG_LONG_MAX && earliest_idx == -1){ - earliest_idx = g_pserver->repl_batch_idxStart; - earliest_off = g_pserver->repl_batch_offStart; - } if (g_pserver->repl_backlog != NULL) { /* What we actually do is to flush the old buffer and realloc a new @@ -339,17 +306,18 @@ void resizeReplicationBacklogForClients(long long newsize) { * worse often we need to alloc additional space before freeing the * old buffer. */ - if (earliest_idx >= 0) { + if (earliest_off != -1) { // We need to keep critical data so we can't shrink less than the hot data in the buffer newsize = std::max(newsize, g_pserver->master_repl_offset - earliest_off); char *backlog = (char*)zmalloc(newsize); g_pserver->repl_backlog_histlen = g_pserver->master_repl_offset - earliest_off; + long long earliest_idx = getReplIndexFromOffset(earliest_off); if (g_pserver->repl_backlog_idx >= earliest_idx) { auto cbActiveBacklog = g_pserver->repl_backlog_idx - earliest_idx; memcpy(backlog, g_pserver->repl_backlog + earliest_idx, cbActiveBacklog); - serverLog(LL_NOTICE, "g_pserver->master_repl_offset: %lld, earliest_off: %lld, g_pserver->repl_backlog_idx: %lld, earliest_idx: %lld", - g_pserver->master_repl_offset, earliest_off, g_pserver->repl_backlog_idx, earliest_idx); + serverLog(LL_NOTICE, "g_pserver->master_repl_offset: %lld, earliest_off: %lld, g_pserver->repl_backlog_idx: %lld, earliest_idx: %lld, repl_backlog_start: %lld", + g_pserver->master_repl_offset, earliest_off, g_pserver->repl_backlog_idx, earliest_idx, g_pserver->repl_backlog_start); serverAssert(g_pserver->repl_backlog_histlen == cbActiveBacklog); } else { auto cbPhase1 = g_pserver->repl_backlog_size - earliest_idx; @@ -361,20 +329,10 @@ void resizeReplicationBacklogForClients(long long newsize) { zfree(g_pserver->repl_backlog); g_pserver->repl_backlog = backlog; g_pserver->repl_backlog_idx = g_pserver->repl_backlog_histlen; - listRewind(g_pserver->slaves, &li); - /* Go through the clients and update their replication indicies */ - while ((ln = listNext(&li))) { - client *replica = (client*)listNodeValue(ln); - if (replica->repl_curr_idx != -1){ - replica->repl_curr_idx -= earliest_idx; - if (replica->repl_curr_idx < 0) - replica->repl_curr_idx += g_pserver->repl_backlog_size; - } - new_idx = replica->repl_curr_idx; - } g_pserver->repl_batch_idxStart -= earliest_idx; if (g_pserver->repl_batch_idxStart < 0) g_pserver->repl_batch_idxStart += g_pserver->repl_backlog_size; + g_pserver->repl_backlog_start = earliest_off; } else { zfree(g_pserver->repl_backlog); g_pserver->repl_backlog = (char*)zmalloc(newsize); @@ -382,14 +340,15 @@ void resizeReplicationBacklogForClients(long long newsize) { g_pserver->repl_backlog_idx = 0; /* Next byte we have is... the next since the buffer is empty. */ g_pserver->repl_backlog_off = g_pserver->master_repl_offset+1; + g_pserver->repl_backlog_start = g_pserver->master_repl_offset; } } g_pserver->repl_backlog_size = newsize; serverLog(LL_NOTICE, "We are ending with: master_repl_offset: %lld, repl_batch_offStart: %lld, new_off: %lld, " "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, new_idx: %lld, repl_backlog_size: %lld", - g_pserver->master_repl_offset, g_pserver->repl_batch_offStart, new_off, - g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, new_idx, g_pserver->repl_backlog_size + g_pserver->master_repl_offset, g_pserver->repl_batch_offStart, 0LL, + g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, 0LL, g_pserver->repl_backlog_size ); } @@ -456,11 +415,6 @@ void feedReplicationBacklog(const void *ptr, size_t len) { len -= thislen; p += thislen; g_pserver->repl_backlog_histlen += thislen; - // serverLog(LL_NOTICE, "Pt2 intermediate with: master_repl_offset: %lld, repl_batch_offStart: %lld, " - // "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, repl_backlog_size: %lld", - // g_pserver->master_repl_offset, g_pserver->repl_batch_offStart, - // g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, g_pserver->repl_backlog_size - // ); } if (g_pserver->repl_backlog_histlen > g_pserver->repl_backlog_size) g_pserver->repl_backlog_histlen = g_pserver->repl_backlog_size; @@ -722,7 +676,7 @@ void replicationFeedSlaves(list *replicas, int dictid, robj **argv, int argc) { void showLatestBacklog(void) { if (g_pserver->repl_backlog == NULL) return; - long long dumplen = 256; + long long dumplen = 1024; if (g_pserver->repl_backlog_histlen < dumplen) dumplen = g_pserver->repl_backlog_histlen; @@ -813,7 +767,7 @@ void replicationFeedMonitors(client *c, list *monitors, int dictid, robj **argv, } decrRefCount(cmdobj); } - +#define BYPASS_PSYNC /* Feed the replica 'c' with the replication backlog starting from the * specified 'offset' up to the end of the backlog. */ long long addReplyReplicationBacklog(client *c, long long offset) { @@ -854,7 +808,8 @@ long long addReplyReplicationBacklog(client *c, long long offset) { len = g_pserver->repl_backlog_histlen - skip; serverLog(LL_DEBUG, "[PSYNC] Reply total length: %lld", len); #ifdef BYPASS_PSYNC - setReplIdx(c, j, offset); + c->repl_curr_off = offset - 1; + serverLog(LL_NOTICE, "This client %lu at addr %s synchronized to %lld", c->id, getClientPeerId(c), c->repl_curr_off); #else while(len) { long long thislen = @@ -900,6 +855,11 @@ int replicationSetupSlaveForFullResync(client *replica, long long offset) { replica->psync_initial_offset = offset; replica->replstate = SLAVE_STATE_WAIT_BGSAVE_END; + + replica->repl_curr_off = offset; + + serverLog(LL_NOTICE, "This client %lu at addr %s synchronized to %lld", replica->id, getClientPeerId(replica), replica->repl_curr_off); + /* We are going to accumulate the incremental changes for this * replica as well. Set replicaseldb to -1 in order to force to re-emit * a SELECT statement in the replication stream. */ @@ -2006,7 +1966,6 @@ void replicationCreateMasterClient(redisMaster *mi, connection *conn, int dbid) mi->master->reploff_skipped = 0; mi->master->read_reploff = mi->master->reploff; mi->master->puser = NULL; /* This client can do everything. */ - memcpy(mi->master->uuid, mi->master_uuid, UUID_BINARY_LEN); memset(mi->master_uuid, 0, UUID_BINARY_LEN); // make sure people don't use this temp storage buffer @@ -4652,12 +4611,17 @@ void flushReplBacklogToClients() serverAssert(g_pserver->master_repl_offset - g_pserver->repl_batch_offStart <= g_pserver->repl_backlog_size); serverAssert(g_pserver->repl_batch_idxStart != g_pserver->repl_backlog_idx); + serverLog(LL_NOTICE, "the master repl offset is %lld", g_pserver->master_repl_offset); + showLatestBacklog(); listIter li; listNode *ln; listRewind(g_pserver->slaves, &li); while ((ln = listNext(&li))) { client *replica = (client*)listNodeValue(ln); + // serverLog(LL_NOTICE, "client %lu is in the party", replica->id); + + // serverLog(LL_NOTICE, "is there a write pending for %lu, %d", replica->id, replica->fPendingReplicaWrite); if (replica->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue; if (replica->flags & CLIENT_CLOSE_ASAP) continue; @@ -4675,11 +4639,21 @@ void flushReplBacklogToClients() asyncUl.lock(); /* If we are online and the RDB has been sent, there is no need to feed the client buffer * We will send our replies directly from the replication backlog instead */ - std::unique_lock tRDBLock (replica->transmittedRDBLock); - if (replica->replstate == SLAVE_STATE_ONLINE && replica->transmittedRDB){ - setReplIdx(replica, g_pserver->repl_batch_idxStart, g_pserver->repl_batch_offStart); - continue; + if (replica->repl_curr_off == -1){ + replica->repl_curr_off = g_pserver->repl_batch_offStart; + + serverLog(LL_NOTICE, "This client %lu at addr %s synchronized to %lld", replica->id, getClientPeerId(replica), replica->repl_curr_off); + } + + /* Only if the there isn't already a pending write do we prepare the client to write */ + if (!replica->fPendingReplicaWrite){ + serverAssert(replica->repl_curr_off != g_pserver->master_repl_offset); + prepareClientToWrite(replica); + replica->fPendingReplicaWrite = true; + } + + continue; } #endif if (g_pserver->repl_backlog_idx >= g_pserver->repl_batch_idxStart) { diff --git a/src/server.h b/src/server.h index da1fce52e..9fdf5e0ef 100644 --- a/src/server.h +++ b/src/server.h @@ -1516,8 +1516,11 @@ struct client { long long psync_initial_offset; /* FULLRESYNC reply offset other slaves copying this replica output buffer should use. */ + long long repl_curr_idx = -1; /* Replication index sent, if this is a replica */ long long repl_curr_off = -1; + int fPendingReplicaWrite; + char replid[CONFIG_RUN_ID_SIZE+1]; /* Master replication ID (if master). */ int slave_listening_port; /* As configured with: REPLCONF listening-port */ char slave_ip[NET_IP_STR_LEN]; /* Optionally given by REPLCONF ip-address */ @@ -1577,12 +1580,8 @@ struct client { robj **argv; size_t argv_len_sumActive = 0; - bool transmittedRDB = false; /* Have we finished transmitting the RDB to this replica? */ - /* If so, we can read from the replication backlog instead of the client buffer */ - // post a function from a non-client thread to run on its client thread bool postFunction(std::function fn, bool fLock = true); - fastlock transmittedRDBLock {"transmittedRDB"}; size_t argv_len_sum() const; }; @@ -2229,6 +2228,8 @@ struct redisServer { that is the next byte will'll write to.*/ long long repl_backlog_off; /* Replication "master offset" of first byte in the replication backlog buffer.*/ + long long repl_backlog_start; /* Used to compute indicies from offsets + basically, index = (offset - start) % size */ fastlock repl_backlog_lock {"replication backlog"}; time_t repl_backlog_time_limit; /* Time without slaves after the backlog gets released. */ From 0f41d34ba200ebfd70d82601be2fc0ddabbbd9df Mon Sep 17 00:00:00 2001 From: John Sully Date: Thu, 3 Jun 2021 04:43:38 +0000 Subject: [PATCH 52/99] Fix collab issue #26 Former-commit-id: 2392879772a77fc30c856488b9911d194ced827b --- src/rdb.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/rdb.cpp b/src/rdb.cpp index a1a3d7301..500291794 100644 --- a/src/rdb.cpp +++ b/src/rdb.cpp @@ -2797,6 +2797,19 @@ public: vars.clients_pending_asyncwrite = listCreate(); serverTL = &vars; aeSetThreadOwnsLockOverride(true); + + // We will inheret the server thread's affinity mask, clear it as we want to run on a different core. + cpu_set_t *cpuset = CPU_ALLOC(std::thread::hardware_concurrency()); + if (cpuset != nullptr) { + size_t size = CPU_ALLOC_SIZE(std::thread::hardware_concurrency()); + CPU_ZERO_S(size, cpuset); + for (unsigned i = 0; i < std::thread::hardware_concurrency(); ++i) { + CPU_SET_S(i, size, cpuset); + } + pthread_setaffinity_np(pthread_self(), size, cpuset); + CPU_FREE(cpuset); + } + for (;;) { std::unique_lock lock(queue.mutex); if (listLength(queue.listJobs) == 0 && queue.queuefn.empty()) { From 2e9c7aed031f5822ddbe955803b6a09c6c1a9aca Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Thu, 3 Jun 2021 20:44:32 +0000 Subject: [PATCH 53/99] Single threaded tests work now Former-commit-id: 0e760d7c71231c7f52102909a31fc8db1b3e2860 --- src/networking.cpp | 2 +- src/replication.cpp | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/networking.cpp b/src/networking.cpp index 80120d0ca..e8ede3338 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -3419,7 +3419,7 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) { * that writes to said replica are using data from the replication backlog * as opposed to it's own internal buffer, this number should keep track of that */ unsigned long getClientReplicationBacklogSharedUsage(client *c) { - return (!(c->flags & CLIENT_SLAVE) || c->repl_curr_idx == -1) ? 0 : g_pserver->master_repl_offset - c->repl_curr_off; + return (!(c->flags & CLIENT_SLAVE) || !c->fPendingReplicaWrite ) ? 0 : g_pserver->master_repl_offset - c->repl_curr_off; } /* This function returns the number of bytes that Redis is diff --git a/src/replication.cpp b/src/replication.cpp index 97638e833..a7a2aa79e 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -810,6 +810,10 @@ long long addReplyReplicationBacklog(client *c, long long offset) { #ifdef BYPASS_PSYNC c->repl_curr_off = offset - 1; serverLog(LL_NOTICE, "This client %lu at addr %s synchronized to %lld", c->id, getClientPeerId(c), c->repl_curr_off); + + /* Force the partial sync to be queued */ + prepareClientToWrite(c); + c->fPendingReplicaWrite = true; #else while(len) { long long thislen = From 667d2763c0df3ca48b52949a365d3237dbcc0c52 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Thu, 3 Jun 2021 21:47:33 +0000 Subject: [PATCH 54/99] Removed unused variables Former-commit-id: 48663bc480f7279a94c68aeebdd9721ca64f7038 --- src/config.cpp | 1 - src/evict.cpp | 1 - src/replication.cpp | 2 -- src/server.h | 6 +----- 4 files changed, 1 insertion(+), 9 deletions(-) diff --git a/src/config.cpp b/src/config.cpp index b546ef607..9d7f14007 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -2347,7 +2347,6 @@ static int updateReplBacklogSize(long long val, long long prev, const char **err UNUSED(err); g_pserver->repl_backlog_size = prev; resizeReplicationBacklog(val); - g_pserver->repl_backlog_config_size = g_pserver->repl_backlog_size; return 1; } diff --git a/src/evict.cpp b/src/evict.cpp index e7f0a10ef..7ec223f6d 100644 --- a/src/evict.cpp +++ b/src/evict.cpp @@ -399,7 +399,6 @@ size_t freeMemoryGetNotCountedMemory(void) { /* also don't count the replication backlog memory * that's where the replication clients get their memory from */ - // overhead += (g_pserver->repl_backlog_size - g_pserver->repl_backlog_config_size); overhead += g_pserver->repl_backlog_size; if (g_pserver->aof_state != AOF_OFF) { diff --git a/src/replication.cpp b/src/replication.cpp index a7a2aa79e..3a48963ab 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -1129,7 +1129,6 @@ void syncCommand(client *c) { if (!strcasecmp((const char*)ptrFromObj(c->argv[0]),"psync")) { if (masterTryPartialResynchronization(c) == C_OK) { g_pserver->stat_sync_partial_ok++; - // c->repl_curr_idx = g_pserver->repl_backlog_idx; return; /* No full resync needed, return. */ } else { char *master_replid = (char*)ptrFromObj(c->argv[1]); @@ -1157,7 +1156,6 @@ void syncCommand(client *c) { connDisableTcpNoDelay(c->conn); /* Non critical if it fails. */ c->repldbfd = -1; c->flags |= CLIENT_SLAVE; - // c->repl_curr_idx = g_pserver->repl_backlog_idx; listAddNodeTail(g_pserver->slaves,c); /* Create the replication backlog if needed. */ diff --git a/src/server.h b/src/server.h index 9fdf5e0ef..2aba985ed 100644 --- a/src/server.h +++ b/src/server.h @@ -1517,8 +1517,7 @@ struct client { copying this replica output buffer should use. */ - long long repl_curr_idx = -1; /* Replication index sent, if this is a replica */ - long long repl_curr_off = -1; + long long repl_curr_off = -1; /* Replication offset of the client, only if it's a replica*/ int fPendingReplicaWrite; char replid[CONFIG_RUN_ID_SIZE+1]; /* Master replication ID (if master). */ @@ -2416,9 +2415,6 @@ struct redisServer { uint16_t rglockSamples[s_lockContentionSamples]; unsigned ilockRingHead = 0; - long long repl_backlog_config_size = 1024*1024; /* This is a hack to ignore the resizing of the replication backlog - when using it as a defacto for the client buffer */ - bool FRdbSaveInProgress() const { return rdbThreadVars.fRdbThreadActive; } }; From da0b7a3900ba50b37a2e3ac0cac1196aa19d734d Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Fri, 4 Jun 2021 20:09:47 +0000 Subject: [PATCH 55/99] Seems to pass multithreaded test cases, thank the lord Former-commit-id: 6cbf70cfff5735f3d4ef2e980945b4b1a1f85971 --- src/networking.cpp | 19 +++++++++---------- src/replication.cpp | 15 +++++++++------ src/server.h | 1 + 3 files changed, 19 insertions(+), 16 deletions(-) diff --git a/src/networking.cpp b/src/networking.cpp index e8ede3338..cead76998 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -277,8 +277,9 @@ void clientInstallAsyncWriteHandler(client *c) { int prepareClientToWrite(client *c) { bool fAsync = !FCorrectThread(c); // Not async if we're on the right thread - if (c->flags & CLIENT_SLAVE) + if (c->flags & CLIENT_SLAVE){ serverLog(LL_NOTICE, "got into prepareClientToWrite"); + } if (!fAsync) { serverAssert(c->conn == nullptr || c->lock.fOwnLock()); @@ -1758,7 +1759,7 @@ int writeToClient(client *c, int handler_installed) { /* We can only directly read from the replication backlog if the client is a replica, so only attempt to do so if that's the case. */ - if (c->flags & CLIENT_SLAVE) { + if (c->flags & CLIENT_SLAVE && !(c->flags & CLIENT_MONITOR)) { /* For replicas, we don't store all the information in the client buffer * We always read from the replication backlog directly */ std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); @@ -1766,14 +1767,12 @@ int writeToClient(client *c, int handler_installed) { /* Right now, we're bringing in the offStart into the scope * If repl_batch_offStart is equal to -1, that means the mechanism is disabled * which implies there is no data to flush and that the global offset is accurate */ - long long offStart = g_pserver->repl_batch_offStart == -1 ? g_pserver->master_repl_offset : g_pserver->repl_batch_offStart; + // long long offStart = g_pserver->repl_batch_offStart == -1 ? g_pserver->master_repl_offset : g_pserver->repl_batch_offStart; + long long offStart = c->repl_end_off; long long idxStart = getReplIndexFromOffset(offStart); - if (g_pserver->repl_batch_offStart != -1) - serverAssert(idxStart == g_pserver->repl_batch_idxStart); - else - serverAssert(idxStart == g_pserver->repl_backlog_idx); - - if (c->repl_curr_off != -1 && c->repl_curr_off != offStart){ + + serverAssert(c->repl_curr_off != -1); + if (c->repl_curr_off != offStart){ serverLog(LL_NOTICE, "printing the stats for client %lu: c->repl_curr_off: %lld, repl_batch_offStart: %lld, nwritten: %ld, offStart: %lld", c->id, c->repl_curr_off, g_pserver->repl_batch_offStart, nwritten, offStart); @@ -1846,7 +1845,7 @@ int writeToClient(client *c, int handler_installed) { if (!clientHasPendingReplies(c) && !c->fPendingReplicaWrite) { // if(c->flags & CLIENT_SLAVE && handler_installed){ // serverLog(LL_NOTICE, "Uninstalling handler"); - // serverLog(LL_NOTICE, "handler repl_curr_idx: %lld, repl_backlog_size: %lld", c->repl_curr_idx, g_pserver->repl_backlog_size); + // serverLog(LL_NOTICE, "repl_backlog_size: %lld", g_pserver->repl_backlog_size); // serverLog(LL_NOTICE, "handler repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset); // } c->sentlen = 0; diff --git a/src/replication.cpp b/src/replication.cpp index 3a48963ab..96bf161f9 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -382,7 +382,9 @@ void feedReplicationBacklog(const void *ptr, size_t len) { lower_bound = g_pserver->repl_batch_offStart; long long minimumsize = g_pserver->master_repl_offset + len - lower_bound + 1; if (minimumsize > g_pserver->repl_backlog_size) { + g_pserver->repl_backlog_lock.unlock(); flushReplBacklogToClients(); + g_pserver->repl_backlog_lock.lock(); minimumsize = g_pserver->master_repl_offset + len - lower_bound +1; if (minimumsize > g_pserver->repl_backlog_size) { @@ -809,6 +811,7 @@ long long addReplyReplicationBacklog(client *c, long long offset) { serverLog(LL_DEBUG, "[PSYNC] Reply total length: %lld", len); #ifdef BYPASS_PSYNC c->repl_curr_off = offset - 1; + c->repl_end_off = g_pserver->master_repl_offset; serverLog(LL_NOTICE, "This client %lu at addr %s synchronized to %lld", c->id, getClientPeerId(c), c->repl_curr_off); /* Force the partial sync to be queued */ @@ -861,6 +864,7 @@ int replicationSetupSlaveForFullResync(client *replica, long long offset) { replica->replstate = SLAVE_STATE_WAIT_BGSAVE_END; replica->repl_curr_off = offset; + replica->repl_end_off = g_pserver->master_repl_offset; serverLog(LL_NOTICE, "This client %lu at addr %s synchronized to %lld", replica->id, getClientPeerId(replica), replica->repl_curr_off); @@ -4634,19 +4638,18 @@ void flushReplBacklogToClients() fAsyncWrite = true; + /* If we are online and the RDB has been sent, there is no need to feed the client buffer + * We will send our replies directly from the replication backlog instead */ #ifdef BYPASS_BUFFER { std::unique_lock asyncUl(replica->lock, std::defer_lock); if (!FCorrectThread(replica)) asyncUl.lock(); - /* If we are online and the RDB has been sent, there is no need to feed the client buffer - * We will send our replies directly from the replication backlog instead */ - if (replica->repl_curr_off == -1){ - replica->repl_curr_off = g_pserver->repl_batch_offStart; - serverLog(LL_NOTICE, "This client %lu at addr %s synchronized to %lld", replica->id, getClientPeerId(replica), replica->repl_curr_off); + /* We should have set the repl_curr_off when synchronizing, so it shouldn't be -1 here */ + serverAssert(replica->repl_curr_off != -1); - } + replica->repl_end_off = g_pserver->master_repl_offset; /* Only if the there isn't already a pending write do we prepare the client to write */ if (!replica->fPendingReplicaWrite){ diff --git a/src/server.h b/src/server.h index 2aba985ed..64a2ca515 100644 --- a/src/server.h +++ b/src/server.h @@ -1518,6 +1518,7 @@ struct client { should use. */ long long repl_curr_off = -1; /* Replication offset of the client, only if it's a replica*/ + long long repl_end_off = -1; /* Replication offset to write to */ int fPendingReplicaWrite; char replid[CONFIG_RUN_ID_SIZE+1]; /* Master replication ID (if master). */ From 9db8556e91d46c5e2fb7f96ea5fb3880d56274aa Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Tue, 8 Jun 2021 23:10:53 +0000 Subject: [PATCH 56/99] Cleaned up code a bit, need to rewrite some comments to reflect new behaviour Former-commit-id: 850ec766cd71614ce9e61c12414545cd212d3878 --- src/evict.cpp | 1 - src/networking.cpp | 108 ++++---------------------- src/replication.cpp | 179 +++++++------------------------------------- src/server.cpp | 2 - src/server.h | 1 - 5 files changed, 43 insertions(+), 248 deletions(-) diff --git a/src/evict.cpp b/src/evict.cpp index 7ec223f6d..54153dc27 100644 --- a/src/evict.cpp +++ b/src/evict.cpp @@ -522,7 +522,6 @@ int freeMemoryIfNeeded(bool fQuickCycle, bool fPreSnapshot) { if (g_pserver->maxmemory_policy == MAXMEMORY_NO_EVICTION) goto cant_free; /* We need to free memory, but policy forbids. */ - serverLog(LL_NOTICE, "evicting i guess lol, the overhead was %ld, the repl_backlog_size, %lld", freeMemoryGetNotCountedMemory(), g_pserver->repl_backlog_size); while (mem_freed < mem_tofree) { int j, k, i; static unsigned int next_db = 0; diff --git a/src/networking.cpp b/src/networking.cpp index cead76998..aba1f1705 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -223,9 +223,6 @@ void clientInstallWriteHandler(client *c) { * if not already done and, for slaves, if the replica can actually receive * writes at this stage. */ - if (c->flags & CLIENT_SLAVE) - serverLog(LL_NOTICE, "installing write handler"); - if (!(c->flags & CLIENT_PENDING_WRITE) && (c->replstate == REPL_STATE_NONE || (c->replstate == SLAVE_STATE_ONLINE && !c->repl_put_online_on_ack))) @@ -277,10 +274,6 @@ void clientInstallAsyncWriteHandler(client *c) { int prepareClientToWrite(client *c) { bool fAsync = !FCorrectThread(c); // Not async if we're on the right thread - if (c->flags & CLIENT_SLAVE){ - serverLog(LL_NOTICE, "got into prepareClientToWrite"); - } - if (!fAsync) { serverAssert(c->conn == nullptr || c->lock.fOwnLock()); } else { @@ -1695,10 +1688,6 @@ int writeToClient(client *c, int handler_installed) { serverAssertDebug(FCorrectThread(c)); std::unique_locklock)> lock(c->lock); - // serverLog(LL_NOTICE, "acq client"); - - if (c->flags & CLIENT_SLAVE) - serverLog(LL_NOTICE, "writeToClient has happened"); while(clientHasPendingReplies(c)) { serverAssert(!(c->flags & CLIENT_SLAVE) || c->flags & CLIENT_MONITOR); @@ -1710,7 +1699,6 @@ int writeToClient(client *c, int handler_installed) { /* If the buffer was sent, set bufpos to zero to continue with * the remainder of the reply. */ - // serverLog(LL_NOTICE, "buf pos: %d, sentlen: %ld", c->bufpos, c->sentlen); if ((int)c->sentlen == c->bufpos) { c->bufpos = 0; c->sentlen = 0; @@ -1764,33 +1752,24 @@ int writeToClient(client *c, int handler_installed) { * We always read from the replication backlog directly */ std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); - /* Right now, we're bringing in the offStart into the scope - * If repl_batch_offStart is equal to -1, that means the mechanism is disabled - * which implies there is no data to flush and that the global offset is accurate */ - // long long offStart = g_pserver->repl_batch_offStart == -1 ? g_pserver->master_repl_offset : g_pserver->repl_batch_offStart; - long long offStart = c->repl_end_off; - long long idxStart = getReplIndexFromOffset(offStart); + long long repl_end_idx = getReplIndexFromOffset(c->repl_end_off); serverAssert(c->repl_curr_off != -1); - if (c->repl_curr_off != offStart){ - serverLog(LL_NOTICE, "printing the stats for client %lu: c->repl_curr_off: %lld, repl_batch_offStart: %lld, nwritten: %ld, offStart: %lld", - c->id, c->repl_curr_off, g_pserver->repl_batch_offStart, nwritten, offStart); - - long long curr_idx = getReplIndexFromOffset(c->repl_curr_off); - long long nwrittenPart2 = 0; + if (c->repl_curr_off != c->repl_end_off){ + long long repl_curr_idx = getReplIndexFromOffset(c->repl_curr_off); + long long nwritten2ndStage = 0; /* How much was written from the start of the replication backlog + * in the event of a wrap around write */ /* normal case with no wrap around */ - if (idxStart >= curr_idx){ - nwritten = connWrite(c->conn, g_pserver->repl_backlog + curr_idx, idxStart - curr_idx); - /* wrap around case, v. rare */ - /* also v. buggy so there's that */ + if (repl_end_idx >= repl_curr_idx){ + nwritten = connWrite(c->conn, g_pserver->repl_backlog + repl_curr_idx, repl_end_idx - repl_curr_idx); + /* wrap around case */ } else { - serverLog(LL_NOTICE, "ROAD OF RESISTANCE"); - nwritten = connWrite(c->conn, g_pserver->repl_backlog + curr_idx, g_pserver->repl_backlog_size - curr_idx); + nwritten = connWrite(c->conn, g_pserver->repl_backlog + repl_curr_idx, g_pserver->repl_backlog_size - repl_curr_idx); /* only attempt wrapping if we write the correct number of bytes */ - if (nwritten == g_pserver->repl_backlog_size - curr_idx){ - long long nwrittenPart2 = connWrite(c->conn, g_pserver->repl_backlog, idxStart); - if (nwrittenPart2 != -1) - nwritten += nwrittenPart2; + if (nwritten == g_pserver->repl_backlog_size - repl_curr_idx){ + nwritten2ndStage = connWrite(c->conn, g_pserver->repl_backlog, repl_end_idx); + if (nwritten2ndStage != -1) + nwritten += nwritten2ndStage; } } @@ -1798,31 +1777,19 @@ int writeToClient(client *c, int handler_installed) { if (nwritten > 0){ totwritten += nwritten; c->repl_curr_off += nwritten; - if (1){ - serverLog(LL_NOTICE, "printing the stats for client %lu: c->repl_curr_off: %lld, repl_batch_offStart: %lld, nwritten: %ld, offStart: %lld", - c->id, c->repl_curr_off, g_pserver->repl_batch_offStart, nwritten, offStart); - } - serverAssert(c->repl_curr_off <= offStart); + serverAssert(c->repl_curr_off <= c->repl_end_off); /* If the client offset matches the global offset, we wrote all we needed to, * in which case, there is no pending write */ - if (c->repl_curr_off == offStart){ - serverLog(LL_NOTICE, "good, %lld", offStart); + if (c->repl_curr_off == c->repl_end_off){ c->fPendingReplicaWrite = false; - } else { - serverLog(LL_NOTICE, "mismatch between repl_curr_off (%lld) and offStart (%lld)", c->repl_curr_off, offStart); } } /* If the second part of a write didn't go through, we still need to register that */ - if (nwrittenPart2 == -1) nwritten = -1; + if (nwritten2ndStage == -1) nwritten = -1; } - - // if (c->flags & CLIENT_SLAVE && handler_installed) - // serverLog(LL_NOTICE, "Total bytes written, %ld, write handler installed?: %d", totwritten, handler_installed); - } - // serverLog(LL_NOTICE, "rel client"); g_pserver->stat_net_output_bytes += totwritten; if (nwritten == -1) { if (connGetState(c->conn) == CONN_STATE_CONNECTED) { @@ -1843,11 +1810,6 @@ int writeToClient(client *c, int handler_installed) { if (!(c->flags & CLIENT_MASTER)) c->lastinteraction = g_pserver->unixtime; } if (!clientHasPendingReplies(c) && !c->fPendingReplicaWrite) { - // if(c->flags & CLIENT_SLAVE && handler_installed){ - // serverLog(LL_NOTICE, "Uninstalling handler"); - // serverLog(LL_NOTICE, "repl_backlog_size: %lld", g_pserver->repl_backlog_size); - // serverLog(LL_NOTICE, "handler repl_curr_off: %lld, master_repl_offset: %lld", c->repl_curr_off, g_pserver->master_repl_offset); - // } c->sentlen = 0; if (handler_installed) connSetWriteHandler(c->conn, NULL); @@ -1863,7 +1825,6 @@ int writeToClient(client *c, int handler_installed) { /* Write event handler. Just send data to the client. */ void sendReplyToClient(connection *conn) { client *c = (client*)connGetPrivateData(conn); - // serverLog(LL_NOTICE, "called the sendreplytoclient"); if (writeToClient(c,1) == C_ERR) { AeLocker ae; @@ -1997,7 +1958,6 @@ int handleClientsWithPendingWrites(int iel, int aof_state) { auto vec = std::move(g_pserver->rgthreadvar[iel].clients_pending_write); processed += (int)vec.size(); - // serverLog(LL_NOTICE, "entered handleClientsWithPendingWrites"); for (client *c : vec) { serverAssertDebug(FCorrectThread(c)); @@ -2013,12 +1973,6 @@ int handleClientsWithPendingWrites(int iel, int aof_state) { /* Don't write to clients that are going to be closed anyway. */ if (c->flags & CLIENT_CLOSE_ASAP) continue; - // if (c->flags & CLIENT_SLAVE){ - // if(clientHasPendingReplies(c)) - // serverLog(LL_NOTICE, "somehow the client buffer has these values: %s", c->buf); - // serverLog(LL_NOTICE, "LOL"); - // } - /* Try to write buffers to the client socket. */ if (writeToClient(c,0) == C_ERR) { @@ -2216,34 +2170,6 @@ static void setProtocolError(const char *errstr, client *c) { c->flags |= (CLIENT_CLOSE_AFTER_REPLY|CLIENT_PROTOCOL_ERROR); } -static void printQueryBuffer(client *c) { - if (cserver.verbosity <= LL_VERBOSE || c->flags & CLIENT_MASTER) { - sds client = catClientInfoString(sdsempty(),c); - - /* Sample some protocol to given an idea about what was inside. */ - char buf[PROTO_DUMP_LEN*2]; - if (sdslen(c->querybuf)-c->qb_pos < PROTO_DUMP_LEN) { - snprintf(buf,sizeof(buf),"%s", c->querybuf+c->qb_pos); - } else { - snprintf(buf,sizeof(buf),"%.*s (... more %zu bytes ...) %.*s", PROTO_DUMP_LEN/2, c->querybuf+c->qb_pos, sdslen(c->querybuf)-c->qb_pos-PROTO_DUMP_LEN, PROTO_DUMP_LEN/2, c->querybuf+sdslen(c->querybuf)-PROTO_DUMP_LEN/2); - } - - /* Remove non printable chars. */ - char *p = buf; - while (*p != '\0') { - if (!isprint(*p)) *p = '.'; - p++; - } - - /* Log all the client and protocol info. */ - int loglevel = (c->flags & CLIENT_MASTER) ? LL_WARNING : - LL_VERBOSE; - serverLog(loglevel, - "Query buffer from client %lu: %s. %s", c->id, client, buf); - sdsfree(client); - } -} - /* Process the query buffer for client 'c', setting up the client argument * vector for command execution. Returns C_OK if after running the function * the client has a well-formed ready to be processed command, otherwise @@ -2498,8 +2424,6 @@ void parseClientCommandBuffer(client *c) { } size_t cqueriesStart = c->vecqueuedcmd.size(); - // if (c->flags & CLIENT_MASTER) - // printQueryBuffer(c); if (c->reqtype == PROTO_REQ_INLINE) { if (processInlineBuffer(c) != C_OK) break; } else if (c->reqtype == PROTO_REQ_MULTIBULK) { diff --git a/src/replication.cpp b/src/replication.cpp index 96bf161f9..ebdb8af78 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -60,7 +60,6 @@ static void propagateMasterStaleKeys(); void updateLowestOffsetAmongReplicas(){ serverAssert(GlobalLocksAcquired()); serverAssert(!g_pserver->repl_backlog_lock.fOwnLock()); - // serverLog(LL_NOTICE, "off- have repl"); long long min_offset = LONG_LONG_MAX; listIter li; listNode *ln; @@ -73,14 +72,13 @@ void updateLowestOffsetAmongReplicas(){ if (replica->flags & CLIENT_CLOSE_ASAP) continue; std::unique_lock ul(replica->lock); - // serverLog(LL_NOTICE, "off- acq client"); - min_offset = std::min(min_offset, replica->repl_curr_off); - // serverLog(LL_NOTICE, "off- rel client"); + min_offset = std::min(min_offset, replica->repl_curr_off); } /* return -1 if no other minimum was found */ g_pserver->repl_lowest_off.store(min_offset == LONG_LONG_MAX ? -1 : min_offset, std::memory_order_seq_cst); } + /* We take a global flag to remember if this instance generated an RDB * because of replication, so that we can remove the RDB file in case * the instance is configured to have no persistence. */ @@ -232,6 +230,8 @@ void createReplicationBacklog(void) { g_pserver->repl_batch_offStart = g_pserver->master_repl_offset; } +long long getReplIndexFromOffset(long long offset); + /* This function is called when the user modifies the replication backlog * size at runtime. It is up to the function to both update the * g_pserver->repl_backlog_size and to resize the buffer and setup it so that @@ -243,8 +243,6 @@ void resizeReplicationBacklog(long long newsize) { newsize = CONFIG_REPL_BACKLOG_MIN_SIZE; if (g_pserver->repl_backlog_size == newsize) return; - serverLog(LL_NOTICE, "WE HAD TO RESIZE from %lld to %lld", g_pserver->repl_backlog_size, newsize); - if (g_pserver->repl_backlog != NULL) { /* What we actually do is to flush the old buffer and realloc a new * empty one. It will refill with new data incrementally. @@ -252,59 +250,8 @@ void resizeReplicationBacklog(long long newsize) { * worse often we need to alloc additional space before freeing the * old buffer. */ - if (g_pserver->repl_batch_idxStart >= 0) { - // We need to keep critical data so we can't shrink less than the hot data in the buffer - newsize = std::max(newsize, g_pserver->master_repl_offset - g_pserver->repl_batch_offStart); - char *backlog = (char*)zmalloc(newsize); - g_pserver->repl_backlog_histlen = g_pserver->master_repl_offset - g_pserver->repl_batch_offStart; - - if (g_pserver->repl_backlog_idx >= g_pserver->repl_batch_idxStart) { - auto cbActiveBacklog = g_pserver->repl_backlog_idx - g_pserver->repl_batch_idxStart; - memcpy(backlog, g_pserver->repl_backlog + g_pserver->repl_batch_idxStart, cbActiveBacklog); - serverAssert(g_pserver->repl_backlog_histlen == cbActiveBacklog); - } else { - auto cbPhase1 = g_pserver->repl_backlog_size - g_pserver->repl_batch_idxStart; - memcpy(backlog, g_pserver->repl_backlog + g_pserver->repl_batch_idxStart, cbPhase1); - memcpy(backlog + cbPhase1, g_pserver->repl_backlog, g_pserver->repl_backlog_idx); - auto cbActiveBacklog = cbPhase1 + g_pserver->repl_backlog_idx; - serverAssert(g_pserver->repl_backlog_histlen == cbActiveBacklog); - } - zfree(g_pserver->repl_backlog); - g_pserver->repl_backlog = backlog; - g_pserver->repl_backlog_idx = g_pserver->repl_backlog_histlen; - g_pserver->repl_batch_idxStart = 0; - g_pserver->repl_backlog_start = g_pserver->master_repl_offset; - } else { - zfree(g_pserver->repl_backlog); - g_pserver->repl_backlog = (char*)zmalloc(newsize); - g_pserver->repl_backlog_histlen = 0; - g_pserver->repl_backlog_idx = 0; - /* Next byte we have is... the next since the buffer is empty. */ - g_pserver->repl_backlog_off = g_pserver->master_repl_offset+1; - } - } - g_pserver->repl_backlog_size = newsize; -} - -long long getReplIndexFromOffset(long long offset); - -/* The above but for when clients need extra replication backlog because ??? */ -void resizeReplicationBacklogForClients(long long newsize) { - if (newsize < CONFIG_REPL_BACKLOG_MIN_SIZE) - newsize = CONFIG_REPL_BACKLOG_MIN_SIZE; - if (g_pserver->repl_backlog_size == newsize) return; - - serverLog(LL_NOTICE, "WE HAVE TO RESIZE from %lld to %lld", g_pserver->repl_backlog_size, newsize); - /* get the critical client size, i.e. the size of the data unflushed to clients */ - long long earliest_off = g_pserver->repl_lowest_off.load(); - - - if (g_pserver->repl_backlog != NULL) { - /* What we actually do is to flush the old buffer and realloc a new - * empty one. It will refill with new data incrementally. - * The reason is that copying a few gigabytes adds latency and even - * worse often we need to alloc additional space before freeing the - * old buffer. */ + /* get the critical client size, i.e. the size of the data unflushed to clients */ + long long earliest_off = g_pserver->repl_lowest_off.load(); if (earliest_off != -1) { // We need to keep critical data so we can't shrink less than the hot data in the buffer @@ -316,8 +263,6 @@ void resizeReplicationBacklogForClients(long long newsize) { if (g_pserver->repl_backlog_idx >= earliest_idx) { auto cbActiveBacklog = g_pserver->repl_backlog_idx - earliest_idx; memcpy(backlog, g_pserver->repl_backlog + earliest_idx, cbActiveBacklog); - serverLog(LL_NOTICE, "g_pserver->master_repl_offset: %lld, earliest_off: %lld, g_pserver->repl_backlog_idx: %lld, earliest_idx: %lld, repl_backlog_start: %lld", - g_pserver->master_repl_offset, earliest_off, g_pserver->repl_backlog_idx, earliest_idx, g_pserver->repl_backlog_start); serverAssert(g_pserver->repl_backlog_histlen == cbActiveBacklog); } else { auto cbPhase1 = g_pserver->repl_backlog_size - earliest_idx; @@ -344,14 +289,9 @@ void resizeReplicationBacklogForClients(long long newsize) { } } g_pserver->repl_backlog_size = newsize; - - serverLog(LL_NOTICE, "We are ending with: master_repl_offset: %lld, repl_batch_offStart: %lld, new_off: %lld, " - "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, new_idx: %lld, repl_backlog_size: %lld", - g_pserver->master_repl_offset, g_pserver->repl_batch_offStart, 0LL, - g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, 0LL, g_pserver->repl_backlog_size - ); } + void freeReplicationBacklog(void) { serverAssert(GlobalLocksAcquired()); listIter li; @@ -391,17 +331,11 @@ void feedReplicationBacklog(const void *ptr, size_t len) { // This is an emergency overflow, we better resize to fit long long newsize = std::max(g_pserver->repl_backlog_size*2, minimumsize); serverLog(LL_WARNING, "Replication backlog is too small, resizing to: %lld", newsize); - resizeReplicationBacklogForClients(newsize); + resizeReplicationBacklog(newsize); } } } - // serverLog(LL_NOTICE, "Pt2 start with: master_repl_offset: %lld, repl_batch_offStart: %lld, " - // "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, repl_backlog_size: %lld", - // g_pserver->master_repl_offset, g_pserver->repl_batch_offStart, - // g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, g_pserver->repl_backlog_size - // ); - g_pserver->master_repl_offset += len; /* This is a circular buffer, so write as much data we can at every @@ -423,12 +357,6 @@ void feedReplicationBacklog(const void *ptr, size_t len) { /* Set the offset of the first byte we have in the backlog. */ g_pserver->repl_backlog_off = g_pserver->master_repl_offset - g_pserver->repl_backlog_histlen + 1; - - // serverLog(LL_NOTICE, "Pt2 end with: master_repl_offset: %lld, repl_batch_offStart: %lld, " - // "repl_backlog_idx: %lld, repl_batch_idxStart: %lld, repl_backlog_size: %lld", - // g_pserver->master_repl_offset, g_pserver->repl_batch_offStart, - // g_pserver->repl_backlog_idx, g_pserver->repl_batch_idxStart, g_pserver->repl_backlog_size - // ); } /* Wrapper for feedReplicationBacklog() that takes Redis string objects @@ -578,9 +506,7 @@ void replicationFeedSlavesCore(list *slaves, int dictid, robj **argv, int argc) /* Add the SELECT command into the backlog. */ /* We don't do this for advanced replication because this will be done later when it adds the whole RREPLAY command */ - if (g_pserver->repl_backlog && fSendRaw) { - feedReplicationBacklogWithObject(selectcmd); - } + if (g_pserver->repl_backlog && fSendRaw) feedReplicationBacklogWithObject(selectcmd); if (dictid < 0 || dictid >= PROTO_SHARED_SELECT_CMDS) decrRefCount(selectcmd); @@ -678,7 +604,7 @@ void replicationFeedSlaves(list *replicas, int dictid, robj **argv, int argc) { void showLatestBacklog(void) { if (g_pserver->repl_backlog == NULL) return; - long long dumplen = 1024; + long long dumplen = 256; if (g_pserver->repl_backlog_histlen < dumplen) dumplen = g_pserver->repl_backlog_histlen; @@ -769,7 +695,9 @@ void replicationFeedMonitors(client *c, list *monitors, int dictid, robj **argv, } decrRefCount(cmdobj); } -#define BYPASS_PSYNC + +int prepareClientToWrite(client *c); + /* Feed the replica 'c' with the replication backlog starting from the * specified 'offset' up to the end of the backlog. */ long long addReplyReplicationBacklog(client *c, long long offset) { @@ -809,26 +737,14 @@ long long addReplyReplicationBacklog(client *c, long long offset) { * split the reply in two parts if we are cross-boundary. */ len = g_pserver->repl_backlog_histlen - skip; serverLog(LL_DEBUG, "[PSYNC] Reply total length: %lld", len); -#ifdef BYPASS_PSYNC + c->repl_curr_off = offset - 1; c->repl_end_off = g_pserver->master_repl_offset; - serverLog(LL_NOTICE, "This client %lu at addr %s synchronized to %lld", c->id, getClientPeerId(c), c->repl_curr_off); /* Force the partial sync to be queued */ prepareClientToWrite(c); - c->fPendingReplicaWrite = true; -#else - while(len) { - long long thislen = - ((g_pserver->repl_backlog_size - j) < len) ? - (g_pserver->repl_backlog_size - j) : len; + c->fPendingReplicaWrite = true; - serverLog(LL_DEBUG, "[PSYNC] addReply() length: %lld", thislen); - addReplySds(c,sdsnewlen(g_pserver->repl_backlog + j, thislen)); - len -= thislen; - j = 0; - } -#endif return g_pserver->repl_backlog_histlen - skip; } @@ -866,15 +782,11 @@ int replicationSetupSlaveForFullResync(client *replica, long long offset) { replica->repl_curr_off = offset; replica->repl_end_off = g_pserver->master_repl_offset; - serverLog(LL_NOTICE, "This client %lu at addr %s synchronized to %lld", replica->id, getClientPeerId(replica), replica->repl_curr_off); - /* We are going to accumulate the incremental changes for this * replica as well. Set replicaseldb to -1 in order to force to re-emit * a SELECT statement in the replication stream. */ g_pserver->replicaseldb = -1; - serverLog(LL_NOTICE, "We are setting up here lad"); - /* Don't send this reply to slaves that approached us with * the old SYNC command. */ if (!(replica->flags & CLIENT_PRE_PSYNC)) { @@ -1179,7 +1091,6 @@ void syncCommand(client *c) { if (g_pserver->FRdbSaveInProgress() && g_pserver->rdb_child_type == RDB_CHILD_TYPE_DISK) { - serverLog(LL_NOTICE, "case 1"); /* Ok a background save is in progress. Let's check if it is a good * one for replication, i.e. if there is another replica that is * registering differences since the server forked to save. */ @@ -1211,7 +1122,6 @@ void syncCommand(client *c) { } else if (g_pserver->FRdbSaveInProgress() && g_pserver->rdb_child_type == RDB_CHILD_TYPE_SOCKET) { - serverLog(LL_NOTICE, "case 2"); /* There is an RDB child process but it is writing directly to * children sockets. We need to wait for the next BGSAVE * in order to synchronize. */ @@ -1219,7 +1129,6 @@ void syncCommand(client *c) { /* CASE 3: There is no BGSAVE is progress. */ } else { - serverLog(LL_NOTICE, "case 3"); if (g_pserver->repl_diskless_sync && (c->slave_capa & SLAVE_CAPA_EOF)) { /* Diskless replication RDB child is created inside * replicationCron() since we want to delay its start a @@ -4606,9 +4515,10 @@ void _clientAsyncReplyBufferReserve(client *c, size_t len); void flushReplBacklogToClients() { serverAssert(GlobalLocksAcquired()); + /* If we have the repl backlog lock, we will deadlock */ + serverAssert(!g_pserver->repl_backlog_lock.fOwnLock()); if (g_pserver->repl_batch_offStart < 0) return; - if (g_pserver->repl_batch_offStart != g_pserver->master_repl_offset) { bool fAsyncWrite = false; @@ -4617,66 +4527,31 @@ void flushReplBacklogToClients() serverAssert(g_pserver->master_repl_offset - g_pserver->repl_batch_offStart <= g_pserver->repl_backlog_size); serverAssert(g_pserver->repl_batch_idxStart != g_pserver->repl_backlog_idx); - serverLog(LL_NOTICE, "the master repl offset is %lld", g_pserver->master_repl_offset); - showLatestBacklog(); listIter li; listNode *ln; listRewind(g_pserver->slaves, &li); while ((ln = listNext(&li))) { client *replica = (client*)listNodeValue(ln); - // serverLog(LL_NOTICE, "client %lu is in the party", replica->id); - - // serverLog(LL_NOTICE, "is there a write pending for %lu, %d", replica->id, replica->fPendingReplicaWrite); if (replica->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue; if (replica->flags & CLIENT_CLOSE_ASAP) continue; - std::unique_lock ul(replica->lock, std::defer_lock); - if (FCorrectThread(replica)) - ul.lock(); - else + std::unique_lock ul(replica->lock); + if (!FCorrectThread(replica)) fAsyncWrite = true; - - /* If we are online and the RDB has been sent, there is no need to feed the client buffer - * We will send our replies directly from the replication backlog instead */ -#ifdef BYPASS_BUFFER - { - std::unique_lock asyncUl(replica->lock, std::defer_lock); - if (!FCorrectThread(replica)) - asyncUl.lock(); + /* We should have set the repl_curr_off when synchronizing, so it shouldn't be -1 here */ + serverAssert(replica->repl_curr_off != -1); - /* We should have set the repl_curr_off when synchronizing, so it shouldn't be -1 here */ - serverAssert(replica->repl_curr_off != -1); + replica->repl_end_off = g_pserver->master_repl_offset; - replica->repl_end_off = g_pserver->master_repl_offset; - - /* Only if the there isn't already a pending write do we prepare the client to write */ - if (!replica->fPendingReplicaWrite){ - serverAssert(replica->repl_curr_off != g_pserver->master_repl_offset); - prepareClientToWrite(replica); - replica->fPendingReplicaWrite = true; - } - - continue; + /* Only if the there isn't already a pending write do we prepare the client to write */ + if (!replica->fPendingReplicaWrite){ + serverAssert(replica->repl_curr_off != g_pserver->master_repl_offset); + prepareClientToWrite(replica); + replica->fPendingReplicaWrite = true; } -#endif - if (g_pserver->repl_backlog_idx >= g_pserver->repl_batch_idxStart) { - long long cbCopy = g_pserver->repl_backlog_idx - g_pserver->repl_batch_idxStart; - serverAssert((g_pserver->master_repl_offset - g_pserver->repl_batch_offStart) == cbCopy); - serverAssert((g_pserver->repl_backlog_size - g_pserver->repl_batch_idxStart) >= (cbCopy)); - serverAssert((g_pserver->repl_batch_idxStart + cbCopy) <= g_pserver->repl_backlog_size); - - addReplyProto(replica, g_pserver->repl_backlog + g_pserver->repl_batch_idxStart, cbCopy); - } else { - auto cbPhase1 = g_pserver->repl_backlog_size - g_pserver->repl_batch_idxStart; - if (fAsyncWrite) - _clientAsyncReplyBufferReserve(replica, cbPhase1 + g_pserver->repl_backlog_idx); - addReplyProto(replica, g_pserver->repl_backlog + g_pserver->repl_batch_idxStart, cbPhase1); - addReplyProto(replica, g_pserver->repl_backlog, g_pserver->repl_backlog_idx); - serverAssert((cbPhase1 + g_pserver->repl_backlog_idx) == (g_pserver->master_repl_offset - g_pserver->repl_batch_offStart)); - } } if (fAsyncWrite) ProcessPendingAsyncWrites(); diff --git a/src/server.cpp b/src/server.cpp index 439e1aeff..362569bfa 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -1796,7 +1796,6 @@ int clientsCronTrackClientsMemUsage(client *c) { mem += zmalloc_size(c); mem += c->argv_len_sum(); if (c->argv) mem += zmalloc_size(c->argv); - // serverLog(LL_NOTICE, "Mem here is : %lu", mem); /* Now that we have the memory used by the client, remove the old * value from the old category, and add it back. */ g_pserver->stat_clients_type_memory[c->client_cron_last_memory_type] -= @@ -1855,7 +1854,6 @@ void clientsCron(int iel) { while(listLength(g_pserver->clients) && iterations--) { client *c; listNode *head; - // serverLog(LL_NOTICE, "we are at iteration: %d", iterations); /* Rotate the list, take the current head, process. * This way if the client must be removed from the list it's the * first element and we don't incur into O(N) computation. */ diff --git a/src/server.h b/src/server.h index 64a2ca515..0fcd8f5ef 100644 --- a/src/server.h +++ b/src/server.h @@ -3540,7 +3540,6 @@ void tlsInit(void); void tlsInitThread(); int tlsConfigure(redisTLSContextConfig *ctx_config); -int prepareClientToWrite(client *c); class ShutdownException From 5998dc233afa724060fe2a8855d226ab98112e90 Mon Sep 17 00:00:00 2001 From: malavan Date: Wed, 9 Jun 2021 21:49:15 +0000 Subject: [PATCH 57/99] add global locks to FreeMemoryLazyFree Former-commit-id: d850ce20219a3e29a6a816ebfa0d714963d6a88b --- src/evict.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/evict.cpp b/src/evict.cpp index b673e165d..8438064a4 100644 --- a/src/evict.cpp +++ b/src/evict.cpp @@ -470,11 +470,13 @@ public: FreeMemoryLazyFree(FreeMemoryLazyFree&&) = default; ~FreeMemoryLazyFree() { + aeAcquireLock(); for (auto &pair : vecdictvecde) { for (auto de : pair.second) { dictFreeUnlinkedEntry(pair.first, de); } } + aeReleaseLock(); --s_clazyFreesInProgress; } From bdc29a935e886c92a05208cb47800c99af7b83fe Mon Sep 17 00:00:00 2001 From: John Sully Date: Mon, 14 Jun 2021 06:32:58 +0000 Subject: [PATCH 58/99] Fix deadlock in storage cache Former-commit-id: e74711e8131cd29a1e0294fbb28e1737ee98afce --- src/StorageCache.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/StorageCache.cpp b/src/StorageCache.cpp index e33c97ff7..98c797c71 100644 --- a/src/StorageCache.cpp +++ b/src/StorageCache.cpp @@ -130,9 +130,10 @@ void StorageCache::retrieve(sds key, IStorage::callbackSingle fn) const size_t StorageCache::count() const { - std::unique_lock ul(m_lock); + std::unique_lock ul(m_lock, std::defer_lock); + bool fLocked = ul.try_lock(); size_t count = m_spstorage->count(); - if (m_pdict != nullptr) { + if (m_pdict != nullptr && fLocked) { serverAssert(bulkInsertsInProgress.load(std::memory_order_seq_cst) || count == (dictSize(m_pdict) + m_collisionCount)); } return count; @@ -140,6 +141,5 @@ size_t StorageCache::count() const void StorageCache::beginWriteBatch() { serverAssert(GlobalLocksAcquired()); // Otherwise we deadlock - m_lock.lock(); m_spstorage->beginWriteBatch(); } \ No newline at end of file From 80dddab0c4e587332b497cbe8157f39dcc417eb4 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 14 Jun 2021 19:30:49 +0000 Subject: [PATCH 59/99] Relaxed locking, should run faster now Former-commit-id: 5cec4d026dc1766b9ecbade6ec4b9d0e75a94e0f --- src/multi.cpp | 1 - src/networking.cpp | 6 ++++++ src/replication.cpp | 18 ++++++++++-------- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/src/multi.cpp b/src/multi.cpp index f74748e90..589dba589 100644 --- a/src/multi.cpp +++ b/src/multi.cpp @@ -268,7 +268,6 @@ void execCommand(client *c) { if (g_pserver->repl_backlog && was_master && !is_master) { const char *execcmd = "*1\r\n$4\r\nEXEC\r\n"; updateLowestOffsetAmongReplicas(); - std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); feedReplicationBacklog(execcmd,strlen(execcmd)); } afterPropagateExec(); diff --git a/src/networking.cpp b/src/networking.cpp index d8d91751d..07312a9ee 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -1856,6 +1856,8 @@ int writeToClient(client *c, int handler_installed) { * We always read from the replication backlog directly */ std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); + // serverLog(LL_NOTICE, "written to handler"); + long long repl_end_idx = getReplIndexFromOffset(c->repl_end_off); serverAssert(c->repl_curr_off != -1); @@ -1884,8 +1886,12 @@ int writeToClient(client *c, int handler_installed) { serverAssert(c->repl_curr_off <= c->repl_end_off); /* If the client offset matches the global offset, we wrote all we needed to, * in which case, there is no pending write */ + if (c->repl_curr_off == c->repl_end_off){ + // serverLog(LL_NOTICE, "Successfully wrote up until %lld", c->repl_end_off); c->fPendingReplicaWrite = false; + } else { + // serverLog(LL_NOTICE, "Wrote to %lld out of %lld", c->repl_curr_off, c->repl_end_off); } } diff --git a/src/replication.cpp b/src/replication.cpp index d10bac99a..a5f9c3acf 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -241,6 +241,8 @@ void resizeReplicationBacklog(long long newsize) { newsize = CONFIG_REPL_BACKLOG_MIN_SIZE; if (g_pserver->repl_backlog_size == newsize) return; + std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); + if (g_pserver->repl_backlog != NULL) { /* What we actually do is to flush the old buffer and realloc a new * empty one. It will refill with new data incrementally. @@ -310,9 +312,9 @@ void freeReplicationBacklog(void) { * the backlog without incrementing the offset. */ void feedReplicationBacklog(const void *ptr, size_t len) { serverAssert(GlobalLocksAcquired()); - serverAssert(g_pserver->repl_backlog_lock.fOwnLock()); const unsigned char *p = (const unsigned char*)ptr; + if (g_pserver->repl_batch_idxStart >= 0) { /* we are lower bounded by the lower client offset or the offStart if all the clients are up to date */ long long lower_bound = g_pserver->repl_lowest_off.load(std::memory_order_seq_cst); @@ -320,10 +322,11 @@ void feedReplicationBacklog(const void *ptr, size_t len) { lower_bound = g_pserver->repl_batch_offStart; long long minimumsize = g_pserver->master_repl_offset + len - lower_bound + 1; if (minimumsize > g_pserver->repl_backlog_size) { - g_pserver->repl_backlog_lock.unlock(); flushReplBacklogToClients(); - g_pserver->repl_backlog_lock.lock(); - minimumsize = g_pserver->master_repl_offset + len - lower_bound +1; + minimumsize = g_pserver->master_repl_offset + len - lower_bound + 1; + + serverLog(LL_NOTICE, "minimumsize: %lld, g_pserver->master_repl_offset: %lld, len: %lu, lower_bound: %lld", + minimumsize, g_pserver->master_repl_offset, len, lower_bound); if (minimumsize > g_pserver->repl_backlog_size) { // This is an emergency overflow, we better resize to fit @@ -492,7 +495,6 @@ void replicationFeedSlavesCore(list *slaves, int dictid, robj **argv, int argc) bool fSendRaw = !g_pserver->fActiveReplica; updateLowestOffsetAmongReplicas(); - std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); /* Send SELECT command to every replica if needed. */ if (g_pserver->replicaseldb != dictid) { @@ -655,7 +657,6 @@ void replicationFeedSlavesFromMasterStream(char *buf, size_t buflen) { if (g_pserver->repl_backlog){ updateLowestOffsetAmongReplicas(); - std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); feedReplicationBacklog(buf,buflen); } } @@ -750,7 +751,7 @@ long long addReplyReplicationBacklog(client *c, long long offset) { serverLog(LL_DEBUG, "[PSYNC] Reply total length: %lld", len); c->repl_curr_off = offset - 1; - serverLog(LL_NOTICE, "Client %s, replica offset %lld in psync", replicationGetSlaveName(c), c->repl_curr_off); + // serverLog(LL_NOTICE, "Client %s, replica offset %lld in psync", replicationGetSlaveName(c), c->repl_curr_off); c->repl_end_off = g_pserver->master_repl_offset; /* Force the partial sync to be queued */ @@ -4988,7 +4989,7 @@ void flushReplBacklogToClients() if (!canFeedReplicaReplBuffer(replica)) continue; if (replica->flags & CLIENT_CLOSE_ASAP) continue; - serverLog(LL_NOTICE, "Client %s, replica offset %lld", replicationGetSlaveName(replica), replica->repl_curr_off); + // serverLog(LL_NOTICE, "Client %s, replica offset %lld", replicationGetSlaveName(replica), replica->repl_curr_off); std::unique_lock ul(replica->lock); if (!FCorrectThread(replica)) @@ -5013,6 +5014,7 @@ void flushReplBacklogToClients() // This may be called multiple times per "frame" so update with our progress flushing to clients g_pserver->repl_batch_idxStart = g_pserver->repl_backlog_idx; g_pserver->repl_batch_offStart = g_pserver->master_repl_offset; + updateLowestOffsetAmongReplicas(); } } From 6a65b8bbaa318429c69dadd852f62fb6364414fd Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 15 Jun 2021 23:13:49 +0000 Subject: [PATCH 60/99] Optimized use of repl_lowest_off to reduce lock contention Former-commit-id: 30a957e5399fe94675f0b6d2d34c24112d5a9734 --- src/multi.cpp | 1 - src/replication.cpp | 34 ++++++++-------------------------- 2 files changed, 8 insertions(+), 27 deletions(-) diff --git a/src/multi.cpp b/src/multi.cpp index 589dba589..1b91a05a0 100644 --- a/src/multi.cpp +++ b/src/multi.cpp @@ -267,7 +267,6 @@ void execCommand(client *c) { * backlog with the final EXEC. */ if (g_pserver->repl_backlog && was_master && !is_master) { const char *execcmd = "*1\r\n$4\r\nEXEC\r\n"; - updateLowestOffsetAmongReplicas(); feedReplicationBacklog(execcmd,strlen(execcmd)); } afterPropagateExec(); diff --git a/src/replication.cpp b/src/replication.cpp index a5f9c3acf..cb0b562b1 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -56,29 +56,6 @@ void putSlaveOnline(client *replica); int cancelReplicationHandshake(redisMaster *mi, int reconnect); static void propagateMasterStaleKeys(); -/* gets the lowest offset amongst all of the replicas and stores it globally*/ -void updateLowestOffsetAmongReplicas(){ - serverAssert(GlobalLocksAcquired()); - serverAssert(!g_pserver->repl_backlog_lock.fOwnLock()); - long long min_offset = LONG_LONG_MAX; - listIter li; - listNode *ln; - listRewind(g_pserver->slaves, &li); - // check for potential overflow first - while ((ln = listNext(&li))) { - client *replica = (client*)listNodeValue(ln); - - if (replica->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue; - if (replica->flags & CLIENT_CLOSE_ASAP) continue; - - std::unique_lock ul(replica->lock); - - min_offset = std::min(min_offset, replica->repl_curr_off); - } - /* return -1 if no other minimum was found */ - g_pserver->repl_lowest_off.store(min_offset == LONG_LONG_MAX ? -1 : min_offset, std::memory_order_seq_cst); -} - /* We take a global flag to remember if this instance generated an RDB * because of replication, so that we can remove the RDB file in case * the instance is configured to have no persistence. */ @@ -323,6 +300,10 @@ void feedReplicationBacklog(const void *ptr, size_t len) { long long minimumsize = g_pserver->master_repl_offset + len - lower_bound + 1; if (minimumsize > g_pserver->repl_backlog_size) { flushReplBacklogToClients(); + lower_bound = g_pserver->repl_lowest_off.load(std::memory_order_seq_cst); + if (lower_bound == -1) + lower_bound = g_pserver->repl_batch_offStart; + minimumsize = g_pserver->master_repl_offset + len - lower_bound + 1; serverLog(LL_NOTICE, "minimumsize: %lld, g_pserver->master_repl_offset: %lld, len: %lu, lower_bound: %lld", @@ -494,7 +475,6 @@ void replicationFeedSlavesCore(list *slaves, int dictid, robj **argv, int argc) serverAssert(!(listLength(slaves) != 0 && g_pserver->repl_backlog == NULL)); bool fSendRaw = !g_pserver->fActiveReplica; - updateLowestOffsetAmongReplicas(); /* Send SELECT command to every replica if needed. */ if (g_pserver->replicaseldb != dictid) { @@ -656,7 +636,6 @@ void replicationFeedSlavesFromMasterStream(char *buf, size_t buflen) { } if (g_pserver->repl_backlog){ - updateLowestOffsetAmongReplicas(); feedReplicationBacklog(buf,buflen); } } @@ -4975,6 +4954,7 @@ void flushReplBacklogToClients() if (g_pserver->repl_batch_offStart != g_pserver->master_repl_offset) { bool fAsyncWrite = false; + long long min_offset = LONG_LONG_MAX; // Ensure no overflow serverAssert(g_pserver->repl_batch_offStart < g_pserver->master_repl_offset); serverAssert(g_pserver->master_repl_offset - g_pserver->repl_batch_offStart <= g_pserver->repl_backlog_size); @@ -4998,6 +4978,8 @@ void flushReplBacklogToClients() /* We should have set the repl_curr_off when synchronizing, so it shouldn't be -1 here */ serverAssert(replica->repl_curr_off != -1); + min_offset = std::min(min_offset, replica->repl_curr_off); + replica->repl_end_off = g_pserver->master_repl_offset; /* Only if the there isn't already a pending write do we prepare the client to write */ @@ -5014,7 +4996,7 @@ void flushReplBacklogToClients() // This may be called multiple times per "frame" so update with our progress flushing to clients g_pserver->repl_batch_idxStart = g_pserver->repl_backlog_idx; g_pserver->repl_batch_offStart = g_pserver->master_repl_offset; - updateLowestOffsetAmongReplicas(); + g_pserver->repl_lowest_off.store(min_offset == LONG_LONG_MAX ? -1 : min_offset, std::memory_order_seq_cst); } } From 29f4c661799107ed6db8168ecb297b1e0b64f575 Mon Sep 17 00:00:00 2001 From: VivekSainiEQ Date: Wed, 16 Jun 2021 19:41:55 +0000 Subject: [PATCH 61/99] More code cleanup Former-commit-id: 8e9962b9b7b9093399451bf93d30e5b5d26e3d33 --- src/evict.cpp | 2 ++ src/networking.cpp | 52 +++++++++++++++------------------------------ src/replication.cpp | 51 ++++++++++++++++++-------------------------- src/server.h | 14 ++++++------ 4 files changed, 47 insertions(+), 72 deletions(-) diff --git a/src/evict.cpp b/src/evict.cpp index ba426f0ee..d336bc8b8 100644 --- a/src/evict.cpp +++ b/src/evict.cpp @@ -354,6 +354,8 @@ unsigned long LFUDecrAndReturn(robj_roptr o) { return counter; } +unsigned long getClientReplicationBacklogSharedUsage(client *c); + /* We don't want to count AOF buffers and slaves output buffers as * used memory: the eviction should use mostly data size. This function * returns the sum of AOF and slaves buffer. */ diff --git a/src/networking.cpp b/src/networking.cpp index 07312a9ee..767fe9c2b 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -1765,15 +1765,7 @@ client *lookupClientByID(uint64_t id) { return (c == raxNotFound) ? NULL : c; } -/* Compute the corresponding index from a replication backlog offset - * by taking the distance between the input offset and the replication backlog offset - * and applying that to the replication backlog index, wrapping around if the index - * becomes negative. - * TODO: Rewrite comment for new logic */ -long long getReplIndexFromOffset(long long offset){ - long long index = (offset - g_pserver->repl_backlog_start) % g_pserver->repl_backlog_size; - return index; -} +long long getReplIndexFromOffset(long long offset); /* Write data in output buffers to client. Return C_OK if the client * is still valid after the call, C_ERR if it was freed because of some @@ -1832,35 +1824,31 @@ int writeToClient(client *c, int handler_installed) { } } /* Note that we avoid to send more than NET_MAX_WRITES_PER_EVENT - * bytes, in a single threaded server it's a good idea to serve - * other clients as well, even if a very large request comes from - * super fast link that is always able to accept data (in real world - * scenario think about 'KEYS *' against the loopback interface). - * - * However if we are over the maxmemory limit we ignore that and - * just deliver as much data as it is possible to deliver. - * - * Moreover, we also send as much as possible if the client is - * a replica or a monitor (otherwise, on high-speed traffic, the - * replication/output buffer will grow indefinitely) */ + * bytes, in a single threaded server it's a good idea to serve + * other clients as well, even if a very large request comes from + * super fast link that is always able to accept data (in real world + * scenario think about 'KEYS *' against the loopback interface). + * + * However if we are over the maxmemory limit we ignore that and + * just deliver as much data as it is possible to deliver. + * + * Moreover, we also send as much as possible if the client is + * a replica or a monitor (otherwise, on high-speed traffic, the + * replication/output buffer will grow indefinitely) */ if (totwritten > NET_MAX_WRITES_PER_EVENT && (g_pserver->maxmemory == 0 || - zmalloc_used_memory() < g_pserver->maxmemory) && + zmalloc_used_memory() < g_pserver->maxmemory) && !(c->flags & CLIENT_SLAVE)) break; } /* We can only directly read from the replication backlog if the client is a replica, so only attempt to do so if that's the case. */ if (c->flags & CLIENT_SLAVE && !(c->flags & CLIENT_MONITOR)) { - /* For replicas, we don't store all the information in the client buffer - * We always read from the replication backlog directly */ + std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); - - // serverLog(LL_NOTICE, "written to handler"); - long long repl_end_idx = getReplIndexFromOffset(c->repl_end_off); - serverAssert(c->repl_curr_off != -1); + if (c->repl_curr_off != c->repl_end_off){ long long repl_curr_idx = getReplIndexFromOffset(c->repl_curr_off); long long nwritten2ndStage = 0; /* How much was written from the start of the replication backlog @@ -1884,14 +1872,9 @@ int writeToClient(client *c, int handler_installed) { totwritten += nwritten; c->repl_curr_off += nwritten; serverAssert(c->repl_curr_off <= c->repl_end_off); - /* If the client offset matches the global offset, we wrote all we needed to, - * in which case, there is no pending write */ - + /* If the client's current offset matches the last offset it can read from, there is no pending write */ if (c->repl_curr_off == c->repl_end_off){ - // serverLog(LL_NOTICE, "Successfully wrote up until %lld", c->repl_end_off); c->fPendingReplicaWrite = false; - } else { - // serverLog(LL_NOTICE, "Wrote to %lld out of %lld", c->repl_curr_off, c->repl_end_off); } } @@ -3719,8 +3702,7 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) { } } -/* In the case of a replica client, it is possible (and very likely) - * that writes to said replica are using data from the replication backlog +/* In the case of a replica client, writes to said replica are using data from the replication backlog * as opposed to it's own internal buffer, this number should keep track of that */ unsigned long getClientReplicationBacklogSharedUsage(client *c) { return (!(c->flags & CLIENT_SLAVE) || !c->fPendingReplicaWrite ) ? 0 : g_pserver->master_repl_offset - c->repl_curr_off; diff --git a/src/replication.cpp b/src/replication.cpp index cb0b562b1..b9465680e 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -47,8 +47,6 @@ #include #include -#define BYPASS_BUFFER - void replicationDiscardCachedMaster(redisMaster *mi); void replicationResurrectCachedMaster(redisMaster *mi, connection *conn); void replicationSendAck(redisMaster *mi); @@ -61,8 +59,6 @@ static void propagateMasterStaleKeys(); * the instance is configured to have no persistence. */ int RDBGeneratedByReplication = 0; -void resizeReplicationBacklogForClients(long long newsize); - /* --------------------------- Utility functions ---------------------------- */ /* Return the pointer to a string representing the replica ip:listening_port @@ -205,7 +201,14 @@ void createReplicationBacklog(void) { g_pserver->repl_batch_offStart = g_pserver->master_repl_offset; } -long long getReplIndexFromOffset(long long offset); +/* Compute the corresponding index from a replication backlog offset + * Since this computation needs the size of the replication backlog, + * you need to have the repl_backlog_lock in order to call it */ +long long getReplIndexFromOffset(long long offset){ + serverAssert(g_pserver->repl_backlog_lock.fOwnLock()); + long long index = (offset - g_pserver->repl_backlog_start) % g_pserver->repl_backlog_size; + return index; +} /* This function is called when the user modifies the replication backlog * size at runtime. It is up to the function to both update the @@ -293,7 +296,7 @@ void feedReplicationBacklog(const void *ptr, size_t len) { if (g_pserver->repl_batch_idxStart >= 0) { - /* we are lower bounded by the lower client offset or the offStart if all the clients are up to date */ + /* We are lower bounded by the lowest replica offset, or the batch offset start if not applicable */ long long lower_bound = g_pserver->repl_lowest_off.load(std::memory_order_seq_cst); if (lower_bound == -1) lower_bound = g_pserver->repl_batch_offStart; @@ -306,9 +309,6 @@ void feedReplicationBacklog(const void *ptr, size_t len) { minimumsize = g_pserver->master_repl_offset + len - lower_bound + 1; - serverLog(LL_NOTICE, "minimumsize: %lld, g_pserver->master_repl_offset: %lld, len: %lu, lower_bound: %lld", - minimumsize, g_pserver->master_repl_offset, len, lower_bound); - if (minimumsize > g_pserver->repl_backlog_size) { // This is an emergency overflow, we better resize to fit long long newsize = std::max(g_pserver->repl_backlog_size*2, minimumsize); @@ -635,9 +635,7 @@ void replicationFeedSlavesFromMasterStream(char *buf, size_t buflen) { printf("\n"); } - if (g_pserver->repl_backlog){ - feedReplicationBacklog(buf,buflen); - } + if (g_pserver->repl_backlog) feedReplicationBacklog(buf,buflen); } void replicationFeedMonitors(client *c, list *monitors, int dictid, robj **argv, int argc) { @@ -689,13 +687,12 @@ int prepareClientToWrite(client *c); /* Feed the replica 'c' with the replication backlog starting from the * specified 'offset' up to the end of the backlog. */ long long addReplyReplicationBacklog(client *c, long long offset) { - long long j, skip, len; + long long skip, len; serverLog(LL_DEBUG, "[PSYNC] Replica request offset: %lld", offset); if (g_pserver->repl_backlog_histlen == 0) { serverLog(LL_DEBUG, "[PSYNC] Backlog history len is zero"); - serverLog(LL_NOTICE, "REOAD TO RESIST"); c->repl_curr_off = g_pserver->master_repl_offset; c->repl_end_off = g_pserver->master_repl_offset; return 0; @@ -714,30 +711,20 @@ long long addReplyReplicationBacklog(client *c, long long offset) { skip = offset - g_pserver->repl_backlog_off; serverLog(LL_DEBUG, "[PSYNC] Skipping: %lld", skip); - /* Point j to the oldest byte, that is actually our - * g_pserver->repl_backlog_off byte. */ - j = (g_pserver->repl_backlog_idx + - (g_pserver->repl_backlog_size-g_pserver->repl_backlog_histlen)) % - g_pserver->repl_backlog_size; - serverLog(LL_DEBUG, "[PSYNC] Index of first byte: %lld", j); - - /* Discard the amount of data to seek to the specified 'offset'. */ - j = (j + skip) % g_pserver->repl_backlog_size; - - /* Feed replica with data. Since it is a circular buffer we have to - * split the reply in two parts if we are cross-boundary. */ len = g_pserver->repl_backlog_histlen - skip; serverLog(LL_DEBUG, "[PSYNC] Reply total length: %lld", len); + /* Set the start and end offsets for the replica so that a future + * writeToClient will send the backlog from the given offset to + * the current end of the backlog to said replica */ c->repl_curr_off = offset - 1; - // serverLog(LL_NOTICE, "Client %s, replica offset %lld in psync", replicationGetSlaveName(c), c->repl_curr_off); c->repl_end_off = g_pserver->master_repl_offset; /* Force the partial sync to be queued */ prepareClientToWrite(c); c->fPendingReplicaWrite = true; - return g_pserver->repl_backlog_histlen - skip; + return len; } /* Return the offset to provide as reply to the PSYNC command received @@ -4963,14 +4950,18 @@ void flushReplBacklogToClients() listIter li; listNode *ln; listRewind(g_pserver->slaves, &li); + /* We don't actually write any data in this function since we send data + * directly from the replication backlog to replicas in writeToClient. + * + * What we do however, is set the end offset of each replica here. This way, + * future calls to writeToClient will know up to where in the replication + * backlog is valid for writing. */ while ((ln = listNext(&li))) { client *replica = (client*)listNodeValue(ln); if (!canFeedReplicaReplBuffer(replica)) continue; if (replica->flags & CLIENT_CLOSE_ASAP) continue; - // serverLog(LL_NOTICE, "Client %s, replica offset %lld", replicationGetSlaveName(replica), replica->repl_curr_off); - std::unique_lock ul(replica->lock); if (!FCorrectThread(replica)) fAsyncWrite = true; diff --git a/src/server.h b/src/server.h index cb3973969..0d6f766ce 100644 --- a/src/server.h +++ b/src/server.h @@ -1590,9 +1590,11 @@ struct client { copying this replica output buffer should use. */ - long long repl_curr_off = -1; /* Replication offset of the client, only if it's a replica*/ - long long repl_end_off = -1; /* Replication offset to write to */ - int fPendingReplicaWrite; + long long repl_curr_off = -1;/* Replication offset of the replica, also where in the backlog we need to start from + * when sending data to this replica. */ + long long repl_end_off = -1; /* Replication offset to write to, stored in the replica, as opposed to using the global offset + * to prevent needing the global lock */ + int fPendingReplicaWrite; /* Is there a write queued for this replica? */ char replid[CONFIG_RUN_ID_SIZE+1]; /* Master replication ID (if master). */ int slave_listening_port; /* As configured with: REPLCONF listening-port */ @@ -2375,8 +2377,8 @@ struct redisServer { int repl_diskless_load; /* Slave parse RDB directly from the socket. * see REPL_DISKLESS_LOAD_* enum */ int repl_diskless_sync_delay; /* Delay to start a diskless repl BGSAVE. */ - std::atomic repl_lowest_off; /* The lowest offset amongst all clients - Updated before calls to feed the replication backlog */ + std::atomic repl_lowest_off; /* The lowest offset amongst all replicas + -1 if there are no replicas */ /* Replication (replica) */ list *masters; int enable_multimaster; @@ -2825,7 +2827,6 @@ sds getAllClientsInfoString(int type); void rewriteClientCommandVector(client *c, int argc, ...); void rewriteClientCommandArgument(client *c, int i, robj *newval); void replaceClientCommandVector(client *c, int argc, robj **argv); -unsigned long getClientReplicationBacklogSharedUsage(client *c); unsigned long getClientOutputBufferMemoryUsage(client *c); int freeClientsInAsyncFreeQueue(int iel); void asyncCloseClientOnOutputBufferLimitReached(client *c); @@ -3017,7 +3018,6 @@ void rdbPipeReadHandler(struct aeEventLoop *eventLoop, int fd, void *clientData, void rdbPipeWriteHandlerConnRemoved(struct connection *conn); void replicationNotifyLoadedKey(redisDb *db, robj_roptr key, robj_roptr val, long long expire); void replicateSubkeyExpire(redisDb *db, robj_roptr key, robj_roptr subkey, long long expire); -void updateLowestOffsetAmongReplicas(void); void clearFailoverState(void); void updateFailoverStatus(void); void abortFailover(redisMaster *mi, const char *err); From 815ebe1e6b0b7ad13db30dc342e8d6cb92330651 Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 25 Jun 2021 01:54:38 +0000 Subject: [PATCH 62/99] Remove fPendingReplicaWrite flag which can instead be calculated on demand Former-commit-id: ae26afd13f955eb230b5c2cab20ec90db9b714ad --- src/networking.cpp | 128 +++++++++++++++++++++----------------------- src/replication.cpp | 8 +-- src/server.h | 5 +- 3 files changed, 67 insertions(+), 74 deletions(-) diff --git a/src/networking.cpp b/src/networking.cpp index 767fe9c2b..690b03a51 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -158,7 +158,6 @@ client *createClient(connection *conn, int iel) { c->flags = 0; c->fPendingAsyncWrite = FALSE; c->fPendingAsyncWriteHandler = FALSE; - c->fPendingReplicaWrite = FALSE; c->ctime = c->lastinteraction = g_pserver->unixtime; /* If the default user does not require authentication, the user is * directly authenticated. */ @@ -318,7 +317,7 @@ int prepareClientToWrite(client *c) { /* Schedule the client to write the output buffers to the socket, unless * it should already be setup to do so (it has already pending data). */ - if (!fAsync && !clientHasPendingReplies(c) && !c->fPendingReplicaWrite) clientInstallWriteHandler(c); + if (!fAsync && (c->flags & CLIENT_SLAVE || !clientHasPendingReplies(c))) clientInstallWriteHandler(c); if (fAsync && !(c->fPendingAsyncWrite)) clientInstallAsyncWriteHandler(c); /* Authorize the caller to queue in the output buffer of this client. */ @@ -1132,7 +1131,7 @@ void copyClientOutputBuffer(client *dst, client *src) { /* Return true if the specified client has pending reply buffers to write to * the socket. */ int clientHasPendingReplies(client *c) { - return (c->bufpos || listLength(c->reply)); + return (c->bufpos || listLength(c->reply) || c->FPendingReplicaWrite()); } static std::atomic rgacceptsInFlight[MAX_EVENT_LOOPS]; @@ -1785,66 +1784,9 @@ int writeToClient(client *c, int handler_installed) { std::unique_locklock)> lock(c->lock); - while(clientHasPendingReplies(c)) { - serverAssert(!(c->flags & CLIENT_SLAVE) || c->flags & CLIENT_MONITOR); - if (c->bufpos > 0) { - nwritten = connWrite(c->conn,c->buf+c->sentlen,c->bufpos-c->sentlen); - if (nwritten <= 0) break; - c->sentlen += nwritten; - totwritten += nwritten; - - /* If the buffer was sent, set bufpos to zero to continue with - * the remainder of the reply. */ - if ((int)c->sentlen == c->bufpos) { - c->bufpos = 0; - c->sentlen = 0; - } - } else { - o = (clientReplyBlock*)listNodeValue(listFirst(c->reply)); - if (o->used == 0) { - c->reply_bytes -= o->size; - listDelNode(c->reply,listFirst(c->reply)); - continue; - } - - nwritten = connWrite(c->conn, o->buf() + c->sentlen, o->used - c->sentlen); - if (nwritten <= 0) break; - c->sentlen += nwritten; - totwritten += nwritten; - - /* If we fully sent the object on head go to the next one */ - if (c->sentlen == o->used) { - c->reply_bytes -= o->size; - listDelNode(c->reply,listFirst(c->reply)); - c->sentlen = 0; - /* If there are no longer objects in the list, we expect - * the count of reply bytes to be exactly zero. */ - if (listLength(c->reply) == 0) - serverAssert(c->reply_bytes == 0); - } - } - /* Note that we avoid to send more than NET_MAX_WRITES_PER_EVENT - * bytes, in a single threaded server it's a good idea to serve - * other clients as well, even if a very large request comes from - * super fast link that is always able to accept data (in real world - * scenario think about 'KEYS *' against the loopback interface). - * - * However if we are over the maxmemory limit we ignore that and - * just deliver as much data as it is possible to deliver. - * - * Moreover, we also send as much as possible if the client is - * a replica or a monitor (otherwise, on high-speed traffic, the - * replication/output buffer will grow indefinitely) */ - if (totwritten > NET_MAX_WRITES_PER_EVENT && - (g_pserver->maxmemory == 0 || - zmalloc_used_memory() < g_pserver->maxmemory) && - !(c->flags & CLIENT_SLAVE)) break; - } - /* We can only directly read from the replication backlog if the client is a replica, so only attempt to do so if that's the case. */ if (c->flags & CLIENT_SLAVE && !(c->flags & CLIENT_MONITOR)) { - std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); long long repl_end_idx = getReplIndexFromOffset(c->repl_end_off); serverAssert(c->repl_curr_off != -1); @@ -1872,15 +1814,67 @@ int writeToClient(client *c, int handler_installed) { totwritten += nwritten; c->repl_curr_off += nwritten; serverAssert(c->repl_curr_off <= c->repl_end_off); - /* If the client's current offset matches the last offset it can read from, there is no pending write */ - if (c->repl_curr_off == c->repl_end_off){ - c->fPendingReplicaWrite = false; - } } /* If the second part of a write didn't go through, we still need to register that */ if (nwritten2ndStage == -1) nwritten = -1; } + } else { + while(clientHasPendingReplies(c)) { + serverAssert(!(c->flags & CLIENT_SLAVE) || c->flags & CLIENT_MONITOR); + if (c->bufpos > 0) { + nwritten = connWrite(c->conn,c->buf+c->sentlen,c->bufpos-c->sentlen); + if (nwritten <= 0) break; + c->sentlen += nwritten; + totwritten += nwritten; + + /* If the buffer was sent, set bufpos to zero to continue with + * the remainder of the reply. */ + if ((int)c->sentlen == c->bufpos) { + c->bufpos = 0; + c->sentlen = 0; + } + } else { + o = (clientReplyBlock*)listNodeValue(listFirst(c->reply)); + if (o->used == 0) { + c->reply_bytes -= o->size; + listDelNode(c->reply,listFirst(c->reply)); + continue; + } + + nwritten = connWrite(c->conn, o->buf() + c->sentlen, o->used - c->sentlen); + if (nwritten <= 0) break; + c->sentlen += nwritten; + totwritten += nwritten; + + /* If we fully sent the object on head go to the next one */ + if (c->sentlen == o->used) { + c->reply_bytes -= o->size; + listDelNode(c->reply,listFirst(c->reply)); + c->sentlen = 0; + /* If there are no longer objects in the list, we expect + * the count of reply bytes to be exactly zero. */ + if (listLength(c->reply) == 0) + serverAssert(c->reply_bytes == 0); + } + } + /* Note that we avoid to send more than NET_MAX_WRITES_PER_EVENT + * bytes, in a single threaded server it's a good idea to serve + * other clients as well, even if a very large request comes from + * super fast link that is always able to accept data (in real world + * scenario think about 'KEYS *' against the loopback interface). + * + * However if we are over the maxmemory limit we ignore that and + * just deliver as much data as it is possible to deliver. + * + * Moreover, we also send as much as possible if the client is + * a replica or a monitor (otherwise, on high-speed traffic, the + * replication/output buffer will grow indefinitely) */ + if (totwritten > NET_MAX_WRITES_PER_EVENT && + (g_pserver->maxmemory == 0 || + zmalloc_used_memory() < g_pserver->maxmemory) && + !(c->flags & CLIENT_SLAVE)) break; + } } g_pserver->stat_net_output_bytes += totwritten; @@ -1900,7 +1894,7 @@ int writeToClient(client *c, int handler_installed) { * We just rely on data / pings received for timeout detection. */ if (!(c->flags & CLIENT_MASTER)) c->lastinteraction = g_pserver->unixtime; } - if (!clientHasPendingReplies(c) && !c->fPendingReplicaWrite) { + if (!clientHasPendingReplies(c)) { c->sentlen = 0; if (handler_installed) connSetWriteHandler(c->conn, NULL); @@ -2080,7 +2074,7 @@ int handleClientsWithPendingWrites(int iel, int aof_state) { /* If after the synchronous writes above we still have data to * output to the client, we need to install the writable handler. */ - if (clientHasPendingReplies(c) || c->fPendingReplicaWrite) { + if (clientHasPendingReplies(c)) { if (connSetWriteHandlerWithBarrier(c->conn, sendReplyToClient, ae_flags, true) == C_ERR) { freeClientAsync(c); } @@ -3705,7 +3699,7 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) { /* In the case of a replica client, writes to said replica are using data from the replication backlog * as opposed to it's own internal buffer, this number should keep track of that */ unsigned long getClientReplicationBacklogSharedUsage(client *c) { - return (!(c->flags & CLIENT_SLAVE) || !c->fPendingReplicaWrite ) ? 0 : g_pserver->master_repl_offset - c->repl_curr_off; + return (!(c->flags & CLIENT_SLAVE) || !c->FPendingReplicaWrite() ) ? 0 : g_pserver->master_repl_offset - c->repl_curr_off; } /* This function returns the number of bytes that Redis is diff --git a/src/replication.cpp b/src/replication.cpp index b9465680e..94b35e314 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -722,7 +722,6 @@ long long addReplyReplicationBacklog(client *c, long long offset) { /* Force the partial sync to be queued */ prepareClientToWrite(c); - c->fPendingReplicaWrite = true; return len; } @@ -4974,11 +4973,8 @@ void flushReplBacklogToClients() replica->repl_end_off = g_pserver->master_repl_offset; /* Only if the there isn't already a pending write do we prepare the client to write */ - if (!replica->fPendingReplicaWrite){ - serverAssert(replica->repl_curr_off != g_pserver->master_repl_offset); - prepareClientToWrite(replica); - replica->fPendingReplicaWrite = true; - } + serverAssert(replica->repl_curr_off != g_pserver->master_repl_offset); + prepareClientToWrite(replica); } if (fAsyncWrite) diff --git a/src/server.h b/src/server.h index 0d6f766ce..07608632e 100644 --- a/src/server.h +++ b/src/server.h @@ -1594,7 +1594,6 @@ struct client { * when sending data to this replica. */ long long repl_end_off = -1; /* Replication offset to write to, stored in the replica, as opposed to using the global offset * to prevent needing the global lock */ - int fPendingReplicaWrite; /* Is there a write queued for this replica? */ char replid[CONFIG_RUN_ID_SIZE+1]; /* Master replication ID (if master). */ int slave_listening_port; /* As configured with: REPLCONF listening-port */ @@ -1657,6 +1656,10 @@ struct client { robj **argv; size_t argv_len_sumActive = 0; + bool FPendingReplicaWrite() const { + return repl_curr_off != repl_end_off; + } + // post a function from a non-client thread to run on its client thread bool postFunction(std::function fn, bool fLock = true); size_t argv_len_sum() const; From e6a82692b7be9d62a619f9968e0f9ae5f90ca71e Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 25 Jun 2021 02:31:17 +0000 Subject: [PATCH 63/99] Avoid holding the lockPendingWrite for too long and deadlocking due to lock inversion Former-commit-id: a4b49fbec60e2333a4407d24383ae204d5d2b413 --- src/networking.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/networking.cpp b/src/networking.cpp index 690b03a51..5ced371d1 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -2018,7 +2018,6 @@ void ProcessPendingAsyncWrites() * need to use a syscall in order to install the writable event handler, * get it called, and so forth. */ int handleClientsWithPendingWrites(int iel, int aof_state) { - std::unique_lock lockf(g_pserver->rgthreadvar[iel].lockPendingWrite); int processed = 0; serverAssert(iel == (serverTL - g_pserver->rgthreadvar)); @@ -2041,7 +2040,9 @@ int handleClientsWithPendingWrites(int iel, int aof_state) { ae_flags |= AE_BARRIER; } + std::unique_lock lockf(g_pserver->rgthreadvar[iel].lockPendingWrite); auto vec = std::move(g_pserver->rgthreadvar[iel].clients_pending_write); + lockf.unlock(); processed += (int)vec.size(); for (client *c : vec) { From 5949e253cab606c0bd7616e00c42e7ebcfca872a Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 25 Jun 2021 02:46:32 +0000 Subject: [PATCH 64/99] remove unnecessary newline Former-commit-id: 532af9cd0286ac6ece6f401c42aea18e36d16f7c --- src/replication.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/replication.cpp b/src/replication.cpp index 94b35e314..e9a503167 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -4975,7 +4975,6 @@ void flushReplBacklogToClients() /* Only if the there isn't already a pending write do we prepare the client to write */ serverAssert(replica->repl_curr_off != g_pserver->master_repl_offset); prepareClientToWrite(replica); - } if (fAsyncWrite) ProcessPendingAsyncWrites(); From 9e8a28c0ed3582cfee0a6ddb7357498bed119759 Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 23 Jul 2021 16:02:29 +0000 Subject: [PATCH 65/99] We cannot create time events on threads that don't have an event loop Former-commit-id: 3812586a41bb7f974b5d9820c8a68ff34ee8aa9a --- src/evict.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/evict.cpp b/src/evict.cpp index da2a05a55..802784633 100644 --- a/src/evict.cpp +++ b/src/evict.cpp @@ -834,7 +834,7 @@ int performEvictions(bool fPreSnapshot) { * memory, don't want to spend too much time here. */ if (elapsedUs(evictionTimer) > eviction_time_limit_us) { // We still need to free memory - start eviction timer proc - if (!isEvictionProcRunning) { + if (!isEvictionProcRunning && serverTL->el != nullptr) { isEvictionProcRunning = 1; aeCreateTimeEvent(serverTL->el, 0, evictionTimeProc, NULL, NULL); From a8685235c30c683a0c455118156dc0a6cfbc0f1e Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 23 Jul 2021 19:31:22 +0000 Subject: [PATCH 66/99] Initialize el so we can detect if it is null Former-commit-id: ec0f833ea17c668971893aa8f198d22da2e1d289 --- src/server.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/server.h b/src/server.h index 2c0a69b8b..5aea41b23 100644 --- a/src/server.h +++ b/src/server.h @@ -1999,7 +1999,7 @@ public: // Per-thread variabels that may be accessed without a lock struct redisServerThreadVars { - aeEventLoop *el; + aeEventLoop *el = nullptr; socketFds ipfd; /* TCP socket file descriptors */ socketFds tlsfd; /* TLS socket file descriptors */ int in_eval; /* Are we inside EVAL? */ From aef0bd877fc23c60f32bcdb177ddfc40737f6003 Mon Sep 17 00:00:00 2001 From: John Sully Date: Mon, 26 Jul 2021 22:30:31 +0000 Subject: [PATCH 67/99] Fix issue collab #32 Former-commit-id: 0d192cf00ebe9fc0d898404b86e1173476edaefb --- src/evict.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/evict.cpp b/src/evict.cpp index 802784633..84bf21c36 100644 --- a/src/evict.cpp +++ b/src/evict.cpp @@ -832,7 +832,7 @@ int performEvictions(bool fPreSnapshot) { /* After some time, exit the loop early - even if memory limit * hasn't been reached. If we suddenly need to free a lot of * memory, don't want to spend too much time here. */ - if (elapsedUs(evictionTimer) > eviction_time_limit_us) { + if (g_pserver->m_pstorageFactory == nullptr && elapsedUs(evictionTimer) > eviction_time_limit_us) { // We still need to free memory - start eviction timer proc if (!isEvictionProcRunning && serverTL->el != nullptr) { isEvictionProcRunning = 1; From c6a0c7b04f873aede35622c7d544710582bcc520 Mon Sep 17 00:00:00 2001 From: John Sully Date: Wed, 11 Aug 2021 01:19:39 +0000 Subject: [PATCH 68/99] Fix crash in load with storage provider set Former-commit-id: 6990818b7ca647819b50ae04224778e6f8f12a1a --- src/rdb.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/rdb.cpp b/src/rdb.cpp index cef3d15dd..ec4be50e0 100644 --- a/src/rdb.cpp +++ b/src/rdb.cpp @@ -3195,8 +3195,10 @@ int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi) { ckeysLoaded++; if (g_pserver->m_pstorageFactory && (ckeysLoaded % 128) == 0) { - g_pserver->garbageCollector.endEpoch(serverTL->gcEpoch); - serverTL->gcEpoch = g_pserver->garbageCollector.startEpoch(); + if (!serverTL->gcEpoch.isReset()) { + g_pserver->garbageCollector.endEpoch(serverTL->gcEpoch); + serverTL->gcEpoch = g_pserver->garbageCollector.startEpoch(); + } } if (g_pserver->key_load_delay) From 1a66c5fdab104e87c3ac4704ffd4db570a9b7bdb Mon Sep 17 00:00:00 2001 From: John Sully Date: Wed, 11 Aug 2021 02:53:44 +0000 Subject: [PATCH 69/99] Fix crash loading RDB on start with a storage provider set Former-commit-id: ace6a44399a15ab48a7ef798dc656dbc1207b58d --- src/server.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/server.cpp b/src/server.cpp index a88e13d74..ef1039cea 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -6757,6 +6757,7 @@ void loadDataFromDisk(void) { serverLog(LL_NOTICE, "Loading the RDB even though we have a storage provider because the database is empty"); } + serverTL->gcEpoch = g_pserver->garbageCollector.startEpoch(); if (g_pserver->aof_state == AOF_ON) { if (loadAppendOnlyFile(g_pserver->aof_filename) == C_OK) serverLog(LL_NOTICE,"DB loaded from append only file: %.3f seconds",(float)(ustime()-start)/1000000); @@ -6802,6 +6803,8 @@ void loadDataFromDisk(void) { exit(1); } } + g_pserver->garbageCollector.endEpoch(serverTL->gcEpoch); + serverTL->gcEpoch.reset(); } void redisOutOfMemoryHandler(size_t allocation_size) { From f3fb4e320997c8e182f4444af1fac8265202eea4 Mon Sep 17 00:00:00 2001 From: John Sully Date: Wed, 1 Sep 2021 04:15:28 +0000 Subject: [PATCH 70/99] Don't be in tracking mode during load as processChangesAsync works outside the normal system Former-commit-id: 8d31ce6eafea1cea2f9f4ea25e44306efef28fa3 --- src/db.cpp | 1 + src/rdb.cpp | 23 +++++++++++++++++++++++ src/server.h | 2 ++ 3 files changed, 26 insertions(+) diff --git a/src/db.cpp b/src/db.cpp index 0079598f2..c883bddba 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -2902,6 +2902,7 @@ bool redisDbPersistentData::processChanges(bool fSnapshot) void redisDbPersistentData::processChangesAsync(std::atomic &pendingJobs) { ++pendingJobs; + serverAssert(!m_fAllChanged); dictEmpty(m_dictChanged, nullptr); dict *dictNew = dictCreate(&dbDictType, nullptr); std::swap(dictNew, m_pdict); diff --git a/src/rdb.cpp b/src/rdb.cpp index ec4be50e0..4858f75f1 100644 --- a/src/rdb.cpp +++ b/src/rdb.cpp @@ -2927,6 +2927,16 @@ int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi) { bool fLastKeyExpired = false; std::unique_ptr spjob; + // If we're tracking changes we need to reset this + bool fTracking = g_pserver->db[0]->FTrackingChanges(); + if (fTracking) { + // We don't want to track here because processChangesAsync is outside the normal scope handling + for (int idb = 0; idb < cserver.dbnum; ++idb) { + if (g_pserver->db[idb]->processChanges(false)) + g_pserver->db[idb]->commitChanges(); + } + } + rdb->update_cksum = rdbLoadProgressCallback; rdb->chksum_arg = &wqueue; rdb->max_processing_chunk = g_pserver->loading_process_events_interval_bytes; @@ -3249,6 +3259,12 @@ int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi) { } wqueue.endWork(); + if (fTracking) { + // Reset track changes + for (int idb = 0; idb < cserver.dbnum; ++idb) { + g_pserver->db[idb]->trackChanges(false); + } + } return C_OK; @@ -3257,6 +3273,13 @@ int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi) { * the RDB file from a socket during initial SYNC (diskless replica mode), * we'll report the error to the caller, so that we can retry. */ eoferr: + if (fTracking) { + // Reset track changes + for (int idb = 0; idb < cserver.dbnum; ++idb) { + g_pserver->db[idb]->trackChanges(false); + } + } + wqueue.endWork(); if (key != nullptr) { diff --git a/src/server.h b/src/server.h index d35f12c4a..197a7a7be 100644 --- a/src/server.h +++ b/src/server.h @@ -1154,6 +1154,7 @@ public: void setStorageProvider(StorageCache *pstorage); void trackChanges(bool fBulk, size_t sizeHint = 0); + bool FTrackingChanges() const { return !!m_fTrackingChanges; } // Process and commit changes for secondary storage. Note that process and commit are seperated // to allow you to release the global lock before commiting. To prevent deadlocks you *must* @@ -1338,6 +1339,7 @@ struct redisDb : public redisDbPersistentDataSnapshot using redisDbPersistentData::prefetchKeysAsync; using redisDbPersistentData::prepOverwriteForSnapshot; using redisDbPersistentData::FRehashing; + using redisDbPersistentData::FTrackingChanges; public: expireset::setiter expireitr; From 6e7c4cbf1738615909d9947cc4fdd1253c00684c Mon Sep 17 00:00:00 2001 From: John Sully Date: Wed, 1 Sep 2021 04:15:59 +0000 Subject: [PATCH 71/99] We need to send keepalives to masters while waiting to prevent disconnects Former-commit-id: 7cbd6758b1042198c14ca9e8da0f1f7bc05df93d --- src/rdb.cpp | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/src/rdb.cpp b/src/rdb.cpp index 4858f75f1..0afe08267 100644 --- a/src/rdb.cpp +++ b/src/rdb.cpp @@ -2619,6 +2619,7 @@ class rdbAsyncWorkThread std::atomic workerThreadDone; std::thread m_thread; long long now; + long long lastPing = -1; static void listFreeMethod(const void *v) { delete reinterpret_cast(v); @@ -2654,7 +2655,7 @@ public: l.unlock(); usleep(1); pauseExecution(); - processEventsWhileBlocked(serverTL - g_pserver->rgthreadvar); + ProcessWhileBlocked(); resumeExecution(); l.lock(); } @@ -2685,6 +2686,23 @@ public: cv.notify_one(); } + void ProcessWhileBlocked() { + if ((mstime() - lastPing) > 1000) { // Ping if its been a second or longer + listIter li; + listNode *ln; + listRewind(g_pserver->masters, &li); + while ((ln = listNext(&li))) + { + struct redisMaster *mi = (struct redisMaster*)listNodeValue(ln); + if (mi->masterhost && mi->repl_state == REPL_STATE_TRANSFER) + replicationSendNewlineToMaster(mi); + } + lastPing = mstime(); + } + + processEventsWhileBlocked(serverTL - g_pserver->rgthreadvar); + } + size_t ckeys() { return ckeysLoaded; } size_t endWork() { @@ -2698,14 +2716,14 @@ public: while (!workerThreadDone) { usleep(10); pauseExecution(); - processEventsWhileBlocked(serverTL - g_pserver->rgthreadvar); + ProcessWhileBlocked(); resumeExecution(); } } m_thread.join(); while (cstorageWritesInFlight.load(std::memory_order_seq_cst)) { usleep(10); - processEventsWhileBlocked(serverTL - g_pserver->rgthreadvar); + ProcessWhileBlocked(); } fLaunched = false; fExit = false; From 570224a2c45a00962e0780bb2adffa090bd2b238 Mon Sep 17 00:00:00 2001 From: malavan Date: Fri, 13 Aug 2021 15:46:09 +0000 Subject: [PATCH 72/99] Remove duplicate code Former-commit-id: a4c3182afb71a0d7d6d25111c0e0cfe5dcfc6130 --- src/tls.cpp | 41 ----------------------------------------- 1 file changed, 41 deletions(-) diff --git a/src/tls.cpp b/src/tls.cpp index c1ad9158e..b2c96c497 100644 --- a/src/tls.cpp +++ b/src/tls.cpp @@ -140,47 +140,6 @@ static void initCryptoLocks(void) { } #endif /* USE_CRYPTO_LOCKS */ -/** - * OpenSSL global initialization and locking handling callbacks. - * Note that this is only required for OpenSSL < 1.1.0. - */ - -#if OPENSSL_VERSION_NUMBER < 0x10100000L -#define USE_CRYPTO_LOCKS -#endif - -#ifdef USE_CRYPTO_LOCKS - -static pthread_mutex_t *openssl_locks; - -static void sslLockingCallback(int mode, int lock_id, const char *f, int line) { - pthread_mutex_t *mt = openssl_locks + lock_id; - - if (mode & CRYPTO_LOCK) { - pthread_mutex_lock(mt); - } else { - pthread_mutex_unlock(mt); - } - - (void)f; - (void)line; -} - -static void initCryptoLocks(void) { - unsigned i, nlocks; - if (CRYPTO_get_locking_callback() != NULL) { - /* Someone already set the callback before us. Don't destroy it! */ - return; - } - nlocks = CRYPTO_num_locks(); - openssl_locks = zmalloc(sizeof(*openssl_locks) * nlocks); - for (i = 0; i < nlocks; i++) { - pthread_mutex_init(openssl_locks + i, NULL); - } - CRYPTO_set_locking_callback(sslLockingCallback); -} -#endif /* USE_CRYPTO_LOCKS */ - void tlsInit(void) { /* Enable configuring OpenSSL using the standard openssl.cnf * OPENSSL_config()/OPENSSL_init_crypto() should be the first From 35e5e85d95d7c2510de78564e43c7bf3702b89e8 Mon Sep 17 00:00:00 2001 From: malavan Date: Fri, 13 Aug 2021 20:02:06 +0000 Subject: [PATCH 73/99] don't use flags that aren't supported by certain versions of OPENSSL Former-commit-id: ca7342948fb12804905219ae106b4722368e9a2f --- src/tls.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/tls.cpp b/src/tls.cpp index b2c96c497..0315962e5 100644 --- a/src/tls.cpp +++ b/src/tls.cpp @@ -149,6 +149,8 @@ void tlsInit(void) { */ #if OPENSSL_VERSION_NUMBER < 0x10100000L OPENSSL_config(NULL); + #elif OPENSSL_VERSION_NUMBER < 0x10101000L + OPENSSL_init_crypto(OPENSSL_INIT_LOAD_CONFIG, NULL); #else OPENSSL_init_crypto(OPENSSL_INIT_LOAD_CONFIG|OPENSSL_INIT_ATFORK, NULL); #endif From f695e2e26c6726526ec454c98294af221cab45f2 Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 10 Sep 2021 00:38:08 +0000 Subject: [PATCH 74/99] In single thread mode don't batch Former-commit-id: 7daadae789cdca6f0eb0c3f553737d4f8efc0566 --- src/networking.cpp | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/networking.cpp b/src/networking.cpp index e2e033be3..d8b3e9a5d 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -2547,7 +2547,7 @@ void parseClientCommandBuffer(client *c) { } /* Prefetch outside the lock for better perf */ - if (g_pserver->prefetch_enabled && cqueriesStart < c->vecqueuedcmd.size() && + if (g_pserver->prefetch_enabled && cserver.cthreads > 1 && cqueriesStart < c->vecqueuedcmd.size() && (g_pserver->m_pstorageFactory || aeLockContested(cserver.cthreads/2) || cserver.cthreads == 1) && !GlobalLocksAcquired()) { auto &query = c->vecqueuedcmd.back(); if (query.argc > 0 && query.argc == query.argcMax) { @@ -2622,7 +2622,7 @@ void readQueryFromClient(connection *conn) { int nread, readlen; size_t qblen; - serverAssertDebug(FCorrectThread(c) sdfsdf); + serverAssertDebug(FCorrectThread(c)); serverAssertDebug(!GlobalLocksAcquired()); AeLocker aelock; @@ -2694,9 +2694,16 @@ void readQueryFromClient(connection *conn) { return; } - parseClientCommandBuffer(c); - - serverTL->vecclientsProcess.push_back(c); + if (cserver.cthreads > 1) { + parseClientCommandBuffer(c); + serverTL->vecclientsProcess.push_back(c); + } else { + // If we're single threaded its actually better to just process the command here while the query is hot in the cache + // multithreaded lock contention dominates and batching is better + aeAcquireLock(); + runAndPropogateToReplicas(processInputBuffer, c, true /*fParse*/, CMD_CALL_FULL); + aeReleaseLock(); + } } void processClients() From 8210d67c2483430129f46f5621c2788adabebc9d Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 10 Sep 2021 00:38:49 +0000 Subject: [PATCH 75/99] Don't zero out potentially long buffers Former-commit-id: 91e76ab4a00546278100baf6dfe61c13e802b40e --- src/object.cpp | 3 ++- src/server.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/object.cpp b/src/object.cpp index 6ecaf3ba5..bde1bc302 100644 --- a/src/object.cpp +++ b/src/object.cpp @@ -102,10 +102,11 @@ robj *createEmbeddedStringObject(const char *ptr, size_t len) { allocsize = sizeof(void*); size_t mvccExtraBytes = g_pserver->fActiveReplica ? sizeof(redisObjectExtended) : 0; - char *oB = (char*)zcalloc(sizeof(robj)+allocsize-sizeof(redisObject::m_ptr)+mvccExtraBytes, MALLOC_SHARED); + char *oB = (char*)zmalloc(sizeof(robj)+allocsize-sizeof(redisObject::m_ptr)+mvccExtraBytes, MALLOC_SHARED); robj *o = reinterpret_cast(oB + mvccExtraBytes); struct sdshdr8 *sh = (sdshdr8*)(&o->m_ptr); + new (o) redisObject; o->type = OBJ_STRING; o->encoding = OBJ_ENCODING_EMBSTR; o->setrefcount(1); diff --git a/src/server.h b/src/server.h index 5aea41b23..fa2183149 100644 --- a/src/server.h +++ b/src/server.h @@ -950,6 +950,7 @@ struct redisObjectExtended { }; typedef struct redisObject { + friend redisObject *createEmbeddedStringObject(const char *ptr, size_t len); protected: redisObject() {} From 86784fe9ba8596adc4779dbfdec3379f3dc32fcc Mon Sep 17 00:00:00 2001 From: malavan Date: Tue, 14 Sep 2021 17:06:04 +0000 Subject: [PATCH 76/99] improve overwrite key performance Former-commit-id: 56f9d5528385ea78074a308c6d3987b920d6cc35 --- src/db.cpp | 27 +++++++++++++++++++-------- src/dict.cpp | 4 ++-- src/dict.h | 2 +- src/server.cpp | 2 +- src/server.h | 7 +++++-- 5 files changed, 28 insertions(+), 14 deletions(-) diff --git a/src/db.cpp b/src/db.cpp index 067f4b74b..fa6e678a2 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -242,7 +242,7 @@ robj *lookupKeyWriteOrReply(client *c, robj *key, robj *reply) { return o; } -bool dbAddCore(redisDb *db, sds key, robj *val, bool fUpdateMvcc, bool fAssumeNew = false) { +bool dbAddCore(redisDb *db, sds key, robj *val, bool fUpdateMvcc, bool fAssumeNew = false, dict_iter *piterExisting = nullptr) { serverAssert(!val->FExpires()); sds copy = sdsdupshared(key); @@ -251,7 +251,7 @@ bool dbAddCore(redisDb *db, sds key, robj *val, bool fUpdateMvcc, bool fAssumeNe setMvccTstamp(val, mvcc); } - bool fInserted = db->insert(copy, val, fAssumeNew); + bool fInserted = db->insert(copy, val, fAssumeNew, piterExisting); if (fInserted) { @@ -321,8 +321,12 @@ void redisDb::dbOverwriteCore(redisDb::iter itr, sds keySds, robj *val, bool fUp * This function does not modify the expire time of the existing key. * * The program is aborted if the key was not already present. */ -void dbOverwrite(redisDb *db, robj *key, robj *val, bool fRemoveExpire) { - auto itr = db->find(key); +void dbOverwrite(redisDb *db, robj *key, robj *val, bool fRemoveExpire, dict_iter *pitrExisting) { + redisDb::iter itr; + if (pitrExisting != nullptr) + itr = *pitrExisting; + else + itr = db->find(key); serverAssertWithInfo(NULL,key,itr != nullptr); lookupKeyUpdateObj(itr.val(), LOOKUP_NONE); @@ -366,8 +370,9 @@ int dbMerge(redisDb *db, sds key, robj *val, int fReplace) * in a context where there is no clear client performing the operation. */ void genericSetKey(client *c, redisDb *db, robj *key, robj *val, int keepttl, int signal) { db->prepOverwriteForSnapshot(szFromObj(key)); - if (!dbAddCore(db, szFromObj(key), val, true /* fUpdateMvcc */)) { - dbOverwrite(db, key, val, !keepttl); + dict_iter iter; + if (!dbAddCore(db, szFromObj(key), val, true /* fUpdateMvcc */, false /*fAssumeNew*/, &iter)) { + dbOverwrite(db, key, val, !keepttl, &iter); } incrRefCount(val); if (signal) signalModifiedKey(c,db,key); @@ -2594,11 +2599,12 @@ void redisDb::storageProviderInitialize() } } -bool redisDbPersistentData::insert(char *key, robj *o, bool fAssumeNew) +bool redisDbPersistentData::insert(char *key, robj *o, bool fAssumeNew, dict_iter *piterExisting) { if (!fAssumeNew && (g_pserver->m_pstorageFactory != nullptr || m_pdbSnapshot != nullptr)) ensure(key); - int res = dictAdd(m_pdict, key, o); + dictEntry *de; + int res = dictAdd(m_pdict, key, o, &de); serverAssert(FImplies(fAssumeNew, res == DICT_OK)); if (res == DICT_OK) { @@ -2610,6 +2616,11 @@ bool redisDbPersistentData::insert(char *key, robj *o, bool fAssumeNew) #endif trackkey(key, false /* fUpdate */); } + else + { + if (piterExisting) + *piterExisting = dict_iter(m_pdict, de); + } return (res == DICT_OK); } diff --git a/src/dict.cpp b/src/dict.cpp index 1ed414b69..ac17fc7ab 100644 --- a/src/dict.cpp +++ b/src/dict.cpp @@ -573,9 +573,9 @@ static void _dictRehashStep(dict *d) { } /* Add an element to the target hash table */ -int dictAdd(dict *d, void *key, void *val) +int dictAdd(dict *d, void *key, void *val, dictEntry **existing) { - dictEntry *entry = dictAddRaw(d,key,NULL); + dictEntry *entry = dictAddRaw(d,key,existing); if (!entry) return DICT_ERR; dictSetVal(d, entry, val); diff --git a/src/dict.h b/src/dict.h index 64fdc98c9..ed33c9175 100644 --- a/src/dict.h +++ b/src/dict.h @@ -205,7 +205,7 @@ typedef void (dictScanBucketFunction)(void *privdata, dictEntry **bucketref); dict *dictCreate(dictType *type, void *privDataPtr); int dictExpand(dict *d, unsigned long size, bool fShrink = false); int dictTryExpand(dict *d, unsigned long size, bool fShrink); -int dictAdd(dict *d, void *key, void *val); +int dictAdd(dict *d, void *key, void *val, dictEntry **existing = nullptr); dictEntry *dictAddRaw(dict *d, void *key, dictEntry **existing); dictEntry *dictAddOrFind(dict *d, void *key); int dictReplace(dict *d, void *key, void *val); diff --git a/src/server.cpp b/src/server.cpp index ef1039cea..7cc1a9ac3 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -2086,7 +2086,7 @@ void databasesCron(bool fMainThread) { /* Perform hash tables rehashing if needed, but only if there are no * other processes saving the DB on disk. Otherwise rehashing is bad * as will cause a lot of copy-on-write of memory pages. */ - if (!hasActiveChildProcess() || g_pserver->FRdbSaveInProgress()) { + if (!(hasActiveChildProcess() || g_pserver->FRdbSaveInProgress())) { /* We use global counters so if we stop the computation at a given * DB we'll be able to start from the successive in the next * cron loop iteration. */ diff --git a/src/server.h b/src/server.h index fa2183149..40923d80e 100644 --- a/src/server.h +++ b/src/server.h @@ -1067,6 +1067,9 @@ class dict_iter : public dict_const_iter { dict *m_dict = nullptr; public: + dict_iter() + : dict_const_iter(nullptr) + {} explicit dict_iter(nullptr_t) : dict_const_iter(nullptr) {} @@ -1131,7 +1134,7 @@ public: void getStats(char *buf, size_t bufsize) { dictGetStats(buf, bufsize, m_pdict); } void getExpireStats(char *buf, size_t bufsize) { m_setexpire->getstats(buf, bufsize); } - bool insert(char *k, robj *o, bool fAssumeNew = false); + bool insert(char *k, robj *o, bool fAssumeNew = false, dict_iter *existing = nullptr); void tryResize(); int incrementallyRehash(); void updateValue(dict_iter itr, robj *val); @@ -3325,7 +3328,7 @@ int objectSetLRUOrLFU(robj *val, long long lfu_freq, long long lru_idle, #define LOOKUP_NONOTIFY (1<<1) #define LOOKUP_UPDATEMVCC (1<<2) void dbAdd(redisDb *db, robj *key, robj *val); -void dbOverwrite(redisDb *db, robj *key, robj *val, bool fRemoveExpire = false); +void dbOverwrite(redisDb *db, robj *key, robj *val, bool fRemoveExpire = false, dict_iter *pitrExisting = nullptr); int dbMerge(redisDb *db, sds key, robj *val, int fReplace); void genericSetKey(client *c, redisDb *db, robj *key, robj *val, int keepttl, int signal); void setKey(client *c, redisDb *db, robj *key, robj *val); From 765885bd7b51fc5f78e3da2bda5f1ea7bf527a3e Mon Sep 17 00:00:00 2001 From: malavan Date: Tue, 14 Sep 2021 17:17:08 +0000 Subject: [PATCH 77/99] if statement was actually correct Former-commit-id: d44b99afdf1ec92f8a36f0c091c37328008d867b --- src/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/server.cpp b/src/server.cpp index 7cc1a9ac3..ef1039cea 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -2086,7 +2086,7 @@ void databasesCron(bool fMainThread) { /* Perform hash tables rehashing if needed, but only if there are no * other processes saving the DB on disk. Otherwise rehashing is bad * as will cause a lot of copy-on-write of memory pages. */ - if (!(hasActiveChildProcess() || g_pserver->FRdbSaveInProgress())) { + if (!hasActiveChildProcess() || g_pserver->FRdbSaveInProgress()) { /* We use global counters so if we stop the computation at a given * DB we'll be able to start from the successive in the next * cron loop iteration. */ From 27bf624baeec0827e5bbac647292d746523929c0 Mon Sep 17 00:00:00 2001 From: Malavan Sotheeswaran Date: Thu, 5 Aug 2021 20:07:31 +0000 Subject: [PATCH 78/99] Merge fix to dict resize during rdb load Former-commit-id: c398d5f8a027c67acac64bdbfbd01486dde555eb --- src/dict.cpp | 44 ++++++++++++++++++++++++++------------------ src/dict.h | 1 + 2 files changed, 27 insertions(+), 18 deletions(-) diff --git a/src/dict.cpp b/src/dict.cpp index 1ed414b69..8e10f3f63 100644 --- a/src/dict.cpp +++ b/src/dict.cpp @@ -131,19 +131,6 @@ int _dictInit(dict *d, dictType *type, return DICT_OK; } -/* Resize the table to the minimal size that contains all the elements, - * but with the invariant of a USED/BUCKETS ratio near to <= 1 */ -int dictResize(dict *d) -{ - unsigned long minimal; - - if (!dict_can_resize || dictIsRehashing(d)) return DICT_ERR; - minimal = d->ht[0].used; - if (minimal < DICT_HT_INITIAL_SIZE) - minimal = DICT_HT_INITIAL_SIZE; - return dictExpand(d, minimal, false /*fShirnk*/); -} - /* Expand or create the hash table, * when malloc_failed is non-NULL, it'll avoid panic if malloc fails (in which case it'll be set to 1). * Returns DICT_OK if expand was performed, and DICT_ERR if skipped. */ @@ -189,6 +176,19 @@ int _dictExpand(dict *d, unsigned long size, bool fShrink, int* malloc_failed) return DICT_OK; } +/* Resize the table to the minimal size that contains all the elements, + * but with the invariant of a USED/BUCKETS ratio near to <= 1 */ +int dictResize(dict *d) +{ + unsigned long minimal; + + if (!dict_can_resize || dictIsRehashing(d)) return DICT_ERR; + minimal = d->ht[0].used; + if (minimal < DICT_HT_INITIAL_SIZE) + minimal = DICT_HT_INITIAL_SIZE; + return _dictExpand(d, minimal, false /*fShirnk*/, nullptr); +} + int dictMerge(dict *dst, dict *src) { #define MERGE_BLOCK_SIZE 4 @@ -273,7 +273,7 @@ int dictMerge(dict *dst, dict *src) return DICT_OK; } - dictExpand(dst, dictSize(dst)+dictSize(src), false /* fShrink */); // start dst rehashing if necessary + _dictExpand(dst, dictSize(dst)+dictSize(src), false /* fShrink */, nullptr); // start dst rehashing if necessary auto &htDst = dictIsRehashing(dst) ? dst->ht[1] : dst->ht[0]; for (int iht = 0; iht < 2; ++iht) { @@ -328,12 +328,16 @@ int dictMerge(dict *dst, dict *src) /* return DICT_ERR if expand was not performed */ int dictExpand(dict *d, unsigned long size, bool fShrink) { + // External expand likely means mass insertion, and we don't want to shrink during that + d->noshrink = true; return _dictExpand(d, size, fShrink, NULL); } /* return DICT_ERR if expand failed due to memory allocation failure */ int dictTryExpand(dict *d, unsigned long size, bool fShrink) { int malloc_failed; + // External expand likely means mass insertion, and we don't want to shrink during that + d->noshrink = true; _dictExpand(d, size, fShrink, &malloc_failed); return malloc_failed? DICT_ERR : DICT_OK; } @@ -677,6 +681,9 @@ static dictEntry *dictGenericDelete(dict *d, const void *key, int nofree) { dictEntry *he, *prevHe; int table; + // if we are deleting elements we probably aren't mass inserting anymore and it is safe to shrink + d->noshrink = false; + if (d->ht[0].used == 0 && d->ht[1].used == 0) return NULL; if (dictIsRehashing(d)) _dictRehashStep(d); @@ -715,6 +722,7 @@ static dictEntry *dictGenericDelete(dict *d, const void *key, int nofree) { if (!dictIsRehashing(d)) break; } + _dictExpandIfNeeded(d); return NULL; /* not found */ } @@ -1317,7 +1325,7 @@ static int _dictExpandIfNeeded(dict *d) if (dictIsRehashing(d)) return DICT_OK; /* If the hash table is empty expand it to the initial size. */ - if (d->ht[0].size == 0) return dictExpand(d, DICT_HT_INITIAL_SIZE, false /*fShrink*/); + if (d->ht[0].size == 0) return _dictExpand(d, DICT_HT_INITIAL_SIZE, false /*fShrink*/, nullptr); /* If we reached the 1:1 ratio, and we are allowed to resize the hash * table (global setting) or we should avoid it but the ratio between @@ -1328,12 +1336,12 @@ static int _dictExpandIfNeeded(dict *d) d->ht[0].used/d->ht[0].size > dict_force_resize_ratio) && dictTypeExpandAllowed(d)) { - return dictExpand(d, d->ht[0].used + 1, false /*fShrink*/); + return _dictExpand(d, d->ht[0].used + 1, false /*fShrink*/, nullptr); } - else if (d->ht[0].used > 0 && d->ht[0].size >= (1024*SHRINK_FACTOR) && (d->ht[0].used * 16) < d->ht[0].size && dict_can_resize) + else if (d->ht[0].used > 0 && d->ht[0].size >= (1024*SHRINK_FACTOR) && (d->ht[0].used * 16) < d->ht[0].size && dict_can_resize && !d->noshrink) { // If the dictionary has shurnk a lot we'll need to shrink the hash table instead - return dictExpand(d, d->ht[0].size/SHRINK_FACTOR, true /*fShrink*/); + return _dictExpand(d, d->ht[0].size/SHRINK_FACTOR, true /*fShrink*/, nullptr); } return DICT_OK; } diff --git a/src/dict.h b/src/dict.h index 64fdc98c9..c4e1931d8 100644 --- a/src/dict.h +++ b/src/dict.h @@ -124,6 +124,7 @@ typedef struct dict { unsigned refcount; dictAsyncRehashCtl *asyncdata; int16_t pauserehash; /* If >0 rehashing is paused (<0 indicates coding error) */ + uint8_t noshrink = false; } dict; /* If safe is set to 1 this is a safe iterator, that means, you can call From 4d605bea02fbda53d5e10158a811cd02f8184b40 Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 17 Sep 2021 17:27:19 +0000 Subject: [PATCH 79/99] Unify job types so everything is processed in order Former-commit-id: 625aa97e4cf16337e8b052b7a27491a0ab09110f --- src/rdb.cpp | 75 +++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 53 insertions(+), 22 deletions(-) diff --git a/src/rdb.cpp b/src/rdb.cpp index 0afe08267..edbf1ccaa 100644 --- a/src/rdb.cpp +++ b/src/rdb.cpp @@ -2563,7 +2563,25 @@ void stopSaving(int success) { NULL); } -struct rdbInsertJob + +class JobBase +{ +public: + enum class JobType { + Function, + Insert + }; + + JobType type; + + JobBase(JobType type) + : type(type) + {} + + virtual ~JobBase() = default; +}; + +struct rdbInsertJob : public JobBase { redisDb *db = nullptr; sds key = nullptr; @@ -2579,8 +2597,13 @@ struct rdbInsertJob decrRefCount(subkey); } - rdbInsertJob() = default; - rdbInsertJob(rdbInsertJob &&src) { + rdbInsertJob() + : JobBase(JobBase::JobType::Insert) + {} + + rdbInsertJob(rdbInsertJob &&src) + : JobBase(JobBase::JobType::Insert) + { db = src.db; src.db = nullptr; key = src.key; @@ -2602,12 +2625,21 @@ struct rdbInsertJob } }; +struct rdbFunctionJob : public JobBase +{ +public: + std::function m_fn; + + rdbFunctionJob(std::function &&fn) + : JobBase(JobBase::JobType::Function), m_fn(fn) + {} +}; + class rdbAsyncWorkThread { rdbSaveInfo *rsi; int rdbflags; list *listJobs; - std::vector> queuefn; // for custom jobs std::mutex mutex; std::condition_variable cv; std::condition_variable cvThrottle; @@ -2622,7 +2654,7 @@ class rdbAsyncWorkThread long long lastPing = -1; static void listFreeMethod(const void *v) { - delete reinterpret_cast(v); + delete reinterpret_cast(v); } public: @@ -2679,10 +2711,11 @@ public: } void enqueue(std::function &&fn) { + JobBase *pjob = new rdbFunctionJob(std::move(fn)); std::unique_lock l(mutex); - bool fNotify = queuefn.empty(); - queuefn.push_back(std::move(fn)); - if (fNotify) + throttle(l); + listAddNodeTail(listJobs, pjob); + if (listLength(listJobs) == 1) cv.notify_one(); } @@ -2728,7 +2761,6 @@ public: fLaunched = false; fExit = false; serverAssert(listLength(listJobs) == 0); - serverAssert(queuefn.empty()); return ckeysLoaded; } @@ -2832,11 +2864,11 @@ public: for (;;) { std::unique_lock lock(queue.mutex); - if (listLength(queue.listJobs) == 0 && queue.queuefn.empty()) { + if (listLength(queue.listJobs) == 0) { if (queue.fExit) break; queue.cv.wait(lock); - if (listLength(queue.listJobs) == 0 && queue.queuefn.empty() && queue.fExit) + if (listLength(queue.listJobs) == 0 && queue.fExit) break; } pqueue->cvThrottle.notify_one(); @@ -2844,27 +2876,27 @@ public: list *listJobs = queue.listJobs; queue.listJobs = listCreate(); listSetFreeMethod(queue.listJobs, listFreeMethod); - - auto queuefn = std::move(queue.queuefn); lock.unlock(); vars.gcEpoch = g_pserver->garbageCollector.startEpoch(); while (listLength(listJobs)) { std::unique_lock ulPause(pqueue->m_lockPause); - rdbInsertJob &job = *((rdbInsertJob*)listNodeValue(listFirst(listJobs))); + JobBase *pjobBase = ((JobBase*)listNodeValue(listFirst(listJobs))); - pqueue->processJob(job); + switch (pjobBase->type) + { + case JobBase::JobType::Insert: + pqueue->processJob(*static_cast(pjobBase)); + break; + case JobBase::JobType::Function: + static_cast(pjobBase)->m_fn(); + break; + } // Pop from the list listDelNode(listJobs, listFirst(listJobs)); } listRelease(listJobs); - - for (auto &fn : queuefn) { - std::unique_lock ulPause(pqueue->m_lockPause); - fn(); - } - g_pserver->garbageCollector.endEpoch(vars.gcEpoch); } @@ -2875,7 +2907,6 @@ public: queue.workerThreadDone = true; std::unique_lock lock(queue.mutex); - serverAssert(queue.queuefn.empty()); serverAssert(listLength(queue.listJobs) == 0); ProcessPendingAsyncWrites(); listRelease(vars.clients_pending_asyncwrite); From 663f0bca3e28bb0eb064f4b93f0ee0c1d50dccfe Mon Sep 17 00:00:00 2001 From: John Sully Date: Tue, 28 Sep 2021 18:08:15 +0000 Subject: [PATCH 80/99] Revert "Don't zero out potentially long buffers" - It has no benefit above the noise floor and potentially is the cause of a multithread slowdown This reverts commit 9a9841afb3ffac4ebece0ec911b35ce1fe1c7e35 [formerly 91e76ab4a00546278100baf6dfe61c13e802b40e]. Former-commit-id: f651cf67d731a10c3a52e30261c8d8b9d291576c --- src/object.cpp | 3 +-- src/server.h | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/object.cpp b/src/object.cpp index bde1bc302..6ecaf3ba5 100644 --- a/src/object.cpp +++ b/src/object.cpp @@ -102,11 +102,10 @@ robj *createEmbeddedStringObject(const char *ptr, size_t len) { allocsize = sizeof(void*); size_t mvccExtraBytes = g_pserver->fActiveReplica ? sizeof(redisObjectExtended) : 0; - char *oB = (char*)zmalloc(sizeof(robj)+allocsize-sizeof(redisObject::m_ptr)+mvccExtraBytes, MALLOC_SHARED); + char *oB = (char*)zcalloc(sizeof(robj)+allocsize-sizeof(redisObject::m_ptr)+mvccExtraBytes, MALLOC_SHARED); robj *o = reinterpret_cast(oB + mvccExtraBytes); struct sdshdr8 *sh = (sdshdr8*)(&o->m_ptr); - new (o) redisObject; o->type = OBJ_STRING; o->encoding = OBJ_ENCODING_EMBSTR; o->setrefcount(1); diff --git a/src/server.h b/src/server.h index fa2183149..5aea41b23 100644 --- a/src/server.h +++ b/src/server.h @@ -950,7 +950,6 @@ struct redisObjectExtended { }; typedef struct redisObject { - friend redisObject *createEmbeddedStringObject(const char *ptr, size_t len); protected: redisObject() {} From 8a2f2bcb911e1e6639361a14eb19d2c44602beed Mon Sep 17 00:00:00 2001 From: John Sully Date: Sat, 2 Oct 2021 18:27:21 +0000 Subject: [PATCH 81/99] Add in the concurrentqueue Former-commit-id: 319cad462be502b0b7a8c45b634d578b2c1c4e9d --- .../concurrentqueue/blockingconcurrentqueue.h | 583 +++ deps/concurrentqueue/concurrentqueue.h | 3743 +++++++++++++++++ deps/concurrentqueue/lightweightsemaphore.h | 412 ++ src/Makefile | 2 +- 4 files changed, 4739 insertions(+), 1 deletion(-) create mode 100644 deps/concurrentqueue/blockingconcurrentqueue.h create mode 100644 deps/concurrentqueue/concurrentqueue.h create mode 100644 deps/concurrentqueue/lightweightsemaphore.h diff --git a/deps/concurrentqueue/blockingconcurrentqueue.h b/deps/concurrentqueue/blockingconcurrentqueue.h new file mode 100644 index 000000000..6002cd82f --- /dev/null +++ b/deps/concurrentqueue/blockingconcurrentqueue.h @@ -0,0 +1,583 @@ +// Provides an efficient blocking version of moodycamel::ConcurrentQueue. +// ©2015-2020 Cameron Desrochers. Distributed under the terms of the simplified +// BSD license, available at the top of concurrentqueue.h. +// Also dual-licensed under the Boost Software License (see LICENSE.md) +// Uses Jeff Preshing's semaphore implementation (under the terms of its +// separate zlib license, see lightweightsemaphore.h). + +#pragma once + +#include "concurrentqueue.h" +#include "lightweightsemaphore.h" + +#include +#include +#include +#include +#include + +namespace moodycamel +{ +// This is a blocking version of the queue. It has an almost identical interface to +// the normal non-blocking version, with the addition of various wait_dequeue() methods +// and the removal of producer-specific dequeue methods. +template +class BlockingConcurrentQueue +{ +private: + typedef ::moodycamel::ConcurrentQueue ConcurrentQueue; + typedef ::moodycamel::LightweightSemaphore LightweightSemaphore; + +public: + typedef typename ConcurrentQueue::producer_token_t producer_token_t; + typedef typename ConcurrentQueue::consumer_token_t consumer_token_t; + + typedef typename ConcurrentQueue::index_t index_t; + typedef typename ConcurrentQueue::size_t size_t; + typedef typename std::make_signed::type ssize_t; + + static const size_t BLOCK_SIZE = ConcurrentQueue::BLOCK_SIZE; + static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = ConcurrentQueue::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD; + static const size_t EXPLICIT_INITIAL_INDEX_SIZE = ConcurrentQueue::EXPLICIT_INITIAL_INDEX_SIZE; + static const size_t IMPLICIT_INITIAL_INDEX_SIZE = ConcurrentQueue::IMPLICIT_INITIAL_INDEX_SIZE; + static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = ConcurrentQueue::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; + static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = ConcurrentQueue::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE; + static const size_t MAX_SUBQUEUE_SIZE = ConcurrentQueue::MAX_SUBQUEUE_SIZE; + +public: + // Creates a queue with at least `capacity` element slots; note that the + // actual number of elements that can be inserted without additional memory + // allocation depends on the number of producers and the block size (e.g. if + // the block size is equal to `capacity`, only a single block will be allocated + // up-front, which means only a single producer will be able to enqueue elements + // without an extra allocation -- blocks aren't shared between producers). + // This method is not thread safe -- it is up to the user to ensure that the + // queue is fully constructed before it starts being used by other threads (this + // includes making the memory effects of construction visible, possibly with a + // memory barrier). + explicit BlockingConcurrentQueue(size_t capacity = 6 * BLOCK_SIZE) + : inner(capacity), sema(create(0, (int)Traits::MAX_SEMA_SPINS), &BlockingConcurrentQueue::template destroy) + { + assert(reinterpret_cast((BlockingConcurrentQueue*)1) == &((BlockingConcurrentQueue*)1)->inner && "BlockingConcurrentQueue must have ConcurrentQueue as its first member"); + if (!sema) { + MOODYCAMEL_THROW(std::bad_alloc()); + } + } + + BlockingConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers) + : inner(minCapacity, maxExplicitProducers, maxImplicitProducers), sema(create(0, (int)Traits::MAX_SEMA_SPINS), &BlockingConcurrentQueue::template destroy) + { + assert(reinterpret_cast((BlockingConcurrentQueue*)1) == &((BlockingConcurrentQueue*)1)->inner && "BlockingConcurrentQueue must have ConcurrentQueue as its first member"); + if (!sema) { + MOODYCAMEL_THROW(std::bad_alloc()); + } + } + + // Disable copying and copy assignment + BlockingConcurrentQueue(BlockingConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION; + BlockingConcurrentQueue& operator=(BlockingConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION; + + // Moving is supported, but note that it is *not* a thread-safe operation. + // Nobody can use the queue while it's being moved, and the memory effects + // of that move must be propagated to other threads before they can use it. + // Note: When a queue is moved, its tokens are still valid but can only be + // used with the destination queue (i.e. semantically they are moved along + // with the queue itself). + BlockingConcurrentQueue(BlockingConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT + : inner(std::move(other.inner)), sema(std::move(other.sema)) + { } + + inline BlockingConcurrentQueue& operator=(BlockingConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT + { + return swap_internal(other); + } + + // Swaps this queue's state with the other's. Not thread-safe. + // Swapping two queues does not invalidate their tokens, however + // the tokens that were created for one queue must be used with + // only the swapped queue (i.e. the tokens are tied to the + // queue's movable state, not the object itself). + inline void swap(BlockingConcurrentQueue& other) MOODYCAMEL_NOEXCEPT + { + swap_internal(other); + } + +private: + BlockingConcurrentQueue& swap_internal(BlockingConcurrentQueue& other) + { + if (this == &other) { + return *this; + } + + inner.swap(other.inner); + sema.swap(other.sema); + return *this; + } + +public: + // Enqueues a single item (by copying it). + // Allocates memory if required. Only fails if memory allocation fails (or implicit + // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, + // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(T const& item) + { + if ((details::likely)(inner.enqueue(item))) { + sema->signal(); + return true; + } + return false; + } + + // Enqueues a single item (by moving it, if possible). + // Allocates memory if required. Only fails if memory allocation fails (or implicit + // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, + // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(T&& item) + { + if ((details::likely)(inner.enqueue(std::move(item)))) { + sema->signal(); + return true; + } + return false; + } + + // Enqueues a single item (by copying it) using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails (or + // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(producer_token_t const& token, T const& item) + { + if ((details::likely)(inner.enqueue(token, item))) { + sema->signal(); + return true; + } + return false; + } + + // Enqueues a single item (by moving it, if possible) using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails (or + // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(producer_token_t const& token, T&& item) + { + if ((details::likely)(inner.enqueue(token, std::move(item)))) { + sema->signal(); + return true; + } + return false; + } + + // Enqueues several items. + // Allocates memory if required. Only fails if memory allocation fails (or + // implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE + // is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Note: Use std::make_move_iterator if the elements should be moved instead of copied. + // Thread-safe. + template + inline bool enqueue_bulk(It itemFirst, size_t count) + { + if ((details::likely)(inner.enqueue_bulk(std::forward(itemFirst), count))) { + sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count); + return true; + } + return false; + } + + // Enqueues several items using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails + // (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + inline bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) + { + if ((details::likely)(inner.enqueue_bulk(token, std::forward(itemFirst), count))) { + sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count); + return true; + } + return false; + } + + // Enqueues a single item (by copying it). + // Does not allocate memory. Fails if not enough room to enqueue (or implicit + // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE + // is 0). + // Thread-safe. + inline bool try_enqueue(T const& item) + { + if (inner.try_enqueue(item)) { + sema->signal(); + return true; + } + return false; + } + + // Enqueues a single item (by moving it, if possible). + // Does not allocate memory (except for one-time implicit producer). + // Fails if not enough room to enqueue (or implicit production is + // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). + // Thread-safe. + inline bool try_enqueue(T&& item) + { + if (inner.try_enqueue(std::move(item))) { + sema->signal(); + return true; + } + return false; + } + + // Enqueues a single item (by copying it) using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Thread-safe. + inline bool try_enqueue(producer_token_t const& token, T const& item) + { + if (inner.try_enqueue(token, item)) { + sema->signal(); + return true; + } + return false; + } + + // Enqueues a single item (by moving it, if possible) using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Thread-safe. + inline bool try_enqueue(producer_token_t const& token, T&& item) + { + if (inner.try_enqueue(token, std::move(item))) { + sema->signal(); + return true; + } + return false; + } + + // Enqueues several items. + // Does not allocate memory (except for one-time implicit producer). + // Fails if not enough room to enqueue (or implicit production is + // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + inline bool try_enqueue_bulk(It itemFirst, size_t count) + { + if (inner.try_enqueue_bulk(std::forward(itemFirst), count)) { + sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count); + return true; + } + return false; + } + + // Enqueues several items using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + inline bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) + { + if (inner.try_enqueue_bulk(token, std::forward(itemFirst), count)) { + sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count); + return true; + } + return false; + } + + + // Attempts to dequeue from the queue. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + inline bool try_dequeue(U& item) + { + if (sema->tryWait()) { + while (!inner.try_dequeue(item)) { + continue; + } + return true; + } + return false; + } + + // Attempts to dequeue from the queue using an explicit consumer token. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + inline bool try_dequeue(consumer_token_t& token, U& item) + { + if (sema->tryWait()) { + while (!inner.try_dequeue(token, item)) { + continue; + } + return true; + } + return false; + } + + // Attempts to dequeue several elements from the queue. + // Returns the number of items actually dequeued. + // Returns 0 if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + inline size_t try_dequeue_bulk(It itemFirst, size_t max) + { + size_t count = 0; + max = (size_t)sema->tryWaitMany((LightweightSemaphore::ssize_t)(ssize_t)max); + while (count != max) { + count += inner.template try_dequeue_bulk(itemFirst, max - count); + } + return count; + } + + // Attempts to dequeue several elements from the queue using an explicit consumer token. + // Returns the number of items actually dequeued. + // Returns 0 if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + inline size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max) + { + size_t count = 0; + max = (size_t)sema->tryWaitMany((LightweightSemaphore::ssize_t)(ssize_t)max); + while (count != max) { + count += inner.template try_dequeue_bulk(token, itemFirst, max - count); + } + return count; + } + + + + // Blocks the current thread until there's something to dequeue, then + // dequeues it. + // Never allocates. Thread-safe. + template + inline void wait_dequeue(U& item) + { + while (!sema->wait()) { + continue; + } + while (!inner.try_dequeue(item)) { + continue; + } + } + + // Blocks the current thread until either there's something to dequeue + // or the timeout (specified in microseconds) expires. Returns false + // without setting `item` if the timeout expires, otherwise assigns + // to `item` and returns true. + // Using a negative timeout indicates an indefinite timeout, + // and is thus functionally equivalent to calling wait_dequeue. + // Never allocates. Thread-safe. + template + inline bool wait_dequeue_timed(U& item, std::int64_t timeout_usecs) + { + if (!sema->wait(timeout_usecs)) { + return false; + } + while (!inner.try_dequeue(item)) { + continue; + } + return true; + } + + // Blocks the current thread until either there's something to dequeue + // or the timeout expires. Returns false without setting `item` if the + // timeout expires, otherwise assigns to `item` and returns true. + // Never allocates. Thread-safe. + template + inline bool wait_dequeue_timed(U& item, std::chrono::duration const& timeout) + { + return wait_dequeue_timed(item, std::chrono::duration_cast(timeout).count()); + } + + // Blocks the current thread until there's something to dequeue, then + // dequeues it using an explicit consumer token. + // Never allocates. Thread-safe. + template + inline void wait_dequeue(consumer_token_t& token, U& item) + { + while (!sema->wait()) { + continue; + } + while (!inner.try_dequeue(token, item)) { + continue; + } + } + + // Blocks the current thread until either there's something to dequeue + // or the timeout (specified in microseconds) expires. Returns false + // without setting `item` if the timeout expires, otherwise assigns + // to `item` and returns true. + // Using a negative timeout indicates an indefinite timeout, + // and is thus functionally equivalent to calling wait_dequeue. + // Never allocates. Thread-safe. + template + inline bool wait_dequeue_timed(consumer_token_t& token, U& item, std::int64_t timeout_usecs) + { + if (!sema->wait(timeout_usecs)) { + return false; + } + while (!inner.try_dequeue(token, item)) { + continue; + } + return true; + } + + // Blocks the current thread until either there's something to dequeue + // or the timeout expires. Returns false without setting `item` if the + // timeout expires, otherwise assigns to `item` and returns true. + // Never allocates. Thread-safe. + template + inline bool wait_dequeue_timed(consumer_token_t& token, U& item, std::chrono::duration const& timeout) + { + return wait_dequeue_timed(token, item, std::chrono::duration_cast(timeout).count()); + } + + // Attempts to dequeue several elements from the queue. + // Returns the number of items actually dequeued, which will + // always be at least one (this method blocks until the queue + // is non-empty) and at most max. + // Never allocates. Thread-safe. + template + inline size_t wait_dequeue_bulk(It itemFirst, size_t max) + { + size_t count = 0; + max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max); + while (count != max) { + count += inner.template try_dequeue_bulk(itemFirst, max - count); + } + return count; + } + + // Attempts to dequeue several elements from the queue. + // Returns the number of items actually dequeued, which can + // be 0 if the timeout expires while waiting for elements, + // and at most max. + // Using a negative timeout indicates an indefinite timeout, + // and is thus functionally equivalent to calling wait_dequeue_bulk. + // Never allocates. Thread-safe. + template + inline size_t wait_dequeue_bulk_timed(It itemFirst, size_t max, std::int64_t timeout_usecs) + { + size_t count = 0; + max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max, timeout_usecs); + while (count != max) { + count += inner.template try_dequeue_bulk(itemFirst, max - count); + } + return count; + } + + // Attempts to dequeue several elements from the queue. + // Returns the number of items actually dequeued, which can + // be 0 if the timeout expires while waiting for elements, + // and at most max. + // Never allocates. Thread-safe. + template + inline size_t wait_dequeue_bulk_timed(It itemFirst, size_t max, std::chrono::duration const& timeout) + { + return wait_dequeue_bulk_timed(itemFirst, max, std::chrono::duration_cast(timeout).count()); + } + + // Attempts to dequeue several elements from the queue using an explicit consumer token. + // Returns the number of items actually dequeued, which will + // always be at least one (this method blocks until the queue + // is non-empty) and at most max. + // Never allocates. Thread-safe. + template + inline size_t wait_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max) + { + size_t count = 0; + max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max); + while (count != max) { + count += inner.template try_dequeue_bulk(token, itemFirst, max - count); + } + return count; + } + + // Attempts to dequeue several elements from the queue using an explicit consumer token. + // Returns the number of items actually dequeued, which can + // be 0 if the timeout expires while waiting for elements, + // and at most max. + // Using a negative timeout indicates an indefinite timeout, + // and is thus functionally equivalent to calling wait_dequeue_bulk. + // Never allocates. Thread-safe. + template + inline size_t wait_dequeue_bulk_timed(consumer_token_t& token, It itemFirst, size_t max, std::int64_t timeout_usecs) + { + size_t count = 0; + max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max, timeout_usecs); + while (count != max) { + count += inner.template try_dequeue_bulk(token, itemFirst, max - count); + } + return count; + } + + // Attempts to dequeue several elements from the queue using an explicit consumer token. + // Returns the number of items actually dequeued, which can + // be 0 if the timeout expires while waiting for elements, + // and at most max. + // Never allocates. Thread-safe. + template + inline size_t wait_dequeue_bulk_timed(consumer_token_t& token, It itemFirst, size_t max, std::chrono::duration const& timeout) + { + return wait_dequeue_bulk_timed(token, itemFirst, max, std::chrono::duration_cast(timeout).count()); + } + + + // Returns an estimate of the total number of elements currently in the queue. This + // estimate is only accurate if the queue has completely stabilized before it is called + // (i.e. all enqueue and dequeue operations have completed and their memory effects are + // visible on the calling thread, and no further operations start while this method is + // being called). + // Thread-safe. + inline size_t size_approx() const + { + return (size_t)sema->availableApprox(); + } + + + // Returns true if the underlying atomic variables used by + // the queue are lock-free (they should be on most platforms). + // Thread-safe. + static bool is_lock_free() + { + return ConcurrentQueue::is_lock_free(); + } + + +private: + template + static inline U* create(A1&& a1, A2&& a2) + { + void* p = (Traits::malloc)(sizeof(U)); + return p != nullptr ? new (p) U(std::forward(a1), std::forward(a2)) : nullptr; + } + + template + static inline void destroy(U* p) + { + if (p != nullptr) { + p->~U(); + } + (Traits::free)(p); + } + +private: + ConcurrentQueue inner; + std::unique_ptr sema; +}; + + +template +inline void swap(BlockingConcurrentQueue& a, BlockingConcurrentQueue& b) MOODYCAMEL_NOEXCEPT +{ + a.swap(b); +} + +} // end namespace moodycamel + diff --git a/deps/concurrentqueue/concurrentqueue.h b/deps/concurrentqueue/concurrentqueue.h new file mode 100644 index 000000000..121383eab --- /dev/null +++ b/deps/concurrentqueue/concurrentqueue.h @@ -0,0 +1,3743 @@ +// Provides a C++11 implementation of a multi-producer, multi-consumer lock-free queue. +// An overview, including benchmark results, is provided here: +// http://moodycamel.com/blog/2014/a-fast-general-purpose-lock-free-queue-for-c++ +// The full design is also described in excruciating detail at: +// http://moodycamel.com/blog/2014/detailed-design-of-a-lock-free-queue + +// Simplified BSD license: +// Copyright (c) 2013-2020, Cameron Desrochers. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// - Redistributions of source code must retain the above copyright notice, this list of +// conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, this list of +// conditions and the following disclaimer in the documentation and/or other materials +// provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL +// THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR +// TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Also dual-licensed under the Boost Software License (see LICENSE.md) + +#pragma once + +#if defined(__GNUC__) && !defined(__INTEL_COMPILER) +// Disable -Wconversion warnings (spuriously triggered when Traits::size_t and +// Traits::index_t are set to < 32 bits, causing integer promotion, causing warnings +// upon assigning any computed values) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" + +#ifdef MCDBGQ_USE_RELACY +#pragma GCC diagnostic ignored "-Wint-to-pointer-cast" +#endif +#endif + +#if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17) +// VS2019 with /W4 warns about constant conditional expressions but unless /std=c++17 or higher +// does not support `if constexpr`, so we have no choice but to simply disable the warning +#pragma warning(push) +#pragma warning(disable: 4127) // conditional expression is constant +#endif + +#if defined(__APPLE__) +#include "TargetConditionals.h" +#endif + +#ifdef MCDBGQ_USE_RELACY +#include "relacy/relacy_std.hpp" +#include "relacy_shims.h" +// We only use malloc/free anyway, and the delete macro messes up `= delete` method declarations. +// We'll override the default trait malloc ourselves without a macro. +#undef new +#undef delete +#undef malloc +#undef free +#else +#include // Requires C++11. Sorry VS2010. +#include +#endif +#include // for max_align_t +#include +#include +#include +#include +#include +#include +#include // for CHAR_BIT +#include +#include // partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading + +// Platform-specific definitions of a numeric thread ID type and an invalid value +namespace moodycamel { namespace details { + template struct thread_id_converter { + typedef thread_id_t thread_id_numeric_size_t; + typedef thread_id_t thread_id_hash_t; + static thread_id_hash_t prehash(thread_id_t const& x) { return x; } + }; +} } +#if defined(MCDBGQ_USE_RELACY) +namespace moodycamel { namespace details { + typedef std::uint32_t thread_id_t; + static const thread_id_t invalid_thread_id = 0xFFFFFFFFU; + static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU; + static inline thread_id_t thread_id() { return rl::thread_index(); } +} } +#elif defined(_WIN32) || defined(__WINDOWS__) || defined(__WIN32__) +// No sense pulling in windows.h in a header, we'll manually declare the function +// we use and rely on backwards-compatibility for this not to break +extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId(void); +namespace moodycamel { namespace details { + static_assert(sizeof(unsigned long) == sizeof(std::uint32_t), "Expected size of unsigned long to be 32 bits on Windows"); + typedef std::uint32_t thread_id_t; + static const thread_id_t invalid_thread_id = 0; // See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx + static const thread_id_t invalid_thread_id2 = 0xFFFFFFFFU; // Not technically guaranteed to be invalid, but is never used in practice. Note that all Win32 thread IDs are presently multiples of 4. + static inline thread_id_t thread_id() { return static_cast(::GetCurrentThreadId()); } +} } +#elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || (defined(__APPLE__) && TARGET_OS_IPHONE) || defined(MOODYCAMEL_NO_THREAD_LOCAL) +namespace moodycamel { namespace details { + static_assert(sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8, "std::thread::id is expected to be either 4 or 8 bytes"); + + typedef std::thread::id thread_id_t; + static const thread_id_t invalid_thread_id; // Default ctor creates invalid ID + + // Note we don't define a invalid_thread_id2 since std::thread::id doesn't have one; it's + // only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined anyway, which it won't + // be. + static inline thread_id_t thread_id() { return std::this_thread::get_id(); } + + template struct thread_id_size { }; + template<> struct thread_id_size<4> { typedef std::uint32_t numeric_t; }; + template<> struct thread_id_size<8> { typedef std::uint64_t numeric_t; }; + + template<> struct thread_id_converter { + typedef thread_id_size::numeric_t thread_id_numeric_size_t; +#ifndef __APPLE__ + typedef std::size_t thread_id_hash_t; +#else + typedef thread_id_numeric_size_t thread_id_hash_t; +#endif + + static thread_id_hash_t prehash(thread_id_t const& x) + { +#ifndef __APPLE__ + return std::hash()(x); +#else + return *reinterpret_cast(&x); +#endif + } + }; +} } +#else +// Use a nice trick from this answer: http://stackoverflow.com/a/8438730/21475 +// In order to get a numeric thread ID in a platform-independent way, we use a thread-local +// static variable's address as a thread identifier :-) +#if defined(__GNUC__) || defined(__INTEL_COMPILER) +#define MOODYCAMEL_THREADLOCAL __thread +#elif defined(_MSC_VER) +#define MOODYCAMEL_THREADLOCAL __declspec(thread) +#else +// Assume C++11 compliant compiler +#define MOODYCAMEL_THREADLOCAL thread_local +#endif +namespace moodycamel { namespace details { + typedef std::uintptr_t thread_id_t; + static const thread_id_t invalid_thread_id = 0; // Address can't be nullptr + static const thread_id_t invalid_thread_id2 = 1; // Member accesses off a null pointer are also generally invalid. Plus it's not aligned. + inline thread_id_t thread_id() { static MOODYCAMEL_THREADLOCAL int x; return reinterpret_cast(&x); } +} } +#endif + +// Constexpr if +#ifndef MOODYCAMEL_CONSTEXPR_IF +#if (defined(_MSC_VER) && defined(_HAS_CXX17) && _HAS_CXX17) || __cplusplus > 201402L +#define MOODYCAMEL_CONSTEXPR_IF if constexpr +#define MOODYCAMEL_MAYBE_UNUSED [[maybe_unused]] +#else +#define MOODYCAMEL_CONSTEXPR_IF if +#define MOODYCAMEL_MAYBE_UNUSED +#endif +#endif + +// Exceptions +#ifndef MOODYCAMEL_EXCEPTIONS_ENABLED +#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || (!defined(_MSC_VER) && !defined(__GNUC__)) +#define MOODYCAMEL_EXCEPTIONS_ENABLED +#endif +#endif +#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED +#define MOODYCAMEL_TRY try +#define MOODYCAMEL_CATCH(...) catch(__VA_ARGS__) +#define MOODYCAMEL_RETHROW throw +#define MOODYCAMEL_THROW(expr) throw (expr) +#else +#define MOODYCAMEL_TRY MOODYCAMEL_CONSTEXPR_IF (true) +#define MOODYCAMEL_CATCH(...) else MOODYCAMEL_CONSTEXPR_IF (false) +#define MOODYCAMEL_RETHROW +#define MOODYCAMEL_THROW(expr) +#endif + +#ifndef MOODYCAMEL_NOEXCEPT +#if !defined(MOODYCAMEL_EXCEPTIONS_ENABLED) +#define MOODYCAMEL_NOEXCEPT +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) true +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) true +#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1800 +// VS2012's std::is_nothrow_[move_]constructible is broken and returns true when it shouldn't :-( +// We have to assume *all* non-trivial constructors may throw on VS2012! +#define MOODYCAMEL_NOEXCEPT _NOEXCEPT +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference::value && std::is_move_constructible::value ? std::is_trivially_move_constructible::value : std::is_trivially_copy_constructible::value) +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference::value && std::is_move_assignable::value ? std::is_trivially_move_assignable::value || std::is_nothrow_move_assignable::value : std::is_trivially_copy_assignable::value || std::is_nothrow_copy_assignable::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)) +#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1900 +#define MOODYCAMEL_NOEXCEPT _NOEXCEPT +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference::value && std::is_move_constructible::value ? std::is_trivially_move_constructible::value || std::is_nothrow_move_constructible::value : std::is_trivially_copy_constructible::value || std::is_nothrow_copy_constructible::value) +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference::value && std::is_move_assignable::value ? std::is_trivially_move_assignable::value || std::is_nothrow_move_assignable::value : std::is_trivially_copy_assignable::value || std::is_nothrow_copy_assignable::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)) +#else +#define MOODYCAMEL_NOEXCEPT noexcept +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept(expr) +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) noexcept(expr) +#endif +#endif + +#ifndef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED +#ifdef MCDBGQ_USE_RELACY +#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED +#else +// VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445 +// g++ <=4.7 doesn't support thread_local either. +// Finally, iOS/ARM doesn't have support for it either, and g++/ARM allows it to compile but it's unconfirmed to actually work +#if (!defined(_MSC_VER) || _MSC_VER >= 1900) && (!defined(__MINGW32__) && !defined(__MINGW64__) || !defined(__WINPTHREADS_VERSION)) && (!defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) && (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) +// Assume `thread_local` is fully supported in all other C++11 compilers/platforms +//#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED // always disabled for now since several users report having problems with it on +#endif +#endif +#endif + +// VS2012 doesn't support deleted functions. +// In this case, we declare the function normally but don't define it. A link error will be generated if the function is called. +#ifndef MOODYCAMEL_DELETE_FUNCTION +#if defined(_MSC_VER) && _MSC_VER < 1800 +#define MOODYCAMEL_DELETE_FUNCTION +#else +#define MOODYCAMEL_DELETE_FUNCTION = delete +#endif +#endif + +namespace moodycamel { namespace details { +#ifndef MOODYCAMEL_ALIGNAS +// VS2013 doesn't support alignas or alignof, and align() requires a constant literal +#if defined(_MSC_VER) && _MSC_VER <= 1800 +#define MOODYCAMEL_ALIGNAS(alignment) __declspec(align(alignment)) +#define MOODYCAMEL_ALIGNOF(obj) __alignof(obj) +#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) typename details::Vs2013Aligned::value, T>::type + template struct Vs2013Aligned { }; // default, unsupported alignment + template struct Vs2013Aligned<1, T> { typedef __declspec(align(1)) T type; }; + template struct Vs2013Aligned<2, T> { typedef __declspec(align(2)) T type; }; + template struct Vs2013Aligned<4, T> { typedef __declspec(align(4)) T type; }; + template struct Vs2013Aligned<8, T> { typedef __declspec(align(8)) T type; }; + template struct Vs2013Aligned<16, T> { typedef __declspec(align(16)) T type; }; + template struct Vs2013Aligned<32, T> { typedef __declspec(align(32)) T type; }; + template struct Vs2013Aligned<64, T> { typedef __declspec(align(64)) T type; }; + template struct Vs2013Aligned<128, T> { typedef __declspec(align(128)) T type; }; + template struct Vs2013Aligned<256, T> { typedef __declspec(align(256)) T type; }; +#else + template struct identity { typedef T type; }; +#define MOODYCAMEL_ALIGNAS(alignment) alignas(alignment) +#define MOODYCAMEL_ALIGNOF(obj) alignof(obj) +#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) alignas(alignof(obj)) typename details::identity::type +#endif +#endif +} } + + +// TSAN can false report races in lock-free code. To enable TSAN to be used from projects that use this one, +// we can apply per-function compile-time suppression. +// See https://clang.llvm.org/docs/ThreadSanitizer.html#has-feature-thread-sanitizer +#define MOODYCAMEL_NO_TSAN +#if defined(__has_feature) + #if __has_feature(thread_sanitizer) + #undef MOODYCAMEL_NO_TSAN + #define MOODYCAMEL_NO_TSAN __attribute__((no_sanitize("thread"))) + #endif // TSAN +#endif // TSAN + +// Compiler-specific likely/unlikely hints +namespace moodycamel { namespace details { +#if defined(__GNUC__) + static inline bool (likely)(bool x) { return __builtin_expect((x), true); } + static inline bool (unlikely)(bool x) { return __builtin_expect((x), false); } +#else + static inline bool (likely)(bool x) { return x; } + static inline bool (unlikely)(bool x) { return x; } +#endif +} } + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG +#include "internal/concurrentqueue_internal_debug.h" +#endif + +namespace moodycamel { +namespace details { + template + struct const_numeric_max { + static_assert(std::is_integral::value, "const_numeric_max can only be used with integers"); + static const T value = std::numeric_limits::is_signed + ? (static_cast(1) << (sizeof(T) * CHAR_BIT - 1)) - static_cast(1) + : static_cast(-1); + }; + +#if defined(__GLIBCXX__) + typedef ::max_align_t std_max_align_t; // libstdc++ forgot to add it to std:: for a while +#else + typedef std::max_align_t std_max_align_t; // Others (e.g. MSVC) insist it can *only* be accessed via std:: +#endif + + // Some platforms have incorrectly set max_align_t to a type with <8 bytes alignment even while supporting + // 8-byte aligned scalar values (*cough* 32-bit iOS). Work around this with our own union. See issue #64. + typedef union { + std_max_align_t x; + long long y; + void* z; + } max_align_t; +} + +// Default traits for the ConcurrentQueue. To change some of the +// traits without re-implementing all of them, inherit from this +// struct and shadow the declarations you wish to be different; +// since the traits are used as a template type parameter, the +// shadowed declarations will be used where defined, and the defaults +// otherwise. +struct ConcurrentQueueDefaultTraits +{ + // General-purpose size type. std::size_t is strongly recommended. + typedef std::size_t size_t; + + // The type used for the enqueue and dequeue indices. Must be at least as + // large as size_t. Should be significantly larger than the number of elements + // you expect to hold at once, especially if you have a high turnover rate; + // for example, on 32-bit x86, if you expect to have over a hundred million + // elements or pump several million elements through your queue in a very + // short space of time, using a 32-bit type *may* trigger a race condition. + // A 64-bit int type is recommended in that case, and in practice will + // prevent a race condition no matter the usage of the queue. Note that + // whether the queue is lock-free with a 64-int type depends on the whether + // std::atomic is lock-free, which is platform-specific. + typedef std::size_t index_t; + + // Internally, all elements are enqueued and dequeued from multi-element + // blocks; this is the smallest controllable unit. If you expect few elements + // but many producers, a smaller block size should be favoured. For few producers + // and/or many elements, a larger block size is preferred. A sane default + // is provided. Must be a power of 2. + static const size_t BLOCK_SIZE = 32; + + // For explicit producers (i.e. when using a producer token), the block is + // checked for being empty by iterating through a list of flags, one per element. + // For large block sizes, this is too inefficient, and switching to an atomic + // counter-based approach is faster. The switch is made for block sizes strictly + // larger than this threshold. + static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32; + + // How many full blocks can be expected for a single explicit producer? This should + // reflect that number's maximum for optimal performance. Must be a power of 2. + static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32; + + // How many full blocks can be expected for a single implicit producer? This should + // reflect that number's maximum for optimal performance. Must be a power of 2. + static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32; + + // The initial size of the hash table mapping thread IDs to implicit producers. + // Note that the hash is resized every time it becomes half full. + // Must be a power of two, and either 0 or at least 1. If 0, implicit production + // (using the enqueue methods without an explicit producer token) is disabled. + static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32; + + // Controls the number of items that an explicit consumer (i.e. one with a token) + // must consume before it causes all consumers to rotate and move on to the next + // internal queue. + static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256; + + // The maximum number of elements (inclusive) that can be enqueued to a sub-queue. + // Enqueue operations that would cause this limit to be surpassed will fail. Note + // that this limit is enforced at the block level (for performance reasons), i.e. + // it's rounded up to the nearest block size. + static const size_t MAX_SUBQUEUE_SIZE = details::const_numeric_max::value; + + // The number of times to spin before sleeping when waiting on a semaphore. + // Recommended values are on the order of 1000-10000 unless the number of + // consumer threads exceeds the number of idle cores (in which case try 0-100). + // Only affects instances of the BlockingConcurrentQueue. + static const int MAX_SEMA_SPINS = 10000; + + +#ifndef MCDBGQ_USE_RELACY + // Memory allocation can be customized if needed. + // malloc should return nullptr on failure, and handle alignment like std::malloc. +#if defined(malloc) || defined(free) + // Gah, this is 2015, stop defining macros that break standard code already! + // Work around malloc/free being special macros: + static inline void* WORKAROUND_malloc(size_t size) { return malloc(size); } + static inline void WORKAROUND_free(void* ptr) { return free(ptr); } + static inline void* (malloc)(size_t size) { return WORKAROUND_malloc(size); } + static inline void (free)(void* ptr) { return WORKAROUND_free(ptr); } +#else + static inline void* malloc(size_t size) { return std::malloc(size); } + static inline void free(void* ptr) { return std::free(ptr); } +#endif +#else + // Debug versions when running under the Relacy race detector (ignore + // these in user code) + static inline void* malloc(size_t size) { return rl::rl_malloc(size, $); } + static inline void free(void* ptr) { return rl::rl_free(ptr, $); } +#endif +}; + + +// When producing or consuming many elements, the most efficient way is to: +// 1) Use one of the bulk-operation methods of the queue with a token +// 2) Failing that, use the bulk-operation methods without a token +// 3) Failing that, create a token and use that with the single-item methods +// 4) Failing that, use the single-parameter methods of the queue +// Having said that, don't create tokens willy-nilly -- ideally there should be +// a maximum of one token per thread (of each kind). +struct ProducerToken; +struct ConsumerToken; + +template class ConcurrentQueue; +template class BlockingConcurrentQueue; +class ConcurrentQueueTests; + + +namespace details +{ + struct ConcurrentQueueProducerTypelessBase + { + ConcurrentQueueProducerTypelessBase* next; + std::atomic inactive; + ProducerToken* token; + + ConcurrentQueueProducerTypelessBase() + : next(nullptr), inactive(false), token(nullptr) + { + } + }; + + template struct _hash_32_or_64 { + static inline std::uint32_t hash(std::uint32_t h) + { + // MurmurHash3 finalizer -- see https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp + // Since the thread ID is already unique, all we really want to do is propagate that + // uniqueness evenly across all the bits, so that we can use a subset of the bits while + // reducing collisions significantly + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + return h ^ (h >> 16); + } + }; + template<> struct _hash_32_or_64<1> { + static inline std::uint64_t hash(std::uint64_t h) + { + h ^= h >> 33; + h *= 0xff51afd7ed558ccd; + h ^= h >> 33; + h *= 0xc4ceb9fe1a85ec53; + return h ^ (h >> 33); + } + }; + template struct hash_32_or_64 : public _hash_32_or_64<(size > 4)> { }; + + static inline size_t hash_thread_id(thread_id_t id) + { + static_assert(sizeof(thread_id_t) <= 8, "Expected a platform where thread IDs are at most 64-bit values"); + return static_cast(hash_32_or_64::thread_id_hash_t)>::hash( + thread_id_converter::prehash(id))); + } + + template + static inline bool circular_less_than(T a, T b) + { +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable: 4554) +#endif + static_assert(std::is_integral::value && !std::numeric_limits::is_signed, "circular_less_than is intended to be used only with unsigned integer types"); + return static_cast(a - b) > static_cast(static_cast(1) << static_cast(sizeof(T) * CHAR_BIT - 1)); +#ifdef _MSC_VER +#pragma warning(pop) +#endif + } + + template + static inline char* align_for(char* ptr) + { + const std::size_t alignment = std::alignment_of::value; + return ptr + (alignment - (reinterpret_cast(ptr) % alignment)) % alignment; + } + + template + static inline T ceil_to_pow_2(T x) + { + static_assert(std::is_integral::value && !std::numeric_limits::is_signed, "ceil_to_pow_2 is intended to be used only with unsigned integer types"); + + // Adapted from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 + --x; + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + for (std::size_t i = 1; i < sizeof(T); i <<= 1) { + x |= x >> (i << 3); + } + ++x; + return x; + } + + template + static inline void swap_relaxed(std::atomic& left, std::atomic& right) + { + T temp = std::move(left.load(std::memory_order_relaxed)); + left.store(std::move(right.load(std::memory_order_relaxed)), std::memory_order_relaxed); + right.store(std::move(temp), std::memory_order_relaxed); + } + + template + static inline T const& nomove(T const& x) + { + return x; + } + + template + struct nomove_if + { + template + static inline T const& eval(T const& x) + { + return x; + } + }; + + template<> + struct nomove_if + { + template + static inline auto eval(U&& x) + -> decltype(std::forward(x)) + { + return std::forward(x); + } + }; + + template + static inline auto deref_noexcept(It& it) MOODYCAMEL_NOEXCEPT -> decltype(*it) + { + return *it; + } + +#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8) + template struct is_trivially_destructible : std::is_trivially_destructible { }; +#else + template struct is_trivially_destructible : std::has_trivial_destructor { }; +#endif + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED +#ifdef MCDBGQ_USE_RELACY + typedef RelacyThreadExitListener ThreadExitListener; + typedef RelacyThreadExitNotifier ThreadExitNotifier; +#else + struct ThreadExitListener + { + typedef void (*callback_t)(void*); + callback_t callback; + void* userData; + + ThreadExitListener* next; // reserved for use by the ThreadExitNotifier + }; + + + class ThreadExitNotifier + { + public: + static void subscribe(ThreadExitListener* listener) + { + auto& tlsInst = instance(); + listener->next = tlsInst.tail; + tlsInst.tail = listener; + } + + static void unsubscribe(ThreadExitListener* listener) + { + auto& tlsInst = instance(); + ThreadExitListener** prev = &tlsInst.tail; + for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) { + if (ptr == listener) { + *prev = ptr->next; + break; + } + prev = &ptr->next; + } + } + + private: + ThreadExitNotifier() : tail(nullptr) { } + ThreadExitNotifier(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION; + ThreadExitNotifier& operator=(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION; + + ~ThreadExitNotifier() + { + // This thread is about to exit, let everyone know! + assert(this == &instance() && "If this assert fails, you likely have a buggy compiler! Change the preprocessor conditions such that MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined."); + for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) { + ptr->callback(ptr->userData); + } + } + + // Thread-local + static inline ThreadExitNotifier& instance() + { + static thread_local ThreadExitNotifier notifier; + return notifier; + } + + private: + ThreadExitListener* tail; + }; +#endif +#endif + + template struct static_is_lock_free_num { enum { value = 0 }; }; + template<> struct static_is_lock_free_num { enum { value = ATOMIC_CHAR_LOCK_FREE }; }; + template<> struct static_is_lock_free_num { enum { value = ATOMIC_SHORT_LOCK_FREE }; }; + template<> struct static_is_lock_free_num { enum { value = ATOMIC_INT_LOCK_FREE }; }; + template<> struct static_is_lock_free_num { enum { value = ATOMIC_LONG_LOCK_FREE }; }; + template<> struct static_is_lock_free_num { enum { value = ATOMIC_LLONG_LOCK_FREE }; }; + template struct static_is_lock_free : static_is_lock_free_num::type> { }; + template<> struct static_is_lock_free { enum { value = ATOMIC_BOOL_LOCK_FREE }; }; + template struct static_is_lock_free { enum { value = ATOMIC_POINTER_LOCK_FREE }; }; +} + + +struct ProducerToken +{ + template + explicit ProducerToken(ConcurrentQueue& queue); + + template + explicit ProducerToken(BlockingConcurrentQueue& queue); + + ProducerToken(ProducerToken&& other) MOODYCAMEL_NOEXCEPT + : producer(other.producer) + { + other.producer = nullptr; + if (producer != nullptr) { + producer->token = this; + } + } + + inline ProducerToken& operator=(ProducerToken&& other) MOODYCAMEL_NOEXCEPT + { + swap(other); + return *this; + } + + void swap(ProducerToken& other) MOODYCAMEL_NOEXCEPT + { + std::swap(producer, other.producer); + if (producer != nullptr) { + producer->token = this; + } + if (other.producer != nullptr) { + other.producer->token = &other; + } + } + + // A token is always valid unless: + // 1) Memory allocation failed during construction + // 2) It was moved via the move constructor + // (Note: assignment does a swap, leaving both potentially valid) + // 3) The associated queue was destroyed + // Note that if valid() returns true, that only indicates + // that the token is valid for use with a specific queue, + // but not which one; that's up to the user to track. + inline bool valid() const { return producer != nullptr; } + + ~ProducerToken() + { + if (producer != nullptr) { + producer->token = nullptr; + producer->inactive.store(true, std::memory_order_release); + } + } + + // Disable copying and assignment + ProducerToken(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION; + ProducerToken& operator=(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION; + +private: + template friend class ConcurrentQueue; + friend class ConcurrentQueueTests; + +protected: + details::ConcurrentQueueProducerTypelessBase* producer; +}; + + +struct ConsumerToken +{ + template + explicit ConsumerToken(ConcurrentQueue& q); + + template + explicit ConsumerToken(BlockingConcurrentQueue& q); + + ConsumerToken(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT + : initialOffset(other.initialOffset), lastKnownGlobalOffset(other.lastKnownGlobalOffset), itemsConsumedFromCurrent(other.itemsConsumedFromCurrent), currentProducer(other.currentProducer), desiredProducer(other.desiredProducer) + { + } + + inline ConsumerToken& operator=(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT + { + swap(other); + return *this; + } + + void swap(ConsumerToken& other) MOODYCAMEL_NOEXCEPT + { + std::swap(initialOffset, other.initialOffset); + std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset); + std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent); + std::swap(currentProducer, other.currentProducer); + std::swap(desiredProducer, other.desiredProducer); + } + + // Disable copying and assignment + ConsumerToken(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION; + ConsumerToken& operator=(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION; + +private: + template friend class ConcurrentQueue; + friend class ConcurrentQueueTests; + +private: // but shared with ConcurrentQueue + std::uint32_t initialOffset; + std::uint32_t lastKnownGlobalOffset; + std::uint32_t itemsConsumedFromCurrent; + details::ConcurrentQueueProducerTypelessBase* currentProducer; + details::ConcurrentQueueProducerTypelessBase* desiredProducer; +}; + +// Need to forward-declare this swap because it's in a namespace. +// See http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces +template +inline void swap(typename ConcurrentQueue::ImplicitProducerKVP& a, typename ConcurrentQueue::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT; + + +template +class ConcurrentQueue +{ +public: + typedef ::moodycamel::ProducerToken producer_token_t; + typedef ::moodycamel::ConsumerToken consumer_token_t; + + typedef typename Traits::index_t index_t; + typedef typename Traits::size_t size_t; + + static const size_t BLOCK_SIZE = static_cast(Traits::BLOCK_SIZE); + static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = static_cast(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD); + static const size_t EXPLICIT_INITIAL_INDEX_SIZE = static_cast(Traits::EXPLICIT_INITIAL_INDEX_SIZE); + static const size_t IMPLICIT_INITIAL_INDEX_SIZE = static_cast(Traits::IMPLICIT_INITIAL_INDEX_SIZE); + static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = static_cast(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE); + static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = static_cast(Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE); +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable: 4307) // + integral constant overflow (that's what the ternary expression is for!) +#pragma warning(disable: 4309) // static_cast: Truncation of constant value +#endif + static const size_t MAX_SUBQUEUE_SIZE = (details::const_numeric_max::value - static_cast(Traits::MAX_SUBQUEUE_SIZE) < BLOCK_SIZE) ? details::const_numeric_max::value : ((static_cast(Traits::MAX_SUBQUEUE_SIZE) + (BLOCK_SIZE - 1)) / BLOCK_SIZE * BLOCK_SIZE); +#ifdef _MSC_VER +#pragma warning(pop) +#endif + + static_assert(!std::numeric_limits::is_signed && std::is_integral::value, "Traits::size_t must be an unsigned integral type"); + static_assert(!std::numeric_limits::is_signed && std::is_integral::value, "Traits::index_t must be an unsigned integral type"); + static_assert(sizeof(index_t) >= sizeof(size_t), "Traits::index_t must be at least as wide as Traits::size_t"); + static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)"); + static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) && !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a power of 2 (and greater than 1)"); + static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) && !(EXPLICIT_INITIAL_INDEX_SIZE & (EXPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)"); + static_assert((IMPLICIT_INITIAL_INDEX_SIZE > 1) && !(IMPLICIT_INITIAL_INDEX_SIZE & (IMPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)"); + static_assert((INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) || !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE & (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)), "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2"); + static_assert(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 || INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1, "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at least 1 (or 0 to disable implicit enqueueing)"); + +public: + // Creates a queue with at least `capacity` element slots; note that the + // actual number of elements that can be inserted without additional memory + // allocation depends on the number of producers and the block size (e.g. if + // the block size is equal to `capacity`, only a single block will be allocated + // up-front, which means only a single producer will be able to enqueue elements + // without an extra allocation -- blocks aren't shared between producers). + // This method is not thread safe -- it is up to the user to ensure that the + // queue is fully constructed before it starts being used by other threads (this + // includes making the memory effects of construction visible, possibly with a + // memory barrier). + explicit ConcurrentQueue(size_t capacity = 6 * BLOCK_SIZE) + : producerListTail(nullptr), + producerCount(0), + initialBlockPoolIndex(0), + nextExplicitConsumerId(0), + globalExplicitConsumerOffset(0) + { + implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); + populate_initial_implicit_producer_hash(); + populate_initial_block_list(capacity / BLOCK_SIZE + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1)); + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + // Track all the producers using a fully-resolved typed list for + // each kind; this makes it possible to debug them starting from + // the root queue object (otherwise wacky casts are needed that + // don't compile in the debugger's expression evaluator). + explicitProducers.store(nullptr, std::memory_order_relaxed); + implicitProducers.store(nullptr, std::memory_order_relaxed); +#endif + } + + // Computes the correct amount of pre-allocated blocks for you based + // on the minimum number of elements you want available at any given + // time, and the maximum concurrent number of each type of producer. + ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers) + : producerListTail(nullptr), + producerCount(0), + initialBlockPoolIndex(0), + nextExplicitConsumerId(0), + globalExplicitConsumerOffset(0) + { + implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); + populate_initial_implicit_producer_hash(); + size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * (maxExplicitProducers + 1) + 2 * (maxExplicitProducers + maxImplicitProducers); + populate_initial_block_list(blocks); + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + explicitProducers.store(nullptr, std::memory_order_relaxed); + implicitProducers.store(nullptr, std::memory_order_relaxed); +#endif + } + + // Note: The queue should not be accessed concurrently while it's + // being deleted. It's up to the user to synchronize this. + // This method is not thread safe. + ~ConcurrentQueue() + { + // Destroy producers + auto ptr = producerListTail.load(std::memory_order_relaxed); + while (ptr != nullptr) { + auto next = ptr->next_prod(); + if (ptr->token != nullptr) { + ptr->token->producer = nullptr; + } + destroy(ptr); + ptr = next; + } + + // Destroy implicit producer hash tables + MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) { + auto hash = implicitProducerHash.load(std::memory_order_relaxed); + while (hash != nullptr) { + auto prev = hash->prev; + if (prev != nullptr) { // The last hash is part of this object and was not allocated dynamically + for (size_t i = 0; i != hash->capacity; ++i) { + hash->entries[i].~ImplicitProducerKVP(); + } + hash->~ImplicitProducerHash(); + (Traits::free)(hash); + } + hash = prev; + } + } + + // Destroy global free list + auto block = freeList.head_unsafe(); + while (block != nullptr) { + auto next = block->freeListNext.load(std::memory_order_relaxed); + if (block->dynamicallyAllocated) { + destroy(block); + } + block = next; + } + + // Destroy initial free list + destroy_array(initialBlockPool, initialBlockPoolSize); + } + + // Disable copying and copy assignment + ConcurrentQueue(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION; + ConcurrentQueue& operator=(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION; + + // Moving is supported, but note that it is *not* a thread-safe operation. + // Nobody can use the queue while it's being moved, and the memory effects + // of that move must be propagated to other threads before they can use it. + // Note: When a queue is moved, its tokens are still valid but can only be + // used with the destination queue (i.e. semantically they are moved along + // with the queue itself). + ConcurrentQueue(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT + : producerListTail(other.producerListTail.load(std::memory_order_relaxed)), + producerCount(other.producerCount.load(std::memory_order_relaxed)), + initialBlockPoolIndex(other.initialBlockPoolIndex.load(std::memory_order_relaxed)), + initialBlockPool(other.initialBlockPool), + initialBlockPoolSize(other.initialBlockPoolSize), + freeList(std::move(other.freeList)), + nextExplicitConsumerId(other.nextExplicitConsumerId.load(std::memory_order_relaxed)), + globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load(std::memory_order_relaxed)) + { + // Move the other one into this, and leave the other one as an empty queue + implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); + populate_initial_implicit_producer_hash(); + swap_implicit_producer_hashes(other); + + other.producerListTail.store(nullptr, std::memory_order_relaxed); + other.producerCount.store(0, std::memory_order_relaxed); + other.nextExplicitConsumerId.store(0, std::memory_order_relaxed); + other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed); + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + explicitProducers.store(other.explicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed); + other.explicitProducers.store(nullptr, std::memory_order_relaxed); + implicitProducers.store(other.implicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed); + other.implicitProducers.store(nullptr, std::memory_order_relaxed); +#endif + + other.initialBlockPoolIndex.store(0, std::memory_order_relaxed); + other.initialBlockPoolSize = 0; + other.initialBlockPool = nullptr; + + reown_producers(); + } + + inline ConcurrentQueue& operator=(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT + { + return swap_internal(other); + } + + // Swaps this queue's state with the other's. Not thread-safe. + // Swapping two queues does not invalidate their tokens, however + // the tokens that were created for one queue must be used with + // only the swapped queue (i.e. the tokens are tied to the + // queue's movable state, not the object itself). + inline void swap(ConcurrentQueue& other) MOODYCAMEL_NOEXCEPT + { + swap_internal(other); + } + +private: + ConcurrentQueue& swap_internal(ConcurrentQueue& other) + { + if (this == &other) { + return *this; + } + + details::swap_relaxed(producerListTail, other.producerListTail); + details::swap_relaxed(producerCount, other.producerCount); + details::swap_relaxed(initialBlockPoolIndex, other.initialBlockPoolIndex); + std::swap(initialBlockPool, other.initialBlockPool); + std::swap(initialBlockPoolSize, other.initialBlockPoolSize); + freeList.swap(other.freeList); + details::swap_relaxed(nextExplicitConsumerId, other.nextExplicitConsumerId); + details::swap_relaxed(globalExplicitConsumerOffset, other.globalExplicitConsumerOffset); + + swap_implicit_producer_hashes(other); + + reown_producers(); + other.reown_producers(); + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + details::swap_relaxed(explicitProducers, other.explicitProducers); + details::swap_relaxed(implicitProducers, other.implicitProducers); +#endif + + return *this; + } + +public: + // Enqueues a single item (by copying it). + // Allocates memory if required. Only fails if memory allocation fails (or implicit + // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, + // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(T const& item) + { + MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; + else return inner_enqueue(item); + } + + // Enqueues a single item (by moving it, if possible). + // Allocates memory if required. Only fails if memory allocation fails (or implicit + // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, + // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(T&& item) + { + MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; + else return inner_enqueue(std::move(item)); + } + + // Enqueues a single item (by copying it) using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails (or + // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(producer_token_t const& token, T const& item) + { + return inner_enqueue(token, item); + } + + // Enqueues a single item (by moving it, if possible) using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails (or + // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(producer_token_t const& token, T&& item) + { + return inner_enqueue(token, std::move(item)); + } + + // Enqueues several items. + // Allocates memory if required. Only fails if memory allocation fails (or + // implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE + // is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Note: Use std::make_move_iterator if the elements should be moved instead of copied. + // Thread-safe. + template + bool enqueue_bulk(It itemFirst, size_t count) + { + MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; + else return inner_enqueue_bulk(itemFirst, count); + } + + // Enqueues several items using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails + // (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) + { + return inner_enqueue_bulk(token, itemFirst, count); + } + + // Enqueues a single item (by copying it). + // Does not allocate memory. Fails if not enough room to enqueue (or implicit + // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE + // is 0). + // Thread-safe. + inline bool try_enqueue(T const& item) + { + MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; + else return inner_enqueue(item); + } + + // Enqueues a single item (by moving it, if possible). + // Does not allocate memory (except for one-time implicit producer). + // Fails if not enough room to enqueue (or implicit production is + // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). + // Thread-safe. + inline bool try_enqueue(T&& item) + { + MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; + else return inner_enqueue(std::move(item)); + } + + // Enqueues a single item (by copying it) using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Thread-safe. + inline bool try_enqueue(producer_token_t const& token, T const& item) + { + return inner_enqueue(token, item); + } + + // Enqueues a single item (by moving it, if possible) using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Thread-safe. + inline bool try_enqueue(producer_token_t const& token, T&& item) + { + return inner_enqueue(token, std::move(item)); + } + + // Enqueues several items. + // Does not allocate memory (except for one-time implicit producer). + // Fails if not enough room to enqueue (or implicit production is + // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + bool try_enqueue_bulk(It itemFirst, size_t count) + { + MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; + else return inner_enqueue_bulk(itemFirst, count); + } + + // Enqueues several items using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) + { + return inner_enqueue_bulk(token, itemFirst, count); + } + + + + // Attempts to dequeue from the queue. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + bool try_dequeue(U& item) + { + // Instead of simply trying each producer in turn (which could cause needless contention on the first + // producer), we score them heuristically. + size_t nonEmptyCount = 0; + ProducerBase* best = nullptr; + size_t bestSize = 0; + for (auto ptr = producerListTail.load(std::memory_order_acquire); nonEmptyCount < 3 && ptr != nullptr; ptr = ptr->next_prod()) { + auto size = ptr->size_approx(); + if (size > 0) { + if (size > bestSize) { + bestSize = size; + best = ptr; + } + ++nonEmptyCount; + } + } + + // If there was at least one non-empty queue but it appears empty at the time + // we try to dequeue from it, we need to make sure every queue's been tried + if (nonEmptyCount > 0) { + if ((details::likely)(best->dequeue(item))) { + return true; + } + for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + if (ptr != best && ptr->dequeue(item)) { + return true; + } + } + } + return false; + } + + // Attempts to dequeue from the queue. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // This differs from the try_dequeue(item) method in that this one does + // not attempt to reduce contention by interleaving the order that producer + // streams are dequeued from. So, using this method can reduce overall throughput + // under contention, but will give more predictable results in single-threaded + // consumer scenarios. This is mostly only useful for internal unit tests. + // Never allocates. Thread-safe. + template + bool try_dequeue_non_interleaved(U& item) + { + for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + if (ptr->dequeue(item)) { + return true; + } + } + return false; + } + + // Attempts to dequeue from the queue using an explicit consumer token. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + bool try_dequeue(consumer_token_t& token, U& item) + { + // The idea is roughly as follows: + // Every 256 items from one producer, make everyone rotate (increase the global offset) -> this means the highest efficiency consumer dictates the rotation speed of everyone else, more or less + // If you see that the global offset has changed, you must reset your consumption counter and move to your designated place + // If there's no items where you're supposed to be, keep moving until you find a producer with some items + // If the global offset has not changed but you've run out of items to consume, move over from your current position until you find an producer with something in it + + if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) { + if (!update_current_producer_after_rotation(token)) { + return false; + } + } + + // If there was at least one non-empty queue but it appears empty at the time + // we try to dequeue from it, we need to make sure every queue's been tried + if (static_cast(token.currentProducer)->dequeue(item)) { + if (++token.itemsConsumedFromCurrent == EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) { + globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed); + } + return true; + } + + auto tail = producerListTail.load(std::memory_order_acquire); + auto ptr = static_cast(token.currentProducer)->next_prod(); + if (ptr == nullptr) { + ptr = tail; + } + while (ptr != static_cast(token.currentProducer)) { + if (ptr->dequeue(item)) { + token.currentProducer = ptr; + token.itemsConsumedFromCurrent = 1; + return true; + } + ptr = ptr->next_prod(); + if (ptr == nullptr) { + ptr = tail; + } + } + return false; + } + + // Attempts to dequeue several elements from the queue. + // Returns the number of items actually dequeued. + // Returns 0 if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + size_t try_dequeue_bulk(It itemFirst, size_t max) + { + size_t count = 0; + for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + count += ptr->dequeue_bulk(itemFirst, max - count); + if (count == max) { + break; + } + } + return count; + } + + // Attempts to dequeue several elements from the queue using an explicit consumer token. + // Returns the number of items actually dequeued. + // Returns 0 if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max) + { + if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) { + if (!update_current_producer_after_rotation(token)) { + return 0; + } + } + + size_t count = static_cast(token.currentProducer)->dequeue_bulk(itemFirst, max); + if (count == max) { + if ((token.itemsConsumedFromCurrent += static_cast(max)) >= EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) { + globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed); + } + return max; + } + token.itemsConsumedFromCurrent += static_cast(count); + max -= count; + + auto tail = producerListTail.load(std::memory_order_acquire); + auto ptr = static_cast(token.currentProducer)->next_prod(); + if (ptr == nullptr) { + ptr = tail; + } + while (ptr != static_cast(token.currentProducer)) { + auto dequeued = ptr->dequeue_bulk(itemFirst, max); + count += dequeued; + if (dequeued != 0) { + token.currentProducer = ptr; + token.itemsConsumedFromCurrent = static_cast(dequeued); + } + if (dequeued == max) { + break; + } + max -= dequeued; + ptr = ptr->next_prod(); + if (ptr == nullptr) { + ptr = tail; + } + } + return count; + } + + + + // Attempts to dequeue from a specific producer's inner queue. + // If you happen to know which producer you want to dequeue from, this + // is significantly faster than using the general-case try_dequeue methods. + // Returns false if the producer's queue appeared empty at the time it + // was checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + inline bool try_dequeue_from_producer(producer_token_t const& producer, U& item) + { + return static_cast(producer.producer)->dequeue(item); + } + + // Attempts to dequeue several elements from a specific producer's inner queue. + // Returns the number of items actually dequeued. + // If you happen to know which producer you want to dequeue from, this + // is significantly faster than using the general-case try_dequeue methods. + // Returns 0 if the producer's queue appeared empty at the time it + // was checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + inline size_t try_dequeue_bulk_from_producer(producer_token_t const& producer, It itemFirst, size_t max) + { + return static_cast(producer.producer)->dequeue_bulk(itemFirst, max); + } + + + // Returns an estimate of the total number of elements currently in the queue. This + // estimate is only accurate if the queue has completely stabilized before it is called + // (i.e. all enqueue and dequeue operations have completed and their memory effects are + // visible on the calling thread, and no further operations start while this method is + // being called). + // Thread-safe. + size_t size_approx() const + { + size_t size = 0; + for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + size += ptr->size_approx(); + } + return size; + } + + + // Returns true if the underlying atomic variables used by + // the queue are lock-free (they should be on most platforms). + // Thread-safe. + static bool is_lock_free() + { + return + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::thread_id_numeric_size_t>::value == 2; + } + + +private: + friend struct ProducerToken; + friend struct ConsumerToken; + struct ExplicitProducer; + friend struct ExplicitProducer; + struct ImplicitProducer; + friend struct ImplicitProducer; + friend class ConcurrentQueueTests; + + enum AllocationMode { CanAlloc, CannotAlloc }; + + + /////////////////////////////// + // Queue methods + /////////////////////////////// + + template + inline bool inner_enqueue(producer_token_t const& token, U&& element) + { + return static_cast(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue(std::forward(element)); + } + + template + inline bool inner_enqueue(U&& element) + { + auto producer = get_or_add_implicit_producer(); + return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue(std::forward(element)); + } + + template + inline bool inner_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) + { + return static_cast(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue_bulk(itemFirst, count); + } + + template + inline bool inner_enqueue_bulk(It itemFirst, size_t count) + { + auto producer = get_or_add_implicit_producer(); + return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue_bulk(itemFirst, count); + } + + inline bool update_current_producer_after_rotation(consumer_token_t& token) + { + // Ah, there's been a rotation, figure out where we should be! + auto tail = producerListTail.load(std::memory_order_acquire); + if (token.desiredProducer == nullptr && tail == nullptr) { + return false; + } + auto prodCount = producerCount.load(std::memory_order_relaxed); + auto globalOffset = globalExplicitConsumerOffset.load(std::memory_order_relaxed); + if ((details::unlikely)(token.desiredProducer == nullptr)) { + // Aha, first time we're dequeueing anything. + // Figure out our local position + // Note: offset is from start, not end, but we're traversing from end -- subtract from count first + std::uint32_t offset = prodCount - 1 - (token.initialOffset % prodCount); + token.desiredProducer = tail; + for (std::uint32_t i = 0; i != offset; ++i) { + token.desiredProducer = static_cast(token.desiredProducer)->next_prod(); + if (token.desiredProducer == nullptr) { + token.desiredProducer = tail; + } + } + } + + std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset; + if (delta >= prodCount) { + delta = delta % prodCount; + } + for (std::uint32_t i = 0; i != delta; ++i) { + token.desiredProducer = static_cast(token.desiredProducer)->next_prod(); + if (token.desiredProducer == nullptr) { + token.desiredProducer = tail; + } + } + + token.lastKnownGlobalOffset = globalOffset; + token.currentProducer = token.desiredProducer; + token.itemsConsumedFromCurrent = 0; + return true; + } + + + /////////////////////////// + // Free list + /////////////////////////// + + template + struct FreeListNode + { + FreeListNode() : freeListRefs(0), freeListNext(nullptr) { } + + std::atomic freeListRefs; + std::atomic freeListNext; + }; + + // A simple CAS-based lock-free free list. Not the fastest thing in the world under heavy contention, but + // simple and correct (assuming nodes are never freed until after the free list is destroyed), and fairly + // speedy under low contention. + template // N must inherit FreeListNode or have the same fields (and initialization of them) + struct FreeList + { + FreeList() : freeListHead(nullptr) { } + FreeList(FreeList&& other) : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) { other.freeListHead.store(nullptr, std::memory_order_relaxed); } + void swap(FreeList& other) { details::swap_relaxed(freeListHead, other.freeListHead); } + + FreeList(FreeList const&) MOODYCAMEL_DELETE_FUNCTION; + FreeList& operator=(FreeList const&) MOODYCAMEL_DELETE_FUNCTION; + + inline void add(N* node) + { +#ifdef MCDBGQ_NOLOCKFREE_FREELIST + debug::DebugLock lock(mutex); +#endif + // We know that the should-be-on-freelist bit is 0 at this point, so it's safe to + // set it using a fetch_add + if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, std::memory_order_acq_rel) == 0) { + // Oh look! We were the last ones referencing this node, and we know + // we want to add it to the free list, so let's do it! + add_knowing_refcount_is_zero(node); + } + } + + inline N* try_get() + { +#ifdef MCDBGQ_NOLOCKFREE_FREELIST + debug::DebugLock lock(mutex); +#endif + auto head = freeListHead.load(std::memory_order_acquire); + while (head != nullptr) { + auto prevHead = head; + auto refs = head->freeListRefs.load(std::memory_order_relaxed); + if ((refs & REFS_MASK) == 0 || !head->freeListRefs.compare_exchange_strong(refs, refs + 1, std::memory_order_acquire, std::memory_order_relaxed)) { + head = freeListHead.load(std::memory_order_acquire); + continue; + } + + // Good, reference count has been incremented (it wasn't at zero), which means we can read the + // next and not worry about it changing between now and the time we do the CAS + auto next = head->freeListNext.load(std::memory_order_relaxed); + if (freeListHead.compare_exchange_strong(head, next, std::memory_order_acquire, std::memory_order_relaxed)) { + // Yay, got the node. This means it was on the list, which means shouldBeOnFreeList must be false no + // matter the refcount (because nobody else knows it's been taken off yet, it can't have been put back on). + assert((head->freeListRefs.load(std::memory_order_relaxed) & SHOULD_BE_ON_FREELIST) == 0); + + // Decrease refcount twice, once for our ref, and once for the list's ref + head->freeListRefs.fetch_sub(2, std::memory_order_release); + return head; + } + + // OK, the head must have changed on us, but we still need to decrease the refcount we increased. + // Note that we don't need to release any memory effects, but we do need to ensure that the reference + // count decrement happens-after the CAS on the head. + refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel); + if (refs == SHOULD_BE_ON_FREELIST + 1) { + add_knowing_refcount_is_zero(prevHead); + } + } + + return nullptr; + } + + // Useful for traversing the list when there's no contention (e.g. to destroy remaining nodes) + N* head_unsafe() const { return freeListHead.load(std::memory_order_relaxed); } + + private: + inline void add_knowing_refcount_is_zero(N* node) + { + // Since the refcount is zero, and nobody can increase it once it's zero (except us, and we run + // only one copy of this method per node at a time, i.e. the single thread case), then we know + // we can safely change the next pointer of the node; however, once the refcount is back above + // zero, then other threads could increase it (happens under heavy contention, when the refcount + // goes to zero in between a load and a refcount increment of a node in try_get, then back up to + // something non-zero, then the refcount increment is done by the other thread) -- so, if the CAS + // to add the node to the actual list fails, decrease the refcount and leave the add operation to + // the next thread who puts the refcount back at zero (which could be us, hence the loop). + auto head = freeListHead.load(std::memory_order_relaxed); + while (true) { + node->freeListNext.store(head, std::memory_order_relaxed); + node->freeListRefs.store(1, std::memory_order_release); + if (!freeListHead.compare_exchange_strong(head, node, std::memory_order_release, std::memory_order_relaxed)) { + // Hmm, the add failed, but we can only try again when the refcount goes back to zero + if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1, std::memory_order_release) == 1) { + continue; + } + } + return; + } + } + + private: + // Implemented like a stack, but where node order doesn't matter (nodes are inserted out of order under contention) + std::atomic freeListHead; + + static const std::uint32_t REFS_MASK = 0x7FFFFFFF; + static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000; + +#ifdef MCDBGQ_NOLOCKFREE_FREELIST + debug::DebugMutex mutex; +#endif + }; + + + /////////////////////////// + // Block + /////////////////////////// + + enum InnerQueueContext { implicit_context = 0, explicit_context = 1 }; + + struct Block + { + Block() + : next(nullptr), elementsCompletelyDequeued(0), freeListRefs(0), freeListNext(nullptr), shouldBeOnFreeList(false), dynamicallyAllocated(true) + { +#ifdef MCDBGQ_TRACKMEM + owner = nullptr; +#endif + } + + template + inline bool is_empty() const + { + MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Check flags + for (size_t i = 0; i < BLOCK_SIZE; ++i) { + if (!emptyFlags[i].load(std::memory_order_relaxed)) { + return false; + } + } + + // Aha, empty; make sure we have all other memory effects that happened before the empty flags were set + std::atomic_thread_fence(std::memory_order_acquire); + return true; + } + else { + // Check counter + if (elementsCompletelyDequeued.load(std::memory_order_relaxed) == BLOCK_SIZE) { + std::atomic_thread_fence(std::memory_order_acquire); + return true; + } + assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <= BLOCK_SIZE); + return false; + } + } + + // Returns true if the block is now empty (does not apply in explicit context) + template + inline bool set_empty(MOODYCAMEL_MAYBE_UNUSED index_t i) + { + MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Set flag + assert(!emptyFlags[BLOCK_SIZE - 1 - static_cast(i & static_cast(BLOCK_SIZE - 1))].load(std::memory_order_relaxed)); + emptyFlags[BLOCK_SIZE - 1 - static_cast(i & static_cast(BLOCK_SIZE - 1))].store(true, std::memory_order_release); + return false; + } + else { + // Increment counter + auto prevVal = elementsCompletelyDequeued.fetch_add(1, std::memory_order_release); + assert(prevVal < BLOCK_SIZE); + return prevVal == BLOCK_SIZE - 1; + } + } + + // Sets multiple contiguous item statuses to 'empty' (assumes no wrapping and count > 0). + // Returns true if the block is now empty (does not apply in explicit context). + template + inline bool set_many_empty(MOODYCAMEL_MAYBE_UNUSED index_t i, size_t count) + { + MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Set flags + std::atomic_thread_fence(std::memory_order_release); + i = BLOCK_SIZE - 1 - static_cast(i & static_cast(BLOCK_SIZE - 1)) - count + 1; + for (size_t j = 0; j != count; ++j) { + assert(!emptyFlags[i + j].load(std::memory_order_relaxed)); + emptyFlags[i + j].store(true, std::memory_order_relaxed); + } + return false; + } + else { + // Increment counter + auto prevVal = elementsCompletelyDequeued.fetch_add(count, std::memory_order_release); + assert(prevVal + count <= BLOCK_SIZE); + return prevVal + count == BLOCK_SIZE; + } + } + + template + inline void set_all_empty() + { + MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Set all flags + for (size_t i = 0; i != BLOCK_SIZE; ++i) { + emptyFlags[i].store(true, std::memory_order_relaxed); + } + } + else { + // Reset counter + elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed); + } + } + + template + inline void reset_empty() + { + MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Reset flags + for (size_t i = 0; i != BLOCK_SIZE; ++i) { + emptyFlags[i].store(false, std::memory_order_relaxed); + } + } + else { + // Reset counter + elementsCompletelyDequeued.store(0, std::memory_order_relaxed); + } + } + + inline T* operator[](index_t idx) MOODYCAMEL_NOEXCEPT { return static_cast(static_cast(elements)) + static_cast(idx & static_cast(BLOCK_SIZE - 1)); } + inline T const* operator[](index_t idx) const MOODYCAMEL_NOEXCEPT { return static_cast(static_cast(elements)) + static_cast(idx & static_cast(BLOCK_SIZE - 1)); } + + private: + static_assert(std::alignment_of::value <= sizeof(T), "The queue does not support types with an alignment greater than their size at this time"); + MOODYCAMEL_ALIGNED_TYPE_LIKE(char[sizeof(T) * BLOCK_SIZE], T) elements; + public: + Block* next; + std::atomic elementsCompletelyDequeued; + std::atomic emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1]; + public: + std::atomic freeListRefs; + std::atomic freeListNext; + std::atomic shouldBeOnFreeList; + bool dynamicallyAllocated; // Perhaps a better name for this would be 'isNotPartOfInitialBlockPool' + +#ifdef MCDBGQ_TRACKMEM + void* owner; +#endif + }; + static_assert(std::alignment_of::value >= std::alignment_of::value, "Internal error: Blocks must be at least as aligned as the type they are wrapping"); + + +#ifdef MCDBGQ_TRACKMEM +public: + struct MemStats; +private: +#endif + + /////////////////////////// + // Producer base + /////////////////////////// + + struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase + { + ProducerBase(ConcurrentQueue* parent_, bool isExplicit_) : + tailIndex(0), + headIndex(0), + dequeueOptimisticCount(0), + dequeueOvercommit(0), + tailBlock(nullptr), + isExplicit(isExplicit_), + parent(parent_) + { + } + + virtual ~ProducerBase() { } + + template + inline bool dequeue(U& element) + { + if (isExplicit) { + return static_cast(this)->dequeue(element); + } + else { + return static_cast(this)->dequeue(element); + } + } + + template + inline size_t dequeue_bulk(It& itemFirst, size_t max) + { + if (isExplicit) { + return static_cast(this)->dequeue_bulk(itemFirst, max); + } + else { + return static_cast(this)->dequeue_bulk(itemFirst, max); + } + } + + inline ProducerBase* next_prod() const { return static_cast(next); } + + inline size_t size_approx() const + { + auto tail = tailIndex.load(std::memory_order_relaxed); + auto head = headIndex.load(std::memory_order_relaxed); + return details::circular_less_than(head, tail) ? static_cast(tail - head) : 0; + } + + inline index_t getTail() const { return tailIndex.load(std::memory_order_relaxed); } + protected: + std::atomic tailIndex; // Where to enqueue to next + std::atomic headIndex; // Where to dequeue from next + + std::atomic dequeueOptimisticCount; + std::atomic dequeueOvercommit; + + Block* tailBlock; + + public: + bool isExplicit; + ConcurrentQueue* parent; + + protected: +#ifdef MCDBGQ_TRACKMEM + friend struct MemStats; +#endif + }; + + + /////////////////////////// + // Explicit queue + /////////////////////////// + + struct ExplicitProducer : public ProducerBase + { + explicit ExplicitProducer(ConcurrentQueue* parent_) : + ProducerBase(parent_, true), + blockIndex(nullptr), + pr_blockIndexSlotsUsed(0), + pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1), + pr_blockIndexFront(0), + pr_blockIndexEntries(nullptr), + pr_blockIndexRaw(nullptr) + { + size_t poolBasedIndexSize = details::ceil_to_pow_2(parent_->initialBlockPoolSize) >> 1; + if (poolBasedIndexSize > pr_blockIndexSize) { + pr_blockIndexSize = poolBasedIndexSize; + } + + new_block_index(0); // This creates an index with double the number of current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE + } + + ~ExplicitProducer() + { + // Destruct any elements not yet dequeued. + // Since we're in the destructor, we can assume all elements + // are either completely dequeued or completely not (no halfways). + if (this->tailBlock != nullptr) { // Note this means there must be a block index too + // First find the block that's partially dequeued, if any + Block* halfDequeuedBlock = nullptr; + if ((this->headIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)) != 0) { + // The head's not on a block boundary, meaning a block somewhere is partially dequeued + // (or the head block is the tail block and was fully dequeued, but the head/tail are still not on a boundary) + size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & (pr_blockIndexSize - 1); + while (details::circular_less_than(pr_blockIndexEntries[i].base + BLOCK_SIZE, this->headIndex.load(std::memory_order_relaxed))) { + i = (i + 1) & (pr_blockIndexSize - 1); + } + assert(details::circular_less_than(pr_blockIndexEntries[i].base, this->headIndex.load(std::memory_order_relaxed))); + halfDequeuedBlock = pr_blockIndexEntries[i].block; + } + + // Start at the head block (note the first line in the loop gives us the head from the tail on the first iteration) + auto block = this->tailBlock; + do { + block = block->next; + if (block->ConcurrentQueue::Block::template is_empty()) { + continue; + } + + size_t i = 0; // Offset into block + if (block == halfDequeuedBlock) { + i = static_cast(this->headIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)); + } + + // Walk through all the items in the block; if this is the tail block, we need to stop when we reach the tail index + auto lastValidIndex = (this->tailIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)) == 0 ? BLOCK_SIZE : static_cast(this->tailIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)); + while (i != BLOCK_SIZE && (block != this->tailBlock || i != lastValidIndex)) { + (*block)[i++]->~T(); + } + } while (block != this->tailBlock); + } + + // Destroy all blocks that we own + if (this->tailBlock != nullptr) { + auto block = this->tailBlock; + do { + auto nextBlock = block->next; + if (block->dynamicallyAllocated) { + destroy(block); + } + else { + this->parent->add_block_to_free_list(block); + } + block = nextBlock; + } while (block != this->tailBlock); + } + + // Destroy the block indices + auto header = static_cast(pr_blockIndexRaw); + while (header != nullptr) { + auto prev = static_cast(header->prev); + header->~BlockIndexHeader(); + (Traits::free)(header); + header = prev; + } + } + + template + inline bool enqueue(U&& element) + { + index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed); + index_t newTailIndex = 1 + currentTailIndex; + if ((currentTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { + // We reached the end of a block, start a new one + auto startBlock = this->tailBlock; + auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed; + if (this->tailBlock != nullptr && this->tailBlock->next->ConcurrentQueue::Block::template is_empty()) { + // We can re-use the block ahead of us, it's empty! + this->tailBlock = this->tailBlock->next; + this->tailBlock->ConcurrentQueue::Block::template reset_empty(); + + // We'll put the block on the block index (guaranteed to be room since we're conceptually removing the + // last block from it first -- except instead of removing then adding, we can just overwrite). + // Note that there must be a valid block index here, since even if allocation failed in the ctor, + // it would have been re-attempted when adding the first block to the queue; since there is such + // a block, a block index must have been successfully allocated. + } + else { + // Whatever head value we see here is >= the last value we saw here (relatively), + // and <= its current value. Since we have the most recent tail, the head must be + // <= to it. + auto head = this->headIndex.load(std::memory_order_relaxed); + assert(!details::circular_less_than(currentTailIndex, head)); + if (!details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) + || (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) { + // We can't enqueue in another block because there's not enough leeway -- the + // tail could surpass the head by the time the block fills up! (Or we'll exceed + // the size limit, if the second part of the condition was true.) + return false; + } + // We're going to need a new block; check that the block index has room + if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize) { + // Hmm, the circular block index is already full -- we'll need + // to allocate a new index. Note pr_blockIndexRaw can only be nullptr if + // the initial allocation failed in the constructor. + + MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) { + return false; + } + else if (!new_block_index(pr_blockIndexSlotsUsed)) { + return false; + } + } + + // Insert a new block in the circular linked list + auto newBlock = this->parent->ConcurrentQueue::template requisition_block(); + if (newBlock == nullptr) { + return false; + } +#ifdef MCDBGQ_TRACKMEM + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template reset_empty(); + if (this->tailBlock == nullptr) { + newBlock->next = newBlock; + } + else { + newBlock->next = this->tailBlock->next; + this->tailBlock->next = newBlock; + } + this->tailBlock = newBlock; + ++pr_blockIndexSlotsUsed; + } + + MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast(nullptr)) T(std::forward(element)))) { + // The constructor may throw. We want the element not to appear in the queue in + // that case (without corrupting the queue): + MOODYCAMEL_TRY { + new ((*this->tailBlock)[currentTailIndex]) T(std::forward(element)); + } + MOODYCAMEL_CATCH (...) { + // Revert change to the current block, but leave the new block available + // for next time + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr ? this->tailBlock : startBlock; + MOODYCAMEL_RETHROW; + } + } + else { + (void)startBlock; + (void)originalBlockIndexSlotsUsed; + } + + // Add block to block index + auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront]; + entry.base = currentTailIndex; + entry.block = this->tailBlock; + blockIndex.load(std::memory_order_relaxed)->front.store(pr_blockIndexFront, std::memory_order_release); + pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); + + MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast(nullptr)) T(std::forward(element)))) { + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + } + + // Enqueue + new ((*this->tailBlock)[currentTailIndex]) T(std::forward(element)); + + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + + template + bool dequeue(U& element) + { + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); + if (details::circular_less_than(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) { + // Might be something to dequeue, let's give it a try + + // Note that this if is purely for performance purposes in the common case when the queue is + // empty and the values are eventually consistent -- we may enter here spuriously. + + // Note that whatever the values of overcommit and tail are, they are not going to change (unless we + // change them) and must be the same value at this point (inside the if) as when the if condition was + // evaluated. + + // We insert an acquire fence here to synchronize-with the release upon incrementing dequeueOvercommit below. + // This ensures that whatever the value we got loaded into overcommit, the load of dequeueOptisticCount in + // the fetch_add below will result in a value at least as recent as that (and therefore at least as large). + // Note that I believe a compiler (signal) fence here would be sufficient due to the nature of fetch_add (all + // read-modify-write operations are guaranteed to work on the latest value in the modification order), but + // unfortunately that can't be shown to be correct using only the C++11 standard. + // See http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case + std::atomic_thread_fence(std::memory_order_acquire); + + // Increment optimistic counter, then check if it went over the boundary + auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed); + + // Note that since dequeueOvercommit must be <= dequeueOptimisticCount (because dequeueOvercommit is only ever + // incremented after dequeueOptimisticCount -- this is enforced in the `else` block below), and since we now + // have a version of dequeueOptimisticCount that is at least as recent as overcommit (due to the release upon + // incrementing dequeueOvercommit and the acquire above that synchronizes with it), overcommit <= myDequeueCount. + // However, we can't assert this since both dequeueOptimisticCount and dequeueOvercommit may (independently) + // overflow; in such a case, though, the logic still holds since the difference between the two is maintained. + + // Note that we reload tail here in case it changed; it will be the same value as before or greater, since + // this load is sequenced after (happens after) the earlier load above. This is supported by read-read + // coherency (as defined in the standard), explained here: http://en.cppreference.com/w/cpp/atomic/memory_order + tail = this->tailIndex.load(std::memory_order_acquire); + if ((details::likely)(details::circular_less_than(myDequeueCount - overcommit, tail))) { + // Guaranteed to be at least one element to dequeue! + + // Get the index. Note that since there's guaranteed to be at least one element, this + // will never exceed tail. We need to do an acquire-release fence here since it's possible + // that whatever condition got us to this point was for an earlier enqueued element (that + // we already see the memory effects for), but that by the time we increment somebody else + // has incremented it, and we need to see the memory effects for *that* element, which is + // in such a case is necessarily visible on the thread that incremented it in the first + // place with the more current condition (they must have acquired a tail that is at least + // as recent). + auto index = this->headIndex.fetch_add(1, std::memory_order_acq_rel); + + + // Determine which block the element is in + + auto localBlockIndex = blockIndex.load(std::memory_order_acquire); + auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire); + + // We need to be careful here about subtracting and dividing because of index wrap-around. + // When an index wraps, we need to preserve the sign of the offset when dividing it by the + // block size (in order to get a correct signed block count offset in all cases): + auto headBase = localBlockIndex->entries[localBlockIndexHead].base; + auto blockBaseIndex = index & ~static_cast(BLOCK_SIZE - 1); + auto offset = static_cast(static_cast::type>(blockBaseIndex - headBase) / BLOCK_SIZE); + auto block = localBlockIndex->entries[(localBlockIndexHead + offset) & (localBlockIndex->size - 1)].block; + + // Dequeue + auto& el = *((*block)[index]); + if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) { + // Make sure the element is still fully dequeued and destroyed even if the assignment + // throws + struct Guard { + Block* block; + index_t index; + + ~Guard() + { + (*block)[index]->~T(); + block->ConcurrentQueue::Block::template set_empty(index); + } + } guard = { block, index }; + + element = std::move(el); // NOLINT + } + else { + element = std::move(el); // NOLINT + el.~T(); // NOLINT + block->ConcurrentQueue::Block::template set_empty(index); + } + + return true; + } + else { + // Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent + this->dequeueOvercommit.fetch_add(1, std::memory_order_release); // Release so that the fetch_add on dequeueOptimisticCount is guaranteed to happen before this write + } + } + + return false; + } + + template + bool MOODYCAMEL_NO_TSAN enqueue_bulk(It itemFirst, size_t count) + { + // First, we need to make sure we have enough room to enqueue all of the elements; + // this means pre-allocating blocks and putting them in the block index (but only if + // all the allocations succeeded). + index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed); + auto startBlock = this->tailBlock; + auto originalBlockIndexFront = pr_blockIndexFront; + auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed; + + Block* firstAllocatedBlock = nullptr; + + // Figure out how many blocks we'll need to allocate, and do so + size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1)); + index_t currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); + if (blockBaseDiff > 0) { + // Allocate as many blocks as possible from ahead + while (blockBaseDiff > 0 && this->tailBlock != nullptr && this->tailBlock->next != firstAllocatedBlock && this->tailBlock->next->ConcurrentQueue::Block::template is_empty()) { + blockBaseDiff -= static_cast(BLOCK_SIZE); + currentTailIndex += static_cast(BLOCK_SIZE); + + this->tailBlock = this->tailBlock->next; + firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock; + + auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront]; + entry.base = currentTailIndex; + entry.block = this->tailBlock; + pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); + } + + // Now allocate as many blocks as necessary from the block pool + while (blockBaseDiff > 0) { + blockBaseDiff -= static_cast(BLOCK_SIZE); + currentTailIndex += static_cast(BLOCK_SIZE); + + auto head = this->headIndex.load(std::memory_order_relaxed); + assert(!details::circular_less_than(currentTailIndex, head)); + bool full = !details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head)); + if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize || full) { + MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) { + // Failed to allocate, undo changes (but keep injected blocks) + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; + return false; + } + else if (full || !new_block_index(originalBlockIndexSlotsUsed)) { + // Failed to allocate, undo changes (but keep injected blocks) + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; + return false; + } + + // pr_blockIndexFront is updated inside new_block_index, so we need to + // update our fallback value too (since we keep the new index even if we + // later fail) + originalBlockIndexFront = originalBlockIndexSlotsUsed; + } + + // Insert a new block in the circular linked list + auto newBlock = this->parent->ConcurrentQueue::template requisition_block(); + if (newBlock == nullptr) { + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; + return false; + } + +#ifdef MCDBGQ_TRACKMEM + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template set_all_empty(); + if (this->tailBlock == nullptr) { + newBlock->next = newBlock; + } + else { + newBlock->next = this->tailBlock->next; + this->tailBlock->next = newBlock; + } + this->tailBlock = newBlock; + firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock; + + ++pr_blockIndexSlotsUsed; + + auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront]; + entry.base = currentTailIndex; + entry.block = this->tailBlock; + pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); + } + + // Excellent, all allocations succeeded. Reset each block's emptiness before we fill them up, and + // publish the new block index front + auto block = firstAllocatedBlock; + while (true) { + block->ConcurrentQueue::Block::template reset_empty(); + if (block == this->tailBlock) { + break; + } + block = block->next; + } + + MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast(nullptr)) T(details::deref_noexcept(itemFirst)))) { + blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release); + } + } + + // Enqueue, one block at a time + index_t newTailIndex = startTailIndex + static_cast(count); + currentTailIndex = startTailIndex; + auto endBlock = this->tailBlock; + this->tailBlock = startBlock; + assert((startTailIndex & static_cast(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0); + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) { + this->tailBlock = firstAllocatedBlock; + } + while (true) { + index_t stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + if (details::circular_less_than(newTailIndex, stopIndex)) { + stopIndex = newTailIndex; + } + MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast(nullptr)) T(details::deref_noexcept(itemFirst)))) { + while (currentTailIndex != stopIndex) { + new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++); + } + } + else { + MOODYCAMEL_TRY { + while (currentTailIndex != stopIndex) { + // Must use copy constructor even if move constructor is available + // because we may have to revert if there's an exception. + // Sorry about the horrible templated next line, but it was the only way + // to disable moving *at compile time*, which is important because a type + // may only define a (noexcept) move constructor, and so calls to the + // cctor will not compile, even if they are in an if branch that will never + // be executed + new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if(nullptr)) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst)); + ++currentTailIndex; + ++itemFirst; + } + } + MOODYCAMEL_CATCH (...) { + // Oh dear, an exception's been thrown -- destroy the elements that + // were enqueued so far and revert the entire bulk operation (we'll keep + // any allocated blocks in our linked list for later, though). + auto constructedStopIndex = currentTailIndex; + auto lastBlockEnqueued = this->tailBlock; + + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; + + if (!details::is_trivially_destructible::value) { + auto block = startBlock; + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { + block = firstAllocatedBlock; + } + currentTailIndex = startTailIndex; + while (true) { + stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + if (details::circular_less_than(constructedStopIndex, stopIndex)) { + stopIndex = constructedStopIndex; + } + while (currentTailIndex != stopIndex) { + (*block)[currentTailIndex++]->~T(); + } + if (block == lastBlockEnqueued) { + break; + } + block = block->next; + } + } + MOODYCAMEL_RETHROW; + } + } + + if (this->tailBlock == endBlock) { + assert(currentTailIndex == newTailIndex); + break; + } + this->tailBlock = this->tailBlock->next; + } + + MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast(nullptr)) T(details::deref_noexcept(itemFirst)))) { + if (firstAllocatedBlock != nullptr) + blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release); + } + + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + + template + size_t dequeue_bulk(It& itemFirst, size_t max) + { + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); + auto desiredCount = static_cast(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit)); + if (details::circular_less_than(0, desiredCount)) { + desiredCount = desiredCount < max ? desiredCount : max; + std::atomic_thread_fence(std::memory_order_acquire); + + auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed); + + tail = this->tailIndex.load(std::memory_order_acquire); + auto actualCount = static_cast(tail - (myDequeueCount - overcommit)); + if (details::circular_less_than(0, actualCount)) { + actualCount = desiredCount < actualCount ? desiredCount : actualCount; + if (actualCount < desiredCount) { + this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release); + } + + // Get the first index. Note that since there's guaranteed to be at least actualCount elements, this + // will never exceed tail. + auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel); + + // Determine which block the first element is in + auto localBlockIndex = blockIndex.load(std::memory_order_acquire); + auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire); + + auto headBase = localBlockIndex->entries[localBlockIndexHead].base; + auto firstBlockBaseIndex = firstIndex & ~static_cast(BLOCK_SIZE - 1); + auto offset = static_cast(static_cast::type>(firstBlockBaseIndex - headBase) / BLOCK_SIZE); + auto indexIndex = (localBlockIndexHead + offset) & (localBlockIndex->size - 1); + + // Iterate the blocks and dequeue + auto index = firstIndex; + do { + auto firstIndexInBlock = index; + index_t endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + endIndex = details::circular_less_than(firstIndex + static_cast(actualCount), endIndex) ? firstIndex + static_cast(actualCount) : endIndex; + auto block = localBlockIndex->entries[indexIndex].block; + if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) { + while (index != endIndex) { + auto& el = *((*block)[index]); + *itemFirst++ = std::move(el); + el.~T(); + ++index; + } + } + else { + MOODYCAMEL_TRY { + while (index != endIndex) { + auto& el = *((*block)[index]); + *itemFirst = std::move(el); + ++itemFirst; + el.~T(); + ++index; + } + } + MOODYCAMEL_CATCH (...) { + // It's too late to revert the dequeue, but we can make sure that all + // the dequeued objects are properly destroyed and the block index + // (and empty count) are properly updated before we propagate the exception + do { + block = localBlockIndex->entries[indexIndex].block; + while (index != endIndex) { + (*block)[index++]->~T(); + } + block->ConcurrentQueue::Block::template set_many_empty(firstIndexInBlock, static_cast(endIndex - firstIndexInBlock)); + indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1); + + firstIndexInBlock = index; + endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + endIndex = details::circular_less_than(firstIndex + static_cast(actualCount), endIndex) ? firstIndex + static_cast(actualCount) : endIndex; + } while (index != firstIndex + actualCount); + + MOODYCAMEL_RETHROW; + } + } + block->ConcurrentQueue::Block::template set_many_empty(firstIndexInBlock, static_cast(endIndex - firstIndexInBlock)); + indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1); + } while (index != firstIndex + actualCount); + + return actualCount; + } + else { + // Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent + this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release); + } + } + + return 0; + } + + private: + struct BlockIndexEntry + { + index_t base; + Block* block; + }; + + struct BlockIndexHeader + { + size_t size; + std::atomic front; // Current slot (not next, like pr_blockIndexFront) + BlockIndexEntry* entries; + void* prev; + }; + + + bool new_block_index(size_t numberOfFilledSlotsToExpose) + { + auto prevBlockSizeMask = pr_blockIndexSize - 1; + + // Create the new block + pr_blockIndexSize <<= 1; + auto newRawPtr = static_cast((Traits::malloc)(sizeof(BlockIndexHeader) + std::alignment_of::value - 1 + sizeof(BlockIndexEntry) * pr_blockIndexSize)); + if (newRawPtr == nullptr) { + pr_blockIndexSize >>= 1; // Reset to allow graceful retry + return false; + } + + auto newBlockIndexEntries = reinterpret_cast(details::align_for(newRawPtr + sizeof(BlockIndexHeader))); + + // Copy in all the old indices, if any + size_t j = 0; + if (pr_blockIndexSlotsUsed != 0) { + auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask; + do { + newBlockIndexEntries[j++] = pr_blockIndexEntries[i]; + i = (i + 1) & prevBlockSizeMask; + } while (i != pr_blockIndexFront); + } + + // Update everything + auto header = new (newRawPtr) BlockIndexHeader; + header->size = pr_blockIndexSize; + header->front.store(numberOfFilledSlotsToExpose - 1, std::memory_order_relaxed); + header->entries = newBlockIndexEntries; + header->prev = pr_blockIndexRaw; // we link the new block to the old one so we can free it later + + pr_blockIndexFront = j; + pr_blockIndexEntries = newBlockIndexEntries; + pr_blockIndexRaw = newRawPtr; + blockIndex.store(header, std::memory_order_release); + + return true; + } + + private: + std::atomic blockIndex; + + // To be used by producer only -- consumer must use the ones in referenced by blockIndex + size_t pr_blockIndexSlotsUsed; + size_t pr_blockIndexSize; + size_t pr_blockIndexFront; // Next slot (not current) + BlockIndexEntry* pr_blockIndexEntries; + void* pr_blockIndexRaw; + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + public: + ExplicitProducer* nextExplicitProducer; + private: +#endif + +#ifdef MCDBGQ_TRACKMEM + friend struct MemStats; +#endif + }; + + + ////////////////////////////////// + // Implicit queue + ////////////////////////////////// + + struct ImplicitProducer : public ProducerBase + { + ImplicitProducer(ConcurrentQueue* parent_) : + ProducerBase(parent_, false), + nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE), + blockIndex(nullptr) + { + new_block_index(); + } + + ~ImplicitProducer() + { + // Note that since we're in the destructor we can assume that all enqueue/dequeue operations + // completed already; this means that all undequeued elements are placed contiguously across + // contiguous blocks, and that only the first and last remaining blocks can be only partially + // empty (all other remaining blocks must be completely full). + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + // Unregister ourselves for thread termination notification + if (!this->inactive.load(std::memory_order_relaxed)) { + details::ThreadExitNotifier::unsubscribe(&threadExitListener); + } +#endif + + // Destroy all remaining elements! + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto index = this->headIndex.load(std::memory_order_relaxed); + Block* block = nullptr; + assert(index == tail || details::circular_less_than(index, tail)); + bool forceFreeLastBlock = index != tail; // If we enter the loop, then the last (tail) block will not be freed + while (index != tail) { + if ((index & static_cast(BLOCK_SIZE - 1)) == 0 || block == nullptr) { + if (block != nullptr) { + // Free the old block + this->parent->add_block_to_free_list(block); + } + + block = get_block_index_entry_for_index(index)->value.load(std::memory_order_relaxed); + } + + ((*block)[index])->~T(); + ++index; + } + // Even if the queue is empty, there's still one block that's not on the free list + // (unless the head index reached the end of it, in which case the tail will be poised + // to create a new block). + if (this->tailBlock != nullptr && (forceFreeLastBlock || (tail & static_cast(BLOCK_SIZE - 1)) != 0)) { + this->parent->add_block_to_free_list(this->tailBlock); + } + + // Destroy block index + auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); + if (localBlockIndex != nullptr) { + for (size_t i = 0; i != localBlockIndex->capacity; ++i) { + localBlockIndex->index[i]->~BlockIndexEntry(); + } + do { + auto prev = localBlockIndex->prev; + localBlockIndex->~BlockIndexHeader(); + (Traits::free)(localBlockIndex); + localBlockIndex = prev; + } while (localBlockIndex != nullptr); + } + } + + template + inline bool enqueue(U&& element) + { + index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed); + index_t newTailIndex = 1 + currentTailIndex; + if ((currentTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { + // We reached the end of a block, start a new one + auto head = this->headIndex.load(std::memory_order_relaxed); + assert(!details::circular_less_than(currentTailIndex, head)); + if (!details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) { + return false; + } +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + // Find out where we'll be inserting this block in the block index + BlockIndexEntry* idxEntry; + if (!insert_block_index_entry(idxEntry, currentTailIndex)) { + return false; + } + + // Get ahold of a new block + auto newBlock = this->parent->ConcurrentQueue::template requisition_block(); + if (newBlock == nullptr) { + rewind_block_index_tail(); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + return false; + } +#ifdef MCDBGQ_TRACKMEM + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template reset_empty(); + + MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast(nullptr)) T(std::forward(element)))) { + // May throw, try to insert now before we publish the fact that we have this new block + MOODYCAMEL_TRY { + new ((*newBlock)[currentTailIndex]) T(std::forward(element)); + } + MOODYCAMEL_CATCH (...) { + rewind_block_index_tail(); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + this->parent->add_block_to_free_list(newBlock); + MOODYCAMEL_RETHROW; + } + } + + // Insert the new block into the index + idxEntry->value.store(newBlock, std::memory_order_relaxed); + + this->tailBlock = newBlock; + + MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast(nullptr)) T(std::forward(element)))) { + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + } + + // Enqueue + new ((*this->tailBlock)[currentTailIndex]) T(std::forward(element)); + + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + + template + bool dequeue(U& element) + { + // See ExplicitProducer::dequeue for rationale and explanation + index_t tail = this->tailIndex.load(std::memory_order_relaxed); + index_t overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); + if (details::circular_less_than(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) { + std::atomic_thread_fence(std::memory_order_acquire); + + index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed); + tail = this->tailIndex.load(std::memory_order_acquire); + if ((details::likely)(details::circular_less_than(myDequeueCount - overcommit, tail))) { + index_t index = this->headIndex.fetch_add(1, std::memory_order_acq_rel); + + // Determine which block the element is in + auto entry = get_block_index_entry_for_index(index); + + // Dequeue + auto block = entry->value.load(std::memory_order_relaxed); + auto& el = *((*block)[index]); + + if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + // Note: Acquiring the mutex with every dequeue instead of only when a block + // is released is very sub-optimal, but it is, after all, purely debug code. + debug::DebugLock lock(producer->mutex); +#endif + struct Guard { + Block* block; + index_t index; + BlockIndexEntry* entry; + ConcurrentQueue* parent; + + ~Guard() + { + (*block)[index]->~T(); + if (block->ConcurrentQueue::Block::template set_empty(index)) { + entry->value.store(nullptr, std::memory_order_relaxed); + parent->add_block_to_free_list(block); + } + } + } guard = { block, index, entry, this->parent }; + + element = std::move(el); // NOLINT + } + else { + element = std::move(el); // NOLINT + el.~T(); // NOLINT + + if (block->ConcurrentQueue::Block::template set_empty(index)) { + { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + // Add the block back into the global free pool (and remove from block index) + entry->value.store(nullptr, std::memory_order_relaxed); + } + this->parent->add_block_to_free_list(block); // releases the above store + } + } + + return true; + } + else { + this->dequeueOvercommit.fetch_add(1, std::memory_order_release); + } + } + + return false; + } + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable: 4706) // assignment within conditional expression +#endif + template + bool enqueue_bulk(It itemFirst, size_t count) + { + // First, we need to make sure we have enough room to enqueue all of the elements; + // this means pre-allocating blocks and putting them in the block index (but only if + // all the allocations succeeded). + + // Note that the tailBlock we start off with may not be owned by us any more; + // this happens if it was filled up exactly to the top (setting tailIndex to + // the first index of the next block which is not yet allocated), then dequeued + // completely (putting it on the free list) before we enqueue again. + + index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed); + auto startBlock = this->tailBlock; + Block* firstAllocatedBlock = nullptr; + auto endBlock = this->tailBlock; + + // Figure out how many blocks we'll need to allocate, and do so + size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1)); + index_t currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); + if (blockBaseDiff > 0) { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + do { + blockBaseDiff -= static_cast(BLOCK_SIZE); + currentTailIndex += static_cast(BLOCK_SIZE); + + // Find out where we'll be inserting this block in the block index + BlockIndexEntry* idxEntry = nullptr; // initialization here unnecessary but compiler can't always tell + Block* newBlock; + bool indexInserted = false; + auto head = this->headIndex.load(std::memory_order_relaxed); + assert(!details::circular_less_than(currentTailIndex, head)); + bool full = !details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head)); + + if (full || !(indexInserted = insert_block_index_entry(idxEntry, currentTailIndex)) || (newBlock = this->parent->ConcurrentQueue::template requisition_block()) == nullptr) { + // Index allocation or block allocation failed; revert any other allocations + // and index insertions done so far for this operation + if (indexInserted) { + rewind_block_index_tail(); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + } + currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); + for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) { + currentTailIndex += static_cast(BLOCK_SIZE); + idxEntry = get_block_index_entry_for_index(currentTailIndex); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + rewind_block_index_tail(); + } + this->parent->add_blocks_to_free_list(firstAllocatedBlock); + this->tailBlock = startBlock; + + return false; + } + +#ifdef MCDBGQ_TRACKMEM + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template reset_empty(); + newBlock->next = nullptr; + + // Insert the new block into the index + idxEntry->value.store(newBlock, std::memory_order_relaxed); + + // Store the chain of blocks so that we can undo if later allocations fail, + // and so that we can find the blocks when we do the actual enqueueing + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr) { + assert(this->tailBlock != nullptr); + this->tailBlock->next = newBlock; + } + this->tailBlock = newBlock; + endBlock = newBlock; + firstAllocatedBlock = firstAllocatedBlock == nullptr ? newBlock : firstAllocatedBlock; + } while (blockBaseDiff > 0); + } + + // Enqueue, one block at a time + index_t newTailIndex = startTailIndex + static_cast(count); + currentTailIndex = startTailIndex; + this->tailBlock = startBlock; + assert((startTailIndex & static_cast(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0); + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) { + this->tailBlock = firstAllocatedBlock; + } + while (true) { + index_t stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + if (details::circular_less_than(newTailIndex, stopIndex)) { + stopIndex = newTailIndex; + } + MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast(nullptr)) T(details::deref_noexcept(itemFirst)))) { + while (currentTailIndex != stopIndex) { + new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++); + } + } + else { + MOODYCAMEL_TRY { + while (currentTailIndex != stopIndex) { + new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if(nullptr)) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst)); + ++currentTailIndex; + ++itemFirst; + } + } + MOODYCAMEL_CATCH (...) { + auto constructedStopIndex = currentTailIndex; + auto lastBlockEnqueued = this->tailBlock; + + if (!details::is_trivially_destructible::value) { + auto block = startBlock; + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { + block = firstAllocatedBlock; + } + currentTailIndex = startTailIndex; + while (true) { + stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + if (details::circular_less_than(constructedStopIndex, stopIndex)) { + stopIndex = constructedStopIndex; + } + while (currentTailIndex != stopIndex) { + (*block)[currentTailIndex++]->~T(); + } + if (block == lastBlockEnqueued) { + break; + } + block = block->next; + } + } + + currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); + for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) { + currentTailIndex += static_cast(BLOCK_SIZE); + auto idxEntry = get_block_index_entry_for_index(currentTailIndex); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + rewind_block_index_tail(); + } + this->parent->add_blocks_to_free_list(firstAllocatedBlock); + this->tailBlock = startBlock; + MOODYCAMEL_RETHROW; + } + } + + if (this->tailBlock == endBlock) { + assert(currentTailIndex == newTailIndex); + break; + } + this->tailBlock = this->tailBlock->next; + } + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } +#ifdef _MSC_VER +#pragma warning(pop) +#endif + + template + size_t dequeue_bulk(It& itemFirst, size_t max) + { + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); + auto desiredCount = static_cast(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit)); + if (details::circular_less_than(0, desiredCount)) { + desiredCount = desiredCount < max ? desiredCount : max; + std::atomic_thread_fence(std::memory_order_acquire); + + auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed); + + tail = this->tailIndex.load(std::memory_order_acquire); + auto actualCount = static_cast(tail - (myDequeueCount - overcommit)); + if (details::circular_less_than(0, actualCount)) { + actualCount = desiredCount < actualCount ? desiredCount : actualCount; + if (actualCount < desiredCount) { + this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release); + } + + // Get the first index. Note that since there's guaranteed to be at least actualCount elements, this + // will never exceed tail. + auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel); + + // Iterate the blocks and dequeue + auto index = firstIndex; + BlockIndexHeader* localBlockIndex; + auto indexIndex = get_block_index_index_for_index(index, localBlockIndex); + do { + auto blockStartIndex = index; + index_t endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + endIndex = details::circular_less_than(firstIndex + static_cast(actualCount), endIndex) ? firstIndex + static_cast(actualCount) : endIndex; + + auto entry = localBlockIndex->index[indexIndex]; + auto block = entry->value.load(std::memory_order_relaxed); + if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) { + while (index != endIndex) { + auto& el = *((*block)[index]); + *itemFirst++ = std::move(el); + el.~T(); + ++index; + } + } + else { + MOODYCAMEL_TRY { + while (index != endIndex) { + auto& el = *((*block)[index]); + *itemFirst = std::move(el); + ++itemFirst; + el.~T(); + ++index; + } + } + MOODYCAMEL_CATCH (...) { + do { + entry = localBlockIndex->index[indexIndex]; + block = entry->value.load(std::memory_order_relaxed); + while (index != endIndex) { + (*block)[index++]->~T(); + } + + if (block->ConcurrentQueue::Block::template set_many_empty(blockStartIndex, static_cast(endIndex - blockStartIndex))) { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + entry->value.store(nullptr, std::memory_order_relaxed); + this->parent->add_block_to_free_list(block); + } + indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1); + + blockStartIndex = index; + endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + endIndex = details::circular_less_than(firstIndex + static_cast(actualCount), endIndex) ? firstIndex + static_cast(actualCount) : endIndex; + } while (index != firstIndex + actualCount); + + MOODYCAMEL_RETHROW; + } + } + if (block->ConcurrentQueue::Block::template set_many_empty(blockStartIndex, static_cast(endIndex - blockStartIndex))) { + { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + // Note that the set_many_empty above did a release, meaning that anybody who acquires the block + // we're about to free can use it safely since our writes (and reads!) will have happened-before then. + entry->value.store(nullptr, std::memory_order_relaxed); + } + this->parent->add_block_to_free_list(block); // releases the above store + } + indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1); + } while (index != firstIndex + actualCount); + + return actualCount; + } + else { + this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release); + } + } + + return 0; + } + + private: + // The block size must be > 1, so any number with the low bit set is an invalid block base index + static const index_t INVALID_BLOCK_BASE = 1; + + struct BlockIndexEntry + { + std::atomic key; + std::atomic value; + }; + + struct BlockIndexHeader + { + size_t capacity; + std::atomic tail; + BlockIndexEntry* entries; + BlockIndexEntry** index; + BlockIndexHeader* prev; + }; + + template + inline bool insert_block_index_entry(BlockIndexEntry*& idxEntry, index_t blockStartIndex) + { + auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); // We're the only writer thread, relaxed is OK + if (localBlockIndex == nullptr) { + return false; // this can happen if new_block_index failed in the constructor + } + size_t newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1); + idxEntry = localBlockIndex->index[newTail]; + if (idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE || + idxEntry->value.load(std::memory_order_relaxed) == nullptr) { + + idxEntry->key.store(blockStartIndex, std::memory_order_relaxed); + localBlockIndex->tail.store(newTail, std::memory_order_release); + return true; + } + + // No room in the old block index, try to allocate another one! + MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) { + return false; + } + else if (!new_block_index()) { + return false; + } + localBlockIndex = blockIndex.load(std::memory_order_relaxed); + newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1); + idxEntry = localBlockIndex->index[newTail]; + assert(idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE); + idxEntry->key.store(blockStartIndex, std::memory_order_relaxed); + localBlockIndex->tail.store(newTail, std::memory_order_release); + return true; + } + + inline void rewind_block_index_tail() + { + auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); + localBlockIndex->tail.store((localBlockIndex->tail.load(std::memory_order_relaxed) - 1) & (localBlockIndex->capacity - 1), std::memory_order_relaxed); + } + + inline BlockIndexEntry* get_block_index_entry_for_index(index_t index) const + { + BlockIndexHeader* localBlockIndex; + auto idx = get_block_index_index_for_index(index, localBlockIndex); + return localBlockIndex->index[idx]; + } + + inline size_t get_block_index_index_for_index(index_t index, BlockIndexHeader*& localBlockIndex) const + { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + index &= ~static_cast(BLOCK_SIZE - 1); + localBlockIndex = blockIndex.load(std::memory_order_acquire); + auto tail = localBlockIndex->tail.load(std::memory_order_acquire); + auto tailBase = localBlockIndex->index[tail]->key.load(std::memory_order_relaxed); + assert(tailBase != INVALID_BLOCK_BASE); + // Note: Must use division instead of shift because the index may wrap around, causing a negative + // offset, whose negativity we want to preserve + auto offset = static_cast(static_cast::type>(index - tailBase) / BLOCK_SIZE); + size_t idx = (tail + offset) & (localBlockIndex->capacity - 1); + assert(localBlockIndex->index[idx]->key.load(std::memory_order_relaxed) == index && localBlockIndex->index[idx]->value.load(std::memory_order_relaxed) != nullptr); + return idx; + } + + bool new_block_index() + { + auto prev = blockIndex.load(std::memory_order_relaxed); + size_t prevCapacity = prev == nullptr ? 0 : prev->capacity; + auto entryCount = prev == nullptr ? nextBlockIndexCapacity : prevCapacity; + auto raw = static_cast((Traits::malloc)( + sizeof(BlockIndexHeader) + + std::alignment_of::value - 1 + sizeof(BlockIndexEntry) * entryCount + + std::alignment_of::value - 1 + sizeof(BlockIndexEntry*) * nextBlockIndexCapacity)); + if (raw == nullptr) { + return false; + } + + auto header = new (raw) BlockIndexHeader; + auto entries = reinterpret_cast(details::align_for(raw + sizeof(BlockIndexHeader))); + auto index = reinterpret_cast(details::align_for(reinterpret_cast(entries) + sizeof(BlockIndexEntry) * entryCount)); + if (prev != nullptr) { + auto prevTail = prev->tail.load(std::memory_order_relaxed); + auto prevPos = prevTail; + size_t i = 0; + do { + prevPos = (prevPos + 1) & (prev->capacity - 1); + index[i++] = prev->index[prevPos]; + } while (prevPos != prevTail); + assert(i == prevCapacity); + } + for (size_t i = 0; i != entryCount; ++i) { + new (entries + i) BlockIndexEntry; + entries[i].key.store(INVALID_BLOCK_BASE, std::memory_order_relaxed); + index[prevCapacity + i] = entries + i; + } + header->prev = prev; + header->entries = entries; + header->index = index; + header->capacity = nextBlockIndexCapacity; + header->tail.store((prevCapacity - 1) & (nextBlockIndexCapacity - 1), std::memory_order_relaxed); + + blockIndex.store(header, std::memory_order_release); + + nextBlockIndexCapacity <<= 1; + + return true; + } + + private: + size_t nextBlockIndexCapacity; + std::atomic blockIndex; + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + public: + details::ThreadExitListener threadExitListener; + private: +#endif + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + public: + ImplicitProducer* nextImplicitProducer; + private: +#endif + +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + mutable debug::DebugMutex mutex; +#endif +#ifdef MCDBGQ_TRACKMEM + friend struct MemStats; +#endif + }; + + + ////////////////////////////////// + // Block pool manipulation + ////////////////////////////////// + + void populate_initial_block_list(size_t blockCount) + { + initialBlockPoolSize = blockCount; + if (initialBlockPoolSize == 0) { + initialBlockPool = nullptr; + return; + } + + initialBlockPool = create_array(blockCount); + if (initialBlockPool == nullptr) { + initialBlockPoolSize = 0; + } + for (size_t i = 0; i < initialBlockPoolSize; ++i) { + initialBlockPool[i].dynamicallyAllocated = false; + } + } + + inline Block* try_get_block_from_initial_pool() + { + if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= initialBlockPoolSize) { + return nullptr; + } + + auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed); + + return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr; + } + + inline void add_block_to_free_list(Block* block) + { +#ifdef MCDBGQ_TRACKMEM + block->owner = nullptr; +#endif + freeList.add(block); + } + + inline void add_blocks_to_free_list(Block* block) + { + while (block != nullptr) { + auto next = block->next; + add_block_to_free_list(block); + block = next; + } + } + + inline Block* try_get_block_from_free_list() + { + return freeList.try_get(); + } + + // Gets a free block from one of the memory pools, or allocates a new one (if applicable) + template + Block* requisition_block() + { + auto block = try_get_block_from_initial_pool(); + if (block != nullptr) { + return block; + } + + block = try_get_block_from_free_list(); + if (block != nullptr) { + return block; + } + + MOODYCAMEL_CONSTEXPR_IF (canAlloc == CanAlloc) { + return create(); + } + else { + return nullptr; + } + } + + +#ifdef MCDBGQ_TRACKMEM + public: + struct MemStats { + size_t allocatedBlocks; + size_t usedBlocks; + size_t freeBlocks; + size_t ownedBlocksExplicit; + size_t ownedBlocksImplicit; + size_t implicitProducers; + size_t explicitProducers; + size_t elementsEnqueued; + size_t blockClassBytes; + size_t queueClassBytes; + size_t implicitBlockIndexBytes; + size_t explicitBlockIndexBytes; + + friend class ConcurrentQueue; + + private: + static MemStats getFor(ConcurrentQueue* q) + { + MemStats stats = { 0 }; + + stats.elementsEnqueued = q->size_approx(); + + auto block = q->freeList.head_unsafe(); + while (block != nullptr) { + ++stats.allocatedBlocks; + ++stats.freeBlocks; + block = block->freeListNext.load(std::memory_order_relaxed); + } + + for (auto ptr = q->producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + bool implicit = dynamic_cast(ptr) != nullptr; + stats.implicitProducers += implicit ? 1 : 0; + stats.explicitProducers += implicit ? 0 : 1; + + if (implicit) { + auto prod = static_cast(ptr); + stats.queueClassBytes += sizeof(ImplicitProducer); + auto head = prod->headIndex.load(std::memory_order_relaxed); + auto tail = prod->tailIndex.load(std::memory_order_relaxed); + auto hash = prod->blockIndex.load(std::memory_order_relaxed); + if (hash != nullptr) { + for (size_t i = 0; i != hash->capacity; ++i) { + if (hash->index[i]->key.load(std::memory_order_relaxed) != ImplicitProducer::INVALID_BLOCK_BASE && hash->index[i]->value.load(std::memory_order_relaxed) != nullptr) { + ++stats.allocatedBlocks; + ++stats.ownedBlocksImplicit; + } + } + stats.implicitBlockIndexBytes += hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry); + for (; hash != nullptr; hash = hash->prev) { + stats.implicitBlockIndexBytes += sizeof(typename ImplicitProducer::BlockIndexHeader) + hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry*); + } + } + for (; details::circular_less_than(head, tail); head += BLOCK_SIZE) { + //auto block = prod->get_block_index_entry_for_index(head); + ++stats.usedBlocks; + } + } + else { + auto prod = static_cast(ptr); + stats.queueClassBytes += sizeof(ExplicitProducer); + auto tailBlock = prod->tailBlock; + bool wasNonEmpty = false; + if (tailBlock != nullptr) { + auto block = tailBlock; + do { + ++stats.allocatedBlocks; + if (!block->ConcurrentQueue::Block::template is_empty() || wasNonEmpty) { + ++stats.usedBlocks; + wasNonEmpty = wasNonEmpty || block != tailBlock; + } + ++stats.ownedBlocksExplicit; + block = block->next; + } while (block != tailBlock); + } + auto index = prod->blockIndex.load(std::memory_order_relaxed); + while (index != nullptr) { + stats.explicitBlockIndexBytes += sizeof(typename ExplicitProducer::BlockIndexHeader) + index->size * sizeof(typename ExplicitProducer::BlockIndexEntry); + index = static_cast(index->prev); + } + } + } + + auto freeOnInitialPool = q->initialBlockPoolIndex.load(std::memory_order_relaxed) >= q->initialBlockPoolSize ? 0 : q->initialBlockPoolSize - q->initialBlockPoolIndex.load(std::memory_order_relaxed); + stats.allocatedBlocks += freeOnInitialPool; + stats.freeBlocks += freeOnInitialPool; + + stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks; + stats.queueClassBytes += sizeof(ConcurrentQueue); + + return stats; + } + }; + + // For debugging only. Not thread-safe. + MemStats getMemStats() + { + return MemStats::getFor(this); + } + private: + friend struct MemStats; +#endif + + + ////////////////////////////////// + // Producer list manipulation + ////////////////////////////////// + + ProducerBase* recycle_or_create_producer(bool isExplicit) + { + bool recycled; + return recycle_or_create_producer(isExplicit, recycled); + } + + ProducerBase* recycle_or_create_producer(bool isExplicit, bool& recycled) + { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH + debug::DebugLock lock(implicitProdMutex); +#endif + // Try to re-use one first + for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + if (ptr->inactive.load(std::memory_order_relaxed) && ptr->isExplicit == isExplicit) { + bool expected = true; + if (ptr->inactive.compare_exchange_strong(expected, /* desired */ false, std::memory_order_acquire, std::memory_order_relaxed)) { + // We caught one! It's been marked as activated, the caller can have it + recycled = true; + return ptr; + } + } + } + + recycled = false; + return add_producer(isExplicit ? static_cast(create(this)) : create(this)); + } + + ProducerBase* add_producer(ProducerBase* producer) + { + // Handle failed memory allocation + if (producer == nullptr) { + return nullptr; + } + + producerCount.fetch_add(1, std::memory_order_relaxed); + + // Add it to the lock-free list + auto prevTail = producerListTail.load(std::memory_order_relaxed); + do { + producer->next = prevTail; + } while (!producerListTail.compare_exchange_weak(prevTail, producer, std::memory_order_release, std::memory_order_relaxed)); + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + if (producer->isExplicit) { + auto prevTailExplicit = explicitProducers.load(std::memory_order_relaxed); + do { + static_cast(producer)->nextExplicitProducer = prevTailExplicit; + } while (!explicitProducers.compare_exchange_weak(prevTailExplicit, static_cast(producer), std::memory_order_release, std::memory_order_relaxed)); + } + else { + auto prevTailImplicit = implicitProducers.load(std::memory_order_relaxed); + do { + static_cast(producer)->nextImplicitProducer = prevTailImplicit; + } while (!implicitProducers.compare_exchange_weak(prevTailImplicit, static_cast(producer), std::memory_order_release, std::memory_order_relaxed)); + } +#endif + + return producer; + } + + void reown_producers() + { + // After another instance is moved-into/swapped-with this one, all the + // producers we stole still think their parents are the other queue. + // So fix them up! + for (auto ptr = producerListTail.load(std::memory_order_relaxed); ptr != nullptr; ptr = ptr->next_prod()) { + ptr->parent = this; + } + } + + + ////////////////////////////////// + // Implicit producer hash + ////////////////////////////////// + + struct ImplicitProducerKVP + { + std::atomic key; + ImplicitProducer* value; // No need for atomicity since it's only read by the thread that sets it in the first place + + ImplicitProducerKVP() : value(nullptr) { } + + ImplicitProducerKVP(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT + { + key.store(other.key.load(std::memory_order_relaxed), std::memory_order_relaxed); + value = other.value; + } + + inline ImplicitProducerKVP& operator=(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT + { + swap(other); + return *this; + } + + inline void swap(ImplicitProducerKVP& other) MOODYCAMEL_NOEXCEPT + { + if (this != &other) { + details::swap_relaxed(key, other.key); + std::swap(value, other.value); + } + } + }; + + template + friend void moodycamel::swap(typename ConcurrentQueue::ImplicitProducerKVP&, typename ConcurrentQueue::ImplicitProducerKVP&) MOODYCAMEL_NOEXCEPT; + + struct ImplicitProducerHash + { + size_t capacity; + ImplicitProducerKVP* entries; + ImplicitProducerHash* prev; + }; + + inline void populate_initial_implicit_producer_hash() + { + MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) { + return; + } + else { + implicitProducerHashCount.store(0, std::memory_order_relaxed); + auto hash = &initialImplicitProducerHash; + hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; + hash->entries = &initialImplicitProducerHashEntries[0]; + for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) { + initialImplicitProducerHashEntries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed); + } + hash->prev = nullptr; + implicitProducerHash.store(hash, std::memory_order_relaxed); + } + } + + void swap_implicit_producer_hashes(ConcurrentQueue& other) + { + MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) { + return; + } + else { + // Swap (assumes our implicit producer hash is initialized) + initialImplicitProducerHashEntries.swap(other.initialImplicitProducerHashEntries); + initialImplicitProducerHash.entries = &initialImplicitProducerHashEntries[0]; + other.initialImplicitProducerHash.entries = &other.initialImplicitProducerHashEntries[0]; + + details::swap_relaxed(implicitProducerHashCount, other.implicitProducerHashCount); + + details::swap_relaxed(implicitProducerHash, other.implicitProducerHash); + if (implicitProducerHash.load(std::memory_order_relaxed) == &other.initialImplicitProducerHash) { + implicitProducerHash.store(&initialImplicitProducerHash, std::memory_order_relaxed); + } + else { + ImplicitProducerHash* hash; + for (hash = implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &other.initialImplicitProducerHash; hash = hash->prev) { + continue; + } + hash->prev = &initialImplicitProducerHash; + } + if (other.implicitProducerHash.load(std::memory_order_relaxed) == &initialImplicitProducerHash) { + other.implicitProducerHash.store(&other.initialImplicitProducerHash, std::memory_order_relaxed); + } + else { + ImplicitProducerHash* hash; + for (hash = other.implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &initialImplicitProducerHash; hash = hash->prev) { + continue; + } + hash->prev = &other.initialImplicitProducerHash; + } + } + } + + // Only fails (returns nullptr) if memory allocation fails + ImplicitProducer* get_or_add_implicit_producer() + { + // Note that since the data is essentially thread-local (key is thread ID), + // there's a reduced need for fences (memory ordering is already consistent + // for any individual thread), except for the current table itself. + + // Start by looking for the thread ID in the current and all previous hash tables. + // If it's not found, it must not be in there yet, since this same thread would + // have added it previously to one of the tables that we traversed. + + // Code and algorithm adapted from http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table + +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH + debug::DebugLock lock(implicitProdMutex); +#endif + + auto id = details::thread_id(); + auto hashedId = details::hash_thread_id(id); + + auto mainHash = implicitProducerHash.load(std::memory_order_acquire); + assert(mainHash != nullptr); // silence clang-tidy and MSVC warnings (hash cannot be null) + for (auto hash = mainHash; hash != nullptr; hash = hash->prev) { + // Look for the id in this hash + auto index = hashedId; + while (true) { // Not an infinite loop because at least one slot is free in the hash table + index &= hash->capacity - 1; + + auto probedKey = hash->entries[index].key.load(std::memory_order_relaxed); + if (probedKey == id) { + // Found it! If we had to search several hashes deep, though, we should lazily add it + // to the current main hash table to avoid the extended search next time. + // Note there's guaranteed to be room in the current hash table since every subsequent + // table implicitly reserves space for all previous tables (there's only one + // implicitProducerHashCount). + auto value = hash->entries[index].value; + if (hash != mainHash) { + index = hashedId; + while (true) { + index &= mainHash->capacity - 1; + probedKey = mainHash->entries[index].key.load(std::memory_order_relaxed); + auto empty = details::invalid_thread_id; +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + auto reusable = details::invalid_thread_id2; + if ((probedKey == empty && mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_relaxed, std::memory_order_relaxed)) || + (probedKey == reusable && mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_acquire, std::memory_order_acquire))) { +#else + if ((probedKey == empty && mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_relaxed, std::memory_order_relaxed))) { +#endif + mainHash->entries[index].value = value; + break; + } + ++index; + } + } + + return value; + } + if (probedKey == details::invalid_thread_id) { + break; // Not in this hash table + } + ++index; + } + } + + // Insert! + auto newCount = 1 + implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed); + while (true) { + // NOLINTNEXTLINE(clang-analyzer-core.NullDereference) + if (newCount >= (mainHash->capacity >> 1) && !implicitProducerHashResizeInProgress.test_and_set(std::memory_order_acquire)) { + // We've acquired the resize lock, try to allocate a bigger hash table. + // Note the acquire fence synchronizes with the release fence at the end of this block, and hence when + // we reload implicitProducerHash it must be the most recent version (it only gets changed within this + // locked block). + mainHash = implicitProducerHash.load(std::memory_order_acquire); + if (newCount >= (mainHash->capacity >> 1)) { + auto newCapacity = mainHash->capacity << 1; + while (newCount >= (newCapacity >> 1)) { + newCapacity <<= 1; + } + auto raw = static_cast((Traits::malloc)(sizeof(ImplicitProducerHash) + std::alignment_of::value - 1 + sizeof(ImplicitProducerKVP) * newCapacity)); + if (raw == nullptr) { + // Allocation failed + implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed); + implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); + return nullptr; + } + + auto newHash = new (raw) ImplicitProducerHash; + newHash->capacity = static_cast(newCapacity); + newHash->entries = reinterpret_cast(details::align_for(raw + sizeof(ImplicitProducerHash))); + for (size_t i = 0; i != newCapacity; ++i) { + new (newHash->entries + i) ImplicitProducerKVP; + newHash->entries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed); + } + newHash->prev = mainHash; + implicitProducerHash.store(newHash, std::memory_order_release); + implicitProducerHashResizeInProgress.clear(std::memory_order_release); + mainHash = newHash; + } + else { + implicitProducerHashResizeInProgress.clear(std::memory_order_release); + } + } + + // If it's < three-quarters full, add to the old one anyway so that we don't have to wait for the next table + // to finish being allocated by another thread (and if we just finished allocating above, the condition will + // always be true) + if (newCount < (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) { + bool recycled; + auto producer = static_cast(recycle_or_create_producer(false, recycled)); + if (producer == nullptr) { + implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed); + return nullptr; + } + if (recycled) { + implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed); + } + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + producer->threadExitListener.callback = &ConcurrentQueue::implicit_producer_thread_exited_callback; + producer->threadExitListener.userData = producer; + details::ThreadExitNotifier::subscribe(&producer->threadExitListener); +#endif + + auto index = hashedId; + while (true) { + index &= mainHash->capacity - 1; + auto probedKey = mainHash->entries[index].key.load(std::memory_order_relaxed); + + auto empty = details::invalid_thread_id; +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + auto reusable = details::invalid_thread_id2; + if ((probedKey == empty && mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_relaxed, std::memory_order_relaxed)) || + (probedKey == reusable && mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_acquire, std::memory_order_acquire))) { +#else + if ((probedKey == empty && mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_relaxed, std::memory_order_relaxed))) { +#endif + mainHash->entries[index].value = producer; + break; + } + ++index; + } + return producer; + } + + // Hmm, the old hash is quite full and somebody else is busy allocating a new one. + // We need to wait for the allocating thread to finish (if it succeeds, we add, if not, + // we try to allocate ourselves). + mainHash = implicitProducerHash.load(std::memory_order_acquire); + } + } + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + void implicit_producer_thread_exited(ImplicitProducer* producer) + { + // Remove from thread exit listeners + details::ThreadExitNotifier::unsubscribe(&producer->threadExitListener); + + // Remove from hash +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH + debug::DebugLock lock(implicitProdMutex); +#endif + auto hash = implicitProducerHash.load(std::memory_order_acquire); + assert(hash != nullptr); // The thread exit listener is only registered if we were added to a hash in the first place + auto id = details::thread_id(); + auto hashedId = details::hash_thread_id(id); + details::thread_id_t probedKey; + + // We need to traverse all the hashes just in case other threads aren't on the current one yet and are + // trying to add an entry thinking there's a free slot (because they reused a producer) + for (; hash != nullptr; hash = hash->prev) { + auto index = hashedId; + do { + index &= hash->capacity - 1; + probedKey = hash->entries[index].key.load(std::memory_order_relaxed); + if (probedKey == id) { + hash->entries[index].key.store(details::invalid_thread_id2, std::memory_order_release); + break; + } + ++index; + } while (probedKey != details::invalid_thread_id); // Can happen if the hash has changed but we weren't put back in it yet, or if we weren't added to this hash in the first place + } + + // Mark the queue as being recyclable + producer->inactive.store(true, std::memory_order_release); + } + + static void implicit_producer_thread_exited_callback(void* userData) + { + auto producer = static_cast(userData); + auto queue = producer->parent; + queue->implicit_producer_thread_exited(producer); + } +#endif + + ////////////////////////////////// + // Utility functions + ////////////////////////////////// + + template + static inline void* aligned_malloc(size_t size) + { + MOODYCAMEL_CONSTEXPR_IF (std::alignment_of::value <= std::alignment_of::value) + return (Traits::malloc)(size); + else { + size_t alignment = std::alignment_of::value; + void* raw = (Traits::malloc)(size + alignment - 1 + sizeof(void*)); + if (!raw) + return nullptr; + char* ptr = details::align_for(reinterpret_cast(raw) + sizeof(void*)); + *(reinterpret_cast(ptr) - 1) = raw; + return ptr; + } + } + + template + static inline void aligned_free(void* ptr) + { + MOODYCAMEL_CONSTEXPR_IF (std::alignment_of::value <= std::alignment_of::value) + return (Traits::free)(ptr); + else + (Traits::free)(ptr ? *(reinterpret_cast(ptr) - 1) : nullptr); + } + + template + static inline U* create_array(size_t count) + { + assert(count > 0); + U* p = static_cast(aligned_malloc(sizeof(U) * count)); + if (p == nullptr) + return nullptr; + + for (size_t i = 0; i != count; ++i) + new (p + i) U(); + return p; + } + + template + static inline void destroy_array(U* p, size_t count) + { + if (p != nullptr) { + assert(count > 0); + for (size_t i = count; i != 0; ) + (p + --i)->~U(); + } + aligned_free(p); + } + + template + static inline U* create() + { + void* p = aligned_malloc(sizeof(U)); + return p != nullptr ? new (p) U : nullptr; + } + + template + static inline U* create(A1&& a1) + { + void* p = aligned_malloc(sizeof(U)); + return p != nullptr ? new (p) U(std::forward(a1)) : nullptr; + } + + template + static inline void destroy(U* p) + { + if (p != nullptr) + p->~U(); + aligned_free(p); + } + +private: + std::atomic producerListTail; + std::atomic producerCount; + + std::atomic initialBlockPoolIndex; + Block* initialBlockPool; + size_t initialBlockPoolSize; + +#ifndef MCDBGQ_USEDEBUGFREELIST + FreeList freeList; +#else + debug::DebugFreeList freeList; +#endif + + std::atomic implicitProducerHash; + std::atomic implicitProducerHashCount; // Number of slots logically used + ImplicitProducerHash initialImplicitProducerHash; + std::array initialImplicitProducerHashEntries; + std::atomic_flag implicitProducerHashResizeInProgress; + + std::atomic nextExplicitConsumerId; + std::atomic globalExplicitConsumerOffset; + +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH + debug::DebugMutex implicitProdMutex; +#endif + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + std::atomic explicitProducers; + std::atomic implicitProducers; +#endif +}; + + +template +ProducerToken::ProducerToken(ConcurrentQueue& queue) + : producer(queue.recycle_or_create_producer(true)) +{ + if (producer != nullptr) { + producer->token = this; + } +} + +template +ProducerToken::ProducerToken(BlockingConcurrentQueue& queue) + : producer(reinterpret_cast*>(&queue)->recycle_or_create_producer(true)) +{ + if (producer != nullptr) { + producer->token = this; + } +} + +template +ConsumerToken::ConsumerToken(ConcurrentQueue& queue) + : itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr) +{ + initialOffset = queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release); + lastKnownGlobalOffset = static_cast(-1); +} + +template +ConsumerToken::ConsumerToken(BlockingConcurrentQueue& queue) + : itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr) +{ + initialOffset = reinterpret_cast*>(&queue)->nextExplicitConsumerId.fetch_add(1, std::memory_order_release); + lastKnownGlobalOffset = static_cast(-1); +} + +template +inline void swap(ConcurrentQueue& a, ConcurrentQueue& b) MOODYCAMEL_NOEXCEPT +{ + a.swap(b); +} + +inline void swap(ProducerToken& a, ProducerToken& b) MOODYCAMEL_NOEXCEPT +{ + a.swap(b); +} + +inline void swap(ConsumerToken& a, ConsumerToken& b) MOODYCAMEL_NOEXCEPT +{ + a.swap(b); +} + +template +inline void swap(typename ConcurrentQueue::ImplicitProducerKVP& a, typename ConcurrentQueue::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT +{ + a.swap(b); +} + +} + +#if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17) +#pragma warning(pop) +#endif + +#if defined(__GNUC__) && !defined(__INTEL_COMPILER) +#pragma GCC diagnostic pop +#endif + diff --git a/deps/concurrentqueue/lightweightsemaphore.h b/deps/concurrentqueue/lightweightsemaphore.h new file mode 100644 index 000000000..bb7c7a403 --- /dev/null +++ b/deps/concurrentqueue/lightweightsemaphore.h @@ -0,0 +1,412 @@ +// Provides an efficient implementation of a semaphore (LightweightSemaphore). +// This is an extension of Jeff Preshing's sempahore implementation (licensed +// under the terms of its separate zlib license) that has been adapted and +// extended by Cameron Desrochers. + +#pragma once + +#include // For std::size_t +#include +#include // For std::make_signed + +#if defined(_WIN32) +// Avoid including windows.h in a header; we only need a handful of +// items, so we'll redeclare them here (this is relatively safe since +// the API generally has to remain stable between Windows versions). +// I know this is an ugly hack but it still beats polluting the global +// namespace with thousands of generic names or adding a .cpp for nothing. +extern "C" { + struct _SECURITY_ATTRIBUTES; + __declspec(dllimport) void* __stdcall CreateSemaphoreW(_SECURITY_ATTRIBUTES* lpSemaphoreAttributes, long lInitialCount, long lMaximumCount, const wchar_t* lpName); + __declspec(dllimport) int __stdcall CloseHandle(void* hObject); + __declspec(dllimport) unsigned long __stdcall WaitForSingleObject(void* hHandle, unsigned long dwMilliseconds); + __declspec(dllimport) int __stdcall ReleaseSemaphore(void* hSemaphore, long lReleaseCount, long* lpPreviousCount); +} +#elif defined(__MACH__) +#include +#elif defined(__unix__) +#include +#endif + +namespace moodycamel +{ +namespace details +{ + +// Code in the mpmc_sema namespace below is an adaptation of Jeff Preshing's +// portable + lightweight semaphore implementations, originally from +// https://github.com/preshing/cpp11-on-multicore/blob/master/common/sema.h +// LICENSE: +// Copyright (c) 2015 Jeff Preshing +// +// This software is provided 'as-is', without any express or implied +// warranty. In no event will the authors be held liable for any damages +// arising from the use of this software. +// +// Permission is granted to anyone to use this software for any purpose, +// including commercial applications, and to alter it and redistribute it +// freely, subject to the following restrictions: +// +// 1. The origin of this software must not be misrepresented; you must not +// claim that you wrote the original software. If you use this software +// in a product, an acknowledgement in the product documentation would be +// appreciated but is not required. +// 2. Altered source versions must be plainly marked as such, and must not be +// misrepresented as being the original software. +// 3. This notice may not be removed or altered from any source distribution. +#if defined(_WIN32) +class Semaphore +{ +private: + void* m_hSema; + + Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION; + Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION; + +public: + Semaphore(int initialCount = 0) + { + assert(initialCount >= 0); + const long maxLong = 0x7fffffff; + m_hSema = CreateSemaphoreW(nullptr, initialCount, maxLong, nullptr); + assert(m_hSema); + } + + ~Semaphore() + { + CloseHandle(m_hSema); + } + + bool wait() + { + const unsigned long infinite = 0xffffffff; + return WaitForSingleObject(m_hSema, infinite) == 0; + } + + bool try_wait() + { + return WaitForSingleObject(m_hSema, 0) == 0; + } + + bool timed_wait(std::uint64_t usecs) + { + return WaitForSingleObject(m_hSema, (unsigned long)(usecs / 1000)) == 0; + } + + void signal(int count = 1) + { + while (!ReleaseSemaphore(m_hSema, count, nullptr)); + } +}; +#elif defined(__MACH__) +//--------------------------------------------------------- +// Semaphore (Apple iOS and OSX) +// Can't use POSIX semaphores due to http://lists.apple.com/archives/darwin-kernel/2009/Apr/msg00010.html +//--------------------------------------------------------- +class Semaphore +{ +private: + semaphore_t m_sema; + + Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION; + Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION; + +public: + Semaphore(int initialCount = 0) + { + assert(initialCount >= 0); + kern_return_t rc = semaphore_create(mach_task_self(), &m_sema, SYNC_POLICY_FIFO, initialCount); + assert(rc == KERN_SUCCESS); + (void)rc; + } + + ~Semaphore() + { + semaphore_destroy(mach_task_self(), m_sema); + } + + bool wait() + { + return semaphore_wait(m_sema) == KERN_SUCCESS; + } + + bool try_wait() + { + return timed_wait(0); + } + + bool timed_wait(std::uint64_t timeout_usecs) + { + mach_timespec_t ts; + ts.tv_sec = static_cast(timeout_usecs / 1000000); + ts.tv_nsec = static_cast((timeout_usecs % 1000000) * 1000); + + // added in OSX 10.10: https://developer.apple.com/library/prerelease/mac/documentation/General/Reference/APIDiffsMacOSX10_10SeedDiff/modules/Darwin.html + kern_return_t rc = semaphore_timedwait(m_sema, ts); + return rc == KERN_SUCCESS; + } + + void signal() + { + while (semaphore_signal(m_sema) != KERN_SUCCESS); + } + + void signal(int count) + { + while (count-- > 0) + { + while (semaphore_signal(m_sema) != KERN_SUCCESS); + } + } +}; +#elif defined(__unix__) +//--------------------------------------------------------- +// Semaphore (POSIX, Linux) +//--------------------------------------------------------- +class Semaphore +{ +private: + sem_t m_sema; + + Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION; + Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION; + +public: + Semaphore(int initialCount = 0) + { + assert(initialCount >= 0); + int rc = sem_init(&m_sema, 0, static_cast(initialCount)); + assert(rc == 0); + (void)rc; + } + + ~Semaphore() + { + sem_destroy(&m_sema); + } + + bool wait() + { + // http://stackoverflow.com/questions/2013181/gdb-causes-sem-wait-to-fail-with-eintr-error + int rc; + do { + rc = sem_wait(&m_sema); + } while (rc == -1 && errno == EINTR); + return rc == 0; + } + + bool try_wait() + { + int rc; + do { + rc = sem_trywait(&m_sema); + } while (rc == -1 && errno == EINTR); + return rc == 0; + } + + bool timed_wait(std::uint64_t usecs) + { + struct timespec ts; + const int usecs_in_1_sec = 1000000; + const int nsecs_in_1_sec = 1000000000; + clock_gettime(CLOCK_REALTIME, &ts); + ts.tv_sec += (time_t)(usecs / usecs_in_1_sec); + ts.tv_nsec += (long)(usecs % usecs_in_1_sec) * 1000; + // sem_timedwait bombs if you have more than 1e9 in tv_nsec + // so we have to clean things up before passing it in + if (ts.tv_nsec >= nsecs_in_1_sec) { + ts.tv_nsec -= nsecs_in_1_sec; + ++ts.tv_sec; + } + + int rc; + do { + rc = sem_timedwait(&m_sema, &ts); + } while (rc == -1 && errno == EINTR); + return rc == 0; + } + + void signal() + { + while (sem_post(&m_sema) == -1); + } + + void signal(int count) + { + while (count-- > 0) + { + while (sem_post(&m_sema) == -1); + } + } +}; +#else +#error Unsupported platform! (No semaphore wrapper available) +#endif + +} // end namespace details + + +//--------------------------------------------------------- +// LightweightSemaphore +//--------------------------------------------------------- +class LightweightSemaphore +{ +public: + typedef std::make_signed::type ssize_t; + +private: + std::atomic m_count; + details::Semaphore m_sema; + int m_maxSpins; + + bool waitWithPartialSpinning(std::int64_t timeout_usecs = -1) + { + ssize_t oldCount; + int spin = m_maxSpins; + while (--spin >= 0) + { + oldCount = m_count.load(std::memory_order_relaxed); + if ((oldCount > 0) && m_count.compare_exchange_strong(oldCount, oldCount - 1, std::memory_order_acquire, std::memory_order_relaxed)) + return true; + std::atomic_signal_fence(std::memory_order_acquire); // Prevent the compiler from collapsing the loop. + } + oldCount = m_count.fetch_sub(1, std::memory_order_acquire); + if (oldCount > 0) + return true; + if (timeout_usecs < 0) + { + if (m_sema.wait()) + return true; + } + if (timeout_usecs > 0 && m_sema.timed_wait((std::uint64_t)timeout_usecs)) + return true; + // At this point, we've timed out waiting for the semaphore, but the + // count is still decremented indicating we may still be waiting on + // it. So we have to re-adjust the count, but only if the semaphore + // wasn't signaled enough times for us too since then. If it was, we + // need to release the semaphore too. + while (true) + { + oldCount = m_count.load(std::memory_order_acquire); + if (oldCount >= 0 && m_sema.try_wait()) + return true; + if (oldCount < 0 && m_count.compare_exchange_strong(oldCount, oldCount + 1, std::memory_order_relaxed, std::memory_order_relaxed)) + return false; + } + } + + ssize_t waitManyWithPartialSpinning(ssize_t max, std::int64_t timeout_usecs = -1) + { + assert(max > 0); + ssize_t oldCount; + int spin = m_maxSpins; + while (--spin >= 0) + { + oldCount = m_count.load(std::memory_order_relaxed); + if (oldCount > 0) + { + ssize_t newCount = oldCount > max ? oldCount - max : 0; + if (m_count.compare_exchange_strong(oldCount, newCount, std::memory_order_acquire, std::memory_order_relaxed)) + return oldCount - newCount; + } + std::atomic_signal_fence(std::memory_order_acquire); + } + oldCount = m_count.fetch_sub(1, std::memory_order_acquire); + if (oldCount <= 0) + { + if ((timeout_usecs == 0) || (timeout_usecs < 0 && !m_sema.wait()) || (timeout_usecs > 0 && !m_sema.timed_wait((std::uint64_t)timeout_usecs))) + { + while (true) + { + oldCount = m_count.load(std::memory_order_acquire); + if (oldCount >= 0 && m_sema.try_wait()) + break; + if (oldCount < 0 && m_count.compare_exchange_strong(oldCount, oldCount + 1, std::memory_order_relaxed, std::memory_order_relaxed)) + return 0; + } + } + } + if (max > 1) + return 1 + tryWaitMany(max - 1); + return 1; + } + +public: + LightweightSemaphore(ssize_t initialCount = 0, int maxSpins = 10000) : m_count(initialCount), m_maxSpins(maxSpins) + { + assert(initialCount >= 0); + assert(maxSpins >= 0); + } + + bool tryWait() + { + ssize_t oldCount = m_count.load(std::memory_order_relaxed); + while (oldCount > 0) + { + if (m_count.compare_exchange_weak(oldCount, oldCount - 1, std::memory_order_acquire, std::memory_order_relaxed)) + return true; + } + return false; + } + + bool wait() + { + return tryWait() || waitWithPartialSpinning(); + } + + bool wait(std::int64_t timeout_usecs) + { + return tryWait() || waitWithPartialSpinning(timeout_usecs); + } + + // Acquires between 0 and (greedily) max, inclusive + ssize_t tryWaitMany(ssize_t max) + { + assert(max >= 0); + ssize_t oldCount = m_count.load(std::memory_order_relaxed); + while (oldCount > 0) + { + ssize_t newCount = oldCount > max ? oldCount - max : 0; + if (m_count.compare_exchange_weak(oldCount, newCount, std::memory_order_acquire, std::memory_order_relaxed)) + return oldCount - newCount; + } + return 0; + } + + // Acquires at least one, and (greedily) at most max + ssize_t waitMany(ssize_t max, std::int64_t timeout_usecs) + { + assert(max >= 0); + ssize_t result = tryWaitMany(max); + if (result == 0 && max > 0) + result = waitManyWithPartialSpinning(max, timeout_usecs); + return result; + } + + ssize_t waitMany(ssize_t max) + { + ssize_t result = waitMany(max, -1); + assert(result > 0); + return result; + } + + void signal(ssize_t count = 1) + { + assert(count >= 0); + ssize_t oldCount = m_count.fetch_add(count, std::memory_order_release); + ssize_t toRelease = -oldCount < count ? -oldCount : count; + if (toRelease > 0) + { + m_sema.signal((int)toRelease); + } + } + + std::size_t availableApprox() const + { + ssize_t count = m_count.load(std::memory_order_relaxed); + return count > 0 ? static_cast(count) : 0; + } +}; + +} // end namespace moodycamel + diff --git a/src/Makefile b/src/Makefile index f6b8ae93d..13f5209f7 100644 --- a/src/Makefile +++ b/src/Makefile @@ -246,7 +246,7 @@ endif endif # Include paths to dependencies FINAL_CFLAGS+= -I../deps/hiredis -I../deps/linenoise -I../deps/lua/src -I../deps/hdr_histogram -I../deps/license/ -FINAL_CXXFLAGS+= -I../deps/hiredis -I../deps/linenoise -I../deps/lua/src -I../deps/hdr_histogram -I../deps/rocksdb/include/ -I../deps/license +FINAL_CXXFLAGS+= -I../deps/hiredis -I../deps/linenoise -I../deps/lua/src -I../deps/hdr_histogram -I../deps/rocksdb/include/ -I../deps/license -I../deps/concurrentqueue # Determine systemd support and/or build preference (defaulting to auto-detection) BUILD_WITH_SYSTEMD=no From d29df021b11ccd5c8ce9281c085f8b62c9889f03 Mon Sep 17 00:00:00 2001 From: John Sully Date: Mon, 4 Oct 2021 07:33:03 +0000 Subject: [PATCH 82/99] Use the concurrentqueue for multithread load instead of mutex and cvs Former-commit-id: d5a59113dbfedaf7b62a650cff58a2e8ec01826f --- src/rdb.cpp | 99 ++++++++++++++++++++++------------------------------ src/server.h | 3 ++ 2 files changed, 45 insertions(+), 57 deletions(-) diff --git a/src/rdb.cpp b/src/rdb.cpp index edbf1ccaa..e5ec4804b 100644 --- a/src/rdb.cpp +++ b/src/rdb.cpp @@ -2639,17 +2639,15 @@ class rdbAsyncWorkThread { rdbSaveInfo *rsi; int rdbflags; - list *listJobs; - std::mutex mutex; - std::condition_variable cv; - std::condition_variable cvThrottle; + moodycamel::BlockingConcurrentQueue queueJobs; fastlock m_lockPause { "rdbAsyncWork-Pause"}; bool fLaunched = false; - bool fExit = false; + std::atomic fExit {false}; std::atomic ckeysLoaded; std::atomic cstorageWritesInFlight; std::atomic workerThreadDone; std::thread m_thread; + std::vector vecbatch; long long now; long long lastPing = -1; @@ -2664,14 +2662,11 @@ public: { ckeysLoaded = 0; cstorageWritesInFlight = 0; - listJobs = listCreate(); - listSetFreeMethod(listJobs, listFreeMethod); } ~rdbAsyncWorkThread() { if (m_thread.joinable()) endWork(); - listRelease(listJobs); } void start() { @@ -2680,26 +2675,24 @@ public: fLaunched = true; } - void throttle(std::unique_lock &l) { - if (listLength(listJobs) > 0 && (listLength(listJobs) % 1024 == 0) && (getMaxmemoryState(NULL,NULL,NULL,NULL) != C_OK)) { - cvThrottle.wait(l); - while (cstorageWritesInFlight.load(std::memory_order_relaxed) && (getMaxmemoryState(NULL,NULL,NULL,NULL) != C_OK)) { - l.unlock(); + void throttle() { + if (g_pserver->m_pstorageFactory && (getMaxmemoryState(NULL,NULL,NULL,NULL) != C_OK)) { + while ((cstorageWritesInFlight.load(std::memory_order_relaxed) || queueJobs.size_approx()) && (getMaxmemoryState(NULL,NULL,NULL,NULL) != C_OK)) { usleep(1); pauseExecution(); ProcessWhileBlocked(); resumeExecution(); - l.lock(); } } } - void enqueue(std::unique_ptr &spjob) { - std::unique_lock l(mutex); - throttle(l); - listAddNodeTail(listJobs, spjob.release()); - if (listLength(listJobs) == 1) - cv.notify_one(); + void enqueue(std::unique_ptr &spjob) { + vecbatch.push_back(spjob.release()); + if (vecbatch.size() >= 64) { + queueJobs.enqueue_bulk(vecbatch.data(), vecbatch.size()); + vecbatch.clear(); + throttle(); + } } void pauseExecution() { @@ -2711,12 +2704,9 @@ public: } void enqueue(std::function &&fn) { - JobBase *pjob = new rdbFunctionJob(std::move(fn)); - std::unique_lock l(mutex); - throttle(l); - listAddNodeTail(listJobs, pjob); - if (listLength(listJobs) == 1) - cv.notify_one(); + std::unique_ptr spjob = std::make_unique(std::move(fn)); + queueJobs.enqueue(spjob.release()); + throttle(); } void ProcessWhileBlocked() { @@ -2739,11 +2729,13 @@ public: size_t ckeys() { return ckeysLoaded; } size_t endWork() { - std::unique_lock l(mutex); + if (!vecbatch.empty()) { + queueJobs.enqueue_bulk(vecbatch.data(), vecbatch.size()); + vecbatch.clear(); + } + std::atomic_thread_fence(std::memory_order_seq_cst); // The queue must have transferred to the consumer before we call fExit serverAssert(fLaunched); fExit = true; - cv.notify_one(); - l.unlock(); if (g_pserver->m_pstorageFactory) { // If we have a storage provider it can take some time to complete and we want to process events in the meantime while (!workerThreadDone) { @@ -2760,7 +2752,7 @@ public: } fLaunched = false; fExit = false; - serverAssert(listLength(listJobs) == 0); + serverAssert(queueJobs.size_approx() == 0); return ckeysLoaded; } @@ -2863,40 +2855,35 @@ public: } for (;;) { - std::unique_lock lock(queue.mutex); - if (listLength(queue.listJobs) == 0) { - if (queue.fExit) - break; - queue.cv.wait(lock); - if (listLength(queue.listJobs) == 0 && queue.fExit) + if (queue.queueJobs.size_approx() == 0) { + if (queue.fExit.load(std::memory_order_relaxed)) break; } - pqueue->cvThrottle.notify_one(); - - list *listJobs = queue.listJobs; - queue.listJobs = listCreate(); - listSetFreeMethod(queue.listJobs, listFreeMethod); - lock.unlock(); + + if (queue.fExit.load(std::memory_order_seq_cst) && queue.queueJobs.size_approx() == 0) + break; vars.gcEpoch = g_pserver->garbageCollector.startEpoch(); - while (listLength(listJobs)) { + JobBase *rgjob[64]; + int cjobs = 0; + while ((cjobs = pqueue->queueJobs.wait_dequeue_bulk_timed(rgjob, 64, std::chrono::milliseconds(5))) > 0) { std::unique_lock ulPause(pqueue->m_lockPause); - JobBase *pjobBase = ((JobBase*)listNodeValue(listFirst(listJobs))); - switch (pjobBase->type) - { - case JobBase::JobType::Insert: - pqueue->processJob(*static_cast(pjobBase)); - break; + for (int ijob = 0; ijob < cjobs; ++ijob) { + JobBase *pjob = rgjob[ijob]; + switch (pjob->type) + { + case JobBase::JobType::Insert: + pqueue->processJob(*static_cast(pjob)); + break; - case JobBase::JobType::Function: - static_cast(pjobBase)->m_fn(); - break; + case JobBase::JobType::Function: + static_cast(pjob)->m_fn(); + break; + } + delete pjob; } - // Pop from the list - listDelNode(listJobs, listFirst(listJobs)); } - listRelease(listJobs); g_pserver->garbageCollector.endEpoch(vars.gcEpoch); } @@ -2906,8 +2893,6 @@ public: } queue.workerThreadDone = true; - std::unique_lock lock(queue.mutex); - serverAssert(listLength(queue.listJobs) == 0); ProcessPendingAsyncWrites(); listRelease(vars.clients_pending_asyncwrite); aeSetThreadOwnsLockOverride(false); diff --git a/src/server.h b/src/server.h index 197a7a7be..1153fde23 100644 --- a/src/server.h +++ b/src/server.h @@ -39,6 +39,9 @@ #include "rio.h" #include "atomicvar.h" +#include +#include + #include #include #include From 86ec032e2c0ef3fa6f05bdf68f0a3ad1ec87620d Mon Sep 17 00:00:00 2001 From: John Sully Date: Mon, 4 Oct 2021 07:33:38 +0000 Subject: [PATCH 83/99] Don't expand the dictionary if a storage provider is set as we won't use the whole thing Former-commit-id: 1f07b01144397cec59ec2d94f41c85eceb7248e2 --- src/rdb.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rdb.cpp b/src/rdb.cpp index e5ec4804b..44fff3244 100644 --- a/src/rdb.cpp +++ b/src/rdb.cpp @@ -3047,7 +3047,7 @@ int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi) { goto eoferr; if ((expires_size = rdbLoadLen(rdb,NULL)) == RDB_LENERR) goto eoferr; - if (g_pserver->allowRdbResizeOp) { + if (g_pserver->allowRdbResizeOp && !g_pserver->m_pstorageFactory) { wqueue.enqueue([dbCur, db_size]{ dbCur->expand(db_size); }); From 9d78b8bb080a06bc7d6fba5fb70d9e786400528b Mon Sep 17 00:00:00 2001 From: John Sully Date: Mon, 4 Oct 2021 07:34:05 +0000 Subject: [PATCH 84/99] Handle the case where the key cache exceeds maxmemory Former-commit-id: 01febf902267fec7fe87e6437b0b81fd08b50963 --- src/StorageCache.cpp | 10 ++++++++++ src/StorageCache.h | 2 ++ src/db.cpp | 14 ++++++++++++++ src/evict.cpp | 13 +++++++++---- src/server.h | 4 ++++ 5 files changed, 39 insertions(+), 4 deletions(-) diff --git a/src/StorageCache.cpp b/src/StorageCache.cpp index ad56a253a..af2ac12b7 100644 --- a/src/StorageCache.cpp +++ b/src/StorageCache.cpp @@ -148,4 +148,14 @@ size_t StorageCache::count() const void StorageCache::beginWriteBatch() { serverAssert(GlobalLocksAcquired()); // Otherwise we deadlock m_spstorage->beginWriteBatch(); +} + +void StorageCache::emergencyFreeCache() { + dict *d = m_pdict; + m_pdict = nullptr; + if (d != nullptr) { + g_pserver->asyncworkqueue->AddWorkFunction([d]{ + dictRelease(d); + }); + } } \ No newline at end of file diff --git a/src/StorageCache.h b/src/StorageCache.h index 9f92f75c0..184eb60bd 100644 --- a/src/StorageCache.h +++ b/src/StorageCache.h @@ -43,6 +43,8 @@ public: void bulkInsert(sds *rgkeys, sds *rgvals, size_t celem); void retrieve(sds key, IStorage::callbackSingle fn) const; bool erase(sds key); + void emergencyFreeCache(); + bool keycacheIsEnabled() const { return m_pdict != nullptr; } bool enumerate(IStorage::callback fn) const { return m_spstorage->enumerate(fn); } diff --git a/src/db.cpp b/src/db.cpp index c883bddba..446657040 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -3065,6 +3065,20 @@ void redisDbPersistentData::removeAllCachedValues() } } +void redisDbPersistentData::disableKeyCache() +{ + if (m_spstorage == nullptr) + return; + m_spstorage->emergencyFreeCache(); +} + +bool redisDbPersistentData::keycacheIsEnabled() +{ + if (m_spstorage == nullptr) + return false; + return m_spstorage->keycacheIsEnabled(); +} + void redisDbPersistentData::trackkey(const char *key, bool fUpdate) { if (m_fTrackingChanges && !m_fAllChanged && m_spstorage) { diff --git a/src/evict.cpp b/src/evict.cpp index 84bf21c36..20ebc9058 100644 --- a/src/evict.cpp +++ b/src/evict.cpp @@ -868,10 +868,15 @@ cant_free: redisDb *db = g_pserver->db[idb]; if (db->FStorageProvider()) { - serverLog(LL_WARNING, "Failed to evict keys, falling back to flushing entire cache. Consider increasing maxmemory-samples."); - db->removeAllCachedValues(); - if (((mem_reported - zmalloc_used_memory()) + mem_freed) >= mem_tofree) - result = EVICT_OK; + if (db->size() != 0 && db->size(true /*fcachedOnly*/) == 0 && db->keycacheIsEnabled()) { + serverLog(LL_WARNING, "Key cache exceeds maxmemory, freeing - performance may be affected increase maxmemory if possible"); + db->disableKeyCache(); + } else if (db->size(true /*fCachedOnly*/)) { + serverLog(LL_WARNING, "Failed to evict keys, falling back to flushing entire cache. Consider increasing maxmemory-samples."); + db->removeAllCachedValues(); + if (((mem_reported - zmalloc_used_memory()) + mem_freed) >= mem_tofree) + result = EVICT_OK; + } } } } diff --git a/src/server.h b/src/server.h index 1153fde23..f76d73beb 100644 --- a/src/server.h +++ b/src/server.h @@ -1182,6 +1182,8 @@ public: bool FStorageProvider() { return m_spstorage != nullptr; } bool removeCachedValue(const char *key, dictEntry **ppde = nullptr); void removeAllCachedValues(); + void disableKeyCache(); + bool keycacheIsEnabled(); bool prefetchKeysAsync(client *c, struct parsed_command &command, bool fExecOK); @@ -1337,6 +1339,8 @@ struct redisDb : public redisDbPersistentDataSnapshot using redisDbPersistentData::endSnapshot; using redisDbPersistentData::restoreSnapshot; using redisDbPersistentData::removeAllCachedValues; + using redisDbPersistentData::disableKeyCache; + using redisDbPersistentData::keycacheIsEnabled; using redisDbPersistentData::dictUnsafeKeyOnly; using redisDbPersistentData::resortExpire; using redisDbPersistentData::prefetchKeysAsync; From ac22f3c60fb046c68be6cee4cce1af1cbb63ba41 Mon Sep 17 00:00:00 2001 From: John Sully Date: Mon, 4 Oct 2021 07:35:36 +0000 Subject: [PATCH 85/99] disable key cache during load if necessary Former-commit-id: 68dcf66909e2138da4902bdec98985f4fcd737cf --- src/rdb.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/rdb.cpp b/src/rdb.cpp index 44fff3244..ee66ef93e 100644 --- a/src/rdb.cpp +++ b/src/rdb.cpp @@ -2683,6 +2683,16 @@ public: ProcessWhileBlocked(); resumeExecution(); } + + if ((getMaxmemoryState(NULL,NULL,NULL,NULL) != C_OK)) { + for (int idb = 0; idb < cserver.dbnum; ++idb) { + redisDb *db = g_pserver->db[idb]; + if (db->size() > 0 && db->keycacheIsEnabled()) { + serverLog(LL_WARNING, "Key cache %d exceeds maxmemory during load, freeing - performance may be affected increase maxmemory if possible", idb); + db->disableKeyCache(); + } + } + } } } From d57883bf6443b679d22b6942b05af49dab188d22 Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 15 Oct 2021 16:22:42 +0000 Subject: [PATCH 86/99] Permit prefetch for FLASH scenarios in single thread mode Former-commit-id: 6d0b90ed43cc9d1196903ddbc7d50cd40e439e42 --- src/networking.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/networking.cpp b/src/networking.cpp index d8b3e9a5d..1638328c0 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -2547,7 +2547,7 @@ void parseClientCommandBuffer(client *c) { } /* Prefetch outside the lock for better perf */ - if (g_pserver->prefetch_enabled && cserver.cthreads > 1 && cqueriesStart < c->vecqueuedcmd.size() && + if (g_pserver->prefetch_enabled && (cserver.cthreads > 1 || g_pserver->m_pstorageFactory) && cqueriesStart < c->vecqueuedcmd.size() && (g_pserver->m_pstorageFactory || aeLockContested(cserver.cthreads/2) || cserver.cthreads == 1) && !GlobalLocksAcquired()) { auto &query = c->vecqueuedcmd.back(); if (query.argc > 0 && query.argc == query.argcMax) { From 43a62493f872874e965b3edbbfaba199c292d675 Mon Sep 17 00:00:00 2001 From: John Sully Date: Wed, 20 Oct 2021 03:13:36 +0000 Subject: [PATCH 87/99] Additional change to ensure FLASH storage goes through the multithread path Former-commit-id: 422ea0723f0b8718f28ef9c1cc4d5f56d374af46 --- src/networking.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/networking.cpp b/src/networking.cpp index 1638328c0..72b2ec59e 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -2694,7 +2694,7 @@ void readQueryFromClient(connection *conn) { return; } - if (cserver.cthreads > 1) { + if (cserver.cthreads > 1 || g_pserver->m_pstorageFactory) { parseClientCommandBuffer(c); serverTL->vecclientsProcess.push_back(c); } else { From 7b3337d244bf718f0fb15b0db8a194a3ed9dd974 Mon Sep 17 00:00:00 2001 From: John Sully Date: Thu, 21 Oct 2021 22:46:17 +0000 Subject: [PATCH 88/99] Ensure async rehash completes before we start a new time. Degrad to sync hash if necessary to ensure this Former-commit-id: 0f830facc7c6bc6668af9bb2e10b6e13a13227aa --- src/server.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/server.cpp b/src/server.cpp index ef1039cea..f257883e0 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -2112,12 +2112,11 @@ void databasesCron(bool fMainThread) { if (g_pserver->activerehashing) { for (j = 0; j < dbs_per_call; j++) { if (serverTL->rehashCtl != nullptr) { - if (dictRehashSomeAsync(serverTL->rehashCtl, 5)) { + if (dictRehashSomeAsync(serverTL->rehashCtl, rehashes_per_ms)) { break; - } else { - dictCompleteRehashAsync(serverTL->rehashCtl, true /*fFree*/); - serverTL->rehashCtl = nullptr; - } + } + dictCompleteRehashAsync(serverTL->rehashCtl, true /*fFree*/); + serverTL->rehashCtl = nullptr; } serverAssert(serverTL->rehashCtl == nullptr); From 3f089054080d98e8a87d8bdbf2c5f4deba1595a1 Mon Sep 17 00:00:00 2001 From: John Sully Date: Thu, 21 Oct 2021 23:45:46 +0000 Subject: [PATCH 89/99] Do not dereference a nullptr if there are too many files open Former-commit-id: 4674eb29a261e8b046953398c94354fc3e550c2a --- src/server.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/server.cpp b/src/server.cpp index f257883e0..1794f51e8 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -3793,8 +3793,6 @@ static void initServerThread(struct redisServerThreadVars *pvar, int fMain) pvar->in_eval = 0; pvar->in_exec = 0; pvar->el = aeCreateEventLoop(g_pserver->maxclients+CONFIG_FDSET_INCR); - aeSetBeforeSleepProc(pvar->el, beforeSleep, AE_SLEEP_THREADSAFE); - aeSetAfterSleepProc(pvar->el, afterSleep, AE_SLEEP_THREADSAFE); pvar->current_client = nullptr; pvar->fRetrySetAofEvent = false; if (pvar->el == NULL) { @@ -3803,6 +3801,8 @@ static void initServerThread(struct redisServerThreadVars *pvar, int fMain) strerror(errno)); exit(1); } + aeSetBeforeSleepProc(pvar->el, beforeSleep, AE_SLEEP_THREADSAFE); + aeSetAfterSleepProc(pvar->el, afterSleep, AE_SLEEP_THREADSAFE); fastlock_init(&pvar->lockPendingWrite, "lockPendingWrite"); From b7b14ef19f28b2e29081cb7695800863ed53fbed Mon Sep 17 00:00:00 2001 From: malavan Date: Thu, 21 Oct 2021 23:50:50 +0000 Subject: [PATCH 90/99] null check for delete override Former-commit-id: a48b4cbdf24bfa5a1d13295c0202d1ed09115374 --- src/new.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/new.cpp b/src/new.cpp index 4775e207a..4e6b07dfd 100644 --- a/src/new.cpp +++ b/src/new.cpp @@ -27,14 +27,17 @@ void *operator new(std::size_t size, const std::nothrow_t &) noexcept return zmalloc(size, MALLOC_LOCAL); } +//need to do null checks for delete since the compiler can optimize out null checks in zfree void operator delete(void * p) noexcept { - zfree(p); + if (p != nullptr) + zfree(p); } void operator delete(void *p, std::size_t) noexcept { - zfree(p); + if (p != nullptr) + zfree(p); } #endif From fbe9ff66804cc8d48e3919ab1e18bea9949b6327 Mon Sep 17 00:00:00 2001 From: malavan Date: Thu, 21 Oct 2021 23:50:50 +0000 Subject: [PATCH 91/99] null check for delete override Former-commit-id: f5f2f5e200a5ff1b0306998624b758d5a4c10825 --- src/new.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/new.cpp b/src/new.cpp index 4775e207a..4e6b07dfd 100644 --- a/src/new.cpp +++ b/src/new.cpp @@ -27,14 +27,17 @@ void *operator new(std::size_t size, const std::nothrow_t &) noexcept return zmalloc(size, MALLOC_LOCAL); } +//need to do null checks for delete since the compiler can optimize out null checks in zfree void operator delete(void * p) noexcept { - zfree(p); + if (p != nullptr) + zfree(p); } void operator delete(void *p, std::size_t) noexcept { - zfree(p); + if (p != nullptr) + zfree(p); } #endif From d41aa34ba38595d8e6e6c2bafd5e2b0a6d7447b9 Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 22 Oct 2021 03:16:33 +0000 Subject: [PATCH 92/99] Fix slower performance during replication by better balancing client load on threads servicing a replica Former-commit-id: 496f91d3f169fcfe6d94c2ea69cee402f8eb60ca --- src/networking.cpp | 65 ++++++++++++++++++++++++++------------------- src/replication.cpp | 3 +++ src/server.h | 1 + 3 files changed, 41 insertions(+), 28 deletions(-) diff --git a/src/networking.cpp b/src/networking.cpp index 72b2ec59e..d1ccf28a9 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -1144,6 +1144,7 @@ int chooseBestThreadForAccept() int cclientsThread; atomicGet(g_pserver->rgthreadvar[iel].cclients, cclientsThread); cclientsThread += rgacceptsInFlight[iel].load(std::memory_order_relaxed); + cclientsThread *= (g_pserver->rgthreadvar[iel].cclientsReplica+1); if (cclientsThread < cserver.thread_min_client_threshold) return iel; if (cclientsThread < cclientsMin) @@ -1668,6 +1669,7 @@ bool freeClient(client *c) { ln = listSearchKey(l,c); serverAssert(ln != NULL); listDelNode(l,ln); + g_pserver->rgthreadvar[c->iel].cclientsReplica--; /* We need to remember the time when we started to have zero * attached slaves, as after some time we'll free the replication * backlog. */ @@ -1790,36 +1792,43 @@ int writeToClient(client *c, int handler_installed) { is a replica, so only attempt to do so if that's the case. */ if (c->flags & CLIENT_SLAVE && !(c->flags & CLIENT_MONITOR)) { std::unique_lock repl_backlog_lock (g_pserver->repl_backlog_lock); - long long repl_end_idx = getReplIndexFromOffset(c->repl_end_off); - serverAssert(c->repl_curr_off != -1); - if (c->repl_curr_off != c->repl_end_off){ - long long repl_curr_idx = getReplIndexFromOffset(c->repl_curr_off); - long long nwritten2ndStage = 0; /* How much was written from the start of the replication backlog - * in the event of a wrap around write */ - /* normal case with no wrap around */ - if (repl_end_idx >= repl_curr_idx){ - nwritten = connWrite(c->conn, g_pserver->repl_backlog + repl_curr_idx, repl_end_idx - repl_curr_idx); - /* wrap around case */ + while (clientHasPendingReplies(c)) { + long long repl_end_idx = getReplIndexFromOffset(c->repl_end_off); + serverAssert(c->repl_curr_off != -1); + + if (c->repl_curr_off != c->repl_end_off){ + long long repl_curr_idx = getReplIndexFromOffset(c->repl_curr_off); + long long nwritten2ndStage = 0; /* How much was written from the start of the replication backlog + * in the event of a wrap around write */ + /* normal case with no wrap around */ + if (repl_end_idx >= repl_curr_idx){ + nwritten = connWrite(c->conn, g_pserver->repl_backlog + repl_curr_idx, repl_end_idx - repl_curr_idx); + /* wrap around case */ + } else { + nwritten = connWrite(c->conn, g_pserver->repl_backlog + repl_curr_idx, g_pserver->repl_backlog_size - repl_curr_idx); + /* only attempt wrapping if we write the correct number of bytes */ + if (nwritten == g_pserver->repl_backlog_size - repl_curr_idx){ + nwritten2ndStage = connWrite(c->conn, g_pserver->repl_backlog, repl_end_idx); + if (nwritten2ndStage != -1) + nwritten += nwritten2ndStage; + } + } + + /* only increment bytes if an error didn't occur */ + if (nwritten > 0){ + totwritten += nwritten; + c->repl_curr_off += nwritten; + serverAssert(c->repl_curr_off <= c->repl_end_off); + } + + /* If the second part of a write didn't go through, we still need to register that */ + if (nwritten2ndStage == -1) nwritten = -1; + if (nwritten == -1) + break; } else { - nwritten = connWrite(c->conn, g_pserver->repl_backlog + repl_curr_idx, g_pserver->repl_backlog_size - repl_curr_idx); - /* only attempt wrapping if we write the correct number of bytes */ - if (nwritten == g_pserver->repl_backlog_size - repl_curr_idx){ - nwritten2ndStage = connWrite(c->conn, g_pserver->repl_backlog, repl_end_idx); - if (nwritten2ndStage != -1) - nwritten += nwritten2ndStage; - } + break; } - - /* only increment bytes if an error didn't occur */ - if (nwritten > 0){ - totwritten += nwritten; - c->repl_curr_off += nwritten; - serverAssert(c->repl_curr_off <= c->repl_end_off); - } - - /* If the second part of a write didn't go through, we still need to register that */ - if (nwritten2ndStage == -1) nwritten = -1; } } else { while(clientHasPendingReplies(c)) { @@ -2060,7 +2069,7 @@ int handleClientsWithPendingWrites(int iel, int aof_state) { /* Don't write to clients that are going to be closed anyway. */ if (c->flags & CLIENT_CLOSE_ASAP) continue; - /* Try to write buffers to the client socket. */ + /* Try to write buffers to the client socket, unless its a replica in multithread mode */ if (writeToClient(c,0) == C_ERR) { if (c->flags & CLIENT_CLOSE_ASAP) diff --git a/src/replication.cpp b/src/replication.cpp index 2725a9ce4..91036723a 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -889,6 +889,7 @@ int masterTryPartialResynchronization(client *c) { c->repl_ack_time = g_pserver->unixtime; c->repl_put_online_on_ack = 0; listAddNodeTail(g_pserver->slaves,c); + g_pserver->rgthreadvar[c->iel].cclientsReplica++; /* We can't use the connection buffers since they are used to accumulate * new commands at this stage. But we are sure the socket send buffer is @@ -992,6 +993,7 @@ int startBgsaveForReplication(int mincapa) { replica->replstate = REPL_STATE_NONE; replica->flags &= ~CLIENT_SLAVE; listDelNode(g_pserver->slaves,ln); + g_pserver->rgthreadvar[replica->iel].cclientsReplica--; addReplyError(replica, "BGSAVE failed, replication can't continue"); replica->flags |= CLIENT_CLOSE_AFTER_REPLY; @@ -1121,6 +1123,7 @@ void syncCommand(client *c) { c->repldbfd = -1; c->flags |= CLIENT_SLAVE; listAddNodeTail(g_pserver->slaves,c); + g_pserver->rgthreadvar[c->iel].cclientsReplica++; /* Create the replication backlog if needed. */ if (listLength(g_pserver->slaves) == 1 && g_pserver->repl_backlog == NULL) { diff --git a/src/server.h b/src/server.h index 45f44df8f..1b2c02f04 100644 --- a/src/server.h +++ b/src/server.h @@ -2022,6 +2022,7 @@ struct redisServerThreadVars { list *unblocked_clients; /* list of clients to unblock before next loop NOT THREADSAFE */ list *clients_pending_asyncwrite; int cclients; + int cclientsReplica = 0; client *current_client; /* Current client */ long fixed_time_expire = 0; /* If > 0, expire keys against server.mstime. */ client *lua_client = nullptr; /* The "fake client" to query Redis from Lua */ From 73215b2eebb38e682caca6eeb1e7d7a1f0114a07 Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 29 Oct 2021 17:59:46 +0000 Subject: [PATCH 93/99] Make the replica weighting configurable Former-commit-id: be6a8a7e68acb5cfbe950f13b903e6f7b98c5a39 --- keydb.conf | 11 +++++++++++ src/config.cpp | 1 + src/networking.cpp | 3 ++- src/server.h | 2 ++ tests/unit/introspection.tcl | 5 +++++ 5 files changed, 21 insertions(+), 1 deletion(-) diff --git a/keydb.conf b/keydb.conf index 50f9784a1..352f07663 100644 --- a/keydb.conf +++ b/keydb.conf @@ -1834,3 +1834,14 @@ server-threads 2 # Enable FLASH support? (Enterprise Only) # storage-provider flash /path/to/flash/db + +# KeyDB will attempt to balance clients across threads evenly; However, replica clients +# are usually much more expensive than a normal client, and so KeyDB will try to assign +# fewer clients to threads with a replica. The weighting factor below is intented to help tune +# this behavior. A replica weighting factor of 2 means we treat a replica as the equivalent +# of two normal clients. Adjusting this value may improve performance when replication is +# used. The best weighting is workload specific - e.g. read heavy workloads should set +# this to 1. Very write heavy workloads may benefit from higher numbers. +# +# By default KeyDB sets this to 2. +replica-weighting-factor 2 \ No newline at end of file diff --git a/src/config.cpp b/src/config.cpp index 701565e1e..e2028f4cc 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -2803,6 +2803,7 @@ standardConfig configs[] = { createIntConfig("min-clients-per-thread", NULL, MODIFIABLE_CONFIG, 0, 400, cserver.thread_min_client_threshold, 20, INTEGER_CONFIG, NULL, NULL), createIntConfig("storage-flush-period", NULL, MODIFIABLE_CONFIG, 1, 10000, g_pserver->storage_flush_period, 500, INTEGER_CONFIG, NULL, NULL), createIntConfig("replica-quorum", NULL, MODIFIABLE_CONFIG, -1, INT_MAX, g_pserver->repl_quorum, -1, INTEGER_CONFIG, NULL, NULL), + createIntConfig("replica-weighting-factor", NULL, MODIFIABLE_CONFIG, 1, INT_MAX, g_pserver->replicaIsolationFactor, 2, INTEGER_CONFIG, NULL, NULL), /* Unsigned int configs */ createUIntConfig("maxclients", NULL, MODIFIABLE_CONFIG, 1, UINT_MAX, g_pserver->maxclients, 10000, INTEGER_CONFIG, NULL, updateMaxclients), createUIntConfig("loading-process-events-interval-keys", NULL, MODIFIABLE_CONFIG, 0, LONG_MAX, g_pserver->loading_process_events_interval_keys, 8192, MEMORY_CONFIG, NULL, NULL), diff --git a/src/networking.cpp b/src/networking.cpp index d1ccf28a9..29c4ac206 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -1144,7 +1144,8 @@ int chooseBestThreadForAccept() int cclientsThread; atomicGet(g_pserver->rgthreadvar[iel].cclients, cclientsThread); cclientsThread += rgacceptsInFlight[iel].load(std::memory_order_relaxed); - cclientsThread *= (g_pserver->rgthreadvar[iel].cclientsReplica+1); + // Note: Its repl factor less one because cclients also includes replicas, so we don't want to double count + cclientsThread += (g_pserver->rgthreadvar[iel].cclientsReplica) * (g_pserver->replicaIsolationFactor-1); if (cclientsThread < cserver.thread_min_client_threshold) return iel; if (cclientsThread < cclientsMin) diff --git a/src/server.h b/src/server.h index 1b2c02f04..8997c3e73 100644 --- a/src/server.h +++ b/src/server.h @@ -2216,6 +2216,8 @@ struct redisServer { int active_expire_enabled; /* Can be disabled for testing purposes. */ + int replicaIsolationFactor = 1; + /* Fields used only for stats */ long long stat_numcommands; /* Number of processed commands */ long long stat_numconnections; /* Number of connections received */ diff --git a/tests/unit/introspection.tcl b/tests/unit/introspection.tcl index 2a0474ed8..b562530f6 100644 --- a/tests/unit/introspection.tcl +++ b/tests/unit/introspection.tcl @@ -44,6 +44,11 @@ start_server {tags {"introspection"}} { set e } {ERR*} + test {replica-weighting-factor does not accept values less than 1} { + catch {r config set replica-weighting-factor 0} e + set e + } {ERR*} + test {CLIENT SETNAME can assign a name to this connection} { assert_equal [r client setname myname] {OK} r client list From 755e9788bae8e4bc0fa352f9620d87e470e68099 Mon Sep 17 00:00:00 2001 From: malavan Date: Fri, 29 Oct 2021 15:42:01 +0000 Subject: [PATCH 94/99] don't delete db or snapshots on shutdown, still delete storage provider Former-commit-id: edb840ce10ea77ce654ba27c9eadbf98bbc13403 --- src/db.cpp | 14 ++++++++++++++ src/server.cpp | 10 +--------- src/server.h | 2 ++ 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/src/db.cpp b/src/db.cpp index f93c2fd45..cc6092831 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -2571,6 +2571,12 @@ void redisDbPersistentData::setStorageProvider(StorageCache *pstorage) m_spstorage = std::unique_ptr(pstorage); } +void redisDbPersistentData::endStorageProvider() +{ + serverAssert(m_spstorage != nullptr); + m_spstorage.reset(); +} + void clusterStorageLoadCallback(const char *rgchkey, size_t cch, void *) { slotToKeyUpdateKeyCore(rgchkey, cch, true /*add*/); @@ -2599,6 +2605,14 @@ void redisDb::storageProviderInitialize() } } +void redisDb::storageProviderDelete() +{ + if (g_pserver->m_pstorageFactory != nullptr) + { + this->endStorageProvider(); + } +} + bool redisDbPersistentData::insert(char *key, robj *o, bool fAssumeNew, dict_iter *piterExisting) { if (!fAssumeNew && (g_pserver->m_pstorageFactory != nullptr || m_pdbSnapshot != nullptr)) diff --git a/src/server.cpp b/src/server.cpp index d6cc589d9..16a9dc378 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -7079,13 +7079,6 @@ void *workerThreadMain(void *parg) serverAssert(!GlobalLocksAcquired()); aeDeleteEventLoop(el); - aeAcquireLock(); - for (int idb = 0; idb < cserver.dbnum; ++idb) { - if (g_pserver->rgthreadvar[iel].rgdbSnapshot[idb] != nullptr) - g_pserver->db[idb]->endSnapshot(g_pserver->rgthreadvar[iel].rgdbSnapshot[idb]); - } - aeReleaseLock(); - return NULL; } @@ -7507,8 +7500,7 @@ int main(int argc, char **argv) { if (!fLockAcquired) g_fInCrash = true; // We don't actually crash right away, because we want to sync any storage providers for (int idb = 0; idb < cserver.dbnum; ++idb) { - delete g_pserver->db[idb]; - g_pserver->db[idb] = nullptr; + g_pserver->db[idb]->storageProviderDelete(); } // If we couldn't acquire the global lock it means something wasn't shutdown and we'll probably deadlock serverAssert(fLockAcquired); diff --git a/src/server.h b/src/server.h index 45f44df8f..890db48b5 100644 --- a/src/server.h +++ b/src/server.h @@ -1158,6 +1158,7 @@ public: bool FRehashing() const { return dictIsRehashing(m_pdict) || dictIsRehashing(m_pdictTombstone); } void setStorageProvider(StorageCache *pstorage); + void endStorageProvider(); void trackChanges(bool fBulk, size_t sizeHint = 0); bool FTrackingChanges() const { return !!m_fTrackingChanges; } @@ -1301,6 +1302,7 @@ struct redisDb : public redisDbPersistentDataSnapshot void initialize(int id); void storageProviderInitialize(); + void storageProviderDelete(); virtual ~redisDb(); void dbOverwriteCore(redisDb::iter itr, sds keySds, robj *val, bool fUpdateMvcc, bool fRemoveExpire); From 9bd7f63ab135563fea6f377deafe9f0d1d5b1060 Mon Sep 17 00:00:00 2001 From: John Sully Date: Tue, 2 Nov 2021 19:31:53 +0000 Subject: [PATCH 95/99] This change fixes out of file descriptor issues with FLASH. The change does 3 things: 1. It limits RocksDB to 256 file descriptors instead of unlimited 2. It includes the fd limit in its estimation for total file descriptors needed 3. It raises the system fd limit if possible before we open rocksdb but accounting for the 256 limit we added Former-commit-id: 1447288209c5e7daf8a1203511fc262500ebe5e1 --- src/IStorage.h | 1 + src/config.cpp | 1 + src/server.cpp | 2 ++ 3 files changed, 4 insertions(+) diff --git a/src/IStorage.h b/src/IStorage.h index d5808c4c1..d1f316022 100644 --- a/src/IStorage.h +++ b/src/IStorage.h @@ -12,6 +12,7 @@ public: virtual const char *name() const = 0; virtual size_t totalDiskspaceUsed() const = 0; virtual bool FSlow() const = 0; + virtual size_t filedsRequired() const { return 0; } }; class IStorage diff --git a/src/config.cpp b/src/config.cpp index e2028f4cc..7ef995bbd 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -357,6 +357,7 @@ bool initializeStorageProvider(const char **err) { // Create The Storage Factory (if necessary) serverLog(LL_NOTICE, "Initializing FLASH storage provider (this may take a long time)"); + adjustOpenFilesLimit(); g_pserver->m_pstorageFactory = CreateRocksDBStorageFactory(g_sdsArgs, cserver.dbnum, cserver.storage_conf, cserver.storage_conf ? strlen(cserver.storage_conf) : 0); } else if (!strcasecmp(g_sdsProvider, "test") && g_sdsArgs == nullptr) diff --git a/src/server.cpp b/src/server.cpp index 6540371d5..76a142efe 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -3476,6 +3476,8 @@ int setOOMScoreAdj(int process_class) { * g_pserver->maxclients to the value that we can actually handle. */ void adjustOpenFilesLimit(void) { rlim_t maxfiles = g_pserver->maxclients+CONFIG_MIN_RESERVED_FDS; + if (g_pserver->m_pstorageFactory) + maxfiles += g_pserver->m_pstorageFactory->filedsRequired(); struct rlimit limit; if (getrlimit(RLIMIT_NOFILE,&limit) == -1) { From 553f49507dd10d2e717365334d249fb3e890c845 Mon Sep 17 00:00:00 2001 From: John Sully Date: Sat, 6 Nov 2021 15:49:58 +0000 Subject: [PATCH 96/99] Smooth out performance fluctuations caused by variance in the rehash calibration Former-commit-id: 09580dedfef09deace7863bf68ba7e0f9edf3eb3 --- src/dict.cpp | 2 +- src/server.cpp | 55 +++++++++++++++++++++++--------------------------- 2 files changed, 26 insertions(+), 31 deletions(-) diff --git a/src/dict.cpp b/src/dict.cpp index 8f8262751..e10567c8d 100644 --- a/src/dict.cpp +++ b/src/dict.cpp @@ -408,7 +408,7 @@ dictAsyncRehashCtl *dictRehashAsyncStart(dict *d, int buckets) { d->asyncdata = new dictAsyncRehashCtl(d, d->asyncdata); - int empty_visits = buckets; + int empty_visits = buckets*10; while (d->asyncdata->queue.size() < (size_t)buckets && (size_t)d->rehashidx < d->ht[0].size) { dictEntry *de; diff --git a/src/server.cpp b/src/server.cpp index 9fb92939f..0dcae55b0 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -1705,12 +1705,12 @@ void tryResizeHashTables(int dbid) { * is returned. */ int redisDbPersistentData::incrementallyRehash() { /* Keys dictionary */ - if (dictIsRehashing(m_pdict) || dictIsRehashing(m_pdictTombstone)) { - int result = dictRehashMilliseconds(m_pdict,1); - result += dictRehashMilliseconds(m_pdictTombstone,1); - return result; /* already used our millisecond for this loop... */ - } - return 0; + int result = 0; + if (dictIsRehashing(m_pdict)) + result += dictRehashMilliseconds(m_pdict,1); + if (dictIsRehashing(m_pdictTombstone)) + dictRehashMilliseconds(m_pdictTombstone,1); // don't count this + return result; /* already used our millisecond for this loop... */ } /* This function is called once a background process of some kind terminates, @@ -2112,9 +2112,15 @@ void databasesCron(bool fMainThread) { if (g_pserver->activerehashing) { for (j = 0; j < dbs_per_call; j++) { if (serverTL->rehashCtl != nullptr) { - if (dictRehashSomeAsync(serverTL->rehashCtl, rehashes_per_ms)) { - break; - } + if (!serverTL->rehashCtl->done.load(std::memory_order_relaxed)) { + aeReleaseLock(); + if (dictRehashSomeAsync(serverTL->rehashCtl, rehashes_per_ms)) { + aeAcquireLock(); + break; + } + aeAcquireLock(); + } + dictCompleteRehashAsync(serverTL->rehashCtl, true /*fFree*/); serverTL->rehashCtl = nullptr; } @@ -2124,22 +2130,27 @@ void databasesCron(bool fMainThread) { /* Are we async rehashing? And if so is it time to re-calibrate? */ /* The recalibration limit is a prime number to ensure balancing across threads */ if (rehashes_per_ms > 0 && async_rehashes < 131 && !cserver.active_defrag_enabled && cserver.cthreads > 1 && dictSize(dict) > 2048 && dictIsRehashing(dict) && !g_pserver->loading) { - serverTL->rehashCtl = dictRehashAsyncStart(dict, rehashes_per_ms); - ++async_rehashes; + serverTL->rehashCtl = dictRehashAsyncStart(dict, rehashes_per_ms * ((1000 / g_pserver->hz) / 10)); // Estimate 10% CPU time spent in lock contention + if (serverTL->rehashCtl) + ++async_rehashes; } if (serverTL->rehashCtl) break; - + // Before starting anything new, can we end the rehash of a blocked thread? - if (dict->asyncdata != nullptr) { + while (dict->asyncdata != nullptr) { auto asyncdata = dict->asyncdata; if (asyncdata->done) { dictCompleteRehashAsync(asyncdata, false /*fFree*/); // Don't free because we don't own the pointer serverAssert(dict->asyncdata != asyncdata); - break; // completion can be expensive, don't do anything else + } else { + break; } } + if (dict->asyncdata) + break; + rehashes_per_ms = g_pserver->db[rehash_db]->incrementallyRehash(); async_rehashes = 0; if (rehashes_per_ms > 0) { @@ -2360,14 +2371,6 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) { UNUSED(id); UNUSED(clientData); - if (serverTL->rehashCtl != nullptr && !serverTL->rehashCtl->done) { - aeReleaseLock(); - // If there is not enough lock contention we may not have made enough progress on the async - // rehash. Ensure we finish it outside the lock. - dictRehashSomeAsync(serverTL->rehashCtl, serverTL->rehashCtl->queue.size()); - aeAcquireLock(); - } - if (g_pserver->maxmemory && g_pserver->m_pstorageFactory) performEvictions(false); @@ -2657,14 +2660,6 @@ int serverCronLite(struct aeEventLoop *eventLoop, long long id, void *clientData UNUSED(id); UNUSED(clientData); - if (serverTL->rehashCtl != nullptr && !serverTL->rehashCtl->done) { - aeReleaseLock(); - // If there is not enough lock contention we may not have made enough progress on the async - // rehash. Ensure we finish it outside the lock. - dictRehashSomeAsync(serverTL->rehashCtl, serverTL->rehashCtl->queue.size()); - aeAcquireLock(); - } - if (g_pserver->maxmemory && g_pserver->m_pstorageFactory) performEvictions(false); From 6eae02919a68890ca6b93593fcfdf591133dfb39 Mon Sep 17 00:00:00 2001 From: John Sully Date: Sat, 13 Nov 2021 18:39:03 +0000 Subject: [PATCH 97/99] Allow load at boot to be cancelled with a shutdown command Former-commit-id: 2897a59e59bed14a67d1d0abcec5cb6a71bbb15b --- src/rdb.cpp | 3 +++ src/server.cpp | 8 +++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/rdb.cpp b/src/rdb.cpp index ee66ef93e..bbd320b6b 100644 --- a/src/rdb.cpp +++ b/src/rdb.cpp @@ -2665,6 +2665,9 @@ public: } ~rdbAsyncWorkThread() { + fExit = true; + while (m_lockPause.fOwnLock()) + m_lockPause.unlock(); if (m_thread.joinable()) endWork(); } diff --git a/src/server.cpp b/src/server.cpp index 0dcae55b0..d9c4b52d6 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -7394,7 +7394,13 @@ int main(int argc, char **argv) { } InitServerLast(); - loadDataFromDisk(); + + try { + loadDataFromDisk(); + } catch (ShutdownException) { + exit(EXIT_SUCCESS); + } + if (g_pserver->cluster_enabled) { if (verifyClusterConfigWithData() == C_ERR) { serverLog(LL_WARNING, From f9d36fb6bc042b2ebbfd541e44645a4577785425 Mon Sep 17 00:00:00 2001 From: John Sully Date: Sat, 13 Nov 2021 19:44:13 +0000 Subject: [PATCH 98/99] Prevent GC related crash on force-exit Former-commit-id: e4d95f83b02786b5a170673709780700def160cb --- src/server.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/server.cpp b/src/server.cpp index d9c4b52d6..4eb0f5a75 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -6572,6 +6572,7 @@ static void sigShutdownHandler(int sig) { if (g_pserver->shutdown_asap && sig == SIGINT) { serverLogFromHandler(LL_WARNING, "You insist... exiting now."); rdbRemoveTempFile(g_pserver->rdbThreadVars.tmpfileNum, 1); + g_pserver->garbageCollector.shutdown(); exit(1); /* Exit with an error since this was not a clean shutdown. */ } else if (g_pserver->loading) { serverLogFromHandler(LL_WARNING, "Received shutdown signal during loading, exiting now."); From 8bff491912aad778b1795e0ab3c4cb0816e2e53f Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 26 Nov 2021 03:05:23 +0000 Subject: [PATCH 99/99] Implement the disk backed backlog functionality Former-commit-id: 759cc01c6ba05f9a865c11580cc4975b5f1bd1d6 --- src/config.cpp | 6 +++- src/replication.cpp | 61 +++++++++++++++++++++++++++++++++--- src/server.cpp | 3 ++ src/server.h | 2 ++ tests/unit/introspection.tcl | 1 + 5 files changed, 68 insertions(+), 5 deletions(-) diff --git a/src/config.cpp b/src/config.cpp index 7ef995bbd..53d45cabb 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -2469,7 +2469,10 @@ static int updateJemallocBgThread(int val, int prev, const char **err) { static int updateReplBacklogSize(long long val, long long prev, const char **err) { /* resizeReplicationBacklog sets g_pserver->repl_backlog_size, and relies on * being able to tell when the size changes, so restore prev before calling it. */ - UNUSED(err); + if (cserver.repl_backlog_disk_size) { + *err = "Unable to dynamically resize the backlog because disk backlog is enabled"; + return 0; + } g_pserver->repl_backlog_size = prev; g_pserver->repl_backlog_config_size = val; resizeReplicationBacklog(val); @@ -2822,6 +2825,7 @@ standardConfig configs[] = { createLongLongConfig("proto-max-bulk-len", NULL, MODIFIABLE_CONFIG, 1024*1024, LLONG_MAX, g_pserver->proto_max_bulk_len, 512ll*1024*1024, MEMORY_CONFIG, NULL, NULL), /* Bulk request max size */ createLongLongConfig("stream-node-max-entries", NULL, MODIFIABLE_CONFIG, 0, LLONG_MAX, g_pserver->stream_node_max_entries, 100, INTEGER_CONFIG, NULL, NULL), createLongLongConfig("repl-backlog-size", NULL, MODIFIABLE_CONFIG, 1, LLONG_MAX, g_pserver->repl_backlog_size, 1024*1024, MEMORY_CONFIG, NULL, updateReplBacklogSize), /* Default: 1mb */ + createLongLongConfig("repl-backlog-disk-reserve", NULL, IMMUTABLE_CONFIG, 0, LLONG_MAX, cserver.repl_backlog_disk_size, 0, MEMORY_CONFIG, NULL, NULL), /* Unsigned Long Long configs */ createULongLongConfig("maxmemory", NULL, MODIFIABLE_CONFIG, 0, LLONG_MAX, g_pserver->maxmemory, 0, MEMORY_CONFIG, NULL, updateMaxmemory), diff --git a/src/replication.cpp b/src/replication.cpp index 91036723a..1f2c9043a 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -46,6 +46,7 @@ #include #include #include +#include void replicationDiscardCachedMaster(redisMaster *mi); void replicationResurrectCachedMaster(redisMaster *mi, connection *conn); @@ -184,8 +185,39 @@ int bg_unlink(const char *filename) { /* ---------------------------------- MASTER -------------------------------- */ +bool createDiskBacklog() { + // Lets create some disk backed pages and add them here + std::string path = "./repl-backlog-temp" + std::to_string(gettid()); + int fd = open(path.c_str(), O_CREAT | O_RDWR | O_LARGEFILE, S_IRUSR | S_IWUSR); + if (fd < 0) { + return false; + } + size_t alloc = cserver.repl_backlog_disk_size; + int result = truncate(path.c_str(), alloc); + unlink(path.c_str()); // ensure the fd is the only ref + if (result == -1) { + close (fd); + return false; + } + + g_pserver->repl_backlog_disk = (char*)mmap(nullptr, alloc, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + close(fd); + if (g_pserver->repl_backlog_disk == MAP_FAILED) { + g_pserver->repl_backlog_disk = nullptr; + return false; + } + + serverLog(LL_VERBOSE, "Disk Backed Replication Allocated"); + return true; +} + void createReplicationBacklog(void) { serverAssert(g_pserver->repl_backlog == NULL); + if (cserver.repl_backlog_disk_size) { + if (!createDiskBacklog()) { + serverLog(LL_WARNING, "Failed to create disk backlog, will use memory only"); + } + } g_pserver->repl_backlog = (char*)zmalloc(g_pserver->repl_backlog_size, MALLOC_LOCAL); g_pserver->repl_backlog_histlen = 0; g_pserver->repl_backlog_idx = 0; @@ -234,9 +266,22 @@ void resizeReplicationBacklog(long long newsize) { long long earliest_off = g_pserver->repl_lowest_off.load(); if (earliest_off != -1) { - // We need to keep critical data so we can't shrink less than the hot data in the buffer + char *backlog = nullptr; newsize = std::max(newsize, g_pserver->master_repl_offset - earliest_off); - char *backlog = (char*)zmalloc(newsize); + + if (cserver.repl_backlog_disk_size != 0) { + if (newsize > g_pserver->repl_backlog_config_size) { + if (g_pserver->repl_backlog == g_pserver->repl_backlog_disk) + return; // Can't do anything more + serverLog(LL_NOTICE, "Switching to disk backed replication backlog due to exceeding memory limits"); + backlog = g_pserver->repl_backlog_disk; + newsize = cserver.repl_backlog_disk_size; + } + } + + // We need to keep critical data so we can't shrink less than the hot data in the buffer + if (backlog == nullptr) + backlog = (char*)zmalloc(newsize); g_pserver->repl_backlog_histlen = g_pserver->master_repl_offset - earliest_off; long long earliest_idx = getReplIndexFromOffset(earliest_off); @@ -251,7 +296,10 @@ void resizeReplicationBacklog(long long newsize) { auto cbActiveBacklog = cbPhase1 + g_pserver->repl_backlog_idx; serverAssert(g_pserver->repl_backlog_histlen == cbActiveBacklog); } - zfree(g_pserver->repl_backlog); + if (g_pserver->repl_backlog != g_pserver->repl_backlog_disk) + zfree(g_pserver->repl_backlog); + else + serverLog(LL_NOTICE, "Returning to memory backed replication backlog"); g_pserver->repl_backlog = backlog; g_pserver->repl_backlog_idx = g_pserver->repl_backlog_histlen; if (g_pserver->repl_batch_idxStart >= 0) { @@ -261,7 +309,10 @@ void resizeReplicationBacklog(long long newsize) { } g_pserver->repl_backlog_start = earliest_off; } else { - zfree(g_pserver->repl_backlog); + if (g_pserver->repl_backlog != g_pserver->repl_backlog_disk) + zfree(g_pserver->repl_backlog); + else + serverLog(LL_NOTICE, "Returning to memory backed replication backlog"); g_pserver->repl_backlog = (char*)zmalloc(newsize); g_pserver->repl_backlog_histlen = 0; g_pserver->repl_backlog_idx = 0; @@ -311,6 +362,8 @@ void feedReplicationBacklog(const void *ptr, size_t len) { long long maxClientBuffer = (long long)cserver.client_obuf_limits[CLIENT_TYPE_SLAVE].hard_limit_bytes; if (maxClientBuffer <= 0) maxClientBuffer = LLONG_MAX; // infinite essentially + if (cserver.repl_backlog_disk_size) + maxClientBuffer = std::max(g_pserver->repl_backlog_size, cserver.repl_backlog_disk_size); long long min_offset = LLONG_MAX; int listening_replicas = 0; while ((ln = listNext(&li))) { diff --git a/src/server.cpp b/src/server.cpp index 4eb0f5a75..6827ad675 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -4005,6 +4005,9 @@ void InitServerLast() { g_pserver->initial_memory_usage = zmalloc_used_memory(); g_pserver->asyncworkqueue = new (MALLOC_LOCAL) AsyncWorkQueue(cserver.cthreads); + + // Allocate the repl backlog + } /* Parse the flags string description 'strflags' and set them to the diff --git a/src/server.h b/src/server.h index 5a067090b..a93d4008d 100644 --- a/src/server.h +++ b/src/server.h @@ -2142,6 +2142,7 @@ struct redisServerConst { char *storage_conf = nullptr; int fForkBgSave = false; int time_thread_priority = false; + long long repl_backlog_disk_size = 0; }; struct redisServer { @@ -2381,6 +2382,7 @@ struct redisServer { int replicaseldb; /* Last SELECTed DB in replication output */ int repl_ping_slave_period; /* Master pings the replica every N seconds */ char *repl_backlog; /* Replication backlog for partial syncs */ + char *repl_backlog_disk = nullptr; long long repl_backlog_size; /* Backlog circular buffer size */ long long repl_backlog_config_size; /* The repl backlog may grow but we want to know what the user set it to */ long long repl_backlog_histlen; /* Backlog actual data length */ diff --git a/tests/unit/introspection.tcl b/tests/unit/introspection.tcl index b562530f6..45dbd9838 100644 --- a/tests/unit/introspection.tcl +++ b/tests/unit/introspection.tcl @@ -123,6 +123,7 @@ start_server {tags {"introspection"}} { active-replica bind set-proc-title + repl-backlog-disk-reserve } if {!$::tls} {