Fix issue where SCAN misses elements while snapshot is in flight

Former-commit-id: ce005d748ebf0e116d674a96f74d698d17394010
This commit is contained in:
John Sully 2020-07-10 01:43:51 +00:00
parent 1fef6c42b7
commit 3692771457
3 changed files with 45 additions and 40 deletions

View File

@ -1032,7 +1032,7 @@ void scanGenericCommand(client *c, robj_roptr o, unsigned long cursor) {
}
}
if (o == nullptr && count > 100)
if (o == nullptr && count >= 100)
{
// Do an async version
const redisDbPersistentDataSnapshot *snapshot = nullptr;
@ -1046,7 +1046,7 @@ void scanGenericCommand(client *c, robj_roptr o, unsigned long cursor) {
sds patCopy = pat ? sdsdup(pat) : nullptr;
sds typeCopy = type ? sdsdup(type) : nullptr;
g_pserver->asyncworkqueue->AddWorkFunction([c, snapshot, cursor, count, keys, el, db, patCopy, typeCopy, use_pattern]{
auto cursorResult = snapshot->scan_threadsafe(cursor, count, keys, nullptr);
auto cursorResult = snapshot->scan_threadsafe(cursor, count, keys);
if (use_pattern) {
listNode *ln = listFirst(keys);
int patlen = sdslen(patCopy);
@ -1109,23 +1109,30 @@ void scanGenericCommand(client *c, robj_roptr o, unsigned long cursor) {
}
if (ht) {
void *privdata[2];
/* We set the max number of iterations to ten times the specified
* COUNT, so if the hash table is in a pathological state (very
* sparsely populated) we avoid to block too much time at the cost
* of returning no or very few elements. */
long maxiterations = count*10;
if (ht == c->db->dictUnsafeKeyOnly())
{
cursor = c->db->scan_threadsafe(cursor, count, keys);
}
else
{
void *privdata[2];
/* We set the max number of iterations to ten times the specified
* COUNT, so if the hash table is in a pathological state (very
* sparsely populated) we avoid to block too much time at the cost
* of returning no or very few elements. */
long maxiterations = count*10;
/* We pass two pointers to the callback: the list to which it will
* add new elements, and the object containing the dictionary so that
* it is possible to fetch more data in a type-dependent way. */
privdata[0] = keys;
privdata[1] = o.unsafe_robjcast();
do {
cursor = dictScan(ht, cursor, scanCallback, NULL, privdata);
} while (cursor &&
maxiterations-- &&
listLength(keys) < (unsigned long)count);
/* We pass two pointers to the callback: the list to which it will
* add new elements, and the object containing the dictionary so that
* it is possible to fetch more data in a type-dependent way. */
privdata[0] = keys;
privdata[1] = o.unsafe_robjcast();
do {
cursor = dictScan(ht, cursor, scanCallback, NULL, privdata);
} while (cursor &&
maxiterations-- &&
listLength(keys) < (unsigned long)count);
}
} else if (o->type == OBJ_SET) {
int pos = 0;
int64_t ll;

View File

@ -1395,7 +1395,7 @@ public:
bool FWillFreeChildDebug() const { return m_spdbSnapshotHOLDER != nullptr; }
bool iterate_threadsafe(std::function<bool(const char*, robj_roptr o)> fn, bool fKeyOnly = false, bool fCacheOnly = false) const;
unsigned long scan_threadsafe(unsigned long iterator, long count, list *keys, struct scan_callback_data *pdata = nullptr) const;
unsigned long scan_threadsafe(unsigned long iterator, long count, list *keys) const;
using redisDbPersistentData::createSnapshot;
using redisDbPersistentData::endSnapshot;

View File

@ -20,6 +20,7 @@ const redisDbPersistentDataSnapshot *redisDbPersistentData::createSnapshot(uint6
if (m_spdbSnapshotHOLDER != nullptr)
{
serverLog(LL_DEBUG, "Attempting reuse of snapshot, client tstamp: %llu snapshot tstamp: %llu", mvccCheckpoint, m_spdbSnapshotHOLDER->m_mvccCheckpoint);
// If possible reuse an existing snapshot (we want to minimize nesting)
if (mvccCheckpoint <= m_spdbSnapshotHOLDER->m_mvccCheckpoint)
{
@ -59,8 +60,11 @@ const redisDbPersistentDataSnapshot *redisDbPersistentData::createSnapshot(uint6
}
// See if we have too many levels and can bail out of this to reduce load
if (fOptional && (levels >= 4))
if (fOptional && (levels >= 6))
{
serverLog(LL_DEBUG, "Snapshot nesting too deep, abondoning");
return nullptr;
}
auto spdb = std::unique_ptr<redisDbPersistentDataSnapshot>(new (MALLOC_LOCAL) redisDbPersistentDataSnapshot());
@ -402,44 +406,38 @@ void snapshot_scan_callback(void *privdata, const dictEntry *de)
sds sdskey = (sds)dictGetKey(de);
listAddNodeHead(data->keys, createStringObject(sdskey, sdslen(sdskey)));
}
unsigned long redisDbPersistentDataSnapshot::scan_threadsafe(unsigned long iterator, long count, list *keys, scan_callback_data *pdata) const
unsigned long redisDbPersistentDataSnapshot::scan_threadsafe(unsigned long iterator, long count, list *keys) const
{
unsigned long iteratorReturn = 0;
scan_callback_data dataT;
if (pdata == nullptr)
{
dataT.dictTombstone = m_pdictTombstone;
dataT.keys = keys;
pdata = &dataT;
}
scan_callback_data data;
data.dictTombstone = m_pdictTombstone;
data.keys = keys;
const redisDbPersistentDataSnapshot *psnapshot;
__atomic_load(&m_pdbSnapshot, &psnapshot, __ATOMIC_ACQUIRE);
if (psnapshot != nullptr)
{
// Always process the snapshot first as we assume its bigger than we are
iteratorReturn = psnapshot->scan_threadsafe(iterator, count, keys, pdata);
}
iteratorReturn = psnapshot->scan_threadsafe(iterator, count, keys);
if (psnapshot == nullptr)
// Just catch up with our snapshot
do
{
iterator = dictScan(m_pdict, iterator, snapshot_scan_callback, nullptr, &data);
} while (iterator != 0 && (iterator < iteratorReturn || iteratorReturn == 0));
}
else
{
long maxiterations = count * 10; // allow more iterations than keys for sparse tables
iteratorReturn = iterator;
do {
iteratorReturn = dictScan(m_pdict, iteratorReturn, snapshot_scan_callback, NULL, pdata);
iteratorReturn = dictScan(m_pdict, iteratorReturn, snapshot_scan_callback, NULL, &data);
} while (iteratorReturn &&
maxiterations-- &&
listLength(keys) < (unsigned long)count);
}
else
{
// Just catch up with our snapshot
do
{
iterator = dictScan(m_pdict, iterator, snapshot_scan_callback, nullptr, pdata);
} while (iterator != 0 && (iterator < iteratorReturn || iteratorReturn == 0));
}
return iteratorReturn;
}