futriix/src/semiorderedset.h
John Sully 1610f4211f Start off storage cache with a larger size
Former-commit-id: 5f6fb970a81cc73586ba595b35564e7865e7262d
2020-08-09 00:57:56 +00:00

395 lines
12 KiB
C++

#pragma once
#include <assert.h>
#include "compactvector.h"
#include "cowptr.h"
/****************************************
* semiorderedset.h:
*
* The ordered set is a hash set that maintains semi-ordering, that is you can iterate in sub-linear time over the set comparing a value.
* It has a few other useful properties vs the traditional set:
* 1. The key need not be the underlying type, the only requirement is the value type is castable to the key
* 2. The key need not have total ordering. The set will iterate until it finds an exact match with operator== on the value
* This provides additional flexibility on insert allowing us to optimize this case.
*
*/
extern uint64_t dictGenHashFunction(const void *key, int len);
namespace keydbutils
{
template<typename T>
size_t hash(const T& key)
{
return (size_t)dictGenHashFunction(&key, sizeof(key));
}
template<>
size_t hash(const sdsview &);
}
template<typename T, typename T_KEY = T, bool MEMMOVE_SAFE = false>
class semiorderedset
{
typedef compactvector<T, MEMMOVE_SAFE> vector_type;
friend struct setiter;
std::vector<CowPtr<vector_type>> m_data;
size_t celem = 0;
static const size_t bits_min = 8;
size_t bits = bits_min;
size_t idxRehash = (1ULL << bits_min);
int cfPauseRehash = 0;
constexpr size_t targetElementsPerBucket()
{
// Aim for roughly 4 cache lines per bucket (determined by imperical testing)
// lower values are faster but use more memory
return std::max((64/sizeof(T))*8, (size_t)2);
}
public:
semiorderedset(size_t bitsStart = 0)
{
if (bitsStart < bits_min)
bitsStart = bits_min;
bits = bitsStart;
m_data.resize((1ULL << bits));
}
struct setiter
{
semiorderedset *set;
size_t idxPrimary = 0;
size_t idxSecondary = 0;
setiter(const semiorderedset *set)
{
this->set = (semiorderedset*)set;
}
bool operator==(const setiter &other) const
{
return (idxPrimary == other.idxPrimary) && (idxSecondary == other.idxSecondary);
}
bool operator!=(const setiter &other) const { return !operator==(other); }
inline T &operator*() { return set->m_data[idxPrimary]->operator[](idxSecondary); }
inline const T &operator*() const { return set->m_data[idxPrimary]->operator[](idxSecondary); }
inline T *operator->() { return &set->m_data[idxPrimary]->operator[](idxSecondary); }
inline const T *operator->() const { return &set->m_data[idxPrimary]->operator[](idxSecondary); }
};
setiter find(const T_KEY &key)
{
RehashStep();
return const_cast<const semiorderedset*>(this)->find(key);
}
setiter find(const T_KEY &key) const
{
setiter itr(this);
itr.idxPrimary = idxFromObj(key);
for (int hashset = 0; hashset < 2; ++hashset) // rehashing may only be 1 resize behind, so we check up to two slots
{
if (m_data[itr.idxPrimary] != nullptr)
{
const auto &vecBucket = *m_data[itr.idxPrimary];
auto itrFind = std::find(vecBucket.begin(), vecBucket.end(), key);
if (itrFind != vecBucket.end())
{
itr.idxSecondary = itrFind - vecBucket.begin();
return itr;
}
}
// See if we have to check the older slot
size_t mask = (hashmask() >> 1);
if (itr.idxPrimary == (itr.idxPrimary & mask))
break; // same bucket we just checked
itr.idxPrimary &= mask;
if (FRehashedRow(itr.idxPrimary))
break;
}
return end();
}
bool exists(const T_KEY &key) const
{
auto itr = const_cast<semiorderedset<T,T_KEY,MEMMOVE_SAFE>*>(this)->find(key);
return itr != this->end();
}
setiter end() const
{
setiter itr(const_cast<semiorderedset<T,T_KEY,MEMMOVE_SAFE>*>(this));
itr.idxPrimary = m_data.size();
return itr;
}
void insert(const T &e, bool fRehash = false)
{
if (!fRehash)
RehashStep();
auto idx = idxFromObj(static_cast<T_KEY>(e));
if (!fRehash)
++celem;
if (m_data[idx] == nullptr)
m_data[idx] = std::make_shared<vector_type>();
typename vector_type::iterator itrInsert;
if (!m_data[idx]->empty() && !(e < m_data[idx]->back()))
itrInsert = m_data[idx]->end();
else
itrInsert = std::upper_bound(m_data[idx]->begin(), m_data[idx]->end(), e);
itrInsert = m_data[idx]->insert(itrInsert, e);
if (celem > ((1ULL << bits)*targetElementsPerBucket()))
grow();
}
// enumeration starting from the 'itrStart'th key. Note that the iter is a hint, and need no be valid anymore
template<typename T_VISITOR, typename T_MAX>
setiter enumerate(const setiter &itrStart, const T_MAX &max, T_VISITOR fn, long long *pccheck)
{
setiter itr(itrStart);
if (itrStart.set != this) // really if this case isn't true its probably a bug
itr.set = this; // but why crash the program when we can easily fix this?
cfPauseRehash++;
if (itr.idxPrimary >= m_data.size())
itr.idxPrimary = 0;
for (size_t ibucket = 0; ibucket < m_data.size(); ++ibucket)
{
if (!enumerate_bucket(itr, max, fn, pccheck))
break;
itr.idxSecondary = 0;
++itr.idxPrimary;
if (itr.idxPrimary >= m_data.size())
itr.idxPrimary = 0;
}
cfPauseRehash--;
return itr;
}
// This will "randomly" visit nodes biased towards lower values first
template<typename T_VISITOR>
size_t random_visit(T_VISITOR &fn)
{
bool fSawAny = true;
size_t visited = 0;
size_t basePrimary = rand() % m_data.size();
for (size_t idxSecondary = 0; fSawAny; ++idxSecondary)
{
fSawAny = false;
for (size_t idxPrimaryCount = 0; idxPrimaryCount < m_data.size(); ++idxPrimaryCount)
{
size_t idxPrimary = (basePrimary + idxPrimaryCount) % m_data.size();
if (m_data[idxPrimary] != nullptr && idxSecondary < m_data[idxPrimary]->size())
{
++visited;
fSawAny = true;
if (!fn(m_data[idxPrimary]->operator[](idxSecondary)))
return visited;
}
}
}
return visited;
}
const T& random_value() const
{
assert(!empty());
for (;;)
{
size_t idxPrimary = rand() % m_data.size();
if (m_data[idxPrimary] == nullptr || m_data[idxPrimary]->empty())
continue;
return (*m_data[idxPrimary])[rand() % m_data[idxPrimary]->size()];
}
}
void erase(const setiter &itr)
{
auto &vecRow = *m_data[itr.idxPrimary];
vecRow.erase(vecRow.begin() + itr.idxSecondary);
--celem;
RehashStep();
}
void clear()
{
m_data = decltype(m_data)();
bits = bits_min;
m_data.resize(1ULL << bits);
celem = 0;
idxRehash = m_data.size();
}
bool empty() const noexcept { return celem == 0; }
size_t size() const noexcept { return celem; }
size_t bytes_used() const
{
size_t cb = sizeof(this) + (m_data.capacity()-m_data.size())*sizeof(T);
for (auto &vec : m_data)
{
if (vec != nullptr)
cb += vec->bytes_used();
}
return cb;
}
#define DICT_STATS_VECTLEN 50
size_t getstats(char *buf, size_t bufsize) const
{
unsigned long i, slots = 0, chainlen, maxchainlen = 0;
unsigned long totchainlen = 0;
unsigned long clvector[DICT_STATS_VECTLEN] = {0};
size_t l = 0;
if (empty()) {
return snprintf(buf,bufsize,
"No stats available for empty dictionaries\n");
}
/* Compute stats. */
for (const auto &spvec : m_data) {
if (spvec == nullptr)
continue;
const auto &vec = *spvec;
if (vec.empty()) {
clvector[0]++;
continue;
}
slots++;
/* For each hash entry on this slot... */
chainlen = vec.size();
clvector[(chainlen < DICT_STATS_VECTLEN) ? chainlen : (DICT_STATS_VECTLEN-1)]++;
if (chainlen > maxchainlen) maxchainlen = chainlen;
totchainlen += chainlen;
}
size_t used = m_data.size()-clvector[0];
/* Generate human readable stats. */
l += snprintf(buf+l,bufsize-l,
"semiordered set stats:\n"
" table size: %ld\n"
" number of slots: %ld\n"
" used slots: %ld\n"
" max chain length: %ld\n"
" avg chain length (counted): %.02f\n"
" avg chain length (computed): %.02f\n"
" Chain length distribution:\n",
size(), used, slots, maxchainlen,
(float)totchainlen/slots, (float)size()/m_data.size());
for (i = 0; i < DICT_STATS_VECTLEN; i++) {
if (clvector[i] == 0) continue;
if (l >= bufsize) break;
l += snprintf(buf+l,bufsize-l,
" %s%ld: %ld (%.02f%%)\n",
(i == DICT_STATS_VECTLEN-1)?">= ":"",
i, clvector[i], ((float)clvector[i]/m_data.size())*100);
}
/* Unlike snprintf(), teturn the number of characters actually written. */
if (bufsize) buf[bufsize-1] = '\0';
return strlen(buf);
}
void pause_rehash() { ++cfPauseRehash; }
void unpause_rehash() { --cfPauseRehash; RehashStep(); }
private:
inline size_t hashmask() const { return (1ULL << bits) - 1; }
size_t idxFromObj(const T_KEY &key) const
{
size_t v = keydbutils::hash(key);
return v & hashmask();
}
bool FRehashedRow(size_t idx) const
{
return (idx >= (m_data.size()/2)) || (idx < idxRehash);
}
void RehashStep()
{
if (cfPauseRehash)
return;
int steps = 0;
for (; idxRehash < (m_data.size()/2); ++idxRehash)
{
if (m_data[idxRehash] == nullptr)
continue;
CowPtr<vector_type> spvecT;
std::swap(m_data[idxRehash], spvecT);
for (const auto &v : *spvecT)
insert(v, true);
if (++steps > 1024)
break;
}
}
void grow()
{
assert(idxRehash >= (m_data.size()/2)); // we should have finished rehashing by the time we need to grow again
++bits;
m_data.resize(1ULL << bits);
idxRehash = 0;
RehashStep();
}
template<typename T_VISITOR, typename T_MAX>
inline bool enumerate_bucket(setiter &itr, const T_MAX &max, T_VISITOR &fn, long long *pcheckLimit)
{
if (m_data[itr.idxPrimary] == nullptr)
return true;
auto &vec = *m_data[itr.idxPrimary];
for (; itr.idxSecondary < vec.size(); ++itr.idxSecondary)
{
// Assert we're ordered by T_MAX
assert((itr.idxSecondary+1) >= vec.size()
|| static_cast<T_MAX>(vec[itr.idxSecondary]) <= static_cast<T_MAX>(vec[itr.idxSecondary+1]));
(*pcheckLimit)--;
if (max < static_cast<T_MAX>(*itr))
return *pcheckLimit > 0;
size_t sizeBefore = vec.size();
if (!fn(*itr))
{
itr.idxSecondary++; // we still visited this node
return false;
}
if (vec.size() != sizeBefore)
{
assert(vec.size() == (sizeBefore-1)); // they may only remove the element passed to them
--itr.idxSecondary; // they deleted the element
}
}
vec.shrink_to_fit();
return *pcheckLimit > 0;
}
};