Merge branch 'keydbpro' into keydbpro_collab

Former-commit-id: ecc69952dfd1f145e1aff12bca56a4b4e102d669
This commit is contained in:
John Sully 2021-06-25 06:21:58 +00:00
commit d55bcf23bd
9 changed files with 251 additions and 99 deletions

View File

@ -15,7 +15,7 @@
release_hdr := $(shell sh -c './mkreleasehdr.sh')
uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not')
uname_M := $(shell sh -c 'uname -m 2>/dev/null || echo not')
OPTIMIZATION?=-O2
OPTIMIZATION?=-O2 -flto
DEPENDENCY_TARGETS=hiredis linenoise lua hdr_histogram rocksdb
NODEPS:=clean distclean
@ -349,9 +349,9 @@ endif
REDIS_SERVER_NAME=keydb-server$(PROG_SUFFIX)
REDIS_SENTINEL_NAME=keydb-sentinel$(PROG_SUFFIX)
REDIS_SERVER_OBJ=adlist.o quicklist.o ae.o anet.o dict.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o t_nhash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o crc16.o endianconv.o slowlog.o scripting.o bio.o rio.o rand.o memtest.o crcspeed.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o acl.o storage.o rdb-s3.o fastlock.o new.o tracking.o cron.o connection.o tls.o sha256.o motd.o timeout.o setcpuaffinity.o AsyncWorkQueue.o snapshot.o storage/rocksdb.o storage/rocksdbfactory.o storage/teststorageprovider.o keydbutils.o StorageCache.o monotonic.o cli_common.o mt19937-64.o $(ASM_OBJ)
REDIS_SERVER_OBJ=adlist.o quicklist.o ae.o anet.o dict.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o t_nhash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o crc16.o endianconv.o slowlog.o scripting.o bio.o rio.o rand.o memtest.o crcspeed.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o acl.o storage.o rdb-s3.o fastlock.o new.o tracking.o cron.o connection.o tls.o sha256.o motd_server.o timeout.o setcpuaffinity.o AsyncWorkQueue.o snapshot.o storage/rocksdb.o storage/rocksdbfactory.o storage/teststorageprovider.o keydbutils.o StorageCache.o monotonic.o cli_common.o mt19937-64.o $(ASM_OBJ)
REDIS_CLI_NAME=keydb-cli$(PROG_SUFFIX)
REDIS_CLI_OBJ=anet.o adlist.o dict.o redis-cli.o redis-cli-cpphelper.o zmalloc.o release.o anet.o ae.o crcspeed.o crc64.o siphash.o crc16.o storage-lite.o fastlock.o motd.o monotonic.o cli_common.o mt19937-64.o $(ASM_OBJ)
REDIS_CLI_OBJ=anet.o adlist.o dict.o redis-cli.o redis-cli-cpphelper.o zmalloc.o release.o anet.o ae.o crcspeed.o crc64.o siphash.o crc16.o storage-lite.o fastlock.o motd_client.o monotonic.o cli_common.o mt19937-64.o $(ASM_OBJ)
REDIS_BENCHMARK_NAME=keydb-benchmark$(PROG_SUFFIX)
REDIS_BENCHMARK_OBJ=ae.o anet.o redis-benchmark.o adlist.o dict.o zmalloc.o release.o crcspeed.o crc64.o siphash.o redis-benchmark.o storage-lite.o fastlock.o new.o monotonic.o cli_common.o mt19937-64.o $(ASM_OBJ)
REDIS_CHECK_RDB_NAME=keydb-check-rdb$(PROG_SUFFIX)
@ -435,6 +435,12 @@ DEP = $(REDIS_SERVER_OBJ:%.o=%.d) $(REDIS_CLI_OBJ:%.o=%.d) $(REDIS_BENCHMARK_OBJ
# Because the jemalloc.h header is generated as a part of the jemalloc build,
# building it should complete before building any other object. Instead of
# depending on a single artifact, build all dependencies first.
motd_client.o: motd.cpp .make-prerequisites
$(REDIS_CXX) -MMD -o motd_client.o -c $< -DCLIENT -fno-lto
motd_server.o: motd.cpp .make-prerequisites
$(REDIS_CXX) -MMD -o motd_server.o -c $< -DSERVER
%.o: %.c .make-prerequisites
$(REDIS_CC) -MMD -o $@ -c $<

View File

@ -2726,6 +2726,8 @@ void redisDbPersistentData::ensure(const char *sdsKey, dictEntry **pde)
serverAssert(sdsKey != nullptr);
serverAssert(FImplies(*pde != nullptr, dictGetVal(*pde) != nullptr)); // early versions set a NULL object, this is no longer valid
serverAssert(m_refCount == 0);
if (m_pdbSnapshot == nullptr && g_pserver->m_pstorageFactory == nullptr)
return;
std::unique_lock<fastlock> ul(g_expireLock);
// First see if the key can be obtained from a snapshot

View File

@ -355,6 +355,8 @@ unsigned long LFUDecrAndReturn(robj_roptr o) {
return counter;
}
unsigned long getClientReplicationBacklogSharedUsage(client *c);
/* We don't want to count AOF buffers and slaves output buffers as
* used memory: the eviction should use mostly data size. This function
* returns the sum of AOF and slaves buffer. */
@ -371,9 +373,15 @@ size_t freeMemoryGetNotCountedMemory(void) {
while((ln = listNext(&li))) {
client *replica = (client*)listNodeValue(ln);
std::unique_lock<fastlock>(replica->lock);
overhead += getClientOutputBufferMemoryUsage(replica);
/* we don't wish to multiple count the replication backlog shared usage */
overhead += (getClientOutputBufferMemoryUsage(replica) - getClientReplicationBacklogSharedUsage(replica));
}
}
/* also don't count the replication backlog memory
* that's where the replication clients get their memory from */
overhead += g_pserver->repl_backlog_size;
if (g_pserver->aof_state != AOF_OFF) {
overhead += sdsalloc(g_pserver->aof_buf)+aofRewriteBufferSize();
}

View File

@ -1,7 +1,11 @@
#ifdef CLIENT
extern "C" {
#include <sdscompat.h>
#include <sds.h>
}
#else
#include "sds.h"
#endif
#include <cstring>
#include <unistd.h>
#include <sys/types.h>
@ -15,6 +19,7 @@ extern "C" {
#ifdef MOTD
#include <curl/curl.h>
#ifdef CLIENT
extern "C" {
__attribute__ ((weak)) hisds hi_sdscatlen(hisds s, const void *t, size_t len) {
return sdscatlen(s, t, len);
@ -23,6 +28,7 @@ __attribute__ ((weak)) hisds hi_sdscat(hisds s, const char *t) {
return sdscat(s, t);
}
}
#endif
static const char *szMotdCachePath()
{

View File

@ -136,6 +136,7 @@ client *createClient(connection *conn, int iel) {
client_id = g_pserver->next_client_id.fetch_add(1);
c->iel = iel;
c->id = client_id;
sprintf(c->lock.szName, "client %lu", client_id);
c->resp = 2;
c->conn = conn;
c->name = NULL;
@ -157,6 +158,7 @@ client *createClient(connection *conn, int iel) {
c->flags = 0;
c->fPendingAsyncWrite = FALSE;
c->fPendingAsyncWriteHandler = FALSE;
c->fPendingReplicaWrite = FALSE;
c->ctime = c->lastinteraction = g_pserver->unixtime;
/* If the default user does not require authentication, the user is
* directly authenticated. */
@ -234,6 +236,7 @@ void clientInstallWriteHandler(client *c) {
/* Schedule the client to write the output buffers to the socket only
* if not already done and, for slaves, if the replica can actually receive
* writes at this stage. */
if (!(c->flags & CLIENT_PENDING_WRITE) &&
(c->replstate == REPL_STATE_NONE ||
(c->replstate == SLAVE_STATE_ONLINE && !c->repl_put_online_on_ack)))
@ -315,7 +318,7 @@ int prepareClientToWrite(client *c) {
/* Schedule the client to write the output buffers to the socket, unless
* it should already be setup to do so (it has already pending data). */
if (!fAsync && !clientHasPendingReplies(c)) clientInstallWriteHandler(c);
if (!fAsync && !clientHasPendingReplies(c) && !c->fPendingReplicaWrite) clientInstallWriteHandler(c);
if (fAsync && !(c->fPendingAsyncWrite)) clientInstallAsyncWriteHandler(c);
/* Authorize the caller to queue in the output buffer of this client. */
@ -1764,6 +1767,8 @@ client *lookupClientByID(uint64_t id) {
return (c == raxNotFound) ? NULL : c;
}
long long getReplIndexFromOffset(long long offset);
/* Write data in output buffers to client. Return C_OK if the client
* is still valid after the call, C_ERR if it was freed because of some
* error. If handler_installed is set, it will attempt to clear the
@ -1783,6 +1788,7 @@ int writeToClient(client *c, int handler_installed) {
std::unique_lock<decltype(c->lock)> lock(c->lock);
while(clientHasPendingReplies(c)) {
serverAssert(!(c->flags & CLIENT_SLAVE) || c->flags & CLIENT_MONITOR);
if (c->bufpos > 0) {
nwritten = connWrite(c->conn,c->buf+c->sentlen,c->bufpos-c->sentlen);
if (nwritten <= 0) break;
@ -1790,7 +1796,7 @@ int writeToClient(client *c, int handler_installed) {
totwritten += nwritten;
/* If the buffer was sent, set bufpos to zero to continue with
* the remainder of the reply. */
* the remainder of the reply. */
if ((int)c->sentlen == c->bufpos) {
c->bufpos = 0;
c->sentlen = 0;
@ -1837,6 +1843,48 @@ int writeToClient(client *c, int handler_installed) {
!(c->flags & CLIENT_SLAVE)) break;
}
/* We can only directly read from the replication backlog if the client
is a replica, so only attempt to do so if that's the case. */
if (c->flags & CLIENT_SLAVE && !(c->flags & CLIENT_MONITOR)) {
std::unique_lock<fastlock> repl_backlog_lock (g_pserver->repl_backlog_lock);
long long repl_end_idx = getReplIndexFromOffset(c->repl_end_off);
serverAssert(c->repl_curr_off != -1);
if (c->repl_curr_off != c->repl_end_off){
long long repl_curr_idx = getReplIndexFromOffset(c->repl_curr_off);
long long nwritten2ndStage = 0; /* How much was written from the start of the replication backlog
* in the event of a wrap around write */
/* normal case with no wrap around */
if (repl_end_idx >= repl_curr_idx){
nwritten = connWrite(c->conn, g_pserver->repl_backlog + repl_curr_idx, repl_end_idx - repl_curr_idx);
/* wrap around case */
} else {
nwritten = connWrite(c->conn, g_pserver->repl_backlog + repl_curr_idx, g_pserver->repl_backlog_size - repl_curr_idx);
/* only attempt wrapping if we write the correct number of bytes */
if (nwritten == g_pserver->repl_backlog_size - repl_curr_idx){
nwritten2ndStage = connWrite(c->conn, g_pserver->repl_backlog, repl_end_idx);
if (nwritten2ndStage != -1)
nwritten += nwritten2ndStage;
}
}
/* only increment bytes if an error didn't occur */
if (nwritten > 0){
totwritten += nwritten;
c->repl_curr_off += nwritten;
serverAssert(c->repl_curr_off <= c->repl_end_off);
/* If the client's current offset matches the last offset it can read from, there is no pending write */
if (c->repl_curr_off == c->repl_end_off){
c->fPendingReplicaWrite = false;
}
}
/* If the second part of a write didn't go through, we still need to register that */
if (nwritten2ndStage == -1) nwritten = -1;
}
}
g_pserver->stat_net_output_bytes += totwritten;
if (nwritten == -1) {
if (connGetState(c->conn) != CONN_STATE_CONNECTED) {
@ -1854,7 +1902,7 @@ int writeToClient(client *c, int handler_installed) {
* We just rely on data / pings received for timeout detection. */
if (!(c->flags & CLIENT_MASTER)) c->lastinteraction = g_pserver->unixtime;
}
if (!clientHasPendingReplies(c)) {
if (!clientHasPendingReplies(c) && !c->fPendingReplicaWrite) {
c->sentlen = 0;
if (handler_installed) connSetWriteHandler(c->conn, NULL);
@ -1898,27 +1946,37 @@ void ProcessPendingAsyncWrites()
serverAssert(c->fPendingAsyncWrite);
if (c->flags & (CLIENT_CLOSE_ASAP | CLIENT_CLOSE_AFTER_REPLY))
{
zfree(c->replyAsync);
c->replyAsync = nullptr;
if (c->replyAsync != nullptr){
zfree(c->replyAsync);
c->replyAsync = nullptr;
}
c->fPendingAsyncWrite = FALSE;
continue;
}
int size = c->replyAsync->used;
/* since writes from master to replica can come directly from the replication backlog,
* writes may have been signalled without having been copied to the replyAsync buffer,
* thus causing the buffer to be NULL */
if (c->replyAsync != nullptr){
int size = c->replyAsync->used;
if (listLength(c->reply) == 0 && size <= (PROTO_REPLY_CHUNK_BYTES - c->bufpos)) {
memcpy(c->buf + c->bufpos, c->replyAsync->buf(), size);
c->bufpos += size;
} else {
c->reply_bytes += c->replyAsync->size;
listAddNodeTail(c->reply, c->replyAsync);
if (listLength(c->reply) == 0 && size <= (PROTO_REPLY_CHUNK_BYTES - c->bufpos)) {
memcpy(c->buf + c->bufpos, c->replyAsync->buf(), size);
c->bufpos += size;
} else {
c->reply_bytes += c->replyAsync->size;
listAddNodeTail(c->reply, c->replyAsync);
c->replyAsync = nullptr;
}
zfree(c->replyAsync);
c->replyAsync = nullptr;
} else {
/* Only replicas should have empty async reply buffers */
serverAssert(c->flags & CLIENT_SLAVE);
}
zfree(c->replyAsync);
c->replyAsync = nullptr;
c->fPendingAsyncWrite = FALSE;
// Now install the write event handler
int ae_flags = AE_WRITABLE|AE_WRITE_THREADSAFE;
/* For the fsync=always policy, we want that a given FD is never
@ -2024,9 +2082,10 @@ int handleClientsWithPendingWrites(int iel, int aof_state) {
/* If after the synchronous writes above we still have data to
* output to the client, we need to install the writable handler. */
if (clientHasPendingReplies(c)) {
if (connSetWriteHandlerWithBarrier(c->conn, sendReplyToClient, ae_flags, true) == C_ERR)
if (clientHasPendingReplies(c) || c->fPendingReplicaWrite) {
if (connSetWriteHandlerWithBarrier(c->conn, sendReplyToClient, ae_flags, true) == C_ERR) {
freeClientAsync(c);
}
}
}
@ -3647,6 +3706,12 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) {
}
}
/* In the case of a replica client, writes to said replica are using data from the replication backlog
* as opposed to it's own internal buffer, this number should keep track of that */
unsigned long getClientReplicationBacklogSharedUsage(client *c) {
return (!(c->flags & CLIENT_SLAVE) || !c->fPendingReplicaWrite ) ? 0 : g_pserver->master_repl_offset - c->repl_curr_off;
}
/* This function returns the number of bytes that Redis is
* using to store the reply still not read by the client.
*
@ -3655,9 +3720,11 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) {
* enforcing the client output length limits. */
unsigned long getClientOutputBufferMemoryUsage(client *c) {
unsigned long list_item_size = sizeof(listNode) + sizeof(clientReplyBlock);
return c->reply_bytes + (list_item_size*listLength(c->reply)) + (c->replyAsync ? c->replyAsync->size : 0);
return c->reply_bytes + (list_item_size*listLength(c->reply)) + (c->replyAsync ? c->replyAsync->size : 0) + getClientReplicationBacklogSharedUsage(c);
}
/* Get the class of a client, used in order to enforce limits to different
* classes of clients.
*

View File

@ -189,6 +189,7 @@ void createReplicationBacklog(void) {
g_pserver->repl_backlog = (char*)zmalloc(g_pserver->repl_backlog_size, MALLOC_LOCAL);
g_pserver->repl_backlog_histlen = 0;
g_pserver->repl_backlog_idx = 0;
g_pserver->repl_backlog_start = g_pserver->master_repl_offset;
/* We don't have any data inside our buffer, but virtually the first
* byte we have is the next byte that will be generated for the
@ -200,6 +201,15 @@ void createReplicationBacklog(void) {
g_pserver->repl_batch_offStart = g_pserver->master_repl_offset;
}
/* Compute the corresponding index from a replication backlog offset
* Since this computation needs the size of the replication backlog,
* you need to have the repl_backlog_lock in order to call it */
long long getReplIndexFromOffset(long long offset){
serverAssert(g_pserver->repl_backlog_lock.fOwnLock());
long long index = (offset - g_pserver->repl_backlog_start) % g_pserver->repl_backlog_size;
return index;
}
/* This function is called when the user modifies the replication backlog
* size at runtime. It is up to the function to both update the
* g_pserver->repl_backlog_size and to resize the buffer and setup it so that
@ -211,6 +221,8 @@ void resizeReplicationBacklog(long long newsize) {
newsize = CONFIG_REPL_BACKLOG_MIN_SIZE;
if (g_pserver->repl_backlog_size == newsize) return;
std::unique_lock<fastlock> repl_backlog_lock (g_pserver->repl_backlog_lock);
if (g_pserver->repl_backlog != NULL) {
/* What we actually do is to flush the old buffer and realloc a new
* empty one. It will refill with new data incrementally.
@ -218,19 +230,23 @@ void resizeReplicationBacklog(long long newsize) {
* worse often we need to alloc additional space before freeing the
* old buffer. */
if (g_pserver->repl_batch_idxStart >= 0) {
// We need to keep critical data so we can't shrink less than the hot data in the buffer
newsize = std::max(newsize, g_pserver->master_repl_offset - g_pserver->repl_batch_offStart);
char *backlog = (char*)zmalloc(newsize);
g_pserver->repl_backlog_histlen = g_pserver->master_repl_offset - g_pserver->repl_batch_offStart;
/* get the critical client size, i.e. the size of the data unflushed to clients */
long long earliest_off = g_pserver->repl_lowest_off.load();
if (g_pserver->repl_backlog_idx >= g_pserver->repl_batch_idxStart) {
auto cbActiveBacklog = g_pserver->repl_backlog_idx - g_pserver->repl_batch_idxStart;
memcpy(backlog, g_pserver->repl_backlog + g_pserver->repl_batch_idxStart, cbActiveBacklog);
if (earliest_off != -1) {
// We need to keep critical data so we can't shrink less than the hot data in the buffer
newsize = std::max(newsize, g_pserver->master_repl_offset - earliest_off);
char *backlog = (char*)zmalloc(newsize);
g_pserver->repl_backlog_histlen = g_pserver->master_repl_offset - earliest_off;
long long earliest_idx = getReplIndexFromOffset(earliest_off);
if (g_pserver->repl_backlog_idx >= earliest_idx) {
auto cbActiveBacklog = g_pserver->repl_backlog_idx - earliest_idx;
memcpy(backlog, g_pserver->repl_backlog + earliest_idx, cbActiveBacklog);
serverAssert(g_pserver->repl_backlog_histlen == cbActiveBacklog);
} else {
auto cbPhase1 = g_pserver->repl_backlog_size - g_pserver->repl_batch_idxStart;
memcpy(backlog, g_pserver->repl_backlog + g_pserver->repl_batch_idxStart, cbPhase1);
auto cbPhase1 = g_pserver->repl_backlog_size - earliest_idx;
memcpy(backlog, g_pserver->repl_backlog + earliest_idx, cbPhase1);
memcpy(backlog + cbPhase1, g_pserver->repl_backlog, g_pserver->repl_backlog_idx);
auto cbActiveBacklog = cbPhase1 + g_pserver->repl_backlog_idx;
serverAssert(g_pserver->repl_backlog_histlen == cbActiveBacklog);
@ -238,7 +254,10 @@ void resizeReplicationBacklog(long long newsize) {
zfree(g_pserver->repl_backlog);
g_pserver->repl_backlog = backlog;
g_pserver->repl_backlog_idx = g_pserver->repl_backlog_histlen;
g_pserver->repl_batch_idxStart = 0;
g_pserver->repl_batch_idxStart -= earliest_idx;
if (g_pserver->repl_batch_idxStart < 0)
g_pserver->repl_batch_idxStart += g_pserver->repl_backlog_size;
g_pserver->repl_backlog_start = earliest_off;
} else {
zfree(g_pserver->repl_backlog);
g_pserver->repl_backlog = (char*)zmalloc(newsize);
@ -246,11 +265,13 @@ void resizeReplicationBacklog(long long newsize) {
g_pserver->repl_backlog_idx = 0;
/* Next byte we have is... the next since the buffer is empty. */
g_pserver->repl_backlog_off = g_pserver->master_repl_offset+1;
g_pserver->repl_backlog_start = g_pserver->master_repl_offset;
}
}
g_pserver->repl_backlog_size = newsize;
}
void freeReplicationBacklog(void) {
serverAssert(GlobalLocksAcquired());
listIter li;
@ -273,12 +294,20 @@ void feedReplicationBacklog(const void *ptr, size_t len) {
serverAssert(GlobalLocksAcquired());
const unsigned char *p = (const unsigned char*)ptr;
if (g_pserver->repl_batch_idxStart >= 0) {
long long minimumsize = g_pserver->master_repl_offset + len - g_pserver->repl_batch_offStart+1;
/* We are lower bounded by the lowest replica offset, or the batch offset start if not applicable */
long long lower_bound = g_pserver->repl_lowest_off.load(std::memory_order_seq_cst);
if (lower_bound == -1)
lower_bound = g_pserver->repl_batch_offStart;
long long minimumsize = g_pserver->master_repl_offset + len - lower_bound + 1;
if (minimumsize > g_pserver->repl_backlog_size) {
flushReplBacklogToClients();
serverAssert(g_pserver->master_repl_offset == g_pserver->repl_batch_offStart);
minimumsize = g_pserver->master_repl_offset + len - g_pserver->repl_batch_offStart+1;
lower_bound = g_pserver->repl_lowest_off.load(std::memory_order_seq_cst);
if (lower_bound == -1)
lower_bound = g_pserver->repl_batch_offStart;
minimumsize = g_pserver->master_repl_offset + len - lower_bound + 1;
if (minimumsize > g_pserver->repl_backlog_size && minimumsize < (long long)cserver.client_obuf_limits[CLIENT_TYPE_SLAVE].hard_limit_bytes) {
// This is an emergency overflow, we better resize to fit
@ -293,6 +322,7 @@ void feedReplicationBacklog(const void *ptr, size_t len) {
/* This is a circular buffer, so write as much data we can at every
* iteration and rewind the "idx" index if we reach the limit. */
while(len) {
size_t thislen = g_pserver->repl_backlog_size - g_pserver->repl_backlog_idx;
if (thislen > len) thislen = len;
@ -479,7 +509,6 @@ void replicationFeedSlavesCore(list *slaves, int dictid, robj **argv, int argc)
if (fSendRaw)
{
char aux[LONG_STR_SIZE+3];
/* Add the multi bulk reply length. */
aux[0] = '*';
int multilen = ll2string(aux+1,sizeof(aux)-1,argc);
@ -653,15 +682,19 @@ void replicationFeedMonitors(client *c, list *monitors, int dictid, robj **argv,
decrRefCount(cmdobj);
}
int prepareClientToWrite(client *c);
/* Feed the replica 'c' with the replication backlog starting from the
* specified 'offset' up to the end of the backlog. */
long long addReplyReplicationBacklog(client *c, long long offset) {
long long j, skip, len;
long long skip, len;
serverLog(LL_DEBUG, "[PSYNC] Replica request offset: %lld", offset);
if (g_pserver->repl_backlog_histlen == 0) {
serverLog(LL_DEBUG, "[PSYNC] Backlog history len is zero");
c->repl_curr_off = g_pserver->master_repl_offset;
c->repl_end_off = g_pserver->master_repl_offset;
return 0;
}
@ -678,31 +711,20 @@ long long addReplyReplicationBacklog(client *c, long long offset) {
skip = offset - g_pserver->repl_backlog_off;
serverLog(LL_DEBUG, "[PSYNC] Skipping: %lld", skip);
/* Point j to the oldest byte, that is actually our
* g_pserver->repl_backlog_off byte. */
j = (g_pserver->repl_backlog_idx +
(g_pserver->repl_backlog_size-g_pserver->repl_backlog_histlen)) %
g_pserver->repl_backlog_size;
serverLog(LL_DEBUG, "[PSYNC] Index of first byte: %lld", j);
/* Discard the amount of data to seek to the specified 'offset'. */
j = (j + skip) % g_pserver->repl_backlog_size;
/* Feed replica with data. Since it is a circular buffer we have to
* split the reply in two parts if we are cross-boundary. */
len = g_pserver->repl_backlog_histlen - skip;
serverLog(LL_DEBUG, "[PSYNC] Reply total length: %lld", len);
while(len) {
long long thislen =
((g_pserver->repl_backlog_size - j) < len) ?
(g_pserver->repl_backlog_size - j) : len;
serverLog(LL_DEBUG, "[PSYNC] addReply() length: %lld", thislen);
addReplySds(c,sdsnewlen(g_pserver->repl_backlog + j, thislen));
len -= thislen;
j = 0;
}
return g_pserver->repl_backlog_histlen - skip;
/* Set the start and end offsets for the replica so that a future
* writeToClient will send the backlog from the given offset to
* the current end of the backlog to said replica */
c->repl_curr_off = offset - 1;
c->repl_end_off = g_pserver->master_repl_offset;
/* Force the partial sync to be queued */
prepareClientToWrite(c);
c->fPendingReplicaWrite = true;
return len;
}
/* Return the offset to provide as reply to the PSYNC command received
@ -735,6 +757,10 @@ int replicationSetupSlaveForFullResync(client *replica, long long offset) {
replica->psync_initial_offset = offset;
replica->replstate = SLAVE_STATE_WAIT_BGSAVE_END;
replica->repl_curr_off = offset;
replica->repl_end_off = g_pserver->master_repl_offset;
/* We are going to accumulate the incremental changes for this
* replica as well. Set replicaseldb to -1 in order to force to re-emit
* a SELECT statement in the replication stream. */
@ -1357,6 +1383,7 @@ void replconfCommand(client *c) {
* 4) Update the count of "good replicas". */
void putSlaveOnline(client *replica) {
replica->replstate = SLAVE_STATE_ONLINE;
replica->repl_put_online_on_ack = 0;
replica->repl_ack_time = g_pserver->unixtime; /* Prevent false timeout. */
@ -3059,6 +3086,11 @@ void syncWithMaster(connection *conn) {
if (psync_result == PSYNC_CONTINUE) {
serverLog(LL_NOTICE, "MASTER <-> REPLICA sync: Master accepted a Partial Resynchronization.");
/* Reset the bulklen information in case it is lingering from the last connection
* The partial sync will start from the beginning of a command so these should be reset */
mi->master->reqtype = 0;
mi->master->multibulklen = 0;
mi->master->bulklen = -1;
if (cserver.supervised_mode == SUPERVISED_SYSTEMD) {
redisCommunicateSystemd("STATUS=MASTER <-> REPLICA sync: Partial Resynchronization accepted. Ready to accept connections in read-write mode.\n");
}
@ -4898,15 +4930,19 @@ void replicateSubkeyExpire(redisDb *db, robj_roptr key, robj_roptr subkey, long
}
void _clientAsyncReplyBufferReserve(client *c, size_t len);
void flushReplBacklogToClients()
{
serverAssert(GlobalLocksAcquired());
/* If we have the repl backlog lock, we will deadlock */
serverAssert(!g_pserver->repl_backlog_lock.fOwnLock());
if (g_pserver->repl_batch_offStart < 0)
return;
if (g_pserver->repl_batch_offStart != g_pserver->master_repl_offset) {
bool fAsyncWrite = false;
long long min_offset = LONG_LONG_MAX;
// Ensure no overflow
serverAssert(g_pserver->repl_batch_offStart < g_pserver->master_repl_offset);
if (g_pserver->master_repl_offset - g_pserver->repl_batch_offStart > g_pserver->repl_backlog_size) {
// We overflowed
@ -4930,33 +4966,36 @@ void flushReplBacklogToClients()
listIter li;
listNode *ln;
listRewind(g_pserver->slaves, &li);
/* We don't actually write any data in this function since we send data
* directly from the replication backlog to replicas in writeToClient.
*
* What we do however, is set the end offset of each replica here. This way,
* future calls to writeToClient will know up to where in the replication
* backlog is valid for writing. */
while ((ln = listNext(&li))) {
client *replica = (client*)listNodeValue(ln);
if (!canFeedReplicaReplBuffer(replica)) continue;
if (replica->flags & CLIENT_CLOSE_ASAP) continue;
std::unique_lock<fastlock> ul(replica->lock, std::defer_lock);
if (FCorrectThread(replica))
ul.lock();
else
std::unique_lock<fastlock> ul(replica->lock);
if (!FCorrectThread(replica))
fAsyncWrite = true;
if (g_pserver->repl_backlog_idx >= g_pserver->repl_batch_idxStart) {
long long cbCopy = g_pserver->repl_backlog_idx - g_pserver->repl_batch_idxStart;
serverAssert((g_pserver->master_repl_offset - g_pserver->repl_batch_offStart) == cbCopy);
serverAssert((g_pserver->repl_backlog_size - g_pserver->repl_batch_idxStart) >= (cbCopy));
serverAssert((g_pserver->repl_batch_idxStart + cbCopy) <= g_pserver->repl_backlog_size);
/* We should have set the repl_curr_off when synchronizing, so it shouldn't be -1 here */
serverAssert(replica->repl_curr_off != -1);
addReplyProto(replica, g_pserver->repl_backlog + g_pserver->repl_batch_idxStart, cbCopy);
} else {
auto cbPhase1 = g_pserver->repl_backlog_size - g_pserver->repl_batch_idxStart;
if (fAsyncWrite)
_clientAsyncReplyBufferReserve(replica, cbPhase1 + g_pserver->repl_backlog_idx);
addReplyProto(replica, g_pserver->repl_backlog + g_pserver->repl_batch_idxStart, cbPhase1);
addReplyProto(replica, g_pserver->repl_backlog, g_pserver->repl_backlog_idx);
serverAssert((cbPhase1 + g_pserver->repl_backlog_idx) == (g_pserver->master_repl_offset - g_pserver->repl_batch_offStart));
min_offset = std::min(min_offset, replica->repl_curr_off);
replica->repl_end_off = g_pserver->master_repl_offset;
/* Only if the there isn't already a pending write do we prepare the client to write */
if (!replica->fPendingReplicaWrite){
serverAssert(replica->repl_curr_off != g_pserver->master_repl_offset);
prepareClientToWrite(replica);
replica->fPendingReplicaWrite = true;
}
}
if (fAsyncWrite)
ProcessPendingAsyncWrites();
@ -4965,6 +5004,7 @@ LDone:
// This may be called multiple times per "frame" so update with our progress flushing to clients
g_pserver->repl_batch_idxStart = g_pserver->repl_backlog_idx;
g_pserver->repl_batch_offStart = g_pserver->master_repl_offset;
g_pserver->repl_lowest_off.store(min_offset == LONG_LONG_MAX ? -1 : min_offset, std::memory_order_seq_cst);
}
}

View File

@ -2021,7 +2021,6 @@ void clientsCron(int iel) {
while(listLength(g_pserver->clients) && iterations--) {
client *c;
listNode *head;
/* Rotate the list, take the current head, process.
* This way if the client must be removed from the list it's the
* first element and we don't incur into O(N) computation. */
@ -3261,6 +3260,7 @@ void initServerConfig(void) {
g_pserver->enable_multimaster = CONFIG_DEFAULT_ENABLE_MULTIMASTER;
g_pserver->repl_syncio_timeout = CONFIG_REPL_SYNCIO_TIMEOUT;
g_pserver->master_repl_offset = 0;
g_pserver->repl_lowest_off.store(-1, std::memory_order_seq_cst);
/* Replication partial resync backlog */
g_pserver->repl_backlog = NULL;
@ -7012,9 +7012,10 @@ void OnTerminate()
void wakeTimeThread() {
updateCachedTime();
std::lock_guard<std::mutex> lock(time_thread_mutex);
if (sleeping_threads >= cserver.cthreads)
time_thread_cv.notify_one();
sleeping_threads--;
serverAssert(sleeping_threads >= 0);
time_thread_cv.notify_one();
}
void *timeThreadMain(void*) {

View File

@ -1589,6 +1589,13 @@ struct client {
long long psync_initial_offset; /* FULLRESYNC reply offset other slaves
copying this replica output buffer
should use. */
long long repl_curr_off = -1;/* Replication offset of the replica, also where in the backlog we need to start from
* when sending data to this replica. */
long long repl_end_off = -1; /* Replication offset to write to, stored in the replica, as opposed to using the global offset
* to prevent needing the global lock */
int fPendingReplicaWrite; /* Is there a write queued for this replica? */
char replid[CONFIG_RUN_ID_SIZE+1]; /* Master replication ID (if master). */
int slave_listening_port; /* As configured with: REPLCONF listening-port */
char *slave_addr; /* Optionally given by REPLCONF ip-address */
@ -2357,6 +2364,9 @@ struct redisServer {
that is the next byte will'll write to.*/
long long repl_backlog_off; /* Replication "master offset" of first
byte in the replication backlog buffer.*/
long long repl_backlog_start; /* Used to compute indicies from offsets
basically, index = (offset - start) % size */
fastlock repl_backlog_lock {"replication backlog"};
time_t repl_backlog_time_limit; /* Time without slaves after the backlog
gets released. */
time_t repl_no_slaves_since; /* We have no slaves since that time.
@ -2368,6 +2378,8 @@ struct redisServer {
int repl_diskless_load; /* Slave parse RDB directly from the socket.
* see REPL_DISKLESS_LOAD_* enum */
int repl_diskless_sync_delay; /* Delay to start a diskless repl BGSAVE. */
std::atomic <long long> repl_lowest_off; /* The lowest offset amongst all replicas
-1 if there are no replicas */
/* Replication (replica) */
list *masters;
int enable_multimaster;
@ -3713,6 +3725,8 @@ void mixDigest(unsigned char *digest, const void *ptr, size_t len);
void xorDigest(unsigned char *digest, const void *ptr, size_t len);
int populateCommandTableParseFlags(struct redisCommand *c, const char *strflags);
int moduleGILAcquiredByModule(void);
extern int g_fInCrash;
static inline int GlobalLocksAcquired(void) // Used in asserts to verify all global locks are correctly acquired for a server-thread to operate
@ -3780,6 +3794,7 @@ void tlsCleanup(void);
int tlsConfigure(redisTLSContextConfig *ctx_config);
class ShutdownException
{};
@ -3791,3 +3806,5 @@ class ShutdownException
int iAmMaster(void);
#endif

View File

@ -33,7 +33,8 @@ start_server {tags {"maxmemory"}} {
# Get the current memory limit and calculate a new limit.
# We just add 100k to the current memory size so that it is
# fast for us to reach that limit.
set used [s used_memory]
set overhead [s mem_not_counted_for_evict]
set used [expr [s used_memory] - $overhead]
set limit [expr {$used+100*1024}]
r config set maxmemory $limit
r config set maxmemory-policy $policy
@ -42,7 +43,7 @@ start_server {tags {"maxmemory"}} {
while 1 {
r setex [randomKey] 10000 x
incr numkeys
if {[s used_memory]+4096 > $limit} {
if {[expr {[s used_memory] - $overhead + 4096}] > $limit} {
assert {$numkeys > 10}
break
}
@ -52,7 +53,8 @@ start_server {tags {"maxmemory"}} {
for {set j 0} {$j < $numkeys} {incr j} {
r setex [randomKey] 10000 x
}
assert {[s used_memory] < ($limit+4096)}
set used_amt [expr [s used_memory] - $overhead]
assert {$used_amt < ($limit+4096)}
}
}
@ -65,7 +67,8 @@ start_server {tags {"maxmemory"}} {
# Get the current memory limit and calculate a new limit.
# We just add 100k to the current memory size so that it is
# fast for us to reach that limit.
set used [s used_memory]
set overhead [s mem_not_counted_for_evict]
set used [expr [s used_memory] - $overhead]
set limit [expr {$used+100*1024}]
r config set maxmemory $limit
r config set maxmemory-policy $policy
@ -74,7 +77,7 @@ start_server {tags {"maxmemory"}} {
while 1 {
r set [randomKey] x
incr numkeys
if {[s used_memory]+4096 > $limit} {
if {[expr [s used_memory] - $overhead]+4096 > $limit} {
assert {$numkeys > 10}
break
}
@ -91,7 +94,7 @@ start_server {tags {"maxmemory"}} {
}
}
if {[string match allkeys-* $policy]} {
assert {[s used_memory] < ($limit+4096)}
assert {[expr [s used_memory] - $overhead] < ($limit+4096)}
} else {
assert {$err == 1}
}
@ -107,7 +110,8 @@ start_server {tags {"maxmemory"}} {
# Get the current memory limit and calculate a new limit.
# We just add 100k to the current memory size so that it is
# fast for us to reach that limit.
set used [s used_memory]
set overhead [s mem_not_counted_for_evict]
set used [expr [s used_memory] - $overhead]
set limit [expr {$used+100*1024}]
r config set maxmemory $limit
r config set maxmemory-policy $policy
@ -121,7 +125,7 @@ start_server {tags {"maxmemory"}} {
} else {
r set "key:$numkeys" x
}
if {[s used_memory]+4096 > $limit} {
if {[expr [s used_memory] - $overhead]+4096 > $limit} {
assert {$numkeys > 10}
break
}
@ -135,7 +139,7 @@ start_server {tags {"maxmemory"}} {
catch {r setex "foo:$j" 10000 x}
}
# We should still be under the limit.
assert {[s used_memory] < ($limit+4096)}
assert {[expr [s used_memory] - $overhead] < ($limit+4096)}
# However all our non volatile keys should be here.
for {set j 0} {$j < $numkeys} {incr j 2} {
assert {[r exists "key:$j"]}
@ -305,7 +309,8 @@ start_server {tags {"maxmemory"} overrides {server-threads 1}} {
# we need to make sure to evict keynames of a total size of more than
# 16kb since the (PROTO_REPLY_CHUNK_BYTES), only after that the
# invalidation messages have a chance to trigger further eviction.
set used [s used_memory]
set overhead [s mem_not_counted_for_evict]
set used [expr [s used_memory] - $overhead]
set limit [expr {$used - 40000}]
r config set maxmemory $limit