/* Object implementation. * * Copyright (c) 2009-2012, Salvatore Sanfilippo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Redis nor the names of its contributors may be used * to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "server.h" #include "functions.h" #include "intset.h" /* Compact integer set structure */ #include #include #ifdef __CYGWIN__ #define strtold(a, b) ((long double)strtod((a), (b))) #endif /* ===================== Creation and parsing of objects ==================== */ robj *createObject(int type, void *ptr) { robj *o = zmalloc(sizeof(*o)); o->type = type; o->encoding = OBJ_ENCODING_RAW; o->ptr = ptr; o->refcount = 1; o->lru = 0; return o; } void initObjectLRUOrLFU(robj *o) { if (o->refcount == OBJ_SHARED_REFCOUNT) return; /* Set the LRU to the current lruclock (minutes resolution), or * alternatively the LFU counter. */ if (server.maxmemory_policy & MAXMEMORY_FLAG_LFU) { o->lru = (LFUGetTimeInMinutes() << 8) | LFU_INIT_VAL; } else { o->lru = LRU_CLOCK(); } return; } /* Set a special refcount in the object to make it "shared": * incrRefCount and decrRefCount() will test for this special refcount * and will not touch the object. This way it is free to access shared * objects such as small integers from different threads without any * mutex. * * A common pattern to create shared objects: * * robj *myobject = makeObjectShared(createObject(...)); * */ robj *makeObjectShared(robj *o) { serverAssert(o->refcount == 1); o->refcount = OBJ_SHARED_REFCOUNT; return o; } /* Create a string object with encoding OBJ_ENCODING_RAW, that is a plain * string object where o->ptr points to a proper sds string. */ robj *createRawStringObject(const char *ptr, size_t len) { return createObject(OBJ_STRING, sdsnewlen(ptr, len)); } /* Create a string object with encoding OBJ_ENCODING_EMBSTR, that is * an object where the sds string is actually an unmodifiable string * allocated in the same chunk as the object itself. */ robj *createEmbeddedStringObject(const char *ptr, size_t len) { robj *o = zmalloc(sizeof(robj) + sizeof(struct sdshdr8) + len + 1); struct sdshdr8 *sh = (void *)(o + 1); o->type = OBJ_STRING; o->encoding = OBJ_ENCODING_EMBSTR; o->ptr = sh + 1; o->refcount = 1; o->lru = 0; sh->len = len; sh->alloc = len; sh->flags = SDS_TYPE_8; if (ptr == SDS_NOINIT) sh->buf[len] = '\0'; else if (ptr) { memcpy(sh->buf, ptr, len); sh->buf[len] = '\0'; } else { memset(sh->buf, 0, len + 1); } return o; } /* Create a string object with EMBSTR encoding if it is smaller than * OBJ_ENCODING_EMBSTR_SIZE_LIMIT, otherwise the RAW encoding is * used. * * The current limit of 44 is chosen so that the biggest string object * we allocate as EMBSTR will still fit into the 64 byte arena of jemalloc. */ #define OBJ_ENCODING_EMBSTR_SIZE_LIMIT 44 robj *createStringObject(const char *ptr, size_t len) { if (len <= OBJ_ENCODING_EMBSTR_SIZE_LIMIT) return createEmbeddedStringObject(ptr, len); else return createRawStringObject(ptr, len); } /* Same as CreateRawStringObject, can return NULL if allocation fails */ robj *tryCreateRawStringObject(const char *ptr, size_t len) { sds str = sdstrynewlen(ptr, len); if (!str) return NULL; return createObject(OBJ_STRING, str); } /* Same as createStringObject, can return NULL if allocation fails */ robj *tryCreateStringObject(const char *ptr, size_t len) { if (len <= OBJ_ENCODING_EMBSTR_SIZE_LIMIT) return createEmbeddedStringObject(ptr, len); else return tryCreateRawStringObject(ptr, len); } /* Create a string object from a long long value according to the specified flag. */ #define LL2STROBJ_AUTO 0 /* automatically create the optimal string object */ #define LL2STROBJ_NO_SHARED 1 /* disallow shared objects */ #define LL2STROBJ_NO_INT_ENC 2 /* disallow integer encoded objects. */ robj *createStringObjectFromLongLongWithOptions(long long value, int flag) { robj *o; if (value >= 0 && value < OBJ_SHARED_INTEGERS && flag == LL2STROBJ_AUTO) { o = shared.integers[value]; } else { if ((value >= LONG_MIN && value <= LONG_MAX) && flag != LL2STROBJ_NO_INT_ENC) { o = createObject(OBJ_STRING, NULL); o->encoding = OBJ_ENCODING_INT; o->ptr = (void *)((long)value); } else { char buf[LONG_STR_SIZE]; int len = ll2string(buf, sizeof(buf), value); o = createStringObject(buf, len); } } return o; } /* Wrapper for createStringObjectFromLongLongWithOptions() always demanding * to create a shared object if possible. */ robj *createStringObjectFromLongLong(long long value) { return createStringObjectFromLongLongWithOptions(value, LL2STROBJ_AUTO); } /* The function avoids returning a shared integer when LFU/LRU info * are needed, that is, when the object is used as a value in the key * space(for instance when the INCR command is used), and the server is * configured to evict based on LFU/LRU, so we want LFU/LRU values * specific for each key. */ robj *createStringObjectFromLongLongForValue(long long value) { if (server.maxmemory == 0 || !(server.maxmemory_policy & MAXMEMORY_FLAG_NO_SHARED_INTEGERS)) { /* If the maxmemory policy permits, we can still return shared integers */ return createStringObjectFromLongLongWithOptions(value, LL2STROBJ_AUTO); } else { return createStringObjectFromLongLongWithOptions(value, LL2STROBJ_NO_SHARED); } } /* Create a string object that contains an sds inside it. That means it can't be * integer encoded (OBJ_ENCODING_INT), and it'll always be an EMBSTR type. */ robj *createStringObjectFromLongLongWithSds(long long value) { return createStringObjectFromLongLongWithOptions(value, LL2STROBJ_NO_INT_ENC); } /* Create a string object from a long double. If humanfriendly is non-zero * it does not use exponential format and trims trailing zeroes at the end, * however this results in loss of precision. Otherwise exp format is used * and the output of snprintf() is not modified. * * The 'humanfriendly' option is used for INCRBYFLOAT and HINCRBYFLOAT. */ robj *createStringObjectFromLongDouble(long double value, int humanfriendly) { char buf[MAX_LONG_DOUBLE_CHARS]; int len = ld2string(buf, sizeof(buf), value, humanfriendly ? LD_STR_HUMAN : LD_STR_AUTO); return createStringObject(buf, len); } /* Duplicate a string object, with the guarantee that the returned object * has the same encoding as the original one. * * This function also guarantees that duplicating a small integer object * (or a string object that contains a representation of a small integer) * will always result in a fresh object that is unshared (refcount == 1). * * The resulting object always has refcount set to 1. */ robj *dupStringObject(const robj *o) { robj *d; serverAssert(o->type == OBJ_STRING); switch (o->encoding) { case OBJ_ENCODING_RAW: return createRawStringObject(o->ptr, sdslen(o->ptr)); case OBJ_ENCODING_EMBSTR: return createEmbeddedStringObject(o->ptr, sdslen(o->ptr)); case OBJ_ENCODING_INT: d = createObject(OBJ_STRING, NULL); d->encoding = OBJ_ENCODING_INT; d->ptr = o->ptr; return d; default: serverPanic("Wrong encoding."); break; } } robj *createQuicklistObject(int fill, int compress) { quicklist *l = quicklistNew(fill, compress); robj *o = createObject(OBJ_LIST, l); o->encoding = OBJ_ENCODING_QUICKLIST; return o; } robj *createListListpackObject(void) { unsigned char *lp = lpNew(0); robj *o = createObject(OBJ_LIST, lp); o->encoding = OBJ_ENCODING_LISTPACK; return o; } robj *createSetObject(void) { dict *d = dictCreate(&setDictType); robj *o = createObject(OBJ_SET, d); o->encoding = OBJ_ENCODING_HT; return o; } robj *createIntsetObject(void) { intset *is = intsetNew(); robj *o = createObject(OBJ_SET, is); o->encoding = OBJ_ENCODING_INTSET; return o; } robj *createSetListpackObject(void) { unsigned char *lp = lpNew(0); robj *o = createObject(OBJ_SET, lp); o->encoding = OBJ_ENCODING_LISTPACK; return o; } robj *createHashObject(void) { unsigned char *zl = lpNew(0); robj *o = createObject(OBJ_HASH, zl); o->encoding = OBJ_ENCODING_LISTPACK; return o; } robj *createZsetObject(void) { zset *zs = zmalloc(sizeof(*zs)); robj *o; zs->dict = dictCreate(&zsetDictType); zs->zsl = zslCreate(); o = createObject(OBJ_ZSET, zs); o->encoding = OBJ_ENCODING_SKIPLIST; return o; } robj *createZsetListpackObject(void) { unsigned char *lp = lpNew(0); robj *o = createObject(OBJ_ZSET, lp); o->encoding = OBJ_ENCODING_LISTPACK; return o; } robj *createStreamObject(void) { stream *s = streamNew(); robj *o = createObject(OBJ_STREAM, s); o->encoding = OBJ_ENCODING_STREAM; return o; } robj *createModuleObject(moduleType *mt, void *value) { moduleValue *mv = zmalloc(sizeof(*mv)); mv->type = mt; mv->value = value; return createObject(OBJ_MODULE, mv); } void freeStringObject(robj *o) { if (o->encoding == OBJ_ENCODING_RAW) { sdsfree(o->ptr); } } void freeListObject(robj *o) { if (o->encoding == OBJ_ENCODING_QUICKLIST) { quicklistRelease(o->ptr); } else if (o->encoding == OBJ_ENCODING_LISTPACK) { lpFree(o->ptr); } else { serverPanic("Unknown list encoding type"); } } void freeSetObject(robj *o) { switch (o->encoding) { case OBJ_ENCODING_HT: dictRelease((dict *)o->ptr); break; case OBJ_ENCODING_INTSET: case OBJ_ENCODING_LISTPACK: zfree(o->ptr); break; default: serverPanic("Unknown set encoding type"); } } void freeZsetObject(robj *o) { zset *zs; switch (o->encoding) { case OBJ_ENCODING_SKIPLIST: zs = o->ptr; dictRelease(zs->dict); zslFree(zs->zsl); zfree(zs); break; case OBJ_ENCODING_LISTPACK: zfree(o->ptr); break; default: serverPanic("Unknown sorted set encoding"); } } void freeHashObject(robj *o) { switch (o->encoding) { case OBJ_ENCODING_HT: dictRelease((dict *)o->ptr); break; case OBJ_ENCODING_LISTPACK: lpFree(o->ptr); break; default: serverPanic("Unknown hash encoding type"); break; } } void freeModuleObject(robj *o) { moduleValue *mv = o->ptr; mv->type->free(mv->value); zfree(mv); } void freeStreamObject(robj *o) { freeStream(o->ptr); } void incrRefCount(robj *o) { if (o->refcount < OBJ_FIRST_SPECIAL_REFCOUNT) { o->refcount++; } else { if (o->refcount == OBJ_SHARED_REFCOUNT) { /* Nothing to do: this refcount is immutable. */ } else if (o->refcount == OBJ_STATIC_REFCOUNT) { serverPanic("You tried to retain an object allocated in the stack"); } } } void decrRefCount(robj *o) { if (o->refcount == 1) { switch (o->type) { case OBJ_STRING: freeStringObject(o); break; case OBJ_LIST: freeListObject(o); break; case OBJ_SET: freeSetObject(o); break; case OBJ_ZSET: freeZsetObject(o); break; case OBJ_HASH: freeHashObject(o); break; case OBJ_MODULE: freeModuleObject(o); break; case OBJ_STREAM: freeStreamObject(o); break; default: serverPanic("Unknown object type"); break; } zfree(o); } else { if (o->refcount <= 0) serverPanic("decrRefCount against refcount <= 0"); if (o->refcount != OBJ_SHARED_REFCOUNT) o->refcount--; } } /* See dismissObject() */ void dismissSds(sds s) { dismissMemory(sdsAllocPtr(s), sdsAllocSize(s)); } /* See dismissObject() */ void dismissStringObject(robj *o) { if (o->encoding == OBJ_ENCODING_RAW) { dismissSds(o->ptr); } } /* See dismissObject() */ void dismissListObject(robj *o, size_t size_hint) { if (o->encoding == OBJ_ENCODING_QUICKLIST) { quicklist *ql = o->ptr; serverAssert(ql->len != 0); /* We iterate all nodes only when average node size is bigger than a * page size, and there's a high chance we'll actually dismiss something. */ if (size_hint / ql->len >= server.page_size) { quicklistNode *node = ql->head; while (node) { if (quicklistNodeIsCompressed(node)) { dismissMemory(node->entry, ((quicklistLZF *)node->entry)->sz); } else { dismissMemory(node->entry, node->sz); } node = node->next; } } } else if (o->encoding == OBJ_ENCODING_LISTPACK) { dismissMemory(o->ptr, lpBytes((unsigned char *)o->ptr)); } else { serverPanic("Unknown list encoding type"); } } /* See dismissObject() */ void dismissSetObject(robj *o, size_t size_hint) { if (o->encoding == OBJ_ENCODING_HT) { dict *set = o->ptr; serverAssert(dictSize(set) != 0); /* We iterate all nodes only when average member size is bigger than a * page size, and there's a high chance we'll actually dismiss something. */ if (size_hint / dictSize(set) >= server.page_size) { dictEntry *de; dictIterator *di = dictGetIterator(set); while ((de = dictNext(di)) != NULL) { dismissSds(dictGetKey(de)); } dictReleaseIterator(di); } /* Dismiss hash table memory. */ dismissMemory(set->ht_table[0], DICTHT_SIZE(set->ht_size_exp[0]) * sizeof(dictEntry *)); dismissMemory(set->ht_table[1], DICTHT_SIZE(set->ht_size_exp[1]) * sizeof(dictEntry *)); } else if (o->encoding == OBJ_ENCODING_INTSET) { dismissMemory(o->ptr, intsetBlobLen((intset *)o->ptr)); } else if (o->encoding == OBJ_ENCODING_LISTPACK) { dismissMemory(o->ptr, lpBytes((unsigned char *)o->ptr)); } else { serverPanic("Unknown set encoding type"); } } /* See dismissObject() */ void dismissZsetObject(robj *o, size_t size_hint) { if (o->encoding == OBJ_ENCODING_SKIPLIST) { zset *zs = o->ptr; zskiplist *zsl = zs->zsl; serverAssert(zsl->length != 0); /* We iterate all nodes only when average member size is bigger than a * page size, and there's a high chance we'll actually dismiss something. */ if (size_hint / zsl->length >= server.page_size) { zskiplistNode *zn = zsl->tail; while (zn != NULL) { dismissSds(zn->ele); zn = zn->backward; } } /* Dismiss hash table memory. */ dict *d = zs->dict; dismissMemory(d->ht_table[0], DICTHT_SIZE(d->ht_size_exp[0]) * sizeof(dictEntry *)); dismissMemory(d->ht_table[1], DICTHT_SIZE(d->ht_size_exp[1]) * sizeof(dictEntry *)); } else if (o->encoding == OBJ_ENCODING_LISTPACK) { dismissMemory(o->ptr, lpBytes((unsigned char *)o->ptr)); } else { serverPanic("Unknown zset encoding type"); } } /* See dismissObject() */ void dismissHashObject(robj *o, size_t size_hint) { if (o->encoding == OBJ_ENCODING_HT) { dict *d = o->ptr; serverAssert(dictSize(d) != 0); /* We iterate all fields only when average field/value size is bigger than * a page size, and there's a high chance we'll actually dismiss something. */ if (size_hint / dictSize(d) >= server.page_size) { dictEntry *de; dictIterator *di = dictGetIterator(d); while ((de = dictNext(di)) != NULL) { /* Only dismiss values memory since the field size * usually is small. */ dismissSds(dictGetVal(de)); } dictReleaseIterator(di); } /* Dismiss hash table memory. */ dismissMemory(d->ht_table[0], DICTHT_SIZE(d->ht_size_exp[0]) * sizeof(dictEntry *)); dismissMemory(d->ht_table[1], DICTHT_SIZE(d->ht_size_exp[1]) * sizeof(dictEntry *)); } else if (o->encoding == OBJ_ENCODING_LISTPACK) { dismissMemory(o->ptr, lpBytes((unsigned char *)o->ptr)); } else { serverPanic("Unknown hash encoding type"); } } /* See dismissObject() */ void dismissStreamObject(robj *o, size_t size_hint) { stream *s = o->ptr; rax *rax = s->rax; if (raxSize(rax) == 0) return; /* Iterate only on stream entries, although size_hint may include serialized * consumer groups info, but usually, stream entries take up most of * the space. */ if (size_hint / raxSize(rax) >= server.page_size) { raxIterator ri; raxStart(&ri, rax); raxSeek(&ri, "^", NULL, 0); while (raxNext(&ri)) { dismissMemory(ri.data, lpBytes(ri.data)); } raxStop(&ri); } } /* When creating a snapshot in a fork child process, the main process and child * process share the same physical memory pages, and if / when the parent * modifies any keys due to write traffic, it'll cause CoW which consume * physical memory. In the child process, after serializing the key and value, * the data is definitely not accessed again, so to avoid unnecessary CoW, we * try to release their memory back to OS. see dismissMemory(). * * Because of the cost of iterating all node/field/member/entry of complex data * types, we iterate and dismiss them only when approximate average we estimate * the size of an individual allocation is more than a page size of OS. * 'size_hint' is the size of serialized value. This method is not accurate, but * it can reduce unnecessary iteration for complex data types that are probably * not going to release any memory. */ void dismissObject(robj *o, size_t size_hint) { /* madvise(MADV_DONTNEED) may not work if Transparent Huge Pages is enabled. */ if (server.thp_enabled) return; /* Currently we use zmadvise_dontneed only when we use jemalloc with Linux. * so we avoid these pointless loops when they're not going to do anything. */ #if defined(USE_JEMALLOC) && defined(__linux__) if (o->refcount != 1) return; switch (o->type) { case OBJ_STRING: dismissStringObject(o); break; case OBJ_LIST: dismissListObject(o, size_hint); break; case OBJ_SET: dismissSetObject(o, size_hint); break; case OBJ_ZSET: dismissZsetObject(o, size_hint); break; case OBJ_HASH: dismissHashObject(o, size_hint); break; case OBJ_STREAM: dismissStreamObject(o, size_hint); break; default: break; } #else UNUSED(o); UNUSED(size_hint); #endif } /* This variant of decrRefCount() gets its argument as void, and is useful * as free method in data structures that expect a 'void free_object(void*)' * prototype for the free method. */ void decrRefCountVoid(void *o) { decrRefCount(o); } int checkType(client *c, robj *o, int type) { /* A NULL is considered an empty key */ if (o && o->type != type) { addReplyErrorObject(c, shared.wrongtypeerr); return 1; } return 0; } int isSdsRepresentableAsLongLong(sds s, long long *llval) { return string2ll(s, sdslen(s), llval) ? C_OK : C_ERR; } int isObjectRepresentableAsLongLong(robj *o, long long *llval) { serverAssertWithInfo(NULL, o, o->type == OBJ_STRING); if (o->encoding == OBJ_ENCODING_INT) { if (llval) *llval = (long)o->ptr; return C_OK; } else { return isSdsRepresentableAsLongLong(o->ptr, llval); } } /* Optimize the SDS string inside the string object to require little space, * in case there is more than 10% of free space at the end of the SDS. */ void trimStringObjectIfNeeded(robj *o, int trim_small_values) { if (o->encoding != OBJ_ENCODING_RAW) return; /* A string may have free space in the following cases: * 1. When an arg len is greater than PROTO_MBULK_BIG_ARG the query buffer may be used directly as the SDS string. * 2. When utilizing the argument caching mechanism in Lua. * 3. When calling from RM_TrimStringAllocation (trim_small_values is true). */ size_t len = sdslen(o->ptr); if (len >= PROTO_MBULK_BIG_ARG || trim_small_values || (server.executing_client && server.executing_client->flag.script && len < LUA_CMD_OBJCACHE_MAX_LEN)) { if (sdsavail(o->ptr) > len / 10) { o->ptr = sdsRemoveFreeSpace(o->ptr, 0); } } } /* Try to encode a string object in order to save space */ robj *tryObjectEncodingEx(robj *o, int try_trim) { long value; sds s = o->ptr; size_t len; /* Make sure this is a string object, the only type we encode * in this function. Other types use encoded memory efficient * representations but are handled by the commands implementing * the type. */ serverAssertWithInfo(NULL, o, o->type == OBJ_STRING); /* We try some specialized encoding only for objects that are * RAW or EMBSTR encoded, in other words objects that are still * in represented by an actually array of chars. */ if (!sdsEncodedObject(o)) return o; /* It's not safe to encode shared objects: shared objects can be shared * everywhere in the "object space" of the server and may end in places where * they are not handled. We handle them only as values in the keyspace. */ if (o->refcount > 1) return o; /* Check if we can represent this string as a long integer. * Note that we are sure that a string larger than 20 chars is not * representable as a 32 nor 64 bit integer. */ len = sdslen(s); if (len <= 20 && string2l(s, len, &value)) { /* This object is encodable as a long. Try to use a shared object. * Note that we avoid using shared integers when maxmemory is used * because every object needs to have a private LRU field for the LRU * algorithm to work well. */ if (canUseSharedObject() && value >= 0 && value < OBJ_SHARED_INTEGERS) { decrRefCount(o); return shared.integers[value]; } else { if (o->encoding == OBJ_ENCODING_RAW) { sdsfree(o->ptr); o->encoding = OBJ_ENCODING_INT; o->ptr = (void *)value; return o; } else if (o->encoding == OBJ_ENCODING_EMBSTR) { decrRefCount(o); return createStringObjectFromLongLongForValue(value); } } } /* If the string is small and is still RAW encoded, * try the EMBSTR encoding which is more efficient. * In this representation the object and the SDS string are allocated * in the same chunk of memory to save space and cache misses. */ if (len <= OBJ_ENCODING_EMBSTR_SIZE_LIMIT) { robj *emb; if (o->encoding == OBJ_ENCODING_EMBSTR) return o; emb = createEmbeddedStringObject(s, sdslen(s)); decrRefCount(o); return emb; } /* We can't encode the object... * Do the last try, and at least optimize the SDS string inside */ if (try_trim) trimStringObjectIfNeeded(o, 0); /* Return the original object. */ return o; } robj *tryObjectEncoding(robj *o) { return tryObjectEncodingEx(o, 1); } /* Get a decoded version of an encoded object (returned as a new object). * If the object is already raw-encoded just increment the ref count. */ robj *getDecodedObject(robj *o) { robj *dec; if (sdsEncodedObject(o)) { incrRefCount(o); return o; } if (o->type == OBJ_STRING && o->encoding == OBJ_ENCODING_INT) { char buf[32]; ll2string(buf, 32, (long)o->ptr); dec = createStringObject(buf, strlen(buf)); return dec; } else { serverPanic("Unknown encoding type"); } } /* Compare two string objects via strcmp() or strcoll() depending on flags. * Note that the objects may be integer-encoded. In such a case we * use ll2string() to get a string representation of the numbers on the stack * and compare the strings, it's much faster than calling getDecodedObject(). * * Important note: when STRING_COMPARE_BINARY is used a binary-safe comparison * is used. */ #define STRING_COMPARE_BINARY (1 << 0) #define STRING_COMPARE_COLL (1 << 1) int compareStringObjectsWithFlags(const robj *a, const robj *b, int flags) { serverAssertWithInfo(NULL, a, a->type == OBJ_STRING && b->type == OBJ_STRING); char bufa[128], bufb[128], *astr, *bstr; size_t alen, blen, minlen; if (a == b) return 0; if (sdsEncodedObject(a)) { astr = a->ptr; alen = sdslen(astr); } else { alen = ll2string(bufa, sizeof(bufa), (long)a->ptr); astr = bufa; } if (sdsEncodedObject(b)) { bstr = b->ptr; blen = sdslen(bstr); } else { blen = ll2string(bufb, sizeof(bufb), (long)b->ptr); bstr = bufb; } if (flags & STRING_COMPARE_COLL) { return strcoll(astr, bstr); } else { int cmp; minlen = (alen < blen) ? alen : blen; cmp = memcmp(astr, bstr, minlen); if (cmp == 0) return alen - blen; return cmp; } } /* Wrapper for compareStringObjectsWithFlags() using binary comparison. */ int compareStringObjects(const robj *a, const robj *b) { return compareStringObjectsWithFlags(a, b, STRING_COMPARE_BINARY); } /* Wrapper for compareStringObjectsWithFlags() using collation. */ int collateStringObjects(const robj *a, const robj *b) { return compareStringObjectsWithFlags(a, b, STRING_COMPARE_COLL); } /* Equal string objects return 1 if the two objects are the same from the * point of view of a string comparison, otherwise 0 is returned. Note that * this function is faster then checking for (compareStringObject(a,b) == 0) * because it can perform some more optimization. */ int equalStringObjects(robj *a, robj *b) { if (a->encoding == OBJ_ENCODING_INT && b->encoding == OBJ_ENCODING_INT) { /* If both strings are integer encoded just check if the stored * long is the same. */ return a->ptr == b->ptr; } else { return compareStringObjects(a, b) == 0; } } size_t stringObjectLen(robj *o) { serverAssertWithInfo(NULL, o, o->type == OBJ_STRING); if (sdsEncodedObject(o)) { return sdslen(o->ptr); } else { return sdigits10((long)o->ptr); } } int getDoubleFromObject(const robj *o, double *target) { double value; if (o == NULL) { value = 0; } else { serverAssertWithInfo(NULL, o, o->type == OBJ_STRING); if (sdsEncodedObject(o)) { if (!string2d(o->ptr, sdslen(o->ptr), &value)) return C_ERR; } else if (o->encoding == OBJ_ENCODING_INT) { value = (long)o->ptr; } else { serverPanic("Unknown string encoding"); } } *target = value; return C_OK; } int getDoubleFromObjectOrReply(client *c, robj *o, double *target, const char *msg) { double value; if (getDoubleFromObject(o, &value) != C_OK) { if (msg != NULL) { addReplyError(c, (char *)msg); } else { addReplyError(c, "value is not a valid float"); } return C_ERR; } *target = value; return C_OK; } int getLongDoubleFromObject(robj *o, long double *target) { long double value; if (o == NULL) { value = 0; } else { serverAssertWithInfo(NULL, o, o->type == OBJ_STRING); if (sdsEncodedObject(o)) { if (!string2ld(o->ptr, sdslen(o->ptr), &value)) return C_ERR; } else if (o->encoding == OBJ_ENCODING_INT) { value = (long)o->ptr; } else { serverPanic("Unknown string encoding"); } } *target = value; return C_OK; } int getLongDoubleFromObjectOrReply(client *c, robj *o, long double *target, const char *msg) { long double value; if (getLongDoubleFromObject(o, &value) != C_OK) { if (msg != NULL) { addReplyError(c, (char *)msg); } else { addReplyError(c, "value is not a valid float"); } return C_ERR; } *target = value; return C_OK; } int getLongLongFromObject(robj *o, long long *target) { long long value; if (o == NULL) { value = 0; } else { serverAssertWithInfo(NULL, o, o->type == OBJ_STRING); if (sdsEncodedObject(o)) { if (string2ll(o->ptr, sdslen(o->ptr), &value) == 0) return C_ERR; } else if (o->encoding == OBJ_ENCODING_INT) { value = (long)o->ptr; } else { serverPanic("Unknown string encoding"); } } if (target) *target = value; return C_OK; } int getLongLongFromObjectOrReply(client *c, robj *o, long long *target, const char *msg) { long long value; if (getLongLongFromObject(o, &value) != C_OK) { if (msg != NULL) { addReplyError(c, (char *)msg); } else { addReplyError(c, "value is not an integer or out of range"); } return C_ERR; } *target = value; return C_OK; } int getLongFromObjectOrReply(client *c, robj *o, long *target, const char *msg) { long long value; if (getLongLongFromObjectOrReply(c, o, &value, msg) != C_OK) return C_ERR; if (value < LONG_MIN || value > LONG_MAX) { if (msg != NULL) { addReplyError(c, (char *)msg); } else { addReplyError(c, "value is out of range"); } return C_ERR; } *target = value; return C_OK; } int getRangeLongFromObjectOrReply(client *c, robj *o, long min, long max, long *target, const char *msg) { if (getLongFromObjectOrReply(c, o, target, msg) != C_OK) return C_ERR; if (*target < min || *target > max) { if (msg != NULL) { addReplyError(c, (char *)msg); } else { addReplyErrorFormat(c, "value is out of range, value must between %ld and %ld", min, max); } return C_ERR; } return C_OK; } int getPositiveLongFromObjectOrReply(client *c, robj *o, long *target, const char *msg) { if (msg) { return getRangeLongFromObjectOrReply(c, o, 0, LONG_MAX, target, msg); } else { return getRangeLongFromObjectOrReply(c, o, 0, LONG_MAX, target, "value is out of range, must be positive"); } } int getIntFromObjectOrReply(client *c, robj *o, int *target, const char *msg) { long value; if (getRangeLongFromObjectOrReply(c, o, INT_MIN, INT_MAX, &value, msg) != C_OK) return C_ERR; *target = value; return C_OK; } char *strEncoding(int encoding) { switch (encoding) { case OBJ_ENCODING_RAW: return "raw"; case OBJ_ENCODING_INT: return "int"; case OBJ_ENCODING_HT: return "hashtable"; case OBJ_ENCODING_QUICKLIST: return "quicklist"; case OBJ_ENCODING_LISTPACK: return "listpack"; case OBJ_ENCODING_INTSET: return "intset"; case OBJ_ENCODING_SKIPLIST: return "skiplist"; case OBJ_ENCODING_EMBSTR: return "embstr"; case OBJ_ENCODING_STREAM: return "stream"; default: return "unknown"; } } /* =========================== Memory introspection ========================= */ /* This is a helper function with the goal of estimating the memory * size of a radix tree that is used to store Stream IDs. * * Note: to guess the size of the radix tree is not trivial, so we * approximate it considering 16 bytes of data overhead for each * key (the ID), and then adding the number of bare nodes, plus some * overhead due by the data and child pointers. This secret recipe * was obtained by checking the average radix tree created by real * workloads, and then adjusting the constants to get numbers that * more or less match the real memory usage. * * Actually the number of nodes and keys may be different depending * on the insertion speed and thus the ability of the radix tree * to compress prefixes. */ size_t streamRadixTreeMemoryUsage(rax *rax) { size_t size = sizeof(*rax); size = rax->numele * sizeof(streamID); size += rax->numnodes * sizeof(raxNode); /* Add a fixed overhead due to the aux data pointer, children, ... */ size += rax->numnodes * sizeof(long) * 30; return size; } /* Returns the size in bytes consumed by the key's value in RAM. * Note that the returned value is just an approximation, especially in the * case of aggregated data types where only "sample_size" elements * are checked and averaged to estimate the total size. */ #define OBJ_COMPUTE_SIZE_DEF_SAMPLES 5 /* Default sample size. */ size_t objectComputeSize(robj *key, robj *o, size_t sample_size, int dbid) { sds ele, ele2; dict *d; dictIterator *di; struct dictEntry *de; size_t asize = 0, elesize = 0, samples = 0; if (o->type == OBJ_STRING) { if (o->encoding == OBJ_ENCODING_INT) { asize = sizeof(*o); } else if (o->encoding == OBJ_ENCODING_RAW) { asize = sdsZmallocSize(o->ptr) + sizeof(*o); } else if (o->encoding == OBJ_ENCODING_EMBSTR) { asize = zmalloc_size((void *)o); } else { serverPanic("Unknown string encoding"); } } else if (o->type == OBJ_LIST) { if (o->encoding == OBJ_ENCODING_QUICKLIST) { quicklist *ql = o->ptr; quicklistNode *node = ql->head; asize = sizeof(*o) + sizeof(quicklist); do { elesize += sizeof(quicklistNode) + zmalloc_size(node->entry); samples++; } while ((node = node->next) && samples < sample_size); asize += (double)elesize / samples * ql->len; } else if (o->encoding == OBJ_ENCODING_LISTPACK) { asize = sizeof(*o) + zmalloc_size(o->ptr); } else { serverPanic("Unknown list encoding"); } } else if (o->type == OBJ_SET) { if (o->encoding == OBJ_ENCODING_HT) { d = o->ptr; di = dictGetIterator(d); asize = sizeof(*o) + sizeof(dict) + (sizeof(struct dictEntry *) * dictBuckets(d)); while ((de = dictNext(di)) != NULL && samples < sample_size) { ele = dictGetKey(de); elesize += dictEntryMemUsage() + sdsZmallocSize(ele); samples++; } dictReleaseIterator(di); if (samples) asize += (double)elesize / samples * dictSize(d); } else if (o->encoding == OBJ_ENCODING_INTSET) { asize = sizeof(*o) + zmalloc_size(o->ptr); } else if (o->encoding == OBJ_ENCODING_LISTPACK) { asize = sizeof(*o) + zmalloc_size(o->ptr); } else { serverPanic("Unknown set encoding"); } } else if (o->type == OBJ_ZSET) { if (o->encoding == OBJ_ENCODING_LISTPACK) { asize = sizeof(*o) + zmalloc_size(o->ptr); } else if (o->encoding == OBJ_ENCODING_SKIPLIST) { d = ((zset *)o->ptr)->dict; zskiplist *zsl = ((zset *)o->ptr)->zsl; zskiplistNode *znode = zsl->header->level[0].forward; asize = sizeof(*o) + sizeof(zset) + sizeof(zskiplist) + sizeof(dict) + (sizeof(struct dictEntry *) * dictBuckets(d)) + zmalloc_size(zsl->header); while (znode != NULL && samples < sample_size) { elesize += sdsZmallocSize(znode->ele); elesize += dictEntryMemUsage() + zmalloc_size(znode); samples++; znode = znode->level[0].forward; } if (samples) asize += (double)elesize / samples * dictSize(d); } else { serverPanic("Unknown sorted set encoding"); } } else if (o->type == OBJ_HASH) { if (o->encoding == OBJ_ENCODING_LISTPACK) { asize = sizeof(*o) + zmalloc_size(o->ptr); } else if (o->encoding == OBJ_ENCODING_HT) { d = o->ptr; di = dictGetIterator(d); asize = sizeof(*o) + sizeof(dict) + (sizeof(struct dictEntry *) * dictBuckets(d)); while ((de = dictNext(di)) != NULL && samples < sample_size) { ele = dictGetKey(de); ele2 = dictGetVal(de); elesize += sdsZmallocSize(ele) + sdsZmallocSize(ele2); elesize += dictEntryMemUsage(); samples++; } dictReleaseIterator(di); if (samples) asize += (double)elesize / samples * dictSize(d); } else { serverPanic("Unknown hash encoding"); } } else if (o->type == OBJ_STREAM) { stream *s = o->ptr; asize = sizeof(*o) + sizeof(*s); asize += streamRadixTreeMemoryUsage(s->rax); /* Now we have to add the listpacks. The last listpack is often non * complete, so we estimate the size of the first N listpacks, and * use the average to compute the size of the first N-1 listpacks, and * finally add the real size of the last node. */ raxIterator ri; raxStart(&ri, s->rax); raxSeek(&ri, "^", NULL, 0); size_t lpsize = 0, samples = 0; while (samples < sample_size && raxNext(&ri)) { unsigned char *lp = ri.data; /* Use the allocated size, since we overprovision the node initially. */ lpsize += zmalloc_size(lp); samples++; } if (s->rax->numele <= samples) { asize += lpsize; } else { if (samples) lpsize /= samples; /* Compute the average. */ asize += lpsize * (s->rax->numele - 1); /* No need to check if seek succeeded, we enter this branch only * if there are a few elements in the radix tree. */ raxSeek(&ri, "$", NULL, 0); raxNext(&ri); /* Use the allocated size, since we overprovision the node initially. */ asize += zmalloc_size(ri.data); } raxStop(&ri); /* Consumer groups also have a non trivial memory overhead if there * are many consumers and many groups, let's count at least the * overhead of the pending entries in the groups and consumers * PELs. */ if (s->cgroups) { raxStart(&ri, s->cgroups); raxSeek(&ri, "^", NULL, 0); while (raxNext(&ri)) { streamCG *cg = ri.data; asize += sizeof(*cg); asize += streamRadixTreeMemoryUsage(cg->pel); asize += sizeof(streamNACK) * raxSize(cg->pel); /* For each consumer we also need to add the basic data * structures and the PEL memory usage. */ raxIterator cri; raxStart(&cri, cg->consumers); raxSeek(&cri, "^", NULL, 0); while (raxNext(&cri)) { streamConsumer *consumer = cri.data; asize += sizeof(*consumer); asize += sdslen(consumer->name); asize += streamRadixTreeMemoryUsage(consumer->pel); /* Don't count NACKs again, they are shared with the * consumer group PEL. */ } raxStop(&cri); } raxStop(&ri); } } else if (o->type == OBJ_MODULE) { asize = moduleGetMemUsage(key, o, sample_size, dbid); } else { serverPanic("Unknown object type"); } return asize; } /* Release data obtained with getMemoryOverheadData(). */ void freeMemoryOverheadData(struct serverMemOverhead *mh) { zfree(mh->db); zfree(mh); } /* Return a struct serverMemOverhead filled with memory overhead * information used for the MEMORY OVERHEAD and INFO command. The returned * structure pointer should be freed calling freeMemoryOverheadData(). */ struct serverMemOverhead *getMemoryOverheadData(void) { int j; size_t mem_total = 0; size_t mem = 0; size_t zmalloc_used = zmalloc_used_memory(); struct serverMemOverhead *mh = zcalloc(sizeof(*mh)); mh->total_allocated = zmalloc_used; mh->startup_allocated = server.initial_memory_usage; mh->peak_allocated = server.stat_peak_memory; mh->total_frag = (float)server.cron_malloc_stats.process_rss / server.cron_malloc_stats.zmalloc_used; mh->total_frag_bytes = server.cron_malloc_stats.process_rss - server.cron_malloc_stats.zmalloc_used; mh->allocator_frag = (float)server.cron_malloc_stats.allocator_frag_smallbins_bytes / server.cron_malloc_stats.allocator_allocated + 1; mh->allocator_frag_bytes = server.cron_malloc_stats.allocator_frag_smallbins_bytes; mh->allocator_rss = (float)server.cron_malloc_stats.allocator_resident / server.cron_malloc_stats.allocator_active; mh->allocator_rss_bytes = server.cron_malloc_stats.allocator_resident - server.cron_malloc_stats.allocator_active; mh->rss_extra = (float)server.cron_malloc_stats.process_rss / server.cron_malloc_stats.allocator_resident; mh->rss_extra_bytes = server.cron_malloc_stats.process_rss - server.cron_malloc_stats.allocator_resident; mem_total += server.initial_memory_usage; /* Replication backlog and replicas share one global replication buffer, * only if replication buffer memory is more than the repl backlog setting, * we consider the excess as replicas' memory. Otherwise, replication buffer * memory is the consumption of repl backlog. */ if (listLength(server.replicas) && (long long)server.repl_buffer_mem > server.repl_backlog_size) { mh->clients_replicas = server.repl_buffer_mem - server.repl_backlog_size; mh->repl_backlog = server.repl_backlog_size; } else { mh->clients_replicas = 0; mh->repl_backlog = server.repl_buffer_mem; } if (server.repl_backlog) { /* The approximate memory of rax tree for indexed blocks. */ mh->repl_backlog += server.repl_backlog->blocks_index->numnodes * sizeof(raxNode) + raxSize(server.repl_backlog->blocks_index) * sizeof(void *); } mem_total += mh->repl_backlog; mem_total += mh->clients_replicas; /* Computing the memory used by the clients would be O(N) if done * here online. We use our values computed incrementally by * updateClientMemoryUsage(). */ mh->clients_normal = server.stat_clients_type_memory[CLIENT_TYPE_PRIMARY] + server.stat_clients_type_memory[CLIENT_TYPE_PUBSUB] + server.stat_clients_type_memory[CLIENT_TYPE_NORMAL]; mem_total += mh->clients_normal; mh->cluster_links = server.stat_cluster_links_memory; mem_total += mh->cluster_links; mem = 0; if (server.aof_state != AOF_OFF) { mem += sdsZmallocSize(server.aof_buf); } mh->aof_buffer = mem; mem_total += mem; mem = evalScriptsMemory(); mh->lua_caches = mem; mem_total += mem; mh->functions_caches = functionsMemoryOverhead(); mem_total += mh->functions_caches; for (j = 0; j < server.dbnum; j++) { serverDb *db = server.db + j; if (!kvstoreNumAllocatedDicts(db->keys)) continue; unsigned long long keyscount = kvstoreSize(db->keys); mh->total_keys += keyscount; mh->db = zrealloc(mh->db, sizeof(mh->db[0]) * (mh->num_dbs + 1)); mh->db[mh->num_dbs].dbid = j; mem = kvstoreMemUsage(db->keys) + keyscount * sizeof(robj); mh->db[mh->num_dbs].overhead_ht_main = mem; mem_total += mem; mem = kvstoreMemUsage(db->expires); mh->db[mh->num_dbs].overhead_ht_expires = mem; mem_total += mem; mh->num_dbs++; mh->overhead_db_hashtable_lut += kvstoreOverheadHashtableLut(db->keys); mh->overhead_db_hashtable_lut += kvstoreOverheadHashtableLut(db->expires); mh->overhead_db_hashtable_rehashing += kvstoreOverheadHashtableRehashing(db->keys); mh->overhead_db_hashtable_rehashing += kvstoreOverheadHashtableRehashing(db->expires); mh->db_dict_rehashing_count += kvstoreDictRehashingCount(db->keys); mh->db_dict_rehashing_count += kvstoreDictRehashingCount(db->expires); } mh->overhead_total = mem_total; mh->dataset = zmalloc_used - mem_total; mh->peak_perc = (float)zmalloc_used * 100 / mh->peak_allocated; /* Metrics computed after subtracting the startup memory from * the total memory. */ size_t net_usage = 1; if (zmalloc_used > mh->startup_allocated) net_usage = zmalloc_used - mh->startup_allocated; mh->dataset_perc = (float)mh->dataset * 100 / net_usage; mh->bytes_per_key = mh->total_keys ? (mh->dataset / mh->total_keys) : 0; return mh; } /* Helper for "MEMORY allocator-stats", used as a callback for the jemalloc * stats output. */ void inputCatSds(void *result, const char *str) { /* result is actually a (sds *), so re-cast it here */ sds *info = (sds *)result; *info = sdscat(*info, str); } /* This implements MEMORY DOCTOR. An human readable analysis of the server * memory condition. */ sds getMemoryDoctorReport(void) { int empty = 0; /* Instance is empty or almost empty. */ int big_peak = 0; /* Memory peak is much larger than used mem. */ int high_frag = 0; /* High fragmentation. */ int high_alloc_frag = 0; /* High allocator fragmentation. */ int high_proc_rss = 0; /* High process rss overhead. */ int high_alloc_rss = 0; /* High rss overhead. */ int big_replica_buf = 0; /* Replica buffers are too big. */ int big_client_buf = 0; /* Client buffers are too big. */ int many_scripts = 0; /* Script cache has too many scripts. */ int num_reports = 0; struct serverMemOverhead *mh = getMemoryOverheadData(); if (mh->total_allocated < (1024 * 1024 * 5)) { empty = 1; num_reports++; } else { /* Peak is > 150% of current used memory? */ if (((float)mh->peak_allocated / mh->total_allocated) > 1.5) { big_peak = 1; num_reports++; } /* Fragmentation is higher than 1.4 and 10MB ?*/ if (mh->total_frag > 1.4 && mh->total_frag_bytes > 10 << 20) { high_frag = 1; num_reports++; } /* External fragmentation is higher than 1.1 and 10MB? */ if (mh->allocator_frag > 1.1 && mh->allocator_frag_bytes > 10 << 20) { high_alloc_frag = 1; num_reports++; } /* Allocator rss is higher than 1.1 and 10MB ? */ if (mh->allocator_rss > 1.1 && mh->allocator_rss_bytes > 10 << 20) { high_alloc_rss = 1; num_reports++; } /* Non-Allocator rss is higher than 1.1 and 10MB ? */ if (mh->rss_extra > 1.1 && mh->rss_extra_bytes > 10 << 20) { high_proc_rss = 1; num_reports++; } /* Clients using more than 200k each average? */ long num_replicas = listLength(server.replicas); long numclients = listLength(server.clients) - num_replicas; if (mh->clients_normal / numclients > (1024 * 200)) { big_client_buf = 1; num_reports++; } /* Replicas using more than 10 MB each? */ if (num_replicas > 0 && mh->clients_replicas > (1024 * 1024 * 10)) { big_replica_buf = 1; num_reports++; } /* Too many scripts are cached? */ if (dictSize(evalScriptsDict()) > 1000) { many_scripts = 1; num_reports++; } } sds s; if (num_reports == 0) { s = sdsnew("Hi Sam, I can't find any memory issue in your instance. " "I can only account for what occurs on this base.\n"); } else if (empty == 1) { s = sdsnew("Hi Sam, this instance is empty or is using very little memory, " "my issues detector can't be used in these conditions. " "Please, leave for your mission on Earth and fill it with some data. " "The new Sam and I will be back to our programming as soon as I " "finished rebooting.\n"); } else { s = sdsnew("Sam, I detected a few issues in this Valkey instance memory implants:\n\n"); if (big_peak) { s = sdscat(s, " * Peak memory: In the past this instance used more than 150% the memory that is currently " "using. The allocator is normally not able to release memory after a peak, so you can expect to " "see a big fragmentation ratio, however this is actually harmless and is only due to the memory " "peak, and if the Valkey instance Resident Set Size (RSS) is currently bigger than expected, " "the memory will be used as soon as you fill the Valkey instance with more data. If the memory " "peak was only occasional and you want to try to reclaim memory, please try the MEMORY PURGE " "command, otherwise the only other option is to shutdown and restart the instance.\n\n"); } if (high_frag) { s = sdscatprintf( s, " * High total RSS: This instance has a memory fragmentation and RSS overhead greater than 1.4 (this " "means that the Resident Set Size of the Valkey process is much larger than the sum of the logical " "allocations Valkey performed). This problem is usually due either to a large peak memory (check if " "there is a peak memory entry above in the report) or may result from a workload that causes the " "allocator to fragment memory a lot. If the problem is a large peak memory, then there is no issue. " "Otherwise, make sure you are using the Jemalloc allocator and not the default libc malloc. Note: The " "currently used allocator is \"%s\".\n\n", ZMALLOC_LIB); } if (high_alloc_frag) { s = sdscatprintf( s, " * High allocator fragmentation: This instance has an allocator external fragmentation greater " "than 1.1. This problem is usually due either to a large peak memory (check if there is a peak " "memory entry above in the report) or may result from a workload that causes the allocator to " "fragment memory a lot. You can try enabling 'activedefrag' config option.\n\n"); } if (high_alloc_rss) { s = sdscatprintf( s, " * High allocator RSS overhead: This instance has an RSS memory overhead is greater than 1.1 (this " "means that the Resident Set Size of the allocator is much larger than the sum what the allocator " "actually holds). This problem is usually due to a large peak memory (check if there is a peak " "memory entry above in the report), you can try the MEMORY PURGE command to reclaim it.\n\n"); } if (high_proc_rss) { s = sdscatprintf( s, " * High process RSS overhead: This instance has non-allocator RSS memory overhead is greater than " "1.1 (this means that the Resident Set Size of the Valkey process is much larger than the RSS the " "allocator holds). This problem may be due to Lua scripts or Modules.\n\n"); } if (big_replica_buf) { s = sdscat(s, " * Big replica buffers: The replica output buffers in this instance are greater than 10MB for " "each replica (on average). This likely means that there is some replica instance that is " "struggling receiving data, either because it is too slow or because of networking issues. As a " "result, data piles on the primary output buffers. Please try to identify what replica is not " "receiving data correctly and why. You can use the INFO output in order to check the replicas " "delays and the CLIENT LIST command to check the output buffers of each replica.\n\n"); } if (big_client_buf) { s = sdscat(s, " * Big client buffers: The clients output buffers in this instance are greater than 200K " "per client (on average). This may result from different causes, like Pub/Sub clients " "subscribed to channels bot not receiving data fast enough, so that data piles on the Valkey " "instance output buffer, or clients sending commands with large replies or very large " "sequences of commands in the same pipeline. Please use the CLIENT LIST command in order to " "investigate the issue if it causes problems in your instance, or to understand better why " "certain clients are using a big amount of memory.\n\n"); } if (many_scripts) { s = sdscat(s, " * Many scripts: There seem to be many cached scripts in this instance (more than 1000). " "This may be because scripts are generated and `EVAL`ed, instead of being parameterized " "(with KEYS and ARGV), `SCRIPT LOAD`ed and `EVALSHA`ed. Unless `SCRIPT FLUSH` is called " "periodically, the scripts' caches may end up consuming most of your memory.\n\n"); } s = sdscat(s, "I'm here to keep you safe, Sam. I want to help you.\n"); } freeMemoryOverheadData(mh); return s; } /* Set the object LRU/LFU depending on server.maxmemory_policy. * The lfu_freq arg is only relevant if policy is MAXMEMORY_FLAG_LFU. * The lru_idle and lru_clock args are only relevant if policy * is MAXMEMORY_FLAG_LRU. * Either or both of them may be <0, in that case, nothing is set. */ int objectSetLRUOrLFU(robj *val, long long lfu_freq, long long lru_idle, long long lru_clock, int lru_multiplier) { if (server.maxmemory_policy & MAXMEMORY_FLAG_LFU) { if (lfu_freq >= 0) { serverAssert(lfu_freq <= 255); val->lru = (LFUGetTimeInMinutes() << 8) | lfu_freq; return 1; } } else if (lru_idle >= 0) { /* Provided LRU idle time is in seconds. Scale * according to the LRU clock resolution this * instance was compiled with (normally 1000 ms, so the * below statement will expand to lru_idle*1000/1000. */ lru_idle = lru_idle * lru_multiplier / LRU_CLOCK_RESOLUTION; long lru_abs = lru_clock - lru_idle; /* Absolute access time. */ /* If the LRU field underflows (since lru_clock is a wrapping clock), * we need to make it positive again. This be handled by the unwrapping * code in estimateObjectIdleTime. I.e. imagine a day when lru_clock * wrap arounds (happens once in some 6 months), and becomes a low * value, like 10, an lru_idle of 1000 should be near LRU_CLOCK_MAX. */ if (lru_abs < 0) lru_abs += LRU_CLOCK_MAX; val->lru = lru_abs; return 1; } return 0; } /* ======================= The OBJECT and MEMORY commands =================== */ /* This is a helper function for the OBJECT command. We need to lookup keys * without any modification of LRU or other parameters. */ robj *objectCommandLookup(client *c, robj *key) { return lookupKeyReadWithFlags(c->db, key, LOOKUP_NOTOUCH | LOOKUP_NONOTIFY); } robj *objectCommandLookupOrReply(client *c, robj *key, robj *reply) { robj *o = objectCommandLookup(c, key); if (!o) addReplyOrErrorObject(c, reply); return o; } /* Object command allows to inspect the internals of an Object. * Usage: OBJECT */ void objectCommand(client *c) { robj *o; if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr, "help")) { const char *help[] = {"ENCODING ", " Return the kind of internal representation used in order to store the value", " associated with a .", "FREQ ", " Return the access frequency index of the . The returned integer is", " proportional to the logarithm of the recent access frequency of the key.", "IDLETIME ", " Return the idle time of the , that is the approximated number of", " seconds elapsed since the last access to the key.", "REFCOUNT ", " Return the number of references of the value associated with the specified", " .", NULL}; addReplyHelp(c, help); } else if (!strcasecmp(c->argv[1]->ptr, "refcount") && c->argc == 3) { if ((o = objectCommandLookupOrReply(c, c->argv[2], shared.null[c->resp])) == NULL) return; addReplyLongLong(c, o->refcount); } else if (!strcasecmp(c->argv[1]->ptr, "encoding") && c->argc == 3) { if ((o = objectCommandLookupOrReply(c, c->argv[2], shared.null[c->resp])) == NULL) return; addReplyBulkCString(c, strEncoding(o->encoding)); } else if (!strcasecmp(c->argv[1]->ptr, "idletime") && c->argc == 3) { if ((o = objectCommandLookupOrReply(c, c->argv[2], shared.null[c->resp])) == NULL) return; if (server.maxmemory_policy & MAXMEMORY_FLAG_LFU) { addReplyError(c, "An LFU maxmemory policy is selected, idle time not tracked. Please note that when " "switching between policies at runtime LRU and LFU data will take some time to adjust."); return; } addReplyLongLong(c, estimateObjectIdleTime(o) / 1000); } else if (!strcasecmp(c->argv[1]->ptr, "freq") && c->argc == 3) { if ((o = objectCommandLookupOrReply(c, c->argv[2], shared.null[c->resp])) == NULL) return; if (!(server.maxmemory_policy & MAXMEMORY_FLAG_LFU)) { addReplyError(c, "An LFU maxmemory policy is not selected, access frequency not tracked. Please note that " "when switching between policies at runtime LRU and LFU data will take some time to adjust."); return; } /* LFUDecrAndReturn should be called * in case of the key has not been accessed for a long time, * because we update the access time only * when the key is read or overwritten. */ addReplyLongLong(c, LFUDecrAndReturn(o)); } else { addReplySubcommandSyntaxError(c); } } /* The memory command will eventually be a complete interface for the * memory introspection capabilities of the server. * * Usage: MEMORY usage */ void memoryCommand(client *c) { if (!strcasecmp(c->argv[1]->ptr, "help") && c->argc == 2) { /* clang-format off */ const char *help[] = { "DOCTOR", " Return memory problems reports.", "MALLOC-STATS", " Return internal statistics report from the memory allocator.", "PURGE", " Attempt to purge dirty pages for reclamation by the allocator.", "STATS", " Return information about the memory usage of the server.", "USAGE [SAMPLES ]", " Return memory in bytes used by and its value. Nested values are", " sampled up to times (default: 5, 0 means sample all).", NULL }; /* clang-format on */ addReplyHelp(c, help); } else if (!strcasecmp(c->argv[1]->ptr, "usage") && c->argc >= 3) { dictEntry *de; long long samples = OBJ_COMPUTE_SIZE_DEF_SAMPLES; for (int j = 3; j < c->argc; j++) { if (!strcasecmp(c->argv[j]->ptr, "samples") && j + 1 < c->argc) { if (getLongLongFromObjectOrReply(c, c->argv[j + 1], &samples, NULL) == C_ERR) return; if (samples < 0) { addReplyErrorObject(c, shared.syntaxerr); return; } if (samples == 0) samples = LLONG_MAX; j++; /* skip option argument. */ } else { addReplyErrorObject(c, shared.syntaxerr); return; } } if ((de = dbFind(c->db, c->argv[2]->ptr)) == NULL) { addReplyNull(c); return; } size_t usage = objectComputeSize(c->argv[2], dictGetVal(de), samples, c->db->id); usage += sdsZmallocSize(dictGetKey(de)); usage += dictEntryMemUsage(); addReplyLongLong(c, usage); } else if (!strcasecmp(c->argv[1]->ptr, "stats") && c->argc == 2) { struct serverMemOverhead *mh = getMemoryOverheadData(); addReplyMapLen(c, 31 + mh->num_dbs); addReplyBulkCString(c, "peak.allocated"); addReplyLongLong(c, mh->peak_allocated); addReplyBulkCString(c, "total.allocated"); addReplyLongLong(c, mh->total_allocated); addReplyBulkCString(c, "startup.allocated"); addReplyLongLong(c, mh->startup_allocated); addReplyBulkCString(c, "replication.backlog"); addReplyLongLong(c, mh->repl_backlog); addReplyBulkCString(c, "clients.slaves"); addReplyLongLong(c, mh->clients_replicas); addReplyBulkCString(c, "clients.normal"); addReplyLongLong(c, mh->clients_normal); addReplyBulkCString(c, "cluster.links"); addReplyLongLong(c, mh->cluster_links); addReplyBulkCString(c, "aof.buffer"); addReplyLongLong(c, mh->aof_buffer); addReplyBulkCString(c, "lua.caches"); addReplyLongLong(c, mh->lua_caches); addReplyBulkCString(c, "functions.caches"); addReplyLongLong(c, mh->functions_caches); for (size_t j = 0; j < mh->num_dbs; j++) { char dbname[32]; snprintf(dbname, sizeof(dbname), "db.%zd", mh->db[j].dbid); addReplyBulkCString(c, dbname); addReplyMapLen(c, 2); addReplyBulkCString(c, "overhead.hashtable.main"); addReplyLongLong(c, mh->db[j].overhead_ht_main); addReplyBulkCString(c, "overhead.hashtable.expires"); addReplyLongLong(c, mh->db[j].overhead_ht_expires); } addReplyBulkCString(c, "overhead.db.hashtable.lut"); addReplyLongLong(c, mh->overhead_db_hashtable_lut); addReplyBulkCString(c, "overhead.db.hashtable.rehashing"); addReplyLongLong(c, mh->overhead_db_hashtable_rehashing); addReplyBulkCString(c, "overhead.total"); addReplyLongLong(c, mh->overhead_total); addReplyBulkCString(c, "db.dict.rehashing.count"); addReplyLongLong(c, mh->db_dict_rehashing_count); addReplyBulkCString(c, "keys.count"); addReplyLongLong(c, mh->total_keys); addReplyBulkCString(c, "keys.bytes-per-key"); addReplyLongLong(c, mh->bytes_per_key); addReplyBulkCString(c, "dataset.bytes"); addReplyLongLong(c, mh->dataset); addReplyBulkCString(c, "dataset.percentage"); addReplyDouble(c, mh->dataset_perc); addReplyBulkCString(c, "peak.percentage"); addReplyDouble(c, mh->peak_perc); addReplyBulkCString(c, "allocator.allocated"); addReplyLongLong(c, server.cron_malloc_stats.allocator_allocated); addReplyBulkCString(c, "allocator.active"); addReplyLongLong(c, server.cron_malloc_stats.allocator_active); addReplyBulkCString(c, "allocator.resident"); addReplyLongLong(c, server.cron_malloc_stats.allocator_resident); addReplyBulkCString(c, "allocator.muzzy"); addReplyLongLong(c, server.cron_malloc_stats.allocator_muzzy); addReplyBulkCString(c, "allocator-fragmentation.ratio"); addReplyDouble(c, mh->allocator_frag); addReplyBulkCString(c, "allocator-fragmentation.bytes"); addReplyLongLong(c, mh->allocator_frag_bytes); addReplyBulkCString(c, "allocator-rss.ratio"); addReplyDouble(c, mh->allocator_rss); addReplyBulkCString(c, "allocator-rss.bytes"); addReplyLongLong(c, mh->allocator_rss_bytes); addReplyBulkCString(c, "rss-overhead.ratio"); addReplyDouble(c, mh->rss_extra); addReplyBulkCString(c, "rss-overhead.bytes"); addReplyLongLong(c, mh->rss_extra_bytes); addReplyBulkCString(c, "fragmentation"); /* this is the total RSS overhead, including fragmentation */ addReplyDouble(c, mh->total_frag); /* it is kept here for backwards compatibility */ addReplyBulkCString(c, "fragmentation.bytes"); addReplyLongLong(c, mh->total_frag_bytes); freeMemoryOverheadData(mh); } else if (!strcasecmp(c->argv[1]->ptr, "malloc-stats") && c->argc == 2) { #if defined(USE_JEMALLOC) sds info = sdsempty(); je_malloc_stats_print(inputCatSds, &info, NULL); addReplyVerbatim(c, info, sdslen(info), "txt"); sdsfree(info); #else addReplyBulkCString(c, "Stats not supported for the current allocator"); #endif } else if (!strcasecmp(c->argv[1]->ptr, "doctor") && c->argc == 2) { sds report = getMemoryDoctorReport(); addReplyVerbatim(c, report, sdslen(report), "txt"); sdsfree(report); } else if (!strcasecmp(c->argv[1]->ptr, "purge") && c->argc == 2) { if (jemalloc_purge() == 0) addReply(c, shared.ok); else addReplyError(c, "Error purging dirty pages"); } else { addReplySubcommandSyntaxError(c); } }