diff --git a/README.md b/README.md index d8f8f7880..46b89c392 100644 --- a/README.md +++ b/README.md @@ -106,6 +106,18 @@ To compile against jemalloc on Mac OS X systems, use: % make MALLOC=jemalloc +Monotonic clock +--------------- + +By default, Redis will build using the POSIX clock_gettime function as the +monotonic clock source. On most modern systems, the internal processor clock +can be used to improve performance. Cautions can be found here: + http://oliveryang.net/2015/09/pitfalls-of-TSC-usage/ + +To build with support for the processor's internal instruction clock, use: + + % make CFLAGS="-DUSE_PROCESSOR_CLOCK" + Verbose build ------------- diff --git a/src/Makefile b/src/Makefile index a8d2aa518..9ea4b654f 100644 --- a/src/Makefile +++ b/src/Makefile @@ -228,11 +228,11 @@ endif REDIS_SERVER_NAME=redis-server REDIS_SENTINEL_NAME=redis-sentinel -REDIS_SERVER_OBJ=adlist.o quicklist.o ae.o anet.o dict.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o crc16.o endianconv.o slowlog.o scripting.o bio.o rio.o rand.o memtest.o crcspeed.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o gopher.o tracking.o connection.o tls.o sha256.o timeout.o setcpuaffinity.o +REDIS_SERVER_OBJ=adlist.o quicklist.o ae.o anet.o dict.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o crc16.o endianconv.o slowlog.o scripting.o bio.o rio.o rand.o memtest.o crcspeed.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o gopher.o tracking.o connection.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o REDIS_CLI_NAME=redis-cli -REDIS_CLI_OBJ=anet.o adlist.o dict.o redis-cli.o zmalloc.o release.o ae.o crcspeed.o crc64.o siphash.o crc16.o +REDIS_CLI_OBJ=anet.o adlist.o dict.o redis-cli.o zmalloc.o release.o ae.o crcspeed.o crc64.o siphash.o crc16.o monotonic.o REDIS_BENCHMARK_NAME=redis-benchmark -REDIS_BENCHMARK_OBJ=ae.o anet.o redis-benchmark.o adlist.o dict.o zmalloc.o siphash.o +REDIS_BENCHMARK_OBJ=ae.o anet.o redis-benchmark.o adlist.o dict.o zmalloc.o siphash.o monotonic.o REDIS_CHECK_RDB_NAME=redis-check-rdb REDIS_CHECK_AOF_NAME=redis-check-aof diff --git a/src/ae.c b/src/ae.c index 689a27d16..94459acff 100644 --- a/src/ae.c +++ b/src/ae.c @@ -30,6 +30,8 @@ * POSSIBILITY OF SUCH DAMAGE. */ +#include "ae.h" + #include #include #include @@ -40,7 +42,6 @@ #include #include -#include "ae.h" #include "zmalloc.h" #include "config.h" @@ -60,16 +61,18 @@ #endif #endif + aeEventLoop *aeCreateEventLoop(int setsize) { aeEventLoop *eventLoop; int i; + monotonicInit(); /* just in case the calling app didn't initialize */ + if ((eventLoop = zmalloc(sizeof(*eventLoop))) == NULL) goto err; eventLoop->events = zmalloc(sizeof(aeFileEvent)*setsize); eventLoop->fired = zmalloc(sizeof(aeFiredEvent)*setsize); if (eventLoop->events == NULL || eventLoop->fired == NULL) goto err; eventLoop->setsize = setsize; - eventLoop->lastTime = time(NULL); eventLoop->timeEventHead = NULL; eventLoop->timeEventNextId = 0; eventLoop->stop = 0; @@ -199,29 +202,6 @@ int aeGetFileEvents(aeEventLoop *eventLoop, int fd) { return fe->mask; } -static void aeGetTime(long *seconds, long *milliseconds) -{ - struct timeval tv; - - gettimeofday(&tv, NULL); - *seconds = tv.tv_sec; - *milliseconds = tv.tv_usec/1000; -} - -static void aeAddMillisecondsToNow(long long milliseconds, long *sec, long *ms) { - long cur_sec, cur_ms, when_sec, when_ms; - - aeGetTime(&cur_sec, &cur_ms); - when_sec = cur_sec + milliseconds/1000; - when_ms = cur_ms + milliseconds%1000; - if (when_ms >= 1000) { - when_sec ++; - when_ms -= 1000; - } - *sec = when_sec; - *ms = when_ms; -} - long long aeCreateTimeEvent(aeEventLoop *eventLoop, long long milliseconds, aeTimeProc *proc, void *clientData, aeEventFinalizerProc *finalizerProc) @@ -232,7 +212,7 @@ long long aeCreateTimeEvent(aeEventLoop *eventLoop, long long milliseconds, te = zmalloc(sizeof(*te)); if (te == NULL) return AE_ERR; te->id = id; - aeAddMillisecondsToNow(milliseconds,&te->when_sec,&te->when_ms); + te->when = getMonotonicUs() + milliseconds * 1000; te->timeProc = proc; te->finalizerProc = finalizerProc; te->clientData = clientData; @@ -258,10 +238,8 @@ int aeDeleteTimeEvent(aeEventLoop *eventLoop, long long id) return AE_ERR; /* NO event with the specified ID found */ } -/* Search the first timer to fire. - * This operation is useful to know how many time the select can be - * put in sleep without to delay any event. - * If there are no timers NULL is returned. +/* How many milliseconds until the first timer should fire. + * If there are no timers, -1 is returned. * * Note that's O(N) since time events are unsorted. * Possible optimizations (not needed by Redis so far, but...): @@ -269,19 +247,20 @@ int aeDeleteTimeEvent(aeEventLoop *eventLoop, long long id) * Much better but still insertion or deletion of timers is O(N). * 2) Use a skiplist to have this operation as O(1) and insertion as O(log(N)). */ -static aeTimeEvent *aeSearchNearestTimer(aeEventLoop *eventLoop) -{ +static long msUntilEarliestTimer(aeEventLoop *eventLoop) { aeTimeEvent *te = eventLoop->timeEventHead; - aeTimeEvent *nearest = NULL; + if (te == NULL) return -1; - while(te) { - if (!nearest || te->when_sec < nearest->when_sec || - (te->when_sec == nearest->when_sec && - te->when_ms < nearest->when_ms)) - nearest = te; + aeTimeEvent *earliest = NULL; + while (te) { + if (!earliest || te->when < earliest->when) + earliest = te; te = te->next; } - return nearest; + + monotime now = getMonotonicUs(); + return (now >= earliest->when) + ? 0 : (long)((earliest->when - now) / 1000); } /* Process time events */ @@ -289,29 +268,11 @@ static int processTimeEvents(aeEventLoop *eventLoop) { int processed = 0; aeTimeEvent *te; long long maxId; - time_t now = time(NULL); - - /* If the system clock is moved to the future, and then set back to the - * right value, time events may be delayed in a random way. Often this - * means that scheduled operations will not be performed soon enough. - * - * Here we try to detect system clock skews, and force all the time - * events to be processed ASAP when this happens: the idea is that - * processing events earlier is less dangerous than delaying them - * indefinitely, and practice suggests it is. */ - if (now < eventLoop->lastTime) { - te = eventLoop->timeEventHead; - while(te) { - te->when_sec = 0; - te = te->next; - } - } - eventLoop->lastTime = now; te = eventLoop->timeEventHead; maxId = eventLoop->timeEventNextId-1; + monotime now = getMonotonicUs(); while(te) { - long now_sec, now_ms; long long id; /* Remove events scheduled for deletion. */ @@ -330,8 +291,10 @@ static int processTimeEvents(aeEventLoop *eventLoop) { eventLoop->timeEventHead = te->next; if (te->next) te->next->prev = te->prev; - if (te->finalizerProc) + if (te->finalizerProc) { te->finalizerProc(eventLoop, te->clientData); + now = getMonotonicUs(); + } zfree(te); te = next; continue; @@ -346,10 +309,8 @@ static int processTimeEvents(aeEventLoop *eventLoop) { te = te->next; continue; } - aeGetTime(&now_sec, &now_ms); - if (now_sec > te->when_sec || - (now_sec == te->when_sec && now_ms >= te->when_ms)) - { + + if (te->when <= now) { int retval; id = te->id; @@ -357,8 +318,9 @@ static int processTimeEvents(aeEventLoop *eventLoop) { retval = te->timeProc(eventLoop, id, te->clientData); te->refcount--; processed++; + now = getMonotonicUs(); if (retval != AE_NOMORE) { - aeAddMillisecondsToNow(retval,&te->when_sec,&te->when_ms); + te->when = now + retval * 1000; } else { te->id = AE_DELETED_EVENT_ID; } @@ -397,30 +359,16 @@ int aeProcessEvents(aeEventLoop *eventLoop, int flags) if (eventLoop->maxfd != -1 || ((flags & AE_TIME_EVENTS) && !(flags & AE_DONT_WAIT))) { int j; - aeTimeEvent *shortest = NULL; struct timeval tv, *tvp; + long msUntilTimer = -1; if (flags & AE_TIME_EVENTS && !(flags & AE_DONT_WAIT)) - shortest = aeSearchNearestTimer(eventLoop); - if (shortest) { - long now_sec, now_ms; + msUntilTimer = msUntilEarliestTimer(eventLoop); - aeGetTime(&now_sec, &now_ms); + if (msUntilTimer >= 0) { + tv.tv_sec = msUntilTimer / 1000; + tv.tv_usec = (msUntilTimer % 1000) * 1000; tvp = &tv; - - /* How many milliseconds we need to wait for the next - * time event to fire? */ - long long ms = - (shortest->when_sec - now_sec)*1000 + - shortest->when_ms - now_ms; - - if (ms > 0) { - tvp->tv_sec = ms/1000; - tvp->tv_usec = (ms % 1000)*1000; - } else { - tvp->tv_sec = 0; - tvp->tv_usec = 0; - } } else { /* If we have to check for events but need to return * ASAP because of AE_DONT_WAIT we need to set the timeout diff --git a/src/ae.h b/src/ae.h index d1b7f34bf..4a1edb68f 100644 --- a/src/ae.h +++ b/src/ae.h @@ -33,7 +33,7 @@ #ifndef __AE_H__ #define __AE_H__ -#include +#include "monotonic.h" #define AE_OK 0 #define AE_ERR -1 @@ -79,8 +79,7 @@ typedef struct aeFileEvent { /* Time event structure */ typedef struct aeTimeEvent { long long id; /* time event identifier. */ - long when_sec; /* seconds */ - long when_ms; /* milliseconds */ + monotime when; aeTimeProc *timeProc; aeEventFinalizerProc *finalizerProc; void *clientData; @@ -101,7 +100,6 @@ typedef struct aeEventLoop { int maxfd; /* highest file descriptor currently registered */ int setsize; /* max number of file descriptors tracked */ long long timeEventNextId; - time_t lastTime; /* Used to detect system clock skew */ aeFileEvent *events; /* Registered events */ aeFiredEvent *fired; /* Fired events */ aeTimeEvent *timeEventHead; diff --git a/src/fmacros.h b/src/fmacros.h index 6e56c759d..089dc8de7 100644 --- a/src/fmacros.h +++ b/src/fmacros.h @@ -58,4 +58,10 @@ #define _LARGEFILE_SOURCE #define _FILE_OFFSET_BITS 64 +#ifdef __linux__ +/* features.h uses the defines above to set feature specific defines. */ +#include +#include +#endif + #endif diff --git a/src/monotonic.c b/src/monotonic.c new file mode 100644 index 000000000..5bb4f03bf --- /dev/null +++ b/src/monotonic.c @@ -0,0 +1,170 @@ +#include "monotonic.h" +#include +#include +#include +#include + +#undef NDEBUG +#include + + +/* The function pointer for clock retrieval. */ +monotime (*getMonotonicUs)(void) = NULL; + +static char monotonic_info_string[32]; + + +/* Using the processor clock (aka TSC on x86) can provide improved performance + * throughout Redis wherever the monotonic clock is used. The processor clock + * is significantly faster than calling 'clock_getting' (POSIX). While this is + * generally safe on modern systems, this link provides additional information + * about use of the x86 TSC: http://oliveryang.net/2015/09/pitfalls-of-TSC-usage + * + * To use the processor clock, either uncomment this line, or build with + * CFLAGS="-DUSE_PROCESSOR_CLOCK" +#define USE_PROCESSOR_CLOCK + */ + + +#if defined(USE_PROCESSOR_CLOCK) && defined(__x86_64__) && defined(__linux__) +#include +#include + +static long mono_ticksPerMicrosecond = 0; + +static monotime getMonotonicUs_x86() { + return __rdtsc() / mono_ticksPerMicrosecond; +} + +static void monotonicInit_x86linux() { + const int bufflen = 256; + char buf[bufflen]; + regex_t cpuGhzRegex, constTscRegex; + const size_t nmatch = 2; + regmatch_t pmatch[nmatch]; + int constantTsc = 0; + int rc; + + /* Determine the number of TSC ticks in a micro-second. This is + * a constant value matching the standard speed of the processor. + * On modern processors, this speed remains constant even though + * the actual clock speed varies dynamically for each core. */ + rc = regcomp(&cpuGhzRegex, "^model name\\s+:.*@ ([0-9.]+)GHz", REG_EXTENDED); + assert(rc == 0); + + /* Also check that the constant_tsc flag is present. (It should be + * unless this is a really old CPU. */ + rc = regcomp(&constTscRegex, "^flags\\s+:.* constant_tsc", REG_EXTENDED); + assert(rc == 0); + + FILE *cpuinfo = fopen("/proc/cpuinfo", "r"); + if (cpuinfo != NULL) { + while (fgets(buf, bufflen, cpuinfo) != NULL) { + if (regexec(&cpuGhzRegex, buf, nmatch, pmatch, 0) == 0) { + buf[pmatch[1].rm_eo] = '\0'; + double ghz = atof(&buf[pmatch[1].rm_so]); + mono_ticksPerMicrosecond = (long)(ghz * 1000); + break; + } + } + while (fgets(buf, bufflen, cpuinfo) != NULL) { + if (regexec(&constTscRegex, buf, nmatch, pmatch, 0) == 0) { + constantTsc = 1; + break; + } + } + + fclose(cpuinfo); + } + regfree(&cpuGhzRegex); + regfree(&constTscRegex); + + if (mono_ticksPerMicrosecond == 0) { + fprintf(stderr, "monotonic: x86 linux, unable to determine clock rate"); + return; + } + if (!constantTsc) { + fprintf(stderr, "monotonic: x86 linux, 'constant_tsc' flag not present"); + return; + } + + snprintf(monotonic_info_string, sizeof(monotonic_info_string), + "X86 TSC @ %ld ticks/us", mono_ticksPerMicrosecond); + getMonotonicUs = getMonotonicUs_x86; +} +#endif + + +#if defined(USE_PROCESSOR_CLOCK) && defined(__aarch64__) +static long mono_ticksPerMicrosecond = 0; + +/* Read the clock value. */ +static inline uint64_t __cntvct() { + uint64_t virtual_timer_value; + __asm__ volatile("mrs %0, cntvct_el0" : "=r"(virtual_timer_value)); + return virtual_timer_value; +} + +/* Read the Count-timer Frequency. */ +static inline uint32_t cntfrq_hz() { + uint64_t virtual_freq_value; + __asm__ volatile("mrs %0, cntfrq_el0" : "=r"(virtual_freq_value)); + return (uint32_t)virtual_freq_value; /* top 32 bits are reserved */ +} + +static monotime getMonotonicUs_aarch64() { + return __cntvct() / mono_ticksPerMicrosecond; +} + +static void monotonicInit_aarch64() { + mono_ticksPerMicrosecond = (long)cntfrq_hz() / 1000L / 1000L; + if (mono_ticksPerMicrosecond == 0) { + fprintf(stderr, "monotonic: aarch64, unable to determine clock rate"); + return; + } + + snprintf(monotonic_info_string, sizeof(monotonic_info_string), + "ARM CNTVCT @ %ld ticks/us", mono_ticksPerMicrosecond); + getMonotonicUs = getMonotonicUs_aarch64; +} +#endif + + +static monotime getMonotonicUs_posix() { + /* clock_gettime() is specified in POSIX.1b (1993). Even so, some systems + * did not support this until much later. CLOCK_MONOTONIC is technically + * optional and may not be supported - but it appears to be universal. + * If this is not supported, provide a system-specific alternate version. */ + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return ((uint64_t)ts.tv_sec) * 1000000 + ts.tv_nsec / 1000; +} + +static void monotonicInit_posix() { + /* Ensure that CLOCK_MONOTONIC is supported. This should be supported + * on any reasonably current OS. If the assertion below fails, provide + * an appropriate alternate implementation. */ + struct timespec ts; + int rc = clock_gettime(CLOCK_MONOTONIC, &ts); + assert(rc == 0); + + snprintf(monotonic_info_string, sizeof(monotonic_info_string), + "POSIX clock_gettime"); + getMonotonicUs = getMonotonicUs_posix; +} + + + +const char * monotonicInit() { + #if defined(USE_PROCESSOR_CLOCK) && defined(__x86_64__) && defined(__linux__) + if (getMonotonicUs == NULL) monotonicInit_x86linux(); + #endif + + #if defined(USE_PROCESSOR_CLOCK) && defined(__aarch64__) + if (getMonotonicUs == NULL) monotonicInit_aarch64(); + #endif + + if (getMonotonicUs == NULL) monotonicInit_posix(); + + return monotonic_info_string; +} diff --git a/src/monotonic.h b/src/monotonic.h new file mode 100644 index 000000000..4e82f9d53 --- /dev/null +++ b/src/monotonic.h @@ -0,0 +1,52 @@ +#ifndef __MONOTONIC_H +#define __MONOTONIC_H +/* The monotonic clock is an always increasing clock source. It is unrelated to + * the actual time of day and should only be used for relative timings. The + * monotonic clock is also not guaranteed to be chronologically precise; there + * may be slight skew/shift from a precise clock. + * + * Depending on system architecture, the monotonic time may be able to be + * retrieved much faster than a normal clock source by using an instruction + * counter on the CPU. On x86 architectures (for example), the RDTSC + * instruction is a very fast clock source for this purpose. + */ + +#include "fmacros.h" +#include +#include + +/* A counter in micro-seconds. The 'monotime' type is provided for variables + * holding a monotonic time. This will help distinguish & document that the + * variable is associated with the monotonic clock and should not be confused + * with other types of time.*/ +typedef uint64_t monotime; + +/* Retrieve counter of micro-seconds relative to an arbitrary point in time. */ +extern monotime (*getMonotonicUs)(void); + + +/* Call once at startup to initialize the monotonic clock. Though this only + * needs to be called once, it may be called additional times without impact. + * Returns a printable string indicating the type of clock initialized. + * (The returned string is static and doesn't need to be freed.) */ +const char * monotonicInit(); + + +/* Functions to measure elapsed time. Example: + * monotime myTimer; + * elapsedStart(&myTimer); + * while (elapsedMs(myTimer) < 10) {} // loops for 10ms + */ +static inline void elapsedStart(monotime *start_time) { + *start_time = getMonotonicUs(); +} + +static inline uint64_t elapsedUs(monotime start_time) { + return getMonotonicUs() - start_time; +} + +static inline uint64_t elapsedMs(monotime start_time) { + return elapsedUs(start_time) / 1000; +} + +#endif diff --git a/src/server.c b/src/server.c index 4da4aeeec..d3f5c34f8 100644 --- a/src/server.c +++ b/src/server.c @@ -28,6 +28,7 @@ */ #include "server.h" +#include "monotonic.h" #include "cluster.h" #include "slowlog.h" #include "bio.h" @@ -2874,6 +2875,8 @@ void initServer(void) { createSharedObjects(); adjustOpenFilesLimit(); + const char *clk_msg = monotonicInit(); + serverLog(LL_NOTICE, "monotonic clock: %s", clk_msg); server.el = aeCreateEventLoop(server.maxclients+CONFIG_FDSET_INCR); if (server.el == NULL) { serverLog(LL_WARNING,