From 7a4227915aee1a46cec7c808826ea93e2b7c12af Mon Sep 17 00:00:00 2001 From: John Sully Date: Mon, 25 Feb 2019 18:21:27 -0500 Subject: [PATCH] Rewrite our spinlock in assembly, pretty big performance improvement Former-commit-id: 40d7a701feefd36e9e3fdb6d516228c4a70fcf3d --- src/Makefile | 19 ++++++-- src/fastlock.cpp | 7 ++- src/fastlock_x64.asm | 106 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 128 insertions(+), 4 deletions(-) create mode 100644 src/fastlock_x64.asm diff --git a/src/Makefile b/src/Makefile index 4258f47a5..0379b1b2d 100644 --- a/src/Makefile +++ b/src/Makefile @@ -44,6 +44,15 @@ endif endif endif +# Do we use our assembly spinlock? +ifeq ($(uname_S),Linux) +ifneq ($(@@),32bit) + ASM_OBJ+= fastlock_x64.o + CFLAGS+= -DASM_SPINLOCK + CXXFLAGS+= -DASM_SPINLOCK +endif +endif + # To get ARM stack traces if Redis crashes we need a special C flag. ifneq (,$(filter aarch64 armv,$(uname_M))) CFLAGS+=-funwind-tables @@ -160,6 +169,7 @@ endif REDIS_CC=$(QUIET_CC)$(CC) $(FINAL_CFLAGS) REDIS_CXX=$(QUIET_CC)$(CC) $(FINAL_CXXFLAGS) +REDIS_NASM=$(QUIET_CC)nasm -felf64 REDIS_LD=$(QUIET_LINK)$(CXX) $(FINAL_LDFLAGS) REDIS_INSTALL=$(QUIET_INSTALL)$(INSTALL) @@ -178,11 +188,11 @@ endif REDIS_SERVER_NAME=keydb-server REDIS_SENTINEL_NAME=keydb-sentinel -REDIS_SERVER_OBJ=adlist.o quicklist.o ae.o anet.o dict.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o crc16.o endianconv.o slowlog.o scripting.o bio.o rio.o rand.o memtest.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o acl.o storage.o rdb-s3.o fastlock.o +REDIS_SERVER_OBJ=adlist.o quicklist.o ae.o anet.o dict.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o crc16.o endianconv.o slowlog.o scripting.o bio.o rio.o rand.o memtest.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o acl.o storage.o rdb-s3.o fastlock.o $(ASM_OBJ) REDIS_CLI_NAME=keydb-cli -REDIS_CLI_OBJ=anet.o adlist.o dict.o redis-cli.o zmalloc.o release.o anet.o ae.o crc64.o siphash.o crc16.o storage-lite.o fastlock.o +REDIS_CLI_OBJ=anet.o adlist.o dict.o redis-cli.o zmalloc.o release.o anet.o ae.o crc64.o siphash.o crc16.o storage-lite.o fastlock.o $(ASM_OBJ) REDIS_BENCHMARK_NAME=keydb-benchmark -REDIS_BENCHMARK_OBJ=ae.o anet.o redis-benchmark.o adlist.o zmalloc.o redis-benchmark.o storage-lite.o fastlock.o +REDIS_BENCHMARK_OBJ=ae.o anet.o redis-benchmark.o adlist.o zmalloc.o redis-benchmark.o storage-lite.o fastlock.o $(ASM_OBJ) REDIS_CHECK_RDB_NAME=keydb-check-rdb REDIS_CHECK_AOF_NAME=keydb-check-aof @@ -264,6 +274,9 @@ dict-benchmark: dict.c zmalloc.c sds.c siphash.c %.o: %.cpp .make-prerequisites $(REDIS_CXX) -c $< +%.o: %.asm .make-prerequisites + $(REDIS_NASM) $< + clean: rm -rf $(REDIS_SERVER_NAME) $(REDIS_SENTINEL_NAME) $(REDIS_CLI_NAME) $(REDIS_BENCHMARK_NAME) $(REDIS_CHECK_RDB_NAME) $(REDIS_CHECK_AOF_NAME) *.o *.gcda *.gcno *.gcov redis.info lcov-html Makefile.dep dict-benchmark diff --git a/src/fastlock.cpp b/src/fastlock.cpp index 4a4fb2962..a1499d77b 100644 --- a/src/fastlock.cpp +++ b/src/fastlock.cpp @@ -44,7 +44,7 @@ static_assert(sizeof(pid_t) <= sizeof(fastlock::m_pidOwner), "fastlock::m_pidOwner not large enough"); -static pid_t gettid() +extern "C" pid_t gettid() { static thread_local int pidCache = -1; if (pidCache == -1) @@ -60,6 +60,7 @@ extern "C" void fastlock_init(struct fastlock *lock) lock->m_pidOwner = -1; } +#ifndef ASM_SPINLOCK extern "C" void fastlock_lock(struct fastlock *lock) { if ((int)__atomic_load_4(&lock->m_pidOwner, __ATOMIC_ACQUIRE) == gettid()) @@ -75,6 +76,9 @@ extern "C" void fastlock_lock(struct fastlock *lock) { if ((++cloops % 1024*1024) == 0) sched_yield(); +#if defined(__i386__) || defined(__amd64__) + __asm__ ("pause"); +#endif } lock->m_depth = 1; @@ -107,6 +111,7 @@ extern "C" int fastlock_trylock(struct fastlock *lock) } return false; } +#endif extern "C" void fastlock_unlock(struct fastlock *lock) { diff --git a/src/fastlock_x64.asm b/src/fastlock_x64.asm new file mode 100644 index 000000000..a483678a2 --- /dev/null +++ b/src/fastlock_x64.asm @@ -0,0 +1,106 @@ +section .text + +extern gettid +extern sched_yield + +; This is the first use of assembly in this codebase, a valid question is WHY? +; The spinlock we implement here is performance critical, and simply put GCC +; emits awful code. The original C code is left in fastlock.cpp for reference +; and x-plat. The code generated for the unlock case is reasonable and left in +; C++. + +ALIGN 16 +global fastlock_lock +fastlock_lock: + ; RDI points to the struct: + ; uint16_t active + ; uint16_t avail + ; int32_t m_pidOwner + ; int32_t m_depth + + ; First get our TID and put it in ecx + push rdi ; we need our struct pointer (also balance the stack for the call) + call gettid ; get our thread ID (TLS is nasty in ASM so don't bother inlining) + mov esi, eax ; back it up in esi + mov rdi, [rsp] ; get our pointer back + + cmp [rdi+4], esi ; Is the TID we got back the owner of the lock? + je .LRecursive ; Don't spin in that case + + xor eax, eax ; eliminate partial register dependency + mov ax, 1 ; we want to add one + lock xadd [rdi+2], ax ; do the xadd, ax contains the value before the addition + ; eax now contains the ticket + xor ecx, ecx +ALIGN 16 +.Loop: + cmp [rdi], ax ; is our ticket up? + je .LDone ; leave the loop + add ecx, 1000h ; Have we been waiting a long time? (oflow if we have) + ; 1000h is set so we overflow on the 1024*1024'th iteration (like the C code) + jc .LYield ; If so, give up our timeslice to someone who's doing real work + pause ; be nice to other hyperthreads + jmp .Loop ; maybe next time we'll get our turn +.LDone: + mov [rdi+4], esi ; lock->m_pidOwner = gettid() + mov dword [rdi+8], 1 ; lock->m_depth = 1 + add rsp, 8 ; fix stack + ret +.LYield: + ; Like the compiler, you're probably thinking: "Hey! I should take these pushs out of the loop" + ; But the compiler doesn't know that we rarely hit this, and when we do we know the lock is + ; taking a long time to be released anyways. We optimize for the common case of short + ; lock intervals. That's why we're using a spinlock in the first place + push rsi + push rax + mov rax, 24 ; sys_sched_yield + syscall ; give up our timeslice we'll be here a while + pop rax + pop rsi + mov rdi, [rsp] ; our struct pointer is on the stack already + xor ecx, ecx ; Reset our loop counter + jmp .Loop ; Get back in the game +.LRecursive: + add dword [rdi+8], 1 ; increment the depth counter + add rsp, 8 ; fix the stack + ret + +ALIGN 16 +global fastlock_trylock +fastlock_trylock: + ; RDI points to the struct: + ; uint16_t active + ; uint16_t avail + ; int32_t m_pidOwner + ; int32_t m_depth + + ; First get our TID and put it in ecx + push rdi ; we need our struct pointer (also balance the stack for the call) + call gettid ; get our thread ID (TLS is nasty in ASM so don't bother inlining) + mov esi, eax ; back it up in esi + pop rdi ; get our pointer back + + cmp [rdi+4], esi ; Is the TID we got back the owner of the lock? + je .LRecursive ; Don't spin in that case + + mov eax, [rdi] ; get both active and avail counters + mov ecx, eax ; duplicate in ecx + ror ecx, 16 ; swap upper and lower 16-bits + cmp eax, ecx ; are the upper and lower 16-bits the same? + jnz .LAlreadyLocked ; If not return failure + + ; at this point we know eax+ecx have [avail][active] and they are both the same + add ecx, 10000h ; increment avail, ecx is now our wanted value + lock cmpxchg [rdi], ecx ; If rdi still contains the value in eax, put in ecx (inc avail) + jnz .LAlreadyLocked ; If Z is not set then someone locked it while we were preparing + mov eax, 1 ; return SUCCESS! + mov [rdi+4], esi ; lock->m_pidOwner = gettid() + mov dword [rdi+8], eax ; lock->m_depth = 1 + ret +.LAlreadyLocked: + xor eax, eax ; return 0 for failure + ret +.LRecursive: + add dword [rdi+8], 1 ; increment the depth counter + mov eax, 1 ; we successfully got the lock + ret \ No newline at end of file