Rewrite our spinlock in assembly, pretty big performance improvement
Former-commit-id: 40d7a701feefd36e9e3fdb6d516228c4a70fcf3d
This commit is contained in:
parent
db8cdcafe2
commit
7a4227915a
19
src/Makefile
19
src/Makefile
@ -44,6 +44,15 @@ endif
|
|||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
# Do we use our assembly spinlock?
|
||||||
|
ifeq ($(uname_S),Linux)
|
||||||
|
ifneq ($(@@),32bit)
|
||||||
|
ASM_OBJ+= fastlock_x64.o
|
||||||
|
CFLAGS+= -DASM_SPINLOCK
|
||||||
|
CXXFLAGS+= -DASM_SPINLOCK
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
# To get ARM stack traces if Redis crashes we need a special C flag.
|
# To get ARM stack traces if Redis crashes we need a special C flag.
|
||||||
ifneq (,$(filter aarch64 armv,$(uname_M)))
|
ifneq (,$(filter aarch64 armv,$(uname_M)))
|
||||||
CFLAGS+=-funwind-tables
|
CFLAGS+=-funwind-tables
|
||||||
@ -160,6 +169,7 @@ endif
|
|||||||
|
|
||||||
REDIS_CC=$(QUIET_CC)$(CC) $(FINAL_CFLAGS)
|
REDIS_CC=$(QUIET_CC)$(CC) $(FINAL_CFLAGS)
|
||||||
REDIS_CXX=$(QUIET_CC)$(CC) $(FINAL_CXXFLAGS)
|
REDIS_CXX=$(QUIET_CC)$(CC) $(FINAL_CXXFLAGS)
|
||||||
|
REDIS_NASM=$(QUIET_CC)nasm -felf64
|
||||||
REDIS_LD=$(QUIET_LINK)$(CXX) $(FINAL_LDFLAGS)
|
REDIS_LD=$(QUIET_LINK)$(CXX) $(FINAL_LDFLAGS)
|
||||||
REDIS_INSTALL=$(QUIET_INSTALL)$(INSTALL)
|
REDIS_INSTALL=$(QUIET_INSTALL)$(INSTALL)
|
||||||
|
|
||||||
@ -178,11 +188,11 @@ endif
|
|||||||
|
|
||||||
REDIS_SERVER_NAME=keydb-server
|
REDIS_SERVER_NAME=keydb-server
|
||||||
REDIS_SENTINEL_NAME=keydb-sentinel
|
REDIS_SENTINEL_NAME=keydb-sentinel
|
||||||
REDIS_SERVER_OBJ=adlist.o quicklist.o ae.o anet.o dict.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o crc16.o endianconv.o slowlog.o scripting.o bio.o rio.o rand.o memtest.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o acl.o storage.o rdb-s3.o fastlock.o
|
REDIS_SERVER_OBJ=adlist.o quicklist.o ae.o anet.o dict.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o crc16.o endianconv.o slowlog.o scripting.o bio.o rio.o rand.o memtest.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o acl.o storage.o rdb-s3.o fastlock.o $(ASM_OBJ)
|
||||||
REDIS_CLI_NAME=keydb-cli
|
REDIS_CLI_NAME=keydb-cli
|
||||||
REDIS_CLI_OBJ=anet.o adlist.o dict.o redis-cli.o zmalloc.o release.o anet.o ae.o crc64.o siphash.o crc16.o storage-lite.o fastlock.o
|
REDIS_CLI_OBJ=anet.o adlist.o dict.o redis-cli.o zmalloc.o release.o anet.o ae.o crc64.o siphash.o crc16.o storage-lite.o fastlock.o $(ASM_OBJ)
|
||||||
REDIS_BENCHMARK_NAME=keydb-benchmark
|
REDIS_BENCHMARK_NAME=keydb-benchmark
|
||||||
REDIS_BENCHMARK_OBJ=ae.o anet.o redis-benchmark.o adlist.o zmalloc.o redis-benchmark.o storage-lite.o fastlock.o
|
REDIS_BENCHMARK_OBJ=ae.o anet.o redis-benchmark.o adlist.o zmalloc.o redis-benchmark.o storage-lite.o fastlock.o $(ASM_OBJ)
|
||||||
REDIS_CHECK_RDB_NAME=keydb-check-rdb
|
REDIS_CHECK_RDB_NAME=keydb-check-rdb
|
||||||
REDIS_CHECK_AOF_NAME=keydb-check-aof
|
REDIS_CHECK_AOF_NAME=keydb-check-aof
|
||||||
|
|
||||||
@ -264,6 +274,9 @@ dict-benchmark: dict.c zmalloc.c sds.c siphash.c
|
|||||||
%.o: %.cpp .make-prerequisites
|
%.o: %.cpp .make-prerequisites
|
||||||
$(REDIS_CXX) -c $<
|
$(REDIS_CXX) -c $<
|
||||||
|
|
||||||
|
%.o: %.asm .make-prerequisites
|
||||||
|
$(REDIS_NASM) $<
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -rf $(REDIS_SERVER_NAME) $(REDIS_SENTINEL_NAME) $(REDIS_CLI_NAME) $(REDIS_BENCHMARK_NAME) $(REDIS_CHECK_RDB_NAME) $(REDIS_CHECK_AOF_NAME) *.o *.gcda *.gcno *.gcov redis.info lcov-html Makefile.dep dict-benchmark
|
rm -rf $(REDIS_SERVER_NAME) $(REDIS_SENTINEL_NAME) $(REDIS_CLI_NAME) $(REDIS_BENCHMARK_NAME) $(REDIS_CHECK_RDB_NAME) $(REDIS_CHECK_AOF_NAME) *.o *.gcda *.gcno *.gcov redis.info lcov-html Makefile.dep dict-benchmark
|
||||||
|
|
||||||
|
@ -44,7 +44,7 @@
|
|||||||
|
|
||||||
static_assert(sizeof(pid_t) <= sizeof(fastlock::m_pidOwner), "fastlock::m_pidOwner not large enough");
|
static_assert(sizeof(pid_t) <= sizeof(fastlock::m_pidOwner), "fastlock::m_pidOwner not large enough");
|
||||||
|
|
||||||
static pid_t gettid()
|
extern "C" pid_t gettid()
|
||||||
{
|
{
|
||||||
static thread_local int pidCache = -1;
|
static thread_local int pidCache = -1;
|
||||||
if (pidCache == -1)
|
if (pidCache == -1)
|
||||||
@ -60,6 +60,7 @@ extern "C" void fastlock_init(struct fastlock *lock)
|
|||||||
lock->m_pidOwner = -1;
|
lock->m_pidOwner = -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifndef ASM_SPINLOCK
|
||||||
extern "C" void fastlock_lock(struct fastlock *lock)
|
extern "C" void fastlock_lock(struct fastlock *lock)
|
||||||
{
|
{
|
||||||
if ((int)__atomic_load_4(&lock->m_pidOwner, __ATOMIC_ACQUIRE) == gettid())
|
if ((int)__atomic_load_4(&lock->m_pidOwner, __ATOMIC_ACQUIRE) == gettid())
|
||||||
@ -75,6 +76,9 @@ extern "C" void fastlock_lock(struct fastlock *lock)
|
|||||||
{
|
{
|
||||||
if ((++cloops % 1024*1024) == 0)
|
if ((++cloops % 1024*1024) == 0)
|
||||||
sched_yield();
|
sched_yield();
|
||||||
|
#if defined(__i386__) || defined(__amd64__)
|
||||||
|
__asm__ ("pause");
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
lock->m_depth = 1;
|
lock->m_depth = 1;
|
||||||
@ -107,6 +111,7 @@ extern "C" int fastlock_trylock(struct fastlock *lock)
|
|||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
extern "C" void fastlock_unlock(struct fastlock *lock)
|
extern "C" void fastlock_unlock(struct fastlock *lock)
|
||||||
{
|
{
|
||||||
|
106
src/fastlock_x64.asm
Normal file
106
src/fastlock_x64.asm
Normal file
@ -0,0 +1,106 @@
|
|||||||
|
section .text
|
||||||
|
|
||||||
|
extern gettid
|
||||||
|
extern sched_yield
|
||||||
|
|
||||||
|
; This is the first use of assembly in this codebase, a valid question is WHY?
|
||||||
|
; The spinlock we implement here is performance critical, and simply put GCC
|
||||||
|
; emits awful code. The original C code is left in fastlock.cpp for reference
|
||||||
|
; and x-plat. The code generated for the unlock case is reasonable and left in
|
||||||
|
; C++.
|
||||||
|
|
||||||
|
ALIGN 16
|
||||||
|
global fastlock_lock
|
||||||
|
fastlock_lock:
|
||||||
|
; RDI points to the struct:
|
||||||
|
; uint16_t active
|
||||||
|
; uint16_t avail
|
||||||
|
; int32_t m_pidOwner
|
||||||
|
; int32_t m_depth
|
||||||
|
|
||||||
|
; First get our TID and put it in ecx
|
||||||
|
push rdi ; we need our struct pointer (also balance the stack for the call)
|
||||||
|
call gettid ; get our thread ID (TLS is nasty in ASM so don't bother inlining)
|
||||||
|
mov esi, eax ; back it up in esi
|
||||||
|
mov rdi, [rsp] ; get our pointer back
|
||||||
|
|
||||||
|
cmp [rdi+4], esi ; Is the TID we got back the owner of the lock?
|
||||||
|
je .LRecursive ; Don't spin in that case
|
||||||
|
|
||||||
|
xor eax, eax ; eliminate partial register dependency
|
||||||
|
mov ax, 1 ; we want to add one
|
||||||
|
lock xadd [rdi+2], ax ; do the xadd, ax contains the value before the addition
|
||||||
|
; eax now contains the ticket
|
||||||
|
xor ecx, ecx
|
||||||
|
ALIGN 16
|
||||||
|
.Loop:
|
||||||
|
cmp [rdi], ax ; is our ticket up?
|
||||||
|
je .LDone ; leave the loop
|
||||||
|
add ecx, 1000h ; Have we been waiting a long time? (oflow if we have)
|
||||||
|
; 1000h is set so we overflow on the 1024*1024'th iteration (like the C code)
|
||||||
|
jc .LYield ; If so, give up our timeslice to someone who's doing real work
|
||||||
|
pause ; be nice to other hyperthreads
|
||||||
|
jmp .Loop ; maybe next time we'll get our turn
|
||||||
|
.LDone:
|
||||||
|
mov [rdi+4], esi ; lock->m_pidOwner = gettid()
|
||||||
|
mov dword [rdi+8], 1 ; lock->m_depth = 1
|
||||||
|
add rsp, 8 ; fix stack
|
||||||
|
ret
|
||||||
|
.LYield:
|
||||||
|
; Like the compiler, you're probably thinking: "Hey! I should take these pushs out of the loop"
|
||||||
|
; But the compiler doesn't know that we rarely hit this, and when we do we know the lock is
|
||||||
|
; taking a long time to be released anyways. We optimize for the common case of short
|
||||||
|
; lock intervals. That's why we're using a spinlock in the first place
|
||||||
|
push rsi
|
||||||
|
push rax
|
||||||
|
mov rax, 24 ; sys_sched_yield
|
||||||
|
syscall ; give up our timeslice we'll be here a while
|
||||||
|
pop rax
|
||||||
|
pop rsi
|
||||||
|
mov rdi, [rsp] ; our struct pointer is on the stack already
|
||||||
|
xor ecx, ecx ; Reset our loop counter
|
||||||
|
jmp .Loop ; Get back in the game
|
||||||
|
.LRecursive:
|
||||||
|
add dword [rdi+8], 1 ; increment the depth counter
|
||||||
|
add rsp, 8 ; fix the stack
|
||||||
|
ret
|
||||||
|
|
||||||
|
ALIGN 16
|
||||||
|
global fastlock_trylock
|
||||||
|
fastlock_trylock:
|
||||||
|
; RDI points to the struct:
|
||||||
|
; uint16_t active
|
||||||
|
; uint16_t avail
|
||||||
|
; int32_t m_pidOwner
|
||||||
|
; int32_t m_depth
|
||||||
|
|
||||||
|
; First get our TID and put it in ecx
|
||||||
|
push rdi ; we need our struct pointer (also balance the stack for the call)
|
||||||
|
call gettid ; get our thread ID (TLS is nasty in ASM so don't bother inlining)
|
||||||
|
mov esi, eax ; back it up in esi
|
||||||
|
pop rdi ; get our pointer back
|
||||||
|
|
||||||
|
cmp [rdi+4], esi ; Is the TID we got back the owner of the lock?
|
||||||
|
je .LRecursive ; Don't spin in that case
|
||||||
|
|
||||||
|
mov eax, [rdi] ; get both active and avail counters
|
||||||
|
mov ecx, eax ; duplicate in ecx
|
||||||
|
ror ecx, 16 ; swap upper and lower 16-bits
|
||||||
|
cmp eax, ecx ; are the upper and lower 16-bits the same?
|
||||||
|
jnz .LAlreadyLocked ; If not return failure
|
||||||
|
|
||||||
|
; at this point we know eax+ecx have [avail][active] and they are both the same
|
||||||
|
add ecx, 10000h ; increment avail, ecx is now our wanted value
|
||||||
|
lock cmpxchg [rdi], ecx ; If rdi still contains the value in eax, put in ecx (inc avail)
|
||||||
|
jnz .LAlreadyLocked ; If Z is not set then someone locked it while we were preparing
|
||||||
|
mov eax, 1 ; return SUCCESS!
|
||||||
|
mov [rdi+4], esi ; lock->m_pidOwner = gettid()
|
||||||
|
mov dword [rdi+8], eax ; lock->m_depth = 1
|
||||||
|
ret
|
||||||
|
.LAlreadyLocked:
|
||||||
|
xor eax, eax ; return 0 for failure
|
||||||
|
ret
|
||||||
|
.LRecursive:
|
||||||
|
add dword [rdi+8], 1 ; increment the depth counter
|
||||||
|
mov eax, 1 ; we successfully got the lock
|
||||||
|
ret
|
Loading…
x
Reference in New Issue
Block a user