futriix/src/fastlock_x64.asm
John Sully bb733b0aab Fix bug in fastlock stack metadata
Former-commit-id: 241d1bd7e1ed64885adbb07653a5c7e8ef882607
2021-02-07 19:11:05 -05:00

204 lines
6.6 KiB
NASM

.intel_syntax noprefix
.text
.extern gettid
.extern fastlock_sleep
.extern g_fHighCpuPressure
# This is the first use of assembly in this codebase, a valid question is WHY?
# The spinlock we implement here is performance critical, and simply put GCC
# emits awful code. The original C code is left in fastlock.cpp for reference
# and x-plat.
.ALIGN 16
.global fastlock_lock
.type fastlock_lock,@function
fastlock_lock:
.cfi_startproc
.cfi_def_cfa rsp, 8
# RDI points to the struct:
# int32_t m_pidOwner
# int32_t m_depth
# [rdi+64] ...
# uint16_t active
# uint16_t avail
#
# RSI points to a spin function to call, or NULL
# First get our TID and put it in ecx
sub rsp, 24 # We only use 16 bytes, but we also need the stack aligned
.cfi_adjust_cfa_offset 24
mov [rsp], rdi # we need our struct pointer (also balance the stack for the call)
mov [rsp+8], rsi # backup the spin function
call gettid # get our thread ID (TLS is nasty in ASM so don't bother inlining)
mov esi, eax # back it up in esi
mov rdi, [rsp] # Restore spin struct
mov r8, [rsp+8] # restore the function (in a different register)
add rsp, 24
.cfi_adjust_cfa_offset -24
cmp [rdi], esi # Is the TID we got back the owner of the lock?
je .LLocked # Don't spin in that case
mov r9d, 0x1000 # 1000h is set so we overflow on the 1024*1024'th iteration (like the C code)
mov eax, [rip+g_fHighCpuPressure]
test eax, eax
jz .LNoTestMode
mov r9d, 0x10000
.LNoTestMode:
xor eax, eax # eliminate partial register dependency
inc eax # we want to add one
lock xadd [rdi+66], ax # do the xadd, ax contains the value before the addition
# ax now contains the ticket
# OK Start the wait loop
xor ecx, ecx
test r8, r8
jnz .LLoopFunction
.ALIGN 16
.LLoop:
mov edx, [rdi+64]
cmp dx, ax # is our ticket up?
je .LLocked # leave the loop
pause
add ecx, r9d # Have we been waiting a long time? (oflow if we have)
jnc .LLoop # If so, give up our timeslice to someone who's doing real work
# Like the compiler, you're probably thinking: "Hey! I should take these pushs out of the loop"
# But the compiler doesn't know that we rarely hit this, and when we do we know the lock is
# taking a long time to be released anyways. We optimize for the common case of short
# lock intervals. That's why we're using a spinlock in the first place
# If we get here we're going to sleep in the kernel with a futex
push rdi
push rsi
push rax
.cfi_adjust_cfa_offset 24
# Setup the syscall args
# rdi ARG1 futex (already in rdi)
# rsi ARG2 tid (already in esi)
# rdx ARG3 ticketT.u (already in edx)
mov ecx, eax # rcx ARG4 myticket
call fastlock_sleep
# cleanup and continue
pop rax
pop rsi
pop rdi
.cfi_adjust_cfa_offset -24
xor ecx, ecx # Reset our loop counter
jmp .LLoop # Get back in the game
.ALIGN 16
.LLocked:
mov [rdi], esi # lock->m_pidOwner = gettid()
inc dword ptr [rdi+4] # lock->m_depth++
ret
.LLoopFunction:
sub rsp, 40
.cfi_adjust_cfa_offset 40
xor ecx, ecx
mov [rsp], rcx
mov [rsp+8], r8
mov [rsp+16], rdi
mov [rsp+24], rsi
mov [rsp+32], eax
.LLoopFunctionCore:
mov edx, [rdi+64]
cmp dx, ax
je .LExitLoopFunction
mov r8, [rsp+8]
call r8
test eax, eax
jz .LExitLoopFunctionForNormal
mov eax, [rsp+32] # restore clobbered eax
mov rdi, [rsp+16]
jmp .LLoopFunctionCore
.LExitLoopFunction:
mov rsi, [rsp+24]
add rsp, 40
.cfi_adjust_cfa_offset -40
jmp .LLocked
.LExitLoopFunctionForNormal:
xor ecx, ecx
mov rdi, [rsp+16]
mov rsi, [rsp+24]
mov eax, [rsp+32]
add rsp, 40
.cfi_adjust_cfa_offset -40
jmp .LLoop
.cfi_endproc
.ALIGN 16
.global fastlock_trylock
.type fastlock_trylock,@function
fastlock_trylock:
# RDI points to the struct:
# int32_t m_pidOwner
# int32_t m_depth
# [rdi+64] ...
# uint16_t active
# uint16_t avail
# First get our TID and put it in ecx
push rdi # we need our struct pointer (also balance the stack for the call)
call gettid # get our thread ID (TLS is nasty in ASM so don't bother inlining)
mov esi, eax # back it up in esi
pop rdi # get our pointer back
cmp [rdi], esi # Is the TID we got back the owner of the lock?
je .LRecursive # Don't spin in that case
mov eax, [rdi+64] # get both active and avail counters
mov ecx, eax # duplicate in ecx
ror ecx, 16 # swap upper and lower 16-bits
cmp eax, ecx # are the upper and lower 16-bits the same?
jnz .LAlreadyLocked # If not return failure
# at this point we know eax+ecx have [avail][active] and they are both the same
add ecx, 0x10000 # increment avail, ecx is now our wanted value
lock cmpxchg [rdi+64], ecx # If rdi still contains the value in eax, put in ecx (inc avail)
jnz .LAlreadyLocked # If Z is not set then someone locked it while we were preparing
xor eax, eax
inc eax # return SUCCESS! (eax=1)
mov [rdi], esi # lock->m_pidOwner = gettid()
mov dword ptr [rdi+4], eax # lock->m_depth = 1
ret
.ALIGN 16
.LRecursive:
xor eax, eax
inc eax # return SUCCESS! (eax=1)
inc dword ptr [rdi+4] # lock->m_depth++
ret
.ALIGN 16
.LAlreadyLocked:
xor eax, eax # return 0
ret
.ALIGN 16
.global fastlock_unlock
.type fastlock_unlock,@function
fastlock_unlock:
# RDI points to the struct:
# int32_t m_pidOwner
# int32_t m_depth
# [rdi+64] ...
# uint16_t active
# uint16_t avail
sub dword ptr [rdi+4], 1 # decrement m_depth, don't use dec because it partially writes the flag register and we don't know its state
jnz .LDone # if depth is non-zero this is a recursive unlock, and we still hold it
mov dword ptr [rdi], -1 # pidOwner = -1 (we don't own it anymore)
mov esi, [rdi+64] # get current active (this one)
inc esi # bump it to the next thread
mov word ptr [rdi+64], si # give up our ticket (note: lock is not required here because the spinlock itself guards this variable)
mfence # sync other threads
# At this point the lock is removed, however we must wake up any pending futexs
mov edx, [rdi+64+4] # load the futex mask
bt edx, esi # is the next thread waiting on a futex?
jc unlock_futex # unlock the futex if necessary
ret # if not we're done.
.ALIGN 16
.LDone:
js fastlock_panic # panic if we made m_depth negative
ret