2019-10-22 21:34:51 -04:00
|
|
|
.intel_syntax noprefix
|
|
|
|
.text
|
2019-02-25 18:21:27 -05:00
|
|
|
|
2019-10-22 21:34:51 -04:00
|
|
|
.extern gettid
|
|
|
|
.extern fastlock_sleep
|
2019-02-25 18:21:27 -05:00
|
|
|
|
2019-10-22 21:34:51 -04:00
|
|
|
# This is the first use of assembly in this codebase, a valid question is WHY?
|
|
|
|
# The spinlock we implement here is performance critical, and simply put GCC
|
|
|
|
# emits awful code. The original C code is left in fastlock.cpp for reference
|
|
|
|
# and x-plat.
|
2019-02-25 18:21:27 -05:00
|
|
|
|
2019-10-22 21:34:51 -04:00
|
|
|
.ALIGN 16
|
|
|
|
.global fastlock_lock
|
|
|
|
.type fastlock_lock,@function
|
2019-02-25 18:21:27 -05:00
|
|
|
fastlock_lock:
|
2019-10-22 21:34:51 -04:00
|
|
|
.cfi_startproc
|
|
|
|
.cfi_def_cfa rsp, 8
|
|
|
|
# RDI points to the struct:
|
|
|
|
# int32_t m_pidOwner
|
|
|
|
# int32_t m_depth
|
2019-11-17 16:06:49 -05:00
|
|
|
# [rdi+64] ...
|
|
|
|
# uint16_t active
|
|
|
|
# uint16_t avail
|
2019-02-25 18:21:27 -05:00
|
|
|
|
2019-10-22 21:34:51 -04:00
|
|
|
# First get our TID and put it in ecx
|
|
|
|
push rdi # we need our struct pointer (also balance the stack for the call)
|
|
|
|
.cfi_adjust_cfa_offset 8
|
|
|
|
call gettid # get our thread ID (TLS is nasty in ASM so don't bother inlining)
|
|
|
|
mov esi, eax # back it up in esi
|
|
|
|
pop rdi # get our pointer back
|
|
|
|
.cfi_adjust_cfa_offset -8
|
2019-02-25 18:21:27 -05:00
|
|
|
|
2019-11-17 16:06:49 -05:00
|
|
|
cmp [rdi], esi # Is the TID we got back the owner of the lock?
|
2019-10-22 21:34:51 -04:00
|
|
|
je .LLocked # Don't spin in that case
|
2019-02-25 18:21:27 -05:00
|
|
|
|
2019-10-22 21:34:51 -04:00
|
|
|
xor eax, eax # eliminate partial register dependency
|
|
|
|
inc eax # we want to add one
|
2019-11-17 16:06:49 -05:00
|
|
|
lock xadd [rdi+66], ax # do the xadd, ax contains the value before the addition
|
2019-10-22 21:34:51 -04:00
|
|
|
# ax now contains the ticket
|
|
|
|
# OK Start the wait loop
|
2019-10-22 00:43:32 -04:00
|
|
|
xor ecx, ecx
|
2019-10-22 21:34:51 -04:00
|
|
|
.ALIGN 16
|
2019-03-01 13:29:21 -05:00
|
|
|
.LLoop:
|
2019-11-17 16:06:49 -05:00
|
|
|
mov edx, [rdi+64]
|
2019-10-22 21:34:51 -04:00
|
|
|
cmp dx, ax # is our ticket up?
|
|
|
|
je .LLocked # leave the loop
|
2019-03-01 13:29:21 -05:00
|
|
|
pause
|
2020-01-01 11:52:00 -05:00
|
|
|
add ecx, 0x1000 # Have we been waiting a long time? (oflow if we have)
|
2019-10-22 21:34:51 -04:00
|
|
|
# 1000h is set so we overflow on the 1024*1024'th iteration (like the C code)
|
|
|
|
jnc .LLoop # If so, give up our timeslice to someone who's doing real work
|
|
|
|
# Like the compiler, you're probably thinking: "Hey! I should take these pushs out of the loop"
|
|
|
|
# But the compiler doesn't know that we rarely hit this, and when we do we know the lock is
|
|
|
|
# taking a long time to be released anyways. We optimize for the common case of short
|
|
|
|
# lock intervals. That's why we're using a spinlock in the first place
|
|
|
|
# If we get here we're going to sleep in the kernel with a futex
|
|
|
|
push rdi
|
2019-02-25 18:21:27 -05:00
|
|
|
push rsi
|
|
|
|
push rax
|
2019-10-22 21:34:51 -04:00
|
|
|
.cfi_adjust_cfa_offset 24
|
|
|
|
# Setup the syscall args
|
|
|
|
|
|
|
|
# rdi ARG1 futex (already in rdi)
|
|
|
|
# rsi ARG2 tid (already in esi)
|
|
|
|
# rdx ARG3 ticketT.u (already in edx)
|
|
|
|
bts ecx, eax # rcx ARG4 mask
|
|
|
|
call fastlock_sleep
|
|
|
|
# cleanup and continue
|
2019-06-15 23:53:34 -04:00
|
|
|
pop rax
|
|
|
|
pop rsi
|
2019-10-22 00:43:32 -04:00
|
|
|
pop rdi
|
2019-10-22 21:34:51 -04:00
|
|
|
.cfi_adjust_cfa_offset -24
|
|
|
|
xor ecx, ecx # Reset our loop counter
|
|
|
|
jmp .LLoop # Get back in the game
|
|
|
|
.ALIGN 16
|
2019-03-01 13:29:21 -05:00
|
|
|
.LLocked:
|
2019-11-17 16:06:49 -05:00
|
|
|
mov [rdi], esi # lock->m_pidOwner = gettid()
|
|
|
|
inc dword ptr [rdi+4] # lock->m_depth++
|
2019-02-25 18:21:27 -05:00
|
|
|
ret
|
2019-10-22 21:34:51 -04:00
|
|
|
.cfi_endproc
|
2019-02-25 18:21:27 -05:00
|
|
|
|
2019-10-22 21:34:51 -04:00
|
|
|
.ALIGN 16
|
|
|
|
.global fastlock_trylock
|
|
|
|
.type fastlock_trylock,@function
|
2019-02-25 18:21:27 -05:00
|
|
|
fastlock_trylock:
|
2019-10-22 21:34:51 -04:00
|
|
|
# RDI points to the struct:
|
|
|
|
# int32_t m_pidOwner
|
|
|
|
# int32_t m_depth
|
2019-11-17 16:06:49 -05:00
|
|
|
# [rdi+64] ...
|
|
|
|
# uint16_t active
|
|
|
|
# uint16_t avail
|
2019-02-25 18:21:27 -05:00
|
|
|
|
2019-10-22 21:34:51 -04:00
|
|
|
# First get our TID and put it in ecx
|
|
|
|
push rdi # we need our struct pointer (also balance the stack for the call)
|
|
|
|
call gettid # get our thread ID (TLS is nasty in ASM so don't bother inlining)
|
|
|
|
mov esi, eax # back it up in esi
|
|
|
|
pop rdi # get our pointer back
|
2019-02-25 18:21:27 -05:00
|
|
|
|
2019-11-17 16:06:49 -05:00
|
|
|
cmp [rdi], esi # Is the TID we got back the owner of the lock?
|
2019-10-22 21:34:51 -04:00
|
|
|
je .LRecursive # Don't spin in that case
|
2019-02-25 18:21:27 -05:00
|
|
|
|
2019-11-17 16:06:49 -05:00
|
|
|
mov eax, [rdi+64] # get both active and avail counters
|
2019-10-22 21:34:51 -04:00
|
|
|
mov ecx, eax # duplicate in ecx
|
|
|
|
ror ecx, 16 # swap upper and lower 16-bits
|
|
|
|
cmp eax, ecx # are the upper and lower 16-bits the same?
|
|
|
|
jnz .LAlreadyLocked # If not return failure
|
2019-02-25 18:21:27 -05:00
|
|
|
|
2019-10-22 21:34:51 -04:00
|
|
|
# at this point we know eax+ecx have [avail][active] and they are both the same
|
2019-11-17 16:06:49 -05:00
|
|
|
add ecx, 0x10000 # increment avail, ecx is now our wanted value
|
|
|
|
lock cmpxchg [rdi+64], ecx # If rdi still contains the value in eax, put in ecx (inc avail)
|
|
|
|
jnz .LAlreadyLocked # If Z is not set then someone locked it while we were preparing
|
2019-03-01 13:29:21 -05:00
|
|
|
xor eax, eax
|
2019-11-17 16:06:49 -05:00
|
|
|
inc eax # return SUCCESS! (eax=1)
|
|
|
|
mov [rdi], esi # lock->m_pidOwner = gettid()
|
|
|
|
mov dword ptr [rdi+4], eax # lock->m_depth = 1
|
2019-03-01 13:29:21 -05:00
|
|
|
ret
|
2019-10-22 21:34:51 -04:00
|
|
|
.ALIGN 16
|
2019-03-01 13:29:21 -05:00
|
|
|
.LRecursive:
|
|
|
|
xor eax, eax
|
2019-10-22 21:34:51 -04:00
|
|
|
inc eax # return SUCCESS! (eax=1)
|
2019-11-17 16:06:49 -05:00
|
|
|
inc dword ptr [rdi+4] # lock->m_depth++
|
2019-02-25 18:21:27 -05:00
|
|
|
ret
|
2019-10-22 21:34:51 -04:00
|
|
|
.ALIGN 16
|
2019-02-25 18:21:27 -05:00
|
|
|
.LAlreadyLocked:
|
2019-10-22 21:34:51 -04:00
|
|
|
xor eax, eax # return 0
|
2019-02-25 18:21:27 -05:00
|
|
|
ret
|
2019-03-02 16:47:27 -05:00
|
|
|
|
2019-10-22 21:34:51 -04:00
|
|
|
.ALIGN 16
|
|
|
|
.global fastlock_unlock
|
2019-03-02 16:47:27 -05:00
|
|
|
fastlock_unlock:
|
2019-10-22 21:34:51 -04:00
|
|
|
# RDI points to the struct:
|
|
|
|
# int32_t m_pidOwner
|
|
|
|
# int32_t m_depth
|
2019-11-17 16:06:49 -05:00
|
|
|
# [rdi+64] ...
|
|
|
|
# uint16_t active
|
|
|
|
# uint16_t avail
|
2019-06-15 23:53:34 -04:00
|
|
|
push r11
|
2019-11-17 16:06:49 -05:00
|
|
|
sub dword ptr [rdi+4], 1 # decrement m_depth, don't use dec because it partially writes the flag register and we don't know its state
|
2019-10-22 21:34:51 -04:00
|
|
|
jnz .LDone # if depth is non-zero this is a recursive unlock, and we still hold it
|
2019-11-17 16:06:49 -05:00
|
|
|
mov dword ptr [rdi], -1 # pidOwner = -1 (we don't own it anymore)
|
|
|
|
mov ecx, [rdi+64] # get current active (this one)
|
2019-10-22 21:34:51 -04:00
|
|
|
inc ecx # bump it to the next thread
|
2019-11-17 16:06:49 -05:00
|
|
|
mov [rdi+64], cx # give up our ticket (note: lock is not required here because the spinlock itself guards this variable)
|
2019-11-30 00:51:58 -05:00
|
|
|
mfence # sync other threads
|
2019-10-22 21:34:51 -04:00
|
|
|
# At this point the lock is removed, however we must wake up any pending futexs
|
|
|
|
mov r9d, 1 # eax is the bitmask for 2 threads
|
|
|
|
rol r9d, cl # place the mask in the right spot for the next 2 threads
|
2019-11-17 16:06:49 -05:00
|
|
|
add rdi, 64 # rdi now points to the token
|
2019-10-22 21:34:51 -04:00
|
|
|
.ALIGN 16
|
2019-06-15 23:53:34 -04:00
|
|
|
.LRetryWake:
|
2019-11-17 16:06:49 -05:00
|
|
|
mov r11d, [rdi+4] # load the futex mask
|
2019-10-22 21:34:51 -04:00
|
|
|
and r11d, r9d # are any threads waiting on a futex?
|
|
|
|
jz .LDone # if not we're done.
|
|
|
|
# we have to wake the futexs
|
|
|
|
# rdi ARG1 futex (already in rdi)
|
|
|
|
mov esi, (10 | 128) # rsi ARG2 FUTEX_WAKE_BITSET_PRIVATE
|
|
|
|
mov edx, 0x7fffffff # rdx ARG3 INT_MAX (number of threads to wake)
|
|
|
|
xor r10d, r10d # r10 ARG4 NULL
|
|
|
|
mov r8, rdi # r8 ARG5 dup rdi
|
|
|
|
# r9 ARG6 mask (already set above)
|
|
|
|
mov eax, 202 # sys_futex
|
2019-06-15 23:53:34 -04:00
|
|
|
syscall
|
2019-10-22 21:34:51 -04:00
|
|
|
cmp eax, 1 # did we wake as many as we expected?
|
2019-06-15 23:53:34 -04:00
|
|
|
jnz .LRetryWake
|
2019-03-02 16:47:27 -05:00
|
|
|
.LDone:
|
2019-06-15 23:53:34 -04:00
|
|
|
pop r11
|
2019-03-02 16:47:27 -05:00
|
|
|
ret
|