section .text extern gettid extern sched_yield extern g_longwaits ; This is the first use of assembly in this codebase, a valid question is WHY? ; The spinlock we implement here is performance critical, and simply put GCC ; emits awful code. The original C code is left in fastlock.cpp for reference ; and x-plat. ALIGN 16 global fastlock_lock fastlock_lock: ; RDI points to the struct: ; uint16_t active ; uint16_t avail ; int32_t m_pidOwner ; int32_t m_depth ; First get our TID and put it in ecx push rdi ; we need our struct pointer (also balance the stack for the call) call gettid ; get our thread ID (TLS is nasty in ASM so don't bother inlining) mov esi, eax ; back it up in esi pop rdi ; get our pointer back cmp [rdi+4], esi ; Is the TID we got back the owner of the lock? je .LLocked ; Don't spin in that case xor eax, eax ; eliminate partial register dependency inc eax ; we want to add one lock xadd [rdi+2], ax ; do the xadd, ax contains the value before the addition ; ax now contains the ticket ALIGN 16 .LLoop: mov edx, [rdi] cmp dx, ax ; is our ticket up? je .LLocked ; leave the loop pause add ecx, 1000h ; Have we been waiting a long time? (oflow if we have) ; 1000h is set so we overflow on the 1024*1024'th iteration (like the C code) jnc .LLoop ; If so, give up our timeslice to someone who's doing real work ; Like the compiler, you're probably thinking: "Hey! I should take these pushs out of the loop" ; But the compiler doesn't know that we rarely hit this, and when we do we know the lock is ; taking a long time to be released anyways. We optimize for the common case of short ; lock intervals. That's why we're using a spinlock in the first place inc edx cmp dx, ax je .LLoop dec edx ; restore the current ticket .LFutexWait: push rsi push rax ; Setup the syscall args ; rdi ARG1 futex (already in rdi) mov esi, (9 | 128) ; rsi ARG2 FUTEX_WAIT_BITSET_PRIVATE ; rdx ARG3 ticketT.u (already in edx) xor r10d, r10d ; r10 ARG4 NULL mov r8, rdi ; r8 ARG5 dup rdi xor r9d, r9d bts r9d, eax ; r9 ARG6 mask mov eax, 202 ; sys_futex ; Do the syscall lock or [rdi+12], r9d ; inform the unlocking thread we're waiting syscall ; wait for the futex not r9d ; convert our flag into a mask of bits not to touch lock and [rdi+12], r9d ; clear the flag in the futex control mask ; cleanup and continue mov rcx, g_longwaits inc qword [rcx] ; increment our long wait counter pop rax pop rsi xor ecx, ecx ; Reset our loop counter jmp .LLoop ; Get back in the game ALIGN 16 .LLocked: mov [rdi+4], esi ; lock->m_pidOwner = gettid() inc dword [rdi+8] ; lock->m_depth++ ret ALIGN 16 global fastlock_trylock fastlock_trylock: ; RDI points to the struct: ; uint16_t active ; uint16_t avail ; int32_t m_pidOwner ; int32_t m_depth ; First get our TID and put it in ecx push rdi ; we need our struct pointer (also balance the stack for the call) call gettid ; get our thread ID (TLS is nasty in ASM so don't bother inlining) mov esi, eax ; back it up in esi pop rdi ; get our pointer back cmp [rdi+4], esi ; Is the TID we got back the owner of the lock? je .LRecursive ; Don't spin in that case mov eax, [rdi] ; get both active and avail counters mov ecx, eax ; duplicate in ecx ror ecx, 16 ; swap upper and lower 16-bits cmp eax, ecx ; are the upper and lower 16-bits the same? jnz .LAlreadyLocked ; If not return failure ; at this point we know eax+ecx have [avail][active] and they are both the same add ecx, 10000h ; increment avail, ecx is now our wanted value lock cmpxchg [rdi], ecx ; If rdi still contains the value in eax, put in ecx (inc avail) jnz .LAlreadyLocked ; If Z is not set then someone locked it while we were preparing xor eax, eax inc eax ; return SUCCESS! (eax=1) mov [rdi+4], esi ; lock->m_pidOwner = gettid() mov dword [rdi+8], eax ; lock->m_depth = 1 ret ALIGN 16 .LRecursive: xor eax, eax inc eax ; return SUCCESS! (eax=1) inc dword [rdi+8] ; lock->m_depth++ ret ALIGN 16 .LAlreadyLocked: xor eax, eax ; return 0; ret ALIGN 16 global fastlock_unlock fastlock_unlock: ; RDI points to the struct: ; uint16_t active ; uint16_t avail ; int32_t m_pidOwner ; int32_t m_depth push r11 sub dword [rdi+8], 1 ; decrement m_depth, don't use dec because it partially writes the flag register and we don't know its state jnz .LDone ; if depth is non-zero this is a recursive unlock, and we still hold it mov dword [rdi+4], -1 ; pidOwner = -1 (we don't own it anymore) mov ecx, [rdi] ; get current active (this one) inc ecx ; bump it to the next thread mov [rdi], cx ; give up our ticket (note: lock is not required here because the spinlock itself guards this variable) ; At this point the lock is removed, however we must wake up any pending futexs mov r9d, 1 ; eax is the bitmask for 2 threads rol r9d, cl ; place the mask in the right spot for the next 2 threads ALIGN 16 .LRetryWake: mov r11d, [rdi+12] ; load the futex mask and r11d, r9d ; are any threads waiting on a futex? jz .LDone ; if not we're done. ; we have to wake the futexs ; rdi ARG1 futex (already in rdi) mov esi, (10 | 128) ; rsi ARG2 FUTEX_WAKE_BITSET_PRIVATE mov edx, 0x7fffffff ; rdx ARG3 INT_MAX (number of threads to wake) xor r10d, r10d ; r10 ARG4 NULL mov r8, rdi ; r8 ARG5 dup rdi ; r9 ARG6 mask (already set above) mov eax, 202 ; sys_futex syscall cmp eax, 1 ; did we wake as many as we expected? jnz .LRetryWake .LDone: pop r11 ret