section .text extern gettid extern sched_yield extern g_longwaits ; This is the first use of assembly in this codebase, a valid question is WHY? ; The spinlock we implement here is performance critical, and simply put GCC ; emits awful code. The original C code is left in fastlock.cpp for reference ; and x-plat. ALIGN 16 global fastlock_lock fastlock_lock: ; RDI points to the struct: ; uint16_t active ; uint16_t avail ; int32_t m_pidOwner ; int32_t m_depth ; First get our TID and put it in ecx push rdi ; we need our struct pointer (also balance the stack for the call) call gettid ; get our thread ID (TLS is nasty in ASM so don't bother inlining) mov esi, eax ; back it up in esi mov rdi, [rsp] ; get our pointer back cmp [rdi+4], esi ; Is the TID we got back the owner of the lock? je .LLocked ; Don't spin in that case xor eax, eax ; eliminate partial register dependency inc eax ; we want to add one lock xadd [rdi+2], ax ; do the xadd, ax contains the value before the addition ; eax now contains the ticket xor ecx, ecx ALIGN 16 .LLoop: cmp [rdi], ax ; is our ticket up? je .LLocked ; leave the loop pause add ecx, 1000h ; Have we been waiting a long time? (oflow if we have) ; 1000h is set so we overflow on the 1024*1024'th iteration (like the C code) jnc .LLoop ; If so, give up our timeslice to someone who's doing real work ; Like the compiler, you're probably thinking: "Hey! I should take these pushs out of the loop" ; But the compiler doesn't know that we rarely hit this, and when we do we know the lock is ; taking a long time to be released anyways. We optimize for the common case of short ; lock intervals. That's why we're using a spinlock in the first place push rsi push rax mov rax, 24 ; sys_sched_yield syscall ; give up our timeslice we'll be here a while pop rax pop rsi mov rcx, g_longwaits inc qword [rcx] ; increment our long wait counter mov rdi, [rsp] ; our struct pointer is on the stack already xor ecx, ecx ; Reset our loop counter jmp .LLoop ; Get back in the game ALIGN 16 .LLocked: mov [rdi+4], esi ; lock->m_pidOwner = gettid() inc dword [rdi+8] ; lock->m_depth++ add rsp, 8 ; fix stack ret ALIGN 16 global fastlock_trylock fastlock_trylock: ; RDI points to the struct: ; uint16_t active ; uint16_t avail ; int32_t m_pidOwner ; int32_t m_depth ; First get our TID and put it in ecx push rdi ; we need our struct pointer (also balance the stack for the call) call gettid ; get our thread ID (TLS is nasty in ASM so don't bother inlining) mov esi, eax ; back it up in esi pop rdi ; get our pointer back cmp [rdi+4], esi ; Is the TID we got back the owner of the lock? je .LRecursive ; Don't spin in that case mov eax, [rdi] ; get both active and avail counters mov ecx, eax ; duplicate in ecx ror ecx, 16 ; swap upper and lower 16-bits cmp eax, ecx ; are the upper and lower 16-bits the same? jnz .LAlreadyLocked ; If not return failure ; at this point we know eax+ecx have [avail][active] and they are both the same add ecx, 10000h ; increment avail, ecx is now our wanted value lock cmpxchg [rdi], ecx ; If rdi still contains the value in eax, put in ecx (inc avail) jnz .LAlreadyLocked ; If Z is not set then someone locked it while we were preparing xor eax, eax inc eax ; return SUCCESS! (eax=1) mov [rdi+4], esi ; lock->m_pidOwner = gettid() mov dword [rdi+8], eax ; lock->m_depth = 1 ret ALIGN 16 .LRecursive: xor eax, eax inc eax ; return SUCCESS! (eax=1) inc dword [rdi+8] ; lock->m_depth++ ret ALIGN 16 .LAlreadyLocked: xor eax, eax ; return 0; ret ALIGN 16 global fastlock_unlock fastlock_unlock: ; RDI points to the struct: ; uint16_t active ; uint16_t avail ; int32_t m_pidOwner ; int32_t m_depth sub dword [rdi+8], 1 ; decrement m_depth, don't use dec because it partially writes the flag register and we don't know its state jnz .LDone ; if depth is non-zero this is a recursive unlock, and we still hold it mov dword [rdi+4], -1 ; pidOwner = -1 (we don't own it anymore) inc word [rdi] ; give up our ticket (note: lock is not required here because the spinlock itself guards this variable) .LDone: ret