diff --git a/src/fastlock.cpp b/src/fastlock.cpp index a1499d77b..f265f3908 100644 --- a/src/fastlock.cpp +++ b/src/fastlock.cpp @@ -111,7 +111,6 @@ extern "C" int fastlock_trylock(struct fastlock *lock) } return false; } -#endif extern "C" void fastlock_unlock(struct fastlock *lock) { @@ -121,9 +120,10 @@ extern "C" void fastlock_unlock(struct fastlock *lock) assert((int)__atomic_load_4(&lock->m_pidOwner, __ATOMIC_RELAXED) >= 0); // unlock after free lock->m_pidOwner = -1; std::atomic_thread_fence(std::memory_order_acquire); - __atomic_fetch_add(&lock->m_ticket.m_active, 1, __ATOMIC_ACQ_REL); + __atomic_fetch_add(&lock->m_ticket.m_active, 1, __ATOMIC_ACQ_REL); // on x86 the atomic is not required here, but ASM handles that case } } +#endif extern "C" void fastlock_free(struct fastlock *lock) { diff --git a/src/fastlock_x64.asm b/src/fastlock_x64.asm index d2a1a90b3..1b876350f 100644 --- a/src/fastlock_x64.asm +++ b/src/fastlock_x64.asm @@ -6,8 +6,7 @@ extern sched_yield ; This is the first use of assembly in this codebase, a valid question is WHY? ; The spinlock we implement here is performance critical, and simply put GCC ; emits awful code. The original C code is left in fastlock.cpp for reference -; and x-plat. The code generated for the unlock case is reasonable and left in -; C++. +; and x-plat. ALIGN 16 global fastlock_lock @@ -103,3 +102,18 @@ ALIGN 16 .LAlreadyLocked: xor eax, eax ; return 0; ret + +ALIGN 16 +global fastlock_unlock +fastlock_unlock: + ; RDI points to the struct: + ; uint16_t active + ; uint16_t avail + ; int32_t m_pidOwner + ; int32_t m_depth + sub dword [rdi+8], 1 ; decrement m_depth, don't use dec because it partially writes the flag register and we don't know its state + jnz .LDone ; if depth is non-zero this is a recursive unlock, and we still hold it + mov dword [rdi+4], -1 ; pidOwner = -1 (we don't own it anymore) + inc word [rdi] ; give up our ticket (note: lock is not required here because the spinlock itself guards this variable) +.LDone: + ret