From dd7cbbe730f28c4e8db5f6f855e980cf4bd76eef Mon Sep 17 00:00:00 2001 From: John Sully Date: Sat, 15 Jun 2019 23:53:34 -0400 Subject: [PATCH] Fallback to a futex if we spin for a long time Former-commit-id: ec57b4b0248bba671e388a2257b1bd65ed8d0f44 --- src/fastlock.cpp | 49 +++++++++++++++++++++++++++++------ src/fastlock.h | 16 ++++++++++-- src/fastlock_x64.asm | 61 ++++++++++++++++++++++++++++++++++++-------- 3 files changed, 105 insertions(+), 21 deletions(-) diff --git a/src/fastlock.cpp b/src/fastlock.cpp index f1e13a279..4d3327318 100644 --- a/src/fastlock.cpp +++ b/src/fastlock.cpp @@ -36,6 +36,8 @@ #include #include #include +#include +#include #ifdef __APPLE__ #include @@ -64,6 +66,14 @@ uint64_t fastlock_getlongwaitcount() return g_longwaits; } +#ifndef ASM_SPINLOCK +static int futex(volatile unsigned *uaddr, int futex_op, int val, + const struct timespec *timeout, int val3) +{ + return syscall(SYS_futex, uaddr, futex_op, val, + timeout, uaddr, val3); +} +#endif extern "C" pid_t gettid() { @@ -88,6 +98,7 @@ extern "C" void fastlock_init(struct fastlock *lock) lock->m_ticket.m_avail = 0; lock->m_depth = 0; lock->m_pidOwner = -1; + lock->futex = 0; } #ifndef ASM_SPINLOCK @@ -100,18 +111,24 @@ extern "C" void fastlock_lock(struct fastlock *lock) } unsigned myticket = __atomic_fetch_add(&lock->m_ticket.m_avail, 1, __ATOMIC_RELEASE); - + unsigned mask = (1U << (myticket % 32)); int cloops = 0; - while (__atomic_load_2(&lock->m_ticket.m_active, __ATOMIC_ACQUIRE) != myticket) + ticket ticketT; + while (((ticketT.u = __atomic_load_4(&lock->m_ticket.m_active, __ATOMIC_ACQUIRE)) & 0xffff) != myticket) { - if ((++cloops % 1024*1024) == 0) - { - sched_yield(); - ++g_longwaits; - } #if defined(__i386__) || defined(__amd64__) __asm__ ("pause"); #endif + if ((++cloops % 1024*1024) == 0) + { + if (static_cast(ticketT.m_active+1U) != myticket) + { + __atomic_fetch_or(&lock->futex, mask, __ATOMIC_ACQUIRE); + futex(&lock->m_ticket.u, FUTEX_WAIT_BITSET_PRIVATE, ticketT.u, nullptr, mask); + __atomic_fetch_and(&lock->futex, ~mask, __ATOMIC_RELEASE); + } + ++g_longwaits; + } } lock->m_depth = 1; @@ -145,6 +162,21 @@ extern "C" int fastlock_trylock(struct fastlock *lock) return false; } +#define ROL32(v, shift) ((v << shift) | (v >> (32-shift))) +void unlock_futex(struct fastlock *lock, uint16_t ifutex) +{ + unsigned mask = (1U << (ifutex % 32)); + unsigned futexT = __atomic_load_4(&lock->futex, __ATOMIC_RELAXED) & mask; + + if (futexT == 0) + return; + + while (__atomic_load_4(&lock->futex, __ATOMIC_ACQUIRE) & mask) + { + if (futex(&lock->m_ticket.u, FUTEX_WAKE_BITSET_PRIVATE, INT_MAX, nullptr, mask) == 1) + break; + } +} extern "C" void fastlock_unlock(struct fastlock *lock) { --lock->m_depth; @@ -153,7 +185,8 @@ extern "C" void fastlock_unlock(struct fastlock *lock) assert((int)__atomic_load_4(&lock->m_pidOwner, __ATOMIC_RELAXED) >= 0); // unlock after free lock->m_pidOwner = -1; std::atomic_thread_fence(std::memory_order_acquire); - __atomic_fetch_add(&lock->m_ticket.m_active, 1, __ATOMIC_ACQ_REL); // on x86 the atomic is not required here, but ASM handles that case + uint16_t activeNew = __atomic_add_fetch(&lock->m_ticket.m_active, 1, __ATOMIC_ACQ_REL); // on x86 the atomic is not required here, but ASM handles that case + unlock_futex(lock, activeNew); } } #endif diff --git a/src/fastlock.h b/src/fastlock.h index b8027de54..81c5807cd 100644 --- a/src/fastlock.h +++ b/src/fastlock.h @@ -20,17 +20,29 @@ uint64_t fastlock_getlongwaitcount(); // this is a global value } #endif +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpedantic" struct ticket { - uint16_t m_active; - uint16_t m_avail; + union + { + struct + { + uint16_t m_active; + uint16_t m_avail; + }; + unsigned u; + }; }; +#pragma GCC diagnostic pop + struct fastlock { volatile struct ticket m_ticket; volatile int m_pidOwner; volatile int m_depth; + unsigned futex; #ifdef __cplusplus fastlock() diff --git a/src/fastlock_x64.asm b/src/fastlock_x64.asm index 7645d3baa..f7d1a3093 100644 --- a/src/fastlock_x64.asm +++ b/src/fastlock_x64.asm @@ -22,7 +22,7 @@ fastlock_lock: push rdi ; we need our struct pointer (also balance the stack for the call) call gettid ; get our thread ID (TLS is nasty in ASM so don't bother inlining) mov esi, eax ; back it up in esi - mov rdi, [rsp] ; get our pointer back + pop rdi ; get our pointer back cmp [rdi+4], esi ; Is the TID we got back the owner of the lock? je .LLocked ; Don't spin in that case @@ -30,11 +30,11 @@ fastlock_lock: xor eax, eax ; eliminate partial register dependency inc eax ; we want to add one lock xadd [rdi+2], ax ; do the xadd, ax contains the value before the addition - ; eax now contains the ticket - xor ecx, ecx + ; ax now contains the ticket ALIGN 16 .LLoop: - cmp [rdi], ax ; is our ticket up? + mov edx, [rdi] + cmp dx, ax ; is our ticket up? je .LLocked ; leave the loop pause add ecx, 1000h ; Have we been waiting a long time? (oflow if we have) @@ -44,22 +44,38 @@ ALIGN 16 ; But the compiler doesn't know that we rarely hit this, and when we do we know the lock is ; taking a long time to be released anyways. We optimize for the common case of short ; lock intervals. That's why we're using a spinlock in the first place + inc edx + cmp dx, ax + je .LLoop + dec edx ; restore the current ticket +.LFutexWait: push rsi push rax - mov rax, 24 ; sys_sched_yield - syscall ; give up our timeslice we'll be here a while - pop rax - pop rsi + ; Setup the syscall args + ; rdi ARG1 futex (already in rdi) + mov esi, (9 | 128) ; rsi ARG2 FUTEX_WAIT_BITSET_PRIVATE + ; rdx ARG3 ticketT.u (already in edx) + xor r10d, r10d ; r10 ARG4 NULL + mov r8, rdi ; r8 ARG5 dup rdi + xor r9d, r9d + bts r9d, eax ; r9 ARG6 mask + mov eax, 202 ; sys_futex + ; Do the syscall + lock or [rdi+12], r9d ; inform the unlocking thread we're waiting + syscall ; wait for the futex + not r9d ; convert our flag into a mask of bits not to touch + lock and [rdi+12], r9d ; clear the flag in the futex control mask + ; cleanup and continue mov rcx, g_longwaits inc qword [rcx] ; increment our long wait counter - mov rdi, [rsp] ; our struct pointer is on the stack already + pop rax + pop rsi xor ecx, ecx ; Reset our loop counter jmp .LLoop ; Get back in the game ALIGN 16 .LLocked: mov [rdi+4], esi ; lock->m_pidOwner = gettid() inc dword [rdi+8] ; lock->m_depth++ - add rsp, 8 ; fix stack ret ALIGN 16 @@ -114,9 +130,32 @@ fastlock_unlock: ; uint16_t avail ; int32_t m_pidOwner ; int32_t m_depth + push r11 sub dword [rdi+8], 1 ; decrement m_depth, don't use dec because it partially writes the flag register and we don't know its state jnz .LDone ; if depth is non-zero this is a recursive unlock, and we still hold it mov dword [rdi+4], -1 ; pidOwner = -1 (we don't own it anymore) - inc word [rdi] ; give up our ticket (note: lock is not required here because the spinlock itself guards this variable) + mov ecx, [rdi] ; get current active (this one) + inc ecx ; bump it to the next thread + mov [rdi], cx ; give up our ticket (note: lock is not required here because the spinlock itself guards this variable) + ; At this point the lock is removed, however we must wake up any pending futexs + mov r9d, 1 ; eax is the bitmask for 2 threads + rol r9d, cl ; place the mask in the right spot for the next 2 threads +ALIGN 16 +.LRetryWake: + mov r11d, [rdi+12] ; load the futex mask + and r11d, r9d ; are any threads waiting on a futex? + jz .LDone ; if not we're done. + ; we have to wake the futexs + ; rdi ARG1 futex (already in rdi) + mov esi, (10 | 128) ; rsi ARG2 FUTEX_WAKE_BITSET_PRIVATE + mov edx, 0x7fffffff ; rdx ARG3 INT_MAX (number of threads to wake) + xor r10d, r10d ; r10 ARG4 NULL + mov r8, rdi ; r8 ARG5 dup rdi + ; r9 ARG6 mask (already set above) + mov eax, 202 ; sys_futex + syscall + cmp eax, 1 ; did we wake as many as we expected? + jnz .LRetryWake .LDone: + pop r11 ret