Fallback to a futex if we spin for a long time

Former-commit-id: ec57b4b0248bba671e388a2257b1bd65ed8d0f44
This commit is contained in:
John Sully 2019-06-15 23:53:34 -04:00
parent 7fc6f7f607
commit dd7cbbe730
3 changed files with 105 additions and 21 deletions

View File

@ -36,6 +36,8 @@
#include <assert.h> #include <assert.h>
#include <pthread.h> #include <pthread.h>
#include <limits.h> #include <limits.h>
#include <linux/futex.h>
#include <string.h>
#ifdef __APPLE__ #ifdef __APPLE__
#include <TargetConditionals.h> #include <TargetConditionals.h>
@ -64,6 +66,14 @@ uint64_t fastlock_getlongwaitcount()
return g_longwaits; return g_longwaits;
} }
#ifndef ASM_SPINLOCK
static int futex(volatile unsigned *uaddr, int futex_op, int val,
const struct timespec *timeout, int val3)
{
return syscall(SYS_futex, uaddr, futex_op, val,
timeout, uaddr, val3);
}
#endif
extern "C" pid_t gettid() extern "C" pid_t gettid()
{ {
@ -88,6 +98,7 @@ extern "C" void fastlock_init(struct fastlock *lock)
lock->m_ticket.m_avail = 0; lock->m_ticket.m_avail = 0;
lock->m_depth = 0; lock->m_depth = 0;
lock->m_pidOwner = -1; lock->m_pidOwner = -1;
lock->futex = 0;
} }
#ifndef ASM_SPINLOCK #ifndef ASM_SPINLOCK
@ -100,18 +111,24 @@ extern "C" void fastlock_lock(struct fastlock *lock)
} }
unsigned myticket = __atomic_fetch_add(&lock->m_ticket.m_avail, 1, __ATOMIC_RELEASE); unsigned myticket = __atomic_fetch_add(&lock->m_ticket.m_avail, 1, __ATOMIC_RELEASE);
unsigned mask = (1U << (myticket % 32));
int cloops = 0; int cloops = 0;
while (__atomic_load_2(&lock->m_ticket.m_active, __ATOMIC_ACQUIRE) != myticket) ticket ticketT;
while (((ticketT.u = __atomic_load_4(&lock->m_ticket.m_active, __ATOMIC_ACQUIRE)) & 0xffff) != myticket)
{ {
if ((++cloops % 1024*1024) == 0)
{
sched_yield();
++g_longwaits;
}
#if defined(__i386__) || defined(__amd64__) #if defined(__i386__) || defined(__amd64__)
__asm__ ("pause"); __asm__ ("pause");
#endif #endif
if ((++cloops % 1024*1024) == 0)
{
if (static_cast<uint16_t>(ticketT.m_active+1U) != myticket)
{
__atomic_fetch_or(&lock->futex, mask, __ATOMIC_ACQUIRE);
futex(&lock->m_ticket.u, FUTEX_WAIT_BITSET_PRIVATE, ticketT.u, nullptr, mask);
__atomic_fetch_and(&lock->futex, ~mask, __ATOMIC_RELEASE);
}
++g_longwaits;
}
} }
lock->m_depth = 1; lock->m_depth = 1;
@ -145,6 +162,21 @@ extern "C" int fastlock_trylock(struct fastlock *lock)
return false; return false;
} }
#define ROL32(v, shift) ((v << shift) | (v >> (32-shift)))
void unlock_futex(struct fastlock *lock, uint16_t ifutex)
{
unsigned mask = (1U << (ifutex % 32));
unsigned futexT = __atomic_load_4(&lock->futex, __ATOMIC_RELAXED) & mask;
if (futexT == 0)
return;
while (__atomic_load_4(&lock->futex, __ATOMIC_ACQUIRE) & mask)
{
if (futex(&lock->m_ticket.u, FUTEX_WAKE_BITSET_PRIVATE, INT_MAX, nullptr, mask) == 1)
break;
}
}
extern "C" void fastlock_unlock(struct fastlock *lock) extern "C" void fastlock_unlock(struct fastlock *lock)
{ {
--lock->m_depth; --lock->m_depth;
@ -153,7 +185,8 @@ extern "C" void fastlock_unlock(struct fastlock *lock)
assert((int)__atomic_load_4(&lock->m_pidOwner, __ATOMIC_RELAXED) >= 0); // unlock after free assert((int)__atomic_load_4(&lock->m_pidOwner, __ATOMIC_RELAXED) >= 0); // unlock after free
lock->m_pidOwner = -1; lock->m_pidOwner = -1;
std::atomic_thread_fence(std::memory_order_acquire); std::atomic_thread_fence(std::memory_order_acquire);
__atomic_fetch_add(&lock->m_ticket.m_active, 1, __ATOMIC_ACQ_REL); // on x86 the atomic is not required here, but ASM handles that case uint16_t activeNew = __atomic_add_fetch(&lock->m_ticket.m_active, 1, __ATOMIC_ACQ_REL); // on x86 the atomic is not required here, but ASM handles that case
unlock_futex(lock, activeNew);
} }
} }
#endif #endif

View File

@ -20,17 +20,29 @@ uint64_t fastlock_getlongwaitcount(); // this is a global value
} }
#endif #endif
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wpedantic"
struct ticket struct ticket
{ {
uint16_t m_active; union
uint16_t m_avail; {
struct
{
uint16_t m_active;
uint16_t m_avail;
};
unsigned u;
};
}; };
#pragma GCC diagnostic pop
struct fastlock struct fastlock
{ {
volatile struct ticket m_ticket; volatile struct ticket m_ticket;
volatile int m_pidOwner; volatile int m_pidOwner;
volatile int m_depth; volatile int m_depth;
unsigned futex;
#ifdef __cplusplus #ifdef __cplusplus
fastlock() fastlock()

View File

@ -22,7 +22,7 @@ fastlock_lock:
push rdi ; we need our struct pointer (also balance the stack for the call) push rdi ; we need our struct pointer (also balance the stack for the call)
call gettid ; get our thread ID (TLS is nasty in ASM so don't bother inlining) call gettid ; get our thread ID (TLS is nasty in ASM so don't bother inlining)
mov esi, eax ; back it up in esi mov esi, eax ; back it up in esi
mov rdi, [rsp] ; get our pointer back pop rdi ; get our pointer back
cmp [rdi+4], esi ; Is the TID we got back the owner of the lock? cmp [rdi+4], esi ; Is the TID we got back the owner of the lock?
je .LLocked ; Don't spin in that case je .LLocked ; Don't spin in that case
@ -30,11 +30,11 @@ fastlock_lock:
xor eax, eax ; eliminate partial register dependency xor eax, eax ; eliminate partial register dependency
inc eax ; we want to add one inc eax ; we want to add one
lock xadd [rdi+2], ax ; do the xadd, ax contains the value before the addition lock xadd [rdi+2], ax ; do the xadd, ax contains the value before the addition
; eax now contains the ticket ; ax now contains the ticket
xor ecx, ecx
ALIGN 16 ALIGN 16
.LLoop: .LLoop:
cmp [rdi], ax ; is our ticket up? mov edx, [rdi]
cmp dx, ax ; is our ticket up?
je .LLocked ; leave the loop je .LLocked ; leave the loop
pause pause
add ecx, 1000h ; Have we been waiting a long time? (oflow if we have) add ecx, 1000h ; Have we been waiting a long time? (oflow if we have)
@ -44,22 +44,38 @@ ALIGN 16
; But the compiler doesn't know that we rarely hit this, and when we do we know the lock is ; But the compiler doesn't know that we rarely hit this, and when we do we know the lock is
; taking a long time to be released anyways. We optimize for the common case of short ; taking a long time to be released anyways. We optimize for the common case of short
; lock intervals. That's why we're using a spinlock in the first place ; lock intervals. That's why we're using a spinlock in the first place
inc edx
cmp dx, ax
je .LLoop
dec edx ; restore the current ticket
.LFutexWait:
push rsi push rsi
push rax push rax
mov rax, 24 ; sys_sched_yield ; Setup the syscall args
syscall ; give up our timeslice we'll be here a while ; rdi ARG1 futex (already in rdi)
pop rax mov esi, (9 | 128) ; rsi ARG2 FUTEX_WAIT_BITSET_PRIVATE
pop rsi ; rdx ARG3 ticketT.u (already in edx)
xor r10d, r10d ; r10 ARG4 NULL
mov r8, rdi ; r8 ARG5 dup rdi
xor r9d, r9d
bts r9d, eax ; r9 ARG6 mask
mov eax, 202 ; sys_futex
; Do the syscall
lock or [rdi+12], r9d ; inform the unlocking thread we're waiting
syscall ; wait for the futex
not r9d ; convert our flag into a mask of bits not to touch
lock and [rdi+12], r9d ; clear the flag in the futex control mask
; cleanup and continue
mov rcx, g_longwaits mov rcx, g_longwaits
inc qword [rcx] ; increment our long wait counter inc qword [rcx] ; increment our long wait counter
mov rdi, [rsp] ; our struct pointer is on the stack already pop rax
pop rsi
xor ecx, ecx ; Reset our loop counter xor ecx, ecx ; Reset our loop counter
jmp .LLoop ; Get back in the game jmp .LLoop ; Get back in the game
ALIGN 16 ALIGN 16
.LLocked: .LLocked:
mov [rdi+4], esi ; lock->m_pidOwner = gettid() mov [rdi+4], esi ; lock->m_pidOwner = gettid()
inc dword [rdi+8] ; lock->m_depth++ inc dword [rdi+8] ; lock->m_depth++
add rsp, 8 ; fix stack
ret ret
ALIGN 16 ALIGN 16
@ -114,9 +130,32 @@ fastlock_unlock:
; uint16_t avail ; uint16_t avail
; int32_t m_pidOwner ; int32_t m_pidOwner
; int32_t m_depth ; int32_t m_depth
push r11
sub dword [rdi+8], 1 ; decrement m_depth, don't use dec because it partially writes the flag register and we don't know its state sub dword [rdi+8], 1 ; decrement m_depth, don't use dec because it partially writes the flag register and we don't know its state
jnz .LDone ; if depth is non-zero this is a recursive unlock, and we still hold it jnz .LDone ; if depth is non-zero this is a recursive unlock, and we still hold it
mov dword [rdi+4], -1 ; pidOwner = -1 (we don't own it anymore) mov dword [rdi+4], -1 ; pidOwner = -1 (we don't own it anymore)
inc word [rdi] ; give up our ticket (note: lock is not required here because the spinlock itself guards this variable) mov ecx, [rdi] ; get current active (this one)
inc ecx ; bump it to the next thread
mov [rdi], cx ; give up our ticket (note: lock is not required here because the spinlock itself guards this variable)
; At this point the lock is removed, however we must wake up any pending futexs
mov r9d, 1 ; eax is the bitmask for 2 threads
rol r9d, cl ; place the mask in the right spot for the next 2 threads
ALIGN 16
.LRetryWake:
mov r11d, [rdi+12] ; load the futex mask
and r11d, r9d ; are any threads waiting on a futex?
jz .LDone ; if not we're done.
; we have to wake the futexs
; rdi ARG1 futex (already in rdi)
mov esi, (10 | 128) ; rsi ARG2 FUTEX_WAKE_BITSET_PRIVATE
mov edx, 0x7fffffff ; rdx ARG3 INT_MAX (number of threads to wake)
xor r10d, r10d ; r10 ARG4 NULL
mov r8, rdi ; r8 ARG5 dup rdi
; r9 ARG6 mask (already set above)
mov eax, 202 ; sys_futex
syscall
cmp eax, 1 ; did we wake as many as we expected?
jnz .LRetryWake
.LDone: .LDone:
pop r11
ret ret