Fallback to a futex if we spin for a long time
Former-commit-id: ec57b4b0248bba671e388a2257b1bd65ed8d0f44
This commit is contained in:
parent
7fc6f7f607
commit
dd7cbbe730
@ -36,6 +36,8 @@
|
||||
#include <assert.h>
|
||||
#include <pthread.h>
|
||||
#include <limits.h>
|
||||
#include <linux/futex.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifdef __APPLE__
|
||||
#include <TargetConditionals.h>
|
||||
@ -64,6 +66,14 @@ uint64_t fastlock_getlongwaitcount()
|
||||
return g_longwaits;
|
||||
}
|
||||
|
||||
#ifndef ASM_SPINLOCK
|
||||
static int futex(volatile unsigned *uaddr, int futex_op, int val,
|
||||
const struct timespec *timeout, int val3)
|
||||
{
|
||||
return syscall(SYS_futex, uaddr, futex_op, val,
|
||||
timeout, uaddr, val3);
|
||||
}
|
||||
#endif
|
||||
|
||||
extern "C" pid_t gettid()
|
||||
{
|
||||
@ -88,6 +98,7 @@ extern "C" void fastlock_init(struct fastlock *lock)
|
||||
lock->m_ticket.m_avail = 0;
|
||||
lock->m_depth = 0;
|
||||
lock->m_pidOwner = -1;
|
||||
lock->futex = 0;
|
||||
}
|
||||
|
||||
#ifndef ASM_SPINLOCK
|
||||
@ -100,18 +111,24 @@ extern "C" void fastlock_lock(struct fastlock *lock)
|
||||
}
|
||||
|
||||
unsigned myticket = __atomic_fetch_add(&lock->m_ticket.m_avail, 1, __ATOMIC_RELEASE);
|
||||
|
||||
unsigned mask = (1U << (myticket % 32));
|
||||
int cloops = 0;
|
||||
while (__atomic_load_2(&lock->m_ticket.m_active, __ATOMIC_ACQUIRE) != myticket)
|
||||
ticket ticketT;
|
||||
while (((ticketT.u = __atomic_load_4(&lock->m_ticket.m_active, __ATOMIC_ACQUIRE)) & 0xffff) != myticket)
|
||||
{
|
||||
if ((++cloops % 1024*1024) == 0)
|
||||
{
|
||||
sched_yield();
|
||||
++g_longwaits;
|
||||
}
|
||||
#if defined(__i386__) || defined(__amd64__)
|
||||
__asm__ ("pause");
|
||||
#endif
|
||||
if ((++cloops % 1024*1024) == 0)
|
||||
{
|
||||
if (static_cast<uint16_t>(ticketT.m_active+1U) != myticket)
|
||||
{
|
||||
__atomic_fetch_or(&lock->futex, mask, __ATOMIC_ACQUIRE);
|
||||
futex(&lock->m_ticket.u, FUTEX_WAIT_BITSET_PRIVATE, ticketT.u, nullptr, mask);
|
||||
__atomic_fetch_and(&lock->futex, ~mask, __ATOMIC_RELEASE);
|
||||
}
|
||||
++g_longwaits;
|
||||
}
|
||||
}
|
||||
|
||||
lock->m_depth = 1;
|
||||
@ -145,6 +162,21 @@ extern "C" int fastlock_trylock(struct fastlock *lock)
|
||||
return false;
|
||||
}
|
||||
|
||||
#define ROL32(v, shift) ((v << shift) | (v >> (32-shift)))
|
||||
void unlock_futex(struct fastlock *lock, uint16_t ifutex)
|
||||
{
|
||||
unsigned mask = (1U << (ifutex % 32));
|
||||
unsigned futexT = __atomic_load_4(&lock->futex, __ATOMIC_RELAXED) & mask;
|
||||
|
||||
if (futexT == 0)
|
||||
return;
|
||||
|
||||
while (__atomic_load_4(&lock->futex, __ATOMIC_ACQUIRE) & mask)
|
||||
{
|
||||
if (futex(&lock->m_ticket.u, FUTEX_WAKE_BITSET_PRIVATE, INT_MAX, nullptr, mask) == 1)
|
||||
break;
|
||||
}
|
||||
}
|
||||
extern "C" void fastlock_unlock(struct fastlock *lock)
|
||||
{
|
||||
--lock->m_depth;
|
||||
@ -153,7 +185,8 @@ extern "C" void fastlock_unlock(struct fastlock *lock)
|
||||
assert((int)__atomic_load_4(&lock->m_pidOwner, __ATOMIC_RELAXED) >= 0); // unlock after free
|
||||
lock->m_pidOwner = -1;
|
||||
std::atomic_thread_fence(std::memory_order_acquire);
|
||||
__atomic_fetch_add(&lock->m_ticket.m_active, 1, __ATOMIC_ACQ_REL); // on x86 the atomic is not required here, but ASM handles that case
|
||||
uint16_t activeNew = __atomic_add_fetch(&lock->m_ticket.m_active, 1, __ATOMIC_ACQ_REL); // on x86 the atomic is not required here, but ASM handles that case
|
||||
unlock_futex(lock, activeNew);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -20,17 +20,29 @@ uint64_t fastlock_getlongwaitcount(); // this is a global value
|
||||
}
|
||||
#endif
|
||||
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wpedantic"
|
||||
struct ticket
|
||||
{
|
||||
uint16_t m_active;
|
||||
uint16_t m_avail;
|
||||
union
|
||||
{
|
||||
struct
|
||||
{
|
||||
uint16_t m_active;
|
||||
uint16_t m_avail;
|
||||
};
|
||||
unsigned u;
|
||||
};
|
||||
};
|
||||
#pragma GCC diagnostic pop
|
||||
|
||||
struct fastlock
|
||||
{
|
||||
volatile struct ticket m_ticket;
|
||||
|
||||
volatile int m_pidOwner;
|
||||
volatile int m_depth;
|
||||
unsigned futex;
|
||||
|
||||
#ifdef __cplusplus
|
||||
fastlock()
|
||||
|
@ -22,7 +22,7 @@ fastlock_lock:
|
||||
push rdi ; we need our struct pointer (also balance the stack for the call)
|
||||
call gettid ; get our thread ID (TLS is nasty in ASM so don't bother inlining)
|
||||
mov esi, eax ; back it up in esi
|
||||
mov rdi, [rsp] ; get our pointer back
|
||||
pop rdi ; get our pointer back
|
||||
|
||||
cmp [rdi+4], esi ; Is the TID we got back the owner of the lock?
|
||||
je .LLocked ; Don't spin in that case
|
||||
@ -30,11 +30,11 @@ fastlock_lock:
|
||||
xor eax, eax ; eliminate partial register dependency
|
||||
inc eax ; we want to add one
|
||||
lock xadd [rdi+2], ax ; do the xadd, ax contains the value before the addition
|
||||
; eax now contains the ticket
|
||||
xor ecx, ecx
|
||||
; ax now contains the ticket
|
||||
ALIGN 16
|
||||
.LLoop:
|
||||
cmp [rdi], ax ; is our ticket up?
|
||||
mov edx, [rdi]
|
||||
cmp dx, ax ; is our ticket up?
|
||||
je .LLocked ; leave the loop
|
||||
pause
|
||||
add ecx, 1000h ; Have we been waiting a long time? (oflow if we have)
|
||||
@ -44,22 +44,38 @@ ALIGN 16
|
||||
; But the compiler doesn't know that we rarely hit this, and when we do we know the lock is
|
||||
; taking a long time to be released anyways. We optimize for the common case of short
|
||||
; lock intervals. That's why we're using a spinlock in the first place
|
||||
inc edx
|
||||
cmp dx, ax
|
||||
je .LLoop
|
||||
dec edx ; restore the current ticket
|
||||
.LFutexWait:
|
||||
push rsi
|
||||
push rax
|
||||
mov rax, 24 ; sys_sched_yield
|
||||
syscall ; give up our timeslice we'll be here a while
|
||||
pop rax
|
||||
pop rsi
|
||||
; Setup the syscall args
|
||||
; rdi ARG1 futex (already in rdi)
|
||||
mov esi, (9 | 128) ; rsi ARG2 FUTEX_WAIT_BITSET_PRIVATE
|
||||
; rdx ARG3 ticketT.u (already in edx)
|
||||
xor r10d, r10d ; r10 ARG4 NULL
|
||||
mov r8, rdi ; r8 ARG5 dup rdi
|
||||
xor r9d, r9d
|
||||
bts r9d, eax ; r9 ARG6 mask
|
||||
mov eax, 202 ; sys_futex
|
||||
; Do the syscall
|
||||
lock or [rdi+12], r9d ; inform the unlocking thread we're waiting
|
||||
syscall ; wait for the futex
|
||||
not r9d ; convert our flag into a mask of bits not to touch
|
||||
lock and [rdi+12], r9d ; clear the flag in the futex control mask
|
||||
; cleanup and continue
|
||||
mov rcx, g_longwaits
|
||||
inc qword [rcx] ; increment our long wait counter
|
||||
mov rdi, [rsp] ; our struct pointer is on the stack already
|
||||
pop rax
|
||||
pop rsi
|
||||
xor ecx, ecx ; Reset our loop counter
|
||||
jmp .LLoop ; Get back in the game
|
||||
ALIGN 16
|
||||
.LLocked:
|
||||
mov [rdi+4], esi ; lock->m_pidOwner = gettid()
|
||||
inc dword [rdi+8] ; lock->m_depth++
|
||||
add rsp, 8 ; fix stack
|
||||
ret
|
||||
|
||||
ALIGN 16
|
||||
@ -114,9 +130,32 @@ fastlock_unlock:
|
||||
; uint16_t avail
|
||||
; int32_t m_pidOwner
|
||||
; int32_t m_depth
|
||||
push r11
|
||||
sub dword [rdi+8], 1 ; decrement m_depth, don't use dec because it partially writes the flag register and we don't know its state
|
||||
jnz .LDone ; if depth is non-zero this is a recursive unlock, and we still hold it
|
||||
mov dword [rdi+4], -1 ; pidOwner = -1 (we don't own it anymore)
|
||||
inc word [rdi] ; give up our ticket (note: lock is not required here because the spinlock itself guards this variable)
|
||||
mov ecx, [rdi] ; get current active (this one)
|
||||
inc ecx ; bump it to the next thread
|
||||
mov [rdi], cx ; give up our ticket (note: lock is not required here because the spinlock itself guards this variable)
|
||||
; At this point the lock is removed, however we must wake up any pending futexs
|
||||
mov r9d, 1 ; eax is the bitmask for 2 threads
|
||||
rol r9d, cl ; place the mask in the right spot for the next 2 threads
|
||||
ALIGN 16
|
||||
.LRetryWake:
|
||||
mov r11d, [rdi+12] ; load the futex mask
|
||||
and r11d, r9d ; are any threads waiting on a futex?
|
||||
jz .LDone ; if not we're done.
|
||||
; we have to wake the futexs
|
||||
; rdi ARG1 futex (already in rdi)
|
||||
mov esi, (10 | 128) ; rsi ARG2 FUTEX_WAKE_BITSET_PRIVATE
|
||||
mov edx, 0x7fffffff ; rdx ARG3 INT_MAX (number of threads to wake)
|
||||
xor r10d, r10d ; r10 ARG4 NULL
|
||||
mov r8, rdi ; r8 ARG5 dup rdi
|
||||
; r9 ARG6 mask (already set above)
|
||||
mov eax, 202 ; sys_futex
|
||||
syscall
|
||||
cmp eax, 1 ; did we wake as many as we expected?
|
||||
jnz .LRetryWake
|
||||
.LDone:
|
||||
pop r11
|
||||
ret
|
||||
|
Loading…
x
Reference in New Issue
Block a user