Fallback to a futex if we spin for a long time
Former-commit-id: ec57b4b0248bba671e388a2257b1bd65ed8d0f44
This commit is contained in:
parent
7fc6f7f607
commit
dd7cbbe730
@ -36,6 +36,8 @@
|
|||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <pthread.h>
|
#include <pthread.h>
|
||||||
#include <limits.h>
|
#include <limits.h>
|
||||||
|
#include <linux/futex.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
#ifdef __APPLE__
|
#ifdef __APPLE__
|
||||||
#include <TargetConditionals.h>
|
#include <TargetConditionals.h>
|
||||||
@ -64,6 +66,14 @@ uint64_t fastlock_getlongwaitcount()
|
|||||||
return g_longwaits;
|
return g_longwaits;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifndef ASM_SPINLOCK
|
||||||
|
static int futex(volatile unsigned *uaddr, int futex_op, int val,
|
||||||
|
const struct timespec *timeout, int val3)
|
||||||
|
{
|
||||||
|
return syscall(SYS_futex, uaddr, futex_op, val,
|
||||||
|
timeout, uaddr, val3);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
extern "C" pid_t gettid()
|
extern "C" pid_t gettid()
|
||||||
{
|
{
|
||||||
@ -88,6 +98,7 @@ extern "C" void fastlock_init(struct fastlock *lock)
|
|||||||
lock->m_ticket.m_avail = 0;
|
lock->m_ticket.m_avail = 0;
|
||||||
lock->m_depth = 0;
|
lock->m_depth = 0;
|
||||||
lock->m_pidOwner = -1;
|
lock->m_pidOwner = -1;
|
||||||
|
lock->futex = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef ASM_SPINLOCK
|
#ifndef ASM_SPINLOCK
|
||||||
@ -100,18 +111,24 @@ extern "C" void fastlock_lock(struct fastlock *lock)
|
|||||||
}
|
}
|
||||||
|
|
||||||
unsigned myticket = __atomic_fetch_add(&lock->m_ticket.m_avail, 1, __ATOMIC_RELEASE);
|
unsigned myticket = __atomic_fetch_add(&lock->m_ticket.m_avail, 1, __ATOMIC_RELEASE);
|
||||||
|
unsigned mask = (1U << (myticket % 32));
|
||||||
int cloops = 0;
|
int cloops = 0;
|
||||||
while (__atomic_load_2(&lock->m_ticket.m_active, __ATOMIC_ACQUIRE) != myticket)
|
ticket ticketT;
|
||||||
|
while (((ticketT.u = __atomic_load_4(&lock->m_ticket.m_active, __ATOMIC_ACQUIRE)) & 0xffff) != myticket)
|
||||||
{
|
{
|
||||||
if ((++cloops % 1024*1024) == 0)
|
|
||||||
{
|
|
||||||
sched_yield();
|
|
||||||
++g_longwaits;
|
|
||||||
}
|
|
||||||
#if defined(__i386__) || defined(__amd64__)
|
#if defined(__i386__) || defined(__amd64__)
|
||||||
__asm__ ("pause");
|
__asm__ ("pause");
|
||||||
#endif
|
#endif
|
||||||
|
if ((++cloops % 1024*1024) == 0)
|
||||||
|
{
|
||||||
|
if (static_cast<uint16_t>(ticketT.m_active+1U) != myticket)
|
||||||
|
{
|
||||||
|
__atomic_fetch_or(&lock->futex, mask, __ATOMIC_ACQUIRE);
|
||||||
|
futex(&lock->m_ticket.u, FUTEX_WAIT_BITSET_PRIVATE, ticketT.u, nullptr, mask);
|
||||||
|
__atomic_fetch_and(&lock->futex, ~mask, __ATOMIC_RELEASE);
|
||||||
|
}
|
||||||
|
++g_longwaits;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
lock->m_depth = 1;
|
lock->m_depth = 1;
|
||||||
@ -145,6 +162,21 @@ extern "C" int fastlock_trylock(struct fastlock *lock)
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define ROL32(v, shift) ((v << shift) | (v >> (32-shift)))
|
||||||
|
void unlock_futex(struct fastlock *lock, uint16_t ifutex)
|
||||||
|
{
|
||||||
|
unsigned mask = (1U << (ifutex % 32));
|
||||||
|
unsigned futexT = __atomic_load_4(&lock->futex, __ATOMIC_RELAXED) & mask;
|
||||||
|
|
||||||
|
if (futexT == 0)
|
||||||
|
return;
|
||||||
|
|
||||||
|
while (__atomic_load_4(&lock->futex, __ATOMIC_ACQUIRE) & mask)
|
||||||
|
{
|
||||||
|
if (futex(&lock->m_ticket.u, FUTEX_WAKE_BITSET_PRIVATE, INT_MAX, nullptr, mask) == 1)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
extern "C" void fastlock_unlock(struct fastlock *lock)
|
extern "C" void fastlock_unlock(struct fastlock *lock)
|
||||||
{
|
{
|
||||||
--lock->m_depth;
|
--lock->m_depth;
|
||||||
@ -153,7 +185,8 @@ extern "C" void fastlock_unlock(struct fastlock *lock)
|
|||||||
assert((int)__atomic_load_4(&lock->m_pidOwner, __ATOMIC_RELAXED) >= 0); // unlock after free
|
assert((int)__atomic_load_4(&lock->m_pidOwner, __ATOMIC_RELAXED) >= 0); // unlock after free
|
||||||
lock->m_pidOwner = -1;
|
lock->m_pidOwner = -1;
|
||||||
std::atomic_thread_fence(std::memory_order_acquire);
|
std::atomic_thread_fence(std::memory_order_acquire);
|
||||||
__atomic_fetch_add(&lock->m_ticket.m_active, 1, __ATOMIC_ACQ_REL); // on x86 the atomic is not required here, but ASM handles that case
|
uint16_t activeNew = __atomic_add_fetch(&lock->m_ticket.m_active, 1, __ATOMIC_ACQ_REL); // on x86 the atomic is not required here, but ASM handles that case
|
||||||
|
unlock_futex(lock, activeNew);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -20,17 +20,29 @@ uint64_t fastlock_getlongwaitcount(); // this is a global value
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#pragma GCC diagnostic push
|
||||||
|
#pragma GCC diagnostic ignored "-Wpedantic"
|
||||||
struct ticket
|
struct ticket
|
||||||
|
{
|
||||||
|
union
|
||||||
|
{
|
||||||
|
struct
|
||||||
{
|
{
|
||||||
uint16_t m_active;
|
uint16_t m_active;
|
||||||
uint16_t m_avail;
|
uint16_t m_avail;
|
||||||
};
|
};
|
||||||
|
unsigned u;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
#pragma GCC diagnostic pop
|
||||||
|
|
||||||
struct fastlock
|
struct fastlock
|
||||||
{
|
{
|
||||||
volatile struct ticket m_ticket;
|
volatile struct ticket m_ticket;
|
||||||
|
|
||||||
volatile int m_pidOwner;
|
volatile int m_pidOwner;
|
||||||
volatile int m_depth;
|
volatile int m_depth;
|
||||||
|
unsigned futex;
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
fastlock()
|
fastlock()
|
||||||
|
@ -22,7 +22,7 @@ fastlock_lock:
|
|||||||
push rdi ; we need our struct pointer (also balance the stack for the call)
|
push rdi ; we need our struct pointer (also balance the stack for the call)
|
||||||
call gettid ; get our thread ID (TLS is nasty in ASM so don't bother inlining)
|
call gettid ; get our thread ID (TLS is nasty in ASM so don't bother inlining)
|
||||||
mov esi, eax ; back it up in esi
|
mov esi, eax ; back it up in esi
|
||||||
mov rdi, [rsp] ; get our pointer back
|
pop rdi ; get our pointer back
|
||||||
|
|
||||||
cmp [rdi+4], esi ; Is the TID we got back the owner of the lock?
|
cmp [rdi+4], esi ; Is the TID we got back the owner of the lock?
|
||||||
je .LLocked ; Don't spin in that case
|
je .LLocked ; Don't spin in that case
|
||||||
@ -30,11 +30,11 @@ fastlock_lock:
|
|||||||
xor eax, eax ; eliminate partial register dependency
|
xor eax, eax ; eliminate partial register dependency
|
||||||
inc eax ; we want to add one
|
inc eax ; we want to add one
|
||||||
lock xadd [rdi+2], ax ; do the xadd, ax contains the value before the addition
|
lock xadd [rdi+2], ax ; do the xadd, ax contains the value before the addition
|
||||||
; eax now contains the ticket
|
; ax now contains the ticket
|
||||||
xor ecx, ecx
|
|
||||||
ALIGN 16
|
ALIGN 16
|
||||||
.LLoop:
|
.LLoop:
|
||||||
cmp [rdi], ax ; is our ticket up?
|
mov edx, [rdi]
|
||||||
|
cmp dx, ax ; is our ticket up?
|
||||||
je .LLocked ; leave the loop
|
je .LLocked ; leave the loop
|
||||||
pause
|
pause
|
||||||
add ecx, 1000h ; Have we been waiting a long time? (oflow if we have)
|
add ecx, 1000h ; Have we been waiting a long time? (oflow if we have)
|
||||||
@ -44,22 +44,38 @@ ALIGN 16
|
|||||||
; But the compiler doesn't know that we rarely hit this, and when we do we know the lock is
|
; But the compiler doesn't know that we rarely hit this, and when we do we know the lock is
|
||||||
; taking a long time to be released anyways. We optimize for the common case of short
|
; taking a long time to be released anyways. We optimize for the common case of short
|
||||||
; lock intervals. That's why we're using a spinlock in the first place
|
; lock intervals. That's why we're using a spinlock in the first place
|
||||||
|
inc edx
|
||||||
|
cmp dx, ax
|
||||||
|
je .LLoop
|
||||||
|
dec edx ; restore the current ticket
|
||||||
|
.LFutexWait:
|
||||||
push rsi
|
push rsi
|
||||||
push rax
|
push rax
|
||||||
mov rax, 24 ; sys_sched_yield
|
; Setup the syscall args
|
||||||
syscall ; give up our timeslice we'll be here a while
|
; rdi ARG1 futex (already in rdi)
|
||||||
pop rax
|
mov esi, (9 | 128) ; rsi ARG2 FUTEX_WAIT_BITSET_PRIVATE
|
||||||
pop rsi
|
; rdx ARG3 ticketT.u (already in edx)
|
||||||
|
xor r10d, r10d ; r10 ARG4 NULL
|
||||||
|
mov r8, rdi ; r8 ARG5 dup rdi
|
||||||
|
xor r9d, r9d
|
||||||
|
bts r9d, eax ; r9 ARG6 mask
|
||||||
|
mov eax, 202 ; sys_futex
|
||||||
|
; Do the syscall
|
||||||
|
lock or [rdi+12], r9d ; inform the unlocking thread we're waiting
|
||||||
|
syscall ; wait for the futex
|
||||||
|
not r9d ; convert our flag into a mask of bits not to touch
|
||||||
|
lock and [rdi+12], r9d ; clear the flag in the futex control mask
|
||||||
|
; cleanup and continue
|
||||||
mov rcx, g_longwaits
|
mov rcx, g_longwaits
|
||||||
inc qword [rcx] ; increment our long wait counter
|
inc qword [rcx] ; increment our long wait counter
|
||||||
mov rdi, [rsp] ; our struct pointer is on the stack already
|
pop rax
|
||||||
|
pop rsi
|
||||||
xor ecx, ecx ; Reset our loop counter
|
xor ecx, ecx ; Reset our loop counter
|
||||||
jmp .LLoop ; Get back in the game
|
jmp .LLoop ; Get back in the game
|
||||||
ALIGN 16
|
ALIGN 16
|
||||||
.LLocked:
|
.LLocked:
|
||||||
mov [rdi+4], esi ; lock->m_pidOwner = gettid()
|
mov [rdi+4], esi ; lock->m_pidOwner = gettid()
|
||||||
inc dword [rdi+8] ; lock->m_depth++
|
inc dword [rdi+8] ; lock->m_depth++
|
||||||
add rsp, 8 ; fix stack
|
|
||||||
ret
|
ret
|
||||||
|
|
||||||
ALIGN 16
|
ALIGN 16
|
||||||
@ -114,9 +130,32 @@ fastlock_unlock:
|
|||||||
; uint16_t avail
|
; uint16_t avail
|
||||||
; int32_t m_pidOwner
|
; int32_t m_pidOwner
|
||||||
; int32_t m_depth
|
; int32_t m_depth
|
||||||
|
push r11
|
||||||
sub dword [rdi+8], 1 ; decrement m_depth, don't use dec because it partially writes the flag register and we don't know its state
|
sub dword [rdi+8], 1 ; decrement m_depth, don't use dec because it partially writes the flag register and we don't know its state
|
||||||
jnz .LDone ; if depth is non-zero this is a recursive unlock, and we still hold it
|
jnz .LDone ; if depth is non-zero this is a recursive unlock, and we still hold it
|
||||||
mov dword [rdi+4], -1 ; pidOwner = -1 (we don't own it anymore)
|
mov dword [rdi+4], -1 ; pidOwner = -1 (we don't own it anymore)
|
||||||
inc word [rdi] ; give up our ticket (note: lock is not required here because the spinlock itself guards this variable)
|
mov ecx, [rdi] ; get current active (this one)
|
||||||
|
inc ecx ; bump it to the next thread
|
||||||
|
mov [rdi], cx ; give up our ticket (note: lock is not required here because the spinlock itself guards this variable)
|
||||||
|
; At this point the lock is removed, however we must wake up any pending futexs
|
||||||
|
mov r9d, 1 ; eax is the bitmask for 2 threads
|
||||||
|
rol r9d, cl ; place the mask in the right spot for the next 2 threads
|
||||||
|
ALIGN 16
|
||||||
|
.LRetryWake:
|
||||||
|
mov r11d, [rdi+12] ; load the futex mask
|
||||||
|
and r11d, r9d ; are any threads waiting on a futex?
|
||||||
|
jz .LDone ; if not we're done.
|
||||||
|
; we have to wake the futexs
|
||||||
|
; rdi ARG1 futex (already in rdi)
|
||||||
|
mov esi, (10 | 128) ; rsi ARG2 FUTEX_WAKE_BITSET_PRIVATE
|
||||||
|
mov edx, 0x7fffffff ; rdx ARG3 INT_MAX (number of threads to wake)
|
||||||
|
xor r10d, r10d ; r10 ARG4 NULL
|
||||||
|
mov r8, rdi ; r8 ARG5 dup rdi
|
||||||
|
; r9 ARG6 mask (already set above)
|
||||||
|
mov eax, 202 ; sys_futex
|
||||||
|
syscall
|
||||||
|
cmp eax, 1 ; did we wake as many as we expected?
|
||||||
|
jnz .LRetryWake
|
||||||
.LDone:
|
.LDone:
|
||||||
|
pop r11
|
||||||
ret
|
ret
|
||||||
|
Loading…
x
Reference in New Issue
Block a user