Fix failure to wakeup from futex sleep due to fastlock_unlock reading the wrong offset in the asm version. Also fix false sharing in spinlock

Former-commit-id: 4c8603815cf525c75dcc360fddeab9ca6fe70ae6
This commit is contained in:
John Sully 2019-11-17 16:06:49 -05:00
parent a27c1c9f0a
commit 72bbf16c2f
3 changed files with 40 additions and 29 deletions

View File

@ -258,7 +258,10 @@ extern "C" void fastlock_init(struct fastlock *lock, const char *name)
lock->m_depth = 0;
lock->m_pidOwner = -1;
lock->futex = 0;
lock->szName = name;
int cch = strlen(name);
cch = std::min<int>(cch, sizeof(lock->szName)-1);
memcpy(lock->szName, name, cch);
lock->szName[cch] = '\0';
ANNOTATE_RWLOCK_CREATE(lock);
}

View File

@ -1,5 +1,6 @@
#pragma once
#include <inttypes.h>
#include <stddef.h>
#ifdef __cplusplus
extern "C" {
@ -40,12 +41,13 @@ struct ticket
struct fastlock
{
volatile struct ticket m_ticket;
volatile int m_pidOwner;
volatile int m_depth;
char szName[56];
/* Volatile data on seperate cache line */
volatile struct ticket m_ticket;
unsigned futex;
const char *szName;
char padding[56]; // ensure ticket and futex are on their own independent cache line
#ifdef __cplusplus
fastlock(const char *name)
@ -81,3 +83,5 @@ struct fastlock
bool fOwnLock(); // true if this thread owns the lock, NOTE: not 100% reliable, use for debugging only
#endif
};
static_assert(offsetof(struct fastlock, m_ticket) == 64, "ensure padding is correct");

View File

@ -16,10 +16,11 @@ fastlock_lock:
.cfi_startproc
.cfi_def_cfa rsp, 8
# RDI points to the struct:
# uint16_t active
# uint16_t avail
# int32_t m_pidOwner
# int32_t m_depth
# [rdi+64] ...
# uint16_t active
# uint16_t avail
# First get our TID and put it in ecx
push rdi # we need our struct pointer (also balance the stack for the call)
@ -29,18 +30,18 @@ fastlock_lock:
pop rdi # get our pointer back
.cfi_adjust_cfa_offset -8
cmp [rdi+4], esi # Is the TID we got back the owner of the lock?
cmp [rdi], esi # Is the TID we got back the owner of the lock?
je .LLocked # Don't spin in that case
xor eax, eax # eliminate partial register dependency
inc eax # we want to add one
lock xadd [rdi+2], ax # do the xadd, ax contains the value before the addition
lock xadd [rdi+66], ax # do the xadd, ax contains the value before the addition
# ax now contains the ticket
# OK Start the wait loop
xor ecx, ecx
.ALIGN 16
.LLoop:
mov edx, [rdi]
mov edx, [rdi+64]
cmp dx, ax # is our ticket up?
je .LLocked # leave the loop
pause
@ -72,8 +73,8 @@ fastlock_lock:
jmp .LLoop # Get back in the game
.ALIGN 16
.LLocked:
mov [rdi+4], esi # lock->m_pidOwner = gettid()
inc dword ptr [rdi+8] # lock->m_depth++
mov [rdi], esi # lock->m_pidOwner = gettid()
inc dword ptr [rdi+4] # lock->m_depth++
ret
.cfi_endproc
@ -82,10 +83,11 @@ fastlock_lock:
.type fastlock_trylock,@function
fastlock_trylock:
# RDI points to the struct:
# uint16_t active
# uint16_t avail
# int32_t m_pidOwner
# int32_t m_depth
# [rdi+64] ...
# uint16_t active
# uint16_t avail
# First get our TID and put it in ecx
push rdi # we need our struct pointer (also balance the stack for the call)
@ -93,10 +95,10 @@ fastlock_trylock:
mov esi, eax # back it up in esi
pop rdi # get our pointer back
cmp [rdi+4], esi # Is the TID we got back the owner of the lock?
cmp [rdi], esi # Is the TID we got back the owner of the lock?
je .LRecursive # Don't spin in that case
mov eax, [rdi] # get both active and avail counters
mov eax, [rdi+64] # get both active and avail counters
mov ecx, eax # duplicate in ecx
ror ecx, 16 # swap upper and lower 16-bits
cmp eax, ecx # are the upper and lower 16-bits the same?
@ -104,18 +106,18 @@ fastlock_trylock:
# at this point we know eax+ecx have [avail][active] and they are both the same
add ecx, 0x10000 # increment avail, ecx is now our wanted value
lock cmpxchg [rdi], ecx # If rdi still contains the value in eax, put in ecx (inc avail)
lock cmpxchg [rdi+64], ecx # If rdi still contains the value in eax, put in ecx (inc avail)
jnz .LAlreadyLocked # If Z is not set then someone locked it while we were preparing
xor eax, eax
inc eax # return SUCCESS! (eax=1)
mov [rdi+4], esi # lock->m_pidOwner = gettid()
mov dword ptr [rdi+8], eax # lock->m_depth = 1
mov [rdi], esi # lock->m_pidOwner = gettid()
mov dword ptr [rdi+4], eax # lock->m_depth = 1
ret
.ALIGN 16
.LRecursive:
xor eax, eax
inc eax # return SUCCESS! (eax=1)
inc dword ptr [rdi+8] # lock->m_depth++
inc dword ptr [rdi+4] # lock->m_depth++
ret
.ALIGN 16
.LAlreadyLocked:
@ -126,23 +128,25 @@ fastlock_trylock:
.global fastlock_unlock
fastlock_unlock:
# RDI points to the struct:
# uint16_t active
# uint16_t avail
# int32_t m_pidOwner
# int32_t m_depth
# [rdi+64] ...
# uint16_t active
# uint16_t avail
push r11
sub dword ptr [rdi+8], 1 # decrement m_depth, don't use dec because it partially writes the flag register and we don't know its state
sub dword ptr [rdi+4], 1 # decrement m_depth, don't use dec because it partially writes the flag register and we don't know its state
jnz .LDone # if depth is non-zero this is a recursive unlock, and we still hold it
mov dword ptr [rdi+4], -1 # pidOwner = -1 (we don't own it anymore)
mov ecx, [rdi] # get current active (this one)
mov dword ptr [rdi], -1 # pidOwner = -1 (we don't own it anymore)
mov ecx, [rdi+64] # get current active (this one)
inc ecx # bump it to the next thread
mov [rdi], cx # give up our ticket (note: lock is not required here because the spinlock itself guards this variable)
mov [rdi+64], cx # give up our ticket (note: lock is not required here because the spinlock itself guards this variable)
# At this point the lock is removed, however we must wake up any pending futexs
mov r9d, 1 # eax is the bitmask for 2 threads
rol r9d, cl # place the mask in the right spot for the next 2 threads
add rdi, 64 # rdi now points to the token
.ALIGN 16
.LRetryWake:
mov r11d, [rdi+12] # load the futex mask
mov r11d, [rdi+4] # load the futex mask
and r11d, r9d # are any threads waiting on a futex?
jz .LDone # if not we're done.
# we have to wake the futexs