futriix/src/fastlock_x64.asm

section .text

extern gettid
extern sched_yield
extern g_longwaits
extern registerwait
extern clearwait

;	This is the first use of assembly in this codebase, a valid question is WHY?
;	The spinlock we implement here is performance critical, and simply put GCC
;	emits awful code.  The original C code is left in fastlock.cpp for reference
;	and x-plat.

ALIGN 16
global fastlock_lock
fastlock_lock:
	; RDI points to the struct:
	;	uint16_t active
	;	uint16_t avail
	;	int32_t m_pidOwner
	;	int32_t m_depth
	
	; First get our TID and put it in ecx
	push rdi                ; we need our struct pointer (also balance the stack for the call)
	call gettid             ; get our thread ID (TLS is nasty in ASM so don't bother inlining)
	mov esi, eax            ; back it up in esi
	pop rdi                 ; get our pointer back

	cmp [rdi+4], esi        ; Is the TID we got back the owner of the lock?
	je .LLocked             ; Don't spin in that case

	xor eax, eax            ; eliminate partial register dependency
	inc eax                 ; we want to add one
	lock xadd [rdi+2], ax   ; do the xadd, ax contains the value before the addition
	; ax now contains the ticket
	mov edx, [rdi]
	cmp dx, ax              ; is our ticket up?
	je .LLocked             ; no need to loop
	; Lock is contended, so inform the deadlock detector
	push rax
	push rdi
	push rsi
	call registerwait
	pop rsi
	pop rdi
	pop rax
	; OK Start the wait loop
	xor ecx, ecx
ALIGN 16
.LLoop:
	mov edx, [rdi]
	cmp dx, ax              ; is our ticket up?
	je .LExitLoop           ; leave the loop
	pause
	add ecx, 1000h          ; Have we been waiting a long time? (oflow if we have)
	                        ;	1000h is set so we overflow on the 1024*1024'th iteration (like the C code)
	jnc .LLoop              ; If so, give up our timeslice to someone who's doing real work
	; Like the compiler, you're probably thinking: "Hey! I should take these pushs out of the loop"
	;	But the compiler doesn't know that we rarely hit this, and when we do we know the lock is
	;	taking a long time to be released anyways.  We optimize for the common case of short
	;	lock intervals.  That's why we're using a spinlock in the first place
	; If we get here we're going to sleep in the kernel with a futex
	push rsi
	push rax
	; Setup the syscall args
                            ; rdi ARG1 futex (already in rdi)
	mov esi, (9 | 128)      ; rsi ARG2 FUTEX_WAIT_BITSET_PRIVATE
                            ; rdx ARG3 ticketT.u (already in edx)
	xor r10d, r10d          ; r10 ARG4 NULL
	mov r8, rdi	            ; r8 ARG5 dup rdi
	xor r9d, r9d
	bts r9d, eax            ; r9 ARG6 mask
	mov eax, 202            ; sys_futex
	; Do the syscall
	lock or [rdi+12], r9d   ; inform the unlocking thread we're waiting
	syscall	                ; wait for the futex
	not r9d	                ; convert our flag into a mask of bits not to touch
	lock and [rdi+12], r9d  ; clear the flag in the futex control mask
	; cleanup and continue
	mov rcx, g_longwaits
	inc qword [rcx]         ; increment our long wait counter
	pop rax
	pop rsi
	xor ecx, ecx            ; Reset our loop counter
	jmp .LLoop              ; Get back in the game
ALIGN 16
.LExitLoop:
	push rsi
	push rdi
	call clearwait
	pop rdi
	pop rsi
ALIGN 16
.LLocked:
	mov [rdi+4], esi        ; lock->m_pidOwner = gettid()
	inc dword [rdi+8]       ; lock->m_depth++
	ret

ALIGN 16
global fastlock_trylock
fastlock_trylock:
	; RDI points to the struct:
	;	uint16_t active
	;	uint16_t avail
	;	int32_t m_pidOwner
	;	int32_t m_depth
	
	; First get our TID and put it in ecx
	push rdi                ; we need our struct pointer (also balance the stack for the call)
	call gettid             ; get our thread ID (TLS is nasty in ASM so don't bother inlining)
	mov esi, eax            ; back it up in esi
	pop rdi                 ; get our pointer back

	cmp [rdi+4], esi        ; Is the TID we got back the owner of the lock?
	je .LRecursive          ; Don't spin in that case

	mov eax, [rdi]          ; get both active and avail counters
	mov ecx, eax            ; duplicate in ecx
	ror ecx, 16             ; swap upper and lower 16-bits
	cmp eax, ecx            ; are the upper and lower 16-bits the same?
	jnz .LAlreadyLocked     ;	If not return failure

	; at this point we know eax+ecx have [avail][active] and they are both the same
	add ecx, 10000h         ; increment avail, ecx is now our wanted value
	lock cmpxchg [rdi], ecx ;	If rdi still contains the value in eax, put in ecx (inc avail)
	jnz .LAlreadyLocked     ; If Z is not set then someone locked it while we were preparing
	xor eax, eax
	inc eax                 ; return SUCCESS! (eax=1)
	mov [rdi+4], esi        ; lock->m_pidOwner = gettid()
	mov dword [rdi+8], eax  ; lock->m_depth = 1
	ret
ALIGN 16
.LRecursive:
	xor eax, eax
	inc eax                 ; return SUCCESS! (eax=1)
	inc dword [rdi+8]       ; lock->m_depth++
	ret
ALIGN 16
.LAlreadyLocked:
	xor eax, eax            ; return 0;
	ret

ALIGN 16
global fastlock_unlock
fastlock_unlock:
	; RDI points to the struct:
	;	uint16_t active
	;	uint16_t avail
	;	int32_t m_pidOwner
	;	int32_t m_depth
	push r11
	sub dword [rdi+8], 1         ; decrement m_depth, don't use dec because it partially writes the flag register and we don't know its state
	jnz .LDone                   ; if depth is non-zero this is a recursive unlock, and we still hold it
	mov dword [rdi+4], -1        ; pidOwner = -1 (we don't own it anymore)
	mov ecx, [rdi]               ; get current active (this one)
	inc ecx                      ; bump it to the next thread
	mov [rdi], cx                ; give up our ticket (note: lock is not required here because the spinlock itself guards this variable)
	; At this point the lock is removed, however we must wake up any pending futexs
	mov r9d, 1                   ; eax is the bitmask for 2 threads
	rol r9d, cl                  ; place the mask in the right spot for the next 2 threads
ALIGN 16
.LRetryWake:
	mov r11d, [rdi+12]           ; load the futex mask
	and r11d, r9d                ; are any threads waiting on a futex?
	jz .LDone                    ; if not we're done.
	; we have to wake the futexs
                                 ; rdi ARG1 futex (already in rdi)
	mov esi, (10 | 128)          ; rsi ARG2 FUTEX_WAKE_BITSET_PRIVATE
	mov edx, 0x7fffffff          ; rdx ARG3 INT_MAX (number of threads to wake)
	xor r10d, r10d               ; r10 ARG4 NULL
	mov r8, rdi                  ; r8 ARG5 dup rdi
                                 ; r9 ARG6 mask (already set above)
	mov eax, 202                 ; sys_futex
	syscall
	cmp eax, 1                   ; did we wake as many as we expected?
	jnz .LRetryWake
.LDone:
	pop r11
	ret
Rewrite our spinlock in assembly, pretty big performance improvement Former-commit-id: 40d7a701feefd36e9e3fdb6d516228c4a70fcf3d 2019-02-25 18:21:27 -05:00			`section .text`

			`extern gettid`
			`extern sched_yield`
Add debugging stats to the INFO command Former-commit-id: ac80a5c6a6676f45ac7d460a9cfb02fef8b48d78 2019-03-19 22:04:33 -04:00			`extern g_longwaits`
Implement deadlock detection Former-commit-id: fa797408d9c5d5f12053641144fe1a8b24f66185 2019-10-22 00:43:32 -04:00			`extern registerwait`
			`extern clearwait`
Rewrite our spinlock in assembly, pretty big performance improvement Former-commit-id: 40d7a701feefd36e9e3fdb6d516228c4a70fcf3d 2019-02-25 18:21:27 -05:00
			`; This is the first use of assembly in this codebase, a valid question is WHY?`
			`; The spinlock we implement here is performance critical, and simply put GCC`
			`; emits awful code. The original C code is left in fastlock.cpp for reference`
write fastlock_unlock in ASM... because its faster Former-commit-id: bad73faf28f879d32c4064389b69c83e9474115a 2019-03-02 16:47:27 -05:00			`; and x-plat.`
Rewrite our spinlock in assembly, pretty big performance improvement Former-commit-id: 40d7a701feefd36e9e3fdb6d516228c4a70fcf3d 2019-02-25 18:21:27 -05:00
			`ALIGN 16`
			`global fastlock_lock`
			`fastlock_lock:`
			`; RDI points to the struct:`
			`; uint16_t active`
			`; uint16_t avail`
			`; int32_t m_pidOwner`
			`; int32_t m_depth`

			`; First get our TID and put it in ecx`
tighten up the spinlock loop, and some other bikeshedding Former-commit-id: 8bd56fadd6e73794415e1f9eae892c772800e559 2019-03-01 13:29:21 -05:00			`push rdi ; we need our struct pointer (also balance the stack for the call)`
			`call gettid ; get our thread ID (TLS is nasty in ASM so don't bother inlining)`
			`mov esi, eax ; back it up in esi`
Fallback to a futex if we spin for a long time Former-commit-id: ec57b4b0248bba671e388a2257b1bd65ed8d0f44 2019-06-15 23:53:34 -04:00			`pop rdi ; get our pointer back`
Rewrite our spinlock in assembly, pretty big performance improvement Former-commit-id: 40d7a701feefd36e9e3fdb6d516228c4a70fcf3d 2019-02-25 18:21:27 -05:00
tighten up the spinlock loop, and some other bikeshedding Former-commit-id: 8bd56fadd6e73794415e1f9eae892c772800e559 2019-03-01 13:29:21 -05:00			`cmp [rdi+4], esi ; Is the TID we got back the owner of the lock?`
			`je .LLocked ; Don't spin in that case`
Rewrite our spinlock in assembly, pretty big performance improvement Former-commit-id: 40d7a701feefd36e9e3fdb6d516228c4a70fcf3d 2019-02-25 18:21:27 -05:00
tighten up the spinlock loop, and some other bikeshedding Former-commit-id: 8bd56fadd6e73794415e1f9eae892c772800e559 2019-03-01 13:29:21 -05:00			`xor eax, eax ; eliminate partial register dependency`
			`inc eax ; we want to add one`
			`lock xadd [rdi+2], ax ; do the xadd, ax contains the value before the addition`
Fallback to a futex if we spin for a long time Former-commit-id: ec57b4b0248bba671e388a2257b1bd65ed8d0f44 2019-06-15 23:53:34 -04:00			`; ax now contains the ticket`
Implement deadlock detection Former-commit-id: fa797408d9c5d5f12053641144fe1a8b24f66185 2019-10-22 00:43:32 -04:00			`mov edx, [rdi]`
			`cmp dx, ax ; is our ticket up?`
			`je .LLocked ; no need to loop`
			`; Lock is contended, so inform the deadlock detector`
			`push rax`
			`push rdi`
			`push rsi`
			`call registerwait`
			`pop rsi`
			`pop rdi`
			`pop rax`
			`; OK Start the wait loop`
			`xor ecx, ecx`
Rewrite our spinlock in assembly, pretty big performance improvement Former-commit-id: 40d7a701feefd36e9e3fdb6d516228c4a70fcf3d 2019-02-25 18:21:27 -05:00			`ALIGN 16`
tighten up the spinlock loop, and some other bikeshedding Former-commit-id: 8bd56fadd6e73794415e1f9eae892c772800e559 2019-03-01 13:29:21 -05:00			`.LLoop:`
Fallback to a futex if we spin for a long time Former-commit-id: ec57b4b0248bba671e388a2257b1bd65ed8d0f44 2019-06-15 23:53:34 -04:00			`mov edx, [rdi]`
			`cmp dx, ax ; is our ticket up?`
Implement deadlock detection Former-commit-id: fa797408d9c5d5f12053641144fe1a8b24f66185 2019-10-22 00:43:32 -04:00			`je .LExitLoop ; leave the loop`
tighten up the spinlock loop, and some other bikeshedding Former-commit-id: 8bd56fadd6e73794415e1f9eae892c772800e559 2019-03-01 13:29:21 -05:00			`pause`
			`add ecx, 1000h ; Have we been waiting a long time? (oflow if we have)`
			`; 1000h is set so we overflow on the 1024*1024'th iteration (like the C code)`
			`jnc .LLoop ; If so, give up our timeslice to someone who's doing real work`
Rewrite our spinlock in assembly, pretty big performance improvement Former-commit-id: 40d7a701feefd36e9e3fdb6d516228c4a70fcf3d 2019-02-25 18:21:27 -05:00			`; Like the compiler, you're probably thinking: "Hey! I should take these pushs out of the loop"`
			`; But the compiler doesn't know that we rarely hit this, and when we do we know the lock is`
			`; taking a long time to be released anyways. We optimize for the common case of short`
			`; lock intervals. That's why we're using a spinlock in the first place`
Even the next up thread should sleep in the futex Former-commit-id: 8b76b01bfe710603bcdc101da6eb27afcee7e1b1 2019-06-16 00:00:34 -04:00			`; If we get here we're going to sleep in the kernel with a futex`
Rewrite our spinlock in assembly, pretty big performance improvement Former-commit-id: 40d7a701feefd36e9e3fdb6d516228c4a70fcf3d 2019-02-25 18:21:27 -05:00			`push rsi`
			`push rax`
Fallback to a futex if we spin for a long time Former-commit-id: ec57b4b0248bba671e388a2257b1bd65ed8d0f44 2019-06-15 23:53:34 -04:00			`; Setup the syscall args`
			`; rdi ARG1 futex (already in rdi)`
			`mov esi, (9 \| 128) ; rsi ARG2 FUTEX_WAIT_BITSET_PRIVATE`
			`; rdx ARG3 ticketT.u (already in edx)`
			`xor r10d, r10d ; r10 ARG4 NULL`
			`mov r8, rdi ; r8 ARG5 dup rdi`
			`xor r9d, r9d`
			`bts r9d, eax ; r9 ARG6 mask`
			`mov eax, 202 ; sys_futex`
			`; Do the syscall`
			`lock or [rdi+12], r9d ; inform the unlocking thread we're waiting`
			`syscall ; wait for the futex`
			`not r9d ; convert our flag into a mask of bits not to touch`
			`lock and [rdi+12], r9d ; clear the flag in the futex control mask`
			`; cleanup and continue`
Add debugging stats to the INFO command Former-commit-id: ac80a5c6a6676f45ac7d460a9cfb02fef8b48d78 2019-03-19 22:04:33 -04:00			`mov rcx, g_longwaits`
			`inc qword [rcx] ; increment our long wait counter`
Fallback to a futex if we spin for a long time Former-commit-id: ec57b4b0248bba671e388a2257b1bd65ed8d0f44 2019-06-15 23:53:34 -04:00			`pop rax`
			`pop rsi`
tighten up the spinlock loop, and some other bikeshedding Former-commit-id: 8bd56fadd6e73794415e1f9eae892c772800e559 2019-03-01 13:29:21 -05:00			`xor ecx, ecx ; Reset our loop counter`
			`jmp .LLoop ; Get back in the game`
			`ALIGN 16`
Implement deadlock detection Former-commit-id: fa797408d9c5d5f12053641144fe1a8b24f66185 2019-10-22 00:43:32 -04:00			`.LExitLoop:`
			`push rsi`
			`push rdi`
			`call clearwait`
			`pop rdi`
			`pop rsi`
			`ALIGN 16`
tighten up the spinlock loop, and some other bikeshedding Former-commit-id: 8bd56fadd6e73794415e1f9eae892c772800e559 2019-03-01 13:29:21 -05:00			`.LLocked:`
			`mov [rdi+4], esi ; lock->m_pidOwner = gettid()`
			`inc dword [rdi+8] ; lock->m_depth++`
Rewrite our spinlock in assembly, pretty big performance improvement Former-commit-id: 40d7a701feefd36e9e3fdb6d516228c4a70fcf3d 2019-02-25 18:21:27 -05:00			`ret`

			`ALIGN 16`
			`global fastlock_trylock`
			`fastlock_trylock:`
			`; RDI points to the struct:`
			`; uint16_t active`
			`; uint16_t avail`
			`; int32_t m_pidOwner`
			`; int32_t m_depth`

			`; First get our TID and put it in ecx`
tighten up the spinlock loop, and some other bikeshedding Former-commit-id: 8bd56fadd6e73794415e1f9eae892c772800e559 2019-03-01 13:29:21 -05:00			`push rdi ; we need our struct pointer (also balance the stack for the call)`
			`call gettid ; get our thread ID (TLS is nasty in ASM so don't bother inlining)`
			`mov esi, eax ; back it up in esi`
			`pop rdi ; get our pointer back`
Rewrite our spinlock in assembly, pretty big performance improvement Former-commit-id: 40d7a701feefd36e9e3fdb6d516228c4a70fcf3d 2019-02-25 18:21:27 -05:00
tighten up the spinlock loop, and some other bikeshedding Former-commit-id: 8bd56fadd6e73794415e1f9eae892c772800e559 2019-03-01 13:29:21 -05:00			`cmp [rdi+4], esi ; Is the TID we got back the owner of the lock?`
			`je .LRecursive ; Don't spin in that case`
Rewrite our spinlock in assembly, pretty big performance improvement Former-commit-id: 40d7a701feefd36e9e3fdb6d516228c4a70fcf3d 2019-02-25 18:21:27 -05:00
tighten up the spinlock loop, and some other bikeshedding Former-commit-id: 8bd56fadd6e73794415e1f9eae892c772800e559 2019-03-01 13:29:21 -05:00			`mov eax, [rdi] ; get both active and avail counters`
			`mov ecx, eax ; duplicate in ecx`
			`ror ecx, 16 ; swap upper and lower 16-bits`
			`cmp eax, ecx ; are the upper and lower 16-bits the same?`
			`jnz .LAlreadyLocked ; If not return failure`
Rewrite our spinlock in assembly, pretty big performance improvement Former-commit-id: 40d7a701feefd36e9e3fdb6d516228c4a70fcf3d 2019-02-25 18:21:27 -05:00
			`; at this point we know eax+ecx have [avail][active] and they are both the same`
tighten up the spinlock loop, and some other bikeshedding Former-commit-id: 8bd56fadd6e73794415e1f9eae892c772800e559 2019-03-01 13:29:21 -05:00			`add ecx, 10000h ; increment avail, ecx is now our wanted value`
			`lock cmpxchg [rdi], ecx ; If rdi still contains the value in eax, put in ecx (inc avail)`
			`jnz .LAlreadyLocked ; If Z is not set then someone locked it while we were preparing`
			`xor eax, eax`
			`inc eax ; return SUCCESS! (eax=1)`
			`mov [rdi+4], esi ; lock->m_pidOwner = gettid()`
			`mov dword [rdi+8], eax ; lock->m_depth = 1`
			`ret`
			`ALIGN 16`
			`.LRecursive:`
			`xor eax, eax`
			`inc eax ; return SUCCESS! (eax=1)`
			`inc dword [rdi+8] ; lock->m_depth++`
Rewrite our spinlock in assembly, pretty big performance improvement Former-commit-id: 40d7a701feefd36e9e3fdb6d516228c4a70fcf3d 2019-02-25 18:21:27 -05:00			`ret`
tighten up the spinlock loop, and some other bikeshedding Former-commit-id: 8bd56fadd6e73794415e1f9eae892c772800e559 2019-03-01 13:29:21 -05:00			`ALIGN 16`
Rewrite our spinlock in assembly, pretty big performance improvement Former-commit-id: 40d7a701feefd36e9e3fdb6d516228c4a70fcf3d 2019-02-25 18:21:27 -05:00			`.LAlreadyLocked:`
tighten up the spinlock loop, and some other bikeshedding Former-commit-id: 8bd56fadd6e73794415e1f9eae892c772800e559 2019-03-01 13:29:21 -05:00			`xor eax, eax ; return 0;`
Rewrite our spinlock in assembly, pretty big performance improvement Former-commit-id: 40d7a701feefd36e9e3fdb6d516228c4a70fcf3d 2019-02-25 18:21:27 -05:00			`ret`
write fastlock_unlock in ASM... because its faster Former-commit-id: bad73faf28f879d32c4064389b69c83e9474115a 2019-03-02 16:47:27 -05:00
			`ALIGN 16`
			`global fastlock_unlock`
			`fastlock_unlock:`
			`; RDI points to the struct:`
			`; uint16_t active`
			`; uint16_t avail`
			`; int32_t m_pidOwner`
			`; int32_t m_depth`
Fallback to a futex if we spin for a long time Former-commit-id: ec57b4b0248bba671e388a2257b1bd65ed8d0f44 2019-06-15 23:53:34 -04:00			`push r11`
write fastlock_unlock in ASM... because its faster Former-commit-id: bad73faf28f879d32c4064389b69c83e9474115a 2019-03-02 16:47:27 -05:00			`sub dword [rdi+8], 1 ; decrement m_depth, don't use dec because it partially writes the flag register and we don't know its state`
			`jnz .LDone ; if depth is non-zero this is a recursive unlock, and we still hold it`
			`mov dword [rdi+4], -1 ; pidOwner = -1 (we don't own it anymore)`
Fallback to a futex if we spin for a long time Former-commit-id: ec57b4b0248bba671e388a2257b1bd65ed8d0f44 2019-06-15 23:53:34 -04:00			`mov ecx, [rdi] ; get current active (this one)`
			`inc ecx ; bump it to the next thread`
			`mov [rdi], cx ; give up our ticket (note: lock is not required here because the spinlock itself guards this variable)`
			`; At this point the lock is removed, however we must wake up any pending futexs`
			`mov r9d, 1 ; eax is the bitmask for 2 threads`
			`rol r9d, cl ; place the mask in the right spot for the next 2 threads`
			`ALIGN 16`
			`.LRetryWake:`
			`mov r11d, [rdi+12] ; load the futex mask`
			`and r11d, r9d ; are any threads waiting on a futex?`
			`jz .LDone ; if not we're done.`
			`; we have to wake the futexs`
			`; rdi ARG1 futex (already in rdi)`
			`mov esi, (10 \| 128) ; rsi ARG2 FUTEX_WAKE_BITSET_PRIVATE`
			`mov edx, 0x7fffffff ; rdx ARG3 INT_MAX (number of threads to wake)`
			`xor r10d, r10d ; r10 ARG4 NULL`
			`mov r8, rdi ; r8 ARG5 dup rdi`
			`; r9 ARG6 mask (already set above)`
			`mov eax, 202 ; sys_futex`
			`syscall`
			`cmp eax, 1 ; did we wake as many as we expected?`
			`jnz .LRetryWake`
write fastlock_unlock in ASM... because its faster Former-commit-id: bad73faf28f879d32c4064389b69c83e9474115a 2019-03-02 16:47:27 -05:00			`.LDone:`
Fallback to a futex if we spin for a long time Former-commit-id: ec57b4b0248bba671e388a2257b1bd65ed8d0f44 2019-06-15 23:53:34 -04:00			`pop r11`
write fastlock_unlock in ASM... because its faster Former-commit-id: bad73faf28f879d32c4064389b69c83e9474115a 2019-03-02 16:47:27 -05:00			`ret`