diff --git a/src/fastlock.cpp b/src/fastlock.cpp
index a1499d77b..f265f3908 100644
--- a/src/fastlock.cpp
+++ b/src/fastlock.cpp
@@ -111,7 +111,6 @@ extern "C" int fastlock_trylock(struct fastlock *lock)
     }
     return false;
 }
-#endif
 
 extern "C" void fastlock_unlock(struct fastlock *lock)
 {
@@ -121,9 +120,10 @@ extern "C" void fastlock_unlock(struct fastlock *lock)
         assert((int)__atomic_load_4(&lock->m_pidOwner, __ATOMIC_RELAXED) >= 0);  // unlock after free
         lock->m_pidOwner = -1;
         std::atomic_thread_fence(std::memory_order_acquire);
-        __atomic_fetch_add(&lock->m_ticket.m_active, 1, __ATOMIC_ACQ_REL);
+        __atomic_fetch_add(&lock->m_ticket.m_active, 1, __ATOMIC_ACQ_REL);  // on x86 the atomic is not required here, but ASM handles that case
     }
 }
+#endif
 
 extern "C" void fastlock_free(struct fastlock *lock)
 {
diff --git a/src/fastlock_x64.asm b/src/fastlock_x64.asm
index d2a1a90b3..1b876350f 100644
--- a/src/fastlock_x64.asm
+++ b/src/fastlock_x64.asm
@@ -6,8 +6,7 @@ extern sched_yield
 ;	This is the first use of assembly in this codebase, a valid question is WHY?
 ;	The spinlock we implement here is performance critical, and simply put GCC
 ;	emits awful code.  The original C code is left in fastlock.cpp for reference
-;	and x-plat.  The code generated for the unlock case is reasonable and left in
-;	C++.
+;	and x-plat.
 
 ALIGN 16
 global fastlock_lock
@@ -103,3 +102,18 @@ ALIGN 16
 .LAlreadyLocked:
 	xor eax, eax            ; return 0;
 	ret
+
+ALIGN 16
+global fastlock_unlock
+fastlock_unlock:
+	; RDI points to the struct:
+	;	uint16_t active
+	;	uint16_t avail
+	;	int32_t m_pidOwner
+	;	int32_t m_depth
+	sub dword [rdi+8], 1         ; decrement m_depth, don't use dec because it partially writes the flag register and we don't know its state
+	jnz .LDone                   ; if depth is non-zero this is a recursive unlock, and we still hold it
+	mov dword [rdi+4], -1        ; pidOwner = -1 (we don't own it anymore)
+	inc word [rdi]               ; give up our ticket (note: lock is not required here because the spinlock itself guards this variable)
+.LDone:
+	ret