All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] riscv: locks: introduce ticket-based spinlock implementation
@ 2021-03-24 10:14 ` guoren
  0 siblings, 0 replies; 92+ messages in thread
From: guoren @ 2021-03-24 10:14 UTC (permalink / raw)
  To: guoren
  Cc: linux-riscv, linux-kernel, Guo Ren, Catalin Marinas, Will Deacon,
	Peter Zijlstra, Palmer Dabbelt, Anup Patel, Arnd Bergmann

From: Guo Ren <guoren@linux.alibaba.com>

This patch introduces a ticket lock implementation for riscv, along the
same lines as the implementation for arch/arm & arch/csky.

Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Palmer Dabbelt <palmerdabbelt@google.com>
Cc: Anup Patel <anup@brainfault.org>
Cc: Arnd Bergmann <arnd@arndb.de>
---
 arch/riscv/Kconfig                      |   1 +
 arch/riscv/include/asm/Kbuild           |   1 +
 arch/riscv/include/asm/spinlock.h       | 158 ++++++++++++--------------------
 arch/riscv/include/asm/spinlock_types.h |  19 ++--
 4 files changed, 74 insertions(+), 105 deletions(-)

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 87d7b52..7c56a20 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -30,6 +30,7 @@ config RISCV
 	select ARCH_HAS_STRICT_KERNEL_RWX if MMU
 	select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
 	select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT
+	select ARCH_USE_QUEUED_RWLOCKS
 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
 	select ARCH_WANT_FRAME_POINTERS
 	select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild
index 445ccc9..e57ef80 100644
--- a/arch/riscv/include/asm/Kbuild
+++ b/arch/riscv/include/asm/Kbuild
@@ -3,5 +3,6 @@ generic-y += early_ioremap.h
 generic-y += extable.h
 generic-y += flat.h
 generic-y += kvm_para.h
+generic-y += qrwlock.h
 generic-y += user.h
 generic-y += vmlinux.lds.h
diff --git a/arch/riscv/include/asm/spinlock.h b/arch/riscv/include/asm/spinlock.h
index f4f7fa1..2c81764 100644
--- a/arch/riscv/include/asm/spinlock.h
+++ b/arch/riscv/include/asm/spinlock.h
@@ -7,129 +7,91 @@
 #ifndef _ASM_RISCV_SPINLOCK_H
 #define _ASM_RISCV_SPINLOCK_H
 
-#include <linux/kernel.h>
-#include <asm/current.h>
-#include <asm/fence.h>
-
 /*
- * Simple spin lock operations.  These provide no fairness guarantees.
+ * Ticket-based spin-locking.
  */
+static inline void arch_spin_lock(arch_spinlock_t *lock)
+{
+	arch_spinlock_t lockval;
+	u32 tmp;
+
+	asm volatile (
+		"1:	lr.w	%0, %2		\n"
+		"	mv	%1, %0		\n"
+		"	addw	%0, %0, %3	\n"
+		"	sc.w	%0, %0, %2	\n"
+		"	bnez	%0, 1b		\n"
+		: "=&r" (tmp), "=&r" (lockval), "+A" (lock->lock)
+		: "r" (1 << TICKET_NEXT)
+		: "memory");
 
-/* FIXME: Replace this with a ticket lock, like MIPS. */
-
-#define arch_spin_is_locked(x)	(READ_ONCE((x)->lock) != 0)
+	while (lockval.tickets.next != lockval.tickets.owner) {
+		/*
+		 * FIXME - we need wfi/wfe here to prevent:
+		 *  - cache line bouncing
+		 *  - saving cpu pipeline in multi-harts-per-core
+		 *    processor
+		 */
+		lockval.tickets.owner = READ_ONCE(lock->tickets.owner);
+	}
 
-static inline void arch_spin_unlock(arch_spinlock_t *lock)
-{
-	smp_store_release(&lock->lock, 0);
+	__atomic_acquire_fence();
 }
 
 static inline int arch_spin_trylock(arch_spinlock_t *lock)
 {
-	int tmp = 1, busy;
-
-	__asm__ __volatile__ (
-		"	amoswap.w %0, %2, %1\n"
-		RISCV_ACQUIRE_BARRIER
-		: "=r" (busy), "+A" (lock->lock)
-		: "r" (tmp)
+	u32 tmp, contended, res;
+
+	do {
+		asm volatile (
+		"	lr.w	%0, %3		\n"
+		"	srliw	%1, %0, %5	\n"
+		"	slliw	%2, %0, %5	\n"
+		"	or	%1, %2, %1	\n"
+		"	li	%2, 0		\n"
+		"	sub	%1, %1, %0	\n"
+		"	bnez	%1, 1f		\n"
+		"	addw	%0, %0, %4	\n"
+		"	sc.w	%2, %0, %3	\n"
+		"1:				\n"
+		: "=&r" (tmp), "=&r" (contended), "=&r" (res),
+		  "+A" (lock->lock)
+		: "r" (1 << TICKET_NEXT), "I" (TICKET_NEXT)
 		: "memory");
+	} while (res);
 
-	return !busy;
-}
-
-static inline void arch_spin_lock(arch_spinlock_t *lock)
-{
-	while (1) {
-		if (arch_spin_is_locked(lock))
-			continue;
-
-		if (arch_spin_trylock(lock))
-			break;
+	if (!contended) {
+		__atomic_acquire_fence();
+		return 1;
+	} else {
+		return 0;
 	}
 }
 
-/***********************************************************/
-
-static inline void arch_read_lock(arch_rwlock_t *lock)
+static inline void arch_spin_unlock(arch_spinlock_t *lock)
 {
-	int tmp;
-
-	__asm__ __volatile__(
-		"1:	lr.w	%1, %0\n"
-		"	bltz	%1, 1b\n"
-		"	addi	%1, %1, 1\n"
-		"	sc.w	%1, %1, %0\n"
-		"	bnez	%1, 1b\n"
-		RISCV_ACQUIRE_BARRIER
-		: "+A" (lock->lock), "=&r" (tmp)
-		:: "memory");
+	smp_store_release(&lock->tickets.owner, lock->tickets.owner + 1);
+	/* FIXME - we need ipi/sev here to notify above */
 }
 
-static inline void arch_write_lock(arch_rwlock_t *lock)
+static inline int arch_spin_value_unlocked(arch_spinlock_t lock)
 {
-	int tmp;
-
-	__asm__ __volatile__(
-		"1:	lr.w	%1, %0\n"
-		"	bnez	%1, 1b\n"
-		"	li	%1, -1\n"
-		"	sc.w	%1, %1, %0\n"
-		"	bnez	%1, 1b\n"
-		RISCV_ACQUIRE_BARRIER
-		: "+A" (lock->lock), "=&r" (tmp)
-		:: "memory");
+	return lock.tickets.owner == lock.tickets.next;
 }
 
-static inline int arch_read_trylock(arch_rwlock_t *lock)
+static inline int arch_spin_is_locked(arch_spinlock_t *lock)
 {
-	int busy;
-
-	__asm__ __volatile__(
-		"1:	lr.w	%1, %0\n"
-		"	bltz	%1, 1f\n"
-		"	addi	%1, %1, 1\n"
-		"	sc.w	%1, %1, %0\n"
-		"	bnez	%1, 1b\n"
-		RISCV_ACQUIRE_BARRIER
-		"1:\n"
-		: "+A" (lock->lock), "=&r" (busy)
-		:: "memory");
-
-	return !busy;
+	return !arch_spin_value_unlocked(READ_ONCE(*lock));
 }
 
-static inline int arch_write_trylock(arch_rwlock_t *lock)
+static inline int arch_spin_is_contended(arch_spinlock_t *lock)
 {
-	int busy;
-
-	__asm__ __volatile__(
-		"1:	lr.w	%1, %0\n"
-		"	bnez	%1, 1f\n"
-		"	li	%1, -1\n"
-		"	sc.w	%1, %1, %0\n"
-		"	bnez	%1, 1b\n"
-		RISCV_ACQUIRE_BARRIER
-		"1:\n"
-		: "+A" (lock->lock), "=&r" (busy)
-		:: "memory");
+	struct __raw_tickets tickets = READ_ONCE(lock->tickets);
 
-	return !busy;
+	return (tickets.next - tickets.owner) > 1;
 }
+#define arch_spin_is_contended	arch_spin_is_contended
 
-static inline void arch_read_unlock(arch_rwlock_t *lock)
-{
-	__asm__ __volatile__(
-		RISCV_RELEASE_BARRIER
-		"	amoadd.w x0, %1, %0\n"
-		: "+A" (lock->lock)
-		: "r" (-1)
-		: "memory");
-}
-
-static inline void arch_write_unlock(arch_rwlock_t *lock)
-{
-	smp_store_release(&lock->lock, 0);
-}
+#include <asm/qrwlock.h>
 
 #endif /* _ASM_RISCV_SPINLOCK_H */
diff --git a/arch/riscv/include/asm/spinlock_types.h b/arch/riscv/include/asm/spinlock_types.h
index f398e76..d7b38bf 100644
--- a/arch/riscv/include/asm/spinlock_types.h
+++ b/arch/riscv/include/asm/spinlock_types.h
@@ -10,16 +10,21 @@
 # error "please don't include this file directly"
 #endif
 
+#define TICKET_NEXT	16
+
 typedef struct {
-	volatile unsigned int lock;
+	union {
+		u32 lock;
+		struct __raw_tickets {
+			/* little endian */
+			u16 owner;
+			u16 next;
+		} tickets;
+	};
 } arch_spinlock_t;
 
-#define __ARCH_SPIN_LOCK_UNLOCKED	{ 0 }
-
-typedef struct {
-	volatile unsigned int lock;
-} arch_rwlock_t;
+#define __ARCH_SPIN_LOCK_UNLOCKED	{ { 0 } }
 
-#define __ARCH_RW_LOCK_UNLOCKED		{ 0 }
+#include <asm-generic/qrwlock_types.h>
 
 #endif /* _ASM_RISCV_SPINLOCK_TYPES_H */
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 92+ messages in thread

* [PATCH] riscv: locks: introduce ticket-based spinlock implementation
@ 2021-03-24 10:14 ` guoren
  0 siblings, 0 replies; 92+ messages in thread
From: guoren @ 2021-03-24 10:14 UTC (permalink / raw)
  To: guoren
  Cc: linux-riscv, linux-kernel, Guo Ren, Catalin Marinas, Will Deacon,
	Peter Zijlstra, Palmer Dabbelt, Anup Patel, Arnd Bergmann

From: Guo Ren <guoren@linux.alibaba.com>

This patch introduces a ticket lock implementation for riscv, along the
same lines as the implementation for arch/arm & arch/csky.

Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Palmer Dabbelt <palmerdabbelt@google.com>
Cc: Anup Patel <anup@brainfault.org>
Cc: Arnd Bergmann <arnd@arndb.de>
---
 arch/riscv/Kconfig                      |   1 +
 arch/riscv/include/asm/Kbuild           |   1 +
 arch/riscv/include/asm/spinlock.h       | 158 ++++++++++++--------------------
 arch/riscv/include/asm/spinlock_types.h |  19 ++--
 4 files changed, 74 insertions(+), 105 deletions(-)

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 87d7b52..7c56a20 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -30,6 +30,7 @@ config RISCV
 	select ARCH_HAS_STRICT_KERNEL_RWX if MMU
 	select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
 	select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT
+	select ARCH_USE_QUEUED_RWLOCKS
 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
 	select ARCH_WANT_FRAME_POINTERS
 	select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild
index 445ccc9..e57ef80 100644
--- a/arch/riscv/include/asm/Kbuild
+++ b/arch/riscv/include/asm/Kbuild
@@ -3,5 +3,6 @@ generic-y += early_ioremap.h
 generic-y += extable.h
 generic-y += flat.h
 generic-y += kvm_para.h
+generic-y += qrwlock.h
 generic-y += user.h
 generic-y += vmlinux.lds.h
diff --git a/arch/riscv/include/asm/spinlock.h b/arch/riscv/include/asm/spinlock.h
index f4f7fa1..2c81764 100644
--- a/arch/riscv/include/asm/spinlock.h
+++ b/arch/riscv/include/asm/spinlock.h
@@ -7,129 +7,91 @@
 #ifndef _ASM_RISCV_SPINLOCK_H
 #define _ASM_RISCV_SPINLOCK_H
 
-#include <linux/kernel.h>
-#include <asm/current.h>
-#include <asm/fence.h>
-
 /*
- * Simple spin lock operations.  These provide no fairness guarantees.
+ * Ticket-based spin-locking.
  */
+static inline void arch_spin_lock(arch_spinlock_t *lock)
+{
+	arch_spinlock_t lockval;
+	u32 tmp;
+
+	asm volatile (
+		"1:	lr.w	%0, %2		\n"
+		"	mv	%1, %0		\n"
+		"	addw	%0, %0, %3	\n"
+		"	sc.w	%0, %0, %2	\n"
+		"	bnez	%0, 1b		\n"
+		: "=&r" (tmp), "=&r" (lockval), "+A" (lock->lock)
+		: "r" (1 << TICKET_NEXT)
+		: "memory");
 
-/* FIXME: Replace this with a ticket lock, like MIPS. */
-
-#define arch_spin_is_locked(x)	(READ_ONCE((x)->lock) != 0)
+	while (lockval.tickets.next != lockval.tickets.owner) {
+		/*
+		 * FIXME - we need wfi/wfe here to prevent:
+		 *  - cache line bouncing
+		 *  - saving cpu pipeline in multi-harts-per-core
+		 *    processor
+		 */
+		lockval.tickets.owner = READ_ONCE(lock->tickets.owner);
+	}
 
-static inline void arch_spin_unlock(arch_spinlock_t *lock)
-{
-	smp_store_release(&lock->lock, 0);
+	__atomic_acquire_fence();
 }
 
 static inline int arch_spin_trylock(arch_spinlock_t *lock)
 {
-	int tmp = 1, busy;
-
-	__asm__ __volatile__ (
-		"	amoswap.w %0, %2, %1\n"
-		RISCV_ACQUIRE_BARRIER
-		: "=r" (busy), "+A" (lock->lock)
-		: "r" (tmp)
+	u32 tmp, contended, res;
+
+	do {
+		asm volatile (
+		"	lr.w	%0, %3		\n"
+		"	srliw	%1, %0, %5	\n"
+		"	slliw	%2, %0, %5	\n"
+		"	or	%1, %2, %1	\n"
+		"	li	%2, 0		\n"
+		"	sub	%1, %1, %0	\n"
+		"	bnez	%1, 1f		\n"
+		"	addw	%0, %0, %4	\n"
+		"	sc.w	%2, %0, %3	\n"
+		"1:				\n"
+		: "=&r" (tmp), "=&r" (contended), "=&r" (res),
+		  "+A" (lock->lock)
+		: "r" (1 << TICKET_NEXT), "I" (TICKET_NEXT)
 		: "memory");
+	} while (res);
 
-	return !busy;
-}
-
-static inline void arch_spin_lock(arch_spinlock_t *lock)
-{
-	while (1) {
-		if (arch_spin_is_locked(lock))
-			continue;
-
-		if (arch_spin_trylock(lock))
-			break;
+	if (!contended) {
+		__atomic_acquire_fence();
+		return 1;
+	} else {
+		return 0;
 	}
 }
 
-/***********************************************************/
-
-static inline void arch_read_lock(arch_rwlock_t *lock)
+static inline void arch_spin_unlock(arch_spinlock_t *lock)
 {
-	int tmp;
-
-	__asm__ __volatile__(
-		"1:	lr.w	%1, %0\n"
-		"	bltz	%1, 1b\n"
-		"	addi	%1, %1, 1\n"
-		"	sc.w	%1, %1, %0\n"
-		"	bnez	%1, 1b\n"
-		RISCV_ACQUIRE_BARRIER
-		: "+A" (lock->lock), "=&r" (tmp)
-		:: "memory");
+	smp_store_release(&lock->tickets.owner, lock->tickets.owner + 1);
+	/* FIXME - we need ipi/sev here to notify above */
 }
 
-static inline void arch_write_lock(arch_rwlock_t *lock)
+static inline int arch_spin_value_unlocked(arch_spinlock_t lock)
 {
-	int tmp;
-
-	__asm__ __volatile__(
-		"1:	lr.w	%1, %0\n"
-		"	bnez	%1, 1b\n"
-		"	li	%1, -1\n"
-		"	sc.w	%1, %1, %0\n"
-		"	bnez	%1, 1b\n"
-		RISCV_ACQUIRE_BARRIER
-		: "+A" (lock->lock), "=&r" (tmp)
-		:: "memory");
+	return lock.tickets.owner == lock.tickets.next;
 }
 
-static inline int arch_read_trylock(arch_rwlock_t *lock)
+static inline int arch_spin_is_locked(arch_spinlock_t *lock)
 {
-	int busy;
-
-	__asm__ __volatile__(
-		"1:	lr.w	%1, %0\n"
-		"	bltz	%1, 1f\n"
-		"	addi	%1, %1, 1\n"
-		"	sc.w	%1, %1, %0\n"
-		"	bnez	%1, 1b\n"
-		RISCV_ACQUIRE_BARRIER
-		"1:\n"
-		: "+A" (lock->lock), "=&r" (busy)
-		:: "memory");
-
-	return !busy;
+	return !arch_spin_value_unlocked(READ_ONCE(*lock));
 }
 
-static inline int arch_write_trylock(arch_rwlock_t *lock)
+static inline int arch_spin_is_contended(arch_spinlock_t *lock)
 {
-	int busy;
-
-	__asm__ __volatile__(
-		"1:	lr.w	%1, %0\n"
-		"	bnez	%1, 1f\n"
-		"	li	%1, -1\n"
-		"	sc.w	%1, %1, %0\n"
-		"	bnez	%1, 1b\n"
-		RISCV_ACQUIRE_BARRIER
-		"1:\n"
-		: "+A" (lock->lock), "=&r" (busy)
-		:: "memory");
+	struct __raw_tickets tickets = READ_ONCE(lock->tickets);
 
-	return !busy;
+	return (tickets.next - tickets.owner) > 1;
 }
+#define arch_spin_is_contended	arch_spin_is_contended
 
-static inline void arch_read_unlock(arch_rwlock_t *lock)
-{
-	__asm__ __volatile__(
-		RISCV_RELEASE_BARRIER
-		"	amoadd.w x0, %1, %0\n"
-		: "+A" (lock->lock)
-		: "r" (-1)
-		: "memory");
-}
-
-static inline void arch_write_unlock(arch_rwlock_t *lock)
-{
-	smp_store_release(&lock->lock, 0);
-}
+#include <asm/qrwlock.h>
 
 #endif /* _ASM_RISCV_SPINLOCK_H */
diff --git a/arch/riscv/include/asm/spinlock_types.h b/arch/riscv/include/asm/spinlock_types.h
index f398e76..d7b38bf 100644
--- a/arch/riscv/include/asm/spinlock_types.h
+++ b/arch/riscv/include/asm/spinlock_types.h
@@ -10,16 +10,21 @@
 # error "please don't include this file directly"
 #endif
 
+#define TICKET_NEXT	16
+
 typedef struct {
-	volatile unsigned int lock;
+	union {
+		u32 lock;
+		struct __raw_tickets {
+			/* little endian */
+			u16 owner;
+			u16 next;
+		} tickets;
+	};
 } arch_spinlock_t;
 
-#define __ARCH_SPIN_LOCK_UNLOCKED	{ 0 }
-
-typedef struct {
-	volatile unsigned int lock;
-} arch_rwlock_t;
+#define __ARCH_SPIN_LOCK_UNLOCKED	{ { 0 } }
 
-#define __ARCH_RW_LOCK_UNLOCKED		{ 0 }
+#include <asm-generic/qrwlock_types.h>
 
 #endif /* _ASM_RISCV_SPINLOCK_TYPES_H */
-- 
2.7.4


_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply related	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
  2021-03-24 10:14 ` guoren
@ 2021-03-24 11:09   ` Peter Zijlstra
  -1 siblings, 0 replies; 92+ messages in thread
From: Peter Zijlstra @ 2021-03-24 11:09 UTC (permalink / raw)
  To: guoren
  Cc: linux-riscv, linux-kernel, Guo Ren, Catalin Marinas, Will Deacon,
	Palmer Dabbelt, Anup Patel, Arnd Bergmann

On Wed, Mar 24, 2021 at 10:14:52AM +0000, guoren@kernel.org wrote:
> +static inline void arch_spin_lock(arch_spinlock_t *lock)
> +{
> +	arch_spinlock_t lockval;
> +	u32 tmp;
> +
> +	asm volatile (
> +		"1:	lr.w	%0, %2		\n"
> +		"	mv	%1, %0		\n"
> +		"	addw	%0, %0, %3	\n"
> +		"	sc.w	%0, %0, %2	\n"
> +		"	bnez	%0, 1b		\n"
> +		: "=&r" (tmp), "=&r" (lockval), "+A" (lock->lock)
> +		: "r" (1 << TICKET_NEXT)
> +		: "memory");
>  
> +	while (lockval.tickets.next != lockval.tickets.owner) {
> +		/*
> +		 * FIXME - we need wfi/wfe here to prevent:
> +		 *  - cache line bouncing
> +		 *  - saving cpu pipeline in multi-harts-per-core
> +		 *    processor
> +		 */
> +		lockval.tickets.owner = READ_ONCE(lock->tickets.owner);
> +	}
>  
> +	__atomic_acquire_fence();
>  }

> +static inline void arch_spin_unlock(arch_spinlock_t *lock)
>  {
> +	smp_store_release(&lock->tickets.owner, lock->tickets.owner + 1);
> +	/* FIXME - we need ipi/sev here to notify above */
>  }

Urgh, are you saying your WFE requires an explicit SEV like on ARM ? The
ARM64 model is far superious IMO, and then you can use
smp_cond_load_acquire() in arch_spin_lock() and call it a day.

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
@ 2021-03-24 11:09   ` Peter Zijlstra
  0 siblings, 0 replies; 92+ messages in thread
From: Peter Zijlstra @ 2021-03-24 11:09 UTC (permalink / raw)
  To: guoren
  Cc: linux-riscv, linux-kernel, Guo Ren, Catalin Marinas, Will Deacon,
	Palmer Dabbelt, Anup Patel, Arnd Bergmann

On Wed, Mar 24, 2021 at 10:14:52AM +0000, guoren@kernel.org wrote:
> +static inline void arch_spin_lock(arch_spinlock_t *lock)
> +{
> +	arch_spinlock_t lockval;
> +	u32 tmp;
> +
> +	asm volatile (
> +		"1:	lr.w	%0, %2		\n"
> +		"	mv	%1, %0		\n"
> +		"	addw	%0, %0, %3	\n"
> +		"	sc.w	%0, %0, %2	\n"
> +		"	bnez	%0, 1b		\n"
> +		: "=&r" (tmp), "=&r" (lockval), "+A" (lock->lock)
> +		: "r" (1 << TICKET_NEXT)
> +		: "memory");
>  
> +	while (lockval.tickets.next != lockval.tickets.owner) {
> +		/*
> +		 * FIXME - we need wfi/wfe here to prevent:
> +		 *  - cache line bouncing
> +		 *  - saving cpu pipeline in multi-harts-per-core
> +		 *    processor
> +		 */
> +		lockval.tickets.owner = READ_ONCE(lock->tickets.owner);
> +	}
>  
> +	__atomic_acquire_fence();
>  }

> +static inline void arch_spin_unlock(arch_spinlock_t *lock)
>  {
> +	smp_store_release(&lock->tickets.owner, lock->tickets.owner + 1);
> +	/* FIXME - we need ipi/sev here to notify above */
>  }

Urgh, are you saying your WFE requires an explicit SEV like on ARM ? The
ARM64 model is far superious IMO, and then you can use
smp_cond_load_acquire() in arch_spin_lock() and call it a day.

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
  2021-03-24 11:09   ` Peter Zijlstra
@ 2021-03-24 12:10     ` Guo Ren
  -1 siblings, 0 replies; 92+ messages in thread
From: Guo Ren @ 2021-03-24 12:10 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-riscv, Linux Kernel Mailing List, Guo Ren, Catalin Marinas,
	Will Deacon, Palmer Dabbelt, Anup Patel, Arnd Bergmann

Thx Peter,

On Wed, Mar 24, 2021 at 7:09 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Wed, Mar 24, 2021 at 10:14:52AM +0000, guoren@kernel.org wrote:
> > +static inline void arch_spin_lock(arch_spinlock_t *lock)
> > +{
> > +     arch_spinlock_t lockval;
> > +     u32 tmp;
> > +
> > +     asm volatile (
> > +             "1:     lr.w    %0, %2          \n"
> > +             "       mv      %1, %0          \n"
> > +             "       addw    %0, %0, %3      \n"
> > +             "       sc.w    %0, %0, %2      \n"
> > +             "       bnez    %0, 1b          \n"
> > +             : "=&r" (tmp), "=&r" (lockval), "+A" (lock->lock)
> > +             : "r" (1 << TICKET_NEXT)
> > +             : "memory");
> >
> > +     while (lockval.tickets.next != lockval.tickets.owner) {
> > +             /*
> > +              * FIXME - we need wfi/wfe here to prevent:
> > +              *  - cache line bouncing
> > +              *  - saving cpu pipeline in multi-harts-per-core
> > +              *    processor
> > +              */
> > +             lockval.tickets.owner = READ_ONCE(lock->tickets.owner);
> > +     }
> >
> > +     __atomic_acquire_fence();
> >  }
>
> > +static inline void arch_spin_unlock(arch_spinlock_t *lock)
> >  {
> > +     smp_store_release(&lock->tickets.owner, lock->tickets.owner + 1);
> > +     /* FIXME - we need ipi/sev here to notify above */
> >  }
>
> Urgh, are you saying your WFE requires an explicit SEV like on ARM ? The
Yes, I'm considering that kind of code.

> ARM64 model is far superious IMO, and then you can use
> smp_cond_load_acquire() in arch_spin_lock() and call it a day.
Great tip, thx. I'll follow that.

-- 
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
@ 2021-03-24 12:10     ` Guo Ren
  0 siblings, 0 replies; 92+ messages in thread
From: Guo Ren @ 2021-03-24 12:10 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-riscv, Linux Kernel Mailing List, Guo Ren, Catalin Marinas,
	Will Deacon, Palmer Dabbelt, Anup Patel, Arnd Bergmann

Thx Peter,

On Wed, Mar 24, 2021 at 7:09 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Wed, Mar 24, 2021 at 10:14:52AM +0000, guoren@kernel.org wrote:
> > +static inline void arch_spin_lock(arch_spinlock_t *lock)
> > +{
> > +     arch_spinlock_t lockval;
> > +     u32 tmp;
> > +
> > +     asm volatile (
> > +             "1:     lr.w    %0, %2          \n"
> > +             "       mv      %1, %0          \n"
> > +             "       addw    %0, %0, %3      \n"
> > +             "       sc.w    %0, %0, %2      \n"
> > +             "       bnez    %0, 1b          \n"
> > +             : "=&r" (tmp), "=&r" (lockval), "+A" (lock->lock)
> > +             : "r" (1 << TICKET_NEXT)
> > +             : "memory");
> >
> > +     while (lockval.tickets.next != lockval.tickets.owner) {
> > +             /*
> > +              * FIXME - we need wfi/wfe here to prevent:
> > +              *  - cache line bouncing
> > +              *  - saving cpu pipeline in multi-harts-per-core
> > +              *    processor
> > +              */
> > +             lockval.tickets.owner = READ_ONCE(lock->tickets.owner);
> > +     }
> >
> > +     __atomic_acquire_fence();
> >  }
>
> > +static inline void arch_spin_unlock(arch_spinlock_t *lock)
> >  {
> > +     smp_store_release(&lock->tickets.owner, lock->tickets.owner + 1);
> > +     /* FIXME - we need ipi/sev here to notify above */
> >  }
>
> Urgh, are you saying your WFE requires an explicit SEV like on ARM ? The
Yes, I'm considering that kind of code.

> ARM64 model is far superious IMO, and then you can use
> smp_cond_load_acquire() in arch_spin_lock() and call it a day.
Great tip, thx. I'll follow that.

-- 
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
       [not found] ` <CAM4kBBK7_s9U2vJbq68yC8WdDEfPQTaCOvn1xds3Si5B-Wpw+A@mail.gmail.com>
@ 2021-03-24 12:23     ` Peter Zijlstra
  2021-03-24 12:24     ` Guo Ren
  1 sibling, 0 replies; 92+ messages in thread
From: Peter Zijlstra @ 2021-03-24 12:23 UTC (permalink / raw)
  To: Vitaly Wool
  Cc: guoren, linux-riscv, LKML, Guo Ren, Catalin Marinas, Will Deacon,
	Palmer Dabbelt, Anup Patel, Arnd Bergmann

On Wed, Mar 24, 2021 at 12:15:47PM +0100, Vitaly Wool wrote:
> On Wed, Mar 24, 2021, 11:16 AM <guoren@kernel.org> wrote:
> 
> > From: Guo Ren <guoren@linux.alibaba.com>
> >
> > This patch introduces a ticket lock implementation for riscv, along the
> > same lines as the implementation for arch/arm & arch/csky.
> >
> 
> Could you please provide a rationale for this? Like, what is wrong with the
> current implementation.

test-and-set spinlocks have terrible worst case behaviour.

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
@ 2021-03-24 12:23     ` Peter Zijlstra
  0 siblings, 0 replies; 92+ messages in thread
From: Peter Zijlstra @ 2021-03-24 12:23 UTC (permalink / raw)
  To: Vitaly Wool
  Cc: guoren, linux-riscv, LKML, Guo Ren, Catalin Marinas, Will Deacon,
	Palmer Dabbelt, Anup Patel, Arnd Bergmann

On Wed, Mar 24, 2021 at 12:15:47PM +0100, Vitaly Wool wrote:
> On Wed, Mar 24, 2021, 11:16 AM <guoren@kernel.org> wrote:
> 
> > From: Guo Ren <guoren@linux.alibaba.com>
> >
> > This patch introduces a ticket lock implementation for riscv, along the
> > same lines as the implementation for arch/arm & arch/csky.
> >
> 
> Could you please provide a rationale for this? Like, what is wrong with the
> current implementation.

test-and-set spinlocks have terrible worst case behaviour.

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
       [not found] ` <CAM4kBBK7_s9U2vJbq68yC8WdDEfPQTaCOvn1xds3Si5B-Wpw+A@mail.gmail.com>
@ 2021-03-24 12:24     ` Guo Ren
  2021-03-24 12:24     ` Guo Ren
  1 sibling, 0 replies; 92+ messages in thread
From: Guo Ren @ 2021-03-24 12:24 UTC (permalink / raw)
  To: Vitaly Wool
  Cc: linux-riscv, LKML, Guo Ren, Catalin Marinas, Will Deacon,
	Peter Zijlstra, Palmer Dabbelt, Anup Patel, Arnd Bergmann

On Wed, Mar 24, 2021 at 7:16 PM Vitaly Wool <vitaly.wool@konsulko.com> wrote:
>
>
>
> On Wed, Mar 24, 2021, 11:16 AM <guoren@kernel.org> wrote:
>>
>> From: Guo Ren <guoren@linux.alibaba.com>
>>
>> This patch introduces a ticket lock implementation for riscv, along the
>> same lines as the implementation for arch/arm & arch/csky.
>
>
> Could you please provide a rationale for this? Like, what is wrong with the current implementation.
Ticket based spinlock's principle is here:
https://lwn.net/Articles/267968/

Current implementation will cause cache line bouncing when many harts
are acquiring the same spinlock.
I'm seeking a solution, maybe not fitting the current RISC-V base ISA.

I'll add more comments in the next version of patch.

>
> Thanks in advance,
>
> Best regards,
>    Vitaly
>>
>>
>> Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
>> Cc: Catalin Marinas <catalin.marinas@arm.com>
>> Cc: Will Deacon <will.deacon@arm.com>
>> Cc: Peter Zijlstra <peterz@infradead.org>
>> Cc: Palmer Dabbelt <palmerdabbelt@google.com>
>> Cc: Anup Patel <anup@brainfault.org>
>> Cc: Arnd Bergmann <arnd@arndb.de>
>> ---
>>  arch/riscv/Kconfig                      |   1 +
>>  arch/riscv/include/asm/Kbuild           |   1 +
>>  arch/riscv/include/asm/spinlock.h       | 158 ++++++++++++--------------------
>>  arch/riscv/include/asm/spinlock_types.h |  19 ++--
>>  4 files changed, 74 insertions(+), 105 deletions(-)
>>
>> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
>> index 87d7b52..7c56a20 100644
>> --- a/arch/riscv/Kconfig
>> +++ b/arch/riscv/Kconfig
>> @@ -30,6 +30,7 @@ config RISCV
>>         select ARCH_HAS_STRICT_KERNEL_RWX if MMU
>>         select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
>>         select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT
>> +       select ARCH_USE_QUEUED_RWLOCKS
>>         select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
>>         select ARCH_WANT_FRAME_POINTERS
>>         select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
>> diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild
>> index 445ccc9..e57ef80 100644
>> --- a/arch/riscv/include/asm/Kbuild
>> +++ b/arch/riscv/include/asm/Kbuild
>> @@ -3,5 +3,6 @@ generic-y += early_ioremap.h
>>  generic-y += extable.h
>>  generic-y += flat.h
>>  generic-y += kvm_para.h
>> +generic-y += qrwlock.h
>>  generic-y += user.h
>>  generic-y += vmlinux.lds.h
>> diff --git a/arch/riscv/include/asm/spinlock.h b/arch/riscv/include/asm/spinlock.h
>> index f4f7fa1..2c81764 100644
>> --- a/arch/riscv/include/asm/spinlock.h
>> +++ b/arch/riscv/include/asm/spinlock.h
>> @@ -7,129 +7,91 @@
>>  #ifndef _ASM_RISCV_SPINLOCK_H
>>  #define _ASM_RISCV_SPINLOCK_H
>>
>> -#include <linux/kernel.h>
>> -#include <asm/current.h>
>> -#include <asm/fence.h>
>> -
>>  /*
>> - * Simple spin lock operations.  These provide no fairness guarantees.
>> + * Ticket-based spin-locking.
>>   */
>> +static inline void arch_spin_lock(arch_spinlock_t *lock)
>> +{
>> +       arch_spinlock_t lockval;
>> +       u32 tmp;
>> +
>> +       asm volatile (
>> +               "1:     lr.w    %0, %2          \n"
>> +               "       mv      %1, %0          \n"
>> +               "       addw    %0, %0, %3      \n"
>> +               "       sc.w    %0, %0, %2      \n"
>> +               "       bnez    %0, 1b          \n"
>> +               : "=&r" (tmp), "=&r" (lockval), "+A" (lock->lock)
>> +               : "r" (1 << TICKET_NEXT)
>> +               : "memory");
>>
>> -/* FIXME: Replace this with a ticket lock, like MIPS. */
>> -
>> -#define arch_spin_is_locked(x) (READ_ONCE((x)->lock) != 0)
>> +       while (lockval.tickets.next != lockval.tickets.owner) {
>> +               /*
>> +                * FIXME - we need wfi/wfe here to prevent:
>> +                *  - cache line bouncing
>> +                *  - saving cpu pipeline in multi-harts-per-core
>> +                *    processor
>> +                */
>> +               lockval.tickets.owner = READ_ONCE(lock->tickets.owner);
>> +       }
>>
>> -static inline void arch_spin_unlock(arch_spinlock_t *lock)
>> -{
>> -       smp_store_release(&lock->lock, 0);
>> +       __atomic_acquire_fence();
>>  }
>>
>>  static inline int arch_spin_trylock(arch_spinlock_t *lock)
>>  {
>> -       int tmp = 1, busy;
>> -
>> -       __asm__ __volatile__ (
>> -               "       amoswap.w %0, %2, %1\n"
>> -               RISCV_ACQUIRE_BARRIER
>> -               : "=r" (busy), "+A" (lock->lock)
>> -               : "r" (tmp)
>> +       u32 tmp, contended, res;
>> +
>> +       do {
>> +               asm volatile (
>> +               "       lr.w    %0, %3          \n"
>> +               "       srliw   %1, %0, %5      \n"
>> +               "       slliw   %2, %0, %5      \n"
>> +               "       or      %1, %2, %1      \n"
>> +               "       li      %2, 0           \n"
>> +               "       sub     %1, %1, %0      \n"
>> +               "       bnez    %1, 1f          \n"
>> +               "       addw    %0, %0, %4      \n"
>> +               "       sc.w    %2, %0, %3      \n"
>> +               "1:                             \n"
>> +               : "=&r" (tmp), "=&r" (contended), "=&r" (res),
>> +                 "+A" (lock->lock)
>> +               : "r" (1 << TICKET_NEXT), "I" (TICKET_NEXT)
>>                 : "memory");
>> +       } while (res);
>>
>> -       return !busy;
>> -}
>> -
>> -static inline void arch_spin_lock(arch_spinlock_t *lock)
>> -{
>> -       while (1) {
>> -               if (arch_spin_is_locked(lock))
>> -                       continue;
>> -
>> -               if (arch_spin_trylock(lock))
>> -                       break;
>> +       if (!contended) {
>> +               __atomic_acquire_fence();
>> +               return 1;
>> +       } else {
>> +               return 0;
>>         }
>>  }
>>
>> -/***********************************************************/
>> -
>> -static inline void arch_read_lock(arch_rwlock_t *lock)
>> +static inline void arch_spin_unlock(arch_spinlock_t *lock)
>>  {
>> -       int tmp;
>> -
>> -       __asm__ __volatile__(
>> -               "1:     lr.w    %1, %0\n"
>> -               "       bltz    %1, 1b\n"
>> -               "       addi    %1, %1, 1\n"
>> -               "       sc.w    %1, %1, %0\n"
>> -               "       bnez    %1, 1b\n"
>> -               RISCV_ACQUIRE_BARRIER
>> -               : "+A" (lock->lock), "=&r" (tmp)
>> -               :: "memory");
>> +       smp_store_release(&lock->tickets.owner, lock->tickets.owner + 1);
>> +       /* FIXME - we need ipi/sev here to notify above */
>>  }
>>
>> -static inline void arch_write_lock(arch_rwlock_t *lock)
>> +static inline int arch_spin_value_unlocked(arch_spinlock_t lock)
>>  {
>> -       int tmp;
>> -
>> -       __asm__ __volatile__(
>> -               "1:     lr.w    %1, %0\n"
>> -               "       bnez    %1, 1b\n"
>> -               "       li      %1, -1\n"
>> -               "       sc.w    %1, %1, %0\n"
>> -               "       bnez    %1, 1b\n"
>> -               RISCV_ACQUIRE_BARRIER
>> -               : "+A" (lock->lock), "=&r" (tmp)
>> -               :: "memory");
>> +       return lock.tickets.owner == lock.tickets.next;
>>  }
>>
>> -static inline int arch_read_trylock(arch_rwlock_t *lock)
>> +static inline int arch_spin_is_locked(arch_spinlock_t *lock)
>>  {
>> -       int busy;
>> -
>> -       __asm__ __volatile__(
>> -               "1:     lr.w    %1, %0\n"
>> -               "       bltz    %1, 1f\n"
>> -               "       addi    %1, %1, 1\n"
>> -               "       sc.w    %1, %1, %0\n"
>> -               "       bnez    %1, 1b\n"
>> -               RISCV_ACQUIRE_BARRIER
>> -               "1:\n"
>> -               : "+A" (lock->lock), "=&r" (busy)
>> -               :: "memory");
>> -
>> -       return !busy;
>> +       return !arch_spin_value_unlocked(READ_ONCE(*lock));
>>  }
>>
>> -static inline int arch_write_trylock(arch_rwlock_t *lock)
>> +static inline int arch_spin_is_contended(arch_spinlock_t *lock)
>>  {
>> -       int busy;
>> -
>> -       __asm__ __volatile__(
>> -               "1:     lr.w    %1, %0\n"
>> -               "       bnez    %1, 1f\n"
>> -               "       li      %1, -1\n"
>> -               "       sc.w    %1, %1, %0\n"
>> -               "       bnez    %1, 1b\n"
>> -               RISCV_ACQUIRE_BARRIER
>> -               "1:\n"
>> -               : "+A" (lock->lock), "=&r" (busy)
>> -               :: "memory");
>> +       struct __raw_tickets tickets = READ_ONCE(lock->tickets);
>>
>> -       return !busy;
>> +       return (tickets.next - tickets.owner) > 1;
>>  }
>> +#define arch_spin_is_contended arch_spin_is_contended
>>
>> -static inline void arch_read_unlock(arch_rwlock_t *lock)
>> -{
>> -       __asm__ __volatile__(
>> -               RISCV_RELEASE_BARRIER
>> -               "       amoadd.w x0, %1, %0\n"
>> -               : "+A" (lock->lock)
>> -               : "r" (-1)
>> -               : "memory");
>> -}
>> -
>> -static inline void arch_write_unlock(arch_rwlock_t *lock)
>> -{
>> -       smp_store_release(&lock->lock, 0);
>> -}
>> +#include <asm/qrwlock.h>
>>
>>  #endif /* _ASM_RISCV_SPINLOCK_H */
>> diff --git a/arch/riscv/include/asm/spinlock_types.h b/arch/riscv/include/asm/spinlock_types.h
>> index f398e76..d7b38bf 100644
>> --- a/arch/riscv/include/asm/spinlock_types.h
>> +++ b/arch/riscv/include/asm/spinlock_types.h
>> @@ -10,16 +10,21 @@
>>  # error "please don't include this file directly"
>>  #endif
>>
>> +#define TICKET_NEXT    16
>> +
>>  typedef struct {
>> -       volatile unsigned int lock;
>> +       union {
>> +               u32 lock;
>> +               struct __raw_tickets {
>> +                       /* little endian */
>> +                       u16 owner;
>> +                       u16 next;
>> +               } tickets;
>> +       };
>>  } arch_spinlock_t;
>>
>> -#define __ARCH_SPIN_LOCK_UNLOCKED      { 0 }
>> -
>> -typedef struct {
>> -       volatile unsigned int lock;
>> -} arch_rwlock_t;
>> +#define __ARCH_SPIN_LOCK_UNLOCKED      { { 0 } }
>>
>> -#define __ARCH_RW_LOCK_UNLOCKED                { 0 }
>> +#include <asm-generic/qrwlock_types.h>
>>
>>  #endif /* _ASM_RISCV_SPINLOCK_TYPES_H */
>> --
>> 2.7.4
>>
>>
>> _______________________________________________
>> linux-riscv mailing list
>> linux-riscv@lists.infradead.org
>> http://lists.infradead.org/mailman/listinfo/linux-riscv



-- 
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
@ 2021-03-24 12:24     ` Guo Ren
  0 siblings, 0 replies; 92+ messages in thread
From: Guo Ren @ 2021-03-24 12:24 UTC (permalink / raw)
  To: Vitaly Wool
  Cc: linux-riscv, LKML, Guo Ren, Catalin Marinas, Will Deacon,
	Peter Zijlstra, Palmer Dabbelt, Anup Patel, Arnd Bergmann

On Wed, Mar 24, 2021 at 7:16 PM Vitaly Wool <vitaly.wool@konsulko.com> wrote:
>
>
>
> On Wed, Mar 24, 2021, 11:16 AM <guoren@kernel.org> wrote:
>>
>> From: Guo Ren <guoren@linux.alibaba.com>
>>
>> This patch introduces a ticket lock implementation for riscv, along the
>> same lines as the implementation for arch/arm & arch/csky.
>
>
> Could you please provide a rationale for this? Like, what is wrong with the current implementation.
Ticket based spinlock's principle is here:
https://lwn.net/Articles/267968/

Current implementation will cause cache line bouncing when many harts
are acquiring the same spinlock.
I'm seeking a solution, maybe not fitting the current RISC-V base ISA.

I'll add more comments in the next version of patch.

>
> Thanks in advance,
>
> Best regards,
>    Vitaly
>>
>>
>> Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
>> Cc: Catalin Marinas <catalin.marinas@arm.com>
>> Cc: Will Deacon <will.deacon@arm.com>
>> Cc: Peter Zijlstra <peterz@infradead.org>
>> Cc: Palmer Dabbelt <palmerdabbelt@google.com>
>> Cc: Anup Patel <anup@brainfault.org>
>> Cc: Arnd Bergmann <arnd@arndb.de>
>> ---
>>  arch/riscv/Kconfig                      |   1 +
>>  arch/riscv/include/asm/Kbuild           |   1 +
>>  arch/riscv/include/asm/spinlock.h       | 158 ++++++++++++--------------------
>>  arch/riscv/include/asm/spinlock_types.h |  19 ++--
>>  4 files changed, 74 insertions(+), 105 deletions(-)
>>
>> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
>> index 87d7b52..7c56a20 100644
>> --- a/arch/riscv/Kconfig
>> +++ b/arch/riscv/Kconfig
>> @@ -30,6 +30,7 @@ config RISCV
>>         select ARCH_HAS_STRICT_KERNEL_RWX if MMU
>>         select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
>>         select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT
>> +       select ARCH_USE_QUEUED_RWLOCKS
>>         select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
>>         select ARCH_WANT_FRAME_POINTERS
>>         select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
>> diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild
>> index 445ccc9..e57ef80 100644
>> --- a/arch/riscv/include/asm/Kbuild
>> +++ b/arch/riscv/include/asm/Kbuild
>> @@ -3,5 +3,6 @@ generic-y += early_ioremap.h
>>  generic-y += extable.h
>>  generic-y += flat.h
>>  generic-y += kvm_para.h
>> +generic-y += qrwlock.h
>>  generic-y += user.h
>>  generic-y += vmlinux.lds.h
>> diff --git a/arch/riscv/include/asm/spinlock.h b/arch/riscv/include/asm/spinlock.h
>> index f4f7fa1..2c81764 100644
>> --- a/arch/riscv/include/asm/spinlock.h
>> +++ b/arch/riscv/include/asm/spinlock.h
>> @@ -7,129 +7,91 @@
>>  #ifndef _ASM_RISCV_SPINLOCK_H
>>  #define _ASM_RISCV_SPINLOCK_H
>>
>> -#include <linux/kernel.h>
>> -#include <asm/current.h>
>> -#include <asm/fence.h>
>> -
>>  /*
>> - * Simple spin lock operations.  These provide no fairness guarantees.
>> + * Ticket-based spin-locking.
>>   */
>> +static inline void arch_spin_lock(arch_spinlock_t *lock)
>> +{
>> +       arch_spinlock_t lockval;
>> +       u32 tmp;
>> +
>> +       asm volatile (
>> +               "1:     lr.w    %0, %2          \n"
>> +               "       mv      %1, %0          \n"
>> +               "       addw    %0, %0, %3      \n"
>> +               "       sc.w    %0, %0, %2      \n"
>> +               "       bnez    %0, 1b          \n"
>> +               : "=&r" (tmp), "=&r" (lockval), "+A" (lock->lock)
>> +               : "r" (1 << TICKET_NEXT)
>> +               : "memory");
>>
>> -/* FIXME: Replace this with a ticket lock, like MIPS. */
>> -
>> -#define arch_spin_is_locked(x) (READ_ONCE((x)->lock) != 0)
>> +       while (lockval.tickets.next != lockval.tickets.owner) {
>> +               /*
>> +                * FIXME - we need wfi/wfe here to prevent:
>> +                *  - cache line bouncing
>> +                *  - saving cpu pipeline in multi-harts-per-core
>> +                *    processor
>> +                */
>> +               lockval.tickets.owner = READ_ONCE(lock->tickets.owner);
>> +       }
>>
>> -static inline void arch_spin_unlock(arch_spinlock_t *lock)
>> -{
>> -       smp_store_release(&lock->lock, 0);
>> +       __atomic_acquire_fence();
>>  }
>>
>>  static inline int arch_spin_trylock(arch_spinlock_t *lock)
>>  {
>> -       int tmp = 1, busy;
>> -
>> -       __asm__ __volatile__ (
>> -               "       amoswap.w %0, %2, %1\n"
>> -               RISCV_ACQUIRE_BARRIER
>> -               : "=r" (busy), "+A" (lock->lock)
>> -               : "r" (tmp)
>> +       u32 tmp, contended, res;
>> +
>> +       do {
>> +               asm volatile (
>> +               "       lr.w    %0, %3          \n"
>> +               "       srliw   %1, %0, %5      \n"
>> +               "       slliw   %2, %0, %5      \n"
>> +               "       or      %1, %2, %1      \n"
>> +               "       li      %2, 0           \n"
>> +               "       sub     %1, %1, %0      \n"
>> +               "       bnez    %1, 1f          \n"
>> +               "       addw    %0, %0, %4      \n"
>> +               "       sc.w    %2, %0, %3      \n"
>> +               "1:                             \n"
>> +               : "=&r" (tmp), "=&r" (contended), "=&r" (res),
>> +                 "+A" (lock->lock)
>> +               : "r" (1 << TICKET_NEXT), "I" (TICKET_NEXT)
>>                 : "memory");
>> +       } while (res);
>>
>> -       return !busy;
>> -}
>> -
>> -static inline void arch_spin_lock(arch_spinlock_t *lock)
>> -{
>> -       while (1) {
>> -               if (arch_spin_is_locked(lock))
>> -                       continue;
>> -
>> -               if (arch_spin_trylock(lock))
>> -                       break;
>> +       if (!contended) {
>> +               __atomic_acquire_fence();
>> +               return 1;
>> +       } else {
>> +               return 0;
>>         }
>>  }
>>
>> -/***********************************************************/
>> -
>> -static inline void arch_read_lock(arch_rwlock_t *lock)
>> +static inline void arch_spin_unlock(arch_spinlock_t *lock)
>>  {
>> -       int tmp;
>> -
>> -       __asm__ __volatile__(
>> -               "1:     lr.w    %1, %0\n"
>> -               "       bltz    %1, 1b\n"
>> -               "       addi    %1, %1, 1\n"
>> -               "       sc.w    %1, %1, %0\n"
>> -               "       bnez    %1, 1b\n"
>> -               RISCV_ACQUIRE_BARRIER
>> -               : "+A" (lock->lock), "=&r" (tmp)
>> -               :: "memory");
>> +       smp_store_release(&lock->tickets.owner, lock->tickets.owner + 1);
>> +       /* FIXME - we need ipi/sev here to notify above */
>>  }
>>
>> -static inline void arch_write_lock(arch_rwlock_t *lock)
>> +static inline int arch_spin_value_unlocked(arch_spinlock_t lock)
>>  {
>> -       int tmp;
>> -
>> -       __asm__ __volatile__(
>> -               "1:     lr.w    %1, %0\n"
>> -               "       bnez    %1, 1b\n"
>> -               "       li      %1, -1\n"
>> -               "       sc.w    %1, %1, %0\n"
>> -               "       bnez    %1, 1b\n"
>> -               RISCV_ACQUIRE_BARRIER
>> -               : "+A" (lock->lock), "=&r" (tmp)
>> -               :: "memory");
>> +       return lock.tickets.owner == lock.tickets.next;
>>  }
>>
>> -static inline int arch_read_trylock(arch_rwlock_t *lock)
>> +static inline int arch_spin_is_locked(arch_spinlock_t *lock)
>>  {
>> -       int busy;
>> -
>> -       __asm__ __volatile__(
>> -               "1:     lr.w    %1, %0\n"
>> -               "       bltz    %1, 1f\n"
>> -               "       addi    %1, %1, 1\n"
>> -               "       sc.w    %1, %1, %0\n"
>> -               "       bnez    %1, 1b\n"
>> -               RISCV_ACQUIRE_BARRIER
>> -               "1:\n"
>> -               : "+A" (lock->lock), "=&r" (busy)
>> -               :: "memory");
>> -
>> -       return !busy;
>> +       return !arch_spin_value_unlocked(READ_ONCE(*lock));
>>  }
>>
>> -static inline int arch_write_trylock(arch_rwlock_t *lock)
>> +static inline int arch_spin_is_contended(arch_spinlock_t *lock)
>>  {
>> -       int busy;
>> -
>> -       __asm__ __volatile__(
>> -               "1:     lr.w    %1, %0\n"
>> -               "       bnez    %1, 1f\n"
>> -               "       li      %1, -1\n"
>> -               "       sc.w    %1, %1, %0\n"
>> -               "       bnez    %1, 1b\n"
>> -               RISCV_ACQUIRE_BARRIER
>> -               "1:\n"
>> -               : "+A" (lock->lock), "=&r" (busy)
>> -               :: "memory");
>> +       struct __raw_tickets tickets = READ_ONCE(lock->tickets);
>>
>> -       return !busy;
>> +       return (tickets.next - tickets.owner) > 1;
>>  }
>> +#define arch_spin_is_contended arch_spin_is_contended
>>
>> -static inline void arch_read_unlock(arch_rwlock_t *lock)
>> -{
>> -       __asm__ __volatile__(
>> -               RISCV_RELEASE_BARRIER
>> -               "       amoadd.w x0, %1, %0\n"
>> -               : "+A" (lock->lock)
>> -               : "r" (-1)
>> -               : "memory");
>> -}
>> -
>> -static inline void arch_write_unlock(arch_rwlock_t *lock)
>> -{
>> -       smp_store_release(&lock->lock, 0);
>> -}
>> +#include <asm/qrwlock.h>
>>
>>  #endif /* _ASM_RISCV_SPINLOCK_H */
>> diff --git a/arch/riscv/include/asm/spinlock_types.h b/arch/riscv/include/asm/spinlock_types.h
>> index f398e76..d7b38bf 100644
>> --- a/arch/riscv/include/asm/spinlock_types.h
>> +++ b/arch/riscv/include/asm/spinlock_types.h
>> @@ -10,16 +10,21 @@
>>  # error "please don't include this file directly"
>>  #endif
>>
>> +#define TICKET_NEXT    16
>> +
>>  typedef struct {
>> -       volatile unsigned int lock;
>> +       union {
>> +               u32 lock;
>> +               struct __raw_tickets {
>> +                       /* little endian */
>> +                       u16 owner;
>> +                       u16 next;
>> +               } tickets;
>> +       };
>>  } arch_spinlock_t;
>>
>> -#define __ARCH_SPIN_LOCK_UNLOCKED      { 0 }
>> -
>> -typedef struct {
>> -       volatile unsigned int lock;
>> -} arch_rwlock_t;
>> +#define __ARCH_SPIN_LOCK_UNLOCKED      { { 0 } }
>>
>> -#define __ARCH_RW_LOCK_UNLOCKED                { 0 }
>> +#include <asm-generic/qrwlock_types.h>
>>
>>  #endif /* _ASM_RISCV_SPINLOCK_TYPES_H */
>> --
>> 2.7.4
>>
>>
>> _______________________________________________
>> linux-riscv mailing list
>> linux-riscv@lists.infradead.org
>> http://lists.infradead.org/mailman/listinfo/linux-riscv



-- 
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
  2021-03-24 10:14 ` guoren
@ 2021-03-24 12:28   ` Anup Patel
  -1 siblings, 0 replies; 92+ messages in thread
From: Anup Patel @ 2021-03-24 12:28 UTC (permalink / raw)
  To: Guo Ren
  Cc: linux-riscv, linux-kernel@vger.kernel.org List, Guo Ren,
	Catalin Marinas, Will Deacon, Peter Zijlstra, Palmer Dabbelt,
	Arnd Bergmann

On Wed, Mar 24, 2021 at 3:45 PM <guoren@kernel.org> wrote:
>
> From: Guo Ren <guoren@linux.alibaba.com>
>
> This patch introduces a ticket lock implementation for riscv, along the
> same lines as the implementation for arch/arm & arch/csky.
>
> Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will.deacon@arm.com>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Palmer Dabbelt <palmerdabbelt@google.com>
> Cc: Anup Patel <anup@brainfault.org>
> Cc: Arnd Bergmann <arnd@arndb.de>
> ---
>  arch/riscv/Kconfig                      |   1 +
>  arch/riscv/include/asm/Kbuild           |   1 +
>  arch/riscv/include/asm/spinlock.h       | 158 ++++++++++++--------------------
>  arch/riscv/include/asm/spinlock_types.h |  19 ++--

NACK from myside.

Linux ARM64 has moved away from ticket spinlock to qspinlock.

We should directly go for qspinlock.

Regards,
Anup

>  4 files changed, 74 insertions(+), 105 deletions(-)
>
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index 87d7b52..7c56a20 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -30,6 +30,7 @@ config RISCV
>         select ARCH_HAS_STRICT_KERNEL_RWX if MMU
>         select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
>         select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT
> +       select ARCH_USE_QUEUED_RWLOCKS
>         select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
>         select ARCH_WANT_FRAME_POINTERS
>         select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
> diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild
> index 445ccc9..e57ef80 100644
> --- a/arch/riscv/include/asm/Kbuild
> +++ b/arch/riscv/include/asm/Kbuild
> @@ -3,5 +3,6 @@ generic-y += early_ioremap.h
>  generic-y += extable.h
>  generic-y += flat.h
>  generic-y += kvm_para.h
> +generic-y += qrwlock.h
>  generic-y += user.h
>  generic-y += vmlinux.lds.h
> diff --git a/arch/riscv/include/asm/spinlock.h b/arch/riscv/include/asm/spinlock.h
> index f4f7fa1..2c81764 100644
> --- a/arch/riscv/include/asm/spinlock.h
> +++ b/arch/riscv/include/asm/spinlock.h
> @@ -7,129 +7,91 @@
>  #ifndef _ASM_RISCV_SPINLOCK_H
>  #define _ASM_RISCV_SPINLOCK_H
>
> -#include <linux/kernel.h>
> -#include <asm/current.h>
> -#include <asm/fence.h>
> -
>  /*
> - * Simple spin lock operations.  These provide no fairness guarantees.
> + * Ticket-based spin-locking.
>   */
> +static inline void arch_spin_lock(arch_spinlock_t *lock)
> +{
> +       arch_spinlock_t lockval;
> +       u32 tmp;
> +
> +       asm volatile (
> +               "1:     lr.w    %0, %2          \n"
> +               "       mv      %1, %0          \n"
> +               "       addw    %0, %0, %3      \n"
> +               "       sc.w    %0, %0, %2      \n"
> +               "       bnez    %0, 1b          \n"
> +               : "=&r" (tmp), "=&r" (lockval), "+A" (lock->lock)
> +               : "r" (1 << TICKET_NEXT)
> +               : "memory");
>
> -/* FIXME: Replace this with a ticket lock, like MIPS. */
> -
> -#define arch_spin_is_locked(x) (READ_ONCE((x)->lock) != 0)
> +       while (lockval.tickets.next != lockval.tickets.owner) {
> +               /*
> +                * FIXME - we need wfi/wfe here to prevent:
> +                *  - cache line bouncing
> +                *  - saving cpu pipeline in multi-harts-per-core
> +                *    processor
> +                */
> +               lockval.tickets.owner = READ_ONCE(lock->tickets.owner);
> +       }
>
> -static inline void arch_spin_unlock(arch_spinlock_t *lock)
> -{
> -       smp_store_release(&lock->lock, 0);
> +       __atomic_acquire_fence();
>  }
>
>  static inline int arch_spin_trylock(arch_spinlock_t *lock)
>  {
> -       int tmp = 1, busy;
> -
> -       __asm__ __volatile__ (
> -               "       amoswap.w %0, %2, %1\n"
> -               RISCV_ACQUIRE_BARRIER
> -               : "=r" (busy), "+A" (lock->lock)
> -               : "r" (tmp)
> +       u32 tmp, contended, res;
> +
> +       do {
> +               asm volatile (
> +               "       lr.w    %0, %3          \n"
> +               "       srliw   %1, %0, %5      \n"
> +               "       slliw   %2, %0, %5      \n"
> +               "       or      %1, %2, %1      \n"
> +               "       li      %2, 0           \n"
> +               "       sub     %1, %1, %0      \n"
> +               "       bnez    %1, 1f          \n"
> +               "       addw    %0, %0, %4      \n"
> +               "       sc.w    %2, %0, %3      \n"
> +               "1:                             \n"
> +               : "=&r" (tmp), "=&r" (contended), "=&r" (res),
> +                 "+A" (lock->lock)
> +               : "r" (1 << TICKET_NEXT), "I" (TICKET_NEXT)
>                 : "memory");
> +       } while (res);
>
> -       return !busy;
> -}
> -
> -static inline void arch_spin_lock(arch_spinlock_t *lock)
> -{
> -       while (1) {
> -               if (arch_spin_is_locked(lock))
> -                       continue;
> -
> -               if (arch_spin_trylock(lock))
> -                       break;
> +       if (!contended) {
> +               __atomic_acquire_fence();
> +               return 1;
> +       } else {
> +               return 0;
>         }
>  }
>
> -/***********************************************************/
> -
> -static inline void arch_read_lock(arch_rwlock_t *lock)
> +static inline void arch_spin_unlock(arch_spinlock_t *lock)
>  {
> -       int tmp;
> -
> -       __asm__ __volatile__(
> -               "1:     lr.w    %1, %0\n"
> -               "       bltz    %1, 1b\n"
> -               "       addi    %1, %1, 1\n"
> -               "       sc.w    %1, %1, %0\n"
> -               "       bnez    %1, 1b\n"
> -               RISCV_ACQUIRE_BARRIER
> -               : "+A" (lock->lock), "=&r" (tmp)
> -               :: "memory");
> +       smp_store_release(&lock->tickets.owner, lock->tickets.owner + 1);
> +       /* FIXME - we need ipi/sev here to notify above */
>  }
>
> -static inline void arch_write_lock(arch_rwlock_t *lock)
> +static inline int arch_spin_value_unlocked(arch_spinlock_t lock)
>  {
> -       int tmp;
> -
> -       __asm__ __volatile__(
> -               "1:     lr.w    %1, %0\n"
> -               "       bnez    %1, 1b\n"
> -               "       li      %1, -1\n"
> -               "       sc.w    %1, %1, %0\n"
> -               "       bnez    %1, 1b\n"
> -               RISCV_ACQUIRE_BARRIER
> -               : "+A" (lock->lock), "=&r" (tmp)
> -               :: "memory");
> +       return lock.tickets.owner == lock.tickets.next;
>  }
>
> -static inline int arch_read_trylock(arch_rwlock_t *lock)
> +static inline int arch_spin_is_locked(arch_spinlock_t *lock)
>  {
> -       int busy;
> -
> -       __asm__ __volatile__(
> -               "1:     lr.w    %1, %0\n"
> -               "       bltz    %1, 1f\n"
> -               "       addi    %1, %1, 1\n"
> -               "       sc.w    %1, %1, %0\n"
> -               "       bnez    %1, 1b\n"
> -               RISCV_ACQUIRE_BARRIER
> -               "1:\n"
> -               : "+A" (lock->lock), "=&r" (busy)
> -               :: "memory");
> -
> -       return !busy;
> +       return !arch_spin_value_unlocked(READ_ONCE(*lock));
>  }
>
> -static inline int arch_write_trylock(arch_rwlock_t *lock)
> +static inline int arch_spin_is_contended(arch_spinlock_t *lock)
>  {
> -       int busy;
> -
> -       __asm__ __volatile__(
> -               "1:     lr.w    %1, %0\n"
> -               "       bnez    %1, 1f\n"
> -               "       li      %1, -1\n"
> -               "       sc.w    %1, %1, %0\n"
> -               "       bnez    %1, 1b\n"
> -               RISCV_ACQUIRE_BARRIER
> -               "1:\n"
> -               : "+A" (lock->lock), "=&r" (busy)
> -               :: "memory");
> +       struct __raw_tickets tickets = READ_ONCE(lock->tickets);
>
> -       return !busy;
> +       return (tickets.next - tickets.owner) > 1;
>  }
> +#define arch_spin_is_contended arch_spin_is_contended
>
> -static inline void arch_read_unlock(arch_rwlock_t *lock)
> -{
> -       __asm__ __volatile__(
> -               RISCV_RELEASE_BARRIER
> -               "       amoadd.w x0, %1, %0\n"
> -               : "+A" (lock->lock)
> -               : "r" (-1)
> -               : "memory");
> -}
> -
> -static inline void arch_write_unlock(arch_rwlock_t *lock)
> -{
> -       smp_store_release(&lock->lock, 0);
> -}
> +#include <asm/qrwlock.h>
>
>  #endif /* _ASM_RISCV_SPINLOCK_H */
> diff --git a/arch/riscv/include/asm/spinlock_types.h b/arch/riscv/include/asm/spinlock_types.h
> index f398e76..d7b38bf 100644
> --- a/arch/riscv/include/asm/spinlock_types.h
> +++ b/arch/riscv/include/asm/spinlock_types.h
> @@ -10,16 +10,21 @@
>  # error "please don't include this file directly"
>  #endif
>
> +#define TICKET_NEXT    16
> +
>  typedef struct {
> -       volatile unsigned int lock;
> +       union {
> +               u32 lock;
> +               struct __raw_tickets {
> +                       /* little endian */
> +                       u16 owner;
> +                       u16 next;
> +               } tickets;
> +       };
>  } arch_spinlock_t;
>
> -#define __ARCH_SPIN_LOCK_UNLOCKED      { 0 }
> -
> -typedef struct {
> -       volatile unsigned int lock;
> -} arch_rwlock_t;
> +#define __ARCH_SPIN_LOCK_UNLOCKED      { { 0 } }
>
> -#define __ARCH_RW_LOCK_UNLOCKED                { 0 }
> +#include <asm-generic/qrwlock_types.h>
>
>  #endif /* _ASM_RISCV_SPINLOCK_TYPES_H */
> --
> 2.7.4
>

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
@ 2021-03-24 12:28   ` Anup Patel
  0 siblings, 0 replies; 92+ messages in thread
From: Anup Patel @ 2021-03-24 12:28 UTC (permalink / raw)
  To: Guo Ren
  Cc: linux-riscv, linux-kernel@vger.kernel.org List, Guo Ren,
	Catalin Marinas, Will Deacon, Peter Zijlstra, Palmer Dabbelt,
	Arnd Bergmann

On Wed, Mar 24, 2021 at 3:45 PM <guoren@kernel.org> wrote:
>
> From: Guo Ren <guoren@linux.alibaba.com>
>
> This patch introduces a ticket lock implementation for riscv, along the
> same lines as the implementation for arch/arm & arch/csky.
>
> Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will.deacon@arm.com>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Palmer Dabbelt <palmerdabbelt@google.com>
> Cc: Anup Patel <anup@brainfault.org>
> Cc: Arnd Bergmann <arnd@arndb.de>
> ---
>  arch/riscv/Kconfig                      |   1 +
>  arch/riscv/include/asm/Kbuild           |   1 +
>  arch/riscv/include/asm/spinlock.h       | 158 ++++++++++++--------------------
>  arch/riscv/include/asm/spinlock_types.h |  19 ++--

NACK from myside.

Linux ARM64 has moved away from ticket spinlock to qspinlock.

We should directly go for qspinlock.

Regards,
Anup

>  4 files changed, 74 insertions(+), 105 deletions(-)
>
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index 87d7b52..7c56a20 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -30,6 +30,7 @@ config RISCV
>         select ARCH_HAS_STRICT_KERNEL_RWX if MMU
>         select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
>         select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT
> +       select ARCH_USE_QUEUED_RWLOCKS
>         select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
>         select ARCH_WANT_FRAME_POINTERS
>         select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
> diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild
> index 445ccc9..e57ef80 100644
> --- a/arch/riscv/include/asm/Kbuild
> +++ b/arch/riscv/include/asm/Kbuild
> @@ -3,5 +3,6 @@ generic-y += early_ioremap.h
>  generic-y += extable.h
>  generic-y += flat.h
>  generic-y += kvm_para.h
> +generic-y += qrwlock.h
>  generic-y += user.h
>  generic-y += vmlinux.lds.h
> diff --git a/arch/riscv/include/asm/spinlock.h b/arch/riscv/include/asm/spinlock.h
> index f4f7fa1..2c81764 100644
> --- a/arch/riscv/include/asm/spinlock.h
> +++ b/arch/riscv/include/asm/spinlock.h
> @@ -7,129 +7,91 @@
>  #ifndef _ASM_RISCV_SPINLOCK_H
>  #define _ASM_RISCV_SPINLOCK_H
>
> -#include <linux/kernel.h>
> -#include <asm/current.h>
> -#include <asm/fence.h>
> -
>  /*
> - * Simple spin lock operations.  These provide no fairness guarantees.
> + * Ticket-based spin-locking.
>   */
> +static inline void arch_spin_lock(arch_spinlock_t *lock)
> +{
> +       arch_spinlock_t lockval;
> +       u32 tmp;
> +
> +       asm volatile (
> +               "1:     lr.w    %0, %2          \n"
> +               "       mv      %1, %0          \n"
> +               "       addw    %0, %0, %3      \n"
> +               "       sc.w    %0, %0, %2      \n"
> +               "       bnez    %0, 1b          \n"
> +               : "=&r" (tmp), "=&r" (lockval), "+A" (lock->lock)
> +               : "r" (1 << TICKET_NEXT)
> +               : "memory");
>
> -/* FIXME: Replace this with a ticket lock, like MIPS. */
> -
> -#define arch_spin_is_locked(x) (READ_ONCE((x)->lock) != 0)
> +       while (lockval.tickets.next != lockval.tickets.owner) {
> +               /*
> +                * FIXME - we need wfi/wfe here to prevent:
> +                *  - cache line bouncing
> +                *  - saving cpu pipeline in multi-harts-per-core
> +                *    processor
> +                */
> +               lockval.tickets.owner = READ_ONCE(lock->tickets.owner);
> +       }
>
> -static inline void arch_spin_unlock(arch_spinlock_t *lock)
> -{
> -       smp_store_release(&lock->lock, 0);
> +       __atomic_acquire_fence();
>  }
>
>  static inline int arch_spin_trylock(arch_spinlock_t *lock)
>  {
> -       int tmp = 1, busy;
> -
> -       __asm__ __volatile__ (
> -               "       amoswap.w %0, %2, %1\n"
> -               RISCV_ACQUIRE_BARRIER
> -               : "=r" (busy), "+A" (lock->lock)
> -               : "r" (tmp)
> +       u32 tmp, contended, res;
> +
> +       do {
> +               asm volatile (
> +               "       lr.w    %0, %3          \n"
> +               "       srliw   %1, %0, %5      \n"
> +               "       slliw   %2, %0, %5      \n"
> +               "       or      %1, %2, %1      \n"
> +               "       li      %2, 0           \n"
> +               "       sub     %1, %1, %0      \n"
> +               "       bnez    %1, 1f          \n"
> +               "       addw    %0, %0, %4      \n"
> +               "       sc.w    %2, %0, %3      \n"
> +               "1:                             \n"
> +               : "=&r" (tmp), "=&r" (contended), "=&r" (res),
> +                 "+A" (lock->lock)
> +               : "r" (1 << TICKET_NEXT), "I" (TICKET_NEXT)
>                 : "memory");
> +       } while (res);
>
> -       return !busy;
> -}
> -
> -static inline void arch_spin_lock(arch_spinlock_t *lock)
> -{
> -       while (1) {
> -               if (arch_spin_is_locked(lock))
> -                       continue;
> -
> -               if (arch_spin_trylock(lock))
> -                       break;
> +       if (!contended) {
> +               __atomic_acquire_fence();
> +               return 1;
> +       } else {
> +               return 0;
>         }
>  }
>
> -/***********************************************************/
> -
> -static inline void arch_read_lock(arch_rwlock_t *lock)
> +static inline void arch_spin_unlock(arch_spinlock_t *lock)
>  {
> -       int tmp;
> -
> -       __asm__ __volatile__(
> -               "1:     lr.w    %1, %0\n"
> -               "       bltz    %1, 1b\n"
> -               "       addi    %1, %1, 1\n"
> -               "       sc.w    %1, %1, %0\n"
> -               "       bnez    %1, 1b\n"
> -               RISCV_ACQUIRE_BARRIER
> -               : "+A" (lock->lock), "=&r" (tmp)
> -               :: "memory");
> +       smp_store_release(&lock->tickets.owner, lock->tickets.owner + 1);
> +       /* FIXME - we need ipi/sev here to notify above */
>  }
>
> -static inline void arch_write_lock(arch_rwlock_t *lock)
> +static inline int arch_spin_value_unlocked(arch_spinlock_t lock)
>  {
> -       int tmp;
> -
> -       __asm__ __volatile__(
> -               "1:     lr.w    %1, %0\n"
> -               "       bnez    %1, 1b\n"
> -               "       li      %1, -1\n"
> -               "       sc.w    %1, %1, %0\n"
> -               "       bnez    %1, 1b\n"
> -               RISCV_ACQUIRE_BARRIER
> -               : "+A" (lock->lock), "=&r" (tmp)
> -               :: "memory");
> +       return lock.tickets.owner == lock.tickets.next;
>  }
>
> -static inline int arch_read_trylock(arch_rwlock_t *lock)
> +static inline int arch_spin_is_locked(arch_spinlock_t *lock)
>  {
> -       int busy;
> -
> -       __asm__ __volatile__(
> -               "1:     lr.w    %1, %0\n"
> -               "       bltz    %1, 1f\n"
> -               "       addi    %1, %1, 1\n"
> -               "       sc.w    %1, %1, %0\n"
> -               "       bnez    %1, 1b\n"
> -               RISCV_ACQUIRE_BARRIER
> -               "1:\n"
> -               : "+A" (lock->lock), "=&r" (busy)
> -               :: "memory");
> -
> -       return !busy;
> +       return !arch_spin_value_unlocked(READ_ONCE(*lock));
>  }
>
> -static inline int arch_write_trylock(arch_rwlock_t *lock)
> +static inline int arch_spin_is_contended(arch_spinlock_t *lock)
>  {
> -       int busy;
> -
> -       __asm__ __volatile__(
> -               "1:     lr.w    %1, %0\n"
> -               "       bnez    %1, 1f\n"
> -               "       li      %1, -1\n"
> -               "       sc.w    %1, %1, %0\n"
> -               "       bnez    %1, 1b\n"
> -               RISCV_ACQUIRE_BARRIER
> -               "1:\n"
> -               : "+A" (lock->lock), "=&r" (busy)
> -               :: "memory");
> +       struct __raw_tickets tickets = READ_ONCE(lock->tickets);
>
> -       return !busy;
> +       return (tickets.next - tickets.owner) > 1;
>  }
> +#define arch_spin_is_contended arch_spin_is_contended
>
> -static inline void arch_read_unlock(arch_rwlock_t *lock)
> -{
> -       __asm__ __volatile__(
> -               RISCV_RELEASE_BARRIER
> -               "       amoadd.w x0, %1, %0\n"
> -               : "+A" (lock->lock)
> -               : "r" (-1)
> -               : "memory");
> -}
> -
> -static inline void arch_write_unlock(arch_rwlock_t *lock)
> -{
> -       smp_store_release(&lock->lock, 0);
> -}
> +#include <asm/qrwlock.h>
>
>  #endif /* _ASM_RISCV_SPINLOCK_H */
> diff --git a/arch/riscv/include/asm/spinlock_types.h b/arch/riscv/include/asm/spinlock_types.h
> index f398e76..d7b38bf 100644
> --- a/arch/riscv/include/asm/spinlock_types.h
> +++ b/arch/riscv/include/asm/spinlock_types.h
> @@ -10,16 +10,21 @@
>  # error "please don't include this file directly"
>  #endif
>
> +#define TICKET_NEXT    16
> +
>  typedef struct {
> -       volatile unsigned int lock;
> +       union {
> +               u32 lock;
> +               struct __raw_tickets {
> +                       /* little endian */
> +                       u16 owner;
> +                       u16 next;
> +               } tickets;
> +       };
>  } arch_spinlock_t;
>
> -#define __ARCH_SPIN_LOCK_UNLOCKED      { 0 }
> -
> -typedef struct {
> -       volatile unsigned int lock;
> -} arch_rwlock_t;
> +#define __ARCH_SPIN_LOCK_UNLOCKED      { { 0 } }
>
> -#define __ARCH_RW_LOCK_UNLOCKED                { 0 }
> +#include <asm-generic/qrwlock_types.h>
>
>  #endif /* _ASM_RISCV_SPINLOCK_TYPES_H */
> --
> 2.7.4
>

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
  2021-03-24 12:24     ` Guo Ren
@ 2021-03-24 12:31       ` Peter Zijlstra
  -1 siblings, 0 replies; 92+ messages in thread
From: Peter Zijlstra @ 2021-03-24 12:31 UTC (permalink / raw)
  To: Guo Ren
  Cc: Vitaly Wool, linux-riscv, LKML, Guo Ren, Catalin Marinas,
	Will Deacon, Palmer Dabbelt, Anup Patel, Arnd Bergmann

On Wed, Mar 24, 2021 at 08:24:34PM +0800, Guo Ren wrote:
> On Wed, Mar 24, 2021 at 7:16 PM Vitaly Wool <vitaly.wool@konsulko.com> wrote:
> >
> >
> >
> > On Wed, Mar 24, 2021, 11:16 AM <guoren@kernel.org> wrote:
> >>
> >> From: Guo Ren <guoren@linux.alibaba.com>
> >>
> >> This patch introduces a ticket lock implementation for riscv, along the
> >> same lines as the implementation for arch/arm & arch/csky.
> >
> >
> > Could you please provide a rationale for this? Like, what is wrong with the current implementation.
> Ticket based spinlock's principle is here:
> https://lwn.net/Articles/267968/
> 
> Current implementation will cause cache line bouncing when many harts
> are acquiring the same spinlock.
> I'm seeking a solution, maybe not fitting the current RISC-V base ISA.

Ticket locks as such don't solve the cacheline bouncing part, since
they're all still spinning on the same line. The big improvement ticket
locks bring is that the lock acquisition time becomes a function of the
longest hold time, instead of being unbounded.

However, combine it with the WFE (preferably the ARM64 variant) and you
can avoid the worst of the bouncing.

If you really want to get rid of the bouncing, go with qspinlock, which
will spin on a cpu local line. That said, qspinlock is quite gnarly
code, and only really wins from ticket when you have NUMA.

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
@ 2021-03-24 12:31       ` Peter Zijlstra
  0 siblings, 0 replies; 92+ messages in thread
From: Peter Zijlstra @ 2021-03-24 12:31 UTC (permalink / raw)
  To: Guo Ren
  Cc: Vitaly Wool, linux-riscv, LKML, Guo Ren, Catalin Marinas,
	Will Deacon, Palmer Dabbelt, Anup Patel, Arnd Bergmann

On Wed, Mar 24, 2021 at 08:24:34PM +0800, Guo Ren wrote:
> On Wed, Mar 24, 2021 at 7:16 PM Vitaly Wool <vitaly.wool@konsulko.com> wrote:
> >
> >
> >
> > On Wed, Mar 24, 2021, 11:16 AM <guoren@kernel.org> wrote:
> >>
> >> From: Guo Ren <guoren@linux.alibaba.com>
> >>
> >> This patch introduces a ticket lock implementation for riscv, along the
> >> same lines as the implementation for arch/arm & arch/csky.
> >
> >
> > Could you please provide a rationale for this? Like, what is wrong with the current implementation.
> Ticket based spinlock's principle is here:
> https://lwn.net/Articles/267968/
> 
> Current implementation will cause cache line bouncing when many harts
> are acquiring the same spinlock.
> I'm seeking a solution, maybe not fitting the current RISC-V base ISA.

Ticket locks as such don't solve the cacheline bouncing part, since
they're all still spinning on the same line. The big improvement ticket
locks bring is that the lock acquisition time becomes a function of the
longest hold time, instead of being unbounded.

However, combine it with the WFE (preferably the ARM64 variant) and you
can avoid the worst of the bouncing.

If you really want to get rid of the bouncing, go with qspinlock, which
will spin on a cpu local line. That said, qspinlock is quite gnarly
code, and only really wins from ticket when you have NUMA.

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
  2021-03-24 12:28   ` Anup Patel
@ 2021-03-24 12:37     ` Peter Zijlstra
  -1 siblings, 0 replies; 92+ messages in thread
From: Peter Zijlstra @ 2021-03-24 12:37 UTC (permalink / raw)
  To: Anup Patel
  Cc: Guo Ren, linux-riscv, linux-kernel@vger.kernel.org List, Guo Ren,
	Catalin Marinas, Will Deacon, Palmer Dabbelt, Arnd Bergmann

On Wed, Mar 24, 2021 at 05:58:58PM +0530, Anup Patel wrote:
> On Wed, Mar 24, 2021 at 3:45 PM <guoren@kernel.org> wrote:
> >
> > From: Guo Ren <guoren@linux.alibaba.com>
> >
> > This patch introduces a ticket lock implementation for riscv, along the
> > same lines as the implementation for arch/arm & arch/csky.
> >
> > Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> > Cc: Catalin Marinas <catalin.marinas@arm.com>
> > Cc: Will Deacon <will.deacon@arm.com>
> > Cc: Peter Zijlstra <peterz@infradead.org>
> > Cc: Palmer Dabbelt <palmerdabbelt@google.com>
> > Cc: Anup Patel <anup@brainfault.org>
> > Cc: Arnd Bergmann <arnd@arndb.de>
> > ---
> >  arch/riscv/Kconfig                      |   1 +
> >  arch/riscv/include/asm/Kbuild           |   1 +
> >  arch/riscv/include/asm/spinlock.h       | 158 ++++++++++++--------------------
> >  arch/riscv/include/asm/spinlock_types.h |  19 ++--
> 
> NACK from myside.
> 
> Linux ARM64 has moved away from ticket spinlock to qspinlock.
> 
> We should directly go for qspinlock.

I think it is a sensible intermediate step, even if you want to go
qspinlock. Ticket locks are more or less trivial and get you fairness
and all that goodness without the mind bending complexity of qspinlock.

Once you have the ticket lock implementation solid (and qrwlock) and
everything, *then* start to carefully look at qspinlock.

Now, arguably arm64 did the heavy lifting of making qspinlock good on
weak architectures, but if you want to do it right, you still have to
analyze the whole thing for your own architecture.


^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
@ 2021-03-24 12:37     ` Peter Zijlstra
  0 siblings, 0 replies; 92+ messages in thread
From: Peter Zijlstra @ 2021-03-24 12:37 UTC (permalink / raw)
  To: Anup Patel
  Cc: Guo Ren, linux-riscv, linux-kernel@vger.kernel.org List, Guo Ren,
	Catalin Marinas, Will Deacon, Palmer Dabbelt, Arnd Bergmann

On Wed, Mar 24, 2021 at 05:58:58PM +0530, Anup Patel wrote:
> On Wed, Mar 24, 2021 at 3:45 PM <guoren@kernel.org> wrote:
> >
> > From: Guo Ren <guoren@linux.alibaba.com>
> >
> > This patch introduces a ticket lock implementation for riscv, along the
> > same lines as the implementation for arch/arm & arch/csky.
> >
> > Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> > Cc: Catalin Marinas <catalin.marinas@arm.com>
> > Cc: Will Deacon <will.deacon@arm.com>
> > Cc: Peter Zijlstra <peterz@infradead.org>
> > Cc: Palmer Dabbelt <palmerdabbelt@google.com>
> > Cc: Anup Patel <anup@brainfault.org>
> > Cc: Arnd Bergmann <arnd@arndb.de>
> > ---
> >  arch/riscv/Kconfig                      |   1 +
> >  arch/riscv/include/asm/Kbuild           |   1 +
> >  arch/riscv/include/asm/spinlock.h       | 158 ++++++++++++--------------------
> >  arch/riscv/include/asm/spinlock_types.h |  19 ++--
> 
> NACK from myside.
> 
> Linux ARM64 has moved away from ticket spinlock to qspinlock.
> 
> We should directly go for qspinlock.

I think it is a sensible intermediate step, even if you want to go
qspinlock. Ticket locks are more or less trivial and get you fairness
and all that goodness without the mind bending complexity of qspinlock.

Once you have the ticket lock implementation solid (and qrwlock) and
everything, *then* start to carefully look at qspinlock.

Now, arguably arm64 did the heavy lifting of making qspinlock good on
weak architectures, but if you want to do it right, you still have to
analyze the whole thing for your own architecture.


_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
  2021-03-24 12:37     ` Peter Zijlstra
@ 2021-03-24 12:53       ` Anup Patel
  -1 siblings, 0 replies; 92+ messages in thread
From: Anup Patel @ 2021-03-24 12:53 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Guo Ren, linux-riscv, linux-kernel@vger.kernel.org List, Guo Ren,
	Catalin Marinas, Will Deacon, Palmer Dabbelt, Arnd Bergmann

On Wed, Mar 24, 2021 at 6:08 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Wed, Mar 24, 2021 at 05:58:58PM +0530, Anup Patel wrote:
> > On Wed, Mar 24, 2021 at 3:45 PM <guoren@kernel.org> wrote:
> > >
> > > From: Guo Ren <guoren@linux.alibaba.com>
> > >
> > > This patch introduces a ticket lock implementation for riscv, along the
> > > same lines as the implementation for arch/arm & arch/csky.
> > >
> > > Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> > > Cc: Catalin Marinas <catalin.marinas@arm.com>
> > > Cc: Will Deacon <will.deacon@arm.com>
> > > Cc: Peter Zijlstra <peterz@infradead.org>
> > > Cc: Palmer Dabbelt <palmerdabbelt@google.com>
> > > Cc: Anup Patel <anup@brainfault.org>
> > > Cc: Arnd Bergmann <arnd@arndb.de>
> > > ---
> > >  arch/riscv/Kconfig                      |   1 +
> > >  arch/riscv/include/asm/Kbuild           |   1 +
> > >  arch/riscv/include/asm/spinlock.h       | 158 ++++++++++++--------------------
> > >  arch/riscv/include/asm/spinlock_types.h |  19 ++--
> >
> > NACK from myside.
> >
> > Linux ARM64 has moved away from ticket spinlock to qspinlock.
> >
> > We should directly go for qspinlock.
>
> I think it is a sensible intermediate step, even if you want to go
> qspinlock. Ticket locks are more or less trivial and get you fairness
> and all that goodness without the mind bending complexity of qspinlock.
>
> Once you have the ticket lock implementation solid (and qrwlock) and
> everything, *then* start to carefully look at qspinlock.

I do understand qspinlock are relatively complex but the best thing
about qspinlock is it tries to ensure each CPU spins on it's own location.

Instead of adding ticket spinlock now and later replacing it with qspinlock,
it is better to straight away explore qspinlock hence my NACK.

>
> Now, arguably arm64 did the heavy lifting of making qspinlock good on
> weak architectures, but if you want to do it right, you still have to
> analyze the whole thing for your own architecture.

Most of the RISC-V implementations are weak memory ordering so it
makes more sense to explore qspinlock first.

Regards,
Anup

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
@ 2021-03-24 12:53       ` Anup Patel
  0 siblings, 0 replies; 92+ messages in thread
From: Anup Patel @ 2021-03-24 12:53 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Guo Ren, linux-riscv, linux-kernel@vger.kernel.org List, Guo Ren,
	Catalin Marinas, Will Deacon, Palmer Dabbelt, Arnd Bergmann

On Wed, Mar 24, 2021 at 6:08 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Wed, Mar 24, 2021 at 05:58:58PM +0530, Anup Patel wrote:
> > On Wed, Mar 24, 2021 at 3:45 PM <guoren@kernel.org> wrote:
> > >
> > > From: Guo Ren <guoren@linux.alibaba.com>
> > >
> > > This patch introduces a ticket lock implementation for riscv, along the
> > > same lines as the implementation for arch/arm & arch/csky.
> > >
> > > Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> > > Cc: Catalin Marinas <catalin.marinas@arm.com>
> > > Cc: Will Deacon <will.deacon@arm.com>
> > > Cc: Peter Zijlstra <peterz@infradead.org>
> > > Cc: Palmer Dabbelt <palmerdabbelt@google.com>
> > > Cc: Anup Patel <anup@brainfault.org>
> > > Cc: Arnd Bergmann <arnd@arndb.de>
> > > ---
> > >  arch/riscv/Kconfig                      |   1 +
> > >  arch/riscv/include/asm/Kbuild           |   1 +
> > >  arch/riscv/include/asm/spinlock.h       | 158 ++++++++++++--------------------
> > >  arch/riscv/include/asm/spinlock_types.h |  19 ++--
> >
> > NACK from myside.
> >
> > Linux ARM64 has moved away from ticket spinlock to qspinlock.
> >
> > We should directly go for qspinlock.
>
> I think it is a sensible intermediate step, even if you want to go
> qspinlock. Ticket locks are more or less trivial and get you fairness
> and all that goodness without the mind bending complexity of qspinlock.
>
> Once you have the ticket lock implementation solid (and qrwlock) and
> everything, *then* start to carefully look at qspinlock.

I do understand qspinlock are relatively complex but the best thing
about qspinlock is it tries to ensure each CPU spins on it's own location.

Instead of adding ticket spinlock now and later replacing it with qspinlock,
it is better to straight away explore qspinlock hence my NACK.

>
> Now, arguably arm64 did the heavy lifting of making qspinlock good on
> weak architectures, but if you want to do it right, you still have to
> analyze the whole thing for your own architecture.

Most of the RISC-V implementations are weak memory ordering so it
makes more sense to explore qspinlock first.

Regards,
Anup

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
  2021-03-24 12:53       ` Anup Patel
@ 2021-04-11 21:11         ` Palmer Dabbelt
  -1 siblings, 0 replies; 92+ messages in thread
From: Palmer Dabbelt @ 2021-04-11 21:11 UTC (permalink / raw)
  To: anup
  Cc: peterz, guoren, linux-riscv, linux-kernel, guoren,
	catalin.marinas, will.deacon, Arnd Bergmann

On Wed, 24 Mar 2021 05:53:51 PDT (-0700), anup@brainfault.org wrote:
> On Wed, Mar 24, 2021 at 6:08 PM Peter Zijlstra <peterz@infradead.org> wrote:
>>
>> On Wed, Mar 24, 2021 at 05:58:58PM +0530, Anup Patel wrote:
>> > On Wed, Mar 24, 2021 at 3:45 PM <guoren@kernel.org> wrote:
>> > >
>> > > From: Guo Ren <guoren@linux.alibaba.com>
>> > >
>> > > This patch introduces a ticket lock implementation for riscv, along the
>> > > same lines as the implementation for arch/arm & arch/csky.
>> > >
>> > > Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
>> > > Cc: Catalin Marinas <catalin.marinas@arm.com>
>> > > Cc: Will Deacon <will.deacon@arm.com>
>> > > Cc: Peter Zijlstra <peterz@infradead.org>
>> > > Cc: Palmer Dabbelt <palmerdabbelt@google.com>
>> > > Cc: Anup Patel <anup@brainfault.org>
>> > > Cc: Arnd Bergmann <arnd@arndb.de>
>> > > ---
>> > >  arch/riscv/Kconfig                      |   1 +
>> > >  arch/riscv/include/asm/Kbuild           |   1 +
>> > >  arch/riscv/include/asm/spinlock.h       | 158 ++++++++++++--------------------
>> > >  arch/riscv/include/asm/spinlock_types.h |  19 ++--
>> >
>> > NACK from myside.
>> >
>> > Linux ARM64 has moved away from ticket spinlock to qspinlock.
>> >
>> > We should directly go for qspinlock.
>>
>> I think it is a sensible intermediate step, even if you want to go
>> qspinlock. Ticket locks are more or less trivial and get you fairness
>> and all that goodness without the mind bending complexity of qspinlock.
>>
>> Once you have the ticket lock implementation solid (and qrwlock) and
>> everything, *then* start to carefully look at qspinlock.
>
> I do understand qspinlock are relatively complex but the best thing
> about qspinlock is it tries to ensure each CPU spins on it's own location.
>
> Instead of adding ticket spinlock now and later replacing it with qspinlock,
> it is better to straight away explore qspinlock hence my NACK.
>
>>
>> Now, arguably arm64 did the heavy lifting of making qspinlock good on
>> weak architectures, but if you want to do it right, you still have to
>> analyze the whole thing for your own architecture.
>
> Most of the RISC-V implementations are weak memory ordering so it
> makes more sense to explore qspinlock first.

I know I'm somewhat late to the party here.  I talked with Will (and 
to a lesser extent Peter) about this a week or two ago and it seems the 
best way to go here is to start with ticket locks.  They're simpler, and 
in Arm land they performed better until we got to the larger systems.  
Given that we don't have any high performance implementations of the 
RISC-V memory model (and likely won't any time soon) it's hard to reason 
about the performance of anything like this, but at a bare minimum 
having fair locks is a pretty big positive and ticket locks should have 
very little overhead while providing fairness.

IMO the decision between ticket and queueing locks is really more of a 
property of the hardware/workload than the ISA, though there are of 
course some pretty deep ISA dependencies than can make one saner than 
the other.  It seems best to me to just allow users to pick their own 
flavor of locks, and at least PPC is already doing that.  I threw 
together a quick asm-generic ticket lock that can be selected at compile 
time, but I want to spend some more time playing with the other 
architectures before sending anything out.

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
@ 2021-04-11 21:11         ` Palmer Dabbelt
  0 siblings, 0 replies; 92+ messages in thread
From: Palmer Dabbelt @ 2021-04-11 21:11 UTC (permalink / raw)
  To: anup
  Cc: peterz, guoren, linux-riscv, linux-kernel, guoren,
	catalin.marinas, will.deacon, Arnd Bergmann

On Wed, 24 Mar 2021 05:53:51 PDT (-0700), anup@brainfault.org wrote:
> On Wed, Mar 24, 2021 at 6:08 PM Peter Zijlstra <peterz@infradead.org> wrote:
>>
>> On Wed, Mar 24, 2021 at 05:58:58PM +0530, Anup Patel wrote:
>> > On Wed, Mar 24, 2021 at 3:45 PM <guoren@kernel.org> wrote:
>> > >
>> > > From: Guo Ren <guoren@linux.alibaba.com>
>> > >
>> > > This patch introduces a ticket lock implementation for riscv, along the
>> > > same lines as the implementation for arch/arm & arch/csky.
>> > >
>> > > Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
>> > > Cc: Catalin Marinas <catalin.marinas@arm.com>
>> > > Cc: Will Deacon <will.deacon@arm.com>
>> > > Cc: Peter Zijlstra <peterz@infradead.org>
>> > > Cc: Palmer Dabbelt <palmerdabbelt@google.com>
>> > > Cc: Anup Patel <anup@brainfault.org>
>> > > Cc: Arnd Bergmann <arnd@arndb.de>
>> > > ---
>> > >  arch/riscv/Kconfig                      |   1 +
>> > >  arch/riscv/include/asm/Kbuild           |   1 +
>> > >  arch/riscv/include/asm/spinlock.h       | 158 ++++++++++++--------------------
>> > >  arch/riscv/include/asm/spinlock_types.h |  19 ++--
>> >
>> > NACK from myside.
>> >
>> > Linux ARM64 has moved away from ticket spinlock to qspinlock.
>> >
>> > We should directly go for qspinlock.
>>
>> I think it is a sensible intermediate step, even if you want to go
>> qspinlock. Ticket locks are more or less trivial and get you fairness
>> and all that goodness without the mind bending complexity of qspinlock.
>>
>> Once you have the ticket lock implementation solid (and qrwlock) and
>> everything, *then* start to carefully look at qspinlock.
>
> I do understand qspinlock are relatively complex but the best thing
> about qspinlock is it tries to ensure each CPU spins on it's own location.
>
> Instead of adding ticket spinlock now and later replacing it with qspinlock,
> it is better to straight away explore qspinlock hence my NACK.
>
>>
>> Now, arguably arm64 did the heavy lifting of making qspinlock good on
>> weak architectures, but if you want to do it right, you still have to
>> analyze the whole thing for your own architecture.
>
> Most of the RISC-V implementations are weak memory ordering so it
> makes more sense to explore qspinlock first.

I know I'm somewhat late to the party here.  I talked with Will (and 
to a lesser extent Peter) about this a week or two ago and it seems the 
best way to go here is to start with ticket locks.  They're simpler, and 
in Arm land they performed better until we got to the larger systems.  
Given that we don't have any high performance implementations of the 
RISC-V memory model (and likely won't any time soon) it's hard to reason 
about the performance of anything like this, but at a bare minimum 
having fair locks is a pretty big positive and ticket locks should have 
very little overhead while providing fairness.

IMO the decision between ticket and queueing locks is really more of a 
property of the hardware/workload than the ISA, though there are of 
course some pretty deep ISA dependencies than can make one saner than 
the other.  It seems best to me to just allow users to pick their own 
flavor of locks, and at least PPC is already doing that.  I threw 
together a quick asm-generic ticket lock that can be selected at compile 
time, but I want to spend some more time playing with the other 
architectures before sending anything out.

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
  2021-04-11 21:11         ` Palmer Dabbelt
@ 2021-04-12 13:32           ` Christoph Müllner
  -1 siblings, 0 replies; 92+ messages in thread
From: Christoph Müllner @ 2021-04-12 13:32 UTC (permalink / raw)
  To: Palmer Dabbelt
  Cc: Anup Patel, Peter Zijlstra, Guo Ren, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, catalin.marinas, will.deacon,
	Arnd Bergmann

On Sun, Apr 11, 2021 at 11:11 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:
>
> On Wed, 24 Mar 2021 05:53:51 PDT (-0700), anup@brainfault.org wrote:
> > On Wed, Mar 24, 2021 at 6:08 PM Peter Zijlstra <peterz@infradead.org> wrote:
> >>
> >> On Wed, Mar 24, 2021 at 05:58:58PM +0530, Anup Patel wrote:
> >> > On Wed, Mar 24, 2021 at 3:45 PM <guoren@kernel.org> wrote:
> >> > >
> >> > > From: Guo Ren <guoren@linux.alibaba.com>
> >> > >
> >> > > This patch introduces a ticket lock implementation for riscv, along the
> >> > > same lines as the implementation for arch/arm & arch/csky.
> >> > >
> >> > > Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> >> > > Cc: Catalin Marinas <catalin.marinas@arm.com>
> >> > > Cc: Will Deacon <will.deacon@arm.com>
> >> > > Cc: Peter Zijlstra <peterz@infradead.org>
> >> > > Cc: Palmer Dabbelt <palmerdabbelt@google.com>
> >> > > Cc: Anup Patel <anup@brainfault.org>
> >> > > Cc: Arnd Bergmann <arnd@arndb.de>
> >> > > ---
> >> > >  arch/riscv/Kconfig                      |   1 +
> >> > >  arch/riscv/include/asm/Kbuild           |   1 +
> >> > >  arch/riscv/include/asm/spinlock.h       | 158 ++++++++++++--------------------
> >> > >  arch/riscv/include/asm/spinlock_types.h |  19 ++--
> >> >
> >> > NACK from myside.
> >> >
> >> > Linux ARM64 has moved away from ticket spinlock to qspinlock.
> >> >
> >> > We should directly go for qspinlock.
> >>
> >> I think it is a sensible intermediate step, even if you want to go
> >> qspinlock. Ticket locks are more or less trivial and get you fairness
> >> and all that goodness without the mind bending complexity of qspinlock.
> >>
> >> Once you have the ticket lock implementation solid (and qrwlock) and
> >> everything, *then* start to carefully look at qspinlock.
> >
> > I do understand qspinlock are relatively complex but the best thing
> > about qspinlock is it tries to ensure each CPU spins on it's own location.
> >
> > Instead of adding ticket spinlock now and later replacing it with qspinlock,
> > it is better to straight away explore qspinlock hence my NACK.
> >
> >>
> >> Now, arguably arm64 did the heavy lifting of making qspinlock good on
> >> weak architectures, but if you want to do it right, you still have to
> >> analyze the whole thing for your own architecture.
> >
> > Most of the RISC-V implementations are weak memory ordering so it
> > makes more sense to explore qspinlock first.
>
> I know I'm somewhat late to the party here.  I talked with Will (and
> to a lesser extent Peter) about this a week or two ago and it seems the
> best way to go here is to start with ticket locks.  They're simpler, and
> in Arm land they performed better until we got to the larger systems.
> Given that we don't have any high performance implementations of the
> RISC-V memory model (and likely won't any time soon) it's hard to reason
> about the performance of anything like this, but at a bare minimum
> having fair locks is a pretty big positive and ticket locks should have
> very little overhead while providing fairness.
>
> IMO the decision between ticket and queueing locks is really more of a
> property of the hardware/workload than the ISA, though there are of
> course some pretty deep ISA dependencies than can make one saner than
> the other.  It seems best to me to just allow users to pick their own
> flavor of locks, and at least PPC is already doing that.  I threw
> together a quick asm-generic ticket lock that can be selected at compile
> time, but I want to spend some more time playing with the other
> architectures before sending anything out.

This discussion came up again a few weeks ago because I've been stumbling over
the test-and-set implementation and was wondering if nobody cared to
improve that yet.
Then I saw, that there have been a few attempts so far, but they did not land.
So I brought this up in RVI's platform group meeting and the attendees
showed big
interest to get at least fairness. I assume Guo sent out his new
patchset as a reaction
to this call (1 or 2 days later).

We have the same situation on OpenSBI, where we've agreed (with Anup)
to go for a ticket lock implementation.
A series for that can be found here (the implementation was tested in
the kernel):
http://lists.infradead.org/pipermail/opensbi/2021-April/000789.html

In the mentioned RVI call, I've asked the question if ticket locks or
MCF locks are preferred.
And the feedback was to go for qspinlock/qrwlock. One good argument to
do so would be,
to not have to maintain an RV-specific implementation, but to use a
well-tested in-kernel mechanism.

The feedback in the call is also aligned with the previous attempts to
enable MCF-locks on RV.
However, the kernel's implementation requires sub-word atomics. And
here is, where we are.
The discussion last week was about changing the generic kernel code to
loosen its requirements
(not accepted because of performance regressions on e.g. x86) and if
RV actually can provide
sub-word atomics in form of LL/SC loops (yes, that's possible).
Providing sub-word xchg() can be done within a couple of hours (some
solutions are already on the list),
but that was not enough from the initial patchset from Michael on
(e.g. Christoph Hellwig asked back then
for moving arch-independent parts into lib, which is a good idea given
other archs do the same).
So I expect this might require some more time until there is a
solution, that's accepted by
a broad range of maintainers.

I've been working on a new MCF-lock series last week.
It is working, but I did not publish it yet, because I wanted to go
through the 130+ emails
on the riscv-linux list and check for overseen review comments and
validate the patch authors.
You can find the current state here:
https://github.com/cmuellner/linux/pull/new/riscv-spinlocks
So, if you insist on ticket locks, then let's coordinate who will do
that and how it will be tested
(RV32/RV64, qemu vs real hw).

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
@ 2021-04-12 13:32           ` Christoph Müllner
  0 siblings, 0 replies; 92+ messages in thread
From: Christoph Müllner @ 2021-04-12 13:32 UTC (permalink / raw)
  To: Palmer Dabbelt
  Cc: Anup Patel, Peter Zijlstra, Guo Ren, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, catalin.marinas, will.deacon,
	Arnd Bergmann

On Sun, Apr 11, 2021 at 11:11 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:
>
> On Wed, 24 Mar 2021 05:53:51 PDT (-0700), anup@brainfault.org wrote:
> > On Wed, Mar 24, 2021 at 6:08 PM Peter Zijlstra <peterz@infradead.org> wrote:
> >>
> >> On Wed, Mar 24, 2021 at 05:58:58PM +0530, Anup Patel wrote:
> >> > On Wed, Mar 24, 2021 at 3:45 PM <guoren@kernel.org> wrote:
> >> > >
> >> > > From: Guo Ren <guoren@linux.alibaba.com>
> >> > >
> >> > > This patch introduces a ticket lock implementation for riscv, along the
> >> > > same lines as the implementation for arch/arm & arch/csky.
> >> > >
> >> > > Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> >> > > Cc: Catalin Marinas <catalin.marinas@arm.com>
> >> > > Cc: Will Deacon <will.deacon@arm.com>
> >> > > Cc: Peter Zijlstra <peterz@infradead.org>
> >> > > Cc: Palmer Dabbelt <palmerdabbelt@google.com>
> >> > > Cc: Anup Patel <anup@brainfault.org>
> >> > > Cc: Arnd Bergmann <arnd@arndb.de>
> >> > > ---
> >> > >  arch/riscv/Kconfig                      |   1 +
> >> > >  arch/riscv/include/asm/Kbuild           |   1 +
> >> > >  arch/riscv/include/asm/spinlock.h       | 158 ++++++++++++--------------------
> >> > >  arch/riscv/include/asm/spinlock_types.h |  19 ++--
> >> >
> >> > NACK from myside.
> >> >
> >> > Linux ARM64 has moved away from ticket spinlock to qspinlock.
> >> >
> >> > We should directly go for qspinlock.
> >>
> >> I think it is a sensible intermediate step, even if you want to go
> >> qspinlock. Ticket locks are more or less trivial and get you fairness
> >> and all that goodness without the mind bending complexity of qspinlock.
> >>
> >> Once you have the ticket lock implementation solid (and qrwlock) and
> >> everything, *then* start to carefully look at qspinlock.
> >
> > I do understand qspinlock are relatively complex but the best thing
> > about qspinlock is it tries to ensure each CPU spins on it's own location.
> >
> > Instead of adding ticket spinlock now and later replacing it with qspinlock,
> > it is better to straight away explore qspinlock hence my NACK.
> >
> >>
> >> Now, arguably arm64 did the heavy lifting of making qspinlock good on
> >> weak architectures, but if you want to do it right, you still have to
> >> analyze the whole thing for your own architecture.
> >
> > Most of the RISC-V implementations are weak memory ordering so it
> > makes more sense to explore qspinlock first.
>
> I know I'm somewhat late to the party here.  I talked with Will (and
> to a lesser extent Peter) about this a week or two ago and it seems the
> best way to go here is to start with ticket locks.  They're simpler, and
> in Arm land they performed better until we got to the larger systems.
> Given that we don't have any high performance implementations of the
> RISC-V memory model (and likely won't any time soon) it's hard to reason
> about the performance of anything like this, but at a bare minimum
> having fair locks is a pretty big positive and ticket locks should have
> very little overhead while providing fairness.
>
> IMO the decision between ticket and queueing locks is really more of a
> property of the hardware/workload than the ISA, though there are of
> course some pretty deep ISA dependencies than can make one saner than
> the other.  It seems best to me to just allow users to pick their own
> flavor of locks, and at least PPC is already doing that.  I threw
> together a quick asm-generic ticket lock that can be selected at compile
> time, but I want to spend some more time playing with the other
> architectures before sending anything out.

This discussion came up again a few weeks ago because I've been stumbling over
the test-and-set implementation and was wondering if nobody cared to
improve that yet.
Then I saw, that there have been a few attempts so far, but they did not land.
So I brought this up in RVI's platform group meeting and the attendees
showed big
interest to get at least fairness. I assume Guo sent out his new
patchset as a reaction
to this call (1 or 2 days later).

We have the same situation on OpenSBI, where we've agreed (with Anup)
to go for a ticket lock implementation.
A series for that can be found here (the implementation was tested in
the kernel):
http://lists.infradead.org/pipermail/opensbi/2021-April/000789.html

In the mentioned RVI call, I've asked the question if ticket locks or
MCF locks are preferred.
And the feedback was to go for qspinlock/qrwlock. One good argument to
do so would be,
to not have to maintain an RV-specific implementation, but to use a
well-tested in-kernel mechanism.

The feedback in the call is also aligned with the previous attempts to
enable MCF-locks on RV.
However, the kernel's implementation requires sub-word atomics. And
here is, where we are.
The discussion last week was about changing the generic kernel code to
loosen its requirements
(not accepted because of performance regressions on e.g. x86) and if
RV actually can provide
sub-word atomics in form of LL/SC loops (yes, that's possible).
Providing sub-word xchg() can be done within a couple of hours (some
solutions are already on the list),
but that was not enough from the initial patchset from Michael on
(e.g. Christoph Hellwig asked back then
for moving arch-independent parts into lib, which is a good idea given
other archs do the same).
So I expect this might require some more time until there is a
solution, that's accepted by
a broad range of maintainers.

I've been working on a new MCF-lock series last week.
It is working, but I did not publish it yet, because I wanted to go
through the 130+ emails
on the riscv-linux list and check for overseen review comments and
validate the patch authors.
You can find the current state here:
https://github.com/cmuellner/linux/pull/new/riscv-spinlocks
So, if you insist on ticket locks, then let's coordinate who will do
that and how it will be tested
(RV32/RV64, qemu vs real hw).

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
  2021-04-12 13:32           ` Christoph Müllner
@ 2021-04-12 14:51             ` Peter Zijlstra
  -1 siblings, 0 replies; 92+ messages in thread
From: Peter Zijlstra @ 2021-04-12 14:51 UTC (permalink / raw)
  To: Christoph Müllner
  Cc: Palmer Dabbelt, Anup Patel, Guo Ren, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, catalin.marinas, will.deacon,
	Arnd Bergmann


Please fix your mailer to properly flow text. Reflowed it for you.

On Mon, Apr 12, 2021 at 03:32:27PM +0200, Christoph Müllner wrote:

> This discussion came up again a few weeks ago because I've been
> stumbling over the test-and-set implementation and was wondering if
> nobody cared to improve that yet.

> Then I saw, that there have been a few attempts so far, but they did
> not land.  So I brought this up in RVI's platform group meeting and
> the attendees showed big interest to get at least fairness. I assume
> Guo sent out his new patchset as a reaction to this call (1 or 2 days
> later).
> 
> We have the same situation on OpenSBI, where we've agreed (with Anup)
> to go for a ticket lock implementation.  A series for that can be
> found here (the implementation was tested in the kernel):
> http://lists.infradead.org/pipermail/opensbi/2021-April/000789.html
> 
> In the mentioned RVI call, I've asked the question if ticket locks or
> MCF locks are preferred.  And the feedback was to go for
> qspinlock/qrwlock. One good argument to do so would be, to not have to
> maintain an RV-specific implementation, but to use a well-tested
> in-kernel mechanism.

qrwlock does not depend on qspinlock; any fair spinlock implementation
works, including ticket.

> The feedback in the call is also aligned with the previous attempts to
> enable MCF-locks on RV.  However, the kernel's implementation requires
> sub-word atomics. And here is, where we are.  The discussion last week
> was about changing the generic kernel code to loosen its requirements
> (not accepted because of performance regressions on e.g. x86) and if
> RV actually can provide sub-word atomics in form of LL/SC loops (yes,
> that's possible).

So qspinlock is a complex and fickle beast. Yes it works on x86 and
arm64 (Will and Catalin put a _lot_ of effort into that), but IMO using
such a complex thing needs to be provably better than the trivial and
simple thing (tickets, test-and-set).

Part of that is fwd progress, if you don't have that, stay with
test-and-set. Will fixed a number of issues there, and -RT actually hit
at least one.

Debugging non-deterministic lock behaviour is not on the fun list.
Having something simple (ticket) to fall back to to prove it's your lock
going 'funneh' is very useful.

> Providing sub-word xchg() can be done within a couple of hours (some
> solutions are already on the list), but that was not enough from the
> initial patchset from Michael on (e.g. Christoph Hellwig asked back
> then for moving arch-independent parts into lib, which is a good idea
> given other archs do the same).  So I expect this might require some
> more time until there is a solution, that's accepted by a broad range
> of maintainers.

Using a lib/ cmpxchg based xchg16 is daft. Per the very same arguments I
made elsewhere in the thread. cmpxchg() based loops have very difficult
fwd progress guarantees, esp. so on LL/SC architectures.

What I would do is create a common inline helper to compute that {addr,
mask, val} setup with a comment on how to use it.

(As is, we have at least 2 different ways of dealing with ENDIAN-ness)

> I've been working on a new MCF-lock series last week.  It is working,
> but I did not publish it yet, because I wanted to go through the 130+
> emails on the riscv-linux list and check for overseen review comments
> and validate the patch authors.

> You can find the current state here:
> https://github.com/cmuellner/linux/pull/new/riscv-spinlocks 

That's not public. But if that's not qspinlock, how are you justifying a
complex spinlock implementation? Does it perform better than ticket?

> So, if you insist on ticket locks, then let's coordinate who will do
> that and how it will be tested (RV32/RV64, qemu vs real hw).

Real hardware is all that counts :-)


^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
@ 2021-04-12 14:51             ` Peter Zijlstra
  0 siblings, 0 replies; 92+ messages in thread
From: Peter Zijlstra @ 2021-04-12 14:51 UTC (permalink / raw)
  To: Christoph Müllner
  Cc: Palmer Dabbelt, Anup Patel, Guo Ren, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, catalin.marinas, will.deacon,
	Arnd Bergmann


Please fix your mailer to properly flow text. Reflowed it for you.

On Mon, Apr 12, 2021 at 03:32:27PM +0200, Christoph Müllner wrote:

> This discussion came up again a few weeks ago because I've been
> stumbling over the test-and-set implementation and was wondering if
> nobody cared to improve that yet.

> Then I saw, that there have been a few attempts so far, but they did
> not land.  So I brought this up in RVI's platform group meeting and
> the attendees showed big interest to get at least fairness. I assume
> Guo sent out his new patchset as a reaction to this call (1 or 2 days
> later).
> 
> We have the same situation on OpenSBI, where we've agreed (with Anup)
> to go for a ticket lock implementation.  A series for that can be
> found here (the implementation was tested in the kernel):
> http://lists.infradead.org/pipermail/opensbi/2021-April/000789.html
> 
> In the mentioned RVI call, I've asked the question if ticket locks or
> MCF locks are preferred.  And the feedback was to go for
> qspinlock/qrwlock. One good argument to do so would be, to not have to
> maintain an RV-specific implementation, but to use a well-tested
> in-kernel mechanism.

qrwlock does not depend on qspinlock; any fair spinlock implementation
works, including ticket.

> The feedback in the call is also aligned with the previous attempts to
> enable MCF-locks on RV.  However, the kernel's implementation requires
> sub-word atomics. And here is, where we are.  The discussion last week
> was about changing the generic kernel code to loosen its requirements
> (not accepted because of performance regressions on e.g. x86) and if
> RV actually can provide sub-word atomics in form of LL/SC loops (yes,
> that's possible).

So qspinlock is a complex and fickle beast. Yes it works on x86 and
arm64 (Will and Catalin put a _lot_ of effort into that), but IMO using
such a complex thing needs to be provably better than the trivial and
simple thing (tickets, test-and-set).

Part of that is fwd progress, if you don't have that, stay with
test-and-set. Will fixed a number of issues there, and -RT actually hit
at least one.

Debugging non-deterministic lock behaviour is not on the fun list.
Having something simple (ticket) to fall back to to prove it's your lock
going 'funneh' is very useful.

> Providing sub-word xchg() can be done within a couple of hours (some
> solutions are already on the list), but that was not enough from the
> initial patchset from Michael on (e.g. Christoph Hellwig asked back
> then for moving arch-independent parts into lib, which is a good idea
> given other archs do the same).  So I expect this might require some
> more time until there is a solution, that's accepted by a broad range
> of maintainers.

Using a lib/ cmpxchg based xchg16 is daft. Per the very same arguments I
made elsewhere in the thread. cmpxchg() based loops have very difficult
fwd progress guarantees, esp. so on LL/SC architectures.

What I would do is create a common inline helper to compute that {addr,
mask, val} setup with a comment on how to use it.

(As is, we have at least 2 different ways of dealing with ENDIAN-ness)

> I've been working on a new MCF-lock series last week.  It is working,
> but I did not publish it yet, because I wanted to go through the 130+
> emails on the riscv-linux list and check for overseen review comments
> and validate the patch authors.

> You can find the current state here:
> https://github.com/cmuellner/linux/pull/new/riscv-spinlocks 

That's not public. But if that's not qspinlock, how are you justifying a
complex spinlock implementation? Does it perform better than ticket?

> So, if you insist on ticket locks, then let's coordinate who will do
> that and how it will be tested (RV32/RV64, qemu vs real hw).

Real hardware is all that counts :-)


_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
  2021-04-12 13:32           ` Christoph Müllner
@ 2021-04-12 17:33             ` Palmer Dabbelt
  -1 siblings, 0 replies; 92+ messages in thread
From: Palmer Dabbelt @ 2021-04-12 17:33 UTC (permalink / raw)
  To: christophm30
  Cc: anup, peterz, guoren, linux-riscv, linux-kernel, guoren,
	catalin.marinas, will.deacon, Arnd Bergmann

On Mon, 12 Apr 2021 06:32:27 PDT (-0700), christophm30@gmail.com wrote:
> On Sun, Apr 11, 2021 at 11:11 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:
>>
>> On Wed, 24 Mar 2021 05:53:51 PDT (-0700), anup@brainfault.org wrote:
>> > On Wed, Mar 24, 2021 at 6:08 PM Peter Zijlstra <peterz@infradead.org> wrote:
>> >>
>> >> On Wed, Mar 24, 2021 at 05:58:58PM +0530, Anup Patel wrote:
>> >> > On Wed, Mar 24, 2021 at 3:45 PM <guoren@kernel.org> wrote:
>> >> > >
>> >> > > From: Guo Ren <guoren@linux.alibaba.com>
>> >> > >
>> >> > > This patch introduces a ticket lock implementation for riscv, along the
>> >> > > same lines as the implementation for arch/arm & arch/csky.
>> >> > >
>> >> > > Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
>> >> > > Cc: Catalin Marinas <catalin.marinas@arm.com>
>> >> > > Cc: Will Deacon <will.deacon@arm.com>
>> >> > > Cc: Peter Zijlstra <peterz@infradead.org>
>> >> > > Cc: Palmer Dabbelt <palmerdabbelt@google.com>
>> >> > > Cc: Anup Patel <anup@brainfault.org>
>> >> > > Cc: Arnd Bergmann <arnd@arndb.de>
>> >> > > ---
>> >> > >  arch/riscv/Kconfig                      |   1 +
>> >> > >  arch/riscv/include/asm/Kbuild           |   1 +
>> >> > >  arch/riscv/include/asm/spinlock.h       | 158 ++++++++++++--------------------
>> >> > >  arch/riscv/include/asm/spinlock_types.h |  19 ++--
>> >> >
>> >> > NACK from myside.
>> >> >
>> >> > Linux ARM64 has moved away from ticket spinlock to qspinlock.
>> >> >
>> >> > We should directly go for qspinlock.
>> >>
>> >> I think it is a sensible intermediate step, even if you want to go
>> >> qspinlock. Ticket locks are more or less trivial and get you fairness
>> >> and all that goodness without the mind bending complexity of qspinlock.
>> >>
>> >> Once you have the ticket lock implementation solid (and qrwlock) and
>> >> everything, *then* start to carefully look at qspinlock.
>> >
>> > I do understand qspinlock are relatively complex but the best thing
>> > about qspinlock is it tries to ensure each CPU spins on it's own location.
>> >
>> > Instead of adding ticket spinlock now and later replacing it with qspinlock,
>> > it is better to straight away explore qspinlock hence my NACK.
>> >
>> >>
>> >> Now, arguably arm64 did the heavy lifting of making qspinlock good on
>> >> weak architectures, but if you want to do it right, you still have to
>> >> analyze the whole thing for your own architecture.
>> >
>> > Most of the RISC-V implementations are weak memory ordering so it
>> > makes more sense to explore qspinlock first.
>>
>> I know I'm somewhat late to the party here.  I talked with Will (and
>> to a lesser extent Peter) about this a week or two ago and it seems the
>> best way to go here is to start with ticket locks.  They're simpler, and
>> in Arm land they performed better until we got to the larger systems.
>> Given that we don't have any high performance implementations of the
>> RISC-V memory model (and likely won't any time soon) it's hard to reason
>> about the performance of anything like this, but at a bare minimum
>> having fair locks is a pretty big positive and ticket locks should have
>> very little overhead while providing fairness.
>>
>> IMO the decision between ticket and queueing locks is really more of a
>> property of the hardware/workload than the ISA, though there are of
>> course some pretty deep ISA dependencies than can make one saner than
>> the other.  It seems best to me to just allow users to pick their own
>> flavor of locks, and at least PPC is already doing that.  I threw
>> together a quick asm-generic ticket lock that can be selected at compile
>> time, but I want to spend some more time playing with the other
>> architectures before sending anything out.
>
> This discussion came up again a few weeks ago because I've been stumbling over
> the test-and-set implementation and was wondering if nobody cared to
> improve that yet.
> Then I saw, that there have been a few attempts so far, but they did not land.
> So I brought this up in RVI's platform group meeting and the attendees
> showed big
> interest to get at least fairness. I assume Guo sent out his new
> patchset as a reaction
> to this call (1 or 2 days later).
>
> We have the same situation on OpenSBI, where we've agreed (with Anup)
> to go for a ticket lock implementation.
> A series for that can be found here (the implementation was tested in
> the kernel):
> http://lists.infradead.org/pipermail/opensbi/2021-April/000789.html
>
> In the mentioned RVI call, I've asked the question if ticket locks or
> MCF locks are preferred.
> And the feedback was to go for qspinlock/qrwlock. One good argument to
> do so would be,
> to not have to maintain an RV-specific implementation, but to use a
> well-tested in-kernel mechanism.
>
> The feedback in the call is also aligned with the previous attempts to
> enable MCF-locks on RV.
> However, the kernel's implementation requires sub-word atomics. And
> here is, where we are.
> The discussion last week was about changing the generic kernel code to
> loosen its requirements
> (not accepted because of performance regressions on e.g. x86) and if
> RV actually can provide
> sub-word atomics in form of LL/SC loops (yes, that's possible).
> Providing sub-word xchg() can be done within a couple of hours (some
> solutions are already on the list),
> but that was not enough from the initial patchset from Michael on
> (e.g. Christoph Hellwig asked back then
> for moving arch-independent parts into lib, which is a good idea given
> other archs do the same).
> So I expect this might require some more time until there is a
> solution, that's accepted by
> a broad range of maintainers.
>
> I've been working on a new MCF-lock series last week.
> It is working, but I did not publish it yet, because I wanted to go
> through the 130+ emails
> on the riscv-linux list and check for overseen review comments and
> validate the patch authors.
> You can find the current state here:
> https://github.com/cmuellner/linux/pull/new/riscv-spinlocks
> So, if you insist on ticket locks, then let's coordinate who will do
> that and how it will be tested
> (RV32/RV64, qemu vs real hw).

My plan is to add a generic ticket-based lock, which can be selected at 
compile time.  It'll have no architecture dependencies (though it'll 
likely have some hooks for architectures that can make this go faster).  
Users can then just pick which spinlock flavor they want, with the idea 
being that smaller systems will perform better with ticket locks and 
larger systems will perform better with queued locks.  The main goal 
here is to give the less widely used architectures an easy way to have 
fair locks, as right now we've got a lot of code duplication because any 
architecture that wants ticket locks has to do it themselves.

I'm not really convinced we have the ability to discuss the performance 
of locks on RISC-V right now, at least in any meaningful capacity.  The 
set of systems we have right now are just so far off from having a 
competitive memory system that optimizing for them doesn't seem to be 
worth the time, and as there really aren't any users we don't know what 
workloads people care about.  I'm mostly interested in just keeping the 
implementation simple, and ticket locks are the simplest spinlock flavor 
I know of that's fair (I think we can all agree that unfair locks are 
going to be a disaster).

There are certainly classes of systems for which ticket locks will be 
the wrong choice, and for those it makes sense to use the generic 
qspinlock implementation.  We'll likely need some changes to make that 
go fast on RISC-V, but we won't know what those are until we have 
hardware.  For now just having something that works (while staying away 
from anything that's obviously horribly inefficient) is as good as we 
can do, so I'm perfectly happy to take whatever we need to enable 
qspinlock on RISC-V.

I'll likely default to the ticket locks on RISC-V as it's simpler, but 
my main goal is to just get rid of the code duplication.  IMO the 
correct lock flavor is really something that's tightly coupled to both 
the microarchitecture and workload, but given how poorly the current 
crop of implementations performs on anything parallel I'm more swayed by 
simplicity.  When we end up with real hardware I'll be happy to 
re-evaluate that choice, but I don't think it's all that important today 
because we're going to need a whole new generation of implementations 
(and likely at least some ISA clarifications, as whether anything based 
on cmpxchg can be fair right now is still an open question) before we 
have anything competitive.

If you guys want to go throw a "thou shalt make queued spinlocks better 
than ticket spinlocks" (or the other way around, I don't really 
personally care all that much about spinlock flavors) in the platform 
spec that's fine.  Performance knobs are always a headache so getting 
rid of one would be great, but ultimately this is a microarchitecture 
thing so we'll have to see what gets implemented.

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
@ 2021-04-12 17:33             ` Palmer Dabbelt
  0 siblings, 0 replies; 92+ messages in thread
From: Palmer Dabbelt @ 2021-04-12 17:33 UTC (permalink / raw)
  To: christophm30
  Cc: anup, peterz, guoren, linux-riscv, linux-kernel, guoren,
	catalin.marinas, will.deacon, Arnd Bergmann

On Mon, 12 Apr 2021 06:32:27 PDT (-0700), christophm30@gmail.com wrote:
> On Sun, Apr 11, 2021 at 11:11 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:
>>
>> On Wed, 24 Mar 2021 05:53:51 PDT (-0700), anup@brainfault.org wrote:
>> > On Wed, Mar 24, 2021 at 6:08 PM Peter Zijlstra <peterz@infradead.org> wrote:
>> >>
>> >> On Wed, Mar 24, 2021 at 05:58:58PM +0530, Anup Patel wrote:
>> >> > On Wed, Mar 24, 2021 at 3:45 PM <guoren@kernel.org> wrote:
>> >> > >
>> >> > > From: Guo Ren <guoren@linux.alibaba.com>
>> >> > >
>> >> > > This patch introduces a ticket lock implementation for riscv, along the
>> >> > > same lines as the implementation for arch/arm & arch/csky.
>> >> > >
>> >> > > Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
>> >> > > Cc: Catalin Marinas <catalin.marinas@arm.com>
>> >> > > Cc: Will Deacon <will.deacon@arm.com>
>> >> > > Cc: Peter Zijlstra <peterz@infradead.org>
>> >> > > Cc: Palmer Dabbelt <palmerdabbelt@google.com>
>> >> > > Cc: Anup Patel <anup@brainfault.org>
>> >> > > Cc: Arnd Bergmann <arnd@arndb.de>
>> >> > > ---
>> >> > >  arch/riscv/Kconfig                      |   1 +
>> >> > >  arch/riscv/include/asm/Kbuild           |   1 +
>> >> > >  arch/riscv/include/asm/spinlock.h       | 158 ++++++++++++--------------------
>> >> > >  arch/riscv/include/asm/spinlock_types.h |  19 ++--
>> >> >
>> >> > NACK from myside.
>> >> >
>> >> > Linux ARM64 has moved away from ticket spinlock to qspinlock.
>> >> >
>> >> > We should directly go for qspinlock.
>> >>
>> >> I think it is a sensible intermediate step, even if you want to go
>> >> qspinlock. Ticket locks are more or less trivial and get you fairness
>> >> and all that goodness without the mind bending complexity of qspinlock.
>> >>
>> >> Once you have the ticket lock implementation solid (and qrwlock) and
>> >> everything, *then* start to carefully look at qspinlock.
>> >
>> > I do understand qspinlock are relatively complex but the best thing
>> > about qspinlock is it tries to ensure each CPU spins on it's own location.
>> >
>> > Instead of adding ticket spinlock now and later replacing it with qspinlock,
>> > it is better to straight away explore qspinlock hence my NACK.
>> >
>> >>
>> >> Now, arguably arm64 did the heavy lifting of making qspinlock good on
>> >> weak architectures, but if you want to do it right, you still have to
>> >> analyze the whole thing for your own architecture.
>> >
>> > Most of the RISC-V implementations are weak memory ordering so it
>> > makes more sense to explore qspinlock first.
>>
>> I know I'm somewhat late to the party here.  I talked with Will (and
>> to a lesser extent Peter) about this a week or two ago and it seems the
>> best way to go here is to start with ticket locks.  They're simpler, and
>> in Arm land they performed better until we got to the larger systems.
>> Given that we don't have any high performance implementations of the
>> RISC-V memory model (and likely won't any time soon) it's hard to reason
>> about the performance of anything like this, but at a bare minimum
>> having fair locks is a pretty big positive and ticket locks should have
>> very little overhead while providing fairness.
>>
>> IMO the decision between ticket and queueing locks is really more of a
>> property of the hardware/workload than the ISA, though there are of
>> course some pretty deep ISA dependencies than can make one saner than
>> the other.  It seems best to me to just allow users to pick their own
>> flavor of locks, and at least PPC is already doing that.  I threw
>> together a quick asm-generic ticket lock that can be selected at compile
>> time, but I want to spend some more time playing with the other
>> architectures before sending anything out.
>
> This discussion came up again a few weeks ago because I've been stumbling over
> the test-and-set implementation and was wondering if nobody cared to
> improve that yet.
> Then I saw, that there have been a few attempts so far, but they did not land.
> So I brought this up in RVI's platform group meeting and the attendees
> showed big
> interest to get at least fairness. I assume Guo sent out his new
> patchset as a reaction
> to this call (1 or 2 days later).
>
> We have the same situation on OpenSBI, where we've agreed (with Anup)
> to go for a ticket lock implementation.
> A series for that can be found here (the implementation was tested in
> the kernel):
> http://lists.infradead.org/pipermail/opensbi/2021-April/000789.html
>
> In the mentioned RVI call, I've asked the question if ticket locks or
> MCF locks are preferred.
> And the feedback was to go for qspinlock/qrwlock. One good argument to
> do so would be,
> to not have to maintain an RV-specific implementation, but to use a
> well-tested in-kernel mechanism.
>
> The feedback in the call is also aligned with the previous attempts to
> enable MCF-locks on RV.
> However, the kernel's implementation requires sub-word atomics. And
> here is, where we are.
> The discussion last week was about changing the generic kernel code to
> loosen its requirements
> (not accepted because of performance regressions on e.g. x86) and if
> RV actually can provide
> sub-word atomics in form of LL/SC loops (yes, that's possible).
> Providing sub-word xchg() can be done within a couple of hours (some
> solutions are already on the list),
> but that was not enough from the initial patchset from Michael on
> (e.g. Christoph Hellwig asked back then
> for moving arch-independent parts into lib, which is a good idea given
> other archs do the same).
> So I expect this might require some more time until there is a
> solution, that's accepted by
> a broad range of maintainers.
>
> I've been working on a new MCF-lock series last week.
> It is working, but I did not publish it yet, because I wanted to go
> through the 130+ emails
> on the riscv-linux list and check for overseen review comments and
> validate the patch authors.
> You can find the current state here:
> https://github.com/cmuellner/linux/pull/new/riscv-spinlocks
> So, if you insist on ticket locks, then let's coordinate who will do
> that and how it will be tested
> (RV32/RV64, qemu vs real hw).

My plan is to add a generic ticket-based lock, which can be selected at 
compile time.  It'll have no architecture dependencies (though it'll 
likely have some hooks for architectures that can make this go faster).  
Users can then just pick which spinlock flavor they want, with the idea 
being that smaller systems will perform better with ticket locks and 
larger systems will perform better with queued locks.  The main goal 
here is to give the less widely used architectures an easy way to have 
fair locks, as right now we've got a lot of code duplication because any 
architecture that wants ticket locks has to do it themselves.

I'm not really convinced we have the ability to discuss the performance 
of locks on RISC-V right now, at least in any meaningful capacity.  The 
set of systems we have right now are just so far off from having a 
competitive memory system that optimizing for them doesn't seem to be 
worth the time, and as there really aren't any users we don't know what 
workloads people care about.  I'm mostly interested in just keeping the 
implementation simple, and ticket locks are the simplest spinlock flavor 
I know of that's fair (I think we can all agree that unfair locks are 
going to be a disaster).

There are certainly classes of systems for which ticket locks will be 
the wrong choice, and for those it makes sense to use the generic 
qspinlock implementation.  We'll likely need some changes to make that 
go fast on RISC-V, but we won't know what those are until we have 
hardware.  For now just having something that works (while staying away 
from anything that's obviously horribly inefficient) is as good as we 
can do, so I'm perfectly happy to take whatever we need to enable 
qspinlock on RISC-V.

I'll likely default to the ticket locks on RISC-V as it's simpler, but 
my main goal is to just get rid of the code duplication.  IMO the 
correct lock flavor is really something that's tightly coupled to both 
the microarchitecture and workload, but given how poorly the current 
crop of implementations performs on anything parallel I'm more swayed by 
simplicity.  When we end up with real hardware I'll be happy to 
re-evaluate that choice, but I don't think it's all that important today 
because we're going to need a whole new generation of implementations 
(and likely at least some ISA clarifications, as whether anything based 
on cmpxchg can be fair right now is still an open question) before we 
have anything competitive.

If you guys want to go throw a "thou shalt make queued spinlocks better 
than ticket spinlocks" (or the other way around, I don't really 
personally care all that much about spinlock flavors) in the platform 
spec that's fine.  Performance knobs are always a headache so getting 
rid of one would be great, but ultimately this is a microarchitecture 
thing so we'll have to see what gets implemented.

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
  2021-04-12 14:51             ` Peter Zijlstra
@ 2021-04-12 21:21               ` Christoph Müllner
  -1 siblings, 0 replies; 92+ messages in thread
From: Christoph Müllner @ 2021-04-12 21:21 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Palmer Dabbelt, Anup Patel, Guo Ren, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, catalin.marinas, will.deacon,
	Arnd Bergmann

On Mon, Apr 12, 2021 at 4:52 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
>
> Please fix your mailer to properly flow text. Reflowed it for you.
>
> On Mon, Apr 12, 2021 at 03:32:27PM +0200, Christoph Müllner wrote:
>
> > This discussion came up again a few weeks ago because I've been
> > stumbling over the test-and-set implementation and was wondering if
> > nobody cared to improve that yet.
>
> > Then I saw, that there have been a few attempts so far, but they did
> > not land.  So I brought this up in RVI's platform group meeting and
> > the attendees showed big interest to get at least fairness. I assume
> > Guo sent out his new patchset as a reaction to this call (1 or 2 days
> > later).
> >
> > We have the same situation on OpenSBI, where we've agreed (with Anup)
> > to go for a ticket lock implementation.  A series for that can be
> > found here (the implementation was tested in the kernel):
> > http://lists.infradead.org/pipermail/opensbi/2021-April/000789.html
> >
> > In the mentioned RVI call, I've asked the question if ticket locks or
> > MCF locks are preferred.  And the feedback was to go for
> > qspinlock/qrwlock. One good argument to do so would be, to not have to
> > maintain an RV-specific implementation, but to use a well-tested
> > in-kernel mechanism.
>
> qrwlock does not depend on qspinlock; any fair spinlock implementation
> works, including ticket.
>
> > The feedback in the call is also aligned with the previous attempts to
> > enable MCF-locks on RV.  However, the kernel's implementation requires
> > sub-word atomics. And here is, where we are.  The discussion last week
> > was about changing the generic kernel code to loosen its requirements
> > (not accepted because of performance regressions on e.g. x86) and if
> > RV actually can provide sub-word atomics in form of LL/SC loops (yes,
> > that's possible).
>
> So qspinlock is a complex and fickle beast. Yes it works on x86 and
> arm64 (Will and Catalin put a _lot_ of effort into that), but IMO using
> such a complex thing needs to be provably better than the trivial and
> simple thing (tickets, test-and-set).
>
> Part of that is fwd progress, if you don't have that, stay with
> test-and-set. Will fixed a number of issues there, and -RT actually hit
> at least one.
>
> Debugging non-deterministic lock behaviour is not on the fun list.
> Having something simple (ticket) to fall back to to prove it's your lock
> going 'funneh' is very useful.
>
> > Providing sub-word xchg() can be done within a couple of hours (some
> > solutions are already on the list), but that was not enough from the
> > initial patchset from Michael on (e.g. Christoph Hellwig asked back
> > then for moving arch-independent parts into lib, which is a good idea
> > given other archs do the same).  So I expect this might require some
> > more time until there is a solution, that's accepted by a broad range
> > of maintainers.
>
> Using a lib/ cmpxchg based xchg16 is daft. Per the very same arguments I
> made elsewhere in the thread. cmpxchg() based loops have very difficult
> fwd progress guarantees, esp. so on LL/SC architectures.
>
> What I would do is create a common inline helper to compute that {addr,
> mask, val} setup with a comment on how to use it.
>
> (As is, we have at least 2 different ways of dealing with ENDIAN-ness)

Well, that's what I have here:
https://github.com/cmuellner/linux/commit/9d2f6449dd848b5723326ae8be34a3d2d41dcff1

> > I've been working on a new MCF-lock series last week.  It is working,
> > but I did not publish it yet, because I wanted to go through the 130+
> > emails on the riscv-linux list and check for overseen review comments
> > and validate the patch authors.
>
> > You can find the current state here:
> > https://github.com/cmuellner/linux/pull/new/riscv-spinlocks
>
> That's not public. But if that's not qspinlock, how are you justifying a
> complex spinlock implementation? Does it perform better than ticket?

I pasted the wrong link. Here is a working one:
https://github.com/cmuellner/linux/tree/riscv-spinlocks
It is basically Guo's v4 with mask-and-set within a LL/SC loop,
commits split up,
non-riscv commits dropped, and commit messages rewritten.

I fully understand your reservations against using MCF locks.
I also agree, that debugging locking issues is not funny.

FWIW:
I've been debugging quite some embedded Linux boards the last years,
where essential basic building blocks showed unreliable/erratic behavior
(caused e.g. by an unstable voltage supply) and every attempt to monitor
the bug causes it to disappear.

Ticket locks are also fine for me. Still, it would be nice to get the
16-bit xchg()
merged, so advanced users can try qspinlocks without much effort.
Otherwise, we just suspend the discussion now and restart it from the beginning
in a year (as is happening right now).

> > So, if you insist on ticket locks, then let's coordinate who will do
> > that and how it will be tested (RV32/RV64, qemu vs real hw).
>
> Real hardware is all that counts :-)

Fully agree, therefore I have written that.

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
@ 2021-04-12 21:21               ` Christoph Müllner
  0 siblings, 0 replies; 92+ messages in thread
From: Christoph Müllner @ 2021-04-12 21:21 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Palmer Dabbelt, Anup Patel, Guo Ren, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, catalin.marinas, will.deacon,
	Arnd Bergmann

On Mon, Apr 12, 2021 at 4:52 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
>
> Please fix your mailer to properly flow text. Reflowed it for you.
>
> On Mon, Apr 12, 2021 at 03:32:27PM +0200, Christoph Müllner wrote:
>
> > This discussion came up again a few weeks ago because I've been
> > stumbling over the test-and-set implementation and was wondering if
> > nobody cared to improve that yet.
>
> > Then I saw, that there have been a few attempts so far, but they did
> > not land.  So I brought this up in RVI's platform group meeting and
> > the attendees showed big interest to get at least fairness. I assume
> > Guo sent out his new patchset as a reaction to this call (1 or 2 days
> > later).
> >
> > We have the same situation on OpenSBI, where we've agreed (with Anup)
> > to go for a ticket lock implementation.  A series for that can be
> > found here (the implementation was tested in the kernel):
> > http://lists.infradead.org/pipermail/opensbi/2021-April/000789.html
> >
> > In the mentioned RVI call, I've asked the question if ticket locks or
> > MCF locks are preferred.  And the feedback was to go for
> > qspinlock/qrwlock. One good argument to do so would be, to not have to
> > maintain an RV-specific implementation, but to use a well-tested
> > in-kernel mechanism.
>
> qrwlock does not depend on qspinlock; any fair spinlock implementation
> works, including ticket.
>
> > The feedback in the call is also aligned with the previous attempts to
> > enable MCF-locks on RV.  However, the kernel's implementation requires
> > sub-word atomics. And here is, where we are.  The discussion last week
> > was about changing the generic kernel code to loosen its requirements
> > (not accepted because of performance regressions on e.g. x86) and if
> > RV actually can provide sub-word atomics in form of LL/SC loops (yes,
> > that's possible).
>
> So qspinlock is a complex and fickle beast. Yes it works on x86 and
> arm64 (Will and Catalin put a _lot_ of effort into that), but IMO using
> such a complex thing needs to be provably better than the trivial and
> simple thing (tickets, test-and-set).
>
> Part of that is fwd progress, if you don't have that, stay with
> test-and-set. Will fixed a number of issues there, and -RT actually hit
> at least one.
>
> Debugging non-deterministic lock behaviour is not on the fun list.
> Having something simple (ticket) to fall back to to prove it's your lock
> going 'funneh' is very useful.
>
> > Providing sub-word xchg() can be done within a couple of hours (some
> > solutions are already on the list), but that was not enough from the
> > initial patchset from Michael on (e.g. Christoph Hellwig asked back
> > then for moving arch-independent parts into lib, which is a good idea
> > given other archs do the same).  So I expect this might require some
> > more time until there is a solution, that's accepted by a broad range
> > of maintainers.
>
> Using a lib/ cmpxchg based xchg16 is daft. Per the very same arguments I
> made elsewhere in the thread. cmpxchg() based loops have very difficult
> fwd progress guarantees, esp. so on LL/SC architectures.
>
> What I would do is create a common inline helper to compute that {addr,
> mask, val} setup with a comment on how to use it.
>
> (As is, we have at least 2 different ways of dealing with ENDIAN-ness)

Well, that's what I have here:
https://github.com/cmuellner/linux/commit/9d2f6449dd848b5723326ae8be34a3d2d41dcff1

> > I've been working on a new MCF-lock series last week.  It is working,
> > but I did not publish it yet, because I wanted to go through the 130+
> > emails on the riscv-linux list and check for overseen review comments
> > and validate the patch authors.
>
> > You can find the current state here:
> > https://github.com/cmuellner/linux/pull/new/riscv-spinlocks
>
> That's not public. But if that's not qspinlock, how are you justifying a
> complex spinlock implementation? Does it perform better than ticket?

I pasted the wrong link. Here is a working one:
https://github.com/cmuellner/linux/tree/riscv-spinlocks
It is basically Guo's v4 with mask-and-set within a LL/SC loop,
commits split up,
non-riscv commits dropped, and commit messages rewritten.

I fully understand your reservations against using MCF locks.
I also agree, that debugging locking issues is not funny.

FWIW:
I've been debugging quite some embedded Linux boards the last years,
where essential basic building blocks showed unreliable/erratic behavior
(caused e.g. by an unstable voltage supply) and every attempt to monitor
the bug causes it to disappear.

Ticket locks are also fine for me. Still, it would be nice to get the
16-bit xchg()
merged, so advanced users can try qspinlocks without much effort.
Otherwise, we just suspend the discussion now and restart it from the beginning
in a year (as is happening right now).

> > So, if you insist on ticket locks, then let's coordinate who will do
> > that and how it will be tested (RV32/RV64, qemu vs real hw).
>
> Real hardware is all that counts :-)

Fully agree, therefore I have written that.

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
  2021-04-12 17:33             ` Palmer Dabbelt
@ 2021-04-12 21:54               ` Christoph Müllner
  -1 siblings, 0 replies; 92+ messages in thread
From: Christoph Müllner @ 2021-04-12 21:54 UTC (permalink / raw)
  To: Palmer Dabbelt
  Cc: Anup Patel, Peter Zijlstra, Guo Ren, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, catalin.marinas, will.deacon,
	Arnd Bergmann

On Mon, Apr 12, 2021 at 7:33 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:
>
> On Mon, 12 Apr 2021 06:32:27 PDT (-0700), christophm30@gmail.com wrote:
> > On Sun, Apr 11, 2021 at 11:11 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:
> >>
> >> On Wed, 24 Mar 2021 05:53:51 PDT (-0700), anup@brainfault.org wrote:
> >> > On Wed, Mar 24, 2021 at 6:08 PM Peter Zijlstra <peterz@infradead.org> wrote:
> >> >>
> >> >> On Wed, Mar 24, 2021 at 05:58:58PM +0530, Anup Patel wrote:
> >> >> > On Wed, Mar 24, 2021 at 3:45 PM <guoren@kernel.org> wrote:
> >> >> > >
> >> >> > > From: Guo Ren <guoren@linux.alibaba.com>
> >> >> > >
> >> >> > > This patch introduces a ticket lock implementation for riscv, along the
> >> >> > > same lines as the implementation for arch/arm & arch/csky.
> >> >> > >
> >> >> > > Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> >> >> > > Cc: Catalin Marinas <catalin.marinas@arm.com>
> >> >> > > Cc: Will Deacon <will.deacon@arm.com>
> >> >> > > Cc: Peter Zijlstra <peterz@infradead.org>
> >> >> > > Cc: Palmer Dabbelt <palmerdabbelt@google.com>
> >> >> > > Cc: Anup Patel <anup@brainfault.org>
> >> >> > > Cc: Arnd Bergmann <arnd@arndb.de>
> >> >> > > ---
> >> >> > >  arch/riscv/Kconfig                      |   1 +
> >> >> > >  arch/riscv/include/asm/Kbuild           |   1 +
> >> >> > >  arch/riscv/include/asm/spinlock.h       | 158 ++++++++++++--------------------
> >> >> > >  arch/riscv/include/asm/spinlock_types.h |  19 ++--
> >> >> >
> >> >> > NACK from myside.
> >> >> >
> >> >> > Linux ARM64 has moved away from ticket spinlock to qspinlock.
> >> >> >
> >> >> > We should directly go for qspinlock.
> >> >>
> >> >> I think it is a sensible intermediate step, even if you want to go
> >> >> qspinlock. Ticket locks are more or less trivial and get you fairness
> >> >> and all that goodness without the mind bending complexity of qspinlock.
> >> >>
> >> >> Once you have the ticket lock implementation solid (and qrwlock) and
> >> >> everything, *then* start to carefully look at qspinlock.
> >> >
> >> > I do understand qspinlock are relatively complex but the best thing
> >> > about qspinlock is it tries to ensure each CPU spins on it's own location.
> >> >
> >> > Instead of adding ticket spinlock now and later replacing it with qspinlock,
> >> > it is better to straight away explore qspinlock hence my NACK.
> >> >
> >> >>
> >> >> Now, arguably arm64 did the heavy lifting of making qspinlock good on
> >> >> weak architectures, but if you want to do it right, you still have to
> >> >> analyze the whole thing for your own architecture.
> >> >
> >> > Most of the RISC-V implementations are weak memory ordering so it
> >> > makes more sense to explore qspinlock first.
> >>
> >> I know I'm somewhat late to the party here.  I talked with Will (and
> >> to a lesser extent Peter) about this a week or two ago and it seems the
> >> best way to go here is to start with ticket locks.  They're simpler, and
> >> in Arm land they performed better until we got to the larger systems.
> >> Given that we don't have any high performance implementations of the
> >> RISC-V memory model (and likely won't any time soon) it's hard to reason
> >> about the performance of anything like this, but at a bare minimum
> >> having fair locks is a pretty big positive and ticket locks should have
> >> very little overhead while providing fairness.
> >>
> >> IMO the decision between ticket and queueing locks is really more of a
> >> property of the hardware/workload than the ISA, though there are of
> >> course some pretty deep ISA dependencies than can make one saner than
> >> the other.  It seems best to me to just allow users to pick their own
> >> flavor of locks, and at least PPC is already doing that.  I threw
> >> together a quick asm-generic ticket lock that can be selected at compile
> >> time, but I want to spend some more time playing with the other
> >> architectures before sending anything out.
> >
> > This discussion came up again a few weeks ago because I've been stumbling over
> > the test-and-set implementation and was wondering if nobody cared to
> > improve that yet.
> > Then I saw, that there have been a few attempts so far, but they did not land.
> > So I brought this up in RVI's platform group meeting and the attendees
> > showed big
> > interest to get at least fairness. I assume Guo sent out his new
> > patchset as a reaction
> > to this call (1 or 2 days later).
> >
> > We have the same situation on OpenSBI, where we've agreed (with Anup)
> > to go for a ticket lock implementation.
> > A series for that can be found here (the implementation was tested in
> > the kernel):
> > http://lists.infradead.org/pipermail/opensbi/2021-April/000789.html
> >
> > In the mentioned RVI call, I've asked the question if ticket locks or
> > MCF locks are preferred.
> > And the feedback was to go for qspinlock/qrwlock. One good argument to
> > do so would be,
> > to not have to maintain an RV-specific implementation, but to use a
> > well-tested in-kernel mechanism.
> >
> > The feedback in the call is also aligned with the previous attempts to
> > enable MCF-locks on RV.
> > However, the kernel's implementation requires sub-word atomics. And
> > here is, where we are.
> > The discussion last week was about changing the generic kernel code to
> > loosen its requirements
> > (not accepted because of performance regressions on e.g. x86) and if
> > RV actually can provide
> > sub-word atomics in form of LL/SC loops (yes, that's possible).
> > Providing sub-word xchg() can be done within a couple of hours (some
> > solutions are already on the list),
> > but that was not enough from the initial patchset from Michael on
> > (e.g. Christoph Hellwig asked back then
> > for moving arch-independent parts into lib, which is a good idea given
> > other archs do the same).
> > So I expect this might require some more time until there is a
> > solution, that's accepted by
> > a broad range of maintainers.
> >
> > I've been working on a new MCF-lock series last week.
> > It is working, but I did not publish it yet, because I wanted to go
> > through the 130+ emails
> > on the riscv-linux list and check for overseen review comments and
> > validate the patch authors.
> > You can find the current state here:
> > https://github.com/cmuellner/linux/pull/new/riscv-spinlocks
> > So, if you insist on ticket locks, then let's coordinate who will do
> > that and how it will be tested
> > (RV32/RV64, qemu vs real hw).
>
> My plan is to add a generic ticket-based lock, which can be selected at
> compile time.  It'll have no architecture dependencies (though it'll
> likely have some hooks for architectures that can make this go faster).
> Users can then just pick which spinlock flavor they want, with the idea
> being that smaller systems will perform better with ticket locks and
> larger systems will perform better with queued locks.  The main goal
> here is to give the less widely used architectures an easy way to have
> fair locks, as right now we've got a lot of code duplication because any
> architecture that wants ticket locks has to do it themselves.

In the case of LL/SC sequences, we have a maximum of 16 instructions
on RISC-V. My concern with a pure-C implementation would be that
we cannot guarantee this (e.g. somebody wants to compile with -O0)
and I don't know of a way to abort the build in case this limit exceeds.
Therefore I have preferred inline assembly for OpenSBI (my initial idea
was to use closure-like LL/SC macros, where you can write the loop
in form of C code).

In case you care, here is the ticket lock code from the OpenSBI patchset
ported over to Linux:
https://github.com/cmuellner/linux/commit/40a41d561df71fbe247016b303a1ef91bf9702f3

> I'm not really convinced we have the ability to discuss the performance
> of locks on RISC-V right now, at least in any meaningful capacity.  The
> set of systems we have right now are just so far off from having a
> competitive memory system that optimizing for them doesn't seem to be
> worth the time, and as there really aren't any users we don't know what
> workloads people care about.  I'm mostly interested in just keeping the
> implementation simple, and ticket locks are the simplest spinlock flavor
> I know of that's fair (I think we can all agree that unfair locks are
> going to be a disaster).
>
> There are certainly classes of systems for which ticket locks will be
> the wrong choice, and for those it makes sense to use the generic
> qspinlock implementation.  We'll likely need some changes to make that
> go fast on RISC-V, but we won't know what those are until we have
> hardware.  For now just having something that works (while staying away
> from anything that's obviously horribly inefficient) is as good as we
> can do, so I'm perfectly happy to take whatever we need to enable
> qspinlock on RISC-V.
>
> I'll likely default to the ticket locks on RISC-V as it's simpler, but
> my main goal is to just get rid of the code duplication.  IMO the
> correct lock flavor is really something that's tightly coupled to both
> the microarchitecture and workload, but given how poorly the current
> crop of implementations performs on anything parallel I'm more swayed by
> simplicity.  When we end up with real hardware I'll be happy to
> re-evaluate that choice, but I don't think it's all that important today
> because we're going to need a whole new generation of implementations
> (and likely at least some ISA clarifications, as whether anything based
> on cmpxchg can be fair right now is still an open question) before we
> have anything competitive.
>
> If you guys want to go throw a "thou shalt make queued spinlocks better
> than ticket spinlocks" (or the other way around, I don't really
> personally care all that much about spinlock flavors) in the platform
> spec that's fine.  Performance knobs are always a headache so getting
> rid of one would be great, but ultimately this is a microarchitecture
> thing so we'll have to see what gets implemented.

No, the implementation details of the spinlock algorithm are not part of
any platform specification. It was discussed in this forum because I did
not know a better place to ask this question. We all expect the RISC-V
kernel maintainer(s) to make a proper choice in the end. And as you said,
this can be re-evaluated in the future (based on results on real HW).

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
@ 2021-04-12 21:54               ` Christoph Müllner
  0 siblings, 0 replies; 92+ messages in thread
From: Christoph Müllner @ 2021-04-12 21:54 UTC (permalink / raw)
  To: Palmer Dabbelt
  Cc: Anup Patel, Peter Zijlstra, Guo Ren, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, catalin.marinas, will.deacon,
	Arnd Bergmann

On Mon, Apr 12, 2021 at 7:33 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:
>
> On Mon, 12 Apr 2021 06:32:27 PDT (-0700), christophm30@gmail.com wrote:
> > On Sun, Apr 11, 2021 at 11:11 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:
> >>
> >> On Wed, 24 Mar 2021 05:53:51 PDT (-0700), anup@brainfault.org wrote:
> >> > On Wed, Mar 24, 2021 at 6:08 PM Peter Zijlstra <peterz@infradead.org> wrote:
> >> >>
> >> >> On Wed, Mar 24, 2021 at 05:58:58PM +0530, Anup Patel wrote:
> >> >> > On Wed, Mar 24, 2021 at 3:45 PM <guoren@kernel.org> wrote:
> >> >> > >
> >> >> > > From: Guo Ren <guoren@linux.alibaba.com>
> >> >> > >
> >> >> > > This patch introduces a ticket lock implementation for riscv, along the
> >> >> > > same lines as the implementation for arch/arm & arch/csky.
> >> >> > >
> >> >> > > Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> >> >> > > Cc: Catalin Marinas <catalin.marinas@arm.com>
> >> >> > > Cc: Will Deacon <will.deacon@arm.com>
> >> >> > > Cc: Peter Zijlstra <peterz@infradead.org>
> >> >> > > Cc: Palmer Dabbelt <palmerdabbelt@google.com>
> >> >> > > Cc: Anup Patel <anup@brainfault.org>
> >> >> > > Cc: Arnd Bergmann <arnd@arndb.de>
> >> >> > > ---
> >> >> > >  arch/riscv/Kconfig                      |   1 +
> >> >> > >  arch/riscv/include/asm/Kbuild           |   1 +
> >> >> > >  arch/riscv/include/asm/spinlock.h       | 158 ++++++++++++--------------------
> >> >> > >  arch/riscv/include/asm/spinlock_types.h |  19 ++--
> >> >> >
> >> >> > NACK from myside.
> >> >> >
> >> >> > Linux ARM64 has moved away from ticket spinlock to qspinlock.
> >> >> >
> >> >> > We should directly go for qspinlock.
> >> >>
> >> >> I think it is a sensible intermediate step, even if you want to go
> >> >> qspinlock. Ticket locks are more or less trivial and get you fairness
> >> >> and all that goodness without the mind bending complexity of qspinlock.
> >> >>
> >> >> Once you have the ticket lock implementation solid (and qrwlock) and
> >> >> everything, *then* start to carefully look at qspinlock.
> >> >
> >> > I do understand qspinlock are relatively complex but the best thing
> >> > about qspinlock is it tries to ensure each CPU spins on it's own location.
> >> >
> >> > Instead of adding ticket spinlock now and later replacing it with qspinlock,
> >> > it is better to straight away explore qspinlock hence my NACK.
> >> >
> >> >>
> >> >> Now, arguably arm64 did the heavy lifting of making qspinlock good on
> >> >> weak architectures, but if you want to do it right, you still have to
> >> >> analyze the whole thing for your own architecture.
> >> >
> >> > Most of the RISC-V implementations are weak memory ordering so it
> >> > makes more sense to explore qspinlock first.
> >>
> >> I know I'm somewhat late to the party here.  I talked with Will (and
> >> to a lesser extent Peter) about this a week or two ago and it seems the
> >> best way to go here is to start with ticket locks.  They're simpler, and
> >> in Arm land they performed better until we got to the larger systems.
> >> Given that we don't have any high performance implementations of the
> >> RISC-V memory model (and likely won't any time soon) it's hard to reason
> >> about the performance of anything like this, but at a bare minimum
> >> having fair locks is a pretty big positive and ticket locks should have
> >> very little overhead while providing fairness.
> >>
> >> IMO the decision between ticket and queueing locks is really more of a
> >> property of the hardware/workload than the ISA, though there are of
> >> course some pretty deep ISA dependencies than can make one saner than
> >> the other.  It seems best to me to just allow users to pick their own
> >> flavor of locks, and at least PPC is already doing that.  I threw
> >> together a quick asm-generic ticket lock that can be selected at compile
> >> time, but I want to spend some more time playing with the other
> >> architectures before sending anything out.
> >
> > This discussion came up again a few weeks ago because I've been stumbling over
> > the test-and-set implementation and was wondering if nobody cared to
> > improve that yet.
> > Then I saw, that there have been a few attempts so far, but they did not land.
> > So I brought this up in RVI's platform group meeting and the attendees
> > showed big
> > interest to get at least fairness. I assume Guo sent out his new
> > patchset as a reaction
> > to this call (1 or 2 days later).
> >
> > We have the same situation on OpenSBI, where we've agreed (with Anup)
> > to go for a ticket lock implementation.
> > A series for that can be found here (the implementation was tested in
> > the kernel):
> > http://lists.infradead.org/pipermail/opensbi/2021-April/000789.html
> >
> > In the mentioned RVI call, I've asked the question if ticket locks or
> > MCF locks are preferred.
> > And the feedback was to go for qspinlock/qrwlock. One good argument to
> > do so would be,
> > to not have to maintain an RV-specific implementation, but to use a
> > well-tested in-kernel mechanism.
> >
> > The feedback in the call is also aligned with the previous attempts to
> > enable MCF-locks on RV.
> > However, the kernel's implementation requires sub-word atomics. And
> > here is, where we are.
> > The discussion last week was about changing the generic kernel code to
> > loosen its requirements
> > (not accepted because of performance regressions on e.g. x86) and if
> > RV actually can provide
> > sub-word atomics in form of LL/SC loops (yes, that's possible).
> > Providing sub-word xchg() can be done within a couple of hours (some
> > solutions are already on the list),
> > but that was not enough from the initial patchset from Michael on
> > (e.g. Christoph Hellwig asked back then
> > for moving arch-independent parts into lib, which is a good idea given
> > other archs do the same).
> > So I expect this might require some more time until there is a
> > solution, that's accepted by
> > a broad range of maintainers.
> >
> > I've been working on a new MCF-lock series last week.
> > It is working, but I did not publish it yet, because I wanted to go
> > through the 130+ emails
> > on the riscv-linux list and check for overseen review comments and
> > validate the patch authors.
> > You can find the current state here:
> > https://github.com/cmuellner/linux/pull/new/riscv-spinlocks
> > So, if you insist on ticket locks, then let's coordinate who will do
> > that and how it will be tested
> > (RV32/RV64, qemu vs real hw).
>
> My plan is to add a generic ticket-based lock, which can be selected at
> compile time.  It'll have no architecture dependencies (though it'll
> likely have some hooks for architectures that can make this go faster).
> Users can then just pick which spinlock flavor they want, with the idea
> being that smaller systems will perform better with ticket locks and
> larger systems will perform better with queued locks.  The main goal
> here is to give the less widely used architectures an easy way to have
> fair locks, as right now we've got a lot of code duplication because any
> architecture that wants ticket locks has to do it themselves.

In the case of LL/SC sequences, we have a maximum of 16 instructions
on RISC-V. My concern with a pure-C implementation would be that
we cannot guarantee this (e.g. somebody wants to compile with -O0)
and I don't know of a way to abort the build in case this limit exceeds.
Therefore I have preferred inline assembly for OpenSBI (my initial idea
was to use closure-like LL/SC macros, where you can write the loop
in form of C code).

In case you care, here is the ticket lock code from the OpenSBI patchset
ported over to Linux:
https://github.com/cmuellner/linux/commit/40a41d561df71fbe247016b303a1ef91bf9702f3

> I'm not really convinced we have the ability to discuss the performance
> of locks on RISC-V right now, at least in any meaningful capacity.  The
> set of systems we have right now are just so far off from having a
> competitive memory system that optimizing for them doesn't seem to be
> worth the time, and as there really aren't any users we don't know what
> workloads people care about.  I'm mostly interested in just keeping the
> implementation simple, and ticket locks are the simplest spinlock flavor
> I know of that's fair (I think we can all agree that unfair locks are
> going to be a disaster).
>
> There are certainly classes of systems for which ticket locks will be
> the wrong choice, and for those it makes sense to use the generic
> qspinlock implementation.  We'll likely need some changes to make that
> go fast on RISC-V, but we won't know what those are until we have
> hardware.  For now just having something that works (while staying away
> from anything that's obviously horribly inefficient) is as good as we
> can do, so I'm perfectly happy to take whatever we need to enable
> qspinlock on RISC-V.
>
> I'll likely default to the ticket locks on RISC-V as it's simpler, but
> my main goal is to just get rid of the code duplication.  IMO the
> correct lock flavor is really something that's tightly coupled to both
> the microarchitecture and workload, but given how poorly the current
> crop of implementations performs on anything parallel I'm more swayed by
> simplicity.  When we end up with real hardware I'll be happy to
> re-evaluate that choice, but I don't think it's all that important today
> because we're going to need a whole new generation of implementations
> (and likely at least some ISA clarifications, as whether anything based
> on cmpxchg can be fair right now is still an open question) before we
> have anything competitive.
>
> If you guys want to go throw a "thou shalt make queued spinlocks better
> than ticket spinlocks" (or the other way around, I don't really
> personally care all that much about spinlock flavors) in the platform
> spec that's fine.  Performance knobs are always a headache so getting
> rid of one would be great, but ultimately this is a microarchitecture
> thing so we'll have to see what gets implemented.

No, the implementation details of the spinlock algorithm are not part of
any platform specification. It was discussed in this forum because I did
not know a better place to ask this question. We all expect the RISC-V
kernel maintainer(s) to make a proper choice in the end. And as you said,
this can be re-evaluated in the future (based on results on real HW).

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
  2021-04-12 21:54               ` Christoph Müllner
@ 2021-04-13  8:03                 ` Peter Zijlstra
  -1 siblings, 0 replies; 92+ messages in thread
From: Peter Zijlstra @ 2021-04-13  8:03 UTC (permalink / raw)
  To: Christoph Müllner
  Cc: Palmer Dabbelt, Anup Patel, Guo Ren, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, catalin.marinas, will.deacon,
	Arnd Bergmann

On Mon, Apr 12, 2021 at 11:54:55PM +0200, Christoph Müllner wrote:
> On Mon, Apr 12, 2021 at 7:33 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:

> > My plan is to add a generic ticket-based lock, which can be selected at
> > compile time.  It'll have no architecture dependencies (though it'll
> > likely have some hooks for architectures that can make this go faster).
> > Users can then just pick which spinlock flavor they want, with the idea
> > being that smaller systems will perform better with ticket locks and
> > larger systems will perform better with queued locks.  The main goal
> > here is to give the less widely used architectures an easy way to have
> > fair locks, as right now we've got a lot of code duplication because any
> > architecture that wants ticket locks has to do it themselves.
> 
> In the case of LL/SC sequences, we have a maximum of 16 instructions
> on RISC-V. My concern with a pure-C implementation would be that
> we cannot guarantee this (e.g. somebody wants to compile with -O0)
> and I don't know of a way to abort the build in case this limit exceeds.
> Therefore I have preferred inline assembly for OpenSBI (my initial idea
> was to use closure-like LL/SC macros, where you can write the loop
> in form of C code).

For ticket locks you really only needs atomic_fetch_add() and
smp_store_release() and an architectural guarantees that the
atomic_fetch_add() has fwd progress under contention and that a sub-word
store (through smp_store_release()) will fail the SC.

Then you can do something like:

void lock(atomic_t *lock)
{
	u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
	u16 ticket = val >> 16;

	for (;;) {
		if (ticket == (u16)val)
			break;
		cpu_relax();
		val = atomic_read_acquire(lock);
	}
}

void unlock(atomic_t *lock)
{
	u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__);
	u32 val = atomic_read(lock);

	smp_store_release(ptr, (u16)val + 1);
}

That's _almost_ as simple as a test-and-set :-) It isn't quite optimal
on x86 for not being allowed to use a memop on unlock, since its being
forced into a load-store because of all the volatile, but whatever.

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
@ 2021-04-13  8:03                 ` Peter Zijlstra
  0 siblings, 0 replies; 92+ messages in thread
From: Peter Zijlstra @ 2021-04-13  8:03 UTC (permalink / raw)
  To: Christoph Müllner
  Cc: Palmer Dabbelt, Anup Patel, Guo Ren, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, catalin.marinas, will.deacon,
	Arnd Bergmann

On Mon, Apr 12, 2021 at 11:54:55PM +0200, Christoph Müllner wrote:
> On Mon, Apr 12, 2021 at 7:33 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:

> > My plan is to add a generic ticket-based lock, which can be selected at
> > compile time.  It'll have no architecture dependencies (though it'll
> > likely have some hooks for architectures that can make this go faster).
> > Users can then just pick which spinlock flavor they want, with the idea
> > being that smaller systems will perform better with ticket locks and
> > larger systems will perform better with queued locks.  The main goal
> > here is to give the less widely used architectures an easy way to have
> > fair locks, as right now we've got a lot of code duplication because any
> > architecture that wants ticket locks has to do it themselves.
> 
> In the case of LL/SC sequences, we have a maximum of 16 instructions
> on RISC-V. My concern with a pure-C implementation would be that
> we cannot guarantee this (e.g. somebody wants to compile with -O0)
> and I don't know of a way to abort the build in case this limit exceeds.
> Therefore I have preferred inline assembly for OpenSBI (my initial idea
> was to use closure-like LL/SC macros, where you can write the loop
> in form of C code).

For ticket locks you really only needs atomic_fetch_add() and
smp_store_release() and an architectural guarantees that the
atomic_fetch_add() has fwd progress under contention and that a sub-word
store (through smp_store_release()) will fail the SC.

Then you can do something like:

void lock(atomic_t *lock)
{
	u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
	u16 ticket = val >> 16;

	for (;;) {
		if (ticket == (u16)val)
			break;
		cpu_relax();
		val = atomic_read_acquire(lock);
	}
}

void unlock(atomic_t *lock)
{
	u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__);
	u32 val = atomic_read(lock);

	smp_store_release(ptr, (u16)val + 1);
}

That's _almost_ as simple as a test-and-set :-) It isn't quite optimal
on x86 for not being allowed to use a memop on unlock, since its being
forced into a load-store because of all the volatile, but whatever.

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
  2021-04-13  8:03                 ` Peter Zijlstra
@ 2021-04-13  8:17                   ` Peter Zijlstra
  -1 siblings, 0 replies; 92+ messages in thread
From: Peter Zijlstra @ 2021-04-13  8:17 UTC (permalink / raw)
  To: Christoph Müllner
  Cc: Palmer Dabbelt, Anup Patel, Guo Ren, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, catalin.marinas, will.deacon,
	Arnd Bergmann

On Tue, Apr 13, 2021 at 10:03:01AM +0200, Peter Zijlstra wrote:

> For ticket locks you really only needs atomic_fetch_add() and
> smp_store_release() and an architectural guarantees that the
> atomic_fetch_add() has fwd progress under contention and that a sub-word
> store (through smp_store_release()) will fail the SC.
> 
> Then you can do something like:
> 
> void lock(atomic_t *lock)
> {
> 	u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
> 	u16 ticket = val >> 16;
> 
> 	for (;;) {
> 		if (ticket == (u16)val)
> 			break;
> 		cpu_relax();
> 		val = atomic_read_acquire(lock);
> 	}

A possibly better might be:

	if (ticket == (u16)val)
		return;

	atomic_cond_read_acquire(lock, ticket == (u16)VAL);

Since that allows architectures to use WFE like constructs.

> }
> 
> void unlock(atomic_t *lock)
> {
> 	u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__);
> 	u32 val = atomic_read(lock);
> 
> 	smp_store_release(ptr, (u16)val + 1);
> }
> 
> That's _almost_ as simple as a test-and-set :-) It isn't quite optimal
> on x86 for not being allowed to use a memop on unlock, since its being
> forced into a load-store because of all the volatile, but whatever.

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
@ 2021-04-13  8:17                   ` Peter Zijlstra
  0 siblings, 0 replies; 92+ messages in thread
From: Peter Zijlstra @ 2021-04-13  8:17 UTC (permalink / raw)
  To: Christoph Müllner
  Cc: Palmer Dabbelt, Anup Patel, Guo Ren, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, catalin.marinas, will.deacon,
	Arnd Bergmann

On Tue, Apr 13, 2021 at 10:03:01AM +0200, Peter Zijlstra wrote:

> For ticket locks you really only needs atomic_fetch_add() and
> smp_store_release() and an architectural guarantees that the
> atomic_fetch_add() has fwd progress under contention and that a sub-word
> store (through smp_store_release()) will fail the SC.
> 
> Then you can do something like:
> 
> void lock(atomic_t *lock)
> {
> 	u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
> 	u16 ticket = val >> 16;
> 
> 	for (;;) {
> 		if (ticket == (u16)val)
> 			break;
> 		cpu_relax();
> 		val = atomic_read_acquire(lock);
> 	}

A possibly better might be:

	if (ticket == (u16)val)
		return;

	atomic_cond_read_acquire(lock, ticket == (u16)VAL);

Since that allows architectures to use WFE like constructs.

> }
> 
> void unlock(atomic_t *lock)
> {
> 	u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__);
> 	u32 val = atomic_read(lock);
> 
> 	smp_store_release(ptr, (u16)val + 1);
> }
> 
> That's _almost_ as simple as a test-and-set :-) It isn't quite optimal
> on x86 for not being allowed to use a memop on unlock, since its being
> forced into a load-store because of all the volatile, but whatever.

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
  2021-04-13  8:03                 ` Peter Zijlstra
@ 2021-04-13  9:22                   ` Christoph Müllner
  -1 siblings, 0 replies; 92+ messages in thread
From: Christoph Müllner @ 2021-04-13  9:22 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Palmer Dabbelt, Anup Patel, Guo Ren, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, catalin.marinas, will.deacon,
	Arnd Bergmann

On Tue, Apr 13, 2021 at 10:03 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, Apr 12, 2021 at 11:54:55PM +0200, Christoph Müllner wrote:
> > On Mon, Apr 12, 2021 at 7:33 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:
>
> > > My plan is to add a generic ticket-based lock, which can be selected at
> > > compile time.  It'll have no architecture dependencies (though it'll
> > > likely have some hooks for architectures that can make this go faster).
> > > Users can then just pick which spinlock flavor they want, with the idea
> > > being that smaller systems will perform better with ticket locks and
> > > larger systems will perform better with queued locks.  The main goal
> > > here is to give the less widely used architectures an easy way to have
> > > fair locks, as right now we've got a lot of code duplication because any
> > > architecture that wants ticket locks has to do it themselves.
> >
> > In the case of LL/SC sequences, we have a maximum of 16 instructions
> > on RISC-V. My concern with a pure-C implementation would be that
> > we cannot guarantee this (e.g. somebody wants to compile with -O0)
> > and I don't know of a way to abort the build in case this limit exceeds.
> > Therefore I have preferred inline assembly for OpenSBI (my initial idea
> > was to use closure-like LL/SC macros, where you can write the loop
> > in form of C code).
>
> For ticket locks you really only needs atomic_fetch_add() and
> smp_store_release() and an architectural guarantees that the
> atomic_fetch_add() has fwd progress under contention and that a sub-word
> store (through smp_store_release()) will fail the SC.
>
> Then you can do something like:
>
> void lock(atomic_t *lock)
> {
>         u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
>         u16 ticket = val >> 16;
>
>         for (;;) {
>                 if (ticket == (u16)val)
>                         break;
>                 cpu_relax();
>                 val = atomic_read_acquire(lock);
>         }
> }
>
> void unlock(atomic_t *lock)
> {
>         u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__);
>         u32 val = atomic_read(lock);
>
>         smp_store_release(ptr, (u16)val + 1);
> }
>
> That's _almost_ as simple as a test-and-set :-) It isn't quite optimal
> on x86 for not being allowed to use a memop on unlock, since its being
> forced into a load-store because of all the volatile, but whatever.

What about trylock()?
I.e. one could implement trylock() without a loop, by letting
trylock() fail if the SC fails.
That looks safe on first view, but nobody does this right now.

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
@ 2021-04-13  9:22                   ` Christoph Müllner
  0 siblings, 0 replies; 92+ messages in thread
From: Christoph Müllner @ 2021-04-13  9:22 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Palmer Dabbelt, Anup Patel, Guo Ren, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, catalin.marinas, will.deacon,
	Arnd Bergmann

On Tue, Apr 13, 2021 at 10:03 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, Apr 12, 2021 at 11:54:55PM +0200, Christoph Müllner wrote:
> > On Mon, Apr 12, 2021 at 7:33 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:
>
> > > My plan is to add a generic ticket-based lock, which can be selected at
> > > compile time.  It'll have no architecture dependencies (though it'll
> > > likely have some hooks for architectures that can make this go faster).
> > > Users can then just pick which spinlock flavor they want, with the idea
> > > being that smaller systems will perform better with ticket locks and
> > > larger systems will perform better with queued locks.  The main goal
> > > here is to give the less widely used architectures an easy way to have
> > > fair locks, as right now we've got a lot of code duplication because any
> > > architecture that wants ticket locks has to do it themselves.
> >
> > In the case of LL/SC sequences, we have a maximum of 16 instructions
> > on RISC-V. My concern with a pure-C implementation would be that
> > we cannot guarantee this (e.g. somebody wants to compile with -O0)
> > and I don't know of a way to abort the build in case this limit exceeds.
> > Therefore I have preferred inline assembly for OpenSBI (my initial idea
> > was to use closure-like LL/SC macros, where you can write the loop
> > in form of C code).
>
> For ticket locks you really only needs atomic_fetch_add() and
> smp_store_release() and an architectural guarantees that the
> atomic_fetch_add() has fwd progress under contention and that a sub-word
> store (through smp_store_release()) will fail the SC.
>
> Then you can do something like:
>
> void lock(atomic_t *lock)
> {
>         u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
>         u16 ticket = val >> 16;
>
>         for (;;) {
>                 if (ticket == (u16)val)
>                         break;
>                 cpu_relax();
>                 val = atomic_read_acquire(lock);
>         }
> }
>
> void unlock(atomic_t *lock)
> {
>         u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__);
>         u32 val = atomic_read(lock);
>
>         smp_store_release(ptr, (u16)val + 1);
> }
>
> That's _almost_ as simple as a test-and-set :-) It isn't quite optimal
> on x86 for not being allowed to use a memop on unlock, since its being
> forced into a load-store because of all the volatile, but whatever.

What about trylock()?
I.e. one could implement trylock() without a loop, by letting
trylock() fail if the SC fails.
That looks safe on first view, but nobody does this right now.

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
  2021-04-13  9:22                   ` Christoph Müllner
@ 2021-04-13  9:30                     ` Catalin Marinas
  -1 siblings, 0 replies; 92+ messages in thread
From: Catalin Marinas @ 2021-04-13  9:30 UTC (permalink / raw)
  To: Christoph Müllner
  Cc: Peter Zijlstra, Palmer Dabbelt, Anup Patel, Guo Ren, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, will.deacon, Arnd Bergmann

On Tue, Apr 13, 2021 at 11:22:40AM +0200, Christoph Müllner wrote:
> On Tue, Apr 13, 2021 at 10:03 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > On Mon, Apr 12, 2021 at 11:54:55PM +0200, Christoph Müllner wrote:
> > > On Mon, Apr 12, 2021 at 7:33 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:
> > > > My plan is to add a generic ticket-based lock, which can be selected at
> > > > compile time.  It'll have no architecture dependencies (though it'll
> > > > likely have some hooks for architectures that can make this go faster).
> > > > Users can then just pick which spinlock flavor they want, with the idea
> > > > being that smaller systems will perform better with ticket locks and
> > > > larger systems will perform better with queued locks.  The main goal
> > > > here is to give the less widely used architectures an easy way to have
> > > > fair locks, as right now we've got a lot of code duplication because any
> > > > architecture that wants ticket locks has to do it themselves.
> > >
> > > In the case of LL/SC sequences, we have a maximum of 16 instructions
> > > on RISC-V. My concern with a pure-C implementation would be that
> > > we cannot guarantee this (e.g. somebody wants to compile with -O0)
> > > and I don't know of a way to abort the build in case this limit exceeds.
> > > Therefore I have preferred inline assembly for OpenSBI (my initial idea
> > > was to use closure-like LL/SC macros, where you can write the loop
> > > in form of C code).
> >
> > For ticket locks you really only needs atomic_fetch_add() and
> > smp_store_release() and an architectural guarantees that the
> > atomic_fetch_add() has fwd progress under contention and that a sub-word
> > store (through smp_store_release()) will fail the SC.
> >
> > Then you can do something like:
> >
> > void lock(atomic_t *lock)
> > {
> >         u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
> >         u16 ticket = val >> 16;
> >
> >         for (;;) {
> >                 if (ticket == (u16)val)
> >                         break;
> >                 cpu_relax();
> >                 val = atomic_read_acquire(lock);
> >         }
> > }
> >
> > void unlock(atomic_t *lock)
> > {
> >         u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__);
> >         u32 val = atomic_read(lock);
> >
> >         smp_store_release(ptr, (u16)val + 1);
> > }
> >
> > That's _almost_ as simple as a test-and-set :-) It isn't quite optimal
> > on x86 for not being allowed to use a memop on unlock, since its being
> > forced into a load-store because of all the volatile, but whatever.
> 
> What about trylock()?
> I.e. one could implement trylock() without a loop, by letting
> trylock() fail if the SC fails.
> That looks safe on first view, but nobody does this right now.

Not familiar with RISC-V but I'd recommend that a trylock only fails if
the lock is locked (after LR). A SC may fail for other reasons
(cacheline eviction; depending on the microarchitecture) even if the
lock is unlocked. At least on arm64 we had this issue with an
implementation having a tendency to always fail the first STXR.

-- 
Catalin

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
@ 2021-04-13  9:30                     ` Catalin Marinas
  0 siblings, 0 replies; 92+ messages in thread
From: Catalin Marinas @ 2021-04-13  9:30 UTC (permalink / raw)
  To: Christoph Müllner
  Cc: Peter Zijlstra, Palmer Dabbelt, Anup Patel, Guo Ren, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, will.deacon, Arnd Bergmann

On Tue, Apr 13, 2021 at 11:22:40AM +0200, Christoph Müllner wrote:
> On Tue, Apr 13, 2021 at 10:03 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > On Mon, Apr 12, 2021 at 11:54:55PM +0200, Christoph Müllner wrote:
> > > On Mon, Apr 12, 2021 at 7:33 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:
> > > > My plan is to add a generic ticket-based lock, which can be selected at
> > > > compile time.  It'll have no architecture dependencies (though it'll
> > > > likely have some hooks for architectures that can make this go faster).
> > > > Users can then just pick which spinlock flavor they want, with the idea
> > > > being that smaller systems will perform better with ticket locks and
> > > > larger systems will perform better with queued locks.  The main goal
> > > > here is to give the less widely used architectures an easy way to have
> > > > fair locks, as right now we've got a lot of code duplication because any
> > > > architecture that wants ticket locks has to do it themselves.
> > >
> > > In the case of LL/SC sequences, we have a maximum of 16 instructions
> > > on RISC-V. My concern with a pure-C implementation would be that
> > > we cannot guarantee this (e.g. somebody wants to compile with -O0)
> > > and I don't know of a way to abort the build in case this limit exceeds.
> > > Therefore I have preferred inline assembly for OpenSBI (my initial idea
> > > was to use closure-like LL/SC macros, where you can write the loop
> > > in form of C code).
> >
> > For ticket locks you really only needs atomic_fetch_add() and
> > smp_store_release() and an architectural guarantees that the
> > atomic_fetch_add() has fwd progress under contention and that a sub-word
> > store (through smp_store_release()) will fail the SC.
> >
> > Then you can do something like:
> >
> > void lock(atomic_t *lock)
> > {
> >         u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
> >         u16 ticket = val >> 16;
> >
> >         for (;;) {
> >                 if (ticket == (u16)val)
> >                         break;
> >                 cpu_relax();
> >                 val = atomic_read_acquire(lock);
> >         }
> > }
> >
> > void unlock(atomic_t *lock)
> > {
> >         u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__);
> >         u32 val = atomic_read(lock);
> >
> >         smp_store_release(ptr, (u16)val + 1);
> > }
> >
> > That's _almost_ as simple as a test-and-set :-) It isn't quite optimal
> > on x86 for not being allowed to use a memop on unlock, since its being
> > forced into a load-store because of all the volatile, but whatever.
> 
> What about trylock()?
> I.e. one could implement trylock() without a loop, by letting
> trylock() fail if the SC fails.
> That looks safe on first view, but nobody does this right now.

Not familiar with RISC-V but I'd recommend that a trylock only fails if
the lock is locked (after LR). A SC may fail for other reasons
(cacheline eviction; depending on the microarchitecture) even if the
lock is unlocked. At least on arm64 we had this issue with an
implementation having a tendency to always fail the first STXR.

-- 
Catalin

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
  2021-04-13  9:22                   ` Christoph Müllner
@ 2021-04-13  9:35                     ` Peter Zijlstra
  -1 siblings, 0 replies; 92+ messages in thread
From: Peter Zijlstra @ 2021-04-13  9:35 UTC (permalink / raw)
  To: Christoph Müllner
  Cc: Palmer Dabbelt, Anup Patel, Guo Ren, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, catalin.marinas, will.deacon,
	Arnd Bergmann

On Tue, Apr 13, 2021 at 11:22:40AM +0200, Christoph Müllner wrote:

> > For ticket locks you really only needs atomic_fetch_add() and
> > smp_store_release() and an architectural guarantees that the
> > atomic_fetch_add() has fwd progress under contention and that a sub-word
> > store (through smp_store_release()) will fail the SC.
> >
> > Then you can do something like:
> >
> > void lock(atomic_t *lock)
> > {
> >         u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
> >         u16 ticket = val >> 16;
> >
> >         for (;;) {
> >                 if (ticket == (u16)val)
> >                         break;
> >                 cpu_relax();
> >                 val = atomic_read_acquire(lock);
> >         }
> > }
> >
> > void unlock(atomic_t *lock)
> > {
> >         u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__);
> >         u32 val = atomic_read(lock);
> >
> >         smp_store_release(ptr, (u16)val + 1);
> > }
> >
> > That's _almost_ as simple as a test-and-set :-) It isn't quite optimal
> > on x86 for not being allowed to use a memop on unlock, since its being
> > forced into a load-store because of all the volatile, but whatever.
> 
> What about trylock()?
> I.e. one could implement trylock() without a loop, by letting
> trylock() fail if the SC fails.
> That looks safe on first view, but nobody does this right now.

Generic code has to use cmpxchg(), and then you get something like this:

bool trylock(atomic_t *lock)
{
	u32 old = atomic_read(lock);

	if ((old >> 16) != (old & 0xffff))
		return false;

	return atomic_try_cmpxchg(lock, &old, old + (1<<16)); /* SC, for RCsc */
}

That will try and do the full LL/SC loop, because it wants to complete
the cmpxchg, but in generic code we have no other option.

(Is this what C11's weak cmpxchg is for?)

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
@ 2021-04-13  9:35                     ` Peter Zijlstra
  0 siblings, 0 replies; 92+ messages in thread
From: Peter Zijlstra @ 2021-04-13  9:35 UTC (permalink / raw)
  To: Christoph Müllner
  Cc: Palmer Dabbelt, Anup Patel, Guo Ren, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, catalin.marinas, will.deacon,
	Arnd Bergmann

On Tue, Apr 13, 2021 at 11:22:40AM +0200, Christoph Müllner wrote:

> > For ticket locks you really only needs atomic_fetch_add() and
> > smp_store_release() and an architectural guarantees that the
> > atomic_fetch_add() has fwd progress under contention and that a sub-word
> > store (through smp_store_release()) will fail the SC.
> >
> > Then you can do something like:
> >
> > void lock(atomic_t *lock)
> > {
> >         u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
> >         u16 ticket = val >> 16;
> >
> >         for (;;) {
> >                 if (ticket == (u16)val)
> >                         break;
> >                 cpu_relax();
> >                 val = atomic_read_acquire(lock);
> >         }
> > }
> >
> > void unlock(atomic_t *lock)
> > {
> >         u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__);
> >         u32 val = atomic_read(lock);
> >
> >         smp_store_release(ptr, (u16)val + 1);
> > }
> >
> > That's _almost_ as simple as a test-and-set :-) It isn't quite optimal
> > on x86 for not being allowed to use a memop on unlock, since its being
> > forced into a load-store because of all the volatile, but whatever.
> 
> What about trylock()?
> I.e. one could implement trylock() without a loop, by letting
> trylock() fail if the SC fails.
> That looks safe on first view, but nobody does this right now.

Generic code has to use cmpxchg(), and then you get something like this:

bool trylock(atomic_t *lock)
{
	u32 old = atomic_read(lock);

	if ((old >> 16) != (old & 0xffff))
		return false;

	return atomic_try_cmpxchg(lock, &old, old + (1<<16)); /* SC, for RCsc */
}

That will try and do the full LL/SC loop, because it wants to complete
the cmpxchg, but in generic code we have no other option.

(Is this what C11's weak cmpxchg is for?)

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
  2021-04-13  9:30                     ` Catalin Marinas
@ 2021-04-13  9:55                       ` Christoph Müllner
  -1 siblings, 0 replies; 92+ messages in thread
From: Christoph Müllner @ 2021-04-13  9:55 UTC (permalink / raw)
  To: Catalin Marinas
  Cc: Peter Zijlstra, Palmer Dabbelt, Anup Patel, Guo Ren, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, will.deacon, Arnd Bergmann

On Tue, Apr 13, 2021 at 11:31 AM Catalin Marinas
<catalin.marinas@arm.com> wrote:
>
> On Tue, Apr 13, 2021 at 11:22:40AM +0200, Christoph Müllner wrote:
> > On Tue, Apr 13, 2021 at 10:03 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > On Mon, Apr 12, 2021 at 11:54:55PM +0200, Christoph Müllner wrote:
> > > > On Mon, Apr 12, 2021 at 7:33 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:
> > > > > My plan is to add a generic ticket-based lock, which can be selected at
> > > > > compile time.  It'll have no architecture dependencies (though it'll
> > > > > likely have some hooks for architectures that can make this go faster).
> > > > > Users can then just pick which spinlock flavor they want, with the idea
> > > > > being that smaller systems will perform better with ticket locks and
> > > > > larger systems will perform better with queued locks.  The main goal
> > > > > here is to give the less widely used architectures an easy way to have
> > > > > fair locks, as right now we've got a lot of code duplication because any
> > > > > architecture that wants ticket locks has to do it themselves.
> > > >
> > > > In the case of LL/SC sequences, we have a maximum of 16 instructions
> > > > on RISC-V. My concern with a pure-C implementation would be that
> > > > we cannot guarantee this (e.g. somebody wants to compile with -O0)
> > > > and I don't know of a way to abort the build in case this limit exceeds.
> > > > Therefore I have preferred inline assembly for OpenSBI (my initial idea
> > > > was to use closure-like LL/SC macros, where you can write the loop
> > > > in form of C code).
> > >
> > > For ticket locks you really only needs atomic_fetch_add() and
> > > smp_store_release() and an architectural guarantees that the
> > > atomic_fetch_add() has fwd progress under contention and that a sub-word
> > > store (through smp_store_release()) will fail the SC.
> > >
> > > Then you can do something like:
> > >
> > > void lock(atomic_t *lock)
> > > {
> > >         u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
> > >         u16 ticket = val >> 16;
> > >
> > >         for (;;) {
> > >                 if (ticket == (u16)val)
> > >                         break;
> > >                 cpu_relax();
> > >                 val = atomic_read_acquire(lock);
> > >         }
> > > }
> > >
> > > void unlock(atomic_t *lock)
> > > {
> > >         u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__);
> > >         u32 val = atomic_read(lock);
> > >
> > >         smp_store_release(ptr, (u16)val + 1);
> > > }
> > >
> > > That's _almost_ as simple as a test-and-set :-) It isn't quite optimal
> > > on x86 for not being allowed to use a memop on unlock, since its being
> > > forced into a load-store because of all the volatile, but whatever.
> >
> > What about trylock()?
> > I.e. one could implement trylock() without a loop, by letting
> > trylock() fail if the SC fails.
> > That looks safe on first view, but nobody does this right now.
>
> Not familiar with RISC-V but I'd recommend that a trylock only fails if
> the lock is locked (after LR). A SC may fail for other reasons
> (cacheline eviction; depending on the microarchitecture) even if the
> lock is unlocked. At least on arm64 we had this issue with an
> implementation having a tendency to always fail the first STXR.

Interesting data point.
Thanks!

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
@ 2021-04-13  9:55                       ` Christoph Müllner
  0 siblings, 0 replies; 92+ messages in thread
From: Christoph Müllner @ 2021-04-13  9:55 UTC (permalink / raw)
  To: Catalin Marinas
  Cc: Peter Zijlstra, Palmer Dabbelt, Anup Patel, Guo Ren, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, will.deacon, Arnd Bergmann

On Tue, Apr 13, 2021 at 11:31 AM Catalin Marinas
<catalin.marinas@arm.com> wrote:
>
> On Tue, Apr 13, 2021 at 11:22:40AM +0200, Christoph Müllner wrote:
> > On Tue, Apr 13, 2021 at 10:03 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > On Mon, Apr 12, 2021 at 11:54:55PM +0200, Christoph Müllner wrote:
> > > > On Mon, Apr 12, 2021 at 7:33 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:
> > > > > My plan is to add a generic ticket-based lock, which can be selected at
> > > > > compile time.  It'll have no architecture dependencies (though it'll
> > > > > likely have some hooks for architectures that can make this go faster).
> > > > > Users can then just pick which spinlock flavor they want, with the idea
> > > > > being that smaller systems will perform better with ticket locks and
> > > > > larger systems will perform better with queued locks.  The main goal
> > > > > here is to give the less widely used architectures an easy way to have
> > > > > fair locks, as right now we've got a lot of code duplication because any
> > > > > architecture that wants ticket locks has to do it themselves.
> > > >
> > > > In the case of LL/SC sequences, we have a maximum of 16 instructions
> > > > on RISC-V. My concern with a pure-C implementation would be that
> > > > we cannot guarantee this (e.g. somebody wants to compile with -O0)
> > > > and I don't know of a way to abort the build in case this limit exceeds.
> > > > Therefore I have preferred inline assembly for OpenSBI (my initial idea
> > > > was to use closure-like LL/SC macros, where you can write the loop
> > > > in form of C code).
> > >
> > > For ticket locks you really only needs atomic_fetch_add() and
> > > smp_store_release() and an architectural guarantees that the
> > > atomic_fetch_add() has fwd progress under contention and that a sub-word
> > > store (through smp_store_release()) will fail the SC.
> > >
> > > Then you can do something like:
> > >
> > > void lock(atomic_t *lock)
> > > {
> > >         u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
> > >         u16 ticket = val >> 16;
> > >
> > >         for (;;) {
> > >                 if (ticket == (u16)val)
> > >                         break;
> > >                 cpu_relax();
> > >                 val = atomic_read_acquire(lock);
> > >         }
> > > }
> > >
> > > void unlock(atomic_t *lock)
> > > {
> > >         u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__);
> > >         u32 val = atomic_read(lock);
> > >
> > >         smp_store_release(ptr, (u16)val + 1);
> > > }
> > >
> > > That's _almost_ as simple as a test-and-set :-) It isn't quite optimal
> > > on x86 for not being allowed to use a memop on unlock, since its being
> > > forced into a load-store because of all the volatile, but whatever.
> >
> > What about trylock()?
> > I.e. one could implement trylock() without a loop, by letting
> > trylock() fail if the SC fails.
> > That looks safe on first view, but nobody does this right now.
>
> Not familiar with RISC-V but I'd recommend that a trylock only fails if
> the lock is locked (after LR). A SC may fail for other reasons
> (cacheline eviction; depending on the microarchitecture) even if the
> lock is unlocked. At least on arm64 we had this issue with an
> implementation having a tendency to always fail the first STXR.

Interesting data point.
Thanks!

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
  2021-04-13  9:35                     ` Peter Zijlstra
@ 2021-04-13 10:25                       ` Christoph Müllner
  -1 siblings, 0 replies; 92+ messages in thread
From: Christoph Müllner @ 2021-04-13 10:25 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Palmer Dabbelt, Anup Patel, Guo Ren, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, catalin.marinas, will.deacon,
	Arnd Bergmann

On Tue, Apr 13, 2021 at 11:37 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Tue, Apr 13, 2021 at 11:22:40AM +0200, Christoph Müllner wrote:
>
> > > For ticket locks you really only needs atomic_fetch_add() and
> > > smp_store_release() and an architectural guarantees that the
> > > atomic_fetch_add() has fwd progress under contention and that a sub-word
> > > store (through smp_store_release()) will fail the SC.
> > >
> > > Then you can do something like:
> > >
> > > void lock(atomic_t *lock)
> > > {
> > >         u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
> > >         u16 ticket = val >> 16;
> > >
> > >         for (;;) {
> > >                 if (ticket == (u16)val)
> > >                         break;
> > >                 cpu_relax();
> > >                 val = atomic_read_acquire(lock);
> > >         }
> > > }
> > >
> > > void unlock(atomic_t *lock)
> > > {
> > >         u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__);
> > >         u32 val = atomic_read(lock);
> > >
> > >         smp_store_release(ptr, (u16)val + 1);
> > > }
> > >
> > > That's _almost_ as simple as a test-and-set :-) It isn't quite optimal
> > > on x86 for not being allowed to use a memop on unlock, since its being
> > > forced into a load-store because of all the volatile, but whatever.
> >
> > What about trylock()?
> > I.e. one could implement trylock() without a loop, by letting
> > trylock() fail if the SC fails.
> > That looks safe on first view, but nobody does this right now.
>
> Generic code has to use cmpxchg(), and then you get something like this:
>
> bool trylock(atomic_t *lock)
> {
>         u32 old = atomic_read(lock);
>
>         if ((old >> 16) != (old & 0xffff))
>                 return false;
>
>         return atomic_try_cmpxchg(lock, &old, old + (1<<16)); /* SC, for RCsc */
> }

This approach requires two loads (atomic_read() and cmpxchg()), which
is not required.
Detecting this pattern and optimizing it in a compiler is quite unlikely.

A bit less generic solution would be to wrap the LL/SC (would be
mandatory in this case)
instructions and do something like this:

uint32_t __smp_load_acquire_reserved(void*);
int __smp_store_release_conditional(void*, uint32_t);

typedef union {
    uint32_t v32;
    struct {
        uint16_t owner;
        uint16_t next;
    };
} arch_spinlock_t;

int trylock(arch_spinlock_t *lock)
{
    arch_spinlock_t l;
    int success;
    do {
        l.v32 = __smp_load_acquire_reserved(lock);
        if (l.owner != l.next)
            return 0;
        l.next++;
        success = __smp_store_release_conditional(lock, l.v32);
    } while (!success);
    return success;
}

But here we can't tell the compiler to optimize the code between LL and SC...

>
> That will try and do the full LL/SC loop, because it wants to complete
> the cmpxchg, but in generic code we have no other option.
>
> (Is this what C11's weak cmpxchg is for?)

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
@ 2021-04-13 10:25                       ` Christoph Müllner
  0 siblings, 0 replies; 92+ messages in thread
From: Christoph Müllner @ 2021-04-13 10:25 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Palmer Dabbelt, Anup Patel, Guo Ren, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, catalin.marinas, will.deacon,
	Arnd Bergmann

On Tue, Apr 13, 2021 at 11:37 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Tue, Apr 13, 2021 at 11:22:40AM +0200, Christoph Müllner wrote:
>
> > > For ticket locks you really only needs atomic_fetch_add() and
> > > smp_store_release() and an architectural guarantees that the
> > > atomic_fetch_add() has fwd progress under contention and that a sub-word
> > > store (through smp_store_release()) will fail the SC.
> > >
> > > Then you can do something like:
> > >
> > > void lock(atomic_t *lock)
> > > {
> > >         u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
> > >         u16 ticket = val >> 16;
> > >
> > >         for (;;) {
> > >                 if (ticket == (u16)val)
> > >                         break;
> > >                 cpu_relax();
> > >                 val = atomic_read_acquire(lock);
> > >         }
> > > }
> > >
> > > void unlock(atomic_t *lock)
> > > {
> > >         u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__);
> > >         u32 val = atomic_read(lock);
> > >
> > >         smp_store_release(ptr, (u16)val + 1);
> > > }
> > >
> > > That's _almost_ as simple as a test-and-set :-) It isn't quite optimal
> > > on x86 for not being allowed to use a memop on unlock, since its being
> > > forced into a load-store because of all the volatile, but whatever.
> >
> > What about trylock()?
> > I.e. one could implement trylock() without a loop, by letting
> > trylock() fail if the SC fails.
> > That looks safe on first view, but nobody does this right now.
>
> Generic code has to use cmpxchg(), and then you get something like this:
>
> bool trylock(atomic_t *lock)
> {
>         u32 old = atomic_read(lock);
>
>         if ((old >> 16) != (old & 0xffff))
>                 return false;
>
>         return atomic_try_cmpxchg(lock, &old, old + (1<<16)); /* SC, for RCsc */
> }

This approach requires two loads (atomic_read() and cmpxchg()), which
is not required.
Detecting this pattern and optimizing it in a compiler is quite unlikely.

A bit less generic solution would be to wrap the LL/SC (would be
mandatory in this case)
instructions and do something like this:

uint32_t __smp_load_acquire_reserved(void*);
int __smp_store_release_conditional(void*, uint32_t);

typedef union {
    uint32_t v32;
    struct {
        uint16_t owner;
        uint16_t next;
    };
} arch_spinlock_t;

int trylock(arch_spinlock_t *lock)
{
    arch_spinlock_t l;
    int success;
    do {
        l.v32 = __smp_load_acquire_reserved(lock);
        if (l.owner != l.next)
            return 0;
        l.next++;
        success = __smp_store_release_conditional(lock, l.v32);
    } while (!success);
    return success;
}

But here we can't tell the compiler to optimize the code between LL and SC...

>
> That will try and do the full LL/SC loop, because it wants to complete
> the cmpxchg, but in generic code we have no other option.
>
> (Is this what C11's weak cmpxchg is for?)

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
  2021-04-13 10:25                       ` Christoph Müllner
@ 2021-04-13 10:45                         ` Catalin Marinas
  -1 siblings, 0 replies; 92+ messages in thread
From: Catalin Marinas @ 2021-04-13 10:45 UTC (permalink / raw)
  To: Christoph Müllner
  Cc: Peter Zijlstra, Palmer Dabbelt, Anup Patel, Guo Ren, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, will.deacon, Arnd Bergmann

On Tue, Apr 13, 2021 at 12:25:00PM +0200, Christoph Müllner wrote:
> On Tue, Apr 13, 2021 at 11:37 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > On Tue, Apr 13, 2021 at 11:22:40AM +0200, Christoph Müllner wrote:
> > > What about trylock()?
> > > I.e. one could implement trylock() without a loop, by letting
> > > trylock() fail if the SC fails.
> > > That looks safe on first view, but nobody does this right now.
> >
> > Generic code has to use cmpxchg(), and then you get something like this:
> >
> > bool trylock(atomic_t *lock)
> > {
> >         u32 old = atomic_read(lock);
> >
> >         if ((old >> 16) != (old & 0xffff))
> >                 return false;
> >
> >         return atomic_try_cmpxchg(lock, &old, old + (1<<16)); /* SC, for RCsc */
> > }
> 
> This approach requires two loads (atomic_read() and cmpxchg()), which
> is not required.
> Detecting this pattern and optimizing it in a compiler is quite unlikely.
> 
> A bit less generic solution would be to wrap the LL/SC (would be
> mandatory in this case)
> instructions and do something like this:
> 
> uint32_t __smp_load_acquire_reserved(void*);
> int __smp_store_release_conditional(void*, uint32_t);
> 
> typedef union {
>     uint32_t v32;
>     struct {
>         uint16_t owner;
>         uint16_t next;
>     };
> } arch_spinlock_t;
> 
> int trylock(arch_spinlock_t *lock)
> {
>     arch_spinlock_t l;
>     int success;
>     do {
>         l.v32 = __smp_load_acquire_reserved(lock);
>         if (l.owner != l.next)
>             return 0;
>         l.next++;
>         success = __smp_store_release_conditional(lock, l.v32);
>     } while (!success);
>     return success;
> }
> 
> But here we can't tell the compiler to optimize the code between LL and SC...

This indeed needs some care. IIUC RISC-V has similar restrictions as arm
here, no load/store instructions are allowed between LR and SC. You
can't guarantee that the compiler won't spill some variable onto the
stack.

BTW, I think the SC doesn't need release semantics above, only the LR
needs acquire semantics.

-- 
Catalin

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
@ 2021-04-13 10:45                         ` Catalin Marinas
  0 siblings, 0 replies; 92+ messages in thread
From: Catalin Marinas @ 2021-04-13 10:45 UTC (permalink / raw)
  To: Christoph Müllner
  Cc: Peter Zijlstra, Palmer Dabbelt, Anup Patel, Guo Ren, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, will.deacon, Arnd Bergmann

On Tue, Apr 13, 2021 at 12:25:00PM +0200, Christoph Müllner wrote:
> On Tue, Apr 13, 2021 at 11:37 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > On Tue, Apr 13, 2021 at 11:22:40AM +0200, Christoph Müllner wrote:
> > > What about trylock()?
> > > I.e. one could implement trylock() without a loop, by letting
> > > trylock() fail if the SC fails.
> > > That looks safe on first view, but nobody does this right now.
> >
> > Generic code has to use cmpxchg(), and then you get something like this:
> >
> > bool trylock(atomic_t *lock)
> > {
> >         u32 old = atomic_read(lock);
> >
> >         if ((old >> 16) != (old & 0xffff))
> >                 return false;
> >
> >         return atomic_try_cmpxchg(lock, &old, old + (1<<16)); /* SC, for RCsc */
> > }
> 
> This approach requires two loads (atomic_read() and cmpxchg()), which
> is not required.
> Detecting this pattern and optimizing it in a compiler is quite unlikely.
> 
> A bit less generic solution would be to wrap the LL/SC (would be
> mandatory in this case)
> instructions and do something like this:
> 
> uint32_t __smp_load_acquire_reserved(void*);
> int __smp_store_release_conditional(void*, uint32_t);
> 
> typedef union {
>     uint32_t v32;
>     struct {
>         uint16_t owner;
>         uint16_t next;
>     };
> } arch_spinlock_t;
> 
> int trylock(arch_spinlock_t *lock)
> {
>     arch_spinlock_t l;
>     int success;
>     do {
>         l.v32 = __smp_load_acquire_reserved(lock);
>         if (l.owner != l.next)
>             return 0;
>         l.next++;
>         success = __smp_store_release_conditional(lock, l.v32);
>     } while (!success);
>     return success;
> }
> 
> But here we can't tell the compiler to optimize the code between LL and SC...

This indeed needs some care. IIUC RISC-V has similar restrictions as arm
here, no load/store instructions are allowed between LR and SC. You
can't guarantee that the compiler won't spill some variable onto the
stack.

BTW, I think the SC doesn't need release semantics above, only the LR
needs acquire semantics.

-- 
Catalin

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

* RE: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
  2021-04-13 10:45                         ` Catalin Marinas
@ 2021-04-13 10:54                           ` David Laight
  -1 siblings, 0 replies; 92+ messages in thread
From: David Laight @ 2021-04-13 10:54 UTC (permalink / raw)
  To: 'Catalin Marinas', Christoph Müllner
  Cc: Peter Zijlstra, Palmer Dabbelt, Anup Patel, Guo Ren, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, will.deacon, Arnd Bergmann

From: Catalin Marinas
> Sent: 13 April 2021 11:45
...
> This indeed needs some care. IIUC RISC-V has similar restrictions as arm
> here, no load/store instructions are allowed between LR and SC. You
> can't guarantee that the compiler won't spill some variable onto the
> stack.

You can probably never guarantee the compiler won't spill to stack.
Especially if someone compiles with -O0.

Which probably means that anything using LR/SC must be written in
asm and the C wrappers disabled.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)


^ permalink raw reply	[flat|nested] 92+ messages in thread

* RE: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
@ 2021-04-13 10:54                           ` David Laight
  0 siblings, 0 replies; 92+ messages in thread
From: David Laight @ 2021-04-13 10:54 UTC (permalink / raw)
  To: 'Catalin Marinas', Christoph Müllner
  Cc: Peter Zijlstra, Palmer Dabbelt, Anup Patel, Guo Ren, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, will.deacon, Arnd Bergmann

From: Catalin Marinas
> Sent: 13 April 2021 11:45
...
> This indeed needs some care. IIUC RISC-V has similar restrictions as arm
> here, no load/store instructions are allowed between LR and SC. You
> can't guarantee that the compiler won't spill some variable onto the
> stack.

You can probably never guarantee the compiler won't spill to stack.
Especially if someone compiles with -O0.

Which probably means that anything using LR/SC must be written in
asm and the C wrappers disabled.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)


_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
  2021-04-13 10:45                         ` Catalin Marinas
@ 2021-04-13 11:04                           ` Christoph Müllner
  -1 siblings, 0 replies; 92+ messages in thread
From: Christoph Müllner @ 2021-04-13 11:04 UTC (permalink / raw)
  To: Catalin Marinas
  Cc: Peter Zijlstra, Palmer Dabbelt, Anup Patel, Guo Ren, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, will.deacon, Arnd Bergmann

On Tue, Apr 13, 2021 at 12:45 PM Catalin Marinas
<catalin.marinas@arm.com> wrote:
>
> On Tue, Apr 13, 2021 at 12:25:00PM +0200, Christoph Müllner wrote:
> > On Tue, Apr 13, 2021 at 11:37 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > On Tue, Apr 13, 2021 at 11:22:40AM +0200, Christoph Müllner wrote:
> > > > What about trylock()?
> > > > I.e. one could implement trylock() without a loop, by letting
> > > > trylock() fail if the SC fails.
> > > > That looks safe on first view, but nobody does this right now.
> > >
> > > Generic code has to use cmpxchg(), and then you get something like this:
> > >
> > > bool trylock(atomic_t *lock)
> > > {
> > >         u32 old = atomic_read(lock);
> > >
> > >         if ((old >> 16) != (old & 0xffff))
> > >                 return false;
> > >
> > >         return atomic_try_cmpxchg(lock, &old, old + (1<<16)); /* SC, for RCsc */
> > > }
> >
> > This approach requires two loads (atomic_read() and cmpxchg()), which
> > is not required.
> > Detecting this pattern and optimizing it in a compiler is quite unlikely.
> >
> > A bit less generic solution would be to wrap the LL/SC (would be
> > mandatory in this case)
> > instructions and do something like this:
> >
> > uint32_t __smp_load_acquire_reserved(void*);
> > int __smp_store_release_conditional(void*, uint32_t);
> >
> > typedef union {
> >     uint32_t v32;
> >     struct {
> >         uint16_t owner;
> >         uint16_t next;
> >     };
> > } arch_spinlock_t;
> >
> > int trylock(arch_spinlock_t *lock)
> > {
> >     arch_spinlock_t l;
> >     int success;
> >     do {
> >         l.v32 = __smp_load_acquire_reserved(lock);
> >         if (l.owner != l.next)
> >             return 0;
> >         l.next++;
> >         success = __smp_store_release_conditional(lock, l.v32);
> >     } while (!success);
> >     return success;
> > }
> >
> > But here we can't tell the compiler to optimize the code between LL and SC...
>
> This indeed needs some care. IIUC RISC-V has similar restrictions as arm
> here, no load/store instructions are allowed between LR and SC. You
> can't guarantee that the compiler won't spill some variable onto the
> stack.

RISC-V behaves similar, but the specification is a bit more precise:
To guarantee forward progress, the ("constrained") LL/SC sequence has to
consist of <=16 instructions. Further, the "dynamic code executed between
the LR and SC instructions can only contain instructions from the base “I”
instruction set, excluding loads, stores, backward jumps, taken backward
branches, JALR, FENCE, and SYSTEM instructions".

And GCC generates a backward jump in-between LL and SC.
So we have more than enough reasons, to no do it this way.

>
> BTW, I think the SC doesn't need release semantics above, only the LR
> needs acquire semantics.
>
> --
> Catalin

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
@ 2021-04-13 11:04                           ` Christoph Müllner
  0 siblings, 0 replies; 92+ messages in thread
From: Christoph Müllner @ 2021-04-13 11:04 UTC (permalink / raw)
  To: Catalin Marinas
  Cc: Peter Zijlstra, Palmer Dabbelt, Anup Patel, Guo Ren, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, will.deacon, Arnd Bergmann

On Tue, Apr 13, 2021 at 12:45 PM Catalin Marinas
<catalin.marinas@arm.com> wrote:
>
> On Tue, Apr 13, 2021 at 12:25:00PM +0200, Christoph Müllner wrote:
> > On Tue, Apr 13, 2021 at 11:37 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > On Tue, Apr 13, 2021 at 11:22:40AM +0200, Christoph Müllner wrote:
> > > > What about trylock()?
> > > > I.e. one could implement trylock() without a loop, by letting
> > > > trylock() fail if the SC fails.
> > > > That looks safe on first view, but nobody does this right now.
> > >
> > > Generic code has to use cmpxchg(), and then you get something like this:
> > >
> > > bool trylock(atomic_t *lock)
> > > {
> > >         u32 old = atomic_read(lock);
> > >
> > >         if ((old >> 16) != (old & 0xffff))
> > >                 return false;
> > >
> > >         return atomic_try_cmpxchg(lock, &old, old + (1<<16)); /* SC, for RCsc */
> > > }
> >
> > This approach requires two loads (atomic_read() and cmpxchg()), which
> > is not required.
> > Detecting this pattern and optimizing it in a compiler is quite unlikely.
> >
> > A bit less generic solution would be to wrap the LL/SC (would be
> > mandatory in this case)
> > instructions and do something like this:
> >
> > uint32_t __smp_load_acquire_reserved(void*);
> > int __smp_store_release_conditional(void*, uint32_t);
> >
> > typedef union {
> >     uint32_t v32;
> >     struct {
> >         uint16_t owner;
> >         uint16_t next;
> >     };
> > } arch_spinlock_t;
> >
> > int trylock(arch_spinlock_t *lock)
> > {
> >     arch_spinlock_t l;
> >     int success;
> >     do {
> >         l.v32 = __smp_load_acquire_reserved(lock);
> >         if (l.owner != l.next)
> >             return 0;
> >         l.next++;
> >         success = __smp_store_release_conditional(lock, l.v32);
> >     } while (!success);
> >     return success;
> > }
> >
> > But here we can't tell the compiler to optimize the code between LL and SC...
>
> This indeed needs some care. IIUC RISC-V has similar restrictions as arm
> here, no load/store instructions are allowed between LR and SC. You
> can't guarantee that the compiler won't spill some variable onto the
> stack.

RISC-V behaves similar, but the specification is a bit more precise:
To guarantee forward progress, the ("constrained") LL/SC sequence has to
consist of <=16 instructions. Further, the "dynamic code executed between
the LR and SC instructions can only contain instructions from the base “I”
instruction set, excluding loads, stores, backward jumps, taken backward
branches, JALR, FENCE, and SYSTEM instructions".

And GCC generates a backward jump in-between LL and SC.
So we have more than enough reasons, to no do it this way.

>
> BTW, I think the SC doesn't need release semantics above, only the LR
> needs acquire semantics.
>
> --
> Catalin

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
  2021-04-13 10:25                       ` Christoph Müllner
@ 2021-04-13 13:19                         ` Guo Ren
  -1 siblings, 0 replies; 92+ messages in thread
From: Guo Ren @ 2021-04-13 13:19 UTC (permalink / raw)
  To: Christoph Müllner
  Cc: Peter Zijlstra, Palmer Dabbelt, Anup Patel, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, Catalin Marinas, Will Deacon,
	Arnd Bergmann

On Tue, Apr 13, 2021 at 6:25 PM Christoph Müllner
<christophm30@gmail.com> wrote:
>
> On Tue, Apr 13, 2021 at 11:37 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Tue, Apr 13, 2021 at 11:22:40AM +0200, Christoph Müllner wrote:
> >
> > > > For ticket locks you really only needs atomic_fetch_add() and
> > > > smp_store_release() and an architectural guarantees that the
> > > > atomic_fetch_add() has fwd progress under contention and that a sub-word
> > > > store (through smp_store_release()) will fail the SC.
> > > >
> > > > Then you can do something like:
> > > >
> > > > void lock(atomic_t *lock)
> > > > {
> > > >         u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
> > > >         u16 ticket = val >> 16;
> > > >
> > > >         for (;;) {
> > > >                 if (ticket == (u16)val)
> > > >                         break;
> > > >                 cpu_relax();
> > > >                 val = atomic_read_acquire(lock);
> > > >         }
> > > > }
> > > >
> > > > void unlock(atomic_t *lock)
> > > > {
> > > >         u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__);
> > > >         u32 val = atomic_read(lock);
> > > >
> > > >         smp_store_release(ptr, (u16)val + 1);
> > > > }
> > > >
> > > > That's _almost_ as simple as a test-and-set :-) It isn't quite optimal
> > > > on x86 for not being allowed to use a memop on unlock, since its being
> > > > forced into a load-store because of all the volatile, but whatever.
> > >
> > > What about trylock()?
> > > I.e. one could implement trylock() without a loop, by letting
> > > trylock() fail if the SC fails.
> > > That looks safe on first view, but nobody does this right now.
> >
> > Generic code has to use cmpxchg(), and then you get something like this:
> >
> > bool trylock(atomic_t *lock)
> > {
> >         u32 old = atomic_read(lock);
> >
> >         if ((old >> 16) != (old & 0xffff))
> >                 return false;
> >
> >         return atomic_try_cmpxchg(lock, &old, old + (1<<16)); /* SC, for RCsc */
> > }
>
> This approach requires two loads (atomic_read() and cmpxchg()), which
> is not required.
> Detecting this pattern and optimizing it in a compiler is quite unlikely.
>
> A bit less generic solution would be to wrap the LL/SC (would be
> mandatory in this case)
> instructions and do something like this:
>
> uint32_t __smp_load_acquire_reserved(void*);
> int __smp_store_release_conditional(void*, uint32_t);
>
> typedef union {
>     uint32_t v32;
>     struct {
>         uint16_t owner;
>         uint16_t next;
>     };
> } arch_spinlock_t;
>
> int trylock(arch_spinlock_t *lock)
> {
>     arch_spinlock_t l;
>     int success;
>     do {
>         l.v32 = __smp_load_acquire_reserved(lock);
>         if (l.owner != l.next)
>             return 0;
>         l.next++;
>         success = __smp_store_release_conditional(lock, l.v32);
It's a new semantics v.s cmpxchg, and cmpxchg is come from CAS
instruction to solve some complex scenario.

The primitive of cmpxchg has been widely used in Linux, so I don't
think introducing a new semantics (reserved_conditional) is a good
idea.

Also, from this point: It seems CAS instruction is more suitable for
software compatibility. Although riscv privilege spec chose the LR/SC
and list some bad sides of CAS.

>     } while (!success);
>     return success;
> }
>
> But here we can't tell the compiler to optimize the code between LL and SC...
>
> >
> > That will try and do the full LL/SC loop, because it wants to complete
> > the cmpxchg, but in generic code we have no other option.
> >
> > (Is this what C11's weak cmpxchg is for?)



-- 
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
@ 2021-04-13 13:19                         ` Guo Ren
  0 siblings, 0 replies; 92+ messages in thread
From: Guo Ren @ 2021-04-13 13:19 UTC (permalink / raw)
  To: Christoph Müllner
  Cc: Peter Zijlstra, Palmer Dabbelt, Anup Patel, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, Catalin Marinas, Will Deacon,
	Arnd Bergmann

On Tue, Apr 13, 2021 at 6:25 PM Christoph Müllner
<christophm30@gmail.com> wrote:
>
> On Tue, Apr 13, 2021 at 11:37 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Tue, Apr 13, 2021 at 11:22:40AM +0200, Christoph Müllner wrote:
> >
> > > > For ticket locks you really only needs atomic_fetch_add() and
> > > > smp_store_release() and an architectural guarantees that the
> > > > atomic_fetch_add() has fwd progress under contention and that a sub-word
> > > > store (through smp_store_release()) will fail the SC.
> > > >
> > > > Then you can do something like:
> > > >
> > > > void lock(atomic_t *lock)
> > > > {
> > > >         u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
> > > >         u16 ticket = val >> 16;
> > > >
> > > >         for (;;) {
> > > >                 if (ticket == (u16)val)
> > > >                         break;
> > > >                 cpu_relax();
> > > >                 val = atomic_read_acquire(lock);
> > > >         }
> > > > }
> > > >
> > > > void unlock(atomic_t *lock)
> > > > {
> > > >         u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__);
> > > >         u32 val = atomic_read(lock);
> > > >
> > > >         smp_store_release(ptr, (u16)val + 1);
> > > > }
> > > >
> > > > That's _almost_ as simple as a test-and-set :-) It isn't quite optimal
> > > > on x86 for not being allowed to use a memop on unlock, since its being
> > > > forced into a load-store because of all the volatile, but whatever.
> > >
> > > What about trylock()?
> > > I.e. one could implement trylock() without a loop, by letting
> > > trylock() fail if the SC fails.
> > > That looks safe on first view, but nobody does this right now.
> >
> > Generic code has to use cmpxchg(), and then you get something like this:
> >
> > bool trylock(atomic_t *lock)
> > {
> >         u32 old = atomic_read(lock);
> >
> >         if ((old >> 16) != (old & 0xffff))
> >                 return false;
> >
> >         return atomic_try_cmpxchg(lock, &old, old + (1<<16)); /* SC, for RCsc */
> > }
>
> This approach requires two loads (atomic_read() and cmpxchg()), which
> is not required.
> Detecting this pattern and optimizing it in a compiler is quite unlikely.
>
> A bit less generic solution would be to wrap the LL/SC (would be
> mandatory in this case)
> instructions and do something like this:
>
> uint32_t __smp_load_acquire_reserved(void*);
> int __smp_store_release_conditional(void*, uint32_t);
>
> typedef union {
>     uint32_t v32;
>     struct {
>         uint16_t owner;
>         uint16_t next;
>     };
> } arch_spinlock_t;
>
> int trylock(arch_spinlock_t *lock)
> {
>     arch_spinlock_t l;
>     int success;
>     do {
>         l.v32 = __smp_load_acquire_reserved(lock);
>         if (l.owner != l.next)
>             return 0;
>         l.next++;
>         success = __smp_store_release_conditional(lock, l.v32);
It's a new semantics v.s cmpxchg, and cmpxchg is come from CAS
instruction to solve some complex scenario.

The primitive of cmpxchg has been widely used in Linux, so I don't
think introducing a new semantics (reserved_conditional) is a good
idea.

Also, from this point: It seems CAS instruction is more suitable for
software compatibility. Although riscv privilege spec chose the LR/SC
and list some bad sides of CAS.

>     } while (!success);
>     return success;
> }
>
> But here we can't tell the compiler to optimize the code between LL and SC...
>
> >
> > That will try and do the full LL/SC loop, because it wants to complete
> > the cmpxchg, but in generic code we have no other option.
> >
> > (Is this what C11's weak cmpxchg is for?)



-- 
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
  2021-04-13  9:30                     ` Catalin Marinas
@ 2021-04-14  0:23                       ` Guo Ren
  -1 siblings, 0 replies; 92+ messages in thread
From: Guo Ren @ 2021-04-14  0:23 UTC (permalink / raw)
  To: Catalin Marinas
  Cc: Christoph Müllner, Peter Zijlstra, Palmer Dabbelt,
	Anup Patel, linux-riscv, Linux Kernel Mailing List, Guo Ren,
	Will Deacon, Arnd Bergmann

On Tue, Apr 13, 2021 at 5:31 PM Catalin Marinas <catalin.marinas@arm.com> wrote:
>
> On Tue, Apr 13, 2021 at 11:22:40AM +0200, Christoph Müllner wrote:
> > On Tue, Apr 13, 2021 at 10:03 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > On Mon, Apr 12, 2021 at 11:54:55PM +0200, Christoph Müllner wrote:
> > > > On Mon, Apr 12, 2021 at 7:33 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:
> > > > > My plan is to add a generic ticket-based lock, which can be selected at
> > > > > compile time.  It'll have no architecture dependencies (though it'll
> > > > > likely have some hooks for architectures that can make this go faster).
> > > > > Users can then just pick which spinlock flavor they want, with the idea
> > > > > being that smaller systems will perform better with ticket locks and
> > > > > larger systems will perform better with queued locks.  The main goal
> > > > > here is to give the less widely used architectures an easy way to have
> > > > > fair locks, as right now we've got a lot of code duplication because any
> > > > > architecture that wants ticket locks has to do it themselves.
> > > >
> > > > In the case of LL/SC sequences, we have a maximum of 16 instructions
> > > > on RISC-V. My concern with a pure-C implementation would be that
> > > > we cannot guarantee this (e.g. somebody wants to compile with -O0)
> > > > and I don't know of a way to abort the build in case this limit exceeds.
> > > > Therefore I have preferred inline assembly for OpenSBI (my initial idea
> > > > was to use closure-like LL/SC macros, where you can write the loop
> > > > in form of C code).
> > >
> > > For ticket locks you really only needs atomic_fetch_add() and
> > > smp_store_release() and an architectural guarantees that the
> > > atomic_fetch_add() has fwd progress under contention and that a sub-word
> > > store (through smp_store_release()) will fail the SC.
> > >
> > > Then you can do something like:
> > >
> > > void lock(atomic_t *lock)
> > > {
> > >         u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
> > >         u16 ticket = val >> 16;
> > >
> > >         for (;;) {
> > >                 if (ticket == (u16)val)
> > >                         break;
> > >                 cpu_relax();
> > >                 val = atomic_read_acquire(lock);
> > >         }
> > > }
> > >
> > > void unlock(atomic_t *lock)
> > > {
> > >         u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__);
> > >         u32 val = atomic_read(lock);
> > >
> > >         smp_store_release(ptr, (u16)val + 1);
> > > }
> > >
> > > That's _almost_ as simple as a test-and-set :-) It isn't quite optimal
> > > on x86 for not being allowed to use a memop on unlock, since its being
> > > forced into a load-store because of all the volatile, but whatever.
> >
> > What about trylock()?
> > I.e. one could implement trylock() without a loop, by letting
> > trylock() fail if the SC fails.
> > That looks safe on first view, but nobody does this right now.
I think it's safe for riscv LR/SC, because in spec A 8.3 section:
"As a consequence of the eventuality guarantee, if some harts in an
execution environment are executing constrained LR/SC loops, and no
other harts or devices in the execution environment execute an
unconditional store or AMO to that reservation set, then at least one
hart will eventually exit its constrained LR/SC loop."

So it guarantees LR/SC pair:

CPU0                   CPU1
=======             =======
LR addr1
                            LR addr1
                            SC addr1 // guarantee success.
SC addr1

But not guarantee, another hart unconditional store (which I mentioned before):
u32 a = 0x55aa66bb;
u16 *ptr = &a;

CPU0                       CPU1
=========             =========
xchg16(ptr, new)     while(1)
                                    WRITE_ONCE(*(ptr + 1), x);



>
> Not familiar with RISC-V but I'd recommend that a trylock only fails if
> the lock is locked (after LR). A SC may fail for other reasons
> (cacheline eviction; depending on the microarchitecture) even if the
> lock is unlocked. At least on arm64 we had this issue with an
> implementation having a tendency to always fail the first STXR.

I think it's a broken implementation for riscv. SC couldn't fail by
cache line bouncing and only could fail by another real write.
That means the HW implementation should use a per-hart address monitor
not just grab the cache line into the exclusive state without lockdown
the SNOOP channel.
I think the implementation of LR/SC you mentioned is a gambling style
but broke the riscv spec.

Is the patch of Will's would fix up the problem you mentioned?
----
commit 9bb17be062de6f5a9c9643258951aa0935652ec3
Author: Will Deacon <will.deacon@arm.com>
Date:   Tue Jul 2 14:54:33 2013 +0100

    ARM: locks: prefetch the destination word for write prior to strex

    The cost of changing a cacheline from shared to exclusive state can be
    significant, especially when this is triggered by an exclusive store,
    since it may result in having to retry the transaction.

    This patch prefixes our {spin,read,write}_[try]lock implementations with
    pldw instructions (on CPUs which support them) to try and grab the line
    in exclusive state from the start. arch_rwlock_t is changed to avoid
    using a volatile member, since this generates compiler warnings when
    falling back on the __builtin_prefetch intrinsic which expects a const
    void * argument.

    Acked-by: Nicolas Pitre <nico@linaro.org>
    Signed-off-by: Will Deacon <will.deacon@arm.com>
----

In the end, I want to conclude my suggestions here:
 - Using ticket-lock as default
 - Using ARCH_USE_QUEUED_SPINLOCKS_XCHG32 for riscv qspinlock
 - Disable xhg16/cmxchg16 and any sub-word atomic primitive in riscv

--
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
@ 2021-04-14  0:23                       ` Guo Ren
  0 siblings, 0 replies; 92+ messages in thread
From: Guo Ren @ 2021-04-14  0:23 UTC (permalink / raw)
  To: Catalin Marinas
  Cc: Christoph Müllner, Peter Zijlstra, Palmer Dabbelt,
	Anup Patel, linux-riscv, Linux Kernel Mailing List, Guo Ren,
	Will Deacon, Arnd Bergmann

On Tue, Apr 13, 2021 at 5:31 PM Catalin Marinas <catalin.marinas@arm.com> wrote:
>
> On Tue, Apr 13, 2021 at 11:22:40AM +0200, Christoph Müllner wrote:
> > On Tue, Apr 13, 2021 at 10:03 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > On Mon, Apr 12, 2021 at 11:54:55PM +0200, Christoph Müllner wrote:
> > > > On Mon, Apr 12, 2021 at 7:33 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:
> > > > > My plan is to add a generic ticket-based lock, which can be selected at
> > > > > compile time.  It'll have no architecture dependencies (though it'll
> > > > > likely have some hooks for architectures that can make this go faster).
> > > > > Users can then just pick which spinlock flavor they want, with the idea
> > > > > being that smaller systems will perform better with ticket locks and
> > > > > larger systems will perform better with queued locks.  The main goal
> > > > > here is to give the less widely used architectures an easy way to have
> > > > > fair locks, as right now we've got a lot of code duplication because any
> > > > > architecture that wants ticket locks has to do it themselves.
> > > >
> > > > In the case of LL/SC sequences, we have a maximum of 16 instructions
> > > > on RISC-V. My concern with a pure-C implementation would be that
> > > > we cannot guarantee this (e.g. somebody wants to compile with -O0)
> > > > and I don't know of a way to abort the build in case this limit exceeds.
> > > > Therefore I have preferred inline assembly for OpenSBI (my initial idea
> > > > was to use closure-like LL/SC macros, where you can write the loop
> > > > in form of C code).
> > >
> > > For ticket locks you really only needs atomic_fetch_add() and
> > > smp_store_release() and an architectural guarantees that the
> > > atomic_fetch_add() has fwd progress under contention and that a sub-word
> > > store (through smp_store_release()) will fail the SC.
> > >
> > > Then you can do something like:
> > >
> > > void lock(atomic_t *lock)
> > > {
> > >         u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
> > >         u16 ticket = val >> 16;
> > >
> > >         for (;;) {
> > >                 if (ticket == (u16)val)
> > >                         break;
> > >                 cpu_relax();
> > >                 val = atomic_read_acquire(lock);
> > >         }
> > > }
> > >
> > > void unlock(atomic_t *lock)
> > > {
> > >         u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__);
> > >         u32 val = atomic_read(lock);
> > >
> > >         smp_store_release(ptr, (u16)val + 1);
> > > }
> > >
> > > That's _almost_ as simple as a test-and-set :-) It isn't quite optimal
> > > on x86 for not being allowed to use a memop on unlock, since its being
> > > forced into a load-store because of all the volatile, but whatever.
> >
> > What about trylock()?
> > I.e. one could implement trylock() without a loop, by letting
> > trylock() fail if the SC fails.
> > That looks safe on first view, but nobody does this right now.
I think it's safe for riscv LR/SC, because in spec A 8.3 section:
"As a consequence of the eventuality guarantee, if some harts in an
execution environment are executing constrained LR/SC loops, and no
other harts or devices in the execution environment execute an
unconditional store or AMO to that reservation set, then at least one
hart will eventually exit its constrained LR/SC loop."

So it guarantees LR/SC pair:

CPU0                   CPU1
=======             =======
LR addr1
                            LR addr1
                            SC addr1 // guarantee success.
SC addr1

But not guarantee, another hart unconditional store (which I mentioned before):
u32 a = 0x55aa66bb;
u16 *ptr = &a;

CPU0                       CPU1
=========             =========
xchg16(ptr, new)     while(1)
                                    WRITE_ONCE(*(ptr + 1), x);



>
> Not familiar with RISC-V but I'd recommend that a trylock only fails if
> the lock is locked (after LR). A SC may fail for other reasons
> (cacheline eviction; depending on the microarchitecture) even if the
> lock is unlocked. At least on arm64 we had this issue with an
> implementation having a tendency to always fail the first STXR.

I think it's a broken implementation for riscv. SC couldn't fail by
cache line bouncing and only could fail by another real write.
That means the HW implementation should use a per-hart address monitor
not just grab the cache line into the exclusive state without lockdown
the SNOOP channel.
I think the implementation of LR/SC you mentioned is a gambling style
but broke the riscv spec.

Is the patch of Will's would fix up the problem you mentioned?
----
commit 9bb17be062de6f5a9c9643258951aa0935652ec3
Author: Will Deacon <will.deacon@arm.com>
Date:   Tue Jul 2 14:54:33 2013 +0100

    ARM: locks: prefetch the destination word for write prior to strex

    The cost of changing a cacheline from shared to exclusive state can be
    significant, especially when this is triggered by an exclusive store,
    since it may result in having to retry the transaction.

    This patch prefixes our {spin,read,write}_[try]lock implementations with
    pldw instructions (on CPUs which support them) to try and grab the line
    in exclusive state from the start. arch_rwlock_t is changed to avoid
    using a volatile member, since this generates compiler warnings when
    falling back on the __builtin_prefetch intrinsic which expects a const
    void * argument.

    Acked-by: Nicolas Pitre <nico@linaro.org>
    Signed-off-by: Will Deacon <will.deacon@arm.com>
----

In the end, I want to conclude my suggestions here:
 - Using ticket-lock as default
 - Using ARCH_USE_QUEUED_SPINLOCKS_XCHG32 for riscv qspinlock
 - Disable xhg16/cmxchg16 and any sub-word atomic primitive in riscv

--
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
  2021-04-13  8:17                   ` Peter Zijlstra
@ 2021-04-14  2:26                     ` Guo Ren
  -1 siblings, 0 replies; 92+ messages in thread
From: Guo Ren @ 2021-04-14  2:26 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Christoph Müllner, Palmer Dabbelt, Anup Patel, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, Catalin Marinas, Will Deacon,
	Arnd Bergmann

Thx Peter,

On Tue, Apr 13, 2021 at 4:17 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Tue, Apr 13, 2021 at 10:03:01AM +0200, Peter Zijlstra wrote:
>
> > For ticket locks you really only needs atomic_fetch_add() and
> > smp_store_release() and an architectural guarantees that the
> > atomic_fetch_add() has fwd progress under contention and that a sub-word
> > store (through smp_store_release()) will fail the SC.
> >
> > Then you can do something like:
> >
> > void lock(atomic_t *lock)
> > {
> >       u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
> >       u16 ticket = val >> 16;
> >
> >       for (;;) {
> >               if (ticket == (u16)val)
> >                       break;
> >               cpu_relax();
> >               val = atomic_read_acquire(lock);
> >       }
Should it be?
       for (;;) {
               if (ticket == (u16)val) {
                       __atomic_acquire_fence();
                       break;
               }

>
> A possibly better might be:
>
>         if (ticket == (u16)val)
>                 return;
Should it be?
         if (ticket == (u16)val) {
                 __atomic_acquire_fence();
                 return;
         }

>
>         atomic_cond_read_acquire(lock, ticket == (u16)VAL);
>
> Since that allows architectures to use WFE like constructs.
>
> > }
> >
> > void unlock(atomic_t *lock)
> > {
> >       u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__);
> >       u32 val = atomic_read(lock);
> >
> >       smp_store_release(ptr, (u16)val + 1);
> > }
> >
> > That's _almost_ as simple as a test-and-set :-) It isn't quite optimal
> > on x86 for not being allowed to use a memop on unlock, since its being
> > forced into a load-store because of all the volatile, but whatever.



-- 
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
@ 2021-04-14  2:26                     ` Guo Ren
  0 siblings, 0 replies; 92+ messages in thread
From: Guo Ren @ 2021-04-14  2:26 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Christoph Müllner, Palmer Dabbelt, Anup Patel, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, Catalin Marinas, Will Deacon,
	Arnd Bergmann

Thx Peter,

On Tue, Apr 13, 2021 at 4:17 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Tue, Apr 13, 2021 at 10:03:01AM +0200, Peter Zijlstra wrote:
>
> > For ticket locks you really only needs atomic_fetch_add() and
> > smp_store_release() and an architectural guarantees that the
> > atomic_fetch_add() has fwd progress under contention and that a sub-word
> > store (through smp_store_release()) will fail the SC.
> >
> > Then you can do something like:
> >
> > void lock(atomic_t *lock)
> > {
> >       u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
> >       u16 ticket = val >> 16;
> >
> >       for (;;) {
> >               if (ticket == (u16)val)
> >                       break;
> >               cpu_relax();
> >               val = atomic_read_acquire(lock);
> >       }
Should it be?
       for (;;) {
               if (ticket == (u16)val) {
                       __atomic_acquire_fence();
                       break;
               }

>
> A possibly better might be:
>
>         if (ticket == (u16)val)
>                 return;
Should it be?
         if (ticket == (u16)val) {
                 __atomic_acquire_fence();
                 return;
         }

>
>         atomic_cond_read_acquire(lock, ticket == (u16)VAL);
>
> Since that allows architectures to use WFE like constructs.
>
> > }
> >
> > void unlock(atomic_t *lock)
> > {
> >       u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__);
> >       u32 val = atomic_read(lock);
> >
> >       smp_store_release(ptr, (u16)val + 1);
> > }
> >
> > That's _almost_ as simple as a test-and-set :-) It isn't quite optimal
> > on x86 for not being allowed to use a memop on unlock, since its being
> > forced into a load-store because of all the volatile, but whatever.



-- 
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
  2021-04-13 10:54                           ` David Laight
@ 2021-04-14  5:54                             ` Guo Ren
  -1 siblings, 0 replies; 92+ messages in thread
From: Guo Ren @ 2021-04-14  5:54 UTC (permalink / raw)
  To: David Laight
  Cc: Catalin Marinas, Christoph Müllner, Peter Zijlstra,
	Palmer Dabbelt, Anup Patel, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, will.deacon, Arnd Bergmann

On Tue, Apr 13, 2021 at 6:54 PM David Laight <David.Laight@aculab.com> wrote:
>
> From: Catalin Marinas
> > Sent: 13 April 2021 11:45
> ...
> > This indeed needs some care. IIUC RISC-V has similar restrictions as arm
> > here, no load/store instructions are allowed between LR and SC. You
> > can't guarantee that the compiler won't spill some variable onto the
> > stack.
>
> You can probably never guarantee the compiler won't spill to stack.
> Especially if someone compiles with -O0.
>
> Which probably means that anything using LR/SC must be written in
> asm and the C wrappers disabled.
Agree, and cmpxchg has been widely used in Linux. I think it's the
last requirement for complex atomic API, although cmpxchg has ABA
problem:

CPU0
                          CPU1
=======
                        ======
do {
  old32 = load32;

                               *ptr32 = new32_tmp;

                               *ptr32 = old32;
  load32 = cmpxchg(ptr32, old32, new32); //still success
} while (load32 != old32);

That means cmpxhg only cares about the result but not the middle
situation. It's different from LR/SC or AMO instructions.

>
>         David
>
> -
> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
> Registration No: 1397386 (Wales)
>


-- 
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
@ 2021-04-14  5:54                             ` Guo Ren
  0 siblings, 0 replies; 92+ messages in thread
From: Guo Ren @ 2021-04-14  5:54 UTC (permalink / raw)
  To: David Laight
  Cc: Catalin Marinas, Christoph Müllner, Peter Zijlstra,
	Palmer Dabbelt, Anup Patel, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, will.deacon, Arnd Bergmann

On Tue, Apr 13, 2021 at 6:54 PM David Laight <David.Laight@aculab.com> wrote:
>
> From: Catalin Marinas
> > Sent: 13 April 2021 11:45
> ...
> > This indeed needs some care. IIUC RISC-V has similar restrictions as arm
> > here, no load/store instructions are allowed between LR and SC. You
> > can't guarantee that the compiler won't spill some variable onto the
> > stack.
>
> You can probably never guarantee the compiler won't spill to stack.
> Especially if someone compiles with -O0.
>
> Which probably means that anything using LR/SC must be written in
> asm and the C wrappers disabled.
Agree, and cmpxchg has been widely used in Linux. I think it's the
last requirement for complex atomic API, although cmpxchg has ABA
problem:

CPU0
                          CPU1
=======
                        ======
do {
  old32 = load32;

                               *ptr32 = new32_tmp;

                               *ptr32 = old32;
  load32 = cmpxchg(ptr32, old32, new32); //still success
} while (load32 != old32);

That means cmpxhg only cares about the result but not the middle
situation. It's different from LR/SC or AMO instructions.

>
>         David
>
> -
> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
> Registration No: 1397386 (Wales)
>


-- 
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
  2021-04-14  2:26                     ` Guo Ren
@ 2021-04-14  7:08                       ` Peter Zijlstra
  -1 siblings, 0 replies; 92+ messages in thread
From: Peter Zijlstra @ 2021-04-14  7:08 UTC (permalink / raw)
  To: Guo Ren
  Cc: Christoph Müllner, Palmer Dabbelt, Anup Patel, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, Catalin Marinas, Will Deacon,
	Arnd Bergmann

On Wed, Apr 14, 2021 at 10:26:57AM +0800, Guo Ren wrote:
> Thx Peter,
> 
> On Tue, Apr 13, 2021 at 4:17 PM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Tue, Apr 13, 2021 at 10:03:01AM +0200, Peter Zijlstra wrote:
> >
> > > For ticket locks you really only needs atomic_fetch_add() and
> > > smp_store_release() and an architectural guarantees that the
> > > atomic_fetch_add() has fwd progress under contention and that a sub-word
> > > store (through smp_store_release()) will fail the SC.
> > >
> > > Then you can do something like:
> > >
> > > void lock(atomic_t *lock)
> > > {
> > >       u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
> > >       u16 ticket = val >> 16;
> > >
> > >       for (;;) {
> > >               if (ticket == (u16)val)
> > >                       break;
> > >               cpu_relax();
> > >               val = atomic_read_acquire(lock);
> > >       }
> Should it be?
>        for (;;) {
>                if (ticket == (u16)val) {
>                        __atomic_acquire_fence();
>                        break;
>                }

No, atomic_fetch_add() is full smp_mb(), it even has a comment on that
says so.

Also, __atomic_acquire_fence() is an implementation detail of atomic,
and architectures need not provide it. On top of that, IIRC the atomic
_acquire/_release have RCpc ordering, where we want our locks to have
RCsc ordering (and very much not weaker than RCtso). Even more so,
adding barriers to atomics should really not be conditional.

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
@ 2021-04-14  7:08                       ` Peter Zijlstra
  0 siblings, 0 replies; 92+ messages in thread
From: Peter Zijlstra @ 2021-04-14  7:08 UTC (permalink / raw)
  To: Guo Ren
  Cc: Christoph Müllner, Palmer Dabbelt, Anup Patel, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, Catalin Marinas, Will Deacon,
	Arnd Bergmann

On Wed, Apr 14, 2021 at 10:26:57AM +0800, Guo Ren wrote:
> Thx Peter,
> 
> On Tue, Apr 13, 2021 at 4:17 PM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Tue, Apr 13, 2021 at 10:03:01AM +0200, Peter Zijlstra wrote:
> >
> > > For ticket locks you really only needs atomic_fetch_add() and
> > > smp_store_release() and an architectural guarantees that the
> > > atomic_fetch_add() has fwd progress under contention and that a sub-word
> > > store (through smp_store_release()) will fail the SC.
> > >
> > > Then you can do something like:
> > >
> > > void lock(atomic_t *lock)
> > > {
> > >       u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
> > >       u16 ticket = val >> 16;
> > >
> > >       for (;;) {
> > >               if (ticket == (u16)val)
> > >                       break;
> > >               cpu_relax();
> > >               val = atomic_read_acquire(lock);
> > >       }
> Should it be?
>        for (;;) {
>                if (ticket == (u16)val) {
>                        __atomic_acquire_fence();
>                        break;
>                }

No, atomic_fetch_add() is full smp_mb(), it even has a comment on that
says so.

Also, __atomic_acquire_fence() is an implementation detail of atomic,
and architectures need not provide it. On top of that, IIRC the atomic
_acquire/_release have RCpc ordering, where we want our locks to have
RCsc ordering (and very much not weaker than RCtso). Even more so,
adding barriers to atomics should really not be conditional.

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
  2021-04-14  7:08                       ` Peter Zijlstra
@ 2021-04-14  9:05                         ` Peter Zijlstra
  -1 siblings, 0 replies; 92+ messages in thread
From: Peter Zijlstra @ 2021-04-14  9:05 UTC (permalink / raw)
  To: Guo Ren
  Cc: Christoph Müllner, Palmer Dabbelt, Anup Patel, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, Catalin Marinas, Will Deacon,
	Arnd Bergmann

On Wed, Apr 14, 2021 at 09:08:18AM +0200, Peter Zijlstra wrote:
> On Wed, Apr 14, 2021 at 10:26:57AM +0800, Guo Ren wrote:
> > Thx Peter,
> > 
> > On Tue, Apr 13, 2021 at 4:17 PM Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > On Tue, Apr 13, 2021 at 10:03:01AM +0200, Peter Zijlstra wrote:
> > >
> > > > For ticket locks you really only needs atomic_fetch_add() and
> > > > smp_store_release() and an architectural guarantees that the
> > > > atomic_fetch_add() has fwd progress under contention and that a sub-word
> > > > store (through smp_store_release()) will fail the SC.
> > > >
> > > > Then you can do something like:
> > > >
> > > > void lock(atomic_t *lock)
> > > > {
> > > >       u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
> > > >       u16 ticket = val >> 16;
> > > >
> > > >       for (;;) {
> > > >               if (ticket == (u16)val)
> > > >                       break;
> > > >               cpu_relax();
> > > >               val = atomic_read_acquire(lock);
> > > >       }
> > Should it be?
> >        for (;;) {
> >                if (ticket == (u16)val) {
> >                        __atomic_acquire_fence();
> >                        break;
> >                }
> 
> No, atomic_fetch_add() is full smp_mb(), it even has a comment on that
> says so.
> 
> Also, __atomic_acquire_fence() is an implementation detail of atomic,
> and architectures need not provide it. On top of that, IIRC the atomic
> _acquire/_release have RCpc ordering, where we want our locks to have
> RCsc ordering (and very much not weaker than RCtso). Even more so,
> adding barriers to atomics should really not be conditional.

That made me look at the qspinlock code, and queued_spin_*lock() uses
atomic_try_cmpxchg_acquire(), which means any arch that uses qspinlock
and has RCpc atomics will give us massive pain.

Current archs using qspinlock are: x86, arm64, power, sparc64, mips and
openrisc (WTF?!).

Of those, x86 and sparc are TSO archs with SC atomics, arm64 has RCsc
atomics, power has RCtso atomics (and is the arch we all hate for having
RCtso locks).

Now MIPS has all sorts of ill specified barriers, but last time looked
at it it didn't actually use any of that and stuck to using smp_mb(), so
it will have RCsc atomics.

/me goes look at wth openrisc is..  doesn't even appear to have
asm/barrier.h :-/ Looking at wikipedia it also doesn't appear to
actually have hardware ...

I'm thinking openrisc is a prime candidate for this ticket_lock.h we're
all talking about.

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
@ 2021-04-14  9:05                         ` Peter Zijlstra
  0 siblings, 0 replies; 92+ messages in thread
From: Peter Zijlstra @ 2021-04-14  9:05 UTC (permalink / raw)
  To: Guo Ren
  Cc: Christoph Müllner, Palmer Dabbelt, Anup Patel, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, Catalin Marinas, Will Deacon,
	Arnd Bergmann

On Wed, Apr 14, 2021 at 09:08:18AM +0200, Peter Zijlstra wrote:
> On Wed, Apr 14, 2021 at 10:26:57AM +0800, Guo Ren wrote:
> > Thx Peter,
> > 
> > On Tue, Apr 13, 2021 at 4:17 PM Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > On Tue, Apr 13, 2021 at 10:03:01AM +0200, Peter Zijlstra wrote:
> > >
> > > > For ticket locks you really only needs atomic_fetch_add() and
> > > > smp_store_release() and an architectural guarantees that the
> > > > atomic_fetch_add() has fwd progress under contention and that a sub-word
> > > > store (through smp_store_release()) will fail the SC.
> > > >
> > > > Then you can do something like:
> > > >
> > > > void lock(atomic_t *lock)
> > > > {
> > > >       u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
> > > >       u16 ticket = val >> 16;
> > > >
> > > >       for (;;) {
> > > >               if (ticket == (u16)val)
> > > >                       break;
> > > >               cpu_relax();
> > > >               val = atomic_read_acquire(lock);
> > > >       }
> > Should it be?
> >        for (;;) {
> >                if (ticket == (u16)val) {
> >                        __atomic_acquire_fence();
> >                        break;
> >                }
> 
> No, atomic_fetch_add() is full smp_mb(), it even has a comment on that
> says so.
> 
> Also, __atomic_acquire_fence() is an implementation detail of atomic,
> and architectures need not provide it. On top of that, IIRC the atomic
> _acquire/_release have RCpc ordering, where we want our locks to have
> RCsc ordering (and very much not weaker than RCtso). Even more so,
> adding barriers to atomics should really not be conditional.

That made me look at the qspinlock code, and queued_spin_*lock() uses
atomic_try_cmpxchg_acquire(), which means any arch that uses qspinlock
and has RCpc atomics will give us massive pain.

Current archs using qspinlock are: x86, arm64, power, sparc64, mips and
openrisc (WTF?!).

Of those, x86 and sparc are TSO archs with SC atomics, arm64 has RCsc
atomics, power has RCtso atomics (and is the arch we all hate for having
RCtso locks).

Now MIPS has all sorts of ill specified barriers, but last time looked
at it it didn't actually use any of that and stuck to using smp_mb(), so
it will have RCsc atomics.

/me goes look at wth openrisc is..  doesn't even appear to have
asm/barrier.h :-/ Looking at wikipedia it also doesn't appear to
actually have hardware ...

I'm thinking openrisc is a prime candidate for this ticket_lock.h we're
all talking about.

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
  2021-04-14  0:23                       ` Guo Ren
@ 2021-04-14  9:17                         ` Catalin Marinas
  -1 siblings, 0 replies; 92+ messages in thread
From: Catalin Marinas @ 2021-04-14  9:17 UTC (permalink / raw)
  To: Guo Ren
  Cc: Christoph Müllner, Peter Zijlstra, Palmer Dabbelt,
	Anup Patel, linux-riscv, Linux Kernel Mailing List, Guo Ren,
	Will Deacon, Arnd Bergmann

On Wed, Apr 14, 2021 at 08:23:51AM +0800, Guo Ren wrote:
> On Tue, Apr 13, 2021 at 5:31 PM Catalin Marinas <catalin.marinas@arm.com> wrote:
> > On Tue, Apr 13, 2021 at 11:22:40AM +0200, Christoph Müllner wrote:
> > > On Tue, Apr 13, 2021 at 10:03 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > > On Mon, Apr 12, 2021 at 11:54:55PM +0200, Christoph Müllner wrote:
> > > > > On Mon, Apr 12, 2021 at 7:33 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:
> > > > > > My plan is to add a generic ticket-based lock, which can be selected at
> > > > > > compile time.  It'll have no architecture dependencies (though it'll
> > > > > > likely have some hooks for architectures that can make this go faster).
> > > > > > Users can then just pick which spinlock flavor they want, with the idea
> > > > > > being that smaller systems will perform better with ticket locks and
> > > > > > larger systems will perform better with queued locks.  The main goal
> > > > > > here is to give the less widely used architectures an easy way to have
> > > > > > fair locks, as right now we've got a lot of code duplication because any
> > > > > > architecture that wants ticket locks has to do it themselves.
> > > > >
> > > > > In the case of LL/SC sequences, we have a maximum of 16 instructions
> > > > > on RISC-V. My concern with a pure-C implementation would be that
> > > > > we cannot guarantee this (e.g. somebody wants to compile with -O0)
> > > > > and I don't know of a way to abort the build in case this limit exceeds.
> > > > > Therefore I have preferred inline assembly for OpenSBI (my initial idea
> > > > > was to use closure-like LL/SC macros, where you can write the loop
> > > > > in form of C code).
> > > >
> > > > For ticket locks you really only needs atomic_fetch_add() and
> > > > smp_store_release() and an architectural guarantees that the
> > > > atomic_fetch_add() has fwd progress under contention and that a sub-word
> > > > store (through smp_store_release()) will fail the SC.
> > > >
> > > > Then you can do something like:
> > > >
> > > > void lock(atomic_t *lock)
> > > > {
> > > >         u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
> > > >         u16 ticket = val >> 16;
> > > >
> > > >         for (;;) {
> > > >                 if (ticket == (u16)val)
> > > >                         break;
> > > >                 cpu_relax();
> > > >                 val = atomic_read_acquire(lock);
> > > >         }
> > > > }
> > > >
> > > > void unlock(atomic_t *lock)
> > > > {
> > > >         u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__);
> > > >         u32 val = atomic_read(lock);
> > > >
> > > >         smp_store_release(ptr, (u16)val + 1);
> > > > }
> > > >
> > > > That's _almost_ as simple as a test-and-set :-) It isn't quite optimal
> > > > on x86 for not being allowed to use a memop on unlock, since its being
> > > > forced into a load-store because of all the volatile, but whatever.
> > >
> > > What about trylock()?
> > > I.e. one could implement trylock() without a loop, by letting
> > > trylock() fail if the SC fails.
> > > That looks safe on first view, but nobody does this right now.
> 
> I think it's safe for riscv LR/SC, because in spec A 8.3 section:
> "As a consequence of the eventuality guarantee, if some harts in an
> execution environment are executing constrained LR/SC loops, and no
> other harts or devices in the execution environment execute an
> unconditional store or AMO to that reservation set, then at least one
> hart will eventually exit its constrained LR/SC loop."

This is clearly talking about _loops_ and that one hart will
_eventually_ exit the loop. It does not say that there is a guaranteed
LR/SC successful sequence or single iteration of the loop.

> So it guarantees LR/SC pair:
> 
> CPU0                   CPU1
> =======             =======
> LR addr1
>                             LR addr1
>                             SC addr1 // guarantee success.
> SC addr1

I don't see the RISC-V spec guaranteeing the (eventual) success of the
SC on CPU1 _without_ a loop.

> But not guarantee, another hart unconditional store (which I mentioned before):
> u32 a = 0x55aa66bb;
> u16 *ptr = &a;
> 
> CPU0                       CPU1
> =========             =========
> xchg16(ptr, new)     while(1)
>                                     WRITE_ONCE(*(ptr + 1), x);

If xchg16() is implemented with LR/SC, that's not guaranteed either. If
it implemented as some form of swap, the architecture may guarantee it's
success (or more like it won't deadlock).

> > Not familiar with RISC-V but I'd recommend that a trylock only fails if
> > the lock is locked (after LR). A SC may fail for other reasons
> > (cacheline eviction; depending on the microarchitecture) even if the
> > lock is unlocked. At least on arm64 we had this issue with an
> > implementation having a tendency to always fail the first STXR.
> 
> I think it's a broken implementation for riscv. SC couldn't fail by
> cache line bouncing and only could fail by another real write.
> That means the HW implementation should use a per-hart address monitor
> not just grab the cache line into the exclusive state without lockdown
> the SNOOP channel.
> I think the implementation of LR/SC you mentioned is a gambling style
> but broke the riscv spec.

Arm has the notion of exclusive monitors (local, global) but an
implementation may fake them by using cacheline states. And that's
allowed since the monitor doesn't need to guarantee success on the first
try but an eventual success of an ldxr/stxr loop.

> Is the patch of Will's would fix up the problem you mentioned?
> ----
> commit 9bb17be062de6f5a9c9643258951aa0935652ec3
> Author: Will Deacon <will.deacon@arm.com>
> Date:   Tue Jul 2 14:54:33 2013 +0100
> 
>     ARM: locks: prefetch the destination word for write prior to strex

No, that's only an optimisation. A prefetch would not guarantee that the
cacheline stays in certain state. There can be a long time between the
LR and SC, especially if interrupts are enabled or if you add
virtualisation into the mix.

The commit I was referring to is 4ecf7ccb1973 ("arm64: spinlock: retry
trylock operation if strex fails on free lock").

-- 
Catalin

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [PATCH] riscv: locks: introduce ticket-based spinlock implementation
@ 2021-04-14  9:17                         ` Catalin Marinas
  0 siblings, 0 replies; 92+ messages in thread
From: Catalin Marinas @ 2021-04-14  9:17 UTC (permalink / raw)
  To: Guo Ren
  Cc: Christoph Müllner, Peter Zijlstra, Palmer Dabbelt,
	Anup Patel, linux-riscv, Linux Kernel Mailing List, Guo Ren,
	Will Deacon, Arnd Bergmann

On Wed, Apr 14, 2021 at 08:23:51AM +0800, Guo Ren wrote:
> On Tue, Apr 13, 2021 at 5:31 PM Catalin Marinas <catalin.marinas@arm.com> wrote:
> > On Tue, Apr 13, 2021 at 11:22:40AM +0200, Christoph Müllner wrote:
> > > On Tue, Apr 13, 2021 at 10:03 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > > On Mon, Apr 12, 2021 at 11:54:55PM +0200, Christoph Müllner wrote:
> > > > > On Mon, Apr 12, 2021 at 7:33 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:
> > > > > > My plan is to add a generic ticket-based lock, which can be selected at
> > > > > > compile time.  It'll have no architecture dependencies (though it'll
> > > > > > likely have some hooks for architectures that can make this go faster).
> > > > > > Users can then just pick which spinlock flavor they want, with the idea
> > > > > > being that smaller systems will perform better with ticket locks and
> > > > > > larger systems will perform better with queued locks.  The main goal
> > > > > > here is to give the less widely used architectures an easy way to have
> > > > > > fair locks, as right now we've got a lot of code duplication because any
> > > > > > architecture that wants ticket locks has to do it themselves.
> > > > >
> > > > > In the case of LL/SC sequences, we have a maximum of 16 instructions
> > > > > on RISC-V. My concern with a pure-C implementation would be that
> > > > > we cannot guarantee this (e.g. somebody wants to compile with -O0)
> > > > > and I don't know of a way to abort the build in case this limit exceeds.
> > > > > Therefore I have preferred inline assembly for OpenSBI (my initial idea
> > > > > was to use closure-like LL/SC macros, where you can write the loop
> > > > > in form of C code).
> > > >
> > > > For ticket locks you really only needs atomic_fetch_add() and
> > > > smp_store_release() and an architectural guarantees that the
> > > > atomic_fetch_add() has fwd progress under contention and that a sub-word
> > > > store (through smp_store_release()) will fail the SC.
> > > >
> > > > Then you can do something like:
> > > >
> > > > void lock(atomic_t *lock)
> > > > {
> > > >         u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
> > > >         u16 ticket = val >> 16;
> > > >
> > > >         for (;;) {
> > > >                 if (ticket == (u16)val)
> > > >                         break;
> > > >                 cpu_relax();
> > > >                 val = atomic_read_acquire(lock);
> > > >         }
> > > > }
> > > >
> > > > void unlock(atomic_t *lock)
> > > > {
> > > >         u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__);
> > > >         u32 val = atomic_read(lock);
> > > >
> > > >         smp_store_release(ptr, (u16)val + 1);
> > > > }
> > > >
> > > > That's _almost_ as simple as a test-and-set :-) It isn't quite optimal
> > > > on x86 for not being allowed to use a memop on unlock, since its being
> > > > forced into a load-store because of all the volatile, but whatever.
> > >
> > > What about trylock()?
> > > I.e. one could implement trylock() without a loop, by letting
> > > trylock() fail if the SC fails.
> > > That looks safe on first view, but nobody does this right now.
> 
> I think it's safe for riscv LR/SC, because in spec A 8.3 section:
> "As a consequence of the eventuality guarantee, if some harts in an
> execution environment are executing constrained LR/SC loops, and no
> other harts or devices in the execution environment execute an
> unconditional store or AMO to that reservation set, then at least one
> hart will eventually exit its constrained LR/SC loop."

This is clearly talking about _loops_ and that one hart will
_eventually_ exit the loop. It does not say that there is a guaranteed
LR/SC successful sequence or single iteration of the loop.

> So it guarantees LR/SC pair:
> 
> CPU0                   CPU1
> =======             =======
> LR addr1
>                             LR addr1
>                             SC addr1 // guarantee success.
> SC addr1

I don't see the RISC-V spec guaranteeing the (eventual) success of the
SC on CPU1 _without_ a loop.

> But not guarantee, another hart unconditional store (which I mentioned before):
> u32 a = 0x55aa66bb;
> u16 *ptr = &a;
> 
> CPU0                       CPU1
> =========             =========
> xchg16(ptr, new)     while(1)
>                                     WRITE_ONCE(*(ptr + 1), x);

If xchg16() is implemented with LR/SC, that's not guaranteed either. If
it implemented as some form of swap, the architecture may guarantee it's
success (or more like it won't deadlock).

> > Not familiar with RISC-V but I'd recommend that a trylock only fails if
> > the lock is locked (after LR). A SC may fail for other reasons
> > (cacheline eviction; depending on the microarchitecture) even if the
> > lock is unlocked. At least on arm64 we had this issue with an
> > implementation having a tendency to always fail the first STXR.
> 
> I think it's a broken implementation for riscv. SC couldn't fail by
> cache line bouncing and only could fail by another real write.
> That means the HW implementation should use a per-hart address monitor
> not just grab the cache line into the exclusive state without lockdown
> the SNOOP channel.
> I think the implementation of LR/SC you mentioned is a gambling style
> but broke the riscv spec.

Arm has the notion of exclusive monitors (local, global) but an
implementation may fake them by using cacheline states. And that's
allowed since the monitor doesn't need to guarantee success on the first
try but an eventual success of an ldxr/stxr loop.

> Is the patch of Will's would fix up the problem you mentioned?
> ----
> commit 9bb17be062de6f5a9c9643258951aa0935652ec3
> Author: Will Deacon <will.deacon@arm.com>
> Date:   Tue Jul 2 14:54:33 2013 +0100
> 
>     ARM: locks: prefetch the destination word for write prior to strex

No, that's only an optimisation. A prefetch would not guarantee that the
cacheline stays in certain state. There can be a long time between the
LR and SC, especially if interrupts are enabled or if you add
virtualisation into the mix.

The commit I was referring to is 4ecf7ccb1973 ("arm64: spinlock: retry
trylock operation if strex fails on free lock").

-- 
Catalin

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

* [RFC][PATCH] locking: Generic ticket-lock
  2021-04-14  9:05                         ` Peter Zijlstra
@ 2021-04-14 10:16                           ` Peter Zijlstra
  -1 siblings, 0 replies; 92+ messages in thread
From: Peter Zijlstra @ 2021-04-14 10:16 UTC (permalink / raw)
  To: Guo Ren
  Cc: Christoph Müllner, Palmer Dabbelt, Anup Patel, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, Catalin Marinas, Will Deacon,
	Arnd Bergmann, jonas, stefan.kristiansson, shorne

On Wed, Apr 14, 2021 at 11:05:24AM +0200, Peter Zijlstra wrote:

> That made me look at the qspinlock code, and queued_spin_*lock() uses
> atomic_try_cmpxchg_acquire(), which means any arch that uses qspinlock
> and has RCpc atomics will give us massive pain.
> 
> Current archs using qspinlock are: x86, arm64, power, sparc64, mips and
> openrisc (WTF?!).
> 
> Of those, x86 and sparc are TSO archs with SC atomics, arm64 has RCsc
> atomics, power has RCtso atomics (and is the arch we all hate for having
> RCtso locks).
> 
> Now MIPS has all sorts of ill specified barriers, but last time looked
> at it it didn't actually use any of that and stuck to using smp_mb(), so
> it will have RCsc atomics.
> 
> /me goes look at wth openrisc is..  doesn't even appear to have
> asm/barrier.h :-/ Looking at wikipedia it also doesn't appear to
> actually have hardware ...

FWIW this is broken, anything SMP *MUST* define mb(), at the very least.

> I'm thinking openrisc is a prime candidate for this ticket_lock.h we're
> all talking about.

How's this then? Compile tested only on openrisc/simple_smp_defconfig.

---
 arch/openrisc/Kconfig                      |  1 -
 arch/openrisc/include/asm/Kbuild           |  5 +-
 arch/openrisc/include/asm/spinlock.h       |  3 +-
 arch/openrisc/include/asm/spinlock_types.h |  2 +-
 include/asm-generic/qspinlock.h            | 30 +++++++++++
 include/asm-generic/ticket-lock-types.h    | 11 ++++
 include/asm-generic/ticket-lock.h          | 86 ++++++++++++++++++++++++++++++
 7 files changed, 131 insertions(+), 7 deletions(-)

diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig
index 591acc5990dc..1858cf309f1f 100644
--- a/arch/openrisc/Kconfig
+++ b/arch/openrisc/Kconfig
@@ -32,7 +32,6 @@ config OPENRISC
 	select HAVE_DEBUG_STACKOVERFLOW
 	select OR1K_PIC
 	select CPU_NO_EFFICIENT_FFS if !OPENRISC_HAVE_INST_FF1
-	select ARCH_USE_QUEUED_SPINLOCKS
 	select ARCH_USE_QUEUED_RWLOCKS
 	select OMPIC if SMP
 	select ARCH_WANT_FRAME_POINTERS
diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild
index ca5987e11053..cb260e7d73db 100644
--- a/arch/openrisc/include/asm/Kbuild
+++ b/arch/openrisc/include/asm/Kbuild
@@ -1,9 +1,8 @@
 # SPDX-License-Identifier: GPL-2.0
 generic-y += extable.h
 generic-y += kvm_para.h
-generic-y += mcs_spinlock.h
-generic-y += qspinlock_types.h
-generic-y += qspinlock.h
+generic-y += ticket-lock.h
+generic-y += ticket-lock-types.h
 generic-y += qrwlock_types.h
 generic-y += qrwlock.h
 generic-y += user.h
diff --git a/arch/openrisc/include/asm/spinlock.h b/arch/openrisc/include/asm/spinlock.h
index a8940bdfcb7e..0b839ed1f3a0 100644
--- a/arch/openrisc/include/asm/spinlock.h
+++ b/arch/openrisc/include/asm/spinlock.h
@@ -15,8 +15,7 @@
 #ifndef __ASM_OPENRISC_SPINLOCK_H
 #define __ASM_OPENRISC_SPINLOCK_H
 
-#include <asm/qspinlock.h>
-
+#include <asm/ticket-lock.h>
 #include <asm/qrwlock.h>
 
 #define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
diff --git a/arch/openrisc/include/asm/spinlock_types.h b/arch/openrisc/include/asm/spinlock_types.h
index 7c6fb1208c88..58ea31fa65ce 100644
--- a/arch/openrisc/include/asm/spinlock_types.h
+++ b/arch/openrisc/include/asm/spinlock_types.h
@@ -1,7 +1,7 @@
 #ifndef _ASM_OPENRISC_SPINLOCK_TYPES_H
 #define _ASM_OPENRISC_SPINLOCK_TYPES_H
 
-#include <asm/qspinlock_types.h>
+#include <asm/ticket-lock-types.h>
 #include <asm/qrwlock_types.h>
 
 #endif /* _ASM_OPENRISC_SPINLOCK_TYPES_H */
diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
index d74b13825501..a7a1296b0b4d 100644
--- a/include/asm-generic/qspinlock.h
+++ b/include/asm-generic/qspinlock.h
@@ -2,6 +2,36 @@
 /*
  * Queued spinlock
  *
+ * A 'generic' spinlock implementation that is based on MCS locks. An
+ * architecture that's looking for a 'generic' spinlock, please first consider
+ * ticket-lock.h and only come looking here when you've considered all the
+ * constraints below and can show your hardware does actually perform better
+ * with qspinlock.
+ *
+ *
+ * It relies on atomic_*_release()/atomic_*_acquire() to be RCsc (or no weaker
+ * than RCtso if you're power), where regular code only expects atomic_t to be
+ * RCpc.
+ *
+ * It relies on a far greater (compared to ticket-lock.h) set of atomic
+ * operations to behave well together, please audit them carefully to ensure
+ * they all have forward progress. Many atomic operations may default to
+ * cmpxchg() loops which will not have good forward progress properties on
+ * LL/SC architectures.
+ *
+ * One notable example is atomic_fetch_or_acquire(), which x86 cannot (cheaply)
+ * do. Carefully read the patches that introduced queued_fetch_set_pending_acquire().
+ *
+ * It also heavily relies on mixed size atomic operations, in specific it
+ * requires architectures to have xchg16; something which many LL/SC
+ * architectures need to implement as a 32bit and+or in order to satisfy the
+ * forward progress guarantees mentioned above.
+ *
+ * Further reading on mixed size atomics that might be relevant:
+ *
+ *   http://www.cl.cam.ac.uk/~pes20/popl17/mixed-size.pdf
+ *
+ *
  * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
  * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
  *
diff --git a/include/asm-generic/ticket-lock-types.h b/include/asm-generic/ticket-lock-types.h
new file mode 100644
index 000000000000..829759aedda8
--- /dev/null
+++ b/include/asm-generic/ticket-lock-types.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __ASM_GENERIC_TICKET_LOCK_TYPES_H
+#define __ASM_GENERIC_TICKET_LOCK_TYPES_H
+
+#include <linux/types.h>
+typedef atomic_t arch_spinlock_t;
+
+#define __ARCH_SPIN_LOCK_UNLOCKED	ATOMIC_INIT(0)
+
+#endif /* __ASM_GENERIC_TICKET_LOCK_TYPES_H */
diff --git a/include/asm-generic/ticket-lock.h b/include/asm-generic/ticket-lock.h
new file mode 100644
index 000000000000..3f0d53e21a37
--- /dev/null
+++ b/include/asm-generic/ticket-lock.h
@@ -0,0 +1,86 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * 'Generic' ticket-lock implementation.
+ *
+ * It relies on atomic_fetch_add() having well defined forward progress
+ * guarantees under contention. If your architecture cannot provide this, stick
+ * to a test-and-set lock.
+ *
+ * It also relies on atomic_fetch_add() being safe vs smp_store_release() on a
+ * sub-word of the value. This is generally true for anything LL/SC although
+ * you'd be hard pressed to find anything useful in architecture specifications
+ * about this. If your architecture cannot do this you might be better off with
+ * a test-and-set.
+ *
+ * It further assumes atomic_*_release() + atomic_*_acquire() is RCpc and hence
+ * uses atomic_fetch_add() which is SC to create an RCsc lock.
+ *
+ * The implementation uses smp_cond_load_acquire() to spin, so if the
+ * architecture has WFE like instructions to sleep instead of poll for word
+ * modifications be sure to implement that (see ARM64 for example).
+ *
+ */
+
+#ifndef __ASM_GENERIC_TICKET_LOCK_H
+#define __ASM_GENERIC_TICKET_LOCK_H
+
+#include <linux/atomic.h>
+#include <asm/ticket-lock-types.h>
+
+static __always_inline void ticket_lock(arch_spinlock_t *lock)
+{
+	u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
+	u16 ticket = val >> 16;
+
+	if (ticket == (u16)val)
+		return;
+
+	atomic_cond_read_acquire(lock, ticket == (u16)VAL);
+}
+
+static __always_inline bool ticket_trylock(arch_spinlock_t *lock)
+{
+	u32 old = atomic_read(lock);
+
+	if ((old >> 16) != (old & 0xffff))
+		return false;
+
+	return atomic_try_cmpxchg(lock, &old, old + (1<<16)); /* SC, for RCsc */
+}
+
+static __always_inline void ticket_unlock(arch_spinlock_t *lock)
+{
+	u16 *ptr = (u16 *)lock + __is_defined(__BIG_ENDIAN);
+	u32 val = atomic_read(lock);
+
+	smp_store_release(ptr, (u16)val + 1);
+}
+
+static __always_inline int ticket_is_locked(arch_spinlock_t *lock)
+{
+	u32 val = atomic_read(lock);
+
+	return ((val >> 16) != (val & 0xffff));
+}
+
+static __always_inline int ticket_is_contended(arch_spinlock_t *lock)
+{
+	u32 val = atomic_read(lock);
+
+	return (s16)((val >> 16) - (val & 0xffff)) > 1;
+}
+
+static __always_inline int ticket_value_unlocked(arch_spinlock_t lock)
+{
+	return !ticket_is_locked(&lock);
+}
+
+#define arch_spin_lock(l)		ticket_lock(l)
+#define arch_spin_trylock(l)		ticket_trylock(l)
+#define arch_spin_unlock(l)		ticket_unlock(l)
+#define arch_spin_is_locked(l)		ticket_is_locked(l)
+#define arch_spin_is_contended(l)	ticket_is_contended(l)
+#define arch_spin_value_unlocked(l)	ticket_value_unlocked(l)
+
+#endif /* __ASM_GENERIC_TICKET_LOCK_H */

^ permalink raw reply related	[flat|nested] 92+ messages in thread

* [RFC][PATCH] locking: Generic ticket-lock
@ 2021-04-14 10:16                           ` Peter Zijlstra
  0 siblings, 0 replies; 92+ messages in thread
From: Peter Zijlstra @ 2021-04-14 10:16 UTC (permalink / raw)
  To: Guo Ren
  Cc: Christoph Müllner, Palmer Dabbelt, Anup Patel, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, Catalin Marinas, Will Deacon,
	Arnd Bergmann, jonas, stefan.kristiansson, shorne

On Wed, Apr 14, 2021 at 11:05:24AM +0200, Peter Zijlstra wrote:

> That made me look at the qspinlock code, and queued_spin_*lock() uses
> atomic_try_cmpxchg_acquire(), which means any arch that uses qspinlock
> and has RCpc atomics will give us massive pain.
> 
> Current archs using qspinlock are: x86, arm64, power, sparc64, mips and
> openrisc (WTF?!).
> 
> Of those, x86 and sparc are TSO archs with SC atomics, arm64 has RCsc
> atomics, power has RCtso atomics (and is the arch we all hate for having
> RCtso locks).
> 
> Now MIPS has all sorts of ill specified barriers, but last time looked
> at it it didn't actually use any of that and stuck to using smp_mb(), so
> it will have RCsc atomics.
> 
> /me goes look at wth openrisc is..  doesn't even appear to have
> asm/barrier.h :-/ Looking at wikipedia it also doesn't appear to
> actually have hardware ...

FWIW this is broken, anything SMP *MUST* define mb(), at the very least.

> I'm thinking openrisc is a prime candidate for this ticket_lock.h we're
> all talking about.

How's this then? Compile tested only on openrisc/simple_smp_defconfig.

---
 arch/openrisc/Kconfig                      |  1 -
 arch/openrisc/include/asm/Kbuild           |  5 +-
 arch/openrisc/include/asm/spinlock.h       |  3 +-
 arch/openrisc/include/asm/spinlock_types.h |  2 +-
 include/asm-generic/qspinlock.h            | 30 +++++++++++
 include/asm-generic/ticket-lock-types.h    | 11 ++++
 include/asm-generic/ticket-lock.h          | 86 ++++++++++++++++++++++++++++++
 7 files changed, 131 insertions(+), 7 deletions(-)

diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig
index 591acc5990dc..1858cf309f1f 100644
--- a/arch/openrisc/Kconfig
+++ b/arch/openrisc/Kconfig
@@ -32,7 +32,6 @@ config OPENRISC
 	select HAVE_DEBUG_STACKOVERFLOW
 	select OR1K_PIC
 	select CPU_NO_EFFICIENT_FFS if !OPENRISC_HAVE_INST_FF1
-	select ARCH_USE_QUEUED_SPINLOCKS
 	select ARCH_USE_QUEUED_RWLOCKS
 	select OMPIC if SMP
 	select ARCH_WANT_FRAME_POINTERS
diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild
index ca5987e11053..cb260e7d73db 100644
--- a/arch/openrisc/include/asm/Kbuild
+++ b/arch/openrisc/include/asm/Kbuild
@@ -1,9 +1,8 @@
 # SPDX-License-Identifier: GPL-2.0
 generic-y += extable.h
 generic-y += kvm_para.h
-generic-y += mcs_spinlock.h
-generic-y += qspinlock_types.h
-generic-y += qspinlock.h
+generic-y += ticket-lock.h
+generic-y += ticket-lock-types.h
 generic-y += qrwlock_types.h
 generic-y += qrwlock.h
 generic-y += user.h
diff --git a/arch/openrisc/include/asm/spinlock.h b/arch/openrisc/include/asm/spinlock.h
index a8940bdfcb7e..0b839ed1f3a0 100644
--- a/arch/openrisc/include/asm/spinlock.h
+++ b/arch/openrisc/include/asm/spinlock.h
@@ -15,8 +15,7 @@
 #ifndef __ASM_OPENRISC_SPINLOCK_H
 #define __ASM_OPENRISC_SPINLOCK_H
 
-#include <asm/qspinlock.h>
-
+#include <asm/ticket-lock.h>
 #include <asm/qrwlock.h>
 
 #define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
diff --git a/arch/openrisc/include/asm/spinlock_types.h b/arch/openrisc/include/asm/spinlock_types.h
index 7c6fb1208c88..58ea31fa65ce 100644
--- a/arch/openrisc/include/asm/spinlock_types.h
+++ b/arch/openrisc/include/asm/spinlock_types.h
@@ -1,7 +1,7 @@
 #ifndef _ASM_OPENRISC_SPINLOCK_TYPES_H
 #define _ASM_OPENRISC_SPINLOCK_TYPES_H
 
-#include <asm/qspinlock_types.h>
+#include <asm/ticket-lock-types.h>
 #include <asm/qrwlock_types.h>
 
 #endif /* _ASM_OPENRISC_SPINLOCK_TYPES_H */
diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
index d74b13825501..a7a1296b0b4d 100644
--- a/include/asm-generic/qspinlock.h
+++ b/include/asm-generic/qspinlock.h
@@ -2,6 +2,36 @@
 /*
  * Queued spinlock
  *
+ * A 'generic' spinlock implementation that is based on MCS locks. An
+ * architecture that's looking for a 'generic' spinlock, please first consider
+ * ticket-lock.h and only come looking here when you've considered all the
+ * constraints below and can show your hardware does actually perform better
+ * with qspinlock.
+ *
+ *
+ * It relies on atomic_*_release()/atomic_*_acquire() to be RCsc (or no weaker
+ * than RCtso if you're power), where regular code only expects atomic_t to be
+ * RCpc.
+ *
+ * It relies on a far greater (compared to ticket-lock.h) set of atomic
+ * operations to behave well together, please audit them carefully to ensure
+ * they all have forward progress. Many atomic operations may default to
+ * cmpxchg() loops which will not have good forward progress properties on
+ * LL/SC architectures.
+ *
+ * One notable example is atomic_fetch_or_acquire(), which x86 cannot (cheaply)
+ * do. Carefully read the patches that introduced queued_fetch_set_pending_acquire().
+ *
+ * It also heavily relies on mixed size atomic operations, in specific it
+ * requires architectures to have xchg16; something which many LL/SC
+ * architectures need to implement as a 32bit and+or in order to satisfy the
+ * forward progress guarantees mentioned above.
+ *
+ * Further reading on mixed size atomics that might be relevant:
+ *
+ *   http://www.cl.cam.ac.uk/~pes20/popl17/mixed-size.pdf
+ *
+ *
  * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
  * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
  *
diff --git a/include/asm-generic/ticket-lock-types.h b/include/asm-generic/ticket-lock-types.h
new file mode 100644
index 000000000000..829759aedda8
--- /dev/null
+++ b/include/asm-generic/ticket-lock-types.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __ASM_GENERIC_TICKET_LOCK_TYPES_H
+#define __ASM_GENERIC_TICKET_LOCK_TYPES_H
+
+#include <linux/types.h>
+typedef atomic_t arch_spinlock_t;
+
+#define __ARCH_SPIN_LOCK_UNLOCKED	ATOMIC_INIT(0)
+
+#endif /* __ASM_GENERIC_TICKET_LOCK_TYPES_H */
diff --git a/include/asm-generic/ticket-lock.h b/include/asm-generic/ticket-lock.h
new file mode 100644
index 000000000000..3f0d53e21a37
--- /dev/null
+++ b/include/asm-generic/ticket-lock.h
@@ -0,0 +1,86 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * 'Generic' ticket-lock implementation.
+ *
+ * It relies on atomic_fetch_add() having well defined forward progress
+ * guarantees under contention. If your architecture cannot provide this, stick
+ * to a test-and-set lock.
+ *
+ * It also relies on atomic_fetch_add() being safe vs smp_store_release() on a
+ * sub-word of the value. This is generally true for anything LL/SC although
+ * you'd be hard pressed to find anything useful in architecture specifications
+ * about this. If your architecture cannot do this you might be better off with
+ * a test-and-set.
+ *
+ * It further assumes atomic_*_release() + atomic_*_acquire() is RCpc and hence
+ * uses atomic_fetch_add() which is SC to create an RCsc lock.
+ *
+ * The implementation uses smp_cond_load_acquire() to spin, so if the
+ * architecture has WFE like instructions to sleep instead of poll for word
+ * modifications be sure to implement that (see ARM64 for example).
+ *
+ */
+
+#ifndef __ASM_GENERIC_TICKET_LOCK_H
+#define __ASM_GENERIC_TICKET_LOCK_H
+
+#include <linux/atomic.h>
+#include <asm/ticket-lock-types.h>
+
+static __always_inline void ticket_lock(arch_spinlock_t *lock)
+{
+	u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
+	u16 ticket = val >> 16;
+
+	if (ticket == (u16)val)
+		return;
+
+	atomic_cond_read_acquire(lock, ticket == (u16)VAL);
+}
+
+static __always_inline bool ticket_trylock(arch_spinlock_t *lock)
+{
+	u32 old = atomic_read(lock);
+
+	if ((old >> 16) != (old & 0xffff))
+		return false;
+
+	return atomic_try_cmpxchg(lock, &old, old + (1<<16)); /* SC, for RCsc */
+}
+
+static __always_inline void ticket_unlock(arch_spinlock_t *lock)
+{
+	u16 *ptr = (u16 *)lock + __is_defined(__BIG_ENDIAN);
+	u32 val = atomic_read(lock);
+
+	smp_store_release(ptr, (u16)val + 1);
+}
+
+static __always_inline int ticket_is_locked(arch_spinlock_t *lock)
+{
+	u32 val = atomic_read(lock);
+
+	return ((val >> 16) != (val & 0xffff));
+}
+
+static __always_inline int ticket_is_contended(arch_spinlock_t *lock)
+{
+	u32 val = atomic_read(lock);
+
+	return (s16)((val >> 16) - (val & 0xffff)) > 1;
+}
+
+static __always_inline int ticket_value_unlocked(arch_spinlock_t lock)
+{
+	return !ticket_is_locked(&lock);
+}
+
+#define arch_spin_lock(l)		ticket_lock(l)
+#define arch_spin_trylock(l)		ticket_trylock(l)
+#define arch_spin_unlock(l)		ticket_unlock(l)
+#define arch_spin_is_locked(l)		ticket_is_locked(l)
+#define arch_spin_is_contended(l)	ticket_is_contended(l)
+#define arch_spin_value_unlocked(l)	ticket_value_unlocked(l)
+
+#endif /* __ASM_GENERIC_TICKET_LOCK_H */

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply related	[flat|nested] 92+ messages in thread

* Re: [RFC][PATCH] locking: Generic ticket-lock
  2021-04-14 10:16                           ` Peter Zijlstra
@ 2021-04-14 12:39                             ` Guo Ren
  -1 siblings, 0 replies; 92+ messages in thread
From: Guo Ren @ 2021-04-14 12:39 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Christoph Müllner, Palmer Dabbelt, Anup Patel, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, Catalin Marinas, Will Deacon,
	Arnd Bergmann, Jonas Bonn, Stefan Kristiansson, Stafford Horne

On Wed, Apr 14, 2021 at 6:16 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Wed, Apr 14, 2021 at 11:05:24AM +0200, Peter Zijlstra wrote:
>
> > That made me look at the qspinlock code, and queued_spin_*lock() uses
> > atomic_try_cmpxchg_acquire(), which means any arch that uses qspinlock
> > and has RCpc atomics will give us massive pain.
> >
> > Current archs using qspinlock are: x86, arm64, power, sparc64, mips and
> > openrisc (WTF?!).
> >
> > Of those, x86 and sparc are TSO archs with SC atomics, arm64 has RCsc
> > atomics, power has RCtso atomics (and is the arch we all hate for having
> > RCtso locks).
> >
> > Now MIPS has all sorts of ill specified barriers, but last time looked
> > at it it didn't actually use any of that and stuck to using smp_mb(), so
> > it will have RCsc atomics.
> >
> > /me goes look at wth openrisc is..  doesn't even appear to have
> > asm/barrier.h :-/ Looking at wikipedia it also doesn't appear to
> > actually have hardware ...
>
> FWIW this is broken, anything SMP *MUST* define mb(), at the very least.
>
> > I'm thinking openrisc is a prime candidate for this ticket_lock.h we're
> > all talking about.
>
> How's this then? Compile tested only on openrisc/simple_smp_defconfig.
I've tested it on csky SMP*4 hw (860) & riscv SMP*4 hw (c910) and it's okay.

Hope you can keep
typedef struct {
        union {
                atomic_t lock;
                struct __raw_tickets {
#ifdef __BIG_ENDIAN
                        u16 next;
                        u16 owner;
#else
                        u16 owner;
                        u16 next;
#endif
                } tickets;
        };
} arch_spinlock_t;

Using owner & next is much more readable.

>
> ---
>  arch/openrisc/Kconfig                      |  1 -
>  arch/openrisc/include/asm/Kbuild           |  5 +-
>  arch/openrisc/include/asm/spinlock.h       |  3 +-
>  arch/openrisc/include/asm/spinlock_types.h |  2 +-
>  include/asm-generic/qspinlock.h            | 30 +++++++++++
>  include/asm-generic/ticket-lock-types.h    | 11 ++++
>  include/asm-generic/ticket-lock.h          | 86 ++++++++++++++++++++++++++++++
>  7 files changed, 131 insertions(+), 7 deletions(-)
>
> diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig
> index 591acc5990dc..1858cf309f1f 100644
> --- a/arch/openrisc/Kconfig
> +++ b/arch/openrisc/Kconfig
> @@ -32,7 +32,6 @@ config OPENRISC
>         select HAVE_DEBUG_STACKOVERFLOW
>         select OR1K_PIC
>         select CPU_NO_EFFICIENT_FFS if !OPENRISC_HAVE_INST_FF1
> -       select ARCH_USE_QUEUED_SPINLOCKS
>         select ARCH_USE_QUEUED_RWLOCKS
>         select OMPIC if SMP
>         select ARCH_WANT_FRAME_POINTERS
> diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild
> index ca5987e11053..cb260e7d73db 100644
> --- a/arch/openrisc/include/asm/Kbuild
> +++ b/arch/openrisc/include/asm/Kbuild
> @@ -1,9 +1,8 @@
>  # SPDX-License-Identifier: GPL-2.0
>  generic-y += extable.h
>  generic-y += kvm_para.h
> -generic-y += mcs_spinlock.h
> -generic-y += qspinlock_types.h
> -generic-y += qspinlock.h
> +generic-y += ticket-lock.h
> +generic-y += ticket-lock-types.h
>  generic-y += qrwlock_types.h
>  generic-y += qrwlock.h
>  generic-y += user.h
> diff --git a/arch/openrisc/include/asm/spinlock.h b/arch/openrisc/include/asm/spinlock.h
> index a8940bdfcb7e..0b839ed1f3a0 100644
> --- a/arch/openrisc/include/asm/spinlock.h
> +++ b/arch/openrisc/include/asm/spinlock.h
> @@ -15,8 +15,7 @@
>  #ifndef __ASM_OPENRISC_SPINLOCK_H
>  #define __ASM_OPENRISC_SPINLOCK_H
>
> -#include <asm/qspinlock.h>
> -
> +#include <asm/ticket-lock.h>
>  #include <asm/qrwlock.h>
>
>  #define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
> diff --git a/arch/openrisc/include/asm/spinlock_types.h b/arch/openrisc/include/asm/spinlock_types.h
> index 7c6fb1208c88..58ea31fa65ce 100644
> --- a/arch/openrisc/include/asm/spinlock_types.h
> +++ b/arch/openrisc/include/asm/spinlock_types.h
> @@ -1,7 +1,7 @@
>  #ifndef _ASM_OPENRISC_SPINLOCK_TYPES_H
>  #define _ASM_OPENRISC_SPINLOCK_TYPES_H
>
> -#include <asm/qspinlock_types.h>
> +#include <asm/ticket-lock-types.h>
>  #include <asm/qrwlock_types.h>
>
>  #endif /* _ASM_OPENRISC_SPINLOCK_TYPES_H */
> diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
> index d74b13825501..a7a1296b0b4d 100644
> --- a/include/asm-generic/qspinlock.h
> +++ b/include/asm-generic/qspinlock.h
> @@ -2,6 +2,36 @@
>  /*
>   * Queued spinlock
>   *
> + * A 'generic' spinlock implementation that is based on MCS locks. An
> + * architecture that's looking for a 'generic' spinlock, please first consider
> + * ticket-lock.h and only come looking here when you've considered all the
> + * constraints below and can show your hardware does actually perform better
> + * with qspinlock.
> + *
> + *
> + * It relies on atomic_*_release()/atomic_*_acquire() to be RCsc (or no weaker
> + * than RCtso if you're power), where regular code only expects atomic_t to be
> + * RCpc.
> + *
> + * It relies on a far greater (compared to ticket-lock.h) set of atomic
> + * operations to behave well together, please audit them carefully to ensure
> + * they all have forward progress. Many atomic operations may default to
> + * cmpxchg() loops which will not have good forward progress properties on
> + * LL/SC architectures.
> + *
> + * One notable example is atomic_fetch_or_acquire(), which x86 cannot (cheaply)
> + * do. Carefully read the patches that introduced queued_fetch_set_pending_acquire().
> + *
> + * It also heavily relies on mixed size atomic operations, in specific it
> + * requires architectures to have xchg16; something which many LL/SC
> + * architectures need to implement as a 32bit and+or in order to satisfy the
> + * forward progress guarantees mentioned above.
> + *
> + * Further reading on mixed size atomics that might be relevant:
> + *
> + *   http://www.cl.cam.ac.uk/~pes20/popl17/mixed-size.pdf
> + *
> + *
>   * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
>   * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
>   *
> diff --git a/include/asm-generic/ticket-lock-types.h b/include/asm-generic/ticket-lock-types.h
> new file mode 100644
> index 000000000000..829759aedda8
> --- /dev/null
> +++ b/include/asm-generic/ticket-lock-types.h
> @@ -0,0 +1,11 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#ifndef __ASM_GENERIC_TICKET_LOCK_TYPES_H
> +#define __ASM_GENERIC_TICKET_LOCK_TYPES_H
> +
> +#include <linux/types.h>
> +typedef atomic_t arch_spinlock_t;
> +
> +#define __ARCH_SPIN_LOCK_UNLOCKED      ATOMIC_INIT(0)
> +
> +#endif /* __ASM_GENERIC_TICKET_LOCK_TYPES_H */
> diff --git a/include/asm-generic/ticket-lock.h b/include/asm-generic/ticket-lock.h
> new file mode 100644
> index 000000000000..3f0d53e21a37
> --- /dev/null
> +++ b/include/asm-generic/ticket-lock.h
> @@ -0,0 +1,86 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +/*
> + * 'Generic' ticket-lock implementation.
> + *
> + * It relies on atomic_fetch_add() having well defined forward progress
> + * guarantees under contention. If your architecture cannot provide this, stick
> + * to a test-and-set lock.
> + *
> + * It also relies on atomic_fetch_add() being safe vs smp_store_release() on a
> + * sub-word of the value. This is generally true for anything LL/SC although
> + * you'd be hard pressed to find anything useful in architecture specifications
> + * about this. If your architecture cannot do this you might be better off with
> + * a test-and-set.
> + *
> + * It further assumes atomic_*_release() + atomic_*_acquire() is RCpc and hence
> + * uses atomic_fetch_add() which is SC to create an RCsc lock.
> + *
> + * The implementation uses smp_cond_load_acquire() to spin, so if the
> + * architecture has WFE like instructions to sleep instead of poll for word
> + * modifications be sure to implement that (see ARM64 for example).
> + *
> + */
> +
> +#ifndef __ASM_GENERIC_TICKET_LOCK_H
> +#define __ASM_GENERIC_TICKET_LOCK_H
> +
> +#include <linux/atomic.h>
> +#include <asm/ticket-lock-types.h>
> +
> +static __always_inline void ticket_lock(arch_spinlock_t *lock)
> +{
> +       u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
atomic_fetch_add_acquire ?

> +       u16 ticket = val >> 16;
> +
> +       if (ticket == (u16)val)
> +               return;
> +
> +       atomic_cond_read_acquire(lock, ticket == (u16)VAL);
> +}
> +
> +static __always_inline bool ticket_trylock(arch_spinlock_t *lock)
> +{
> +       u32 old = atomic_read(lock);
> +
> +       if ((old >> 16) != (old & 0xffff))
> +               return false;
> +
> +       return atomic_try_cmpxchg(lock, &old, old + (1<<16)); /* SC, for RCsc */
> +}
> +
> +static __always_inline void ticket_unlock(arch_spinlock_t *lock)
> +{
> +       u16 *ptr = (u16 *)lock + __is_defined(__BIG_ENDIAN);
> +       u32 val = atomic_read(lock);
> +
> +       smp_store_release(ptr, (u16)val + 1);
> +}
> +
> +static __always_inline int ticket_is_locked(arch_spinlock_t *lock)
> +{
> +       u32 val = atomic_read(lock);
> +
> +       return ((val >> 16) != (val & 0xffff));
I perfer:
return !arch_spin_value_unlocked(READ_ONCE(*lock));
> +}
> +
> +static __always_inline int ticket_is_contended(arch_spinlock_t *lock)
> +{
> +       u32 val = atomic_read(lock);
> +
> +       return (s16)((val >> 16) - (val & 0xffff)) > 1;
How big-endian ?

return (tickets.next - tickets.owner) > 1;

> +}
> +
> +static __always_inline int ticket_value_unlocked(arch_spinlock_t lock)
> +{
> +       return !ticket_is_locked(&lock);
Are you sure to let ticket_is_locked->atomic_read(lock) again, the
lock has contained all information?

return lock.tickets.owner == lock.tickets.next;

> +}
> +
> +#define arch_spin_lock(l)              ticket_lock(l)
> +#define arch_spin_trylock(l)           ticket_trylock(l)
> +#define arch_spin_unlock(l)            ticket_unlock(l)
> +#define arch_spin_is_locked(l)         ticket_is_locked(l)
> +#define arch_spin_is_contended(l)      ticket_is_contended(l)
> +#define arch_spin_value_unlocked(l)    ticket_value_unlocked(l)
> +
> +#endif /* __ASM_GENERIC_TICKET_LOCK_H */



-- 
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [RFC][PATCH] locking: Generic ticket-lock
@ 2021-04-14 12:39                             ` Guo Ren
  0 siblings, 0 replies; 92+ messages in thread
From: Guo Ren @ 2021-04-14 12:39 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Christoph Müllner, Palmer Dabbelt, Anup Patel, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, Catalin Marinas, Will Deacon,
	Arnd Bergmann, Jonas Bonn, Stefan Kristiansson, Stafford Horne

On Wed, Apr 14, 2021 at 6:16 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Wed, Apr 14, 2021 at 11:05:24AM +0200, Peter Zijlstra wrote:
>
> > That made me look at the qspinlock code, and queued_spin_*lock() uses
> > atomic_try_cmpxchg_acquire(), which means any arch that uses qspinlock
> > and has RCpc atomics will give us massive pain.
> >
> > Current archs using qspinlock are: x86, arm64, power, sparc64, mips and
> > openrisc (WTF?!).
> >
> > Of those, x86 and sparc are TSO archs with SC atomics, arm64 has RCsc
> > atomics, power has RCtso atomics (and is the arch we all hate for having
> > RCtso locks).
> >
> > Now MIPS has all sorts of ill specified barriers, but last time looked
> > at it it didn't actually use any of that and stuck to using smp_mb(), so
> > it will have RCsc atomics.
> >
> > /me goes look at wth openrisc is..  doesn't even appear to have
> > asm/barrier.h :-/ Looking at wikipedia it also doesn't appear to
> > actually have hardware ...
>
> FWIW this is broken, anything SMP *MUST* define mb(), at the very least.
>
> > I'm thinking openrisc is a prime candidate for this ticket_lock.h we're
> > all talking about.
>
> How's this then? Compile tested only on openrisc/simple_smp_defconfig.
I've tested it on csky SMP*4 hw (860) & riscv SMP*4 hw (c910) and it's okay.

Hope you can keep
typedef struct {
        union {
                atomic_t lock;
                struct __raw_tickets {
#ifdef __BIG_ENDIAN
                        u16 next;
                        u16 owner;
#else
                        u16 owner;
                        u16 next;
#endif
                } tickets;
        };
} arch_spinlock_t;

Using owner & next is much more readable.

>
> ---
>  arch/openrisc/Kconfig                      |  1 -
>  arch/openrisc/include/asm/Kbuild           |  5 +-
>  arch/openrisc/include/asm/spinlock.h       |  3 +-
>  arch/openrisc/include/asm/spinlock_types.h |  2 +-
>  include/asm-generic/qspinlock.h            | 30 +++++++++++
>  include/asm-generic/ticket-lock-types.h    | 11 ++++
>  include/asm-generic/ticket-lock.h          | 86 ++++++++++++++++++++++++++++++
>  7 files changed, 131 insertions(+), 7 deletions(-)
>
> diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig
> index 591acc5990dc..1858cf309f1f 100644
> --- a/arch/openrisc/Kconfig
> +++ b/arch/openrisc/Kconfig
> @@ -32,7 +32,6 @@ config OPENRISC
>         select HAVE_DEBUG_STACKOVERFLOW
>         select OR1K_PIC
>         select CPU_NO_EFFICIENT_FFS if !OPENRISC_HAVE_INST_FF1
> -       select ARCH_USE_QUEUED_SPINLOCKS
>         select ARCH_USE_QUEUED_RWLOCKS
>         select OMPIC if SMP
>         select ARCH_WANT_FRAME_POINTERS
> diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild
> index ca5987e11053..cb260e7d73db 100644
> --- a/arch/openrisc/include/asm/Kbuild
> +++ b/arch/openrisc/include/asm/Kbuild
> @@ -1,9 +1,8 @@
>  # SPDX-License-Identifier: GPL-2.0
>  generic-y += extable.h
>  generic-y += kvm_para.h
> -generic-y += mcs_spinlock.h
> -generic-y += qspinlock_types.h
> -generic-y += qspinlock.h
> +generic-y += ticket-lock.h
> +generic-y += ticket-lock-types.h
>  generic-y += qrwlock_types.h
>  generic-y += qrwlock.h
>  generic-y += user.h
> diff --git a/arch/openrisc/include/asm/spinlock.h b/arch/openrisc/include/asm/spinlock.h
> index a8940bdfcb7e..0b839ed1f3a0 100644
> --- a/arch/openrisc/include/asm/spinlock.h
> +++ b/arch/openrisc/include/asm/spinlock.h
> @@ -15,8 +15,7 @@
>  #ifndef __ASM_OPENRISC_SPINLOCK_H
>  #define __ASM_OPENRISC_SPINLOCK_H
>
> -#include <asm/qspinlock.h>
> -
> +#include <asm/ticket-lock.h>
>  #include <asm/qrwlock.h>
>
>  #define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
> diff --git a/arch/openrisc/include/asm/spinlock_types.h b/arch/openrisc/include/asm/spinlock_types.h
> index 7c6fb1208c88..58ea31fa65ce 100644
> --- a/arch/openrisc/include/asm/spinlock_types.h
> +++ b/arch/openrisc/include/asm/spinlock_types.h
> @@ -1,7 +1,7 @@
>  #ifndef _ASM_OPENRISC_SPINLOCK_TYPES_H
>  #define _ASM_OPENRISC_SPINLOCK_TYPES_H
>
> -#include <asm/qspinlock_types.h>
> +#include <asm/ticket-lock-types.h>
>  #include <asm/qrwlock_types.h>
>
>  #endif /* _ASM_OPENRISC_SPINLOCK_TYPES_H */
> diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
> index d74b13825501..a7a1296b0b4d 100644
> --- a/include/asm-generic/qspinlock.h
> +++ b/include/asm-generic/qspinlock.h
> @@ -2,6 +2,36 @@
>  /*
>   * Queued spinlock
>   *
> + * A 'generic' spinlock implementation that is based on MCS locks. An
> + * architecture that's looking for a 'generic' spinlock, please first consider
> + * ticket-lock.h and only come looking here when you've considered all the
> + * constraints below and can show your hardware does actually perform better
> + * with qspinlock.
> + *
> + *
> + * It relies on atomic_*_release()/atomic_*_acquire() to be RCsc (or no weaker
> + * than RCtso if you're power), where regular code only expects atomic_t to be
> + * RCpc.
> + *
> + * It relies on a far greater (compared to ticket-lock.h) set of atomic
> + * operations to behave well together, please audit them carefully to ensure
> + * they all have forward progress. Many atomic operations may default to
> + * cmpxchg() loops which will not have good forward progress properties on
> + * LL/SC architectures.
> + *
> + * One notable example is atomic_fetch_or_acquire(), which x86 cannot (cheaply)
> + * do. Carefully read the patches that introduced queued_fetch_set_pending_acquire().
> + *
> + * It also heavily relies on mixed size atomic operations, in specific it
> + * requires architectures to have xchg16; something which many LL/SC
> + * architectures need to implement as a 32bit and+or in order to satisfy the
> + * forward progress guarantees mentioned above.
> + *
> + * Further reading on mixed size atomics that might be relevant:
> + *
> + *   http://www.cl.cam.ac.uk/~pes20/popl17/mixed-size.pdf
> + *
> + *
>   * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
>   * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
>   *
> diff --git a/include/asm-generic/ticket-lock-types.h b/include/asm-generic/ticket-lock-types.h
> new file mode 100644
> index 000000000000..829759aedda8
> --- /dev/null
> +++ b/include/asm-generic/ticket-lock-types.h
> @@ -0,0 +1,11 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#ifndef __ASM_GENERIC_TICKET_LOCK_TYPES_H
> +#define __ASM_GENERIC_TICKET_LOCK_TYPES_H
> +
> +#include <linux/types.h>
> +typedef atomic_t arch_spinlock_t;
> +
> +#define __ARCH_SPIN_LOCK_UNLOCKED      ATOMIC_INIT(0)
> +
> +#endif /* __ASM_GENERIC_TICKET_LOCK_TYPES_H */
> diff --git a/include/asm-generic/ticket-lock.h b/include/asm-generic/ticket-lock.h
> new file mode 100644
> index 000000000000..3f0d53e21a37
> --- /dev/null
> +++ b/include/asm-generic/ticket-lock.h
> @@ -0,0 +1,86 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +/*
> + * 'Generic' ticket-lock implementation.
> + *
> + * It relies on atomic_fetch_add() having well defined forward progress
> + * guarantees under contention. If your architecture cannot provide this, stick
> + * to a test-and-set lock.
> + *
> + * It also relies on atomic_fetch_add() being safe vs smp_store_release() on a
> + * sub-word of the value. This is generally true for anything LL/SC although
> + * you'd be hard pressed to find anything useful in architecture specifications
> + * about this. If your architecture cannot do this you might be better off with
> + * a test-and-set.
> + *
> + * It further assumes atomic_*_release() + atomic_*_acquire() is RCpc and hence
> + * uses atomic_fetch_add() which is SC to create an RCsc lock.
> + *
> + * The implementation uses smp_cond_load_acquire() to spin, so if the
> + * architecture has WFE like instructions to sleep instead of poll for word
> + * modifications be sure to implement that (see ARM64 for example).
> + *
> + */
> +
> +#ifndef __ASM_GENERIC_TICKET_LOCK_H
> +#define __ASM_GENERIC_TICKET_LOCK_H
> +
> +#include <linux/atomic.h>
> +#include <asm/ticket-lock-types.h>
> +
> +static __always_inline void ticket_lock(arch_spinlock_t *lock)
> +{
> +       u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
atomic_fetch_add_acquire ?

> +       u16 ticket = val >> 16;
> +
> +       if (ticket == (u16)val)
> +               return;
> +
> +       atomic_cond_read_acquire(lock, ticket == (u16)VAL);
> +}
> +
> +static __always_inline bool ticket_trylock(arch_spinlock_t *lock)
> +{
> +       u32 old = atomic_read(lock);
> +
> +       if ((old >> 16) != (old & 0xffff))
> +               return false;
> +
> +       return atomic_try_cmpxchg(lock, &old, old + (1<<16)); /* SC, for RCsc */
> +}
> +
> +static __always_inline void ticket_unlock(arch_spinlock_t *lock)
> +{
> +       u16 *ptr = (u16 *)lock + __is_defined(__BIG_ENDIAN);
> +       u32 val = atomic_read(lock);
> +
> +       smp_store_release(ptr, (u16)val + 1);
> +}
> +
> +static __always_inline int ticket_is_locked(arch_spinlock_t *lock)
> +{
> +       u32 val = atomic_read(lock);
> +
> +       return ((val >> 16) != (val & 0xffff));
I perfer:
return !arch_spin_value_unlocked(READ_ONCE(*lock));
> +}
> +
> +static __always_inline int ticket_is_contended(arch_spinlock_t *lock)
> +{
> +       u32 val = atomic_read(lock);
> +
> +       return (s16)((val >> 16) - (val & 0xffff)) > 1;
How big-endian ?

return (tickets.next - tickets.owner) > 1;

> +}
> +
> +static __always_inline int ticket_value_unlocked(arch_spinlock_t lock)
> +{
> +       return !ticket_is_locked(&lock);
Are you sure to let ticket_is_locked->atomic_read(lock) again, the
lock has contained all information?

return lock.tickets.owner == lock.tickets.next;

> +}
> +
> +#define arch_spin_lock(l)              ticket_lock(l)
> +#define arch_spin_trylock(l)           ticket_trylock(l)
> +#define arch_spin_unlock(l)            ticket_unlock(l)
> +#define arch_spin_is_locked(l)         ticket_is_locked(l)
> +#define arch_spin_is_contended(l)      ticket_is_contended(l)
> +#define arch_spin_value_unlocked(l)    ticket_value_unlocked(l)
> +
> +#endif /* __ASM_GENERIC_TICKET_LOCK_H */



-- 
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [RFC][PATCH] locking: Generic ticket-lock
  2021-04-14 10:16                           ` Peter Zijlstra
@ 2021-04-14 12:45                             ` Peter Zijlstra
  -1 siblings, 0 replies; 92+ messages in thread
From: Peter Zijlstra @ 2021-04-14 12:45 UTC (permalink / raw)
  To: Guo Ren
  Cc: Christoph Müllner, Palmer Dabbelt, Anup Patel, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, Catalin Marinas, Will Deacon,
	Arnd Bergmann, jonas, stefan.kristiansson, shorne

On Wed, Apr 14, 2021 at 12:16:38PM +0200, Peter Zijlstra wrote:
> On Wed, Apr 14, 2021 at 11:05:24AM +0200, Peter Zijlstra wrote:
> 
> > That made me look at the qspinlock code, and queued_spin_*lock() uses
> > atomic_try_cmpxchg_acquire(), which means any arch that uses qspinlock
> > and has RCpc atomics will give us massive pain.
> > 
> > Current archs using qspinlock are: x86, arm64, power, sparc64, mips and
> > openrisc (WTF?!).
> > 
> > Of those, x86 and sparc are TSO archs with SC atomics, arm64 has RCsc
> > atomics, power has RCtso atomics (and is the arch we all hate for having
> > RCtso locks).
> > 
> > Now MIPS has all sorts of ill specified barriers, but last time looked
> > at it it didn't actually use any of that and stuck to using smp_mb(), so
> > it will have RCsc atomics.
> > 
> > /me goes look at wth openrisc is..  doesn't even appear to have
> > asm/barrier.h :-/ Looking at wikipedia it also doesn't appear to
> > actually have hardware ...
> 
> FWIW this is broken, anything SMP *MUST* define mb(), at the very least.

As near as I can tell this should do. The arch spec only lists this one
instruction and the text makes it sound like a completion barrier.

---
 arch/openrisc/include/asm/barrier.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/openrisc/include/asm/barrier.h b/arch/openrisc/include/asm/barrier.h
new file mode 100644
index 000000000000..7538294721be
--- /dev/null
+++ b/arch/openrisc/include/asm/barrier.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_BARRIER_H
+#define __ASM_BARRIER_H
+
+#define mb() asm volatile ("l.msync" ::: "memory")
+
+#include <asm-generic/barrier.h>
+
+#endif /* __ASM_BARRIER_H */

^ permalink raw reply related	[flat|nested] 92+ messages in thread

* Re: [RFC][PATCH] locking: Generic ticket-lock
@ 2021-04-14 12:45                             ` Peter Zijlstra
  0 siblings, 0 replies; 92+ messages in thread
From: Peter Zijlstra @ 2021-04-14 12:45 UTC (permalink / raw)
  To: Guo Ren
  Cc: Christoph Müllner, Palmer Dabbelt, Anup Patel, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, Catalin Marinas, Will Deacon,
	Arnd Bergmann, jonas, stefan.kristiansson, shorne

On Wed, Apr 14, 2021 at 12:16:38PM +0200, Peter Zijlstra wrote:
> On Wed, Apr 14, 2021 at 11:05:24AM +0200, Peter Zijlstra wrote:
> 
> > That made me look at the qspinlock code, and queued_spin_*lock() uses
> > atomic_try_cmpxchg_acquire(), which means any arch that uses qspinlock
> > and has RCpc atomics will give us massive pain.
> > 
> > Current archs using qspinlock are: x86, arm64, power, sparc64, mips and
> > openrisc (WTF?!).
> > 
> > Of those, x86 and sparc are TSO archs with SC atomics, arm64 has RCsc
> > atomics, power has RCtso atomics (and is the arch we all hate for having
> > RCtso locks).
> > 
> > Now MIPS has all sorts of ill specified barriers, but last time looked
> > at it it didn't actually use any of that and stuck to using smp_mb(), so
> > it will have RCsc atomics.
> > 
> > /me goes look at wth openrisc is..  doesn't even appear to have
> > asm/barrier.h :-/ Looking at wikipedia it also doesn't appear to
> > actually have hardware ...
> 
> FWIW this is broken, anything SMP *MUST* define mb(), at the very least.

As near as I can tell this should do. The arch spec only lists this one
instruction and the text makes it sound like a completion barrier.

---
 arch/openrisc/include/asm/barrier.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/openrisc/include/asm/barrier.h b/arch/openrisc/include/asm/barrier.h
new file mode 100644
index 000000000000..7538294721be
--- /dev/null
+++ b/arch/openrisc/include/asm/barrier.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_BARRIER_H
+#define __ASM_BARRIER_H
+
+#define mb() asm volatile ("l.msync" ::: "memory")
+
+#include <asm-generic/barrier.h>
+
+#endif /* __ASM_BARRIER_H */

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply related	[flat|nested] 92+ messages in thread

* Re: [RFC][PATCH] locking: Generic ticket-lock
  2021-04-14 12:39                             ` Guo Ren
@ 2021-04-14 12:55                               ` Peter Zijlstra
  -1 siblings, 0 replies; 92+ messages in thread
From: Peter Zijlstra @ 2021-04-14 12:55 UTC (permalink / raw)
  To: Guo Ren
  Cc: Christoph Müllner, Palmer Dabbelt, Anup Patel, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, Catalin Marinas, Will Deacon,
	Arnd Bergmann, Jonas Bonn, Stefan Kristiansson, Stafford Horne

On Wed, Apr 14, 2021 at 08:39:33PM +0800, Guo Ren wrote:

> I've tested it on csky SMP*4 hw (860) & riscv SMP*4 hw (c910) and it's okay.

W00t :-)

> Hope you can keep
> typedef struct {
>         union {
>                 atomic_t lock;
>                 struct __raw_tickets {
> #ifdef __BIG_ENDIAN
>                         u16 next;
>                         u16 owner;
> #else
>                         u16 owner;
>                         u16 next;
> #endif
>                 } tickets;
>         };
> } arch_spinlock_t;
> 
> Using owner & next is much more readable.

That almost doubles the line-count of the thing ;-)


> > + * It further assumes atomic_*_release() + atomic_*_acquire() is RCpc and hence
> > + * uses atomic_fetch_add() which is SC to create an RCsc lock.

This ^^^ then vvv

> > +static __always_inline void ticket_lock(arch_spinlock_t *lock)
> > +{
> > +       u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
> atomic_fetch_add_acquire ?

Then we must rely on the arch to implement RCsc atomics. And I for one
can never tell wth Risc-V actually does.

> > +static __always_inline int ticket_is_locked(arch_spinlock_t *lock)
> > +{
> > +       u32 val = atomic_read(lock);
> > +
> > +       return ((val >> 16) != (val & 0xffff));
> I perfer:
> return !arch_spin_value_unlocked(READ_ONCE(*lock));
> > +}

> > +}
> > +
> > +static __always_inline int ticket_value_unlocked(arch_spinlock_t lock)
> > +{
> > +       return !ticket_is_locked(&lock);
> Are you sure to let ticket_is_locked->atomic_read(lock) again, the
> lock has contained all information?
> 
> return lock.tickets.owner == lock.tickets.next;

Yeah, I wrote then the wrong way around. Couldn't be bothered to go back
when I figured it out.

> > +
> > +static __always_inline int ticket_is_contended(arch_spinlock_t *lock)
> > +{
> > +       u32 val = atomic_read(lock);
> > +
> > +       return (s16)((val >> 16) - (val & 0xffff)) > 1;
> How big-endian ?

How not? Endian-ness only matters when you go poke at sub-words, which
the above does not. Only ticket_unlock() does and cares about that.


^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [RFC][PATCH] locking: Generic ticket-lock
@ 2021-04-14 12:55                               ` Peter Zijlstra
  0 siblings, 0 replies; 92+ messages in thread
From: Peter Zijlstra @ 2021-04-14 12:55 UTC (permalink / raw)
  To: Guo Ren
  Cc: Christoph Müllner, Palmer Dabbelt, Anup Patel, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, Catalin Marinas, Will Deacon,
	Arnd Bergmann, Jonas Bonn, Stefan Kristiansson, Stafford Horne

On Wed, Apr 14, 2021 at 08:39:33PM +0800, Guo Ren wrote:

> I've tested it on csky SMP*4 hw (860) & riscv SMP*4 hw (c910) and it's okay.

W00t :-)

> Hope you can keep
> typedef struct {
>         union {
>                 atomic_t lock;
>                 struct __raw_tickets {
> #ifdef __BIG_ENDIAN
>                         u16 next;
>                         u16 owner;
> #else
>                         u16 owner;
>                         u16 next;
> #endif
>                 } tickets;
>         };
> } arch_spinlock_t;
> 
> Using owner & next is much more readable.

That almost doubles the line-count of the thing ;-)


> > + * It further assumes atomic_*_release() + atomic_*_acquire() is RCpc and hence
> > + * uses atomic_fetch_add() which is SC to create an RCsc lock.

This ^^^ then vvv

> > +static __always_inline void ticket_lock(arch_spinlock_t *lock)
> > +{
> > +       u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
> atomic_fetch_add_acquire ?

Then we must rely on the arch to implement RCsc atomics. And I for one
can never tell wth Risc-V actually does.

> > +static __always_inline int ticket_is_locked(arch_spinlock_t *lock)
> > +{
> > +       u32 val = atomic_read(lock);
> > +
> > +       return ((val >> 16) != (val & 0xffff));
> I perfer:
> return !arch_spin_value_unlocked(READ_ONCE(*lock));
> > +}

> > +}
> > +
> > +static __always_inline int ticket_value_unlocked(arch_spinlock_t lock)
> > +{
> > +       return !ticket_is_locked(&lock);
> Are you sure to let ticket_is_locked->atomic_read(lock) again, the
> lock has contained all information?
> 
> return lock.tickets.owner == lock.tickets.next;

Yeah, I wrote then the wrong way around. Couldn't be bothered to go back
when I figured it out.

> > +
> > +static __always_inline int ticket_is_contended(arch_spinlock_t *lock)
> > +{
> > +       u32 val = atomic_read(lock);
> > +
> > +       return (s16)((val >> 16) - (val & 0xffff)) > 1;
> How big-endian ?

How not? Endian-ness only matters when you go poke at sub-words, which
the above does not. Only ticket_unlock() does and cares about that.


_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [RFC][PATCH] locking: Generic ticket-lock
  2021-04-14 12:55                               ` Peter Zijlstra
@ 2021-04-14 13:08                                 ` Peter Zijlstra
  -1 siblings, 0 replies; 92+ messages in thread
From: Peter Zijlstra @ 2021-04-14 13:08 UTC (permalink / raw)
  To: Guo Ren
  Cc: Christoph Müllner, Palmer Dabbelt, Anup Patel, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, Catalin Marinas, Will Deacon,
	Arnd Bergmann, Jonas Bonn, Stefan Kristiansson, Stafford Horne

On Wed, Apr 14, 2021 at 02:55:57PM +0200, Peter Zijlstra wrote:
> On Wed, Apr 14, 2021 at 08:39:33PM +0800, Guo Ren wrote:

> > > + * It further assumes atomic_*_release() + atomic_*_acquire() is RCpc and hence
> > > + * uses atomic_fetch_add() which is SC to create an RCsc lock.
> 
> This ^^^ then vvv
> 
> > > +static __always_inline void ticket_lock(arch_spinlock_t *lock)
> > > +{
> > > +       u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
> > atomic_fetch_add_acquire ?
> 
> Then we must rely on the arch to implement RCsc atomics. And I for one
> can never tell wth Risc-V actually does.

Anyway, if we can mandate that atomic acquire/release is RCsc, then yes
we can do that but then we also need:

#define smp_mb__after_spinlock()	smp_mb__after_atomic()

But currently atomic acquire/release is not RCsc (at the very least
Power does RCtso -- and power gets to deal with any and all pain caused
by that).

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [RFC][PATCH] locking: Generic ticket-lock
@ 2021-04-14 13:08                                 ` Peter Zijlstra
  0 siblings, 0 replies; 92+ messages in thread
From: Peter Zijlstra @ 2021-04-14 13:08 UTC (permalink / raw)
  To: Guo Ren
  Cc: Christoph Müllner, Palmer Dabbelt, Anup Patel, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, Catalin Marinas, Will Deacon,
	Arnd Bergmann, Jonas Bonn, Stefan Kristiansson, Stafford Horne

On Wed, Apr 14, 2021 at 02:55:57PM +0200, Peter Zijlstra wrote:
> On Wed, Apr 14, 2021 at 08:39:33PM +0800, Guo Ren wrote:

> > > + * It further assumes atomic_*_release() + atomic_*_acquire() is RCpc and hence
> > > + * uses atomic_fetch_add() which is SC to create an RCsc lock.
> 
> This ^^^ then vvv
> 
> > > +static __always_inline void ticket_lock(arch_spinlock_t *lock)
> > > +{
> > > +       u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
> > atomic_fetch_add_acquire ?
> 
> Then we must rely on the arch to implement RCsc atomics. And I for one
> can never tell wth Risc-V actually does.

Anyway, if we can mandate that atomic acquire/release is RCsc, then yes
we can do that but then we also need:

#define smp_mb__after_spinlock()	smp_mb__after_atomic()

But currently atomic acquire/release is not RCsc (at the very least
Power does RCtso -- and power gets to deal with any and all pain caused
by that).

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

* RE: [RFC][PATCH] locking: Generic ticket-lock
  2021-04-14 12:55                               ` Peter Zijlstra
@ 2021-04-14 15:59                                 ` David Laight
  -1 siblings, 0 replies; 92+ messages in thread
From: David Laight @ 2021-04-14 15:59 UTC (permalink / raw)
  To: 'Peter Zijlstra', Guo Ren
  Cc: Christoph Müllner, Palmer Dabbelt, Anup Patel, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, Catalin Marinas, Will Deacon,
	Arnd Bergmann, Jonas Bonn, Stefan Kristiansson, Stafford Horne

From: Peter Zijlstra
> Sent: 14 April 2021 13:56
> 
> > I've tested it on csky SMP*4 hw (860) & riscv SMP*4 hw (c910) and it's okay.
> 
> W00t :-)
> 
> > Hope you can keep
> > typedef struct {
> >         union {
> >                 atomic_t lock;
> >                 struct __raw_tickets {
> > #ifdef __BIG_ENDIAN
> >                         u16 next;
> >                         u16 owner;
> > #else
> >                         u16 owner;
> >                         u16 next;
> > #endif
> >                 } tickets;
> >         };
> > } arch_spinlock_t;
> >
> > Using owner & next is much more readable.
> 
> That almost doubles the line-count of the thing ;-)

And relies on the compiler not ever spilling it to stack.
Which it is much more likely to do that for the version
that uses shifts.

Have you checked what happens with -O0 ?
I don't think that is banned by the build system.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)


^ permalink raw reply	[flat|nested] 92+ messages in thread

* RE: [RFC][PATCH] locking: Generic ticket-lock
@ 2021-04-14 15:59                                 ` David Laight
  0 siblings, 0 replies; 92+ messages in thread
From: David Laight @ 2021-04-14 15:59 UTC (permalink / raw)
  To: 'Peter Zijlstra', Guo Ren
  Cc: Christoph Müllner, Palmer Dabbelt, Anup Patel, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, Catalin Marinas, Will Deacon,
	Arnd Bergmann, Jonas Bonn, Stefan Kristiansson, Stafford Horne

From: Peter Zijlstra
> Sent: 14 April 2021 13:56
> 
> > I've tested it on csky SMP*4 hw (860) & riscv SMP*4 hw (c910) and it's okay.
> 
> W00t :-)
> 
> > Hope you can keep
> > typedef struct {
> >         union {
> >                 atomic_t lock;
> >                 struct __raw_tickets {
> > #ifdef __BIG_ENDIAN
> >                         u16 next;
> >                         u16 owner;
> > #else
> >                         u16 owner;
> >                         u16 next;
> > #endif
> >                 } tickets;
> >         };
> > } arch_spinlock_t;
> >
> > Using owner & next is much more readable.
> 
> That almost doubles the line-count of the thing ;-)

And relies on the compiler not ever spilling it to stack.
Which it is much more likely to do that for the version
that uses shifts.

Have you checked what happens with -O0 ?
I don't think that is banned by the build system.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)


_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [RFC][PATCH] locking: Generic ticket-lock
  2021-04-14 10:16                           ` Peter Zijlstra
@ 2021-04-14 20:47                             ` Stafford Horne
  -1 siblings, 0 replies; 92+ messages in thread
From: Stafford Horne @ 2021-04-14 20:47 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Guo Ren, Christoph Müllner, Palmer Dabbelt, Anup Patel,
	linux-riscv, Linux Kernel Mailing List, Guo Ren, Catalin Marinas,
	Will Deacon, Arnd Bergmann, jonas, stefan.kristiansson

On Wed, Apr 14, 2021 at 12:16:38PM +0200, Peter Zijlstra wrote:
> On Wed, Apr 14, 2021 at 11:05:24AM +0200, Peter Zijlstra wrote:
> 
> > That made me look at the qspinlock code, and queued_spin_*lock() uses
> > atomic_try_cmpxchg_acquire(), which means any arch that uses qspinlock
> > and has RCpc atomics will give us massive pain.
> > 
> > Current archs using qspinlock are: x86, arm64, power, sparc64, mips and
> > openrisc (WTF?!).
> > 
> > Of those, x86 and sparc are TSO archs with SC atomics, arm64 has RCsc
> > atomics, power has RCtso atomics (and is the arch we all hate for having
> > RCtso locks).
> > 
> > Now MIPS has all sorts of ill specified barriers, but last time looked
> > at it it didn't actually use any of that and stuck to using smp_mb(), so
> > it will have RCsc atomics.
> > 
> > /me goes look at wth openrisc is..  doesn't even appear to have
> > asm/barrier.h :-/ Looking at wikipedia it also doesn't appear to
> > actually have hardware ...

Yes, not hardware available to consumers directoy, my development is done on
FPGAs.

> FWIW this is broken, anything SMP *MUST* define mb(), at the very least.

Oh, thats right, something missed, when we developed qspinlocks we discussed
this and my point there was that l.swa/l.lwa implied a mem flush
l.msync/barrier.  But mb still needs to be added.

> > I'm thinking openrisc is a prime candidate for this ticket_lock.h we're
> > all talking about.
> 
> How's this then? Compile tested only on openrisc/simple_smp_defconfig.

I did my testing with this FPGA build SoC:

 https://github.com/stffrdhrn/de0_nano-multicore

Note, the CPU timer sync logic uses mb() and is a bit flaky.  So missing mb()
might be a reason.  I thought we had defined mb() and l.msync, but it seems to
have gotten lost.

With that said I could test out this ticket-lock implementation.  How would I
tell if its better than qspinlock?

> ---
>  arch/openrisc/Kconfig                      |  1 -
>  arch/openrisc/include/asm/Kbuild           |  5 +-
>  arch/openrisc/include/asm/spinlock.h       |  3 +-
>  arch/openrisc/include/asm/spinlock_types.h |  2 +-
>  include/asm-generic/qspinlock.h            | 30 +++++++++++
>  include/asm-generic/ticket-lock-types.h    | 11 ++++
>  include/asm-generic/ticket-lock.h          | 86 ++++++++++++++++++++++++++++++
>  7 files changed, 131 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig
> index 591acc5990dc..1858cf309f1f 100644
> --- a/arch/openrisc/Kconfig
> +++ b/arch/openrisc/Kconfig
> @@ -32,7 +32,6 @@ config OPENRISC
>  	select HAVE_DEBUG_STACKOVERFLOW
>  	select OR1K_PIC
>  	select CPU_NO_EFFICIENT_FFS if !OPENRISC_HAVE_INST_FF1
> -	select ARCH_USE_QUEUED_SPINLOCKS
>  	select ARCH_USE_QUEUED_RWLOCKS
>  	select OMPIC if SMP
>  	select ARCH_WANT_FRAME_POINTERS
> diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild
> index ca5987e11053..cb260e7d73db 100644
> --- a/arch/openrisc/include/asm/Kbuild
> +++ b/arch/openrisc/include/asm/Kbuild
> @@ -1,9 +1,8 @@
>  # SPDX-License-Identifier: GPL-2.0
>  generic-y += extable.h
>  generic-y += kvm_para.h
> -generic-y += mcs_spinlock.h
> -generic-y += qspinlock_types.h
> -generic-y += qspinlock.h
> +generic-y += ticket-lock.h
> +generic-y += ticket-lock-types.h
>  generic-y += qrwlock_types.h
>  generic-y += qrwlock.h
>  generic-y += user.h
> diff --git a/arch/openrisc/include/asm/spinlock.h b/arch/openrisc/include/asm/spinlock.h
> index a8940bdfcb7e..0b839ed1f3a0 100644
> --- a/arch/openrisc/include/asm/spinlock.h
> +++ b/arch/openrisc/include/asm/spinlock.h
> @@ -15,8 +15,7 @@
>  #ifndef __ASM_OPENRISC_SPINLOCK_H
>  #define __ASM_OPENRISC_SPINLOCK_H
>  
> -#include <asm/qspinlock.h>
> -
> +#include <asm/ticket-lock.h>
>  #include <asm/qrwlock.h>
>  
>  #define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
> diff --git a/arch/openrisc/include/asm/spinlock_types.h b/arch/openrisc/include/asm/spinlock_types.h
> index 7c6fb1208c88..58ea31fa65ce 100644
> --- a/arch/openrisc/include/asm/spinlock_types.h
> +++ b/arch/openrisc/include/asm/spinlock_types.h
> @@ -1,7 +1,7 @@
>  #ifndef _ASM_OPENRISC_SPINLOCK_TYPES_H
>  #define _ASM_OPENRISC_SPINLOCK_TYPES_H
>  
> -#include <asm/qspinlock_types.h>
> +#include <asm/ticket-lock-types.h>
>  #include <asm/qrwlock_types.h>
>  
>  #endif /* _ASM_OPENRISC_SPINLOCK_TYPES_H */
> diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
> index d74b13825501..a7a1296b0b4d 100644
> --- a/include/asm-generic/qspinlock.h
> +++ b/include/asm-generic/qspinlock.h
> @@ -2,6 +2,36 @@
>  /*
>   * Queued spinlock
>   *
> + * A 'generic' spinlock implementation that is based on MCS locks. An
> + * architecture that's looking for a 'generic' spinlock, please first consider
> + * ticket-lock.h and only come looking here when you've considered all the
> + * constraints below and can show your hardware does actually perform better
> + * with qspinlock.
> + *
> + *
> + * It relies on atomic_*_release()/atomic_*_acquire() to be RCsc (or no weaker
> + * than RCtso if you're power), where regular code only expects atomic_t to be
> + * RCpc.
> + *
> + * It relies on a far greater (compared to ticket-lock.h) set of atomic
> + * operations to behave well together, please audit them carefully to ensure
> + * they all have forward progress. Many atomic operations may default to
> + * cmpxchg() loops which will not have good forward progress properties on
> + * LL/SC architectures.
> + *
> + * One notable example is atomic_fetch_or_acquire(), which x86 cannot (cheaply)
> + * do. Carefully read the patches that introduced queued_fetch_set_pending_acquire().
> + *
> + * It also heavily relies on mixed size atomic operations, in specific it
> + * requires architectures to have xchg16; something which many LL/SC
> + * architectures need to implement as a 32bit and+or in order to satisfy the
> + * forward progress guarantees mentioned above.
> + *
> + * Further reading on mixed size atomics that might be relevant:
> + *
> + *   http://www.cl.cam.ac.uk/~pes20/popl17/mixed-size.pdf
> + *
> + *
>   * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
>   * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
>   *
> diff --git a/include/asm-generic/ticket-lock-types.h b/include/asm-generic/ticket-lock-types.h
> new file mode 100644
> index 000000000000..829759aedda8
> --- /dev/null
> +++ b/include/asm-generic/ticket-lock-types.h
> @@ -0,0 +1,11 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#ifndef __ASM_GENERIC_TICKET_LOCK_TYPES_H
> +#define __ASM_GENERIC_TICKET_LOCK_TYPES_H
> +
> +#include <linux/types.h>
> +typedef atomic_t arch_spinlock_t;
> +
> +#define __ARCH_SPIN_LOCK_UNLOCKED	ATOMIC_INIT(0)
> +
> +#endif /* __ASM_GENERIC_TICKET_LOCK_TYPES_H */
> diff --git a/include/asm-generic/ticket-lock.h b/include/asm-generic/ticket-lock.h
> new file mode 100644
> index 000000000000..3f0d53e21a37
> --- /dev/null
> +++ b/include/asm-generic/ticket-lock.h
> @@ -0,0 +1,86 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +/*
> + * 'Generic' ticket-lock implementation.
> + *
> + * It relies on atomic_fetch_add() having well defined forward progress
> + * guarantees under contention. If your architecture cannot provide this, stick
> + * to a test-and-set lock.
> + *
> + * It also relies on atomic_fetch_add() being safe vs smp_store_release() on a
> + * sub-word of the value. This is generally true for anything LL/SC although
> + * you'd be hard pressed to find anything useful in architecture specifications
> + * about this. If your architecture cannot do this you might be better off with
> + * a test-and-set.
> + *
> + * It further assumes atomic_*_release() + atomic_*_acquire() is RCpc and hence
> + * uses atomic_fetch_add() which is SC to create an RCsc lock.
> + *
> + * The implementation uses smp_cond_load_acquire() to spin, so if the
> + * architecture has WFE like instructions to sleep instead of poll for word
> + * modifications be sure to implement that (see ARM64 for example).
> + *
> + */
> +
> +#ifndef __ASM_GENERIC_TICKET_LOCK_H
> +#define __ASM_GENERIC_TICKET_LOCK_H
> +
> +#include <linux/atomic.h>
> +#include <asm/ticket-lock-types.h>
> +
> +static __always_inline void ticket_lock(arch_spinlock_t *lock)
> +{
> +	u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
> +	u16 ticket = val >> 16;
> +
> +	if (ticket == (u16)val)
> +		return;
> +
> +	atomic_cond_read_acquire(lock, ticket == (u16)VAL);
> +}
> +
> +static __always_inline bool ticket_trylock(arch_spinlock_t *lock)
> +{
> +	u32 old = atomic_read(lock);
> +
> +	if ((old >> 16) != (old & 0xffff))
> +		return false;
> +
> +	return atomic_try_cmpxchg(lock, &old, old + (1<<16)); /* SC, for RCsc */
> +}
> +
> +static __always_inline void ticket_unlock(arch_spinlock_t *lock)
> +{
> +	u16 *ptr = (u16 *)lock + __is_defined(__BIG_ENDIAN);
> +	u32 val = atomic_read(lock);
> +
> +	smp_store_release(ptr, (u16)val + 1);
> +}
> +
> +static __always_inline int ticket_is_locked(arch_spinlock_t *lock)
> +{
> +	u32 val = atomic_read(lock);
> +
> +	return ((val >> 16) != (val & 0xffff));
> +}
> +
> +static __always_inline int ticket_is_contended(arch_spinlock_t *lock)
> +{
> +	u32 val = atomic_read(lock);
> +
> +	return (s16)((val >> 16) - (val & 0xffff)) > 1;
> +}
> +
> +static __always_inline int ticket_value_unlocked(arch_spinlock_t lock)
> +{
> +	return !ticket_is_locked(&lock);
> +}
> +
> +#define arch_spin_lock(l)		ticket_lock(l)
> +#define arch_spin_trylock(l)		ticket_trylock(l)
> +#define arch_spin_unlock(l)		ticket_unlock(l)
> +#define arch_spin_is_locked(l)		ticket_is_locked(l)
> +#define arch_spin_is_contended(l)	ticket_is_contended(l)
> +#define arch_spin_value_unlocked(l)	ticket_value_unlocked(l)
> +
> +#endif /* __ASM_GENERIC_TICKET_LOCK_H */

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [RFC][PATCH] locking: Generic ticket-lock
@ 2021-04-14 20:47                             ` Stafford Horne
  0 siblings, 0 replies; 92+ messages in thread
From: Stafford Horne @ 2021-04-14 20:47 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Guo Ren, Christoph Müllner, Palmer Dabbelt, Anup Patel,
	linux-riscv, Linux Kernel Mailing List, Guo Ren, Catalin Marinas,
	Will Deacon, Arnd Bergmann, jonas, stefan.kristiansson

On Wed, Apr 14, 2021 at 12:16:38PM +0200, Peter Zijlstra wrote:
> On Wed, Apr 14, 2021 at 11:05:24AM +0200, Peter Zijlstra wrote:
> 
> > That made me look at the qspinlock code, and queued_spin_*lock() uses
> > atomic_try_cmpxchg_acquire(), which means any arch that uses qspinlock
> > and has RCpc atomics will give us massive pain.
> > 
> > Current archs using qspinlock are: x86, arm64, power, sparc64, mips and
> > openrisc (WTF?!).
> > 
> > Of those, x86 and sparc are TSO archs with SC atomics, arm64 has RCsc
> > atomics, power has RCtso atomics (and is the arch we all hate for having
> > RCtso locks).
> > 
> > Now MIPS has all sorts of ill specified barriers, but last time looked
> > at it it didn't actually use any of that and stuck to using smp_mb(), so
> > it will have RCsc atomics.
> > 
> > /me goes look at wth openrisc is..  doesn't even appear to have
> > asm/barrier.h :-/ Looking at wikipedia it also doesn't appear to
> > actually have hardware ...

Yes, not hardware available to consumers directoy, my development is done on
FPGAs.

> FWIW this is broken, anything SMP *MUST* define mb(), at the very least.

Oh, thats right, something missed, when we developed qspinlocks we discussed
this and my point there was that l.swa/l.lwa implied a mem flush
l.msync/barrier.  But mb still needs to be added.

> > I'm thinking openrisc is a prime candidate for this ticket_lock.h we're
> > all talking about.
> 
> How's this then? Compile tested only on openrisc/simple_smp_defconfig.

I did my testing with this FPGA build SoC:

 https://github.com/stffrdhrn/de0_nano-multicore

Note, the CPU timer sync logic uses mb() and is a bit flaky.  So missing mb()
might be a reason.  I thought we had defined mb() and l.msync, but it seems to
have gotten lost.

With that said I could test out this ticket-lock implementation.  How would I
tell if its better than qspinlock?

> ---
>  arch/openrisc/Kconfig                      |  1 -
>  arch/openrisc/include/asm/Kbuild           |  5 +-
>  arch/openrisc/include/asm/spinlock.h       |  3 +-
>  arch/openrisc/include/asm/spinlock_types.h |  2 +-
>  include/asm-generic/qspinlock.h            | 30 +++++++++++
>  include/asm-generic/ticket-lock-types.h    | 11 ++++
>  include/asm-generic/ticket-lock.h          | 86 ++++++++++++++++++++++++++++++
>  7 files changed, 131 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig
> index 591acc5990dc..1858cf309f1f 100644
> --- a/arch/openrisc/Kconfig
> +++ b/arch/openrisc/Kconfig
> @@ -32,7 +32,6 @@ config OPENRISC
>  	select HAVE_DEBUG_STACKOVERFLOW
>  	select OR1K_PIC
>  	select CPU_NO_EFFICIENT_FFS if !OPENRISC_HAVE_INST_FF1
> -	select ARCH_USE_QUEUED_SPINLOCKS
>  	select ARCH_USE_QUEUED_RWLOCKS
>  	select OMPIC if SMP
>  	select ARCH_WANT_FRAME_POINTERS
> diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild
> index ca5987e11053..cb260e7d73db 100644
> --- a/arch/openrisc/include/asm/Kbuild
> +++ b/arch/openrisc/include/asm/Kbuild
> @@ -1,9 +1,8 @@
>  # SPDX-License-Identifier: GPL-2.0
>  generic-y += extable.h
>  generic-y += kvm_para.h
> -generic-y += mcs_spinlock.h
> -generic-y += qspinlock_types.h
> -generic-y += qspinlock.h
> +generic-y += ticket-lock.h
> +generic-y += ticket-lock-types.h
>  generic-y += qrwlock_types.h
>  generic-y += qrwlock.h
>  generic-y += user.h
> diff --git a/arch/openrisc/include/asm/spinlock.h b/arch/openrisc/include/asm/spinlock.h
> index a8940bdfcb7e..0b839ed1f3a0 100644
> --- a/arch/openrisc/include/asm/spinlock.h
> +++ b/arch/openrisc/include/asm/spinlock.h
> @@ -15,8 +15,7 @@
>  #ifndef __ASM_OPENRISC_SPINLOCK_H
>  #define __ASM_OPENRISC_SPINLOCK_H
>  
> -#include <asm/qspinlock.h>
> -
> +#include <asm/ticket-lock.h>
>  #include <asm/qrwlock.h>
>  
>  #define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
> diff --git a/arch/openrisc/include/asm/spinlock_types.h b/arch/openrisc/include/asm/spinlock_types.h
> index 7c6fb1208c88..58ea31fa65ce 100644
> --- a/arch/openrisc/include/asm/spinlock_types.h
> +++ b/arch/openrisc/include/asm/spinlock_types.h
> @@ -1,7 +1,7 @@
>  #ifndef _ASM_OPENRISC_SPINLOCK_TYPES_H
>  #define _ASM_OPENRISC_SPINLOCK_TYPES_H
>  
> -#include <asm/qspinlock_types.h>
> +#include <asm/ticket-lock-types.h>
>  #include <asm/qrwlock_types.h>
>  
>  #endif /* _ASM_OPENRISC_SPINLOCK_TYPES_H */
> diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
> index d74b13825501..a7a1296b0b4d 100644
> --- a/include/asm-generic/qspinlock.h
> +++ b/include/asm-generic/qspinlock.h
> @@ -2,6 +2,36 @@
>  /*
>   * Queued spinlock
>   *
> + * A 'generic' spinlock implementation that is based on MCS locks. An
> + * architecture that's looking for a 'generic' spinlock, please first consider
> + * ticket-lock.h and only come looking here when you've considered all the
> + * constraints below and can show your hardware does actually perform better
> + * with qspinlock.
> + *
> + *
> + * It relies on atomic_*_release()/atomic_*_acquire() to be RCsc (or no weaker
> + * than RCtso if you're power), where regular code only expects atomic_t to be
> + * RCpc.
> + *
> + * It relies on a far greater (compared to ticket-lock.h) set of atomic
> + * operations to behave well together, please audit them carefully to ensure
> + * they all have forward progress. Many atomic operations may default to
> + * cmpxchg() loops which will not have good forward progress properties on
> + * LL/SC architectures.
> + *
> + * One notable example is atomic_fetch_or_acquire(), which x86 cannot (cheaply)
> + * do. Carefully read the patches that introduced queued_fetch_set_pending_acquire().
> + *
> + * It also heavily relies on mixed size atomic operations, in specific it
> + * requires architectures to have xchg16; something which many LL/SC
> + * architectures need to implement as a 32bit and+or in order to satisfy the
> + * forward progress guarantees mentioned above.
> + *
> + * Further reading on mixed size atomics that might be relevant:
> + *
> + *   http://www.cl.cam.ac.uk/~pes20/popl17/mixed-size.pdf
> + *
> + *
>   * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
>   * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
>   *
> diff --git a/include/asm-generic/ticket-lock-types.h b/include/asm-generic/ticket-lock-types.h
> new file mode 100644
> index 000000000000..829759aedda8
> --- /dev/null
> +++ b/include/asm-generic/ticket-lock-types.h
> @@ -0,0 +1,11 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#ifndef __ASM_GENERIC_TICKET_LOCK_TYPES_H
> +#define __ASM_GENERIC_TICKET_LOCK_TYPES_H
> +
> +#include <linux/types.h>
> +typedef atomic_t arch_spinlock_t;
> +
> +#define __ARCH_SPIN_LOCK_UNLOCKED	ATOMIC_INIT(0)
> +
> +#endif /* __ASM_GENERIC_TICKET_LOCK_TYPES_H */
> diff --git a/include/asm-generic/ticket-lock.h b/include/asm-generic/ticket-lock.h
> new file mode 100644
> index 000000000000..3f0d53e21a37
> --- /dev/null
> +++ b/include/asm-generic/ticket-lock.h
> @@ -0,0 +1,86 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +/*
> + * 'Generic' ticket-lock implementation.
> + *
> + * It relies on atomic_fetch_add() having well defined forward progress
> + * guarantees under contention. If your architecture cannot provide this, stick
> + * to a test-and-set lock.
> + *
> + * It also relies on atomic_fetch_add() being safe vs smp_store_release() on a
> + * sub-word of the value. This is generally true for anything LL/SC although
> + * you'd be hard pressed to find anything useful in architecture specifications
> + * about this. If your architecture cannot do this you might be better off with
> + * a test-and-set.
> + *
> + * It further assumes atomic_*_release() + atomic_*_acquire() is RCpc and hence
> + * uses atomic_fetch_add() which is SC to create an RCsc lock.
> + *
> + * The implementation uses smp_cond_load_acquire() to spin, so if the
> + * architecture has WFE like instructions to sleep instead of poll for word
> + * modifications be sure to implement that (see ARM64 for example).
> + *
> + */
> +
> +#ifndef __ASM_GENERIC_TICKET_LOCK_H
> +#define __ASM_GENERIC_TICKET_LOCK_H
> +
> +#include <linux/atomic.h>
> +#include <asm/ticket-lock-types.h>
> +
> +static __always_inline void ticket_lock(arch_spinlock_t *lock)
> +{
> +	u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
> +	u16 ticket = val >> 16;
> +
> +	if (ticket == (u16)val)
> +		return;
> +
> +	atomic_cond_read_acquire(lock, ticket == (u16)VAL);
> +}
> +
> +static __always_inline bool ticket_trylock(arch_spinlock_t *lock)
> +{
> +	u32 old = atomic_read(lock);
> +
> +	if ((old >> 16) != (old & 0xffff))
> +		return false;
> +
> +	return atomic_try_cmpxchg(lock, &old, old + (1<<16)); /* SC, for RCsc */
> +}
> +
> +static __always_inline void ticket_unlock(arch_spinlock_t *lock)
> +{
> +	u16 *ptr = (u16 *)lock + __is_defined(__BIG_ENDIAN);
> +	u32 val = atomic_read(lock);
> +
> +	smp_store_release(ptr, (u16)val + 1);
> +}
> +
> +static __always_inline int ticket_is_locked(arch_spinlock_t *lock)
> +{
> +	u32 val = atomic_read(lock);
> +
> +	return ((val >> 16) != (val & 0xffff));
> +}
> +
> +static __always_inline int ticket_is_contended(arch_spinlock_t *lock)
> +{
> +	u32 val = atomic_read(lock);
> +
> +	return (s16)((val >> 16) - (val & 0xffff)) > 1;
> +}
> +
> +static __always_inline int ticket_value_unlocked(arch_spinlock_t lock)
> +{
> +	return !ticket_is_locked(&lock);
> +}
> +
> +#define arch_spin_lock(l)		ticket_lock(l)
> +#define arch_spin_trylock(l)		ticket_trylock(l)
> +#define arch_spin_unlock(l)		ticket_unlock(l)
> +#define arch_spin_is_locked(l)		ticket_is_locked(l)
> +#define arch_spin_is_contended(l)	ticket_is_contended(l)
> +#define arch_spin_value_unlocked(l)	ticket_value_unlocked(l)
> +
> +#endif /* __ASM_GENERIC_TICKET_LOCK_H */

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [RFC][PATCH] locking: Generic ticket-lock
  2021-04-14 12:45                             ` Peter Zijlstra
@ 2021-04-14 21:02                               ` Stafford Horne
  -1 siblings, 0 replies; 92+ messages in thread
From: Stafford Horne @ 2021-04-14 21:02 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Guo Ren, Christoph Müllner, Palmer Dabbelt, Anup Patel,
	linux-riscv, Linux Kernel Mailing List, Guo Ren, Catalin Marinas,
	Will Deacon, Arnd Bergmann, jonas, stefan.kristiansson

On Wed, Apr 14, 2021 at 02:45:43PM +0200, Peter Zijlstra wrote:
> On Wed, Apr 14, 2021 at 12:16:38PM +0200, Peter Zijlstra wrote:
> > On Wed, Apr 14, 2021 at 11:05:24AM +0200, Peter Zijlstra wrote:
> > 
> > > That made me look at the qspinlock code, and queued_spin_*lock() uses
> > > atomic_try_cmpxchg_acquire(), which means any arch that uses qspinlock
> > > and has RCpc atomics will give us massive pain.
> > > 
> > > Current archs using qspinlock are: x86, arm64, power, sparc64, mips and
> > > openrisc (WTF?!).
> > > 
> > > Of those, x86 and sparc are TSO archs with SC atomics, arm64 has RCsc
> > > atomics, power has RCtso atomics (and is the arch we all hate for having
> > > RCtso locks).
> > > 
> > > Now MIPS has all sorts of ill specified barriers, but last time looked
> > > at it it didn't actually use any of that and stuck to using smp_mb(), so
> > > it will have RCsc atomics.
> > > 
> > > /me goes look at wth openrisc is..  doesn't even appear to have
> > > asm/barrier.h :-/ Looking at wikipedia it also doesn't appear to
> > > actually have hardware ...
> > 
> > FWIW this is broken, anything SMP *MUST* define mb(), at the very least.
> 
> As near as I can tell this should do. The arch spec only lists this one
> instruction and the text makes it sound like a completion barrier.

Yes, I will submit this patch.

The l.msync instruction completes all load/store operations.
The l.lwa/l.swa pair (used for xchg/cmpxchg) also implies l.msync.

> ---
>  arch/openrisc/include/asm/barrier.h | 9 +++++++++
>  1 file changed, 9 insertions(+)
> 
> diff --git a/arch/openrisc/include/asm/barrier.h b/arch/openrisc/include/asm/barrier.h
> new file mode 100644
> index 000000000000..7538294721be
> --- /dev/null
> +++ b/arch/openrisc/include/asm/barrier.h
> @@ -0,0 +1,9 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef __ASM_BARRIER_H
> +#define __ASM_BARRIER_H
> +
> +#define mb() asm volatile ("l.msync" ::: "memory")
> +
> +#include <asm-generic/barrier.h>
> +
> +#endif /* __ASM_BARRIER_H */

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [RFC][PATCH] locking: Generic ticket-lock
@ 2021-04-14 21:02                               ` Stafford Horne
  0 siblings, 0 replies; 92+ messages in thread
From: Stafford Horne @ 2021-04-14 21:02 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Guo Ren, Christoph Müllner, Palmer Dabbelt, Anup Patel,
	linux-riscv, Linux Kernel Mailing List, Guo Ren, Catalin Marinas,
	Will Deacon, Arnd Bergmann, jonas, stefan.kristiansson

On Wed, Apr 14, 2021 at 02:45:43PM +0200, Peter Zijlstra wrote:
> On Wed, Apr 14, 2021 at 12:16:38PM +0200, Peter Zijlstra wrote:
> > On Wed, Apr 14, 2021 at 11:05:24AM +0200, Peter Zijlstra wrote:
> > 
> > > That made me look at the qspinlock code, and queued_spin_*lock() uses
> > > atomic_try_cmpxchg_acquire(), which means any arch that uses qspinlock
> > > and has RCpc atomics will give us massive pain.
> > > 
> > > Current archs using qspinlock are: x86, arm64, power, sparc64, mips and
> > > openrisc (WTF?!).
> > > 
> > > Of those, x86 and sparc are TSO archs with SC atomics, arm64 has RCsc
> > > atomics, power has RCtso atomics (and is the arch we all hate for having
> > > RCtso locks).
> > > 
> > > Now MIPS has all sorts of ill specified barriers, but last time looked
> > > at it it didn't actually use any of that and stuck to using smp_mb(), so
> > > it will have RCsc atomics.
> > > 
> > > /me goes look at wth openrisc is..  doesn't even appear to have
> > > asm/barrier.h :-/ Looking at wikipedia it also doesn't appear to
> > > actually have hardware ...
> > 
> > FWIW this is broken, anything SMP *MUST* define mb(), at the very least.
> 
> As near as I can tell this should do. The arch spec only lists this one
> instruction and the text makes it sound like a completion barrier.

Yes, I will submit this patch.

The l.msync instruction completes all load/store operations.
The l.lwa/l.swa pair (used for xchg/cmpxchg) also implies l.msync.

> ---
>  arch/openrisc/include/asm/barrier.h | 9 +++++++++
>  1 file changed, 9 insertions(+)
> 
> diff --git a/arch/openrisc/include/asm/barrier.h b/arch/openrisc/include/asm/barrier.h
> new file mode 100644
> index 000000000000..7538294721be
> --- /dev/null
> +++ b/arch/openrisc/include/asm/barrier.h
> @@ -0,0 +1,9 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef __ASM_BARRIER_H
> +#define __ASM_BARRIER_H
> +
> +#define mb() asm volatile ("l.msync" ::: "memory")
> +
> +#include <asm-generic/barrier.h>
> +
> +#endif /* __ASM_BARRIER_H */

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [RFC][PATCH] locking: Generic ticket-lock
  2021-04-14 20:47                             ` Stafford Horne
@ 2021-04-15  8:09                               ` Peter Zijlstra
  -1 siblings, 0 replies; 92+ messages in thread
From: Peter Zijlstra @ 2021-04-15  8:09 UTC (permalink / raw)
  To: Stafford Horne
  Cc: Guo Ren, Christoph Müllner, Palmer Dabbelt, Anup Patel,
	linux-riscv, Linux Kernel Mailing List, Guo Ren, Catalin Marinas,
	Will Deacon, Arnd Bergmann, jonas, stefan.kristiansson

On Thu, Apr 15, 2021 at 05:47:34AM +0900, Stafford Horne wrote:

> > How's this then? Compile tested only on openrisc/simple_smp_defconfig.
> 
> I did my testing with this FPGA build SoC:
> 
>  https://github.com/stffrdhrn/de0_nano-multicore
> 
> Note, the CPU timer sync logic uses mb() and is a bit flaky.  So missing mb()
> might be a reason.  I thought we had defined mb() and l.msync, but it seems to
> have gotten lost.
> 
> With that said I could test out this ticket-lock implementation.  How would I
> tell if its better than qspinlock?

Mostly if it isn't worse, it's better for being *much* simpler. As you
can see, the guts of ticket is like 16 lines of C (lock+unlock) and you
only need the behaviour of atomic_fetch_add() to reason about behaviour
of the whole thing. qspinlock OTOH is mind bending painful to reason
about.

There are some spinlock tests in locktorture; but back when I had a
userspace copy of the lot and would measure min,avg,max acquire times
under various contention loads (making sure to only run a single task
per CPU etc.. to avoid lock holder preemption and other such 'fun'
things).

It took us a fair amount of work to get qspinlock to compete with ticket
for low contention cases (by far the most common in the kernel), and it
took a fairly large amount of CPUs for qspinlock to really win from
ticket on the contended case. Your hardware may vary. In particular the
access to the external cacheline (for queueing, see the queue: label in
queued_spin_lock_slowpath) is a pain-point and the relative cost of
cacheline misses for your arch determines where (and if) low contention
behaviour is competitive.

Also, less variance (the reason for the min/max measure) is better.
Large variance is typically a sign of fwd progress trouble.

That's not saying that qspinlock isn't awesome, but I'm arguing that you
should get there by first trying all the simpler things. By gradually
increasing complexity you can also find the problem spots (for your
architecture) and you have something to fall back to in case of trouble.

Now, the obvious selling point of qspinlock is that due to the MCS style
nature of the thing it doesn't bounce the lock around, but that comes at
a cost of having to use that extra cacheline (due to the kernel liking
sizeof(spinlock_t) == sizeof(u32)). But things like ARM64's WFE (see
smp_cond_load_acquire()) can shift the balance quite a bit on that front
as well (ARM has a similar thing but less useful, see it's spinlock.h
and look for wfe() and dsb_sev()).

Once your arch hits NUMA, qspinlock is probably a win. However, low
contention performance is still king for most workloads. Better high
contention behaviour is nice.

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [RFC][PATCH] locking: Generic ticket-lock
@ 2021-04-15  8:09                               ` Peter Zijlstra
  0 siblings, 0 replies; 92+ messages in thread
From: Peter Zijlstra @ 2021-04-15  8:09 UTC (permalink / raw)
  To: Stafford Horne
  Cc: Guo Ren, Christoph Müllner, Palmer Dabbelt, Anup Patel,
	linux-riscv, Linux Kernel Mailing List, Guo Ren, Catalin Marinas,
	Will Deacon, Arnd Bergmann, jonas, stefan.kristiansson

On Thu, Apr 15, 2021 at 05:47:34AM +0900, Stafford Horne wrote:

> > How's this then? Compile tested only on openrisc/simple_smp_defconfig.
> 
> I did my testing with this FPGA build SoC:
> 
>  https://github.com/stffrdhrn/de0_nano-multicore
> 
> Note, the CPU timer sync logic uses mb() and is a bit flaky.  So missing mb()
> might be a reason.  I thought we had defined mb() and l.msync, but it seems to
> have gotten lost.
> 
> With that said I could test out this ticket-lock implementation.  How would I
> tell if its better than qspinlock?

Mostly if it isn't worse, it's better for being *much* simpler. As you
can see, the guts of ticket is like 16 lines of C (lock+unlock) and you
only need the behaviour of atomic_fetch_add() to reason about behaviour
of the whole thing. qspinlock OTOH is mind bending painful to reason
about.

There are some spinlock tests in locktorture; but back when I had a
userspace copy of the lot and would measure min,avg,max acquire times
under various contention loads (making sure to only run a single task
per CPU etc.. to avoid lock holder preemption and other such 'fun'
things).

It took us a fair amount of work to get qspinlock to compete with ticket
for low contention cases (by far the most common in the kernel), and it
took a fairly large amount of CPUs for qspinlock to really win from
ticket on the contended case. Your hardware may vary. In particular the
access to the external cacheline (for queueing, see the queue: label in
queued_spin_lock_slowpath) is a pain-point and the relative cost of
cacheline misses for your arch determines where (and if) low contention
behaviour is competitive.

Also, less variance (the reason for the min/max measure) is better.
Large variance is typically a sign of fwd progress trouble.

That's not saying that qspinlock isn't awesome, but I'm arguing that you
should get there by first trying all the simpler things. By gradually
increasing complexity you can also find the problem spots (for your
architecture) and you have something to fall back to in case of trouble.

Now, the obvious selling point of qspinlock is that due to the MCS style
nature of the thing it doesn't bounce the lock around, but that comes at
a cost of having to use that extra cacheline (due to the kernel liking
sizeof(spinlock_t) == sizeof(u32)). But things like ARM64's WFE (see
smp_cond_load_acquire()) can shift the balance quite a bit on that front
as well (ARM has a similar thing but less useful, see it's spinlock.h
and look for wfe() and dsb_sev()).

Once your arch hits NUMA, qspinlock is probably a win. However, low
contention performance is still king for most workloads. Better high
contention behaviour is nice.

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [RFC][PATCH] locking: Generic ticket-lock
  2021-04-15  8:09                               ` Peter Zijlstra
@ 2021-04-15  9:02                                 ` Catalin Marinas
  -1 siblings, 0 replies; 92+ messages in thread
From: Catalin Marinas @ 2021-04-15  9:02 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Stafford Horne, Guo Ren, Christoph Müllner, Palmer Dabbelt,
	Anup Patel, linux-riscv, Linux Kernel Mailing List, Guo Ren,
	Will Deacon, Arnd Bergmann, jonas, stefan.kristiansson

(fixed Will's email address)

On Thu, Apr 15, 2021 at 10:09:54AM +0200, Peter Zijlstra wrote:
> On Thu, Apr 15, 2021 at 05:47:34AM +0900, Stafford Horne wrote:
> > > How's this then? Compile tested only on openrisc/simple_smp_defconfig.
> > 
> > I did my testing with this FPGA build SoC:
> > 
> >  https://github.com/stffrdhrn/de0_nano-multicore
> > 
> > Note, the CPU timer sync logic uses mb() and is a bit flaky.  So missing mb()
> > might be a reason.  I thought we had defined mb() and l.msync, but it seems to
> > have gotten lost.
> > 
> > With that said I could test out this ticket-lock implementation.  How would I
> > tell if its better than qspinlock?
> 
> Mostly if it isn't worse, it's better for being *much* simpler. As you
> can see, the guts of ticket is like 16 lines of C (lock+unlock) and you
> only need the behaviour of atomic_fetch_add() to reason about behaviour
> of the whole thing. qspinlock OTOH is mind bending painful to reason
> about.
> 
> There are some spinlock tests in locktorture; but back when I had a
> userspace copy of the lot and would measure min,avg,max acquire times
> under various contention loads (making sure to only run a single task
> per CPU etc.. to avoid lock holder preemption and other such 'fun'
> things).
> 
> It took us a fair amount of work to get qspinlock to compete with ticket
> for low contention cases (by far the most common in the kernel), and it
> took a fairly large amount of CPUs for qspinlock to really win from
> ticket on the contended case. Your hardware may vary. In particular the
> access to the external cacheline (for queueing, see the queue: label in
> queued_spin_lock_slowpath) is a pain-point and the relative cost of
> cacheline misses for your arch determines where (and if) low contention
> behaviour is competitive.
> 
> Also, less variance (the reason for the min/max measure) is better.
> Large variance is typically a sign of fwd progress trouble.

IIRC, one issue we had with ticket spinlocks on arm64 was on big.LITTLE
systems where the little CPUs were always last to get a ticket when
racing with the big cores. That was with load/store exclusives (LR/SC
style) and would have probably got better with atomics but we moved to
qspinlocks eventually (the Juno board didn't have atomics).

(leaving the rest of the text below for Will's convenience)

> That's not saying that qspinlock isn't awesome, but I'm arguing that you
> should get there by first trying all the simpler things. By gradually
> increasing complexity you can also find the problem spots (for your
> architecture) and you have something to fall back to in case of trouble.
> 
> Now, the obvious selling point of qspinlock is that due to the MCS style
> nature of the thing it doesn't bounce the lock around, but that comes at
> a cost of having to use that extra cacheline (due to the kernel liking
> sizeof(spinlock_t) == sizeof(u32)). But things like ARM64's WFE (see
> smp_cond_load_acquire()) can shift the balance quite a bit on that front
> as well (ARM has a similar thing but less useful, see it's spinlock.h
> and look for wfe() and dsb_sev()).
> 
> Once your arch hits NUMA, qspinlock is probably a win. However, low
> contention performance is still king for most workloads. Better high
> contention behaviour is nice.

-- 
Catalin

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [RFC][PATCH] locking: Generic ticket-lock
@ 2021-04-15  9:02                                 ` Catalin Marinas
  0 siblings, 0 replies; 92+ messages in thread
From: Catalin Marinas @ 2021-04-15  9:02 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Stafford Horne, Guo Ren, Christoph Müllner, Palmer Dabbelt,
	Anup Patel, linux-riscv, Linux Kernel Mailing List, Guo Ren,
	Will Deacon, Arnd Bergmann, jonas, stefan.kristiansson

(fixed Will's email address)

On Thu, Apr 15, 2021 at 10:09:54AM +0200, Peter Zijlstra wrote:
> On Thu, Apr 15, 2021 at 05:47:34AM +0900, Stafford Horne wrote:
> > > How's this then? Compile tested only on openrisc/simple_smp_defconfig.
> > 
> > I did my testing with this FPGA build SoC:
> > 
> >  https://github.com/stffrdhrn/de0_nano-multicore
> > 
> > Note, the CPU timer sync logic uses mb() and is a bit flaky.  So missing mb()
> > might be a reason.  I thought we had defined mb() and l.msync, but it seems to
> > have gotten lost.
> > 
> > With that said I could test out this ticket-lock implementation.  How would I
> > tell if its better than qspinlock?
> 
> Mostly if it isn't worse, it's better for being *much* simpler. As you
> can see, the guts of ticket is like 16 lines of C (lock+unlock) and you
> only need the behaviour of atomic_fetch_add() to reason about behaviour
> of the whole thing. qspinlock OTOH is mind bending painful to reason
> about.
> 
> There are some spinlock tests in locktorture; but back when I had a
> userspace copy of the lot and would measure min,avg,max acquire times
> under various contention loads (making sure to only run a single task
> per CPU etc.. to avoid lock holder preemption and other such 'fun'
> things).
> 
> It took us a fair amount of work to get qspinlock to compete with ticket
> for low contention cases (by far the most common in the kernel), and it
> took a fairly large amount of CPUs for qspinlock to really win from
> ticket on the contended case. Your hardware may vary. In particular the
> access to the external cacheline (for queueing, see the queue: label in
> queued_spin_lock_slowpath) is a pain-point and the relative cost of
> cacheline misses for your arch determines where (and if) low contention
> behaviour is competitive.
> 
> Also, less variance (the reason for the min/max measure) is better.
> Large variance is typically a sign of fwd progress trouble.

IIRC, one issue we had with ticket spinlocks on arm64 was on big.LITTLE
systems where the little CPUs were always last to get a ticket when
racing with the big cores. That was with load/store exclusives (LR/SC
style) and would have probably got better with atomics but we moved to
qspinlocks eventually (the Juno board didn't have atomics).

(leaving the rest of the text below for Will's convenience)

> That's not saying that qspinlock isn't awesome, but I'm arguing that you
> should get there by first trying all the simpler things. By gradually
> increasing complexity you can also find the problem spots (for your
> architecture) and you have something to fall back to in case of trouble.
> 
> Now, the obvious selling point of qspinlock is that due to the MCS style
> nature of the thing it doesn't bounce the lock around, but that comes at
> a cost of having to use that extra cacheline (due to the kernel liking
> sizeof(spinlock_t) == sizeof(u32)). But things like ARM64's WFE (see
> smp_cond_load_acquire()) can shift the balance quite a bit on that front
> as well (ARM has a similar thing but less useful, see it's spinlock.h
> and look for wfe() and dsb_sev()).
> 
> Once your arch hits NUMA, qspinlock is probably a win. However, low
> contention performance is still king for most workloads. Better high
> contention behaviour is nice.

-- 
Catalin

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [RFC][PATCH] locking: Generic ticket-lock
  2021-04-15  9:02                                 ` Catalin Marinas
@ 2021-04-15  9:22                                   ` Will Deacon
  -1 siblings, 0 replies; 92+ messages in thread
From: Will Deacon @ 2021-04-15  9:22 UTC (permalink / raw)
  To: Catalin Marinas
  Cc: Peter Zijlstra, Stafford Horne, Guo Ren, Christoph Müllner,
	Palmer Dabbelt, Anup Patel, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, Arnd Bergmann, jonas,
	stefan.kristiansson

On Thu, Apr 15, 2021 at 10:02:18AM +0100, Catalin Marinas wrote:
> (fixed Will's email address)
> 
> On Thu, Apr 15, 2021 at 10:09:54AM +0200, Peter Zijlstra wrote:
> > On Thu, Apr 15, 2021 at 05:47:34AM +0900, Stafford Horne wrote:
> > > > How's this then? Compile tested only on openrisc/simple_smp_defconfig.
> > > 
> > > I did my testing with this FPGA build SoC:
> > > 
> > >  https://github.com/stffrdhrn/de0_nano-multicore
> > > 
> > > Note, the CPU timer sync logic uses mb() and is a bit flaky.  So missing mb()
> > > might be a reason.  I thought we had defined mb() and l.msync, but it seems to
> > > have gotten lost.
> > > 
> > > With that said I could test out this ticket-lock implementation.  How would I
> > > tell if its better than qspinlock?
> > 
> > Mostly if it isn't worse, it's better for being *much* simpler. As you
> > can see, the guts of ticket is like 16 lines of C (lock+unlock) and you
> > only need the behaviour of atomic_fetch_add() to reason about behaviour
> > of the whole thing. qspinlock OTOH is mind bending painful to reason
> > about.
> > 
> > There are some spinlock tests in locktorture; but back when I had a
> > userspace copy of the lot and would measure min,avg,max acquire times
> > under various contention loads (making sure to only run a single task
> > per CPU etc.. to avoid lock holder preemption and other such 'fun'
> > things).
> > 
> > It took us a fair amount of work to get qspinlock to compete with ticket
> > for low contention cases (by far the most common in the kernel), and it
> > took a fairly large amount of CPUs for qspinlock to really win from
> > ticket on the contended case. Your hardware may vary. In particular the
> > access to the external cacheline (for queueing, see the queue: label in
> > queued_spin_lock_slowpath) is a pain-point and the relative cost of
> > cacheline misses for your arch determines where (and if) low contention
> > behaviour is competitive.
> > 
> > Also, less variance (the reason for the min/max measure) is better.
> > Large variance is typically a sign of fwd progress trouble.
> 
> IIRC, one issue we had with ticket spinlocks on arm64 was on big.LITTLE
> systems where the little CPUs were always last to get a ticket when
> racing with the big cores. That was with load/store exclusives (LR/SC
> style) and would have probably got better with atomics but we moved to
> qspinlocks eventually (the Juno board didn't have atomics).
> 
> (leaving the rest of the text below for Will's convenience)

Yes, I think it was this thread:

https://lore.kernel.org/lkml/alpine.DEB.2.20.1707261548560.2186@nanos

but I don't think you can really fix such hardware by changing the locking
algorithm (although my proposed cpu_relax() hack was worryingly effective).

Will

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [RFC][PATCH] locking: Generic ticket-lock
@ 2021-04-15  9:22                                   ` Will Deacon
  0 siblings, 0 replies; 92+ messages in thread
From: Will Deacon @ 2021-04-15  9:22 UTC (permalink / raw)
  To: Catalin Marinas
  Cc: Peter Zijlstra, Stafford Horne, Guo Ren, Christoph Müllner,
	Palmer Dabbelt, Anup Patel, linux-riscv,
	Linux Kernel Mailing List, Guo Ren, Arnd Bergmann, jonas,
	stefan.kristiansson

On Thu, Apr 15, 2021 at 10:02:18AM +0100, Catalin Marinas wrote:
> (fixed Will's email address)
> 
> On Thu, Apr 15, 2021 at 10:09:54AM +0200, Peter Zijlstra wrote:
> > On Thu, Apr 15, 2021 at 05:47:34AM +0900, Stafford Horne wrote:
> > > > How's this then? Compile tested only on openrisc/simple_smp_defconfig.
> > > 
> > > I did my testing with this FPGA build SoC:
> > > 
> > >  https://github.com/stffrdhrn/de0_nano-multicore
> > > 
> > > Note, the CPU timer sync logic uses mb() and is a bit flaky.  So missing mb()
> > > might be a reason.  I thought we had defined mb() and l.msync, but it seems to
> > > have gotten lost.
> > > 
> > > With that said I could test out this ticket-lock implementation.  How would I
> > > tell if its better than qspinlock?
> > 
> > Mostly if it isn't worse, it's better for being *much* simpler. As you
> > can see, the guts of ticket is like 16 lines of C (lock+unlock) and you
> > only need the behaviour of atomic_fetch_add() to reason about behaviour
> > of the whole thing. qspinlock OTOH is mind bending painful to reason
> > about.
> > 
> > There are some spinlock tests in locktorture; but back when I had a
> > userspace copy of the lot and would measure min,avg,max acquire times
> > under various contention loads (making sure to only run a single task
> > per CPU etc.. to avoid lock holder preemption and other such 'fun'
> > things).
> > 
> > It took us a fair amount of work to get qspinlock to compete with ticket
> > for low contention cases (by far the most common in the kernel), and it
> > took a fairly large amount of CPUs for qspinlock to really win from
> > ticket on the contended case. Your hardware may vary. In particular the
> > access to the external cacheline (for queueing, see the queue: label in
> > queued_spin_lock_slowpath) is a pain-point and the relative cost of
> > cacheline misses for your arch determines where (and if) low contention
> > behaviour is competitive.
> > 
> > Also, less variance (the reason for the min/max measure) is better.
> > Large variance is typically a sign of fwd progress trouble.
> 
> IIRC, one issue we had with ticket spinlocks on arm64 was on big.LITTLE
> systems where the little CPUs were always last to get a ticket when
> racing with the big cores. That was with load/store exclusives (LR/SC
> style) and would have probably got better with atomics but we moved to
> qspinlocks eventually (the Juno board didn't have atomics).
> 
> (leaving the rest of the text below for Will's convenience)

Yes, I think it was this thread:

https://lore.kernel.org/lkml/alpine.DEB.2.20.1707261548560.2186@nanos

but I don't think you can really fix such hardware by changing the locking
algorithm (although my proposed cpu_relax() hack was worryingly effective).

Will

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [RFC][PATCH] locking: Generic ticket-lock
  2021-04-15  9:02                                 ` Catalin Marinas
@ 2021-04-15  9:24                                   ` Peter Zijlstra
  -1 siblings, 0 replies; 92+ messages in thread
From: Peter Zijlstra @ 2021-04-15  9:24 UTC (permalink / raw)
  To: Catalin Marinas
  Cc: Stafford Horne, Guo Ren, Christoph Müllner, Palmer Dabbelt,
	Anup Patel, linux-riscv, Linux Kernel Mailing List, Guo Ren,
	Will Deacon, Arnd Bergmann, jonas, stefan.kristiansson

On Thu, Apr 15, 2021 at 10:02:18AM +0100, Catalin Marinas wrote:
> IIRC, one issue we had with ticket spinlocks on arm64 was on big.LITTLE
> systems where the little CPUs were always last to get a ticket when
> racing with the big cores. That was with load/store exclusives (LR/SC
> style) and would have probably got better with atomics but we moved to
> qspinlocks eventually (the Juno board didn't have atomics).

That sounds like a fundamental LL/SC fail, and I'm not sure qspinlock
will help with that at all. The big cores can still hog the lock word
and starve the little ones.

And those things not having AMOs there's really not much you can do. You
want the big cores to back off, but they're having success, not failure.
I suppose you can add a delay after a successful LL/SC, but that sucks.

I suppose modern big.little things will have AMOs, so maybe nobody still
cares about those systems.

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [RFC][PATCH] locking: Generic ticket-lock
@ 2021-04-15  9:24                                   ` Peter Zijlstra
  0 siblings, 0 replies; 92+ messages in thread
From: Peter Zijlstra @ 2021-04-15  9:24 UTC (permalink / raw)
  To: Catalin Marinas
  Cc: Stafford Horne, Guo Ren, Christoph Müllner, Palmer Dabbelt,
	Anup Patel, linux-riscv, Linux Kernel Mailing List, Guo Ren,
	Will Deacon, Arnd Bergmann, jonas, stefan.kristiansson

On Thu, Apr 15, 2021 at 10:02:18AM +0100, Catalin Marinas wrote:
> IIRC, one issue we had with ticket spinlocks on arm64 was on big.LITTLE
> systems where the little CPUs were always last to get a ticket when
> racing with the big cores. That was with load/store exclusives (LR/SC
> style) and would have probably got better with atomics but we moved to
> qspinlocks eventually (the Juno board didn't have atomics).

That sounds like a fundamental LL/SC fail, and I'm not sure qspinlock
will help with that at all. The big cores can still hog the lock word
and starve the little ones.

And those things not having AMOs there's really not much you can do. You
want the big cores to back off, but they're having success, not failure.
I suppose you can add a delay after a successful LL/SC, but that sucks.

I suppose modern big.little things will have AMOs, so maybe nobody still
cares about those systems.

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [RFC][PATCH] locking: Generic ticket-lock
  2021-04-14 10:16                           ` Peter Zijlstra
@ 2021-04-19 17:35                             ` Will Deacon
  -1 siblings, 0 replies; 92+ messages in thread
From: Will Deacon @ 2021-04-19 17:35 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Guo Ren, Christoph Müllner, Palmer Dabbelt, Anup Patel,
	linux-riscv, Linux Kernel Mailing List, Guo Ren, Catalin Marinas,
	Will Deacon, Arnd Bergmann, jonas, stefan.kristiansson, shorne

On Wed, Apr 14, 2021 at 12:16:38PM +0200, Peter Zijlstra wrote:
> How's this then? Compile tested only on openrisc/simple_smp_defconfig.
> 
> diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
> index d74b13825501..a7a1296b0b4d 100644
> --- a/include/asm-generic/qspinlock.h
> +++ b/include/asm-generic/qspinlock.h
> @@ -2,6 +2,36 @@
>  /*
>   * Queued spinlock
>   *
> + * A 'generic' spinlock implementation that is based on MCS locks. An
> + * architecture that's looking for a 'generic' spinlock, please first consider
> + * ticket-lock.h and only come looking here when you've considered all the
> + * constraints below and can show your hardware does actually perform better
> + * with qspinlock.
> + *
> + *
> + * It relies on atomic_*_release()/atomic_*_acquire() to be RCsc (or no weaker
> + * than RCtso if you're power), where regular code only expects atomic_t to be
> + * RCpc.

Maybe capitalise "Power" to make it clear this about the architecture?

> + *
> + * It relies on a far greater (compared to ticket-lock.h) set of atomic
> + * operations to behave well together, please audit them carefully to ensure
> + * they all have forward progress. Many atomic operations may default to
> + * cmpxchg() loops which will not have good forward progress properties on
> + * LL/SC architectures.
> + *
> + * One notable example is atomic_fetch_or_acquire(), which x86 cannot (cheaply)
> + * do. Carefully read the patches that introduced queued_fetch_set_pending_acquire().
> + *
> + * It also heavily relies on mixed size atomic operations, in specific it
> + * requires architectures to have xchg16; something which many LL/SC
> + * architectures need to implement as a 32bit and+or in order to satisfy the
> + * forward progress guarantees mentioned above.
> + *
> + * Further reading on mixed size atomics that might be relevant:
> + *
> + *   http://www.cl.cam.ac.uk/~pes20/popl17/mixed-size.pdf
> + *
> + *
>   * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
>   * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
>   *
> diff --git a/include/asm-generic/ticket-lock-types.h b/include/asm-generic/ticket-lock-types.h
> new file mode 100644
> index 000000000000..829759aedda8
> --- /dev/null
> +++ b/include/asm-generic/ticket-lock-types.h
> @@ -0,0 +1,11 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#ifndef __ASM_GENERIC_TICKET_LOCK_TYPES_H
> +#define __ASM_GENERIC_TICKET_LOCK_TYPES_H
> +
> +#include <linux/types.h>
> +typedef atomic_t arch_spinlock_t;
> +
> +#define __ARCH_SPIN_LOCK_UNLOCKED	ATOMIC_INIT(0)
> +
> +#endif /* __ASM_GENERIC_TICKET_LOCK_TYPES_H */
> diff --git a/include/asm-generic/ticket-lock.h b/include/asm-generic/ticket-lock.h
> new file mode 100644
> index 000000000000..3f0d53e21a37
> --- /dev/null
> +++ b/include/asm-generic/ticket-lock.h
> @@ -0,0 +1,86 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +/*
> + * 'Generic' ticket-lock implementation.
> + *
> + * It relies on atomic_fetch_add() having well defined forward progress
> + * guarantees under contention. If your architecture cannot provide this, stick
> + * to a test-and-set lock.
> + *
> + * It also relies on atomic_fetch_add() being safe vs smp_store_release() on a
> + * sub-word of the value. This is generally true for anything LL/SC although
> + * you'd be hard pressed to find anything useful in architecture specifications
> + * about this. If your architecture cannot do this you might be better off with
> + * a test-and-set.
> + *
> + * It further assumes atomic_*_release() + atomic_*_acquire() is RCpc and hence
> + * uses atomic_fetch_add() which is SC to create an RCsc lock.
> + *
> + * The implementation uses smp_cond_load_acquire() to spin, so if the
> + * architecture has WFE like instructions to sleep instead of poll for word
> + * modifications be sure to implement that (see ARM64 for example).
> + *
> + */
> +
> +#ifndef __ASM_GENERIC_TICKET_LOCK_H
> +#define __ASM_GENERIC_TICKET_LOCK_H
> +
> +#include <linux/atomic.h>
> +#include <asm/ticket-lock-types.h>
> +
> +static __always_inline void ticket_lock(arch_spinlock_t *lock)
> +{
> +	u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */

I hate to say it, but smp_mb__after_unlock_lock() would make the intention
a lot clearer here :( That is, the implementation as you have it gives
stronger than RCsc semantics for all architectures.

Alternatively, we could write the thing RCpc and throw an smp_mb() into
the unlock path if CONFIG_ARCH_WEAK_RELEASE_ACQUIRE.

> +	u16 ticket = val >> 16;
> +
> +	if (ticket == (u16)val)
> +		return;
> +
> +	atomic_cond_read_acquire(lock, ticket == (u16)VAL);
> +}
> +
> +static __always_inline bool ticket_trylock(arch_spinlock_t *lock)
> +{
> +	u32 old = atomic_read(lock);
> +
> +	if ((old >> 16) != (old & 0xffff))
> +		return false;
> +
> +	return atomic_try_cmpxchg(lock, &old, old + (1<<16)); /* SC, for RCsc */
> +}
> +
> +static __always_inline void ticket_unlock(arch_spinlock_t *lock)
> +{
> +	u16 *ptr = (u16 *)lock + __is_defined(__BIG_ENDIAN);
> +	u32 val = atomic_read(lock);
> +
> +	smp_store_release(ptr, (u16)val + 1);
> +}
> +
> +static __always_inline int ticket_is_locked(arch_spinlock_t *lock)
> +{
> +	u32 val = atomic_read(lock);
> +
> +	return ((val >> 16) != (val & 0xffff));
> +}
> +
> +static __always_inline int ticket_is_contended(arch_spinlock_t *lock)
> +{
> +	u32 val = atomic_read(lock);
> +
> +	return (s16)((val >> 16) - (val & 0xffff)) > 1;

Does this go wonky if the tickets are in the process of wrapping around?

Will

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [RFC][PATCH] locking: Generic ticket-lock
@ 2021-04-19 17:35                             ` Will Deacon
  0 siblings, 0 replies; 92+ messages in thread
From: Will Deacon @ 2021-04-19 17:35 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Guo Ren, Christoph Müllner, Palmer Dabbelt, Anup Patel,
	linux-riscv, Linux Kernel Mailing List, Guo Ren, Catalin Marinas,
	Will Deacon, Arnd Bergmann, jonas, stefan.kristiansson, shorne

On Wed, Apr 14, 2021 at 12:16:38PM +0200, Peter Zijlstra wrote:
> How's this then? Compile tested only on openrisc/simple_smp_defconfig.
> 
> diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
> index d74b13825501..a7a1296b0b4d 100644
> --- a/include/asm-generic/qspinlock.h
> +++ b/include/asm-generic/qspinlock.h
> @@ -2,6 +2,36 @@
>  /*
>   * Queued spinlock
>   *
> + * A 'generic' spinlock implementation that is based on MCS locks. An
> + * architecture that's looking for a 'generic' spinlock, please first consider
> + * ticket-lock.h and only come looking here when you've considered all the
> + * constraints below and can show your hardware does actually perform better
> + * with qspinlock.
> + *
> + *
> + * It relies on atomic_*_release()/atomic_*_acquire() to be RCsc (or no weaker
> + * than RCtso if you're power), where regular code only expects atomic_t to be
> + * RCpc.

Maybe capitalise "Power" to make it clear this about the architecture?

> + *
> + * It relies on a far greater (compared to ticket-lock.h) set of atomic
> + * operations to behave well together, please audit them carefully to ensure
> + * they all have forward progress. Many atomic operations may default to
> + * cmpxchg() loops which will not have good forward progress properties on
> + * LL/SC architectures.
> + *
> + * One notable example is atomic_fetch_or_acquire(), which x86 cannot (cheaply)
> + * do. Carefully read the patches that introduced queued_fetch_set_pending_acquire().
> + *
> + * It also heavily relies on mixed size atomic operations, in specific it
> + * requires architectures to have xchg16; something which many LL/SC
> + * architectures need to implement as a 32bit and+or in order to satisfy the
> + * forward progress guarantees mentioned above.
> + *
> + * Further reading on mixed size atomics that might be relevant:
> + *
> + *   http://www.cl.cam.ac.uk/~pes20/popl17/mixed-size.pdf
> + *
> + *
>   * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
>   * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
>   *
> diff --git a/include/asm-generic/ticket-lock-types.h b/include/asm-generic/ticket-lock-types.h
> new file mode 100644
> index 000000000000..829759aedda8
> --- /dev/null
> +++ b/include/asm-generic/ticket-lock-types.h
> @@ -0,0 +1,11 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#ifndef __ASM_GENERIC_TICKET_LOCK_TYPES_H
> +#define __ASM_GENERIC_TICKET_LOCK_TYPES_H
> +
> +#include <linux/types.h>
> +typedef atomic_t arch_spinlock_t;
> +
> +#define __ARCH_SPIN_LOCK_UNLOCKED	ATOMIC_INIT(0)
> +
> +#endif /* __ASM_GENERIC_TICKET_LOCK_TYPES_H */
> diff --git a/include/asm-generic/ticket-lock.h b/include/asm-generic/ticket-lock.h
> new file mode 100644
> index 000000000000..3f0d53e21a37
> --- /dev/null
> +++ b/include/asm-generic/ticket-lock.h
> @@ -0,0 +1,86 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +/*
> + * 'Generic' ticket-lock implementation.
> + *
> + * It relies on atomic_fetch_add() having well defined forward progress
> + * guarantees under contention. If your architecture cannot provide this, stick
> + * to a test-and-set lock.
> + *
> + * It also relies on atomic_fetch_add() being safe vs smp_store_release() on a
> + * sub-word of the value. This is generally true for anything LL/SC although
> + * you'd be hard pressed to find anything useful in architecture specifications
> + * about this. If your architecture cannot do this you might be better off with
> + * a test-and-set.
> + *
> + * It further assumes atomic_*_release() + atomic_*_acquire() is RCpc and hence
> + * uses atomic_fetch_add() which is SC to create an RCsc lock.
> + *
> + * The implementation uses smp_cond_load_acquire() to spin, so if the
> + * architecture has WFE like instructions to sleep instead of poll for word
> + * modifications be sure to implement that (see ARM64 for example).
> + *
> + */
> +
> +#ifndef __ASM_GENERIC_TICKET_LOCK_H
> +#define __ASM_GENERIC_TICKET_LOCK_H
> +
> +#include <linux/atomic.h>
> +#include <asm/ticket-lock-types.h>
> +
> +static __always_inline void ticket_lock(arch_spinlock_t *lock)
> +{
> +	u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */

I hate to say it, but smp_mb__after_unlock_lock() would make the intention
a lot clearer here :( That is, the implementation as you have it gives
stronger than RCsc semantics for all architectures.

Alternatively, we could write the thing RCpc and throw an smp_mb() into
the unlock path if CONFIG_ARCH_WEAK_RELEASE_ACQUIRE.

> +	u16 ticket = val >> 16;
> +
> +	if (ticket == (u16)val)
> +		return;
> +
> +	atomic_cond_read_acquire(lock, ticket == (u16)VAL);
> +}
> +
> +static __always_inline bool ticket_trylock(arch_spinlock_t *lock)
> +{
> +	u32 old = atomic_read(lock);
> +
> +	if ((old >> 16) != (old & 0xffff))
> +		return false;
> +
> +	return atomic_try_cmpxchg(lock, &old, old + (1<<16)); /* SC, for RCsc */
> +}
> +
> +static __always_inline void ticket_unlock(arch_spinlock_t *lock)
> +{
> +	u16 *ptr = (u16 *)lock + __is_defined(__BIG_ENDIAN);
> +	u32 val = atomic_read(lock);
> +
> +	smp_store_release(ptr, (u16)val + 1);
> +}
> +
> +static __always_inline int ticket_is_locked(arch_spinlock_t *lock)
> +{
> +	u32 val = atomic_read(lock);
> +
> +	return ((val >> 16) != (val & 0xffff));
> +}
> +
> +static __always_inline int ticket_is_contended(arch_spinlock_t *lock)
> +{
> +	u32 val = atomic_read(lock);
> +
> +	return (s16)((val >> 16) - (val & 0xffff)) > 1;

Does this go wonky if the tickets are in the process of wrapping around?

Will

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [RFC][PATCH] locking: Generic ticket-lock
  2021-04-14 10:16                           ` Peter Zijlstra
@ 2021-04-23  6:44                             ` Palmer Dabbelt
  -1 siblings, 0 replies; 92+ messages in thread
From: Palmer Dabbelt @ 2021-04-23  6:44 UTC (permalink / raw)
  To: peterz
  Cc: guoren, christophm30, anup, linux-riscv, linux-kernel, guoren,
	catalin.marinas, will.deacon, Arnd Bergmann, jonas,
	stefan.kristiansson, shorne

On Wed, 14 Apr 2021 03:16:38 PDT (-0700), peterz@infradead.org wrote:
> On Wed, Apr 14, 2021 at 11:05:24AM +0200, Peter Zijlstra wrote:
>
>> That made me look at the qspinlock code, and queued_spin_*lock() uses
>> atomic_try_cmpxchg_acquire(), which means any arch that uses qspinlock
>> and has RCpc atomics will give us massive pain.
>>
>> Current archs using qspinlock are: x86, arm64, power, sparc64, mips and
>> openrisc (WTF?!).

We'd been talking about moving to qspinlock on RISC-V as well.  Not sure 
if that's where this thread came from or if there was another one, but 
we did talk about RISC-V qspinlock a bit on IRC (and it looks a bit like 
the prototype I posted there) so I figured it's best to write something 
down here -- at least that way I won't forgot what was going on next 
time qspinlock comes around:

It seems premature to move RISC-V to qspinlock.  In the RISC-V qspinlock 
thread (and my first crack at this) I'd opened the door for supporting 
both, but at the time I wasn't really aware of how complicated the 
requirements imposed by qspinlock on the architecture is.  We're 
definately not there yet on RISC-V, so I don't really see any reason to 
allow flipping on qspinlock yet -- essentially be allowing users to flip 
it on we'd be giving them some indication that it may work, which would 
couple ourselves to that flavor of lock continuing to work in the 
future.  I don't want to do that until we have something we can point to 
that says qspinlocks will function properly, moving over before that 
just opens a huge can of worms.

Patch sets to move RISC-V over to qspinlock have shown up a handful of 
times over the years.  I'd always considered this just a performance 
thing so I'd been worried about moving over without any benchmarks.  We 
still don't have any locking benchmarks, but I'm happy moving over to 
ticket locks: they're not meaningfully more expensive in the 
non-contended case, having fair locks is a huge win, and it gets code of 
out arch/riscv/ (that's probably the most important one on my end :)).  
IIUC qrwlock should be fine (when backed by a ticket lock like this), 
which will let us get rid of all our lock code.  We will pay a small 
price on the existing microarchitectures (it's a few extra cycles for 
the half-word non-atomic RMW than the AMO, oddly enough) but that seems 
like a small price to pay for fairness.

Regardless, I'm not going to flip RISC-V over for the upcoming merge 
window -- it's just too late in the cycle for this sort of change, and I 
do want to at least look at the generated code first.  This is a pretty 
invasive change and I want to at least get it a full round of 
linux-next.

Anyway, thanks for doing this -- it's certainly way cleaner that what I 
was coming up with.  I'm assuming you're going to eventually send out a 
non-RFC for this?

>> Of those, x86 and sparc are TSO archs with SC atomics, arm64 has RCsc
>> atomics, power has RCtso atomics (and is the arch we all hate for having
>> RCtso locks).
>>
>> Now MIPS has all sorts of ill specified barriers, but last time looked
>> at it it didn't actually use any of that and stuck to using smp_mb(), so
>> it will have RCsc atomics.
>>
>> /me goes look at wth openrisc is..  doesn't even appear to have
>> asm/barrier.h :-/ Looking at wikipedia it also doesn't appear to
>> actually have hardware ...
>
> FWIW this is broken, anything SMP *MUST* define mb(), at the very least.
>
>> I'm thinking openrisc is a prime candidate for this ticket_lock.h we're
>> all talking about.
>
> How's this then? Compile tested only on openrisc/simple_smp_defconfig.
>
> ---
>  arch/openrisc/Kconfig                      |  1 -
>  arch/openrisc/include/asm/Kbuild           |  5 +-
>  arch/openrisc/include/asm/spinlock.h       |  3 +-
>  arch/openrisc/include/asm/spinlock_types.h |  2 +-
>  include/asm-generic/qspinlock.h            | 30 +++++++++++
>  include/asm-generic/ticket-lock-types.h    | 11 ++++
>  include/asm-generic/ticket-lock.h          | 86 ++++++++++++++++++++++++++++++
>  7 files changed, 131 insertions(+), 7 deletions(-)
>
> diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig
> index 591acc5990dc..1858cf309f1f 100644
> --- a/arch/openrisc/Kconfig
> +++ b/arch/openrisc/Kconfig
> @@ -32,7 +32,6 @@ config OPENRISC
>  	select HAVE_DEBUG_STACKOVERFLOW
>  	select OR1K_PIC
>  	select CPU_NO_EFFICIENT_FFS if !OPENRISC_HAVE_INST_FF1
> -	select ARCH_USE_QUEUED_SPINLOCKS
>  	select ARCH_USE_QUEUED_RWLOCKS
>  	select OMPIC if SMP
>  	select ARCH_WANT_FRAME_POINTERS
> diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild
> index ca5987e11053..cb260e7d73db 100644
> --- a/arch/openrisc/include/asm/Kbuild
> +++ b/arch/openrisc/include/asm/Kbuild
> @@ -1,9 +1,8 @@
>  # SPDX-License-Identifier: GPL-2.0
>  generic-y += extable.h
>  generic-y += kvm_para.h
> -generic-y += mcs_spinlock.h
> -generic-y += qspinlock_types.h
> -generic-y += qspinlock.h
> +generic-y += ticket-lock.h
> +generic-y += ticket-lock-types.h
>  generic-y += qrwlock_types.h
>  generic-y += qrwlock.h
>  generic-y += user.h
> diff --git a/arch/openrisc/include/asm/spinlock.h b/arch/openrisc/include/asm/spinlock.h
> index a8940bdfcb7e..0b839ed1f3a0 100644
> --- a/arch/openrisc/include/asm/spinlock.h
> +++ b/arch/openrisc/include/asm/spinlock.h
> @@ -15,8 +15,7 @@
>  #ifndef __ASM_OPENRISC_SPINLOCK_H
>  #define __ASM_OPENRISC_SPINLOCK_H
>
> -#include <asm/qspinlock.h>
> -
> +#include <asm/ticket-lock.h>
>  #include <asm/qrwlock.h>
>
>  #define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
> diff --git a/arch/openrisc/include/asm/spinlock_types.h b/arch/openrisc/include/asm/spinlock_types.h
> index 7c6fb1208c88..58ea31fa65ce 100644
> --- a/arch/openrisc/include/asm/spinlock_types.h
> +++ b/arch/openrisc/include/asm/spinlock_types.h
> @@ -1,7 +1,7 @@
>  #ifndef _ASM_OPENRISC_SPINLOCK_TYPES_H
>  #define _ASM_OPENRISC_SPINLOCK_TYPES_H
>
> -#include <asm/qspinlock_types.h>
> +#include <asm/ticket-lock-types.h>
>  #include <asm/qrwlock_types.h>
>
>  #endif /* _ASM_OPENRISC_SPINLOCK_TYPES_H */
> diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
> index d74b13825501..a7a1296b0b4d 100644
> --- a/include/asm-generic/qspinlock.h
> +++ b/include/asm-generic/qspinlock.h
> @@ -2,6 +2,36 @@
>  /*
>   * Queued spinlock
>   *
> + * A 'generic' spinlock implementation that is based on MCS locks. An
> + * architecture that's looking for a 'generic' spinlock, please first consider
> + * ticket-lock.h and only come looking here when you've considered all the
> + * constraints below and can show your hardware does actually perform better
> + * with qspinlock.
> + *
> + *
> + * It relies on atomic_*_release()/atomic_*_acquire() to be RCsc (or no weaker
> + * than RCtso if you're power), where regular code only expects atomic_t to be
> + * RCpc.

Thanks for writing this up.  We likely would have made a mess out of it 
without this.

> + *
> + * It relies on a far greater (compared to ticket-lock.h) set of atomic
> + * operations to behave well together, please audit them carefully to ensure
> + * they all have forward progress. Many atomic operations may default to
> + * cmpxchg() loops which will not have good forward progress properties on
> + * LL/SC architectures.
> + *
> + * One notable example is atomic_fetch_or_acquire(), which x86 cannot (cheaply)
> + * do. Carefully read the patches that introduced queued_fetch_set_pending_acquire().
> + *
> + * It also heavily relies on mixed size atomic operations, in specific it
> + * requires architectures to have xchg16; something which many LL/SC
> + * architectures need to implement as a 32bit and+or in order to satisfy the
> + * forward progress guarantees mentioned above.
> + *
> + * Further reading on mixed size atomics that might be relevant:
> + *
> + *   http://www.cl.cam.ac.uk/~pes20/popl17/mixed-size.pdf
> + *
> + *
>   * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
>   * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
>   *
> diff --git a/include/asm-generic/ticket-lock-types.h b/include/asm-generic/ticket-lock-types.h
> new file mode 100644
> index 000000000000..829759aedda8
> --- /dev/null
> +++ b/include/asm-generic/ticket-lock-types.h
> @@ -0,0 +1,11 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#ifndef __ASM_GENERIC_TICKET_LOCK_TYPES_H
> +#define __ASM_GENERIC_TICKET_LOCK_TYPES_H
> +
> +#include <linux/types.h>
> +typedef atomic_t arch_spinlock_t;
> +
> +#define __ARCH_SPIN_LOCK_UNLOCKED	ATOMIC_INIT(0)
> +
> +#endif /* __ASM_GENERIC_TICKET_LOCK_TYPES_H */
> diff --git a/include/asm-generic/ticket-lock.h b/include/asm-generic/ticket-lock.h
> new file mode 100644
> index 000000000000..3f0d53e21a37
> --- /dev/null
> +++ b/include/asm-generic/ticket-lock.h
> @@ -0,0 +1,86 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +/*
> + * 'Generic' ticket-lock implementation.
> + *
> + * It relies on atomic_fetch_add() having well defined forward progress
> + * guarantees under contention. If your architecture cannot provide this, stick
> + * to a test-and-set lock.
> + *
> + * It also relies on atomic_fetch_add() being safe vs smp_store_release() on a
> + * sub-word of the value. This is generally true for anything LL/SC although
> + * you'd be hard pressed to find anything useful in architecture specifications
> + * about this. If your architecture cannot do this you might be better off with
> + * a test-and-set.
> + *
> + * It further assumes atomic_*_release() + atomic_*_acquire() is RCpc and hence
> + * uses atomic_fetch_add() which is SC to create an RCsc lock.
> + *
> + * The implementation uses smp_cond_load_acquire() to spin, so if the
> + * architecture has WFE like instructions to sleep instead of poll for word
> + * modifications be sure to implement that (see ARM64 for example).

That's really neat: I'd missed the "lose a load reservation" in the WFE 
triggers and assumed I had to do some explicit event.  IIUC this should 
all just work on arm64 as-is, and be essentially just as efficient as 
the bespoke ticket lock removed by c11090474d70 ("arm64: locking: 
Replace ticket lock implementation with qspinlock").

> + *
> + */
> +
> +#ifndef __ASM_GENERIC_TICKET_LOCK_H
> +#define __ASM_GENERIC_TICKET_LOCK_H
> +
> +#include <linux/atomic.h>
> +#include <asm/ticket-lock-types.h>
> +
> +static __always_inline void ticket_lock(arch_spinlock_t *lock)
> +{
> +	u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
> +	u16 ticket = val >> 16;
> +
> +	if (ticket == (u16)val)
> +		return;
> +
> +	atomic_cond_read_acquire(lock, ticket == (u16)VAL);
> +}
> +
> +static __always_inline bool ticket_trylock(arch_spinlock_t *lock)
> +{
> +	u32 old = atomic_read(lock);
> +
> +	if ((old >> 16) != (old & 0xffff))
> +		return false;
> +
> +	return atomic_try_cmpxchg(lock, &old, old + (1<<16)); /* SC, for RCsc */
> +}
> +
> +static __always_inline void ticket_unlock(arch_spinlock_t *lock)
> +{
> +	u16 *ptr = (u16 *)lock + __is_defined(__BIG_ENDIAN);
> +	u32 val = atomic_read(lock);
> +
> +	smp_store_release(ptr, (u16)val + 1);
> +}
> +
> +static __always_inline int ticket_is_locked(arch_spinlock_t *lock)
> +{
> +	u32 val = atomic_read(lock);
> +
> +	return ((val >> 16) != (val & 0xffff));
> +}
> +
> +static __always_inline int ticket_is_contended(arch_spinlock_t *lock)
> +{
> +	u32 val = atomic_read(lock);
> +
> +	return (s16)((val >> 16) - (val & 0xffff)) > 1;
> +}
> +
> +static __always_inline int ticket_value_unlocked(arch_spinlock_t lock)
> +{
> +	return !ticket_is_locked(&lock);
> +}
> +
> +#define arch_spin_lock(l)		ticket_lock(l)
> +#define arch_spin_trylock(l)		ticket_trylock(l)
> +#define arch_spin_unlock(l)		ticket_unlock(l)
> +#define arch_spin_is_locked(l)		ticket_is_locked(l)
> +#define arch_spin_is_contended(l)	ticket_is_contended(l)
> +#define arch_spin_value_unlocked(l)	ticket_value_unlocked(l)
> +
> +#endif /* __ASM_GENERIC_TICKET_LOCK_H */

^ permalink raw reply	[flat|nested] 92+ messages in thread

* Re: [RFC][PATCH] locking: Generic ticket-lock
@ 2021-04-23  6:44                             ` Palmer Dabbelt
  0 siblings, 0 replies; 92+ messages in thread
From: Palmer Dabbelt @ 2021-04-23  6:44 UTC (permalink / raw)
  To: peterz
  Cc: guoren, christophm30, anup, linux-riscv, linux-kernel, guoren,
	catalin.marinas, will.deacon, Arnd Bergmann, jonas,
	stefan.kristiansson, shorne

On Wed, 14 Apr 2021 03:16:38 PDT (-0700), peterz@infradead.org wrote:
> On Wed, Apr 14, 2021 at 11:05:24AM +0200, Peter Zijlstra wrote:
>
>> That made me look at the qspinlock code, and queued_spin_*lock() uses
>> atomic_try_cmpxchg_acquire(), which means any arch that uses qspinlock
>> and has RCpc atomics will give us massive pain.
>>
>> Current archs using qspinlock are: x86, arm64, power, sparc64, mips and
>> openrisc (WTF?!).

We'd been talking about moving to qspinlock on RISC-V as well.  Not sure 
if that's where this thread came from or if there was another one, but 
we did talk about RISC-V qspinlock a bit on IRC (and it looks a bit like 
the prototype I posted there) so I figured it's best to write something 
down here -- at least that way I won't forgot what was going on next 
time qspinlock comes around:

It seems premature to move RISC-V to qspinlock.  In the RISC-V qspinlock 
thread (and my first crack at this) I'd opened the door for supporting 
both, but at the time I wasn't really aware of how complicated the 
requirements imposed by qspinlock on the architecture is.  We're 
definately not there yet on RISC-V, so I don't really see any reason to 
allow flipping on qspinlock yet -- essentially be allowing users to flip 
it on we'd be giving them some indication that it may work, which would 
couple ourselves to that flavor of lock continuing to work in the 
future.  I don't want to do that until we have something we can point to 
that says qspinlocks will function properly, moving over before that 
just opens a huge can of worms.

Patch sets to move RISC-V over to qspinlock have shown up a handful of 
times over the years.  I'd always considered this just a performance 
thing so I'd been worried about moving over without any benchmarks.  We 
still don't have any locking benchmarks, but I'm happy moving over to 
ticket locks: they're not meaningfully more expensive in the 
non-contended case, having fair locks is a huge win, and it gets code of 
out arch/riscv/ (that's probably the most important one on my end :)).  
IIUC qrwlock should be fine (when backed by a ticket lock like this), 
which will let us get rid of all our lock code.  We will pay a small 
price on the existing microarchitectures (it's a few extra cycles for 
the half-word non-atomic RMW than the AMO, oddly enough) but that seems 
like a small price to pay for fairness.

Regardless, I'm not going to flip RISC-V over for the upcoming merge 
window -- it's just too late in the cycle for this sort of change, and I 
do want to at least look at the generated code first.  This is a pretty 
invasive change and I want to at least get it a full round of 
linux-next.

Anyway, thanks for doing this -- it's certainly way cleaner that what I 
was coming up with.  I'm assuming you're going to eventually send out a 
non-RFC for this?

>> Of those, x86 and sparc are TSO archs with SC atomics, arm64 has RCsc
>> atomics, power has RCtso atomics (and is the arch we all hate for having
>> RCtso locks).
>>
>> Now MIPS has all sorts of ill specified barriers, but last time looked
>> at it it didn't actually use any of that and stuck to using smp_mb(), so
>> it will have RCsc atomics.
>>
>> /me goes look at wth openrisc is..  doesn't even appear to have
>> asm/barrier.h :-/ Looking at wikipedia it also doesn't appear to
>> actually have hardware ...
>
> FWIW this is broken, anything SMP *MUST* define mb(), at the very least.
>
>> I'm thinking openrisc is a prime candidate for this ticket_lock.h we're
>> all talking about.
>
> How's this then? Compile tested only on openrisc/simple_smp_defconfig.
>
> ---
>  arch/openrisc/Kconfig                      |  1 -
>  arch/openrisc/include/asm/Kbuild           |  5 +-
>  arch/openrisc/include/asm/spinlock.h       |  3 +-
>  arch/openrisc/include/asm/spinlock_types.h |  2 +-
>  include/asm-generic/qspinlock.h            | 30 +++++++++++
>  include/asm-generic/ticket-lock-types.h    | 11 ++++
>  include/asm-generic/ticket-lock.h          | 86 ++++++++++++++++++++++++++++++
>  7 files changed, 131 insertions(+), 7 deletions(-)
>
> diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig
> index 591acc5990dc..1858cf309f1f 100644
> --- a/arch/openrisc/Kconfig
> +++ b/arch/openrisc/Kconfig
> @@ -32,7 +32,6 @@ config OPENRISC
>  	select HAVE_DEBUG_STACKOVERFLOW
>  	select OR1K_PIC
>  	select CPU_NO_EFFICIENT_FFS if !OPENRISC_HAVE_INST_FF1
> -	select ARCH_USE_QUEUED_SPINLOCKS
>  	select ARCH_USE_QUEUED_RWLOCKS
>  	select OMPIC if SMP
>  	select ARCH_WANT_FRAME_POINTERS
> diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild
> index ca5987e11053..cb260e7d73db 100644
> --- a/arch/openrisc/include/asm/Kbuild
> +++ b/arch/openrisc/include/asm/Kbuild
> @@ -1,9 +1,8 @@
>  # SPDX-License-Identifier: GPL-2.0
>  generic-y += extable.h
>  generic-y += kvm_para.h
> -generic-y += mcs_spinlock.h
> -generic-y += qspinlock_types.h
> -generic-y += qspinlock.h
> +generic-y += ticket-lock.h
> +generic-y += ticket-lock-types.h
>  generic-y += qrwlock_types.h
>  generic-y += qrwlock.h
>  generic-y += user.h
> diff --git a/arch/openrisc/include/asm/spinlock.h b/arch/openrisc/include/asm/spinlock.h
> index a8940bdfcb7e..0b839ed1f3a0 100644
> --- a/arch/openrisc/include/asm/spinlock.h
> +++ b/arch/openrisc/include/asm/spinlock.h
> @@ -15,8 +15,7 @@
>  #ifndef __ASM_OPENRISC_SPINLOCK_H
>  #define __ASM_OPENRISC_SPINLOCK_H
>
> -#include <asm/qspinlock.h>
> -
> +#include <asm/ticket-lock.h>
>  #include <asm/qrwlock.h>
>
>  #define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
> diff --git a/arch/openrisc/include/asm/spinlock_types.h b/arch/openrisc/include/asm/spinlock_types.h
> index 7c6fb1208c88..58ea31fa65ce 100644
> --- a/arch/openrisc/include/asm/spinlock_types.h
> +++ b/arch/openrisc/include/asm/spinlock_types.h
> @@ -1,7 +1,7 @@
>  #ifndef _ASM_OPENRISC_SPINLOCK_TYPES_H
>  #define _ASM_OPENRISC_SPINLOCK_TYPES_H
>
> -#include <asm/qspinlock_types.h>
> +#include <asm/ticket-lock-types.h>
>  #include <asm/qrwlock_types.h>
>
>  #endif /* _ASM_OPENRISC_SPINLOCK_TYPES_H */
> diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
> index d74b13825501..a7a1296b0b4d 100644
> --- a/include/asm-generic/qspinlock.h
> +++ b/include/asm-generic/qspinlock.h
> @@ -2,6 +2,36 @@
>  /*
>   * Queued spinlock
>   *
> + * A 'generic' spinlock implementation that is based on MCS locks. An
> + * architecture that's looking for a 'generic' spinlock, please first consider
> + * ticket-lock.h and only come looking here when you've considered all the
> + * constraints below and can show your hardware does actually perform better
> + * with qspinlock.
> + *
> + *
> + * It relies on atomic_*_release()/atomic_*_acquire() to be RCsc (or no weaker
> + * than RCtso if you're power), where regular code only expects atomic_t to be
> + * RCpc.

Thanks for writing this up.  We likely would have made a mess out of it 
without this.

> + *
> + * It relies on a far greater (compared to ticket-lock.h) set of atomic
> + * operations to behave well together, please audit them carefully to ensure
> + * they all have forward progress. Many atomic operations may default to
> + * cmpxchg() loops which will not have good forward progress properties on
> + * LL/SC architectures.
> + *
> + * One notable example is atomic_fetch_or_acquire(), which x86 cannot (cheaply)
> + * do. Carefully read the patches that introduced queued_fetch_set_pending_acquire().
> + *
> + * It also heavily relies on mixed size atomic operations, in specific it
> + * requires architectures to have xchg16; something which many LL/SC
> + * architectures need to implement as a 32bit and+or in order to satisfy the
> + * forward progress guarantees mentioned above.
> + *
> + * Further reading on mixed size atomics that might be relevant:
> + *
> + *   http://www.cl.cam.ac.uk/~pes20/popl17/mixed-size.pdf
> + *
> + *
>   * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
>   * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
>   *
> diff --git a/include/asm-generic/ticket-lock-types.h b/include/asm-generic/ticket-lock-types.h
> new file mode 100644
> index 000000000000..829759aedda8
> --- /dev/null
> +++ b/include/asm-generic/ticket-lock-types.h
> @@ -0,0 +1,11 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#ifndef __ASM_GENERIC_TICKET_LOCK_TYPES_H
> +#define __ASM_GENERIC_TICKET_LOCK_TYPES_H
> +
> +#include <linux/types.h>
> +typedef atomic_t arch_spinlock_t;
> +
> +#define __ARCH_SPIN_LOCK_UNLOCKED	ATOMIC_INIT(0)
> +
> +#endif /* __ASM_GENERIC_TICKET_LOCK_TYPES_H */
> diff --git a/include/asm-generic/ticket-lock.h b/include/asm-generic/ticket-lock.h
> new file mode 100644
> index 000000000000..3f0d53e21a37
> --- /dev/null
> +++ b/include/asm-generic/ticket-lock.h
> @@ -0,0 +1,86 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +/*
> + * 'Generic' ticket-lock implementation.
> + *
> + * It relies on atomic_fetch_add() having well defined forward progress
> + * guarantees under contention. If your architecture cannot provide this, stick
> + * to a test-and-set lock.
> + *
> + * It also relies on atomic_fetch_add() being safe vs smp_store_release() on a
> + * sub-word of the value. This is generally true for anything LL/SC although
> + * you'd be hard pressed to find anything useful in architecture specifications
> + * about this. If your architecture cannot do this you might be better off with
> + * a test-and-set.
> + *
> + * It further assumes atomic_*_release() + atomic_*_acquire() is RCpc and hence
> + * uses atomic_fetch_add() which is SC to create an RCsc lock.
> + *
> + * The implementation uses smp_cond_load_acquire() to spin, so if the
> + * architecture has WFE like instructions to sleep instead of poll for word
> + * modifications be sure to implement that (see ARM64 for example).

That's really neat: I'd missed the "lose a load reservation" in the WFE 
triggers and assumed I had to do some explicit event.  IIUC this should 
all just work on arm64 as-is, and be essentially just as efficient as 
the bespoke ticket lock removed by c11090474d70 ("arm64: locking: 
Replace ticket lock implementation with qspinlock").

> + *
> + */
> +
> +#ifndef __ASM_GENERIC_TICKET_LOCK_H
> +#define __ASM_GENERIC_TICKET_LOCK_H
> +
> +#include <linux/atomic.h>
> +#include <asm/ticket-lock-types.h>
> +
> +static __always_inline void ticket_lock(arch_spinlock_t *lock)
> +{
> +	u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */
> +	u16 ticket = val >> 16;
> +
> +	if (ticket == (u16)val)
> +		return;
> +
> +	atomic_cond_read_acquire(lock, ticket == (u16)VAL);
> +}
> +
> +static __always_inline bool ticket_trylock(arch_spinlock_t *lock)
> +{
> +	u32 old = atomic_read(lock);
> +
> +	if ((old >> 16) != (old & 0xffff))
> +		return false;
> +
> +	return atomic_try_cmpxchg(lock, &old, old + (1<<16)); /* SC, for RCsc */
> +}
> +
> +static __always_inline void ticket_unlock(arch_spinlock_t *lock)
> +{
> +	u16 *ptr = (u16 *)lock + __is_defined(__BIG_ENDIAN);
> +	u32 val = atomic_read(lock);
> +
> +	smp_store_release(ptr, (u16)val + 1);
> +}
> +
> +static __always_inline int ticket_is_locked(arch_spinlock_t *lock)
> +{
> +	u32 val = atomic_read(lock);
> +
> +	return ((val >> 16) != (val & 0xffff));
> +}
> +
> +static __always_inline int ticket_is_contended(arch_spinlock_t *lock)
> +{
> +	u32 val = atomic_read(lock);
> +
> +	return (s16)((val >> 16) - (val & 0xffff)) > 1;
> +}
> +
> +static __always_inline int ticket_value_unlocked(arch_spinlock_t lock)
> +{
> +	return !ticket_is_locked(&lock);
> +}
> +
> +#define arch_spin_lock(l)		ticket_lock(l)
> +#define arch_spin_trylock(l)		ticket_trylock(l)
> +#define arch_spin_unlock(l)		ticket_unlock(l)
> +#define arch_spin_is_locked(l)		ticket_is_locked(l)
> +#define arch_spin_is_contended(l)	ticket_is_contended(l)
> +#define arch_spin_value_unlocked(l)	ticket_value_unlocked(l)
> +
> +#endif /* __ASM_GENERIC_TICKET_LOCK_H */

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 92+ messages in thread

end of thread, other threads:[~2021-04-23  6:45 UTC | newest]

Thread overview: 92+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-24 10:14 [PATCH] riscv: locks: introduce ticket-based spinlock implementation guoren
2021-03-24 10:14 ` guoren
2021-03-24 11:09 ` Peter Zijlstra
2021-03-24 11:09   ` Peter Zijlstra
2021-03-24 12:10   ` Guo Ren
2021-03-24 12:10     ` Guo Ren
     [not found] ` <CAM4kBBK7_s9U2vJbq68yC8WdDEfPQTaCOvn1xds3Si5B-Wpw+A@mail.gmail.com>
2021-03-24 12:23   ` Peter Zijlstra
2021-03-24 12:23     ` Peter Zijlstra
2021-03-24 12:24   ` Guo Ren
2021-03-24 12:24     ` Guo Ren
2021-03-24 12:31     ` Peter Zijlstra
2021-03-24 12:31       ` Peter Zijlstra
2021-03-24 12:28 ` Anup Patel
2021-03-24 12:28   ` Anup Patel
2021-03-24 12:37   ` Peter Zijlstra
2021-03-24 12:37     ` Peter Zijlstra
2021-03-24 12:53     ` Anup Patel
2021-03-24 12:53       ` Anup Patel
2021-04-11 21:11       ` Palmer Dabbelt
2021-04-11 21:11         ` Palmer Dabbelt
2021-04-12 13:32         ` Christoph Müllner
2021-04-12 13:32           ` Christoph Müllner
2021-04-12 14:51           ` Peter Zijlstra
2021-04-12 14:51             ` Peter Zijlstra
2021-04-12 21:21             ` Christoph Müllner
2021-04-12 21:21               ` Christoph Müllner
2021-04-12 17:33           ` Palmer Dabbelt
2021-04-12 17:33             ` Palmer Dabbelt
2021-04-12 21:54             ` Christoph Müllner
2021-04-12 21:54               ` Christoph Müllner
2021-04-13  8:03               ` Peter Zijlstra
2021-04-13  8:03                 ` Peter Zijlstra
2021-04-13  8:17                 ` Peter Zijlstra
2021-04-13  8:17                   ` Peter Zijlstra
2021-04-14  2:26                   ` Guo Ren
2021-04-14  2:26                     ` Guo Ren
2021-04-14  7:08                     ` Peter Zijlstra
2021-04-14  7:08                       ` Peter Zijlstra
2021-04-14  9:05                       ` Peter Zijlstra
2021-04-14  9:05                         ` Peter Zijlstra
2021-04-14 10:16                         ` [RFC][PATCH] locking: Generic ticket-lock Peter Zijlstra
2021-04-14 10:16                           ` Peter Zijlstra
2021-04-14 12:39                           ` Guo Ren
2021-04-14 12:39                             ` Guo Ren
2021-04-14 12:55                             ` Peter Zijlstra
2021-04-14 12:55                               ` Peter Zijlstra
2021-04-14 13:08                               ` Peter Zijlstra
2021-04-14 13:08                                 ` Peter Zijlstra
2021-04-14 15:59                               ` David Laight
2021-04-14 15:59                                 ` David Laight
2021-04-14 12:45                           ` Peter Zijlstra
2021-04-14 12:45                             ` Peter Zijlstra
2021-04-14 21:02                             ` Stafford Horne
2021-04-14 21:02                               ` Stafford Horne
2021-04-14 20:47                           ` Stafford Horne
2021-04-14 20:47                             ` Stafford Horne
2021-04-15  8:09                             ` Peter Zijlstra
2021-04-15  8:09                               ` Peter Zijlstra
2021-04-15  9:02                               ` Catalin Marinas
2021-04-15  9:02                                 ` Catalin Marinas
2021-04-15  9:22                                 ` Will Deacon
2021-04-15  9:22                                   ` Will Deacon
2021-04-15  9:24                                 ` Peter Zijlstra
2021-04-15  9:24                                   ` Peter Zijlstra
2021-04-19 17:35                           ` Will Deacon
2021-04-19 17:35                             ` Will Deacon
2021-04-23  6:44                           ` Palmer Dabbelt
2021-04-23  6:44                             ` Palmer Dabbelt
2021-04-13  9:22                 ` [PATCH] riscv: locks: introduce ticket-based spinlock implementation Christoph Müllner
2021-04-13  9:22                   ` Christoph Müllner
2021-04-13  9:30                   ` Catalin Marinas
2021-04-13  9:30                     ` Catalin Marinas
2021-04-13  9:55                     ` Christoph Müllner
2021-04-13  9:55                       ` Christoph Müllner
2021-04-14  0:23                     ` Guo Ren
2021-04-14  0:23                       ` Guo Ren
2021-04-14  9:17                       ` Catalin Marinas
2021-04-14  9:17                         ` Catalin Marinas
2021-04-13  9:35                   ` Peter Zijlstra
2021-04-13  9:35                     ` Peter Zijlstra
2021-04-13 10:25                     ` Christoph Müllner
2021-04-13 10:25                       ` Christoph Müllner
2021-04-13 10:45                       ` Catalin Marinas
2021-04-13 10:45                         ` Catalin Marinas
2021-04-13 10:54                         ` David Laight
2021-04-13 10:54                           ` David Laight
2021-04-14  5:54                           ` Guo Ren
2021-04-14  5:54                             ` Guo Ren
2021-04-13 11:04                         ` Christoph Müllner
2021-04-13 11:04                           ` Christoph Müllner
2021-04-13 13:19                       ` Guo Ren
2021-04-13 13:19                         ` Guo Ren

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.