linux-riscv.lists.infradead.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/5] riscv: Coding convention for xchg
@ 2020-11-24 13:43 guoren
  2020-11-24 13:43 ` [PATCH 2/5] riscv: Add QUEUED_SPINLOCKS & QUEUED_RWLOCKS supported guoren
                   ` (4 more replies)
  0 siblings, 5 replies; 16+ messages in thread
From: guoren @ 2020-11-24 13:43 UTC (permalink / raw)
  To: peterz, arnd, palmerdabbelt, paul.walmsley, anup
  Cc: Guo Ren, linux-kernel, linux-csky, Michael Clark, guoren, linux-riscv

From: Guo Ren <guoren@linux.alibaba.com>

This is prepare for QUEUED_SPINLOCKS which need xchg support short
type value.
 - Remove unused codes (xchg32, xchg64, cmpxchg32 ...)
 - Combine xchg_relaxed, xchg_acquire, xchg_release into one asm
 - Make atomic.aq/rl with seperated fence acquire & release

Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Michael Clark <michaeljclark@mac.com>
---
 arch/riscv/include/asm/atomic.h  |  31 ++++-----
 arch/riscv/include/asm/cmpxchg.h | 135 +++------------------------------------
 arch/riscv/include/asm/fence.h   |   6 ++
 3 files changed, 30 insertions(+), 142 deletions(-)

diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h
index 400a8c8..632fbe4 100644
--- a/arch/riscv/include/asm/atomic.h
+++ b/arch/riscv/include/asm/atomic.h
@@ -19,11 +19,8 @@
 #include <asm/cmpxchg.h>
 #include <asm/barrier.h>
 
-#define __atomic_acquire_fence()					\
-	__asm__ __volatile__(RISCV_ACQUIRE_BARRIER "" ::: "memory")
-
-#define __atomic_release_fence()					\
-	__asm__ __volatile__(RISCV_RELEASE_BARRIER "" ::: "memory");
+#define __atomic_acquire_fence() __acquire_fence()
+#define __atomic_release_fence() __release_fence()
 
 static __always_inline int atomic_read(const atomic_t *v)
 {
@@ -242,58 +239,58 @@ static __always_inline s64 atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u
  * atomic_{cmp,}xchg is required to have exactly the same ordering semantics as
  * {cmp,}xchg and the operations that return, so they need a full barrier.
  */
-#define ATOMIC_OP(c_t, prefix, size)					\
+#define ATOMIC_OP(c_t, prefix)						\
 static __always_inline							\
 c_t atomic##prefix##_xchg_relaxed(atomic##prefix##_t *v, c_t n)		\
 {									\
-	return __xchg_relaxed(&(v->counter), n, size);			\
+	return xchg_relaxed(&(v->counter), n);				\
 }									\
 static __always_inline							\
 c_t atomic##prefix##_xchg_acquire(atomic##prefix##_t *v, c_t n)		\
 {									\
-	return __xchg_acquire(&(v->counter), n, size);			\
+	return xchg_acquire(&(v->counter), n);				\
 }									\
 static __always_inline							\
 c_t atomic##prefix##_xchg_release(atomic##prefix##_t *v, c_t n)		\
 {									\
-	return __xchg_release(&(v->counter), n, size);			\
+	return xchg_release(&(v->counter), n);				\
 }									\
 static __always_inline							\
 c_t atomic##prefix##_xchg(atomic##prefix##_t *v, c_t n)			\
 {									\
-	return __xchg(&(v->counter), n, size);				\
+	return xchg(&(v->counter), n);					\
 }									\
 static __always_inline							\
 c_t atomic##prefix##_cmpxchg_relaxed(atomic##prefix##_t *v,		\
 				     c_t o, c_t n)			\
 {									\
-	return __cmpxchg_relaxed(&(v->counter), o, n, size);		\
+	return cmpxchg_relaxed(&(v->counter), o, n);			\
 }									\
 static __always_inline							\
 c_t atomic##prefix##_cmpxchg_acquire(atomic##prefix##_t *v,		\
 				     c_t o, c_t n)			\
 {									\
-	return __cmpxchg_acquire(&(v->counter), o, n, size);		\
+	return cmpxchg_acquire(&(v->counter), o, n);			\
 }									\
 static __always_inline							\
 c_t atomic##prefix##_cmpxchg_release(atomic##prefix##_t *v,		\
 				     c_t o, c_t n)			\
 {									\
-	return __cmpxchg_release(&(v->counter), o, n, size);		\
+	return cmpxchg_release(&(v->counter), o, n);			\
 }									\
 static __always_inline							\
 c_t atomic##prefix##_cmpxchg(atomic##prefix##_t *v, c_t o, c_t n)	\
 {									\
-	return __cmpxchg(&(v->counter), o, n, size);			\
+	return cmpxchg(&(v->counter), o, n);				\
 }
 
 #ifdef CONFIG_GENERIC_ATOMIC64
 #define ATOMIC_OPS()							\
-	ATOMIC_OP(int,   , 4)
+	ATOMIC_OP(int,   )
 #else
 #define ATOMIC_OPS()							\
-	ATOMIC_OP(int,   , 4)						\
-	ATOMIC_OP(s64, 64, 8)
+	ATOMIC_OP(int,   )						\
+	ATOMIC_OP(s64, 64)
 #endif
 
 ATOMIC_OPS()
diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
index 262e5bb..5609185 100644
--- a/arch/riscv/include/asm/cmpxchg.h
+++ b/arch/riscv/include/asm/cmpxchg.h
@@ -44,118 +44,31 @@
 					    _x_, sizeof(*(ptr)));	\
 })
 
-#define __xchg_acquire(ptr, new, size)					\
-({									\
-	__typeof__(ptr) __ptr = (ptr);					\
-	__typeof__(new) __new = (new);					\
-	__typeof__(*(ptr)) __ret;					\
-	switch (size) {							\
-	case 4:								\
-		__asm__ __volatile__ (					\
-			"	amoswap.w %0, %2, %1\n"			\
-			RISCV_ACQUIRE_BARRIER				\
-			: "=r" (__ret), "+A" (*__ptr)			\
-			: "r" (__new)					\
-			: "memory");					\
-		break;							\
-	case 8:								\
-		__asm__ __volatile__ (					\
-			"	amoswap.d %0, %2, %1\n"			\
-			RISCV_ACQUIRE_BARRIER				\
-			: "=r" (__ret), "+A" (*__ptr)			\
-			: "r" (__new)					\
-			: "memory");					\
-		break;							\
-	default:							\
-		BUILD_BUG();						\
-	}								\
-	__ret;								\
-})
-
 #define xchg_acquire(ptr, x)						\
 ({									\
-	__typeof__(*(ptr)) _x_ = (x);					\
-	(__typeof__(*(ptr))) __xchg_acquire((ptr),			\
-					    _x_, sizeof(*(ptr)));	\
-})
-
-#define __xchg_release(ptr, new, size)					\
-({									\
-	__typeof__(ptr) __ptr = (ptr);					\
-	__typeof__(new) __new = (new);					\
 	__typeof__(*(ptr)) __ret;					\
-	switch (size) {							\
-	case 4:								\
-		__asm__ __volatile__ (					\
-			RISCV_RELEASE_BARRIER				\
-			"	amoswap.w %0, %2, %1\n"			\
-			: "=r" (__ret), "+A" (*__ptr)			\
-			: "r" (__new)					\
-			: "memory");					\
-		break;							\
-	case 8:								\
-		__asm__ __volatile__ (					\
-			RISCV_RELEASE_BARRIER				\
-			"	amoswap.d %0, %2, %1\n"			\
-			: "=r" (__ret), "+A" (*__ptr)			\
-			: "r" (__new)					\
-			: "memory");					\
-		break;							\
-	default:							\
-		BUILD_BUG();						\
-	}								\
+	__typeof__(*(ptr)) _x_ = (x);					\
+	__ret = __xchg_relaxed((ptr), _x_, sizeof(*(ptr)));		\
+	__acquire_fence();						\
 	__ret;								\
 })
 
 #define xchg_release(ptr, x)						\
 ({									\
 	__typeof__(*(ptr)) _x_ = (x);					\
-	(__typeof__(*(ptr))) __xchg_release((ptr),			\
+	__release_fence();						\
+	(__typeof__(*(ptr))) __xchg_relaxed((ptr),			\
 					    _x_, sizeof(*(ptr)));	\
 })
 
-#define __xchg(ptr, new, size)						\
-({									\
-	__typeof__(ptr) __ptr = (ptr);					\
-	__typeof__(new) __new = (new);					\
-	__typeof__(*(ptr)) __ret;					\
-	switch (size) {							\
-	case 4:								\
-		__asm__ __volatile__ (					\
-			"	amoswap.w.aqrl %0, %2, %1\n"		\
-			: "=r" (__ret), "+A" (*__ptr)			\
-			: "r" (__new)					\
-			: "memory");					\
-		break;							\
-	case 8:								\
-		__asm__ __volatile__ (					\
-			"	amoswap.d.aqrl %0, %2, %1\n"		\
-			: "=r" (__ret), "+A" (*__ptr)			\
-			: "r" (__new)					\
-			: "memory");					\
-		break;							\
-	default:							\
-		BUILD_BUG();						\
-	}								\
-	__ret;								\
-})
-
 #define xchg(ptr, x)							\
 ({									\
+	__typeof__(*(ptr)) __ret;					\
 	__typeof__(*(ptr)) _x_ = (x);					\
-	(__typeof__(*(ptr))) __xchg((ptr), _x_, sizeof(*(ptr)));	\
-})
-
-#define xchg32(ptr, x)							\
-({									\
-	BUILD_BUG_ON(sizeof(*(ptr)) != 4);				\
-	xchg((ptr), (x));						\
-})
-
-#define xchg64(ptr, x)							\
-({									\
-	BUILD_BUG_ON(sizeof(*(ptr)) != 8);				\
-	xchg((ptr), (x));						\
+	__smp_mb();							\
+	__ret = __xchg_relaxed((ptr), _x_, sizeof(*(ptr)));		\
+	__smp_mb();							\
+	__ret;								\
 })
 
 /*
@@ -344,32 +257,4 @@
 	(__typeof__(*(ptr))) __cmpxchg((ptr),				\
 				       _o_, _n_, sizeof(*(ptr)));	\
 })
-
-#define cmpxchg_local(ptr, o, n)					\
-	(__cmpxchg_relaxed((ptr), (o), (n), sizeof(*(ptr))))
-
-#define cmpxchg32(ptr, o, n)						\
-({									\
-	BUILD_BUG_ON(sizeof(*(ptr)) != 4);				\
-	cmpxchg((ptr), (o), (n));					\
-})
-
-#define cmpxchg32_local(ptr, o, n)					\
-({									\
-	BUILD_BUG_ON(sizeof(*(ptr)) != 4);				\
-	cmpxchg_relaxed((ptr), (o), (n))				\
-})
-
-#define cmpxchg64(ptr, o, n)						\
-({									\
-	BUILD_BUG_ON(sizeof(*(ptr)) != 8);				\
-	cmpxchg((ptr), (o), (n));					\
-})
-
-#define cmpxchg64_local(ptr, o, n)					\
-({									\
-	BUILD_BUG_ON(sizeof(*(ptr)) != 8);				\
-	cmpxchg_relaxed((ptr), (o), (n));				\
-})
-
 #endif /* _ASM_RISCV_CMPXCHG_H */
diff --git a/arch/riscv/include/asm/fence.h b/arch/riscv/include/asm/fence.h
index 2b443a3..3832601 100644
--- a/arch/riscv/include/asm/fence.h
+++ b/arch/riscv/include/asm/fence.h
@@ -9,4 +9,10 @@
 #define RISCV_RELEASE_BARRIER
 #endif
 
+#define __acquire_fence() \
+	__asm__ __volatile__(RISCV_ACQUIRE_BARRIER "" ::: "memory")
+
+#define __release_fence() \
+	__asm__ __volatile__(RISCV_RELEASE_BARRIER "" ::: "memory")
+
 #endif	/* _ASM_RISCV_FENCE_H */
-- 
2.7.4


_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 2/5] riscv: Add QUEUED_SPINLOCKS & QUEUED_RWLOCKS supported
  2020-11-24 13:43 [PATCH 1/5] riscv: Coding convention for xchg guoren
@ 2020-11-24 13:43 ` guoren
  2020-11-24 14:39   ` Peter Zijlstra
  2020-11-24 13:43 ` [PATCH 3/5] csky: Remove simple spinlock implementation guoren
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 16+ messages in thread
From: guoren @ 2020-11-24 13:43 UTC (permalink / raw)
  To: peterz, arnd, palmerdabbelt, paul.walmsley, anup
  Cc: Guo Ren, linux-kernel, linux-csky, Michael Clark, guoren, linux-riscv

From: Guo Ren <guoren@linux.alibaba.com>

riscv only support lr.wd/s(c).w(d) with word(double word) size &
align access. There are not lr.h/sc.h instructions. But qspinlock.c
need xchg with short type variable:

xchg_tail -> xchg_releaxed(&lock->tail, ...

typedef struct qspinlock {
        union {
                atomic_t val;

                /*
                 * By using the whole 2nd least significant byte for the
                 * pending bit, we can allow better optimization of the lock
                 * acquisition for the pending bit holder.
                 */
                struct {
                        u8      locked;
                        u8      pending;
                };
                struct {
                        u16     locked_pending;
                        u16     tail; /* half word*/
                };
        };
} arch_spinlock_t;

So we add short emulation in xchg with word length and it only
solve qspinlock's requirement.

Michael have sent qspinlock before, ref to Link below.

Link: https://lore.kernel.org/linux-riscv/20190211043829.30096-1-michaeljclark@mac.com/
Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Michael Clark <michaeljclark@mac.com>
---
 arch/riscv/Kconfig                      |   2 +
 arch/riscv/include/asm/Kbuild           |   3 +
 arch/riscv/include/asm/cmpxchg.h        |  36 +++++++++
 arch/riscv/include/asm/spinlock.h       | 126 +-------------------------------
 arch/riscv/include/asm/spinlock_types.h |  15 +---
 5 files changed, 46 insertions(+), 136 deletions(-)

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 2d61c4c..d757ba4 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -36,6 +36,8 @@ config RISCV
 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
 	select ARCH_WANT_FRAME_POINTERS
 	select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
+	select ARCH_USE_QUEUED_RWLOCKS
+	select ARCH_USE_QUEUED_SPINLOCKS
 	select CLONE_BACKWARDS
 	select CLINT_TIMER if !MMU
 	select COMMON_CLK
diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild
index 59dd7be..6f5f438 100644
--- a/arch/riscv/include/asm/Kbuild
+++ b/arch/riscv/include/asm/Kbuild
@@ -6,3 +6,6 @@ generic-y += kvm_para.h
 generic-y += local64.h
 generic-y += user.h
 generic-y += vmlinux.lds.h
+generic-y += mcs_spinlock.h
+generic-y += qrwlock.h
+generic-y += qspinlock.h
diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
index 5609185..e178700 100644
--- a/arch/riscv/include/asm/cmpxchg.h
+++ b/arch/riscv/include/asm/cmpxchg.h
@@ -16,7 +16,43 @@
 	__typeof__(ptr) __ptr = (ptr);					\
 	__typeof__(new) __new = (new);					\
 	__typeof__(*(ptr)) __ret;					\
+	register unsigned long __rc, tmp, align, addr;			\
 	switch (size) {							\
+	case 2:								\
+		align = ((unsigned long) __ptr & 0x3);			\
+		addr = ((unsigned long) __ptr & ~0x3);			\
+		if (align) {						\
+		__asm__ __volatile__ (					\
+			"0:	lr.w %0, 0(%z4)\n"			\
+			"	move %1, %0\n"				\
+			"	slli %1, %1, 16\n"			\
+			"	srli %1, %1, 16\n"			\
+			"	move %2, %z3\n"				\
+			"	slli %2, %2, 16\n"			\
+			"	or   %1, %2, %1\n"			\
+			"	sc.w %2, %1, 0(%z4)\n"			\
+			"	bnez %2, 0b\n"				\
+			"	srli %0, %0, 16\n"			\
+			: "=&r" (__ret), "=&r" (tmp), "=&r" (__rc)	\
+			: "rJ" (__new), "rJ"(addr)			\
+			: "memory");					\
+		} else {						\
+		__asm__ __volatile__ (					\
+			"0:	lr.w %0, (%z4)\n"			\
+			"	move %1, %0\n"				\
+			"	srli %1, %1, 16\n"			\
+			"	slli %1, %1, 16\n"			\
+			"	move %2, %z3\n"				\
+			"	or   %1, %2, %1\n"			\
+			"	sc.w %2, %1, 0(%z4)\n"			\
+			"	bnez %2, 0b\n"				\
+			"	slli %0, %0, 16\n"			\
+			"	srli %0, %0, 16\n"			\
+			: "=&r" (__ret), "=&r" (tmp), "=&r" (__rc)	\
+			: "rJ" (__new), "rJ"(addr)			\
+			: "memory");					\
+		}							\
+		break;							\
 	case 4:								\
 		__asm__ __volatile__ (					\
 			"	amoswap.w %0, %2, %1\n"			\
diff --git a/arch/riscv/include/asm/spinlock.h b/arch/riscv/include/asm/spinlock.h
index f4f7fa1..b8deb3a 100644
--- a/arch/riscv/include/asm/spinlock.h
+++ b/arch/riscv/include/asm/spinlock.h
@@ -7,129 +7,7 @@
 #ifndef _ASM_RISCV_SPINLOCK_H
 #define _ASM_RISCV_SPINLOCK_H
 
-#include <linux/kernel.h>
-#include <asm/current.h>
-#include <asm/fence.h>
-
-/*
- * Simple spin lock operations.  These provide no fairness guarantees.
- */
-
-/* FIXME: Replace this with a ticket lock, like MIPS. */
-
-#define arch_spin_is_locked(x)	(READ_ONCE((x)->lock) != 0)
-
-static inline void arch_spin_unlock(arch_spinlock_t *lock)
-{
-	smp_store_release(&lock->lock, 0);
-}
-
-static inline int arch_spin_trylock(arch_spinlock_t *lock)
-{
-	int tmp = 1, busy;
-
-	__asm__ __volatile__ (
-		"	amoswap.w %0, %2, %1\n"
-		RISCV_ACQUIRE_BARRIER
-		: "=r" (busy), "+A" (lock->lock)
-		: "r" (tmp)
-		: "memory");
-
-	return !busy;
-}
-
-static inline void arch_spin_lock(arch_spinlock_t *lock)
-{
-	while (1) {
-		if (arch_spin_is_locked(lock))
-			continue;
-
-		if (arch_spin_trylock(lock))
-			break;
-	}
-}
-
-/***********************************************************/
-
-static inline void arch_read_lock(arch_rwlock_t *lock)
-{
-	int tmp;
-
-	__asm__ __volatile__(
-		"1:	lr.w	%1, %0\n"
-		"	bltz	%1, 1b\n"
-		"	addi	%1, %1, 1\n"
-		"	sc.w	%1, %1, %0\n"
-		"	bnez	%1, 1b\n"
-		RISCV_ACQUIRE_BARRIER
-		: "+A" (lock->lock), "=&r" (tmp)
-		:: "memory");
-}
-
-static inline void arch_write_lock(arch_rwlock_t *lock)
-{
-	int tmp;
-
-	__asm__ __volatile__(
-		"1:	lr.w	%1, %0\n"
-		"	bnez	%1, 1b\n"
-		"	li	%1, -1\n"
-		"	sc.w	%1, %1, %0\n"
-		"	bnez	%1, 1b\n"
-		RISCV_ACQUIRE_BARRIER
-		: "+A" (lock->lock), "=&r" (tmp)
-		:: "memory");
-}
-
-static inline int arch_read_trylock(arch_rwlock_t *lock)
-{
-	int busy;
-
-	__asm__ __volatile__(
-		"1:	lr.w	%1, %0\n"
-		"	bltz	%1, 1f\n"
-		"	addi	%1, %1, 1\n"
-		"	sc.w	%1, %1, %0\n"
-		"	bnez	%1, 1b\n"
-		RISCV_ACQUIRE_BARRIER
-		"1:\n"
-		: "+A" (lock->lock), "=&r" (busy)
-		:: "memory");
-
-	return !busy;
-}
-
-static inline int arch_write_trylock(arch_rwlock_t *lock)
-{
-	int busy;
-
-	__asm__ __volatile__(
-		"1:	lr.w	%1, %0\n"
-		"	bnez	%1, 1f\n"
-		"	li	%1, -1\n"
-		"	sc.w	%1, %1, %0\n"
-		"	bnez	%1, 1b\n"
-		RISCV_ACQUIRE_BARRIER
-		"1:\n"
-		: "+A" (lock->lock), "=&r" (busy)
-		:: "memory");
-
-	return !busy;
-}
-
-static inline void arch_read_unlock(arch_rwlock_t *lock)
-{
-	__asm__ __volatile__(
-		RISCV_RELEASE_BARRIER
-		"	amoadd.w x0, %1, %0\n"
-		: "+A" (lock->lock)
-		: "r" (-1)
-		: "memory");
-}
-
-static inline void arch_write_unlock(arch_rwlock_t *lock)
-{
-	smp_store_release(&lock->lock, 0);
-}
+#include <asm/qrwlock.h>
+#include <asm/qspinlock.h>
 
 #endif /* _ASM_RISCV_SPINLOCK_H */
diff --git a/arch/riscv/include/asm/spinlock_types.h b/arch/riscv/include/asm/spinlock_types.h
index f398e76..d033a97 100644
--- a/arch/riscv/include/asm/spinlock_types.h
+++ b/arch/riscv/include/asm/spinlock_types.h
@@ -6,20 +6,11 @@
 #ifndef _ASM_RISCV_SPINLOCK_TYPES_H
 #define _ASM_RISCV_SPINLOCK_TYPES_H
 
-#ifndef __LINUX_SPINLOCK_TYPES_H
+#if !defined(__LINUX_SPINLOCK_TYPES_H) && !defined(_ASM_RISCV_SPINLOCK_H)
 # error "please don't include this file directly"
 #endif
 
-typedef struct {
-	volatile unsigned int lock;
-} arch_spinlock_t;
-
-#define __ARCH_SPIN_LOCK_UNLOCKED	{ 0 }
-
-typedef struct {
-	volatile unsigned int lock;
-} arch_rwlock_t;
-
-#define __ARCH_RW_LOCK_UNLOCKED		{ 0 }
+#include <asm-generic/qspinlock_types.h>
+#include <asm-generic/qrwlock_types.h>
 
 #endif /* _ASM_RISCV_SPINLOCK_TYPES_H */
-- 
2.7.4


_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 3/5] csky: Remove simple spinlock implementation
  2020-11-24 13:43 [PATCH 1/5] riscv: Coding convention for xchg guoren
  2020-11-24 13:43 ` [PATCH 2/5] riscv: Add QUEUED_SPINLOCKS & QUEUED_RWLOCKS supported guoren
@ 2020-11-24 13:43 ` guoren
  2020-11-24 13:43 ` [PATCH 4/5] csky: Add QUEUED_SPINLOCKS supported guoren
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 16+ messages in thread
From: guoren @ 2020-11-24 13:43 UTC (permalink / raw)
  To: peterz, arnd, palmerdabbelt, paul.walmsley, anup
  Cc: linux-riscv, Guo Ren, guoren, linux-kernel, linux-csky

From: Guo Ren <guoren@linux.alibaba.com>

There are two implementation of spinlock in csky:
 - simple one (NR_CPU = 1,2)
 - tick's one (NR_CPU = 3,4,5...)

We needn't the baby codes of simple one now.

Link: https://lore.kernel.org/linux-csky/20200807081253.GD2674@hirez.programming.kicks-ass.net/#t
Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 arch/csky/Kconfig                      |   2 +-
 arch/csky/include/asm/spinlock.h       | 164 ---------------------------------
 arch/csky/include/asm/spinlock_types.h |  10 --
 3 files changed, 1 insertion(+), 175 deletions(-)

diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index 268fad5..14ee229 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -7,7 +7,7 @@ config CSKY
 	select ARCH_HAS_SYNC_DMA_FOR_CPU
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
 	select ARCH_USE_BUILTIN_BSWAP
-	select ARCH_USE_QUEUED_RWLOCKS if NR_CPUS>2
+	select ARCH_USE_QUEUED_RWLOCKS
 	select ARCH_WANT_FRAME_POINTERS if !CPU_CK610
 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
 	select COMMON_CLK
diff --git a/arch/csky/include/asm/spinlock.h b/arch/csky/include/asm/spinlock.h
index 7cf3f2b..9feb0fd 100644
--- a/arch/csky/include/asm/spinlock.h
+++ b/arch/csky/include/asm/spinlock.h
@@ -6,8 +6,6 @@
 #include <linux/spinlock_types.h>
 #include <asm/barrier.h>
 
-#ifdef CONFIG_QUEUED_RWLOCKS
-
 /*
  * Ticket-based spin-locking.
  */
@@ -91,166 +89,4 @@ static inline int arch_spin_is_contended(arch_spinlock_t *lock)
 /* See include/linux/spinlock.h */
 #define smp_mb__after_spinlock()	smp_mb()
 
-#else /* CONFIG_QUEUED_RWLOCKS */
-
-/*
- * Test-and-set spin-locking.
- */
-static inline void arch_spin_lock(arch_spinlock_t *lock)
-{
-	u32 *p = &lock->lock;
-	u32 tmp;
-
-	asm volatile (
-		"1:	ldex.w		%0, (%1) \n"
-		"	bnez		%0, 1b   \n"
-		"	movi		%0, 1    \n"
-		"	stex.w		%0, (%1) \n"
-		"	bez		%0, 1b   \n"
-		: "=&r" (tmp)
-		: "r"(p)
-		: "cc");
-	smp_mb();
-}
-
-static inline void arch_spin_unlock(arch_spinlock_t *lock)
-{
-	smp_mb();
-	WRITE_ONCE(lock->lock, 0);
-}
-
-static inline int arch_spin_trylock(arch_spinlock_t *lock)
-{
-	u32 *p = &lock->lock;
-	u32 tmp;
-
-	asm volatile (
-		"1:	ldex.w		%0, (%1) \n"
-		"	bnez		%0, 2f   \n"
-		"	movi		%0, 1    \n"
-		"	stex.w		%0, (%1) \n"
-		"	bez		%0, 1b   \n"
-		"	movi		%0, 0    \n"
-		"2:				 \n"
-		: "=&r" (tmp)
-		: "r"(p)
-		: "cc");
-
-	if (!tmp)
-		smp_mb();
-
-	return !tmp;
-}
-
-#define arch_spin_is_locked(x)	(READ_ONCE((x)->lock) != 0)
-
-/*
- * read lock/unlock/trylock
- */
-static inline void arch_read_lock(arch_rwlock_t *lock)
-{
-	u32 *p = &lock->lock;
-	u32 tmp;
-
-	asm volatile (
-		"1:	ldex.w		%0, (%1) \n"
-		"	blz		%0, 1b   \n"
-		"	addi		%0, 1    \n"
-		"	stex.w		%0, (%1) \n"
-		"	bez		%0, 1b   \n"
-		: "=&r" (tmp)
-		: "r"(p)
-		: "cc");
-	smp_mb();
-}
-
-static inline void arch_read_unlock(arch_rwlock_t *lock)
-{
-	u32 *p = &lock->lock;
-	u32 tmp;
-
-	smp_mb();
-	asm volatile (
-		"1:	ldex.w		%0, (%1) \n"
-		"	subi		%0, 1    \n"
-		"	stex.w		%0, (%1) \n"
-		"	bez		%0, 1b   \n"
-		: "=&r" (tmp)
-		: "r"(p)
-		: "cc");
-}
-
-static inline int arch_read_trylock(arch_rwlock_t *lock)
-{
-	u32 *p = &lock->lock;
-	u32 tmp;
-
-	asm volatile (
-		"1:	ldex.w		%0, (%1) \n"
-		"	blz		%0, 2f   \n"
-		"	addi		%0, 1    \n"
-		"	stex.w		%0, (%1) \n"
-		"	bez		%0, 1b   \n"
-		"	movi		%0, 0    \n"
-		"2:				 \n"
-		: "=&r" (tmp)
-		: "r"(p)
-		: "cc");
-
-	if (!tmp)
-		smp_mb();
-
-	return !tmp;
-}
-
-/*
- * write lock/unlock/trylock
- */
-static inline void arch_write_lock(arch_rwlock_t *lock)
-{
-	u32 *p = &lock->lock;
-	u32 tmp;
-
-	asm volatile (
-		"1:	ldex.w		%0, (%1) \n"
-		"	bnez		%0, 1b   \n"
-		"	subi		%0, 1    \n"
-		"	stex.w		%0, (%1) \n"
-		"	bez		%0, 1b   \n"
-		: "=&r" (tmp)
-		: "r"(p)
-		: "cc");
-	smp_mb();
-}
-
-static inline void arch_write_unlock(arch_rwlock_t *lock)
-{
-	smp_mb();
-	WRITE_ONCE(lock->lock, 0);
-}
-
-static inline int arch_write_trylock(arch_rwlock_t *lock)
-{
-	u32 *p = &lock->lock;
-	u32 tmp;
-
-	asm volatile (
-		"1:	ldex.w		%0, (%1) \n"
-		"	bnez		%0, 2f   \n"
-		"	subi		%0, 1    \n"
-		"	stex.w		%0, (%1) \n"
-		"	bez		%0, 1b   \n"
-		"	movi		%0, 0    \n"
-		"2:				 \n"
-		: "=&r" (tmp)
-		: "r"(p)
-		: "cc");
-
-	if (!tmp)
-		smp_mb();
-
-	return !tmp;
-}
-
-#endif /* CONFIG_QUEUED_RWLOCKS */
 #endif /* __ASM_CSKY_SPINLOCK_H */
diff --git a/arch/csky/include/asm/spinlock_types.h b/arch/csky/include/asm/spinlock_types.h
index 88b8243..8ff0f6f 100644
--- a/arch/csky/include/asm/spinlock_types.h
+++ b/arch/csky/include/asm/spinlock_types.h
@@ -22,16 +22,6 @@ typedef struct {
 
 #define __ARCH_SPIN_LOCK_UNLOCKED	{ { 0 } }
 
-#ifdef CONFIG_QUEUED_RWLOCKS
 #include <asm-generic/qrwlock_types.h>
 
-#else /* CONFIG_NR_CPUS > 2 */
-
-typedef struct {
-	u32 lock;
-} arch_rwlock_t;
-
-#define __ARCH_RW_LOCK_UNLOCKED		{ 0 }
-
-#endif /* CONFIG_QUEUED_RWLOCKS */
 #endif /* __ASM_CSKY_SPINLOCK_TYPES_H */
-- 
2.7.4


_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 4/5] csky: Add QUEUED_SPINLOCKS supported
  2020-11-24 13:43 [PATCH 1/5] riscv: Coding convention for xchg guoren
  2020-11-24 13:43 ` [PATCH 2/5] riscv: Add QUEUED_SPINLOCKS & QUEUED_RWLOCKS supported guoren
  2020-11-24 13:43 ` [PATCH 3/5] csky: Remove simple spinlock implementation guoren
@ 2020-11-24 13:43 ` guoren
  2020-11-24 13:43 ` [PATCH 5/5] csky: Optimize atomic operations with correct barrier usage guoren
  2020-11-24 14:29 ` [PATCH 1/5] riscv: Coding convention for xchg Peter Zijlstra
  4 siblings, 0 replies; 16+ messages in thread
From: guoren @ 2020-11-24 13:43 UTC (permalink / raw)
  To: peterz, arnd, palmerdabbelt, paul.walmsley, anup
  Cc: Guo Ren, Paul E . McKenney, linux-kernel, linux-csky, guoren,
	linux-riscv

From: Guo Ren <guoren@linux.alibaba.com>

Abiv2 only support ldex.w/stex.w with word(double word) size &
align access. There are not short type instructions. But qspinlock.c
need xchg with short type variable:

xchg_tail -> xchg_releaxed(&lock->tail, ...

typedef struct qspinlock {
        union {
                atomic_t val;

                /*
                 * By using the whole 2nd least significant byte for the
                 * pending bit, we can allow better optimization of the lock
                 * acquisition for the pending bit holder.
                 */
                struct {
                        u8      locked;
                        u8      pending;
                };
                struct {
                        u16     locked_pending;
                        u16     tail; /* half word*/
                };
        };
} arch_spinlock_t;

So we add short emulation in xchg with word length and it only
solve qspinlock's requirement.

Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
---
 arch/csky/Kconfig                      |  1 +
 arch/csky/include/asm/Kbuild           |  2 +
 arch/csky/include/asm/cmpxchg.h        | 43 ++++++++++++++++--
 arch/csky/include/asm/spinlock.h       | 82 +---------------------------------
 arch/csky/include/asm/spinlock_types.h | 18 +-------
 5 files changed, 46 insertions(+), 100 deletions(-)

diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index 14ee229..ac02b17 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -8,6 +8,7 @@ config CSKY
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
 	select ARCH_USE_BUILTIN_BSWAP
 	select ARCH_USE_QUEUED_RWLOCKS
+	select ARCH_USE_QUEUED_SPINLOCKS
 	select ARCH_WANT_FRAME_POINTERS if !CPU_CK610
 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
 	select COMMON_CLK
diff --git a/arch/csky/include/asm/Kbuild b/arch/csky/include/asm/Kbuild
index 64876e59..f814d46 100644
--- a/arch/csky/include/asm/Kbuild
+++ b/arch/csky/include/asm/Kbuild
@@ -3,7 +3,9 @@ generic-y += asm-offsets.h
 generic-y += gpio.h
 generic-y += kvm_para.h
 generic-y += local64.h
+generic-y += mcs_spinlock.h
 generic-y += qrwlock.h
+generic-y += qspinlock.h
 generic-y += seccomp.h
 generic-y += user.h
 generic-y += vmlinux.lds.h
diff --git a/arch/csky/include/asm/cmpxchg.h b/arch/csky/include/asm/cmpxchg.h
index 8922453..ca03e90 100644
--- a/arch/csky/include/asm/cmpxchg.h
+++ b/arch/csky/include/asm/cmpxchg.h
@@ -12,9 +12,46 @@ extern void __bad_xchg(void);
 ({								\
 	__typeof__(ptr) __ptr = (ptr);				\
 	__typeof__(new) __new = (new);				\
-	__typeof__(*(ptr)) __ret;				\
-	unsigned long tmp;					\
+	__typeof__(*(ptr)) __ret = 0;				\
+	unsigned long tmp, tmp2, align, addr;			\
 	switch (size) {						\
+	case 2:							\
+		align = ((unsigned long) __ptr & 0x3);		\
+		addr = ((unsigned long) __ptr & ~0x3);		\
+		smp_mb();					\
+		if (align) {					\
+		asm volatile (					\
+		"1:	ldex.w		%0, (%4) \n"		\
+		"	mov		%1, %0   \n"		\
+		"	lsli		%1, 16   \n"		\
+		"	lsri		%1, 16   \n"		\
+		"	mov		%2, %3   \n"		\
+		"	lsli		%2, 16   \n"		\
+		"	or		%1, %2   \n"		\
+		"	stex.w		%1, (%4) \n"		\
+		"	bez		%1, 1b   \n"		\
+		"	lsri		%0, 16   \n"		\
+			: "=&r" (__ret), "=&r" (tmp),		\
+			  "=&r" (tmp2)				\
+			: "r" (__new), "r"(addr)		\
+			:);					\
+		} else {					\
+		asm volatile (					\
+		"1:	ldex.w		%0, (%3) \n"		\
+		"	mov		%1, %0   \n"		\
+		"	lsri		%1, 16   \n"		\
+		"	lsli		%1, 16   \n"		\
+		"	or		%1, %2   \n"		\
+		"	stex.w		%1, (%3) \n"		\
+		"	bez		%1, 1b   \n"		\
+		"	lsli		%0, 16   \n"		\
+		"	lsri		%0, 16   \n"		\
+			: "=&r" (__ret), "=&r" (tmp)		\
+			: "r" (__new), "r"(addr)		\
+			:);					\
+		}						\
+		smp_mb();					\
+		break;						\
 	case 4:							\
 		smp_mb();					\
 		asm volatile (					\
@@ -41,7 +78,7 @@ extern void __bad_xchg(void);
 	__typeof__(new) __new = (new);				\
 	__typeof__(new) __tmp;					\
 	__typeof__(old) __old = (old);				\
-	__typeof__(*(ptr)) __ret;				\
+	__typeof__(*(ptr)) __ret = 0;				\
 	switch (size) {						\
 	case 4:							\
 		smp_mb();					\
diff --git a/arch/csky/include/asm/spinlock.h b/arch/csky/include/asm/spinlock.h
index 9feb0fd..6d21bdb 100644
--- a/arch/csky/include/asm/spinlock.h
+++ b/arch/csky/include/asm/spinlock.h
@@ -3,88 +3,8 @@
 #ifndef __ASM_CSKY_SPINLOCK_H
 #define __ASM_CSKY_SPINLOCK_H
 
-#include <linux/spinlock_types.h>
-#include <asm/barrier.h>
-
-/*
- * Ticket-based spin-locking.
- */
-static inline void arch_spin_lock(arch_spinlock_t *lock)
-{
-	arch_spinlock_t lockval;
-	u32 ticket_next = 1 << TICKET_NEXT;
-	u32 *p = &lock->lock;
-	u32 tmp;
-
-	asm volatile (
-		"1:	ldex.w		%0, (%2) \n"
-		"	mov		%1, %0	 \n"
-		"	add		%0, %3	 \n"
-		"	stex.w		%0, (%2) \n"
-		"	bez		%0, 1b   \n"
-		: "=&r" (tmp), "=&r" (lockval)
-		: "r"(p), "r"(ticket_next)
-		: "cc");
-
-	while (lockval.tickets.next != lockval.tickets.owner)
-		lockval.tickets.owner = READ_ONCE(lock->tickets.owner);
-
-	smp_mb();
-}
-
-static inline int arch_spin_trylock(arch_spinlock_t *lock)
-{
-	u32 tmp, contended, res;
-	u32 ticket_next = 1 << TICKET_NEXT;
-	u32 *p = &lock->lock;
-
-	do {
-		asm volatile (
-		"	ldex.w		%0, (%3)   \n"
-		"	movi		%2, 1	   \n"
-		"	rotli		%1, %0, 16 \n"
-		"	cmpne		%1, %0     \n"
-		"	bt		1f         \n"
-		"	movi		%2, 0	   \n"
-		"	add		%0, %0, %4 \n"
-		"	stex.w		%0, (%3)   \n"
-		"1:				   \n"
-		: "=&r" (res), "=&r" (tmp), "=&r" (contended)
-		: "r"(p), "r"(ticket_next)
-		: "cc");
-	} while (!res);
-
-	if (!contended)
-		smp_mb();
-
-	return !contended;
-}
-
-static inline void arch_spin_unlock(arch_spinlock_t *lock)
-{
-	smp_mb();
-	WRITE_ONCE(lock->tickets.owner, lock->tickets.owner + 1);
-}
-
-static inline int arch_spin_value_unlocked(arch_spinlock_t lock)
-{
-	return lock.tickets.owner == lock.tickets.next;
-}
-
-static inline int arch_spin_is_locked(arch_spinlock_t *lock)
-{
-	return !arch_spin_value_unlocked(READ_ONCE(*lock));
-}
-
-static inline int arch_spin_is_contended(arch_spinlock_t *lock)
-{
-	struct __raw_tickets tickets = READ_ONCE(lock->tickets);
-
-	return (tickets.next - tickets.owner) > 1;
-}
-#define arch_spin_is_contended	arch_spin_is_contended
-
 #include <asm/qrwlock.h>
+#include <asm/qspinlock.h>
 
 /* See include/linux/spinlock.h */
 #define smp_mb__after_spinlock()	smp_mb()
diff --git a/arch/csky/include/asm/spinlock_types.h b/arch/csky/include/asm/spinlock_types.h
index 8ff0f6f..82f5fd5 100644
--- a/arch/csky/include/asm/spinlock_types.h
+++ b/arch/csky/include/asm/spinlock_types.h
@@ -3,25 +3,11 @@
 #ifndef __ASM_CSKY_SPINLOCK_TYPES_H
 #define __ASM_CSKY_SPINLOCK_TYPES_H
 
-#ifndef __LINUX_SPINLOCK_TYPES_H
+#if !defined(__LINUX_SPINLOCK_TYPES_H) && !defined(__ASM_CSKY_SPINLOCK_H)
 # error "please don't include this file directly"
 #endif
 
-#define TICKET_NEXT	16
-
-typedef struct {
-	union {
-		u32 lock;
-		struct __raw_tickets {
-			/* little endian */
-			u16 owner;
-			u16 next;
-		} tickets;
-	};
-} arch_spinlock_t;
-
-#define __ARCH_SPIN_LOCK_UNLOCKED	{ { 0 } }
-
+#include <asm-generic/qspinlock_types.h>
 #include <asm-generic/qrwlock_types.h>
 
 #endif /* __ASM_CSKY_SPINLOCK_TYPES_H */
-- 
2.7.4


_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 5/5] csky: Optimize atomic operations with correct barrier usage
  2020-11-24 13:43 [PATCH 1/5] riscv: Coding convention for xchg guoren
                   ` (2 preceding siblings ...)
  2020-11-24 13:43 ` [PATCH 4/5] csky: Add QUEUED_SPINLOCKS supported guoren
@ 2020-11-24 13:43 ` guoren
  2020-11-24 14:29 ` [PATCH 1/5] riscv: Coding convention for xchg Peter Zijlstra
  4 siblings, 0 replies; 16+ messages in thread
From: guoren @ 2020-11-24 13:43 UTC (permalink / raw)
  To: peterz, arnd, palmerdabbelt, paul.walmsley, anup
  Cc: Guo Ren, Paul E . McKenney, linux-kernel, linux-csky, guoren,
	linux-riscv

From: Guo Ren <guoren@linux.alibaba.com>

The implementation of csky atomic operations in the past was very
rough. Frankly speaking, the implementation is wrong and limits
hardware performance. Optimize the performance of atomic, spinlock,
cmpxchg more fine-grained by increasing acquire/release memory
barriers. Here are the details of the modification:

 - Add acquire/release barrier for cmpxchg.h.
 - Remove custom atomic.h implementations.

Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
---
 arch/csky/include/asm/atomic.h  | 203 +---------------------------------------
 arch/csky/include/asm/barrier.h |  64 ++++++++++---
 arch/csky/include/asm/cmpxchg.h |  85 ++++++++++++++---
 3 files changed, 128 insertions(+), 224 deletions(-)

diff --git a/arch/csky/include/asm/atomic.h b/arch/csky/include/asm/atomic.h
index e369d73..c699c41 100644
--- a/arch/csky/include/asm/atomic.h
+++ b/arch/csky/include/asm/atomic.h
@@ -3,209 +3,10 @@
 #ifndef __ASM_CSKY_ATOMIC_H
 #define __ASM_CSKY_ATOMIC_H
 
-#include <linux/version.h>
-#include <asm/cmpxchg.h>
 #include <asm/barrier.h>
 
-#ifdef CONFIG_CPU_HAS_LDSTEX
-
-#define __atomic_add_unless __atomic_add_unless
-static inline int __atomic_add_unless(atomic_t *v, int a, int u)
-{
-	unsigned long tmp, ret;
-
-	smp_mb();
-
-	asm volatile (
-	"1:	ldex.w		%0, (%3) \n"
-	"	mov		%1, %0   \n"
-	"	cmpne		%0, %4   \n"
-	"	bf		2f	 \n"
-	"	add		%0, %2   \n"
-	"	stex.w		%0, (%3) \n"
-	"	bez		%0, 1b   \n"
-	"2:				 \n"
-		: "=&r" (tmp), "=&r" (ret)
-		: "r" (a), "r"(&v->counter), "r"(u)
-		: "memory");
-
-	if (ret != u)
-		smp_mb();
-
-	return ret;
-}
-
-#define ATOMIC_OP(op, c_op)						\
-static inline void atomic_##op(int i, atomic_t *v)			\
-{									\
-	unsigned long tmp;						\
-									\
-	asm volatile (							\
-	"1:	ldex.w		%0, (%2) \n"				\
-	"	" #op "		%0, %1   \n"				\
-	"	stex.w		%0, (%2) \n"				\
-	"	bez		%0, 1b   \n"				\
-		: "=&r" (tmp)						\
-		: "r" (i), "r"(&v->counter)				\
-		: "memory");						\
-}
-
-#define ATOMIC_OP_RETURN(op, c_op)					\
-static inline int atomic_##op##_return(int i, atomic_t *v)		\
-{									\
-	unsigned long tmp, ret;						\
-									\
-	smp_mb();							\
-	asm volatile (							\
-	"1:	ldex.w		%0, (%3) \n"				\
-	"	" #op "		%0, %2   \n"				\
-	"	mov		%1, %0   \n"				\
-	"	stex.w		%0, (%3) \n"				\
-	"	bez		%0, 1b   \n"				\
-		: "=&r" (tmp), "=&r" (ret)				\
-		: "r" (i), "r"(&v->counter)				\
-		: "memory");						\
-	smp_mb();							\
-									\
-	return ret;							\
-}
-
-#define ATOMIC_FETCH_OP(op, c_op)					\
-static inline int atomic_fetch_##op(int i, atomic_t *v)			\
-{									\
-	unsigned long tmp, ret;						\
-									\
-	smp_mb();							\
-	asm volatile (							\
-	"1:	ldex.w		%0, (%3) \n"				\
-	"	mov		%1, %0   \n"				\
-	"	" #op "		%0, %2   \n"				\
-	"	stex.w		%0, (%3) \n"				\
-	"	bez		%0, 1b   \n"				\
-		: "=&r" (tmp), "=&r" (ret)				\
-		: "r" (i), "r"(&v->counter)				\
-		: "memory");						\
-	smp_mb();							\
-									\
-	return ret;							\
-}
-
-#else /* CONFIG_CPU_HAS_LDSTEX */
-
-#include <linux/irqflags.h>
-
-#define __atomic_add_unless __atomic_add_unless
-static inline int __atomic_add_unless(atomic_t *v, int a, int u)
-{
-	unsigned long tmp, ret, flags;
-
-	raw_local_irq_save(flags);
-
-	asm volatile (
-	"	ldw		%0, (%3) \n"
-	"	mov		%1, %0   \n"
-	"	cmpne		%0, %4   \n"
-	"	bf		2f	 \n"
-	"	add		%0, %2   \n"
-	"	stw		%0, (%3) \n"
-	"2:				 \n"
-		: "=&r" (tmp), "=&r" (ret)
-		: "r" (a), "r"(&v->counter), "r"(u)
-		: "memory");
-
-	raw_local_irq_restore(flags);
-
-	return ret;
-}
-
-#define ATOMIC_OP(op, c_op)						\
-static inline void atomic_##op(int i, atomic_t *v)			\
-{									\
-	unsigned long tmp, flags;					\
-									\
-	raw_local_irq_save(flags);					\
-									\
-	asm volatile (							\
-	"	ldw		%0, (%2) \n"				\
-	"	" #op "		%0, %1   \n"				\
-	"	stw		%0, (%2) \n"				\
-		: "=&r" (tmp)						\
-		: "r" (i), "r"(&v->counter)				\
-		: "memory");						\
-									\
-	raw_local_irq_restore(flags);					\
-}
-
-#define ATOMIC_OP_RETURN(op, c_op)					\
-static inline int atomic_##op##_return(int i, atomic_t *v)		\
-{									\
-	unsigned long tmp, ret, flags;					\
-									\
-	raw_local_irq_save(flags);					\
-									\
-	asm volatile (							\
-	"	ldw		%0, (%3) \n"				\
-	"	" #op "		%0, %2   \n"				\
-	"	stw		%0, (%3) \n"				\
-	"	mov		%1, %0   \n"				\
-		: "=&r" (tmp), "=&r" (ret)				\
-		: "r" (i), "r"(&v->counter)				\
-		: "memory");						\
-									\
-	raw_local_irq_restore(flags);					\
-									\
-	return ret;							\
-}
-
-#define ATOMIC_FETCH_OP(op, c_op)					\
-static inline int atomic_fetch_##op(int i, atomic_t *v)			\
-{									\
-	unsigned long tmp, ret, flags;					\
-									\
-	raw_local_irq_save(flags);					\
-									\
-	asm volatile (							\
-	"	ldw		%0, (%3) \n"				\
-	"	mov		%1, %0   \n"				\
-	"	" #op "		%0, %2   \n"				\
-	"	stw		%0, (%3) \n"				\
-		: "=&r" (tmp), "=&r" (ret)				\
-		: "r" (i), "r"(&v->counter)				\
-		: "memory");						\
-									\
-	raw_local_irq_restore(flags);					\
-									\
-	return ret;							\
-}
-
-#endif /* CONFIG_CPU_HAS_LDSTEX */
-
-#define atomic_add_return atomic_add_return
-ATOMIC_OP_RETURN(add, +)
-#define atomic_sub_return atomic_sub_return
-ATOMIC_OP_RETURN(sub, -)
-
-#define atomic_fetch_add atomic_fetch_add
-ATOMIC_FETCH_OP(add, +)
-#define atomic_fetch_sub atomic_fetch_sub
-ATOMIC_FETCH_OP(sub, -)
-#define atomic_fetch_and atomic_fetch_and
-ATOMIC_FETCH_OP(and, &)
-#define atomic_fetch_or atomic_fetch_or
-ATOMIC_FETCH_OP(or, |)
-#define atomic_fetch_xor atomic_fetch_xor
-ATOMIC_FETCH_OP(xor, ^)
-
-#define atomic_and atomic_and
-ATOMIC_OP(and, &)
-#define atomic_or atomic_or
-ATOMIC_OP(or, |)
-#define atomic_xor atomic_xor
-ATOMIC_OP(xor, ^)
-
-#undef ATOMIC_FETCH_OP
-#undef ATOMIC_OP_RETURN
-#undef ATOMIC_OP
+#define __atomic_acquire_fence() __smp_acquire_fence()
+#define __atomic_release_fence() __smp_release_fence()
 
 #include <asm-generic/atomic.h>
 
diff --git a/arch/csky/include/asm/barrier.h b/arch/csky/include/asm/barrier.h
index a430e7f..6f8269b 100644
--- a/arch/csky/include/asm/barrier.h
+++ b/arch/csky/include/asm/barrier.h
@@ -16,24 +16,64 @@
  * sync.i:      inherit from sync, but also flush cpu pipeline
  * sync.is:     the same with sync.i + sync.s
  *
- * bar.brwarw:  ordering barrier for all load/store instructions before it
- * bar.brwarws: ordering barrier for all load/store instructions before it
- *						and shareable to other cores
- * bar.brar:    ordering barrier for all load       instructions before it
- * bar.brars:   ordering barrier for all load       instructions before it
- *						and shareable to other cores
- * bar.bwaw:    ordering barrier for all store      instructions before it
- * bar.bwaws:   ordering barrier for all store      instructions before it
- *						and shareable to other cores
+ *
+ * bar.brwarws: ordering barrier for all load/store instructions
+ *              before/after it and share to other harts
+ *
+ * |31|30 26|25 21|20 16|15  10|9   5|4           0|
+ *  1  10000 s0000 00000 100001	00001 0 bw br aw ar
+ *
+ * b: before
+ * a: after
+ * r: read
+ * w: write
+ * s: share to other harts
+ *
+ * Here are all combinations:
+ *
+ * bar.brws
+ * bar.brs
+ * bar.bws
+ * bar.arws
+ * bar.ars
+ * bar.aws
+ * bar.brwarws
+ * bar.brarws
+ * bar.bwarws
+ * bar.brwars
+ * bar.brwaws
+ * bar.brars
+ * bar.bwaws
  */
 
 #ifdef CONFIG_CPU_HAS_CACHEV2
 #define mb()		asm volatile ("sync.s\n":::"memory")
 
 #ifdef CONFIG_SMP
-#define __smp_mb()	asm volatile ("bar.brwarws\n":::"memory")
-#define __smp_rmb()	asm volatile ("bar.brars\n":::"memory")
-#define __smp_wmb()	asm volatile ("bar.bwaws\n":::"memory")
+
+#define __bar_brws()	asm volatile (".long 0x842cc200\n":::"memory")
+#define __bar_brs()	asm volatile (".long 0x8424c200\n":::"memory")
+#define __bar_bws()	asm volatile (".long 0x8428c200\n":::"memory")
+#define __bar_arws()	asm volatile (".long 0x8423c200\n":::"memory")
+#define __bar_ars()	asm volatile (".long 0x8421c200\n":::"memory")
+#define __bar_aws()	asm volatile (".long 0x8422c200\n":::"memory")
+#define __bar_brwarws()	asm volatile (".long 0x842fc200\n":::"memory")
+#define __bar_brarws()	asm volatile (".long 0x8427c200\n":::"memory")
+#define __bar_bwarws()	asm volatile (".long 0x842bc200\n":::"memory")
+#define __bar_brwars()	asm volatile (".long 0x842dc200\n":::"memory")
+#define __bar_brwaws()	asm volatile (".long 0x842ec200\n":::"memory")
+#define __bar_brars()	asm volatile (".long 0x8425c200\n":::"memory")
+#define __bar_brars()	asm volatile (".long 0x8425c200\n":::"memory")
+#define __bar_bwaws()	asm volatile (".long 0x842ac200\n":::"memory")
+
+#define __smp_mb()	__bar_brwarws()
+#define __smp_rmb()	__bar_brars()
+#define __smp_wmb()	__bar_bwaws()
+
+#define ACQUIRE_FENCE		".long 0x8427c200\n"
+#define __smp_acquire_fence()	__bar_brarws()
+#define __smp_release_fence()	__bar_brwaws()
+
 #endif /* CONFIG_SMP */
 
 #define sync_is()	asm volatile ("sync.is\n":::"memory")
diff --git a/arch/csky/include/asm/cmpxchg.h b/arch/csky/include/asm/cmpxchg.h
index ca03e90..3030608 100644
--- a/arch/csky/include/asm/cmpxchg.h
+++ b/arch/csky/include/asm/cmpxchg.h
@@ -8,7 +8,7 @@
 
 extern void __bad_xchg(void);
 
-#define __xchg(new, ptr, size)					\
+#define __xchg_relaxed(new, ptr, size)				\
 ({								\
 	__typeof__(ptr) __ptr = (ptr);				\
 	__typeof__(new) __new = (new);				\
@@ -18,7 +18,6 @@ extern void __bad_xchg(void);
 	case 2:							\
 		align = ((unsigned long) __ptr & 0x3);		\
 		addr = ((unsigned long) __ptr & ~0x3);		\
-		smp_mb();					\
 		if (align) {					\
 		asm volatile (					\
 		"1:	ldex.w		%0, (%4) \n"		\
@@ -50,10 +49,8 @@ extern void __bad_xchg(void);
 			: "r" (__new), "r"(addr)		\
 			:);					\
 		}						\
-		smp_mb();					\
 		break;						\
 	case 4:							\
-		smp_mb();					\
 		asm volatile (					\
 		"1:	ldex.w		%0, (%3) \n"		\
 		"	mov		%1, %2   \n"		\
@@ -62,7 +59,6 @@ extern void __bad_xchg(void);
 			: "=&r" (__ret), "=&r" (tmp)		\
 			: "r" (__new), "r"(__ptr)		\
 			:);					\
-		smp_mb();					\
 		break;						\
 	default:						\
 		__bad_xchg();					\
@@ -70,9 +66,32 @@ extern void __bad_xchg(void);
 	__ret;							\
 })
 
-#define xchg(ptr, x)	(__xchg((x), (ptr), sizeof(*(ptr))))
+#define xchg_relaxed(ptr, x)					\
+({								\
+	__xchg_relaxed((x), (ptr), sizeof(*(ptr)));		\
+})
+
+#define xchg_acquire(ptr, x)					\
+({								\
+	__typeof__(*(ptr)) __ret;				\
+	__ret = xchg_relaxed(ptr, x);				\
+	__smp_acquire_fence();					\
+	__ret;							\
+})
+
+#define xchg_release(ptr, x)					\
+({								\
+	__smp_release_fence();					\
+	xchg_relaxed(ptr, x);					\
+})
+
+#define xchg(ptr, x)						\
+({								\
+	__smp_release_fence();					\
+	xchg_acquire(ptr, x);					\
+})
 
-#define __cmpxchg(ptr, old, new, size)				\
+#define __cmpxchg_relaxed(ptr, old, new, size)			\
 ({								\
 	__typeof__(ptr) __ptr = (ptr);				\
 	__typeof__(new) __new = (new);				\
@@ -81,7 +100,6 @@ extern void __bad_xchg(void);
 	__typeof__(*(ptr)) __ret = 0;				\
 	switch (size) {						\
 	case 4:							\
-		smp_mb();					\
 		asm volatile (					\
 		"1:	ldex.w		%0, (%3) \n"		\
 		"	cmpne		%0, %4   \n"		\
@@ -93,7 +111,6 @@ extern void __bad_xchg(void);
 			: "=&r" (__ret), "=&r" (__tmp)		\
 			: "r" (__new), "r"(__ptr), "r"(__old)	\
 			:);					\
-		smp_mb();					\
 		break;						\
 	default:						\
 		__bad_xchg();					\
@@ -101,8 +118,54 @@ extern void __bad_xchg(void);
 	__ret;							\
 })
 
-#define cmpxchg(ptr, o, n) \
-	(__cmpxchg((ptr), (o), (n), sizeof(*(ptr))))
+#define cmpxchg_relaxed(ptr, o, n)				\
+	(__cmpxchg_relaxed((ptr), (o), (n), sizeof(*(ptr))))
+
+#define cmpxchg_release(ptr, o, n)				\
+({								\
+	__smp_release_fence();					\
+	cmpxchg_relaxed(ptr, o, n);				\
+})
+
+#define __cmpxchg_acquire(ptr, old, new, size)			\
+({								\
+	__typeof__(ptr) __ptr = (ptr);				\
+	__typeof__(new) __new = (new);				\
+	__typeof__(new) __tmp;					\
+	__typeof__(old) __old = (old);				\
+	__typeof__(*(ptr)) __ret = 0;				\
+	switch (size) {						\
+	case 4:							\
+		asm volatile (					\
+		"1:	ldex.w		%0, (%3) \n"		\
+		"	cmpne		%0, %4   \n"		\
+		"	bt		2f       \n"		\
+		"	mov		%1, %2   \n"		\
+		"	stex.w		%1, (%3) \n"		\
+		"	bez		%1, 1b   \n"		\
+		ACQUIRE_FENCE					\
+		"2:				 \n"		\
+			: "=&r" (__ret), "=&r" (__tmp)		\
+			: "r" (__new), "r"(__ptr), "r"(__old)	\
+			:);					\
+		break;						\
+	default:						\
+		__bad_xchg();					\
+	}							\
+	__ret;							\
+})
+
+#define cmpxchg_acquire(ptr, o, n)				\
+	(__cmpxchg_acquire((ptr), (o), (n), sizeof(*(ptr))))
+
+#define cmpxchg(ptr, o, n)					\
+({								\
+	__typeof__(*(ptr)) __ret;				\
+	__smp_release_fence();					\
+	__ret = cmpxchg_acquire(ptr, o, n);			\
+	__ret;							\
+})
+
 #else
 #include <asm-generic/cmpxchg.h>
 #endif
-- 
2.7.4


_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/5] riscv: Coding convention for xchg
  2020-11-24 13:43 [PATCH 1/5] riscv: Coding convention for xchg guoren
                   ` (3 preceding siblings ...)
  2020-11-24 13:43 ` [PATCH 5/5] csky: Optimize atomic operations with correct barrier usage guoren
@ 2020-11-24 14:29 ` Peter Zijlstra
  2020-11-25 14:18   ` Guo Ren
  4 siblings, 1 reply; 16+ messages in thread
From: Peter Zijlstra @ 2020-11-24 14:29 UTC (permalink / raw)
  To: guoren
  Cc: Guo Ren, arnd, anup, palmerdabbelt, linux-kernel, linux-csky,
	Michael Clark, paul.walmsley, linux-riscv

On Tue, Nov 24, 2020 at 01:43:53PM +0000, guoren@kernel.org wrote:
> From: Guo Ren <guoren@linux.alibaba.com>
> 
> This is prepare for QUEUED_SPINLOCKS which need xchg support short
> type value.
>  - Remove unused codes (xchg32, xchg64, cmpxchg32 ...)
>  - Combine xchg_relaxed, xchg_acquire, xchg_release into one asm
>  - Make atomic.aq/rl with seperated fence acquire & release

Every time you find yourself doing multiple things, make it multiple
patches.

> @@ -242,58 +239,58 @@ static __always_inline s64 atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u
>   * atomic_{cmp,}xchg is required to have exactly the same ordering semantics as
>   * {cmp,}xchg and the operations that return, so they need a full barrier.
>   */
> +#define ATOMIC_OP(c_t, prefix)						\
>  static __always_inline							\
>  c_t atomic##prefix##_xchg_relaxed(atomic##prefix##_t *v, c_t n)		\
>  {									\
> +	return xchg_relaxed(&(v->counter), n);				\
>  }									\
>  static __always_inline							\
>  c_t atomic##prefix##_xchg_acquire(atomic##prefix##_t *v, c_t n)		\
>  {									\
> +	return xchg_acquire(&(v->counter), n);				\
>  }									\
>  static __always_inline							\
>  c_t atomic##prefix##_xchg_release(atomic##prefix##_t *v, c_t n)		\
>  {									\
> +	return xchg_release(&(v->counter), n);				\
>  }									\
>  static __always_inline							\
>  c_t atomic##prefix##_xchg(atomic##prefix##_t *v, c_t n)			\
>  {									\
> +	return xchg(&(v->counter), n);					\
>  }									\
>  static __always_inline							\
>  c_t atomic##prefix##_cmpxchg_relaxed(atomic##prefix##_t *v,		\
>  				     c_t o, c_t n)			\
>  {									\
> +	return cmpxchg_relaxed(&(v->counter), o, n);			\
>  }									\
>  static __always_inline							\
>  c_t atomic##prefix##_cmpxchg_acquire(atomic##prefix##_t *v,		\
>  				     c_t o, c_t n)			\
>  {									\
> +	return cmpxchg_acquire(&(v->counter), o, n);			\
>  }									\
>  static __always_inline							\
>  c_t atomic##prefix##_cmpxchg_release(atomic##prefix##_t *v,		\
>  				     c_t o, c_t n)			\
>  {									\
> +	return cmpxchg_release(&(v->counter), o, n);			\
>  }									\
>  static __always_inline							\
>  c_t atomic##prefix##_cmpxchg(atomic##prefix##_t *v, c_t o, c_t n)	\
>  {									\
> +	return cmpxchg(&(v->counter), o, n);				\
>  }

> diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
> index 262e5bb..5609185 100644
> --- a/arch/riscv/include/asm/cmpxchg.h
> +++ b/arch/riscv/include/asm/cmpxchg.h
> @@ -44,118 +44,31 @@
>  					    _x_, sizeof(*(ptr)));	\
>  })
>  
>  #define xchg_acquire(ptr, x)						\
>  ({									\
> +	__typeof__(*(ptr)) _x_ = (x);					\
> +	__ret = __xchg_relaxed((ptr), _x_, sizeof(*(ptr)));		\
> +	__acquire_fence();						\
>  	__ret;								\
>  })
>  
>  #define xchg_release(ptr, x)						\
>  ({									\
>  	__typeof__(*(ptr)) _x_ = (x);					\
> +	__release_fence();						\
> +	(__typeof__(*(ptr))) __xchg_relaxed((ptr),			\
>  					    _x_, sizeof(*(ptr)));	\
>  })
>  
>  #define xchg(ptr, x)							\
>  ({									\
> +	__typeof__(*(ptr)) __ret;					\
>  	__typeof__(*(ptr)) _x_ = (x);					\
> +	__smp_mb();							\
> +	__ret = __xchg_relaxed((ptr), _x_, sizeof(*(ptr)));		\
> +	__smp_mb();							\
> +	__ret;								\
>  })
>  
>  /*

Why are you defining *{,_acquire,_release}() at all, doesn't
atomic-fallback.h DTRT for you?

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/5] riscv: Add QUEUED_SPINLOCKS & QUEUED_RWLOCKS supported
  2020-11-24 13:43 ` [PATCH 2/5] riscv: Add QUEUED_SPINLOCKS & QUEUED_RWLOCKS supported guoren
@ 2020-11-24 14:39   ` Peter Zijlstra
  2020-11-24 15:00     ` Arnd Bergmann
  2020-11-25  0:52     ` Guo Ren
  0 siblings, 2 replies; 16+ messages in thread
From: Peter Zijlstra @ 2020-11-24 14:39 UTC (permalink / raw)
  To: guoren
  Cc: Guo Ren, arnd, anup, palmerdabbelt, linux-kernel, linux-csky,
	Michael Clark, paul.walmsley, linux-riscv

On Tue, Nov 24, 2020 at 01:43:54PM +0000, guoren@kernel.org wrote:
> diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild
> index 59dd7be..6f5f438 100644
> --- a/arch/riscv/include/asm/Kbuild
> +++ b/arch/riscv/include/asm/Kbuild
> @@ -6,3 +6,6 @@ generic-y += kvm_para.h
>  generic-y += local64.h
>  generic-y += user.h
>  generic-y += vmlinux.lds.h
> +generic-y += mcs_spinlock.h
> +generic-y += qrwlock.h
> +generic-y += qspinlock.h
> diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
> index 5609185..e178700 100644
> --- a/arch/riscv/include/asm/cmpxchg.h
> +++ b/arch/riscv/include/asm/cmpxchg.h
> @@ -16,7 +16,43 @@
>  	__typeof__(ptr) __ptr = (ptr);					\
>  	__typeof__(new) __new = (new);					\
>  	__typeof__(*(ptr)) __ret;					\
> +	register unsigned long __rc, tmp, align, addr;			\
>  	switch (size) {							\
> +	case 2:								\
> +		align = ((unsigned long) __ptr & 0x3);			\
> +		addr = ((unsigned long) __ptr & ~0x3);			\
> +		if (align) {						\
> +		__asm__ __volatile__ (					\
> +			"0:	lr.w %0, 0(%z4)\n"			\
> +			"	move %1, %0\n"				\
> +			"	slli %1, %1, 16\n"			\
> +			"	srli %1, %1, 16\n"			\
> +			"	move %2, %z3\n"				\
> +			"	slli %2, %2, 16\n"			\
> +			"	or   %1, %2, %1\n"			\
> +			"	sc.w %2, %1, 0(%z4)\n"			\
> +			"	bnez %2, 0b\n"				\
> +			"	srli %0, %0, 16\n"			\
> +			: "=&r" (__ret), "=&r" (tmp), "=&r" (__rc)	\
> +			: "rJ" (__new), "rJ"(addr)			\
> +			: "memory");					\
> +		} else {						\
> +		__asm__ __volatile__ (					\
> +			"0:	lr.w %0, (%z4)\n"			\
> +			"	move %1, %0\n"				\
> +			"	srli %1, %1, 16\n"			\
> +			"	slli %1, %1, 16\n"			\
> +			"	move %2, %z3\n"				\
> +			"	or   %1, %2, %1\n"			\
> +			"	sc.w %2, %1, 0(%z4)\n"			\
> +			"	bnez %2, 0b\n"				\
> +			"	slli %0, %0, 16\n"			\
> +			"	srli %0, %0, 16\n"			\
> +			: "=&r" (__ret), "=&r" (tmp), "=&r" (__rc)	\
> +			: "rJ" (__new), "rJ"(addr)			\
> +			: "memory");					\
> +		}							\
> +		break;							\
>  	case 4:								\
>  		__asm__ __volatile__ (					\
>  			"	amoswap.w %0, %2, %1\n"			\

I'm pretty sure there's a handfull of implementations like this out
there... if only we could share.

Anyway, this too should be an independent patch.

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/5] riscv: Add QUEUED_SPINLOCKS & QUEUED_RWLOCKS supported
  2020-11-24 14:39   ` Peter Zijlstra
@ 2020-11-24 15:00     ` Arnd Bergmann
  2020-11-25 14:09       ` Guo Ren
  2020-11-25 14:16       ` Peter Zijlstra
  2020-11-25  0:52     ` Guo Ren
  1 sibling, 2 replies; 16+ messages in thread
From: Arnd Bergmann @ 2020-11-24 15:00 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Guo Ren, Arnd Bergmann, Anup Patel, Palmer Dabbelt, linux-kernel,
	linux-csky, Michael Clark, Guo Ren, Paul Walmsley, linux-riscv

On Tue, Nov 24, 2020 at 3:39 PM Peter Zijlstra <peterz@infradead.org> wrote:
> On Tue, Nov 24, 2020 at 01:43:54PM +0000, guoren@kernel.org wrote:
> > diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild

> > +             if (align) {                                            \
> > +             __asm__ __volatile__ (                                  \
> > +                     "0:     lr.w %0, 0(%z4)\n"                      \
> > +                     "       move %1, %0\n"                          \
> > +                     "       slli %1, %1, 16\n"                      \
> > +                     "       srli %1, %1, 16\n"                      \
> > +                     "       move %2, %z3\n"                         \
> > +                     "       slli %2, %2, 16\n"                      \
> > +                     "       or   %1, %2, %1\n"                      \
> > +                     "       sc.w %2, %1, 0(%z4)\n"                  \
> > +                     "       bnez %2, 0b\n"                          \
> > +                     "       srli %0, %0, 16\n"                      \
> > +                     : "=&r" (__ret), "=&r" (tmp), "=&r" (__rc)      \
> > +                     : "rJ" (__new), "rJ"(addr)                      \
> > +                     : "memory");                                    \
>
> I'm pretty sure there's a handfull of implementations like this out
> there... if only we could share.

Isn't this effectively the same as the "_Q_PENDING_BITS != 8"
version of xchg_tail()?

If nothing else needs xchg() on a 16-bit value, maybe
changing the #ifdef in the qspinlock code is enough.

Only around half the architectures actually implement 8-bit
and 16-bit cmpxchg() and xchg(), it might even be worth trying
to eventually change the interface to not do it at all, but
instead have explicit cmpxchg8() and cmpxchg16() helpers
for the few files that do use them.

     Arnd

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/5] riscv: Add QUEUED_SPINLOCKS & QUEUED_RWLOCKS supported
  2020-11-24 14:39   ` Peter Zijlstra
  2020-11-24 15:00     ` Arnd Bergmann
@ 2020-11-25  0:52     ` Guo Ren
  2020-11-25 14:18       ` Peter Zijlstra
  1 sibling, 1 reply; 16+ messages in thread
From: Guo Ren @ 2020-11-25  0:52 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Guo Ren, Arnd Bergmann, Anup Patel, Palmer Dabbelt,
	Linux Kernel Mailing List, linux-csky, Michael Clark,
	Paul Walmsley, linux-riscv

Thx Peter,

On Tue, Nov 24, 2020 at 10:39 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Tue, Nov 24, 2020 at 01:43:54PM +0000, guoren@kernel.org wrote:
> > diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild
> > index 59dd7be..6f5f438 100644
> > --- a/arch/riscv/include/asm/Kbuild
> > +++ b/arch/riscv/include/asm/Kbuild
> > @@ -6,3 +6,6 @@ generic-y += kvm_para.h
> >  generic-y += local64.h
> >  generic-y += user.h
> >  generic-y += vmlinux.lds.h
> > +generic-y += mcs_spinlock.h
> > +generic-y += qrwlock.h
> > +generic-y += qspinlock.h
> > diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
> > index 5609185..e178700 100644
> > --- a/arch/riscv/include/asm/cmpxchg.h
> > +++ b/arch/riscv/include/asm/cmpxchg.h
> > @@ -16,7 +16,43 @@
> >       __typeof__(ptr) __ptr = (ptr);                                  \
> >       __typeof__(new) __new = (new);                                  \
> >       __typeof__(*(ptr)) __ret;                                       \
> > +     register unsigned long __rc, tmp, align, addr;                  \
> >       switch (size) {                                                 \
> > +     case 2:                                                         \
> > +             align = ((unsigned long) __ptr & 0x3);                  \
> > +             addr = ((unsigned long) __ptr & ~0x3);                  \
> > +             if (align) {                                            \
> > +             __asm__ __volatile__ (                                  \
> > +                     "0:     lr.w %0, 0(%z4)\n"                      \
> > +                     "       move %1, %0\n"                          \
> > +                     "       slli %1, %1, 16\n"                      \
> > +                     "       srli %1, %1, 16\n"                      \
> > +                     "       move %2, %z3\n"                         \
> > +                     "       slli %2, %2, 16\n"                      \
> > +                     "       or   %1, %2, %1\n"                      \
> > +                     "       sc.w %2, %1, 0(%z4)\n"                  \
> > +                     "       bnez %2, 0b\n"                          \
> > +                     "       srli %0, %0, 16\n"                      \
> > +                     : "=&r" (__ret), "=&r" (tmp), "=&r" (__rc)      \
> > +                     : "rJ" (__new), "rJ"(addr)                      \
> > +                     : "memory");                                    \
> > +             } else {                                                \
> > +             __asm__ __volatile__ (                                  \
> > +                     "0:     lr.w %0, (%z4)\n"                       \
> > +                     "       move %1, %0\n"                          \
> > +                     "       srli %1, %1, 16\n"                      \
> > +                     "       slli %1, %1, 16\n"                      \
> > +                     "       move %2, %z3\n"                         \
> > +                     "       or   %1, %2, %1\n"                      \
> > +                     "       sc.w %2, %1, 0(%z4)\n"                  \
> > +                     "       bnez %2, 0b\n"                          \
> > +                     "       slli %0, %0, 16\n"                      \
> > +                     "       srli %0, %0, 16\n"                      \
> > +                     : "=&r" (__ret), "=&r" (tmp), "=&r" (__rc)      \
> > +                     : "rJ" (__new), "rJ"(addr)                      \
> > +                     : "memory");                                    \
> > +             }                                                       \
> > +             break;                                                  \
> >       case 4:                                                         \
> >               __asm__ __volatile__ (                                  \
> >                       "       amoswap.w %0, %2, %1\n"                 \
>
> I'm pretty sure there's a handfull of implementations like this out
> there... if only we could share.
Michael has sent qspinlock before, ref to Link below. He reused mips' code.

Link: https://lore.kernel.org/linux-riscv/20190211043829.30096-1-michaeljclark@mac.com/

Which short xchg implementation do you prefer (Mine or his)?

>
> Anyway, this too should be an independent patch.
Ok, I'll separate it into two patches,
1. implement short xchg
2. qspinlock enabled based on Michael's patch

-- 
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/5] riscv: Add QUEUED_SPINLOCKS & QUEUED_RWLOCKS supported
  2020-11-24 15:00     ` Arnd Bergmann
@ 2020-11-25 14:09       ` Guo Ren
  2020-11-25 14:16       ` Peter Zijlstra
  1 sibling, 0 replies; 16+ messages in thread
From: Guo Ren @ 2020-11-25 14:09 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Guo Ren, Arnd Bergmann, Peter Zijlstra, Anup Patel,
	Palmer Dabbelt, linux-kernel, linux-csky, Michael Clark,
	Paul Walmsley, linux-riscv

On Tue, Nov 24, 2020 at 11:00 PM Arnd Bergmann <arnd@kernel.org> wrote:
>
> On Tue, Nov 24, 2020 at 3:39 PM Peter Zijlstra <peterz@infradead.org> wrote:
> > On Tue, Nov 24, 2020 at 01:43:54PM +0000, guoren@kernel.org wrote:
> > > diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild
>
> > > +             if (align) {                                            \
> > > +             __asm__ __volatile__ (                                  \
> > > +                     "0:     lr.w %0, 0(%z4)\n"                      \
> > > +                     "       move %1, %0\n"                          \
> > > +                     "       slli %1, %1, 16\n"                      \
> > > +                     "       srli %1, %1, 16\n"                      \
> > > +                     "       move %2, %z3\n"                         \
> > > +                     "       slli %2, %2, 16\n"                      \
> > > +                     "       or   %1, %2, %1\n"                      \
> > > +                     "       sc.w %2, %1, 0(%z4)\n"                  \
> > > +                     "       bnez %2, 0b\n"                          \
> > > +                     "       srli %0, %0, 16\n"                      \
> > > +                     : "=&r" (__ret), "=&r" (tmp), "=&r" (__rc)      \
> > > +                     : "rJ" (__new), "rJ"(addr)                      \
> > > +                     : "memory");                                    \
> >
> > I'm pretty sure there's a handfull of implementations like this out
> > there... if only we could share.
>
> Isn't this effectively the same as the "_Q_PENDING_BITS != 8"
> version of xchg_tail()?

This can be concluded as the different effectiveness between cmpxchg
and xchg. For the arch which only has lr/sc instructions, the cmpxchg
& xchg are similar.

#if _Q_PENDING_BITS == 8

static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
{
        /*
         * We can use relaxed semantics since the caller ensures that the
         * MCS node is properly initialized before updating the tail.
         */
        return (u32)xchg_relaxed(&lock->tail,
                                 tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
}

#else /* _Q_PENDING_BITS == 8 */

static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
{
        u32 old, new, val = atomic_read(&lock->val);

        for (;;) {
                new = (val & _Q_LOCKED_PENDING_MASK) | tail;
                /*
                 * We can use relaxed semantics since the caller ensures that
                 * the MCS node is properly initialized before updating the
                 * tail.
                 */
                old = atomic_cmpxchg_relaxed(&lock->val, val, new);
                if (old == val)
                        break;

                val = old;
        }
        return old;
}
#endif /* _Q_PENDING_BITS == 8 */


>
> If nothing else needs xchg() on a 16-bit value, maybe
> changing the #ifdef in the qspinlock code is enough.
>
> Only around half the architectures actually implement 8-bit
> and 16-bit cmpxchg() and xchg(), it might even be worth trying
> to eventually change the interface to not do it at all, but
> instead have explicit cmpxchg8() and cmpxchg16() helpers
> for the few files that do use them.
>
>      Arnd



-- 
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/5] riscv: Add QUEUED_SPINLOCKS & QUEUED_RWLOCKS supported
  2020-11-24 15:00     ` Arnd Bergmann
  2020-11-25 14:09       ` Guo Ren
@ 2020-11-25 14:16       ` Peter Zijlstra
  2020-11-25 14:31         ` Will Deacon
  1 sibling, 1 reply; 16+ messages in thread
From: Peter Zijlstra @ 2020-11-25 14:16 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Guo Ren, Arnd Bergmann, Anup Patel, Palmer Dabbelt, linux-kernel,
	linux-csky, Michael Clark, Guo Ren, Paul Walmsley, linux-riscv,
	Will Deacon

On Tue, Nov 24, 2020 at 04:00:14PM +0100, Arnd Bergmann wrote:
> On Tue, Nov 24, 2020 at 3:39 PM Peter Zijlstra <peterz@infradead.org> wrote:
> > On Tue, Nov 24, 2020 at 01:43:54PM +0000, guoren@kernel.org wrote:
> > > diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild
> 
> > > +             if (align) {                                            \
> > > +             __asm__ __volatile__ (                                  \
> > > +                     "0:     lr.w %0, 0(%z4)\n"                      \
> > > +                     "       move %1, %0\n"                          \
> > > +                     "       slli %1, %1, 16\n"                      \
> > > +                     "       srli %1, %1, 16\n"                      \
> > > +                     "       move %2, %z3\n"                         \
> > > +                     "       slli %2, %2, 16\n"                      \
> > > +                     "       or   %1, %2, %1\n"                      \
> > > +                     "       sc.w %2, %1, 0(%z4)\n"                  \
> > > +                     "       bnez %2, 0b\n"                          \
> > > +                     "       srli %0, %0, 16\n"                      \
> > > +                     : "=&r" (__ret), "=&r" (tmp), "=&r" (__rc)      \
> > > +                     : "rJ" (__new), "rJ"(addr)                      \
> > > +                     : "memory");                                    \
> >
> > I'm pretty sure there's a handfull of implementations like this out
> > there... if only we could share.
> 
> Isn't this effectively the same as the "_Q_PENDING_BITS != 8"
> version of xchg_tail()?

Pretty much I suppose.

> If nothing else needs xchg() on a 16-bit value, maybe
> changing the #ifdef in the qspinlock code is enough.

Like the below? I think the native LL/SC variant is slightly better than
the cmpxchg loop. The problem is that cmpxchg() is ll/sc and then we
loop that, instead of only having the ll/sc loop.

> Only around half the architectures actually implement 8-bit
> and 16-bit cmpxchg() and xchg(), it might even be worth trying
> to eventually change the interface to not do it at all, but
> instead have explicit cmpxchg8() and cmpxchg16() helpers
> for the few files that do use them.

Yeah, many RISCs don't have sub-word atomics. Not sure how many other
users there are. qspinlock certainly is the most popular I suppose.

---


diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index cbff6ba53d56..7049fb2af0b2 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -163,26 +163,6 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
 	WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
 }
 
-/*
- * xchg_tail - Put in the new queue tail code word & retrieve previous one
- * @lock : Pointer to queued spinlock structure
- * @tail : The new queue tail code word
- * Return: The previous queue tail code word
- *
- * xchg(lock, tail), which heads an address dependency
- *
- * p,*,* -> n,*,* ; prev = xchg(lock, node)
- */
-static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
-{
-	/*
-	 * We can use relaxed semantics since the caller ensures that the
-	 * MCS node is properly initialized before updating the tail.
-	 */
-	return (u32)xchg_relaxed(&lock->tail,
-				 tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
-}
-
 #else /* _Q_PENDING_BITS == 8 */
 
 /**
@@ -207,6 +187,32 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
 	atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val);
 }
 
+#endif /* _Q_PENDING_BITS == 8 */
+
+#if _Q_PENDING_BITS == 8 && ARCH_HAS_XCHG16
+
+/*
+ * xchg_tail - Put in the new queue tail code word & retrieve previous one
+ * @lock : Pointer to queued spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail), which heads an address dependency
+ *
+ * p,*,* -> n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
+{
+	/*
+	 * We can use relaxed semantics since the caller ensures that the
+	 * MCS node is properly initialized before updating the tail.
+	 */
+	return (u32)xchg_relaxed(&lock->tail,
+				 tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
+}
+
+#else /* !(_Q_PENDING_BITS == 8 && ARCH_HAS_XCHG16) */
+
 /**
  * xchg_tail - Put in the new queue tail code word & retrieve previous one
  * @lock : Pointer to queued spinlock structure
@@ -236,7 +242,8 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
 	}
 	return old;
 }
-#endif /* _Q_PENDING_BITS == 8 */
+
+#endif /* _Q_PENDING_BITS == 8 && ARCH_HAS_XCHG16 */
 
 /**
  * queued_fetch_set_pending_acquire - fetch the whole lock value and set pending

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/5] riscv: Add QUEUED_SPINLOCKS & QUEUED_RWLOCKS supported
  2020-11-25  0:52     ` Guo Ren
@ 2020-11-25 14:18       ` Peter Zijlstra
  0 siblings, 0 replies; 16+ messages in thread
From: Peter Zijlstra @ 2020-11-25 14:18 UTC (permalink / raw)
  To: Guo Ren
  Cc: Guo Ren, Arnd Bergmann, Anup Patel, Palmer Dabbelt,
	Linux Kernel Mailing List, linux-csky, Michael Clark,
	Paul Walmsley, linux-riscv, Will Deacon

On Wed, Nov 25, 2020 at 08:52:23AM +0800, Guo Ren wrote:

> > I'm pretty sure there's a handfull of implementations like this out
> > there... if only we could share.
> Michael has sent qspinlock before, ref to Link below. He reused mips' code.
> 
> Link: https://lore.kernel.org/linux-riscv/20190211043829.30096-1-michaeljclark@mac.com/
> 
> Which short xchg implementation do you prefer (Mine or his)?

Well, it would be very nice to have mips/riscv/csky all use the same
header to implement these I suppose.

But then we're back to a cmpxchg-loop, in which case Arnd's suggestion
isn't worse.

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/5] riscv: Coding convention for xchg
  2020-11-24 14:29 ` [PATCH 1/5] riscv: Coding convention for xchg Peter Zijlstra
@ 2020-11-25 14:18   ` Guo Ren
  0 siblings, 0 replies; 16+ messages in thread
From: Guo Ren @ 2020-11-25 14:18 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Guo Ren, Arnd Bergmann, Anup Patel, Palmer Dabbelt,
	Linux Kernel Mailing List, linux-csky, Michael Clark,
	Paul Walmsley, linux-riscv

On Tue, Nov 24, 2020 at 10:29 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Tue, Nov 24, 2020 at 01:43:53PM +0000, guoren@kernel.org wrote:
> > From: Guo Ren <guoren@linux.alibaba.com>
> >
> > This is prepare for QUEUED_SPINLOCKS which need xchg support short
> > type value.
> >  - Remove unused codes (xchg32, xchg64, cmpxchg32 ...)
> >  - Combine xchg_relaxed, xchg_acquire, xchg_release into one asm
> >  - Make atomic.aq/rl with seperated fence acquire & release
>
> Every time you find yourself doing multiple things, make it multiple
> patches.
Ok.

>
> > @@ -242,58 +239,58 @@ static __always_inline s64 atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u
> >   * atomic_{cmp,}xchg is required to have exactly the same ordering semantics as
> >   * {cmp,}xchg and the operations that return, so they need a full barrier.
> >   */
> > +#define ATOMIC_OP(c_t, prefix)                                               \
> >  static __always_inline                                                       \
> >  c_t atomic##prefix##_xchg_relaxed(atomic##prefix##_t *v, c_t n)              \
> >  {                                                                    \
> > +     return xchg_relaxed(&(v->counter), n);                          \
> >  }                                                                    \
> >  static __always_inline                                                       \
> >  c_t atomic##prefix##_xchg_acquire(atomic##prefix##_t *v, c_t n)              \
> >  {                                                                    \
> > +     return xchg_acquire(&(v->counter), n);                          \
> >  }                                                                    \
> >  static __always_inline                                                       \
> >  c_t atomic##prefix##_xchg_release(atomic##prefix##_t *v, c_t n)              \
> >  {                                                                    \
> > +     return xchg_release(&(v->counter), n);                          \
> >  }                                                                    \
> >  static __always_inline                                                       \
> >  c_t atomic##prefix##_xchg(atomic##prefix##_t *v, c_t n)                      \
> >  {                                                                    \
> > +     return xchg(&(v->counter), n);                                  \
> >  }                                                                    \
> >  static __always_inline                                                       \
> >  c_t atomic##prefix##_cmpxchg_relaxed(atomic##prefix##_t *v,          \
> >                                    c_t o, c_t n)                      \
> >  {                                                                    \
> > +     return cmpxchg_relaxed(&(v->counter), o, n);                    \
> >  }                                                                    \
> >  static __always_inline                                                       \
> >  c_t atomic##prefix##_cmpxchg_acquire(atomic##prefix##_t *v,          \
> >                                    c_t o, c_t n)                      \
> >  {                                                                    \
> > +     return cmpxchg_acquire(&(v->counter), o, n);                    \
> >  }                                                                    \
> >  static __always_inline                                                       \
> >  c_t atomic##prefix##_cmpxchg_release(atomic##prefix##_t *v,          \
> >                                    c_t o, c_t n)                      \
> >  {                                                                    \
> > +     return cmpxchg_release(&(v->counter), o, n);                    \
> >  }                                                                    \
> >  static __always_inline                                                       \
> >  c_t atomic##prefix##_cmpxchg(atomic##prefix##_t *v, c_t o, c_t n)    \
> >  {                                                                    \
> > +     return cmpxchg(&(v->counter), o, n);                            \
> >  }
>
> > diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
> > index 262e5bb..5609185 100644
> > --- a/arch/riscv/include/asm/cmpxchg.h
> > +++ b/arch/riscv/include/asm/cmpxchg.h
> > @@ -44,118 +44,31 @@
> >                                           _x_, sizeof(*(ptr)));       \
> >  })
> >
> >  #define xchg_acquire(ptr, x)                                         \
> >  ({                                                                   \
> > +     __typeof__(*(ptr)) _x_ = (x);                                   \
> > +     __ret = __xchg_relaxed((ptr), _x_, sizeof(*(ptr)));             \
> > +     __acquire_fence();                                              \
> >       __ret;                                                          \
> >  })
> >
> >  #define xchg_release(ptr, x)                                         \
> >  ({                                                                   \
> >       __typeof__(*(ptr)) _x_ = (x);                                   \
> > +     __release_fence();                                              \
> > +     (__typeof__(*(ptr))) __xchg_relaxed((ptr),                      \
> >                                           _x_, sizeof(*(ptr)));       \
> >  })
> >
> >  #define xchg(ptr, x)                                                 \
> >  ({                                                                   \
> > +     __typeof__(*(ptr)) __ret;                                       \
> >       __typeof__(*(ptr)) _x_ = (x);                                   \
> > +     __smp_mb();                                                     \
> > +     __ret = __xchg_relaxed((ptr), _x_, sizeof(*(ptr)));             \
> > +     __smp_mb();                                                     \
> > +     __ret;                                                          \
> >  })
> >
> >  /*
>
> Why are you defining *{,_acquire,_release}() at all, doesn't
> atomic-fallback.h DTRT for you?
Yes, you are right. I could reuse that. Thx

#define cmpxchg_acquire(...) \
        __atomic_op_acquire(cmpxchg, __VA_ARGS__)
#endif

#define __atomic_op_acquire(op, args...)                                \
({                                                                      \
        typeof(op##_relaxed(args)) __ret  = op##_relaxed(args);         \
        __atomic_acquire_fence();                                       \
        __ret;                                                          \
})


-- 
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/5] riscv: Add QUEUED_SPINLOCKS & QUEUED_RWLOCKS supported
  2020-11-25 14:16       ` Peter Zijlstra
@ 2020-11-25 14:31         ` Will Deacon
  2020-11-26  1:36           ` Guo Ren
  0 siblings, 1 reply; 16+ messages in thread
From: Will Deacon @ 2020-11-25 14:31 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Arnd Bergmann, Guo Ren, Arnd Bergmann, Anup Patel,
	Palmer Dabbelt, linux-kernel, linux-csky, Michael Clark, Guo Ren,
	Paul Walmsley, linux-riscv

On Wed, Nov 25, 2020 at 03:16:45PM +0100, Peter Zijlstra wrote:
> @@ -207,6 +187,32 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
>  	atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val);
>  }
>  
> +#endif /* _Q_PENDING_BITS == 8 */
> +
> +#if _Q_PENDING_BITS == 8 && ARCH_HAS_XCHG16
> +
> +/*
> + * xchg_tail - Put in the new queue tail code word & retrieve previous one
> + * @lock : Pointer to queued spinlock structure
> + * @tail : The new queue tail code word
> + * Return: The previous queue tail code word
> + *
> + * xchg(lock, tail), which heads an address dependency
> + *
> + * p,*,* -> n,*,* ; prev = xchg(lock, node)
> + */
> +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
> +{
> +	/*
> +	 * We can use relaxed semantics since the caller ensures that the
> +	 * MCS node is properly initialized before updating the tail.
> +	 */
> +	return (u32)xchg_relaxed(&lock->tail,
> +				 tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
> +}
> +
> +#else /* !(_Q_PENDING_BITS == 8 && ARCH_HAS_XCHG16) */

Why can't architectures just implement this with a 32-bit xchg instruction
if they don't have one that operates on 16 bits? Sure, they'll store more
data, but it's atomic so you shouldn't be able to tell... (ignoring parisc
crazy).

Also, I'm surprised qspinlock benefits riscv. On arm64, there's nothing in
it over tickets for <= 16 CPUs.

Will

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/5] riscv: Add QUEUED_SPINLOCKS & QUEUED_RWLOCKS supported
  2020-11-25 14:31         ` Will Deacon
@ 2020-11-26  1:36           ` Guo Ren
  2020-11-26  8:53             ` Will Deacon
  0 siblings, 1 reply; 16+ messages in thread
From: Guo Ren @ 2020-11-26  1:36 UTC (permalink / raw)
  To: Will Deacon
  Cc: Arnd Bergmann, Guo Ren, Arnd Bergmann, Peter Zijlstra,
	Anup Patel, Palmer Dabbelt, linux-kernel, linux-csky,
	Michael Clark, Paul Walmsley, linux-riscv

Hi Will,

On Wed, Nov 25, 2020 at 10:31 PM Will Deacon <will@kernel.org> wrote:
>
> On Wed, Nov 25, 2020 at 03:16:45PM +0100, Peter Zijlstra wrote:
> > @@ -207,6 +187,32 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
> >       atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val);
> >  }
> >
> > +#endif /* _Q_PENDING_BITS == 8 */
> > +
> > +#if _Q_PENDING_BITS == 8 && ARCH_HAS_XCHG16
> > +
> > +/*
> > + * xchg_tail - Put in the new queue tail code word & retrieve previous one
> > + * @lock : Pointer to queued spinlock structure
> > + * @tail : The new queue tail code word
> > + * Return: The previous queue tail code word
> > + *
> > + * xchg(lock, tail), which heads an address dependency
> > + *
> > + * p,*,* -> n,*,* ; prev = xchg(lock, node)
> > + */
> > +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
> > +{
> > +     /*
> > +      * We can use relaxed semantics since the caller ensures that the
> > +      * MCS node is properly initialized before updating the tail.
> > +      */
> > +     return (u32)xchg_relaxed(&lock->tail,
> > +                              tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
> > +}
> > +
> > +#else /* !(_Q_PENDING_BITS == 8 && ARCH_HAS_XCHG16) */
>
> Why can't architectures just implement this with a 32-bit xchg instruction
> if they don't have one that operates on 16 bits? Sure, they'll store more
> data, but it's atomic so you shouldn't be able to tell... (ignoring parisc
> crazy).
>
> Also, I'm surprised qspinlock benefits riscv. On arm64, there's nothing in
> it over tickets for <= 16 CPUs.
NUMA is on the way:
https://lore.kernel.org/linux-riscv/20201119003829.1282810-1-atish.patra@wdc.com/

With your advice, I think we could using tickets lock when <= 16 CPUs
and using qspinlock when > 16 CPUs.
Is that right?

The next patchset plan is:
 - Using tickets & qspinlock together in riscv. Abandon 16bits
xchg/cmpxchg implementation.
 - Abanden qspinlock in csky, because it only could 4 CPUs' SMP.

>
> Will



-- 
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/5] riscv: Add QUEUED_SPINLOCKS & QUEUED_RWLOCKS supported
  2020-11-26  1:36           ` Guo Ren
@ 2020-11-26  8:53             ` Will Deacon
  0 siblings, 0 replies; 16+ messages in thread
From: Will Deacon @ 2020-11-26  8:53 UTC (permalink / raw)
  To: Guo Ren
  Cc: Arnd Bergmann, Guo Ren, Arnd Bergmann, Peter Zijlstra,
	Anup Patel, Palmer Dabbelt, linux-kernel, linux-csky,
	Michael Clark, Paul Walmsley, linux-riscv

On Thu, Nov 26, 2020 at 09:36:34AM +0800, Guo Ren wrote:
> On Wed, Nov 25, 2020 at 10:31 PM Will Deacon <will@kernel.org> wrote:
> > On Wed, Nov 25, 2020 at 03:16:45PM +0100, Peter Zijlstra wrote:
> > > @@ -207,6 +187,32 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
> > >       atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val);
> > >  }
> > >
> > > +#endif /* _Q_PENDING_BITS == 8 */
> > > +
> > > +#if _Q_PENDING_BITS == 8 && ARCH_HAS_XCHG16
> > > +
> > > +/*
> > > + * xchg_tail - Put in the new queue tail code word & retrieve previous one
> > > + * @lock : Pointer to queued spinlock structure
> > > + * @tail : The new queue tail code word
> > > + * Return: The previous queue tail code word
> > > + *
> > > + * xchg(lock, tail), which heads an address dependency
> > > + *
> > > + * p,*,* -> n,*,* ; prev = xchg(lock, node)
> > > + */
> > > +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
> > > +{
> > > +     /*
> > > +      * We can use relaxed semantics since the caller ensures that the
> > > +      * MCS node is properly initialized before updating the tail.
> > > +      */
> > > +     return (u32)xchg_relaxed(&lock->tail,
> > > +                              tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
> > > +}
> > > +
> > > +#else /* !(_Q_PENDING_BITS == 8 && ARCH_HAS_XCHG16) */
> >
> > Why can't architectures just implement this with a 32-bit xchg instruction
> > if they don't have one that operates on 16 bits? Sure, they'll store more
> > data, but it's atomic so you shouldn't be able to tell... (ignoring parisc
> > crazy).
> >
> > Also, I'm surprised qspinlock benefits riscv. On arm64, there's nothing in
> > it over tickets for <= 16 CPUs.
> NUMA is on the way:
> https://lore.kernel.org/linux-riscv/20201119003829.1282810-1-atish.patra@wdc.com/

Well, they're patches not hardware (and I only see mention of QEMU over
there for the RISCV platform) ;)

> With your advice, I think we could using tickets lock when <= 16 CPUs
> and using qspinlock when > 16 CPUs.
> Is that right?

No, when I say "there's nothing in it", it means they're interchangeable.
It's just that qspinlock introduces a lot of complexity and I'm not keen
massively keen on changing the core code (which is used by many other
architectures) just because you don't have a 16-bit xchg() implementation.

So if you need qspinlock on riscv (not sure you do), then go ahead and make
that your one true lock implementation, but implement 16-bit xchg() at the
same time. Bonus points if you implement that in terms of 32-bit xchg() in
generic code (might need to watch out for endianness when shifting the new
value after aligning the base pointer).

Will

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2020-11-26  8:54 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-11-24 13:43 [PATCH 1/5] riscv: Coding convention for xchg guoren
2020-11-24 13:43 ` [PATCH 2/5] riscv: Add QUEUED_SPINLOCKS & QUEUED_RWLOCKS supported guoren
2020-11-24 14:39   ` Peter Zijlstra
2020-11-24 15:00     ` Arnd Bergmann
2020-11-25 14:09       ` Guo Ren
2020-11-25 14:16       ` Peter Zijlstra
2020-11-25 14:31         ` Will Deacon
2020-11-26  1:36           ` Guo Ren
2020-11-26  8:53             ` Will Deacon
2020-11-25  0:52     ` Guo Ren
2020-11-25 14:18       ` Peter Zijlstra
2020-11-24 13:43 ` [PATCH 3/5] csky: Remove simple spinlock implementation guoren
2020-11-24 13:43 ` [PATCH 4/5] csky: Add QUEUED_SPINLOCKS supported guoren
2020-11-24 13:43 ` [PATCH 5/5] csky: Optimize atomic operations with correct barrier usage guoren
2020-11-24 14:29 ` [PATCH 1/5] riscv: Coding convention for xchg Peter Zijlstra
2020-11-25 14:18   ` Guo Ren

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).