[PATCH V3 0/3] csky: Optimize with acquire & release for atomic & cmpxchg

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH V3 0/3] csky: Optimize with acquire & release for atomic & cmpxchg
@ 2022-04-17  8:32 guoren
  2022-04-17  8:32 ` [PATCH V3 1/3] csky: cmpxchg: Optimize with acquire & release guoren
                   ` (2 more replies)
  0 siblings, 3 replies; 6+ messages in thread
From: guoren @ 2022-04-17  8:32 UTC (permalink / raw)
  To: guoren, arnd, mark.rutland, boqun.feng, peterz, will
  Cc: linux-arch, linux-kernel, linux-csky, Guo Ren

From: Guo Ren <guoren@linux.alibaba.com>

Optimize arch_xchg|cmpxchg|cmpxchg_local with ASM acquire|release
instructions instead of previous C based.

The generic atomic.h used cmpxchg to implement the atomic
operations, it will cause daul loop to reduce the forward
guarantee. The patch implement csky custom atomic operations with
ldex/stex instructions for the best performance.

Important reference commit:
8e86f0b409a4 ("arm64: atomics: fix use of acquire + release for
full barrier semantics")

Changes in V3:
 - Add arch_atomic_(fetch_add_unless, inc_unless_negative,
   dec_unless_positive, dec_if_positive)

Changes in V2:
 - Fixup use of acquire + release for barrier semantics by Rutland.

Guo Ren (3):
  csky: cmpxchg: Optimize with acquire & release
  csky: atomic: Add custom atomic.h implementation
  csky: atomic: Add conditional atomic operations' optimization

 arch/csky/include/asm/atomic.h  | 249 ++++++++++++++++++++++++++++++++
 arch/csky/include/asm/barrier.h |  11 +-
 arch/csky/include/asm/cmpxchg.h |  64 +++++++-
 3 files changed, 316 insertions(+), 8 deletions(-)
 create mode 100644 arch/csky/include/asm/atomic.h

-- 
2.25.1


^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH V3 1/3] csky: cmpxchg: Optimize with acquire & release
  2022-04-17  8:32 [PATCH V3 0/3] csky: Optimize with acquire & release for atomic & cmpxchg guoren
@ 2022-04-17  8:32 ` guoren
  2022-04-22  3:20   ` Boqun Feng
  2022-04-17  8:32 ` [PATCH V3 2/3] csky: atomic: Add custom atomic.h implementation guoren
  2022-04-17  8:32 ` [PATCH V3 3/3] csky: atomic: Add conditional atomic operations' optimization guoren
  2 siblings, 1 reply; 6+ messages in thread
From: guoren @ 2022-04-17  8:32 UTC (permalink / raw)
  To: guoren, arnd, mark.rutland, boqun.feng, peterz, will
  Cc: linux-arch, linux-kernel, linux-csky, Guo Ren

From: Guo Ren <guoren@linux.alibaba.com>

Optimize arch_xchg|cmpxchg|cmpxchg_local with ASM acquire|release
instructions instead of previous C based.

Important reference comment by Rutland:
8e86f0b409a4 ("arm64: atomics: fix use of acquire + release for
full barrier semantics")

Link: https://lore.kernel.org/linux-riscv/CAJF2gTSAxpAi=LbAdu7jntZRUa=-dJwL0VfmDfBV5MHB=rcZ-w@mail.gmail.com/T/#m27a0f1342995deae49ce1d0e1f2683f8a181d6c3
Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
Signed-off-by: Guo Ren <guoren@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
---
 arch/csky/include/asm/barrier.h | 11 +++---
 arch/csky/include/asm/cmpxchg.h | 64 ++++++++++++++++++++++++++++++---
 2 files changed, 67 insertions(+), 8 deletions(-)

diff --git a/arch/csky/include/asm/barrier.h b/arch/csky/include/asm/barrier.h
index f4045dd53e17..fb63335ffa33 100644
--- a/arch/csky/include/asm/barrier.h
+++ b/arch/csky/include/asm/barrier.h
@@ -37,17 +37,21 @@
  * bar.brar
  * bar.bwaw
  */
+#define ACQUIRE_FENCE		".long 0x8427c000\n"
+#define RELEASE_FENCE		".long 0x842ec000\n"
+#define FULL_FENCE		".long 0x842fc000\n"
+
 #define __bar_brw()	asm volatile (".long 0x842cc000\n":::"memory")
 #define __bar_br()	asm volatile (".long 0x8424c000\n":::"memory")
 #define __bar_bw()	asm volatile (".long 0x8428c000\n":::"memory")
 #define __bar_arw()	asm volatile (".long 0x8423c000\n":::"memory")
 #define __bar_ar()	asm volatile (".long 0x8421c000\n":::"memory")
 #define __bar_aw()	asm volatile (".long 0x8422c000\n":::"memory")
-#define __bar_brwarw()	asm volatile (".long 0x842fc000\n":::"memory")
-#define __bar_brarw()	asm volatile (".long 0x8427c000\n":::"memory")
+#define __bar_brwarw()	asm volatile (FULL_FENCE:::"memory")
+#define __bar_brarw()	asm volatile (ACQUIRE_FENCE:::"memory")
 #define __bar_bwarw()	asm volatile (".long 0x842bc000\n":::"memory")
 #define __bar_brwar()	asm volatile (".long 0x842dc000\n":::"memory")
-#define __bar_brwaw()	asm volatile (".long 0x842ec000\n":::"memory")
+#define __bar_brwaw()	asm volatile (RELEASE_FENCE:::"memory")
 #define __bar_brar()	asm volatile (".long 0x8425c000\n":::"memory")
 #define __bar_brar()	asm volatile (".long 0x8425c000\n":::"memory")
 #define __bar_bwaw()	asm volatile (".long 0x842ac000\n":::"memory")
@@ -56,7 +60,6 @@
 #define __smp_rmb()	__bar_brar()
 #define __smp_wmb()	__bar_bwaw()
 
-#define ACQUIRE_FENCE		".long 0x8427c000\n"
 #define __smp_acquire_fence()	__bar_brarw()
 #define __smp_release_fence()	__bar_brwaw()
 
diff --git a/arch/csky/include/asm/cmpxchg.h b/arch/csky/include/asm/cmpxchg.h
index d1bef11f8dc9..06c550448bf1 100644
--- a/arch/csky/include/asm/cmpxchg.h
+++ b/arch/csky/include/asm/cmpxchg.h
@@ -64,15 +64,71 @@ extern void __bad_xchg(void);
 #define arch_cmpxchg_relaxed(ptr, o, n) \
 	(__cmpxchg_relaxed((ptr), (o), (n), sizeof(*(ptr))))
 
-#define arch_cmpxchg(ptr, o, n) 				\
+#define __cmpxchg_acquire(ptr, old, new, size)			\
 ({								\
+	__typeof__(ptr) __ptr = (ptr);				\
+	__typeof__(new) __new = (new);				\
+	__typeof__(new) __tmp;					\
+	__typeof__(old) __old = (old);				\
+	__typeof__(*(ptr)) __ret;				\
+	switch (size) {						\
+	case 4:							\
+		asm volatile (					\
+		"1:	ldex.w		%0, (%3) \n"		\
+		"	cmpne		%0, %4   \n"		\
+		"	bt		2f       \n"		\
+		"	mov		%1, %2   \n"		\
+		"	stex.w		%1, (%3) \n"		\
+		"	bez		%1, 1b   \n"		\
+		ACQUIRE_FENCE					\
+		"2:				 \n"		\
+			: "=&r" (__ret), "=&r" (__tmp)		\
+			: "r" (__new), "r"(__ptr), "r"(__old)	\
+			:);					\
+		break;						\
+	default:						\
+		__bad_xchg();					\
+	}							\
+	__ret;							\
+})
+
+#define arch_cmpxchg_acquire(ptr, o, n) \
+	(__cmpxchg_acquire((ptr), (o), (n), sizeof(*(ptr))))
+
+#define __cmpxchg(ptr, old, new, size)				\
+({								\
+	__typeof__(ptr) __ptr = (ptr);				\
+	__typeof__(new) __new = (new);				\
+	__typeof__(new) __tmp;					\
+	__typeof__(old) __old = (old);				\
 	__typeof__(*(ptr)) __ret;				\
-	__smp_release_fence();					\
-	__ret = arch_cmpxchg_relaxed(ptr, o, n);		\
-	__smp_acquire_fence();					\
+	switch (size) {						\
+	case 4:							\
+		asm volatile (					\
+		"1:	ldex.w		%0, (%3) \n"		\
+		"	cmpne		%0, %4   \n"		\
+		"	bt		2f       \n"		\
+		"	mov		%1, %2   \n"		\
+		RELEASE_FENCE					\
+		"	stex.w		%1, (%3) \n"		\
+		"	bez		%1, 1b   \n"		\
+		FULL_FENCE					\
+		"2:				 \n"		\
+			: "=&r" (__ret), "=&r" (__tmp)		\
+			: "r" (__new), "r"(__ptr), "r"(__old)	\
+			:);					\
+		break;						\
+	default:						\
+		__bad_xchg();					\
+	}							\
 	__ret;							\
 })
 
+#define arch_cmpxchg(ptr, o, n) \
+	(__cmpxchg((ptr), (o), (n), sizeof(*(ptr))))
+
+#define arch_cmpxchg_local(ptr, o, n)				\
+	(__cmpxchg_relaxed((ptr), (o), (n), sizeof(*(ptr))))
 #else
 #include <asm-generic/cmpxchg.h>
 #endif
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH V3 2/3] csky: atomic: Add custom atomic.h implementation
  2022-04-17  8:32 [PATCH V3 0/3] csky: Optimize with acquire & release for atomic & cmpxchg guoren
  2022-04-17  8:32 ` [PATCH V3 1/3] csky: cmpxchg: Optimize with acquire & release guoren
@ 2022-04-17  8:32 ` guoren
  2022-04-17  8:32 ` [PATCH V3 3/3] csky: atomic: Add conditional atomic operations' optimization guoren
  2 siblings, 0 replies; 6+ messages in thread
From: guoren @ 2022-04-17  8:32 UTC (permalink / raw)
  To: guoren, arnd, mark.rutland, boqun.feng, peterz, will
  Cc: linux-arch, linux-kernel, linux-csky, Guo Ren

From: Guo Ren <guoren@linux.alibaba.com>

The generic atomic.h used cmpxchg to implement the atomic
operations, it will cause daul loop to reduce the forward
guarantee. The patch implement csky custom atomic operations with
ldex/stex instructions for the best performance.

Important reference comment by Rutland:
8e86f0b409a4 ("arm64: atomics: fix use of acquire + release for
full barrier semantics")

Link: https://lore.kernel.org/linux-riscv/CAJF2gTSAxpAi=LbAdu7jntZRUa=-dJwL0VfmDfBV5MHB=rcZ-w@mail.gmail.com/T/#m27a0f1342995deae49ce1d0e1f2683f8a181d6c3
Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
Signed-off-by: Guo Ren <guoren@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
---
 arch/csky/include/asm/atomic.h | 154 +++++++++++++++++++++++++++++++++
 1 file changed, 154 insertions(+)
 create mode 100644 arch/csky/include/asm/atomic.h

diff --git a/arch/csky/include/asm/atomic.h b/arch/csky/include/asm/atomic.h
new file mode 100644
index 000000000000..5ecc657a2a66
--- /dev/null
+++ b/arch/csky/include/asm/atomic.h
@@ -0,0 +1,154 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __ASM_CSKY_ATOMIC_H
+#define __ASM_CSKY_ATOMIC_H
+
+#ifdef CONFIG_SMP
+#include <asm-generic/atomic64.h>
+
+#include <asm/cmpxchg.h>
+#include <asm/barrier.h>
+
+#define __atomic_acquire_fence()	__smp_acquire_fence()
+
+#define __atomic_release_fence()	__smp_release_fence()
+
+static __always_inline int arch_atomic_read(const atomic_t *v)
+{
+	return READ_ONCE(v->counter);
+}
+static __always_inline void arch_atomic_set(atomic_t *v, int i)
+{
+	WRITE_ONCE(v->counter, i);
+}
+
+#define ATOMIC_OP(op)							\
+static __always_inline							\
+void arch_atomic_##op(int i, atomic_t *v)				\
+{									\
+	unsigned long tmp;						\
+	__asm__ __volatile__ (						\
+	"1:	ldex.w		%0, (%2)	\n"			\
+	"	" #op "		%0, %1		\n"			\
+	"	stex.w		%0, (%2)	\n"			\
+	"	bez		%0, 1b		\n"			\
+	: "=&r" (tmp)							\
+	: "r" (i), "r" (&v->counter)					\
+	: "memory");							\
+}
+
+ATOMIC_OP(add)
+ATOMIC_OP(sub)
+ATOMIC_OP(and)
+ATOMIC_OP( or)
+ATOMIC_OP(xor)
+
+#undef ATOMIC_OP
+
+#define ATOMIC_FETCH_OP(op)						\
+static __always_inline							\
+int arch_atomic_fetch_##op##_relaxed(int i, atomic_t *v)		\
+{									\
+	register int ret, tmp;						\
+	__asm__ __volatile__ (						\
+	"1:	ldex.w		%0, (%3) \n"				\
+	"	mov		%1, %0   \n"				\
+	"	" #op "		%0, %2   \n"				\
+	"	stex.w		%0, (%3) \n"				\
+	"	bez		%0, 1b   \n"				\
+		: "=&r" (tmp), "=&r" (ret)				\
+		: "r" (i), "r"(&v->counter) 				\
+		: "memory");						\
+	return ret;							\
+}
+
+#define ATOMIC_OP_RETURN(op, c_op)					\
+static __always_inline							\
+int arch_atomic_##op##_return_relaxed(int i, atomic_t *v)		\
+{									\
+	return arch_atomic_fetch_##op##_relaxed(i, v) c_op i;		\
+}									\
+static __always_inline							\
+int arch_atomic_##op##_return(int i, atomic_t *v)			\
+{									\
+	return arch_atomic_fetch_##op(i, v) c_op i;			\
+}
+
+#define ATOMIC_OPS(op, c_op)						\
+	ATOMIC_FETCH_OP(op)						\
+	ATOMIC_OP_RETURN(op, c_op)
+
+ATOMIC_OPS(add, +)
+ATOMIC_OPS(sub, -)
+
+#define arch_atomic_fetch_add_relaxed	arch_atomic_fetch_add_relaxed
+#define arch_atomic_fetch_sub_relaxed	arch_atomic_fetch_sub_relaxed
+#define arch_atomic_fetch_add		arch_atomic_fetch_add
+#define arch_atomic_fetch_sub		arch_atomic_fetch_sub
+
+#define arch_atomic_add_return_relaxed	arch_atomic_add_return_relaxed
+#define arch_atomic_sub_return_relaxed	arch_atomic_sub_return_relaxed
+#define arch_atomic_add_return		arch_atomic_add_return
+#define arch_atomic_sub_return		arch_atomic_sub_return
+
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+
+#define ATOMIC_OPS(op)							\
+	ATOMIC_FETCH_OP(op)
+
+ATOMIC_OPS(and)
+ATOMIC_OPS( or)
+ATOMIC_OPS(xor)
+
+#define arch_atomic_fetch_and_relaxed	arch_atomic_fetch_and_relaxed
+#define arch_atomic_fetch_or_relaxed	arch_atomic_fetch_or_relaxed
+#define arch_atomic_fetch_xor_relaxed	arch_atomic_fetch_xor_relaxed
+#define arch_atomic_fetch_and		arch_atomic_fetch_and
+#define arch_atomic_fetch_or		arch_atomic_fetch_or
+#define arch_atomic_fetch_xor		arch_atomic_fetch_xor
+
+#undef ATOMIC_OPS
+
+#undef ATOMIC_FETCH_OP
+
+#define ATOMIC_OP()							\
+static __always_inline							\
+int arch_atomic_xchg_relaxed(atomic_t *v, int n)			\
+{									\
+	return __xchg_relaxed(n, &(v->counter), 4);			\
+}									\
+static __always_inline							\
+int arch_atomic_xchg(atomic_t *v, int n)				\
+{									\
+	return __xchg(n, &(v->counter), 4);				\
+}									\
+static __always_inline							\
+int arch_atomic_cmpxchg_relaxed(atomic_t *v, int o, int n)		\
+{									\
+	return __cmpxchg_relaxed(&(v->counter), o, n, 4);		\
+}									\
+static __always_inline							\
+int arch_atomic_cmpxchg(atomic_t *v, int o, int n)			\
+{									\
+	return __cmpxchg(&(v->counter), o, n, 4);			\
+}
+
+#define ATOMIC_OPS()							\
+	ATOMIC_OP()
+
+ATOMIC_OPS()
+
+#define arch_atomic_xchg_relaxed	arch_atomic_xchg_relaxed
+#define arch_atomic_xchg		arch_atomic_xchg
+#define arch_atomic_cmpxchg_relaxed	arch_atomic_cmpxchg_relaxed
+#define arch_atomic_cmpxchg		arch_atomic_cmpxchg
+
+#undef ATOMIC_OPS
+#undef ATOMIC_OP
+
+#else
+#include <asm-generic/atomic.h>
+#endif
+
+#endif /* __ASM_CSKY_ATOMIC_H */
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH V3 3/3] csky: atomic: Add conditional atomic operations' optimization
  2022-04-17  8:32 [PATCH V3 0/3] csky: Optimize with acquire & release for atomic & cmpxchg guoren
  2022-04-17  8:32 ` [PATCH V3 1/3] csky: cmpxchg: Optimize with acquire & release guoren
  2022-04-17  8:32 ` [PATCH V3 2/3] csky: atomic: Add custom atomic.h implementation guoren
@ 2022-04-17  8:32 ` guoren
  2 siblings, 0 replies; 6+ messages in thread
From: guoren @ 2022-04-17  8:32 UTC (permalink / raw)
  To: guoren, arnd, mark.rutland, boqun.feng, peterz, will
  Cc: linux-arch, linux-kernel, linux-csky, Guo Ren

From: Guo Ren <guoren@linux.alibaba.com>

Add conditional atomic operations' optimization:
 - arch_atomic_fetch_add_unless
 - arch_atomic_inc_unless_negative
 - arch_atomic_dec_unless_positive
 - arch_atomic_dec_if_positive

Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
Signed-off-by: Guo Ren <guoren@kernel.org>
---
 arch/csky/include/asm/atomic.h | 95 ++++++++++++++++++++++++++++++++++
 1 file changed, 95 insertions(+)

diff --git a/arch/csky/include/asm/atomic.h b/arch/csky/include/asm/atomic.h
index 5ecc657a2a66..3f2917b748c3 100644
--- a/arch/csky/include/asm/atomic.h
+++ b/arch/csky/include/asm/atomic.h
@@ -112,6 +112,101 @@ ATOMIC_OPS(xor)
 
 #undef ATOMIC_FETCH_OP
 
+static __always_inline int
+arch_atomic_fetch_add_unless(atomic_t *v, int a, int u)
+{
+	int prev, tmp;
+
+	__asm__ __volatile__ (
+		"1:	ldex.w		%0, (%3)	\n"
+		"	cmpne		%0, %4		\n"
+		"	bf		2f		\n"
+		"	mov		%1, %0		\n"
+		"	add		%1, %2		\n"
+		RELEASE_FENCE
+		"	stex.w		%1, (%3)	\n"
+		"	bez		%1, 1b		\n"
+		FULL_FENCE
+		"2:\n"
+		: "=&r" (prev), "=&r" (tmp)
+		: "r" (a), "r" (&v->counter), "r" (u)
+		: "memory");
+
+	return prev;
+}
+#define arch_atomic_fetch_add_unless arch_atomic_fetch_add_unless
+
+static __always_inline bool
+arch_atomic_inc_unless_negative(atomic_t *v)
+{
+	int rc, tmp;
+
+	__asm__ __volatile__ (
+		"1:	ldex.w		%0, (%2)	\n"
+		"	movi		%1, 0		\n"
+		"	blz		%0, 2f		\n"
+		"	movi		%1, 1		\n"
+		"	addi		%0, 1		\n"
+		RELEASE_FENCE
+		"	stex.w		%0, (%2)	\n"
+		"	bez		%0, 1b		\n"
+		FULL_FENCE
+		"2:\n"
+		: "=&r" (tmp), "=&r" (rc)
+		: "r" (&v->counter)
+		: "memory");
+
+	return tmp ? true : false;
+
+}
+#define arch_atomic_inc_unless_negative arch_atomic_inc_unless_negative
+
+static __always_inline bool
+arch_atomic_dec_unless_positive(atomic_t *v)
+{
+	int rc, tmp;
+
+	__asm__ __volatile__ (
+		"1:	ldex.w		%0, (%2)	\n"
+		"	movi		%1, 0		\n"
+		"	bhz		%0, 2f		\n"
+		"	movi		%1, 1		\n"
+		"	subi		%0, 1		\n"
+		RELEASE_FENCE
+		"	stex.w		%0, (%2)	\n"
+		"	bez		%0, 1b		\n"
+		FULL_FENCE
+		"2:\n"
+		: "=&r" (tmp), "=&r" (rc)
+		: "r" (&v->counter)
+		: "memory");
+
+	return tmp ? true : false;
+}
+#define arch_atomic_dec_unless_positive arch_atomic_dec_unless_positive
+
+static __always_inline int
+arch_atomic_dec_if_positive(atomic_t *v)
+{
+	int dec, tmp;
+
+	__asm__ __volatile__ (
+		"1:	ldex.w		%0, (%2)	\n"
+		"	subi		%1, %0, 1	\n"
+		"	blz		%1, 2f		\n"
+		RELEASE_FENCE
+		"	stex.w		%1, (%2)	\n"
+		"	bez		%1, 1b		\n"
+		FULL_FENCE
+		"2:\n"
+		: "=&r" (dec), "=&r" (tmp)
+		: "r" (&v->counter)
+		: "memory");
+
+	return dec - 1;
+}
+#define arch_atomic_dec_if_positive arch_atomic_dec_if_positive
+
 #define ATOMIC_OP()							\
 static __always_inline							\
 int arch_atomic_xchg_relaxed(atomic_t *v, int n)			\
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH V3 1/3] csky: cmpxchg: Optimize with acquire & release
  2022-04-17  8:32 ` [PATCH V3 1/3] csky: cmpxchg: Optimize with acquire & release guoren
@ 2022-04-22  3:20   ` Boqun Feng
  2022-04-22  3:45     ` Guo Ren
  0 siblings, 1 reply; 6+ messages in thread
From: Boqun Feng @ 2022-04-22  3:20 UTC (permalink / raw)
  To: guoren
  Cc: arnd, mark.rutland, peterz, will, linux-arch, linux-kernel,
	linux-csky, Guo Ren

[-- Attachment #1: Type: text/plain, Size: 5672 bytes --]

On Sun, Apr 17, 2022 at 04:32:02PM +0800, guoren@kernel.org wrote:
> From: Guo Ren <guoren@linux.alibaba.com>
> 
> Optimize arch_xchg|cmpxchg|cmpxchg_local with ASM acquire|release
> instructions instead of previous C based.
> 
> Important reference comment by Rutland:
> 8e86f0b409a4 ("arm64: atomics: fix use of acquire + release for
> full barrier semantics")
> 
> Link: https://lore.kernel.org/linux-riscv/CAJF2gTSAxpAi=LbAdu7jntZRUa=-dJwL0VfmDfBV5MHB=rcZ-w@mail.gmail.com/T/#m27a0f1342995deae49ce1d0e1f2683f8a181d6c3
> Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> Signed-off-by: Guo Ren <guoren@kernel.org>
> Cc: Mark Rutland <mark.rutland@arm.com>
> ---
>  arch/csky/include/asm/barrier.h | 11 +++---
>  arch/csky/include/asm/cmpxchg.h | 64 ++++++++++++++++++++++++++++++---
>  2 files changed, 67 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/csky/include/asm/barrier.h b/arch/csky/include/asm/barrier.h
> index f4045dd53e17..fb63335ffa33 100644
> --- a/arch/csky/include/asm/barrier.h
> +++ b/arch/csky/include/asm/barrier.h
> @@ -37,17 +37,21 @@
>   * bar.brar
>   * bar.bwaw
>   */
> +#define ACQUIRE_FENCE		".long 0x8427c000\n"
> +#define RELEASE_FENCE		".long 0x842ec000\n"
> +#define FULL_FENCE		".long 0x842fc000\n"
> +
>  #define __bar_brw()	asm volatile (".long 0x842cc000\n":::"memory")
>  #define __bar_br()	asm volatile (".long 0x8424c000\n":::"memory")
>  #define __bar_bw()	asm volatile (".long 0x8428c000\n":::"memory")
>  #define __bar_arw()	asm volatile (".long 0x8423c000\n":::"memory")
>  #define __bar_ar()	asm volatile (".long 0x8421c000\n":::"memory")
>  #define __bar_aw()	asm volatile (".long 0x8422c000\n":::"memory")
> -#define __bar_brwarw()	asm volatile (".long 0x842fc000\n":::"memory")
> -#define __bar_brarw()	asm volatile (".long 0x8427c000\n":::"memory")
> +#define __bar_brwarw()	asm volatile (FULL_FENCE:::"memory")
> +#define __bar_brarw()	asm volatile (ACQUIRE_FENCE:::"memory")
>  #define __bar_bwarw()	asm volatile (".long 0x842bc000\n":::"memory")
>  #define __bar_brwar()	asm volatile (".long 0x842dc000\n":::"memory")
> -#define __bar_brwaw()	asm volatile (".long 0x842ec000\n":::"memory")
> +#define __bar_brwaw()	asm volatile (RELEASE_FENCE:::"memory")
>  #define __bar_brar()	asm volatile (".long 0x8425c000\n":::"memory")
>  #define __bar_brar()	asm volatile (".long 0x8425c000\n":::"memory")
>  #define __bar_bwaw()	asm volatile (".long 0x842ac000\n":::"memory")
> @@ -56,7 +60,6 @@
>  #define __smp_rmb()	__bar_brar()
>  #define __smp_wmb()	__bar_bwaw()
>  
> -#define ACQUIRE_FENCE		".long 0x8427c000\n"
>  #define __smp_acquire_fence()	__bar_brarw()
>  #define __smp_release_fence()	__bar_brwaw()
>  
> diff --git a/arch/csky/include/asm/cmpxchg.h b/arch/csky/include/asm/cmpxchg.h
> index d1bef11f8dc9..06c550448bf1 100644
> --- a/arch/csky/include/asm/cmpxchg.h
> +++ b/arch/csky/include/asm/cmpxchg.h
> @@ -64,15 +64,71 @@ extern void __bad_xchg(void);
>  #define arch_cmpxchg_relaxed(ptr, o, n) \
>  	(__cmpxchg_relaxed((ptr), (o), (n), sizeof(*(ptr))))
>  
> -#define arch_cmpxchg(ptr, o, n) 				\
> +#define __cmpxchg_acquire(ptr, old, new, size)			\
>  ({								\
> +	__typeof__(ptr) __ptr = (ptr);				\
> +	__typeof__(new) __new = (new);				\
> +	__typeof__(new) __tmp;					\
> +	__typeof__(old) __old = (old);				\
> +	__typeof__(*(ptr)) __ret;				\
> +	switch (size) {						\
> +	case 4:							\
> +		asm volatile (					\
> +		"1:	ldex.w		%0, (%3) \n"		\
> +		"	cmpne		%0, %4   \n"		\
> +		"	bt		2f       \n"		\
> +		"	mov		%1, %2   \n"		\
> +		"	stex.w		%1, (%3) \n"		\
> +		"	bez		%1, 1b   \n"		\
> +		ACQUIRE_FENCE					\
> +		"2:				 \n"		\
> +			: "=&r" (__ret), "=&r" (__tmp)		\
> +			: "r" (__new), "r"(__ptr), "r"(__old)	\
> +			:);					\
> +		break;						\
> +	default:						\
> +		__bad_xchg();					\
> +	}							\
> +	__ret;							\
> +})
> +
> +#define arch_cmpxchg_acquire(ptr, o, n) \
> +	(__cmpxchg_acquire((ptr), (o), (n), sizeof(*(ptr))))
> +
> +#define __cmpxchg(ptr, old, new, size)				\
> +({								\
> +	__typeof__(ptr) __ptr = (ptr);				\
> +	__typeof__(new) __new = (new);				\
> +	__typeof__(new) __tmp;					\
> +	__typeof__(old) __old = (old);				\
>  	__typeof__(*(ptr)) __ret;				\
> -	__smp_release_fence();					\
> -	__ret = arch_cmpxchg_relaxed(ptr, o, n);		\
> -	__smp_acquire_fence();					\
> +	switch (size) {						\
> +	case 4:							\
> +		asm volatile (					\
> +		"1:	ldex.w		%0, (%3) \n"		\
> +		"	cmpne		%0, %4   \n"		\
> +		"	bt		2f       \n"		\
> +		"	mov		%1, %2   \n"		\
> +		RELEASE_FENCE					\

FWIW, you probably need to make sure that a barrier instruction inside
an lr/sc loop is a good thing. IIUC, the execution time of a barrier
instruction is determined by the status of store buffers and invalidate
queues (and probably other stuffs), so it may increase the execution
time of the lr/sc loop, and make it unlikely to succeed. But this really
depends on how the arch executes these instructions.

Regards,
Boqun

> +		"	stex.w		%1, (%3) \n"		\
> +		"	bez		%1, 1b   \n"		\
> +		FULL_FENCE					\
> +		"2:				 \n"		\
> +			: "=&r" (__ret), "=&r" (__tmp)		\
> +			: "r" (__new), "r"(__ptr), "r"(__old)	\
> +			:);					\
> +		break;						\
> +	default:						\
> +		__bad_xchg();					\
> +	}							\
>  	__ret;							\
>  })
>  
> +#define arch_cmpxchg(ptr, o, n) \
> +	(__cmpxchg((ptr), (o), (n), sizeof(*(ptr))))
> +
> +#define arch_cmpxchg_local(ptr, o, n)				\
> +	(__cmpxchg_relaxed((ptr), (o), (n), sizeof(*(ptr))))
>  #else
>  #include <asm-generic/cmpxchg.h>
>  #endif
> -- 
> 2.25.1
> 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH V3 1/3] csky: cmpxchg: Optimize with acquire & release
  2022-04-22  3:20   ` Boqun Feng
@ 2022-04-22  3:45     ` Guo Ren
  0 siblings, 0 replies; 6+ messages in thread
From: Guo Ren @ 2022-04-22  3:45 UTC (permalink / raw)
  To: Boqun Feng
  Cc: Arnd Bergmann, Mark Rutland, Peter Zijlstra, Will Deacon,
	linux-arch, Linux Kernel Mailing List, linux-csky, Guo Ren

On Fri, Apr 22, 2022 at 11:20 AM Boqun Feng <boqun.feng@gmail.com> wrote:
>
> On Sun, Apr 17, 2022 at 04:32:02PM +0800, guoren@kernel.org wrote:
> > From: Guo Ren <guoren@linux.alibaba.com>
> >
> > Optimize arch_xchg|cmpxchg|cmpxchg_local with ASM acquire|release
> > instructions instead of previous C based.
> >
> > Important reference comment by Rutland:
> > 8e86f0b409a4 ("arm64: atomics: fix use of acquire + release for
> > full barrier semantics")
> >
> > Link: https://lore.kernel.org/linux-riscv/CAJF2gTSAxpAi=LbAdu7jntZRUa=-dJwL0VfmDfBV5MHB=rcZ-w@mail.gmail.com/T/#m27a0f1342995deae49ce1d0e1f2683f8a181d6c3
> > Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> > Signed-off-by: Guo Ren <guoren@kernel.org>
> > Cc: Mark Rutland <mark.rutland@arm.com>
> > ---
> >  arch/csky/include/asm/barrier.h | 11 +++---
> >  arch/csky/include/asm/cmpxchg.h | 64 ++++++++++++++++++++++++++++++---
> >  2 files changed, 67 insertions(+), 8 deletions(-)
> >
> > diff --git a/arch/csky/include/asm/barrier.h b/arch/csky/include/asm/barrier.h
> > index f4045dd53e17..fb63335ffa33 100644
> > --- a/arch/csky/include/asm/barrier.h
> > +++ b/arch/csky/include/asm/barrier.h
> > @@ -37,17 +37,21 @@
> >   * bar.brar
> >   * bar.bwaw
> >   */
> > +#define ACQUIRE_FENCE                ".long 0x8427c000\n"
> > +#define RELEASE_FENCE                ".long 0x842ec000\n"
> > +#define FULL_FENCE           ".long 0x842fc000\n"
> > +
> >  #define __bar_brw()  asm volatile (".long 0x842cc000\n":::"memory")
> >  #define __bar_br()   asm volatile (".long 0x8424c000\n":::"memory")
> >  #define __bar_bw()   asm volatile (".long 0x8428c000\n":::"memory")
> >  #define __bar_arw()  asm volatile (".long 0x8423c000\n":::"memory")
> >  #define __bar_ar()   asm volatile (".long 0x8421c000\n":::"memory")
> >  #define __bar_aw()   asm volatile (".long 0x8422c000\n":::"memory")
> > -#define __bar_brwarw()       asm volatile (".long 0x842fc000\n":::"memory")
> > -#define __bar_brarw()        asm volatile (".long 0x8427c000\n":::"memory")
> > +#define __bar_brwarw()       asm volatile (FULL_FENCE:::"memory")
> > +#define __bar_brarw()        asm volatile (ACQUIRE_FENCE:::"memory")
> >  #define __bar_bwarw()        asm volatile (".long 0x842bc000\n":::"memory")
> >  #define __bar_brwar()        asm volatile (".long 0x842dc000\n":::"memory")
> > -#define __bar_brwaw()        asm volatile (".long 0x842ec000\n":::"memory")
> > +#define __bar_brwaw()        asm volatile (RELEASE_FENCE:::"memory")
> >  #define __bar_brar() asm volatile (".long 0x8425c000\n":::"memory")
> >  #define __bar_brar() asm volatile (".long 0x8425c000\n":::"memory")
> >  #define __bar_bwaw() asm volatile (".long 0x842ac000\n":::"memory")
> > @@ -56,7 +60,6 @@
> >  #define __smp_rmb()  __bar_brar()
> >  #define __smp_wmb()  __bar_bwaw()
> >
> > -#define ACQUIRE_FENCE                ".long 0x8427c000\n"
> >  #define __smp_acquire_fence()        __bar_brarw()
> >  #define __smp_release_fence()        __bar_brwaw()
> >
> > diff --git a/arch/csky/include/asm/cmpxchg.h b/arch/csky/include/asm/cmpxchg.h
> > index d1bef11f8dc9..06c550448bf1 100644
> > --- a/arch/csky/include/asm/cmpxchg.h
> > +++ b/arch/csky/include/asm/cmpxchg.h
> > @@ -64,15 +64,71 @@ extern void __bad_xchg(void);
> >  #define arch_cmpxchg_relaxed(ptr, o, n) \
> >       (__cmpxchg_relaxed((ptr), (o), (n), sizeof(*(ptr))))
> >
> > -#define arch_cmpxchg(ptr, o, n)                              \
> > +#define __cmpxchg_acquire(ptr, old, new, size)                       \
> >  ({                                                           \
> > +     __typeof__(ptr) __ptr = (ptr);                          \
> > +     __typeof__(new) __new = (new);                          \
> > +     __typeof__(new) __tmp;                                  \
> > +     __typeof__(old) __old = (old);                          \
> > +     __typeof__(*(ptr)) __ret;                               \
> > +     switch (size) {                                         \
> > +     case 4:                                                 \
> > +             asm volatile (                                  \
> > +             "1:     ldex.w          %0, (%3) \n"            \
> > +             "       cmpne           %0, %4   \n"            \
> > +             "       bt              2f       \n"            \
> > +             "       mov             %1, %2   \n"            \
> > +             "       stex.w          %1, (%3) \n"            \
> > +             "       bez             %1, 1b   \n"            \
> > +             ACQUIRE_FENCE                                   \
> > +             "2:                              \n"            \
> > +                     : "=&r" (__ret), "=&r" (__tmp)          \
> > +                     : "r" (__new), "r"(__ptr), "r"(__old)   \
> > +                     :);                                     \
> > +             break;                                          \
> > +     default:                                                \
> > +             __bad_xchg();                                   \
> > +     }                                                       \
> > +     __ret;                                                  \
> > +})
> > +
> > +#define arch_cmpxchg_acquire(ptr, o, n) \
> > +     (__cmpxchg_acquire((ptr), (o), (n), sizeof(*(ptr))))
> > +
> > +#define __cmpxchg(ptr, old, new, size)                               \
> > +({                                                           \
> > +     __typeof__(ptr) __ptr = (ptr);                          \
> > +     __typeof__(new) __new = (new);                          \
> > +     __typeof__(new) __tmp;                                  \
> > +     __typeof__(old) __old = (old);                          \
> >       __typeof__(*(ptr)) __ret;                               \
> > -     __smp_release_fence();                                  \
> > -     __ret = arch_cmpxchg_relaxed(ptr, o, n);                \
> > -     __smp_acquire_fence();                                  \
> > +     switch (size) {                                         \
> > +     case 4:                                                 \
> > +             asm volatile (                                  \
> > +             "1:     ldex.w          %0, (%3) \n"            \
> > +             "       cmpne           %0, %4   \n"            \
> > +             "       bt              2f       \n"            \
> > +             "       mov             %1, %2   \n"            \
> > +             RELEASE_FENCE                                   \
>
> FWIW, you probably need to make sure that a barrier instruction inside
> an lr/sc loop is a good thing. IIUC, the execution time of a barrier
> instruction is determined by the status of store buffers and invalidate
> queues (and probably other stuffs), so it may increase the execution
> time of the lr/sc loop, and make it unlikely to succeed. But this really
> depends on how the arch executes these instructions.
Yes, you are right. FENCE would plus overhead in lr/sc loop and that
would make it harder to succeed.

I would fix up it and include your comment in the next version of the patchset.

>
> Regards,
> Boqun
>
> > +             "       stex.w          %1, (%3) \n"            \
> > +             "       bez             %1, 1b   \n"            \
> > +             FULL_FENCE                                      \
> > +             "2:                              \n"            \
> > +                     : "=&r" (__ret), "=&r" (__tmp)          \
> > +                     : "r" (__new), "r"(__ptr), "r"(__old)   \
> > +                     :);                                     \
> > +             break;                                          \
> > +     default:                                                \
> > +             __bad_xchg();                                   \
> > +     }                                                       \
> >       __ret;                                                  \
> >  })
> >
> > +#define arch_cmpxchg(ptr, o, n) \
> > +     (__cmpxchg((ptr), (o), (n), sizeof(*(ptr))))
> > +
> > +#define arch_cmpxchg_local(ptr, o, n)                                \
> > +     (__cmpxchg_relaxed((ptr), (o), (n), sizeof(*(ptr))))
> >  #else
> >  #include <asm-generic/cmpxchg.h>
> >  #endif
> > --
> > 2.25.1
> >



--
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2022-04-22  3:45 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-04-17  8:32 [PATCH V3 0/3] csky: Optimize with acquire & release for atomic & cmpxchg guoren
2022-04-17  8:32 ` [PATCH V3 1/3] csky: cmpxchg: Optimize with acquire & release guoren
2022-04-22  3:20   ` Boqun Feng
2022-04-22  3:45     ` Guo Ren
2022-04-17  8:32 ` [PATCH V3 2/3] csky: atomic: Add custom atomic.h implementation guoren
2022-04-17  8:32 ` [PATCH V3 3/3] csky: atomic: Add conditional atomic operations' optimization guoren

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).