All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH V3 0/3] csky: Optimize with acquire & release for atomic & cmpxchg
@ 2022-04-17  8:32 guoren
  2022-04-17  8:32 ` [PATCH V3 1/3] csky: cmpxchg: Optimize with acquire & release guoren
                   ` (2 more replies)
  0 siblings, 3 replies; 6+ messages in thread
From: guoren @ 2022-04-17  8:32 UTC (permalink / raw)
  To: guoren, arnd, mark.rutland, boqun.feng, peterz, will
  Cc: linux-arch, linux-kernel, linux-csky, Guo Ren

From: Guo Ren <guoren@linux.alibaba.com>

Optimize arch_xchg|cmpxchg|cmpxchg_local with ASM acquire|release
instructions instead of previous C based.

The generic atomic.h used cmpxchg to implement the atomic
operations, it will cause daul loop to reduce the forward
guarantee. The patch implement csky custom atomic operations with
ldex/stex instructions for the best performance.

Important reference commit:
8e86f0b409a4 ("arm64: atomics: fix use of acquire + release for
full barrier semantics")

Changes in V3:
 - Add arch_atomic_(fetch_add_unless, inc_unless_negative,
   dec_unless_positive, dec_if_positive)

Changes in V2:
 - Fixup use of acquire + release for barrier semantics by Rutland.

Guo Ren (3):
  csky: cmpxchg: Optimize with acquire & release
  csky: atomic: Add custom atomic.h implementation
  csky: atomic: Add conditional atomic operations' optimization

 arch/csky/include/asm/atomic.h  | 249 ++++++++++++++++++++++++++++++++
 arch/csky/include/asm/barrier.h |  11 +-
 arch/csky/include/asm/cmpxchg.h |  64 +++++++-
 3 files changed, 316 insertions(+), 8 deletions(-)
 create mode 100644 arch/csky/include/asm/atomic.h

-- 
2.25.1


^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH V3 1/3] csky: cmpxchg: Optimize with acquire & release
  2022-04-17  8:32 [PATCH V3 0/3] csky: Optimize with acquire & release for atomic & cmpxchg guoren
@ 2022-04-17  8:32 ` guoren
  2022-04-22  3:20   ` Boqun Feng
  2022-04-17  8:32 ` [PATCH V3 2/3] csky: atomic: Add custom atomic.h implementation guoren
  2022-04-17  8:32 ` [PATCH V3 3/3] csky: atomic: Add conditional atomic operations' optimization guoren
  2 siblings, 1 reply; 6+ messages in thread
From: guoren @ 2022-04-17  8:32 UTC (permalink / raw)
  To: guoren, arnd, mark.rutland, boqun.feng, peterz, will
  Cc: linux-arch, linux-kernel, linux-csky, Guo Ren

From: Guo Ren <guoren@linux.alibaba.com>

Optimize arch_xchg|cmpxchg|cmpxchg_local with ASM acquire|release
instructions instead of previous C based.

Important reference comment by Rutland:
8e86f0b409a4 ("arm64: atomics: fix use of acquire + release for
full barrier semantics")

Link: https://lore.kernel.org/linux-riscv/CAJF2gTSAxpAi=LbAdu7jntZRUa=-dJwL0VfmDfBV5MHB=rcZ-w@mail.gmail.com/T/#m27a0f1342995deae49ce1d0e1f2683f8a181d6c3
Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
Signed-off-by: Guo Ren <guoren@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
---
 arch/csky/include/asm/barrier.h | 11 +++---
 arch/csky/include/asm/cmpxchg.h | 64 ++++++++++++++++++++++++++++++---
 2 files changed, 67 insertions(+), 8 deletions(-)

diff --git a/arch/csky/include/asm/barrier.h b/arch/csky/include/asm/barrier.h
index f4045dd53e17..fb63335ffa33 100644
--- a/arch/csky/include/asm/barrier.h
+++ b/arch/csky/include/asm/barrier.h
@@ -37,17 +37,21 @@
  * bar.brar
  * bar.bwaw
  */
+#define ACQUIRE_FENCE		".long 0x8427c000\n"
+#define RELEASE_FENCE		".long 0x842ec000\n"
+#define FULL_FENCE		".long 0x842fc000\n"
+
 #define __bar_brw()	asm volatile (".long 0x842cc000\n":::"memory")
 #define __bar_br()	asm volatile (".long 0x8424c000\n":::"memory")
 #define __bar_bw()	asm volatile (".long 0x8428c000\n":::"memory")
 #define __bar_arw()	asm volatile (".long 0x8423c000\n":::"memory")
 #define __bar_ar()	asm volatile (".long 0x8421c000\n":::"memory")
 #define __bar_aw()	asm volatile (".long 0x8422c000\n":::"memory")
-#define __bar_brwarw()	asm volatile (".long 0x842fc000\n":::"memory")
-#define __bar_brarw()	asm volatile (".long 0x8427c000\n":::"memory")
+#define __bar_brwarw()	asm volatile (FULL_FENCE:::"memory")
+#define __bar_brarw()	asm volatile (ACQUIRE_FENCE:::"memory")
 #define __bar_bwarw()	asm volatile (".long 0x842bc000\n":::"memory")
 #define __bar_brwar()	asm volatile (".long 0x842dc000\n":::"memory")
-#define __bar_brwaw()	asm volatile (".long 0x842ec000\n":::"memory")
+#define __bar_brwaw()	asm volatile (RELEASE_FENCE:::"memory")
 #define __bar_brar()	asm volatile (".long 0x8425c000\n":::"memory")
 #define __bar_brar()	asm volatile (".long 0x8425c000\n":::"memory")
 #define __bar_bwaw()	asm volatile (".long 0x842ac000\n":::"memory")
@@ -56,7 +60,6 @@
 #define __smp_rmb()	__bar_brar()
 #define __smp_wmb()	__bar_bwaw()
 
-#define ACQUIRE_FENCE		".long 0x8427c000\n"
 #define __smp_acquire_fence()	__bar_brarw()
 #define __smp_release_fence()	__bar_brwaw()
 
diff --git a/arch/csky/include/asm/cmpxchg.h b/arch/csky/include/asm/cmpxchg.h
index d1bef11f8dc9..06c550448bf1 100644
--- a/arch/csky/include/asm/cmpxchg.h
+++ b/arch/csky/include/asm/cmpxchg.h
@@ -64,15 +64,71 @@ extern void __bad_xchg(void);
 #define arch_cmpxchg_relaxed(ptr, o, n) \
 	(__cmpxchg_relaxed((ptr), (o), (n), sizeof(*(ptr))))
 
-#define arch_cmpxchg(ptr, o, n) 				\
+#define __cmpxchg_acquire(ptr, old, new, size)			\
 ({								\
+	__typeof__(ptr) __ptr = (ptr);				\
+	__typeof__(new) __new = (new);				\
+	__typeof__(new) __tmp;					\
+	__typeof__(old) __old = (old);				\
+	__typeof__(*(ptr)) __ret;				\
+	switch (size) {						\
+	case 4:							\
+		asm volatile (					\
+		"1:	ldex.w		%0, (%3) \n"		\
+		"	cmpne		%0, %4   \n"		\
+		"	bt		2f       \n"		\
+		"	mov		%1, %2   \n"		\
+		"	stex.w		%1, (%3) \n"		\
+		"	bez		%1, 1b   \n"		\
+		ACQUIRE_FENCE					\
+		"2:				 \n"		\
+			: "=&r" (__ret), "=&r" (__tmp)		\
+			: "r" (__new), "r"(__ptr), "r"(__old)	\
+			:);					\
+		break;						\
+	default:						\
+		__bad_xchg();					\
+	}							\
+	__ret;							\
+})
+
+#define arch_cmpxchg_acquire(ptr, o, n) \
+	(__cmpxchg_acquire((ptr), (o), (n), sizeof(*(ptr))))
+
+#define __cmpxchg(ptr, old, new, size)				\
+({								\
+	__typeof__(ptr) __ptr = (ptr);				\
+	__typeof__(new) __new = (new);				\
+	__typeof__(new) __tmp;					\
+	__typeof__(old) __old = (old);				\
 	__typeof__(*(ptr)) __ret;				\
-	__smp_release_fence();					\
-	__ret = arch_cmpxchg_relaxed(ptr, o, n);		\
-	__smp_acquire_fence();					\
+	switch (size) {						\
+	case 4:							\
+		asm volatile (					\
+		"1:	ldex.w		%0, (%3) \n"		\
+		"	cmpne		%0, %4   \n"		\
+		"	bt		2f       \n"		\
+		"	mov		%1, %2   \n"		\
+		RELEASE_FENCE					\
+		"	stex.w		%1, (%3) \n"		\
+		"	bez		%1, 1b   \n"		\
+		FULL_FENCE					\
+		"2:				 \n"		\
+			: "=&r" (__ret), "=&r" (__tmp)		\
+			: "r" (__new), "r"(__ptr), "r"(__old)	\
+			:);					\
+		break;						\
+	default:						\
+		__bad_xchg();					\
+	}							\
 	__ret;							\
 })
 
+#define arch_cmpxchg(ptr, o, n) \
+	(__cmpxchg((ptr), (o), (n), sizeof(*(ptr))))
+
+#define arch_cmpxchg_local(ptr, o, n)				\
+	(__cmpxchg_relaxed((ptr), (o), (n), sizeof(*(ptr))))
 #else
 #include <asm-generic/cmpxchg.h>
 #endif
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH V3 2/3] csky: atomic: Add custom atomic.h implementation
  2022-04-17  8:32 [PATCH V3 0/3] csky: Optimize with acquire & release for atomic & cmpxchg guoren
  2022-04-17  8:32 ` [PATCH V3 1/3] csky: cmpxchg: Optimize with acquire & release guoren
@ 2022-04-17  8:32 ` guoren
  2022-04-17  8:32 ` [PATCH V3 3/3] csky: atomic: Add conditional atomic operations' optimization guoren
  2 siblings, 0 replies; 6+ messages in thread
From: guoren @ 2022-04-17  8:32 UTC (permalink / raw)
  To: guoren, arnd, mark.rutland, boqun.feng, peterz, will
  Cc: linux-arch, linux-kernel, linux-csky, Guo Ren

From: Guo Ren <guoren@linux.alibaba.com>

The generic atomic.h used cmpxchg to implement the atomic
operations, it will cause daul loop to reduce the forward
guarantee. The patch implement csky custom atomic operations with
ldex/stex instructions for the best performance.

Important reference comment by Rutland:
8e86f0b409a4 ("arm64: atomics: fix use of acquire + release for
full barrier semantics")

Link: https://lore.kernel.org/linux-riscv/CAJF2gTSAxpAi=LbAdu7jntZRUa=-dJwL0VfmDfBV5MHB=rcZ-w@mail.gmail.com/T/#m27a0f1342995deae49ce1d0e1f2683f8a181d6c3
Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
Signed-off-by: Guo Ren <guoren@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
---
 arch/csky/include/asm/atomic.h | 154 +++++++++++++++++++++++++++++++++
 1 file changed, 154 insertions(+)
 create mode 100644 arch/csky/include/asm/atomic.h

diff --git a/arch/csky/include/asm/atomic.h b/arch/csky/include/asm/atomic.h
new file mode 100644
index 000000000000..5ecc657a2a66
--- /dev/null
+++ b/arch/csky/include/asm/atomic.h
@@ -0,0 +1,154 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __ASM_CSKY_ATOMIC_H
+#define __ASM_CSKY_ATOMIC_H
+
+#ifdef CONFIG_SMP
+#include <asm-generic/atomic64.h>
+
+#include <asm/cmpxchg.h>
+#include <asm/barrier.h>
+
+#define __atomic_acquire_fence()	__smp_acquire_fence()
+
+#define __atomic_release_fence()	__smp_release_fence()
+
+static __always_inline int arch_atomic_read(const atomic_t *v)
+{
+	return READ_ONCE(v->counter);
+}
+static __always_inline void arch_atomic_set(atomic_t *v, int i)
+{
+	WRITE_ONCE(v->counter, i);
+}
+
+#define ATOMIC_OP(op)							\
+static __always_inline							\
+void arch_atomic_##op(int i, atomic_t *v)				\
+{									\
+	unsigned long tmp;						\
+	__asm__ __volatile__ (						\
+	"1:	ldex.w		%0, (%2)	\n"			\
+	"	" #op "		%0, %1		\n"			\
+	"	stex.w		%0, (%2)	\n"			\
+	"	bez		%0, 1b		\n"			\
+	: "=&r" (tmp)							\
+	: "r" (i), "r" (&v->counter)					\
+	: "memory");							\
+}
+
+ATOMIC_OP(add)
+ATOMIC_OP(sub)
+ATOMIC_OP(and)
+ATOMIC_OP( or)
+ATOMIC_OP(xor)
+
+#undef ATOMIC_OP
+
+#define ATOMIC_FETCH_OP(op)						\
+static __always_inline							\
+int arch_atomic_fetch_##op##_relaxed(int i, atomic_t *v)		\
+{									\
+	register int ret, tmp;						\
+	__asm__ __volatile__ (						\
+	"1:	ldex.w		%0, (%3) \n"				\
+	"	mov		%1, %0   \n"				\
+	"	" #op "		%0, %2   \n"				\
+	"	stex.w		%0, (%3) \n"				\
+	"	bez		%0, 1b   \n"				\
+		: "=&r" (tmp), "=&r" (ret)				\
+		: "r" (i), "r"(&v->counter) 				\
+		: "memory");						\
+	return ret;							\
+}
+
+#define ATOMIC_OP_RETURN(op, c_op)					\
+static __always_inline							\
+int arch_atomic_##op##_return_relaxed(int i, atomic_t *v)		\
+{									\
+	return arch_atomic_fetch_##op##_relaxed(i, v) c_op i;		\
+}									\
+static __always_inline							\
+int arch_atomic_##op##_return(int i, atomic_t *v)			\
+{									\
+	return arch_atomic_fetch_##op(i, v) c_op i;			\
+}
+
+#define ATOMIC_OPS(op, c_op)						\
+	ATOMIC_FETCH_OP(op)						\
+	ATOMIC_OP_RETURN(op, c_op)
+
+ATOMIC_OPS(add, +)
+ATOMIC_OPS(sub, -)
+
+#define arch_atomic_fetch_add_relaxed	arch_atomic_fetch_add_relaxed
+#define arch_atomic_fetch_sub_relaxed	arch_atomic_fetch_sub_relaxed
+#define arch_atomic_fetch_add		arch_atomic_fetch_add
+#define arch_atomic_fetch_sub		arch_atomic_fetch_sub
+
+#define arch_atomic_add_return_relaxed	arch_atomic_add_return_relaxed
+#define arch_atomic_sub_return_relaxed	arch_atomic_sub_return_relaxed
+#define arch_atomic_add_return		arch_atomic_add_return
+#define arch_atomic_sub_return		arch_atomic_sub_return
+
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+
+#define ATOMIC_OPS(op)							\
+	ATOMIC_FETCH_OP(op)
+
+ATOMIC_OPS(and)
+ATOMIC_OPS( or)
+ATOMIC_OPS(xor)
+
+#define arch_atomic_fetch_and_relaxed	arch_atomic_fetch_and_relaxed
+#define arch_atomic_fetch_or_relaxed	arch_atomic_fetch_or_relaxed
+#define arch_atomic_fetch_xor_relaxed	arch_atomic_fetch_xor_relaxed
+#define arch_atomic_fetch_and		arch_atomic_fetch_and
+#define arch_atomic_fetch_or		arch_atomic_fetch_or
+#define arch_atomic_fetch_xor		arch_atomic_fetch_xor
+
+#undef ATOMIC_OPS
+
+#undef ATOMIC_FETCH_OP
+
+#define ATOMIC_OP()							\
+static __always_inline							\
+int arch_atomic_xchg_relaxed(atomic_t *v, int n)			\
+{									\
+	return __xchg_relaxed(n, &(v->counter), 4);			\
+}									\
+static __always_inline							\
+int arch_atomic_xchg(atomic_t *v, int n)				\
+{									\
+	return __xchg(n, &(v->counter), 4);				\
+}									\
+static __always_inline							\
+int arch_atomic_cmpxchg_relaxed(atomic_t *v, int o, int n)		\
+{									\
+	return __cmpxchg_relaxed(&(v->counter), o, n, 4);		\
+}									\
+static __always_inline							\
+int arch_atomic_cmpxchg(atomic_t *v, int o, int n)			\
+{									\
+	return __cmpxchg(&(v->counter), o, n, 4);			\
+}
+
+#define ATOMIC_OPS()							\
+	ATOMIC_OP()
+
+ATOMIC_OPS()
+
+#define arch_atomic_xchg_relaxed	arch_atomic_xchg_relaxed
+#define arch_atomic_xchg		arch_atomic_xchg
+#define arch_atomic_cmpxchg_relaxed	arch_atomic_cmpxchg_relaxed
+#define arch_atomic_cmpxchg		arch_atomic_cmpxchg
+
+#undef ATOMIC_OPS
+#undef ATOMIC_OP
+
+#else
+#include <asm-generic/atomic.h>
+#endif
+
+#endif /* __ASM_CSKY_ATOMIC_H */
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH V3 3/3] csky: atomic: Add conditional atomic operations' optimization
  2022-04-17  8:32 [PATCH V3 0/3] csky: Optimize with acquire & release for atomic & cmpxchg guoren
  2022-04-17  8:32 ` [PATCH V3 1/3] csky: cmpxchg: Optimize with acquire & release guoren
  2022-04-17  8:32 ` [PATCH V3 2/3] csky: atomic: Add custom atomic.h implementation guoren
@ 2022-04-17  8:32 ` guoren
  2 siblings, 0 replies; 6+ messages in thread
From: guoren @ 2022-04-17  8:32 UTC (permalink / raw)
  To: guoren, arnd, mark.rutland, boqun.feng, peterz, will
  Cc: linux-arch, linux-kernel, linux-csky, Guo Ren

From: Guo Ren <guoren@linux.alibaba.com>

Add conditional atomic operations' optimization:
 - arch_atomic_fetch_add_unless
 - arch_atomic_inc_unless_negative
 - arch_atomic_dec_unless_positive
 - arch_atomic_dec_if_positive

Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
Signed-off-by: Guo Ren <guoren@kernel.org>
---
 arch/csky/include/asm/atomic.h | 95 ++++++++++++++++++++++++++++++++++
 1 file changed, 95 insertions(+)

diff --git a/arch/csky/include/asm/atomic.h b/arch/csky/include/asm/atomic.h
index 5ecc657a2a66..3f2917b748c3 100644
--- a/arch/csky/include/asm/atomic.h
+++ b/arch/csky/include/asm/atomic.h
@@ -112,6 +112,101 @@ ATOMIC_OPS(xor)
 
 #undef ATOMIC_FETCH_OP
 
+static __always_inline int
+arch_atomic_fetch_add_unless(atomic_t *v, int a, int u)
+{
+	int prev, tmp;
+
+	__asm__ __volatile__ (
+		"1:	ldex.w		%0, (%3)	\n"
+		"	cmpne		%0, %4		\n"
+		"	bf		2f		\n"
+		"	mov		%1, %0		\n"
+		"	add		%1, %2		\n"
+		RELEASE_FENCE
+		"	stex.w		%1, (%3)	\n"
+		"	bez		%1, 1b		\n"
+		FULL_FENCE
+		"2:\n"
+		: "=&r" (prev), "=&r" (tmp)
+		: "r" (a), "r" (&v->counter), "r" (u)
+		: "memory");
+
+	return prev;
+}
+#define arch_atomic_fetch_add_unless arch_atomic_fetch_add_unless
+
+static __always_inline bool
+arch_atomic_inc_unless_negative(atomic_t *v)
+{
+	int rc, tmp;
+
+	__asm__ __volatile__ (
+		"1:	ldex.w		%0, (%2)	\n"
+		"	movi		%1, 0		\n"
+		"	blz		%0, 2f		\n"
+		"	movi		%1, 1		\n"
+		"	addi		%0, 1		\n"
+		RELEASE_FENCE
+		"	stex.w		%0, (%2)	\n"
+		"	bez		%0, 1b		\n"
+		FULL_FENCE
+		"2:\n"
+		: "=&r" (tmp), "=&r" (rc)
+		: "r" (&v->counter)
+		: "memory");
+
+	return tmp ? true : false;
+
+}
+#define arch_atomic_inc_unless_negative arch_atomic_inc_unless_negative
+
+static __always_inline bool
+arch_atomic_dec_unless_positive(atomic_t *v)
+{
+	int rc, tmp;
+
+	__asm__ __volatile__ (
+		"1:	ldex.w		%0, (%2)	\n"
+		"	movi		%1, 0		\n"
+		"	bhz		%0, 2f		\n"
+		"	movi		%1, 1		\n"
+		"	subi		%0, 1		\n"
+		RELEASE_FENCE
+		"	stex.w		%0, (%2)	\n"
+		"	bez		%0, 1b		\n"
+		FULL_FENCE
+		"2:\n"
+		: "=&r" (tmp), "=&r" (rc)
+		: "r" (&v->counter)
+		: "memory");
+
+	return tmp ? true : false;
+}
+#define arch_atomic_dec_unless_positive arch_atomic_dec_unless_positive
+
+static __always_inline int
+arch_atomic_dec_if_positive(atomic_t *v)
+{
+	int dec, tmp;
+
+	__asm__ __volatile__ (
+		"1:	ldex.w		%0, (%2)	\n"
+		"	subi		%1, %0, 1	\n"
+		"	blz		%1, 2f		\n"
+		RELEASE_FENCE
+		"	stex.w		%1, (%2)	\n"
+		"	bez		%1, 1b		\n"
+		FULL_FENCE
+		"2:\n"
+		: "=&r" (dec), "=&r" (tmp)
+		: "r" (&v->counter)
+		: "memory");
+
+	return dec - 1;
+}
+#define arch_atomic_dec_if_positive arch_atomic_dec_if_positive
+
 #define ATOMIC_OP()							\
 static __always_inline							\
 int arch_atomic_xchg_relaxed(atomic_t *v, int n)			\
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH V3 1/3] csky: cmpxchg: Optimize with acquire & release
  2022-04-17  8:32 ` [PATCH V3 1/3] csky: cmpxchg: Optimize with acquire & release guoren
@ 2022-04-22  3:20   ` Boqun Feng
  2022-04-22  3:45     ` Guo Ren
  0 siblings, 1 reply; 6+ messages in thread
From: Boqun Feng @ 2022-04-22  3:20 UTC (permalink / raw)
  To: guoren
  Cc: arnd, mark.rutland, peterz, will, linux-arch, linux-kernel,
	linux-csky, Guo Ren

[-- Attachment #1: Type: text/plain, Size: 5672 bytes --]

On Sun, Apr 17, 2022 at 04:32:02PM +0800, guoren@kernel.org wrote:
> From: Guo Ren <guoren@linux.alibaba.com>
> 
> Optimize arch_xchg|cmpxchg|cmpxchg_local with ASM acquire|release
> instructions instead of previous C based.
> 
> Important reference comment by Rutland:
> 8e86f0b409a4 ("arm64: atomics: fix use of acquire + release for
> full barrier semantics")
> 
> Link: https://lore.kernel.org/linux-riscv/CAJF2gTSAxpAi=LbAdu7jntZRUa=-dJwL0VfmDfBV5MHB=rcZ-w@mail.gmail.com/T/#m27a0f1342995deae49ce1d0e1f2683f8a181d6c3
> Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> Signed-off-by: Guo Ren <guoren@kernel.org>
> Cc: Mark Rutland <mark.rutland@arm.com>
> ---
>  arch/csky/include/asm/barrier.h | 11 +++---
>  arch/csky/include/asm/cmpxchg.h | 64 ++++++++++++++++++++++++++++++---
>  2 files changed, 67 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/csky/include/asm/barrier.h b/arch/csky/include/asm/barrier.h
> index f4045dd53e17..fb63335ffa33 100644
> --- a/arch/csky/include/asm/barrier.h
> +++ b/arch/csky/include/asm/barrier.h
> @@ -37,17 +37,21 @@
>   * bar.brar
>   * bar.bwaw
>   */
> +#define ACQUIRE_FENCE		".long 0x8427c000\n"
> +#define RELEASE_FENCE		".long 0x842ec000\n"
> +#define FULL_FENCE		".long 0x842fc000\n"
> +
>  #define __bar_brw()	asm volatile (".long 0x842cc000\n":::"memory")
>  #define __bar_br()	asm volatile (".long 0x8424c000\n":::"memory")
>  #define __bar_bw()	asm volatile (".long 0x8428c000\n":::"memory")
>  #define __bar_arw()	asm volatile (".long 0x8423c000\n":::"memory")
>  #define __bar_ar()	asm volatile (".long 0x8421c000\n":::"memory")
>  #define __bar_aw()	asm volatile (".long 0x8422c000\n":::"memory")
> -#define __bar_brwarw()	asm volatile (".long 0x842fc000\n":::"memory")
> -#define __bar_brarw()	asm volatile (".long 0x8427c000\n":::"memory")
> +#define __bar_brwarw()	asm volatile (FULL_FENCE:::"memory")
> +#define __bar_brarw()	asm volatile (ACQUIRE_FENCE:::"memory")
>  #define __bar_bwarw()	asm volatile (".long 0x842bc000\n":::"memory")
>  #define __bar_brwar()	asm volatile (".long 0x842dc000\n":::"memory")
> -#define __bar_brwaw()	asm volatile (".long 0x842ec000\n":::"memory")
> +#define __bar_brwaw()	asm volatile (RELEASE_FENCE:::"memory")
>  #define __bar_brar()	asm volatile (".long 0x8425c000\n":::"memory")
>  #define __bar_brar()	asm volatile (".long 0x8425c000\n":::"memory")
>  #define __bar_bwaw()	asm volatile (".long 0x842ac000\n":::"memory")
> @@ -56,7 +60,6 @@
>  #define __smp_rmb()	__bar_brar()
>  #define __smp_wmb()	__bar_bwaw()
>  
> -#define ACQUIRE_FENCE		".long 0x8427c000\n"
>  #define __smp_acquire_fence()	__bar_brarw()
>  #define __smp_release_fence()	__bar_brwaw()
>  
> diff --git a/arch/csky/include/asm/cmpxchg.h b/arch/csky/include/asm/cmpxchg.h
> index d1bef11f8dc9..06c550448bf1 100644
> --- a/arch/csky/include/asm/cmpxchg.h
> +++ b/arch/csky/include/asm/cmpxchg.h
> @@ -64,15 +64,71 @@ extern void __bad_xchg(void);
>  #define arch_cmpxchg_relaxed(ptr, o, n) \
>  	(__cmpxchg_relaxed((ptr), (o), (n), sizeof(*(ptr))))
>  
> -#define arch_cmpxchg(ptr, o, n) 				\
> +#define __cmpxchg_acquire(ptr, old, new, size)			\
>  ({								\
> +	__typeof__(ptr) __ptr = (ptr);				\
> +	__typeof__(new) __new = (new);				\
> +	__typeof__(new) __tmp;					\
> +	__typeof__(old) __old = (old);				\
> +	__typeof__(*(ptr)) __ret;				\
> +	switch (size) {						\
> +	case 4:							\
> +		asm volatile (					\
> +		"1:	ldex.w		%0, (%3) \n"		\
> +		"	cmpne		%0, %4   \n"		\
> +		"	bt		2f       \n"		\
> +		"	mov		%1, %2   \n"		\
> +		"	stex.w		%1, (%3) \n"		\
> +		"	bez		%1, 1b   \n"		\
> +		ACQUIRE_FENCE					\
> +		"2:				 \n"		\
> +			: "=&r" (__ret), "=&r" (__tmp)		\
> +			: "r" (__new), "r"(__ptr), "r"(__old)	\
> +			:);					\
> +		break;						\
> +	default:						\
> +		__bad_xchg();					\
> +	}							\
> +	__ret;							\
> +})
> +
> +#define arch_cmpxchg_acquire(ptr, o, n) \
> +	(__cmpxchg_acquire((ptr), (o), (n), sizeof(*(ptr))))
> +
> +#define __cmpxchg(ptr, old, new, size)				\
> +({								\
> +	__typeof__(ptr) __ptr = (ptr);				\
> +	__typeof__(new) __new = (new);				\
> +	__typeof__(new) __tmp;					\
> +	__typeof__(old) __old = (old);				\
>  	__typeof__(*(ptr)) __ret;				\
> -	__smp_release_fence();					\
> -	__ret = arch_cmpxchg_relaxed(ptr, o, n);		\
> -	__smp_acquire_fence();					\
> +	switch (size) {						\
> +	case 4:							\
> +		asm volatile (					\
> +		"1:	ldex.w		%0, (%3) \n"		\
> +		"	cmpne		%0, %4   \n"		\
> +		"	bt		2f       \n"		\
> +		"	mov		%1, %2   \n"		\
> +		RELEASE_FENCE					\

FWIW, you probably need to make sure that a barrier instruction inside
an lr/sc loop is a good thing. IIUC, the execution time of a barrier
instruction is determined by the status of store buffers and invalidate
queues (and probably other stuffs), so it may increase the execution
time of the lr/sc loop, and make it unlikely to succeed. But this really
depends on how the arch executes these instructions.

Regards,
Boqun

> +		"	stex.w		%1, (%3) \n"		\
> +		"	bez		%1, 1b   \n"		\
> +		FULL_FENCE					\
> +		"2:				 \n"		\
> +			: "=&r" (__ret), "=&r" (__tmp)		\
> +			: "r" (__new), "r"(__ptr), "r"(__old)	\
> +			:);					\
> +		break;						\
> +	default:						\
> +		__bad_xchg();					\
> +	}							\
>  	__ret;							\
>  })
>  
> +#define arch_cmpxchg(ptr, o, n) \
> +	(__cmpxchg((ptr), (o), (n), sizeof(*(ptr))))
> +
> +#define arch_cmpxchg_local(ptr, o, n)				\
> +	(__cmpxchg_relaxed((ptr), (o), (n), sizeof(*(ptr))))
>  #else
>  #include <asm-generic/cmpxchg.h>
>  #endif
> -- 
> 2.25.1
> 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH V3 1/3] csky: cmpxchg: Optimize with acquire & release
  2022-04-22  3:20   ` Boqun Feng
@ 2022-04-22  3:45     ` Guo Ren
  0 siblings, 0 replies; 6+ messages in thread
From: Guo Ren @ 2022-04-22  3:45 UTC (permalink / raw)
  To: Boqun Feng
  Cc: Arnd Bergmann, Mark Rutland, Peter Zijlstra, Will Deacon,
	linux-arch, Linux Kernel Mailing List, linux-csky, Guo Ren

On Fri, Apr 22, 2022 at 11:20 AM Boqun Feng <boqun.feng@gmail.com> wrote:
>
> On Sun, Apr 17, 2022 at 04:32:02PM +0800, guoren@kernel.org wrote:
> > From: Guo Ren <guoren@linux.alibaba.com>
> >
> > Optimize arch_xchg|cmpxchg|cmpxchg_local with ASM acquire|release
> > instructions instead of previous C based.
> >
> > Important reference comment by Rutland:
> > 8e86f0b409a4 ("arm64: atomics: fix use of acquire + release for
> > full barrier semantics")
> >
> > Link: https://lore.kernel.org/linux-riscv/CAJF2gTSAxpAi=LbAdu7jntZRUa=-dJwL0VfmDfBV5MHB=rcZ-w@mail.gmail.com/T/#m27a0f1342995deae49ce1d0e1f2683f8a181d6c3
> > Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> > Signed-off-by: Guo Ren <guoren@kernel.org>
> > Cc: Mark Rutland <mark.rutland@arm.com>
> > ---
> >  arch/csky/include/asm/barrier.h | 11 +++---
> >  arch/csky/include/asm/cmpxchg.h | 64 ++++++++++++++++++++++++++++++---
> >  2 files changed, 67 insertions(+), 8 deletions(-)
> >
> > diff --git a/arch/csky/include/asm/barrier.h b/arch/csky/include/asm/barrier.h
> > index f4045dd53e17..fb63335ffa33 100644
> > --- a/arch/csky/include/asm/barrier.h
> > +++ b/arch/csky/include/asm/barrier.h
> > @@ -37,17 +37,21 @@
> >   * bar.brar
> >   * bar.bwaw
> >   */
> > +#define ACQUIRE_FENCE                ".long 0x8427c000\n"
> > +#define RELEASE_FENCE                ".long 0x842ec000\n"
> > +#define FULL_FENCE           ".long 0x842fc000\n"
> > +
> >  #define __bar_brw()  asm volatile (".long 0x842cc000\n":::"memory")
> >  #define __bar_br()   asm volatile (".long 0x8424c000\n":::"memory")
> >  #define __bar_bw()   asm volatile (".long 0x8428c000\n":::"memory")
> >  #define __bar_arw()  asm volatile (".long 0x8423c000\n":::"memory")
> >  #define __bar_ar()   asm volatile (".long 0x8421c000\n":::"memory")
> >  #define __bar_aw()   asm volatile (".long 0x8422c000\n":::"memory")
> > -#define __bar_brwarw()       asm volatile (".long 0x842fc000\n":::"memory")
> > -#define __bar_brarw()        asm volatile (".long 0x8427c000\n":::"memory")
> > +#define __bar_brwarw()       asm volatile (FULL_FENCE:::"memory")
> > +#define __bar_brarw()        asm volatile (ACQUIRE_FENCE:::"memory")
> >  #define __bar_bwarw()        asm volatile (".long 0x842bc000\n":::"memory")
> >  #define __bar_brwar()        asm volatile (".long 0x842dc000\n":::"memory")
> > -#define __bar_brwaw()        asm volatile (".long 0x842ec000\n":::"memory")
> > +#define __bar_brwaw()        asm volatile (RELEASE_FENCE:::"memory")
> >  #define __bar_brar() asm volatile (".long 0x8425c000\n":::"memory")
> >  #define __bar_brar() asm volatile (".long 0x8425c000\n":::"memory")
> >  #define __bar_bwaw() asm volatile (".long 0x842ac000\n":::"memory")
> > @@ -56,7 +60,6 @@
> >  #define __smp_rmb()  __bar_brar()
> >  #define __smp_wmb()  __bar_bwaw()
> >
> > -#define ACQUIRE_FENCE                ".long 0x8427c000\n"
> >  #define __smp_acquire_fence()        __bar_brarw()
> >  #define __smp_release_fence()        __bar_brwaw()
> >
> > diff --git a/arch/csky/include/asm/cmpxchg.h b/arch/csky/include/asm/cmpxchg.h
> > index d1bef11f8dc9..06c550448bf1 100644
> > --- a/arch/csky/include/asm/cmpxchg.h
> > +++ b/arch/csky/include/asm/cmpxchg.h
> > @@ -64,15 +64,71 @@ extern void __bad_xchg(void);
> >  #define arch_cmpxchg_relaxed(ptr, o, n) \
> >       (__cmpxchg_relaxed((ptr), (o), (n), sizeof(*(ptr))))
> >
> > -#define arch_cmpxchg(ptr, o, n)                              \
> > +#define __cmpxchg_acquire(ptr, old, new, size)                       \
> >  ({                                                           \
> > +     __typeof__(ptr) __ptr = (ptr);                          \
> > +     __typeof__(new) __new = (new);                          \
> > +     __typeof__(new) __tmp;                                  \
> > +     __typeof__(old) __old = (old);                          \
> > +     __typeof__(*(ptr)) __ret;                               \
> > +     switch (size) {                                         \
> > +     case 4:                                                 \
> > +             asm volatile (                                  \
> > +             "1:     ldex.w          %0, (%3) \n"            \
> > +             "       cmpne           %0, %4   \n"            \
> > +             "       bt              2f       \n"            \
> > +             "       mov             %1, %2   \n"            \
> > +             "       stex.w          %1, (%3) \n"            \
> > +             "       bez             %1, 1b   \n"            \
> > +             ACQUIRE_FENCE                                   \
> > +             "2:                              \n"            \
> > +                     : "=&r" (__ret), "=&r" (__tmp)          \
> > +                     : "r" (__new), "r"(__ptr), "r"(__old)   \
> > +                     :);                                     \
> > +             break;                                          \
> > +     default:                                                \
> > +             __bad_xchg();                                   \
> > +     }                                                       \
> > +     __ret;                                                  \
> > +})
> > +
> > +#define arch_cmpxchg_acquire(ptr, o, n) \
> > +     (__cmpxchg_acquire((ptr), (o), (n), sizeof(*(ptr))))
> > +
> > +#define __cmpxchg(ptr, old, new, size)                               \
> > +({                                                           \
> > +     __typeof__(ptr) __ptr = (ptr);                          \
> > +     __typeof__(new) __new = (new);                          \
> > +     __typeof__(new) __tmp;                                  \
> > +     __typeof__(old) __old = (old);                          \
> >       __typeof__(*(ptr)) __ret;                               \
> > -     __smp_release_fence();                                  \
> > -     __ret = arch_cmpxchg_relaxed(ptr, o, n);                \
> > -     __smp_acquire_fence();                                  \
> > +     switch (size) {                                         \
> > +     case 4:                                                 \
> > +             asm volatile (                                  \
> > +             "1:     ldex.w          %0, (%3) \n"            \
> > +             "       cmpne           %0, %4   \n"            \
> > +             "       bt              2f       \n"            \
> > +             "       mov             %1, %2   \n"            \
> > +             RELEASE_FENCE                                   \
>
> FWIW, you probably need to make sure that a barrier instruction inside
> an lr/sc loop is a good thing. IIUC, the execution time of a barrier
> instruction is determined by the status of store buffers and invalidate
> queues (and probably other stuffs), so it may increase the execution
> time of the lr/sc loop, and make it unlikely to succeed. But this really
> depends on how the arch executes these instructions.
Yes, you are right. FENCE would plus overhead in lr/sc loop and that
would make it harder to succeed.

I would fix up it and include your comment in the next version of the patchset.

>
> Regards,
> Boqun
>
> > +             "       stex.w          %1, (%3) \n"            \
> > +             "       bez             %1, 1b   \n"            \
> > +             FULL_FENCE                                      \
> > +             "2:                              \n"            \
> > +                     : "=&r" (__ret), "=&r" (__tmp)          \
> > +                     : "r" (__new), "r"(__ptr), "r"(__old)   \
> > +                     :);                                     \
> > +             break;                                          \
> > +     default:                                                \
> > +             __bad_xchg();                                   \
> > +     }                                                       \
> >       __ret;                                                  \
> >  })
> >
> > +#define arch_cmpxchg(ptr, o, n) \
> > +     (__cmpxchg((ptr), (o), (n), sizeof(*(ptr))))
> > +
> > +#define arch_cmpxchg_local(ptr, o, n)                                \
> > +     (__cmpxchg_relaxed((ptr), (o), (n), sizeof(*(ptr))))
> >  #else
> >  #include <asm-generic/cmpxchg.h>
> >  #endif
> > --
> > 2.25.1
> >



--
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2022-04-22  3:45 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-04-17  8:32 [PATCH V3 0/3] csky: Optimize with acquire & release for atomic & cmpxchg guoren
2022-04-17  8:32 ` [PATCH V3 1/3] csky: cmpxchg: Optimize with acquire & release guoren
2022-04-22  3:20   ` Boqun Feng
2022-04-22  3:45     ` Guo Ren
2022-04-17  8:32 ` [PATCH V3 2/3] csky: atomic: Add custom atomic.h implementation guoren
2022-04-17  8:32 ` [PATCH V3 3/3] csky: atomic: Add conditional atomic operations' optimization guoren

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.