* [PATCH V2 0/2] csky: Optimize with acquire & release for atomic & cmpxchg
@ 2022-04-11 14:51 guoren
2022-04-11 14:51 ` [PATCH V2 1/2] csky: cmpxchg: Optimize with acquire & release guoren
2022-04-11 14:51 ` [PATCH V2 2/2] csky: atomic: Add custom atomic.h implementation guoren
0 siblings, 2 replies; 4+ messages in thread
From: guoren @ 2022-04-11 14:51 UTC (permalink / raw)
To: guoren, arnd, mark.rutland; +Cc: linux-arch, linux-kernel, linux-csky, Guo Ren
From: Guo Ren <guoren@linux.alibaba.com>
Optimize arch_xchg|cmpxchg|cmpxchg_local with ASM acquire|release
instructions instead of previous C based.
The generic atomic.h used cmpxchg to implement the atomic
operations, it will cause daul loop to reduce the forward
guarantee. The patch implement csky custom atomic operations with
ldex/stex instructions for the best performance.
Important reference comment by Rutland:
8e86f0b409a4 ("arm64: atomics: fix use of acquire + release for
full barrier semantics")
Changes in V2:
- Fixup use of acquire + release for barrier semantics by Rutland.
Guo Ren (2):
csky: cmpxchg: Optimize with acquire & release
csky: atomic: Add custom atomic.h implementation
arch/csky/include/asm/atomic.h | 130 ++++++++++++++++++++++++++++++++
arch/csky/include/asm/barrier.h | 8 +-
arch/csky/include/asm/cmpxchg.h | 61 +++++++++++++--
3 files changed, 190 insertions(+), 9 deletions(-)
create mode 100644 arch/csky/include/asm/atomic.h
--
2.25.1
^ permalink raw reply [flat|nested] 4+ messages in thread
* [PATCH V2 1/2] csky: cmpxchg: Optimize with acquire & release
2022-04-11 14:51 [PATCH V2 0/2] csky: Optimize with acquire & release for atomic & cmpxchg guoren
@ 2022-04-11 14:51 ` guoren
2022-04-11 14:51 ` [PATCH V2 2/2] csky: atomic: Add custom atomic.h implementation guoren
1 sibling, 0 replies; 4+ messages in thread
From: guoren @ 2022-04-11 14:51 UTC (permalink / raw)
To: guoren, arnd, mark.rutland; +Cc: linux-arch, linux-kernel, linux-csky, Guo Ren
From: Guo Ren <guoren@linux.alibaba.com>
Optimize arch_xchg|cmpxchg|cmpxchg_local with ASM acquire|release
instructions instead of previous C based.
Important reference comment by Rutland:
8e86f0b409a4 ("arm64: atomics: fix use of acquire + release for
full barrier semantics")
Link: https://lore.kernel.org/linux-riscv/CAJF2gTSAxpAi=LbAdu7jntZRUa=-dJwL0VfmDfBV5MHB=rcZ-w@mail.gmail.com/T/#m27a0f1342995deae49ce1d0e1f2683f8a181d6c3
Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
Signed-off-by: Guo Ren <guoren@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
---
Changes in V2:
- Fixup use of acquire + release for barrier semantics by Rutland.
---
arch/csky/include/asm/barrier.h | 8 +++--
arch/csky/include/asm/cmpxchg.h | 61 +++++++++++++++++++++++++++++----
2 files changed, 60 insertions(+), 9 deletions(-)
diff --git a/arch/csky/include/asm/barrier.h b/arch/csky/include/asm/barrier.h
index f4045dd53e17..a075f17d02dd 100644
--- a/arch/csky/include/asm/barrier.h
+++ b/arch/csky/include/asm/barrier.h
@@ -37,6 +37,9 @@
* bar.brar
* bar.bwaw
*/
+#define ACQUIRE_FENCE ".long 0x8427c000\n"
+#define RELEASE_FENCE ".long 0x842ec000\n"
+
#define __bar_brw() asm volatile (".long 0x842cc000\n":::"memory")
#define __bar_br() asm volatile (".long 0x8424c000\n":::"memory")
#define __bar_bw() asm volatile (".long 0x8428c000\n":::"memory")
@@ -44,10 +47,10 @@
#define __bar_ar() asm volatile (".long 0x8421c000\n":::"memory")
#define __bar_aw() asm volatile (".long 0x8422c000\n":::"memory")
#define __bar_brwarw() asm volatile (".long 0x842fc000\n":::"memory")
-#define __bar_brarw() asm volatile (".long 0x8427c000\n":::"memory")
+#define __bar_brarw() asm volatile (ACQUIRE_FENCE:::"memory")
#define __bar_bwarw() asm volatile (".long 0x842bc000\n":::"memory")
#define __bar_brwar() asm volatile (".long 0x842dc000\n":::"memory")
-#define __bar_brwaw() asm volatile (".long 0x842ec000\n":::"memory")
+#define __bar_brwaw() asm volatile (RELEASE_FENCE:::"memory")
#define __bar_brar() asm volatile (".long 0x8425c000\n":::"memory")
#define __bar_brar() asm volatile (".long 0x8425c000\n":::"memory")
#define __bar_bwaw() asm volatile (".long 0x842ac000\n":::"memory")
@@ -56,7 +59,6 @@
#define __smp_rmb() __bar_brar()
#define __smp_wmb() __bar_bwaw()
-#define ACQUIRE_FENCE ".long 0x8427c000\n"
#define __smp_acquire_fence() __bar_brarw()
#define __smp_release_fence() __bar_brwaw()
diff --git a/arch/csky/include/asm/cmpxchg.h b/arch/csky/include/asm/cmpxchg.h
index d1bef11f8dc9..1a6f2f445c12 100644
--- a/arch/csky/include/asm/cmpxchg.h
+++ b/arch/csky/include/asm/cmpxchg.h
@@ -30,10 +30,36 @@ extern void __bad_xchg(void);
} \
__ret; \
})
-
#define arch_xchg_relaxed(ptr, x) \
(__xchg_relaxed((x), (ptr), sizeof(*(ptr))))
+#define __xchg(new, ptr, size) \
+({ \
+ __typeof__(ptr) __ptr = (ptr); \
+ __typeof__(new) __new = (new); \
+ __typeof__(*(ptr)) __ret; \
+ unsigned long tmp; \
+ switch (size) { \
+ case 4: \
+ asm volatile ( \
+ "1: ldex.w %0, (%3) \n" \
+ " mov %1, %2 \n" \
+ RELEASE_FENCE \
+ " stex.w %1, (%3) \n" \
+ " bez %1, 1b \n" \
+ : "=&r" (__ret), "=&r" (tmp) \
+ : "r" (__new), "r"(__ptr) \
+ :); \
+ __smp_mb(); \
+ break; \
+ default: \
+ __bad_xchg(); \
+ } \
+ __ret; \
+})
+#define arch_xchg(ptr, x) \
+ (__xchg((x), (ptr), sizeof(*(ptr))))
+
#define __cmpxchg_relaxed(ptr, old, new, size) \
({ \
__typeof__(ptr) __ptr = (ptr); \
@@ -60,19 +86,42 @@ extern void __bad_xchg(void);
} \
__ret; \
})
-
#define arch_cmpxchg_relaxed(ptr, o, n) \
(__cmpxchg_relaxed((ptr), (o), (n), sizeof(*(ptr))))
-#define arch_cmpxchg(ptr, o, n) \
+#define __cmpxchg(ptr, old, new, size) \
({ \
+ __typeof__(ptr) __ptr = (ptr); \
+ __typeof__(new) __new = (new); \
+ __typeof__(new) __tmp; \
+ __typeof__(old) __old = (old); \
__typeof__(*(ptr)) __ret; \
- __smp_release_fence(); \
- __ret = arch_cmpxchg_relaxed(ptr, o, n); \
- __smp_acquire_fence(); \
+ switch (size) { \
+ case 4: \
+ asm volatile ( \
+ "1: ldex.w %0, (%3) \n" \
+ " cmpne %0, %4 \n" \
+ " bt 2f \n" \
+ " mov %1, %2 \n" \
+ RELEASE_FENCE \
+ " stex.w %1, (%3) \n" \
+ " bez %1, 1b \n" \
+ "2: \n" \
+ : "=&r" (__ret), "=&r" (__tmp) \
+ : "r" (__new), "r"(__ptr), "r"(__old) \
+ :); \
+ __smp_mb(); \
+ break; \
+ default: \
+ __bad_xchg(); \
+ } \
__ret; \
})
+#define arch_cmpxchg(ptr, o, n) \
+ (__cmpxchg((ptr), (o), (n), sizeof(*(ptr))))
+#define arch_cmpxchg_local(ptr, o, n) \
+ (__cmpxchg_relaxed((ptr), (o), (n), sizeof(*(ptr))))
#else
#include <asm-generic/cmpxchg.h>
#endif
--
2.25.1
^ permalink raw reply related [flat|nested] 4+ messages in thread
* [PATCH V2 2/2] csky: atomic: Add custom atomic.h implementation
2022-04-11 14:51 [PATCH V2 0/2] csky: Optimize with acquire & release for atomic & cmpxchg guoren
2022-04-11 14:51 ` [PATCH V2 1/2] csky: cmpxchg: Optimize with acquire & release guoren
@ 2022-04-11 14:51 ` guoren
2022-04-12 5:21 ` Guo Ren
1 sibling, 1 reply; 4+ messages in thread
From: guoren @ 2022-04-11 14:51 UTC (permalink / raw)
To: guoren, arnd, mark.rutland; +Cc: linux-arch, linux-kernel, linux-csky, Guo Ren
From: Guo Ren <guoren@linux.alibaba.com>
The generic atomic.h used cmpxchg to implement the atomic
operations, it will cause daul loop to reduce the forward
guarantee. The patch implement csky custom atomic operations with
ldex/stex instructions for the best performance.
Important reference comment by Rutland:
8e86f0b409a4 ("arm64: atomics: fix use of acquire + release for
full barrier semantics")
Link: https://lore.kernel.org/linux-riscv/CAJF2gTSAxpAi=LbAdu7jntZRUa=-dJwL0VfmDfBV5MHB=rcZ-w@mail.gmail.com/T/#m27a0f1342995deae49ce1d0e1f2683f8a181d6c3
Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
Signed-off-by: Guo Ren <guoren@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
---
Changes in V2:
- Fixup use of acquire + release for barrier semantics by Rutland.
---
arch/csky/include/asm/atomic.h | 130 +++++++++++++++++++++++++++++++++
1 file changed, 130 insertions(+)
create mode 100644 arch/csky/include/asm/atomic.h
diff --git a/arch/csky/include/asm/atomic.h b/arch/csky/include/asm/atomic.h
new file mode 100644
index 000000000000..2e1a22f55ea1
--- /dev/null
+++ b/arch/csky/include/asm/atomic.h
@@ -0,0 +1,130 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __ASM_CSKY_ATOMIC_H
+#define __ASM_CSKY_ATOMIC_H
+
+#ifdef CONFIG_SMP
+# include <asm-generic/atomic64.h>
+
+#include <asm/cmpxchg.h>
+#include <asm/barrier.h>
+
+#define __atomic_acquire_fence() __smp_acquire_fence()
+
+#define __atomic_release_fence() __smp_release_fence()
+
+static __always_inline int arch_atomic_read(const atomic_t *v)
+{
+ return READ_ONCE(v->counter);
+}
+static __always_inline void arch_atomic_set(atomic_t *v, int i)
+{
+ WRITE_ONCE(v->counter, i);
+}
+
+#define ATOMIC_OP(op, asm_op, I) \
+static __always_inline \
+void arch_atomic_##op(int i, atomic_t *v) \
+{ \
+ unsigned long tmp; \
+ __asm__ __volatile__ ( \
+ "1: ldex.w %0, (%2) \n" \
+ " " #op " %0, %1 \n" \
+ " stex.w %0, (%2) \n" \
+ " bez %0, 1b \n" \
+ : "=&r" (tmp) \
+ : "r" (I), "r" (&v->counter) \
+ : "memory"); \
+}
+
+ATOMIC_OP(add, add, i)
+ATOMIC_OP(sub, add, -i)
+ATOMIC_OP(and, and, i)
+ATOMIC_OP( or, or, i)
+ATOMIC_OP(xor, xor, i)
+
+#undef ATOMIC_OP
+
+#define ATOMIC_FETCH_OP(op, asm_op, I) \
+static __always_inline \
+int arch_atomic_fetch_##op##_relaxed(int i, atomic_t *v) \
+{ \
+ register int ret, tmp; \
+ __asm__ __volatile__ ( \
+ "1: ldex.w %0, (%3) \n" \
+ " mov %1, %0 \n" \
+ " " #op " %0, %2 \n" \
+ " stex.w %0, (%3) \n" \
+ " bez %0, 1b \n" \
+ : "=&r" (tmp), "=&r" (ret) \
+ : "r" (I), "r"(&v->counter) \
+ : "memory"); \
+ return ret; \
+}
+
+#define ATOMIC_OP_RETURN(op, asm_op, c_op, I) \
+static __always_inline \
+int arch_atomic_##op##_return_relaxed(int i, atomic_t *v) \
+{ \
+ return arch_atomic_fetch_##op##_relaxed(i, v) c_op I; \
+}
+
+#define ATOMIC_OPS(op, asm_op, c_op, I) \
+ ATOMIC_FETCH_OP( op, asm_op, I) \
+ ATOMIC_OP_RETURN(op, asm_op, c_op, I)
+
+ATOMIC_OPS(add, add, +, i)
+ATOMIC_OPS(sub, add, +, -i)
+
+#define arch_atomic_fetch_add_relaxed arch_atomic_fetch_add_relaxed
+#define arch_atomic_fetch_sub_relaxed arch_atomic_fetch_sub_relaxed
+
+#define arch_atomic_add_return_relaxed arch_atomic_add_return_relaxed
+#define arch_atomic_sub_return_relaxed arch_atomic_sub_return_relaxed
+
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+
+#define ATOMIC_OPS(op, asm_op, I) \
+ ATOMIC_FETCH_OP(op, asm_op, I)
+
+ATOMIC_OPS(and, and, i)
+ATOMIC_OPS( or, or, i)
+ATOMIC_OPS(xor, xor, i)
+
+#define arch_atomic_fetch_and_relaxed arch_atomic_fetch_and_relaxed
+#define arch_atomic_fetch_or_relaxed arch_atomic_fetch_or_relaxed
+#define arch_atomic_fetch_xor_relaxed arch_atomic_fetch_xor_relaxed
+
+#undef ATOMIC_OPS
+
+#undef ATOMIC_FETCH_OP
+
+#define ATOMIC_OP() \
+static __always_inline \
+int arch_atomic_xchg_relaxed(atomic_t *v, int n) \
+{ \
+ return __xchg_relaxed(n, &(v->counter), 4); \
+} \
+static __always_inline \
+int arch_atomic_cmpxchg_relaxed(atomic_t *v, int o, int n) \
+{ \
+ return __cmpxchg_relaxed(&(v->counter), o, n, 4); \
+}
+
+#define ATOMIC_OPS() \
+ ATOMIC_OP()
+
+ATOMIC_OPS()
+
+#define arch_atomic_xchg_relaxed arch_atomic_xchg_relaxed
+#define arch_atomic_cmpxchg_relaxed arch_atomic_cmpxchg_relaxed
+
+#undef ATOMIC_OPS
+#undef ATOMIC_OP
+
+#else
+# include <asm-generic/atomic.h>
+#endif
+
+#endif /* __ASM_CSKY_ATOMIC_H */
--
2.25.1
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [PATCH V2 2/2] csky: atomic: Add custom atomic.h implementation
2022-04-11 14:51 ` [PATCH V2 2/2] csky: atomic: Add custom atomic.h implementation guoren
@ 2022-04-12 5:21 ` Guo Ren
0 siblings, 0 replies; 4+ messages in thread
From: Guo Ren @ 2022-04-12 5:21 UTC (permalink / raw)
To: Guo Ren, Arnd Bergmann, Mark Rutland
Cc: linux-arch, Linux Kernel Mailing List, linux-csky, Guo Ren
On Mon, Apr 11, 2022 at 10:52 PM <guoren@kernel.org> wrote:
>
> From: Guo Ren <guoren@linux.alibaba.com>
>
> The generic atomic.h used cmpxchg to implement the atomic
> operations, it will cause daul loop to reduce the forward
> guarantee. The patch implement csky custom atomic operations with
> ldex/stex instructions for the best performance.
>
> Important reference comment by Rutland:
> 8e86f0b409a4 ("arm64: atomics: fix use of acquire + release for
> full barrier semantics")
>
> Link: https://lore.kernel.org/linux-riscv/CAJF2gTSAxpAi=LbAdu7jntZRUa=-dJwL0VfmDfBV5MHB=rcZ-w@mail.gmail.com/T/#m27a0f1342995deae49ce1d0e1f2683f8a181d6c3
> Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> Signed-off-by: Guo Ren <guoren@kernel.org>
> Cc: Mark Rutland <mark.rutland@arm.com>
> ---
> Changes in V2:
> - Fixup use of acquire + release for barrier semantics by Rutland.
> ---
> arch/csky/include/asm/atomic.h | 130 +++++++++++++++++++++++++++++++++
> 1 file changed, 130 insertions(+)
> create mode 100644 arch/csky/include/asm/atomic.h
>
> diff --git a/arch/csky/include/asm/atomic.h b/arch/csky/include/asm/atomic.h
> new file mode 100644
> index 000000000000..2e1a22f55ea1
> --- /dev/null
> +++ b/arch/csky/include/asm/atomic.h
> @@ -0,0 +1,130 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#ifndef __ASM_CSKY_ATOMIC_H
> +#define __ASM_CSKY_ATOMIC_H
> +
> +#ifdef CONFIG_SMP
> +# include <asm-generic/atomic64.h>
> +
> +#include <asm/cmpxchg.h>
> +#include <asm/barrier.h>
> +
> +#define __atomic_acquire_fence() __smp_acquire_fence()
> +
> +#define __atomic_release_fence() __smp_release_fence()
> +
> +static __always_inline int arch_atomic_read(const atomic_t *v)
> +{
> + return READ_ONCE(v->counter);
> +}
> +static __always_inline void arch_atomic_set(atomic_t *v, int i)
> +{
> + WRITE_ONCE(v->counter, i);
> +}
> +
> +#define ATOMIC_OP(op, asm_op, I) \
> +static __always_inline \
> +void arch_atomic_##op(int i, atomic_t *v) \
> +{ \
> + unsigned long tmp; \
> + __asm__ __volatile__ ( \
> + "1: ldex.w %0, (%2) \n" \
> + " " #op " %0, %1 \n" \
> + " stex.w %0, (%2) \n" \
> + " bez %0, 1b \n" \
> + : "=&r" (tmp) \
> + : "r" (I), "r" (&v->counter) \
> + : "memory"); \
> +}
> +
> +ATOMIC_OP(add, add, i)
> +ATOMIC_OP(sub, add, -i)
> +ATOMIC_OP(and, and, i)
> +ATOMIC_OP( or, or, i)
> +ATOMIC_OP(xor, xor, i)
Sorry, it should be fixed up by:
#define ATOMIC_OP(op) \
static __always_inline \
void arch_atomic_##op(int i, atomic_t *v) \
{ \
unsigned long tmp; \
__asm__ __volatile__ ( \
"1: ldex.w %0, (%2) \n" \
" " #op " %0, %1 \n" \
" stex.w %0, (%2) \n" \
" bez %0, 1b \n" \
: "=&r" (tmp) \
: "r" (i), "r" (&v->counter) \
: "memory"); \
}
ATOMIC_OP(add)
ATOMIC_OP(sub)
ATOMIC_OP(and)
ATOMIC_OP( or)
ATOMIC_OP(xor)
> +
> +#undef ATOMIC_OP
> +
> +#define ATOMIC_FETCH_OP(op, asm_op, I) \
> +static __always_inline \
> +int arch_atomic_fetch_##op##_relaxed(int i, atomic_t *v) \
> +{ \
> + register int ret, tmp; \
> + __asm__ __volatile__ ( \
> + "1: ldex.w %0, (%3) \n" \
> + " mov %1, %0 \n" \
> + " " #op " %0, %2 \n" \
> + " stex.w %0, (%3) \n" \
> + " bez %0, 1b \n" \
> + : "=&r" (tmp), "=&r" (ret) \
> + : "r" (I), "r"(&v->counter) \
> + : "memory"); \
> + return ret; \
> +}
> +
> +#define ATOMIC_OP_RETURN(op, asm_op, c_op, I) \
> +static __always_inline \
> +int arch_atomic_##op##_return_relaxed(int i, atomic_t *v) \
> +{ \
> + return arch_atomic_fetch_##op##_relaxed(i, v) c_op I; \
> +}
> +
> +#define ATOMIC_OPS(op, asm_op, c_op, I) \
> + ATOMIC_FETCH_OP( op, asm_op, I) \
> + ATOMIC_OP_RETURN(op, asm_op, c_op, I)
> +
> +ATOMIC_OPS(add, add, +, i)
> +ATOMIC_OPS(sub, add, +, -i)
> +
> +#define arch_atomic_fetch_add_relaxed arch_atomic_fetch_add_relaxed
> +#define arch_atomic_fetch_sub_relaxed arch_atomic_fetch_sub_relaxed
> +
> +#define arch_atomic_add_return_relaxed arch_atomic_add_return_relaxed
> +#define arch_atomic_sub_return_relaxed arch_atomic_sub_return_relaxed
> +
> +#undef ATOMIC_OPS
> +#undef ATOMIC_OP_RETURN
> +
> +#define ATOMIC_OPS(op, asm_op, I) \
> + ATOMIC_FETCH_OP(op, asm_op, I)
> +
> +ATOMIC_OPS(and, and, i)
> +ATOMIC_OPS( or, or, i)
> +ATOMIC_OPS(xor, xor, i)
> +
> +#define arch_atomic_fetch_and_relaxed arch_atomic_fetch_and_relaxed
> +#define arch_atomic_fetch_or_relaxed arch_atomic_fetch_or_relaxed
> +#define arch_atomic_fetch_xor_relaxed arch_atomic_fetch_xor_relaxed
> +
> +#undef ATOMIC_OPS
> +
> +#undef ATOMIC_FETCH_OP
> +
> +#define ATOMIC_OP() \
> +static __always_inline \
> +int arch_atomic_xchg_relaxed(atomic_t *v, int n) \
> +{ \
> + return __xchg_relaxed(n, &(v->counter), 4); \
> +} \
> +static __always_inline \
> +int arch_atomic_cmpxchg_relaxed(atomic_t *v, int o, int n) \
> +{ \
> + return __cmpxchg_relaxed(&(v->counter), o, n, 4); \
> +}
> +
> +#define ATOMIC_OPS() \
> + ATOMIC_OP()
> +
> +ATOMIC_OPS()
> +
> +#define arch_atomic_xchg_relaxed arch_atomic_xchg_relaxed
> +#define arch_atomic_cmpxchg_relaxed arch_atomic_cmpxchg_relaxed
> +
> +#undef ATOMIC_OPS
> +#undef ATOMIC_OP
> +
> +#else
> +# include <asm-generic/atomic.h>
> +#endif
> +
> +#endif /* __ASM_CSKY_ATOMIC_H */
> --
> 2.25.1
>
--
Best Regards
Guo Ren
ML: https://lore.kernel.org/linux-csky/
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2022-04-12 5:21 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-04-11 14:51 [PATCH V2 0/2] csky: Optimize with acquire & release for atomic & cmpxchg guoren
2022-04-11 14:51 ` [PATCH V2 1/2] csky: cmpxchg: Optimize with acquire & release guoren
2022-04-11 14:51 ` [PATCH V2 2/2] csky: atomic: Add custom atomic.h implementation guoren
2022-04-12 5:21 ` Guo Ren
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).