linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2 1/5] csky: Remove custom asm/atomic.h implementation
@ 2020-12-20 15:39 guoren
  2020-12-20 15:39 ` [PATCH v2 2/5] csky: Fixup barrier design guoren
                   ` (4 more replies)
  0 siblings, 5 replies; 10+ messages in thread
From: guoren @ 2020-12-20 15:39 UTC (permalink / raw)
  To: guoren, arnd
  Cc: linux-kernel, linux-csky, linux-arch, Guo Ren, Peter Zijlstra,
	Arnd Bergmann, Paul E . McKenney

From: Guo Ren <guoren@linux.alibaba.com>

Use generic atomic implementation based on cmpxchg. So remove csky
asm/atomic.h.

Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnd Bergmann <arnd@kernel.org>
Cc: Paul E. McKenney <paulmck@kernel.org>
---
 arch/csky/include/asm/atomic.h | 212 -----------------------------------------
 1 file changed, 212 deletions(-)
 delete mode 100644 arch/csky/include/asm/atomic.h

diff --git a/arch/csky/include/asm/atomic.h b/arch/csky/include/asm/atomic.h
deleted file mode 100644
index e369d73..00000000
--- a/arch/csky/include/asm/atomic.h
+++ /dev/null
@@ -1,212 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef __ASM_CSKY_ATOMIC_H
-#define __ASM_CSKY_ATOMIC_H
-
-#include <linux/version.h>
-#include <asm/cmpxchg.h>
-#include <asm/barrier.h>
-
-#ifdef CONFIG_CPU_HAS_LDSTEX
-
-#define __atomic_add_unless __atomic_add_unless
-static inline int __atomic_add_unless(atomic_t *v, int a, int u)
-{
-	unsigned long tmp, ret;
-
-	smp_mb();
-
-	asm volatile (
-	"1:	ldex.w		%0, (%3) \n"
-	"	mov		%1, %0   \n"
-	"	cmpne		%0, %4   \n"
-	"	bf		2f	 \n"
-	"	add		%0, %2   \n"
-	"	stex.w		%0, (%3) \n"
-	"	bez		%0, 1b   \n"
-	"2:				 \n"
-		: "=&r" (tmp), "=&r" (ret)
-		: "r" (a), "r"(&v->counter), "r"(u)
-		: "memory");
-
-	if (ret != u)
-		smp_mb();
-
-	return ret;
-}
-
-#define ATOMIC_OP(op, c_op)						\
-static inline void atomic_##op(int i, atomic_t *v)			\
-{									\
-	unsigned long tmp;						\
-									\
-	asm volatile (							\
-	"1:	ldex.w		%0, (%2) \n"				\
-	"	" #op "		%0, %1   \n"				\
-	"	stex.w		%0, (%2) \n"				\
-	"	bez		%0, 1b   \n"				\
-		: "=&r" (tmp)						\
-		: "r" (i), "r"(&v->counter)				\
-		: "memory");						\
-}
-
-#define ATOMIC_OP_RETURN(op, c_op)					\
-static inline int atomic_##op##_return(int i, atomic_t *v)		\
-{									\
-	unsigned long tmp, ret;						\
-									\
-	smp_mb();							\
-	asm volatile (							\
-	"1:	ldex.w		%0, (%3) \n"				\
-	"	" #op "		%0, %2   \n"				\
-	"	mov		%1, %0   \n"				\
-	"	stex.w		%0, (%3) \n"				\
-	"	bez		%0, 1b   \n"				\
-		: "=&r" (tmp), "=&r" (ret)				\
-		: "r" (i), "r"(&v->counter)				\
-		: "memory");						\
-	smp_mb();							\
-									\
-	return ret;							\
-}
-
-#define ATOMIC_FETCH_OP(op, c_op)					\
-static inline int atomic_fetch_##op(int i, atomic_t *v)			\
-{									\
-	unsigned long tmp, ret;						\
-									\
-	smp_mb();							\
-	asm volatile (							\
-	"1:	ldex.w		%0, (%3) \n"				\
-	"	mov		%1, %0   \n"				\
-	"	" #op "		%0, %2   \n"				\
-	"	stex.w		%0, (%3) \n"				\
-	"	bez		%0, 1b   \n"				\
-		: "=&r" (tmp), "=&r" (ret)				\
-		: "r" (i), "r"(&v->counter)				\
-		: "memory");						\
-	smp_mb();							\
-									\
-	return ret;							\
-}
-
-#else /* CONFIG_CPU_HAS_LDSTEX */
-
-#include <linux/irqflags.h>
-
-#define __atomic_add_unless __atomic_add_unless
-static inline int __atomic_add_unless(atomic_t *v, int a, int u)
-{
-	unsigned long tmp, ret, flags;
-
-	raw_local_irq_save(flags);
-
-	asm volatile (
-	"	ldw		%0, (%3) \n"
-	"	mov		%1, %0   \n"
-	"	cmpne		%0, %4   \n"
-	"	bf		2f	 \n"
-	"	add		%0, %2   \n"
-	"	stw		%0, (%3) \n"
-	"2:				 \n"
-		: "=&r" (tmp), "=&r" (ret)
-		: "r" (a), "r"(&v->counter), "r"(u)
-		: "memory");
-
-	raw_local_irq_restore(flags);
-
-	return ret;
-}
-
-#define ATOMIC_OP(op, c_op)						\
-static inline void atomic_##op(int i, atomic_t *v)			\
-{									\
-	unsigned long tmp, flags;					\
-									\
-	raw_local_irq_save(flags);					\
-									\
-	asm volatile (							\
-	"	ldw		%0, (%2) \n"				\
-	"	" #op "		%0, %1   \n"				\
-	"	stw		%0, (%2) \n"				\
-		: "=&r" (tmp)						\
-		: "r" (i), "r"(&v->counter)				\
-		: "memory");						\
-									\
-	raw_local_irq_restore(flags);					\
-}
-
-#define ATOMIC_OP_RETURN(op, c_op)					\
-static inline int atomic_##op##_return(int i, atomic_t *v)		\
-{									\
-	unsigned long tmp, ret, flags;					\
-									\
-	raw_local_irq_save(flags);					\
-									\
-	asm volatile (							\
-	"	ldw		%0, (%3) \n"				\
-	"	" #op "		%0, %2   \n"				\
-	"	stw		%0, (%3) \n"				\
-	"	mov		%1, %0   \n"				\
-		: "=&r" (tmp), "=&r" (ret)				\
-		: "r" (i), "r"(&v->counter)				\
-		: "memory");						\
-									\
-	raw_local_irq_restore(flags);					\
-									\
-	return ret;							\
-}
-
-#define ATOMIC_FETCH_OP(op, c_op)					\
-static inline int atomic_fetch_##op(int i, atomic_t *v)			\
-{									\
-	unsigned long tmp, ret, flags;					\
-									\
-	raw_local_irq_save(flags);					\
-									\
-	asm volatile (							\
-	"	ldw		%0, (%3) \n"				\
-	"	mov		%1, %0   \n"				\
-	"	" #op "		%0, %2   \n"				\
-	"	stw		%0, (%3) \n"				\
-		: "=&r" (tmp), "=&r" (ret)				\
-		: "r" (i), "r"(&v->counter)				\
-		: "memory");						\
-									\
-	raw_local_irq_restore(flags);					\
-									\
-	return ret;							\
-}
-
-#endif /* CONFIG_CPU_HAS_LDSTEX */
-
-#define atomic_add_return atomic_add_return
-ATOMIC_OP_RETURN(add, +)
-#define atomic_sub_return atomic_sub_return
-ATOMIC_OP_RETURN(sub, -)
-
-#define atomic_fetch_add atomic_fetch_add
-ATOMIC_FETCH_OP(add, +)
-#define atomic_fetch_sub atomic_fetch_sub
-ATOMIC_FETCH_OP(sub, -)
-#define atomic_fetch_and atomic_fetch_and
-ATOMIC_FETCH_OP(and, &)
-#define atomic_fetch_or atomic_fetch_or
-ATOMIC_FETCH_OP(or, |)
-#define atomic_fetch_xor atomic_fetch_xor
-ATOMIC_FETCH_OP(xor, ^)
-
-#define atomic_and atomic_and
-ATOMIC_OP(and, &)
-#define atomic_or atomic_or
-ATOMIC_OP(or, |)
-#define atomic_xor atomic_xor
-ATOMIC_OP(xor, ^)
-
-#undef ATOMIC_FETCH_OP
-#undef ATOMIC_OP_RETURN
-#undef ATOMIC_OP
-
-#include <asm-generic/atomic.h>
-
-#endif /* __ASM_CSKY_ATOMIC_H */
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH v2 2/5] csky: Fixup barrier design
  2020-12-20 15:39 [PATCH v2 1/5] csky: Remove custom asm/atomic.h implementation guoren
@ 2020-12-20 15:39 ` guoren
  2020-12-20 15:39 ` [PATCH v2 3/5] csky: Fixup futex SMP implementation guoren
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 10+ messages in thread
From: guoren @ 2020-12-20 15:39 UTC (permalink / raw)
  To: guoren, arnd; +Cc: linux-kernel, linux-csky, linux-arch, Guo Ren

From: Guo Ren <guoren@linux.alibaba.com>

Remove shareable bit for ordering barrier, just keep ordering
in current hart is enough for SMP. Using three continuous
sync.is as PTW barrier to prevent speculative PTW in 860
microarchitecture.

Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
---
 arch/csky/include/asm/barrier.h | 82 ++++++++++++++++++++++++++++++-----------
 1 file changed, 60 insertions(+), 22 deletions(-)

diff --git a/arch/csky/include/asm/barrier.h b/arch/csky/include/asm/barrier.h
index a430e7f..117e622 100644
--- a/arch/csky/include/asm/barrier.h
+++ b/arch/csky/include/asm/barrier.h
@@ -8,6 +8,61 @@
 
 #define nop()	asm volatile ("nop\n":::"memory")
 
+#ifdef CONFIG_SMP
+
+/*
+ * bar.brwarws: ordering barrier for all load/store instructions
+ *              before/after
+ *
+ * |31|30 26|25 21|20 16|15  10|9   5|4           0|
+ *  1  10000 00000 00000 100001	00001 0 bw br aw ar
+ *
+ * b: before
+ * a: after
+ * r: read
+ * w: write
+ *
+ * Here are all combinations:
+ *
+ * bar.brw
+ * bar.br
+ * bar.bw
+ * bar.arw
+ * bar.ar
+ * bar.aw
+ * bar.brwarw
+ * bar.brarw
+ * bar.bwarw
+ * bar.brwar
+ * bar.brwaw
+ * bar.brar
+ * bar.bwaw
+ */
+#define __bar_brw()	asm volatile (".long 0x842cc000\n":::"memory")
+#define __bar_br()	asm volatile (".long 0x8424c000\n":::"memory")
+#define __bar_bw()	asm volatile (".long 0x8428c000\n":::"memory")
+#define __bar_arw()	asm volatile (".long 0x8423c000\n":::"memory")
+#define __bar_ar()	asm volatile (".long 0x8421c000\n":::"memory")
+#define __bar_aw()	asm volatile (".long 0x8422c000\n":::"memory")
+#define __bar_brwarw()	asm volatile (".long 0x842fc000\n":::"memory")
+#define __bar_brarw()	asm volatile (".long 0x8427c000\n":::"memory")
+#define __bar_bwarw()	asm volatile (".long 0x842bc000\n":::"memory")
+#define __bar_brwar()	asm volatile (".long 0x842dc000\n":::"memory")
+#define __bar_brwaw()	asm volatile (".long 0x842ec000\n":::"memory")
+#define __bar_brar()	asm volatile (".long 0x8425c000\n":::"memory")
+#define __bar_brar()	asm volatile (".long 0x8425c000\n":::"memory")
+#define __bar_bwaw()	asm volatile (".long 0x842ac000\n":::"memory")
+
+#define __smp_mb()	__bar_brwarw()
+#define __smp_rmb()	__bar_brar()
+#define __smp_wmb()	__bar_bwaw()
+
+#define ACQUIRE_FENCE		".long 0x8427c000\n"
+#define __smp_acquire_fence()	__bar_brarw()
+#define __smp_release_fence()	__bar_brwaw()
+
+#endif /* CONFIG_SMP */
+
 /*
  * sync:        completion barrier, all sync.xx instructions
  *              guarantee the last response recieved by bus transaction
@@ -15,31 +70,14 @@
  * sync.s:      inherit from sync, but also shareable to other cores
  * sync.i:      inherit from sync, but also flush cpu pipeline
  * sync.is:     the same with sync.i + sync.s
- *
- * bar.brwarw:  ordering barrier for all load/store instructions before it
- * bar.brwarws: ordering barrier for all load/store instructions before it
- *						and shareable to other cores
- * bar.brar:    ordering barrier for all load       instructions before it
- * bar.brars:   ordering barrier for all load       instructions before it
- *						and shareable to other cores
- * bar.bwaw:    ordering barrier for all store      instructions before it
- * bar.bwaws:   ordering barrier for all store      instructions before it
- *						and shareable to other cores
  */
+#define mb()		asm volatile ("sync\n":::"memory")
 
 #ifdef CONFIG_CPU_HAS_CACHEV2
-#define mb()		asm volatile ("sync.s\n":::"memory")
-
-#ifdef CONFIG_SMP
-#define __smp_mb()	asm volatile ("bar.brwarws\n":::"memory")
-#define __smp_rmb()	asm volatile ("bar.brars\n":::"memory")
-#define __smp_wmb()	asm volatile ("bar.bwaws\n":::"memory")
-#endif /* CONFIG_SMP */
-
-#define sync_is()	asm volatile ("sync.is\n":::"memory")
-
-#else /* !CONFIG_CPU_HAS_CACHEV2 */
-#define mb()		asm volatile ("sync\n":::"memory")
+/*
+ * Using three sync.is to prevent speculative PTW
+ */
+#define sync_is()	asm volatile ("sync.is\nsync.is\nsync.is\n":::"memory")
 #endif
 
 #include <asm-generic/barrier.h>
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH v2 3/5] csky: Fixup futex SMP implementation
  2020-12-20 15:39 [PATCH v2 1/5] csky: Remove custom asm/atomic.h implementation guoren
  2020-12-20 15:39 ` [PATCH v2 2/5] csky: Fixup barrier design guoren
@ 2020-12-20 15:39 ` guoren
  2020-12-20 15:39 ` [PATCH v2 4/5] csky: Fixup asm/cmpxchg.h with correct ordering barrier guoren
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 10+ messages in thread
From: guoren @ 2020-12-20 15:39 UTC (permalink / raw)
  To: guoren, arnd
  Cc: linux-kernel, linux-csky, linux-arch, Guo Ren, Arnd Bergmann,
	Peter Zijlstra, Paul E . McKenney

From: Guo Ren <guoren@linux.alibaba.com>

Arnd said:
I would guess that for csky, this is a mistake, as the architecture
is fairly new and should be able to implement it.

Guo reply:
The c610, c807, c810 don't support SMP, so futex_cmpxchg_enabled = 1
with asm-generic's implementation.
For c860, there is no HAVE_FUTEX_CMPXCHG and cmpxchg_inatomic/inuser
implementation, so futex_cmpxchg_enabled = 0.

Thx for point it out, we'll implement cmpxchg_inatomic/inuser for
C860 and still use asm-generic for non-smp CPUs.

LTP test:
futex_wait01    1  TPASS  :  futex_wait(): errno=ETIMEDOUT(110): Connection timed out
futex_wait01    2  TPASS  :  futex_wait(): errno=EAGAIN/EWOULDBLOCK(11): Resource temporarily unavailable
futex_wait01    3  TPASS  :  futex_wait(): errno=ETIMEDOUT(110): Connection timed out
futex_wait01    4  TPASS  :  futex_wait(): errno=EAGAIN/EWOULDBLOCK(11): Resource temporarily unavailable
futex_wait02    1  TPASS  :  futex_wait() woken up
futex_wait03    1  TPASS  :  futex_wait() woken up
futex_wait04    1  TPASS  :  futex_wait() returned -1: errno=EAGAIN/EWOULDBLOCK(11): Resource temporarily unavailable

Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
Cc: Arnd Bergmann <arnd@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul E. McKenney <paulmck@kernel.org>
Link: https://lore.kernel.org/lkml/CAK8P3a3+WaQNyJ6Za2qfu6=0mBgU1hApnRXrdp1b1=P7wwyRUg@mail.gmail.com/
---
 arch/csky/Kconfig             |   1 +
 arch/csky/include/asm/futex.h | 127 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 128 insertions(+)
 create mode 100644 arch/csky/include/asm/futex.h

diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index 50bb8b4..e254dc2 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -49,6 +49,7 @@ config CSKY
 	select HAVE_FUNCTION_TRACER
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_FUNCTION_ERROR_INJECTION
+	select HAVE_FUTEX_CMPXCHG if FUTEX && SMP
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_LZO
diff --git a/arch/csky/include/asm/futex.h b/arch/csky/include/asm/futex.h
new file mode 100644
index 00000000..dbe2f99
--- /dev/null
+++ b/arch/csky/include/asm/futex.h
@@ -0,0 +1,127 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __ASM_CSKY_FUTEX_H
+#define __ASM_CSKY_FUTEX_H
+
+#ifndef CONFIG_SMP
+#include <asm-generic/futex.h>
+#else
+#include <linux/atomic.h>
+#include <linux/futex.h>
+#include <linux/uaccess.h>
+#include <linux/errno.h>
+
+#define __futex_atomic_op(insn, ret, oldval, uaddr, oparg)		\
+{									\
+	u32 tmp;							\
+									\
+	__atomic_pre_full_fence();					\
+									\
+	__asm__ __volatile__ (						\
+	"1:	ldex.w	%[ov], %[u]			\n"		\
+	"	"insn"					\n"		\
+	"2:	stex.w	%[t], %[u]			\n"		\
+	"	bez	%[t], 1b			\n"		\
+	"4:						\n"		\
+	"	.section .fixup,\"ax\"			\n"		\
+	"	.balign 4				\n"		\
+	"5:	mov %[r], %[e]				\n"		\
+	"	jmpi 4b					\n"		\
+	"	.previous				\n"		\
+	"	.section __ex_table,\"a\"		\n"		\
+	"	.balign 4				\n"		\
+	"	.long	1b, 5b				\n"		\
+	"	.long	2b, 5b				\n"		\
+	"	.previous				\n"		\
+	: [r] "+r" (ret), [ov] "=&r" (oldval),				\
+	  [u] "+m" (*uaddr), [t] "=&r" (tmp)				\
+	: [op] "Jr" (oparg), [e] "jr" (-EFAULT)				\
+	: "memory");							\
+									\
+	__atomic_post_full_fence();					\
+}
+
+static inline int
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
+{
+	int oldval = 0, ret = 0;
+
+	if (!access_ok(uaddr, sizeof(u32)))
+		return -EFAULT;
+
+	switch (op) {
+	case FUTEX_OP_SET:
+		__futex_atomic_op("mov %[t], %[ov]",
+				  ret, oldval, uaddr, oparg);
+		break;
+	case FUTEX_OP_ADD:
+		__futex_atomic_op("add %[t], %[ov], %[op]",
+				  ret, oldval, uaddr, oparg);
+		break;
+	case FUTEX_OP_OR:
+		__futex_atomic_op("or %[t], %[ov], %[op]",
+				  ret, oldval, uaddr, oparg);
+		break;
+	case FUTEX_OP_ANDN:
+		__futex_atomic_op("and %[t], %[ov], %[op]",
+				  ret, oldval, uaddr, ~oparg);
+		break;
+	case FUTEX_OP_XOR:
+		__futex_atomic_op("xor %[t], %[ov], %[op]",
+				  ret, oldval, uaddr, oparg);
+		break;
+	default:
+		ret = -ENOSYS;
+	}
+
+	if (!ret)
+		*oval = oldval;
+
+	return ret;
+}
+
+
+
+static inline int
+futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
+			      u32 oldval, u32 newval)
+{
+	int ret = 0;
+	u32 val, tmp;
+
+	if (!access_ok(uaddr, sizeof(u32)))
+		return -EFAULT;
+
+	__atomic_pre_full_fence();
+
+	__asm__ __volatile__ (
+	"1:	ldex.w	%[v], %[u]			\n"
+	"	cmpne	%[v], %[ov]			\n"
+	"	bt	4f				\n"
+	"	mov	%[t], %[nv]			\n"
+	"2:	stex.w	%[t], %[u]			\n"
+	"	bez	%[t], 1b			\n"
+	"4:						\n"
+	"	.section .fixup,\"ax\"			\n"
+	"	.balign 4				\n"
+	"5:	mov %[r], %[e]				\n"
+	"	jmpi 4b					\n"
+	"	.previous				\n"
+	"	.section __ex_table,\"a\"		\n"
+	"	.balign 4				\n"
+	"	.long	1b, 5b				\n"
+	"	.long	2b, 5b				\n"
+	"	.previous				\n"
+	: [r] "+r" (ret), [v] "=&r" (val), [u] "+m" (*uaddr),
+	  [t] "=&r" (tmp)
+	: [ov] "Jr" (oldval), [nv] "Jr" (newval), [e] "Jr" (-EFAULT)
+	: "memory");
+
+	__atomic_post_full_fence();
+
+	*uval = val;
+	return ret;
+}
+
+#endif /* CONFIG_SMP */
+#endif /* __ASM_CSKY_FUTEX_H */
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH v2 4/5] csky: Fixup asm/cmpxchg.h with correct ordering barrier
  2020-12-20 15:39 [PATCH v2 1/5] csky: Remove custom asm/atomic.h implementation guoren
  2020-12-20 15:39 ` [PATCH v2 2/5] csky: Fixup barrier design guoren
  2020-12-20 15:39 ` [PATCH v2 3/5] csky: Fixup futex SMP implementation guoren
@ 2020-12-20 15:39 ` guoren
  2021-01-07 12:40   ` Peter Zijlstra
  2020-12-20 15:39 ` [PATCH v2 5/5] csky: Cleanup asm/spinlock.h guoren
  2021-01-07 11:19 ` [PATCH v2 1/5] csky: Remove custom asm/atomic.h implementation Peter Zijlstra
  4 siblings, 1 reply; 10+ messages in thread
From: guoren @ 2020-12-20 15:39 UTC (permalink / raw)
  To: guoren, arnd
  Cc: linux-kernel, linux-csky, linux-arch, Guo Ren, Peter Zijlstra,
	Paul E . McKenney

From: Guo Ren <guoren@linux.alibaba.com>

Optimize the performance of cmpxchg by using more fine-grained
acquire/release barriers.

Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Paul E. McKenney <paulmck@kernel.org>
---
 arch/csky/include/asm/cmpxchg.h | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/arch/csky/include/asm/cmpxchg.h b/arch/csky/include/asm/cmpxchg.h
index 8922453..dabc8e4 100644
--- a/arch/csky/include/asm/cmpxchg.h
+++ b/arch/csky/include/asm/cmpxchg.h
@@ -3,12 +3,12 @@
 #ifndef __ASM_CSKY_CMPXCHG_H
 #define __ASM_CSKY_CMPXCHG_H
 
-#ifdef CONFIG_CPU_HAS_LDSTEX
+#ifdef CONFIG_SMP
 #include <asm/barrier.h>
 
 extern void __bad_xchg(void);
 
-#define __xchg(new, ptr, size)					\
+#define __xchg_relaxed(new, ptr, size)				\
 ({								\
 	__typeof__(ptr) __ptr = (ptr);				\
 	__typeof__(new) __new = (new);				\
@@ -16,7 +16,6 @@ extern void __bad_xchg(void);
 	unsigned long tmp;					\
 	switch (size) {						\
 	case 4:							\
-		smp_mb();					\
 		asm volatile (					\
 		"1:	ldex.w		%0, (%3) \n"		\
 		"	mov		%1, %2   \n"		\
@@ -25,7 +24,6 @@ extern void __bad_xchg(void);
 			: "=&r" (__ret), "=&r" (tmp)		\
 			: "r" (__new), "r"(__ptr)		\
 			:);					\
-		smp_mb();					\
 		break;						\
 	default:						\
 		__bad_xchg();					\
@@ -33,9 +31,10 @@ extern void __bad_xchg(void);
 	__ret;							\
 })
 
-#define xchg(ptr, x)	(__xchg((x), (ptr), sizeof(*(ptr))))
+#define xchg_relaxed(ptr, x) \
+		(__xchg_relaxed((x), (ptr), sizeof(*(ptr))))
 
-#define __cmpxchg(ptr, old, new, size)				\
+#define __cmpxchg_relaxed(ptr, old, new, size)			\
 ({								\
 	__typeof__(ptr) __ptr = (ptr);				\
 	__typeof__(new) __new = (new);				\
@@ -44,7 +43,6 @@ extern void __bad_xchg(void);
 	__typeof__(*(ptr)) __ret;				\
 	switch (size) {						\
 	case 4:							\
-		smp_mb();					\
 		asm volatile (					\
 		"1:	ldex.w		%0, (%3) \n"		\
 		"	cmpne		%0, %4   \n"		\
@@ -56,7 +54,6 @@ extern void __bad_xchg(void);
 			: "=&r" (__ret), "=&r" (__tmp)		\
 			: "r" (__new), "r"(__ptr), "r"(__old)	\
 			:);					\
-		smp_mb();					\
 		break;						\
 	default:						\
 		__bad_xchg();					\
@@ -64,8 +61,18 @@ extern void __bad_xchg(void);
 	__ret;							\
 })
 
-#define cmpxchg(ptr, o, n) \
-	(__cmpxchg((ptr), (o), (n), sizeof(*(ptr))))
+#define cmpxchg_relaxed(ptr, o, n) \
+	(__cmpxchg_relaxed((ptr), (o), (n), sizeof(*(ptr))))
+
+#define cmpxchg(ptr, o, n) 					\
+({								\
+	__typeof__(*(ptr)) __ret;				\
+	__smp_release_fence();					\
+	__ret = cmpxchg_relaxed(ptr, o, n);			\
+	__smp_acquire_fence();					\
+	__ret;							\
+})
+
 #else
 #include <asm-generic/cmpxchg.h>
 #endif
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH v2 5/5] csky: Cleanup asm/spinlock.h
  2020-12-20 15:39 [PATCH v2 1/5] csky: Remove custom asm/atomic.h implementation guoren
                   ` (2 preceding siblings ...)
  2020-12-20 15:39 ` [PATCH v2 4/5] csky: Fixup asm/cmpxchg.h with correct ordering barrier guoren
@ 2020-12-20 15:39 ` guoren
  2021-01-07 12:45   ` Peter Zijlstra
  2021-01-07 11:19 ` [PATCH v2 1/5] csky: Remove custom asm/atomic.h implementation Peter Zijlstra
  4 siblings, 1 reply; 10+ messages in thread
From: guoren @ 2020-12-20 15:39 UTC (permalink / raw)
  To: guoren, arnd
  Cc: linux-kernel, linux-csky, linux-arch, Guo Ren, Peter Zijlstra k

From: Guo Ren <guoren@linux.alibaba.com>

There are two implementation of spinlock in arch/csky:
 - simple one (NR_CPU = 1,2)
 - tick's one (NR_CPU = 3,4)
Remove the simple one.

There is already smp_mb in spinlock, so remove the definition of
smp_mb__after_spinlock.

Link: https://lore.kernel.org/linux-csky/20200807081253.GD2674@hirez.programming.kicks-ass.net/#t
Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
Cc: Peter Zijlstra <peterz@infradead.org>k
Cc: Arnd Bergmann <arnd@arndb.de>
---
 arch/csky/Kconfig                      |   2 +-
 arch/csky/include/asm/spinlock.h       | 167 ---------------------------------
 arch/csky/include/asm/spinlock_types.h |  10 --
 3 files changed, 1 insertion(+), 178 deletions(-)

diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index e254dc2..5ebb05a 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -7,7 +7,7 @@ config CSKY
 	select ARCH_HAS_SYNC_DMA_FOR_CPU
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
 	select ARCH_USE_BUILTIN_BSWAP
-	select ARCH_USE_QUEUED_RWLOCKS if NR_CPUS>2
+	select ARCH_USE_QUEUED_RWLOCKS
 	select ARCH_WANT_FRAME_POINTERS if !CPU_CK610
 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
 	select COMMON_CLK
diff --git a/arch/csky/include/asm/spinlock.h b/arch/csky/include/asm/spinlock.h
index 7cf3f2b..69f5aa2 100644
--- a/arch/csky/include/asm/spinlock.h
+++ b/arch/csky/include/asm/spinlock.h
@@ -6,8 +6,6 @@
 #include <linux/spinlock_types.h>
 #include <asm/barrier.h>
 
-#ifdef CONFIG_QUEUED_RWLOCKS
-
 /*
  * Ticket-based spin-locking.
  */
@@ -88,169 +86,4 @@ static inline int arch_spin_is_contended(arch_spinlock_t *lock)
 
 #include <asm/qrwlock.h>
 
-/* See include/linux/spinlock.h */
-#define smp_mb__after_spinlock()	smp_mb()
-
-#else /* CONFIG_QUEUED_RWLOCKS */
-
-/*
- * Test-and-set spin-locking.
- */
-static inline void arch_spin_lock(arch_spinlock_t *lock)
-{
-	u32 *p = &lock->lock;
-	u32 tmp;
-
-	asm volatile (
-		"1:	ldex.w		%0, (%1) \n"
-		"	bnez		%0, 1b   \n"
-		"	movi		%0, 1    \n"
-		"	stex.w		%0, (%1) \n"
-		"	bez		%0, 1b   \n"
-		: "=&r" (tmp)
-		: "r"(p)
-		: "cc");
-	smp_mb();
-}
-
-static inline void arch_spin_unlock(arch_spinlock_t *lock)
-{
-	smp_mb();
-	WRITE_ONCE(lock->lock, 0);
-}
-
-static inline int arch_spin_trylock(arch_spinlock_t *lock)
-{
-	u32 *p = &lock->lock;
-	u32 tmp;
-
-	asm volatile (
-		"1:	ldex.w		%0, (%1) \n"
-		"	bnez		%0, 2f   \n"
-		"	movi		%0, 1    \n"
-		"	stex.w		%0, (%1) \n"
-		"	bez		%0, 1b   \n"
-		"	movi		%0, 0    \n"
-		"2:				 \n"
-		: "=&r" (tmp)
-		: "r"(p)
-		: "cc");
-
-	if (!tmp)
-		smp_mb();
-
-	return !tmp;
-}
-
-#define arch_spin_is_locked(x)	(READ_ONCE((x)->lock) != 0)
-
-/*
- * read lock/unlock/trylock
- */
-static inline void arch_read_lock(arch_rwlock_t *lock)
-{
-	u32 *p = &lock->lock;
-	u32 tmp;
-
-	asm volatile (
-		"1:	ldex.w		%0, (%1) \n"
-		"	blz		%0, 1b   \n"
-		"	addi		%0, 1    \n"
-		"	stex.w		%0, (%1) \n"
-		"	bez		%0, 1b   \n"
-		: "=&r" (tmp)
-		: "r"(p)
-		: "cc");
-	smp_mb();
-}
-
-static inline void arch_read_unlock(arch_rwlock_t *lock)
-{
-	u32 *p = &lock->lock;
-	u32 tmp;
-
-	smp_mb();
-	asm volatile (
-		"1:	ldex.w		%0, (%1) \n"
-		"	subi		%0, 1    \n"
-		"	stex.w		%0, (%1) \n"
-		"	bez		%0, 1b   \n"
-		: "=&r" (tmp)
-		: "r"(p)
-		: "cc");
-}
-
-static inline int arch_read_trylock(arch_rwlock_t *lock)
-{
-	u32 *p = &lock->lock;
-	u32 tmp;
-
-	asm volatile (
-		"1:	ldex.w		%0, (%1) \n"
-		"	blz		%0, 2f   \n"
-		"	addi		%0, 1    \n"
-		"	stex.w		%0, (%1) \n"
-		"	bez		%0, 1b   \n"
-		"	movi		%0, 0    \n"
-		"2:				 \n"
-		: "=&r" (tmp)
-		: "r"(p)
-		: "cc");
-
-	if (!tmp)
-		smp_mb();
-
-	return !tmp;
-}
-
-/*
- * write lock/unlock/trylock
- */
-static inline void arch_write_lock(arch_rwlock_t *lock)
-{
-	u32 *p = &lock->lock;
-	u32 tmp;
-
-	asm volatile (
-		"1:	ldex.w		%0, (%1) \n"
-		"	bnez		%0, 1b   \n"
-		"	subi		%0, 1    \n"
-		"	stex.w		%0, (%1) \n"
-		"	bez		%0, 1b   \n"
-		: "=&r" (tmp)
-		: "r"(p)
-		: "cc");
-	smp_mb();
-}
-
-static inline void arch_write_unlock(arch_rwlock_t *lock)
-{
-	smp_mb();
-	WRITE_ONCE(lock->lock, 0);
-}
-
-static inline int arch_write_trylock(arch_rwlock_t *lock)
-{
-	u32 *p = &lock->lock;
-	u32 tmp;
-
-	asm volatile (
-		"1:	ldex.w		%0, (%1) \n"
-		"	bnez		%0, 2f   \n"
-		"	subi		%0, 1    \n"
-		"	stex.w		%0, (%1) \n"
-		"	bez		%0, 1b   \n"
-		"	movi		%0, 0    \n"
-		"2:				 \n"
-		: "=&r" (tmp)
-		: "r"(p)
-		: "cc");
-
-	if (!tmp)
-		smp_mb();
-
-	return !tmp;
-}
-
-#endif /* CONFIG_QUEUED_RWLOCKS */
 #endif /* __ASM_CSKY_SPINLOCK_H */
diff --git a/arch/csky/include/asm/spinlock_types.h b/arch/csky/include/asm/spinlock_types.h
index 88b8243..8ff0f6f 100644
--- a/arch/csky/include/asm/spinlock_types.h
+++ b/arch/csky/include/asm/spinlock_types.h
@@ -22,16 +22,6 @@ typedef struct {
 
 #define __ARCH_SPIN_LOCK_UNLOCKED	{ { 0 } }
 
-#ifdef CONFIG_QUEUED_RWLOCKS
 #include <asm-generic/qrwlock_types.h>
 
-#else /* CONFIG_NR_CPUS > 2 */
-
-typedef struct {
-	u32 lock;
-} arch_rwlock_t;
-
-#define __ARCH_RW_LOCK_UNLOCKED		{ 0 }
-
-#endif /* CONFIG_QUEUED_RWLOCKS */
 #endif /* __ASM_CSKY_SPINLOCK_TYPES_H */
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 1/5] csky: Remove custom asm/atomic.h implementation
  2020-12-20 15:39 [PATCH v2 1/5] csky: Remove custom asm/atomic.h implementation guoren
                   ` (3 preceding siblings ...)
  2020-12-20 15:39 ` [PATCH v2 5/5] csky: Cleanup asm/spinlock.h guoren
@ 2021-01-07 11:19 ` Peter Zijlstra
  2021-01-21  7:10   ` Guo Ren
  4 siblings, 1 reply; 10+ messages in thread
From: Peter Zijlstra @ 2021-01-07 11:19 UTC (permalink / raw)
  To: guoren
  Cc: arnd, linux-kernel, linux-csky, linux-arch, Guo Ren,
	Arnd Bergmann, Paul E . McKenney

On Sun, Dec 20, 2020 at 03:39:19PM +0000, guoren@kernel.org wrote:
> From: Guo Ren <guoren@linux.alibaba.com>
> 
> Use generic atomic implementation based on cmpxchg. So remove csky
> asm/atomic.h.

Clarification would be good. Typically cmpxchg() loops perform
sub-optimal on LL/SC architectures, due to the double loop construction.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 4/5] csky: Fixup asm/cmpxchg.h with correct ordering barrier
  2020-12-20 15:39 ` [PATCH v2 4/5] csky: Fixup asm/cmpxchg.h with correct ordering barrier guoren
@ 2021-01-07 12:40   ` Peter Zijlstra
  2021-01-21  7:05     ` Guo Ren
  0 siblings, 1 reply; 10+ messages in thread
From: Peter Zijlstra @ 2021-01-07 12:40 UTC (permalink / raw)
  To: guoren
  Cc: arnd, linux-kernel, linux-csky, linux-arch, Guo Ren, Paul E . McKenney

On Sun, Dec 20, 2020 at 03:39:22PM +0000, guoren@kernel.org wrote:


> +#define cmpxchg(ptr, o, n) 					\
> +({								\
> +	__typeof__(*(ptr)) __ret;				\
> +	__smp_release_fence();					\
> +	__ret = cmpxchg_relaxed(ptr, o, n);			\
> +	__smp_acquire_fence();					\
> +	__ret;							\
> +})

So you failed to Cc me on patch #2 that introduces these barriers. I've
dug it out, but I'm still terribly confused on all that.

On first reading the above looks wrong.

Could you also clarify the difference (if any) between your bar.brwarw
and sync instruction?

Specifically, about transitiviry, or whatever we seem to be calling that
today.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 5/5] csky: Cleanup asm/spinlock.h
  2020-12-20 15:39 ` [PATCH v2 5/5] csky: Cleanup asm/spinlock.h guoren
@ 2021-01-07 12:45   ` Peter Zijlstra
  0 siblings, 0 replies; 10+ messages in thread
From: Peter Zijlstra @ 2021-01-07 12:45 UTC (permalink / raw)
  To: guoren; +Cc: arnd, linux-kernel, linux-csky, linux-arch, Guo Ren, Will Deacon

On Sun, Dec 20, 2020 at 03:39:23PM +0000, guoren@kernel.org wrote:
> From: Guo Ren <guoren@linux.alibaba.com>
> 
> There are two implementation of spinlock in arch/csky:
>  - simple one (NR_CPU = 1,2)
>  - tick's one (NR_CPU = 3,4)
> Remove the simple one.
> 
> There is already smp_mb in spinlock, so remove the definition of
> smp_mb__after_spinlock.

Where ? Note that with qspinlock the fast path is
atomic_try_cmpxchg_acquire(), which does not imply anything of the sort.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 4/5] csky: Fixup asm/cmpxchg.h with correct ordering barrier
  2021-01-07 12:40   ` Peter Zijlstra
@ 2021-01-21  7:05     ` Guo Ren
  0 siblings, 0 replies; 10+ messages in thread
From: Guo Ren @ 2021-01-21  7:05 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Arnd Bergmann, Linux Kernel Mailing List, linux-csky, linux-arch,
	Guo Ren, Paul E . McKenney

Hi Peter,

On Thu, Jan 7, 2021 at 8:41 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Sun, Dec 20, 2020 at 03:39:22PM +0000, guoren@kernel.org wrote:
>
>
> > +#define cmpxchg(ptr, o, n)                                   \
> > +({                                                           \
> > +     __typeof__(*(ptr)) __ret;                               \
> > +     __smp_release_fence();                                  \
> > +     __ret = cmpxchg_relaxed(ptr, o, n);                     \
> > +     __smp_acquire_fence();                                  \
> > +     __ret;                                                  \
> > +})
>
> So you failed to Cc me on patch #2 that introduces these barriers. I've
> dug it out, but I'm still terribly confused on all that.
>
> On first reading the above looks wrong.
>
> Could you also clarify the difference (if any) between your bar.brwarw
> and sync instruction?
>
> Specifically, about transitiviry, or whatever we seem to be calling that
> today.

bar.brwarw just like riscv fence.rwrw
bar means barrier
brw means before read and write would happen before the instruction.
arw means after read and write would happen after the instruction
So it also could be bar.brarw / bar.arw / bar.brw / bar.braw

sync means we need to wait until all instructions complete in the CPU
pipeline and then issue the next instructions.

-- 
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 1/5] csky: Remove custom asm/atomic.h implementation
  2021-01-07 11:19 ` [PATCH v2 1/5] csky: Remove custom asm/atomic.h implementation Peter Zijlstra
@ 2021-01-21  7:10   ` Guo Ren
  0 siblings, 0 replies; 10+ messages in thread
From: Guo Ren @ 2021-01-21  7:10 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Arnd Bergmann, Linux Kernel Mailing List, linux-csky, linux-arch,
	Guo Ren, Arnd Bergmann, Paul E . McKenney

Hi Peter,

On Thu, Jan 7, 2021 at 7:19 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Sun, Dec 20, 2020 at 03:39:19PM +0000, guoren@kernel.org wrote:
> > From: Guo Ren <guoren@linux.alibaba.com>
> >
> > Use generic atomic implementation based on cmpxchg. So remove csky
> > asm/atomic.h.
>
> Clarification would be good. Typically cmpxchg() loops perform
> sub-optimal on LL/SC architectures, due to the double loop construction.

Yes, you are right. But I still want to use comm cmpxchg instead of my
implementation. Maybe in the future, we'll optimize it back.

-- 
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2021-01-21  7:14 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-12-20 15:39 [PATCH v2 1/5] csky: Remove custom asm/atomic.h implementation guoren
2020-12-20 15:39 ` [PATCH v2 2/5] csky: Fixup barrier design guoren
2020-12-20 15:39 ` [PATCH v2 3/5] csky: Fixup futex SMP implementation guoren
2020-12-20 15:39 ` [PATCH v2 4/5] csky: Fixup asm/cmpxchg.h with correct ordering barrier guoren
2021-01-07 12:40   ` Peter Zijlstra
2021-01-21  7:05     ` Guo Ren
2020-12-20 15:39 ` [PATCH v2 5/5] csky: Cleanup asm/spinlock.h guoren
2021-01-07 12:45   ` Peter Zijlstra
2021-01-07 11:19 ` [PATCH v2 1/5] csky: Remove custom asm/atomic.h implementation Peter Zijlstra
2021-01-21  7:10   ` Guo Ren

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).