loongarch.lists.linux.dev archive mirror
 help / color / mirror / Atom feed
* [PATCH] LoongArch: Add qspinlock support
@ 2022-06-17 14:57 Huacai Chen
  2022-06-17 16:10 ` Arnd Bergmann
  2022-06-17 16:35 ` Guo Ren
  0 siblings, 2 replies; 18+ messages in thread
From: Huacai Chen @ 2022-06-17 14:57 UTC (permalink / raw)
  To: Arnd Bergmann, Huacai Chen
  Cc: loongarch, linux-arch, Xuefeng Li, Guo Ren, Xuerui Wang,
	Jiaxun Yang, Peter Zijlstra, Will Deacon, Ingo Molnar,
	Huacai Chen

On NUMA system, the performance of qspinlock is better than generic
spinlock. Below is the UnixBench test results on a 8 nodes (4 cores
per node, 32 cores in total) machine.

A. With generic spinlock:

System Benchmarks Index Values               BASELINE       RESULT    INDEX
Dhrystone 2 using register variables         116700.0  449574022.5  38523.9
Double-Precision Whetstone                       55.0      85190.4  15489.2
Execl Throughput                                 43.0      14696.2   3417.7
File Copy 1024 bufsize 2000 maxblocks          3960.0     143157.8    361.5
File Copy 256 bufsize 500 maxblocks            1655.0      37631.8    227.4
File Copy 4096 bufsize 8000 maxblocks          5800.0     444814.2    766.9
Pipe Throughput                               12440.0    5047490.7   4057.5
Pipe-based Context Switching                   4000.0    2021545.7   5053.9
Process Creation                                126.0      23829.8   1891.3
Shell Scripts (1 concurrent)                     42.4      33756.7   7961.5
Shell Scripts (8 concurrent)                      6.0       4062.9   6771.5
System Call Overhead                          15000.0    2479748.6   1653.2
                                                                   ========
System Benchmarks Index Score                                        2955.6

B. With qspinlock:

System Benchmarks Index Values               BASELINE       RESULT    INDEX
Dhrystone 2 using register variables         116700.0  449467876.9  38514.8
Double-Precision Whetstone                       55.0      85174.6  15486.3
Execl Throughput                                 43.0      14769.1   3434.7
File Copy 1024 bufsize 2000 maxblocks          3960.0     146150.5    369.1
File Copy 256 bufsize 500 maxblocks            1655.0      37496.8    226.6
File Copy 4096 bufsize 8000 maxblocks          5800.0     447527.0    771.6
Pipe Throughput                               12440.0    5175989.2   4160.8
Pipe-based Context Switching                   4000.0    2207747.8   5519.4
Process Creation                                126.0      25125.5   1994.1
Shell Scripts (1 concurrent)                     42.4      33461.2   7891.8
Shell Scripts (8 concurrent)                      6.0       4024.7   6707.8
System Call Overhead                          15000.0    2917278.6   1944.9
                                                                   ========
System Benchmarks Index Score                                        3040.1

Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
---
 arch/loongarch/Kconfig                      |   1 +
 arch/loongarch/include/asm/Kbuild           |   5 +-
 arch/loongarch/include/asm/cmpxchg.h        |  14 +++
 arch/loongarch/include/asm/percpu.h         |   8 ++
 arch/loongarch/include/asm/spinlock.h       |  12 +++
 arch/loongarch/include/asm/spinlock_types.h |  11 ++
 arch/loongarch/kernel/Makefile              |   2 +-
 arch/loongarch/kernel/cmpxchg.c             | 105 ++++++++++++++++++++
 8 files changed, 154 insertions(+), 4 deletions(-)
 create mode 100644 arch/loongarch/include/asm/spinlock.h
 create mode 100644 arch/loongarch/include/asm/spinlock_types.h
 create mode 100644 arch/loongarch/kernel/cmpxchg.c

diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index 1920d52653b4..1ec220df751d 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -46,6 +46,7 @@ config LOONGARCH
 	select ARCH_USE_BUILTIN_BSWAP
 	select ARCH_USE_CMPXCHG_LOCKREF
 	select ARCH_USE_QUEUED_RWLOCKS
+	select ARCH_USE_QUEUED_SPINLOCKS
 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
 	select ARCH_WANTS_NO_INSTR
 	select BUILDTIME_TABLE_SORT
diff --git a/arch/loongarch/include/asm/Kbuild b/arch/loongarch/include/asm/Kbuild
index 83bc0681e72b..a0eed6076c79 100644
--- a/arch/loongarch/include/asm/Kbuild
+++ b/arch/loongarch/include/asm/Kbuild
@@ -1,12 +1,11 @@
 # SPDX-License-Identifier: GPL-2.0
 generic-y += dma-contiguous.h
 generic-y += export.h
+generic-y += mcs_spinlock.h
 generic-y += parport.h
 generic-y += early_ioremap.h
 generic-y += qrwlock.h
-generic-y += qrwlock_types.h
-generic-y += spinlock.h
-generic-y += spinlock_types.h
+generic-y += qspinlock.h
 generic-y += rwsem.h
 generic-y += segment.h
 generic-y += user.h
diff --git a/arch/loongarch/include/asm/cmpxchg.h b/arch/loongarch/include/asm/cmpxchg.h
index 75b3a4478652..afcd05be010e 100644
--- a/arch/loongarch/include/asm/cmpxchg.h
+++ b/arch/loongarch/include/asm/cmpxchg.h
@@ -21,10 +21,17 @@
 		__ret;				\
 })
 
+extern unsigned long __xchg_small(volatile void *ptr, unsigned long x,
+				  unsigned int size);
+
 static inline unsigned long __xchg(volatile void *ptr, unsigned long x,
 				   int size)
 {
 	switch (size) {
+	case 1:
+	case 2:
+		return __xchg_small(ptr, x, size);
+
 	case 4:
 		return __xchg_asm("amswap_db.w", (volatile u32 *)ptr, (u32)x);
 
@@ -67,10 +74,17 @@ static inline unsigned long __xchg(volatile void *ptr, unsigned long x,
 	__ret;								\
 })
 
+extern unsigned long __cmpxchg_small(volatile void *ptr, unsigned long old,
+				     unsigned long new, unsigned int size);
+
 static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
 				      unsigned long new, unsigned int size)
 {
 	switch (size) {
+	case 1:
+	case 2:
+		return __cmpxchg_small(ptr, old, new, size);
+
 	case 4:
 		return __cmpxchg_asm("ll.w", "sc.w", (volatile u32 *)ptr,
 				     (u32)old, new);
diff --git a/arch/loongarch/include/asm/percpu.h b/arch/loongarch/include/asm/percpu.h
index e6569f18c6dd..0bd6b0110198 100644
--- a/arch/loongarch/include/asm/percpu.h
+++ b/arch/loongarch/include/asm/percpu.h
@@ -123,6 +123,10 @@ static inline unsigned long __percpu_xchg(void *ptr, unsigned long val,
 						int size)
 {
 	switch (size) {
+	case 1:
+	case 2:
+		return __xchg_small((volatile void *)ptr, val, size);
+
 	case 4:
 		return __xchg_asm("amswap.w", (volatile u32 *)ptr, (u32)val);
 
@@ -204,9 +208,13 @@ do {									\
 #define this_cpu_write_4(pcp, val) _percpu_write(pcp, val)
 #define this_cpu_write_8(pcp, val) _percpu_write(pcp, val)
 
+#define this_cpu_xchg_1(pcp, val) _percpu_xchg(pcp, val)
+#define this_cpu_xchg_2(pcp, val) _percpu_xchg(pcp, val)
 #define this_cpu_xchg_4(pcp, val) _percpu_xchg(pcp, val)
 #define this_cpu_xchg_8(pcp, val) _percpu_xchg(pcp, val)
 
+#define this_cpu_cmpxchg_1(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
+#define this_cpu_cmpxchg_2(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
 #define this_cpu_cmpxchg_4(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
 #define this_cpu_cmpxchg_8(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
 
diff --git a/arch/loongarch/include/asm/spinlock.h b/arch/loongarch/include/asm/spinlock.h
new file mode 100644
index 000000000000..7cb3476999be
--- /dev/null
+++ b/arch/loongarch/include/asm/spinlock.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
+ */
+#ifndef _ASM_SPINLOCK_H
+#define _ASM_SPINLOCK_H
+
+#include <asm/processor.h>
+#include <asm/qspinlock.h>
+#include <asm/qrwlock.h>
+
+#endif /* _ASM_SPINLOCK_H */
diff --git a/arch/loongarch/include/asm/spinlock_types.h b/arch/loongarch/include/asm/spinlock_types.h
new file mode 100644
index 000000000000..7458d036c161
--- /dev/null
+++ b/arch/loongarch/include/asm/spinlock_types.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
+ */
+#ifndef _ASM_SPINLOCK_TYPES_H
+#define _ASM_SPINLOCK_TYPES_H
+
+#include <asm-generic/qspinlock_types.h>
+#include <asm-generic/qrwlock_types.h>
+
+#endif
diff --git a/arch/loongarch/kernel/Makefile b/arch/loongarch/kernel/Makefile
index 940de9173542..07930921f7b5 100644
--- a/arch/loongarch/kernel/Makefile
+++ b/arch/loongarch/kernel/Makefile
@@ -5,7 +5,7 @@
 
 extra-y		:= head.o vmlinux.lds
 
-obj-y		+= cpu-probe.o cacheinfo.o env.o setup.o entry.o genex.o \
+obj-y		+= cpu-probe.o cacheinfo.o cmpxchg.o env.o setup.o entry.o genex.o \
 		   traps.o irq.o idle.o process.o dma.o mem.o io.o reset.o switch.o \
 		   elf.o syscall.o signal.o time.o topology.o inst.o ptrace.o vdso.o
 
diff --git a/arch/loongarch/kernel/cmpxchg.c b/arch/loongarch/kernel/cmpxchg.c
new file mode 100644
index 000000000000..4c83471c4e47
--- /dev/null
+++ b/arch/loongarch/kernel/cmpxchg.c
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Author: Huacai Chen <chenhuacai@loongson.cn>
+ * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
+ *
+ * Derived from MIPS:
+ * Copyright (C) 2017 Imagination Technologies
+ * Author: Paul Burton <paul.burton@mips.com>
+ */
+
+#include <linux/bug.h>
+#include <asm/barrier.h>
+#include <asm/cmpxchg.h>
+
+unsigned long __xchg_small(volatile void *ptr, unsigned long val, unsigned int size)
+{
+	u32 old32, mask, temp;
+	volatile u32 *ptr32;
+	unsigned int shift;
+
+	/* Check that ptr is naturally aligned */
+	WARN_ON((unsigned long)ptr & (size - 1));
+
+	/* Mask value to the correct size. */
+	mask = GENMASK((size * BITS_PER_BYTE) - 1, 0);
+	val &= mask;
+
+	/*
+	 * Calculate a shift & mask that correspond to the value we wish to
+	 * exchange within the naturally aligned 4 byte integerthat includes
+	 * it.
+	 */
+	shift = (unsigned long)ptr & 0x3;
+	shift *= BITS_PER_BYTE;
+	mask <<= shift;
+
+	/*
+	 * Calculate a pointer to the naturally aligned 4 byte integer that
+	 * includes our byte of interest, and load its value.
+	 */
+	ptr32 = (volatile u32 *)((unsigned long)ptr & ~0x3);
+
+	asm volatile (
+	"1:	ll.w		%0, %3		\n"
+	"	andn		%1, %0, %4	\n"
+	"	or		%1, %1, %5	\n"
+	"	sc.w		%1, %2		\n"
+	"	beqz		%1, 1b		\n"
+	: "=&r" (old32), "=&r" (temp), "=" GCC_OFF_SMALL_ASM() (*ptr32)
+	: GCC_OFF_SMALL_ASM() (*ptr32), "Jr" (mask), "Jr" (val << shift)
+	: "memory");
+
+	return (old32 & mask) >> shift;
+}
+
+unsigned long __cmpxchg_small(volatile void *ptr, unsigned long old,
+			      unsigned long new, unsigned int size)
+{
+	u32 old32, mask, temp;
+	volatile u32 *ptr32;
+	unsigned int shift;
+
+	/* Check that ptr is naturally aligned */
+	WARN_ON((unsigned long)ptr & (size - 1));
+
+	/* Mask inputs to the correct size. */
+	mask = GENMASK((size * BITS_PER_BYTE) - 1, 0);
+	old &= mask;
+	new &= mask;
+
+	/*
+	 * Calculate a shift & mask that correspond to the value we wish to
+	 * compare & exchange within the naturally aligned 4 byte integer
+	 * that includes it.
+	 */
+	shift = (unsigned long)ptr & 0x3;
+	shift *= BITS_PER_BYTE;
+	old <<= shift;
+	new <<= shift;
+	mask <<= shift;
+
+	/*
+	 * Calculate a pointer to the naturally aligned 4 byte integer that
+	 * includes our byte of interest, and load its value.
+	 */
+	ptr32 = (volatile u32 *)((unsigned long)ptr & ~0x3);
+
+	asm volatile (
+	"1:	ll.w		%0, %3		\n"
+	"	and		%1, %0, %4	\n"
+	"	bne		%1, %5, 2f	\n"
+	"	andn		%1, %0, %4	\n"
+	"	or		%1, %1, %6	\n"
+	"	sc.w		%1, %2		\n"
+	"	beqz		%1, 1b		\n"
+	"	b		3f		\n"
+	"2:					\n"
+	__WEAK_LLSC_MB
+	"3:					\n"
+	: "=&r" (old32), "=&r" (temp), "=" GCC_OFF_SMALL_ASM() (*ptr32)
+	: GCC_OFF_SMALL_ASM() (*ptr32), "Jr" (mask), "Jr" (old), "Jr" (new)
+	: "memory");
+
+	return (old32 & mask) >> shift;
+}
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH] LoongArch: Add qspinlock support
  2022-06-17 14:57 [PATCH] LoongArch: Add qspinlock support Huacai Chen
@ 2022-06-17 16:10 ` Arnd Bergmann
  2022-06-17 17:45   ` Guo Ren
  2022-06-19 15:23   ` Guo Ren
  2022-06-17 16:35 ` Guo Ren
  1 sibling, 2 replies; 18+ messages in thread
From: Arnd Bergmann @ 2022-06-17 16:10 UTC (permalink / raw)
  To: Huacai Chen
  Cc: Arnd Bergmann, Huacai Chen, loongarch, linux-arch, Xuefeng Li,
	Guo Ren, Xuerui Wang, Jiaxun Yang, Peter Zijlstra, Will Deacon,
	Ingo Molnar

On Fri, Jun 17, 2022 at 4:57 PM Huacai Chen <chenhuacai@loongson.cn> wrote:
>
> On NUMA system, the performance of qspinlock is better than generic
> spinlock. Below is the UnixBench test results on a 8 nodes (4 cores
> per node, 32 cores in total) machine.
>

The performance increase is nice, but this is only half the story we need here.

I think the more important bit is how you can guarantee that the xchg16()
implementation is correct and always allows forward progress.

>@@ -123,6 +123,10 @@ static inline unsigned long __percpu_xchg(void *ptr, unsigned long val,
>                                                int size)
> {
>        switch (size) {
>+       case 1:
>+       case 2:
>+               return __xchg_small((volatile void *)ptr, val, size);
>+

Do you actually need the size 1 as well?

Generally speaking, I would like to rework the xchg()/cmpxchg() logic
to only cover the 32-bit and word-sized (possibly 64-bit) case, while
having separate optional 8-bit and 16-bit functions. I had a patch for
this in the past, and can try to dig that out, this may be the time to
finally do that.

I see that the qspinlock() code actually calls a 'relaxed' version of xchg16(),
but you only implement the one with the full barrier. Is it possible to
directly provide a relaxed version that has something less than the
__WEAK_LLSC_MB?

       Arnd

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] LoongArch: Add qspinlock support
  2022-06-17 14:57 [PATCH] LoongArch: Add qspinlock support Huacai Chen
  2022-06-17 16:10 ` Arnd Bergmann
@ 2022-06-17 16:35 ` Guo Ren
  1 sibling, 0 replies; 18+ messages in thread
From: Guo Ren @ 2022-06-17 16:35 UTC (permalink / raw)
  To: Huacai Chen, Peter Zijlstra
  Cc: Arnd Bergmann, Huacai Chen, loongarch, linux-arch, Xuefeng Li,
	Xuerui Wang, Jiaxun Yang, Will Deacon, Ingo Molnar

On Fri, Jun 17, 2022 at 10:55 PM Huacai Chen <chenhuacai@loongson.cn> wrote:
>
> On NUMA system, the performance of qspinlock is better than generic
> spinlock. Below is the UnixBench test results on a 8 nodes (4 cores
> per node, 32 cores in total) machine.
>
> A. With generic spinlock:
>
> System Benchmarks Index Values               BASELINE       RESULT    INDEX
> Dhrystone 2 using register variables         116700.0  449574022.5  38523.9
> Double-Precision Whetstone                       55.0      85190.4  15489.2
> Execl Throughput                                 43.0      14696.2   3417.7
> File Copy 1024 bufsize 2000 maxblocks          3960.0     143157.8    361.5
> File Copy 256 bufsize 500 maxblocks            1655.0      37631.8    227.4
> File Copy 4096 bufsize 8000 maxblocks          5800.0     444814.2    766.9
> Pipe Throughput                               12440.0    5047490.7   4057.5
> Pipe-based Context Switching                   4000.0    2021545.7   5053.9
> Process Creation                                126.0      23829.8   1891.3
> Shell Scripts (1 concurrent)                     42.4      33756.7   7961.5
> Shell Scripts (8 concurrent)                      6.0       4062.9   6771.5
> System Call Overhead                          15000.0    2479748.6   1653.2
>                                                                    ========
> System Benchmarks Index Score                                        2955.6
>
> B. With qspinlock:
>
> System Benchmarks Index Values               BASELINE       RESULT    INDEX
> Dhrystone 2 using register variables         116700.0  449467876.9  38514.8
> Double-Precision Whetstone                       55.0      85174.6  15486.3
> Execl Throughput                                 43.0      14769.1   3434.7
> File Copy 1024 bufsize 2000 maxblocks          3960.0     146150.5    369.1
> File Copy 256 bufsize 500 maxblocks            1655.0      37496.8    226.6
> File Copy 4096 bufsize 8000 maxblocks          5800.0     447527.0    771.6
> Pipe Throughput                               12440.0    5175989.2   4160.8
> Pipe-based Context Switching                   4000.0    2207747.8   5519.4
> Process Creation                                126.0      25125.5   1994.1
> Shell Scripts (1 concurrent)                     42.4      33461.2   7891.8
> Shell Scripts (8 concurrent)                      6.0       4024.7   6707.8
> System Call Overhead                          15000.0    2917278.6   1944.9
>                                                                    ========
> System Benchmarks Index Score                                        3040.1
>
> Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
> ---
>  arch/loongarch/Kconfig                      |   1 +
>  arch/loongarch/include/asm/Kbuild           |   5 +-
>  arch/loongarch/include/asm/cmpxchg.h        |  14 +++
>  arch/loongarch/include/asm/percpu.h         |   8 ++
>  arch/loongarch/include/asm/spinlock.h       |  12 +++
>  arch/loongarch/include/asm/spinlock_types.h |  11 ++
>  arch/loongarch/kernel/Makefile              |   2 +-
>  arch/loongarch/kernel/cmpxchg.c             | 105 ++++++++++++++++++++
>  8 files changed, 154 insertions(+), 4 deletions(-)
>  create mode 100644 arch/loongarch/include/asm/spinlock.h
>  create mode 100644 arch/loongarch/include/asm/spinlock_types.h
>  create mode 100644 arch/loongarch/kernel/cmpxchg.c
>
> diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
> index 1920d52653b4..1ec220df751d 100644
> --- a/arch/loongarch/Kconfig
> +++ b/arch/loongarch/Kconfig
> @@ -46,6 +46,7 @@ config LOONGARCH
>         select ARCH_USE_BUILTIN_BSWAP
>         select ARCH_USE_CMPXCHG_LOCKREF
>         select ARCH_USE_QUEUED_RWLOCKS
> +       select ARCH_USE_QUEUED_SPINLOCKS
>         select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
>         select ARCH_WANTS_NO_INSTR
>         select BUILDTIME_TABLE_SORT
> diff --git a/arch/loongarch/include/asm/Kbuild b/arch/loongarch/include/asm/Kbuild
> index 83bc0681e72b..a0eed6076c79 100644
> --- a/arch/loongarch/include/asm/Kbuild
> +++ b/arch/loongarch/include/asm/Kbuild
> @@ -1,12 +1,11 @@
>  # SPDX-License-Identifier: GPL-2.0
>  generic-y += dma-contiguous.h
>  generic-y += export.h
> +generic-y += mcs_spinlock.h
>  generic-y += parport.h
>  generic-y += early_ioremap.h
>  generic-y += qrwlock.h
> -generic-y += qrwlock_types.h
> -generic-y += spinlock.h
> -generic-y += spinlock_types.h
> +generic-y += qspinlock.h
>  generic-y += rwsem.h
>  generic-y += segment.h
>  generic-y += user.h
> diff --git a/arch/loongarch/include/asm/cmpxchg.h b/arch/loongarch/include/asm/cmpxchg.h
> index 75b3a4478652..afcd05be010e 100644
> --- a/arch/loongarch/include/asm/cmpxchg.h
> +++ b/arch/loongarch/include/asm/cmpxchg.h
> @@ -21,10 +21,17 @@
>                 __ret;                          \
>  })
>
> +extern unsigned long __xchg_small(volatile void *ptr, unsigned long x,
> +                                 unsigned int size);
> +
>  static inline unsigned long __xchg(volatile void *ptr, unsigned long x,
>                                    int size)
>  {
>         switch (size) {
> +       case 1:
> +       case 2:
> +               return __xchg_small(ptr, x, size);
> +
>         case 4:
>                 return __xchg_asm("amswap_db.w", (volatile u32 *)ptr, (u32)x);
>
> @@ -67,10 +74,17 @@ static inline unsigned long __xchg(volatile void *ptr, unsigned long x,
>         __ret;                                                          \
>  })
>
> +extern unsigned long __cmpxchg_small(volatile void *ptr, unsigned long old,
> +                                    unsigned long new, unsigned int size);
> +
>  static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
>                                       unsigned long new, unsigned int size)
>  {
>         switch (size) {
> +       case 1:
> +       case 2:
> +               return __cmpxchg_small(ptr, old, new, size);
> +
>         case 4:
>                 return __cmpxchg_asm("ll.w", "sc.w", (volatile u32 *)ptr,
>                                      (u32)old, new);
> diff --git a/arch/loongarch/include/asm/percpu.h b/arch/loongarch/include/asm/percpu.h
> index e6569f18c6dd..0bd6b0110198 100644
> --- a/arch/loongarch/include/asm/percpu.h
> +++ b/arch/loongarch/include/asm/percpu.h
> @@ -123,6 +123,10 @@ static inline unsigned long __percpu_xchg(void *ptr, unsigned long val,
>                                                 int size)
>  {
>         switch (size) {
> +       case 1:
> +       case 2:
> +               return __xchg_small((volatile void *)ptr, val, size);
> +
>         case 4:
>                 return __xchg_asm("amswap.w", (volatile u32 *)ptr, (u32)val);
>
> @@ -204,9 +208,13 @@ do {                                                                       \
>  #define this_cpu_write_4(pcp, val) _percpu_write(pcp, val)
>  #define this_cpu_write_8(pcp, val) _percpu_write(pcp, val)
>
> +#define this_cpu_xchg_1(pcp, val) _percpu_xchg(pcp, val)
> +#define this_cpu_xchg_2(pcp, val) _percpu_xchg(pcp, val)
>  #define this_cpu_xchg_4(pcp, val) _percpu_xchg(pcp, val)
>  #define this_cpu_xchg_8(pcp, val) _percpu_xchg(pcp, val)
>
> +#define this_cpu_cmpxchg_1(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
> +#define this_cpu_cmpxchg_2(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
>  #define this_cpu_cmpxchg_4(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
>  #define this_cpu_cmpxchg_8(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
>
> diff --git a/arch/loongarch/include/asm/spinlock.h b/arch/loongarch/include/asm/spinlock.h
> new file mode 100644
> index 000000000000..7cb3476999be
> --- /dev/null
> +++ b/arch/loongarch/include/asm/spinlock.h
> @@ -0,0 +1,12 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
> + */
> +#ifndef _ASM_SPINLOCK_H
> +#define _ASM_SPINLOCK_H
> +
> +#include <asm/processor.h>
> +#include <asm/qspinlock.h>
> +#include <asm/qrwlock.h>
> +
> +#endif /* _ASM_SPINLOCK_H */
> diff --git a/arch/loongarch/include/asm/spinlock_types.h b/arch/loongarch/include/asm/spinlock_types.h
> new file mode 100644
> index 000000000000..7458d036c161
> --- /dev/null
> +++ b/arch/loongarch/include/asm/spinlock_types.h
> @@ -0,0 +1,11 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
> + */
> +#ifndef _ASM_SPINLOCK_TYPES_H
> +#define _ASM_SPINLOCK_TYPES_H
> +
> +#include <asm-generic/qspinlock_types.h>
> +#include <asm-generic/qrwlock_types.h>
> +
> +#endif
> diff --git a/arch/loongarch/kernel/Makefile b/arch/loongarch/kernel/Makefile
> index 940de9173542..07930921f7b5 100644
> --- a/arch/loongarch/kernel/Makefile
> +++ b/arch/loongarch/kernel/Makefile
> @@ -5,7 +5,7 @@
>
>  extra-y                := head.o vmlinux.lds
>
> -obj-y          += cpu-probe.o cacheinfo.o env.o setup.o entry.o genex.o \
> +obj-y          += cpu-probe.o cacheinfo.o cmpxchg.o env.o setup.o entry.o genex.o \
>                    traps.o irq.o idle.o process.o dma.o mem.o io.o reset.o switch.o \
>                    elf.o syscall.o signal.o time.o topology.o inst.o ptrace.o vdso.o
>
> diff --git a/arch/loongarch/kernel/cmpxchg.c b/arch/loongarch/kernel/cmpxchg.c
> new file mode 100644
> index 000000000000..4c83471c4e47
> --- /dev/null
> +++ b/arch/loongarch/kernel/cmpxchg.c
> @@ -0,0 +1,105 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Author: Huacai Chen <chenhuacai@loongson.cn>
> + * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
> + *
> + * Derived from MIPS:
> + * Copyright (C) 2017 Imagination Technologies
> + * Author: Paul Burton <paul.burton@mips.com>
> + */
> +
> +#include <linux/bug.h>
> +#include <asm/barrier.h>
> +#include <asm/cmpxchg.h>
> +
> +unsigned long __xchg_small(volatile void *ptr, unsigned long val, unsigned int size)
> +{
> +       u32 old32, mask, temp;
> +       volatile u32 *ptr32;
> +       unsigned int shift;
> +
> +       /* Check that ptr is naturally aligned */
> +       WARN_ON((unsigned long)ptr & (size - 1));
> +
> +       /* Mask value to the correct size. */
> +       mask = GENMASK((size * BITS_PER_BYTE) - 1, 0);
> +       val &= mask;
> +
> +       /*
> +        * Calculate a shift & mask that correspond to the value we wish to
> +        * exchange within the naturally aligned 4 byte integerthat includes
> +        * it.
> +        */
> +       shift = (unsigned long)ptr & 0x3;
> +       shift *= BITS_PER_BYTE;
> +       mask <<= shift;
> +
> +       /*
> +        * Calculate a pointer to the naturally aligned 4 byte integer that
> +        * includes our byte of interest, and load its value.
> +        */
> +       ptr32 = (volatile u32 *)((unsigned long)ptr & ~0x3);
> +
> +       asm volatile (
> +       "1:     ll.w            %0, %3          \n"
> +       "       andn            %1, %0, %4      \n"
> +       "       or              %1, %1, %5      \n"
> +       "       sc.w            %1, %2          \n"
> +       "       beqz            %1, 1b          \n"
Above depends on how micro-arch implements ll/sc with strong forward
guarantee,  eg:
A. Just check if there is remote write from snoop channel, that's a
monitor style in cache coherency. And I think it's a weak forward
guarantee not a good ll/sc implementation.
B. Lock snoop channel and block other remote write requests until
sc/branch/interrupt/normal load/store happen. That's strong enough for
qspinlock and only interrupt could break ll/sc pair. (ISA should
writes some limitation in spec, just like RISC-V)
C. Fusion ll + alu + sc into one atomic bus transaction, See Atomic
transactions in AMBA CHI - Arm Developer

We are also preparing similar patch for RISC-V, but I think your spec
should give out some details on ll/sc atomic forward guarantee.

Only for the code implementation, I give Reviewed-by: Guo Ren
<guoren@kernel.org>

> +       : "=&r" (old32), "=&r" (temp), "=" GCC_OFF_SMALL_ASM() (*ptr32)
> +       : GCC_OFF_SMALL_ASM() (*ptr32), "Jr" (mask), "Jr" (val << shift)
> +       : "memory");
> +
> +       return (old32 & mask) >> shift;
> +}
> +
> +unsigned long __cmpxchg_small(volatile void *ptr, unsigned long old,
> +                             unsigned long new, unsigned int size)
> +{
> +       u32 old32, mask, temp;
> +       volatile u32 *ptr32;
> +       unsigned int shift;
> +
> +       /* Check that ptr is naturally aligned */
> +       WARN_ON((unsigned long)ptr & (size - 1));
> +
> +       /* Mask inputs to the correct size. */
> +       mask = GENMASK((size * BITS_PER_BYTE) - 1, 0);
> +       old &= mask;
> +       new &= mask;
> +
> +       /*
> +        * Calculate a shift & mask that correspond to the value we wish to
> +        * compare & exchange within the naturally aligned 4 byte integer
> +        * that includes it.
> +        */
> +       shift = (unsigned long)ptr & 0x3;
> +       shift *= BITS_PER_BYTE;
> +       old <<= shift;
> +       new <<= shift;
> +       mask <<= shift;
> +
> +       /*
> +        * Calculate a pointer to the naturally aligned 4 byte integer that
> +        * includes our byte of interest, and load its value.
> +        */
> +       ptr32 = (volatile u32 *)((unsigned long)ptr & ~0x3);
> +
> +       asm volatile (
> +       "1:     ll.w            %0, %3          \n"
> +       "       and             %1, %0, %4      \n"
> +       "       bne             %1, %5, 2f      \n"
> +       "       andn            %1, %0, %4      \n"
> +       "       or              %1, %1, %6      \n"
> +       "       sc.w            %1, %2          \n"
> +       "       beqz            %1, 1b          \n"
> +       "       b               3f              \n"
> +       "2:                                     \n"
> +       __WEAK_LLSC_MB
> +       "3:                                     \n"
> +       : "=&r" (old32), "=&r" (temp), "=" GCC_OFF_SMALL_ASM() (*ptr32)
> +       : GCC_OFF_SMALL_ASM() (*ptr32), "Jr" (mask), "Jr" (old), "Jr" (new)
> +       : "memory");
> +
> +       return (old32 & mask) >> shift;
> +}
> --
> 2.27.0
>


-- 
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] LoongArch: Add qspinlock support
  2022-06-17 16:10 ` Arnd Bergmann
@ 2022-06-17 17:45   ` Guo Ren
  2022-06-17 18:59     ` Arnd Bergmann
  2022-06-18 12:50     ` WANG Xuerui
  2022-06-19 15:23   ` Guo Ren
  1 sibling, 2 replies; 18+ messages in thread
From: Guo Ren @ 2022-06-17 17:45 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Huacai Chen, Huacai Chen, loongarch, linux-arch, Xuefeng Li,
	Xuerui Wang, Jiaxun Yang, Peter Zijlstra, Will Deacon,
	Ingo Molnar

On Sat, Jun 18, 2022 at 12:11 AM Arnd Bergmann <arnd@arndb.de> wrote:
>
> On Fri, Jun 17, 2022 at 4:57 PM Huacai Chen <chenhuacai@loongson.cn> wrote:
> >
> > On NUMA system, the performance of qspinlock is better than generic
> > spinlock. Below is the UnixBench test results on a 8 nodes (4 cores
> > per node, 32 cores in total) machine.
> >
>
> The performance increase is nice, but this is only half the story we need here.
>
> I think the more important bit is how you can guarantee that the xchg16()
> implementation is correct and always allows forward progress.
>
> >@@ -123,6 +123,10 @@ static inline unsigned long __percpu_xchg(void *ptr, unsigned long val,
> >                                                int size)
> > {
> >        switch (size) {
> >+       case 1:
> >+       case 2:
> >+               return __xchg_small((volatile void *)ptr, val, size);
> >+
>
> Do you actually need the size 1 as well?
>
> Generally speaking, I would like to rework the xchg()/cmpxchg() logic
> to only cover the 32-bit and word-sized (possibly 64-bit) case, while
> having separate optional 8-bit and 16-bit functions. I had a patch for
Why not prevent 8-bit and 16-bit xchg()/cmpxchg() directly? eg: move
qspinlock xchg_tail to per arch_xchg_tail.
That means Linux doesn't provide a mixed-size atomic operation primitive.

What does your "separate optional 8-bit and 16-bit functions" mean here?

> this in the past, and can try to dig that out, this may be the time to
> finally do that.
>
> I see that the qspinlock() code actually calls a 'relaxed' version of xchg16(),
> but you only implement the one with the full barrier. Is it possible to
> directly provide a relaxed version that has something less than the
> __WEAK_LLSC_MB?
I am also curious that __WEAK_LLSC_MB is very magic. How does it
prevent preceded accesses from happening after sc for a strong
cmpxchg?

#define __cmpxchg_asm(ld, st, m, old, new)                              \
({                                                                      \
        __typeof(old) __ret;                                            \
                                                                        \
        __asm__ __volatile__(                                           \
        "1:     " ld "  %0, %2          # __cmpxchg_asm \n"             \
        "       bne     %0, %z3, 2f                     \n"             \
        "       or      $t0, %z4, $zero                 \n"             \
        "       " st "  $t0, %1                         \n"             \
        "       beq     $zero, $t0, 1b                  \n"             \
        "2:                                             \n"             \
        __WEAK_LLSC_MB                                                  \

And its __smp_mb__xxx are just defined as a compiler barrier()?
#define __smp_mb__before_atomic()       barrier()
#define __smp_mb__after_atomic()        barrier()

>
>        Arnd



-- 
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] LoongArch: Add qspinlock support
  2022-06-17 17:45   ` Guo Ren
@ 2022-06-17 18:59     ` Arnd Bergmann
  2022-06-17 23:19       ` Guo Ren
  2022-06-18 12:50     ` WANG Xuerui
  1 sibling, 1 reply; 18+ messages in thread
From: Arnd Bergmann @ 2022-06-17 18:59 UTC (permalink / raw)
  To: Guo Ren
  Cc: Arnd Bergmann, Huacai Chen, Huacai Chen, loongarch, linux-arch,
	Xuefeng Li, Xuerui Wang, Jiaxun Yang, Peter Zijlstra,
	Will Deacon, Ingo Molnar

On Fri, Jun 17, 2022 at 7:45 PM Guo Ren <guoren@kernel.org> wrote:
> On Sat, Jun 18, 2022 at 12:11 AM Arnd Bergmann <arnd@arndb.de> wrote:
> > >+
> >
> > Do you actually need the size 1 as well?
> >
> > Generally speaking, I would like to rework the xchg()/cmpxchg() logic
> > to only cover the 32-bit and word-sized (possibly 64-bit) case, while
> > having separate optional 8-bit and 16-bit functions. I had a patch for
> Why not prevent 8-bit and 16-bit xchg()/cmpxchg() directly? eg: move
> qspinlock xchg_tail to per arch_xchg_tail.
> That means Linux doesn't provide a mixed-size atomic operation primitive.
>
> What does your "separate optional 8-bit and 16-bit functions" mean here?

What I have in mind is something like

static inline  u8 arch_xchg8(u8 *ptr, u8 x) {...}
static inline u16 arch_xchg16(u16 *ptr, u16 x) {...}
static inline u32 arch_xchg32(u32 *ptr, u32 x) {...}
static inline u64 arch_xchg64(u64 *ptr, u64 x) {...}

#ifdef CONFIG_64BIT
#define xchg(ptr, x) (sizeof(*ptr) == 8) ? \
            arch_xchg64((u64*)ptr, (uintptr_t)x)  \
            arch_xchg32((u32*)ptr, x)
#else
#define xchg(ptr, x) arch_xchg32((u32*)ptr, (uintptr_t)x)
#endif

This means most of the helpers can actually be normal
inline functions, and only 64-bit architectures need the special
case of dealing with non-u32-sized pointers and 'long' values.

         Arnd

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] LoongArch: Add qspinlock support
  2022-06-17 18:59     ` Arnd Bergmann
@ 2022-06-17 23:19       ` Guo Ren
  2022-06-18  5:40         ` Arnd Bergmann
  0 siblings, 1 reply; 18+ messages in thread
From: Guo Ren @ 2022-06-17 23:19 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Huacai Chen, Huacai Chen, loongarch, linux-arch, Xuefeng Li,
	Xuerui Wang, Jiaxun Yang, Peter Zijlstra, Will Deacon,
	Ingo Molnar

On Sat, Jun 18, 2022 at 2:59 AM Arnd Bergmann <arnd@arndb.de> wrote:
>
> On Fri, Jun 17, 2022 at 7:45 PM Guo Ren <guoren@kernel.org> wrote:
> > On Sat, Jun 18, 2022 at 12:11 AM Arnd Bergmann <arnd@arndb.de> wrote:
> > > >+
> > >
> > > Do you actually need the size 1 as well?
> > >
> > > Generally speaking, I would like to rework the xchg()/cmpxchg() logic
> > > to only cover the 32-bit and word-sized (possibly 64-bit) case, while
> > > having separate optional 8-bit and 16-bit functions. I had a patch for
> > Why not prevent 8-bit and 16-bit xchg()/cmpxchg() directly? eg: move
> > qspinlock xchg_tail to per arch_xchg_tail.
> > That means Linux doesn't provide a mixed-size atomic operation primitive.
> >
> > What does your "separate optional 8-bit and 16-bit functions" mean here?
>
> What I have in mind is something like
>
> static inline  u8 arch_xchg8(u8 *ptr, u8 x) {...}
> static inline u16 arch_xchg16(u16 *ptr, u16 x) {...}
Yes, inline is very important. We should prevent procedure call like
this patch. My preparing qspinlock patch for riscv only deal with
xchg16 with inline.


> static inline u32 arch_xchg32(u32 *ptr, u32 x) {...}
> static inline u64 arch_xchg64(u64 *ptr, u64 x) {...}
>
> #ifdef CONFIG_64BIT
> #define xchg(ptr, x) (sizeof(*ptr) == 8) ? \
>             arch_xchg64((u64*)ptr, (uintptr_t)x)  \
>             arch_xchg32((u32*)ptr, x)
> #else
> #define xchg(ptr, x) arch_xchg32((u32*)ptr, (uintptr_t)x)
> #endif
The above primitive implies only long & int type args are permitted, right?

>
> This means most of the helpers can actually be normal
> inline functions, and only 64-bit architectures need the special
> case of dealing with non-u32-sized pointers and 'long' values.
>
>          Arnd



-- 
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] LoongArch: Add qspinlock support
  2022-06-17 23:19       ` Guo Ren
@ 2022-06-18  5:40         ` Arnd Bergmann
  2022-06-19 15:48           ` Guo Ren
  0 siblings, 1 reply; 18+ messages in thread
From: Arnd Bergmann @ 2022-06-18  5:40 UTC (permalink / raw)
  To: Guo Ren
  Cc: Arnd Bergmann, Huacai Chen, Huacai Chen, loongarch, linux-arch,
	Xuefeng Li, Xuerui Wang, Jiaxun Yang, Peter Zijlstra,
	Will Deacon, Ingo Molnar

On Sat, Jun 18, 2022 at 1:19 AM Guo Ren <guoren@kernel.org> wrote:
>
> > static inline u32 arch_xchg32(u32 *ptr, u32 x) {...}
> > static inline u64 arch_xchg64(u64 *ptr, u64 x) {...}
> >
> > #ifdef CONFIG_64BIT
> > #define xchg(ptr, x) (sizeof(*ptr) == 8) ? \
> >             arch_xchg64((u64*)ptr, (uintptr_t)x)  \
> >             arch_xchg32((u32*)ptr, x)
> > #else
> > #define xchg(ptr, x) arch_xchg32((u32*)ptr, (uintptr_t)x)
> > #endif
>
> The above primitive implies only long & int type args are permitted, right?

The idea is to allow any scalar or pointer type, but not structures or
unions. If we need to deal with those as well, the macro could be extended
accordingly, but I would prefer to limit it as much as possible.

There is already cmpxchg64(), which is used for types that are fixed to
64 bit integers even on 32-bit architectures, but it is rarely used except
to implement the atomic64_t helpers.

80% of the uses of cmpxchg() and xchg() deal with word-sized
quantities like 'unsigned long', or 'void *', but the others are almost
all fixed 32-bit quantities. We could change those to use cmpxchg32()
directly and simplify the cmpxchg() function further to only deal
with word-sized arguments, but I would not do that in the first step.

        Arnd

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] LoongArch: Add qspinlock support
  2022-06-17 17:45   ` Guo Ren
  2022-06-17 18:59     ` Arnd Bergmann
@ 2022-06-18 12:50     ` WANG Xuerui
  2022-06-19  4:28       ` hev
  1 sibling, 1 reply; 18+ messages in thread
From: WANG Xuerui @ 2022-06-18 12:50 UTC (permalink / raw)
  To: Guo Ren, Arnd Bergmann
  Cc: Huacai Chen, Huacai Chen, loongarch, linux-arch, Xuefeng Li,
	Xuerui Wang, Jiaxun Yang, Peter Zijlstra, Will Deacon,
	Ingo Molnar

On 6/18/22 01:45, Guo Ren wrote:
>
>> I see that the qspinlock() code actually calls a 'relaxed' version of xchg16(),
>> but you only implement the one with the full barrier. Is it possible to
>> directly provide a relaxed version that has something less than the
>> __WEAK_LLSC_MB?
> I am also curious that __WEAK_LLSC_MB is very magic. How does it
> prevent preceded accesses from happening after sc for a strong
> cmpxchg?
>
> #define __cmpxchg_asm(ld, st, m, old, new)                              \
> ({                                                                      \
>          __typeof(old) __ret;                                            \
>                                                                          \
>          __asm__ __volatile__(                                           \
>          "1:     " ld "  %0, %2          # __cmpxchg_asm \n"             \
>          "       bne     %0, %z3, 2f                     \n"             \
>          "       or      $t0, %z4, $zero                 \n"             \
>          "       " st "  $t0, %1                         \n"             \
>          "       beq     $zero, $t0, 1b                  \n"             \
>          "2:                                             \n"             \
>          __WEAK_LLSC_MB                                                  \
>
> And its __smp_mb__xxx are just defined as a compiler barrier()?
> #define __smp_mb__before_atomic()       barrier()
> #define __smp_mb__after_atomic()        barrier()
I know this one. There is only one type of barrier defined in the v1.00 
of LoongArch, that is the full barrier, but this is going to change. 
Huacai hinted in the bringup patchset that 3A6000 and later models would 
have finer-grained barriers. So these indeed could be relaxed in the 
future, just that Huacai has to wait for their embargo to expire.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] LoongArch: Add qspinlock support
  2022-06-18 12:50     ` WANG Xuerui
@ 2022-06-19  4:28       ` hev
  2022-06-19 15:06         ` Guo Ren
  0 siblings, 1 reply; 18+ messages in thread
From: hev @ 2022-06-19  4:28 UTC (permalink / raw)
  To: Guo Ren, Arnd Bergmann, WANG Xuerui
  Cc: Huacai Chen, Huacai Chen, loongarch, linux-arch, Xuefeng Li,
	Jiaxun Yang, Peter Zijlstra, Will Deacon, Ingo Molnar

Hello,

On Sat, Jun 18, 2022 at 8:59 PM WANG Xuerui <kernel@xen0n.name> wrote:
>
> On 6/18/22 01:45, Guo Ren wrote:
> >
> >> I see that the qspinlock() code actually calls a 'relaxed' version of xchg16(),
> >> but you only implement the one with the full barrier. Is it possible to
> >> directly provide a relaxed version that has something less than the
> >> __WEAK_LLSC_MB?
> > I am also curious that __WEAK_LLSC_MB is very magic. How does it
> > prevent preceded accesses from happening after sc for a strong
> > cmpxchg?
> >
> > #define __cmpxchg_asm(ld, st, m, old, new)                              \
> > ({                                                                      \
> >          __typeof(old) __ret;                                            \
> >                                                                          \
> >          __asm__ __volatile__(                                           \
> >          "1:     " ld "  %0, %2          # __cmpxchg_asm \n"             \
> >          "       bne     %0, %z3, 2f                     \n"             \
> >          "       or      $t0, %z4, $zero                 \n"             \
> >          "       " st "  $t0, %1                         \n"             \
> >          "       beq     $zero, $t0, 1b                  \n"             \
> >          "2:                                             \n"             \
> >          __WEAK_LLSC_MB                                                  \
> >
> > And its __smp_mb__xxx are just defined as a compiler barrier()?
> > #define __smp_mb__before_atomic()       barrier()
> > #define __smp_mb__after_atomic()        barrier()
> I know this one. There is only one type of barrier defined in the v1.00
> of LoongArch, that is the full barrier, but this is going to change.
> Huacai hinted in the bringup patchset that 3A6000 and later models would
> have finer-grained barriers. So these indeed could be relaxed in the
> future, just that Huacai has to wait for their embargo to expire.
>

IIRC, The Loongson LL/SC behaves differently than others:

Loongson:
LL: Full barrier + Load exclusive
SC: Store conditional + Full barrier

Others:
LL: Load exclusive + Acquire barrier
SC: Release barrier + Store conditional

So we just need to prevent compiler reorder before/after atomic.
And this is why we need __WEAK_LLSC_MB to prevent runtime reorder for
loads after LL.

hev

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] LoongArch: Add qspinlock support
  2022-06-19  4:28       ` hev
@ 2022-06-19 15:06         ` Guo Ren
  2022-06-19 15:38           ` hev
  0 siblings, 1 reply; 18+ messages in thread
From: Guo Ren @ 2022-06-19 15:06 UTC (permalink / raw)
  To: hev
  Cc: Arnd Bergmann, WANG Xuerui, Huacai Chen, Huacai Chen, loongarch,
	linux-arch, Xuefeng Li, Jiaxun Yang, Peter Zijlstra, Will Deacon,
	Ingo Molnar

On Sun, Jun 19, 2022 at 12:28 PM hev <r@hev.cc> wrote:
>
> Hello,
>
> On Sat, Jun 18, 2022 at 8:59 PM WANG Xuerui <kernel@xen0n.name> wrote:
> >
> > On 6/18/22 01:45, Guo Ren wrote:
> > >
> > >> I see that the qspinlock() code actually calls a 'relaxed' version of xchg16(),
> > >> but you only implement the one with the full barrier. Is it possible to
> > >> directly provide a relaxed version that has something less than the
> > >> __WEAK_LLSC_MB?
> > > I am also curious that __WEAK_LLSC_MB is very magic. How does it
> > > prevent preceded accesses from happening after sc for a strong
> > > cmpxchg?
> > >
> > > #define __cmpxchg_asm(ld, st, m, old, new)                              \
> > > ({                                                                      \
> > >          __typeof(old) __ret;                                            \
> > >                                                                          \
> > >          __asm__ __volatile__(                                           \
> > >          "1:     " ld "  %0, %2          # __cmpxchg_asm \n"             \
> > >          "       bne     %0, %z3, 2f                     \n"             \
> > >          "       or      $t0, %z4, $zero                 \n"             \
> > >          "       " st "  $t0, %1                         \n"             \
> > >          "       beq     $zero, $t0, 1b                  \n"             \
> > >          "2:                                             \n"             \
> > >          __WEAK_LLSC_MB                                                  \
> > >
> > > And its __smp_mb__xxx are just defined as a compiler barrier()?
> > > #define __smp_mb__before_atomic()       barrier()
> > > #define __smp_mb__after_atomic()        barrier()
> > I know this one. There is only one type of barrier defined in the v1.00
> > of LoongArch, that is the full barrier, but this is going to change.
> > Huacai hinted in the bringup patchset that 3A6000 and later models would
> > have finer-grained barriers. So these indeed could be relaxed in the
> > future, just that Huacai has to wait for their embargo to expire.
> >
>
> IIRC, The Loongson LL/SC behaves differently than others:
>
> Loongson:
> LL: Full barrier + Load exclusive
> SC: Store conditional + Full barrier
How about your "am"#asm_op"_db."?

Full barrier + AMO + Full barrier ?

>
> Others:
> LL: Load exclusive + Acquire barrier
> SC: Release barrier + Store conditional
>
> So we just need to prevent compiler reorder before/after atomic.
> And this is why we need __WEAK_LLSC_MB to prevent runtime reorder for
> loads after LL.
>
> hev



-- 
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] LoongArch: Add qspinlock support
  2022-06-17 16:10 ` Arnd Bergmann
  2022-06-17 17:45   ` Guo Ren
@ 2022-06-19 15:23   ` Guo Ren
  1 sibling, 0 replies; 18+ messages in thread
From: Guo Ren @ 2022-06-19 15:23 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Huacai Chen, Huacai Chen, loongarch, linux-arch, Xuefeng Li,
	Xuerui Wang, Jiaxun Yang, Peter Zijlstra, Will Deacon,
	Ingo Molnar

On Sat, Jun 18, 2022 at 12:11 AM Arnd Bergmann <arnd@arndb.de> wrote:
>
> On Fri, Jun 17, 2022 at 4:57 PM Huacai Chen <chenhuacai@loongson.cn> wrote:
> >
> > On NUMA system, the performance of qspinlock is better than generic
> > spinlock. Below is the UnixBench test results on a 8 nodes (4 cores
> > per node, 32 cores in total) machine.
> >
>
> The performance increase is nice, but this is only half the story we need here.
>
> I think the more important bit is how you can guarantee that the xchg16()
> implementation is correct and always allows forward progress.
>
> >@@ -123,6 +123,10 @@ static inline unsigned long __percpu_xchg(void *ptr, unsigned long val,
> >                                                int size)
> > {
> >        switch (size) {
> >+       case 1:
> >+       case 2:
> >+               return __xchg_small((volatile void *)ptr, val, size);
> >+
>
> Do you actually need the size 1 as well?
>
> Generally speaking, I would like to rework the xchg()/cmpxchg() logic
> to only cover the 32-bit and word-sized (possibly 64-bit) case, while
> having separate optional 8-bit and 16-bit functions. I had a patch for
> this in the past, and can try to dig that out, this may be the time to
> finally do that.
>
> I see that the qspinlock() code actually calls a 'relaxed' version of xchg16(),
> but you only implement the one with the full barrier. Is it possible to
> directly provide a relaxed version that has something less than the
> __WEAK_LLSC_MB?
There is no __WEAK_LLSC_MB in __xchg_small, and it's Full-fence +
xchg16_relaxed() + Full-fence (by hev explained).

The __cmpxchg_small isn't related to qspinlock, we could drop it from
this patch.

>
>        Arnd



-- 
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] LoongArch: Add qspinlock support
  2022-06-19 15:06         ` Guo Ren
@ 2022-06-19 15:38           ` hev
  0 siblings, 0 replies; 18+ messages in thread
From: hev @ 2022-06-19 15:38 UTC (permalink / raw)
  To: Guo Ren
  Cc: Arnd Bergmann, WANG Xuerui, Huacai Chen, Huacai Chen, loongarch,
	linux-arch, Xuefeng Li, Jiaxun Yang, Peter Zijlstra, Will Deacon,
	Ingo Molnar

On Sun, Jun 19, 2022 at 11:06 PM Guo Ren <guoren@kernel.org> wrote:
>
> On Sun, Jun 19, 2022 at 12:28 PM hev <r@hev.cc> wrote:
> >
> > Hello,
> >
> > On Sat, Jun 18, 2022 at 8:59 PM WANG Xuerui <kernel@xen0n.name> wrote:
> > >
> > > On 6/18/22 01:45, Guo Ren wrote:
> > > >
> > > >> I see that the qspinlock() code actually calls a 'relaxed' version of xchg16(),
> > > >> but you only implement the one with the full barrier. Is it possible to
> > > >> directly provide a relaxed version that has something less than the
> > > >> __WEAK_LLSC_MB?
> > > > I am also curious that __WEAK_LLSC_MB is very magic. How does it
> > > > prevent preceded accesses from happening after sc for a strong
> > > > cmpxchg?
> > > >
> > > > #define __cmpxchg_asm(ld, st, m, old, new)                              \
> > > > ({                                                                      \
> > > >          __typeof(old) __ret;                                            \
> > > >                                                                          \
> > > >          __asm__ __volatile__(                                           \
> > > >          "1:     " ld "  %0, %2          # __cmpxchg_asm \n"             \
> > > >          "       bne     %0, %z3, 2f                     \n"             \
> > > >          "       or      $t0, %z4, $zero                 \n"             \
> > > >          "       " st "  $t0, %1                         \n"             \
> > > >          "       beq     $zero, $t0, 1b                  \n"             \
> > > >          "2:                                             \n"             \
> > > >          __WEAK_LLSC_MB                                                  \
> > > >
> > > > And its __smp_mb__xxx are just defined as a compiler barrier()?
> > > > #define __smp_mb__before_atomic()       barrier()
> > > > #define __smp_mb__after_atomic()        barrier()
> > > I know this one. There is only one type of barrier defined in the v1.00
> > > of LoongArch, that is the full barrier, but this is going to change.
> > > Huacai hinted in the bringup patchset that 3A6000 and later models would
> > > have finer-grained barriers. So these indeed could be relaxed in the
> > > future, just that Huacai has to wait for their embargo to expire.
> > >
> >
> > IIRC, The Loongson LL/SC behaves differently than others:
> >
> > Loongson:
> > LL: Full barrier + Load exclusive
> > SC: Store conditional + Full barrier
> How about your "am"#asm_op"_db."?
>
> Full barrier + AMO + Full barrier ?

Yes. AMO without '_db' is relaxed.

hev

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] LoongArch: Add qspinlock support
  2022-06-18  5:40         ` Arnd Bergmann
@ 2022-06-19 15:48           ` Guo Ren
  2022-06-19 16:10             ` Arnd Bergmann
  0 siblings, 1 reply; 18+ messages in thread
From: Guo Ren @ 2022-06-19 15:48 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Huacai Chen, Huacai Chen, loongarch, linux-arch, Xuefeng Li,
	Xuerui Wang, Jiaxun Yang, Peter Zijlstra, Will Deacon,
	Ingo Molnar

On Sat, Jun 18, 2022 at 1:40 PM Arnd Bergmann <arnd@arndb.de> wrote:
>
> On Sat, Jun 18, 2022 at 1:19 AM Guo Ren <guoren@kernel.org> wrote:
> >
> > > static inline u32 arch_xchg32(u32 *ptr, u32 x) {...}
> > > static inline u64 arch_xchg64(u64 *ptr, u64 x) {...}
> > >
> > > #ifdef CONFIG_64BIT
> > > #define xchg(ptr, x) (sizeof(*ptr) == 8) ? \
> > >             arch_xchg64((u64*)ptr, (uintptr_t)x)  \
> > >             arch_xchg32((u32*)ptr, x)
> > > #else
> > > #define xchg(ptr, x) arch_xchg32((u32*)ptr, (uintptr_t)x)
> > > #endif
> >
> > The above primitive implies only long & int type args are permitted, right?
>
> The idea is to allow any scalar or pointer type, but not structures or
> unions. If we need to deal with those as well, the macro could be extended
> accordingly, but I would prefer to limit it as much as possible.
>
> There is already cmpxchg64(), which is used for types that are fixed to
> 64 bit integers even on 32-bit architectures, but it is rarely used except
> to implement the atomic64_t helpers.
A lot of 32bit arches couldn't provide cmpxchg64 (like arm's ldrexd/strexd).

Another question: Do you know why arm32 didn't implement
HAVE_CMPXCHG_DOUBLE with ldrexd/strexd?

>
> 80% of the uses of cmpxchg() and xchg() deal with word-sized
> quantities like 'unsigned long', or 'void *', but the others are almost
> all fixed 32-bit quantities. We could change those to use cmpxchg32()
> directly and simplify the cmpxchg() function further to only deal
> with word-sized arguments, but I would not do that in the first step.
Don't forget cmpxchg_double for this cleanup, when do you want to
restart the work?

>
>         Arnd



-- 
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] LoongArch: Add qspinlock support
  2022-06-19 15:48           ` Guo Ren
@ 2022-06-19 16:10             ` Arnd Bergmann
  2022-06-20  9:49               ` Huacai Chen
  0 siblings, 1 reply; 18+ messages in thread
From: Arnd Bergmann @ 2022-06-19 16:10 UTC (permalink / raw)
  To: Guo Ren
  Cc: Arnd Bergmann, Huacai Chen, Huacai Chen, loongarch, linux-arch,
	Xuefeng Li, Xuerui Wang, Jiaxun Yang, Peter Zijlstra,
	Will Deacon, Ingo Molnar

On Sun, Jun 19, 2022 at 5:48 PM Guo Ren <guoren@kernel.org> wrote:
>
> On Sat, Jun 18, 2022 at 1:40 PM Arnd Bergmann <arnd@arndb.de> wrote:
> >
> > On Sat, Jun 18, 2022 at 1:19 AM Guo Ren <guoren@kernel.org> wrote:
> > >
> > > > static inline u32 arch_xchg32(u32 *ptr, u32 x) {...}
> > > > static inline u64 arch_xchg64(u64 *ptr, u64 x) {...}
> > > >
> > > > #ifdef CONFIG_64BIT
> > > > #define xchg(ptr, x) (sizeof(*ptr) == 8) ? \
> > > >             arch_xchg64((u64*)ptr, (uintptr_t)x)  \
> > > >             arch_xchg32((u32*)ptr, x)
> > > > #else
> > > > #define xchg(ptr, x) arch_xchg32((u32*)ptr, (uintptr_t)x)
> > > > #endif
> > >
> > > The above primitive implies only long & int type args are permitted, right?
> >
> > The idea is to allow any scalar or pointer type, but not structures or
> > unions. If we need to deal with those as well, the macro could be extended
> > accordingly, but I would prefer to limit it as much as possible.
> >
> > There is already cmpxchg64(), which is used for types that are fixed to
> > 64 bit integers even on 32-bit architectures, but it is rarely used except
> > to implement the atomic64_t helpers.
> A lot of 32bit arches couldn't provide cmpxchg64 (like arm's ldrexd/strexd).

Most 32-bit architectures also lack SMP support, so they can fall back to
the generic version from include/asm-generic/cmpxchg-local.h

> Another question: Do you know why arm32 didn't implement
> HAVE_CMPXCHG_DOUBLE with ldrexd/strexd?

I think it's just fairly obscure, the slub code appears to be the only
code that would use it.

> >
> > 80% of the uses of cmpxchg() and xchg() deal with word-sized
> > quantities like 'unsigned long', or 'void *', but the others are almost
> > all fixed 32-bit quantities. We could change those to use cmpxchg32()
> > directly and simplify the cmpxchg() function further to only deal
> > with word-sized arguments, but I would not do that in the first step.
> Don't forget cmpxchg_double for this cleanup, when do you want to
> restart the work?

I have no specific plans at the moment. If you or someone else likes
to look into it, I can dig out my old patch though.

The cmpxchg_double() call seems to already fit in, since it is an
inline function and does not expect arbitrary argument types.

       Arnd

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] LoongArch: Add qspinlock support
  2022-06-19 16:10             ` Arnd Bergmann
@ 2022-06-20  9:49               ` Huacai Chen
  2022-06-20 16:00                 ` Guo Ren
  0 siblings, 1 reply; 18+ messages in thread
From: Huacai Chen @ 2022-06-20  9:49 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Guo Ren, Huacai Chen, loongarch, linux-arch, Xuefeng Li,
	Xuerui Wang, Jiaxun Yang, Peter Zijlstra, Will Deacon,
	Ingo Molnar

Hi,

On Mon, Jun 20, 2022 at 12:11 AM Arnd Bergmann <arnd@arndb.de> wrote:
>
> On Sun, Jun 19, 2022 at 5:48 PM Guo Ren <guoren@kernel.org> wrote:
> >
> > On Sat, Jun 18, 2022 at 1:40 PM Arnd Bergmann <arnd@arndb.de> wrote:
> > >
> > > On Sat, Jun 18, 2022 at 1:19 AM Guo Ren <guoren@kernel.org> wrote:
> > > >
> > > > > static inline u32 arch_xchg32(u32 *ptr, u32 x) {...}
> > > > > static inline u64 arch_xchg64(u64 *ptr, u64 x) {...}
> > > > >
> > > > > #ifdef CONFIG_64BIT
> > > > > #define xchg(ptr, x) (sizeof(*ptr) == 8) ? \
> > > > >             arch_xchg64((u64*)ptr, (uintptr_t)x)  \
> > > > >             arch_xchg32((u32*)ptr, x)
> > > > > #else
> > > > > #define xchg(ptr, x) arch_xchg32((u32*)ptr, (uintptr_t)x)
> > > > > #endif
> > > >
> > > > The above primitive implies only long & int type args are permitted, right?
> > >
> > > The idea is to allow any scalar or pointer type, but not structures or
> > > unions. If we need to deal with those as well, the macro could be extended
> > > accordingly, but I would prefer to limit it as much as possible.
> > >
> > > There is already cmpxchg64(), which is used for types that are fixed to
> > > 64 bit integers even on 32-bit architectures, but it is rarely used except
> > > to implement the atomic64_t helpers.
> > A lot of 32bit arches couldn't provide cmpxchg64 (like arm's ldrexd/strexd).
>
> Most 32-bit architectures also lack SMP support, so they can fall back to
> the generic version from include/asm-generic/cmpxchg-local.h
>
> > Another question: Do you know why arm32 didn't implement
> > HAVE_CMPXCHG_DOUBLE with ldrexd/strexd?
>
> I think it's just fairly obscure, the slub code appears to be the only
> code that would use it.
>
> > >
> > > 80% of the uses of cmpxchg() and xchg() deal with word-sized
> > > quantities like 'unsigned long', or 'void *', but the others are almost
> > > all fixed 32-bit quantities. We could change those to use cmpxchg32()
> > > directly and simplify the cmpxchg() function further to only deal
> > > with word-sized arguments, but I would not do that in the first step.
> > Don't forget cmpxchg_double for this cleanup, when do you want to
> > restart the work?
>
> I have no specific plans at the moment. If you or someone else likes
> to look into it, I can dig out my old patch though.
>
> The cmpxchg_double() call seems to already fit in, since it is an
> inline function and does not expect arbitrary argument types.
Thank all of you. :)

As Rui and Xuerui said, ll and sc in LoongArch both have implicit full
barriers, so there is no "relaxed" version.

The __WEAK_LLSC_MB in __cmpxchg_small() have nothing to do with ll and
 sc themselves, we need a barrier at the branch target just because
Loongson-3A5000 has a hardware flaw (and will be fixed in
Loongson-3A6000).

qspinlock just needs xchg_small(), but cmpxchg_small() is also useful
for percpu operations. So I plan to split this patch to two: the first
add xchg_small() and cmpxchg_small(), the second enable qspinlock.

Huacai

>
>        Arnd

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] LoongArch: Add qspinlock support
  2022-06-20  9:49               ` Huacai Chen
@ 2022-06-20 16:00                 ` Guo Ren
  2022-06-21  0:59                   ` Huacai Chen
  0 siblings, 1 reply; 18+ messages in thread
From: Guo Ren @ 2022-06-20 16:00 UTC (permalink / raw)
  To: Huacai Chen
  Cc: Arnd Bergmann, Huacai Chen, loongarch, linux-arch, Xuefeng Li,
	Xuerui Wang, Jiaxun Yang, Peter Zijlstra, Will Deacon,
	Ingo Molnar

Hi Huacai & Arnd,

Please have a look at riscv qspinlock V5:
https://lore.kernel.org/linux-riscv/a498a2ff-2503-b25c-53c9-55f5f2480bf6@microchip.com/T/#t



On Mon, Jun 20, 2022 at 5:50 PM Huacai Chen <chenhuacai@kernel.org> wrote:
>
> Hi,
>
> On Mon, Jun 20, 2022 at 12:11 AM Arnd Bergmann <arnd@arndb.de> wrote:
> >
> > On Sun, Jun 19, 2022 at 5:48 PM Guo Ren <guoren@kernel.org> wrote:
> > >
> > > On Sat, Jun 18, 2022 at 1:40 PM Arnd Bergmann <arnd@arndb.de> wrote:
> > > >
> > > > On Sat, Jun 18, 2022 at 1:19 AM Guo Ren <guoren@kernel.org> wrote:
> > > > >
> > > > > > static inline u32 arch_xchg32(u32 *ptr, u32 x) {...}
> > > > > > static inline u64 arch_xchg64(u64 *ptr, u64 x) {...}
> > > > > >
> > > > > > #ifdef CONFIG_64BIT
> > > > > > #define xchg(ptr, x) (sizeof(*ptr) == 8) ? \
> > > > > >             arch_xchg64((u64*)ptr, (uintptr_t)x)  \
> > > > > >             arch_xchg32((u32*)ptr, x)
> > > > > > #else
> > > > > > #define xchg(ptr, x) arch_xchg32((u32*)ptr, (uintptr_t)x)
> > > > > > #endif
> > > > >
> > > > > The above primitive implies only long & int type args are permitted, right?
> > > >
> > > > The idea is to allow any scalar or pointer type, but not structures or
> > > > unions. If we need to deal with those as well, the macro could be extended
> > > > accordingly, but I would prefer to limit it as much as possible.
> > > >
> > > > There is already cmpxchg64(), which is used for types that are fixed to
> > > > 64 bit integers even on 32-bit architectures, but it is rarely used except
> > > > to implement the atomic64_t helpers.
> > > A lot of 32bit arches couldn't provide cmpxchg64 (like arm's ldrexd/strexd).
> >
> > Most 32-bit architectures also lack SMP support, so they can fall back to
> > the generic version from include/asm-generic/cmpxchg-local.h
> >
> > > Another question: Do you know why arm32 didn't implement
> > > HAVE_CMPXCHG_DOUBLE with ldrexd/strexd?
> >
> > I think it's just fairly obscure, the slub code appears to be the only
> > code that would use it.
> >
> > > >
> > > > 80% of the uses of cmpxchg() and xchg() deal with word-sized
> > > > quantities like 'unsigned long', or 'void *', but the others are almost
> > > > all fixed 32-bit quantities. We could change those to use cmpxchg32()
> > > > directly and simplify the cmpxchg() function further to only deal
> > > > with word-sized arguments, but I would not do that in the first step.
> > > Don't forget cmpxchg_double for this cleanup, when do you want to
> > > restart the work?
> >
> > I have no specific plans at the moment. If you or someone else likes
> > to look into it, I can dig out my old patch though.
> >
> > The cmpxchg_double() call seems to already fit in, since it is an
> > inline function and does not expect arbitrary argument types.
> Thank all of you. :)
>
> As Rui and Xuerui said, ll and sc in LoongArch both have implicit full
> barriers, so there is no "relaxed" version.
>
> The __WEAK_LLSC_MB in __cmpxchg_small() have nothing to do with ll and
>  sc themselves, we need a barrier at the branch target just because
> Loongson-3A5000 has a hardware flaw (and will be fixed in
> Loongson-3A6000).
>
> qspinlock just needs xchg_small(), but cmpxchg_small() is also useful
> for percpu operations. So I plan to split this patch to two: the first
> add xchg_small() and cmpxchg_small(), the second enable qspinlock.
>
> Huacai
>
> >
> >        Arnd



--
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] LoongArch: Add qspinlock support
  2022-06-20 16:00                 ` Guo Ren
@ 2022-06-21  0:59                   ` Huacai Chen
  2022-06-21  2:11                     ` Guo Ren
  0 siblings, 1 reply; 18+ messages in thread
From: Huacai Chen @ 2022-06-21  0:59 UTC (permalink / raw)
  To: Guo Ren
  Cc: Arnd Bergmann, Huacai Chen, loongarch, linux-arch, Xuefeng Li,
	Xuerui Wang, Jiaxun Yang, Peter Zijlstra, Will Deacon,
	Ingo Molnar

Hi,

On Tue, Jun 21, 2022 at 12:01 AM Guo Ren <guoren@kernel.org> wrote:
>
> Hi Huacai & Arnd,
>
> Please have a look at riscv qspinlock V5:
> https://lore.kernel.org/linux-riscv/a498a2ff-2503-b25c-53c9-55f5f2480bf6@microchip.com/T/#t
From my point of view, we can simply drop RISCV_USE_QUEUED_SPINLOCKS,
unless ticket spinlock is better than qspinlock in the !NUMA case.

Huacai

>
>
>
> On Mon, Jun 20, 2022 at 5:50 PM Huacai Chen <chenhuacai@kernel.org> wrote:
> >
> > Hi,
> >
> > On Mon, Jun 20, 2022 at 12:11 AM Arnd Bergmann <arnd@arndb.de> wrote:
> > >
> > > On Sun, Jun 19, 2022 at 5:48 PM Guo Ren <guoren@kernel.org> wrote:
> > > >
> > > > On Sat, Jun 18, 2022 at 1:40 PM Arnd Bergmann <arnd@arndb.de> wrote:
> > > > >
> > > > > On Sat, Jun 18, 2022 at 1:19 AM Guo Ren <guoren@kernel.org> wrote:
> > > > > >
> > > > > > > static inline u32 arch_xchg32(u32 *ptr, u32 x) {...}
> > > > > > > static inline u64 arch_xchg64(u64 *ptr, u64 x) {...}
> > > > > > >
> > > > > > > #ifdef CONFIG_64BIT
> > > > > > > #define xchg(ptr, x) (sizeof(*ptr) == 8) ? \
> > > > > > >             arch_xchg64((u64*)ptr, (uintptr_t)x)  \
> > > > > > >             arch_xchg32((u32*)ptr, x)
> > > > > > > #else
> > > > > > > #define xchg(ptr, x) arch_xchg32((u32*)ptr, (uintptr_t)x)
> > > > > > > #endif
> > > > > >
> > > > > > The above primitive implies only long & int type args are permitted, right?
> > > > >
> > > > > The idea is to allow any scalar or pointer type, but not structures or
> > > > > unions. If we need to deal with those as well, the macro could be extended
> > > > > accordingly, but I would prefer to limit it as much as possible.
> > > > >
> > > > > There is already cmpxchg64(), which is used for types that are fixed to
> > > > > 64 bit integers even on 32-bit architectures, but it is rarely used except
> > > > > to implement the atomic64_t helpers.
> > > > A lot of 32bit arches couldn't provide cmpxchg64 (like arm's ldrexd/strexd).
> > >
> > > Most 32-bit architectures also lack SMP support, so they can fall back to
> > > the generic version from include/asm-generic/cmpxchg-local.h
> > >
> > > > Another question: Do you know why arm32 didn't implement
> > > > HAVE_CMPXCHG_DOUBLE with ldrexd/strexd?
> > >
> > > I think it's just fairly obscure, the slub code appears to be the only
> > > code that would use it.
> > >
> > > > >
> > > > > 80% of the uses of cmpxchg() and xchg() deal with word-sized
> > > > > quantities like 'unsigned long', or 'void *', but the others are almost
> > > > > all fixed 32-bit quantities. We could change those to use cmpxchg32()
> > > > > directly and simplify the cmpxchg() function further to only deal
> > > > > with word-sized arguments, but I would not do that in the first step.
> > > > Don't forget cmpxchg_double for this cleanup, when do you want to
> > > > restart the work?
> > >
> > > I have no specific plans at the moment. If you or someone else likes
> > > to look into it, I can dig out my old patch though.
> > >
> > > The cmpxchg_double() call seems to already fit in, since it is an
> > > inline function and does not expect arbitrary argument types.
> > Thank all of you. :)
> >
> > As Rui and Xuerui said, ll and sc in LoongArch both have implicit full
> > barriers, so there is no "relaxed" version.
> >
> > The __WEAK_LLSC_MB in __cmpxchg_small() have nothing to do with ll and
> >  sc themselves, we need a barrier at the branch target just because
> > Loongson-3A5000 has a hardware flaw (and will be fixed in
> > Loongson-3A6000).
> >
> > qspinlock just needs xchg_small(), but cmpxchg_small() is also useful
> > for percpu operations. So I plan to split this patch to two: the first
> > add xchg_small() and cmpxchg_small(), the second enable qspinlock.
> >
> > Huacai
> >
> > >
> > >        Arnd
>
>
>
> --
> Best Regards
>  Guo Ren
>
> ML: https://lore.kernel.org/linux-csky/
>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] LoongArch: Add qspinlock support
  2022-06-21  0:59                   ` Huacai Chen
@ 2022-06-21  2:11                     ` Guo Ren
  0 siblings, 0 replies; 18+ messages in thread
From: Guo Ren @ 2022-06-21  2:11 UTC (permalink / raw)
  To: Huacai Chen
  Cc: Arnd Bergmann, Huacai Chen, loongarch, linux-arch, Xuefeng Li,
	Xuerui Wang, Jiaxun Yang, Peter Zijlstra, Will Deacon,
	Ingo Molnar

On Tue, Jun 21, 2022 at 8:59 AM Huacai Chen <chenhuacai@kernel.org> wrote:
>
> Hi,
>
> On Tue, Jun 21, 2022 at 12:01 AM Guo Ren <guoren@kernel.org> wrote:
> >
> > Hi Huacai & Arnd,
> >
> > Please have a look at riscv qspinlock V5:
> > https://lore.kernel.org/linux-riscv/a498a2ff-2503-b25c-53c9-55f5f2480bf6@microchip.com/T/#t
> From my point of view, we can simply drop RISCV_USE_QUEUED_SPINLOCKS,
> unless ticket spinlock is better than qspinlock in the !NUMA case.
RISC-V ISA has a flexible LR/SC forward guarantee definition, which
means some processors still must stay in ticket-lock.

Loongarch should give out the info about how strong your LR/SC forward
guarantee is. If some versions of the processor do strong guarantee,
but some don't. Maybe you should consider the RISC-V style.

>
> Huacai
>
> >
> >
> >
> > On Mon, Jun 20, 2022 at 5:50 PM Huacai Chen <chenhuacai@kernel.org> wrote:
> > >
> > > Hi,
> > >
> > > On Mon, Jun 20, 2022 at 12:11 AM Arnd Bergmann <arnd@arndb.de> wrote:
> > > >
> > > > On Sun, Jun 19, 2022 at 5:48 PM Guo Ren <guoren@kernel.org> wrote:
> > > > >
> > > > > On Sat, Jun 18, 2022 at 1:40 PM Arnd Bergmann <arnd@arndb.de> wrote:
> > > > > >
> > > > > > On Sat, Jun 18, 2022 at 1:19 AM Guo Ren <guoren@kernel.org> wrote:
> > > > > > >
> > > > > > > > static inline u32 arch_xchg32(u32 *ptr, u32 x) {...}
> > > > > > > > static inline u64 arch_xchg64(u64 *ptr, u64 x) {...}
> > > > > > > >
> > > > > > > > #ifdef CONFIG_64BIT
> > > > > > > > #define xchg(ptr, x) (sizeof(*ptr) == 8) ? \
> > > > > > > >             arch_xchg64((u64*)ptr, (uintptr_t)x)  \
> > > > > > > >             arch_xchg32((u32*)ptr, x)
> > > > > > > > #else
> > > > > > > > #define xchg(ptr, x) arch_xchg32((u32*)ptr, (uintptr_t)x)
> > > > > > > > #endif
> > > > > > >
> > > > > > > The above primitive implies only long & int type args are permitted, right?
> > > > > >
> > > > > > The idea is to allow any scalar or pointer type, but not structures or
> > > > > > unions. If we need to deal with those as well, the macro could be extended
> > > > > > accordingly, but I would prefer to limit it as much as possible.
> > > > > >
> > > > > > There is already cmpxchg64(), which is used for types that are fixed to
> > > > > > 64 bit integers even on 32-bit architectures, but it is rarely used except
> > > > > > to implement the atomic64_t helpers.
> > > > > A lot of 32bit arches couldn't provide cmpxchg64 (like arm's ldrexd/strexd).
> > > >
> > > > Most 32-bit architectures also lack SMP support, so they can fall back to
> > > > the generic version from include/asm-generic/cmpxchg-local.h
> > > >
> > > > > Another question: Do you know why arm32 didn't implement
> > > > > HAVE_CMPXCHG_DOUBLE with ldrexd/strexd?
> > > >
> > > > I think it's just fairly obscure, the slub code appears to be the only
> > > > code that would use it.
> > > >
> > > > > >
> > > > > > 80% of the uses of cmpxchg() and xchg() deal with word-sized
> > > > > > quantities like 'unsigned long', or 'void *', but the others are almost
> > > > > > all fixed 32-bit quantities. We could change those to use cmpxchg32()
> > > > > > directly and simplify the cmpxchg() function further to only deal
> > > > > > with word-sized arguments, but I would not do that in the first step.
> > > > > Don't forget cmpxchg_double for this cleanup, when do you want to
> > > > > restart the work?
> > > >
> > > > I have no specific plans at the moment. If you or someone else likes
> > > > to look into it, I can dig out my old patch though.
> > > >
> > > > The cmpxchg_double() call seems to already fit in, since it is an
> > > > inline function and does not expect arbitrary argument types.
> > > Thank all of you. :)
> > >
> > > As Rui and Xuerui said, ll and sc in LoongArch both have implicit full
> > > barriers, so there is no "relaxed" version.
> > >
> > > The __WEAK_LLSC_MB in __cmpxchg_small() have nothing to do with ll and
> > >  sc themselves, we need a barrier at the branch target just because
> > > Loongson-3A5000 has a hardware flaw (and will be fixed in
> > > Loongson-3A6000).
> > >
> > > qspinlock just needs xchg_small(), but cmpxchg_small() is also useful
> > > for percpu operations. So I plan to split this patch to two: the first
> > > add xchg_small() and cmpxchg_small(), the second enable qspinlock.
> > >
> > > Huacai
> > >
> > > >
> > > >        Arnd
> >
> >
> >
> > --
> > Best Regards
> >  Guo Ren
> >
> > ML: https://lore.kernel.org/linux-csky/
> >



-- 
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2022-06-21  2:11 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-06-17 14:57 [PATCH] LoongArch: Add qspinlock support Huacai Chen
2022-06-17 16:10 ` Arnd Bergmann
2022-06-17 17:45   ` Guo Ren
2022-06-17 18:59     ` Arnd Bergmann
2022-06-17 23:19       ` Guo Ren
2022-06-18  5:40         ` Arnd Bergmann
2022-06-19 15:48           ` Guo Ren
2022-06-19 16:10             ` Arnd Bergmann
2022-06-20  9:49               ` Huacai Chen
2022-06-20 16:00                 ` Guo Ren
2022-06-21  0:59                   ` Huacai Chen
2022-06-21  2:11                     ` Guo Ren
2022-06-18 12:50     ` WANG Xuerui
2022-06-19  4:28       ` hev
2022-06-19 15:06         ` Guo Ren
2022-06-19 15:38           ` hev
2022-06-19 15:23   ` Guo Ren
2022-06-17 16:35 ` Guo Ren

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).