All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH V2 1/2] LoongArch: Add subword xchg/cmpxchg emulation
@ 2022-06-23  4:47 Huacai Chen
  2022-06-23  4:47 ` [PATCH V2 2/2] LoongArch: Add qspinlock support Huacai Chen
  2022-06-23  6:49 ` [PATCH V2 1/2] LoongArch: Add subword xchg/cmpxchg emulation Guo Ren
  0 siblings, 2 replies; 16+ messages in thread
From: Huacai Chen @ 2022-06-23  4:47 UTC (permalink / raw)
  To: Arnd Bergmann, Huacai Chen
  Cc: loongarch, linux-arch, Xuefeng Li, Guo Ren, Xuerui Wang,
	Jiaxun Yang, Peter Zijlstra, Will Deacon, Ingo Molnar,
	Huacai Chen, Rui Wang

LoongArch only support 32-bit/64-bit xchg/cmpxchg in native. But percpu
operation and qspinlock need 8-bit/16-bit xchg/cmpxchg. On NUMA system,
the performance of subword xchg/cmpxchg emulation is better than the
generic implementation (especially for qspinlock, data will be shown in
the next patch). This patch (of 2) adds subword xchg/cmpxchg emulation
with ll/sc and enable its usage for percpu operations.

Signed-off-by: Rui Wang <wangrui@loongson.cn>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
---
 arch/loongarch/include/asm/cmpxchg.h | 98 +++++++++++++++++++++++++++-
 arch/loongarch/include/asm/percpu.h  |  8 +++
 2 files changed, 105 insertions(+), 1 deletion(-)

diff --git a/arch/loongarch/include/asm/cmpxchg.h b/arch/loongarch/include/asm/cmpxchg.h
index 75b3a4478652..967e64bf2c37 100644
--- a/arch/loongarch/include/asm/cmpxchg.h
+++ b/arch/loongarch/include/asm/cmpxchg.h
@@ -5,8 +5,9 @@
 #ifndef __ASM_CMPXCHG_H
 #define __ASM_CMPXCHG_H
 
-#include <asm/barrier.h>
+#include <linux/bits.h>
 #include <linux/build_bug.h>
+#include <asm/barrier.h>
 
 #define __xchg_asm(amswap_db, m, val)		\
 ({						\
@@ -21,10 +22,53 @@
 		__ret;				\
 })
 
+static inline unsigned int __xchg_small(volatile void *ptr, unsigned int val,
+					unsigned int size)
+{
+	unsigned int shift;
+	u32 old32, mask, temp;
+	volatile u32 *ptr32;
+
+	/* Mask value to the correct size. */
+	mask = GENMASK((size * BITS_PER_BYTE) - 1, 0);
+	val &= mask;
+
+	/*
+	 * Calculate a shift & mask that correspond to the value we wish to
+	 * exchange within the naturally aligned 4 byte integerthat includes
+	 * it.
+	 */
+	shift = (unsigned long)ptr & 0x3;
+	shift *= BITS_PER_BYTE;
+	mask <<= shift;
+
+	/*
+	 * Calculate a pointer to the naturally aligned 4 byte integer that
+	 * includes our byte of interest, and load its value.
+	 */
+	ptr32 = (volatile u32 *)((unsigned long)ptr & ~0x3);
+
+	asm volatile (
+	"1:	ll.w		%0, %3		\n"
+	"	andn		%1, %0, %z4	\n"
+	"	or		%1, %1, %z5	\n"
+	"	sc.w		%1, %2		\n"
+	"	beqz		%1, 1b		\n"
+	: "=&r" (old32), "=&r" (temp), "=" GCC_OFF_SMALL_ASM() (*ptr32)
+	: GCC_OFF_SMALL_ASM() (*ptr32), "Jr" (mask), "Jr" (val << shift)
+	: "memory");
+
+	return (old32 & mask) >> shift;
+}
+
 static inline unsigned long __xchg(volatile void *ptr, unsigned long x,
 				   int size)
 {
 	switch (size) {
+	case 1:
+	case 2:
+		return __xchg_small(ptr, x, size);
+
 	case 4:
 		return __xchg_asm("amswap_db.w", (volatile u32 *)ptr, (u32)x);
 
@@ -67,10 +111,62 @@ static inline unsigned long __xchg(volatile void *ptr, unsigned long x,
 	__ret;								\
 })
 
+static inline unsigned int __cmpxchg_small(volatile void *ptr, unsigned int old,
+					   unsigned int new, unsigned int size)
+{
+	unsigned int shift;
+	u32 old32, mask, temp;
+	volatile u32 *ptr32;
+
+	/* Mask inputs to the correct size. */
+	mask = GENMASK((size * BITS_PER_BYTE) - 1, 0);
+	old &= mask;
+	new &= mask;
+
+	/*
+	 * Calculate a shift & mask that correspond to the value we wish to
+	 * compare & exchange within the naturally aligned 4 byte integer
+	 * that includes it.
+	 */
+	shift = (unsigned long)ptr & 0x3;
+	shift *= BITS_PER_BYTE;
+	old <<= shift;
+	new <<= shift;
+	mask <<= shift;
+
+	/*
+	 * Calculate a pointer to the naturally aligned 4 byte integer that
+	 * includes our byte of interest, and load its value.
+	 */
+	ptr32 = (volatile u32 *)((unsigned long)ptr & ~0x3);
+
+	asm volatile (
+	"1:	ll.w		%0, %3		\n"
+	"	and		%1, %0, %z4	\n"
+	"	bne		%1, %z5, 2f	\n"
+	"	andn		%1, %0, %z4	\n"
+	"	or		%1, %1, %z6	\n"
+	"	sc.w		%1, %2		\n"
+	"	beqz		%1, 1b		\n"
+	"	b		3f		\n"
+	"2:					\n"
+	__WEAK_LLSC_MB
+	"3:					\n"
+	: "=&r" (old32), "=&r" (temp), "=" GCC_OFF_SMALL_ASM() (*ptr32)
+	: GCC_OFF_SMALL_ASM() (*ptr32), "Jr" (mask), "Jr" (old), "Jr" (new)
+	: "memory");
+
+	return (old32 & mask) >> shift;
+}
+
 static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
 				      unsigned long new, unsigned int size)
 {
 	switch (size) {
+	case 1:
+	case 2:
+		return __cmpxchg_small(ptr, old, new, size);
+
 	case 4:
 		return __cmpxchg_asm("ll.w", "sc.w", (volatile u32 *)ptr,
 				     (u32)old, new);
diff --git a/arch/loongarch/include/asm/percpu.h b/arch/loongarch/include/asm/percpu.h
index e6569f18c6dd..0bd6b0110198 100644
--- a/arch/loongarch/include/asm/percpu.h
+++ b/arch/loongarch/include/asm/percpu.h
@@ -123,6 +123,10 @@ static inline unsigned long __percpu_xchg(void *ptr, unsigned long val,
 						int size)
 {
 	switch (size) {
+	case 1:
+	case 2:
+		return __xchg_small((volatile void *)ptr, val, size);
+
 	case 4:
 		return __xchg_asm("amswap.w", (volatile u32 *)ptr, (u32)val);
 
@@ -204,9 +208,13 @@ do {									\
 #define this_cpu_write_4(pcp, val) _percpu_write(pcp, val)
 #define this_cpu_write_8(pcp, val) _percpu_write(pcp, val)
 
+#define this_cpu_xchg_1(pcp, val) _percpu_xchg(pcp, val)
+#define this_cpu_xchg_2(pcp, val) _percpu_xchg(pcp, val)
 #define this_cpu_xchg_4(pcp, val) _percpu_xchg(pcp, val)
 #define this_cpu_xchg_8(pcp, val) _percpu_xchg(pcp, val)
 
+#define this_cpu_cmpxchg_1(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
+#define this_cpu_cmpxchg_2(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
 #define this_cpu_cmpxchg_4(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
 #define this_cpu_cmpxchg_8(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
 
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH V2 2/2] LoongArch: Add qspinlock support
  2022-06-23  4:47 [PATCH V2 1/2] LoongArch: Add subword xchg/cmpxchg emulation Huacai Chen
@ 2022-06-23  4:47 ` Huacai Chen
  2022-06-23  5:44   ` Guo Ren
  2022-06-23  6:49 ` [PATCH V2 1/2] LoongArch: Add subword xchg/cmpxchg emulation Guo Ren
  1 sibling, 1 reply; 16+ messages in thread
From: Huacai Chen @ 2022-06-23  4:47 UTC (permalink / raw)
  To: Arnd Bergmann, Huacai Chen
  Cc: loongarch, linux-arch, Xuefeng Li, Guo Ren, Xuerui Wang,
	Jiaxun Yang, Peter Zijlstra, Will Deacon, Ingo Molnar,
	Huacai Chen, Rui Wang

On NUMA system, the performance of qspinlock is better than generic
spinlock. Below is the UnixBench test results on a 8 nodes (4 cores
per node, 32 cores in total) machine.

A. With generic spinlock:

System Benchmarks Index Values               BASELINE       RESULT    INDEX
Dhrystone 2 using register variables         116700.0  449574022.5  38523.9
Double-Precision Whetstone                       55.0      85190.4  15489.2
Execl Throughput                                 43.0      14696.2   3417.7
File Copy 1024 bufsize 2000 maxblocks          3960.0     143157.8    361.5
File Copy 256 bufsize 500 maxblocks            1655.0      37631.8    227.4
File Copy 4096 bufsize 8000 maxblocks          5800.0     444814.2    766.9
Pipe Throughput                               12440.0    5047490.7   4057.5
Pipe-based Context Switching                   4000.0    2021545.7   5053.9
Process Creation                                126.0      23829.8   1891.3
Shell Scripts (1 concurrent)                     42.4      33756.7   7961.5
Shell Scripts (8 concurrent)                      6.0       4062.9   6771.5
System Call Overhead                          15000.0    2479748.6   1653.2
                                                                   ========
System Benchmarks Index Score                                        2955.6

B. With qspinlock:

System Benchmarks Index Values               BASELINE       RESULT    INDEX
Dhrystone 2 using register variables         116700.0  449467876.9  38514.8
Double-Precision Whetstone                       55.0      85174.6  15486.3
Execl Throughput                                 43.0      14769.1   3434.7
File Copy 1024 bufsize 2000 maxblocks          3960.0     146150.5    369.1
File Copy 256 bufsize 500 maxblocks            1655.0      37496.8    226.6
File Copy 4096 bufsize 8000 maxblocks          5800.0     447527.0    771.6
Pipe Throughput                               12440.0    5175989.2   4160.8
Pipe-based Context Switching                   4000.0    2207747.8   5519.4
Process Creation                                126.0      25125.5   1994.1
Shell Scripts (1 concurrent)                     42.4      33461.2   7891.8
Shell Scripts (8 concurrent)                      6.0       4024.7   6707.8
System Call Overhead                          15000.0    2917278.6   1944.9
                                                                   ========
System Benchmarks Index Score                                        3040.1

Signed-off-by: Rui Wang <wangrui@loongson.cn>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
---
 arch/loongarch/Kconfig                      |  1 +
 arch/loongarch/include/asm/Kbuild           |  5 ++---
 arch/loongarch/include/asm/spinlock.h       | 12 ++++++++++++
 arch/loongarch/include/asm/spinlock_types.h | 11 +++++++++++
 4 files changed, 26 insertions(+), 3 deletions(-)
 create mode 100644 arch/loongarch/include/asm/spinlock.h
 create mode 100644 arch/loongarch/include/asm/spinlock_types.h

diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index 1920d52653b4..1ec220df751d 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -46,6 +46,7 @@ config LOONGARCH
 	select ARCH_USE_BUILTIN_BSWAP
 	select ARCH_USE_CMPXCHG_LOCKREF
 	select ARCH_USE_QUEUED_RWLOCKS
+	select ARCH_USE_QUEUED_SPINLOCKS
 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
 	select ARCH_WANTS_NO_INSTR
 	select BUILDTIME_TABLE_SORT
diff --git a/arch/loongarch/include/asm/Kbuild b/arch/loongarch/include/asm/Kbuild
index 83bc0681e72b..a0eed6076c79 100644
--- a/arch/loongarch/include/asm/Kbuild
+++ b/arch/loongarch/include/asm/Kbuild
@@ -1,12 +1,11 @@
 # SPDX-License-Identifier: GPL-2.0
 generic-y += dma-contiguous.h
 generic-y += export.h
+generic-y += mcs_spinlock.h
 generic-y += parport.h
 generic-y += early_ioremap.h
 generic-y += qrwlock.h
-generic-y += qrwlock_types.h
-generic-y += spinlock.h
-generic-y += spinlock_types.h
+generic-y += qspinlock.h
 generic-y += rwsem.h
 generic-y += segment.h
 generic-y += user.h
diff --git a/arch/loongarch/include/asm/spinlock.h b/arch/loongarch/include/asm/spinlock.h
new file mode 100644
index 000000000000..7cb3476999be
--- /dev/null
+++ b/arch/loongarch/include/asm/spinlock.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
+ */
+#ifndef _ASM_SPINLOCK_H
+#define _ASM_SPINLOCK_H
+
+#include <asm/processor.h>
+#include <asm/qspinlock.h>
+#include <asm/qrwlock.h>
+
+#endif /* _ASM_SPINLOCK_H */
diff --git a/arch/loongarch/include/asm/spinlock_types.h b/arch/loongarch/include/asm/spinlock_types.h
new file mode 100644
index 000000000000..7458d036c161
--- /dev/null
+++ b/arch/loongarch/include/asm/spinlock_types.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
+ */
+#ifndef _ASM_SPINLOCK_TYPES_H
+#define _ASM_SPINLOCK_TYPES_H
+
+#include <asm-generic/qspinlock_types.h>
+#include <asm-generic/qrwlock_types.h>
+
+#endif
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH V2 2/2] LoongArch: Add qspinlock support
  2022-06-23  4:47 ` [PATCH V2 2/2] LoongArch: Add qspinlock support Huacai Chen
@ 2022-06-23  5:44   ` Guo Ren
  2022-06-23  7:56     ` Huacai Chen
  0 siblings, 1 reply; 16+ messages in thread
From: Guo Ren @ 2022-06-23  5:44 UTC (permalink / raw)
  To: Huacai Chen
  Cc: Arnd Bergmann, Huacai Chen, loongarch, linux-arch, Xuefeng Li,
	Xuerui Wang, Jiaxun Yang, Peter Zijlstra, Will Deacon,
	Ingo Molnar, Rui Wang

On Thu, Jun 23, 2022 at 12:46 PM Huacai Chen <chenhuacai@loongson.cn> wrote:
>
> On NUMA system, the performance of qspinlock is better than generic
> spinlock. Below is the UnixBench test results on a 8 nodes (4 cores
> per node, 32 cores in total) machine.
>
> A. With generic spinlock:
>
> System Benchmarks Index Values               BASELINE       RESULT    INDEX
> Dhrystone 2 using register variables         116700.0  449574022.5  38523.9
> Double-Precision Whetstone                       55.0      85190.4  15489.2
> Execl Throughput                                 43.0      14696.2   3417.7
> File Copy 1024 bufsize 2000 maxblocks          3960.0     143157.8    361.5
> File Copy 256 bufsize 500 maxblocks            1655.0      37631.8    227.4
> File Copy 4096 bufsize 8000 maxblocks          5800.0     444814.2    766.9
> Pipe Throughput                               12440.0    5047490.7   4057.5
> Pipe-based Context Switching                   4000.0    2021545.7   5053.9
> Process Creation                                126.0      23829.8   1891.3
> Shell Scripts (1 concurrent)                     42.4      33756.7   7961.5
> Shell Scripts (8 concurrent)                      6.0       4062.9   6771.5
> System Call Overhead                          15000.0    2479748.6   1653.2
>                                                                    ========
> System Benchmarks Index Score                                        2955.6
>
> B. With qspinlock:
>
> System Benchmarks Index Values               BASELINE       RESULT    INDEX
> Dhrystone 2 using register variables         116700.0  449467876.9  38514.8
> Double-Precision Whetstone                       55.0      85174.6  15486.3
> Execl Throughput                                 43.0      14769.1   3434.7
> File Copy 1024 bufsize 2000 maxblocks          3960.0     146150.5    369.1
> File Copy 256 bufsize 500 maxblocks            1655.0      37496.8    226.6
> File Copy 4096 bufsize 8000 maxblocks          5800.0     447527.0    771.6
> Pipe Throughput                               12440.0    5175989.2   4160.8
> Pipe-based Context Switching                   4000.0    2207747.8   5519.4
> Process Creation                                126.0      25125.5   1994.1
> Shell Scripts (1 concurrent)                     42.4      33461.2   7891.8
> Shell Scripts (8 concurrent)                      6.0       4024.7   6707.8
> System Call Overhead                          15000.0    2917278.6   1944.9
>                                                                    ========
> System Benchmarks Index Score                                        3040.1
>
> Signed-off-by: Rui Wang <wangrui@loongson.cn>
> Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
> ---
>  arch/loongarch/Kconfig                      |  1 +
>  arch/loongarch/include/asm/Kbuild           |  5 ++---
>  arch/loongarch/include/asm/spinlock.h       | 12 ++++++++++++
>  arch/loongarch/include/asm/spinlock_types.h | 11 +++++++++++
>  4 files changed, 26 insertions(+), 3 deletions(-)
>  create mode 100644 arch/loongarch/include/asm/spinlock.h
>  create mode 100644 arch/loongarch/include/asm/spinlock_types.h
>
> diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
> index 1920d52653b4..1ec220df751d 100644
> --- a/arch/loongarch/Kconfig
> +++ b/arch/loongarch/Kconfig
> @@ -46,6 +46,7 @@ config LOONGARCH
>         select ARCH_USE_BUILTIN_BSWAP
>         select ARCH_USE_CMPXCHG_LOCKREF
>         select ARCH_USE_QUEUED_RWLOCKS
> +       select ARCH_USE_QUEUED_SPINLOCKS
>         select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
>         select ARCH_WANTS_NO_INSTR
>         select BUILDTIME_TABLE_SORT
> diff --git a/arch/loongarch/include/asm/Kbuild b/arch/loongarch/include/asm/Kbuild
> index 83bc0681e72b..a0eed6076c79 100644
> --- a/arch/loongarch/include/asm/Kbuild
> +++ b/arch/loongarch/include/asm/Kbuild
> @@ -1,12 +1,11 @@
>  # SPDX-License-Identifier: GPL-2.0
>  generic-y += dma-contiguous.h
>  generic-y += export.h
> +generic-y += mcs_spinlock.h
>  generic-y += parport.h
>  generic-y += early_ioremap.h
>  generic-y += qrwlock.h
> -generic-y += qrwlock_types.h
> -generic-y += spinlock.h
> -generic-y += spinlock_types.h
Could you base the patch on [1]?

[1] https://lore.kernel.org/linux-riscv/20220621144920.2945595-2-guoren@kernel.org/raw

And keep the spinlock.h & spinlock_types.h in your Kconfig.

> +generic-y += qspinlock.h
>  generic-y += rwsem.h
>  generic-y += segment.h
>  generic-y += user.h


> diff --git a/arch/loongarch/include/asm/spinlock.h b/arch/loongarch/include/asm/spinlock.h
> new file mode 100644
> index 000000000000..7cb3476999be
> --- /dev/null
> +++ b/arch/loongarch/include/asm/spinlock.h
> @@ -0,0 +1,12 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
> + */
> +#ifndef _ASM_SPINLOCK_H
> +#define _ASM_SPINLOCK_H
> +
> +#include <asm/processor.h>
> +#include <asm/qspinlock.h>
> +#include <asm/qrwlock.h>
> +
> +#endif /* _ASM_SPINLOCK_H */
> diff --git a/arch/loongarch/include/asm/spinlock_types.h b/arch/loongarch/include/asm/spinlock_types.h
> new file mode 100644
> index 000000000000..7458d036c161
> --- /dev/null
> +++ b/arch/loongarch/include/asm/spinlock_types.h
> @@ -0,0 +1,11 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
> + */
> +#ifndef _ASM_SPINLOCK_TYPES_H
> +#define _ASM_SPINLOCK_TYPES_H
> +
> +#include <asm-generic/qspinlock_types.h>
> +#include <asm-generic/qrwlock_types.h>
> +
> +#endif
> --
> 2.27.0
>


-- 
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH V2 1/2] LoongArch: Add subword xchg/cmpxchg emulation
  2022-06-23  4:47 [PATCH V2 1/2] LoongArch: Add subword xchg/cmpxchg emulation Huacai Chen
  2022-06-23  4:47 ` [PATCH V2 2/2] LoongArch: Add qspinlock support Huacai Chen
@ 2022-06-23  6:49 ` Guo Ren
  2022-06-23  8:04   ` Huacai Chen
  1 sibling, 1 reply; 16+ messages in thread
From: Guo Ren @ 2022-06-23  6:49 UTC (permalink / raw)
  To: Huacai Chen
  Cc: Arnd Bergmann, Huacai Chen, loongarch, linux-arch, Xuefeng Li,
	Xuerui Wang, Jiaxun Yang, Peter Zijlstra, Will Deacon,
	Ingo Molnar, Rui Wang

On Thu, Jun 23, 2022 at 12:46 PM Huacai Chen <chenhuacai@loongson.cn> wrote:
>
> LoongArch only support 32-bit/64-bit xchg/cmpxchg in native. But percpu
> operation and qspinlock need 8-bit/16-bit xchg/cmpxchg. On NUMA system,
> the performance of subword xchg/cmpxchg emulation is better than the
> generic implementation (especially for qspinlock, data will be shown in
> the next patch). This patch (of 2) adds subword xchg/cmpxchg emulation
> with ll/sc and enable its usage for percpu operations.
The xchg/cmpxchg are designed for multi-processor data-sharing issues.
The percpu data won't share with other harts and disabling
irq/preemption is enough. Do you have the same issue with f97fc810798c
("arm64: percpu: Implement this_cpu operations")?


>
> Signed-off-by: Rui Wang <wangrui@loongson.cn>
> Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
> ---
>  arch/loongarch/include/asm/cmpxchg.h | 98 +++++++++++++++++++++++++++-
>  arch/loongarch/include/asm/percpu.h  |  8 +++
>  2 files changed, 105 insertions(+), 1 deletion(-)
>
> diff --git a/arch/loongarch/include/asm/cmpxchg.h b/arch/loongarch/include/asm/cmpxchg.h
> index 75b3a4478652..967e64bf2c37 100644
> --- a/arch/loongarch/include/asm/cmpxchg.h
> +++ b/arch/loongarch/include/asm/cmpxchg.h
> @@ -5,8 +5,9 @@
>  #ifndef __ASM_CMPXCHG_H
>  #define __ASM_CMPXCHG_H
>
> -#include <asm/barrier.h>
> +#include <linux/bits.h>
>  #include <linux/build_bug.h>
> +#include <asm/barrier.h>
>
>  #define __xchg_asm(amswap_db, m, val)          \
>  ({                                             \
> @@ -21,10 +22,53 @@
>                 __ret;                          \
>  })
>
> +static inline unsigned int __xchg_small(volatile void *ptr, unsigned int val,
> +                                       unsigned int size)
> +{
> +       unsigned int shift;
> +       u32 old32, mask, temp;
> +       volatile u32 *ptr32;
> +
> +       /* Mask value to the correct size. */
> +       mask = GENMASK((size * BITS_PER_BYTE) - 1, 0);
> +       val &= mask;
> +
> +       /*
> +        * Calculate a shift & mask that correspond to the value we wish to
> +        * exchange within the naturally aligned 4 byte integerthat includes
> +        * it.
> +        */
> +       shift = (unsigned long)ptr & 0x3;
> +       shift *= BITS_PER_BYTE;
> +       mask <<= shift;
> +
> +       /*
> +        * Calculate a pointer to the naturally aligned 4 byte integer that
> +        * includes our byte of interest, and load its value.
> +        */
> +       ptr32 = (volatile u32 *)((unsigned long)ptr & ~0x3);
> +
> +       asm volatile (
> +       "1:     ll.w            %0, %3          \n"
> +       "       andn            %1, %0, %z4     \n"
> +       "       or              %1, %1, %z5     \n"
> +       "       sc.w            %1, %2          \n"
> +       "       beqz            %1, 1b          \n"
> +       : "=&r" (old32), "=&r" (temp), "=" GCC_OFF_SMALL_ASM() (*ptr32)
> +       : GCC_OFF_SMALL_ASM() (*ptr32), "Jr" (mask), "Jr" (val << shift)
> +       : "memory");
> +
> +       return (old32 & mask) >> shift;
> +}
> +
>  static inline unsigned long __xchg(volatile void *ptr, unsigned long x,
>                                    int size)
>  {
>         switch (size) {
> +       case 1:
> +       case 2:
> +               return __xchg_small(ptr, x, size);
> +
>         case 4:
>                 return __xchg_asm("amswap_db.w", (volatile u32 *)ptr, (u32)x);
>
> @@ -67,10 +111,62 @@ static inline unsigned long __xchg(volatile void *ptr, unsigned long x,
>         __ret;                                                          \
>  })
>
> +static inline unsigned int __cmpxchg_small(volatile void *ptr, unsigned int old,
> +                                          unsigned int new, unsigned int size)
> +{
> +       unsigned int shift;
> +       u32 old32, mask, temp;
> +       volatile u32 *ptr32;
> +
> +       /* Mask inputs to the correct size. */
> +       mask = GENMASK((size * BITS_PER_BYTE) - 1, 0);
> +       old &= mask;
> +       new &= mask;
> +
> +       /*
> +        * Calculate a shift & mask that correspond to the value we wish to
> +        * compare & exchange within the naturally aligned 4 byte integer
> +        * that includes it.
> +        */
> +       shift = (unsigned long)ptr & 0x3;
> +       shift *= BITS_PER_BYTE;
> +       old <<= shift;
> +       new <<= shift;
> +       mask <<= shift;
> +
> +       /*
> +        * Calculate a pointer to the naturally aligned 4 byte integer that
> +        * includes our byte of interest, and load its value.
> +        */
> +       ptr32 = (volatile u32 *)((unsigned long)ptr & ~0x3);
> +
> +       asm volatile (
> +       "1:     ll.w            %0, %3          \n"
> +       "       and             %1, %0, %z4     \n"
> +       "       bne             %1, %z5, 2f     \n"
> +       "       andn            %1, %0, %z4     \n"
> +       "       or              %1, %1, %z6     \n"
> +       "       sc.w            %1, %2          \n"
> +       "       beqz            %1, 1b          \n"
> +       "       b               3f              \n"
> +       "2:                                     \n"
> +       __WEAK_LLSC_MB
> +       "3:                                     \n"
> +       : "=&r" (old32), "=&r" (temp), "=" GCC_OFF_SMALL_ASM() (*ptr32)
> +       : GCC_OFF_SMALL_ASM() (*ptr32), "Jr" (mask), "Jr" (old), "Jr" (new)
> +       : "memory");
> +
> +       return (old32 & mask) >> shift;
> +}
> +
>  static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
>                                       unsigned long new, unsigned int size)
>  {
>         switch (size) {
> +       case 1:
> +       case 2:
> +               return __cmpxchg_small(ptr, old, new, size);
> +
>         case 4:
>                 return __cmpxchg_asm("ll.w", "sc.w", (volatile u32 *)ptr,
>                                      (u32)old, new);
> diff --git a/arch/loongarch/include/asm/percpu.h b/arch/loongarch/include/asm/percpu.h
> index e6569f18c6dd..0bd6b0110198 100644
> --- a/arch/loongarch/include/asm/percpu.h
> +++ b/arch/loongarch/include/asm/percpu.h
> @@ -123,6 +123,10 @@ static inline unsigned long __percpu_xchg(void *ptr, unsigned long val,
>                                                 int size)
>  {
>         switch (size) {
> +       case 1:
> +       case 2:
> +               return __xchg_small((volatile void *)ptr, val, size);
> +
>         case 4:
>                 return __xchg_asm("amswap.w", (volatile u32 *)ptr, (u32)val);
The percpu operations are local and we shouldn't combine them with the
normal xchg/cmpxchg (They have different semantics one for local, one
for share.), please implement your own percpu ops here to fix the irq
disable/enable performance issue.

>
> @@ -204,9 +208,13 @@ do {                                                                       \
>  #define this_cpu_write_4(pcp, val) _percpu_write(pcp, val)
>  #define this_cpu_write_8(pcp, val) _percpu_write(pcp, val)
>
> +#define this_cpu_xchg_1(pcp, val) _percpu_xchg(pcp, val)
> +#define this_cpu_xchg_2(pcp, val) _percpu_xchg(pcp, val)
>  #define this_cpu_xchg_4(pcp, val) _percpu_xchg(pcp, val)
>  #define this_cpu_xchg_8(pcp, val) _percpu_xchg(pcp, val)
>
> +#define this_cpu_cmpxchg_1(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
> +#define this_cpu_cmpxchg_2(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
>  #define this_cpu_cmpxchg_4(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
>  #define this_cpu_cmpxchg_8(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
>
> --
> 2.27.0
>


-- 
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH V2 2/2] LoongArch: Add qspinlock support
  2022-06-23  5:44   ` Guo Ren
@ 2022-06-23  7:56     ` Huacai Chen
  2022-06-23  8:26       ` Arnd Bergmann
  0 siblings, 1 reply; 16+ messages in thread
From: Huacai Chen @ 2022-06-23  7:56 UTC (permalink / raw)
  To: Guo Ren
  Cc: Huacai Chen, Arnd Bergmann, loongarch, linux-arch, Xuefeng Li,
	Xuerui Wang, Jiaxun Yang, Peter Zijlstra, Will Deacon,
	Ingo Molnar, Rui Wang

Hi, Ren,

On Thu, Jun 23, 2022 at 1:45 PM Guo Ren <guoren@kernel.org> wrote:
>
> On Thu, Jun 23, 2022 at 12:46 PM Huacai Chen <chenhuacai@loongson.cn> wrote:
> >
> > On NUMA system, the performance of qspinlock is better than generic
> > spinlock. Below is the UnixBench test results on a 8 nodes (4 cores
> > per node, 32 cores in total) machine.
> >
> > A. With generic spinlock:
> >
> > System Benchmarks Index Values               BASELINE       RESULT    INDEX
> > Dhrystone 2 using register variables         116700.0  449574022.5  38523.9
> > Double-Precision Whetstone                       55.0      85190.4  15489.2
> > Execl Throughput                                 43.0      14696.2   3417.7
> > File Copy 1024 bufsize 2000 maxblocks          3960.0     143157.8    361.5
> > File Copy 256 bufsize 500 maxblocks            1655.0      37631.8    227.4
> > File Copy 4096 bufsize 8000 maxblocks          5800.0     444814.2    766.9
> > Pipe Throughput                               12440.0    5047490.7   4057.5
> > Pipe-based Context Switching                   4000.0    2021545.7   5053.9
> > Process Creation                                126.0      23829.8   1891.3
> > Shell Scripts (1 concurrent)                     42.4      33756.7   7961.5
> > Shell Scripts (8 concurrent)                      6.0       4062.9   6771.5
> > System Call Overhead                          15000.0    2479748.6   1653.2
> >                                                                    ========
> > System Benchmarks Index Score                                        2955.6
> >
> > B. With qspinlock:
> >
> > System Benchmarks Index Values               BASELINE       RESULT    INDEX
> > Dhrystone 2 using register variables         116700.0  449467876.9  38514.8
> > Double-Precision Whetstone                       55.0      85174.6  15486.3
> > Execl Throughput                                 43.0      14769.1   3434.7
> > File Copy 1024 bufsize 2000 maxblocks          3960.0     146150.5    369.1
> > File Copy 256 bufsize 500 maxblocks            1655.0      37496.8    226.6
> > File Copy 4096 bufsize 8000 maxblocks          5800.0     447527.0    771.6
> > Pipe Throughput                               12440.0    5175989.2   4160.8
> > Pipe-based Context Switching                   4000.0    2207747.8   5519.4
> > Process Creation                                126.0      25125.5   1994.1
> > Shell Scripts (1 concurrent)                     42.4      33461.2   7891.8
> > Shell Scripts (8 concurrent)                      6.0       4024.7   6707.8
> > System Call Overhead                          15000.0    2917278.6   1944.9
> >                                                                    ========
> > System Benchmarks Index Score                                        3040.1
> >
> > Signed-off-by: Rui Wang <wangrui@loongson.cn>
> > Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
> > ---
> >  arch/loongarch/Kconfig                      |  1 +
> >  arch/loongarch/include/asm/Kbuild           |  5 ++---
> >  arch/loongarch/include/asm/spinlock.h       | 12 ++++++++++++
> >  arch/loongarch/include/asm/spinlock_types.h | 11 +++++++++++
> >  4 files changed, 26 insertions(+), 3 deletions(-)
> >  create mode 100644 arch/loongarch/include/asm/spinlock.h
> >  create mode 100644 arch/loongarch/include/asm/spinlock_types.h
> >
> > diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
> > index 1920d52653b4..1ec220df751d 100644
> > --- a/arch/loongarch/Kconfig
> > +++ b/arch/loongarch/Kconfig
> > @@ -46,6 +46,7 @@ config LOONGARCH
> >         select ARCH_USE_BUILTIN_BSWAP
> >         select ARCH_USE_CMPXCHG_LOCKREF
> >         select ARCH_USE_QUEUED_RWLOCKS
> > +       select ARCH_USE_QUEUED_SPINLOCKS
> >         select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
> >         select ARCH_WANTS_NO_INSTR
> >         select BUILDTIME_TABLE_SORT
> > diff --git a/arch/loongarch/include/asm/Kbuild b/arch/loongarch/include/asm/Kbuild
> > index 83bc0681e72b..a0eed6076c79 100644
> > --- a/arch/loongarch/include/asm/Kbuild
> > +++ b/arch/loongarch/include/asm/Kbuild
> > @@ -1,12 +1,11 @@
> >  # SPDX-License-Identifier: GPL-2.0
> >  generic-y += dma-contiguous.h
> >  generic-y += export.h
> > +generic-y += mcs_spinlock.h
> >  generic-y += parport.h
> >  generic-y += early_ioremap.h
> >  generic-y += qrwlock.h
> > -generic-y += qrwlock_types.h
> > -generic-y += spinlock.h
> > -generic-y += spinlock_types.h
> Could you base the patch on [1]?
>
> [1] https://lore.kernel.org/linux-riscv/20220621144920.2945595-2-guoren@kernel.org/raw
I found that whether we use qspinlock or tspinlock, we always use
qrwlock, so maybe it is better like this?

#ifdef CONFIG_ARCH_USE_QUEUED_SPINLOCKS
#include <asm/qspinlock.h>
#else
#include <asm-generic/tspinlock.h>
#endif

#include <asm/qrwlock.h>

Huacai

>
> And keep the spinlock.h & spinlock_types.h in your Kconfig.
>
> > +generic-y += qspinlock.h
> >  generic-y += rwsem.h
> >  generic-y += segment.h
> >  generic-y += user.h
>
>
> > diff --git a/arch/loongarch/include/asm/spinlock.h b/arch/loongarch/include/asm/spinlock.h
> > new file mode 100644
> > index 000000000000..7cb3476999be
> > --- /dev/null
> > +++ b/arch/loongarch/include/asm/spinlock.h
> > @@ -0,0 +1,12 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +/*
> > + * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
> > + */
> > +#ifndef _ASM_SPINLOCK_H
> > +#define _ASM_SPINLOCK_H
> > +
> > +#include <asm/processor.h>
> > +#include <asm/qspinlock.h>
> > +#include <asm/qrwlock.h>
> > +
> > +#endif /* _ASM_SPINLOCK_H */
> > diff --git a/arch/loongarch/include/asm/spinlock_types.h b/arch/loongarch/include/asm/spinlock_types.h
> > new file mode 100644
> > index 000000000000..7458d036c161
> > --- /dev/null
> > +++ b/arch/loongarch/include/asm/spinlock_types.h
> > @@ -0,0 +1,11 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +/*
> > + * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
> > + */
> > +#ifndef _ASM_SPINLOCK_TYPES_H
> > +#define _ASM_SPINLOCK_TYPES_H
> > +
> > +#include <asm-generic/qspinlock_types.h>
> > +#include <asm-generic/qrwlock_types.h>
> > +
> > +#endif
> > --
> > 2.27.0
> >
>
>
> --
> Best Regards
>  Guo Ren
>
> ML: https://lore.kernel.org/linux-csky/

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH V2 1/2] LoongArch: Add subword xchg/cmpxchg emulation
  2022-06-23  6:49 ` [PATCH V2 1/2] LoongArch: Add subword xchg/cmpxchg emulation Guo Ren
@ 2022-06-23  8:04   ` Huacai Chen
  2022-06-23  8:43     ` Guo Ren
  0 siblings, 1 reply; 16+ messages in thread
From: Huacai Chen @ 2022-06-23  8:04 UTC (permalink / raw)
  To: Guo Ren
  Cc: Huacai Chen, Arnd Bergmann, loongarch, linux-arch, Xuefeng Li,
	Xuerui Wang, Jiaxun Yang, Peter Zijlstra, Will Deacon,
	Ingo Molnar, Rui Wang

Hi, Ren,

On Thu, Jun 23, 2022 at 2:49 PM Guo Ren <guoren@kernel.org> wrote:
>
> On Thu, Jun 23, 2022 at 12:46 PM Huacai Chen <chenhuacai@loongson.cn> wrote:
> >
> > LoongArch only support 32-bit/64-bit xchg/cmpxchg in native. But percpu
> > operation and qspinlock need 8-bit/16-bit xchg/cmpxchg. On NUMA system,
> > the performance of subword xchg/cmpxchg emulation is better than the
> > generic implementation (especially for qspinlock, data will be shown in
> > the next patch). This patch (of 2) adds subword xchg/cmpxchg emulation
> > with ll/sc and enable its usage for percpu operations.
> The xchg/cmpxchg are designed for multi-processor data-sharing issues.
> The percpu data won't share with other harts and disabling
> irq/preemption is enough. Do you have the same issue with f97fc810798c
> ("arm64: percpu: Implement this_cpu operations")?
Yes, very similar, our csr operations are even slower than atomic operations. :(

>
>
> >
> > Signed-off-by: Rui Wang <wangrui@loongson.cn>
> > Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
> > ---
> >  arch/loongarch/include/asm/cmpxchg.h | 98 +++++++++++++++++++++++++++-
> >  arch/loongarch/include/asm/percpu.h  |  8 +++
> >  2 files changed, 105 insertions(+), 1 deletion(-)
> >
> > diff --git a/arch/loongarch/include/asm/cmpxchg.h b/arch/loongarch/include/asm/cmpxchg.h
> > index 75b3a4478652..967e64bf2c37 100644
> > --- a/arch/loongarch/include/asm/cmpxchg.h
> > +++ b/arch/loongarch/include/asm/cmpxchg.h
> > @@ -5,8 +5,9 @@
> >  #ifndef __ASM_CMPXCHG_H
> >  #define __ASM_CMPXCHG_H
> >
> > -#include <asm/barrier.h>
> > +#include <linux/bits.h>
> >  #include <linux/build_bug.h>
> > +#include <asm/barrier.h>
> >
> >  #define __xchg_asm(amswap_db, m, val)          \
> >  ({                                             \
> > @@ -21,10 +22,53 @@
> >                 __ret;                          \
> >  })
> >
> > +static inline unsigned int __xchg_small(volatile void *ptr, unsigned int val,
> > +                                       unsigned int size)
> > +{
> > +       unsigned int shift;
> > +       u32 old32, mask, temp;
> > +       volatile u32 *ptr32;
> > +
> > +       /* Mask value to the correct size. */
> > +       mask = GENMASK((size * BITS_PER_BYTE) - 1, 0);
> > +       val &= mask;
> > +
> > +       /*
> > +        * Calculate a shift & mask that correspond to the value we wish to
> > +        * exchange within the naturally aligned 4 byte integerthat includes
> > +        * it.
> > +        */
> > +       shift = (unsigned long)ptr & 0x3;
> > +       shift *= BITS_PER_BYTE;
> > +       mask <<= shift;
> > +
> > +       /*
> > +        * Calculate a pointer to the naturally aligned 4 byte integer that
> > +        * includes our byte of interest, and load its value.
> > +        */
> > +       ptr32 = (volatile u32 *)((unsigned long)ptr & ~0x3);
> > +
> > +       asm volatile (
> > +       "1:     ll.w            %0, %3          \n"
> > +       "       andn            %1, %0, %z4     \n"
> > +       "       or              %1, %1, %z5     \n"
> > +       "       sc.w            %1, %2          \n"
> > +       "       beqz            %1, 1b          \n"
> > +       : "=&r" (old32), "=&r" (temp), "=" GCC_OFF_SMALL_ASM() (*ptr32)
> > +       : GCC_OFF_SMALL_ASM() (*ptr32), "Jr" (mask), "Jr" (val << shift)
> > +       : "memory");
> > +
> > +       return (old32 & mask) >> shift;
> > +}
> > +
> >  static inline unsigned long __xchg(volatile void *ptr, unsigned long x,
> >                                    int size)
> >  {
> >         switch (size) {
> > +       case 1:
> > +       case 2:
> > +               return __xchg_small(ptr, x, size);
> > +
> >         case 4:
> >                 return __xchg_asm("amswap_db.w", (volatile u32 *)ptr, (u32)x);
> >
> > @@ -67,10 +111,62 @@ static inline unsigned long __xchg(volatile void *ptr, unsigned long x,
> >         __ret;                                                          \
> >  })
> >
> > +static inline unsigned int __cmpxchg_small(volatile void *ptr, unsigned int old,
> > +                                          unsigned int new, unsigned int size)
> > +{
> > +       unsigned int shift;
> > +       u32 old32, mask, temp;
> > +       volatile u32 *ptr32;
> > +
> > +       /* Mask inputs to the correct size. */
> > +       mask = GENMASK((size * BITS_PER_BYTE) - 1, 0);
> > +       old &= mask;
> > +       new &= mask;
> > +
> > +       /*
> > +        * Calculate a shift & mask that correspond to the value we wish to
> > +        * compare & exchange within the naturally aligned 4 byte integer
> > +        * that includes it.
> > +        */
> > +       shift = (unsigned long)ptr & 0x3;
> > +       shift *= BITS_PER_BYTE;
> > +       old <<= shift;
> > +       new <<= shift;
> > +       mask <<= shift;
> > +
> > +       /*
> > +        * Calculate a pointer to the naturally aligned 4 byte integer that
> > +        * includes our byte of interest, and load its value.
> > +        */
> > +       ptr32 = (volatile u32 *)((unsigned long)ptr & ~0x3);
> > +
> > +       asm volatile (
> > +       "1:     ll.w            %0, %3          \n"
> > +       "       and             %1, %0, %z4     \n"
> > +       "       bne             %1, %z5, 2f     \n"
> > +       "       andn            %1, %0, %z4     \n"
> > +       "       or              %1, %1, %z6     \n"
> > +       "       sc.w            %1, %2          \n"
> > +       "       beqz            %1, 1b          \n"
> > +       "       b               3f              \n"
> > +       "2:                                     \n"
> > +       __WEAK_LLSC_MB
> > +       "3:                                     \n"
> > +       : "=&r" (old32), "=&r" (temp), "=" GCC_OFF_SMALL_ASM() (*ptr32)
> > +       : GCC_OFF_SMALL_ASM() (*ptr32), "Jr" (mask), "Jr" (old), "Jr" (new)
> > +       : "memory");
> > +
> > +       return (old32 & mask) >> shift;
> > +}
> > +
> >  static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
> >                                       unsigned long new, unsigned int size)
> >  {
> >         switch (size) {
> > +       case 1:
> > +       case 2:
> > +               return __cmpxchg_small(ptr, old, new, size);
> > +
> >         case 4:
> >                 return __cmpxchg_asm("ll.w", "sc.w", (volatile u32 *)ptr,
> >                                      (u32)old, new);
> > diff --git a/arch/loongarch/include/asm/percpu.h b/arch/loongarch/include/asm/percpu.h
> > index e6569f18c6dd..0bd6b0110198 100644
> > --- a/arch/loongarch/include/asm/percpu.h
> > +++ b/arch/loongarch/include/asm/percpu.h
> > @@ -123,6 +123,10 @@ static inline unsigned long __percpu_xchg(void *ptr, unsigned long val,
> >                                                 int size)
> >  {
> >         switch (size) {
> > +       case 1:
> > +       case 2:
> > +               return __xchg_small((volatile void *)ptr, val, size);
> > +
> >         case 4:
> >                 return __xchg_asm("amswap.w", (volatile u32 *)ptr, (u32)val);
> The percpu operations are local and we shouldn't combine them with the
> normal xchg/cmpxchg (They have different semantics one for local, one
> for share.), please implement your own percpu ops here to fix the irq
> disable/enable performance issue.
Yes, percpu operations are local and atomic operations are for
sharing. But we are using atomic ops to implement percpu ops (for
performance), so just implementing common emulated operations and
using it for both percpu ops and qspinlock is just OK?

Huacai
>
> >
> > @@ -204,9 +208,13 @@ do {                                                                       \
> >  #define this_cpu_write_4(pcp, val) _percpu_write(pcp, val)
> >  #define this_cpu_write_8(pcp, val) _percpu_write(pcp, val)
> >
> > +#define this_cpu_xchg_1(pcp, val) _percpu_xchg(pcp, val)
> > +#define this_cpu_xchg_2(pcp, val) _percpu_xchg(pcp, val)
> >  #define this_cpu_xchg_4(pcp, val) _percpu_xchg(pcp, val)
> >  #define this_cpu_xchg_8(pcp, val) _percpu_xchg(pcp, val)
> >
> > +#define this_cpu_cmpxchg_1(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
> > +#define this_cpu_cmpxchg_2(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
> >  #define this_cpu_cmpxchg_4(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
> >  #define this_cpu_cmpxchg_8(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
> >
> > --
> > 2.27.0
> >
>
>
> --
> Best Regards
>  Guo Ren
>
> ML: https://lore.kernel.org/linux-csky/

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH V2 2/2] LoongArch: Add qspinlock support
  2022-06-23  7:56     ` Huacai Chen
@ 2022-06-23  8:26       ` Arnd Bergmann
  2022-06-23  8:32         ` Guo Ren
  2022-06-23 13:05         ` Huacai Chen
  0 siblings, 2 replies; 16+ messages in thread
From: Arnd Bergmann @ 2022-06-23  8:26 UTC (permalink / raw)
  To: Huacai Chen
  Cc: Guo Ren, Huacai Chen, Arnd Bergmann, loongarch, linux-arch,
	Xuefeng Li, Xuerui Wang, Jiaxun Yang, Peter Zijlstra,
	Will Deacon, Ingo Molnar, Rui Wang

On Thu, Jun 23, 2022 at 9:56 AM Huacai Chen <chenhuacai@kernel.org> wrote:
> On Thu, Jun 23, 2022 at 1:45 PM Guo Ren <guoren@kernel.org> wrote:
> >
> > On Thu, Jun 23, 2022 at 12:46 PM Huacai Chen <chenhuacai@loongson.cn> wrote:
> > >
> > > On NUMA system, the performance of qspinlock is better than generic
> > > spinlock. Below is the UnixBench test results on a 8 nodes (4 cores
> > > per node, 32 cores in total) machine.

You are still missing an explanation here about why this is safe to
do. Is there are
architectural guarantee for forward progress, or do you rely on
specific microarchitectural
behavior?

> > Could you base the patch on [1]?
> >
> > [1] https://lore.kernel.org/linux-riscv/20220621144920.2945595-2-guoren@kernel.org/raw
> I found that whether we use qspinlock or tspinlock, we always use
> qrwlock, so maybe it is better like this?
>
> #ifdef CONFIG_ARCH_USE_QUEUED_SPINLOCKS
> #include <asm/qspinlock.h>
> #else
> #include <asm-generic/tspinlock.h>
> #endif
>
> #include <asm/qrwlock.h>

Yes, that seems better, but I would go one step further and include
asm-generic/qspinlock.h
in place of asm/qspinlock.h here: The two architectures that have a
custom asm/qspinlock.h
also have a custom asm/spinlock.h, so they have no need to include
asm-generic/spinlock.h
either.

        Arnd

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH V2 2/2] LoongArch: Add qspinlock support
  2022-06-23  8:26       ` Arnd Bergmann
@ 2022-06-23  8:32         ` Guo Ren
  2022-06-23 13:05         ` Huacai Chen
  1 sibling, 0 replies; 16+ messages in thread
From: Guo Ren @ 2022-06-23  8:32 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Huacai Chen, Huacai Chen, loongarch, linux-arch, Xuefeng Li,
	Xuerui Wang, Jiaxun Yang, Peter Zijlstra, Will Deacon,
	Ingo Molnar, Rui Wang

On Thu, Jun 23, 2022 at 4:26 PM Arnd Bergmann <arnd@arndb.de> wrote:
>
> On Thu, Jun 23, 2022 at 9:56 AM Huacai Chen <chenhuacai@kernel.org> wrote:
> > On Thu, Jun 23, 2022 at 1:45 PM Guo Ren <guoren@kernel.org> wrote:
> > >
> > > On Thu, Jun 23, 2022 at 12:46 PM Huacai Chen <chenhuacai@loongson.cn> wrote:
> > > >
> > > > On NUMA system, the performance of qspinlock is better than generic
> > > > spinlock. Below is the UnixBench test results on a 8 nodes (4 cores
> > > > per node, 32 cores in total) machine.
>
> You are still missing an explanation here about why this is safe to
> do. Is there are
> architectural guarantee for forward progress, or do you rely on
> specific microarchitectural
> behavior?
>
> > > Could you base the patch on [1]?
> > >
> > > [1] https://lore.kernel.org/linux-riscv/20220621144920.2945595-2-guoren@kernel.org/raw
> > I found that whether we use qspinlock or tspinlock, we always use
> > qrwlock, so maybe it is better like this?
> >
> > #ifdef CONFIG_ARCH_USE_QUEUED_SPINLOCKS
> > #include <asm/qspinlock.h>
> > #else
> > #include <asm-generic/tspinlock.h>
> > #endif
> >
> > #include <asm/qrwlock.h>
>
> Yes, that seems better, but I would go one step further and include
> asm-generic/qspinlock.h
> in place of asm/qspinlock.h here: The two architectures that have a
> custom asm/qspinlock.h
> also have a custom asm/spinlock.h, so they have no need to include
> asm-generic/spinlock.h
> either.
Okay, thx Huacai & Arnd

>
>         Arnd



-- 
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH V2 1/2] LoongArch: Add subword xchg/cmpxchg emulation
  2022-06-23  8:04   ` Huacai Chen
@ 2022-06-23  8:43     ` Guo Ren
  0 siblings, 0 replies; 16+ messages in thread
From: Guo Ren @ 2022-06-23  8:43 UTC (permalink / raw)
  To: Huacai Chen
  Cc: Huacai Chen, Arnd Bergmann, loongarch, linux-arch, Xuefeng Li,
	Xuerui Wang, Jiaxun Yang, Peter Zijlstra, Will Deacon,
	Ingo Molnar, Rui Wang

On Thu, Jun 23, 2022 at 4:04 PM Huacai Chen <chenhuacai@kernel.org> wrote:
>
> Hi, Ren,
>
> On Thu, Jun 23, 2022 at 2:49 PM Guo Ren <guoren@kernel.org> wrote:
> >
> > On Thu, Jun 23, 2022 at 12:46 PM Huacai Chen <chenhuacai@loongson.cn> wrote:
> > >
> > > LoongArch only support 32-bit/64-bit xchg/cmpxchg in native. But percpu
> > > operation and qspinlock need 8-bit/16-bit xchg/cmpxchg. On NUMA system,
> > > the performance of subword xchg/cmpxchg emulation is better than the
> > > generic implementation (especially for qspinlock, data will be shown in
> > > the next patch). This patch (of 2) adds subword xchg/cmpxchg emulation
> > > with ll/sc and enable its usage for percpu operations.
> > The xchg/cmpxchg are designed for multi-processor data-sharing issues.
> > The percpu data won't share with other harts and disabling
> > irq/preemption is enough. Do you have the same issue with f97fc810798c
> > ("arm64: percpu: Implement this_cpu operations")?
> Yes, very similar, our csr operations are even slower than atomic operations. :(
>
> >
> >
> > >
> > > Signed-off-by: Rui Wang <wangrui@loongson.cn>
> > > Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
> > > ---
> > >  arch/loongarch/include/asm/cmpxchg.h | 98 +++++++++++++++++++++++++++-
> > >  arch/loongarch/include/asm/percpu.h  |  8 +++
> > >  2 files changed, 105 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/arch/loongarch/include/asm/cmpxchg.h b/arch/loongarch/include/asm/cmpxchg.h
> > > index 75b3a4478652..967e64bf2c37 100644
> > > --- a/arch/loongarch/include/asm/cmpxchg.h
> > > +++ b/arch/loongarch/include/asm/cmpxchg.h
> > > @@ -5,8 +5,9 @@
> > >  #ifndef __ASM_CMPXCHG_H
> > >  #define __ASM_CMPXCHG_H
> > >
> > > -#include <asm/barrier.h>
> > > +#include <linux/bits.h>
> > >  #include <linux/build_bug.h>
> > > +#include <asm/barrier.h>
> > >
> > >  #define __xchg_asm(amswap_db, m, val)          \
> > >  ({                                             \
> > > @@ -21,10 +22,53 @@
> > >                 __ret;                          \
> > >  })
> > >
> > > +static inline unsigned int __xchg_small(volatile void *ptr, unsigned int val,
> > > +                                       unsigned int size)
> > > +{
> > > +       unsigned int shift;
> > > +       u32 old32, mask, temp;
> > > +       volatile u32 *ptr32;
> > > +
> > > +       /* Mask value to the correct size. */
> > > +       mask = GENMASK((size * BITS_PER_BYTE) - 1, 0);
> > > +       val &= mask;
> > > +
> > > +       /*
> > > +        * Calculate a shift & mask that correspond to the value we wish to
> > > +        * exchange within the naturally aligned 4 byte integerthat includes
> > > +        * it.
> > > +        */
> > > +       shift = (unsigned long)ptr & 0x3;
> > > +       shift *= BITS_PER_BYTE;
> > > +       mask <<= shift;
> > > +
> > > +       /*
> > > +        * Calculate a pointer to the naturally aligned 4 byte integer that
> > > +        * includes our byte of interest, and load its value.
> > > +        */
> > > +       ptr32 = (volatile u32 *)((unsigned long)ptr & ~0x3);
> > > +
> > > +       asm volatile (
> > > +       "1:     ll.w            %0, %3          \n"
> > > +       "       andn            %1, %0, %z4     \n"
> > > +       "       or              %1, %1, %z5     \n"
> > > +       "       sc.w            %1, %2          \n"
> > > +       "       beqz            %1, 1b          \n"
> > > +       : "=&r" (old32), "=&r" (temp), "=" GCC_OFF_SMALL_ASM() (*ptr32)
> > > +       : GCC_OFF_SMALL_ASM() (*ptr32), "Jr" (mask), "Jr" (val << shift)
> > > +       : "memory");
> > > +
> > > +       return (old32 & mask) >> shift;
> > > +}
> > > +
> > >  static inline unsigned long __xchg(volatile void *ptr, unsigned long x,
> > >                                    int size)
> > >  {
> > >         switch (size) {
> > > +       case 1:
> > > +       case 2:
> > > +               return __xchg_small(ptr, x, size);
> > > +
> > >         case 4:
> > >                 return __xchg_asm("amswap_db.w", (volatile u32 *)ptr, (u32)x);
> > >
> > > @@ -67,10 +111,62 @@ static inline unsigned long __xchg(volatile void *ptr, unsigned long x,
> > >         __ret;                                                          \
> > >  })
> > >
> > > +static inline unsigned int __cmpxchg_small(volatile void *ptr, unsigned int old,
> > > +                                          unsigned int new, unsigned int size)
> > > +{
> > > +       unsigned int shift;
> > > +       u32 old32, mask, temp;
> > > +       volatile u32 *ptr32;
> > > +
> > > +       /* Mask inputs to the correct size. */
> > > +       mask = GENMASK((size * BITS_PER_BYTE) - 1, 0);
> > > +       old &= mask;
> > > +       new &= mask;
> > > +
> > > +       /*
> > > +        * Calculate a shift & mask that correspond to the value we wish to
> > > +        * compare & exchange within the naturally aligned 4 byte integer
> > > +        * that includes it.
> > > +        */
> > > +       shift = (unsigned long)ptr & 0x3;
> > > +       shift *= BITS_PER_BYTE;
> > > +       old <<= shift;
> > > +       new <<= shift;
> > > +       mask <<= shift;
> > > +
> > > +       /*
> > > +        * Calculate a pointer to the naturally aligned 4 byte integer that
> > > +        * includes our byte of interest, and load its value.
> > > +        */
> > > +       ptr32 = (volatile u32 *)((unsigned long)ptr & ~0x3);
> > > +
> > > +       asm volatile (
> > > +       "1:     ll.w            %0, %3          \n"
> > > +       "       and             %1, %0, %z4     \n"
> > > +       "       bne             %1, %z5, 2f     \n"
> > > +       "       andn            %1, %0, %z4     \n"
> > > +       "       or              %1, %1, %z6     \n"
> > > +       "       sc.w            %1, %2          \n"
> > > +       "       beqz            %1, 1b          \n"
> > > +       "       b               3f              \n"
> > > +       "2:                                     \n"
> > > +       __WEAK_LLSC_MB
> > > +       "3:                                     \n"
> > > +       : "=&r" (old32), "=&r" (temp), "=" GCC_OFF_SMALL_ASM() (*ptr32)
> > > +       : GCC_OFF_SMALL_ASM() (*ptr32), "Jr" (mask), "Jr" (old), "Jr" (new)
> > > +       : "memory");
> > > +
> > > +       return (old32 & mask) >> shift;
> > > +}
> > > +
> > >  static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
> > >                                       unsigned long new, unsigned int size)
> > >  {
> > >         switch (size) {
> > > +       case 1:
> > > +       case 2:
> > > +               return __cmpxchg_small(ptr, old, new, size);
> > > +
> > >         case 4:
> > >                 return __cmpxchg_asm("ll.w", "sc.w", (volatile u32 *)ptr,
> > >                                      (u32)old, new);
> > > diff --git a/arch/loongarch/include/asm/percpu.h b/arch/loongarch/include/asm/percpu.h
> > > index e6569f18c6dd..0bd6b0110198 100644
> > > --- a/arch/loongarch/include/asm/percpu.h
> > > +++ b/arch/loongarch/include/asm/percpu.h
> > > @@ -123,6 +123,10 @@ static inline unsigned long __percpu_xchg(void *ptr, unsigned long val,
> > >                                                 int size)
> > >  {
> > >         switch (size) {
> > > +       case 1:
> > > +       case 2:
> > > +               return __xchg_small((volatile void *)ptr, val, size);
> > > +
> > >         case 4:
> > >                 return __xchg_asm("amswap.w", (volatile u32 *)ptr, (u32)val);
> > The percpu operations are local and we shouldn't combine them with the
> > normal xchg/cmpxchg (They have different semantics one for local, one
> > for share.), please implement your own percpu ops here to fix the irq
> > disable/enable performance issue.
> Yes, percpu operations are local and atomic operations are for
> sharing. But we are using atomic ops to implement percpu ops (for
> performance), so just implementing common emulated operations and
> using it for both percpu ops and qspinlock is just OK?
No, separating them would be fine. The qspinlock only needs
xchg16_relaxed and just leave that in loongarch's cmpxchg.h. That
would be easier for Arnd to cleanup xchg/cmpxchg.

>
> Huacai
> >
> > >
> > > @@ -204,9 +208,13 @@ do {                                                                       \
> > >  #define this_cpu_write_4(pcp, val) _percpu_write(pcp, val)
> > >  #define this_cpu_write_8(pcp, val) _percpu_write(pcp, val)
> > >
> > > +#define this_cpu_xchg_1(pcp, val) _percpu_xchg(pcp, val)
> > > +#define this_cpu_xchg_2(pcp, val) _percpu_xchg(pcp, val)
> > >  #define this_cpu_xchg_4(pcp, val) _percpu_xchg(pcp, val)
> > >  #define this_cpu_xchg_8(pcp, val) _percpu_xchg(pcp, val)
> > >
> > > +#define this_cpu_cmpxchg_1(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
> > > +#define this_cpu_cmpxchg_2(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
> > >  #define this_cpu_cmpxchg_4(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
> > >  #define this_cpu_cmpxchg_8(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
> > >
> > > --
> > > 2.27.0
> > >
> >
> >
> > --
> > Best Regards
> >  Guo Ren
> >
> > ML: https://lore.kernel.org/linux-csky/



-- 
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH V2 2/2] LoongArch: Add qspinlock support
  2022-06-23  8:26       ` Arnd Bergmann
  2022-06-23  8:32         ` Guo Ren
@ 2022-06-23 13:05         ` Huacai Chen
  2022-06-23 14:04           ` Arnd Bergmann
  1 sibling, 1 reply; 16+ messages in thread
From: Huacai Chen @ 2022-06-23 13:05 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Guo Ren, Huacai Chen, loongarch, linux-arch, Xuefeng Li,
	Xuerui Wang, Jiaxun Yang, Peter Zijlstra, Will Deacon,
	Ingo Molnar, Rui Wang

Hi, Arnd,

On Thu, Jun 23, 2022 at 4:26 PM Arnd Bergmann <arnd@arndb.de> wrote:
>
> On Thu, Jun 23, 2022 at 9:56 AM Huacai Chen <chenhuacai@kernel.org> wrote:
> > On Thu, Jun 23, 2022 at 1:45 PM Guo Ren <guoren@kernel.org> wrote:
> > >
> > > On Thu, Jun 23, 2022 at 12:46 PM Huacai Chen <chenhuacai@loongson.cn> wrote:
> > > >
> > > > On NUMA system, the performance of qspinlock is better than generic
> > > > spinlock. Below is the UnixBench test results on a 8 nodes (4 cores
> > > > per node, 32 cores in total) machine.
>
> You are still missing an explanation here about why this is safe to
> do. Is there are
> architectural guarantee for forward progress, or do you rely on
> specific microarchitectural
> behavior?
In my understanding, "guarantee for forward progress" means to avoid
many ll/sc happening at the same time and no one succeeds.
LoongArch uses "exclusive access (with timeout) of ll" to avoid
simultaneous ll (it also blocks other memory load/store on the same
address), and uses "random delay of sc" to avoid simultaneous sc
(introduced in CPUCFG3, bit 3 and bit 4 [1]). This mechanism can
guarantee forward progress in practice.

[1] https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html#_cpucfg



Huacai

>
> > > Could you base the patch on [1]?
> > >
> > > [1] https://lore.kernel.org/linux-riscv/20220621144920.2945595-2-guoren@kernel.org/raw
> > I found that whether we use qspinlock or tspinlock, we always use
> > qrwlock, so maybe it is better like this?
> >
> > #ifdef CONFIG_ARCH_USE_QUEUED_SPINLOCKS
> > #include <asm/qspinlock.h>
> > #else
> > #include <asm-generic/tspinlock.h>
> > #endif
> >
> > #include <asm/qrwlock.h>
>
> Yes, that seems better, but I would go one step further and include
> asm-generic/qspinlock.h
> in place of asm/qspinlock.h here: The two architectures that have a
> custom asm/qspinlock.h
> also have a custom asm/spinlock.h, so they have no need to include
> asm-generic/spinlock.h
> either.
>
>         Arnd

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH V2 2/2] LoongArch: Add qspinlock support
  2022-06-23 13:05         ` Huacai Chen
@ 2022-06-23 14:04           ` Arnd Bergmann
  2022-06-25  2:42             ` Guo Ren
  2022-06-25  6:54             ` Huacai Chen
  0 siblings, 2 replies; 16+ messages in thread
From: Arnd Bergmann @ 2022-06-23 14:04 UTC (permalink / raw)
  To: Huacai Chen
  Cc: Arnd Bergmann, Guo Ren, Huacai Chen, loongarch, linux-arch,
	Xuefeng Li, Xuerui Wang, Jiaxun Yang, Peter Zijlstra,
	Will Deacon, Ingo Molnar, Rui Wang

On Thu, Jun 23, 2022 at 3:05 PM Huacai Chen <chenhuacai@kernel.org> wrote:
> On Thu, Jun 23, 2022 at 4:26 PM Arnd Bergmann <arnd@arndb.de> wrote:
> >
> > On Thu, Jun 23, 2022 at 9:56 AM Huacai Chen <chenhuacai@kernel.org> wrote:
> > > On Thu, Jun 23, 2022 at 1:45 PM Guo Ren <guoren@kernel.org> wrote:
> > > >
> > > > On Thu, Jun 23, 2022 at 12:46 PM Huacai Chen <chenhuacai@loongson.cn> wrote:
> > > > >
> > > > > On NUMA system, the performance of qspinlock is better than generic
> > > > > spinlock. Below is the UnixBench test results on a 8 nodes (4 cores
> > > > > per node, 32 cores in total) machine.
> >
> > You are still missing an explanation here about why this is safe to
> > do. Is there are
> > architectural guarantee for forward progress, or do you rely on
> > specific microarchitectural
> > behavior?
> In my understanding, "guarantee for forward progress" means to avoid
> many ll/sc happening at the same time and no one succeeds.
> LoongArch uses "exclusive access (with timeout) of ll" to avoid
> simultaneous ll (it also blocks other memory load/store on the same
> address), and uses "random delay of sc" to avoid simultaneous sc
> (introduced in CPUCFG3, bit 3 and bit 4 [1]). This mechanism can
> guarantee forward progress in practice.
>
> [1] https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html#_cpucfg

If there is an architected feature bit for the delay, does that mean that there
is a chance of CPUs getting released that set this to zero?

In that case, you probably need a boot-time check for this feature bit
to refuse booting a kernel with qspinlock enabled when it has more than
one active CPU but does not support the random backoff, and you need
to make the choice user-visible, so users are able to configure their
kernels using the ticket spinlock. The ticket lock may also be the best
choice for smaller configurations such as a single-socket 3A5000 with
four cores and no NUMA.

       Arnd

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH V2 2/2] LoongArch: Add qspinlock support
  2022-06-23 14:04           ` Arnd Bergmann
@ 2022-06-25  2:42             ` Guo Ren
  2022-06-25  6:22               ` Arnd Bergmann
  2022-06-25  6:54             ` Huacai Chen
  1 sibling, 1 reply; 16+ messages in thread
From: Guo Ren @ 2022-06-25  2:42 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Huacai Chen, Huacai Chen, loongarch, linux-arch, Xuefeng Li,
	Xuerui Wang, Jiaxun Yang, Peter Zijlstra, Will Deacon,
	Ingo Molnar, Rui Wang

On Thu, Jun 23, 2022 at 10:08 PM Arnd Bergmann <arnd@arndb.de> wrote:
>
> On Thu, Jun 23, 2022 at 3:05 PM Huacai Chen <chenhuacai@kernel.org> wrote:
> > On Thu, Jun 23, 2022 at 4:26 PM Arnd Bergmann <arnd@arndb.de> wrote:
> > >
> > > On Thu, Jun 23, 2022 at 9:56 AM Huacai Chen <chenhuacai@kernel.org> wrote:
> > > > On Thu, Jun 23, 2022 at 1:45 PM Guo Ren <guoren@kernel.org> wrote:
> > > > >
> > > > > On Thu, Jun 23, 2022 at 12:46 PM Huacai Chen <chenhuacai@loongson.cn> wrote:
> > > > > >
> > > > > > On NUMA system, the performance of qspinlock is better than generic
> > > > > > spinlock. Below is the UnixBench test results on a 8 nodes (4 cores
> > > > > > per node, 32 cores in total) machine.
> > >
> > > You are still missing an explanation here about why this is safe to
> > > do. Is there are
> > > architectural guarantee for forward progress, or do you rely on
> > > specific microarchitectural
> > > behavior?
> > In my understanding, "guarantee for forward progress" means to avoid
> > many ll/sc happening at the same time and no one succeeds.
> > LoongArch uses "exclusive access (with timeout) of ll" to avoid
> > simultaneous ll (it also blocks other memory load/store on the same
> > address), and uses "random delay of sc" to avoid simultaneous sc
> > (introduced in CPUCFG3, bit 3 and bit 4 [1]). This mechanism can
> > guarantee forward progress in practice.
> >
> > [1] https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html#_cpucfg
>
> If there is an architected feature bit for the delay, does that mean that there
> is a chance of CPUs getting released that set this to zero?
>
> In that case, you probably need a boot-time check for this feature bit
> to refuse booting a kernel with qspinlock enabled when it has more than
> one active CPU but does not support the random backoff,
Do you mean we should combine ticket-lock into qspinlock, and let the
machine choose during boot-time?

From Peter's comment, seems the arch is broken without a strong fwd guarantee.
https://lore.kernel.org/linux-riscv/YGwKpmPkn5xIxIyx@hirez.programming.kicks-ass.net/

I'm not sure qspinlock guys would welcome ticket-lock getting into the
qspinlock data structure (Although ticket-lock is also made by peter).

> and you need
> to make the choice user-visible, so users are able to configure their
> kernels using the ticket spinlock. The ticket lock may also be the best
> choice for smaller configurations such as a single-socket 3A5000 with
> four cores and no NUMA.
>
>        Arnd



-- 
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH V2 2/2] LoongArch: Add qspinlock support
  2022-06-25  2:42             ` Guo Ren
@ 2022-06-25  6:22               ` Arnd Bergmann
  0 siblings, 0 replies; 16+ messages in thread
From: Arnd Bergmann @ 2022-06-25  6:22 UTC (permalink / raw)
  To: Guo Ren
  Cc: Arnd Bergmann, Huacai Chen, Huacai Chen, loongarch, linux-arch,
	Xuefeng Li, Xuerui Wang, Jiaxun Yang, Peter Zijlstra,
	Will Deacon, Ingo Molnar, Rui Wang

On Sat, Jun 25, 2022 at 4:42 AM Guo Ren <guoren@kernel.org> wrote:
> > In that case, you probably need a boot-time check for this feature bit
> > to refuse booting a kernel with qspinlock enabled when it has more than
> > one active CPU but does not support the random backoff,
> Do you mean we should combine ticket-lock into qspinlock, and let the
> machine choose during boot-time?

That is not what I meant here. It would be great if it could be done, but
I doubt this is possible without adding excessive complexity.

> From Peter's comment, seems the arch is broken without a strong fwd guarantee.
> https://lore.kernel.org/linux-riscv/YGwKpmPkn5xIxIyx@hirez.programming.kicks-ass.net/
>
> I'm not sure qspinlock guys would welcome ticket-lock getting into the
> qspinlock data structure (Although ticket-lock is also made by peter).

What I meant is that a kernel that is not safe to run on hardware
without the necessary guarantees should just refuse to boot, the
same way that we tend to warn when the CPU instruction set
version is too old for what the kernel was built against.


         Arnd

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH V2 2/2] LoongArch: Add qspinlock support
  2022-06-23 14:04           ` Arnd Bergmann
  2022-06-25  2:42             ` Guo Ren
@ 2022-06-25  6:54             ` Huacai Chen
  2022-06-25 11:48               ` Arnd Bergmann
  1 sibling, 1 reply; 16+ messages in thread
From: Huacai Chen @ 2022-06-25  6:54 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Guo Ren, Huacai Chen, loongarch, linux-arch, Xuefeng Li,
	Xuerui Wang, Jiaxun Yang, Peter Zijlstra, Will Deacon,
	Ingo Molnar, Rui Wang

Hi, Arnd,

On Thu, Jun 23, 2022 at 10:04 PM Arnd Bergmann <arnd@arndb.de> wrote:
>
> On Thu, Jun 23, 2022 at 3:05 PM Huacai Chen <chenhuacai@kernel.org> wrote:
> > On Thu, Jun 23, 2022 at 4:26 PM Arnd Bergmann <arnd@arndb.de> wrote:
> > >
> > > On Thu, Jun 23, 2022 at 9:56 AM Huacai Chen <chenhuacai@kernel.org> wrote:
> > > > On Thu, Jun 23, 2022 at 1:45 PM Guo Ren <guoren@kernel.org> wrote:
> > > > >
> > > > > On Thu, Jun 23, 2022 at 12:46 PM Huacai Chen <chenhuacai@loongson.cn> wrote:
> > > > > >
> > > > > > On NUMA system, the performance of qspinlock is better than generic
> > > > > > spinlock. Below is the UnixBench test results on a 8 nodes (4 cores
> > > > > > per node, 32 cores in total) machine.
> > >
> > > You are still missing an explanation here about why this is safe to
> > > do. Is there are
> > > architectural guarantee for forward progress, or do you rely on
> > > specific microarchitectural
> > > behavior?
> > In my understanding, "guarantee for forward progress" means to avoid
> > many ll/sc happening at the same time and no one succeeds.
> > LoongArch uses "exclusive access (with timeout) of ll" to avoid
> > simultaneous ll (it also blocks other memory load/store on the same
> > address), and uses "random delay of sc" to avoid simultaneous sc
> > (introduced in CPUCFG3, bit 3 and bit 4 [1]). This mechanism can
> > guarantee forward progress in practice.
> >
> > [1] https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html#_cpucfg
>
> If there is an architected feature bit for the delay, does that mean that there
> is a chance of CPUs getting released that set this to zero?
I had an offline discussion with hardware engineers, they told me that
it is a mandatory requirement for LoongArch to implement "exclusive
access of ll" and "random delay of sc" for multi-core chips. Only
single-core and dual-core processors (and not support multi-chip
interconnection) are allowed to have no such features.

Huacai
>
> In that case, you probably need a boot-time check for this feature bit
> to refuse booting a kernel with qspinlock enabled when it has more than
> one active CPU but does not support the random backoff, and you need
> to make the choice user-visible, so users are able to configure their
> kernels using the ticket spinlock. The ticket lock may also be the best
> choice for smaller configurations such as a single-socket 3A5000 with
> four cores and no NUMA.
>
>        Arnd

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH V2 2/2] LoongArch: Add qspinlock support
  2022-06-25  6:54             ` Huacai Chen
@ 2022-06-25 11:48               ` Arnd Bergmann
  2022-06-25 14:25                 ` Huacai Chen
  0 siblings, 1 reply; 16+ messages in thread
From: Arnd Bergmann @ 2022-06-25 11:48 UTC (permalink / raw)
  To: Huacai Chen
  Cc: Arnd Bergmann, Guo Ren, Huacai Chen, loongarch, linux-arch,
	Xuefeng Li, Xuerui Wang, Jiaxun Yang, Peter Zijlstra,
	Will Deacon, Ingo Molnar, Rui Wang

On Sat, Jun 25, 2022 at 8:54 AM Huacai Chen <chenhuacai@kernel.org> wrote:
> On Thu, Jun 23, 2022 at 10:04 PM Arnd Bergmann <arnd@arndb.de> wrote:
> > On Thu, Jun 23, 2022 at 3:05 PM Huacai Chen <chenhuacai@kernel.org> wrote:
> >
> > If there is an architected feature bit for the delay, does that mean that there
> > is a chance of CPUs getting released that set this to zero?
> I had an offline discussion with hardware engineers, they told me that
> it is a mandatory requirement for LoongArch to implement "exclusive
> access of ll" and "random delay of sc" for multi-core chips. Only
> single-core and dual-core processors (and not support multi-chip
> interconnection) are allowed to have no such features.

Ok, I see. I suppose the reason is that the dual-core version is safe
without the random backoff because all uses cases for qspinlock only
involve one CPU waiting for a lock, right?

Please put the explanation into the changelog text for the next version. It
might be helpful to also document this in the source code itself, maybe
with a boot-time assertion that checks for this guarantee to be held up,
and an explanation that this is required for using qspinlock.

Regardless of this, I think it still makes sense to use the same compile-time
logic that Guo Ren suggested for the risc-v version, offering a choice between
ticket spinlock and qspinlock when both make sense, possibly depending
on CONFIG_NR_CPUS and CONFIG_NUMA.

           Arnd

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH V2 2/2] LoongArch: Add qspinlock support
  2022-06-25 11:48               ` Arnd Bergmann
@ 2022-06-25 14:25                 ` Huacai Chen
  0 siblings, 0 replies; 16+ messages in thread
From: Huacai Chen @ 2022-06-25 14:25 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Guo Ren, Huacai Chen, loongarch, linux-arch, Xuefeng Li,
	Xuerui Wang, Jiaxun Yang, Peter Zijlstra, Will Deacon,
	Ingo Molnar, Rui Wang

Hi, Arnd,

On Sat, Jun 25, 2022 at 7:49 PM Arnd Bergmann <arnd@arndb.de> wrote:
>
> On Sat, Jun 25, 2022 at 8:54 AM Huacai Chen <chenhuacai@kernel.org> wrote:
> > On Thu, Jun 23, 2022 at 10:04 PM Arnd Bergmann <arnd@arndb.de> wrote:
> > > On Thu, Jun 23, 2022 at 3:05 PM Huacai Chen <chenhuacai@kernel.org> wrote:
> > >
> > > If there is an architected feature bit for the delay, does that mean that there
> > > is a chance of CPUs getting released that set this to zero?
> > I had an offline discussion with hardware engineers, they told me that
> > it is a mandatory requirement for LoongArch to implement "exclusive
> > access of ll" and "random delay of sc" for multi-core chips. Only
> > single-core and dual-core processors (and not support multi-chip
> > interconnection) are allowed to have no such features.
>
> Ok, I see. I suppose the reason is that the dual-core version is safe
> without the random backoff because all uses cases for qspinlock only
> involve one CPU waiting for a lock, right?
Right.

>
> Please put the explanation into the changelog text for the next version. It
> might be helpful to also document this in the source code itself, maybe
> with a boot-time assertion that checks for this guarantee to be held up,
> and an explanation that this is required for using qspinlock.
OK, this will be added to the commit message.

>
> Regardless of this, I think it still makes sense to use the same compile-time
> logic that Guo Ren suggested for the risc-v version, offering a choice between
> ticket spinlock and qspinlock when both make sense, possibly depending
> on CONFIG_NR_CPUS and CONFIG_NUMA.
OK, the dependency seems to make sense.

Huacai
>
>            Arnd
>

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2022-06-25 14:25 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-06-23  4:47 [PATCH V2 1/2] LoongArch: Add subword xchg/cmpxchg emulation Huacai Chen
2022-06-23  4:47 ` [PATCH V2 2/2] LoongArch: Add qspinlock support Huacai Chen
2022-06-23  5:44   ` Guo Ren
2022-06-23  7:56     ` Huacai Chen
2022-06-23  8:26       ` Arnd Bergmann
2022-06-23  8:32         ` Guo Ren
2022-06-23 13:05         ` Huacai Chen
2022-06-23 14:04           ` Arnd Bergmann
2022-06-25  2:42             ` Guo Ren
2022-06-25  6:22               ` Arnd Bergmann
2022-06-25  6:54             ` Huacai Chen
2022-06-25 11:48               ` Arnd Bergmann
2022-06-25 14:25                 ` Huacai Chen
2022-06-23  6:49 ` [PATCH V2 1/2] LoongArch: Add subword xchg/cmpxchg emulation Guo Ren
2022-06-23  8:04   ` Huacai Chen
2022-06-23  8:43     ` Guo Ren

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.