All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH RFC 1/2] arch: Introduce ARCH_HAS_HW_XCHG_SMALL
@ 2021-07-24 12:36 Huacai Chen
  2021-07-24 12:36 ` [PATCH RFC 2/2] qspinlock: Use ARCH_HAS_HW_XCHG_SMALL to select _Q_PENDING_BITS definition Huacai Chen
                   ` (2 more replies)
  0 siblings, 3 replies; 26+ messages in thread
From: Huacai Chen @ 2021-07-24 12:36 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Will Deacon, Arnd Bergmann
  Cc: Waiman Long, Boqun Feng, Guo Ren, linux-arch, Rui Wang,
	Xuefeng Li, Huacai Chen, Jiaxun Yang, Huacai Chen

Introduce a new Kconfig option ARCH_HAS_HW_XCHG_SMALL, which means arch
has hardware sub-word xchg/cmpxchg support. This option will be used as
an indicator to select the bit-field definition in the qspinlock data
structure.

Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
---
 arch/Kconfig       | 4 ++++
 arch/arm/Kconfig   | 1 +
 arch/arm64/Kconfig | 1 +
 arch/ia64/Kconfig  | 1 +
 arch/m68k/Kconfig  | 1 +
 arch/x86/Kconfig   | 1 +
 6 files changed, 9 insertions(+)

diff --git a/arch/Kconfig b/arch/Kconfig
index 129df498a8e1..ba5ed867b813 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -228,6 +228,10 @@ config ARCH_HAS_FORTIFY_SOURCE
 	  An architecture should select this when it can successfully
 	  build and run with CONFIG_FORTIFY_SOURCE.
 
+# Select if arch has hardware sub-word xchg/cmpxchg support
+config ARCH_HAS_HW_XCHG_SMALL
+	bool
+
 #
 # Select if the arch provides a historic keepinit alias for the retain_initrd
 # command line option
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 82f908fa5676..fc374ab3b5c7 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -8,6 +8,7 @@ config ARM
 	select ARCH_HAS_DMA_WRITE_COMBINE if !ARM_DMA_MEM_BUFFERABLE
 	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_FORTIFY_SOURCE
+	select ARCH_HAS_HW_XCHG_SMALL
 	select ARCH_HAS_KEEPINITRD
 	select ARCH_HAS_KCOV
 	select ARCH_HAS_MEMBARRIER_SYNC_CORE
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index b5b13a932561..393cbe1a6d85 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -25,6 +25,7 @@ config ARM64
 	select ARCH_HAS_FORTIFY_SOURCE
 	select ARCH_HAS_GCOV_PROFILE_ALL
 	select ARCH_HAS_GIGANTIC_PAGE
+	select ARCH_HAS_HW_XCHG_SMALL
 	select ARCH_HAS_KCOV
 	select ARCH_HAS_KEEPINITRD
 	select ARCH_HAS_MEMBARRIER_SYNC_CORE
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index cf425c2c63af..a0a31fd8d9be 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -9,6 +9,7 @@ menu "Processor type and features"
 config IA64
 	bool
 	select ARCH_HAS_DMA_MARK_CLEAN
+	select ARCH_HAS_HW_XCHG_SMALL
 	select ARCH_MIGHT_HAVE_PC_PARPORT
 	select ARCH_MIGHT_HAVE_PC_SERIO
 	select ACPI
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 96989ad46f66..324c2a42ec00 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -5,6 +5,7 @@ config M68K
 	select ARCH_32BIT_OFF_T
 	select ARCH_HAS_BINFMT_FLAT
 	select ARCH_HAS_DMA_PREP_COHERENT if HAS_DMA && MMU && !COLDFIRE
+	select ARCH_HAS_HW_XCHG_SMALL
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE if HAS_DMA
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG if RMW_INSNS
 	select ARCH_MIGHT_HAVE_PC_PARPORT if ISA
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 49270655e827..0f3f24502c70 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -77,6 +77,7 @@ config X86
 	select ARCH_HAS_FILTER_PGPROT
 	select ARCH_HAS_FORTIFY_SOURCE
 	select ARCH_HAS_GCOV_PROFILE_ALL
+	select ARCH_HAS_HW_XCHG_SMALL
 	select ARCH_HAS_KCOV			if X86_64 && STACK_VALIDATION
 	select ARCH_HAS_MEM_ENCRYPT
 	select ARCH_HAS_MEMBARRIER_SYNC_CORE
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH RFC 2/2] qspinlock: Use ARCH_HAS_HW_XCHG_SMALL to select _Q_PENDING_BITS definition
  2021-07-24 12:36 [PATCH RFC 1/2] arch: Introduce ARCH_HAS_HW_XCHG_SMALL Huacai Chen
@ 2021-07-24 12:36 ` Huacai Chen
  2021-07-24 19:24 ` [PATCH RFC 1/2] arch: Introduce ARCH_HAS_HW_XCHG_SMALL Arnd Bergmann
  2021-07-26  8:36 ` Geert Uytterhoeven
  2 siblings, 0 replies; 26+ messages in thread
From: Huacai Chen @ 2021-07-24 12:36 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Will Deacon, Arnd Bergmann
  Cc: Waiman Long, Boqun Feng, Guo Ren, linux-arch, Rui Wang,
	Xuefeng Li, Huacai Chen, Jiaxun Yang, Huacai Chen

There are two types of bitfield definition in qspinlock data structues:

     * When NR_CPUS < 16K
     *  0- 7: locked byte
     *     8: pending
     *  9-15: not used
     * 16-17: tail index
     * 18-31: tail cpu (+1)
     *
     * When NR_CPUS >= 16K
     *  0- 7: locked byte
     *     8: pending
     *  9-10: tail index
     * 11-31: tail cpu (+1)

_Q_PENDING_BITS is 8 or 1 for the two types respectively. The second
type is a universal definition while the first type is an optimization
for NR_CPUS < 16K, but it relies on hardware 16bit xchg/cmpxchg support.

Unfortunately, some architectures don't have hardware sub-word xchg/
cmpxchg support. Though these archs can use software emulation (e.g.,
MIPS), but the cost is too expensive, and they have no benefits from
_Q_PENDING_BITS=8. So we only allow archs with ARCH_HAS_HW_XCHG_SMALL
to select _Q_PENDING_BITS=8.

This patch can let CSKY, RISC-V and other similar archs use qspinlock to
replace existing ticket spinlock.

Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
---
 include/asm-generic/qspinlock_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/asm-generic/qspinlock_types.h b/include/asm-generic/qspinlock_types.h
index 2fd1fb89ec36..458e5d941c92 100644
--- a/include/asm-generic/qspinlock_types.h
+++ b/include/asm-generic/qspinlock_types.h
@@ -71,7 +71,7 @@ typedef struct qspinlock {
 #define _Q_LOCKED_MASK		_Q_SET_MASK(LOCKED)
 
 #define _Q_PENDING_OFFSET	(_Q_LOCKED_OFFSET + _Q_LOCKED_BITS)
-#if CONFIG_NR_CPUS < (1U << 14)
+#if (CONFIG_NR_CPUS < (1U << 14)) && defined(CONFIG_ARCH_HAS_HW_XCHG_SMALL)
 #define _Q_PENDING_BITS		8
 #else
 #define _Q_PENDING_BITS		1
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC 1/2] arch: Introduce ARCH_HAS_HW_XCHG_SMALL
  2021-07-24 12:36 [PATCH RFC 1/2] arch: Introduce ARCH_HAS_HW_XCHG_SMALL Huacai Chen
  2021-07-24 12:36 ` [PATCH RFC 2/2] qspinlock: Use ARCH_HAS_HW_XCHG_SMALL to select _Q_PENDING_BITS definition Huacai Chen
@ 2021-07-24 19:24 ` Arnd Bergmann
  2021-07-25  3:06   ` Jiaxun Yang
  2021-07-26  8:36 ` Geert Uytterhoeven
  2 siblings, 1 reply; 26+ messages in thread
From: Arnd Bergmann @ 2021-07-24 19:24 UTC (permalink / raw)
  To: Huacai Chen
  Cc: Peter Zijlstra, Ingo Molnar, Will Deacon, Arnd Bergmann,
	Waiman Long, Boqun Feng, Guo Ren, linux-arch, Rui Wang,
	Xuefeng Li, Huacai Chen, Jiaxun Yang

On Sat, Jul 24, 2021 at 2:36 PM Huacai Chen <chenhuacai@loongson.cn> wrote:
>
> Introduce a new Kconfig option ARCH_HAS_HW_XCHG_SMALL, which means arch
> has hardware sub-word xchg/cmpxchg support. This option will be used as
> an indicator to select the bit-field definition in the qspinlock data
> structure.
>
> Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>

Adding a Kconfig conditional sounds like a good idea, but I have two
concerns about the specific implementation:

- I think we should have separate symbols for 8-bit, 16-bit and 64-bit
  cmpxchg(). I think every architecture needs to support at least 32-bit
  cmpxchg() and 64-bit architectures also need to support cmpxchg64().
  I actually have a prototype patch that introduces cmpxchg8() and
  cmpxchg16() helpers with the purpose of no longer supporting these
  width in the normal cmpxchg(), but that is mostly independent of
  whether we want a conditional or not.

- If I remember correctly, there were some concerns about whether using
  this information for picking the qspinlock implementation is a good idea.

         Arnd

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC 1/2] arch: Introduce ARCH_HAS_HW_XCHG_SMALL
  2021-07-24 19:24 ` [PATCH RFC 1/2] arch: Introduce ARCH_HAS_HW_XCHG_SMALL Arnd Bergmann
@ 2021-07-25  3:06   ` Jiaxun Yang
  2021-07-25 10:08     ` Arnd Bergmann
  0 siblings, 1 reply; 26+ messages in thread
From: Jiaxun Yang @ 2021-07-25  3:06 UTC (permalink / raw)
  To: Arnd Bergmann, Huacai Chen
  Cc: Peter Zijlstra, Ingo Molnar, Will Deacon, Waiman Long,
	Boqun Feng, Guo Ren, linux-arch, Rui Wang, Xuefeng Li,
	Huacai Chen



在 2021/7/25 上午3:24, Arnd Bergmann 写道:
> On Sat, Jul 24, 2021 at 2:36 PM Huacai Chen <chenhuacai@loongson.cn> wrote:
>> Introduce a new Kconfig option ARCH_HAS_HW_XCHG_SMALL, which means arch
>> has hardware sub-word xchg/cmpxchg support. This option will be used as
>> an indicator to select the bit-field definition in the qspinlock data
>> structure.
>>
>> Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
> Adding a Kconfig conditional sounds like a good idea, but I have two
> concerns about the specific implementation:
>
> - I think we should have separate symbols for 8-bit, 16-bit and 64-bit
>    cmpxchg(). I think every architecture needs to support at least 32-bit
>    cmpxchg() and 64-bit architectures also need to support cmpxchg64().
>    I actually have a prototype patch that introduces cmpxchg8() and
>    cmpxchg16() helpers with the purpose of no longer supporting these
>    width in the normal cmpxchg(), but that is mostly independent of
>    whether we want a conditional or not.
Yeah that seems a better solution to make supporting status clear,

>
> - If I remember correctly, there were some concerns about whether using
>    this information for picking the qspinlock implementation is a good idea.
We've checked previous attempt made by Guo Ren about
ARCH_USE_QUEUED_SPINLOCKS_XCHG32[1], the concerns of potential livelock 
do exist.

So in this patch Huacai took another, dropping the whole standalone 
tailing logic to remove
the usage of sub-word xchg. It could be understood as partial revert of  
69f9cae9 ("
locking/qspinlock: Optimize for smaller NR_CPUS") [2] on these 
architectures.

>
>           Arnd
[1]: 
https://lore.kernel.org/linux-csky/1617201040-83905-2-git-send-email-guoren@kernel.org/
[2]: 
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=69f9cae90907e09af95fb991ed384670cef8dd32

Thanks.

- Jiaxun

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC 1/2] arch: Introduce ARCH_HAS_HW_XCHG_SMALL
  2021-07-25  3:06   ` Jiaxun Yang
@ 2021-07-25 10:08     ` Arnd Bergmann
  0 siblings, 0 replies; 26+ messages in thread
From: Arnd Bergmann @ 2021-07-25 10:08 UTC (permalink / raw)
  To: Jiaxun Yang
  Cc: Arnd Bergmann, Huacai Chen, Peter Zijlstra, Ingo Molnar,
	Will Deacon, Waiman Long, Boqun Feng, Guo Ren, linux-arch,
	Rui Wang, Xuefeng Li, Huacai Chen

On Sun, Jul 25, 2021 at 5:06 AM Jiaxun Yang <jiaxun.yang@flygoat.com> wrote:
> 在 2021/7/25 上午3:24, Arnd Bergmann 写道:
> > - If I remember correctly, there were some concerns about whether using
> >    this information for picking the qspinlock implementation is a good idea.
> We've checked previous attempt made by Guo Ren about
> ARCH_USE_QUEUED_SPINLOCKS_XCHG32[1], the concerns of potential livelock
> do exist.
>
> So in this patch Huacai took another, dropping the whole standalone
> tailing logic to remove the usage of sub-word xchg. It could be understood as
> partial revert of 69f9cae9 ("locking/qspinlock: Optimize for smaller NR_CPUS")
> [2] on these architectures.

Ok, I see. Let's see what Peter thinks about it.

        Arnd

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC 1/2] arch: Introduce ARCH_HAS_HW_XCHG_SMALL
  2021-07-24 12:36 [PATCH RFC 1/2] arch: Introduce ARCH_HAS_HW_XCHG_SMALL Huacai Chen
  2021-07-24 12:36 ` [PATCH RFC 2/2] qspinlock: Use ARCH_HAS_HW_XCHG_SMALL to select _Q_PENDING_BITS definition Huacai Chen
  2021-07-24 19:24 ` [PATCH RFC 1/2] arch: Introduce ARCH_HAS_HW_XCHG_SMALL Arnd Bergmann
@ 2021-07-26  8:36 ` Geert Uytterhoeven
  2021-07-26  8:56   ` Huacai Chen
  2021-07-26  9:00   ` Arnd Bergmann
  2 siblings, 2 replies; 26+ messages in thread
From: Geert Uytterhoeven @ 2021-07-26  8:36 UTC (permalink / raw)
  To: Huacai Chen
  Cc: Peter Zijlstra, Ingo Molnar, Will Deacon, Arnd Bergmann,
	Waiman Long, Boqun Feng, Guo Ren, Linux-Arch, Rui Wang,
	Xuefeng Li, Huacai Chen, Jiaxun Yang

Hi Huacai,

On Sat, Jul 24, 2021 at 2:36 PM Huacai Chen <chenhuacai@loongson.cn> wrote:
> Introduce a new Kconfig option ARCH_HAS_HW_XCHG_SMALL, which means arch
> has hardware sub-word xchg/cmpxchg support. This option will be used as
> an indicator to select the bit-field definition in the qspinlock data
> structure.
>
> Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>

Thanks for your patch!

> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -228,6 +228,10 @@ config ARCH_HAS_FORTIFY_SOURCE
>           An architecture should select this when it can successfully
>           build and run with CONFIG_FORTIFY_SOURCE.
>
> +# Select if arch has hardware sub-word xchg/cmpxchg support
> +config ARCH_HAS_HW_XCHG_SMALL

What do you mean by "hardware"?
Does a software fallback count?

> --- a/arch/m68k/Kconfig
> +++ b/arch/m68k/Kconfig
> @@ -5,6 +5,7 @@ config M68K
>         select ARCH_32BIT_OFF_T
>         select ARCH_HAS_BINFMT_FLAT
>         select ARCH_HAS_DMA_PREP_COHERENT if HAS_DMA && MMU && !COLDFIRE
> +       select ARCH_HAS_HW_XCHG_SMALL

M68k CPUs which support the CAS (Compare And Set) instruction do
support this on 8-bit, 16-bit, and 32-bit quantities.
M68k CPUs which lack CAS use a software implementation, which
supports the same quantities.

As CAS is used only if CONFIG_RMW_INSNS=y, perhaps this needs
a dependency?

   select ARCH_HAS_HW_XCHG_SMALL if RMW_INSNS

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC 1/2] arch: Introduce ARCH_HAS_HW_XCHG_SMALL
  2021-07-26  8:36 ` Geert Uytterhoeven
@ 2021-07-26  8:56   ` Huacai Chen
  2021-07-26 10:39     ` Boqun Feng
  2021-07-26  9:00   ` Arnd Bergmann
  1 sibling, 1 reply; 26+ messages in thread
From: Huacai Chen @ 2021-07-26  8:56 UTC (permalink / raw)
  To: Geert Uytterhoeven
  Cc: Huacai Chen, Peter Zijlstra, Ingo Molnar, Will Deacon,
	Arnd Bergmann, Waiman Long, Boqun Feng, Guo Ren, Linux-Arch,
	Rui Wang, Xuefeng Li, Jiaxun Yang

Hi, Geert,

On Mon, Jul 26, 2021 at 4:36 PM Geert Uytterhoeven <geert@linux-m68k.org> wrote:
>
> Hi Huacai,
>
> On Sat, Jul 24, 2021 at 2:36 PM Huacai Chen <chenhuacai@loongson.cn> wrote:
> > Introduce a new Kconfig option ARCH_HAS_HW_XCHG_SMALL, which means arch
> > has hardware sub-word xchg/cmpxchg support. This option will be used as
> > an indicator to select the bit-field definition in the qspinlock data
> > structure.
> >
> > Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
>
> Thanks for your patch!
>
> > --- a/arch/Kconfig
> > +++ b/arch/Kconfig
> > @@ -228,6 +228,10 @@ config ARCH_HAS_FORTIFY_SOURCE
> >           An architecture should select this when it can successfully
> >           build and run with CONFIG_FORTIFY_SOURCE.
> >
> > +# Select if arch has hardware sub-word xchg/cmpxchg support
> > +config ARCH_HAS_HW_XCHG_SMALL
>
> What do you mean by "hardware"?
> Does a software fallback count?
This new option is supposed as an indicator to select bit-field
definition of qspinlock, software fallback is not helpful in this
case.

>
> > --- a/arch/m68k/Kconfig
> > +++ b/arch/m68k/Kconfig
> > @@ -5,6 +5,7 @@ config M68K
> >         select ARCH_32BIT_OFF_T
> >         select ARCH_HAS_BINFMT_FLAT
> >         select ARCH_HAS_DMA_PREP_COHERENT if HAS_DMA && MMU && !COLDFIRE
> > +       select ARCH_HAS_HW_XCHG_SMALL
>
> M68k CPUs which support the CAS (Compare And Set) instruction do
> support this on 8-bit, 16-bit, and 32-bit quantities.
> M68k CPUs which lack CAS use a software implementation, which
> supports the same quantities.
>
> As CAS is used only if CONFIG_RMW_INSNS=y, perhaps this needs
> a dependency?
OK, I think this dependency is needed.

Huacai

>
>    select ARCH_HAS_HW_XCHG_SMALL if RMW_INSNS
>
> Gr{oetje,eeting}s,
>
>                         Geert
>
> --
> Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org
>
> In personal conversations with technical people, I call myself a hacker. But
> when I'm talking to journalists I just say "programmer" or something like that.
>                                 -- Linus Torvalds

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC 1/2] arch: Introduce ARCH_HAS_HW_XCHG_SMALL
  2021-07-26  8:36 ` Geert Uytterhoeven
  2021-07-26  8:56   ` Huacai Chen
@ 2021-07-26  9:00   ` Arnd Bergmann
  1 sibling, 0 replies; 26+ messages in thread
From: Arnd Bergmann @ 2021-07-26  9:00 UTC (permalink / raw)
  To: Geert Uytterhoeven
  Cc: Huacai Chen, Peter Zijlstra, Ingo Molnar, Will Deacon,
	Arnd Bergmann, Waiman Long, Boqun Feng, Guo Ren, Linux-Arch,
	Rui Wang, Xuefeng Li, Huacai Chen, Jiaxun Yang

On Mon, Jul 26, 2021 at 10:36 AM Geert Uytterhoeven
<geert@linux-m68k.org> wrote:
> On Sat, Jul 24, 2021 at 2:36 PM Huacai Chen <chenhuacai@loongson.cn> wrote:
> > @@ -228,6 +228,10 @@ config ARCH_HAS_FORTIFY_SOURCE
> >           An architecture should select this when it can successfully
> >           build and run with CONFIG_FORTIFY_SOURCE.
> >
> > +# Select if arch has hardware sub-word xchg/cmpxchg support
> > +config ARCH_HAS_HW_XCHG_SMALL
>
> What do you mean by "hardware"?
> Does a software fallback count?

Indeed, this needs to be clarified

> > --- a/arch/m68k/Kconfig
> > +++ b/arch/m68k/Kconfig
> > @@ -5,6 +5,7 @@ config M68K
> >         select ARCH_32BIT_OFF_T
> >         select ARCH_HAS_BINFMT_FLAT
> >         select ARCH_HAS_DMA_PREP_COHERENT if HAS_DMA && MMU && !COLDFIRE
> > +       select ARCH_HAS_HW_XCHG_SMALL
>
> M68k CPUs which support the CAS (Compare And Set) instruction do
> support this on 8-bit, 16-bit, and 32-bit quantities.
> M68k CPUs which lack CAS use a software implementation, which
> supports the same quantities.
>
> As CAS is used only if CONFIG_RMW_INSNS=y, perhaps this needs
> a dependency?

I would probably define that case as 'yes' then, defined as 'the 8-bit
and 16-bit cmpxchg/xchg helpers are no less atomic than the 32-bit
version'.

This would be in contrast to the MIPS versions that have native
32-bit and 64-bit cmpxchg() but only emulated 8-bit and 16-bit
versions.

Not sure about the parisc case, which only emulates the cmpxchg()
and xchg() based on spinlocks for any of the sizes, so they are not
atomic in regard to a concurrent READ_ONCE()/WRITE_ONCE().

       Arnd

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC 1/2] arch: Introduce ARCH_HAS_HW_XCHG_SMALL
  2021-07-26  8:56   ` Huacai Chen
@ 2021-07-26 10:39     ` Boqun Feng
  2021-07-26 16:41       ` Guo Ren
  0 siblings, 1 reply; 26+ messages in thread
From: Boqun Feng @ 2021-07-26 10:39 UTC (permalink / raw)
  To: Huacai Chen
  Cc: Geert Uytterhoeven, Huacai Chen, Peter Zijlstra, Ingo Molnar,
	Will Deacon, Arnd Bergmann, Waiman Long, Guo Ren, Linux-Arch,
	Rui Wang, Xuefeng Li, Jiaxun Yang

On Mon, Jul 26, 2021 at 04:56:49PM +0800, Huacai Chen wrote:
> Hi, Geert,
> 
> On Mon, Jul 26, 2021 at 4:36 PM Geert Uytterhoeven <geert@linux-m68k.org> wrote:
> >
> > Hi Huacai,
> >
> > On Sat, Jul 24, 2021 at 2:36 PM Huacai Chen <chenhuacai@loongson.cn> wrote:
> > > Introduce a new Kconfig option ARCH_HAS_HW_XCHG_SMALL, which means arch
> > > has hardware sub-word xchg/cmpxchg support. This option will be used as
> > > an indicator to select the bit-field definition in the qspinlock data
> > > structure.
> > >
> > > Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
> >
> > Thanks for your patch!
> >
> > > --- a/arch/Kconfig
> > > +++ b/arch/Kconfig
> > > @@ -228,6 +228,10 @@ config ARCH_HAS_FORTIFY_SOURCE
> > >           An architecture should select this when it can successfully
> > >           build and run with CONFIG_FORTIFY_SOURCE.
> > >
> > > +# Select if arch has hardware sub-word xchg/cmpxchg support
> > > +config ARCH_HAS_HW_XCHG_SMALL
> >
> > What do you mean by "hardware"?
> > Does a software fallback count?
> This new option is supposed as an indicator to select bit-field
> definition of qspinlock, software fallback is not helpful in this
> case.
> 

I don't think this is true. IIUC, the rationale of the config is that
for some architectures, since the architectural cmpxchg doesn't provide
forward-progress guarantee then using cmpxchg of machine-word to
implement xchg{8,16}() may cause livelock, therefore these architectures
don't want to provide xchg{8,16}(), as a result they cannot work with
qspinlock when _Q_PENDING_BITS is 8.

So as long as an architecture can provide and has already provided an
implementation of xchg{8,16}() which guarantee forward-progress (even
though the implementation is using a machine-word size cmpxchg), the
architecture doesn't need to select ARCH_HAS_HW_XCHG_SMALL.

Regards,
Boqun

> >
> > > --- a/arch/m68k/Kconfig
> > > +++ b/arch/m68k/Kconfig
> > > @@ -5,6 +5,7 @@ config M68K
> > >         select ARCH_32BIT_OFF_T
> > >         select ARCH_HAS_BINFMT_FLAT
> > >         select ARCH_HAS_DMA_PREP_COHERENT if HAS_DMA && MMU && !COLDFIRE
> > > +       select ARCH_HAS_HW_XCHG_SMALL
> >
> > M68k CPUs which support the CAS (Compare And Set) instruction do
> > support this on 8-bit, 16-bit, and 32-bit quantities.
> > M68k CPUs which lack CAS use a software implementation, which
> > supports the same quantities.
> >
> > As CAS is used only if CONFIG_RMW_INSNS=y, perhaps this needs
> > a dependency?
> OK, I think this dependency is needed.
> 
> Huacai
> 
> >
> >    select ARCH_HAS_HW_XCHG_SMALL if RMW_INSNS
> >
> > Gr{oetje,eeting}s,
> >
> >                         Geert
> >
> > --
> > Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org
> >
> > In personal conversations with technical people, I call myself a hacker. But
> > when I'm talking to journalists I just say "programmer" or something like that.
> >                                 -- Linus Torvalds

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC 1/2] arch: Introduce ARCH_HAS_HW_XCHG_SMALL
  2021-07-26 10:39     ` Boqun Feng
@ 2021-07-26 16:41       ` Guo Ren
  2021-07-26 17:03         ` Boqun Feng
  0 siblings, 1 reply; 26+ messages in thread
From: Guo Ren @ 2021-07-26 16:41 UTC (permalink / raw)
  To: Boqun Feng
  Cc: Huacai Chen, Geert Uytterhoeven, Huacai Chen, Peter Zijlstra,
	Ingo Molnar, Will Deacon, Arnd Bergmann, Waiman Long, Linux-Arch,
	Rui Wang, Xuefeng Li, Jiaxun Yang

On Mon, Jul 26, 2021 at 6:39 PM Boqun Feng <boqun.feng@gmail.com> wrote:
>
> On Mon, Jul 26, 2021 at 04:56:49PM +0800, Huacai Chen wrote:
> > Hi, Geert,
> >
> > On Mon, Jul 26, 2021 at 4:36 PM Geert Uytterhoeven <geert@linux-m68k.org> wrote:
> > >
> > > Hi Huacai,
> > >
> > > On Sat, Jul 24, 2021 at 2:36 PM Huacai Chen <chenhuacai@loongson.cn> wrote:
> > > > Introduce a new Kconfig option ARCH_HAS_HW_XCHG_SMALL, which means arch
> > > > has hardware sub-word xchg/cmpxchg support. This option will be used as
> > > > an indicator to select the bit-field definition in the qspinlock data
> > > > structure.
> > > >
> > > > Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
> > >
> > > Thanks for your patch!
> > >
> > > > --- a/arch/Kconfig
> > > > +++ b/arch/Kconfig
> > > > @@ -228,6 +228,10 @@ config ARCH_HAS_FORTIFY_SOURCE
> > > >           An architecture should select this when it can successfully
> > > >           build and run with CONFIG_FORTIFY_SOURCE.
> > > >
> > > > +# Select if arch has hardware sub-word xchg/cmpxchg support
> > > > +config ARCH_HAS_HW_XCHG_SMALL
> > >
> > > What do you mean by "hardware"?
> > > Does a software fallback count?
> > This new option is supposed as an indicator to select bit-field
> > definition of qspinlock, software fallback is not helpful in this
> > case.
> >
>
> I don't think this is true. IIUC, the rationale of the config is that
> for some architectures, since the architectural cmpxchg doesn't provide
> forward-progress guarantee then using cmpxchg of machine-word to
> implement xchg{8,16}() may cause livelock, therefore these architectures
> don't want to provide xchg{8,16}(), as a result they cannot work with
> qspinlock when _Q_PENDING_BITS is 8.
>
> So as long as an architecture can provide and has already provided an
> implementation of xchg{8,16}() which guarantee forward-progress (even
> though the implementation is using a machine-word size cmpxchg), the
> architecture doesn't need to select ARCH_HAS_HW_XCHG_SMALL.
Seems only atomic could provide forward progress, isn't it? And lr/sc
couldn't implement xchg/cmpxchg primitive properly.

How to make CPU guarantee  "load + cmpxchg" forward-progress? Fusion
these instructions and lock the snoop channel?
Maybe hardware guys would think that it's easier to implement cas +
dcas + amo(short & byte).

>
> Regards,
> Boqun
>
> > >
> > > > --- a/arch/m68k/Kconfig
> > > > +++ b/arch/m68k/Kconfig
> > > > @@ -5,6 +5,7 @@ config M68K
> > > >         select ARCH_32BIT_OFF_T
> > > >         select ARCH_HAS_BINFMT_FLAT
> > > >         select ARCH_HAS_DMA_PREP_COHERENT if HAS_DMA && MMU && !COLDFIRE
> > > > +       select ARCH_HAS_HW_XCHG_SMALL
> > >
> > > M68k CPUs which support the CAS (Compare And Set) instruction do
> > > support this on 8-bit, 16-bit, and 32-bit quantities.
> > > M68k CPUs which lack CAS use a software implementation, which
> > > supports the same quantities.
> > >
> > > As CAS is used only if CONFIG_RMW_INSNS=y, perhaps this needs
> > > a dependency?
> > OK, I think this dependency is needed.
> >
> > Huacai
> >
> > >
> > >    select ARCH_HAS_HW_XCHG_SMALL if RMW_INSNS
> > >
> > > Gr{oetje,eeting}s,
> > >
> > >                         Geert
> > >
> > > --
> > > Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org
> > >
> > > In personal conversations with technical people, I call myself a hacker. But
> > > when I'm talking to journalists I just say "programmer" or something like that.
> > >                                 -- Linus Torvalds



-- 
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC 1/2] arch: Introduce ARCH_HAS_HW_XCHG_SMALL
  2021-07-26 16:41       ` Guo Ren
@ 2021-07-26 17:03         ` Boqun Feng
  2021-07-26 21:20           ` Waiman Long
                             ` (2 more replies)
  0 siblings, 3 replies; 26+ messages in thread
From: Boqun Feng @ 2021-07-26 17:03 UTC (permalink / raw)
  To: Guo Ren
  Cc: Huacai Chen, Geert Uytterhoeven, Huacai Chen, Peter Zijlstra,
	Ingo Molnar, Will Deacon, Arnd Bergmann, Waiman Long, Linux-Arch,
	Rui Wang, Xuefeng Li, Jiaxun Yang

On Tue, Jul 27, 2021 at 12:41:34AM +0800, Guo Ren wrote:
> On Mon, Jul 26, 2021 at 6:39 PM Boqun Feng <boqun.feng@gmail.com> wrote:
> >
> > On Mon, Jul 26, 2021 at 04:56:49PM +0800, Huacai Chen wrote:
> > > Hi, Geert,
> > >
> > > On Mon, Jul 26, 2021 at 4:36 PM Geert Uytterhoeven <geert@linux-m68k.org> wrote:
> > > >
> > > > Hi Huacai,
> > > >
> > > > On Sat, Jul 24, 2021 at 2:36 PM Huacai Chen <chenhuacai@loongson.cn> wrote:
> > > > > Introduce a new Kconfig option ARCH_HAS_HW_XCHG_SMALL, which means arch
> > > > > has hardware sub-word xchg/cmpxchg support. This option will be used as
> > > > > an indicator to select the bit-field definition in the qspinlock data
> > > > > structure.
> > > > >
> > > > > Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
> > > >
> > > > Thanks for your patch!
> > > >
> > > > > --- a/arch/Kconfig
> > > > > +++ b/arch/Kconfig
> > > > > @@ -228,6 +228,10 @@ config ARCH_HAS_FORTIFY_SOURCE
> > > > >           An architecture should select this when it can successfully
> > > > >           build and run with CONFIG_FORTIFY_SOURCE.
> > > > >
> > > > > +# Select if arch has hardware sub-word xchg/cmpxchg support
> > > > > +config ARCH_HAS_HW_XCHG_SMALL
> > > >
> > > > What do you mean by "hardware"?
> > > > Does a software fallback count?
> > > This new option is supposed as an indicator to select bit-field
> > > definition of qspinlock, software fallback is not helpful in this
> > > case.
> > >
> >
> > I don't think this is true. IIUC, the rationale of the config is that
> > for some architectures, since the architectural cmpxchg doesn't provide
> > forward-progress guarantee then using cmpxchg of machine-word to
> > implement xchg{8,16}() may cause livelock, therefore these architectures
> > don't want to provide xchg{8,16}(), as a result they cannot work with
> > qspinlock when _Q_PENDING_BITS is 8.
> >
> > So as long as an architecture can provide and has already provided an
> > implementation of xchg{8,16}() which guarantee forward-progress (even
> > though the implementation is using a machine-word size cmpxchg), the
> > architecture doesn't need to select ARCH_HAS_HW_XCHG_SMALL.
> Seems only atomic could provide forward progress, isn't it? And lr/sc
> couldn't implement xchg/cmpxchg primitive properly.
> 

I'm missing you point here, a) ll/sc can provide forward progress and b)
ll/sc instructions are used to implement xchg/cmpxchg (see ARM64 and
PPC).

> How to make CPU guarantee  "load + cmpxchg" forward-progress? Fusion
> these instructions and lock the snoop channel?
> Maybe hardware guys would think that it's easier to implement cas +
> dcas + amo(short & byte).
> 

Please note that if _Q_PENDING_BITS == 1, then the xchg_tail() is
implemented as a "load + cmpxchg", so if "load + cmpxchg" implementation
of xchg16() doesn't provide forward-progress in an architecture, neither
does xchg_tail().

Regards,
Boqun

> >
> > Regards,
> > Boqun
> >
> > > >
> > > > > --- a/arch/m68k/Kconfig
> > > > > +++ b/arch/m68k/Kconfig
> > > > > @@ -5,6 +5,7 @@ config M68K
> > > > >         select ARCH_32BIT_OFF_T
> > > > >         select ARCH_HAS_BINFMT_FLAT
> > > > >         select ARCH_HAS_DMA_PREP_COHERENT if HAS_DMA && MMU && !COLDFIRE
> > > > > +       select ARCH_HAS_HW_XCHG_SMALL
> > > >
> > > > M68k CPUs which support the CAS (Compare And Set) instruction do
> > > > support this on 8-bit, 16-bit, and 32-bit quantities.
> > > > M68k CPUs which lack CAS use a software implementation, which
> > > > supports the same quantities.
> > > >
> > > > As CAS is used only if CONFIG_RMW_INSNS=y, perhaps this needs
> > > > a dependency?
> > > OK, I think this dependency is needed.
> > >
> > > Huacai
> > >
> > > >
> > > >    select ARCH_HAS_HW_XCHG_SMALL if RMW_INSNS
> > > >
> > > > Gr{oetje,eeting}s,
> > > >
> > > >                         Geert
> > > >
> > > > --
> > > > Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org
> > > >
> > > > In personal conversations with technical people, I call myself a hacker. But
> > > > when I'm talking to journalists I just say "programmer" or something like that.
> > > >                                 -- Linus Torvalds
> 
> 
> 
> -- 
> Best Regards
>  Guo Ren
> 
> ML: https://lore.kernel.org/linux-csky/

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC 1/2] arch: Introduce ARCH_HAS_HW_XCHG_SMALL
  2021-07-26 17:03         ` Boqun Feng
@ 2021-07-26 21:20           ` Waiman Long
  2021-07-27  1:27             ` Guo Ren
  2021-07-27 10:17             ` Peter Zijlstra
  2021-07-27  1:07           ` Guo Ren
  2021-07-27 10:12           ` Peter Zijlstra
  2 siblings, 2 replies; 26+ messages in thread
From: Waiman Long @ 2021-07-26 21:20 UTC (permalink / raw)
  To: Boqun Feng, Guo Ren
  Cc: Huacai Chen, Geert Uytterhoeven, Huacai Chen, Peter Zijlstra,
	Ingo Molnar, Will Deacon, Arnd Bergmann, Linux-Arch, Rui Wang,
	Xuefeng Li, Jiaxun Yang

On 7/26/21 1:03 PM, Boqun Feng wrote:
> On Tue, Jul 27, 2021 at 12:41:34AM +0800, Guo Ren wrote:
>> On Mon, Jul 26, 2021 at 6:39 PM Boqun Feng <boqun.feng@gmail.com> wrote:
>>> On Mon, Jul 26, 2021 at 04:56:49PM +0800, Huacai Chen wrote:
>>>> Hi, Geert,
>>>>
>>>> On Mon, Jul 26, 2021 at 4:36 PM Geert Uytterhoeven <geert@linux-m68k.org> wrote:
>>>>> Hi Huacai,
>>>>>
>>>>> On Sat, Jul 24, 2021 at 2:36 PM Huacai Chen <chenhuacai@loongson.cn> wrote:
>>>>>> Introduce a new Kconfig option ARCH_HAS_HW_XCHG_SMALL, which means arch
>>>>>> has hardware sub-word xchg/cmpxchg support. This option will be used as
>>>>>> an indicator to select the bit-field definition in the qspinlock data
>>>>>> structure.
>>>>>>
>>>>>> Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
>>>>> Thanks for your patch!
>>>>>
>>>>>> --- a/arch/Kconfig
>>>>>> +++ b/arch/Kconfig
>>>>>> @@ -228,6 +228,10 @@ config ARCH_HAS_FORTIFY_SOURCE
>>>>>>            An architecture should select this when it can successfully
>>>>>>            build and run with CONFIG_FORTIFY_SOURCE.
>>>>>>
>>>>>> +# Select if arch has hardware sub-word xchg/cmpxchg support
>>>>>> +config ARCH_HAS_HW_XCHG_SMALL
>>>>> What do you mean by "hardware"?
>>>>> Does a software fallback count?
>>>> This new option is supposed as an indicator to select bit-field
>>>> definition of qspinlock, software fallback is not helpful in this
>>>> case.
>>>>
>>> I don't think this is true. IIUC, the rationale of the config is that
>>> for some architectures, since the architectural cmpxchg doesn't provide
>>> forward-progress guarantee then using cmpxchg of machine-word to
>>> implement xchg{8,16}() may cause livelock, therefore these architectures
>>> don't want to provide xchg{8,16}(), as a result they cannot work with
>>> qspinlock when _Q_PENDING_BITS is 8.
>>>
>>> So as long as an architecture can provide and has already provided an
>>> implementation of xchg{8,16}() which guarantee forward-progress (even
>>> though the implementation is using a machine-word size cmpxchg), the
>>> architecture doesn't need to select ARCH_HAS_HW_XCHG_SMALL.
>> Seems only atomic could provide forward progress, isn't it? And lr/sc
>> couldn't implement xchg/cmpxchg primitive properly.
>>
> I'm missing you point here, a) ll/sc can provide forward progress and b)
> ll/sc instructions are used to implement xchg/cmpxchg (see ARM64 and
> PPC).
>
>> How to make CPU guarantee  "load + cmpxchg" forward-progress? Fusion
>> these instructions and lock the snoop channel?
>> Maybe hardware guys would think that it's easier to implement cas +
>> dcas + amo(short & byte).
>>
> Please note that if _Q_PENDING_BITS == 1, then the xchg_tail() is
> implemented as a "load + cmpxchg", so if "load + cmpxchg" implementation
> of xchg16() doesn't provide forward-progress in an architecture, neither
> does xchg_tail().

Agreed. The xchg_tail() for the "_Q_PENDING_BITS == 1" case is a 
software emulation of xchg16(). Pure software emulation like that does 
not provide forward progress guarantee. This is usually not a big 
problem for non-RT kernel for which occasional long latency is 
acceptable, but it is not good for RT kernel.

Cheers,
Longman


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC 1/2] arch: Introduce ARCH_HAS_HW_XCHG_SMALL
  2021-07-26 17:03         ` Boqun Feng
  2021-07-26 21:20           ` Waiman Long
@ 2021-07-27  1:07           ` Guo Ren
  2021-07-27  1:52             ` Wang Rui
                               ` (2 more replies)
  2021-07-27 10:12           ` Peter Zijlstra
  2 siblings, 3 replies; 26+ messages in thread
From: Guo Ren @ 2021-07-27  1:07 UTC (permalink / raw)
  To: Boqun Feng
  Cc: Huacai Chen, Geert Uytterhoeven, Huacai Chen, Peter Zijlstra,
	Ingo Molnar, Will Deacon, Arnd Bergmann, Waiman Long, Linux-Arch,
	Rui Wang, Xuefeng Li, Jiaxun Yang

On Tue, Jul 27, 2021 at 1:03 AM Boqun Feng <boqun.feng@gmail.com> wrote:
>
> On Tue, Jul 27, 2021 at 12:41:34AM +0800, Guo Ren wrote:
> > On Mon, Jul 26, 2021 at 6:39 PM Boqun Feng <boqun.feng@gmail.com> wrote:
> > >
> > > On Mon, Jul 26, 2021 at 04:56:49PM +0800, Huacai Chen wrote:
> > > > Hi, Geert,
> > > >
> > > > On Mon, Jul 26, 2021 at 4:36 PM Geert Uytterhoeven <geert@linux-m68k.org> wrote:
> > > > >
> > > > > Hi Huacai,
> > > > >
> > > > > On Sat, Jul 24, 2021 at 2:36 PM Huacai Chen <chenhuacai@loongson.cn> wrote:
> > > > > > Introduce a new Kconfig option ARCH_HAS_HW_XCHG_SMALL, which means arch
> > > > > > has hardware sub-word xchg/cmpxchg support. This option will be used as
> > > > > > an indicator to select the bit-field definition in the qspinlock data
> > > > > > structure.
> > > > > >
> > > > > > Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
> > > > >
> > > > > Thanks for your patch!
> > > > >
> > > > > > --- a/arch/Kconfig
> > > > > > +++ b/arch/Kconfig
> > > > > > @@ -228,6 +228,10 @@ config ARCH_HAS_FORTIFY_SOURCE
> > > > > >           An architecture should select this when it can successfully
> > > > > >           build and run with CONFIG_FORTIFY_SOURCE.
> > > > > >
> > > > > > +# Select if arch has hardware sub-word xchg/cmpxchg support
> > > > > > +config ARCH_HAS_HW_XCHG_SMALL
> > > > >
> > > > > What do you mean by "hardware"?
> > > > > Does a software fallback count?
> > > > This new option is supposed as an indicator to select bit-field
> > > > definition of qspinlock, software fallback is not helpful in this
> > > > case.
> > > >
> > >
> > > I don't think this is true. IIUC, the rationale of the config is that
> > > for some architectures, since the architectural cmpxchg doesn't provide
> > > forward-progress guarantee then using cmpxchg of machine-word to
> > > implement xchg{8,16}() may cause livelock, therefore these architectures
> > > don't want to provide xchg{8,16}(), as a result they cannot work with
> > > qspinlock when _Q_PENDING_BITS is 8.
> > >
> > > So as long as an architecture can provide and has already provided an
> > > implementation of xchg{8,16}() which guarantee forward-progress (even
> > > though the implementation is using a machine-word size cmpxchg), the
> > > architecture doesn't need to select ARCH_HAS_HW_XCHG_SMALL.
> > Seems only atomic could provide forward progress, isn't it? And lr/sc
> > couldn't implement xchg/cmpxchg primitive properly.
> >
>
> I'm missing you point here, a) ll/sc can provide forward progress and b)
> ll/sc instructions are used to implement xchg/cmpxchg (see ARM64 and
> PPC).
I don't think arm64 could provide fwd guarantee with ll/sc, otherwise,
they wouldn't add ARM64_HAS_LSE_ATOMICS for large systems.

>
> > How to make CPU ç  "load + cmpxchg" forward-progress? Fusion
> > these instructions and lock the snoop channel?
> > Maybe hardware guys would think that it's easier to implement cas +
> > dcas + amo(short & byte).
> >
>
> Please note that if _Q_PENDING_BITS == 1, then the xchg_tail() is
> implemented as a "load + cmpxchg", so if "load + cmpxchg" implementation
> of xchg16() doesn't provide forward-progress in an architecture, neither
> does xchg_tail().
That's the problem of "_Q_PENDING_BITS == 1", no hardware could
provide "load + ALU + cas" fwd guarantee!

A simple example, atomic a++:
c = READ_ONCE(g_value);
new = c + 1;
while ((old = cmpxchg(&g_value, c, new)) != c) {
    c = old;
    new = c + 1;
}

Q: When it runs on CPU0(500Mhz) & CPU1(2Ghz) in one SMP, how do we
prevent CPU1 from starving CPU0?
A: I think the answer is using AMO-add instruction:
atomic_add(1, &g_value);
(If the arch hasn't atomic instructions and using cmpxchg or lr/sc
implement atomic, it's the same problem.)

>
> Regards,
> Boqun
>
> > >
> > > Regards,
> > > Boqun
> > >
> > > > >
> > > > > > --- a/arch/m68k/Kconfig
> > > > > > +++ b/arch/m68k/Kconfig
> > > > > > @@ -5,6 +5,7 @@ config M68K
> > > > > >         select ARCH_32BIT_OFF_T
> > > > > >         select ARCH_HAS_BINFMT_FLAT
> > > > > >         select ARCH_HAS_DMA_PREP_COHERENT if HAS_DMA && MMU && !COLDFIRE
> > > > > > +       select ARCH_HAS_HW_XCHG_SMALL
> > > > >
> > > > > M68k CPUs which support the CAS (Compare And Set) instruction do
> > > > > support this on 8-bit, 16-bit, and 32-bit quantities.
> > > > > M68k CPUs which lack CAS use a software implementation, which
> > > > > supports the same quantities.
> > > > >
> > > > > As CAS is used only if CONFIG_RMW_INSNS=y, perhaps this needs
> > > > > a dependency?
> > > > OK, I think this dependency is needed.
> > > >
> > > > Huacai
> > > >
> > > > >
> > > > >    select ARCH_HAS_HW_XCHG_SMALL if RMW_INSNS
> > > > >
> > > > > Gr{oetje,eeting}s,
> > > > >
> > > > >                         Geert
> > > > >
> > > > > --
> > > > > Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org
> > > > >
> > > > > In personal conversations with technical people, I call myself a hacker. But
> > > > > when I'm talking to journalists I just say "programmer" or something like that.
> > > > >                                 -- Linus Torvalds
> >
> >
> >
> > --
> > Best Regards
> >  Guo Ren
> >
> > ML: https://lore.kernel.org/linux-csky/



-- 
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC 1/2] arch: Introduce ARCH_HAS_HW_XCHG_SMALL
  2021-07-26 21:20           ` Waiman Long
@ 2021-07-27  1:27             ` Guo Ren
  2021-07-27  2:29               ` Boqun Feng
  2021-07-27 10:17             ` Peter Zijlstra
  1 sibling, 1 reply; 26+ messages in thread
From: Guo Ren @ 2021-07-27  1:27 UTC (permalink / raw)
  To: Waiman Long
  Cc: Boqun Feng, Huacai Chen, Geert Uytterhoeven, Huacai Chen,
	Peter Zijlstra, Ingo Molnar, Will Deacon, Arnd Bergmann,
	Linux-Arch, Rui Wang, Xuefeng Li, Jiaxun Yang

On Tue, Jul 27, 2021 at 5:21 AM Waiman Long <llong@redhat.com> wrote:
>
> On 7/26/21 1:03 PM, Boqun Feng wrote:
> > On Tue, Jul 27, 2021 at 12:41:34AM +0800, Guo Ren wrote:
> >> On Mon, Jul 26, 2021 at 6:39 PM Boqun Feng <boqun.feng@gmail.com> wrote:
> >>> On Mon, Jul 26, 2021 at 04:56:49PM +0800, Huacai Chen wrote:
> >>>> Hi, Geert,
> >>>>
> >>>> On Mon, Jul 26, 2021 at 4:36 PM Geert Uytterhoeven <geert@linux-m68k.org> wrote:
> >>>>> Hi Huacai,
> >>>>>
> >>>>> On Sat, Jul 24, 2021 at 2:36 PM Huacai Chen <chenhuacai@loongson.cn> wrote:
> >>>>>> Introduce a new Kconfig option ARCH_HAS_HW_XCHG_SMALL, which means arch
> >>>>>> has hardware sub-word xchg/cmpxchg support. This option will be used as
> >>>>>> an indicator to select the bit-field definition in the qspinlock data
> >>>>>> structure.
> >>>>>>
> >>>>>> Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
> >>>>> Thanks for your patch!
> >>>>>
> >>>>>> --- a/arch/Kconfig
> >>>>>> +++ b/arch/Kconfig
> >>>>>> @@ -228,6 +228,10 @@ config ARCH_HAS_FORTIFY_SOURCE
> >>>>>>            An architecture should select this when it can successfully
> >>>>>>            build and run with CONFIG_FORTIFY_SOURCE.
> >>>>>>
> >>>>>> +# Select if arch has hardware sub-word xchg/cmpxchg support
> >>>>>> +config ARCH_HAS_HW_XCHG_SMALL
> >>>>> What do you mean by "hardware"?
> >>>>> Does a software fallback count?
> >>>> This new option is supposed as an indicator to select bit-field
> >>>> definition of qspinlock, software fallback is not helpful in this
> >>>> case.
> >>>>
> >>> I don't think this is true. IIUC, the rationale of the config is that
> >>> for some architectures, since the architectural cmpxchg doesn't provide
> >>> forward-progress guarantee then using cmpxchg of machine-word to
> >>> implement xchg{8,16}() may cause livelock, therefore these architectures
> >>> don't want to provide xchg{8,16}(), as a result they cannot work with
> >>> qspinlock when _Q_PENDING_BITS is 8.
> >>>
> >>> So as long as an architecture can provide and has already provided an
> >>> implementation of xchg{8,16}() which guarantee forward-progress (even
> >>> though the implementation is using a machine-word size cmpxchg), the
> >>> architecture doesn't need to select ARCH_HAS_HW_XCHG_SMALL.
> >> Seems only atomic could provide forward progress, isn't it? And lr/sc
> >> couldn't implement xchg/cmpxchg primitive properly.
> >>
> > I'm missing you point here, a) ll/sc can provide forward progress and b)
> > ll/sc instructions are used to implement xchg/cmpxchg (see ARM64 and
> > PPC).
> >
> >> How to make CPU guarantee  "load + cmpxchg" forward-progress? Fusion
> >> these instructions and lock the snoop channel?
> >> Maybe hardware guys would think that it's easier to implement cas +
> >> dcas + amo(short & byte).
> >>
> > Please note that if _Q_PENDING_BITS == 1, then the xchg_tail() is
> > implemented as a "load + cmpxchg", so if "load + cmpxchg" implementation
> > of xchg16() doesn't provide forward-progress in an architecture, neither
> > does xchg_tail().
>
> Agreed. The xchg_tail() for the "_Q_PENDING_BITS == 1" case is a
> software emulation of xchg16(). Pure software emulation like that does
> not provide forward progress guarantee. This is usually not a big
> problem for non-RT kernel for which occasional long latency is
> acceptable, but it is not good for RT kernel.
"How to implement xchg_tail" shouldn't force with _Q_PENDING_BITS, but
the arch could choose.
But this will raise another topic, is qspinlock suitable for these
arches? Maybe you've answered here: "for the non-RT kernel, Yes".

I remember you are the person who doesn't against the patch I've sent [1]:
[1] https://lore.kernel.org/linux-riscv/b6466a43-6fb3-dc47-e0ef-d493e0930ab2@redhat.com/

>
> Cheers,
> Longman
>

-- 
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Re: [PATCH RFC 1/2] arch: Introduce ARCH_HAS_HW_XCHG_SMALL
  2021-07-27  1:07           ` Guo Ren
@ 2021-07-27  1:52             ` Wang Rui
  2021-07-27 11:03               ` Peter Zijlstra
  2021-07-27  2:00             ` Boqun Feng
  2021-07-27 10:50             ` Peter Zijlstra
  2 siblings, 1 reply; 26+ messages in thread
From: Wang Rui @ 2021-07-27  1:52 UTC (permalink / raw)
  To: Guo Ren
  Cc: Boqun Feng, Huacai Chen, Geert Uytterhoeven, Huacai Chen,
	Peter Zijlstra, Ingo Molnar, Will Deacon, Arnd Bergmann,
	Waiman Long, Linux-Arch, Xuefeng Li, Jiaxun Yang

Hi, Ren,


&gt; -----Original Messages-----
&gt; From: "Guo Ren" <guoren@kernel.org>
&gt; Sent Time: 2021-07-27 09:07:44 (Tuesday)
&gt; To: "Boqun Feng" <boqun.feng@gmail.com>
&gt; Cc: "Huacai Chen" <chenhuacai@gmail.com>, "Geert Uytterhoeven" <geert@linux-m68k.org>, "Huacai Chen" <chenhuacai@loongson.cn>, "Peter Zijlstra" <peterz@infradead.org>, "Ingo Molnar" <mingo@redhat.com>, "Will Deacon" <will@kernel.org>, "Arnd Bergmann" <arnd@arndb.de>, "Waiman Long" <longman@redhat.com>, Linux-Arch <linux-arch@vger.kernel.org>, "Rui Wang" <wangrui@loongson.cn>, "Xuefeng Li" <lixuefeng@loongson.cn>, "Jiaxun Yang" <jiaxun.yang@flygoat.com>
&gt; Subject: Re: [PATCH RFC 1/2] arch: Introduce ARCH_HAS_HW_XCHG_SMALL
&gt; 
&gt; On Tue, Jul 27, 2021 at 1:03 AM Boqun Feng <boqun.feng@gmail.com> wrote:
&gt; &gt;
&gt; &gt; On Tue, Jul 27, 2021 at 12:41:34AM +0800, Guo Ren wrote:
&gt; &gt; &gt; On Mon, Jul 26, 2021 at 6:39 PM Boqun Feng <boqun.feng@gmail.com> wrote:
&gt; &gt; &gt; &gt;
&gt; &gt; &gt; &gt; On Mon, Jul 26, 2021 at 04:56:49PM +0800, Huacai Chen wrote:
&gt; &gt; &gt; &gt; &gt; Hi, Geert,
&gt; &gt; &gt; &gt; &gt;
&gt; &gt; &gt; &gt; &gt; On Mon, Jul 26, 2021 at 4:36 PM Geert Uytterhoeven <geert@linux-m68k.org> wrote:
&gt; &gt; &gt; &gt; &gt; &gt;
&gt; &gt; &gt; &gt; &gt; &gt; Hi Huacai,
&gt; &gt; &gt; &gt; &gt; &gt;
&gt; &gt; &gt; &gt; &gt; &gt; On Sat, Jul 24, 2021 at 2:36 PM Huacai Chen <chenhuacai@loongson.cn> wrote:
&gt; &gt; &gt; &gt; &gt; &gt; &gt; Introduce a new Kconfig option ARCH_HAS_HW_XCHG_SMALL, which means arch
&gt; &gt; &gt; &gt; &gt; &gt; &gt; has hardware sub-word xchg/cmpxchg support. This option will be used as
&gt; &gt; &gt; &gt; &gt; &gt; &gt; an indicator to select the bit-field definition in the qspinlock data
&gt; &gt; &gt; &gt; &gt; &gt; &gt; structure.
&gt; &gt; &gt; &gt; &gt; &gt; &gt;
&gt; &gt; &gt; &gt; &gt; &gt; &gt; Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
&gt; &gt; &gt; &gt; &gt; &gt;
&gt; &gt; &gt; &gt; &gt; &gt; Thanks for your patch!
&gt; &gt; &gt; &gt; &gt; &gt;
&gt; &gt; &gt; &gt; &gt; &gt; &gt; --- a/arch/Kconfig
&gt; &gt; &gt; &gt; &gt; &gt; &gt; +++ b/arch/Kconfig
&gt; &gt; &gt; &gt; &gt; &gt; &gt; @@ -228,6 +228,10 @@ config ARCH_HAS_FORTIFY_SOURCE
&gt; &gt; &gt; &gt; &gt; &gt; &gt;           An architecture should select this when it can successfully
&gt; &gt; &gt; &gt; &gt; &gt; &gt;           build and run with CONFIG_FORTIFY_SOURCE.
&gt; &gt; &gt; &gt; &gt; &gt; &gt;
&gt; &gt; &gt; &gt; &gt; &gt; &gt; +# Select if arch has hardware sub-word xchg/cmpxchg support
&gt; &gt; &gt; &gt; &gt; &gt; &gt; +config ARCH_HAS_HW_XCHG_SMALL
&gt; &gt; &gt; &gt; &gt; &gt;
&gt; &gt; &gt; &gt; &gt; &gt; What do you mean by "hardware"?
&gt; &gt; &gt; &gt; &gt; &gt; Does a software fallback count?
&gt; &gt; &gt; &gt; &gt; This new option is supposed as an indicator to select bit-field
&gt; &gt; &gt; &gt; &gt; definition of qspinlock, software fallback is not helpful in this
&gt; &gt; &gt; &gt; &gt; case.
&gt; &gt; &gt; &gt; &gt;
&gt; &gt; &gt; &gt;
&gt; &gt; &gt; &gt; I don't think this is true. IIUC, the rationale of the config is that
&gt; &gt; &gt; &gt; for some architectures, since the architectural cmpxchg doesn't provide
&gt; &gt; &gt; &gt; forward-progress guarantee then using cmpxchg of machine-word to
&gt; &gt; &gt; &gt; implement xchg{8,16}() may cause livelock, therefore these architectures
&gt; &gt; &gt; &gt; don't want to provide xchg{8,16}(), as a result they cannot work with
&gt; &gt; &gt; &gt; qspinlock when _Q_PENDING_BITS is 8.
&gt; &gt; &gt; &gt;
&gt; &gt; &gt; &gt; So as long as an architecture can provide and has already provided an
&gt; &gt; &gt; &gt; implementation of xchg{8,16}() which guarantee forward-progress (even
&gt; &gt; &gt; &gt; though the implementation is using a machine-word size cmpxchg), the
&gt; &gt; &gt; &gt; architecture doesn't need to select ARCH_HAS_HW_XCHG_SMALL.
&gt; &gt; &gt; Seems only atomic could provide forward progress, isn't it? And lr/sc
&gt; &gt; &gt; couldn't implement xchg/cmpxchg primitive properly.
&gt; &gt; &gt;
&gt; &gt;
&gt; &gt; I'm missing you point here, a) ll/sc can provide forward progress and b)
&gt; &gt; ll/sc instructions are used to implement xchg/cmpxchg (see ARM64 and
&gt; &gt; PPC).
&gt; I don't think arm64 could provide fwd guarantee with ll/sc, otherwise,
&gt; they wouldn't add ARM64_HAS_LSE_ATOMICS for large systems.
&gt; 
&gt; &gt;
&gt; &gt; &gt; How to make CPU ç  "load + cmpxchg" forward-progress? Fusion
&gt; &gt; &gt; these instructions and lock the snoop channel?
&gt; &gt; &gt; Maybe hardware guys would think that it's easier to implement cas +
&gt; &gt; &gt; dcas + amo(short &amp; byte).
&gt; &gt; &gt;
&gt; &gt;
&gt; &gt; Please note that if _Q_PENDING_BITS == 1, then the xchg_tail() is
&gt; &gt; implemented as a "load + cmpxchg", so if "load + cmpxchg" implementation
&gt; &gt; of xchg16() doesn't provide forward-progress in an architecture, neither
&gt; &gt; does xchg_tail().
&gt; That's the problem of "_Q_PENDING_BITS == 1", no hardware could
&gt; provide "load + ALU + cas" fwd guarantee!
&gt; 
&gt; A simple example, atomic a++:
&gt; c = READ_ONCE(g_value);
&gt; new = c + 1;
&gt; while ((old = cmpxchg(&amp;g_value, c, new)) != c) {
&gt;     c = old;
&gt;     new = c + 1;
&gt; }
&gt; 
&gt; Q: When it runs on CPU0(500Mhz) &amp; CPU1(2Ghz) in one SMP, how do we
&gt; prevent CPU1 from starving CPU0?
&gt; A: I think the answer is using AMO-add instruction:
&gt; atomic_add(1, &amp;g_value);
&gt; (If the arch hasn't atomic instructions and using cmpxchg or lr/sc
&gt; implement atomic, it's the same problem.)
&gt; 

I think the forward progress are guaranteed while all operations are atomic(ll/sc or amo). If ll/sc runs on a fast cpu, there will be random delays, is that okay? Else, for such hardware, we can't even implement generic spinlock with ll/sc.

And I also think that the hardware supports normal store for unlocking. (e.g. arch_spin_unlock)

In qspinlock, when _Q_PENDING_BITS == 1, it's available for all hardware, because the clear_pending/clear_pending_set_locked are all atomic operations. Isn't it?

Q: Why live lock happens while _Q_PENDING_BITS == 8?
A: I found a case is:

* CPU A updates sub-word of qpsinlock at high frequency with normal store.
* CPU B do xchg_tail with load + cmpxchg, and the value of load is always not equal to the value of ll(cmpxchg).

qspinlock:
  0: locked
  1: pending
  2: tail

CPU A                    CPU B
1:                       1: &lt;--------------------+
  sh $newval, &amp;locked      lw  $v1, &amp;qspinlock   |
  add $newval, 1           and $t1, $v1, ~mask   |
  b 1b                     or  $t1, $t1, newval  | (live lock path)
                           ll  $v2, &amp;qspinlock   |
                           bne $v1, $v2, 1b -----+
                           sc  $t1, &amp;qspinlock
                           beq $t1, 0, 1b

If xchg_tail like this, at least there is no live lock on Loongson

xchg_tail:

1:
  ll  $v1, &amp;qspinlock
  and $t1, $v1, ~mask
  or  $t1, $t1, newval
  sc  $t1, &amp;qspinlock
  beq $t1, 0, 1b

For hardware that ll/sc is based on cache coherency, I think sc is easy to succeed. The ll makes cache-line is exclusive by CPU B, and the store of CPU A needs to acquire exclusive again, the sc may be completed before this.

&gt; &gt;
&gt; &gt; Regards,
&gt; &gt; Boqun
&gt; &gt;
&gt; &gt; &gt; &gt;
&gt; &gt; &gt; &gt; Regards,
&gt; &gt; &gt; &gt; Boqun
&gt; &gt; &gt; &gt;
&gt; &gt; &gt; &gt; &gt; &gt;
&gt; &gt; &gt; &gt; &gt; &gt; &gt; --- a/arch/m68k/Kconfig
&gt; &gt; &gt; &gt; &gt; &gt; &gt; +++ b/arch/m68k/Kconfig
&gt; &gt; &gt; &gt; &gt; &gt; &gt; @@ -5,6 +5,7 @@ config M68K
&gt; &gt; &gt; &gt; &gt; &gt; &gt;         select ARCH_32BIT_OFF_T
&gt; &gt; &gt; &gt; &gt; &gt; &gt;         select ARCH_HAS_BINFMT_FLAT
&gt; &gt; &gt; &gt; &gt; &gt; &gt;         select ARCH_HAS_DMA_PREP_COHERENT if HAS_DMA &amp;&amp; MMU &amp;&amp; !COLDFIRE
&gt; &gt; &gt; &gt; &gt; &gt; &gt; +       select ARCH_HAS_HW_XCHG_SMALL
&gt; &gt; &gt; &gt; &gt; &gt;
&gt; &gt; &gt; &gt; &gt; &gt; M68k CPUs which support the CAS (Compare And Set) instruction do
&gt; &gt; &gt; &gt; &gt; &gt; support this on 8-bit, 16-bit, and 32-bit quantities.
&gt; &gt; &gt; &gt; &gt; &gt; M68k CPUs which lack CAS use a software implementation, which
&gt; &gt; &gt; &gt; &gt; &gt; supports the same quantities.
&gt; &gt; &gt; &gt; &gt; &gt;
&gt; &gt; &gt; &gt; &gt; &gt; As CAS is used only if CONFIG_RMW_INSNS=y, perhaps this needs
&gt; &gt; &gt; &gt; &gt; &gt; a dependency?
&gt; &gt; &gt; &gt; &gt; OK, I think this dependency is needed.
&gt; &gt; &gt; &gt; &gt;
&gt; &gt; &gt; &gt; &gt; Huacai
&gt; &gt; &gt; &gt; &gt;
&gt; &gt; &gt; &gt; &gt; &gt;
&gt; &gt; &gt; &gt; &gt; &gt;    select ARCH_HAS_HW_XCHG_SMALL if RMW_INSNS
&gt; &gt; &gt; &gt; &gt; &gt;
&gt; &gt; &gt; &gt; &gt; &gt; Gr{oetje,eeting}s,
&gt; &gt; &gt; &gt; &gt; &gt;
&gt; &gt; &gt; &gt; &gt; &gt;                         Geert
&gt; &gt; &gt; &gt; &gt; &gt;
&gt; &gt; &gt; &gt; &gt; &gt; --
&gt; &gt; &gt; &gt; &gt; &gt; Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org
&gt; &gt; &gt; &gt; &gt; &gt;
&gt; &gt; &gt; &gt; &gt; &gt; In personal conversations with technical people, I call myself a hacker. But
&gt; &gt; &gt; &gt; &gt; &gt; when I'm talking to journalists I just say "programmer" or something like that.
&gt; &gt; &gt; &gt; &gt; &gt;                                 -- Linus Torvalds
&gt; &gt; &gt;
&gt; &gt; &gt;
&gt; &gt; &gt;
&gt; &gt; &gt; --
&gt; &gt; &gt; Best Regards
&gt; &gt; &gt;  Guo Ren
&gt; &gt; &gt;
&gt; &gt; &gt; ML: https://lore.kernel.org/linux-csky/
&gt; 
&gt; 
&gt; 
&gt; -- 
&gt; Best Regards
&gt;  Guo Ren
&gt; 
&gt; ML: https://lore.kernel.org/linux-csky/
</chenhuacai@loongson.cn></chenhuacai@loongson.cn></geert@linux-m68k.org></boqun.feng@gmail.com></boqun.feng@gmail.com></jiaxun.yang@flygoat.com></lixuefeng@loongson.cn></wangrui@loongson.cn></linux-arch@vger.kernel.org></longman@redhat.com></arnd@arndb.de></will@kernel.org></mingo@redhat.com></peterz@infradead.org></chenhuacai@loongson.cn></geert@linux-m68k.org></chenhuacai@gmail.com></boqun.feng@gmail.com></guoren@kernel.org>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC 1/2] arch: Introduce ARCH_HAS_HW_XCHG_SMALL
  2021-07-27  1:07           ` Guo Ren
  2021-07-27  1:52             ` Wang Rui
@ 2021-07-27  2:00             ` Boqun Feng
  2021-07-27 10:50             ` Peter Zijlstra
  2 siblings, 0 replies; 26+ messages in thread
From: Boqun Feng @ 2021-07-27  2:00 UTC (permalink / raw)
  To: Guo Ren
  Cc: Huacai Chen, Geert Uytterhoeven, Huacai Chen, Peter Zijlstra,
	Ingo Molnar, Will Deacon, Arnd Bergmann, Waiman Long, Linux-Arch,
	Rui Wang, Xuefeng Li, Jiaxun Yang

On Tue, Jul 27, 2021 at 09:07:44AM +0800, Guo Ren wrote:
> On Tue, Jul 27, 2021 at 1:03 AM Boqun Feng <boqun.feng@gmail.com> wrote:
> >
> > On Tue, Jul 27, 2021 at 12:41:34AM +0800, Guo Ren wrote:
> > > On Mon, Jul 26, 2021 at 6:39 PM Boqun Feng <boqun.feng@gmail.com> wrote:
> > > >
> > > > On Mon, Jul 26, 2021 at 04:56:49PM +0800, Huacai Chen wrote:
> > > > > Hi, Geert,
> > > > >
> > > > > On Mon, Jul 26, 2021 at 4:36 PM Geert Uytterhoeven <geert@linux-m68k.org> wrote:
> > > > > >
> > > > > > Hi Huacai,
> > > > > >
> > > > > > On Sat, Jul 24, 2021 at 2:36 PM Huacai Chen <chenhuacai@loongson.cn> wrote:
> > > > > > > Introduce a new Kconfig option ARCH_HAS_HW_XCHG_SMALL, which means arch
> > > > > > > has hardware sub-word xchg/cmpxchg support. This option will be used as
> > > > > > > an indicator to select the bit-field definition in the qspinlock data
> > > > > > > structure.
> > > > > > >
> > > > > > > Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
> > > > > >
> > > > > > Thanks for your patch!
> > > > > >
> > > > > > > --- a/arch/Kconfig
> > > > > > > +++ b/arch/Kconfig
> > > > > > > @@ -228,6 +228,10 @@ config ARCH_HAS_FORTIFY_SOURCE
> > > > > > >           An architecture should select this when it can successfully
> > > > > > >           build and run with CONFIG_FORTIFY_SOURCE.
> > > > > > >
> > > > > > > +# Select if arch has hardware sub-word xchg/cmpxchg support
> > > > > > > +config ARCH_HAS_HW_XCHG_SMALL
> > > > > >
> > > > > > What do you mean by "hardware"?
> > > > > > Does a software fallback count?
> > > > > This new option is supposed as an indicator to select bit-field
> > > > > definition of qspinlock, software fallback is not helpful in this
> > > > > case.
> > > > >
> > > >
> > > > I don't think this is true. IIUC, the rationale of the config is that
> > > > for some architectures, since the architectural cmpxchg doesn't provide
> > > > forward-progress guarantee then using cmpxchg of machine-word to
> > > > implement xchg{8,16}() may cause livelock, therefore these architectures
> > > > don't want to provide xchg{8,16}(), as a result they cannot work with
> > > > qspinlock when _Q_PENDING_BITS is 8.
> > > >
> > > > So as long as an architecture can provide and has already provided an
> > > > implementation of xchg{8,16}() which guarantee forward-progress (even
> > > > though the implementation is using a machine-word size cmpxchg), the
> > > > architecture doesn't need to select ARCH_HAS_HW_XCHG_SMALL.
> > > Seems only atomic could provide forward progress, isn't it? And lr/sc
> > > couldn't implement xchg/cmpxchg primitive properly.
> > >
> >
> > I'm missing you point here, a) ll/sc can provide forward progress and b)
> > ll/sc instructions are used to implement xchg/cmpxchg (see ARM64 and
> > PPC).
> I don't think arm64 could provide fwd guarantee with ll/sc, otherwise,
> they wouldn't add ARM64_HAS_LSE_ATOMICS for large systems.
> 
> >
> > > How to make CPU ç  "load + cmpxchg" forward-progress? Fusion
> > > these instructions and lock the snoop channel?
> > > Maybe hardware guys would think that it's easier to implement cas +
> > > dcas + amo(short & byte).
> > >
> >
> > Please note that if _Q_PENDING_BITS == 1, then the xchg_tail() is
> > implemented as a "load + cmpxchg", so if "load + cmpxchg" implementation
> > of xchg16() doesn't provide forward-progress in an architecture, neither
> > does xchg_tail().
> That's the problem of "_Q_PENDING_BITS == 1", no hardware could
> provide "load + ALU + cas" fwd guarantee!
> 

You lost me again... in this patchset, archs that don't have hardware
xchg16 will use _Q_PENDING_BITS == 1, and now you tell me that
"_Q_PENDING_BITS == 1" has a problem, so what's the point of this
patchset? Does the problem of "_Q_PENDING_BITS == 1" need fixes? If so,
when do you plan to send the fix?

Regards,
Boqun

> A simple example, atomic a++:
> c = READ_ONCE(g_value);
> new = c + 1;
> while ((old = cmpxchg(&g_value, c, new)) != c) {
>     c = old;
>     new = c + 1;
> }
> 
> Q: When it runs on CPU0(500Mhz) & CPU1(2Ghz) in one SMP, how do we
> prevent CPU1 from starving CPU0?
> A: I think the answer is using AMO-add instruction:
> atomic_add(1, &g_value);
> (If the arch hasn't atomic instructions and using cmpxchg or lr/sc
> implement atomic, it's the same problem.)
> 
> >
> > Regards,
> > Boqun
> >
> > > >
> > > > Regards,
> > > > Boqun
> > > >
> > > > > >
> > > > > > > --- a/arch/m68k/Kconfig
> > > > > > > +++ b/arch/m68k/Kconfig
> > > > > > > @@ -5,6 +5,7 @@ config M68K
> > > > > > >         select ARCH_32BIT_OFF_T
> > > > > > >         select ARCH_HAS_BINFMT_FLAT
> > > > > > >         select ARCH_HAS_DMA_PREP_COHERENT if HAS_DMA && MMU && !COLDFIRE
> > > > > > > +       select ARCH_HAS_HW_XCHG_SMALL
> > > > > >
> > > > > > M68k CPUs which support the CAS (Compare And Set) instruction do
> > > > > > support this on 8-bit, 16-bit, and 32-bit quantities.
> > > > > > M68k CPUs which lack CAS use a software implementation, which
> > > > > > supports the same quantities.
> > > > > >
> > > > > > As CAS is used only if CONFIG_RMW_INSNS=y, perhaps this needs
> > > > > > a dependency?
> > > > > OK, I think this dependency is needed.
> > > > >
> > > > > Huacai
> > > > >
> > > > > >
> > > > > >    select ARCH_HAS_HW_XCHG_SMALL if RMW_INSNS
> > > > > >
> > > > > > Gr{oetje,eeting}s,
> > > > > >
> > > > > >                         Geert
> > > > > >
> > > > > > --
> > > > > > Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org
> > > > > >
> > > > > > In personal conversations with technical people, I call myself a hacker. But
> > > > > > when I'm talking to journalists I just say "programmer" or something like that.
> > > > > >                                 -- Linus Torvalds
> > >
> > >
> > >
> > > --
> > > Best Regards
> > >  Guo Ren
> > >
> > > ML: https://lore.kernel.org/linux-csky/
> 
> 
> 
> -- 
> Best Regards
>  Guo Ren
> 
> ML: https://lore.kernel.org/linux-csky/

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC 1/2] arch: Introduce ARCH_HAS_HW_XCHG_SMALL
  2021-07-27  1:27             ` Guo Ren
@ 2021-07-27  2:29               ` Boqun Feng
  2021-07-27  2:46                 ` Waiman Long
  2021-07-27 11:05                 ` Peter Zijlstra
  0 siblings, 2 replies; 26+ messages in thread
From: Boqun Feng @ 2021-07-27  2:29 UTC (permalink / raw)
  To: Guo Ren
  Cc: Waiman Long, Huacai Chen, Geert Uytterhoeven, Huacai Chen,
	Peter Zijlstra, Ingo Molnar, Will Deacon, Arnd Bergmann,
	Linux-Arch, Rui Wang, Xuefeng Li, Jiaxun Yang

On Tue, Jul 27, 2021 at 09:27:51AM +0800, Guo Ren wrote:
> On Tue, Jul 27, 2021 at 5:21 AM Waiman Long <llong@redhat.com> wrote:
> >
> > On 7/26/21 1:03 PM, Boqun Feng wrote:
> > > On Tue, Jul 27, 2021 at 12:41:34AM +0800, Guo Ren wrote:
> > >> On Mon, Jul 26, 2021 at 6:39 PM Boqun Feng <boqun.feng@gmail.com> wrote:
> > >>> On Mon, Jul 26, 2021 at 04:56:49PM +0800, Huacai Chen wrote:
> > >>>> Hi, Geert,
> > >>>>
> > >>>> On Mon, Jul 26, 2021 at 4:36 PM Geert Uytterhoeven <geert@linux-m68k.org> wrote:
> > >>>>> Hi Huacai,
> > >>>>>
> > >>>>> On Sat, Jul 24, 2021 at 2:36 PM Huacai Chen <chenhuacai@loongson.cn> wrote:
> > >>>>>> Introduce a new Kconfig option ARCH_HAS_HW_XCHG_SMALL, which means arch
> > >>>>>> has hardware sub-word xchg/cmpxchg support. This option will be used as
> > >>>>>> an indicator to select the bit-field definition in the qspinlock data
> > >>>>>> structure.
> > >>>>>>
> > >>>>>> Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
> > >>>>> Thanks for your patch!
> > >>>>>
> > >>>>>> --- a/arch/Kconfig
> > >>>>>> +++ b/arch/Kconfig
> > >>>>>> @@ -228,6 +228,10 @@ config ARCH_HAS_FORTIFY_SOURCE
> > >>>>>>            An architecture should select this when it can successfully
> > >>>>>>            build and run with CONFIG_FORTIFY_SOURCE.
> > >>>>>>
> > >>>>>> +# Select if arch has hardware sub-word xchg/cmpxchg support
> > >>>>>> +config ARCH_HAS_HW_XCHG_SMALL
> > >>>>> What do you mean by "hardware"?
> > >>>>> Does a software fallback count?
> > >>>> This new option is supposed as an indicator to select bit-field
> > >>>> definition of qspinlock, software fallback is not helpful in this
> > >>>> case.
> > >>>>
> > >>> I don't think this is true. IIUC, the rationale of the config is that
> > >>> for some architectures, since the architectural cmpxchg doesn't provide
> > >>> forward-progress guarantee then using cmpxchg of machine-word to
> > >>> implement xchg{8,16}() may cause livelock, therefore these architectures
> > >>> don't want to provide xchg{8,16}(), as a result they cannot work with
> > >>> qspinlock when _Q_PENDING_BITS is 8.
> > >>>
> > >>> So as long as an architecture can provide and has already provided an
> > >>> implementation of xchg{8,16}() which guarantee forward-progress (even
> > >>> though the implementation is using a machine-word size cmpxchg), the
> > >>> architecture doesn't need to select ARCH_HAS_HW_XCHG_SMALL.
> > >> Seems only atomic could provide forward progress, isn't it? And lr/sc
> > >> couldn't implement xchg/cmpxchg primitive properly.
> > >>
> > > I'm missing you point here, a) ll/sc can provide forward progress and b)
> > > ll/sc instructions are used to implement xchg/cmpxchg (see ARM64 and
> > > PPC).
> > >
> > >> How to make CPU guarantee  "load + cmpxchg" forward-progress? Fusion
> > >> these instructions and lock the snoop channel?
> > >> Maybe hardware guys would think that it's easier to implement cas +
> > >> dcas + amo(short & byte).
> > >>
> > > Please note that if _Q_PENDING_BITS == 1, then the xchg_tail() is
> > > implemented as a "load + cmpxchg", so if "load + cmpxchg" implementation
> > > of xchg16() doesn't provide forward-progress in an architecture, neither
> > > does xchg_tail().
> >
> > Agreed. The xchg_tail() for the "_Q_PENDING_BITS == 1" case is a
> > software emulation of xchg16(). Pure software emulation like that does
> > not provide forward progress guarantee. This is usually not a big
> > problem for non-RT kernel for which occasional long latency is
> > acceptable, but it is not good for RT kernel.
> "How to implement xchg_tail" shouldn't force with _Q_PENDING_BITS, but
> the arch could choose.

I actually agree with this part, but this patchset failed to provide
enough evidences on why we should choose xchg_tail() implementation
based on whether hardware has xchg16, more precisely, for an archtecture
which doesn't have a hardware xchg16, why cmpxchg emulated xchg16() is
worse than a "load+cmpxchg) implemeneted xchg_tail()? If it's a
performance reason, please show some numbers.

In fact, why don't you introduce a ARCH_QSPINLOCK_USE_GENERIC_XCHG_TAIL,
and only select it for csky and risc-v, and let other archs choose to
select or it themselves? FWIW, qspinlock code looks like something
below with this config:

 #if (CONFIG_NR_CPUS < (1U << 14)) && !defined(CONFIG_ARCH_QSPINLOCK_USE_GENERIC_XCHG_TAIL)
 #define _Q_PENDING_BITS                8
 #else
 #define _Q_PENDING_BITS                1

Just my two cents.

Regards,
Boqun

> But this will raise another topic, is qspinlock suitable for these
> arches? Maybe you've answered here: "for the non-RT kernel, Yes".
> 
> I remember you are the person who doesn't against the patch I've sent [1]:
> [1] https://lore.kernel.org/linux-riscv/b6466a43-6fb3-dc47-e0ef-d493e0930ab2@redhat.com/
> 
> >
> > Cheers,
> > Longman
> >
> 
> -- 
> Best Regards
>  Guo Ren
> 
> ML: https://lore.kernel.org/linux-csky/

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC 1/2] arch: Introduce ARCH_HAS_HW_XCHG_SMALL
  2021-07-27  2:29               ` Boqun Feng
@ 2021-07-27  2:46                 ` Waiman Long
  2021-07-27 11:05                 ` Peter Zijlstra
  1 sibling, 0 replies; 26+ messages in thread
From: Waiman Long @ 2021-07-27  2:46 UTC (permalink / raw)
  To: Boqun Feng, Guo Ren
  Cc: Waiman Long, Huacai Chen, Geert Uytterhoeven, Huacai Chen,
	Peter Zijlstra, Ingo Molnar, Will Deacon, Arnd Bergmann,
	Linux-Arch, Rui Wang, Xuefeng Li, Jiaxun Yang

On 7/26/21 10:29 PM, Boqun Feng wrote:
> On Tue, Jul 27, 2021 at 09:27:51AM +0800, Guo Ren wrote:
>> On Tue, Jul 27, 2021 at 5:21 AM Waiman Long <llong@redhat.com> wrote:
>>> On 7/26/21 1:03 PM, Boqun Feng wrote:
>>>> On Tue, Jul 27, 2021 at 12:41:34AM +0800, Guo Ren wrote:
>>>>> On Mon, Jul 26, 2021 at 6:39 PM Boqun Feng <boqun.feng@gmail.com> wrote:
>>>>>> On Mon, Jul 26, 2021 at 04:56:49PM +0800, Huacai Chen wrote:
>>>>>>> Hi, Geert,
>>>>>>>
>>>>>>> On Mon, Jul 26, 2021 at 4:36 PM Geert Uytterhoeven <geert@linux-m68k.org> wrote:
>>>>>>>> Hi Huacai,
>>>>>>>>
>>>>>>>> On Sat, Jul 24, 2021 at 2:36 PM Huacai Chen <chenhuacai@loongson.cn> wrote:
>>>>>>>>> Introduce a new Kconfig option ARCH_HAS_HW_XCHG_SMALL, which means arch
>>>>>>>>> has hardware sub-word xchg/cmpxchg support. This option will be used as
>>>>>>>>> an indicator to select the bit-field definition in the qspinlock data
>>>>>>>>> structure.
>>>>>>>>>
>>>>>>>>> Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
>>>>>>>> Thanks for your patch!
>>>>>>>>
>>>>>>>>> --- a/arch/Kconfig
>>>>>>>>> +++ b/arch/Kconfig
>>>>>>>>> @@ -228,6 +228,10 @@ config ARCH_HAS_FORTIFY_SOURCE
>>>>>>>>>             An architecture should select this when it can successfully
>>>>>>>>>             build and run with CONFIG_FORTIFY_SOURCE.
>>>>>>>>>
>>>>>>>>> +# Select if arch has hardware sub-word xchg/cmpxchg support
>>>>>>>>> +config ARCH_HAS_HW_XCHG_SMALL
>>>>>>>> What do you mean by "hardware"?
>>>>>>>> Does a software fallback count?
>>>>>>> This new option is supposed as an indicator to select bit-field
>>>>>>> definition of qspinlock, software fallback is not helpful in this
>>>>>>> case.
>>>>>>>
>>>>>> I don't think this is true. IIUC, the rationale of the config is that
>>>>>> for some architectures, since the architectural cmpxchg doesn't provide
>>>>>> forward-progress guarantee then using cmpxchg of machine-word to
>>>>>> implement xchg{8,16}() may cause livelock, therefore these architectures
>>>>>> don't want to provide xchg{8,16}(), as a result they cannot work with
>>>>>> qspinlock when _Q_PENDING_BITS is 8.
>>>>>>
>>>>>> So as long as an architecture can provide and has already provided an
>>>>>> implementation of xchg{8,16}() which guarantee forward-progress (even
>>>>>> though the implementation is using a machine-word size cmpxchg), the
>>>>>> architecture doesn't need to select ARCH_HAS_HW_XCHG_SMALL.
>>>>> Seems only atomic could provide forward progress, isn't it? And lr/sc
>>>>> couldn't implement xchg/cmpxchg primitive properly.
>>>>>
>>>> I'm missing you point here, a) ll/sc can provide forward progress and b)
>>>> ll/sc instructions are used to implement xchg/cmpxchg (see ARM64 and
>>>> PPC).
>>>>
>>>>> How to make CPU guarantee  "load + cmpxchg" forward-progress? Fusion
>>>>> these instructions and lock the snoop channel?
>>>>> Maybe hardware guys would think that it's easier to implement cas +
>>>>> dcas + amo(short & byte).
>>>>>
>>>> Please note that if _Q_PENDING_BITS == 1, then the xchg_tail() is
>>>> implemented as a "load + cmpxchg", so if "load + cmpxchg" implementation
>>>> of xchg16() doesn't provide forward-progress in an architecture, neither
>>>> does xchg_tail().
>>> Agreed. The xchg_tail() for the "_Q_PENDING_BITS == 1" case is a
>>> software emulation of xchg16(). Pure software emulation like that does
>>> not provide forward progress guarantee. This is usually not a big
>>> problem for non-RT kernel for which occasional long latency is
>>> acceptable, but it is not good for RT kernel.
>> "How to implement xchg_tail" shouldn't force with _Q_PENDING_BITS, but
>> the arch could choose.
> I actually agree with this part, but this patchset failed to provide
> enough evidences on why we should choose xchg_tail() implementation
> based on whether hardware has xchg16, more precisely, for an archtecture
> which doesn't have a hardware xchg16, why cmpxchg emulated xchg16() is
> worse than a "load+cmpxchg) implemeneted xchg_tail()? If it's a
> performance reason, please show some numbers.
>
> In fact, why don't you introduce a ARCH_QSPINLOCK_USE_GENERIC_XCHG_TAIL,
> and only select it for csky and risc-v, and let other archs choose to
> select or it themselves? FWIW, qspinlock code looks like something
> below with this config:
>
>   #if (CONFIG_NR_CPUS < (1U << 14)) && !defined(CONFIG_ARCH_QSPINLOCK_USE_GENERIC_XCHG_TAIL)
>   #define _Q_PENDING_BITS                8
>   #else
>   #define _Q_PENDING_BITS                1
>
> Just my two cents.
>
The patch sent out by Guo Ren was actually similar to this approach [1]. 
I actually like it better than this patch because the pending bit 
handling is more optimized with "_Q_PENDING_BITS == 8". So we shouldn't 
force _Q_PENDING_BITS to 1 if we just want to avoid using xchg16().

[1] 
https://lore.kernel.org/lkml/1616868399-82848-4-git-send-email-guoren@kernel.org/

Cheers,
Longman


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC 1/2] arch: Introduce ARCH_HAS_HW_XCHG_SMALL
  2021-07-26 17:03         ` Boqun Feng
  2021-07-26 21:20           ` Waiman Long
  2021-07-27  1:07           ` Guo Ren
@ 2021-07-27 10:12           ` Peter Zijlstra
  2 siblings, 0 replies; 26+ messages in thread
From: Peter Zijlstra @ 2021-07-27 10:12 UTC (permalink / raw)
  To: Boqun Feng
  Cc: Guo Ren, Huacai Chen, Geert Uytterhoeven, Huacai Chen,
	Ingo Molnar, Will Deacon, Arnd Bergmann, Waiman Long, Linux-Arch,
	Rui Wang, Xuefeng Li, Jiaxun Yang

On Tue, Jul 27, 2021 at 01:03:32AM +0800, Boqun Feng wrote:

> I'm missing you point here, a) ll/sc can provide forward progress and b)
> ll/sc instructions are used to implement xchg/cmpxchg (see ARM64 and
> PPC).

Correct on both counts, but b) is tricky, even if a), then it doesn't
hold that the primitive resulting from b) also provides fwd progress.

I feel this point is often overlooked. I should go add something to
atomic_t.txt about that I suppose.

> > How to make CPU guarantee  "load + cmpxchg" forward-progress? Fusion
> > these instructions and lock the snoop channel?
> > Maybe hardware guys would think that it's easier to implement cas +
> > dcas + amo(short & byte).
> > 
> 
> Please note that if _Q_PENDING_BITS == 1, then the xchg_tail() is
> implemented as a "load + cmpxchg", so if "load + cmpxchg" implementation
> of xchg16() doesn't provide forward-progress in an architecture, neither
> does xchg_tail().

Right, so generally we rely on cmpxchg() to provide fairness. Some
architectures (notably Sparc64) go to great lengths to ensure this.

I have memories of adding backoff to an LL/SC based arch at some point,
but I cannot find it in a hurry, so it could be one of the since deleted
archs.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC 1/2] arch: Introduce ARCH_HAS_HW_XCHG_SMALL
  2021-07-26 21:20           ` Waiman Long
  2021-07-27  1:27             ` Guo Ren
@ 2021-07-27 10:17             ` Peter Zijlstra
  1 sibling, 0 replies; 26+ messages in thread
From: Peter Zijlstra @ 2021-07-27 10:17 UTC (permalink / raw)
  To: Waiman Long
  Cc: Boqun Feng, Guo Ren, Huacai Chen, Geert Uytterhoeven,
	Huacai Chen, Ingo Molnar, Will Deacon, Arnd Bergmann, Linux-Arch,
	Rui Wang, Xuefeng Li, Jiaxun Yang

On Mon, Jul 26, 2021 at 05:20:55PM -0400, Waiman Long wrote:

> Agreed. The xchg_tail() for the "_Q_PENDING_BITS == 1" case is a software
> emulation of xchg16(). Pure software emulation like that does not provide
> forward progress guarantee. This is usually not a big problem for non-RT
> kernel for which occasional long latency is acceptable, but it is not good
> for RT kernel.

Even !RT doesn't like lock starvation. We've had quite a number of truly
terrible performance problems due to lock starvation over the years.

Please don't categorize this as an RT issue. It is true that RT
absolutely must not have unfair locks, but you can't turn that around
and say that only RT requires fair locks.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC 1/2] arch: Introduce ARCH_HAS_HW_XCHG_SMALL
  2021-07-27  1:07           ` Guo Ren
  2021-07-27  1:52             ` Wang Rui
  2021-07-27  2:00             ` Boqun Feng
@ 2021-07-27 10:50             ` Peter Zijlstra
  2 siblings, 0 replies; 26+ messages in thread
From: Peter Zijlstra @ 2021-07-27 10:50 UTC (permalink / raw)
  To: Guo Ren
  Cc: Boqun Feng, Huacai Chen, Geert Uytterhoeven, Huacai Chen,
	Ingo Molnar, Will Deacon, Arnd Bergmann, Waiman Long, Linux-Arch,
	Rui Wang, Xuefeng Li, Jiaxun Yang

On Tue, Jul 27, 2021 at 09:07:44AM +0800, Guo Ren wrote:
> On Tue, Jul 27, 2021 at 1:03 AM Boqun Feng <boqun.feng@gmail.com> wrote:

> > I'm missing you point here, a) ll/sc can provide forward progress and b)
> > ll/sc instructions are used to implement xchg/cmpxchg (see ARM64 and
> > PPC).
> I don't think arm64 could provide fwd guarantee with ll/sc, otherwise,
> they wouldn't add ARM64_HAS_LSE_ATOMICS for large systems.

You can do LL/SC with fwd progress, it's just that AMOs can be done
faster.

> That's the problem of "_Q_PENDING_BITS == 1", no hardware could
> provide "load + ALU + cas" fwd guarantee!
> 
> A simple example, atomic a++:
> c = READ_ONCE(g_value);
> new = c + 1;
> while ((old = cmpxchg(&g_value, c, new)) != c) {
>     c = old;
>     new = c + 1;
> }
> 
> Q: When it runs on CPU0(500Mhz) & CPU1(2Ghz) in one SMP, how do we
> prevent CPU1 from starving CPU0?

By not handing the cacheline to CPU1 for a while, similar to LL/SC.

The traditional way of making this work is for LL to hold onto the
exclusive state for a while and the same for a failed CAS. Simply refuse
to yield the line for a while.

OoO CPUs can get all fancy and detect the loop, but simply holding onto
the line for some N instructions mostly works.

The obvious problem is that the LL/SC fwd progress doesn't extend to
cmpxchg() implemented using LL/SC. Typically the body of the cmpxchg()
loop does things that break the exclusive hold.

For things like lock implementations, the best way is to make sure the
primitives are in native form, in this case using xchg16 implemented
with LL/SC (note that implementing xchg16() using cmpxchg() in terms of
LL/SC is terrible and throws everything out the window again).

And if the native form doesn't provide fwd progress, your best option is
to switch to a better architecture :-)


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Re: [PATCH RFC 1/2] arch: Introduce ARCH_HAS_HW_XCHG_SMALL
  2021-07-27  1:52             ` Wang Rui
@ 2021-07-27 11:03               ` Peter Zijlstra
  0 siblings, 0 replies; 26+ messages in thread
From: Peter Zijlstra @ 2021-07-27 11:03 UTC (permalink / raw)
  To: Wang Rui
  Cc: Guo Ren, Boqun Feng, Huacai Chen, Geert Uytterhoeven,
	Huacai Chen, Ingo Molnar, Will Deacon, Arnd Bergmann,
	Waiman Long, Linux-Arch, Xuefeng Li, Jiaxun Yang

On Tue, Jul 27, 2021 at 09:52:26AM +0800, Wang Rui wrote:
> I think the forward progress are guaranteed while all operations are
> atomic(ll/sc or amo). If ll/sc runs on a fast cpu, there will be
> random delays, is that okay? Else, for such hardware, we can't even
> implement generic spinlock with ll/sc.
> 
> And I also think that the hardware supports normal store for
> unlocking. (e.g. arch_spin_unlock)
> 
> In qspinlock, when _Q_PENDING_BITS == 1, it's available for all
> hardware, because the clear_pending/clear_pending_set_locked are all
> atomic operations. Isn't it?
> 
> Q: Why live lock happens while _Q_PENDING_BITS == 8?
> A: I found a case is:
> 
> * CPU A updates sub-word of qpsinlock at high frequency with normal store.
> * CPU B do xchg_tail with load + cmpxchg, and the value of load is always not equal to the value of ll(cmpxchg).
> 
> qspinlock:
>   0: locked
>   1: pending
>   2: tail
> 
> CPU A                    CPU B
> 1:                       1: &lt;--------------------+
>   sh $newval, &amp;locked      lw  $v1, &amp;qspinlock   |
>   add $newval, 1           and $t1, $v1, ~mask   |
>   b 1b                     or  $t1, $t1, newval  | (live lock path)
>                            ll  $v2, &amp;qspinlock   |
>                            bne $v1, $v2, 1b -----+
>                            sc  $t1, &amp;qspinlock
>                            beq $t1, 0, 1b
> 
> If xchg_tail like this, at least there is no live lock on Loongson
> 
> xchg_tail:
> 
> 1:
>   ll  $v1, &amp;qspinlock
>   and $t1, $v1, ~mask
>   or  $t1, $t1, newval
>   sc  $t1, &amp;qspinlock
>   beq $t1, 0, 1b
> 
> For hardware that ll/sc is based on cache coherency, I think sc is
> easy to succeed. The ll makes cache-line is exclusive by CPU B, and
> the store of CPU A needs to acquire exclusive again, the sc may be
> completed before this.

This! I've been saying this for ages. All those xchg16() implementations
are broken for using cmpxchg() on LL/SC. Not because xchg16() is
fundamentally flawed.

Perhaps we should introduce:

	atomic_nand_or() and atomic_fetch_nand_or()

and implement short xchg() using those, then we can have the whole masks
setup shared. It just means you get to implement those primitives for
*all* archs :-)

Also, the _Q_PENDING_BITS==1 case can use that primitive.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC 1/2] arch: Introduce ARCH_HAS_HW_XCHG_SMALL
  2021-07-27  2:29               ` Boqun Feng
  2021-07-27  2:46                 ` Waiman Long
@ 2021-07-27 11:05                 ` Peter Zijlstra
  2021-07-28 10:40                   ` Huacai Chen
  1 sibling, 1 reply; 26+ messages in thread
From: Peter Zijlstra @ 2021-07-27 11:05 UTC (permalink / raw)
  To: Boqun Feng
  Cc: Guo Ren, Waiman Long, Huacai Chen, Geert Uytterhoeven,
	Huacai Chen, Ingo Molnar, Will Deacon, Arnd Bergmann, Linux-Arch,
	Rui Wang, Xuefeng Li, Jiaxun Yang

On Tue, Jul 27, 2021 at 10:29:59AM +0800, Boqun Feng wrote:

> > "How to implement xchg_tail" shouldn't force with _Q_PENDING_BITS, but
> > the arch could choose.
> 
> I actually agree with this part, but this patchset failed to provide
> enough evidences on why we should choose xchg_tail() implementation
> based on whether hardware has xchg16, more precisely, for an archtecture
> which doesn't have a hardware xchg16, why cmpxchg emulated xchg16() is
> worse than a "load+cmpxchg) implemeneted xchg_tail()? If it's a
> performance reason, please show some numbers.

Right. Their problem is their broken xchg16() implementation.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC 1/2] arch: Introduce ARCH_HAS_HW_XCHG_SMALL
  2021-07-27 11:05                 ` Peter Zijlstra
@ 2021-07-28 10:40                   ` Huacai Chen
  2021-07-28 11:01                     ` Peter Zijlstra
  0 siblings, 1 reply; 26+ messages in thread
From: Huacai Chen @ 2021-07-28 10:40 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Boqun Feng, Guo Ren, Waiman Long, Geert Uytterhoeven,
	Huacai Chen, Ingo Molnar, Will Deacon, Arnd Bergmann, Linux-Arch,
	Rui Wang, Xuefeng Li, Jiaxun Yang

Hi, Peter,

On Tue, Jul 27, 2021 at 7:06 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Tue, Jul 27, 2021 at 10:29:59AM +0800, Boqun Feng wrote:
>
> > > "How to implement xchg_tail" shouldn't force with _Q_PENDING_BITS, but
> > > the arch could choose.
> >
> > I actually agree with this part, but this patchset failed to provide
> > enough evidences on why we should choose xchg_tail() implementation
> > based on whether hardware has xchg16, more precisely, for an archtecture
> > which doesn't have a hardware xchg16, why cmpxchg emulated xchg16() is
> > worse than a "load+cmpxchg) implemeneted xchg_tail()? If it's a
> > performance reason, please show some numbers.
>
> Right. Their problem is their broken xchg16() implementation.

Please correct me if I'm wrong. Now my understanding is like this:
1, _Q_PENDING_BITS=1 qspinlock can be used by all archs, though it may
be not optimized.
2, _Q_PENDING_BITS=8 qspinlock can be used if hardware supports
sub-word xchg/cmpxchg, or the software emulation is correctly
implemented. But the current MIPS emulation is not correct.

If so, I want to rename ARCH_HAS_HW_XCHG_SMALL to
ARCH_HAS_FAST_XCHG_SMALL, and let these archs select it:
1, X86, ARM, ARM64, IA64, M68K, because they have hardware support.
2, Other archs who select qspinlock currently (including MIPS),
because their current behavior is use _Q_PENDING_BITS=8 qspinlock and
we don't want to change anything in this patch. If their emulation is
broken or not as "fast" as expected, we can make new patches to
unselect the ARCH_HAS_FAST_XCHG_SMALL option.

Huacai

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC 1/2] arch: Introduce ARCH_HAS_HW_XCHG_SMALL
  2021-07-28 10:40                   ` Huacai Chen
@ 2021-07-28 11:01                     ` Peter Zijlstra
  2021-07-29 16:38                       ` Huacai Chen
  0 siblings, 1 reply; 26+ messages in thread
From: Peter Zijlstra @ 2021-07-28 11:01 UTC (permalink / raw)
  To: Huacai Chen
  Cc: Boqun Feng, Guo Ren, Waiman Long, Geert Uytterhoeven,
	Huacai Chen, Ingo Molnar, Will Deacon, Arnd Bergmann, Linux-Arch,
	Rui Wang, Xuefeng Li, Jiaxun Yang

On Wed, Jul 28, 2021 at 06:40:54PM +0800, Huacai Chen wrote:
> Hi, Peter,
> 
> On Tue, Jul 27, 2021 at 7:06 PM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Tue, Jul 27, 2021 at 10:29:59AM +0800, Boqun Feng wrote:
> >
> > > > "How to implement xchg_tail" shouldn't force with _Q_PENDING_BITS, but
> > > > the arch could choose.
> > >
> > > I actually agree with this part, but this patchset failed to provide
> > > enough evidences on why we should choose xchg_tail() implementation
> > > based on whether hardware has xchg16, more precisely, for an archtecture
> > > which doesn't have a hardware xchg16, why cmpxchg emulated xchg16() is
> > > worse than a "load+cmpxchg) implemeneted xchg_tail()? If it's a
> > > performance reason, please show some numbers.
> >
> > Right. Their problem is their broken xchg16() implementation.
> 
> Please correct me if I'm wrong. Now my understanding is like this:
> 1, _Q_PENDING_BITS=1 qspinlock can be used by all archs, though it may
> be not optimized.

Only if your arch has fwd progress guarantees for cmpxchg(). LL/SC based
cmpxchg() is tricky, and typically needs software based backoff on
failure.

The qspinlock code is written in generic code, but it very much relies
on an architecture to audit and vet the resulting code is sane for them.
Clearly MIPS didn't do a good job of that.

> 2, _Q_PENDING_BITS=8 qspinlock can be used if hardware supports
> sub-word xchg/cmpxchg, or the software emulation is correctly
> implemented. But the current MIPS emulation is not correct.

Everything always relies on things being correctly implemented. 1)
relies on cmpxchg() being correctly implemented. 2) relies on xchg16()
being correctly implemented.

Of these 2 is actually easier to implement correctly on LL/SC.

> If so, I want to rename ARCH_HAS_HW_XCHG_SMALL to
> ARCH_HAS_FAST_XCHG_SMALL, and let these archs select it:
> 1, X86, ARM, ARM64, IA64, M68K, because they have hardware support.
> 2, Other archs who select qspinlock currently (including MIPS),
> because their current behavior is use _Q_PENDING_BITS=8 qspinlock and
> we don't want to change anything in this patch. If their emulation is
> broken or not as "fast" as expected, we can make new patches to
> unselect the ARCH_HAS_FAST_XCHG_SMALL option.

I utterly fail to see the point of any of this. If you use qspinlock,
you had better have audited the whole thing for your architecture. And
FAST_XCHG_SMALL is completely irrelevant for anything here.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC 1/2] arch: Introduce ARCH_HAS_HW_XCHG_SMALL
  2021-07-28 11:01                     ` Peter Zijlstra
@ 2021-07-29 16:38                       ` Huacai Chen
  0 siblings, 0 replies; 26+ messages in thread
From: Huacai Chen @ 2021-07-29 16:38 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Boqun Feng, Guo Ren, Waiman Long, Geert Uytterhoeven,
	Huacai Chen, Ingo Molnar, Will Deacon, Arnd Bergmann, Linux-Arch,
	Rui Wang, Xuefeng Li, Jiaxun Yang

Hi Peter,

On Wed, Jul 28, 2021 at 7:02 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Wed, Jul 28, 2021 at 06:40:54PM +0800, Huacai Chen wrote:
> > Hi, Peter,
> >
> > On Tue, Jul 27, 2021 at 7:06 PM Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > On Tue, Jul 27, 2021 at 10:29:59AM +0800, Boqun Feng wrote:
> > >
> > > > > "How to implement xchg_tail" shouldn't force with _Q_PENDING_BITS, but
> > > > > the arch could choose.
> > > >
> > > > I actually agree with this part, but this patchset failed to provide
> > > > enough evidences on why we should choose xchg_tail() implementation
> > > > based on whether hardware has xchg16, more precisely, for an archtecture
> > > > which doesn't have a hardware xchg16, why cmpxchg emulated xchg16() is
> > > > worse than a "load+cmpxchg) implemeneted xchg_tail()? If it's a
> > > > performance reason, please show some numbers.
> > >
> > > Right. Their problem is their broken xchg16() implementation.
> >
> > Please correct me if I'm wrong. Now my understanding is like this:
> > 1, _Q_PENDING_BITS=1 qspinlock can be used by all archs, though it may
> > be not optimized.
>
> Only if your arch has fwd progress guarantees for cmpxchg(). LL/SC based
> cmpxchg() is tricky, and typically needs software based backoff on
> failure.
>
> The qspinlock code is written in generic code, but it very much relies
> on an architecture to audit and vet the resulting code is sane for them.
> Clearly MIPS didn't do a good job of that.
>
> > 2, _Q_PENDING_BITS=8 qspinlock can be used if hardware supports
> > sub-word xchg/cmpxchg, or the software emulation is correctly
> > implemented. But the current MIPS emulation is not correct.
>
> Everything always relies on things being correctly implemented. 1)
> relies on cmpxchg() being correctly implemented. 2) relies on xchg16()
> being correctly implemented.
>
> Of these 2 is actually easier to implement correctly on LL/SC.
>
> > If so, I want to rename ARCH_HAS_HW_XCHG_SMALL to
> > ARCH_HAS_FAST_XCHG_SMALL, and let these archs select it:
> > 1, X86, ARM, ARM64, IA64, M68K, because they have hardware support.
> > 2, Other archs who select qspinlock currently (including MIPS),
> > because their current behavior is use _Q_PENDING_BITS=8 qspinlock and
> > we don't want to change anything in this patch. If their emulation is
> > broken or not as "fast" as expected, we can make new patches to
> > unselect the ARCH_HAS_FAST_XCHG_SMALL option.
>
> I utterly fail to see the point of any of this. If you use qspinlock,
> you had better have audited the whole thing for your architecture. And
> FAST_XCHG_SMALL is completely irrelevant for anything here.
>
Our original goal of this patch is let LoongArch use qspinlock, but
Arnd let us remove xchg_small, so we want to let LoongArch use
_Q_PENDING_BITS=1 lock . :)

Now Rui Wang has a better solution, and xchg_small will be
reimplemented for LoongArch, so I think this patch is no longer
needed.

Huacai

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2021-07-29 16:38 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-24 12:36 [PATCH RFC 1/2] arch: Introduce ARCH_HAS_HW_XCHG_SMALL Huacai Chen
2021-07-24 12:36 ` [PATCH RFC 2/2] qspinlock: Use ARCH_HAS_HW_XCHG_SMALL to select _Q_PENDING_BITS definition Huacai Chen
2021-07-24 19:24 ` [PATCH RFC 1/2] arch: Introduce ARCH_HAS_HW_XCHG_SMALL Arnd Bergmann
2021-07-25  3:06   ` Jiaxun Yang
2021-07-25 10:08     ` Arnd Bergmann
2021-07-26  8:36 ` Geert Uytterhoeven
2021-07-26  8:56   ` Huacai Chen
2021-07-26 10:39     ` Boqun Feng
2021-07-26 16:41       ` Guo Ren
2021-07-26 17:03         ` Boqun Feng
2021-07-26 21:20           ` Waiman Long
2021-07-27  1:27             ` Guo Ren
2021-07-27  2:29               ` Boqun Feng
2021-07-27  2:46                 ` Waiman Long
2021-07-27 11:05                 ` Peter Zijlstra
2021-07-28 10:40                   ` Huacai Chen
2021-07-28 11:01                     ` Peter Zijlstra
2021-07-29 16:38                       ` Huacai Chen
2021-07-27 10:17             ` Peter Zijlstra
2021-07-27  1:07           ` Guo Ren
2021-07-27  1:52             ` Wang Rui
2021-07-27 11:03               ` Peter Zijlstra
2021-07-27  2:00             ` Boqun Feng
2021-07-27 10:50             ` Peter Zijlstra
2021-07-27 10:12           ` Peter Zijlstra
2021-07-26  9:00   ` Arnd Bergmann

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.