All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg
@ 2023-08-30 15:13 Uros Bizjak
  2023-08-30 15:13 ` [PATCH 2/2] x86/percpu: Use raw_cpu_try_cmpxchg in preempt_count_set Uros Bizjak
  2023-09-15 11:25 ` [tip: x86/asm] x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg() tip-bot2 for Uros Bizjak
  0 siblings, 2 replies; 8+ messages in thread
From: Uros Bizjak @ 2023-08-30 15:13 UTC (permalink / raw)
  To: x86, linux-kernel
  Cc: Uros Bizjak, Peter Zijlstra, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, H. Peter Anvin

Define target-specific raw_cpu_try_cmpxchg_N and
this_cpu_try_cmpxchg_N macros. These definitions override
the generic fallback definitions and enable target-specific
optimized implementations.

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
---
 arch/x86/include/asm/percpu.h | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 34734d730463..c8309f260d98 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -210,6 +210,25 @@ do {									\
 	(typeof(_var))(unsigned long) pco_old__;			\
 })
 
+#define percpu_try_cmpxchg_op(size, qual, _var, _ovalp, _nval)		\
+({									\
+	bool success;							\
+	__pcpu_type_##size *pco_oval__ = (__pcpu_type_##size *)(_ovalp); \
+	__pcpu_type_##size pco_old__ = *pco_oval__;			\
+	__pcpu_type_##size pco_new__ = __pcpu_cast_##size(_nval);	\
+	asm qual (__pcpu_op2_##size("cmpxchg", "%[nval]",		\
+				    __percpu_arg([var]))		\
+		  CC_SET(z)						\
+		  : CC_OUT(z) (success),				\
+		    [oval] "+a" (pco_old__),				\
+		    [var] "+m" (_var)					\
+		  : [nval] __pcpu_reg_##size(, pco_new__)		\
+		  : "memory");						\
+	if (unlikely(!success))						\
+		*pco_oval__ = pco_old__;				\
+	likely(success);						\
+})
+
 #if defined(CONFIG_X86_32) && !defined(CONFIG_UML)
 #define percpu_cmpxchg64_op(size, qual, _var, _oval, _nval)		\
 ({									\
@@ -343,6 +362,9 @@ do {									\
 #define raw_cpu_cmpxchg_1(pcp, oval, nval)	percpu_cmpxchg_op(1, , pcp, oval, nval)
 #define raw_cpu_cmpxchg_2(pcp, oval, nval)	percpu_cmpxchg_op(2, , pcp, oval, nval)
 #define raw_cpu_cmpxchg_4(pcp, oval, nval)	percpu_cmpxchg_op(4, , pcp, oval, nval)
+#define raw_cpu_try_cmpxchg_1(pcp, ovalp, nval)	percpu_try_cmpxchg_op(1, , pcp, ovalp, nval)
+#define raw_cpu_try_cmpxchg_2(pcp, ovalp, nval)	percpu_try_cmpxchg_op(2, , pcp, ovalp, nval)
+#define raw_cpu_try_cmpxchg_4(pcp, ovalp, nval)	percpu_try_cmpxchg_op(4, , pcp, ovalp, nval)
 
 #define this_cpu_add_return_1(pcp, val)		percpu_add_return_op(1, volatile, pcp, val)
 #define this_cpu_add_return_2(pcp, val)		percpu_add_return_op(2, volatile, pcp, val)
@@ -350,6 +372,9 @@ do {									\
 #define this_cpu_cmpxchg_1(pcp, oval, nval)	percpu_cmpxchg_op(1, volatile, pcp, oval, nval)
 #define this_cpu_cmpxchg_2(pcp, oval, nval)	percpu_cmpxchg_op(2, volatile, pcp, oval, nval)
 #define this_cpu_cmpxchg_4(pcp, oval, nval)	percpu_cmpxchg_op(4, volatile, pcp, oval, nval)
+#define this_cpu_try_cmpxchg_1(pcp, ovalp, nval)	percpu_try_cmpxchg_op(1, volatile, pcp, ovalp, nval)
+#define this_cpu_try_cmpxchg_2(pcp, ovalp, nval)	percpu_try_cmpxchg_op(2, volatile, pcp, ovalp, nval)
+#define this_cpu_try_cmpxchg_4(pcp, ovalp, nval)	percpu_try_cmpxchg_op(4, volatile, pcp, ovalp, nval)
 
 /*
  * Per cpu atomic 64 bit operations are only available under 64 bit.
@@ -364,6 +389,7 @@ do {									\
 #define raw_cpu_add_return_8(pcp, val)		percpu_add_return_op(8, , pcp, val)
 #define raw_cpu_xchg_8(pcp, nval)		raw_percpu_xchg_op(pcp, nval)
 #define raw_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(8, , pcp, oval, nval)
+#define raw_cpu_try_cmpxchg_8(pcp, ovalp, nval)	percpu_try_cmpxchg_op(8, , pcp, ovalp, nval)
 
 #define this_cpu_read_8(pcp)			percpu_from_op(8, volatile, "mov", pcp)
 #define this_cpu_write_8(pcp, val)		percpu_to_op(8, volatile, "mov", (pcp), val)
@@ -373,6 +399,7 @@ do {									\
 #define this_cpu_add_return_8(pcp, val)		percpu_add_return_op(8, volatile, pcp, val)
 #define this_cpu_xchg_8(pcp, nval)		percpu_xchg_op(8, volatile, pcp, nval)
 #define this_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(8, volatile, pcp, oval, nval)
+#define this_cpu_try_cmpxchg_8(pcp, ovalp, nval)	percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval)
 #endif
 
 static __always_inline bool x86_this_cpu_constant_test_bit(unsigned int nr,
-- 
2.41.0


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 2/2] x86/percpu: Use raw_cpu_try_cmpxchg in preempt_count_set
  2023-08-30 15:13 [PATCH 1/2] x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg Uros Bizjak
@ 2023-08-30 15:13 ` Uros Bizjak
  2023-09-15  9:47   ` Ingo Molnar
  2023-09-15 11:25   ` [tip: x86/asm] x86/percpu: Use raw_cpu_try_cmpxchg() in preempt_count_set() tip-bot2 for Uros Bizjak
  2023-09-15 11:25 ` [tip: x86/asm] x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg() tip-bot2 for Uros Bizjak
  1 sibling, 2 replies; 8+ messages in thread
From: Uros Bizjak @ 2023-08-30 15:13 UTC (permalink / raw)
  To: x86, linux-kernel
  Cc: Uros Bizjak, Peter Zijlstra, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, H. Peter Anvin

Use raw_cpu_try_cmpxchg instead of raw_cpu_cmpxchg (*ptr, old, new) == old.
x86 CMPXCHG instruction returns success in ZF flag, so this change saves a
compare after cmpxchg (and related move instruction in front of cmpxchg).

Also, raw_cpu_try_cmpxchg implicitly assigns old *ptr value to "old" when
cmpxchg fails. There is no need to re-read the value in the loop.

No functional change intended.

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
---
 arch/x86/include/asm/preempt.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 2d13f25b1bd8..4527e1430c6d 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -31,11 +31,11 @@ static __always_inline void preempt_count_set(int pc)
 {
 	int old, new;
 
+	old = raw_cpu_read_4(pcpu_hot.preempt_count);
 	do {
-		old = raw_cpu_read_4(pcpu_hot.preempt_count);
 		new = (old & PREEMPT_NEED_RESCHED) |
 			(pc & ~PREEMPT_NEED_RESCHED);
-	} while (raw_cpu_cmpxchg_4(pcpu_hot.preempt_count, old, new) != old);
+	} while (!raw_cpu_try_cmpxchg_4(pcpu_hot.preempt_count, &old, new));
 }
 
 /*
-- 
2.41.0


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH 2/2] x86/percpu: Use raw_cpu_try_cmpxchg in preempt_count_set
  2023-08-30 15:13 ` [PATCH 2/2] x86/percpu: Use raw_cpu_try_cmpxchg in preempt_count_set Uros Bizjak
@ 2023-09-15  9:47   ` Ingo Molnar
  2023-09-15 11:15     ` Ingo Molnar
  2023-09-15 12:01     ` Uros Bizjak
  2023-09-15 11:25   ` [tip: x86/asm] x86/percpu: Use raw_cpu_try_cmpxchg() in preempt_count_set() tip-bot2 for Uros Bizjak
  1 sibling, 2 replies; 8+ messages in thread
From: Ingo Molnar @ 2023-09-15  9:47 UTC (permalink / raw)
  To: Uros Bizjak
  Cc: x86, linux-kernel, Peter Zijlstra, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, H. Peter Anvin


* Uros Bizjak <ubizjak@gmail.com> wrote:

> Use raw_cpu_try_cmpxchg instead of raw_cpu_cmpxchg (*ptr, old, new) == old.
> x86 CMPXCHG instruction returns success in ZF flag, so this change saves a
> compare after cmpxchg (and related move instruction in front of cmpxchg).
> 
> Also, raw_cpu_try_cmpxchg implicitly assigns old *ptr value to "old" when
> cmpxchg fails. There is no need to re-read the value in the loop.
> 
> No functional change intended.
> 
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: Borislav Petkov <bp@alien8.de>
> Cc: Dave Hansen <dave.hansen@linux.intel.com>
> Cc: "H. Peter Anvin" <hpa@zytor.com>
> Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
> ---
>  arch/x86/include/asm/preempt.h | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
> index 2d13f25b1bd8..4527e1430c6d 100644
> --- a/arch/x86/include/asm/preempt.h
> +++ b/arch/x86/include/asm/preempt.h
> @@ -31,11 +31,11 @@ static __always_inline void preempt_count_set(int pc)
>  {
>  	int old, new;
>  
> +	old = raw_cpu_read_4(pcpu_hot.preempt_count);
>  	do {
> -		old = raw_cpu_read_4(pcpu_hot.preempt_count);
>  		new = (old & PREEMPT_NEED_RESCHED) |
>  			(pc & ~PREEMPT_NEED_RESCHED);
> -	} while (raw_cpu_cmpxchg_4(pcpu_hot.preempt_count, old, new) != old);
> +	} while (!raw_cpu_try_cmpxchg_4(pcpu_hot.preempt_count, &old, new));

It would be really nice to have a before/after comparison of generated 
assembly code in the changelog, to demonstrate the effectiveness of this 
optimization.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 2/2] x86/percpu: Use raw_cpu_try_cmpxchg in preempt_count_set
  2023-09-15  9:47   ` Ingo Molnar
@ 2023-09-15 11:15     ` Ingo Molnar
  2023-09-15 11:22       ` Ingo Molnar
  2023-09-15 12:01     ` Uros Bizjak
  1 sibling, 1 reply; 8+ messages in thread
From: Ingo Molnar @ 2023-09-15 11:15 UTC (permalink / raw)
  To: Uros Bizjak
  Cc: x86, linux-kernel, Peter Zijlstra, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, H. Peter Anvin


* Ingo Molnar <mingo@kernel.org> wrote:

> 
> * Uros Bizjak <ubizjak@gmail.com> wrote:
> 
> > Use raw_cpu_try_cmpxchg instead of raw_cpu_cmpxchg (*ptr, old, new) == old.
> > x86 CMPXCHG instruction returns success in ZF flag, so this change saves a
> > compare after cmpxchg (and related move instruction in front of cmpxchg).
> > 
> > Also, raw_cpu_try_cmpxchg implicitly assigns old *ptr value to "old" when
> > cmpxchg fails. There is no need to re-read the value in the loop.
> > 
> > No functional change intended.
> > 
> > Cc: Peter Zijlstra <peterz@infradead.org>
> > Cc: Thomas Gleixner <tglx@linutronix.de>
> > Cc: Ingo Molnar <mingo@redhat.com>
> > Cc: Borislav Petkov <bp@alien8.de>
> > Cc: Dave Hansen <dave.hansen@linux.intel.com>
> > Cc: "H. Peter Anvin" <hpa@zytor.com>
> > Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
> > ---
> >  arch/x86/include/asm/preempt.h | 4 ++--
> >  1 file changed, 2 insertions(+), 2 deletions(-)
> > 
> > diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
> > index 2d13f25b1bd8..4527e1430c6d 100644
> > --- a/arch/x86/include/asm/preempt.h
> > +++ b/arch/x86/include/asm/preempt.h
> > @@ -31,11 +31,11 @@ static __always_inline void preempt_count_set(int pc)
> >  {
> >  	int old, new;
> >  
> > +	old = raw_cpu_read_4(pcpu_hot.preempt_count);
> >  	do {
> > -		old = raw_cpu_read_4(pcpu_hot.preempt_count);
> >  		new = (old & PREEMPT_NEED_RESCHED) |
> >  			(pc & ~PREEMPT_NEED_RESCHED);
> > -	} while (raw_cpu_cmpxchg_4(pcpu_hot.preempt_count, old, new) != old);
> > +	} while (!raw_cpu_try_cmpxchg_4(pcpu_hot.preempt_count, &old, new));
> 
> It would be really nice to have a before/after comparison of generated 
> assembly code in the changelog, to demonstrate the effectiveness of this 
> optimization.

Never mind, you did exactly that in the September 6 variation of these 
changes. I'll apply those.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 2/2] x86/percpu: Use raw_cpu_try_cmpxchg in preempt_count_set
  2023-09-15 11:15     ` Ingo Molnar
@ 2023-09-15 11:22       ` Ingo Molnar
  0 siblings, 0 replies; 8+ messages in thread
From: Ingo Molnar @ 2023-09-15 11:22 UTC (permalink / raw)
  To: Uros Bizjak
  Cc: x86, linux-kernel, Peter Zijlstra, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, H. Peter Anvin


* Ingo Molnar <mingo@kernel.org> wrote:

> 
> * Ingo Molnar <mingo@kernel.org> wrote:
> 
> > 
> > * Uros Bizjak <ubizjak@gmail.com> wrote:
> > 
> > > Use raw_cpu_try_cmpxchg instead of raw_cpu_cmpxchg (*ptr, old, new) == old.
> > > x86 CMPXCHG instruction returns success in ZF flag, so this change saves a
> > > compare after cmpxchg (and related move instruction in front of cmpxchg).
> > > 
> > > Also, raw_cpu_try_cmpxchg implicitly assigns old *ptr value to "old" when
> > > cmpxchg fails. There is no need to re-read the value in the loop.
> > > 
> > > No functional change intended.
> > > 
> > > Cc: Peter Zijlstra <peterz@infradead.org>
> > > Cc: Thomas Gleixner <tglx@linutronix.de>
> > > Cc: Ingo Molnar <mingo@redhat.com>
> > > Cc: Borislav Petkov <bp@alien8.de>
> > > Cc: Dave Hansen <dave.hansen@linux.intel.com>
> > > Cc: "H. Peter Anvin" <hpa@zytor.com>
> > > Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
> > > ---
> > >  arch/x86/include/asm/preempt.h | 4 ++--
> > >  1 file changed, 2 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
> > > index 2d13f25b1bd8..4527e1430c6d 100644
> > > --- a/arch/x86/include/asm/preempt.h
> > > +++ b/arch/x86/include/asm/preempt.h
> > > @@ -31,11 +31,11 @@ static __always_inline void preempt_count_set(int pc)
> > >  {
> > >  	int old, new;
> > >  
> > > +	old = raw_cpu_read_4(pcpu_hot.preempt_count);
> > >  	do {
> > > -		old = raw_cpu_read_4(pcpu_hot.preempt_count);
> > >  		new = (old & PREEMPT_NEED_RESCHED) |
> > >  			(pc & ~PREEMPT_NEED_RESCHED);
> > > -	} while (raw_cpu_cmpxchg_4(pcpu_hot.preempt_count, old, new) != old);
> > > +	} while (!raw_cpu_try_cmpxchg_4(pcpu_hot.preempt_count, &old, new));
> > 
> > It would be really nice to have a before/after comparison of generated 
> > assembly code in the changelog, to demonstrate the effectiveness of this 
> > optimization.
> 
> Never mind, you did exactly that in the September 6 variation of these 
> changes. I'll apply those.

I mean, this third patch of yours:

   [PATCH] x86/percpu: Define {raw,this}_cpu_try_cmpxchg{64,128}

Had a proper disassembly comparison - so I've applied all 3 optimization 
patches to tip:x86/asm as:

  b8e3dfa16ec5 ("x86/percpu: Use raw_cpu_try_cmpxchg() in preempt_count_set()")
  5f863897d964 ("x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg()")
  54cd971c6f44 ("x86/percpu: Define {raw,this}_cpu_try_cmpxchg{64,128}")

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [tip: x86/asm] x86/percpu: Use raw_cpu_try_cmpxchg() in preempt_count_set()
  2023-08-30 15:13 ` [PATCH 2/2] x86/percpu: Use raw_cpu_try_cmpxchg in preempt_count_set Uros Bizjak
  2023-09-15  9:47   ` Ingo Molnar
@ 2023-09-15 11:25   ` tip-bot2 for Uros Bizjak
  1 sibling, 0 replies; 8+ messages in thread
From: tip-bot2 for Uros Bizjak @ 2023-09-15 11:25 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Uros Bizjak, Ingo Molnar, Peter Zijlstra, x86, linux-kernel

The following commit has been merged into the x86/asm branch of tip:

Commit-ID:     b8e3dfa16ec55f310dd95831614af3d24abf5ed5
Gitweb:        https://git.kernel.org/tip/b8e3dfa16ec55f310dd95831614af3d24abf5ed5
Author:        Uros Bizjak <ubizjak@gmail.com>
AuthorDate:    Wed, 30 Aug 2023 17:13:57 +02:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Fri, 15 Sep 2023 13:19:22 +02:00

x86/percpu: Use raw_cpu_try_cmpxchg() in preempt_count_set()

Use raw_cpu_try_cmpxchg() instead of raw_cpu_cmpxchg(*ptr, old, new) == old.
x86 CMPXCHG instruction returns success in ZF flag, so this change saves a
compare after CMPXCHG (and related MOV instruction in front of CMPXCHG).

Also, raw_cpu_try_cmpxchg() implicitly assigns old *ptr value to "old" when
cmpxchg fails. There is no need to re-read the value in the loop.

No functional change intended.

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20230830151623.3900-2-ubizjak@gmail.com
---
 arch/x86/include/asm/preempt.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 2d13f25..4527e14 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -31,11 +31,11 @@ static __always_inline void preempt_count_set(int pc)
 {
 	int old, new;
 
+	old = raw_cpu_read_4(pcpu_hot.preempt_count);
 	do {
-		old = raw_cpu_read_4(pcpu_hot.preempt_count);
 		new = (old & PREEMPT_NEED_RESCHED) |
 			(pc & ~PREEMPT_NEED_RESCHED);
-	} while (raw_cpu_cmpxchg_4(pcpu_hot.preempt_count, old, new) != old);
+	} while (!raw_cpu_try_cmpxchg_4(pcpu_hot.preempt_count, &old, new));
 }
 
 /*

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [tip: x86/asm] x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg()
  2023-08-30 15:13 [PATCH 1/2] x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg Uros Bizjak
  2023-08-30 15:13 ` [PATCH 2/2] x86/percpu: Use raw_cpu_try_cmpxchg in preempt_count_set Uros Bizjak
@ 2023-09-15 11:25 ` tip-bot2 for Uros Bizjak
  1 sibling, 0 replies; 8+ messages in thread
From: tip-bot2 for Uros Bizjak @ 2023-09-15 11:25 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: Uros Bizjak, Ingo Molnar, x86, linux-kernel

The following commit has been merged into the x86/asm branch of tip:

Commit-ID:     5f863897d964e834a0da35b1e483b5bb8faca522
Gitweb:        https://git.kernel.org/tip/5f863897d964e834a0da35b1e483b5bb8faca522
Author:        Uros Bizjak <ubizjak@gmail.com>
AuthorDate:    Wed, 30 Aug 2023 17:13:56 +02:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Fri, 15 Sep 2023 13:18:23 +02:00

x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg()

Define target-specific raw_cpu_try_cmpxchg_N() and
this_cpu_try_cmpxchg_N() macros. These definitions override
the generic fallback definitions and enable target-specific
optimized implementations.

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20230830151623.3900-1-ubizjak@gmail.com
---
 arch/x86/include/asm/percpu.h | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 4c36419..a87db61 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -210,6 +210,25 @@ do {									\
 	(typeof(_var))(unsigned long) pco_old__;			\
 })
 
+#define percpu_try_cmpxchg_op(size, qual, _var, _ovalp, _nval)		\
+({									\
+	bool success;							\
+	__pcpu_type_##size *pco_oval__ = (__pcpu_type_##size *)(_ovalp); \
+	__pcpu_type_##size pco_old__ = *pco_oval__;			\
+	__pcpu_type_##size pco_new__ = __pcpu_cast_##size(_nval);	\
+	asm qual (__pcpu_op2_##size("cmpxchg", "%[nval]",		\
+				    __percpu_arg([var]))		\
+		  CC_SET(z)						\
+		  : CC_OUT(z) (success),				\
+		    [oval] "+a" (pco_old__),				\
+		    [var] "+m" (_var)					\
+		  : [nval] __pcpu_reg_##size(, pco_new__)		\
+		  : "memory");						\
+	if (unlikely(!success))						\
+		*pco_oval__ = pco_old__;				\
+	likely(success);						\
+})
+
 #if defined(CONFIG_X86_32) && !defined(CONFIG_UML)
 #define percpu_cmpxchg64_op(size, qual, _var, _oval, _nval)		\
 ({									\
@@ -410,6 +429,9 @@ do {									\
 #define raw_cpu_cmpxchg_1(pcp, oval, nval)	percpu_cmpxchg_op(1, , pcp, oval, nval)
 #define raw_cpu_cmpxchg_2(pcp, oval, nval)	percpu_cmpxchg_op(2, , pcp, oval, nval)
 #define raw_cpu_cmpxchg_4(pcp, oval, nval)	percpu_cmpxchg_op(4, , pcp, oval, nval)
+#define raw_cpu_try_cmpxchg_1(pcp, ovalp, nval)	percpu_try_cmpxchg_op(1, , pcp, ovalp, nval)
+#define raw_cpu_try_cmpxchg_2(pcp, ovalp, nval)	percpu_try_cmpxchg_op(2, , pcp, ovalp, nval)
+#define raw_cpu_try_cmpxchg_4(pcp, ovalp, nval)	percpu_try_cmpxchg_op(4, , pcp, ovalp, nval)
 
 #define this_cpu_add_return_1(pcp, val)		percpu_add_return_op(1, volatile, pcp, val)
 #define this_cpu_add_return_2(pcp, val)		percpu_add_return_op(2, volatile, pcp, val)
@@ -417,6 +439,9 @@ do {									\
 #define this_cpu_cmpxchg_1(pcp, oval, nval)	percpu_cmpxchg_op(1, volatile, pcp, oval, nval)
 #define this_cpu_cmpxchg_2(pcp, oval, nval)	percpu_cmpxchg_op(2, volatile, pcp, oval, nval)
 #define this_cpu_cmpxchg_4(pcp, oval, nval)	percpu_cmpxchg_op(4, volatile, pcp, oval, nval)
+#define this_cpu_try_cmpxchg_1(pcp, ovalp, nval)	percpu_try_cmpxchg_op(1, volatile, pcp, ovalp, nval)
+#define this_cpu_try_cmpxchg_2(pcp, ovalp, nval)	percpu_try_cmpxchg_op(2, volatile, pcp, ovalp, nval)
+#define this_cpu_try_cmpxchg_4(pcp, ovalp, nval)	percpu_try_cmpxchg_op(4, volatile, pcp, ovalp, nval)
 
 /*
  * Per cpu atomic 64 bit operations are only available under 64 bit.
@@ -431,6 +456,7 @@ do {									\
 #define raw_cpu_add_return_8(pcp, val)		percpu_add_return_op(8, , pcp, val)
 #define raw_cpu_xchg_8(pcp, nval)		raw_percpu_xchg_op(pcp, nval)
 #define raw_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(8, , pcp, oval, nval)
+#define raw_cpu_try_cmpxchg_8(pcp, ovalp, nval)	percpu_try_cmpxchg_op(8, , pcp, ovalp, nval)
 
 #define this_cpu_read_8(pcp)			percpu_from_op(8, volatile, "mov", pcp)
 #define this_cpu_write_8(pcp, val)		percpu_to_op(8, volatile, "mov", (pcp), val)
@@ -440,6 +466,7 @@ do {									\
 #define this_cpu_add_return_8(pcp, val)		percpu_add_return_op(8, volatile, pcp, val)
 #define this_cpu_xchg_8(pcp, nval)		percpu_xchg_op(8, volatile, pcp, nval)
 #define this_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(8, volatile, pcp, oval, nval)
+#define this_cpu_try_cmpxchg_8(pcp, ovalp, nval)	percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval)
 #endif
 
 static __always_inline bool x86_this_cpu_constant_test_bit(unsigned int nr,

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH 2/2] x86/percpu: Use raw_cpu_try_cmpxchg in preempt_count_set
  2023-09-15  9:47   ` Ingo Molnar
  2023-09-15 11:15     ` Ingo Molnar
@ 2023-09-15 12:01     ` Uros Bizjak
  1 sibling, 0 replies; 8+ messages in thread
From: Uros Bizjak @ 2023-09-15 12:01 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: x86, linux-kernel, Peter Zijlstra, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, H. Peter Anvin

On Fri, Sep 15, 2023 at 11:47 AM Ingo Molnar <mingo@kernel.org> wrote:
>
>
> * Uros Bizjak <ubizjak@gmail.com> wrote:
>
> > Use raw_cpu_try_cmpxchg instead of raw_cpu_cmpxchg (*ptr, old, new) == old.
> > x86 CMPXCHG instruction returns success in ZF flag, so this change saves a
> > compare after cmpxchg (and related move instruction in front of cmpxchg).
> >
> > Also, raw_cpu_try_cmpxchg implicitly assigns old *ptr value to "old" when
> > cmpxchg fails. There is no need to re-read the value in the loop.
> >
> > No functional change intended.
> >
> > Cc: Peter Zijlstra <peterz@infradead.org>
> > Cc: Thomas Gleixner <tglx@linutronix.de>
> > Cc: Ingo Molnar <mingo@redhat.com>
> > Cc: Borislav Petkov <bp@alien8.de>
> > Cc: Dave Hansen <dave.hansen@linux.intel.com>
> > Cc: "H. Peter Anvin" <hpa@zytor.com>
> > Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
> > ---
> >  arch/x86/include/asm/preempt.h | 4 ++--
> >  1 file changed, 2 insertions(+), 2 deletions(-)
> >
> > diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
> > index 2d13f25b1bd8..4527e1430c6d 100644
> > --- a/arch/x86/include/asm/preempt.h
> > +++ b/arch/x86/include/asm/preempt.h
> > @@ -31,11 +31,11 @@ static __always_inline void preempt_count_set(int pc)
> >  {
> >       int old, new;
> >
> > +     old = raw_cpu_read_4(pcpu_hot.preempt_count);
> >       do {
> > -             old = raw_cpu_read_4(pcpu_hot.preempt_count);
> >               new = (old & PREEMPT_NEED_RESCHED) |
> >                       (pc & ~PREEMPT_NEED_RESCHED);
> > -     } while (raw_cpu_cmpxchg_4(pcpu_hot.preempt_count, old, new) != old);
> > +     } while (!raw_cpu_try_cmpxchg_4(pcpu_hot.preempt_count, &old, new));
>
> It would be really nice to have a before/after comparison of generated
> assembly code in the changelog, to demonstrate the effectiveness of this
> optimization.

The  assembly code improvements are in line with other try_cmpxchg
conversions, but for reference, finish_task_switch() from
kernel/sched/core.c that inlines preempt_count_set() improves from:

    5bad:    65 8b 0d 00 00 00 00     mov    %gs:0x0(%rip),%ecx
    5bb4:    89 ca                    mov    %ecx,%edx
    5bb6:    89 c8                    mov    %ecx,%eax
    5bb8:    81 e2 00 00 00 80        and    $0x80000000,%edx
    5bbe:    83 ca 02                 or     $0x2,%edx
    5bc1:    65 0f b1 15 00 00 00     cmpxchg %edx,%gs:0x0(%rip)
    5bc8:    00
    5bc9:    39 c1                    cmp    %eax,%ecx
    5bcb:    75 e0                    jne    5bad <...>
    5bcd:    e9 5a fe ff ff           jmpq   5a2c <...>
    5bd2:

to:

    5bad:    65 8b 05 00 00 00 00     mov    %gs:0x0(%rip),%eax
    5bb4:    89 c2                    mov    %eax,%edx
    5bb6:    81 e2 00 00 00 80        and    $0x80000000,%edx
    5bbc:    83 ca 02                 or     $0x2,%edx
    5bbf:    65 0f b1 15 00 00 00     cmpxchg %edx,%gs:0x0(%rip)
    5bc6:    00
    5bc7:    0f 84 5f fe ff ff        je     5a2c <...>
    5bcd:    eb e5                    jmp    5bb4 <...>
    5bcf:

Please note missing cmp (and mov), loop without extra memory load from
%gs:0x0(%rip) and better predicted jump in the later case. The
improvements with {raw,this}_cpu_try_cmpxchg_128 in the third patch
are even more noticeable, because __int128 value lives in a register
pair, so the comparison needs three separate machine instructions, in
addition to a move of the register pair.

Thanks,
Uros.

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2023-09-15 12:02 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-08-30 15:13 [PATCH 1/2] x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg Uros Bizjak
2023-08-30 15:13 ` [PATCH 2/2] x86/percpu: Use raw_cpu_try_cmpxchg in preempt_count_set Uros Bizjak
2023-09-15  9:47   ` Ingo Molnar
2023-09-15 11:15     ` Ingo Molnar
2023-09-15 11:22       ` Ingo Molnar
2023-09-15 12:01     ` Uros Bizjak
2023-09-15 11:25   ` [tip: x86/asm] x86/percpu: Use raw_cpu_try_cmpxchg() in preempt_count_set() tip-bot2 for Uros Bizjak
2023-09-15 11:25 ` [tip: x86/asm] x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg() tip-bot2 for Uros Bizjak

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.