* [PATCH 1/2] x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg
@ 2023-08-30 15:13 Uros Bizjak
2023-08-30 15:13 ` [PATCH 2/2] x86/percpu: Use raw_cpu_try_cmpxchg in preempt_count_set Uros Bizjak
2023-09-15 11:25 ` [tip: x86/asm] x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg() tip-bot2 for Uros Bizjak
0 siblings, 2 replies; 8+ messages in thread
From: Uros Bizjak @ 2023-08-30 15:13 UTC (permalink / raw)
To: x86, linux-kernel
Cc: Uros Bizjak, Peter Zijlstra, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, H. Peter Anvin
Define target-specific raw_cpu_try_cmpxchg_N and
this_cpu_try_cmpxchg_N macros. These definitions override
the generic fallback definitions and enable target-specific
optimized implementations.
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
---
arch/x86/include/asm/percpu.h | 27 +++++++++++++++++++++++++++
1 file changed, 27 insertions(+)
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 34734d730463..c8309f260d98 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -210,6 +210,25 @@ do { \
(typeof(_var))(unsigned long) pco_old__; \
})
+#define percpu_try_cmpxchg_op(size, qual, _var, _ovalp, _nval) \
+({ \
+ bool success; \
+ __pcpu_type_##size *pco_oval__ = (__pcpu_type_##size *)(_ovalp); \
+ __pcpu_type_##size pco_old__ = *pco_oval__; \
+ __pcpu_type_##size pco_new__ = __pcpu_cast_##size(_nval); \
+ asm qual (__pcpu_op2_##size("cmpxchg", "%[nval]", \
+ __percpu_arg([var])) \
+ CC_SET(z) \
+ : CC_OUT(z) (success), \
+ [oval] "+a" (pco_old__), \
+ [var] "+m" (_var) \
+ : [nval] __pcpu_reg_##size(, pco_new__) \
+ : "memory"); \
+ if (unlikely(!success)) \
+ *pco_oval__ = pco_old__; \
+ likely(success); \
+})
+
#if defined(CONFIG_X86_32) && !defined(CONFIG_UML)
#define percpu_cmpxchg64_op(size, qual, _var, _oval, _nval) \
({ \
@@ -343,6 +362,9 @@ do { \
#define raw_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(1, , pcp, oval, nval)
#define raw_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(2, , pcp, oval, nval)
#define raw_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(4, , pcp, oval, nval)
+#define raw_cpu_try_cmpxchg_1(pcp, ovalp, nval) percpu_try_cmpxchg_op(1, , pcp, ovalp, nval)
+#define raw_cpu_try_cmpxchg_2(pcp, ovalp, nval) percpu_try_cmpxchg_op(2, , pcp, ovalp, nval)
+#define raw_cpu_try_cmpxchg_4(pcp, ovalp, nval) percpu_try_cmpxchg_op(4, , pcp, ovalp, nval)
#define this_cpu_add_return_1(pcp, val) percpu_add_return_op(1, volatile, pcp, val)
#define this_cpu_add_return_2(pcp, val) percpu_add_return_op(2, volatile, pcp, val)
@@ -350,6 +372,9 @@ do { \
#define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(1, volatile, pcp, oval, nval)
#define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(2, volatile, pcp, oval, nval)
#define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(4, volatile, pcp, oval, nval)
+#define this_cpu_try_cmpxchg_1(pcp, ovalp, nval) percpu_try_cmpxchg_op(1, volatile, pcp, ovalp, nval)
+#define this_cpu_try_cmpxchg_2(pcp, ovalp, nval) percpu_try_cmpxchg_op(2, volatile, pcp, ovalp, nval)
+#define this_cpu_try_cmpxchg_4(pcp, ovalp, nval) percpu_try_cmpxchg_op(4, volatile, pcp, ovalp, nval)
/*
* Per cpu atomic 64 bit operations are only available under 64 bit.
@@ -364,6 +389,7 @@ do { \
#define raw_cpu_add_return_8(pcp, val) percpu_add_return_op(8, , pcp, val)
#define raw_cpu_xchg_8(pcp, nval) raw_percpu_xchg_op(pcp, nval)
#define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, , pcp, oval, nval)
+#define raw_cpu_try_cmpxchg_8(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, , pcp, ovalp, nval)
#define this_cpu_read_8(pcp) percpu_from_op(8, volatile, "mov", pcp)
#define this_cpu_write_8(pcp, val) percpu_to_op(8, volatile, "mov", (pcp), val)
@@ -373,6 +399,7 @@ do { \
#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(8, volatile, pcp, val)
#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(8, volatile, pcp, nval)
#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, volatile, pcp, oval, nval)
+#define this_cpu_try_cmpxchg_8(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval)
#endif
static __always_inline bool x86_this_cpu_constant_test_bit(unsigned int nr,
--
2.41.0
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCH 2/2] x86/percpu: Use raw_cpu_try_cmpxchg in preempt_count_set
2023-08-30 15:13 [PATCH 1/2] x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg Uros Bizjak
@ 2023-08-30 15:13 ` Uros Bizjak
2023-09-15 9:47 ` Ingo Molnar
2023-09-15 11:25 ` [tip: x86/asm] x86/percpu: Use raw_cpu_try_cmpxchg() in preempt_count_set() tip-bot2 for Uros Bizjak
2023-09-15 11:25 ` [tip: x86/asm] x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg() tip-bot2 for Uros Bizjak
1 sibling, 2 replies; 8+ messages in thread
From: Uros Bizjak @ 2023-08-30 15:13 UTC (permalink / raw)
To: x86, linux-kernel
Cc: Uros Bizjak, Peter Zijlstra, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, H. Peter Anvin
Use raw_cpu_try_cmpxchg instead of raw_cpu_cmpxchg (*ptr, old, new) == old.
x86 CMPXCHG instruction returns success in ZF flag, so this change saves a
compare after cmpxchg (and related move instruction in front of cmpxchg).
Also, raw_cpu_try_cmpxchg implicitly assigns old *ptr value to "old" when
cmpxchg fails. There is no need to re-read the value in the loop.
No functional change intended.
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
---
arch/x86/include/asm/preempt.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 2d13f25b1bd8..4527e1430c6d 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -31,11 +31,11 @@ static __always_inline void preempt_count_set(int pc)
{
int old, new;
+ old = raw_cpu_read_4(pcpu_hot.preempt_count);
do {
- old = raw_cpu_read_4(pcpu_hot.preempt_count);
new = (old & PREEMPT_NEED_RESCHED) |
(pc & ~PREEMPT_NEED_RESCHED);
- } while (raw_cpu_cmpxchg_4(pcpu_hot.preempt_count, old, new) != old);
+ } while (!raw_cpu_try_cmpxchg_4(pcpu_hot.preempt_count, &old, new));
}
/*
--
2.41.0
^ permalink raw reply related [flat|nested] 8+ messages in thread
* Re: [PATCH 2/2] x86/percpu: Use raw_cpu_try_cmpxchg in preempt_count_set
2023-08-30 15:13 ` [PATCH 2/2] x86/percpu: Use raw_cpu_try_cmpxchg in preempt_count_set Uros Bizjak
@ 2023-09-15 9:47 ` Ingo Molnar
2023-09-15 11:15 ` Ingo Molnar
2023-09-15 12:01 ` Uros Bizjak
2023-09-15 11:25 ` [tip: x86/asm] x86/percpu: Use raw_cpu_try_cmpxchg() in preempt_count_set() tip-bot2 for Uros Bizjak
1 sibling, 2 replies; 8+ messages in thread
From: Ingo Molnar @ 2023-09-15 9:47 UTC (permalink / raw)
To: Uros Bizjak
Cc: x86, linux-kernel, Peter Zijlstra, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, H. Peter Anvin
* Uros Bizjak <ubizjak@gmail.com> wrote:
> Use raw_cpu_try_cmpxchg instead of raw_cpu_cmpxchg (*ptr, old, new) == old.
> x86 CMPXCHG instruction returns success in ZF flag, so this change saves a
> compare after cmpxchg (and related move instruction in front of cmpxchg).
>
> Also, raw_cpu_try_cmpxchg implicitly assigns old *ptr value to "old" when
> cmpxchg fails. There is no need to re-read the value in the loop.
>
> No functional change intended.
>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: Borislav Petkov <bp@alien8.de>
> Cc: Dave Hansen <dave.hansen@linux.intel.com>
> Cc: "H. Peter Anvin" <hpa@zytor.com>
> Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
> ---
> arch/x86/include/asm/preempt.h | 4 ++--
> 1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
> index 2d13f25b1bd8..4527e1430c6d 100644
> --- a/arch/x86/include/asm/preempt.h
> +++ b/arch/x86/include/asm/preempt.h
> @@ -31,11 +31,11 @@ static __always_inline void preempt_count_set(int pc)
> {
> int old, new;
>
> + old = raw_cpu_read_4(pcpu_hot.preempt_count);
> do {
> - old = raw_cpu_read_4(pcpu_hot.preempt_count);
> new = (old & PREEMPT_NEED_RESCHED) |
> (pc & ~PREEMPT_NEED_RESCHED);
> - } while (raw_cpu_cmpxchg_4(pcpu_hot.preempt_count, old, new) != old);
> + } while (!raw_cpu_try_cmpxchg_4(pcpu_hot.preempt_count, &old, new));
It would be really nice to have a before/after comparison of generated
assembly code in the changelog, to demonstrate the effectiveness of this
optimization.
Thanks,
Ingo
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH 2/2] x86/percpu: Use raw_cpu_try_cmpxchg in preempt_count_set
2023-09-15 9:47 ` Ingo Molnar
@ 2023-09-15 11:15 ` Ingo Molnar
2023-09-15 11:22 ` Ingo Molnar
2023-09-15 12:01 ` Uros Bizjak
1 sibling, 1 reply; 8+ messages in thread
From: Ingo Molnar @ 2023-09-15 11:15 UTC (permalink / raw)
To: Uros Bizjak
Cc: x86, linux-kernel, Peter Zijlstra, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, H. Peter Anvin
* Ingo Molnar <mingo@kernel.org> wrote:
>
> * Uros Bizjak <ubizjak@gmail.com> wrote:
>
> > Use raw_cpu_try_cmpxchg instead of raw_cpu_cmpxchg (*ptr, old, new) == old.
> > x86 CMPXCHG instruction returns success in ZF flag, so this change saves a
> > compare after cmpxchg (and related move instruction in front of cmpxchg).
> >
> > Also, raw_cpu_try_cmpxchg implicitly assigns old *ptr value to "old" when
> > cmpxchg fails. There is no need to re-read the value in the loop.
> >
> > No functional change intended.
> >
> > Cc: Peter Zijlstra <peterz@infradead.org>
> > Cc: Thomas Gleixner <tglx@linutronix.de>
> > Cc: Ingo Molnar <mingo@redhat.com>
> > Cc: Borislav Petkov <bp@alien8.de>
> > Cc: Dave Hansen <dave.hansen@linux.intel.com>
> > Cc: "H. Peter Anvin" <hpa@zytor.com>
> > Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
> > ---
> > arch/x86/include/asm/preempt.h | 4 ++--
> > 1 file changed, 2 insertions(+), 2 deletions(-)
> >
> > diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
> > index 2d13f25b1bd8..4527e1430c6d 100644
> > --- a/arch/x86/include/asm/preempt.h
> > +++ b/arch/x86/include/asm/preempt.h
> > @@ -31,11 +31,11 @@ static __always_inline void preempt_count_set(int pc)
> > {
> > int old, new;
> >
> > + old = raw_cpu_read_4(pcpu_hot.preempt_count);
> > do {
> > - old = raw_cpu_read_4(pcpu_hot.preempt_count);
> > new = (old & PREEMPT_NEED_RESCHED) |
> > (pc & ~PREEMPT_NEED_RESCHED);
> > - } while (raw_cpu_cmpxchg_4(pcpu_hot.preempt_count, old, new) != old);
> > + } while (!raw_cpu_try_cmpxchg_4(pcpu_hot.preempt_count, &old, new));
>
> It would be really nice to have a before/after comparison of generated
> assembly code in the changelog, to demonstrate the effectiveness of this
> optimization.
Never mind, you did exactly that in the September 6 variation of these
changes. I'll apply those.
Thanks,
Ingo
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH 2/2] x86/percpu: Use raw_cpu_try_cmpxchg in preempt_count_set
2023-09-15 11:15 ` Ingo Molnar
@ 2023-09-15 11:22 ` Ingo Molnar
0 siblings, 0 replies; 8+ messages in thread
From: Ingo Molnar @ 2023-09-15 11:22 UTC (permalink / raw)
To: Uros Bizjak
Cc: x86, linux-kernel, Peter Zijlstra, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, H. Peter Anvin
* Ingo Molnar <mingo@kernel.org> wrote:
>
> * Ingo Molnar <mingo@kernel.org> wrote:
>
> >
> > * Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > > Use raw_cpu_try_cmpxchg instead of raw_cpu_cmpxchg (*ptr, old, new) == old.
> > > x86 CMPXCHG instruction returns success in ZF flag, so this change saves a
> > > compare after cmpxchg (and related move instruction in front of cmpxchg).
> > >
> > > Also, raw_cpu_try_cmpxchg implicitly assigns old *ptr value to "old" when
> > > cmpxchg fails. There is no need to re-read the value in the loop.
> > >
> > > No functional change intended.
> > >
> > > Cc: Peter Zijlstra <peterz@infradead.org>
> > > Cc: Thomas Gleixner <tglx@linutronix.de>
> > > Cc: Ingo Molnar <mingo@redhat.com>
> > > Cc: Borislav Petkov <bp@alien8.de>
> > > Cc: Dave Hansen <dave.hansen@linux.intel.com>
> > > Cc: "H. Peter Anvin" <hpa@zytor.com>
> > > Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
> > > ---
> > > arch/x86/include/asm/preempt.h | 4 ++--
> > > 1 file changed, 2 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
> > > index 2d13f25b1bd8..4527e1430c6d 100644
> > > --- a/arch/x86/include/asm/preempt.h
> > > +++ b/arch/x86/include/asm/preempt.h
> > > @@ -31,11 +31,11 @@ static __always_inline void preempt_count_set(int pc)
> > > {
> > > int old, new;
> > >
> > > + old = raw_cpu_read_4(pcpu_hot.preempt_count);
> > > do {
> > > - old = raw_cpu_read_4(pcpu_hot.preempt_count);
> > > new = (old & PREEMPT_NEED_RESCHED) |
> > > (pc & ~PREEMPT_NEED_RESCHED);
> > > - } while (raw_cpu_cmpxchg_4(pcpu_hot.preempt_count, old, new) != old);
> > > + } while (!raw_cpu_try_cmpxchg_4(pcpu_hot.preempt_count, &old, new));
> >
> > It would be really nice to have a before/after comparison of generated
> > assembly code in the changelog, to demonstrate the effectiveness of this
> > optimization.
>
> Never mind, you did exactly that in the September 6 variation of these
> changes. I'll apply those.
I mean, this third patch of yours:
[PATCH] x86/percpu: Define {raw,this}_cpu_try_cmpxchg{64,128}
Had a proper disassembly comparison - so I've applied all 3 optimization
patches to tip:x86/asm as:
b8e3dfa16ec5 ("x86/percpu: Use raw_cpu_try_cmpxchg() in preempt_count_set()")
5f863897d964 ("x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg()")
54cd971c6f44 ("x86/percpu: Define {raw,this}_cpu_try_cmpxchg{64,128}")
Thanks,
Ingo
^ permalink raw reply [flat|nested] 8+ messages in thread
* [tip: x86/asm] x86/percpu: Use raw_cpu_try_cmpxchg() in preempt_count_set()
2023-08-30 15:13 ` [PATCH 2/2] x86/percpu: Use raw_cpu_try_cmpxchg in preempt_count_set Uros Bizjak
2023-09-15 9:47 ` Ingo Molnar
@ 2023-09-15 11:25 ` tip-bot2 for Uros Bizjak
1 sibling, 0 replies; 8+ messages in thread
From: tip-bot2 for Uros Bizjak @ 2023-09-15 11:25 UTC (permalink / raw)
To: linux-tip-commits
Cc: Uros Bizjak, Ingo Molnar, Peter Zijlstra, x86, linux-kernel
The following commit has been merged into the x86/asm branch of tip:
Commit-ID: b8e3dfa16ec55f310dd95831614af3d24abf5ed5
Gitweb: https://git.kernel.org/tip/b8e3dfa16ec55f310dd95831614af3d24abf5ed5
Author: Uros Bizjak <ubizjak@gmail.com>
AuthorDate: Wed, 30 Aug 2023 17:13:57 +02:00
Committer: Ingo Molnar <mingo@kernel.org>
CommitterDate: Fri, 15 Sep 2023 13:19:22 +02:00
x86/percpu: Use raw_cpu_try_cmpxchg() in preempt_count_set()
Use raw_cpu_try_cmpxchg() instead of raw_cpu_cmpxchg(*ptr, old, new) == old.
x86 CMPXCHG instruction returns success in ZF flag, so this change saves a
compare after CMPXCHG (and related MOV instruction in front of CMPXCHG).
Also, raw_cpu_try_cmpxchg() implicitly assigns old *ptr value to "old" when
cmpxchg fails. There is no need to re-read the value in the loop.
No functional change intended.
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20230830151623.3900-2-ubizjak@gmail.com
---
arch/x86/include/asm/preempt.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 2d13f25..4527e14 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -31,11 +31,11 @@ static __always_inline void preempt_count_set(int pc)
{
int old, new;
+ old = raw_cpu_read_4(pcpu_hot.preempt_count);
do {
- old = raw_cpu_read_4(pcpu_hot.preempt_count);
new = (old & PREEMPT_NEED_RESCHED) |
(pc & ~PREEMPT_NEED_RESCHED);
- } while (raw_cpu_cmpxchg_4(pcpu_hot.preempt_count, old, new) != old);
+ } while (!raw_cpu_try_cmpxchg_4(pcpu_hot.preempt_count, &old, new));
}
/*
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [tip: x86/asm] x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg()
2023-08-30 15:13 [PATCH 1/2] x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg Uros Bizjak
2023-08-30 15:13 ` [PATCH 2/2] x86/percpu: Use raw_cpu_try_cmpxchg in preempt_count_set Uros Bizjak
@ 2023-09-15 11:25 ` tip-bot2 for Uros Bizjak
1 sibling, 0 replies; 8+ messages in thread
From: tip-bot2 for Uros Bizjak @ 2023-09-15 11:25 UTC (permalink / raw)
To: linux-tip-commits; +Cc: Uros Bizjak, Ingo Molnar, x86, linux-kernel
The following commit has been merged into the x86/asm branch of tip:
Commit-ID: 5f863897d964e834a0da35b1e483b5bb8faca522
Gitweb: https://git.kernel.org/tip/5f863897d964e834a0da35b1e483b5bb8faca522
Author: Uros Bizjak <ubizjak@gmail.com>
AuthorDate: Wed, 30 Aug 2023 17:13:56 +02:00
Committer: Ingo Molnar <mingo@kernel.org>
CommitterDate: Fri, 15 Sep 2023 13:18:23 +02:00
x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg()
Define target-specific raw_cpu_try_cmpxchg_N() and
this_cpu_try_cmpxchg_N() macros. These definitions override
the generic fallback definitions and enable target-specific
optimized implementations.
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20230830151623.3900-1-ubizjak@gmail.com
---
arch/x86/include/asm/percpu.h | 27 +++++++++++++++++++++++++++
1 file changed, 27 insertions(+)
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 4c36419..a87db61 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -210,6 +210,25 @@ do { \
(typeof(_var))(unsigned long) pco_old__; \
})
+#define percpu_try_cmpxchg_op(size, qual, _var, _ovalp, _nval) \
+({ \
+ bool success; \
+ __pcpu_type_##size *pco_oval__ = (__pcpu_type_##size *)(_ovalp); \
+ __pcpu_type_##size pco_old__ = *pco_oval__; \
+ __pcpu_type_##size pco_new__ = __pcpu_cast_##size(_nval); \
+ asm qual (__pcpu_op2_##size("cmpxchg", "%[nval]", \
+ __percpu_arg([var])) \
+ CC_SET(z) \
+ : CC_OUT(z) (success), \
+ [oval] "+a" (pco_old__), \
+ [var] "+m" (_var) \
+ : [nval] __pcpu_reg_##size(, pco_new__) \
+ : "memory"); \
+ if (unlikely(!success)) \
+ *pco_oval__ = pco_old__; \
+ likely(success); \
+})
+
#if defined(CONFIG_X86_32) && !defined(CONFIG_UML)
#define percpu_cmpxchg64_op(size, qual, _var, _oval, _nval) \
({ \
@@ -410,6 +429,9 @@ do { \
#define raw_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(1, , pcp, oval, nval)
#define raw_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(2, , pcp, oval, nval)
#define raw_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(4, , pcp, oval, nval)
+#define raw_cpu_try_cmpxchg_1(pcp, ovalp, nval) percpu_try_cmpxchg_op(1, , pcp, ovalp, nval)
+#define raw_cpu_try_cmpxchg_2(pcp, ovalp, nval) percpu_try_cmpxchg_op(2, , pcp, ovalp, nval)
+#define raw_cpu_try_cmpxchg_4(pcp, ovalp, nval) percpu_try_cmpxchg_op(4, , pcp, ovalp, nval)
#define this_cpu_add_return_1(pcp, val) percpu_add_return_op(1, volatile, pcp, val)
#define this_cpu_add_return_2(pcp, val) percpu_add_return_op(2, volatile, pcp, val)
@@ -417,6 +439,9 @@ do { \
#define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(1, volatile, pcp, oval, nval)
#define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(2, volatile, pcp, oval, nval)
#define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(4, volatile, pcp, oval, nval)
+#define this_cpu_try_cmpxchg_1(pcp, ovalp, nval) percpu_try_cmpxchg_op(1, volatile, pcp, ovalp, nval)
+#define this_cpu_try_cmpxchg_2(pcp, ovalp, nval) percpu_try_cmpxchg_op(2, volatile, pcp, ovalp, nval)
+#define this_cpu_try_cmpxchg_4(pcp, ovalp, nval) percpu_try_cmpxchg_op(4, volatile, pcp, ovalp, nval)
/*
* Per cpu atomic 64 bit operations are only available under 64 bit.
@@ -431,6 +456,7 @@ do { \
#define raw_cpu_add_return_8(pcp, val) percpu_add_return_op(8, , pcp, val)
#define raw_cpu_xchg_8(pcp, nval) raw_percpu_xchg_op(pcp, nval)
#define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, , pcp, oval, nval)
+#define raw_cpu_try_cmpxchg_8(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, , pcp, ovalp, nval)
#define this_cpu_read_8(pcp) percpu_from_op(8, volatile, "mov", pcp)
#define this_cpu_write_8(pcp, val) percpu_to_op(8, volatile, "mov", (pcp), val)
@@ -440,6 +466,7 @@ do { \
#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(8, volatile, pcp, val)
#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(8, volatile, pcp, nval)
#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, volatile, pcp, oval, nval)
+#define this_cpu_try_cmpxchg_8(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval)
#endif
static __always_inline bool x86_this_cpu_constant_test_bit(unsigned int nr,
^ permalink raw reply related [flat|nested] 8+ messages in thread
* Re: [PATCH 2/2] x86/percpu: Use raw_cpu_try_cmpxchg in preempt_count_set
2023-09-15 9:47 ` Ingo Molnar
2023-09-15 11:15 ` Ingo Molnar
@ 2023-09-15 12:01 ` Uros Bizjak
1 sibling, 0 replies; 8+ messages in thread
From: Uros Bizjak @ 2023-09-15 12:01 UTC (permalink / raw)
To: Ingo Molnar
Cc: x86, linux-kernel, Peter Zijlstra, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, H. Peter Anvin
On Fri, Sep 15, 2023 at 11:47 AM Ingo Molnar <mingo@kernel.org> wrote:
>
>
> * Uros Bizjak <ubizjak@gmail.com> wrote:
>
> > Use raw_cpu_try_cmpxchg instead of raw_cpu_cmpxchg (*ptr, old, new) == old.
> > x86 CMPXCHG instruction returns success in ZF flag, so this change saves a
> > compare after cmpxchg (and related move instruction in front of cmpxchg).
> >
> > Also, raw_cpu_try_cmpxchg implicitly assigns old *ptr value to "old" when
> > cmpxchg fails. There is no need to re-read the value in the loop.
> >
> > No functional change intended.
> >
> > Cc: Peter Zijlstra <peterz@infradead.org>
> > Cc: Thomas Gleixner <tglx@linutronix.de>
> > Cc: Ingo Molnar <mingo@redhat.com>
> > Cc: Borislav Petkov <bp@alien8.de>
> > Cc: Dave Hansen <dave.hansen@linux.intel.com>
> > Cc: "H. Peter Anvin" <hpa@zytor.com>
> > Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
> > ---
> > arch/x86/include/asm/preempt.h | 4 ++--
> > 1 file changed, 2 insertions(+), 2 deletions(-)
> >
> > diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
> > index 2d13f25b1bd8..4527e1430c6d 100644
> > --- a/arch/x86/include/asm/preempt.h
> > +++ b/arch/x86/include/asm/preempt.h
> > @@ -31,11 +31,11 @@ static __always_inline void preempt_count_set(int pc)
> > {
> > int old, new;
> >
> > + old = raw_cpu_read_4(pcpu_hot.preempt_count);
> > do {
> > - old = raw_cpu_read_4(pcpu_hot.preempt_count);
> > new = (old & PREEMPT_NEED_RESCHED) |
> > (pc & ~PREEMPT_NEED_RESCHED);
> > - } while (raw_cpu_cmpxchg_4(pcpu_hot.preempt_count, old, new) != old);
> > + } while (!raw_cpu_try_cmpxchg_4(pcpu_hot.preempt_count, &old, new));
>
> It would be really nice to have a before/after comparison of generated
> assembly code in the changelog, to demonstrate the effectiveness of this
> optimization.
The assembly code improvements are in line with other try_cmpxchg
conversions, but for reference, finish_task_switch() from
kernel/sched/core.c that inlines preempt_count_set() improves from:
5bad: 65 8b 0d 00 00 00 00 mov %gs:0x0(%rip),%ecx
5bb4: 89 ca mov %ecx,%edx
5bb6: 89 c8 mov %ecx,%eax
5bb8: 81 e2 00 00 00 80 and $0x80000000,%edx
5bbe: 83 ca 02 or $0x2,%edx
5bc1: 65 0f b1 15 00 00 00 cmpxchg %edx,%gs:0x0(%rip)
5bc8: 00
5bc9: 39 c1 cmp %eax,%ecx
5bcb: 75 e0 jne 5bad <...>
5bcd: e9 5a fe ff ff jmpq 5a2c <...>
5bd2:
to:
5bad: 65 8b 05 00 00 00 00 mov %gs:0x0(%rip),%eax
5bb4: 89 c2 mov %eax,%edx
5bb6: 81 e2 00 00 00 80 and $0x80000000,%edx
5bbc: 83 ca 02 or $0x2,%edx
5bbf: 65 0f b1 15 00 00 00 cmpxchg %edx,%gs:0x0(%rip)
5bc6: 00
5bc7: 0f 84 5f fe ff ff je 5a2c <...>
5bcd: eb e5 jmp 5bb4 <...>
5bcf:
Please note missing cmp (and mov), loop without extra memory load from
%gs:0x0(%rip) and better predicted jump in the later case. The
improvements with {raw,this}_cpu_try_cmpxchg_128 in the third patch
are even more noticeable, because __int128 value lives in a register
pair, so the comparison needs three separate machine instructions, in
addition to a move of the register pair.
Thanks,
Uros.
^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2023-09-15 12:02 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-08-30 15:13 [PATCH 1/2] x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg Uros Bizjak
2023-08-30 15:13 ` [PATCH 2/2] x86/percpu: Use raw_cpu_try_cmpxchg in preempt_count_set Uros Bizjak
2023-09-15 9:47 ` Ingo Molnar
2023-09-15 11:15 ` Ingo Molnar
2023-09-15 11:22 ` Ingo Molnar
2023-09-15 12:01 ` Uros Bizjak
2023-09-15 11:25 ` [tip: x86/asm] x86/percpu: Use raw_cpu_try_cmpxchg() in preempt_count_set() tip-bot2 for Uros Bizjak
2023-09-15 11:25 ` [tip: x86/asm] x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg() tip-bot2 for Uros Bizjak
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.