All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] x86/percpu: Define {raw,this}_cpu_try_cmpxchg{64,128}
@ 2023-09-06 18:58 Uros Bizjak
  2023-09-15 11:25 ` [tip: x86/asm] " tip-bot2 for Uros Bizjak
  0 siblings, 1 reply; 6+ messages in thread
From: Uros Bizjak @ 2023-09-06 18:58 UTC (permalink / raw)
  To: x86, linux-kernel
  Cc: Uros Bizjak, Peter Zijlstra, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, H. Peter Anvin

Define target-specific {raw,this}_cpu_try_cmpxchg64 and
{raw,this}_cpu_try_cmpxchg128 macros. These definitions override
the generic fallback definitions and enable target-specific
optimized implementations.

Several places in mm/slub.o improve from e.g.:

    53bc:	48 8d 4f 40          	lea    0x40(%rdi),%rcx
    53c0:	48 89 fa             	mov    %rdi,%rdx
    53c3:	49 8b 5c 05 00       	mov    0x0(%r13,%rax,1),%rbx
    53c8:	4c 89 e8             	mov    %r13,%rax
    53cb:	49 8d 30             	lea    (%r8),%rsi
    53ce:	e8 00 00 00 00       	call   53d3 <...>
			53cf: R_X86_64_PLT32	this_cpu_cmpxchg16b_emu-0x4
    53d3:	48 31 d7             	xor    %rdx,%rdi
    53d6:	4c 31 e8             	xor    %r13,%rax
    53d9:	48 09 c7             	or     %rax,%rdi
    53dc:	75 ae                	jne    538c <...>

to:

    53bc:	48 8d 4a 40          	lea    0x40(%rdx),%rcx
    53c0:	49 8b 1c 07          	mov    (%r15,%rax,1),%rbx
    53c4:	4c 89 f8             	mov    %r15,%rax
    53c7:	48 8d 37             	lea    (%rdi),%rsi
    53ca:	e8 00 00 00 00       	call   53cf <...>
			53cb: R_X86_64_PLT32	this_cpu_cmpxchg16b_emu-0x4
    53cf:	75 bb                	jne    538c <...>

reducing the size of mm/slub.o by 80 bytes:

   text    data     bss     dec     hex filename
  39758    5337    4208   49303    c097 slub-new.o
  39838    5337    4208   49383    c0e7 slub-old.o

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
---
 arch/x86/include/asm/percpu.h | 67 +++++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 34734d730463..4c3641927f39 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -237,12 +237,47 @@ do {									\
 
 #define raw_cpu_cmpxchg64(pcp, oval, nval)	percpu_cmpxchg64_op(8,         , pcp, oval, nval)
 #define this_cpu_cmpxchg64(pcp, oval, nval)	percpu_cmpxchg64_op(8, volatile, pcp, oval, nval)
+
+#define percpu_try_cmpxchg64_op(size, qual, _var, _ovalp, _nval)	\
+({									\
+	bool success;							\
+	u64 *_oval = (u64 *)(_ovalp);					\
+	union {								\
+		u64 var;						\
+		struct {						\
+			u32 low, high;					\
+		};							\
+	} old__, new__;							\
+									\
+	old__.var = *_oval;						\
+	new__.var = _nval;						\
+									\
+	asm qual (ALTERNATIVE("leal %P[var], %%esi; call this_cpu_cmpxchg8b_emu", \
+			      "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \
+		  CC_SET(z)						\
+		  : CC_OUT(z) (success),				\
+		    [var] "+m" (_var),					\
+		    "+a" (old__.low),					\
+		    "+d" (old__.high)					\
+		  : "b" (new__.low),					\
+		    "c" (new__.high)					\
+		  : "memory", "esi");					\
+	if (unlikely(!success))						\
+		*_oval = old__.var;					\
+	likely(success);						\
+})
+
+#define raw_cpu_try_cmpxchg64(pcp, ovalp, nval)		percpu_try_cmpxchg64_op(8,         , pcp, ovalp, nval)
+#define this_cpu_try_cmpxchg64(pcp, ovalp, nval)	percpu_try_cmpxchg64_op(8, volatile, pcp, ovalp, nval)
 #endif
 
 #ifdef CONFIG_X86_64
 #define raw_cpu_cmpxchg64(pcp, oval, nval)	percpu_cmpxchg_op(8,         , pcp, oval, nval);
 #define this_cpu_cmpxchg64(pcp, oval, nval)	percpu_cmpxchg_op(8, volatile, pcp, oval, nval);
 
+#define raw_cpu_try_cmpxchg64(pcp, ovalp, nval)		percpu_try_cmpxchg_op(8,         , pcp, ovalp, nval);
+#define this_cpu_try_cmpxchg64(pcp, ovalp, nval)	percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval);
+
 #define percpu_cmpxchg128_op(size, qual, _var, _oval, _nval)		\
 ({									\
 	union {								\
@@ -269,6 +304,38 @@ do {									\
 
 #define raw_cpu_cmpxchg128(pcp, oval, nval)	percpu_cmpxchg128_op(16,         , pcp, oval, nval)
 #define this_cpu_cmpxchg128(pcp, oval, nval)	percpu_cmpxchg128_op(16, volatile, pcp, oval, nval)
+
+#define percpu_try_cmpxchg128_op(size, qual, _var, _ovalp, _nval)	\
+({									\
+	bool success;							\
+	u128 *_oval = (u128 *)(_ovalp);					\
+	union {								\
+		u128 var;						\
+		struct {						\
+			u64 low, high;					\
+		};							\
+	} old__, new__;							\
+									\
+	old__.var = *_oval;						\
+	new__.var = _nval;						\
+									\
+	asm qual (ALTERNATIVE("leaq %P[var], %%rsi; call this_cpu_cmpxchg16b_emu", \
+			      "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \
+		  CC_SET(z)						\
+		  : CC_OUT(z) (success),				\
+		    [var] "+m" (_var),					\
+		    "+a" (old__.low),					\
+		    "+d" (old__.high)					\
+		  : "b" (new__.low),					\
+		    "c" (new__.high)					\
+		  : "memory", "rsi");					\
+	if (unlikely(!success))						\
+		*_oval = old__.var;					\
+	likely(success);						\
+})
+
+#define raw_cpu_try_cmpxchg128(pcp, ovalp, nval)	percpu_try_cmpxchg128_op(16,         , pcp, ovalp, nval)
+#define this_cpu_try_cmpxchg128(pcp, ovalp, nval)	percpu_try_cmpxchg128_op(16, volatile, pcp, ovalp, nval)
 #endif
 
 /*
-- 
2.41.0


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [tip: x86/asm] x86/percpu: Define {raw,this}_cpu_try_cmpxchg{64,128}
  2023-09-06 18:58 [PATCH] x86/percpu: Define {raw,this}_cpu_try_cmpxchg{64,128} Uros Bizjak
@ 2023-09-15 11:25 ` tip-bot2 for Uros Bizjak
  2023-09-15 16:45   ` Linus Torvalds
  0 siblings, 1 reply; 6+ messages in thread
From: tip-bot2 for Uros Bizjak @ 2023-09-15 11:25 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Uros Bizjak, Ingo Molnar, Linus Torvalds, Peter Zijlstra, x86,
	linux-kernel

The following commit has been merged into the x86/asm branch of tip:

Commit-ID:     54cd971c6f4461fb6b178579751788bf4f64dfca
Gitweb:        https://git.kernel.org/tip/54cd971c6f4461fb6b178579751788bf4f64dfca
Author:        Uros Bizjak <ubizjak@gmail.com>
AuthorDate:    Wed, 06 Sep 2023 20:58:44 +02:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Fri, 15 Sep 2023 13:16:35 +02:00

x86/percpu: Define {raw,this}_cpu_try_cmpxchg{64,128}

Define target-specific {raw,this}_cpu_try_cmpxchg64() and
{raw,this}_cpu_try_cmpxchg128() macros. These definitions override
the generic fallback definitions and enable target-specific
optimized implementations.

Several places in mm/slub.o improve from e.g.:

    53bc:	48 8d 4f 40          	lea    0x40(%rdi),%rcx
    53c0:	48 89 fa             	mov    %rdi,%rdx
    53c3:	49 8b 5c 05 00       	mov    0x0(%r13,%rax,1),%rbx
    53c8:	4c 89 e8             	mov    %r13,%rax
    53cb:	49 8d 30             	lea    (%r8),%rsi
    53ce:	e8 00 00 00 00       	call   53d3 <...>
			53cf: R_X86_64_PLT32	this_cpu_cmpxchg16b_emu-0x4
    53d3:	48 31 d7             	xor    %rdx,%rdi
    53d6:	4c 31 e8             	xor    %r13,%rax
    53d9:	48 09 c7             	or     %rax,%rdi
    53dc:	75 ae                	jne    538c <...>

to:

    53bc:	48 8d 4a 40          	lea    0x40(%rdx),%rcx
    53c0:	49 8b 1c 07          	mov    (%r15,%rax,1),%rbx
    53c4:	4c 89 f8             	mov    %r15,%rax
    53c7:	48 8d 37             	lea    (%rdi),%rsi
    53ca:	e8 00 00 00 00       	call   53cf <...>
			53cb: R_X86_64_PLT32	this_cpu_cmpxchg16b_emu-0x4
    53cf:	75 bb                	jne    538c <...>

reducing the size of mm/slub.o by 80 bytes:

   text    data     bss     dec     hex filename
  39758    5337    4208   49303    c097 slub-new.o
  39838    5337    4208   49383    c0e7 slub-old.o

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20230906185941.53527-1-ubizjak@gmail.com
---
 arch/x86/include/asm/percpu.h | 67 ++++++++++++++++++++++++++++++++++-
 1 file changed, 67 insertions(+)

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 34734d7..4c36419 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -237,12 +237,47 @@ do {									\
 
 #define raw_cpu_cmpxchg64(pcp, oval, nval)	percpu_cmpxchg64_op(8,         , pcp, oval, nval)
 #define this_cpu_cmpxchg64(pcp, oval, nval)	percpu_cmpxchg64_op(8, volatile, pcp, oval, nval)
+
+#define percpu_try_cmpxchg64_op(size, qual, _var, _ovalp, _nval)	\
+({									\
+	bool success;							\
+	u64 *_oval = (u64 *)(_ovalp);					\
+	union {								\
+		u64 var;						\
+		struct {						\
+			u32 low, high;					\
+		};							\
+	} old__, new__;							\
+									\
+	old__.var = *_oval;						\
+	new__.var = _nval;						\
+									\
+	asm qual (ALTERNATIVE("leal %P[var], %%esi; call this_cpu_cmpxchg8b_emu", \
+			      "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \
+		  CC_SET(z)						\
+		  : CC_OUT(z) (success),				\
+		    [var] "+m" (_var),					\
+		    "+a" (old__.low),					\
+		    "+d" (old__.high)					\
+		  : "b" (new__.low),					\
+		    "c" (new__.high)					\
+		  : "memory", "esi");					\
+	if (unlikely(!success))						\
+		*_oval = old__.var;					\
+	likely(success);						\
+})
+
+#define raw_cpu_try_cmpxchg64(pcp, ovalp, nval)		percpu_try_cmpxchg64_op(8,         , pcp, ovalp, nval)
+#define this_cpu_try_cmpxchg64(pcp, ovalp, nval)	percpu_try_cmpxchg64_op(8, volatile, pcp, ovalp, nval)
 #endif
 
 #ifdef CONFIG_X86_64
 #define raw_cpu_cmpxchg64(pcp, oval, nval)	percpu_cmpxchg_op(8,         , pcp, oval, nval);
 #define this_cpu_cmpxchg64(pcp, oval, nval)	percpu_cmpxchg_op(8, volatile, pcp, oval, nval);
 
+#define raw_cpu_try_cmpxchg64(pcp, ovalp, nval)		percpu_try_cmpxchg_op(8,         , pcp, ovalp, nval);
+#define this_cpu_try_cmpxchg64(pcp, ovalp, nval)	percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval);
+
 #define percpu_cmpxchg128_op(size, qual, _var, _oval, _nval)		\
 ({									\
 	union {								\
@@ -269,6 +304,38 @@ do {									\
 
 #define raw_cpu_cmpxchg128(pcp, oval, nval)	percpu_cmpxchg128_op(16,         , pcp, oval, nval)
 #define this_cpu_cmpxchg128(pcp, oval, nval)	percpu_cmpxchg128_op(16, volatile, pcp, oval, nval)
+
+#define percpu_try_cmpxchg128_op(size, qual, _var, _ovalp, _nval)	\
+({									\
+	bool success;							\
+	u128 *_oval = (u128 *)(_ovalp);					\
+	union {								\
+		u128 var;						\
+		struct {						\
+			u64 low, high;					\
+		};							\
+	} old__, new__;							\
+									\
+	old__.var = *_oval;						\
+	new__.var = _nval;						\
+									\
+	asm qual (ALTERNATIVE("leaq %P[var], %%rsi; call this_cpu_cmpxchg16b_emu", \
+			      "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \
+		  CC_SET(z)						\
+		  : CC_OUT(z) (success),				\
+		    [var] "+m" (_var),					\
+		    "+a" (old__.low),					\
+		    "+d" (old__.high)					\
+		  : "b" (new__.low),					\
+		    "c" (new__.high)					\
+		  : "memory", "rsi");					\
+	if (unlikely(!success))						\
+		*_oval = old__.var;					\
+	likely(success);						\
+})
+
+#define raw_cpu_try_cmpxchg128(pcp, ovalp, nval)	percpu_try_cmpxchg128_op(16,         , pcp, ovalp, nval)
+#define this_cpu_try_cmpxchg128(pcp, ovalp, nval)	percpu_try_cmpxchg128_op(16, volatile, pcp, ovalp, nval)
 #endif
 
 /*

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [tip: x86/asm] x86/percpu: Define {raw,this}_cpu_try_cmpxchg{64,128}
  2023-09-15 11:25 ` [tip: x86/asm] " tip-bot2 for Uros Bizjak
@ 2023-09-15 16:45   ` Linus Torvalds
  2023-09-17 18:31     ` Uros Bizjak
  0 siblings, 1 reply; 6+ messages in thread
From: Linus Torvalds @ 2023-09-15 16:45 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-tip-commits, Uros Bizjak, Ingo Molnar, Peter Zijlstra, x86

On Fri, 15 Sept 2023 at 04:25, tip-bot2 for Uros Bizjak
<tip-bot2@linutronix.de> wrote:
>
> Several places in mm/slub.o improve from e.g.:
>
[...]
>
> to:
>
>     53bc:       48 8d 4a 40             lea    0x40(%rdx),%rcx
>     53c0:       49 8b 1c 07             mov    (%r15,%rax,1),%rbx
>     53c4:       4c 89 f8                mov    %r15,%rax
>     53c7:       48 8d 37                lea    (%rdi),%rsi
>     53ca:       e8 00 00 00 00          call   53cf <...>
>                         53cb: R_X86_64_PLT32     this_cpu_cmpxchg16b_emu-0x4
>     53cf:       75 bb                   jne    538c <...>

Honestly, if y ou care deeply about this code sequence, I think you
should also move the "lea" out of the inline asm.

Both

    call this_cpu_cmpxchg16b_emu

and

    cmpxchg16b %gs:(%rsi)

are 5 bytes, and I suspect it's easiest to just always put the address
in %rsi - whether you call the function or not.

It doesn't really make the code generation for the non-call sequence
worse, and it gives the compiler more information (ie instead of
clobbering %rsi, the compiler knows what %rsi contains).

IOW, something like this:

-       asm qual (ALTERNATIVE("leaq %P[var], %%rsi; call
this_cpu_cmpxchg16b_emu", \
+       asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu",           \
...
-                   "c" (new__.high)                                    \
-                 : "memory", "rsi");                                   \
+                   "c" (new__.high),                                   \
+                   "S" (&_var)                                   \
+                 : "memory");                                          \

should do it.

Note that I think this is particularly true of the slub code, because
afaik, the slub code will *only* use the slow call-out.

Why? Because if the CPU actually supports the cmpxchgb16 instruction,
then the slub code won't even take this path at all - it will do the
__CMPXCHG_DOUBLE path, which does an unconditional locked cmpxchg16b.

Maybe I'm misreading it. And no, none of this matters. But since I saw
the patch fly by, and slub.o mentioned, I thought I'd point out how
silly this all is. It's optimizing a code-path that is basically never
taken, and when it *is* taken, it can be improved further, I think.

                   Linus

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [tip: x86/asm] x86/percpu: Define {raw,this}_cpu_try_cmpxchg{64,128}
  2023-09-15 16:45   ` Linus Torvalds
@ 2023-09-17 18:31     ` Uros Bizjak
  2023-09-17 18:33       ` Uros Bizjak
  0 siblings, 1 reply; 6+ messages in thread
From: Uros Bizjak @ 2023-09-17 18:31 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: linux-kernel, linux-tip-commits, Ingo Molnar, Peter Zijlstra, x86

On Fri, Sep 15, 2023 at 6:45 PM Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> On Fri, 15 Sept 2023 at 04:25, tip-bot2 for Uros Bizjak
> <tip-bot2@linutronix.de> wrote:
> >
> > Several places in mm/slub.o improve from e.g.:
> >
> [...]
> >
> > to:
> >
> >     53bc:       48 8d 4a 40             lea    0x40(%rdx),%rcx
> >     53c0:       49 8b 1c 07             mov    (%r15,%rax,1),%rbx
> >     53c4:       4c 89 f8                mov    %r15,%rax
> >     53c7:       48 8d 37                lea    (%rdi),%rsi
> >     53ca:       e8 00 00 00 00          call   53cf <...>
> >                         53cb: R_X86_64_PLT32     this_cpu_cmpxchg16b_emu-0x4
> >     53cf:       75 bb                   jne    538c <...>
>
> Honestly, if y ou care deeply about this code sequence, I think you
> should also move the "lea" out of the inline asm.

I have to say that the above asm code was shown mostly as an example
of the improvement, to illustrate how the compare sequence at the end
of the cmpxchg loop gets eliminated. Being a fairly mechanical change,
I didn't put much thought in the surrounding code.

> Both
>
>     call this_cpu_cmpxchg16b_emu
>
> and
>
>     cmpxchg16b %gs:(%rsi)
>
> are 5 bytes, and I suspect it's easiest to just always put the address
> in %rsi - whether you call the function or not.
>
> It doesn't really make the code generation for the non-call sequence
> worse, and it gives the compiler more information (ie instead of
> clobbering %rsi, the compiler knows what %rsi contains).
>
> IOW, something like this:
>
> -       asm qual (ALTERNATIVE("leaq %P[var], %%rsi; call
> this_cpu_cmpxchg16b_emu", \
> +       asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu",           \
> ...
> -                   "c" (new__.high)                                    \
> -                 : "memory", "rsi");                                   \
> +                   "c" (new__.high),                                   \
> +                   "S" (&_var)                                   \
> +                 : "memory");                                          \
>
> should do it.

Yes, and the above change improves slub.o assembly from (current tip
tree with try_cmpxchg patch applied):

    53b3:    41 8b 44 24 28           mov    0x28(%r12),%eax
    53b8:    49 8b 3c 24              mov    (%r12),%rdi
    53bc:    48 8d 4a 40              lea    0x40(%rdx),%rcx
    53c0:    49 8b 1c 07              mov    (%r15,%rax,1),%rbx
    53c4:    4c 89 f8                 mov    %r15,%rax
    53c7:    48 8d 37                 lea    (%rdi),%rsi
    53ca:    e8 00 00 00 00           call   53cf <kmem_cache_alloc+0x9f>
            53cb: R_X86_64_PLT32    this_cpu_cmpxchg16b_emu-0x4
    53cf:    75 bb                    jne    538c <kmem_cache_alloc+0x5c>

to:

    53b3:    41 8b 44 24 28           mov    0x28(%r12),%eax
    53b8:    49 8b 34 24              mov    (%r12),%rsi
    53bc:    48 8d 4a 40              lea    0x40(%rdx),%rcx
    53c0:    49 8b 1c 07              mov    (%r15,%rax,1),%rbx
    53c4:    4c 89 f8                 mov    %r15,%rax
    53c7:    e8 00 00 00 00           call   53cc <kmem_cache_alloc+0x9c>
            53c8: R_X86_64_PLT32    this_cpu_cmpxchg16b_emu-0x4
    53cc:    75 be                    jne    538c <kmem_cache_alloc+0x5c>

where an effective reg-reg move "lea (%rdi), %rsi" at 537c gets
removed. And indeed, GCC figures out that %rsi holds the address of
the variable and emits:

   5:    65 48 0f c7 0e           cmpxchg16b %gs:(%rsi)

alternative replacement.

Now, here comes the best part: We can get rid of the %P modifier. With
named address spaces (__seg_gs), older GCCs had some problems with %P
and emitted "%gs:foo" instead of foo, resulting in "Warning: segment
override on `lea' is ineffectual" assembly warning. With the proposed
change, we use:

--cut here--
int __seg_gs g;

void foo (void)
{
  asm ("%0 %1" :: "m"(g), "S"(&g));
}
--cut here--

and get the desired assembly:

       movl    $g, %esi
       %gs:g(%rip) %rsi

The above is also in line with [1], where it is said that
"[__seg_gs/__seg_fs] address spaces are not considered to be subspaces
of the generic (flat) address space." So, cmpxchg16b_emu.S must use
%gs to apply segment base offset, which it does.

> Note that I think this is particularly true of the slub code, because
> afaik, the slub code will *only* use the slow call-out.
>
> Why? Because if the CPU actually supports the cmpxchgb16 instruction,
> then the slub code won't even take this path at all - it will do the
> __CMPXCHG_DOUBLE path, which does an unconditional locked cmpxchg16b.
>
> Maybe I'm misreading it. And no, none of this matters. But since I saw
> the patch fly by, and slub.o mentioned, I thought I'd point out how
> silly this all is. It's optimizing a code-path that is basically never
> taken, and when it *is* taken, it can be improved further, I think.

True, but as mentioned above, the slub.o code was used to illustrate
the effect of the patch. The new locking primitive should be usable in
a general way and could be also used in other places.

[1] https://gcc.gnu.org/onlinedocs/gcc/Named-Address-Spaces.html#x86-Named-Address-Spaces

Uros.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [tip: x86/asm] x86/percpu: Define {raw,this}_cpu_try_cmpxchg{64,128}
  2023-09-17 18:31     ` Uros Bizjak
@ 2023-09-17 18:33       ` Uros Bizjak
  2023-09-18  7:31         ` Ingo Molnar
  0 siblings, 1 reply; 6+ messages in thread
From: Uros Bizjak @ 2023-09-17 18:33 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: linux-kernel, linux-tip-commits, Ingo Molnar, Peter Zijlstra, x86

[-- Attachment #1: Type: text/plain, Size: 5707 bytes --]

Now also with the patch attached.

Uros.

On Sun, Sep 17, 2023 at 8:31 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Fri, Sep 15, 2023 at 6:45 PM Linus Torvalds
> <torvalds@linux-foundation.org> wrote:
> >
> > On Fri, 15 Sept 2023 at 04:25, tip-bot2 for Uros Bizjak
> > <tip-bot2@linutronix.de> wrote:
> > >
> > > Several places in mm/slub.o improve from e.g.:
> > >
> > [...]
> > >
> > > to:
> > >
> > >     53bc:       48 8d 4a 40             lea    0x40(%rdx),%rcx
> > >     53c0:       49 8b 1c 07             mov    (%r15,%rax,1),%rbx
> > >     53c4:       4c 89 f8                mov    %r15,%rax
> > >     53c7:       48 8d 37                lea    (%rdi),%rsi
> > >     53ca:       e8 00 00 00 00          call   53cf <...>
> > >                         53cb: R_X86_64_PLT32     this_cpu_cmpxchg16b_emu-0x4
> > >     53cf:       75 bb                   jne    538c <...>
> >
> > Honestly, if y ou care deeply about this code sequence, I think you
> > should also move the "lea" out of the inline asm.
>
> I have to say that the above asm code was shown mostly as an example
> of the improvement, to illustrate how the compare sequence at the end
> of the cmpxchg loop gets eliminated. Being a fairly mechanical change,
> I didn't put much thought in the surrounding code.
>
> > Both
> >
> >     call this_cpu_cmpxchg16b_emu
> >
> > and
> >
> >     cmpxchg16b %gs:(%rsi)
> >
> > are 5 bytes, and I suspect it's easiest to just always put the address
> > in %rsi - whether you call the function or not.
> >
> > It doesn't really make the code generation for the non-call sequence
> > worse, and it gives the compiler more information (ie instead of
> > clobbering %rsi, the compiler knows what %rsi contains).
> >
> > IOW, something like this:
> >
> > -       asm qual (ALTERNATIVE("leaq %P[var], %%rsi; call
> > this_cpu_cmpxchg16b_emu", \
> > +       asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu",           \
> > ...
> > -                   "c" (new__.high)                                    \
> > -                 : "memory", "rsi");                                   \
> > +                   "c" (new__.high),                                   \
> > +                   "S" (&_var)                                   \
> > +                 : "memory");                                          \
> >
> > should do it.
>
> Yes, and the above change improves slub.o assembly from (current tip
> tree with try_cmpxchg patch applied):
>
>     53b3:    41 8b 44 24 28           mov    0x28(%r12),%eax
>     53b8:    49 8b 3c 24              mov    (%r12),%rdi
>     53bc:    48 8d 4a 40              lea    0x40(%rdx),%rcx
>     53c0:    49 8b 1c 07              mov    (%r15,%rax,1),%rbx
>     53c4:    4c 89 f8                 mov    %r15,%rax
>     53c7:    48 8d 37                 lea    (%rdi),%rsi
>     53ca:    e8 00 00 00 00           call   53cf <kmem_cache_alloc+0x9f>
>             53cb: R_X86_64_PLT32    this_cpu_cmpxchg16b_emu-0x4
>     53cf:    75 bb                    jne    538c <kmem_cache_alloc+0x5c>
>
> to:
>
>     53b3:    41 8b 44 24 28           mov    0x28(%r12),%eax
>     53b8:    49 8b 34 24              mov    (%r12),%rsi
>     53bc:    48 8d 4a 40              lea    0x40(%rdx),%rcx
>     53c0:    49 8b 1c 07              mov    (%r15,%rax,1),%rbx
>     53c4:    4c 89 f8                 mov    %r15,%rax
>     53c7:    e8 00 00 00 00           call   53cc <kmem_cache_alloc+0x9c>
>             53c8: R_X86_64_PLT32    this_cpu_cmpxchg16b_emu-0x4
>     53cc:    75 be                    jne    538c <kmem_cache_alloc+0x5c>
>
> where an effective reg-reg move "lea (%rdi), %rsi" at 537c gets
> removed. And indeed, GCC figures out that %rsi holds the address of
> the variable and emits:
>
>    5:    65 48 0f c7 0e           cmpxchg16b %gs:(%rsi)
>
> alternative replacement.
>
> Now, here comes the best part: We can get rid of the %P modifier. With
> named address spaces (__seg_gs), older GCCs had some problems with %P
> and emitted "%gs:foo" instead of foo, resulting in "Warning: segment
> override on `lea' is ineffectual" assembly warning. With the proposed
> change, we use:
>
> --cut here--
> int __seg_gs g;
>
> void foo (void)
> {
>   asm ("%0 %1" :: "m"(g), "S"(&g));
> }
> --cut here--
>
> and get the desired assembly:
>
>        movl    $g, %esi
>        %gs:g(%rip) %rsi
>
> The above is also in line with [1], where it is said that
> "[__seg_gs/__seg_fs] address spaces are not considered to be subspaces
> of the generic (flat) address space." So, cmpxchg16b_emu.S must use
> %gs to apply segment base offset, which it does.
>
> > Note that I think this is particularly true of the slub code, because
> > afaik, the slub code will *only* use the slow call-out.
> >
> > Why? Because if the CPU actually supports the cmpxchgb16 instruction,
> > then the slub code won't even take this path at all - it will do the
> > __CMPXCHG_DOUBLE path, which does an unconditional locked cmpxchg16b.
> >
> > Maybe I'm misreading it. And no, none of this matters. But since I saw
> > the patch fly by, and slub.o mentioned, I thought I'd point out how
> > silly this all is. It's optimizing a code-path that is basically never
> > taken, and when it *is* taken, it can be improved further, I think.
>
> True, but as mentioned above, the slub.o code was used to illustrate
> the effect of the patch. The new locking primitive should be usable in
> a general way and could be also used in other places.
>
> [1] https://gcc.gnu.org/onlinedocs/gcc/Named-Address-Spaces.html#x86-Named-Address-Spaces
>
> Uros.

[-- Attachment #2: p.diff.txt --]
[-- Type: text/plain, Size: 2843 bytes --]

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index a87db6140fe2..331a9d4dce82 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -242,14 +242,15 @@ do {									\
 	old__.var = _oval;						\
 	new__.var = _nval;						\
 									\
-	asm qual (ALTERNATIVE("leal %P[var], %%esi; call this_cpu_cmpxchg8b_emu", \
+	asm qual (ALTERNATIVE("call this_cpu_cmpxchg8b_emu",		\
 			      "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \
 		  : [var] "+m" (_var),					\
 		    "+a" (old__.low),					\
 		    "+d" (old__.high)					\
 		  : "b" (new__.low),					\
-		    "c" (new__.high)					\
-		  : "memory", "esi");					\
+		    "c" (new__.high),					\
+		    "S" (&_var)						\
+		  : "memory");						\
 									\
 	old__.var;							\
 })
@@ -271,7 +272,7 @@ do {									\
 	old__.var = *_oval;						\
 	new__.var = _nval;						\
 									\
-	asm qual (ALTERNATIVE("leal %P[var], %%esi; call this_cpu_cmpxchg8b_emu", \
+	asm qual (ALTERNATIVE("call this_cpu_cmpxchg8b_emu",		\
 			      "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \
 		  CC_SET(z)						\
 		  : CC_OUT(z) (success),				\
@@ -279,8 +280,9 @@ do {									\
 		    "+a" (old__.low),					\
 		    "+d" (old__.high)					\
 		  : "b" (new__.low),					\
-		    "c" (new__.high)					\
-		  : "memory", "esi");					\
+		    "c" (new__.high),					\
+		    "S" (&_var)						\
+		  : "memory");						\
 	if (unlikely(!success))						\
 		*_oval = old__.var;					\
 	likely(success);						\
@@ -309,14 +311,15 @@ do {									\
 	old__.var = _oval;						\
 	new__.var = _nval;						\
 									\
-	asm qual (ALTERNATIVE("leaq %P[var], %%rsi; call this_cpu_cmpxchg16b_emu", \
+	asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu",		\
 			      "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \
 		  : [var] "+m" (_var),					\
 		    "+a" (old__.low),					\
 		    "+d" (old__.high)					\
 		  : "b" (new__.low),					\
-		    "c" (new__.high)					\
-		  : "memory", "rsi");					\
+		    "c" (new__.high),					\
+		    "S" (&_var)						\
+		  : "memory");						\
 									\
 	old__.var;							\
 })
@@ -338,7 +341,7 @@ do {									\
 	old__.var = *_oval;						\
 	new__.var = _nval;						\
 									\
-	asm qual (ALTERNATIVE("leaq %P[var], %%rsi; call this_cpu_cmpxchg16b_emu", \
+	asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu",		\
 			      "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \
 		  CC_SET(z)						\
 		  : CC_OUT(z) (success),				\
@@ -346,8 +349,9 @@ do {									\
 		    "+a" (old__.low),					\
 		    "+d" (old__.high)					\
 		  : "b" (new__.low),					\
-		    "c" (new__.high)					\
-		  : "memory", "rsi");					\
+		    "c" (new__.high),					\
+		    "S" (&_var)						\
+		  : "memory");						\
 	if (unlikely(!success))						\
 		*_oval = old__.var;					\
 	likely(success);						\

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [tip: x86/asm] x86/percpu: Define {raw,this}_cpu_try_cmpxchg{64,128}
  2023-09-17 18:33       ` Uros Bizjak
@ 2023-09-18  7:31         ` Ingo Molnar
  0 siblings, 0 replies; 6+ messages in thread
From: Ingo Molnar @ 2023-09-18  7:31 UTC (permalink / raw)
  To: Uros Bizjak
  Cc: Linus Torvalds, linux-kernel, linux-tip-commits, Peter Zijlstra, x86


* Uros Bizjak <ubizjak@gmail.com> wrote:

> Now also with the patch attached.
> 
> Uros.

> diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
> index a87db6140fe2..331a9d4dce82 100644
> --- a/arch/x86/include/asm/percpu.h
> +++ b/arch/x86/include/asm/percpu.h

Assuming it boots & works, mind sending a fully changelogged patch with a 
SOB and a 'Suggested-by: Linus' tag or so? Looks like a nice v6.7 addition.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2023-09-18  7:32 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-09-06 18:58 [PATCH] x86/percpu: Define {raw,this}_cpu_try_cmpxchg{64,128} Uros Bizjak
2023-09-15 11:25 ` [tip: x86/asm] " tip-bot2 for Uros Bizjak
2023-09-15 16:45   ` Linus Torvalds
2023-09-17 18:31     ` Uros Bizjak
2023-09-17 18:33       ` Uros Bizjak
2023-09-18  7:31         ` Ingo Molnar

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.