All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] locking/mutex: Optimize __mutex_trylock_fast
@ 2018-04-05  9:37 Peter Zijlstra
  2018-04-05  9:55 ` Will Deacon
  0 siblings, 1 reply; 3+ messages in thread
From: Peter Zijlstra @ 2018-04-05  9:37 UTC (permalink / raw)
  To: Will Deacon, Ingo Molnar; +Cc: Matthew Wilcox, linux-kernel

Subject: locking/mutex: Optimize __mutex_trylock_fast()
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu Apr  5 11:05:35 CEST 2018

Use try_cmpxchg to avoid the pointless TEST instruction..
And add the (missing) atomic_long_try_cmpxchg*() wrappery.

On x86_64 this gives:

0000000000000710 <mutex_lock>:						0000000000000710 <mutex_lock>:
 710:   65 48 8b 14 25 00 00    mov    %gs:0x0,%rdx                      710:   65 48 8b 14 25 00 00    mov    %gs:0x0,%rdx
 717:   00 00                                                            717:   00 00
                        715: R_X86_64_32S       current_task                                    715: R_X86_64_32S       current_task
 719:   31 c0                   xor    %eax,%eax                         719:   31 c0                   xor    %eax,%eax
 71b:   f0 48 0f b1 17          lock cmpxchg %rdx,(%rdi)                 71b:   f0 48 0f b1 17          lock cmpxchg %rdx,(%rdi)
 720:   48 85 c0                test   %rax,%rax                         720:   75 02                   jne    724 <mutex_lock+0x14>
 723:   75 02                   jne    727 <mutex_lock+0x17>             722:   f3 c3                   repz retq
 725:   f3 c3                   repz retq                                724:   eb da                   jmp    700 <__mutex_lock_slowpath>
 727:   eb d7                   jmp    700 <__mutex_lock_slowpath>

On ARM64 this gives:

000000000000638 <mutex_lock>:						0000000000000638 <mutex_lock>:
     638:       d5384101        mrs     x1, sp_el0                           638:       d5384101        mrs     x1, sp_el0
     63c:       d2800002        mov     x2, #0x0                             63c:       d2800002        mov     x2, #0x0
     640:       f9800011        prfm    pstl1strm, [x0]                      640:       f9800011        prfm    pstl1strm, [x0]
     644:       c85ffc03        ldaxr   x3, [x0]                             644:       c85ffc03        ldaxr   x3, [x0]
     648:       ca020064        eor     x4, x3, x2                           648:       ca020064        eor     x4, x3, x2
     64c:       b5000064        cbnz    x4, 658 <mutex_lock+0x20>            64c:       b5000064        cbnz    x4, 658 <mutex_lock+0x20>
     650:       c8047c01        stxr    w4, x1, [x0]                         650:       c8047c01        stxr    w4, x1, [x0]
     654:       35ffff84        cbnz    w4, 644 <mutex_lock+0xc>             654:       35ffff84        cbnz    w4, 644 <mutex_lock+0xc>
     658:       b40000c3        cbz     x3, 670 <mutex_lock+0x38>            658:       b5000043        cbnz    x3, 660 <mutex_lock+0x28>
     65c:       a9bf7bfd        stp     x29, x30, [sp,#-16]!                 65c:       d65f03c0        ret
     660:       910003fd        mov     x29, sp                              660:       a9bf7bfd        stp     x29, x30, [sp,#-16]!
     664:       97ffffef        bl      620 <__mutex_lock_slowpath>          664:       910003fd        mov     x29, sp
     668:       a8c17bfd        ldp     x29, x30, [sp],#16                   668:       97ffffee        bl      620 <__mutex_lock_slowpath>
     66c:       d65f03c0        ret                                          66c:       a8c17bfd        ldp     x29, x30, [sp],#16
     670:       d65f03c0        ret                                          670:       d65f03c0        ret

Which to my untrained eye just looks different, not worse. Will?

Reported-by: Matthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/asm-generic/atomic-long.h |   17 +++++++++++++++++
 kernel/locking/mutex.c            |    3 ++-
 2 files changed, 19 insertions(+), 1 deletion(-)

--- a/include/asm-generic/atomic-long.h
+++ b/include/asm-generic/atomic-long.h
@@ -25,6 +25,7 @@ typedef atomic64_t atomic_long_t;
 
 #define ATOMIC_LONG_INIT(i)	ATOMIC64_INIT(i)
 #define ATOMIC_LONG_PFX(x)	atomic64 ## x
+#define ATOMIC_LONG_TYPE	s64
 
 #else
 
@@ -32,6 +33,7 @@ typedef atomic_t atomic_long_t;
 
 #define ATOMIC_LONG_INIT(i)	ATOMIC_INIT(i)
 #define ATOMIC_LONG_PFX(x)	atomic ## x
+#define ATOMIC_LONG_TYPE	int
 
 #endif
 
@@ -90,6 +92,21 @@ ATOMIC_LONG_ADD_SUB_OP(sub, _release)
 #define atomic_long_cmpxchg(l, old, new) \
 	(ATOMIC_LONG_PFX(_cmpxchg)((ATOMIC_LONG_PFX(_t) *)(l), (old), (new)))
 
+
+#define atomic_long_try_cmpxchg_relaxed(l, old, new) \
+	(ATOMIC_LONG_PFX(_try_cmpxchg_relaxed)((ATOMIC_LONG_PFX(_t) *)(l), \
+					   (ATOMIC_LONG_TYPE *)(old), (ATOMIC_LONG_TYPE)(new)))
+#define atomic_long_try_cmpxchg_acquire(l, old, new) \
+	(ATOMIC_LONG_PFX(_try_cmpxchg_acquire)((ATOMIC_LONG_PFX(_t) *)(l), \
+					   (ATOMIC_LONG_TYPE *)(old), (ATOMIC_LONG_TYPE)(new)))
+#define atomic_long_try_cmpxchg_release(l, old, new) \
+	(ATOMIC_LONG_PFX(_try_cmpxchg_release)((ATOMIC_LONG_PFX(_t) *)(l), \
+					   (ATOMIC_LONG_TYPE *)(old), (ATOMIC_LONG_TYPE)(new)))
+#define atomic_long_try_cmpxchg(l, old, new) \
+	(ATOMIC_LONG_PFX(_try_cmpxchg)((ATOMIC_LONG_PFX(_t) *)(l), \
+				       (ATOMIC_LONG_TYPE *)(old), (ATOMIC_LONG_TYPE)(new)))
+
+
 #define atomic_long_xchg_relaxed(v, new) \
 	(ATOMIC_LONG_PFX(_xchg_relaxed)((ATOMIC_LONG_PFX(_t) *)(v), (new)))
 #define atomic_long_xchg_acquire(v, new) \
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -139,8 +139,9 @@ static inline bool __mutex_trylock(struc
 static __always_inline bool __mutex_trylock_fast(struct mutex *lock)
 {
 	unsigned long curr = (unsigned long)current;
+	unsigned long zero = 0UL;
 
-	if (!atomic_long_cmpxchg_acquire(&lock->owner, 0UL, curr))
+	if (atomic_long_try_cmpxchg_acquire(&lock->owner, &zero, curr))
 		return true;
 
 	return false;

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [RFC] locking/mutex: Optimize __mutex_trylock_fast
  2018-04-05  9:37 [RFC] locking/mutex: Optimize __mutex_trylock_fast Peter Zijlstra
@ 2018-04-05  9:55 ` Will Deacon
  2018-04-05 11:22   ` Peter Zijlstra
  0 siblings, 1 reply; 3+ messages in thread
From: Will Deacon @ 2018-04-05  9:55 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, Matthew Wilcox, linux-kernel

Hi Peter,

On Thu, Apr 05, 2018 at 11:37:16AM +0200, Peter Zijlstra wrote:
> Subject: locking/mutex: Optimize __mutex_trylock_fast()
> From: Peter Zijlstra <peterz@infradead.org>
> Date: Thu Apr  5 11:05:35 CEST 2018
> 
> Use try_cmpxchg to avoid the pointless TEST instruction..
> And add the (missing) atomic_long_try_cmpxchg*() wrappery.
> 
> On x86_64 this gives:
> 
> 0000000000000710 <mutex_lock>:						0000000000000710 <mutex_lock>:
>  710:   65 48 8b 14 25 00 00    mov    %gs:0x0,%rdx                      710:   65 48 8b 14 25 00 00    mov    %gs:0x0,%rdx
>  717:   00 00                                                            717:   00 00
>                         715: R_X86_64_32S       current_task                                    715: R_X86_64_32S       current_task
>  719:   31 c0                   xor    %eax,%eax                         719:   31 c0                   xor    %eax,%eax
>  71b:   f0 48 0f b1 17          lock cmpxchg %rdx,(%rdi)                 71b:   f0 48 0f b1 17          lock cmpxchg %rdx,(%rdi)
>  720:   48 85 c0                test   %rax,%rax                         720:   75 02                   jne    724 <mutex_lock+0x14>
>  723:   75 02                   jne    727 <mutex_lock+0x17>             722:   f3 c3                   repz retq
>  725:   f3 c3                   repz retq                                724:   eb da                   jmp    700 <__mutex_lock_slowpath>
>  727:   eb d7                   jmp    700 <__mutex_lock_slowpath>
> 
> On ARM64 this gives:
> 
> 000000000000638 <mutex_lock>:						0000000000000638 <mutex_lock>:
>      638:       d5384101        mrs     x1, sp_el0                           638:       d5384101        mrs     x1, sp_el0
>      63c:       d2800002        mov     x2, #0x0                             63c:       d2800002        mov     x2, #0x0
>      640:       f9800011        prfm    pstl1strm, [x0]                      640:       f9800011        prfm    pstl1strm, [x0]
>      644:       c85ffc03        ldaxr   x3, [x0]                             644:       c85ffc03        ldaxr   x3, [x0]
>      648:       ca020064        eor     x4, x3, x2                           648:       ca020064        eor     x4, x3, x2
>      64c:       b5000064        cbnz    x4, 658 <mutex_lock+0x20>            64c:       b5000064        cbnz    x4, 658 <mutex_lock+0x20>
>      650:       c8047c01        stxr    w4, x1, [x0]                         650:       c8047c01        stxr    w4, x1, [x0]
>      654:       35ffff84        cbnz    w4, 644 <mutex_lock+0xc>             654:       35ffff84        cbnz    w4, 644 <mutex_lock+0xc>
>      658:       b40000c3        cbz     x3, 670 <mutex_lock+0x38>            658:       b5000043        cbnz    x3, 660 <mutex_lock+0x28>
>      65c:       a9bf7bfd        stp     x29, x30, [sp,#-16]!                 65c:       d65f03c0        ret
>      660:       910003fd        mov     x29, sp                              660:       a9bf7bfd        stp     x29, x30, [sp,#-16]!
>      664:       97ffffef        bl      620 <__mutex_lock_slowpath>          664:       910003fd        mov     x29, sp
>      668:       a8c17bfd        ldp     x29, x30, [sp],#16                   668:       97ffffee        bl      620 <__mutex_lock_slowpath>
>      66c:       d65f03c0        ret                                          66c:       a8c17bfd        ldp     x29, x30, [sp],#16
>      670:       d65f03c0        ret                                          670:       d65f03c0        ret
> 
> Which to my untrained eye just looks different, not worse. Will?

Yeah, looks fine. The compiler has just decided to move the slowpath out of
line. One comment on the patch:

> 
> Reported-by: Matthew Wilcox <mawilcox@microsoft.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  include/asm-generic/atomic-long.h |   17 +++++++++++++++++
>  kernel/locking/mutex.c            |    3 ++-
>  2 files changed, 19 insertions(+), 1 deletion(-)
> 
> --- a/include/asm-generic/atomic-long.h
> +++ b/include/asm-generic/atomic-long.h
> @@ -25,6 +25,7 @@ typedef atomic64_t atomic_long_t;
>  
>  #define ATOMIC_LONG_INIT(i)	ATOMIC64_INIT(i)
>  #define ATOMIC_LONG_PFX(x)	atomic64 ## x
> +#define ATOMIC_LONG_TYPE	s64
>  
>  #else
>  
> @@ -32,6 +33,7 @@ typedef atomic_t atomic_long_t;
>  
>  #define ATOMIC_LONG_INIT(i)	ATOMIC_INIT(i)
>  #define ATOMIC_LONG_PFX(x)	atomic ## x
> +#define ATOMIC_LONG_TYPE	int
>  
>  #endif
>  
> @@ -90,6 +92,21 @@ ATOMIC_LONG_ADD_SUB_OP(sub, _release)
>  #define atomic_long_cmpxchg(l, old, new) \
>  	(ATOMIC_LONG_PFX(_cmpxchg)((ATOMIC_LONG_PFX(_t) *)(l), (old), (new)))
>  
> +
> +#define atomic_long_try_cmpxchg_relaxed(l, old, new) \
> +	(ATOMIC_LONG_PFX(_try_cmpxchg_relaxed)((ATOMIC_LONG_PFX(_t) *)(l), \
> +					   (ATOMIC_LONG_TYPE *)(old), (ATOMIC_LONG_TYPE)(new)))

Are the casts on old and new strictly needed? We don't have them for the
non-try versions...

Will

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [RFC] locking/mutex: Optimize __mutex_trylock_fast
  2018-04-05  9:55 ` Will Deacon
@ 2018-04-05 11:22   ` Peter Zijlstra
  0 siblings, 0 replies; 3+ messages in thread
From: Peter Zijlstra @ 2018-04-05 11:22 UTC (permalink / raw)
  To: Will Deacon; +Cc: Ingo Molnar, Matthew Wilcox, linux-kernel

On Thu, Apr 05, 2018 at 10:55:45AM +0100, Will Deacon wrote:

> > +#define atomic_long_try_cmpxchg_relaxed(l, old, new) \
> > +	(ATOMIC_LONG_PFX(_try_cmpxchg_relaxed)((ATOMIC_LONG_PFX(_t) *)(l), \
> > +					   (ATOMIC_LONG_TYPE *)(old), (ATOMIC_LONG_TYPE)(new)))
> 
> Are the casts on old and new strictly needed? We don't have them for the
> non-try versions...

It was required for the pointer; it doesn't want to convert (unsigned
long *) into (s64 *) without complaining about it :/

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2018-04-05 11:22 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-04-05  9:37 [RFC] locking/mutex: Optimize __mutex_trylock_fast Peter Zijlstra
2018-04-05  9:55 ` Will Deacon
2018-04-05 11:22   ` Peter Zijlstra

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.