All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC][PATCH 0/3] smp_cond_load_acquire + cmpwait
@ 2016-04-04 12:22 Peter Zijlstra
  2016-04-04 12:22 ` [RFC][PATCH 1/3] locking: Replace smp_cond_acquire with smp_cond_load_acquire Peter Zijlstra
                   ` (2 more replies)
  0 siblings, 3 replies; 18+ messages in thread
From: Peter Zijlstra @ 2016-04-04 12:22 UTC (permalink / raw)
  To: linux-kernel, will.deacon
  Cc: waiman.long, mingo, paulmck, boqun.feng, torvalds, dave, peterz

As Will reminded me last week; we still had some pending changes.

A number of patches that allow arm64 to use a monitor/mwait like construct to
avoid most spin waiting.

Compile and boot tested on x86_64, but I seem to have misplaced my arm64
compiler so that part is still a rough sketch mostly.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [RFC][PATCH 1/3] locking: Replace smp_cond_acquire with smp_cond_load_acquire
  2016-04-04 12:22 [RFC][PATCH 0/3] smp_cond_load_acquire + cmpwait Peter Zijlstra
@ 2016-04-04 12:22 ` Peter Zijlstra
  2016-04-04 18:20   ` Waiman Long
  2016-04-04 12:22 ` [RFC][PATCH 2/3] locking/qrwlock: Use smp_cond_load_acquire() Peter Zijlstra
  2016-04-04 12:22 ` [RFC][PATCH 3/3] locking,arm64: Introduce cmpwait() Peter Zijlstra
  2 siblings, 1 reply; 18+ messages in thread
From: Peter Zijlstra @ 2016-04-04 12:22 UTC (permalink / raw)
  To: linux-kernel, will.deacon
  Cc: waiman.long, mingo, paulmck, boqun.feng, torvalds, dave, peterz

[-- Attachment #1: peterz-locking-smp_cond_load_acquire.patch --]
[-- Type: text/plain, Size: 5846 bytes --]

This new form allows using hardware assisted waiting.

Requested-by: Will Deacon <will.deacon@arm.com>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/compiler.h   |   25 +++++++++++++++++++------
 kernel/locking/qspinlock.c |   12 ++++++------
 kernel/sched/core.c        |    8 ++++----
 kernel/sched/sched.h       |    2 +-
 kernel/smp.c               |    2 +-
 5 files changed, 31 insertions(+), 18 deletions(-)

--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -305,21 +305,34 @@ static __always_inline void __write_once
 })
 
 /**
- * smp_cond_acquire() - Spin wait for cond with ACQUIRE ordering
+ * smp_cond_load_acquire() - (Spin) wait for cond with ACQUIRE ordering
+ * @ptr: pointer to the variable to wait on
  * @cond: boolean expression to wait for
  *
  * Equivalent to using smp_load_acquire() on the condition variable but employs
  * the control dependency of the wait to reduce the barrier on many platforms.
  *
+ * Due to C lacking lambda expressions we load the value of *ptr into a
+ * pre-named variable @VAL to be used in @cond.
+ *
  * The control dependency provides a LOAD->STORE order, the additional RMB
  * provides LOAD->LOAD order, together they provide LOAD->{LOAD,STORE} order,
  * aka. ACQUIRE.
  */
-#define smp_cond_acquire(cond)	do {		\
-	while (!(cond))				\
-		cpu_relax();			\
-	smp_rmb(); /* ctrl + rmb := acquire */	\
-} while (0)
+#ifndef smp_cond_load_acquire
+#define smp_cond_load_acquire(ptr, cond_expr) ({		\
+	typeof(ptr) __PTR = (ptr);				\
+	typeof(*ptr) VAL;					\
+	for (;;) {						\
+		VAL = READ_ONCE(*__PTR);			\
+		if (cond_expr)					\
+			break;					\
+		cpu_relax();					\
+	}							\
+	smp_rmb(); /* ctrl + rmb := acquire */			\
+	VAL;							\
+})
+#endif
 
 #endif /* __KERNEL__ */
 
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -358,7 +358,7 @@ void queued_spin_lock_slowpath(struct qs
 	 * sequentiality; this is because not all clear_pending_set_locked()
 	 * implementations imply full barriers.
 	 */
-	smp_cond_acquire(!(atomic_read(&lock->val) & _Q_LOCKED_MASK));
+	smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_MASK));
 
 	/*
 	 * take ownership and clear the pending bit.
@@ -434,7 +434,7 @@ void queued_spin_lock_slowpath(struct qs
 	 *
 	 * The PV pv_wait_head_or_lock function, if active, will acquire
 	 * the lock and return a non-zero value. So we have to skip the
-	 * smp_cond_acquire() call. As the next PV queue head hasn't been
+	 * smp_cond_load_acquire() call. As the next PV queue head hasn't been
 	 * designated yet, there is no way for the locked value to become
 	 * _Q_SLOW_VAL. So both the set_locked() and the
 	 * atomic_cmpxchg_relaxed() calls will be safe.
@@ -445,7 +445,7 @@ void queued_spin_lock_slowpath(struct qs
 	if ((val = pv_wait_head_or_lock(lock, node)))
 		goto locked;
 
-	smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK));
+	val = smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_PENDING_MASK));
 
 locked:
 	/*
@@ -465,9 +465,9 @@ void queued_spin_lock_slowpath(struct qs
 			break;
 		}
 		/*
-		 * The smp_cond_acquire() call above has provided the necessary
-		 * acquire semantics required for locking. At most two
-		 * iterations of this loop may be ran.
+		 * The smp_cond_load_acquire() call above has provided the
+		 * necessary acquire semantics required for locking. At most
+		 * two iterations of this loop may be ran.
 		 */
 		old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL);
 		if (old == val)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1843,7 +1843,7 @@ static void ttwu_queue(struct task_struc
  * chain to provide order. Instead we do:
  *
  *   1) smp_store_release(X->on_cpu, 0)
- *   2) smp_cond_acquire(!X->on_cpu)
+ *   2) smp_cond_load_acquire(!X->on_cpu)
  *
  * Example:
  *
@@ -1854,7 +1854,7 @@ static void ttwu_queue(struct task_struc
  *   sched-out X
  *   smp_store_release(X->on_cpu, 0);
  *
- *                    smp_cond_acquire(!X->on_cpu);
+ *                    smp_cond_load_acquire(&X->on_cpu, !VAL);
  *                    X->state = WAKING
  *                    set_task_cpu(X,2)
  *
@@ -1880,7 +1880,7 @@ static void ttwu_queue(struct task_struc
  * This means that any means of doing remote wakeups must order the CPU doing
  * the wakeup against the CPU the task is going to end up running on. This,
  * however, is already required for the regular Program-Order guarantee above,
- * since the waking CPU is the one issueing the ACQUIRE (smp_cond_acquire).
+ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire).
  *
  */
 
@@ -1953,7 +1953,7 @@ try_to_wake_up(struct task_struct *p, un
 	 * This ensures that tasks getting woken will be fully ordered against
 	 * their previous state and preserve Program Order.
 	 */
-	smp_cond_acquire(!p->on_cpu);
+	smp_cond_load_acquire(&p->on_cpu, !VAL);
 
 	p->sched_contributes_to_load = !!task_contributes_to_load(p);
 	p->state = TASK_WAKING;
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1104,7 +1104,7 @@ static inline void finish_lock_switch(st
 	 * In particular, the load of prev->state in finish_task_switch() must
 	 * happen before this.
 	 *
-	 * Pairs with the smp_cond_acquire() in try_to_wake_up().
+	 * Pairs with the smp_cond_load_acquire() in try_to_wake_up().
 	 */
 	smp_store_release(&prev->on_cpu, 0);
 #endif
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -107,7 +107,7 @@ void __init call_function_init(void)
  */
 static __always_inline void csd_lock_wait(struct call_single_data *csd)
 {
-	smp_cond_acquire(!(csd->flags & CSD_FLAG_LOCK));
+	smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK));
 }
 
 static __always_inline void csd_lock(struct call_single_data *csd)

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [RFC][PATCH 2/3] locking/qrwlock: Use smp_cond_load_acquire()
  2016-04-04 12:22 [RFC][PATCH 0/3] smp_cond_load_acquire + cmpwait Peter Zijlstra
  2016-04-04 12:22 ` [RFC][PATCH 1/3] locking: Replace smp_cond_acquire with smp_cond_load_acquire Peter Zijlstra
@ 2016-04-04 12:22 ` Peter Zijlstra
  2016-04-12  4:58   ` Davidlohr Bueso
  2016-04-04 12:22 ` [RFC][PATCH 3/3] locking,arm64: Introduce cmpwait() Peter Zijlstra
  2 siblings, 1 reply; 18+ messages in thread
From: Peter Zijlstra @ 2016-04-04 12:22 UTC (permalink / raw)
  To: linux-kernel, will.deacon
  Cc: waiman.long, mingo, paulmck, boqun.feng, torvalds, dave, peterz

[-- Attachment #1: peterz-locking-qrwlock-smp_cond_load_acquire.patch --]
[-- Type: text/plain, Size: 1640 bytes --]

Use smp_cond_load_acquire() to make better use of the hardware
assisted 'spin' wait on arm64.

Arguably the second hunk is the more horrid abuse possible, but
avoids having to use cmpwait (see next patch) directly. Also, this
makes 'clever' (ab)use of the cond+rmb acquire to omit the acquire
from cmpxchg().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/locking/qrwlock.c |   18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -53,10 +53,7 @@ struct __qrwlock {
 static __always_inline void
 rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts)
 {
-	while ((cnts & _QW_WMASK) == _QW_LOCKED) {
-		cpu_relax_lowlatency();
-		cnts = atomic_read_acquire(&lock->cnts);
-	}
+	smp_cond_load_acquire(&lock->cnts.counter, (VAL & _QW_WMASK) != _QW_LOCKED);
 }
 
 /**
@@ -109,8 +106,6 @@ EXPORT_SYMBOL(queued_read_lock_slowpath)
  */
 void queued_write_lock_slowpath(struct qrwlock *lock)
 {
-	u32 cnts;
-
 	/* Put the writer into the wait queue */
 	arch_spin_lock(&lock->wait_lock);
 
@@ -134,15 +129,10 @@ void queued_write_lock_slowpath(struct q
 	}
 
 	/* When no more readers, set the locked flag */
-	for (;;) {
-		cnts = atomic_read(&lock->cnts);
-		if ((cnts == _QW_WAITING) &&
-		    (atomic_cmpxchg_acquire(&lock->cnts, _QW_WAITING,
-					    _QW_LOCKED) == _QW_WAITING))
-			break;
+	smp_cond_load_acquire(&lock->cnts.counter,
+		(VAL == _QW_WAITING) &&
+		atomic_cmpxchg_relaxed(&lock->cnts, _QW_WAITING, _QW_LOCKED) == _QW_WAITING);
 
-		cpu_relax_lowlatency();
-	}
 unlock:
 	arch_spin_unlock(&lock->wait_lock);
 }

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [RFC][PATCH 3/3] locking,arm64: Introduce cmpwait()
  2016-04-04 12:22 [RFC][PATCH 0/3] smp_cond_load_acquire + cmpwait Peter Zijlstra
  2016-04-04 12:22 ` [RFC][PATCH 1/3] locking: Replace smp_cond_acquire with smp_cond_load_acquire Peter Zijlstra
  2016-04-04 12:22 ` [RFC][PATCH 2/3] locking/qrwlock: Use smp_cond_load_acquire() Peter Zijlstra
@ 2016-04-04 12:22 ` Peter Zijlstra
  2016-04-04 13:12   ` Peter Zijlstra
  2016-04-12 16:59   ` Will Deacon
  2 siblings, 2 replies; 18+ messages in thread
From: Peter Zijlstra @ 2016-04-04 12:22 UTC (permalink / raw)
  To: linux-kernel, will.deacon
  Cc: waiman.long, mingo, paulmck, boqun.feng, torvalds, dave, peterz

[-- Attachment #1: peterz-locking-cmp_and_wait.patch --]
[-- Type: text/plain, Size: 4947 bytes --]

Provide the cmpwait() primitive, which will 'spin' wait for a variable
to change. Use it to implement smp_cond_load_acquire() and provide an
ARM64 implementation.

The ARM64 implementation uses LDXR+WFE to avoid most spinning by
letting the hardware go idle while waiting for the exclusive load of
the variable to be cancelled (as anybody changing the value must).

I've misplaced my arm64 compiler, so this is not even compile tested.

Suggested-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/arm64/include/asm/cmpxchg.h |   36 +++++++++++++++++++++++++++++
 include/linux/atomic.h           |   47 +++++++++++++++++++++++++++++++++++++++
 include/linux/compiler.h         |   30 ------------------------
 3 files changed, 83 insertions(+), 30 deletions(-)

--- a/arch/arm64/include/asm/cmpxchg.h
+++ b/arch/arm64/include/asm/cmpxchg.h
@@ -224,4 +224,40 @@ __CMPXCHG_GEN(_mb)
 	__ret;								\
 })
 
+#define __CMPWAIT_GEN(w, sz, name)					\
+void __cmpwait_case_##name(volatile void *ptr, unsigned long val)	\
+{									\
+	unsigned long tmp;						\
+									\
+	asm volatile(							\
+	"	ldxr" #sz "\t%" #w "[tmp], %[v]\n"			\
+	"	eor	%" #w "[tmp], %" #w "[tmp], %" #w "[val]\n"	\
+	"	cbnz	%" #w "[tmp], 1f\n"				\
+	"	wfe\n"							\
+	"1:"								\
+	: [tmp] "=&r" (tmp), [val] "=&r" (val),				\
+	  [v] "+Q" (*(unsigned long *)ptr));				\
+}
+
+__CMPWAIT_GEN(w, b, 1);
+__CMPWAIT_GEN(w, h, 2);
+__CMPWAIT_GEN(w,  , 4);
+__CMPWAIT_GEN( ,  , 8);
+
+static inline void __cmpwait(volatile void *ptr, unsigned long val, int size)
+{
+	switch (size) {
+	case 1: return __cmpwait_case_1(ptr, val);
+	case 2: return __cmpwait_case_2(ptr, val);
+	case 4: return __cmpwait_case_4(ptr, val);
+	case 8: return __cmpwait_case_8(ptr, val);
+	default: BUILD_BUG();
+	}
+
+	unreachable();
+}
+
+#define cmpwait(ptr, val) \
+	__cmpwait((ptr), (unsigned long)(val), sizeof(*(ptr)))
+
 #endif	/* __ASM_CMPXCHG_H */
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -30,6 +30,53 @@
 #define  atomic_set_release(v, i)	smp_store_release(&(v)->counter, (i))
 #endif
 
+/**
+ * cmpwait - compare and wait for a variable to change
+ * @ptr: pointer to the variable to wait on
+ * @val: the value it should change from
+ *
+ * A simple constuct that waits for a variable to change from a known
+ * value; some architectures can do this in hardware.
+ */
+#ifndef cmpwait
+#define cmpwait(ptr, val) do {					\
+	typeof (ptr) __ptr = (ptr);				\
+	typeof (val) __val = (val);				\
+	while (READ_ONCE(*__ptr) == __val)			\
+		cpu_relax();					\
+} while (0)
+#endif
+
+/**
+ * smp_cond_load_acquire() - (Spin) wait for cond with ACQUIRE ordering
+ * @ptr: pointer to the variable to wait on
+ * @cond: boolean expression to wait for
+ *
+ * Equivalent to using smp_load_acquire() on the condition variable but employs
+ * the control dependency of the wait to reduce the barrier on many platforms.
+ *
+ * Due to C lacking lambda expressions we load the value of *ptr into a
+ * pre-named variable @VAL to be used in @cond.
+ *
+ * The control dependency provides a LOAD->STORE order, the additional RMB
+ * provides LOAD->LOAD order, together they provide LOAD->{LOAD,STORE} order,
+ * aka. ACQUIRE.
+ */
+#ifndef smp_cond_load_acquire
+#define smp_cond_load_acquire(ptr, cond_expr) ({		\
+	typeof(ptr) __PTR = (ptr);				\
+	typeof(*ptr) VAL;					\
+	for (;;) {						\
+		VAL = READ_ONCE(*__PTR);			\
+		if (cond_expr)					\
+			break;					\
+		cmpwait(__PTR, VAL);				\
+	}							\
+	smp_rmb(); /* ctrl + rmb := acquire */			\
+	VAL;							\
+})
+#endif
+
 /*
  * The idea here is to build acquire/release variants by adding explicit
  * barriers on top of the relaxed variant. In the case where the relaxed
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -304,36 +304,6 @@ static __always_inline void __write_once
 	__u.__val;					\
 })
 
-/**
- * smp_cond_load_acquire() - (Spin) wait for cond with ACQUIRE ordering
- * @ptr: pointer to the variable to wait on
- * @cond: boolean expression to wait for
- *
- * Equivalent to using smp_load_acquire() on the condition variable but employs
- * the control dependency of the wait to reduce the barrier on many platforms.
- *
- * Due to C lacking lambda expressions we load the value of *ptr into a
- * pre-named variable @VAL to be used in @cond.
- *
- * The control dependency provides a LOAD->STORE order, the additional RMB
- * provides LOAD->LOAD order, together they provide LOAD->{LOAD,STORE} order,
- * aka. ACQUIRE.
- */
-#ifndef smp_cond_load_acquire
-#define smp_cond_load_acquire(ptr, cond_expr) ({		\
-	typeof(ptr) __PTR = (ptr);				\
-	typeof(*ptr) VAL;					\
-	for (;;) {						\
-		VAL = READ_ONCE(*__PTR);			\
-		if (cond_expr)					\
-			break;					\
-		cpu_relax();					\
-	}							\
-	smp_rmb(); /* ctrl + rmb := acquire */			\
-	VAL;							\
-})
-#endif
-
 #endif /* __KERNEL__ */
 
 #endif /* __ASSEMBLY__ */

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC][PATCH 3/3] locking,arm64: Introduce cmpwait()
  2016-04-04 12:22 ` [RFC][PATCH 3/3] locking,arm64: Introduce cmpwait() Peter Zijlstra
@ 2016-04-04 13:12   ` Peter Zijlstra
  2016-04-12 16:59   ` Will Deacon
  1 sibling, 0 replies; 18+ messages in thread
From: Peter Zijlstra @ 2016-04-04 13:12 UTC (permalink / raw)
  To: linux-kernel, will.deacon
  Cc: waiman.long, mingo, paulmck, boqun.feng, torvalds, dave

On Mon, Apr 04, 2016 at 02:22:53PM +0200, Peter Zijlstra wrote:
> +#define __CMPWAIT_GEN(w, sz, name)					\

 +static inline  \

> +void __cmpwait_case_##name(volatile void *ptr, unsigned long val)	\
> +{									\
> +	unsigned long tmp;						\
> +									\
> +	asm volatile(							\
> +	"	ldxr" #sz "\t%" #w "[tmp], %[v]\n"			\
> +	"	eor	%" #w "[tmp], %" #w "[tmp], %" #w "[val]\n"	\
> +	"	cbnz	%" #w "[tmp], 1f\n"				\
> +	"	wfe\n"							\
> +	"1:"								\
> +	: [tmp] "=&r" (tmp), [val] "=&r" (val),				\
> +	  [v] "+Q" (*(unsigned long *)ptr));				\

And this probably wants a "memory" clobber to force reload values after
this returns.

> +}

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC][PATCH 1/3] locking: Replace smp_cond_acquire with smp_cond_load_acquire
  2016-04-04 12:22 ` [RFC][PATCH 1/3] locking: Replace smp_cond_acquire with smp_cond_load_acquire Peter Zijlstra
@ 2016-04-04 18:20   ` Waiman Long
  0 siblings, 0 replies; 18+ messages in thread
From: Waiman Long @ 2016-04-04 18:20 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, will.deacon, mingo, paulmck, boqun.feng, torvalds, dave

On 04/04/2016 08:22 AM, Peter Zijlstra wrote:
> This new form allows using hardware assisted waiting.
>
> Requested-by: Will Deacon<will.deacon@arm.com>
> Suggested-by: Linus Torvalds<torvalds@linux-foundation.org>
> Signed-off-by: Peter Zijlstra (Intel)<peterz@infradead.org>
> ---
>   include/linux/compiler.h   |   25 +++++++++++++++++++------
>   kernel/locking/qspinlock.c |   12 ++++++------
>   kernel/sched/core.c        |    8 ++++----
>   kernel/sched/sched.h       |    2 +-
>   kernel/smp.c               |    2 +-
>   5 files changed, 31 insertions(+), 18 deletions(-)
>
> --- a/include/linux/compiler.h
> +++ b/include/linux/compiler.h
> @@ -305,21 +305,34 @@ static __always_inline void __write_once
>   })
>
>   /**
> - * smp_cond_acquire() - Spin wait for cond with ACQUIRE ordering
> + * smp_cond_load_acquire() - (Spin) wait for cond with ACQUIRE ordering
> + * @ptr: pointer to the variable to wait on
>    * @cond: boolean expression to wait for
>    *
>    * Equivalent to using smp_load_acquire() on the condition variable but employs
>    * the control dependency of the wait to reduce the barrier on many platforms.
>    *
> + * Due to C lacking lambda expressions we load the value of *ptr into a
> + * pre-named variable @VAL to be used in @cond.
> + *
>    * The control dependency provides a LOAD->STORE order, the additional RMB
>    * provides LOAD->LOAD order, together they provide LOAD->{LOAD,STORE} order,
>    * aka. ACQUIRE.
>    */
> -#define smp_cond_acquire(cond)	do {		\
> -	while (!(cond))				\
> -		cpu_relax();			\
> -	smp_rmb(); /* ctrl + rmb := acquire */	\
> -} while (0)
> +#ifndef smp_cond_load_acquire
> +#define smp_cond_load_acquire(ptr, cond_expr) ({		\
> +	typeof(ptr) __PTR = (ptr);				\
> +	typeof(*ptr) VAL;					\
> +	for (;;) {						\
> +		VAL = READ_ONCE(*__PTR);			\
> +		if (cond_expr)					\
> +			break;					\
> +		cpu_relax();					\
> +	}							\
> +	smp_rmb(); /* ctrl + rmb := acquire */			\
> +	VAL;							\
> +})
> +#endif

Using a predefined VAR seems a bit awkward as a reader of the code may 
not know where VAR comes from. How about passing in a variable to hold 
the latest value of (*ptr), e.g.

#ifndef smp_cond_load_acquire
#define smp_cond_load_acquire(ptr, var, cond_expr) do {         \
         typeof(ptr) __PTR = (ptr);                              \
         for (;;) {                                              \
                 var = READ_ONCE(*__PTR);                        \
                 if (cond_expr)                                  \
                         break;                                  \
                 cpu_relax();                                    \
         }                                                       \
         smp_rmb(); /* ctrl + rmb := acquire */                  \
} while (0)
#endif

Cheers,
Longman

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC][PATCH 2/3] locking/qrwlock: Use smp_cond_load_acquire()
  2016-04-04 12:22 ` [RFC][PATCH 2/3] locking/qrwlock: Use smp_cond_load_acquire() Peter Zijlstra
@ 2016-04-12  4:58   ` Davidlohr Bueso
  2016-04-12 16:45     ` Waiman Long
  0 siblings, 1 reply; 18+ messages in thread
From: Davidlohr Bueso @ 2016-04-12  4:58 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, will.deacon, waiman.long, mingo, paulmck,
	boqun.feng, torvalds

On Mon, 04 Apr 2016, Peter Zijlstra wrote:

>Use smp_cond_load_acquire() to make better use of the hardware
>assisted 'spin' wait on arm64.
>
>Arguably the second hunk is the more horrid abuse possible, but
>avoids having to use cmpwait (see next patch) directly. Also, this
>makes 'clever' (ab)use of the cond+rmb acquire to omit the acquire
>from cmpxchg().
>
>Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
>---
> kernel/locking/qrwlock.c |   18 ++++--------------
> 1 file changed, 4 insertions(+), 14 deletions(-)
>
>--- a/kernel/locking/qrwlock.c
>+++ b/kernel/locking/qrwlock.c
>@@ -53,10 +53,7 @@ struct __qrwlock {
> static __always_inline void
> rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts)
> {
>-	while ((cnts & _QW_WMASK) == _QW_LOCKED) {
>-		cpu_relax_lowlatency();
>-		cnts = atomic_read_acquire(&lock->cnts);
>-	}
>+	smp_cond_load_acquire(&lock->cnts.counter, (VAL & _QW_WMASK) != _QW_LOCKED);
> }
>
> /**
>@@ -109,8 +106,6 @@ EXPORT_SYMBOL(queued_read_lock_slowpath)
>  */
> void queued_write_lock_slowpath(struct qrwlock *lock)
> {
>-	u32 cnts;
>-
> 	/* Put the writer into the wait queue */
> 	arch_spin_lock(&lock->wait_lock);
>
>@@ -134,15 +129,10 @@ void queued_write_lock_slowpath(struct q
> 	}
>
> 	/* When no more readers, set the locked flag */
>-	for (;;) {
>-		cnts = atomic_read(&lock->cnts);
>-		if ((cnts == _QW_WAITING) &&
>-		    (atomic_cmpxchg_acquire(&lock->cnts, _QW_WAITING,
>-					    _QW_LOCKED) == _QW_WAITING))
>-			break;
>+	smp_cond_load_acquire(&lock->cnts.counter,
>+		(VAL == _QW_WAITING) &&
>+		atomic_cmpxchg_relaxed(&lock->cnts, _QW_WAITING, _QW_LOCKED) == _QW_WAITING);
>
>-		cpu_relax_lowlatency();

You would need some variant for cpu_relax_lowlatency otherwise you'll be hurting s390, no?
fwiw back when I was looking at this, I recall thinking about possibly introducing
smp_cond_acquire_lowlatency but never got around to it.

Thanks,
Davidlohr

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC][PATCH 2/3] locking/qrwlock: Use smp_cond_load_acquire()
  2016-04-12  4:58   ` Davidlohr Bueso
@ 2016-04-12 16:45     ` Waiman Long
  0 siblings, 0 replies; 18+ messages in thread
From: Waiman Long @ 2016-04-12 16:45 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Peter Zijlstra, linux-kernel, will.deacon, mingo, paulmck,
	boqun.feng, torvalds

On 04/12/2016 12:58 AM, Davidlohr Bueso wrote:
> On Mon, 04 Apr 2016, Peter Zijlstra wrote:
>
>> Use smp_cond_load_acquire() to make better use of the hardware
>> assisted 'spin' wait on arm64.
>>
>> Arguably the second hunk is the more horrid abuse possible, but
>> avoids having to use cmpwait (see next patch) directly. Also, this
>> makes 'clever' (ab)use of the cond+rmb acquire to omit the acquire
>> from cmpxchg().
>>
>> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
>> ---
>> kernel/locking/qrwlock.c |   18 ++++--------------
>> 1 file changed, 4 insertions(+), 14 deletions(-)
>>
>> --- a/kernel/locking/qrwlock.c
>> +++ b/kernel/locking/qrwlock.c
>> @@ -53,10 +53,7 @@ struct __qrwlock {
>> static __always_inline void
>> rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts)
>> {
>> -    while ((cnts & _QW_WMASK) == _QW_LOCKED) {
>> -        cpu_relax_lowlatency();
>> -        cnts = atomic_read_acquire(&lock->cnts);
>> -    }
>> +    smp_cond_load_acquire(&lock->cnts.counter, (VAL & _QW_WMASK) != 
>> _QW_LOCKED);
>> }
>>
>> /**
>> @@ -109,8 +106,6 @@ EXPORT_SYMBOL(queued_read_lock_slowpath)
>>  */
>> void queued_write_lock_slowpath(struct qrwlock *lock)
>> {
>> -    u32 cnts;
>> -
>>     /* Put the writer into the wait queue */
>>     arch_spin_lock(&lock->wait_lock);
>>
>> @@ -134,15 +129,10 @@ void queued_write_lock_slowpath(struct q
>>     }
>>
>>     /* When no more readers, set the locked flag */
>> -    for (;;) {
>> -        cnts = atomic_read(&lock->cnts);
>> -        if ((cnts == _QW_WAITING) &&
>> -            (atomic_cmpxchg_acquire(&lock->cnts, _QW_WAITING,
>> -                        _QW_LOCKED) == _QW_WAITING))
>> -            break;
>> +    smp_cond_load_acquire(&lock->cnts.counter,
>> +        (VAL == _QW_WAITING) &&
>> +        atomic_cmpxchg_relaxed(&lock->cnts, _QW_WAITING, _QW_LOCKED) 
>> == _QW_WAITING);
>>
>> -        cpu_relax_lowlatency();
>
> You would need some variant for cpu_relax_lowlatency otherwise you'll 
> be hurting s390, no?
> fwiw back when I was looking at this, I recall thinking about possibly 
> introducing
> smp_cond_acquire_lowlatency but never got around to it.
>
> Thanks,
> Davidlohr

The qrwlock is currently only used on x86 architecture. We can also come 
back to revisit this issue when other architectures that need the 
lowlatency variants are going to use qrwlock.

Cheers,
Longman

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC][PATCH 3/3] locking,arm64: Introduce cmpwait()
  2016-04-04 12:22 ` [RFC][PATCH 3/3] locking,arm64: Introduce cmpwait() Peter Zijlstra
  2016-04-04 13:12   ` Peter Zijlstra
@ 2016-04-12 16:59   ` Will Deacon
  2016-04-13 12:52     ` Peter Zijlstra
  2016-04-22 16:08     ` Boqun Feng
  1 sibling, 2 replies; 18+ messages in thread
From: Will Deacon @ 2016-04-12 16:59 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, waiman.long, mingo, paulmck, boqun.feng, torvalds, dave

Hi Peter,

Thanks for looking at this!

On Mon, Apr 04, 2016 at 02:22:53PM +0200, Peter Zijlstra wrote:
> Provide the cmpwait() primitive, which will 'spin' wait for a variable
> to change. Use it to implement smp_cond_load_acquire() and provide an
> ARM64 implementation.
> 
> The ARM64 implementation uses LDXR+WFE to avoid most spinning by
> letting the hardware go idle while waiting for the exclusive load of
> the variable to be cancelled (as anybody changing the value must).
> 
> I've misplaced my arm64 compiler, so this is not even compile tested.

Guess what? ;)

init/do_mounts_initrd.o: In function `__cmpwait_case_1':
/home/will/work/aarch32/linux/linux/./arch/arm64/include/asm/cmpxchg.h:242: multiple definition of `__cmpwait_case_1'
init/do_mounts.o:/home/will/work/aarch32/linux/linux/./arch/arm64/include/asm/cmpxchg.h:242: first defined here
init/do_mounts_initrd.o: In function `__cmpwait_case_2':
/home/will/work/aarch32/linux/linux/./arch/arm64/include/asm/cmpxchg.h:243: multiple definition of `__cmpwait_case_2'
init/do_mounts.o:/home/will/work/aarch32/linux/linux/./arch/arm64/include/asm/cmpxchg.h:243: first defined here
init/do_mounts_initrd.o: In function `__cmpwait_case_4':
/home/will/work/aarch32/linux/linux/./arch/arm64/include/asm/cmpxchg.h:244: multiple definition of `__cmpwait_case_4'
init/do_mounts.o:/home/will/work/aarch32/linux/linux/./arch/arm64/include/asm/cmpxchg.h:244: first defined here
init/do_mounts_initrd.o: In function `__cmpwait_case_8':
/home/will/work/aarch32/linux/linux/./arch/arm64/include/asm/cmpxchg.h:245: multiple definition of `__cmpwait_case_8'
init/do_mounts.o:/home/will/work/aarch32/linux/linux/./arch/arm64/include/asm/cmpxchg.h:245: first defined here
make[1]: *** [init/mounts.o] Error 1
make: *** [init] Error 2
make: *** Waiting for unfinished jobs....

(and lot of similar errors).

Looks like you're just missing an #undef in cmpxchg.h.

FWIW, you can pick up arm64 toolchain binaries from:

  https://releases.linaro.org/components/toolchain/binaries/latest-5/

> Suggested-by: Will Deacon <will.deacon@arm.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  arch/arm64/include/asm/cmpxchg.h |   36 +++++++++++++++++++++++++++++
>  include/linux/atomic.h           |   47 +++++++++++++++++++++++++++++++++++++++
>  include/linux/compiler.h         |   30 ------------------------
>  3 files changed, 83 insertions(+), 30 deletions(-)
> 
> --- a/arch/arm64/include/asm/cmpxchg.h
> +++ b/arch/arm64/include/asm/cmpxchg.h
> @@ -224,4 +224,40 @@ __CMPXCHG_GEN(_mb)
>  	__ret;								\
>  })
>  
> +#define __CMPWAIT_GEN(w, sz, name)					\
> +void __cmpwait_case_##name(volatile void *ptr, unsigned long val)	\
> +{									\
> +	unsigned long tmp;						\
> +									\
> +	asm volatile(							\
> +	"	ldxr" #sz "\t%" #w "[tmp], %[v]\n"			\
> +	"	eor	%" #w "[tmp], %" #w "[tmp], %" #w "[val]\n"	\
> +	"	cbnz	%" #w "[tmp], 1f\n"				\

Shouldn't this be cbz? (i.e. branch over the wfe if the value is equal
to what we wanted?).

> +	"	wfe\n"							\
> +	"1:"								\
> +	: [tmp] "=&r" (tmp), [val] "=&r" (val),				\

We only read val, so it can be an input operand, no?

> +	  [v] "+Q" (*(unsigned long *)ptr));				\
> +}
> +
> +__CMPWAIT_GEN(w, b, 1);
> +__CMPWAIT_GEN(w, h, 2);
> +__CMPWAIT_GEN(w,  , 4);
> +__CMPWAIT_GEN( ,  , 8);
> +
> +static inline void __cmpwait(volatile void *ptr, unsigned long val, int size)
> +{
> +	switch (size) {
> +	case 1: return __cmpwait_case_1(ptr, val);
> +	case 2: return __cmpwait_case_2(ptr, val);
> +	case 4: return __cmpwait_case_4(ptr, val);
> +	case 8: return __cmpwait_case_8(ptr, val);
> +	default: BUILD_BUG();
> +	}
> +
> +	unreachable();
> +}
> +
> +#define cmpwait(ptr, val) \
> +	__cmpwait((ptr), (unsigned long)(val), sizeof(*(ptr)))

We might want to call this cmpwait_relaxed, in case we decide to add
fenced versions in the future. Or just make it cmpwait_acquire and
remove the smp_rmb() from smp_cond_load_acquire(). Dunno.

Will

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC][PATCH 3/3] locking,arm64: Introduce cmpwait()
  2016-04-12 16:59   ` Will Deacon
@ 2016-04-13 12:52     ` Peter Zijlstra
  2016-04-26 16:33       ` Will Deacon
  2016-04-22 16:08     ` Boqun Feng
  1 sibling, 1 reply; 18+ messages in thread
From: Peter Zijlstra @ 2016-04-13 12:52 UTC (permalink / raw)
  To: Will Deacon
  Cc: linux-kernel, waiman.long, mingo, paulmck, boqun.feng, torvalds, dave

On Tue, Apr 12, 2016 at 05:59:41PM +0100, Will Deacon wrote:
> Thanks for looking at this!

n/p, had to se what it would look like etc.. :-)

> > I've misplaced my arm64 compiler, so this is not even compile tested.
> 
> Guess what? ;)
> 

> make: *** [init] Error 2
> make: *** Waiting for unfinished jobs....
> 
> (and lot of similar errors).
> 
> Looks like you're just missing an #undef in cmpxchg.h.
> 
> FWIW, you can pick up arm64 toolchain binaries from:
> 
>   https://releases.linaro.org/components/toolchain/binaries/latest-5/

Ah, I usually build a whole set from sources; for some reason arm64
didn't build in the latest run. I'll have to kick it.

> > +#define __CMPWAIT_GEN(w, sz, name)					\
> > +void __cmpwait_case_##name(volatile void *ptr, unsigned long val)	\
> > +{									\
> > +	unsigned long tmp;						\
> > +									\
> > +	asm volatile(							\
> > +	"	ldxr" #sz "\t%" #w "[tmp], %[v]\n"			\
> > +	"	eor	%" #w "[tmp], %" #w "[tmp], %" #w "[val]\n"	\
> > +	"	cbnz	%" #w "[tmp], 1f\n"				\
> 
> Shouldn't this be cbz? (i.e. branch over the wfe if the value is equal
> to what we wanted?).

Indeed so.

> > +	"	wfe\n"							\
> > +	"1:"								\
> > +	: [tmp] "=&r" (tmp), [val] "=&r" (val),				\
> 
> We only read val, so it can be an input operand, no?

True.. :-)

> > +#define cmpwait(ptr, val) \
> > +	__cmpwait((ptr), (unsigned long)(val), sizeof(*(ptr)))
> 
> We might want to call this cmpwait_relaxed, in case we decide to add
> fenced versions in the future. Or just make it cmpwait_acquire and
> remove the smp_rmb() from smp_cond_load_acquire(). Dunno.

This is something I'll very much leave up to you. I have no idea on the
tradeoffs involved here.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC][PATCH 3/3] locking,arm64: Introduce cmpwait()
  2016-04-12 16:59   ` Will Deacon
  2016-04-13 12:52     ` Peter Zijlstra
@ 2016-04-22 16:08     ` Boqun Feng
  2016-04-22 16:53       ` Will Deacon
  2016-04-23  2:37       ` Peter Zijlstra
  1 sibling, 2 replies; 18+ messages in thread
From: Boqun Feng @ 2016-04-22 16:08 UTC (permalink / raw)
  To: Will Deacon
  Cc: Peter Zijlstra, linux-kernel, waiman.long, mingo, paulmck,
	torvalds, dave

[-- Attachment #1: Type: text/plain, Size: 1310 bytes --]

On Tue, Apr 12, 2016 at 05:59:41PM +0100, Will Deacon wrote:
[...]
> > +static inline void __cmpwait(volatile void *ptr, unsigned long val, int size)
> > +{
> > +	switch (size) {
> > +	case 1: return __cmpwait_case_1(ptr, val);
> > +	case 2: return __cmpwait_case_2(ptr, val);
> > +	case 4: return __cmpwait_case_4(ptr, val);
> > +	case 8: return __cmpwait_case_8(ptr, val);
> > +	default: BUILD_BUG();
> > +	}
> > +
> > +	unreachable();
> > +}
> > +
> > +#define cmpwait(ptr, val) \
> > +	__cmpwait((ptr), (unsigned long)(val), sizeof(*(ptr)))
> 
> We might want to call this cmpwait_relaxed, in case we decide to add
> fenced versions in the future. Or just make it cmpwait_acquire and
> remove the smp_rmb() from smp_cond_load_acquire(). Dunno.
> 

How about replace smp_rmb() with a smp_acquire_barrier__after_cmpwait()?
This barrier is designed to provide an ACQUIRE ordering when combining a
cmpwait() .

And cmpwait() only has minimal ordering guarantee, but if it is actually
an ACQUIRE, then the corresponding smp_acquire_barrier__after_cmpwait()
is just empty.

We might need this special barrier on ppc, because we can implement it
with "isync" given that cmpwait() has control dependency and ctrl+isync
is ACQUIRE on ppc.

Thoughts?

Regards,
Boqun

> Will

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 473 bytes --]

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC][PATCH 3/3] locking,arm64: Introduce cmpwait()
  2016-04-22 16:08     ` Boqun Feng
@ 2016-04-22 16:53       ` Will Deacon
  2016-04-23  4:02         ` Boqun Feng
  2016-04-23  2:37       ` Peter Zijlstra
  1 sibling, 1 reply; 18+ messages in thread
From: Will Deacon @ 2016-04-22 16:53 UTC (permalink / raw)
  To: Boqun Feng
  Cc: Peter Zijlstra, linux-kernel, waiman.long, mingo, paulmck,
	torvalds, dave

On Sat, Apr 23, 2016 at 12:08:57AM +0800, Boqun Feng wrote:
> On Tue, Apr 12, 2016 at 05:59:41PM +0100, Will Deacon wrote:
> [...]
> > > +static inline void __cmpwait(volatile void *ptr, unsigned long val, int size)
> > > +{
> > > +	switch (size) {
> > > +	case 1: return __cmpwait_case_1(ptr, val);
> > > +	case 2: return __cmpwait_case_2(ptr, val);
> > > +	case 4: return __cmpwait_case_4(ptr, val);
> > > +	case 8: return __cmpwait_case_8(ptr, val);
> > > +	default: BUILD_BUG();
> > > +	}
> > > +
> > > +	unreachable();
> > > +}
> > > +
> > > +#define cmpwait(ptr, val) \
> > > +	__cmpwait((ptr), (unsigned long)(val), sizeof(*(ptr)))
> > 
> > We might want to call this cmpwait_relaxed, in case we decide to add
> > fenced versions in the future. Or just make it cmpwait_acquire and
> > remove the smp_rmb() from smp_cond_load_acquire(). Dunno.
> > 
> 
> How about replace smp_rmb() with a smp_acquire_barrier__after_cmpwait()?
> This barrier is designed to provide an ACQUIRE ordering when combining a
> cmpwait() .
> 
> And cmpwait() only has minimal ordering guarantee, but if it is actually
> an ACQUIRE, then the corresponding smp_acquire_barrier__after_cmpwait()
> is just empty.

Maybe, but that makes it difficult for me to use a load-acquire instruction
for the ACQUIRE case.

Will

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC][PATCH 3/3] locking,arm64: Introduce cmpwait()
  2016-04-22 16:08     ` Boqun Feng
  2016-04-22 16:53       ` Will Deacon
@ 2016-04-23  2:37       ` Peter Zijlstra
  2016-04-23  3:40         ` Boqun Feng
  1 sibling, 1 reply; 18+ messages in thread
From: Peter Zijlstra @ 2016-04-23  2:37 UTC (permalink / raw)
  To: Boqun Feng
  Cc: Will Deacon, linux-kernel, waiman.long, mingo, paulmck, torvalds, dave

On Sat, Apr 23, 2016 at 12:08:57AM +0800, Boqun Feng wrote:
> How about replace smp_rmb() with a smp_acquire_barrier__after_cmpwait()?
> This barrier is designed to provide an ACQUIRE ordering when combining a
> cmpwait() .

That's a horrible name for a barrier :-)

> And cmpwait() only has minimal ordering guarantee, but if it is actually
> an ACQUIRE, then the corresponding smp_acquire_barrier__after_cmpwait()
> is just empty.
> 
> We might need this special barrier on ppc, because we can implement it
> with "isync" given that cmpwait() has control dependency and ctrl+isync
> is ACQUIRE on ppc.
> 
> Thoughts?

Provide a PPC specific smp_cond_load_acquire() using ISYNC ?

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC][PATCH 3/3] locking,arm64: Introduce cmpwait()
  2016-04-23  2:37       ` Peter Zijlstra
@ 2016-04-23  3:40         ` Boqun Feng
  0 siblings, 0 replies; 18+ messages in thread
From: Boqun Feng @ 2016-04-23  3:40 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Will Deacon, linux-kernel, waiman.long, mingo, paulmck, torvalds, dave

[-- Attachment #1: Type: text/plain, Size: 1493 bytes --]

On Sat, Apr 23, 2016 at 04:37:36AM +0200, Peter Zijlstra wrote:
> On Sat, Apr 23, 2016 at 12:08:57AM +0800, Boqun Feng wrote:
> > How about replace smp_rmb() with a smp_acquire_barrier__after_cmpwait()?
> > This barrier is designed to provide an ACQUIRE ordering when combining a
> > cmpwait() .
> 
> That's a horrible name for a barrier :-)
> 

Indeed, and having another special barrier is a pain.

> > And cmpwait() only has minimal ordering guarantee, but if it is actually
> > an ACQUIRE, then the corresponding smp_acquire_barrier__after_cmpwait()
> > is just empty.
> > 
> > We might need this special barrier on ppc, because we can implement it
> > with "isync" given that cmpwait() has control dependency and ctrl+isync
> > is ACQUIRE on ppc.
> > 
> > Thoughts?
> 
> Provide a PPC specific smp_cond_load_acquire() using ISYNC ?

That works, but I should do more investigation on that, I brought this
up because ISYNC may be better than smp_rmb() in two cases, IIUC:

1.	for old systems without LWSYNC, smp_rmb() is actually SYNC,
	which is heavier than ISYNC.

2.	for new systems, isync may be a little faster than lwsync,
	according to Michael Ellerman: http://lkml.kernel.org/g/1437012028.28475.2.camel@ellerman.id.au

but I doubt these two cases will have any observed performance impact,
so the current version of smp_cond_load_acquire() in your patch is fine,
and we can switch to a specific one in the future we want to ;-)

Regards,
Boqun

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 473 bytes --]

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC][PATCH 3/3] locking,arm64: Introduce cmpwait()
  2016-04-22 16:53       ` Will Deacon
@ 2016-04-23  4:02         ` Boqun Feng
  0 siblings, 0 replies; 18+ messages in thread
From: Boqun Feng @ 2016-04-23  4:02 UTC (permalink / raw)
  To: Will Deacon
  Cc: Peter Zijlstra, linux-kernel, waiman.long, mingo, paulmck,
	torvalds, dave

[-- Attachment #1: Type: text/plain, Size: 1752 bytes --]

On Fri, Apr 22, 2016 at 05:53:11PM +0100, Will Deacon wrote:
> On Sat, Apr 23, 2016 at 12:08:57AM +0800, Boqun Feng wrote:
> > On Tue, Apr 12, 2016 at 05:59:41PM +0100, Will Deacon wrote:
> > [...]
> > > > +static inline void __cmpwait(volatile void *ptr, unsigned long val, int size)
> > > > +{
> > > > +	switch (size) {
> > > > +	case 1: return __cmpwait_case_1(ptr, val);
> > > > +	case 2: return __cmpwait_case_2(ptr, val);
> > > > +	case 4: return __cmpwait_case_4(ptr, val);
> > > > +	case 8: return __cmpwait_case_8(ptr, val);
> > > > +	default: BUILD_BUG();
> > > > +	}
> > > > +
> > > > +	unreachable();
> > > > +}
> > > > +
> > > > +#define cmpwait(ptr, val) \
> > > > +	__cmpwait((ptr), (unsigned long)(val), sizeof(*(ptr)))
> > > 
> > > We might want to call this cmpwait_relaxed, in case we decide to add
> > > fenced versions in the future. Or just make it cmpwait_acquire and
> > > remove the smp_rmb() from smp_cond_load_acquire(). Dunno.
> > > 
> > 
> > How about replace smp_rmb() with a smp_acquire_barrier__after_cmpwait()?
> > This barrier is designed to provide an ACQUIRE ordering when combining a
> > cmpwait() .
> > 
> > And cmpwait() only has minimal ordering guarantee, but if it is actually
> > an ACQUIRE, then the corresponding smp_acquire_barrier__after_cmpwait()
> > is just empty.
> 
> Maybe, but that makes it difficult for me to use a load-acquire instruction
> for the ACQUIRE case.
> 

You're right. I was missting that point. Please ignore this proposal.

So I think having a cmpwait_relaxed makes more sense for the people
knowing they could only rely on the control dependency? Or we actually
don't want to encourage this kind of people ;-)

Regards,
Boqun

> Will

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 473 bytes --]

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC][PATCH 3/3] locking,arm64: Introduce cmpwait()
  2016-04-13 12:52     ` Peter Zijlstra
@ 2016-04-26 16:33       ` Will Deacon
  2016-04-26 17:15         ` Will Deacon
  0 siblings, 1 reply; 18+ messages in thread
From: Will Deacon @ 2016-04-26 16:33 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, waiman.long, mingo, paulmck, boqun.feng, torvalds, dave

On Wed, Apr 13, 2016 at 02:52:43PM +0200, Peter Zijlstra wrote:
> On Tue, Apr 12, 2016 at 05:59:41PM +0100, Will Deacon wrote:
> > Thanks for looking at this!
> 
> n/p, had to se what it would look like etc.. :-)
> 
> > > I've misplaced my arm64 compiler, so this is not even compile tested.
> > 
> > Guess what? ;)
> > 
> 
> > make: *** [init] Error 2
> > make: *** Waiting for unfinished jobs....
> > 
> > (and lot of similar errors).
> > 
> > Looks like you're just missing an #undef in cmpxchg.h.
> > 
> > FWIW, you can pick up arm64 toolchain binaries from:
> > 
> >   https://releases.linaro.org/components/toolchain/binaries/latest-5/
> 
> Ah, I usually build a whole set from sources; for some reason arm64
> didn't build in the latest run. I'll have to kick it.
> 
> > > +#define __CMPWAIT_GEN(w, sz, name)					\
> > > +void __cmpwait_case_##name(volatile void *ptr, unsigned long val)	\
> > > +{									\
> > > +	unsigned long tmp;						\
> > > +									\
> > > +	asm volatile(							\
> > > +	"	ldxr" #sz "\t%" #w "[tmp], %[v]\n"			\
> > > +	"	eor	%" #w "[tmp], %" #w "[tmp], %" #w "[val]\n"	\
> > > +	"	cbnz	%" #w "[tmp], 1f\n"				\
> > 
> > Shouldn't this be cbz? (i.e. branch over the wfe if the value is equal
> > to what we wanted?).
> 
> Indeed so.
> 
> > > +	"	wfe\n"							\
> > > +	"1:"								\
> > > +	: [tmp] "=&r" (tmp), [val] "=&r" (val),				\
> > 
> > We only read val, so it can be an input operand, no?
> 
> True.. :-)
> 
> > > +#define cmpwait(ptr, val) \
> > > +	__cmpwait((ptr), (unsigned long)(val), sizeof(*(ptr)))
> > 
> > We might want to call this cmpwait_relaxed, in case we decide to add
> > fenced versions in the future. Or just make it cmpwait_acquire and
> > remove the smp_rmb() from smp_cond_load_acquire(). Dunno.
> 
> This is something I'll very much leave up to you. I have no idea on the
> tradeoffs involved here.

FWIW, here's a fixup patch to get this patch building and running. I
also noticed some missing casts for the subword cases.

Will

--->8

>From 5aa5750d5eeb6e3a42f5547f094dc803f89793bb Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 26 Apr 2016 17:31:53 +0100
Subject: [PATCH] fixup! locking,arm64: Introduce cmpwait()

Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/include/asm/cmpxchg.h | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/include/asm/cmpxchg.h b/arch/arm64/include/asm/cmpxchg.h
index cd7bff6ddedc..9b7113a3f0d7 100644
--- a/arch/arm64/include/asm/cmpxchg.h
+++ b/arch/arm64/include/asm/cmpxchg.h
@@ -225,18 +225,19 @@ __CMPXCHG_GEN(_mb)
 })
 
 #define __CMPWAIT_GEN(w, sz, name)					\
-void __cmpwait_case_##name(volatile void *ptr, unsigned long val)	\
+static inline void __cmpwait_case_##name(volatile void *ptr,		\
+					 unsigned long val)		\
 {									\
 	unsigned long tmp;						\
 									\
 	asm volatile(							\
 	"	ldxr" #sz "\t%" #w "[tmp], %[v]\n"			\
 	"	eor	%" #w "[tmp], %" #w "[tmp], %" #w "[val]\n"	\
-	"	cbnz	%" #w "[tmp], 1f\n"				\
+	"	cbz	%" #w "[tmp], 1f\n"				\
 	"	wfe\n"							\
 	"1:"								\
-	: [tmp] "=&r" (tmp), [val] "=&r" (val),				\
-	  [v] "+Q" (*(unsigned long *)ptr));				\
+	: [tmp] "=&r" (tmp), [v] "+Q" (*(unsigned long *)ptr)		\
+	: [val] "r" (val));						\
 }
 
 __CMPWAIT_GEN(w, b, 1);
@@ -244,11 +245,13 @@ __CMPWAIT_GEN(w, h, 2);
 __CMPWAIT_GEN(w,  , 4);
 __CMPWAIT_GEN( ,  , 8);
 
+#undef __CMPWAIT_GEN
+
 static inline void __cmpwait(volatile void *ptr, unsigned long val, int size)
 {
 	switch (size) {
-	case 1: return __cmpwait_case_1(ptr, val);
-	case 2: return __cmpwait_case_2(ptr, val);
+	case 1: return __cmpwait_case_1(ptr, (u8)val);
+	case 2: return __cmpwait_case_2(ptr, (u16)val);
 	case 4: return __cmpwait_case_4(ptr, val);
 	case 8: return __cmpwait_case_8(ptr, val);
 	default: BUILD_BUG();
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [RFC][PATCH 3/3] locking,arm64: Introduce cmpwait()
  2016-04-26 16:33       ` Will Deacon
@ 2016-04-26 17:15         ` Will Deacon
  2016-04-26 20:25           ` Peter Zijlstra
  0 siblings, 1 reply; 18+ messages in thread
From: Will Deacon @ 2016-04-26 17:15 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, waiman.long, mingo, paulmck, boqun.feng, torvalds, dave

On Tue, Apr 26, 2016 at 05:33:44PM +0100, Will Deacon wrote:
> From 5aa5750d5eeb6e3a42f5547f094dc803f89793bb Mon Sep 17 00:00:00 2001
> From: Will Deacon <will.deacon@arm.com>
> Date: Tue, 26 Apr 2016 17:31:53 +0100
> Subject: [PATCH] fixup! locking,arm64: Introduce cmpwait()
> 
> Signed-off-by: Will Deacon <will.deacon@arm.com>
> ---
>  arch/arm64/include/asm/cmpxchg.h | 15 +++++++++------
>  1 file changed, 9 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/cmpxchg.h b/arch/arm64/include/asm/cmpxchg.h
> index cd7bff6ddedc..9b7113a3f0d7 100644
> --- a/arch/arm64/include/asm/cmpxchg.h
> +++ b/arch/arm64/include/asm/cmpxchg.h
> @@ -225,18 +225,19 @@ __CMPXCHG_GEN(_mb)
>  })
>  
>  #define __CMPWAIT_GEN(w, sz, name)					\
> -void __cmpwait_case_##name(volatile void *ptr, unsigned long val)	\
> +static inline void __cmpwait_case_##name(volatile void *ptr,		\
> +					 unsigned long val)		\
>  {									\
>  	unsigned long tmp;						\
>  									\
>  	asm volatile(							\
>  	"	ldxr" #sz "\t%" #w "[tmp], %[v]\n"			\
>  	"	eor	%" #w "[tmp], %" #w "[tmp], %" #w "[val]\n"	\
> -	"	cbnz	%" #w "[tmp], 1f\n"				\
> +	"	cbz	%" #w "[tmp], 1f\n"				\

Actually, you're right with cbnz. I only noticed when I came to implement
my own version of smp_cond_load_acquire. *sigh*

I have fixups applied locally, so maybe the best thing is for me to send
you an arm64 series on top of whatever you post next?

Will

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC][PATCH 3/3] locking,arm64: Introduce cmpwait()
  2016-04-26 17:15         ` Will Deacon
@ 2016-04-26 20:25           ` Peter Zijlstra
  0 siblings, 0 replies; 18+ messages in thread
From: Peter Zijlstra @ 2016-04-26 20:25 UTC (permalink / raw)
  To: Will Deacon
  Cc: linux-kernel, waiman.long, mingo, paulmck, boqun.feng, torvalds, dave

On Tue, Apr 26, 2016 at 06:15:43PM +0100, Will Deacon wrote:
> Actually, you're right with cbnz. I only noticed when I came to implement
> my own version of smp_cond_load_acquire. *sigh*
> 
> I have fixups applied locally, so maybe the best thing is for me to send
> you an arm64 series on top of whatever you post next?

Sure; lemme rip out all the AARGH64 bits so you can do whatever you
wants :-)

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2016-04-26 20:44 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-04-04 12:22 [RFC][PATCH 0/3] smp_cond_load_acquire + cmpwait Peter Zijlstra
2016-04-04 12:22 ` [RFC][PATCH 1/3] locking: Replace smp_cond_acquire with smp_cond_load_acquire Peter Zijlstra
2016-04-04 18:20   ` Waiman Long
2016-04-04 12:22 ` [RFC][PATCH 2/3] locking/qrwlock: Use smp_cond_load_acquire() Peter Zijlstra
2016-04-12  4:58   ` Davidlohr Bueso
2016-04-12 16:45     ` Waiman Long
2016-04-04 12:22 ` [RFC][PATCH 3/3] locking,arm64: Introduce cmpwait() Peter Zijlstra
2016-04-04 13:12   ` Peter Zijlstra
2016-04-12 16:59   ` Will Deacon
2016-04-13 12:52     ` Peter Zijlstra
2016-04-26 16:33       ` Will Deacon
2016-04-26 17:15         ` Will Deacon
2016-04-26 20:25           ` Peter Zijlstra
2016-04-22 16:08     ` Boqun Feng
2016-04-22 16:53       ` Will Deacon
2016-04-23  4:02         ` Boqun Feng
2016-04-23  2:37       ` Peter Zijlstra
2016-04-23  3:40         ` Boqun Feng

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.