linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] locking/pvqspinlock: Relax cmpxchg's to improve performance on some archs
@ 2016-12-25 20:26 Waiman Long
  2016-12-26  5:50 ` Boqun Feng
                   ` (2 more replies)
  0 siblings, 3 replies; 21+ messages in thread
From: Waiman Long @ 2016-12-25 20:26 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar
  Cc: linux-kernel, Pan Xinhui, Boqun Feng, Waiman Long

A number of cmpxchg calls in qspinlock_paravirt.h were replaced by more
relaxed versions to improve performance on architectures that use LL/SC.

All the locking related cmpxchg's are replaced with the _acquire
variants:
 - pv_queued_spin_steal_lock()
 - trylock_clear_pending()

The cmpxchg's related to hashing are replaced by either by the _release
or the _relaxed variants. See the inline comment for details.

Signed-off-by: Waiman Long <longman@redhat.com>

 v1->v2:
  - Add comments in changelog and code for the rationale of the change.

---
 kernel/locking/qspinlock_paravirt.h | 50 ++++++++++++++++++++++++-------------
 1 file changed, 33 insertions(+), 17 deletions(-)

diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index e3b5520..c31d1ab 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -72,7 +72,7 @@ static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
 	struct __qspinlock *l = (void *)lock;
 
 	if (!(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
-	    (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
+	    (cmpxchg_acquire(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
 		qstat_inc(qstat_pv_lock_stealing, true);
 		return true;
 	}
@@ -101,16 +101,16 @@ static __always_inline void clear_pending(struct qspinlock *lock)
 
 /*
  * The pending bit check in pv_queued_spin_steal_lock() isn't a memory
- * barrier. Therefore, an atomic cmpxchg() is used to acquire the lock
- * just to be sure that it will get it.
+ * barrier. Therefore, an atomic cmpxchg_acquire() is used to acquire the
+ * lock to provide the proper memory barrier.
  */
 static __always_inline int trylock_clear_pending(struct qspinlock *lock)
 {
 	struct __qspinlock *l = (void *)lock;
 
 	return !READ_ONCE(l->locked) &&
-	       (cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL)
-			== _Q_PENDING_VAL);
+	       (cmpxchg_acquire(&l->locked_pending, _Q_PENDING_VAL,
+				_Q_LOCKED_VAL) == _Q_PENDING_VAL);
 }
 #else /* _Q_PENDING_BITS == 8 */
 static __always_inline void set_pending(struct qspinlock *lock)
@@ -138,7 +138,7 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock)
 		 */
 		old = val;
 		new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
-		val = atomic_cmpxchg(&lock->val, old, new);
+		val = atomic_cmpxchg_acquire(&lock->val, old, new);
 
 		if (val == old)
 			return 1;
@@ -209,9 +209,15 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
 	struct pv_hash_entry *he;
 	int hopcnt = 0;
 
+	/*
+	 * Synchronizing with the node state variable will control who does
+	 * the hashing - the lock holder or lock waiter. The control
+	 * dependency will ensure that node value is written after the lock
+	 * value. So we don't need other ordering guarantee.
+	 */
 	for_each_hash_entry(he, offset, hash) {
 		hopcnt++;
-		if (!cmpxchg(&he->lock, NULL, lock)) {
+		if (!cmpxchg_relaxed(&he->lock, NULL, lock)) {
 			WRITE_ONCE(he->node, node);
 			qstat_hop(hopcnt);
 			return &he->lock;
@@ -309,7 +315,7 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
 		 *     MB			      MB
 		 * [L] pn->locked		[RmW] pn->state = vcpu_hashed
 		 *
-		 * Matches the cmpxchg() from pv_kick_node().
+		 * Matches the cmpxchg_release() from pv_kick_node().
 		 */
 		smp_store_mb(pn->state, vcpu_halted);
 
@@ -323,8 +329,14 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
 		 * If pv_kick_node() changed us to vcpu_hashed, retain that
 		 * value so that pv_wait_head_or_lock() knows to not also try
 		 * to hash this lock.
+		 *
+		 * The smp_store_mb() and control dependency above will ensure
+		 * that state change won't happen before that. Synchronizing
+		 * with pv_kick_node() wrt hashing by this waiter or by the
+		 * lock holder is done solely by the state variable. There is
+		 * no other ordering requirement.
 		 */
-		cmpxchg(&pn->state, vcpu_halted, vcpu_running);
+		cmpxchg_relaxed(&pn->state, vcpu_halted, vcpu_running);
 
 		/*
 		 * If the locked flag is still not set after wakeup, it is a
@@ -360,9 +372,12 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
 	 * pv_wait_node(). If OTOH this fails, the vCPU was running and will
 	 * observe its next->locked value and advance itself.
 	 *
-	 * Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
+	 * Matches with smp_store_mb() and cmpxchg_relaxed() in pv_wait_node().
+	 * A release barrier is used here to ensure that node->locked is
+	 * always set before changing the state. See comment in pv_wait_node().
 	 */
-	if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted)
+	if (cmpxchg_release(&pn->state, vcpu_halted, vcpu_hashed)
+			!= vcpu_halted)
 		return;
 
 	/*
@@ -461,8 +476,8 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
 	}
 
 	/*
-	 * The cmpxchg() or xchg() call before coming here provides the
-	 * acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL
+	 * The cmpxchg_acquire() or xchg() call before coming here provides
+	 * the acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL
 	 * here is to indicate to the compiler that the value will always
 	 * be nozero to enable better code optimization.
 	 */
@@ -488,11 +503,12 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
 	}
 
 	/*
-	 * A failed cmpxchg doesn't provide any memory-ordering guarantees,
-	 * so we need a barrier to order the read of the node data in
-	 * pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
+	 * A failed cmpxchg_release doesn't provide any memory-ordering
+	 * guarantees, so we need a barrier to order the read of the node
+	 * data in pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
 	 *
-	 * Matches the cmpxchg() in pv_wait_head_or_lock() setting _Q_SLOW_VAL.
+	 * Matches the cmpxchg_acquire() in pv_wait_head_or_lock() setting
+	 * _Q_SLOW_VAL.
 	 */
 	smp_rmb();
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH v2] locking/pvqspinlock: Relax cmpxchg's to improve performance on some archs
  2016-12-25 20:26 [PATCH v2] locking/pvqspinlock: Relax cmpxchg's to improve performance on some archs Waiman Long
@ 2016-12-26  5:50 ` Boqun Feng
  2017-01-03 22:23   ` Waiman Long
  2017-01-03 16:18 ` Peter Zijlstra
       [not found] ` <CAH4ORazqsCBA4G5paHtsp8PMfM=J3P6rvyR-53-ZLjn=7U6J0g@mail.gmail.com>
  2 siblings, 1 reply; 21+ messages in thread
From: Boqun Feng @ 2016-12-26  5:50 UTC (permalink / raw)
  To: Waiman Long; +Cc: Peter Zijlstra, Ingo Molnar, linux-kernel, Pan Xinhui

[-- Attachment #1: Type: text/plain, Size: 7329 bytes --]

Hi Wainman,

On Sun, Dec 25, 2016 at 03:26:01PM -0500, Waiman Long wrote:
> A number of cmpxchg calls in qspinlock_paravirt.h were replaced by more
> relaxed versions to improve performance on architectures that use LL/SC.
> 
> All the locking related cmpxchg's are replaced with the _acquire
> variants:
>  - pv_queued_spin_steal_lock()
>  - trylock_clear_pending()
> 
> The cmpxchg's related to hashing are replaced by either by the _release
> or the _relaxed variants. See the inline comment for details.
> 
> Signed-off-by: Waiman Long <longman@redhat.com>
> 
>  v1->v2:
>   - Add comments in changelog and code for the rationale of the change.
> 
> ---
>  kernel/locking/qspinlock_paravirt.h | 50 ++++++++++++++++++++++++-------------
>  1 file changed, 33 insertions(+), 17 deletions(-)
> 
> diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
> index e3b5520..c31d1ab 100644
> --- a/kernel/locking/qspinlock_paravirt.h
> +++ b/kernel/locking/qspinlock_paravirt.h
> @@ -72,7 +72,7 @@ static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
>  	struct __qspinlock *l = (void *)lock;
>  
>  	if (!(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
> -	    (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
> +	    (cmpxchg_acquire(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
>  		qstat_inc(qstat_pv_lock_stealing, true);
>  		return true;
>  	}
> @@ -101,16 +101,16 @@ static __always_inline void clear_pending(struct qspinlock *lock)
>  
>  /*
>   * The pending bit check in pv_queued_spin_steal_lock() isn't a memory
> - * barrier. Therefore, an atomic cmpxchg() is used to acquire the lock
> - * just to be sure that it will get it.
> + * barrier. Therefore, an atomic cmpxchg_acquire() is used to acquire the
> + * lock to provide the proper memory barrier.
>   */
>  static __always_inline int trylock_clear_pending(struct qspinlock *lock)
>  {
>  	struct __qspinlock *l = (void *)lock;
>  
>  	return !READ_ONCE(l->locked) &&
> -	       (cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL)
> -			== _Q_PENDING_VAL);
> +	       (cmpxchg_acquire(&l->locked_pending, _Q_PENDING_VAL,
> +				_Q_LOCKED_VAL) == _Q_PENDING_VAL);
>  }
>  #else /* _Q_PENDING_BITS == 8 */
>  static __always_inline void set_pending(struct qspinlock *lock)
> @@ -138,7 +138,7 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock)
>  		 */
>  		old = val;
>  		new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
> -		val = atomic_cmpxchg(&lock->val, old, new);
> +		val = atomic_cmpxchg_acquire(&lock->val, old, new);
>  
>  		if (val == old)
>  			return 1;
> @@ -209,9 +209,15 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
>  	struct pv_hash_entry *he;
>  	int hopcnt = 0;
>  
> +	/*
> +	 * Synchronizing with the node state variable will control who does
> +	 * the hashing - the lock holder or lock waiter. The control
> +	 * dependency will ensure that node value is written after the lock
> +	 * value. So we don't need other ordering guarantee.
> +	 */

By this comment, you mean that
	
	cmpxchg_relaxed(&he->lock, NULL, lock);
	  r1 = ll he->lock;
	  <compare part>
	  sc he->lock, lock // successed

	if (r1)
		WRITE_ONCE(he->node, node);


the sc and WRITE_ONCE() can not be reordered because of the control
dependency? I dont think this is true. Yes the sc must execute before
the WRITE_ONCE(), but the memory/cache effects may be reordered. IOW,
the following may happen


	CPU 0			CPU 1
	===================	=======================
	{x = 0, y = 0}		if (!cmpxchg_relaxed(&y, 0, 1))
					WRITE_ONCE(x, 1);
	r1 = READ_ONCE(x);

	smp_rmb();

	r2 = READ_ONCE(y);

The following result is possible:

	y = 1 && r1 = 1 && r2 = 0

Or I'm missing your point here? ;-) 

Regards,
Boqun

>  	for_each_hash_entry(he, offset, hash) {
>  		hopcnt++;
> -		if (!cmpxchg(&he->lock, NULL, lock)) {
> +		if (!cmpxchg_relaxed(&he->lock, NULL, lock)) {
>  			WRITE_ONCE(he->node, node);
>  			qstat_hop(hopcnt);
>  			return &he->lock;
> @@ -309,7 +315,7 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
>  		 *     MB			      MB
>  		 * [L] pn->locked		[RmW] pn->state = vcpu_hashed
>  		 *
> -		 * Matches the cmpxchg() from pv_kick_node().
> +		 * Matches the cmpxchg_release() from pv_kick_node().
>  		 */
>  		smp_store_mb(pn->state, vcpu_halted);
>  
> @@ -323,8 +329,14 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
>  		 * If pv_kick_node() changed us to vcpu_hashed, retain that
>  		 * value so that pv_wait_head_or_lock() knows to not also try
>  		 * to hash this lock.
> +		 *
> +		 * The smp_store_mb() and control dependency above will ensure
> +		 * that state change won't happen before that. Synchronizing
> +		 * with pv_kick_node() wrt hashing by this waiter or by the
> +		 * lock holder is done solely by the state variable. There is
> +		 * no other ordering requirement.
>  		 */
> -		cmpxchg(&pn->state, vcpu_halted, vcpu_running);
> +		cmpxchg_relaxed(&pn->state, vcpu_halted, vcpu_running);
>  
>  		/*
>  		 * If the locked flag is still not set after wakeup, it is a
> @@ -360,9 +372,12 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
>  	 * pv_wait_node(). If OTOH this fails, the vCPU was running and will
>  	 * observe its next->locked value and advance itself.
>  	 *
> -	 * Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
> +	 * Matches with smp_store_mb() and cmpxchg_relaxed() in pv_wait_node().
> +	 * A release barrier is used here to ensure that node->locked is
> +	 * always set before changing the state. See comment in pv_wait_node().
>  	 */
> -	if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted)
> +	if (cmpxchg_release(&pn->state, vcpu_halted, vcpu_hashed)
> +			!= vcpu_halted)
>  		return;
>  
>  	/*
> @@ -461,8 +476,8 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
>  	}
>  
>  	/*
> -	 * The cmpxchg() or xchg() call before coming here provides the
> -	 * acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL
> +	 * The cmpxchg_acquire() or xchg() call before coming here provides
> +	 * the acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL
>  	 * here is to indicate to the compiler that the value will always
>  	 * be nozero to enable better code optimization.
>  	 */
> @@ -488,11 +503,12 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
>  	}
>  
>  	/*
> -	 * A failed cmpxchg doesn't provide any memory-ordering guarantees,
> -	 * so we need a barrier to order the read of the node data in
> -	 * pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
> +	 * A failed cmpxchg_release doesn't provide any memory-ordering
> +	 * guarantees, so we need a barrier to order the read of the node
> +	 * data in pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
>  	 *
> -	 * Matches the cmpxchg() in pv_wait_head_or_lock() setting _Q_SLOW_VAL.
> +	 * Matches the cmpxchg_acquire() in pv_wait_head_or_lock() setting
> +	 * _Q_SLOW_VAL.
>  	 */
>  	smp_rmb();
>  
> -- 
> 1.8.3.1
> 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2] locking/pvqspinlock: Relax cmpxchg's to improve performance on some archs
  2016-12-25 20:26 [PATCH v2] locking/pvqspinlock: Relax cmpxchg's to improve performance on some archs Waiman Long
  2016-12-26  5:50 ` Boqun Feng
@ 2017-01-03 16:18 ` Peter Zijlstra
  2017-01-03 22:07   ` Waiman Long
       [not found] ` <CAH4ORazqsCBA4G5paHtsp8PMfM=J3P6rvyR-53-ZLjn=7U6J0g@mail.gmail.com>
  2 siblings, 1 reply; 21+ messages in thread
From: Peter Zijlstra @ 2017-01-03 16:18 UTC (permalink / raw)
  To: Waiman Long; +Cc: Ingo Molnar, linux-kernel, Pan Xinhui, Boqun Feng

On Sun, Dec 25, 2016 at 03:26:01PM -0500, Waiman Long wrote:
> A number of cmpxchg calls in qspinlock_paravirt.h were replaced by more
> relaxed versions to improve performance on architectures that use LL/SC.

Claim without numbers ;-)

> 
> All the locking related cmpxchg's are replaced with the _acquire
> variants:
>  - pv_queued_spin_steal_lock()
>  - trylock_clear_pending()

So these seem to make sense in that they're in 'fast' paths..

> The cmpxchg's related to hashing are replaced by either by the _release
> or the _relaxed variants. See the inline comment for details.


But these not so much, we're going to put the vcpu to sleep, why does it
make sense to 'optimize' the wait/kick stuff?

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2] locking/pvqspinlock: Relax cmpxchg's to improve performance on some archs
  2017-01-03 16:18 ` Peter Zijlstra
@ 2017-01-03 22:07   ` Waiman Long
  2017-01-04  9:41     ` Peter Zijlstra
  0 siblings, 1 reply; 21+ messages in thread
From: Waiman Long @ 2017-01-03 22:07 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, linux-kernel, Pan Xinhui, Boqun Feng

On 01/03/2017 11:18 AM, Peter Zijlstra wrote:
> On Sun, Dec 25, 2016 at 03:26:01PM -0500, Waiman Long wrote:
>> A number of cmpxchg calls in qspinlock_paravirt.h were replaced by more
>> relaxed versions to improve performance on architectures that use LL/SC.
> Claim without numbers ;-)

Well it is hard to produce actual numbers here as I don't have the setup
to gather data.
 
>> All the locking related cmpxchg's are replaced with the _acquire
>> variants:
>>  - pv_queued_spin_steal_lock()
>>  - trylock_clear_pending()
> So these seem to make sense in that they're in 'fast' paths..
>
>> The cmpxchg's related to hashing are replaced by either by the _release
>> or the _relaxed variants. See the inline comment for details.
>
> But these not so much, we're going to put the vcpu to sleep, why does it
> make sense to 'optimize' the wait/kick stuff?

I haven't thought too much about fast/slow paths when I was making the
patch. You are right that we properly don't need to do that for the
slowpath cases. I can modify the patch to do just the fast patch change.

Cheers,
Longman

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2] locking/pvqspinlock: Relax cmpxchg's to improve performance on some archs
  2016-12-26  5:50 ` Boqun Feng
@ 2017-01-03 22:23   ` Waiman Long
  0 siblings, 0 replies; 21+ messages in thread
From: Waiman Long @ 2017-01-03 22:23 UTC (permalink / raw)
  To: Boqun Feng; +Cc: Peter Zijlstra, Ingo Molnar, linux-kernel, Pan Xinhui

On 12/26/2016 12:50 AM, Boqun Feng wrote:
> Hi Wainman,
>
> On Sun, Dec 25, 2016 at 03:26:01PM -0500, Waiman Long wrote:
>> A number of cmpxchg calls in qspinlock_paravirt.h were replaced by more
>> relaxed versions to improve performance on architectures that use LL/SC.
>>
>> All the locking related cmpxchg's are replaced with the _acquire
>> variants:
>>  - pv_queued_spin_steal_lock()
>>  - trylock_clear_pending()
>>
>> The cmpxchg's related to hashing are replaced by either by the _release
>> or the _relaxed variants. See the inline comment for details.
>>
>> Signed-off-by: Waiman Long <longman@redhat.com>
>>
>>  v1->v2:
>>   - Add comments in changelog and code for the rationale of the change.
>>
>> ---
>>  kernel/locking/qspinlock_paravirt.h | 50 ++++++++++++++++++++++++-------------
>>  1 file changed, 33 insertions(+), 17 deletions(-)
>>
>> diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
>> index e3b5520..c31d1ab 100644
>> --- a/kernel/locking/qspinlock_paravirt.h
>> +++ b/kernel/locking/qspinlock_paravirt.h
>> @@ -72,7 +72,7 @@ static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
>>  	struct __qspinlock *l = (void *)lock;
>>  
>>  	if (!(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
>> -	    (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
>> +	    (cmpxchg_acquire(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
>>  		qstat_inc(qstat_pv_lock_stealing, true);
>>  		return true;
>>  	}
>> @@ -101,16 +101,16 @@ static __always_inline void clear_pending(struct qspinlock *lock)
>>  
>>  /*
>>   * The pending bit check in pv_queued_spin_steal_lock() isn't a memory
>> - * barrier. Therefore, an atomic cmpxchg() is used to acquire the lock
>> - * just to be sure that it will get it.
>> + * barrier. Therefore, an atomic cmpxchg_acquire() is used to acquire the
>> + * lock to provide the proper memory barrier.
>>   */
>>  static __always_inline int trylock_clear_pending(struct qspinlock *lock)
>>  {
>>  	struct __qspinlock *l = (void *)lock;
>>  
>>  	return !READ_ONCE(l->locked) &&
>> -	       (cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL)
>> -			== _Q_PENDING_VAL);
>> +	       (cmpxchg_acquire(&l->locked_pending, _Q_PENDING_VAL,
>> +				_Q_LOCKED_VAL) == _Q_PENDING_VAL);
>>  }
>>  #else /* _Q_PENDING_BITS == 8 */
>>  static __always_inline void set_pending(struct qspinlock *lock)
>> @@ -138,7 +138,7 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock)
>>  		 */
>>  		old = val;
>>  		new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
>> -		val = atomic_cmpxchg(&lock->val, old, new);
>> +		val = atomic_cmpxchg_acquire(&lock->val, old, new);
>>  
>>  		if (val == old)
>>  			return 1;
>> @@ -209,9 +209,15 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
>>  	struct pv_hash_entry *he;
>>  	int hopcnt = 0;
>>  
>> +	/*
>> +	 * Synchronizing with the node state variable will control who does
>> +	 * the hashing - the lock holder or lock waiter. The control
>> +	 * dependency will ensure that node value is written after the lock
>> +	 * value. So we don't need other ordering guarantee.
>> +	 */
> By this comment, you mean that
> 	
> 	cmpxchg_relaxed(&he->lock, NULL, lock);
> 	  r1 = ll he->lock;
> 	  <compare part>
> 	  sc he->lock, lock // successed
>
> 	if (r1)
> 		WRITE_ONCE(he->node, node);
>
>
> the sc and WRITE_ONCE() can not be reordered because of the control
> dependency? I dont think this is true. Yes the sc must execute before
> the WRITE_ONCE(), but the memory/cache effects may be reordered. IOW,
> the following may happen
>
>
> 	CPU 0			CPU 1
> 	===================	=======================
> 	{x = 0, y = 0}		if (!cmpxchg_relaxed(&y, 0, 1))
> 					WRITE_ONCE(x, 1);
> 	r1 = READ_ONCE(x);
>
> 	smp_rmb();
>
> 	r2 = READ_ONCE(y);
>
> The following result is possible:
>
> 	y = 1 && r1 = 1 && r2 = 0
>
> Or I'm missing your point here? ;-) 
>
> Regards,
> Boqun
>
You are probably right. I know the code is somewhat risky. That is why I
am waiting for expert like you to see if this is really the case. Now it
seems that it may not be the case. I will revise the patch to take that out.

Cheers,
Longman

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2] locking/pvqspinlock: Relax cmpxchg's to improve performance on some archs
  2017-01-03 22:07   ` Waiman Long
@ 2017-01-04  9:41     ` Peter Zijlstra
  2017-01-05  8:16       ` Pan Xinhui
  2017-01-05 15:30       ` Waiman Long
  0 siblings, 2 replies; 21+ messages in thread
From: Peter Zijlstra @ 2017-01-04  9:41 UTC (permalink / raw)
  To: Waiman Long; +Cc: Ingo Molnar, linux-kernel, Pan Xinhui, Boqun Feng

On Tue, Jan 03, 2017 at 05:07:54PM -0500, Waiman Long wrote:
> On 01/03/2017 11:18 AM, Peter Zijlstra wrote:
> > On Sun, Dec 25, 2016 at 03:26:01PM -0500, Waiman Long wrote:
> >> A number of cmpxchg calls in qspinlock_paravirt.h were replaced by more
> >> relaxed versions to improve performance on architectures that use LL/SC.
> > Claim without numbers ;-)
> 
> Well it is hard to produce actual numbers here as I don't have the setup
> to gather data.

Surely RHT has big PPC machines around? I know that getting to them is a
wee bit of a bother, but they should be available somewhere.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2] locking/pvqspinlock: Relax cmpxchg's to improve performance on some archs
  2017-01-04  9:41     ` Peter Zijlstra
@ 2017-01-05  8:16       ` Pan Xinhui
  2017-01-05  9:48         ` Peter Zijlstra
                           ` (2 more replies)
  2017-01-05 15:30       ` Waiman Long
  1 sibling, 3 replies; 21+ messages in thread
From: Pan Xinhui @ 2017-01-05  8:16 UTC (permalink / raw)
  To: Peter Zijlstra, Waiman Long; +Cc: Ingo Molnar, linux-kernel, Boqun Feng



在 2017/1/4 17:41, Peter Zijlstra 写道:
> On Tue, Jan 03, 2017 at 05:07:54PM -0500, Waiman Long wrote:
>> On 01/03/2017 11:18 AM, Peter Zijlstra wrote:
>>> On Sun, Dec 25, 2016 at 03:26:01PM -0500, Waiman Long wrote:
>>>> A number of cmpxchg calls in qspinlock_paravirt.h were replaced by more
>>>> relaxed versions to improve performance on architectures that use LL/SC.
>>> Claim without numbers ;-)
>>
>> Well it is hard to produce actual numbers here as I don't have the setup
>> to gather data.
>
> Surely RHT has big PPC machines around? I know that getting to them is a
> wee bit of a bother, but they should be available somewhere.
>
hi,

I do some tests about cmpxchg and cmpxchg_acquire before on ppc.

loops in 15s of each cmpxchg is below.

cmpxchg_relaxed: 336663
cmpxchg_release: 369054
cmpxchg_acquire: 363364
cmpxchg:	 179435

so cmpxchg is really expensive than others.
but I also have doubt about the cmpxchg_relaxed, it should be the cheapest, but from the tests, release/acquire are faster than it.

thanks
xinhui

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2] locking/pvqspinlock: Relax cmpxchg's to improve performance on some archs
  2017-01-05  8:16       ` Pan Xinhui
@ 2017-01-05  9:48         ` Peter Zijlstra
  2017-01-05  9:51         ` Boqun Feng
  2017-01-05 15:17         ` Waiman Long
  2 siblings, 0 replies; 21+ messages in thread
From: Peter Zijlstra @ 2017-01-05  9:48 UTC (permalink / raw)
  To: Pan Xinhui; +Cc: Waiman Long, Ingo Molnar, linux-kernel, Boqun Feng

On Thu, Jan 05, 2017 at 04:16:38PM +0800, Pan Xinhui wrote:
> I do some tests about cmpxchg and cmpxchg_acquire before on ppc.
> 
> loops in 15s of each cmpxchg is below.
> 
> cmpxchg_relaxed: 336663
> cmpxchg_release: 369054
> cmpxchg_acquire: 363364
> cmpxchg:	 179435
> 
> so cmpxchg is really expensive than others.
> but I also have doubt about the cmpxchg_relaxed, it should be the cheapest, but from the tests, release/acquire are faster than it.

Right, curious about that relaxed one. In any case, I was more wondering
about the performance impact on the larger construct of the pvlock
itself.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2] locking/pvqspinlock: Relax cmpxchg's to improve performance on some archs
  2017-01-05  8:16       ` Pan Xinhui
  2017-01-05  9:48         ` Peter Zijlstra
@ 2017-01-05  9:51         ` Boqun Feng
  2017-01-05 15:17         ` Waiman Long
  2 siblings, 0 replies; 21+ messages in thread
From: Boqun Feng @ 2017-01-05  9:51 UTC (permalink / raw)
  To: Pan Xinhui; +Cc: Peter Zijlstra, Waiman Long, Ingo Molnar, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1817 bytes --]

On Thu, Jan 05, 2017 at 04:16:38PM +0800, Pan Xinhui wrote:
> 
> 
> 在 2017/1/4 17:41, Peter Zijlstra 写道:
> > On Tue, Jan 03, 2017 at 05:07:54PM -0500, Waiman Long wrote:
> > > On 01/03/2017 11:18 AM, Peter Zijlstra wrote:
> > > > On Sun, Dec 25, 2016 at 03:26:01PM -0500, Waiman Long wrote:
> > > > > A number of cmpxchg calls in qspinlock_paravirt.h were replaced by more
> > > > > relaxed versions to improve performance on architectures that use LL/SC.
> > > > Claim without numbers ;-)
> > > 
> > > Well it is hard to produce actual numbers here as I don't have the setup
> > > to gather data.
> > 
> > Surely RHT has big PPC machines around? I know that getting to them is a
> > wee bit of a bother, but they should be available somewhere.
> > 
> hi,
> 
> I do some tests about cmpxchg and cmpxchg_acquire before on ppc.
> 
> loops in 15s of each cmpxchg is below.
> 
> cmpxchg_relaxed: 336663
> cmpxchg_release: 369054
> cmpxchg_acquire: 363364
> cmpxchg:	 179435
> 
> so cmpxchg is really expensive than others.
> but I also have doubt about the cmpxchg_relaxed, it should be the cheapest, but from the tests, release/acquire are faster than it.
> 

I have observed something similar before. But the performance number for
a single atomic operation itself is not that useful.

Here is my understanding(basically guessing ;-))

If your testcase is only committing those cmpxchg in a loop then the
overhead of the barrier in _release and _acquire is much small and may
even help the performance because of their side effects on prefetchs or
cache invalidations.

But if your testcase get complex even that committing barriers is not
cheap, you probably will see cmpxchg_relaxed beats _acquire and _release
variants.

Regards,
Boqun

> thanks
> xinhui
> 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2] locking/pvqspinlock: Relax cmpxchg's to improve performance on some archs
  2017-01-05  8:16       ` Pan Xinhui
  2017-01-05  9:48         ` Peter Zijlstra
  2017-01-05  9:51         ` Boqun Feng
@ 2017-01-05 15:17         ` Waiman Long
  2017-01-05 15:40           ` Boqun Feng
  2 siblings, 1 reply; 21+ messages in thread
From: Waiman Long @ 2017-01-05 15:17 UTC (permalink / raw)
  To: Pan Xinhui, Peter Zijlstra; +Cc: Ingo Molnar, linux-kernel, Boqun Feng

On 01/05/2017 03:16 AM, Pan Xinhui wrote:
>
>
> 在 2017/1/4 17:41, Peter Zijlstra 写道:
>> On Tue, Jan 03, 2017 at 05:07:54PM -0500, Waiman Long wrote:
>>> On 01/03/2017 11:18 AM, Peter Zijlstra wrote:
>>>> On Sun, Dec 25, 2016 at 03:26:01PM -0500, Waiman Long wrote:
>>>>> A number of cmpxchg calls in qspinlock_paravirt.h were replaced by
>>>>> more
>>>>> relaxed versions to improve performance on architectures that use
>>>>> LL/SC.
>>>> Claim without numbers ;-)
>>>
>>> Well it is hard to produce actual numbers here as I don't have the
>>> setup
>>> to gather data.
>>
>> Surely RHT has big PPC machines around? I know that getting to them is a
>> wee bit of a bother, but they should be available somewhere.
>>
> hi,
>
> I do some tests about cmpxchg and cmpxchg_acquire before on ppc.
>
> loops in 15s of each cmpxchg is below.
>
> cmpxchg_relaxed: 336663
> cmpxchg_release: 369054
> cmpxchg_acquire: 363364
> cmpxchg:     179435
>
> so cmpxchg is really expensive than others.
> but I also have doubt about the cmpxchg_relaxed, it should be the
> cheapest, but from the tests, release/acquire are faster than it.
>
> thanks
> xinhui
>
Thanks for doing the test. It looks like we should just focus on using
either cmpxchg_release or cmpxchg_acquire and forget about cmpxchg_relaxed.

Cheers,
Longman

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2] locking/pvqspinlock: Relax cmpxchg's to improve performance on some archs
  2017-01-04  9:41     ` Peter Zijlstra
  2017-01-05  8:16       ` Pan Xinhui
@ 2017-01-05 15:30       ` Waiman Long
  1 sibling, 0 replies; 21+ messages in thread
From: Waiman Long @ 2017-01-05 15:30 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, linux-kernel, Pan Xinhui, Boqun Feng

On 01/04/2017 04:41 AM, Peter Zijlstra wrote:
> On Tue, Jan 03, 2017 at 05:07:54PM -0500, Waiman Long wrote:
>> On 01/03/2017 11:18 AM, Peter Zijlstra wrote:
>>> On Sun, Dec 25, 2016 at 03:26:01PM -0500, Waiman Long wrote:
>>>> A number of cmpxchg calls in qspinlock_paravirt.h were replaced by more
>>>> relaxed versions to improve performance on architectures that use LL/SC.
>>> Claim without numbers ;-)
>> Well it is hard to produce actual numbers here as I don't have the setup
>> to gather data.
> Surely RHT has big PPC machines around? I know that getting to them is a
> wee bit of a bother, but they should be available somewhere.

Yes, RHT does have PPC machines around. It is just I don't have
experience dealing with those machines yet. I will try to run some test
and include the numbers in the next version of the patch.

Cheers,
Longman

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2] locking/pvqspinlock: Relax cmpxchg's to improve performance on some archs
  2017-01-05 15:17         ` Waiman Long
@ 2017-01-05 15:40           ` Boqun Feng
  0 siblings, 0 replies; 21+ messages in thread
From: Boqun Feng @ 2017-01-05 15:40 UTC (permalink / raw)
  To: Waiman Long; +Cc: Pan Xinhui, Peter Zijlstra, Ingo Molnar, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 2030 bytes --]

On Thu, Jan 05, 2017 at 10:17:46AM -0500, Waiman Long wrote:
> On 01/05/2017 03:16 AM, Pan Xinhui wrote:
> >
> >
> > 在 2017/1/4 17:41, Peter Zijlstra 写道:
> >> On Tue, Jan 03, 2017 at 05:07:54PM -0500, Waiman Long wrote:
> >>> On 01/03/2017 11:18 AM, Peter Zijlstra wrote:
> >>>> On Sun, Dec 25, 2016 at 03:26:01PM -0500, Waiman Long wrote:
> >>>>> A number of cmpxchg calls in qspinlock_paravirt.h were replaced by
> >>>>> more
> >>>>> relaxed versions to improve performance on architectures that use
> >>>>> LL/SC.
> >>>> Claim without numbers ;-)
> >>>
> >>> Well it is hard to produce actual numbers here as I don't have the
> >>> setup
> >>> to gather data.
> >>
> >> Surely RHT has big PPC machines around? I know that getting to them is a
> >> wee bit of a bother, but they should be available somewhere.
> >>
> > hi,
> >
> > I do some tests about cmpxchg and cmpxchg_acquire before on ppc.
> >
> > loops in 15s of each cmpxchg is below.
> >
> > cmpxchg_relaxed: 336663
> > cmpxchg_release: 369054
> > cmpxchg_acquire: 363364
> > cmpxchg:     179435
> >
> > so cmpxchg is really expensive than others.
> > but I also have doubt about the cmpxchg_relaxed, it should be the
> > cheapest, but from the tests, release/acquire are faster than it.
> >
> > thanks
> > xinhui
> >
> Thanks for doing the test. It looks like we should just focus on using
> either cmpxchg_release or cmpxchg_acquire and forget about cmpxchg_relaxed.
> 

_relaxed is more lightweight in two aspects:

1.	_relaxed is not a compiler barrier, so it allows compilers do
	more optimizations than _acquire and _release.

2.	_relaxed doesn't introduce HW barriers and doesn't performs
	worse than _acquire and _release in _most_ _usual_ cases.   

And on some ll/sc archs without a lightweigt barrier like "lwsync" for
_acquire and _release, #2 could be more significant.

So don't hesitate to use _relaxed whenever it's possible and reasonable
;-)

Regards,
Boqun

> Cheers,
> Longman
> 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2] locking/pvqspinlock: Relax cmpxchg's to improve performance on some archs
       [not found] ` <CAH4ORazqsCBA4G5paHtsp8PMfM=J3P6rvyR-53-ZLjn=7U6J0g@mail.gmail.com>
@ 2017-02-08  4:05   ` Boqun Feng
  2017-02-08  6:09     ` Boqun Feng
       [not found]   ` <778926a5-cf9f-586b-6bc4-b9453d88aabb@redhat.com>
  1 sibling, 1 reply; 21+ messages in thread
From: Boqun Feng @ 2017-02-08  4:05 UTC (permalink / raw)
  To: Xinhui Pan
  Cc: Waiman Long, Peter Zijlstra, Ingo Molnar, linux-kernel, Pan Xinhui

[-- Attachment #1: Type: text/plain, Size: 3721 bytes --]

On Wed, Feb 08, 2017 at 11:39:10AM +0800, Xinhui Pan wrote:
> 2016-12-26 4:26 GMT+08:00 Waiman Long <longman@redhat.com>:
> 
> > A number of cmpxchg calls in qspinlock_paravirt.h were replaced by more
> > relaxed versions to improve performance on architectures that use LL/SC.
> >
> > All the locking related cmpxchg's are replaced with the _acquire
> > variants:
> >  - pv_queued_spin_steal_lock()
> >  - trylock_clear_pending()
> >
> > The cmpxchg's related to hashing are replaced by either by the _release
> > or the _relaxed variants. See the inline comment for details.
> >
> > Signed-off-by: Waiman Long <longman@redhat.com>
> >
> >  v1->v2:
> >   - Add comments in changelog and code for the rationale of the change.
> >
> > ---
> >  kernel/locking/qspinlock_paravirt.h | 50 ++++++++++++++++++++++++------
> > -------
> >  1 file changed, 33 insertions(+), 17 deletions(-)
> >
> >
> > @@ -323,8 +329,14 @@ static void pv_wait_node(struct mcs_spinlock *node,
> > struct mcs_spinlock *prev)
> >                  * If pv_kick_node() changed us to vcpu_hashed, retain that
> >                  * value so that pv_wait_head_or_lock() knows to not also
> > try
> >                  * to hash this lock.
> > +                *
> > +                * The smp_store_mb() and control dependency above will
> > ensure
> > +                * that state change won't happen before that.
> > Synchronizing
> > +                * with pv_kick_node() wrt hashing by this waiter or by the
> > +                * lock holder is done solely by the state variable. There
> > is
> > +                * no other ordering requirement.
> >                  */
> > -               cmpxchg(&pn->state, vcpu_halted, vcpu_running);
> > +               cmpxchg_relaxed(&pn->state, vcpu_halted, vcpu_running);
> >
> >                 /*
> >                  * If the locked flag is still not set after wakeup, it is
> > a
> > @@ -360,9 +372,12 @@ static void pv_kick_node(struct qspinlock *lock,
> > struct mcs_spinlock *node)
> >          * pv_wait_node(). If OTOH this fails, the vCPU was running and
> > will
> >          * observe its next->locked value and advance itself.
> >          *
> > -        * Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
> > +        * Matches with smp_store_mb() and cmpxchg_relaxed() in
> > pv_wait_node().
> > +        * A release barrier is used here to ensure that node->locked is
> > +        * always set before changing the state. See comment in
> > pv_wait_node().
> >          */
> > -       if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted)
> > +       if (cmpxchg_release(&pn->state, vcpu_halted, vcpu_hashed)
> > +                       != vcpu_halted)
> >                 return;
> >
> > hi, Waiman
> We can't use _release here, a full barrier is needed.
> 
> There is pv_kick_node vs pv_wait_head_or_lock
> 
> [w] l->locked = _Q_SLOW_VAL  //reordered here
> 
> if (READ_ONCE(pn->state) == vcpu_hashed) //False.
> 
>                    lp = (struct qspinlock **)1;
> 
> [STORE] pn->state = vcpu_hashed                        lp = pv_hash(lock,
> pn);
> pv_hash()                                                                if
> (xchg(&l->locked, _Q_SLOW_VAL) == 0) // fasle, not unhashed.
> 

This analysis is correct, but..

> Then the same lock has hashed twice but only unhashed once. So at last as
> the hash table grows big, we hit RCU stall.
> 
> I hit RCU stall when I run netperf benchmark
> 

how will a big hash table hit RCU stall? Do you have the call trace for
your RCU stall?

Regards,
Boqun

> thanks
> xinhui
> 
> 
> > --
> > 1.8.3.1
> >
> >

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2] locking/pvqspinlock: Relax cmpxchg's to improve performance on some archs
  2017-02-08  4:05   ` Boqun Feng
@ 2017-02-08  6:09     ` Boqun Feng
  2017-02-08  6:47       ` Pan Xinhui
                         ` (2 more replies)
  0 siblings, 3 replies; 21+ messages in thread
From: Boqun Feng @ 2017-02-08  6:09 UTC (permalink / raw)
  To: Xinhui Pan
  Cc: Waiman Long, Peter Zijlstra, Ingo Molnar, linux-kernel, Pan Xinhui

[-- Attachment #1: Type: text/plain, Size: 4876 bytes --]

On Wed, Feb 08, 2017 at 12:05:40PM +0800, Boqun Feng wrote:
> On Wed, Feb 08, 2017 at 11:39:10AM +0800, Xinhui Pan wrote:
> > 2016-12-26 4:26 GMT+08:00 Waiman Long <longman@redhat.com>:
> > 
> > > A number of cmpxchg calls in qspinlock_paravirt.h were replaced by more
> > > relaxed versions to improve performance on architectures that use LL/SC.
> > >
> > > All the locking related cmpxchg's are replaced with the _acquire
> > > variants:
> > >  - pv_queued_spin_steal_lock()
> > >  - trylock_clear_pending()
> > >
> > > The cmpxchg's related to hashing are replaced by either by the _release
> > > or the _relaxed variants. See the inline comment for details.
> > >
> > > Signed-off-by: Waiman Long <longman@redhat.com>
> > >
> > >  v1->v2:
> > >   - Add comments in changelog and code for the rationale of the change.
> > >
> > > ---
> > >  kernel/locking/qspinlock_paravirt.h | 50 ++++++++++++++++++++++++------
> > > -------
> > >  1 file changed, 33 insertions(+), 17 deletions(-)
> > >
> > >
> > > @@ -323,8 +329,14 @@ static void pv_wait_node(struct mcs_spinlock *node,
> > > struct mcs_spinlock *prev)
> > >                  * If pv_kick_node() changed us to vcpu_hashed, retain that
> > >                  * value so that pv_wait_head_or_lock() knows to not also
> > > try
> > >                  * to hash this lock.
> > > +                *
> > > +                * The smp_store_mb() and control dependency above will
> > > ensure
> > > +                * that state change won't happen before that.
> > > Synchronizing
> > > +                * with pv_kick_node() wrt hashing by this waiter or by the
> > > +                * lock holder is done solely by the state variable. There
> > > is
> > > +                * no other ordering requirement.
> > >                  */
> > > -               cmpxchg(&pn->state, vcpu_halted, vcpu_running);
> > > +               cmpxchg_relaxed(&pn->state, vcpu_halted, vcpu_running);
> > >
> > >                 /*
> > >                  * If the locked flag is still not set after wakeup, it is
> > > a
> > > @@ -360,9 +372,12 @@ static void pv_kick_node(struct qspinlock *lock,
> > > struct mcs_spinlock *node)
> > >          * pv_wait_node(). If OTOH this fails, the vCPU was running and
> > > will
> > >          * observe its next->locked value and advance itself.
> > >          *
> > > -        * Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
> > > +        * Matches with smp_store_mb() and cmpxchg_relaxed() in
> > > pv_wait_node().
> > > +        * A release barrier is used here to ensure that node->locked is
> > > +        * always set before changing the state. See comment in
> > > pv_wait_node().
> > >          */
> > > -       if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted)
> > > +       if (cmpxchg_release(&pn->state, vcpu_halted, vcpu_hashed)
> > > +                       != vcpu_halted)
> > >                 return;
> > >
> > > hi, Waiman
> > We can't use _release here, a full barrier is needed.
> > 
> > There is pv_kick_node vs pv_wait_head_or_lock
> > 
> > [w] l->locked = _Q_SLOW_VAL  //reordered here
> > 
> > if (READ_ONCE(pn->state) == vcpu_hashed) //False.
> > 
> >                    lp = (struct qspinlock **)1;
> > 
> > [STORE] pn->state = vcpu_hashed                        lp = pv_hash(lock,
> > pn);
> > pv_hash()                                                                if
> > (xchg(&l->locked, _Q_SLOW_VAL) == 0) // fasle, not unhashed.
> > 
> 
> This analysis is correct, but..
> 

Hmm.. look at this again, I don't think this analysis is meaningful,
let's say the reordering didn't happen, we still got(similar to your
case):

						if (READ_ONCE(pn->state) == vcpu_hashed) // false.
						  lp = (struct qspinlock **)1;

cmpxchg(pn->state, vcpu_halted, vcpu_hashed);
						  if(!lp) {
						    lp = pv_hash(lock, pn);
WRITE_ONCE(l->locked, _Q_SLOW_VAL);
pv_hash();
						    if (xchg(&l->locked, _Q_SLOW_VAL) == 0) // fasle, not unhashed.

, right?

Actually, I think this or your case could not happen because we have

	cmpxchg(pn->state, vcpu_halted, vcpu_running);

in pv_wait_node(), which makes us either observe vcpu_hashed or set
pn->state to vcpu_running before pv_kick_node() trying to do the hash.

I may miss something subtle, but does switching back to cmpxchg() could
fix the RCU stall you observed?

Regards,
Boqun

> > Then the same lock has hashed twice but only unhashed once. So at last as
> > the hash table grows big, we hit RCU stall.
> > 
> > I hit RCU stall when I run netperf benchmark
> > 
> 
> how will a big hash table hit RCU stall? Do you have the call trace for
> your RCU stall?
> 
> Regards,
> Boqun
> 
> > thanks
> > xinhui
> > 
> > 
> > > --
> > > 1.8.3.1
> > >
> > >



[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2] locking/pvqspinlock: Relax cmpxchg's to improve performance on some archs
  2017-02-08  6:09     ` Boqun Feng
@ 2017-02-08  6:47       ` Pan Xinhui
  2017-02-08  6:48       ` Pan Xinhui
  2017-02-08  7:09       ` Pan Xinhui
  2 siblings, 0 replies; 21+ messages in thread
From: Pan Xinhui @ 2017-02-08  6:47 UTC (permalink / raw)
  To: Boqun Feng, Xinhui Pan
  Cc: Waiman Long, Peter Zijlstra, Ingo Molnar, linux-kernel



在 2017/2/8 14:09, Boqun Feng 写道:
> On Wed, Feb 08, 2017 at 12:05:40PM +0800, Boqun Feng wrote:
>> On Wed, Feb 08, 2017 at 11:39:10AM +0800, Xinhui Pan wrote:
>>> 2016-12-26 4:26 GMT+08:00 Waiman Long <longman@redhat.com>:
>>>
>>>> A number of cmpxchg calls in qspinlock_paravirt.h were replaced by more
>>>> relaxed versions to improve performance on architectures that use LL/SC.
>>>>
>>>> All the locking related cmpxchg's are replaced with the _acquire
>>>> variants:
>>>>  - pv_queued_spin_steal_lock()
>>>>  - trylock_clear_pending()
>>>>
>>>> The cmpxchg's related to hashing are replaced by either by the _release
>>>> or the _relaxed variants. See the inline comment for details.
>>>>
>>>> Signed-off-by: Waiman Long <longman@redhat.com>
>>>>
>>>>  v1->v2:
>>>>   - Add comments in changelog and code for the rationale of the change.
>>>>
>>>> ---
>>>>  kernel/locking/qspinlock_paravirt.h | 50 ++++++++++++++++++++++++------
>>>> -------
>>>>  1 file changed, 33 insertions(+), 17 deletions(-)
>>>>
>>>>
>>>> @@ -323,8 +329,14 @@ static void pv_wait_node(struct mcs_spinlock *node,
>>>> struct mcs_spinlock *prev)
>>>>                  * If pv_kick_node() changed us to vcpu_hashed, retain that
>>>>                  * value so that pv_wait_head_or_lock() knows to not also
>>>> try
>>>>                  * to hash this lock.
>>>> +                *
>>>> +                * The smp_store_mb() and control dependency above will
>>>> ensure
>>>> +                * that state change won't happen before that.
>>>> Synchronizing
>>>> +                * with pv_kick_node() wrt hashing by this waiter or by the
>>>> +                * lock holder is done solely by the state variable. There
>>>> is
>>>> +                * no other ordering requirement.
>>>>                  */
>>>> -               cmpxchg(&pn->state, vcpu_halted, vcpu_running);
>>>> +               cmpxchg_relaxed(&pn->state, vcpu_halted, vcpu_running);
>>>>
>>>>                 /*
>>>>                  * If the locked flag is still not set after wakeup, it is
>>>> a
>>>> @@ -360,9 +372,12 @@ static void pv_kick_node(struct qspinlock *lock,
>>>> struct mcs_spinlock *node)
>>>>          * pv_wait_node(). If OTOH this fails, the vCPU was running and
>>>> will
>>>>          * observe its next->locked value and advance itself.
>>>>          *
>>>> -        * Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
>>>> +        * Matches with smp_store_mb() and cmpxchg_relaxed() in
>>>> pv_wait_node().
>>>> +        * A release barrier is used here to ensure that node->locked is
>>>> +        * always set before changing the state. See comment in
>>>> pv_wait_node().
>>>>          */
>>>> -       if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted)
>>>> +       if (cmpxchg_release(&pn->state, vcpu_halted, vcpu_hashed)
>>>> +                       != vcpu_halted)
>>>>                 return;
>>>>
>>>> hi, Waiman
>>> We can't use _release here, a full barrier is needed.
>>>
>>> There is pv_kick_node vs pv_wait_head_or_lock
>>>
>>> [w] l->locked = _Q_SLOW_VAL  //reordered here
>>>
>>> if (READ_ONCE(pn->state) == vcpu_hashed) //False.
>>>
>>>                    lp = (struct qspinlock **)1;
>>>
>>> [STORE] pn->state = vcpu_hashed                        lp = pv_hash(lock,
>>> pn);
>>> pv_hash()                                                                if
>>> (xchg(&l->locked, _Q_SLOW_VAL) == 0) // fasle, not unhashed.
>>>
>>
>> This analysis is correct, but..
>>
>
> Hmm.. look at this again, I don't think this analysis is meaningful,
> let's say the reordering didn't happen, we still got(similar to your
> case):
>
> 						if (READ_ONCE(pn->state) == vcpu_hashed) // false.
> 						  lp = (struct qspinlock **)1;
>
> cmpxchg(pn->state, vcpu_halted, vcpu_hashed);
> 						  if(!lp) {
> 						    lp = pv_hash(lock, pn);
> WRITE_ONCE(l->locked, _Q_SLOW_VAL);
> pv_hash();
> 						    if (xchg(&l->locked, _Q_SLOW_VAL) == 0) // fasle, not unhashed.
>
> , right?
>
> Actually, I think this or your case could not happen because we have
>
> 	cmpxchg(pn->state, vcpu_halted, vcpu_running);
>
> in pv_wait_node(), which makes us either observe vcpu_hashed or set
> pn->state to vcpu_running before pv_kick_node() trying to do the hash.
>
yep, there is still a race. We have to fix it.
so I think
we must check old = xchg(&l->locked, _Q_SLOW_VAL)
if (old  == 0)
do something
else if (old  == _Q_SLOW_VAL)
do something else

> I may miss something subtle, but does switching back to cmpxchg() could
> fix the RCU stall you observed?
>
yes, just fix this cmpxchg and then no RCU stall.

> Regards,
> Boqun
>
>>> Then the same lock has hashed twice but only unhashed once. So at last as
>>> the hash table grows big, we hit RCU stall.
>>>
>>> I hit RCU stall when I run netperf benchmark
>>>
>>
>> how will a big hash table hit RCU stall? Do you have the call trace for
>> your RCU stall?
>>
>> Regards,
>> Boqun
>>
>>> thanks
>>> xinhui
>>>
>>>
>>>> --
>>>> 1.8.3.1
>>>>
>>>>
>
>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2] locking/pvqspinlock: Relax cmpxchg's to improve performance on some archs
  2017-02-08  6:09     ` Boqun Feng
  2017-02-08  6:47       ` Pan Xinhui
@ 2017-02-08  6:48       ` Pan Xinhui
  2017-02-08  7:09       ` Pan Xinhui
  2 siblings, 0 replies; 21+ messages in thread
From: Pan Xinhui @ 2017-02-08  6:48 UTC (permalink / raw)
  To: Boqun Feng, Xinhui Pan
  Cc: Waiman Long, Peter Zijlstra, Ingo Molnar, linux-kernel



在 2017/2/8 14:09, Boqun Feng 写道:
> On Wed, Feb 08, 2017 at 12:05:40PM +0800, Boqun Feng wrote:
>> On Wed, Feb 08, 2017 at 11:39:10AM +0800, Xinhui Pan wrote:
>>> 2016-12-26 4:26 GMT+08:00 Waiman Long <longman@redhat.com>:
>>>
>>>> A number of cmpxchg calls in qspinlock_paravirt.h were replaced by more
>>>> relaxed versions to improve performance on architectures that use LL/SC.
>>>>
>>>> All the locking related cmpxchg's are replaced with the _acquire
>>>> variants:
>>>>  - pv_queued_spin_steal_lock()
>>>>  - trylock_clear_pending()
>>>>
>>>> The cmpxchg's related to hashing are replaced by either by the _release
>>>> or the _relaxed variants. See the inline comment for details.
>>>>
>>>> Signed-off-by: Waiman Long <longman@redhat.com>
>>>>
>>>>  v1->v2:
>>>>   - Add comments in changelog and code for the rationale of the change.
>>>>
>>>> ---
>>>>  kernel/locking/qspinlock_paravirt.h | 50 ++++++++++++++++++++++++------
>>>> -------
>>>>  1 file changed, 33 insertions(+), 17 deletions(-)
>>>>
>>>>
>>>> @@ -323,8 +329,14 @@ static void pv_wait_node(struct mcs_spinlock *node,
>>>> struct mcs_spinlock *prev)
>>>>                  * If pv_kick_node() changed us to vcpu_hashed, retain that
>>>>                  * value so that pv_wait_head_or_lock() knows to not also
>>>> try
>>>>                  * to hash this lock.
>>>> +                *
>>>> +                * The smp_store_mb() and control dependency above will
>>>> ensure
>>>> +                * that state change won't happen before that.
>>>> Synchronizing
>>>> +                * with pv_kick_node() wrt hashing by this waiter or by the
>>>> +                * lock holder is done solely by the state variable. There
>>>> is
>>>> +                * no other ordering requirement.
>>>>                  */
>>>> -               cmpxchg(&pn->state, vcpu_halted, vcpu_running);
>>>> +               cmpxchg_relaxed(&pn->state, vcpu_halted, vcpu_running);
>>>>
>>>>                 /*
>>>>                  * If the locked flag is still not set after wakeup, it is
>>>> a
>>>> @@ -360,9 +372,12 @@ static void pv_kick_node(struct qspinlock *lock,
>>>> struct mcs_spinlock *node)
>>>>          * pv_wait_node(). If OTOH this fails, the vCPU was running and
>>>> will
>>>>          * observe its next->locked value and advance itself.
>>>>          *
>>>> -        * Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
>>>> +        * Matches with smp_store_mb() and cmpxchg_relaxed() in
>>>> pv_wait_node().
>>>> +        * A release barrier is used here to ensure that node->locked is
>>>> +        * always set before changing the state. See comment in
>>>> pv_wait_node().
>>>>          */
>>>> -       if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted)
>>>> +       if (cmpxchg_release(&pn->state, vcpu_halted, vcpu_hashed)
>>>> +                       != vcpu_halted)
>>>>                 return;
>>>>
>>>> hi, Waiman
>>> We can't use _release here, a full barrier is needed.
>>>
>>> There is pv_kick_node vs pv_wait_head_or_lock
>>>
>>> [w] l->locked = _Q_SLOW_VAL  //reordered here
>>>
>>> if (READ_ONCE(pn->state) == vcpu_hashed) //False.
>>>
>>>                    lp = (struct qspinlock **)1;
>>>
>>> [STORE] pn->state = vcpu_hashed                        lp = pv_hash(lock,
>>> pn);
>>> pv_hash()                                                                if
>>> (xchg(&l->locked, _Q_SLOW_VAL) == 0) // fasle, not unhashed.
>>>
>>
>> This analysis is correct, but..
>>
>
> Hmm.. look at this again, I don't think this analysis is meaningful,
> let's say the reordering didn't happen, we still got(similar to your
> case):
>
> 						if (READ_ONCE(pn->state) == vcpu_hashed) // false.
> 						  lp = (struct qspinlock **)1;
>
> cmpxchg(pn->state, vcpu_halted, vcpu_hashed);
> 						  if(!lp) {
> 						    lp = pv_hash(lock, pn);
> WRITE_ONCE(l->locked, _Q_SLOW_VAL);
> pv_hash();
> 						    if (xchg(&l->locked, _Q_SLOW_VAL) == 0) // fasle, not unhashed.
>
> , right?
>
> Actually, I think this or your case could not happen because we have
>
> 	cmpxchg(pn->state, vcpu_halted, vcpu_running);
>
> in pv_wait_node(), which makes us either observe vcpu_hashed or set
> pn->state to vcpu_running before pv_kick_node() trying to do the hash.
>
yep, there is still a race. We have to fix it.
so I think
we must check old = xchg(&l->locked, _Q_SLOW_VAL)
if (old  == 0)
do something
else if (old  == _Q_SLOW_VAL)
do something else

> I may miss something subtle, but does switching back to cmpxchg() could
> fix the RCU stall you observed?
>
yes, just fix this cmpxchg and then no RCU stall.

> Regards,
> Boqun
>
>>> Then the same lock has hashed twice but only unhashed once. So at last as
>>> the hash table grows big, we hit RCU stall.
>>>
>>> I hit RCU stall when I run netperf benchmark
>>>
>>
>> how will a big hash table hit RCU stall? Do you have the call trace for
>> your RCU stall?
>>
maybe too many time on hashing? I am not sure.

>> Regards,
>> Boqun
>>
>>> thanks
>>> xinhui
>>>
>>>
>>>> --
>>>> 1.8.3.1
>>>>
>>>>
>
>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2] locking/pvqspinlock: Relax cmpxchg's to improve performance on some archs
  2017-02-08  6:09     ` Boqun Feng
  2017-02-08  6:47       ` Pan Xinhui
  2017-02-08  6:48       ` Pan Xinhui
@ 2017-02-08  7:09       ` Pan Xinhui
  2017-02-08  7:15         ` Boqun Feng
  2 siblings, 1 reply; 21+ messages in thread
From: Pan Xinhui @ 2017-02-08  7:09 UTC (permalink / raw)
  To: Boqun Feng, Xinhui Pan
  Cc: Waiman Long, Peter Zijlstra, Ingo Molnar, linux-kernel



在 2017/2/8 14:09, Boqun Feng 写道:
> On Wed, Feb 08, 2017 at 12:05:40PM +0800, Boqun Feng wrote:
>> On Wed, Feb 08, 2017 at 11:39:10AM +0800, Xinhui Pan wrote:
>>> 2016-12-26 4:26 GMT+08:00 Waiman Long <longman@redhat.com>:
>>>
>>>> A number of cmpxchg calls in qspinlock_paravirt.h were replaced by more
>>>> relaxed versions to improve performance on architectures that use LL/SC.
>>>>
>>>> All the locking related cmpxchg's are replaced with the _acquire
>>>> variants:
>>>>  - pv_queued_spin_steal_lock()
>>>>  - trylock_clear_pending()
>>>>
>>>> The cmpxchg's related to hashing are replaced by either by the _release
>>>> or the _relaxed variants. See the inline comment for details.
>>>>
>>>> Signed-off-by: Waiman Long <longman@redhat.com>
>>>>
>>>>  v1->v2:
>>>>   - Add comments in changelog and code for the rationale of the change.
>>>>
>>>> ---
>>>>  kernel/locking/qspinlock_paravirt.h | 50 ++++++++++++++++++++++++------
>>>> -------
>>>>  1 file changed, 33 insertions(+), 17 deletions(-)
>>>>
>>>>
>>>> @@ -323,8 +329,14 @@ static void pv_wait_node(struct mcs_spinlock *node,
>>>> struct mcs_spinlock *prev)
>>>>                  * If pv_kick_node() changed us to vcpu_hashed, retain that
>>>>                  * value so that pv_wait_head_or_lock() knows to not also
>>>> try
>>>>                  * to hash this lock.
>>>> +                *
>>>> +                * The smp_store_mb() and control dependency above will
>>>> ensure
>>>> +                * that state change won't happen before that.
>>>> Synchronizing
>>>> +                * with pv_kick_node() wrt hashing by this waiter or by the
>>>> +                * lock holder is done solely by the state variable. There
>>>> is
>>>> +                * no other ordering requirement.
>>>>                  */
>>>> -               cmpxchg(&pn->state, vcpu_halted, vcpu_running);
>>>> +               cmpxchg_relaxed(&pn->state, vcpu_halted, vcpu_running);
>>>>
>>>>                 /*
>>>>                  * If the locked flag is still not set after wakeup, it is
>>>> a
>>>> @@ -360,9 +372,12 @@ static void pv_kick_node(struct qspinlock *lock,
>>>> struct mcs_spinlock *node)
>>>>          * pv_wait_node(). If OTOH this fails, the vCPU was running and
>>>> will
>>>>          * observe its next->locked value and advance itself.
>>>>          *
>>>> -        * Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
>>>> +        * Matches with smp_store_mb() and cmpxchg_relaxed() in
>>>> pv_wait_node().
>>>> +        * A release barrier is used here to ensure that node->locked is
>>>> +        * always set before changing the state. See comment in
>>>> pv_wait_node().
>>>>          */
>>>> -       if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted)
>>>> +       if (cmpxchg_release(&pn->state, vcpu_halted, vcpu_hashed)
>>>> +                       != vcpu_halted)
>>>>                 return;
>>>>
>>>> hi, Waiman
>>> We can't use _release here, a full barrier is needed.
>>>
>>> There is pv_kick_node vs pv_wait_head_or_lock
>>>
>>> [w] l->locked = _Q_SLOW_VAL  //reordered here
>>>
>>> if (READ_ONCE(pn->state) == vcpu_hashed) //False.
>>>
>>>                    lp = (struct qspinlock **)1;
>>>
>>> [STORE] pn->state = vcpu_hashed                        lp = pv_hash(lock,
>>> pn);
>>> pv_hash()                                                                if
>>> (xchg(&l->locked, _Q_SLOW_VAL) == 0) // fasle, not unhashed.
>>>
>>
>> This analysis is correct, but..
>>
>
> Hmm.. look at this again, I don't think this analysis is meaningful,
> let's say the reordering didn't happen, we still got(similar to your
> case):
>
but there is
						cmpxchg_relaxed(&pn->state, vcpu_halted, vcpu_running);

> 						if (READ_ONCE(pn->state) == vcpu_hashed) // false.
> 						  lp = (struct qspinlock **)1;
>
> cmpxchg(pn->state, vcpu_halted, vcpu_hashed);
this cmpxchg will observe the cmpxchg_relaxed above, so this cmpxchg will fail as pn->state is vcpu_running.
No bug here..

> 						  if(!lp) {
> 						    lp = pv_hash(lock, pn);
> WRITE_ONCE(l->locked, _Q_SLOW_VAL);
> pv_hash();
> 						    if (xchg(&l->locked, _Q_SLOW_VAL) == 0) // fasle, not unhashed.
>
> , right?

>
> Actually, I think this or your case could not happen because we have
>
> 	cmpxchg(pn->state, vcpu_halted, vcpu_running);
>
> in pv_wait_node(), which makes us either observe vcpu_hashed or set
> pn->state to vcpu_running before pv_kick_node() trying to do the hash.
>
> I may miss something subtle, but does switching back to cmpxchg() could
> fix the RCU stall you observed?
>
> Regards,
> Boqun
>
>>> Then the same lock has hashed twice but only unhashed once. So at last as
>>> the hash table grows big, we hit RCU stall.
>>>
>>> I hit RCU stall when I run netperf benchmark
>>>
>>
>> how will a big hash table hit RCU stall? Do you have the call trace for
>> your RCU stall?
>>
>> Regards,
>> Boqun
>>
>>> thanks
>>> xinhui
>>>
>>>
>>>> --
>>>> 1.8.3.1
>>>>
>>>>
>
>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2] locking/pvqspinlock: Relax cmpxchg's to improve performance on some archs
  2017-02-08  7:09       ` Pan Xinhui
@ 2017-02-08  7:15         ` Boqun Feng
  0 siblings, 0 replies; 21+ messages in thread
From: Boqun Feng @ 2017-02-08  7:15 UTC (permalink / raw)
  To: Pan Xinhui
  Cc: Xinhui Pan, Waiman Long, Peter Zijlstra, Ingo Molnar, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 4694 bytes --]

On Wed, Feb 08, 2017 at 03:09:33PM +0800, Pan Xinhui wrote:
> 
> 
> 在 2017/2/8 14:09, Boqun Feng 写道:
> > On Wed, Feb 08, 2017 at 12:05:40PM +0800, Boqun Feng wrote:
> > > On Wed, Feb 08, 2017 at 11:39:10AM +0800, Xinhui Pan wrote:
> > > > 2016-12-26 4:26 GMT+08:00 Waiman Long <longman@redhat.com>:
> > > > 
> > > > > A number of cmpxchg calls in qspinlock_paravirt.h were replaced by more
> > > > > relaxed versions to improve performance on architectures that use LL/SC.
> > > > > 
> > > > > All the locking related cmpxchg's are replaced with the _acquire
> > > > > variants:
> > > > >  - pv_queued_spin_steal_lock()
> > > > >  - trylock_clear_pending()
> > > > > 
> > > > > The cmpxchg's related to hashing are replaced by either by the _release
> > > > > or the _relaxed variants. See the inline comment for details.
> > > > > 
> > > > > Signed-off-by: Waiman Long <longman@redhat.com>
> > > > > 
> > > > >  v1->v2:
> > > > >   - Add comments in changelog and code for the rationale of the change.
> > > > > 
> > > > > ---
> > > > >  kernel/locking/qspinlock_paravirt.h | 50 ++++++++++++++++++++++++------
> > > > > -------
> > > > >  1 file changed, 33 insertions(+), 17 deletions(-)
> > > > > 
> > > > > 
> > > > > @@ -323,8 +329,14 @@ static void pv_wait_node(struct mcs_spinlock *node,
> > > > > struct mcs_spinlock *prev)
> > > > >                  * If pv_kick_node() changed us to vcpu_hashed, retain that
> > > > >                  * value so that pv_wait_head_or_lock() knows to not also
> > > > > try
> > > > >                  * to hash this lock.
> > > > > +                *
> > > > > +                * The smp_store_mb() and control dependency above will
> > > > > ensure
> > > > > +                * that state change won't happen before that.
> > > > > Synchronizing
> > > > > +                * with pv_kick_node() wrt hashing by this waiter or by the
> > > > > +                * lock holder is done solely by the state variable. There
> > > > > is
> > > > > +                * no other ordering requirement.
> > > > >                  */
> > > > > -               cmpxchg(&pn->state, vcpu_halted, vcpu_running);
> > > > > +               cmpxchg_relaxed(&pn->state, vcpu_halted, vcpu_running);
> > > > > 
> > > > >                 /*
> > > > >                  * If the locked flag is still not set after wakeup, it is
> > > > > a
> > > > > @@ -360,9 +372,12 @@ static void pv_kick_node(struct qspinlock *lock,
> > > > > struct mcs_spinlock *node)
> > > > >          * pv_wait_node(). If OTOH this fails, the vCPU was running and
> > > > > will
> > > > >          * observe its next->locked value and advance itself.
> > > > >          *
> > > > > -        * Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
> > > > > +        * Matches with smp_store_mb() and cmpxchg_relaxed() in
> > > > > pv_wait_node().
> > > > > +        * A release barrier is used here to ensure that node->locked is
> > > > > +        * always set before changing the state. See comment in
> > > > > pv_wait_node().
> > > > >          */
> > > > > -       if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted)
> > > > > +       if (cmpxchg_release(&pn->state, vcpu_halted, vcpu_hashed)
> > > > > +                       != vcpu_halted)
> > > > >                 return;
> > > > > 
> > > > > hi, Waiman
> > > > We can't use _release here, a full barrier is needed.
> > > > 
> > > > There is pv_kick_node vs pv_wait_head_or_lock
> > > > 
> > > > [w] l->locked = _Q_SLOW_VAL  //reordered here
> > > > 
> > > > if (READ_ONCE(pn->state) == vcpu_hashed) //False.
> > > > 
> > > >                    lp = (struct qspinlock **)1;
> > > > 
> > > > [STORE] pn->state = vcpu_hashed                        lp = pv_hash(lock,
> > > > pn);
> > > > pv_hash()                                                                if
> > > > (xchg(&l->locked, _Q_SLOW_VAL) == 0) // fasle, not unhashed.
> > > > 
> > > 
> > > This analysis is correct, but..
> > > 
> > 
> > Hmm.. look at this again, I don't think this analysis is meaningful,
> > let's say the reordering didn't happen, we still got(similar to your
> > case):
> > 
> but there is
> 						cmpxchg_relaxed(&pn->state, vcpu_halted, vcpu_running);
> 
> > 						if (READ_ONCE(pn->state) == vcpu_hashed) // false.
> > 						  lp = (struct qspinlock **)1;
> > 
> > cmpxchg(pn->state, vcpu_halted, vcpu_hashed);
> this cmpxchg will observe the cmpxchg_relaxed above, so this cmpxchg will fail as pn->state is vcpu_running.
> No bug here..
> 

And we got the same guarantee if we use cmpxchg_release(), no?

Regards,
Boqun

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2] locking/pvqspinlock: Relax cmpxchg's to improve performance on some archs
       [not found]   ` <778926a5-cf9f-586b-6bc4-b9453d88aabb@redhat.com>
@ 2017-02-13  2:24     ` panxinhui
  2017-02-13  3:19       ` Boqun Feng
  2017-02-17 19:01       ` Waiman Long
  0 siblings, 2 replies; 21+ messages in thread
From: panxinhui @ 2017-02-13  2:24 UTC (permalink / raw)
  To: Waiman Long, Xinhui Pan
  Cc: Peter Zijlstra, Ingo Molnar, linux-kernel, Boqun Feng



在 2017/2/10 上午4:53, Waiman Long 写道:
> On 02/07/2017 10:39 PM, Xinhui Pan wrote:
>>
>>
>> 2016-12-26 4:26 GMT+08:00 Waiman Long <longman@redhat.com <mailto:longman@redhat.com>>:
>>
>>     A number of cmpxchg calls in qspinlock_paravirt.h were replaced by more
>>     relaxed versions to improve performance on architectures that use LL/SC.
>>
>>     All the locking related cmpxchg's are replaced with the _acquire
>>     variants:
>>      - pv_queued_spin_steal_lock()
>>      - trylock_clear_pending()
>>
>>     The cmpxchg's related to hashing are replaced by either by the _release
>>     or the _relaxed variants. See the inline comment for details.
>>
>>     Signed-off-by: Waiman Long <longman@redhat.com <mailto:longman@redhat.com>>
>>
>>      v1->v2:
>>       - Add comments in changelog and code for the rationale of the change.
>>
>>     ---
>>      kernel/locking/qspinlock_paravirt.h | 50 ++++++++++++++++++++++++-------------
>>      1 file changed, 33 insertions(+), 17 deletions(-)
>>
>>
>>     @@ -323,8 +329,14 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
>>                      * If pv_kick_node() changed us to vcpu_hashed, retain that
>>                      * value so that pv_wait_head_or_lock() knows to not also try
>>                      * to hash this lock.
>>     +                *
>>     +                * The smp_store_mb() and control dependency above will ensure
>>     +                * that state change won't happen before that. Synchronizing
>>     +                * with pv_kick_node() wrt hashing by this waiter or by the
>>     +                * lock holder is done solely by the state variable. There is
>>     +                * no other ordering requirement.
>>                      */
>>     -               cmpxchg(&pn->state, vcpu_halted, vcpu_running);
>>     +               cmpxchg_relaxed(&pn->state, vcpu_halted, vcpu_running);
>>
>>                     /*
>>                      * If the locked flag is still not set after wakeup, it is a
>>     @@ -360,9 +372,12 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
>>              * pv_wait_node(). If OTOH this fails, the vCPU was running and will
>>              * observe its next->locked value and advance itself.
>>              *
>>     -        * Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
>>     +        * Matches with smp_store_mb() and cmpxchg_relaxed() in pv_wait_node().
>>     +        * A release barrier is used here to ensure that node->locked is
>>     +        * always set before changing the state. See comment in pv_wait_node().
>>              */
>>     -       if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted)
>>     +       if (cmpxchg_release(&pn->state, vcpu_halted, vcpu_hashed)
>>     +                       != vcpu_halted)
>>                     return;
>>
>> hi, Waiman
>> We can't use _release here, a full barrier is needed.
>>
>> There is pv_kick_node vs pv_wait_head_or_lock
>>
>> [w] l->locked = _Q_SLOW_VAL  //reordered here    
>>                                                                                 if (READ_ONCE(pn->state) == vcpu_hashed) //False.
>>                                                                                                lp = (struct qspinlock **)1;
>>
>> [STORE] pn->state = vcpu_hashed                        lp = pv_hash(lock, pn);
>> pv_hash()                                                                if (xchg(&l->locked, _Q_SLOW_VAL) == 0) // fasle, not unhashed.
>>
>> Then the same lock has hashed twice but only unhashed once. So at last as the hash table grows big, we hit RCU stall.
>>
>> I hit RCU stall when I run netperf benchmark
>>
>> thanks
>> xinhui
>>
>>
>>     --
>>     1.8.3.1
>>
>>
> Yes, I know I am being too aggressive in this patch. I am going to tone it down a bit. I just don't have time to run a performance test on PPC system to verify the gain yet. I am planning to send an updated patch soon.
> 
hi, All

I guess I have found the scenario that causes the RCU stall.

pv_wait_node					
										[L] pn->state // this load is reordered from cmxchg_release.
	smp_store_mb(pn->state, vcpu_halted);					
	if (!READ_ONCE(node->locked))						
										arch_mcs_spin_unlock_contended(&next->locked);
										pv_kick_node
											[-L]cmpxchg_release(&pn->state, vcpu_halted, vcpu_hashed)
											//cmpxchg_release fails, so pn->state keep as it is.
		pv_wait(&pn->state, vcpu_halted);
		//on PPC, It will not return until pn->state != vcpu_halted.
		
And when rcu stall hit, I fire an BUG(), and enter debug mode, it seems most cpus are in pv_wait...

So the soltuion to solve this problems is simple, keep the cmpxchg as it is in pv_kick_node, cmpxchg on ppc provides full barriers.

thanks
xinhui

> Cheers,
> Longman
> 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2] locking/pvqspinlock: Relax cmpxchg's to improve performance on some archs
  2017-02-13  2:24     ` panxinhui
@ 2017-02-13  3:19       ` Boqun Feng
  2017-02-17 19:01       ` Waiman Long
  1 sibling, 0 replies; 21+ messages in thread
From: Boqun Feng @ 2017-02-13  3:19 UTC (permalink / raw)
  To: panxinhui
  Cc: Waiman Long, Xinhui Pan, Peter Zijlstra, Ingo Molnar, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 5381 bytes --]

On Mon, Feb 13, 2017 at 10:24:38AM +0800, panxinhui wrote:
> 
> 
> 在 2017/2/10 上午4:53, Waiman Long 写道:
> > On 02/07/2017 10:39 PM, Xinhui Pan wrote:
> >>
> >>
> >> 2016-12-26 4:26 GMT+08:00 Waiman Long <longman@redhat.com <mailto:longman@redhat.com>>:
> >>
> >>     A number of cmpxchg calls in qspinlock_paravirt.h were replaced by more
> >>     relaxed versions to improve performance on architectures that use LL/SC.
> >>
> >>     All the locking related cmpxchg's are replaced with the _acquire
> >>     variants:
> >>      - pv_queued_spin_steal_lock()
> >>      - trylock_clear_pending()
> >>
> >>     The cmpxchg's related to hashing are replaced by either by the _release
> >>     or the _relaxed variants. See the inline comment for details.
> >>
> >>     Signed-off-by: Waiman Long <longman@redhat.com <mailto:longman@redhat.com>>
> >>
> >>      v1->v2:
> >>       - Add comments in changelog and code for the rationale of the change.
> >>
> >>     ---
> >>      kernel/locking/qspinlock_paravirt.h | 50 ++++++++++++++++++++++++-------------
> >>      1 file changed, 33 insertions(+), 17 deletions(-)
> >>
> >>
> >>     @@ -323,8 +329,14 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
> >>                      * If pv_kick_node() changed us to vcpu_hashed, retain that
> >>                      * value so that pv_wait_head_or_lock() knows to not also try
> >>                      * to hash this lock.
> >>     +                *
> >>     +                * The smp_store_mb() and control dependency above will ensure
> >>     +                * that state change won't happen before that. Synchronizing
> >>     +                * with pv_kick_node() wrt hashing by this waiter or by the
> >>     +                * lock holder is done solely by the state variable. There is
> >>     +                * no other ordering requirement.
> >>                      */
> >>     -               cmpxchg(&pn->state, vcpu_halted, vcpu_running);
> >>     +               cmpxchg_relaxed(&pn->state, vcpu_halted, vcpu_running);
> >>
> >>                     /*
> >>                      * If the locked flag is still not set after wakeup, it is a
> >>     @@ -360,9 +372,12 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
> >>              * pv_wait_node(). If OTOH this fails, the vCPU was running and will
> >>              * observe its next->locked value and advance itself.
> >>              *
> >>     -        * Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
> >>     +        * Matches with smp_store_mb() and cmpxchg_relaxed() in pv_wait_node().
> >>     +        * A release barrier is used here to ensure that node->locked is
> >>     +        * always set before changing the state. See comment in pv_wait_node().
> >>              */
> >>     -       if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted)
> >>     +       if (cmpxchg_release(&pn->state, vcpu_halted, vcpu_hashed)
> >>     +                       != vcpu_halted)
> >>                     return;
> >>
> >> hi, Waiman
> >> We can't use _release here, a full barrier is needed.
> >>
> >> There is pv_kick_node vs pv_wait_head_or_lock
> >>
> >> [w] l->locked = _Q_SLOW_VAL  //reordered here    
> >>                                                                                 if (READ_ONCE(pn->state) == vcpu_hashed) //False.
> >>                                                                                                lp = (struct qspinlock **)1;
> >>
> >> [STORE] pn->state = vcpu_hashed                        lp = pv_hash(lock, pn);
> >> pv_hash()                                                                if (xchg(&l->locked, _Q_SLOW_VAL) == 0) // fasle, not unhashed.
> >>
> >> Then the same lock has hashed twice but only unhashed once. So at last as the hash table grows big, we hit RCU stall.
> >>
> >> I hit RCU stall when I run netperf benchmark
> >>
> >> thanks
> >> xinhui
> >>
> >>
> >>     --
> >>     1.8.3.1
> >>
> >>
> > Yes, I know I am being too aggressive in this patch. I am going to tone it down a bit. I just don't have time to run a performance test on PPC system to verify the gain yet. I am planning to send an updated patch soon.
> > 
> hi, All
> 
> I guess I have found the scenario that causes the RCU stall.
> 

Yes, I believe that's the case, as the comment before smp_store_mb() in
pv_wait_node states. Good show!

Regards,
Boqun

> pv_wait_node					
> 										[L] pn->state // this load is reordered from cmxchg_release.
> 	smp_store_mb(pn->state, vcpu_halted);					
> 	if (!READ_ONCE(node->locked))						
> 										arch_mcs_spin_unlock_contended(&next->locked);
> 										pv_kick_node
> 											[-L]cmpxchg_release(&pn->state, vcpu_halted, vcpu_hashed)
> 											//cmpxchg_release fails, so pn->state keep as it is.
> 		pv_wait(&pn->state, vcpu_halted);
> 		//on PPC, It will not return until pn->state != vcpu_halted.
> 		
> And when rcu stall hit, I fire an BUG(), and enter debug mode, it seems most cpus are in pv_wait...
> 
> So the soltuion to solve this problems is simple, keep the cmpxchg as it is in pv_kick_node, cmpxchg on ppc provides full barriers.
> 
> thanks
> xinhui
> 
> > Cheers,
> > Longman
> > 
> 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2] locking/pvqspinlock: Relax cmpxchg's to improve performance on some archs
  2017-02-13  2:24     ` panxinhui
  2017-02-13  3:19       ` Boqun Feng
@ 2017-02-17 19:01       ` Waiman Long
  1 sibling, 0 replies; 21+ messages in thread
From: Waiman Long @ 2017-02-17 19:01 UTC (permalink / raw)
  To: panxinhui, Xinhui Pan
  Cc: Peter Zijlstra, Ingo Molnar, linux-kernel, Boqun Feng

On 02/12/2017 09:24 PM, panxinhui wrote:
>
> 在 2017/2/10 上午4:53, Waiman Long 写道:
>> On 02/07/2017 10:39 PM, Xinhui Pan wrote:
>>>
>>> 2016-12-26 4:26 GMT+08:00 Waiman Long <longman@redhat.com <mailto:longman@redhat.com>>:
>>>
>>>     A number of cmpxchg calls in qspinlock_paravirt.h were replaced by more
>>>     relaxed versions to improve performance on architectures that use LL/SC.
>>>
>>>     All the locking related cmpxchg's are replaced with the _acquire
>>>     variants:
>>>      - pv_queued_spin_steal_lock()
>>>      - trylock_clear_pending()
>>>
>>>     The cmpxchg's related to hashing are replaced by either by the _release
>>>     or the _relaxed variants. See the inline comment for details.
>>>
>>>     Signed-off-by: Waiman Long <longman@redhat.com <mailto:longman@redhat.com>>
>>>
>>>      v1->v2:
>>>       - Add comments in changelog and code for the rationale of the change.
>>>
>>>     ---
>>>      kernel/locking/qspinlock_paravirt.h | 50 ++++++++++++++++++++++++-------------
>>>      1 file changed, 33 insertions(+), 17 deletions(-)
>>>
>>>
>>>     @@ -323,8 +329,14 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
>>>                      * If pv_kick_node() changed us to vcpu_hashed, retain that
>>>                      * value so that pv_wait_head_or_lock() knows to not also try
>>>                      * to hash this lock.
>>>     +                *
>>>     +                * The smp_store_mb() and control dependency above will ensure
>>>     +                * that state change won't happen before that. Synchronizing
>>>     +                * with pv_kick_node() wrt hashing by this waiter or by the
>>>     +                * lock holder is done solely by the state variable. There is
>>>     +                * no other ordering requirement.
>>>                      */
>>>     -               cmpxchg(&pn->state, vcpu_halted, vcpu_running);
>>>     +               cmpxchg_relaxed(&pn->state, vcpu_halted, vcpu_running);
>>>
>>>                     /*
>>>                      * If the locked flag is still not set after wakeup, it is a
>>>     @@ -360,9 +372,12 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
>>>              * pv_wait_node(). If OTOH this fails, the vCPU was running and will
>>>              * observe its next->locked value and advance itself.
>>>              *
>>>     -        * Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
>>>     +        * Matches with smp_store_mb() and cmpxchg_relaxed() in pv_wait_node().
>>>     +        * A release barrier is used here to ensure that node->locked is
>>>     +        * always set before changing the state. See comment in pv_wait_node().
>>>              */
>>>     -       if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted)
>>>     +       if (cmpxchg_release(&pn->state, vcpu_halted, vcpu_hashed)
>>>     +                       != vcpu_halted)
>>>                     return;
>>>
>>> hi, Waiman
>>> We can't use _release here, a full barrier is needed.
>>>
>>> There is pv_kick_node vs pv_wait_head_or_lock
>>>
>>> [w] l->locked = _Q_SLOW_VAL  //reordered here    
>>>                                                                                 if (READ_ONCE(pn->state) == vcpu_hashed) //False.
>>>                                                                                                lp = (struct qspinlock **)1;
>>>
>>> [STORE] pn->state = vcpu_hashed                        lp = pv_hash(lock, pn);
>>> pv_hash()                                                                if (xchg(&l->locked, _Q_SLOW_VAL) == 0) // fasle, not unhashed.
>>>
>>> Then the same lock has hashed twice but only unhashed once. So at last as the hash table grows big, we hit RCU stall.
>>>
>>> I hit RCU stall when I run netperf benchmark
>>>
>>> thanks
>>> xinhui
>>>
>>>
>>>     --
>>>     1.8.3.1
>>>
>>>
>> Yes, I know I am being too aggressive in this patch. I am going to tone it down a bit. I just don't have time to run a performance test on PPC system to verify the gain yet. I am planning to send an updated patch soon.
>>
> hi, All
>
> I guess I have found the scenario that causes the RCU stall.
>
> pv_wait_node					
> 										[L] pn->state // this load is reordered from cmxchg_release.
> 	smp_store_mb(pn->state, vcpu_halted);					
> 	if (!READ_ONCE(node->locked))						
> 										arch_mcs_spin_unlock_contended(&next->locked);
> 										pv_kick_node
> 											[-L]cmpxchg_release(&pn->state, vcpu_halted, vcpu_hashed)
> 											//cmpxchg_release fails, so pn->state keep as it is.
> 		pv_wait(&pn->state, vcpu_halted);
> 		//on PPC, It will not return until pn->state != vcpu_halted.
> 		
> And when rcu stall hit, I fire an BUG(), and enter debug mode, it seems most cpus are in pv_wait...
>
> So the soltuion to solve this problems is simple, keep the cmpxchg as it is in pv_kick_node, cmpxchg on ppc provides full barriers.
>
> thanks
> xinhui
>

Sorry for the late reply and thank for the explanation. I am not aware
that the load of cmpxchg_release can be moved up like that. I will need
to be more careful of using relaxed form of cmpxchg next time.

Cheers,
Longman

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2017-02-17 19:01 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-12-25 20:26 [PATCH v2] locking/pvqspinlock: Relax cmpxchg's to improve performance on some archs Waiman Long
2016-12-26  5:50 ` Boqun Feng
2017-01-03 22:23   ` Waiman Long
2017-01-03 16:18 ` Peter Zijlstra
2017-01-03 22:07   ` Waiman Long
2017-01-04  9:41     ` Peter Zijlstra
2017-01-05  8:16       ` Pan Xinhui
2017-01-05  9:48         ` Peter Zijlstra
2017-01-05  9:51         ` Boqun Feng
2017-01-05 15:17         ` Waiman Long
2017-01-05 15:40           ` Boqun Feng
2017-01-05 15:30       ` Waiman Long
     [not found] ` <CAH4ORazqsCBA4G5paHtsp8PMfM=J3P6rvyR-53-ZLjn=7U6J0g@mail.gmail.com>
2017-02-08  4:05   ` Boqun Feng
2017-02-08  6:09     ` Boqun Feng
2017-02-08  6:47       ` Pan Xinhui
2017-02-08  6:48       ` Pan Xinhui
2017-02-08  7:09       ` Pan Xinhui
2017-02-08  7:15         ` Boqun Feng
     [not found]   ` <778926a5-cf9f-586b-6bc4-b9453d88aabb@redhat.com>
2017-02-13  2:24     ` panxinhui
2017-02-13  3:19       ` Boqun Feng
2017-02-17 19:01       ` Waiman Long

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).