From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <linux-kernel-owner@vger.kernel.org>
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
	id S1161449AbbKFOuV (ORCPT <rfc822;w@1wt.eu>);
	Fri, 6 Nov 2015 09:50:21 -0500
Received: from bombadil.infradead.org ([198.137.202.9]:55807 "EHLO
	bombadil.infradead.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
	with ESMTP id S1757284AbbKFOuT (ORCPT
	<rfc822;linux-kernel@vger.kernel.org>);
	Fri, 6 Nov 2015 09:50:19 -0500
Date: Fri, 6 Nov 2015 15:50:05 +0100
From: Peter Zijlstra <peterz@infradead.org>
To: Waiman Long <Waiman.Long@hpe.com>
Cc: Ingo Molnar <mingo@redhat.com>, Thomas Gleixner <tglx@linutronix.de>,
        "H. Peter Anvin" <hpa@zytor.com>, x86@kernel.org,
        linux-kernel@vger.kernel.org, Scott J Norton <scott.norton@hpe.com>,
        Douglas Hatch <doug.hatch@hpe.com>,
        Davidlohr Bueso <dave@stgolabs.net>
Subject: Re: [PATCH tip/locking/core v9 5/6] locking/pvqspinlock: Allow 1
 lock stealing attempt
Message-ID: <20151106145005.GU17308@twins.programming.kicks-ass.net>
References: <1446247597-61863-1-git-send-email-Waiman.Long@hpe.com>
 <1446247597-61863-6-git-send-email-Waiman.Long@hpe.com>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <1446247597-61863-6-git-send-email-Waiman.Long@hpe.com>
User-Agent: Mutt/1.5.21 (2012-12-30)
Sender: linux-kernel-owner@vger.kernel.org
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org

On Fri, Oct 30, 2015 at 07:26:36PM -0400, Waiman Long wrote:

> @@ -431,35 +432,44 @@ queue:
>  	 * sequentiality; this is because the set_locked() function below
>  	 * does not imply a full barrier.
>  	 *
> +	 * The PV pv_wait_head_lock function, if active, will acquire the lock
> +	 * and return a non-zero value. So we have to skip the
> +	 * smp_load_acquire() call. As the next PV queue head hasn't been
> +	 * designated yet, there is no way for the locked value to become
> +	 * _Q_SLOW_VAL. So both the redundant set_locked() and the
> +	 * atomic_cmpxchg_relaxed() calls will be safe. The cost of the
> +	 * redundant set_locked() call below should be negligible, too.
> +	 *
> +	 * If PV isn't active, 0 will be returned instead.
>  	 */
> -	pv_wait_head(lock, node);
> -	while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK)
> -		cpu_relax();
> +	val = pv_wait_head_lock(lock, node);
> +	if (!val) {
> +		while ((val = smp_load_acquire(&lock->val.counter))
> +				& _Q_LOCKED_PENDING_MASK)
> +			cpu_relax();
> +		/*
> +		 * Claim the lock now:
> +		 *
> +		 * 0,0 -> 0,1
> +		 */
> +		set_locked(lock);
> +		val |= _Q_LOCKED_VAL;
> +	}
>  
>  	/*
>  	 * If the next pointer is defined, we are not tail anymore.
> -	 * In this case, claim the spinlock & release the MCS lock.
>  	 */
> -	if (next) {
> -		set_locked(lock);
> +	if (next)
>  		goto mcs_unlock;
> -	}
>  
>  	/*
> -	 * claim the lock:
> -	 *
> -	 * n,0,0 -> 0,0,1 : lock, uncontended
> -	 * *,0,0 -> *,0,1 : lock, contended
> -	 *
>  	 * If the queue head is the only one in the queue (lock value == tail),
> -	 * clear the tail code and grab the lock. Otherwise, we only need
> -	 * to grab the lock.
> +	 * we have to clear the tail code.
>  	 */
>  	for (;;) {
> -		if (val != tail) {
> -			set_locked(lock);
> +		if ((val & _Q_TAIL_MASK) != tail)
>  			break;
> -		}
> +
>  		/*
>  		 * The smp_load_acquire() call above has provided the necessary
>  		 * acquire semantics required for locking. At most two

*urgh*, last time we had:

+	if (pv_wait_head_or_steal())
+		goto stolen;
	while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK)
		cpu_relax();

	...

+stolen:
	while (!(next = READ_ONCE(node->next)))
		cpu_relax();

	...

Now you completely overhaul the native code.. what happened?

> -static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
> +static u32 pv_wait_head_lock(struct qspinlock *lock, struct mcs_spinlock *node)
>  {
>  	struct pv_node *pn = (struct pv_node *)node;
>  	struct __qspinlock *l = (void *)lock;
> @@ -276,11 +330,24 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
>  		lp = (struct qspinlock **)1;
>  
>  	for (;; waitcnt++) {
> +		/*
> +		 * Set the pending bit in the active lock spinning loop to
> +		 * disable lock stealing. However, the pending bit check in
> +		 * pv_queued_spin_trylock_unfair() and the setting/clearing
> +		 * of pending bit here aren't memory barriers. So a cmpxchg()
> +		 * is used to acquire the lock to be sure.
> +		 */
> +		set_pending(lock);

OK, so we mark ourselves 'pending' such that a new lock() will not steal
and is forced to queue behind us.

>  		for (loop = SPIN_THRESHOLD; loop; loop--) {
> -			if (!READ_ONCE(l->locked))
> -				return;
> +			if (!READ_ONCE(l->locked) &&
> +			   (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
> +				clear_pending(lock);
> +				goto gotlock;

Would not: cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL),
make sense to avoid the clear_pending() call?

> +			}
>  			cpu_relax();
>  		}
> +		clear_pending(lock);
> +