Re: [RFC][PATCH 3/3] locking/qspinlock: Optimize for x86

From: Peter Zijlstra <peterz@infradead.org>
To: Will Deacon <will.deacon@arm.com>
Cc: mingo@kernel.org, linux-kernel@vger.kernel.org,
	longman@redhat.com, andrea.parri@amarulasolutions.com,
	tglx@linutronix.de
Subject: Re: [RFC][PATCH 3/3] locking/qspinlock: Optimize for x86
Date: Tue, 2 Oct 2018 16:14:24 +0200	[thread overview]
Message-ID: <20181002141424.GG26858@hirez.programming.kicks-ass.net> (raw)
In-Reply-To: <20181002131952.GD16422@arm.com>

On Tue, Oct 02, 2018 at 02:19:53PM +0100, Will Deacon wrote:
> On Mon, Oct 01, 2018 at 10:00:28PM +0200, Peter Zijlstra wrote:

> > Let me draw a picture of that..
> > 
> > 
> >   CPU0		CPU1		CPU2		CPU3
> > 
> > 0)						lock
> > 						  trylock -> (0,0,1)
> > 1)lock
> >     trylock /* fail */
> > 
> > 2)		lock
> > 		  trylock /* fail */
> > 		  tas-pending -> (0,1,1)
> > 		  wait-locked
> > 
> > 3)				lock
> > 				  trylock /* fail */
> > 				  tas-pending /* fail */
> > 
> > 4)						unlock -> (0,1,0)
> > 		  clr_pnd_set_lck -> (0,0,1)
> > 		  unlock -> (0,0,0)
> > 
> > 5)  tas-pending -> (0,1,0)
> >     read-val -> (0,1,0)
> > 6)  clr_pnd_set_lck -> (0,0,1)
> > 7)				  xchg_tail -> (n,0,1)
> > 				  load_acquire <- (n,0,0) (from-4)
> > 8)				  cmpxchg /* fail */
> > 				  set_locked()
> > 
> > > Is there something I'm missing that means this can't happen? I suppose
> > > cacheline granularity ends up giving serialisation between (4) and (7),
> > > but I'd *much* prefer not to rely on that because it feels horribly
> > > fragile.
> > 
> > Well, on x86 atomics are fully ordered, so the xchg_tail() does in
> > fact have smp_mb() in and that should order it sufficient for that not
> > to happen I think.
> 
> Hmm, does that actually help, though? I still think you're relying on the
> cache-coherence protocol to serialise the xchg() on pending before the
> xchg_tail(), which I think is fragile because they don't actually overlap.

Maybe, I suspect TSO makes it work, but see the below alternative.

---

--- a/arch/x86/include/asm/qspinlock.h
+++ b/arch/x86/include/asm/qspinlock.h
@@ -6,9 +6,29 @@
 #include <asm/cpufeature.h>
 #include <asm-generic/qspinlock_types.h>
 #include <asm/paravirt.h>
+#include <asm/rmwcc.h>
 
 #define _Q_PENDING_LOOPS	(1 << 9)
 
+static __always_inline bool __test_and_set_pending(struct qspinlock *lock)
+{
+	GEN_BINARY_RMWcc(LOCK_PREFIX "btsl",
+			 lock->val.counter, "Ir", _Q_PENDING_OFFSET, "%0", c);
+}
+
+#define queued_set_pending_fetch_acquire queued_set_pending_fetch_acquire
+static inline u32 queued_set_pending_fetch_acquire(struct qspinlock *lock)
+{
+	u32 val = 0;
+
+	if (__test_and_set_pending(lock))
+		val |= _Q_PENDING_VAL;
+
+	val |= atomic_read(&lock->val) & ~_Q_PENDING_MASK;
+
+	return val;
+}
+
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
 extern void __pv_init_lock_hash(void);
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -232,6 +232,20 @@ static __always_inline u32 xchg_tail(str
 #endif /* _Q_PENDING_BITS == 8 */
 
 /**
+ * queued_set_pending_fetch_acquire - fetch the whole lock value and set pending
+ * @lock : Pointer to queued spinlock structure
+ * Return: The previous lock value
+ *
+ * *,*,* -> *,1,*
+ */
+#ifndef queued_set_pending_fetch_acquire
+static __always_inline u32 queued_set_pending_fetch_acquire(struct qspinlock *lock)
+{
+	return atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val);
+}
+#endif
+
+/**
  * set_locked - Set the lock bit and own the lock
  * @lock: Pointer to queued spinlock structure
  *
@@ -328,7 +342,7 @@ void queued_spin_lock_slowpath(struct qs
 	 *
 	 * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock
 	 */
-	val = atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val);
+	val = queued_set_pending_fetch_acquire(lock);
 
 	/*
 	 * If we observe contention, there is a concurrent locker.