[PATCH 6/5] locking/qspinlock: Use acquire/release semantics

From: Davidlohr Bueso <dave@stgolabs.net>
To: Peter Zijlstra <peterz@infradead.org>,
	Ingo Molnar <mingo@kernel.org>,
	Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Will Deacon <will.deacon@arm.com>,
	"Paul E.McKenney" <paulmck@linux.vnet.ibm.com>,
	linux-kernel@vger.kernel.org, Waiman Long <Waiman.Long@hpe.com>,
	Davidlohr Bueso <dave@stgolabs.net>
Subject: [PATCH 6/5] locking/qspinlock: Use acquire/release semantics
Date: Tue, 13 Oct 2015 10:04:23 -0700	[thread overview]
Message-ID: <20151013170423.GB3052@linux-uzut.site> (raw)
In-Reply-To: <1443643395-17016-1-git-send-email-dave@stgolabs.net>

As of 654672d4ba1 (locking/atomics: Add _{acquire|release|relaxed}()
variants of some atomic operations) and 6d79ef2d30e (locking, asm-generic:
Add _{relaxed|acquire|release}() variants for 'atomic_long_t'), weakly
ordered archs can benefit from more relaxed use of barriers when locking
and unlocking, instead of regular full barrier semantics. While currently
only arm64 supports such optimizations, updating corresponding locking
primitives serves for other archs to immediately benefit as well, once the
necessary machinery is implemented of course.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
---

Hi Waiman, it seems you never sent an update patch regarding $TOPIC[1].
So I rebased, rewrote most of the comments and eliminated the x86
changes. What do you think? I'd like to get everything updated in
kernel/locking/ by 4.4, and qspinlock is the only primitive left.

x86 compile tested only.

Thanks!

[1] https://lkml.org/lkml/2015/9/11/540

  include/asm-generic/qspinlock.h | 10 ++++------
  kernel/locking/qspinlock.c      | 25 +++++++++++++++++++++----
  2 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
index e2aadbc..799f960 100644
--- a/include/asm-generic/qspinlock.h
+++ b/include/asm-generic/qspinlock.h
@@ -62,7 +62,7 @@ static __always_inline int queued_spin_is_contended(struct qspinlock *lock)
  static __always_inline int queued_spin_trylock(struct qspinlock *lock)
  {
	if (!atomic_read(&lock->val) &&
-	   (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) == 0))
+	   (atomic_cmpxchg_acquire(&lock->val, 0, _Q_LOCKED_VAL) == 0))
		return 1;
	return 0;
  }
@@ -77,7 +77,7 @@ static __always_inline void queued_spin_lock(struct qspinlock *lock)
  {
	u32 val;

-	val = atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL);
+	val = atomic_cmpxchg_acquire(&lock->val, 0, _Q_LOCKED_VAL);
	if (likely(val == 0))
		return;
	queued_spin_lock_slowpath(lock, val);
@@ -90,10 +90,8 @@ static __always_inline void queued_spin_lock(struct qspinlock *lock)
   */
  static __always_inline void queued_spin_unlock(struct qspinlock *lock)
  {
-	/*
-	 * smp_mb__before_atomic() in order to guarantee release semantics
-	 */
-	smp_mb__before_atomic_dec();
+	/* at minimum, guarantee RELEASE semantics */
+	smp_mb__before_atomic();
	atomic_sub(_Q_LOCKED_VAL, &lock->val);
  }
  #endif
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 87e9ce6a..6355d8a 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -176,7 +176,13 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
  {
	struct __qspinlock *l = (void *)lock;

-	return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
+	/*
+	 * Use RELEASE semantics to ensure that nothing is
+	 * re-ordered out before we are done initializing the
+	 * new mcs node. Once the new tail is set, all is fair.
+	 */
+	return (u32)xchg_release(&l->tail,
+				 tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
  }

  #else /* _Q_PENDING_BITS == 8 */
@@ -208,7 +214,12 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)

	for (;;) {
		new = (val & _Q_LOCKED_PENDING_MASK) | tail;
-		old = atomic_cmpxchg(&lock->val, val, new);
+		/*
+		 * Use RELEASE semantics to ensure that nothing is
+		 * re-ordered out before we are done initializing the
+		 * new mcs node. Once the new tail is set, all is fair.
+		 */
+		old = atomic_cmpxchg_release(&lock->val, val, new);
		if (old == val)
			break;

@@ -319,7 +330,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
		if (val == new)
			new |= _Q_PENDING_VAL;

-		old = atomic_cmpxchg(&lock->val, val, new);
+		old = atomic_cmpxchg_acquire(&lock->val, val, new);
		if (old == val)
			break;

@@ -426,7 +437,13 @@ queue:
			set_locked(lock);
			break;
		}
-		old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL);
+
+		/*
+		 * The above smp_load_acquire() provides us with the necessary
+		 * ACQUIRE semantics required for locking. We can, therefore,
+		 * fully relax the barriers in this case.
+		 */
+		old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL);
		if (old == val)
			goto release;	/* No contention */

--
2.1.4