From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <linux-kernel-owner@vger.kernel.org>
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
	id S1756384AbbE2PUR (ORCPT <rfc822;w@1wt.eu>);
	Fri, 29 May 2015 11:20:17 -0400
Received: from cantor2.suse.de ([195.135.220.15]:39150 "EHLO mx2.suse.de"
	rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP
	id S1756210AbbE2PUL (ORCPT <rfc822;linux-kernel@vger.kernel.org>);
	Fri, 29 May 2015 11:20:11 -0400
Message-ID: <1432912794.21393.1.camel@stgolabs.net>
Subject: Re: [PATCH -rfc 4/4] locking/rtmutex: Support spin on owner (osq)
From: Davidlohr Bueso <dave@stgolabs.net>
To: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>, Ingo Molnar <mingo@redhat.com>,
        Steven Rostedt <rostedt@goodmis.org>,
        Mike Galbraith <umgwanakikbuti@gmail.com>,
        "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>,
        Sebastian Andrzej Siewior <bigeasy@linutronix.de>,
        linux-kernel@vger.kernel.org
Date: Fri, 29 May 2015 08:19:54 -0700
In-Reply-To: <1432056298-18738-5-git-send-email-dave@stgolabs.net>
References: <1432056298-18738-1-git-send-email-dave@stgolabs.net>
	 <1432056298-18738-5-git-send-email-dave@stgolabs.net>
Content-Type: text/plain; charset="UTF-8"
X-Mailer: Evolution 3.12.11 
Mime-Version: 1.0
Content-Transfer-Encoding: 7bit
Sender: linux-kernel-owner@vger.kernel.org
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org

Btw, I just realized I had sent a stale patch where the osq was not
being initialized, fixed below. Thanks!

From: Davidlohr Bueso <dave@stgolabs.net>
Subject: [PATCH v2] locking/rtmutex: Support spin on owner (osq)

Similar to what we have in other locks, particularly regular mutexes, the
idea is that as long as the owner is running, there is a fair chance it'll
release the lock soon, and thus a task trying to acquire the rtmutex will
better off spinning instead of blocking immediately after the fastpath.
Conditions to stop spinning and enter the slowpath are simple:

(1) Upon need_resched()
(2) Current lock owner blocks

Spinning tasks performance is further enhanced by queuing in cancelable
mcs (osq). Because rtmutexes track the lock owner atomically, we can
extend the fastpath to continue polling on the lock owner via
cmpxchg(lock->owner, NULL, current).

This is a conservative approach, such that upon entering the slowpath,
we force all lock spinners (including future ones) to serialize on the
wait_lock as mark_rt_mutex_waiters() is called, unconditionally.

CPU0					CPU1
optimistic_spin() (failed)
try_to_take_rt_mutex()
	mark_rt_mutex_waiters()
					optimistic_spin() (failed cmpxchg)
					spin_lock(&lock->wait_lock)

As such we check for RT_MUTEX_HAS_WAITERS bit0 (rt_mutex_has_waiters_fast()).
This allows priority boosting to take precedence over spinning, as otherwise
we could starve a higher priority queued-up task (ie: top waiter) if spinners
constantly steal the lock. Another alternative would be to stop spinning when
we should really wakeup a higher priority waiter, but of course we do not hold
the wait_lock, so that is racy.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
---
Changes from v1:
- Fix Kconfig debug
- Init lock->osq in rt mutex initializer

 include/linux/percpu-rwsem.h |  14 +++++
 include/linux/rtmutex.h      |   4 ++
 kernel/Kconfig.locks         |   4 ++
 kernel/locking/rtmutex.c     | 136 ++++++++++++++++++++++++++++++++++++++++++-
 mm/vmscan.c                  |  14 ++---
 5 files changed, 164 insertions(+), 8 deletions(-)

diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index 266110d..05b4737 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -45,6 +45,20 @@ static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
 	 */
 }
 
+static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
+{
+	bool ret = false;
+
+	preempt_disable();
+	if (likely(rcu_sync_is_idle(&sem->rss))) {
+		__this_cpu_inc(*sem->refcount);
+		ret = true;
+		rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_);
+	}
+	preempt_enable();
+	return ret;
+}
+
 static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
 {
 	/*
diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
index 1abba5c..b5e85ca 100644
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -15,6 +15,7 @@
 #include <linux/linkage.h>
 #include <linux/rbtree.h>
 #include <linux/spinlock_types.h>
+#include <linux/osq_lock.h>
 
 extern int max_lock_depth; /* for sysctl */
 
@@ -31,6 +32,9 @@ struct rt_mutex {
 	struct rb_root          waiters;
 	struct rb_node          *waiters_leftmost;
 	struct task_struct	*owner;
+#ifdef CONFIG_RT_MUTEX_SPIN_ON_OWNER
+	struct optimistic_spin_queue osq; /* Spinner MCS lock */
+#endif
 #ifdef CONFIG_DEBUG_RT_MUTEXES
 	int			save_state;
 	const char 		*name, *file;
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index ebdb004..8602871 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -227,6 +227,10 @@ config MUTEX_SPIN_ON_OWNER
 	def_bool y
 	depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
 
+config RT_MUTEX_SPIN_ON_OWNER
+	def_bool y
+	depends on SMP && RT_MUTEXES && !DEBUG_RT_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
+
 config RWSEM_SPIN_ON_OWNER
        def_bool y
        depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 81aad32..c868bd4 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -63,6 +63,20 @@ static inline void clear_rt_mutex_waiters(struct rt_mutex *lock)
 			((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
 }
 
+/*
+ * Lockless alternative to rt_mutex_has_waiters() as we do not need the
+ * wait_lock to check if we are in, for instance, a transitional state
+ * after calling mark_rt_mutex_waiters().
+ */
+static inline bool rt_mutex_has_waiters_fast(struct rt_mutex *lock)
+{
+	unsigned long val = (unsigned long)lock->owner;
+
+	if (!val)
+		return false;
+	return val & RT_MUTEX_HAS_WAITERS;
+}
+
 static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
 {
 	if (!rt_mutex_has_waiters(lock))
@@ -1152,6 +1166,121 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
 	}
 }
 
+#ifdef CONFIG_RT_MUTEX_SPIN_ON_OWNER
+static noinline
+bool rt_mutex_spin_on_owner(struct rt_mutex *lock, struct task_struct *owner)
+{
+	bool ret = true;
+
+	rcu_read_lock();
+	while (rt_mutex_owner(lock) == owner) {
+		/*
+		 * Ensure we emit the owner->on_cpu, dereference _after_
+		 * checking lock->owner still matches owner. If that fails,
+		 * owner might point to freed memory. If it still matches,
+		 * the rcu_read_lock() ensures the memory stays valid.
+		 */
+		barrier();
+
+		if (!owner->on_cpu || need_resched()) {
+			ret = false;
+			break;
+		}
+
+		cpu_relax_lowlatency();
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
+/*
+ * Initial check for entering the mutex spinning loop
+ */
+static inline bool rt_mutex_can_spin_on_owner(struct rt_mutex *lock)
+{
+	struct task_struct *owner;
+	/* default return to spin: if no owner, the lock is free */
+	int ret = true;
+
+	if (need_resched())
+		return 0;
+
+	rcu_read_lock();
+	owner = rt_mutex_owner(lock);
+	if (owner)
+		ret = owner->on_cpu;
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static bool rt_mutex_optimistic_spin(struct rt_mutex *lock)
+{
+	bool taken = false;
+
+	preempt_disable();
+
+	if (!rt_mutex_can_spin_on_owner(lock))
+		goto done;
+	/*
+	 * In order to avoid a stampede of mutex spinners trying to
+	 * acquire the mutex all at once, the spinners need to take a
+	 * MCS (queued) lock first before spinning on the owner field.
+	 */
+	if (!osq_lock(&lock->osq))
+		goto done;
+
+	while (true) {
+		struct task_struct *owner;
+
+		/*
+		 * When another task is competing for the lock in the
+		 * slowpath (either transitional or not), avoid the
+		 * cmpxchg and immediately block. If the situation is
+		 * later fixed by clearing the waiters bit, future
+		 * tasks that atempt to take the rtmutex can spin.
+		 */
+		if (rt_mutex_has_waiters_fast(lock))
+			goto done_unlock;
+
+		/*
+		 * If there's an owner, wait for it to either
+		 * release the lock or go to sleep.
+		 */
+		owner = rt_mutex_owner(lock);
+		if (owner && !rt_mutex_spin_on_owner(lock, owner))
+			break;
+
+		/* Try to acquire the lock, if it is unlocked. */
+		if (rt_mutex_cmpxchg(lock, NULL, current)) {
+			taken = true;
+			goto done_unlock;
+		}
+
+		/*
+		 * The cpu_relax() call is a compiler barrier which forces
+		 * everything in this loop to be re-loaded. We don't need
+		 * memory barriers as we'll eventually observe the right
+		 * values at the cost of a few extra spins.
+		 */
+		cpu_relax_lowlatency();
+	}
+
+done_unlock:
+	osq_unlock(&lock->osq);
+done:
+	preempt_enable();
+	return taken;
+}
+
+#else
+static bool rt_mutex_optimistic_spin(struct rt_mutex *lock)
+{
+	return false;
+}
+#endif
+
 /*
  * Slow path lock function:
  */
@@ -1163,6 +1292,9 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 	struct rt_mutex_waiter waiter;
 	int ret = 0;
 
+	if (rt_mutex_optimistic_spin(lock))
+		return ret;
+
 	debug_rt_mutex_init_waiter(&waiter);
 	RB_CLEAR_NODE(&waiter.pi_tree_entry);
 	RB_CLEAR_NODE(&waiter.tree_entry);
@@ -1500,7 +1632,9 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name)
 	raw_spin_lock_init(&lock->wait_lock);
 	lock->waiters = RB_ROOT;
 	lock->waiters_leftmost = NULL;
-
+#ifdef CONFIG_RT_MUTEX_SPIN_ON_OWNER
+	osq_lock_init(&lock->osq);
+#endif
 	debug_rt_mutex_init(lock, name);
 }
 EXPORT_SYMBOL_GPL(__rt_mutex_init);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5e8eadd..f2da14d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -36,7 +36,7 @@
 #include <linux/cpuset.h>
 #include <linux/compaction.h>
 #include <linux/notifier.h>
-#include <linux/rwsem.h>
+#include <linux/percpu-rwsem.h>
 #include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
@@ -211,9 +211,9 @@ int register_shrinker(struct shrinker *shrinker)
 	if (!shrinker->nr_deferred)
 		return -ENOMEM;
 
-	down_write(&shrinker_rwsem);
+	percpu_down_write(&shrinker_rwsem);
 	list_add_tail(&shrinker->list, &shrinker_list);
-	up_write(&shrinker_rwsem);
+	percpu_up_write(&shrinker_rwsem);
 	return 0;
 }
 EXPORT_SYMBOL(register_shrinker);
@@ -223,9 +223,9 @@ EXPORT_SYMBOL(register_shrinker);
  */
 void unregister_shrinker(struct shrinker *shrinker)
 {
-	down_write(&shrinker_rwsem);
+	percpu_down_write(&shrinker_rwsem);
 	list_del(&shrinker->list);
-	up_write(&shrinker_rwsem);
+	percpu_up_write(&shrinker_rwsem);
 	kfree(shrinker->nr_deferred);
 }
 EXPORT_SYMBOL(unregister_shrinker);
@@ -386,7 +386,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
 	if (nr_scanned == 0)
 		nr_scanned = SWAP_CLUSTER_MAX;
 
-	if (!down_read_trylock(&shrinker_rwsem)) {
+	if (!percpu_down_read_trylock(&shrinker_rwsem)) {
 		/*
 		 * If we would return 0, our callers would understand that we
 		 * have nothing else to shrink and give up trying. By returning
@@ -413,7 +413,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
 		freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible);
 	}
 
-	up_read(&shrinker_rwsem);
+	percpu_up_read(&shrinker_rwsem);
 out:
 	cond_resched();
 	return freed;
-- 
2.1.4