From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1756384AbbE2PUR (ORCPT ); Fri, 29 May 2015 11:20:17 -0400 Received: from cantor2.suse.de ([195.135.220.15]:39150 "EHLO mx2.suse.de" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1756210AbbE2PUL (ORCPT ); Fri, 29 May 2015 11:20:11 -0400 Message-ID: <1432912794.21393.1.camel@stgolabs.net> Subject: Re: [PATCH -rfc 4/4] locking/rtmutex: Support spin on owner (osq) From: Davidlohr Bueso To: Thomas Gleixner Cc: Peter Zijlstra , Ingo Molnar , Steven Rostedt , Mike Galbraith , "Paul E. McKenney" , Sebastian Andrzej Siewior , linux-kernel@vger.kernel.org Date: Fri, 29 May 2015 08:19:54 -0700 In-Reply-To: <1432056298-18738-5-git-send-email-dave@stgolabs.net> References: <1432056298-18738-1-git-send-email-dave@stgolabs.net> <1432056298-18738-5-git-send-email-dave@stgolabs.net> Content-Type: text/plain; charset="UTF-8" X-Mailer: Evolution 3.12.11 Mime-Version: 1.0 Content-Transfer-Encoding: 7bit Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Btw, I just realized I had sent a stale patch where the osq was not being initialized, fixed below. Thanks! From: Davidlohr Bueso Subject: [PATCH v2] locking/rtmutex: Support spin on owner (osq) Similar to what we have in other locks, particularly regular mutexes, the idea is that as long as the owner is running, there is a fair chance it'll release the lock soon, and thus a task trying to acquire the rtmutex will better off spinning instead of blocking immediately after the fastpath. Conditions to stop spinning and enter the slowpath are simple: (1) Upon need_resched() (2) Current lock owner blocks Spinning tasks performance is further enhanced by queuing in cancelable mcs (osq). Because rtmutexes track the lock owner atomically, we can extend the fastpath to continue polling on the lock owner via cmpxchg(lock->owner, NULL, current). This is a conservative approach, such that upon entering the slowpath, we force all lock spinners (including future ones) to serialize on the wait_lock as mark_rt_mutex_waiters() is called, unconditionally. CPU0 CPU1 optimistic_spin() (failed) try_to_take_rt_mutex() mark_rt_mutex_waiters() optimistic_spin() (failed cmpxchg) spin_lock(&lock->wait_lock) As such we check for RT_MUTEX_HAS_WAITERS bit0 (rt_mutex_has_waiters_fast()). This allows priority boosting to take precedence over spinning, as otherwise we could starve a higher priority queued-up task (ie: top waiter) if spinners constantly steal the lock. Another alternative would be to stop spinning when we should really wakeup a higher priority waiter, but of course we do not hold the wait_lock, so that is racy. Signed-off-by: Davidlohr Bueso --- Changes from v1: - Fix Kconfig debug - Init lock->osq in rt mutex initializer include/linux/percpu-rwsem.h | 14 +++++ include/linux/rtmutex.h | 4 ++ kernel/Kconfig.locks | 4 ++ kernel/locking/rtmutex.c | 136 ++++++++++++++++++++++++++++++++++++++++++- mm/vmscan.c | 14 ++--- 5 files changed, 164 insertions(+), 8 deletions(-) diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index 266110d..05b4737 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -45,6 +45,20 @@ static inline void percpu_down_read(struct percpu_rw_semaphore *sem) */ } +static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem) +{ + bool ret = false; + + preempt_disable(); + if (likely(rcu_sync_is_idle(&sem->rss))) { + __this_cpu_inc(*sem->refcount); + ret = true; + rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_); + } + preempt_enable(); + return ret; +} + static inline void percpu_up_read(struct percpu_rw_semaphore *sem) { /* diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h index 1abba5c..b5e85ca 100644 --- a/include/linux/rtmutex.h +++ b/include/linux/rtmutex.h @@ -15,6 +15,7 @@ #include #include #include +#include extern int max_lock_depth; /* for sysctl */ @@ -31,6 +32,9 @@ struct rt_mutex { struct rb_root waiters; struct rb_node *waiters_leftmost; struct task_struct *owner; +#ifdef CONFIG_RT_MUTEX_SPIN_ON_OWNER + struct optimistic_spin_queue osq; /* Spinner MCS lock */ +#endif #ifdef CONFIG_DEBUG_RT_MUTEXES int save_state; const char *name, *file; diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index ebdb004..8602871 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -227,6 +227,10 @@ config MUTEX_SPIN_ON_OWNER def_bool y depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW +config RT_MUTEX_SPIN_ON_OWNER + def_bool y + depends on SMP && RT_MUTEXES && !DEBUG_RT_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW + config RWSEM_SPIN_ON_OWNER def_bool y depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 81aad32..c868bd4 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -63,6 +63,20 @@ static inline void clear_rt_mutex_waiters(struct rt_mutex *lock) ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); } +/* + * Lockless alternative to rt_mutex_has_waiters() as we do not need the + * wait_lock to check if we are in, for instance, a transitional state + * after calling mark_rt_mutex_waiters(). + */ +static inline bool rt_mutex_has_waiters_fast(struct rt_mutex *lock) +{ + unsigned long val = (unsigned long)lock->owner; + + if (!val) + return false; + return val & RT_MUTEX_HAS_WAITERS; +} + static void fixup_rt_mutex_waiters(struct rt_mutex *lock) { if (!rt_mutex_has_waiters(lock)) @@ -1152,6 +1166,121 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock, } } +#ifdef CONFIG_RT_MUTEX_SPIN_ON_OWNER +static noinline +bool rt_mutex_spin_on_owner(struct rt_mutex *lock, struct task_struct *owner) +{ + bool ret = true; + + rcu_read_lock(); + while (rt_mutex_owner(lock) == owner) { + /* + * Ensure we emit the owner->on_cpu, dereference _after_ + * checking lock->owner still matches owner. If that fails, + * owner might point to freed memory. If it still matches, + * the rcu_read_lock() ensures the memory stays valid. + */ + barrier(); + + if (!owner->on_cpu || need_resched()) { + ret = false; + break; + } + + cpu_relax_lowlatency(); + } + rcu_read_unlock(); + + return ret; +} + +/* + * Initial check for entering the mutex spinning loop + */ +static inline bool rt_mutex_can_spin_on_owner(struct rt_mutex *lock) +{ + struct task_struct *owner; + /* default return to spin: if no owner, the lock is free */ + int ret = true; + + if (need_resched()) + return 0; + + rcu_read_lock(); + owner = rt_mutex_owner(lock); + if (owner) + ret = owner->on_cpu; + rcu_read_unlock(); + + return ret; +} + +static bool rt_mutex_optimistic_spin(struct rt_mutex *lock) +{ + bool taken = false; + + preempt_disable(); + + if (!rt_mutex_can_spin_on_owner(lock)) + goto done; + /* + * In order to avoid a stampede of mutex spinners trying to + * acquire the mutex all at once, the spinners need to take a + * MCS (queued) lock first before spinning on the owner field. + */ + if (!osq_lock(&lock->osq)) + goto done; + + while (true) { + struct task_struct *owner; + + /* + * When another task is competing for the lock in the + * slowpath (either transitional or not), avoid the + * cmpxchg and immediately block. If the situation is + * later fixed by clearing the waiters bit, future + * tasks that atempt to take the rtmutex can spin. + */ + if (rt_mutex_has_waiters_fast(lock)) + goto done_unlock; + + /* + * If there's an owner, wait for it to either + * release the lock or go to sleep. + */ + owner = rt_mutex_owner(lock); + if (owner && !rt_mutex_spin_on_owner(lock, owner)) + break; + + /* Try to acquire the lock, if it is unlocked. */ + if (rt_mutex_cmpxchg(lock, NULL, current)) { + taken = true; + goto done_unlock; + } + + /* + * The cpu_relax() call is a compiler barrier which forces + * everything in this loop to be re-loaded. We don't need + * memory barriers as we'll eventually observe the right + * values at the cost of a few extra spins. + */ + cpu_relax_lowlatency(); + } + +done_unlock: + osq_unlock(&lock->osq); +done: + preempt_enable(); + return taken; +} + +#else +static bool rt_mutex_optimistic_spin(struct rt_mutex *lock) +{ + return false; +} +#endif + /* * Slow path lock function: */ @@ -1163,6 +1292,9 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, struct rt_mutex_waiter waiter; int ret = 0; + if (rt_mutex_optimistic_spin(lock)) + return ret; + debug_rt_mutex_init_waiter(&waiter); RB_CLEAR_NODE(&waiter.pi_tree_entry); RB_CLEAR_NODE(&waiter.tree_entry); @@ -1500,7 +1632,9 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name) raw_spin_lock_init(&lock->wait_lock); lock->waiters = RB_ROOT; lock->waiters_leftmost = NULL; - +#ifdef CONFIG_RT_MUTEX_SPIN_ON_OWNER + osq_lock_init(&lock->osq); +#endif debug_rt_mutex_init(lock, name); } EXPORT_SYMBOL_GPL(__rt_mutex_init); diff --git a/mm/vmscan.c b/mm/vmscan.c index 5e8eadd..f2da14d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -36,7 +36,7 @@ #include #include #include -#include +#include #include #include #include @@ -211,9 +211,9 @@ int register_shrinker(struct shrinker *shrinker) if (!shrinker->nr_deferred) return -ENOMEM; - down_write(&shrinker_rwsem); + percpu_down_write(&shrinker_rwsem); list_add_tail(&shrinker->list, &shrinker_list); - up_write(&shrinker_rwsem); + percpu_up_write(&shrinker_rwsem); return 0; } EXPORT_SYMBOL(register_shrinker); @@ -223,9 +223,9 @@ EXPORT_SYMBOL(register_shrinker); */ void unregister_shrinker(struct shrinker *shrinker) { - down_write(&shrinker_rwsem); + percpu_down_write(&shrinker_rwsem); list_del(&shrinker->list); - up_write(&shrinker_rwsem); + percpu_up_write(&shrinker_rwsem); kfree(shrinker->nr_deferred); } EXPORT_SYMBOL(unregister_shrinker); @@ -386,7 +386,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, if (nr_scanned == 0) nr_scanned = SWAP_CLUSTER_MAX; - if (!down_read_trylock(&shrinker_rwsem)) { + if (!percpu_down_read_trylock(&shrinker_rwsem)) { /* * If we would return 0, our callers would understand that we * have nothing else to shrink and give up trying. By returning @@ -413,7 +413,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible); } - up_read(&shrinker_rwsem); + percpu_up_read(&shrinker_rwsem); out: cond_resched(); return freed; -- 2.1.4