From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1756547AbbDGPE5 (ORCPT ); Tue, 7 Apr 2015 11:04:57 -0400 Received: from www.linutronix.de ([62.245.132.108]:38677 "EHLO Galois.linutronix.de" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1755024AbbDGPEG (ORCPT ); Tue, 7 Apr 2015 11:04:06 -0400 From: Sebastian Andrzej Siewior To: linux-kernel@vger.kernel.org Cc: Peter Zijlstra , Ingo Molnar , Thomas Gleixner , Darren Hart , Steven Rostedt , fredrik.markstrom@windriver.com, Davidlohr Bueso , Manfred Spraul , Sebastian Andrzej Siewior Subject: [PATCH 1/3] futex: avoid double wake up in PI futex wait / wake on -RT Date: Tue, 7 Apr 2015 17:03:48 +0200 Message-Id: <1428419030-20030-2-git-send-email-bigeasy@linutronix.de> X-Mailer: git-send-email 2.1.4 In-Reply-To: <1428419030-20030-1-git-send-email-bigeasy@linutronix.de> References: <1428419030-20030-1-git-send-email-bigeasy@linutronix.de> X-Linutronix-Spam-Score: -1.0 X-Linutronix-Spam-Level: - X-Linutronix-Spam-Status: No , -1.0 points, 5.0 required, ALL_TRUSTED=-1,SHORTCIRCUIT=-0.0001,URIBL_BLOCKED=0.001 Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org From: Thomas Gleixner The boosted priority is reverted after the unlock but before the futex_hash_bucket (hb) has been accessed. The result is that we boost the task, deboost the task, boost again for the hb lock, deboost again. A sched trace of this scenario looks the following: | med_prio-93 sched_wakeup: comm=high_prio pid=92 prio=9 success=1 target_cpu=000 | med_prio-93 sched_switch: prev_comm=med_prio prev_pid=93 prev_prio=29 prev_state=R ==> next_comm=high_prio next_pid=92 next_prio=9 |high_prio-92 sched_pi_setprio: comm=low_prio pid=91 oldprio=120 newprio=9 |high_prio-92 sched_switch: prev_comm=high_prio prev_pid=92 prev_prio=9 prev_state=S ==> next_comm=low_prio next_pid=91 next_prio=9 | low_prio-91 sched_wakeup: comm=high_prio pid=92 prio=9 success=1 target_cpu=000 | low_prio-91 sched_pi_setprio: comm=low_prio pid=91 oldprio=9 newprio=120 | low_prio-91 sched_switch: prev_comm=low_prio prev_pid=91 prev_prio=120 prev_state=R+ ==> next_comm=high_prio next_pid=92 next_prio=9 |high_prio-92 sched_pi_setprio: comm=low_prio pid=91 oldprio=120 newprio=9 |high_prio-92 sched_switch: prev_comm=high_prio prev_pid=92 prev_prio=9 prev_state=D ==> next_comm=low_prio next_pid=91 next_prio=9 | low_prio-91 sched_wakeup: comm=high_prio pid=92 prio=9 success=1 target_cpu=000 | low_prio-91 sched_pi_setprio: comm=low_prio pid=91 oldprio=9 newprio=120 | low_prio-91 sched_switch: prev_comm=low_prio prev_pid=91 prev_prio=120 prev_state=R+ ==> next_comm=high_prio next_pid=92 next_prio=9 We see four sched_pi_setprio() invocation but ideally two would be enough. The patch tries to avoid the double wakeup by a wake up once the hb lock is released. The same test case: | med_prio-21 sched_wakeup: comm=high_prio pid=20 prio=9 success=1 target_cpu=000 | med_prio-21 sched_switch: prev_comm=med_prio prev_pid=21 prev_prio=29 prev_state=R ==> next_comm=high_prio next_pid=20 next_prio=9 |high_prio-20 sched_pi_setprio: comm=low_prio pid=19 oldprio=120 newprio=9 |high_prio-20 sched_switch: prev_comm=high_prio prev_pid=20 prev_prio=9 prev_state=S ==> next_comm=low_prio next_pid=19 next_prio=9 | low_prio-19 sched_wakeup: comm=high_prio pid=20 prio=9 success=1 target_cpu=000 | low_prio-19 sched_pi_setprio: comm=low_prio pid=19 oldprio=9 newprio=120 | low_prio-19 sched_switch: prev_comm=low_prio prev_pid=19 prev_prio=120 prev_state=R+ ==> next_comm=high_prio next_pid=20 next_prio=9 only two sched_pi_setprio() invocations as one would expect and see without -RT. Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior --- kernel/futex.c | 32 +++++++++++++++++++++++++++++--- kernel/locking/rtmutex.c | 39 ++++++++++++++++++++++++++++----------- kernel/locking/rtmutex_common.h | 4 ++++ 3 files changed, 61 insertions(+), 14 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 2579e407ff67..b38abe3573a8 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1122,11 +1122,13 @@ static void wake_futex(struct futex_q *q) put_task_struct(p); } -static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) +static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, + struct futex_hash_bucket *hb) { struct task_struct *new_owner; struct futex_pi_state *pi_state = this->pi_state; u32 uninitialized_var(curval), newval; + bool deboost; int ret = 0; if (!pi_state) @@ -1178,7 +1180,17 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) raw_spin_unlock_irq(&new_owner->pi_lock); raw_spin_unlock(&pi_state->pi_mutex.wait_lock); - rt_mutex_unlock(&pi_state->pi_mutex); + + deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex); + + /* + * We deboost after dropping hb->lock. That prevents a double + * wakeup on RT. + */ + spin_unlock(&hb->lock); + + if (deboost) + rt_mutex_adjust_prio(current); return 0; } @@ -2412,13 +2424,26 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) */ match = futex_top_waiter(hb, &key); if (match) { - ret = wake_futex_pi(uaddr, uval, match); + ret = wake_futex_pi(uaddr, uval, match, hb); + + /* + * In case of success wake_futex_pi dropped the hash + * bucket lock. + */ + if (!ret) + goto out_putkey; + /* * The atomic access to the futex value generated a * pagefault, so retry the user-access and the wakeup: */ if (ret == -EFAULT) goto pi_faulted; + + /* + * wake_futex_pi has detected invalid state. Tell user + * space. + */ goto out_unlock; } @@ -2439,6 +2464,7 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) out_unlock: spin_unlock(&hb->lock); +out_putkey: put_futex_key(&key); return ret; diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index b73279367087..9d858106ba0c 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -298,7 +298,7 @@ static void __rt_mutex_adjust_prio(struct task_struct *task) * of task. We do not use the spin_xx_mutex() variants here as we are * outside of the debug path.) */ -static void rt_mutex_adjust_prio(struct task_struct *task) +void rt_mutex_adjust_prio(struct task_struct *task) { unsigned long flags; @@ -955,8 +955,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, /* * Wake up the next waiter on the lock. * - * Remove the top waiter from the current tasks pi waiter list and - * wake it up. + * Remove the top waiter from the current tasks pi waiter list take a + * reference and return a pointer to it. * * Called with lock->wait_lock held. */ @@ -1253,7 +1253,7 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) /* * Slow path to release a rt-mutex: */ -static void __sched +static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock) { raw_spin_lock(&lock->wait_lock); @@ -1296,7 +1296,7 @@ rt_mutex_slowunlock(struct rt_mutex *lock) while (!rt_mutex_has_waiters(lock)) { /* Drops lock->wait_lock ! */ if (unlock_rt_mutex_safe(lock) == true) - return; + return false; /* Relock the rtmutex and try again */ raw_spin_lock(&lock->wait_lock); } @@ -1309,8 +1309,7 @@ rt_mutex_slowunlock(struct rt_mutex *lock) raw_spin_unlock(&lock->wait_lock); - /* Undo pi boosting if necessary: */ - rt_mutex_adjust_prio(current); + return true; } /* @@ -1361,12 +1360,14 @@ rt_mutex_fasttrylock(struct rt_mutex *lock, static inline void rt_mutex_fastunlock(struct rt_mutex *lock, - void (*slowfn)(struct rt_mutex *lock)) + bool (*slowfn)(struct rt_mutex *lock)) { - if (likely(rt_mutex_cmpxchg(lock, current, NULL))) + if (likely(rt_mutex_cmpxchg(lock, current, NULL))) { rt_mutex_deadlock_account_unlock(current); - else - slowfn(lock); + } else if (slowfn(lock)) { + /* Undo pi boosting if necessary: */ + rt_mutex_adjust_prio(current); + } } /** @@ -1461,6 +1462,22 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock) EXPORT_SYMBOL_GPL(rt_mutex_unlock); /** + * rt_mutex_futex_unlock - Futex variant of rt_mutex_unlock + * @lock: the rt_mutex to be unlocked + * + * Returns: true/false indicating whether priority adjustment is + * required or not. + */ +bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock) +{ + if (likely(rt_mutex_cmpxchg(lock, current, NULL))) { + rt_mutex_deadlock_account_unlock(current); + return false; + } + return rt_mutex_slowunlock(lock); +} + +/** * rt_mutex_destroy - mark a mutex unusable * @lock: the mutex to be destroyed * diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 855212501407..1bcf43099b16 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -132,6 +132,10 @@ extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter); extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to); +extern bool rt_mutex_futex_unlock(struct rt_mutex *lock); + +extern void rt_mutex_adjust_prio(struct task_struct *task); + #ifdef CONFIG_DEBUG_RT_MUTEXES # include "rtmutex-debug.h" #else -- 2.1.4