When PREEMPT_RT_FULL does the spinlock -> rt_mutex substitution the PI chain code will (falsely) report a deadlock and BUG. The problem is that we hold hb->lock (now an rt_mutex) while doing task_blocks_on_rt_mutex on the futex's pi_state::rtmutex. This, when interleaved just right with futex_unlock_pi() leads it to believe we have an AB-BA deadlock. Task1 (holds rt_mutex, Task2 (does FUTEX_LOCK_PI) does FUTEX_UNLOCK_PI) lock hb->lock lock rt_mutex (as per start_proxy) lock hb->lock Which is a trivial AB-BA. It is not an actual deadlock, because we won't be holding hb->lock by the time we actually block on rt_mutex, but the chainwalk code doesn't know that. To avoid this problem, do the same thing we do in futex_unlock_pi() and drop hb->lock after acquiring wait_lock. This still fully serializes against futex_unlock_pi(), since adding to the wait_list does the very same lock dance, and removing it holds both locks. Reported-by: Sebastian Andrzej Siewior Suggested-by: Thomas Gleixner Tested-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) --- kernel/futex.c | 30 +++++++++++++++++------- kernel/locking/rtmutex.c | 49 ++++++++++++++++++++++------------------ kernel/locking/rtmutex_common.h | 3 ++ 3 files changed, 52 insertions(+), 30 deletions(-) --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2654,20 +2654,33 @@ static int futex_lock_pi(u32 __user *uad goto no_block; } + rt_mutex_init_waiter(&rt_waiter); + /* - * We must add ourselves to the rt_mutex waitlist while holding hb->lock - * such that the hb and rt_mutex wait lists match. + * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not + * hold it while doing rt_mutex_start_proxy(), because then it will + * include hb->lock in the blocking chain, even through we'll not in + * fact hold it while blocking. This will lead it to report -EDEADLK + * and BUG when futex_unlock_pi() interleaves with this. + * + * Therefore acquire wait_lock while holding hb->lock, but drop the + * latter before calling rt_mutex_start_proxy_lock(). This still fully + * serializes against futex_unlock_pi() as that does the exact same + * lock handoff sequence. */ - rt_mutex_init_waiter(&rt_waiter); - ret = rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); + raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); + spin_unlock(q.lock_ptr); + ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); + raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock); + if (ret) { if (ret == 1) ret = 0; + spin_lock(q.lock_ptr); goto no_block; } - spin_unlock(q.lock_ptr); if (unlikely(to)) hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); @@ -2680,6 +2693,9 @@ static int futex_lock_pi(u32 __user *uad * first acquire the hb->lock before removing the lock from the * rt_mutex waitqueue, such that we can keep the hb and rt_mutex * wait lists consistent. + * + * In particular; it is important that futex_unlock_pi() can not + * observe this inconsistency. */ if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) ret = 0; @@ -2791,10 +2807,6 @@ static int futex_unlock_pi(u32 __user *u get_pi_state(pi_state); /* - * Since modifying the wait_list is done while holding both - * hb->lock and wait_lock, holding either is sufficient to - * observe it. - * * By taking wait_lock while still holding hb->lock, we ensure * there is no point where we hold neither; and therefore * wake_futex_pi() must observe a state consistent with what we --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1669,31 +1669,14 @@ void rt_mutex_proxy_unlock(struct rt_mut rt_mutex_set_owner(lock, NULL); } -/** - * rt_mutex_start_proxy_lock() - Start lock acquisition for another task - * @lock: the rt_mutex to take - * @waiter: the pre-initialized rt_mutex_waiter - * @task: the task to prepare - * - * Returns: - * 0 - task blocked on lock - * 1 - acquired the lock for task, caller should wake it up - * <0 - error - * - * Special API call for FUTEX_REQUEUE_PI support. - */ -int rt_mutex_start_proxy_lock(struct rt_mutex *lock, +int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct task_struct *task) { int ret; - raw_spin_lock_irq(&lock->wait_lock); - - if (try_to_take_rt_mutex(lock, task, NULL)) { - raw_spin_unlock_irq(&lock->wait_lock); + if (try_to_take_rt_mutex(lock, task, NULL)) return 1; - } /* We enforce deadlock detection for futexes */ ret = task_blocks_on_rt_mutex(lock, waiter, task, @@ -1712,12 +1695,36 @@ int rt_mutex_start_proxy_lock(struct rt_ if (unlikely(ret)) remove_waiter(lock, waiter); - raw_spin_unlock_irq(&lock->wait_lock); - debug_rt_mutex_print_deadlock(waiter); return ret; } + +/** + * rt_mutex_start_proxy_lock() - Start lock acquisition for another task + * @lock: the rt_mutex to take + * @waiter: the pre-initialized rt_mutex_waiter + * @task: the task to prepare + * + * Returns: + * 0 - task blocked on lock + * 1 - acquired the lock for task, caller should wake it up + * <0 - error + * + * Special API call for FUTEX_REQUEUE_PI support. + */ +int rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task) +{ + int ret; + + raw_spin_lock_irq(&lock->wait_lock); + ret = __rt_mutex_start_proxy_lock(lock, waiter, task); + raw_spin_unlock_irq(&lock->wait_lock); + + return ret; +} /** * rt_mutex_next_owner - return the next owner of the lock --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -104,6 +104,9 @@ extern void rt_mutex_init_proxy_locked(s extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, struct task_struct *proxy_owner); extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); +extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task); extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct task_struct *task);