All of lore.kernel.org
 help / color / mirror / Atom feed
From: Steven Rostedt <rostedt@goodmis.org>
To: linux-kernel@vger.kernel.org,
	linux-rt-users <linux-rt-users@vger.kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>,
	Carsten Emde <C.Emde@osadl.org>,
	Sebastian Andrzej Siewior <bigeasy@linutronix.de>,
	John Kacur <jkacur@redhat.com>, Julia Cartwright <julia@ni.com>,
	Daniel Wagner <wagi@monom.org>, Tom Zanussi <zanussi@kernel.org>
Subject: [PATCH RT 18/30] futex: Make the futex_hash_bucket spinlock_t again and bring back its old state
Date: Thu, 23 Jan 2020 15:39:48 -0500	[thread overview]
Message-ID: <20200123203944.869001366@goodmis.org> (raw)
In-Reply-To: 20200123203930.646725253@goodmis.org

4.19.94-rt39-rc2 stable review patch.
If anyone has any objections, please let me know.

------------------

From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

[ Upstream commit 954ad80c23edfe71f4e8ce70b961eac884320c3a ]

This is an all-in-one patch that reverts the patches:
   futex: Make the futex_hash_bucket lock raw
   futex: Delay deallocation of pi_state

and adds back the old patches we had:
   futex: workaround migrate_disable/enable in different context
   rtmutex: Handle the various new futex race conditions
   futex: Fix bug on when a requeued RT task times out
   futex: Ensure lock/unlock symetry versus pi_lock and hash bucket lock

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/futex.c                  | 230 ++++++++++++++++++--------------
 kernel/locking/rtmutex.c        |  65 ++++++++-
 kernel/locking/rtmutex_common.h |   3 +
 3 files changed, 194 insertions(+), 104 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index 0b8cff8d9162..e815cf542b82 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -243,7 +243,7 @@ struct futex_q {
 	struct plist_node list;
 
 	struct task_struct *task;
-	raw_spinlock_t *lock_ptr;
+	spinlock_t *lock_ptr;
 	union futex_key key;
 	struct futex_pi_state *pi_state;
 	struct rt_mutex_waiter *rt_waiter;
@@ -264,7 +264,7 @@ static const struct futex_q futex_q_init = {
  */
 struct futex_hash_bucket {
 	atomic_t waiters;
-	raw_spinlock_t lock;
+	spinlock_t lock;
 	struct plist_head chain;
 } ____cacheline_aligned_in_smp;
 
@@ -825,13 +825,13 @@ static void get_pi_state(struct futex_pi_state *pi_state)
  * Drops a reference to the pi_state object and frees or caches it
  * when the last reference is gone.
  */
-static struct futex_pi_state *__put_pi_state(struct futex_pi_state *pi_state)
+static void put_pi_state(struct futex_pi_state *pi_state)
 {
 	if (!pi_state)
-		return NULL;
+		return;
 
 	if (!atomic_dec_and_test(&pi_state->refcount))
-		return NULL;
+		return;
 
 	/*
 	 * If pi_state->owner is NULL, the owner is most probably dying
@@ -851,7 +851,9 @@ static struct futex_pi_state *__put_pi_state(struct futex_pi_state *pi_state)
 		raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
 	}
 
-	if (!current->pi_state_cache) {
+	if (current->pi_state_cache) {
+		kfree(pi_state);
+	} else {
 		/*
 		 * pi_state->list is already empty.
 		 * clear pi_state->owner.
@@ -860,30 +862,6 @@ static struct futex_pi_state *__put_pi_state(struct futex_pi_state *pi_state)
 		pi_state->owner = NULL;
 		atomic_set(&pi_state->refcount, 1);
 		current->pi_state_cache = pi_state;
-		pi_state = NULL;
-	}
-	return pi_state;
-}
-
-static void put_pi_state(struct futex_pi_state *pi_state)
-{
-	kfree(__put_pi_state(pi_state));
-}
-
-static void put_pi_state_atomic(struct futex_pi_state *pi_state,
-				struct list_head *to_free)
-{
-	if (__put_pi_state(pi_state))
-		list_add(&pi_state->list, to_free);
-}
-
-static void free_pi_state_list(struct list_head *to_free)
-{
-	struct futex_pi_state *p, *next;
-
-	list_for_each_entry_safe(p, next, to_free, list) {
-		list_del(&p->list);
-		kfree(p);
 	}
 }
 
@@ -900,7 +878,6 @@ void exit_pi_state_list(struct task_struct *curr)
 	struct futex_pi_state *pi_state;
 	struct futex_hash_bucket *hb;
 	union futex_key key = FUTEX_KEY_INIT;
-	LIST_HEAD(to_free);
 
 	if (!futex_cmpxchg_enabled)
 		return;
@@ -934,7 +911,7 @@ void exit_pi_state_list(struct task_struct *curr)
 		}
 		raw_spin_unlock_irq(&curr->pi_lock);
 
-		raw_spin_lock(&hb->lock);
+		spin_lock(&hb->lock);
 		raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
 		raw_spin_lock(&curr->pi_lock);
 		/*
@@ -944,8 +921,10 @@ void exit_pi_state_list(struct task_struct *curr)
 		if (head->next != next) {
 			/* retain curr->pi_lock for the loop invariant */
 			raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
-			raw_spin_unlock(&hb->lock);
-			put_pi_state_atomic(pi_state, &to_free);
+			raw_spin_unlock_irq(&curr->pi_lock);
+			spin_unlock(&hb->lock);
+			raw_spin_lock_irq(&curr->pi_lock);
+			put_pi_state(pi_state);
 			continue;
 		}
 
@@ -956,7 +935,7 @@ void exit_pi_state_list(struct task_struct *curr)
 
 		raw_spin_unlock(&curr->pi_lock);
 		raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
-		raw_spin_unlock(&hb->lock);
+		spin_unlock(&hb->lock);
 
 		rt_mutex_futex_unlock(&pi_state->pi_mutex);
 		put_pi_state(pi_state);
@@ -964,8 +943,6 @@ void exit_pi_state_list(struct task_struct *curr)
 		raw_spin_lock_irq(&curr->pi_lock);
 	}
 	raw_spin_unlock_irq(&curr->pi_lock);
-
-	free_pi_state_list(&to_free);
 }
 
 #endif
@@ -1452,7 +1429,7 @@ static void __unqueue_futex(struct futex_q *q)
 {
 	struct futex_hash_bucket *hb;
 
-	if (WARN_ON_SMP(!q->lock_ptr || !raw_spin_is_locked(q->lock_ptr))
+	if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr))
 	    || WARN_ON(plist_node_empty(&q->list)))
 		return;
 
@@ -1580,21 +1557,21 @@ static inline void
 double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
 {
 	if (hb1 <= hb2) {
-		raw_spin_lock(&hb1->lock);
+		spin_lock(&hb1->lock);
 		if (hb1 < hb2)
-			raw_spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
+			spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
 	} else { /* hb1 > hb2 */
-		raw_spin_lock(&hb2->lock);
-		raw_spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
+		spin_lock(&hb2->lock);
+		spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
 	}
 }
 
 static inline void
 double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
 {
-	raw_spin_unlock(&hb1->lock);
+	spin_unlock(&hb1->lock);
 	if (hb1 != hb2)
-		raw_spin_unlock(&hb2->lock);
+		spin_unlock(&hb2->lock);
 }
 
 /*
@@ -1622,7 +1599,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 	if (!hb_waiters_pending(hb))
 		goto out_put_key;
 
-	raw_spin_lock(&hb->lock);
+	spin_lock(&hb->lock);
 
 	plist_for_each_entry_safe(this, next, &hb->chain, list) {
 		if (match_futex (&this->key, &key)) {
@@ -1641,7 +1618,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 		}
 	}
 
-	raw_spin_unlock(&hb->lock);
+	spin_unlock(&hb->lock);
 	wake_up_q(&wake_q);
 out_put_key:
 	put_futex_key(&key);
@@ -1948,7 +1925,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
 	struct futex_hash_bucket *hb1, *hb2;
 	struct futex_q *this, *next;
 	DEFINE_WAKE_Q(wake_q);
-	LIST_HEAD(to_free);
 
 	if (nr_wake < 0 || nr_requeue < 0)
 		return -EINVAL;
@@ -2176,6 +2152,16 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
 				requeue_pi_wake_futex(this, &key2, hb2);
 				drop_count++;
 				continue;
+			} else if (ret == -EAGAIN) {
+				/*
+				 * Waiter was woken by timeout or
+				 * signal and has set pi_blocked_on to
+				 * PI_WAKEUP_INPROGRESS before we
+				 * tried to enqueue it on the rtmutex.
+				 */
+				this->pi_state = NULL;
+				put_pi_state(pi_state);
+				continue;
 			} else if (ret) {
 				/*
 				 * rt_mutex_start_proxy_lock() detected a
@@ -2186,7 +2172,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
 				 * object.
 				 */
 				this->pi_state = NULL;
-				put_pi_state_atomic(pi_state, &to_free);
+				put_pi_state(pi_state);
 				/*
 				 * We stop queueing more waiters and let user
 				 * space deal with the mess.
@@ -2203,7 +2189,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
 	 * in futex_proxy_trylock_atomic() or in lookup_pi_state(). We
 	 * need to drop it here again.
 	 */
-	put_pi_state_atomic(pi_state, &to_free);
+	put_pi_state(pi_state);
 
 out_unlock:
 	double_unlock_hb(hb1, hb2);
@@ -2224,7 +2210,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
 out_put_key1:
 	put_futex_key(&key1);
 out:
-	free_pi_state_list(&to_free);
 	return ret ? ret : task_count;
 }
 
@@ -2248,7 +2233,7 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
 
 	q->lock_ptr = &hb->lock;
 
-	raw_spin_lock(&hb->lock); /* implies smp_mb(); (A) */
+	spin_lock(&hb->lock); /* implies smp_mb(); (A) */
 	return hb;
 }
 
@@ -2256,7 +2241,7 @@ static inline void
 queue_unlock(struct futex_hash_bucket *hb)
 	__releases(&hb->lock)
 {
-	raw_spin_unlock(&hb->lock);
+	spin_unlock(&hb->lock);
 	hb_waiters_dec(hb);
 }
 
@@ -2295,7 +2280,7 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
 	__releases(&hb->lock)
 {
 	__queue_me(q, hb);
-	raw_spin_unlock(&hb->lock);
+	spin_unlock(&hb->lock);
 }
 
 /**
@@ -2311,41 +2296,41 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
  */
 static int unqueue_me(struct futex_q *q)
 {
-	raw_spinlock_t *lock_ptr;
+	spinlock_t *lock_ptr;
 	int ret = 0;
 
 	/* In the common case we don't take the spinlock, which is nice. */
 retry:
 	/*
-	 * q->lock_ptr can change between this read and the following
-	 * raw_spin_lock. Use READ_ONCE to forbid the compiler from reloading
-	 * q->lock_ptr and optimizing lock_ptr out of the logic below.
+	 * q->lock_ptr can change between this read and the following spin_lock.
+	 * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and
+	 * optimizing lock_ptr out of the logic below.
 	 */
 	lock_ptr = READ_ONCE(q->lock_ptr);
 	if (lock_ptr != NULL) {
-		raw_spin_lock(lock_ptr);
+		spin_lock(lock_ptr);
 		/*
 		 * q->lock_ptr can change between reading it and
-		 * raw_spin_lock(), causing us to take the wrong lock.  This
+		 * spin_lock(), causing us to take the wrong lock.  This
 		 * corrects the race condition.
 		 *
 		 * Reasoning goes like this: if we have the wrong lock,
 		 * q->lock_ptr must have changed (maybe several times)
-		 * between reading it and the raw_spin_lock().  It can
-		 * change again after the raw_spin_lock() but only if it was
-		 * already changed before the raw_spin_lock().  It cannot,
+		 * between reading it and the spin_lock().  It can
+		 * change again after the spin_lock() but only if it was
+		 * already changed before the spin_lock().  It cannot,
 		 * however, change back to the original value.  Therefore
 		 * we can detect whether we acquired the correct lock.
 		 */
 		if (unlikely(lock_ptr != q->lock_ptr)) {
-			raw_spin_unlock(lock_ptr);
+			spin_unlock(lock_ptr);
 			goto retry;
 		}
 		__unqueue_futex(q);
 
 		BUG_ON(q->pi_state);
 
-		raw_spin_unlock(lock_ptr);
+		spin_unlock(lock_ptr);
 		ret = 1;
 	}
 
@@ -2361,16 +2346,13 @@ static int unqueue_me(struct futex_q *q)
 static void unqueue_me_pi(struct futex_q *q)
 	__releases(q->lock_ptr)
 {
-	struct futex_pi_state *ps;
-
 	__unqueue_futex(q);
 
 	BUG_ON(!q->pi_state);
-	ps = __put_pi_state(q->pi_state);
+	put_pi_state(q->pi_state);
 	q->pi_state = NULL;
 
-	raw_spin_unlock(q->lock_ptr);
-	kfree(ps);
+	spin_unlock(q->lock_ptr);
 }
 
 static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
@@ -2503,7 +2485,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
 	 */
 handle_err:
 	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
-	raw_spin_unlock(q->lock_ptr);
+	spin_unlock(q->lock_ptr);
 
 	switch (err) {
 	case -EFAULT:
@@ -2521,7 +2503,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
 		break;
 	}
 
-	raw_spin_lock(q->lock_ptr);
+	spin_lock(q->lock_ptr);
 	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
 
 	/*
@@ -2617,7 +2599,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
 	/*
 	 * The task state is guaranteed to be set before another task can
 	 * wake it. set_current_state() is implemented using smp_store_mb() and
-	 * queue_me() calls raw_spin_unlock() upon completion, both serializing
+	 * queue_me() calls spin_unlock() upon completion, both serializing
 	 * access to the hash list and forcing another memory barrier.
 	 */
 	set_current_state(TASK_INTERRUPTIBLE);
@@ -2908,7 +2890,15 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
 	 * before __rt_mutex_start_proxy_lock() is done.
 	 */
 	raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
-	raw_spin_unlock(q.lock_ptr);
+	/*
+	 * the migrate_disable() here disables migration in the in_atomic() fast
+	 * path which is enabled again in the following spin_unlock(). We have
+	 * one migrate_disable() pending in the slow-path which is reversed
+	 * after the raw_spin_unlock_irq() where we leave the atomic context.
+	 */
+	migrate_disable();
+
+	spin_unlock(q.lock_ptr);
 	/*
 	 * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
 	 * such that futex_unlock_pi() is guaranteed to observe the waiter when
@@ -2916,6 +2906,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
 	 */
 	ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
 	raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
+	migrate_enable();
 
 	if (ret) {
 		if (ret == 1)
@@ -2929,7 +2920,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
 	ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
 
 cleanup:
-	raw_spin_lock(q.lock_ptr);
+	spin_lock(q.lock_ptr);
 	/*
 	 * If we failed to acquire the lock (deadlock/signal/timeout), we must
 	 * first acquire the hb->lock before removing the lock from the
@@ -3030,7 +3021,7 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
 		return ret;
 
 	hb = hash_futex(&key);
-	raw_spin_lock(&hb->lock);
+	spin_lock(&hb->lock);
 
 	/*
 	 * Check waiters first. We do not trust user space values at
@@ -3064,10 +3055,19 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
 		 * rt_waiter. Also see the WARN in wake_futex_pi().
 		 */
 		raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
-		raw_spin_unlock(&hb->lock);
+		/*
+		 * Magic trickery for now to make the RT migrate disable
+		 * logic happy. The following spin_unlock() happens with
+		 * interrupts disabled so the internal migrate_enable()
+		 * won't undo the migrate_disable() which was issued when
+		 * locking hb->lock.
+		 */
+		migrate_disable();
+		spin_unlock(&hb->lock);
 
 		/* drops pi_state->pi_mutex.wait_lock */
 		ret = wake_futex_pi(uaddr, uval, pi_state);
+		migrate_enable();
 
 		put_pi_state(pi_state);
 
@@ -3103,7 +3103,7 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
 	 * owner.
 	 */
 	if ((ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))) {
-		raw_spin_unlock(&hb->lock);
+		spin_unlock(&hb->lock);
 		switch (ret) {
 		case -EFAULT:
 			goto pi_faulted;
@@ -3123,7 +3123,7 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
 	ret = (curval == uval) ? 0 : -EAGAIN;
 
 out_unlock:
-	raw_spin_unlock(&hb->lock);
+	spin_unlock(&hb->lock);
 out_putkey:
 	put_futex_key(&key);
 	return ret;
@@ -3239,7 +3239,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 	struct hrtimer_sleeper timeout, *to = NULL;
 	struct futex_pi_state *pi_state = NULL;
 	struct rt_mutex_waiter rt_waiter;
-	struct futex_hash_bucket *hb;
+	struct futex_hash_bucket *hb, *hb2;
 	union futex_key key2 = FUTEX_KEY_INIT;
 	struct futex_q q = futex_q_init;
 	int res, ret;
@@ -3297,20 +3297,55 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 	/* Queue the futex_q, drop the hb lock, wait for wakeup. */
 	futex_wait_queue_me(hb, &q, to);
 
-	raw_spin_lock(&hb->lock);
-	ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
-	raw_spin_unlock(&hb->lock);
-	if (ret)
-		goto out_put_keys;
+	/*
+	 * On RT we must avoid races with requeue and trying to block
+	 * on two mutexes (hb->lock and uaddr2's rtmutex) by
+	 * serializing access to pi_blocked_on with pi_lock.
+	 */
+	raw_spin_lock_irq(&current->pi_lock);
+	if (current->pi_blocked_on) {
+		/*
+		 * We have been requeued or are in the process of
+		 * being requeued.
+		 */
+		raw_spin_unlock_irq(&current->pi_lock);
+	} else {
+		/*
+		 * Setting pi_blocked_on to PI_WAKEUP_INPROGRESS
+		 * prevents a concurrent requeue from moving us to the
+		 * uaddr2 rtmutex. After that we can safely acquire
+		 * (and possibly block on) hb->lock.
+		 */
+		current->pi_blocked_on = PI_WAKEUP_INPROGRESS;
+		raw_spin_unlock_irq(&current->pi_lock);
+
+		spin_lock(&hb->lock);
+
+		/*
+		 * Clean up pi_blocked_on. We might leak it otherwise
+		 * when we succeeded with the hb->lock in the fast
+		 * path.
+		 */
+		raw_spin_lock_irq(&current->pi_lock);
+		current->pi_blocked_on = NULL;
+		raw_spin_unlock_irq(&current->pi_lock);
+
+		ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
+		spin_unlock(&hb->lock);
+		if (ret)
+			goto out_put_keys;
+	}
 
 	/*
-	 * In order for us to be here, we know our q.key == key2, and since
-	 * we took the hb->lock above, we also know that futex_requeue() has
-	 * completed and we no longer have to concern ourselves with a wakeup
-	 * race with the atomic proxy lock acquisition by the requeue code. The
-	 * futex_requeue dropped our key1 reference and incremented our key2
-	 * reference count.
+	 * In order to be here, we have either been requeued, are in
+	 * the process of being requeued, or requeue successfully
+	 * acquired uaddr2 on our behalf.  If pi_blocked_on was
+	 * non-null above, we may be racing with a requeue.  Do not
+	 * rely on q->lock_ptr to be hb2->lock until after blocking on
+	 * hb->lock or hb2->lock. The futex_requeue dropped our key1
+	 * reference and incremented our key2 reference count.
 	 */
+	hb2 = hash_futex(&key2);
 
 	/* Check if the requeue code acquired the second futex for us. */
 	if (!q.rt_waiter) {
@@ -3319,9 +3354,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 		 * did a lock-steal - fix up the PI-state in that case.
 		 */
 		if (q.pi_state && (q.pi_state->owner != current)) {
-			struct futex_pi_state *ps_free;
-
-			raw_spin_lock(q.lock_ptr);
+			spin_lock(&hb2->lock);
+			BUG_ON(&hb2->lock != q.lock_ptr);
 			ret = fixup_pi_state_owner(uaddr2, &q, current);
 			if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
 				pi_state = q.pi_state;
@@ -3331,9 +3365,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 			 * Drop the reference to the pi state which
 			 * the requeue_pi() code acquired for us.
 			 */
-			ps_free = __put_pi_state(q.pi_state);
-			raw_spin_unlock(q.lock_ptr);
-			kfree(ps_free);
+			put_pi_state(q.pi_state);
+			spin_unlock(&hb2->lock);
 		}
 	} else {
 		struct rt_mutex *pi_mutex;
@@ -3347,7 +3380,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 		pi_mutex = &q.pi_state->pi_mutex;
 		ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
 
-		raw_spin_lock(q.lock_ptr);
+		spin_lock(&hb2->lock);
+		BUG_ON(&hb2->lock != q.lock_ptr);
 		if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
 			ret = 0;
 
@@ -4014,7 +4048,7 @@ static int __init futex_init(void)
 	for (i = 0; i < futex_hashsize; i++) {
 		atomic_set(&futex_queues[i].waiters, 0);
 		plist_head_init(&futex_queues[i].chain);
-		raw_spin_lock_init(&futex_queues[i].lock);
+		spin_lock_init(&futex_queues[i].lock);
 	}
 
 	return 0;
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 44a33057a83a..2a9bf2443acc 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -142,6 +142,12 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
 		WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS);
 }
 
+static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter)
+{
+	return waiter && waiter != PI_WAKEUP_INPROGRESS &&
+		waiter != PI_REQUEUE_INPROGRESS;
+}
+
 /*
  * We can speed up the acquire/release, if there's no debugging state to be
  * set up.
@@ -415,7 +421,8 @@ int max_lock_depth = 1024;
 
 static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
 {
-	return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
+	return rt_mutex_real_waiter(p->pi_blocked_on) ?
+		p->pi_blocked_on->lock : NULL;
 }
 
 /*
@@ -551,7 +558,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 	 * reached or the state of the chain has changed while we
 	 * dropped the locks.
 	 */
-	if (!waiter)
+	if (!rt_mutex_real_waiter(waiter))
 		goto out_unlock_pi;
 
 	/*
@@ -1321,6 +1328,22 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 		return -EDEADLK;
 
 	raw_spin_lock(&task->pi_lock);
+	/*
+	 * In the case of futex requeue PI, this will be a proxy
+	 * lock. The task will wake unaware that it is enqueueed on
+	 * this lock. Avoid blocking on two locks and corrupting
+	 * pi_blocked_on via the PI_WAKEUP_INPROGRESS
+	 * flag. futex_wait_requeue_pi() sets this when it wakes up
+	 * before requeue (due to a signal or timeout). Do not enqueue
+	 * the task if PI_WAKEUP_INPROGRESS is set.
+	 */
+	if (task != current && task->pi_blocked_on == PI_WAKEUP_INPROGRESS) {
+		raw_spin_unlock(&task->pi_lock);
+		return -EAGAIN;
+	}
+
+       BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on));
+
 	waiter->task = task;
 	waiter->lock = lock;
 	waiter->prio = task->prio;
@@ -1344,7 +1367,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 		rt_mutex_enqueue_pi(owner, waiter);
 
 		rt_mutex_adjust_prio(owner);
-		if (owner->pi_blocked_on)
+		if (rt_mutex_real_waiter(owner->pi_blocked_on))
 			chain_walk = 1;
 	} else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
 		chain_walk = 1;
@@ -1444,7 +1467,7 @@ static void remove_waiter(struct rt_mutex *lock,
 {
 	bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
 	struct task_struct *owner = rt_mutex_owner(lock);
-	struct rt_mutex *next_lock;
+	struct rt_mutex *next_lock = NULL;
 
 	lockdep_assert_held(&lock->wait_lock);
 
@@ -1470,7 +1493,8 @@ static void remove_waiter(struct rt_mutex *lock,
 	rt_mutex_adjust_prio(owner);
 
 	/* Store the lock on which owner is blocked or NULL */
-	next_lock = task_blocked_on_lock(owner);
+	if (rt_mutex_real_waiter(owner->pi_blocked_on))
+		next_lock = task_blocked_on_lock(owner);
 
 	raw_spin_unlock(&owner->pi_lock);
 
@@ -1506,7 +1530,8 @@ void rt_mutex_adjust_pi(struct task_struct *task)
 	raw_spin_lock_irqsave(&task->pi_lock, flags);
 
 	waiter = task->pi_blocked_on;
-	if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
+	if (!rt_mutex_real_waiter(waiter) ||
+	    rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
 		raw_spin_unlock_irqrestore(&task->pi_lock, flags);
 		return;
 	}
@@ -2325,6 +2350,34 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
 	if (try_to_take_rt_mutex(lock, task, NULL))
 		return 1;
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+	/*
+	 * In PREEMPT_RT there's an added race.
+	 * If the task, that we are about to requeue, times out,
+	 * it can set the PI_WAKEUP_INPROGRESS. This tells the requeue
+	 * to skip this task. But right after the task sets
+	 * its pi_blocked_on to PI_WAKEUP_INPROGRESS it can then
+	 * block on the spin_lock(&hb->lock), which in RT is an rtmutex.
+	 * This will replace the PI_WAKEUP_INPROGRESS with the actual
+	 * lock that it blocks on. We *must not* place this task
+	 * on this proxy lock in that case.
+	 *
+	 * To prevent this race, we first take the task's pi_lock
+	 * and check if it has updated its pi_blocked_on. If it has,
+	 * we assume that it woke up and we return -EAGAIN.
+	 * Otherwise, we set the task's pi_blocked_on to
+	 * PI_REQUEUE_INPROGRESS, so that if the task is waking up
+	 * it will know that we are in the process of requeuing it.
+	 */
+	raw_spin_lock(&task->pi_lock);
+	if (task->pi_blocked_on) {
+		raw_spin_unlock(&task->pi_lock);
+		return -EAGAIN;
+	}
+	task->pi_blocked_on = PI_REQUEUE_INPROGRESS;
+	raw_spin_unlock(&task->pi_lock);
+#endif
+
 	/* We enforce deadlock detection for futexes */
 	ret = task_blocks_on_rt_mutex(lock, waiter, task,
 				      RT_MUTEX_FULL_CHAINWALK);
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 758dc43872e5..546aaf058b9e 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -132,6 +132,9 @@ enum rtmutex_chainwalk {
 /*
  * PI-futex support (proxy locking functions, etc.):
  */
+#define PI_WAKEUP_INPROGRESS	((struct rt_mutex_waiter *) 1)
+#define PI_REQUEUE_INPROGRESS	((struct rt_mutex_waiter *) 2)
+
 extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
 extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
 				       struct task_struct *proxy_owner);
-- 
2.24.1



  parent reply	other threads:[~2020-01-23 20:40 UTC|newest]

Thread overview: 38+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-01-23 20:39 [PATCH RT 00/30] Linux 4.19.94-rt39-rc2 Steven Rostedt
2020-01-23 20:39 ` [PATCH RT 01/30] i2c: exynos5: Remove IRQF_ONESHOT Steven Rostedt
2020-01-23 20:39 ` [PATCH RT 02/30] i2c: hix5hd2: " Steven Rostedt
2020-01-23 20:39 ` [PATCH RT 03/30] sched/deadline: Ensure inactive_timer runs in hardirq context Steven Rostedt
2020-01-23 20:39 ` [PATCH RT 04/30] thermal/x86_pkg_temp: make pkg_temp_lock a raw spinlock Steven Rostedt
2020-01-23 20:39 ` [PATCH RT 05/30] dma-buf: Use seqlock_t instread disabling preemption Steven Rostedt
2020-01-23 20:39 ` [PATCH RT 06/30] KVM: arm/arm64: Let the timer expire in hardirq context on RT Steven Rostedt
2020-01-23 20:39 ` [PATCH RT 07/30] x86: preempt: Check preemption level before looking at lazy-preempt Steven Rostedt
2020-01-23 20:39 ` [PATCH RT 08/30] hrtimer: Use READ_ONCE to access timer->base in hrimer_grab_expiry_lock() Steven Rostedt
2020-01-23 20:39 ` [PATCH RT 09/30] hrtimer: Dont grab the expiry lock for non-soft hrtimer Steven Rostedt
2020-01-23 20:39 ` [PATCH RT 10/30] hrtimer: Prevent using hrtimer_grab_expiry_lock() on migration_base Steven Rostedt
2020-04-27 13:10   ` Rasmus Villemoes
2020-04-27 19:06     ` Steven Rostedt
2020-04-27 19:26       ` Tom Zanussi
2020-04-28  6:51         ` Rasmus Villemoes
2020-04-28  7:03   ` Rasmus Villemoes
2020-04-28 12:59     ` Tom Zanussi
2020-04-28 13:07       ` Rasmus Villemoes
2020-04-28 13:43         ` Tom Zanussi
2020-01-23 20:39 ` [PATCH RT 11/30] hrtimer: Add a missing bracket and hide `migration_base on !SMP Steven Rostedt
2020-01-23 20:39 ` [PATCH RT 12/30] posix-timers: Unlock expiry lock in the early return Steven Rostedt
2020-01-23 20:39 ` [PATCH RT 14/30] sched: __set_cpus_allowed_ptr: Check cpus_mask, not cpus_ptr Steven Rostedt
2020-01-23 20:39 ` [PATCH RT 15/30] sched: Remove dead __migrate_disabled() check Steven Rostedt
2020-01-23 20:39 ` [PATCH RT 16/30] sched: migrate disable: Protect cpus_ptr with lock Steven Rostedt
2020-01-23 20:39 ` [PATCH RT 17/30] lib/smp_processor_id: Dont use cpumask_equal() Steven Rostedt
2020-01-23 20:39 ` Steven Rostedt [this message]
2020-01-23 20:39 ` [PATCH RT 19/30] locking/rtmutex: Clean ->pi_blocked_on in the error case Steven Rostedt
2020-01-23 20:39 ` [PATCH RT 20/30] lib/ubsan: Dont seralize UBSAN report Steven Rostedt
2020-01-23 20:39 ` [PATCH RT 21/30] kmemleak: Change the lock of kmemleak_object to raw_spinlock_t Steven Rostedt
2020-01-23 20:39 ` [PATCH RT 22/30] sched: migrate_enable: Use select_fallback_rq() Steven Rostedt
2020-01-23 20:39 ` [PATCH RT 23/30] sched: Lazy migrate_disable processing Steven Rostedt
2020-01-23 20:39 ` [PATCH RT 24/30] sched: migrate_enable: Use stop_one_cpu_nowait() Steven Rostedt
2020-01-23 20:39 ` [PATCH RT 25/30] Revert "ARM: Initialize split page table locks for vector page" Steven Rostedt
2020-01-23 20:39 ` [PATCH RT 26/30] locking: Make spinlock_t and rwlock_t a RCU section on RT Steven Rostedt
2020-01-23 20:39 ` [PATCH RT 27/30] sched/core: migrate_enable() must access takedown_cpu_task on !HOTPLUG_CPU Steven Rostedt
2020-01-23 20:39 ` [PATCH RT 28/30] lib/smp_processor_id: Adjust check_preemption_disabled() Steven Rostedt
2020-01-23 20:39 ` [PATCH RT 29/30] sched: migrate_enable: Busy loop until the migration request is completed Steven Rostedt
2020-01-23 20:40 ` [PATCH RT 30/30] Linux 4.19.94-rt39-rc2 Steven Rostedt

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200123203944.869001366@goodmis.org \
    --to=rostedt@goodmis.org \
    --cc=C.Emde@osadl.org \
    --cc=bigeasy@linutronix.de \
    --cc=jkacur@redhat.com \
    --cc=julia@ni.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-rt-users@vger.kernel.org \
    --cc=tglx@linutronix.de \
    --cc=wagi@monom.org \
    --cc=zanussi@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.