Re: [patch] futex: Cure exit race

From: Peter Zijlstra <peterz@infradead.org>
To: Thomas Gleixner <tglx@linutronix.de>
Cc: LKML <linux-kernel@vger.kernel.org>,
	Stefan Liebler <stli@linux.ibm.com>,
	Heiko Carstens <heiko.carstens@de.ibm.com>,
	Darren Hart <dvhart@infradead.org>,
	Ingo Molnar <mingo@kernel.org>
Subject: Re: [patch] futex: Cure exit race
Date: Wed, 12 Dec 2018 10:04:18 +0100	[thread overview]
Message-ID: <20181212090418.GT5289@hirez.programming.kicks-ass.net> (raw)
In-Reply-To: <alpine.DEB.2.21.1812101824070.1667@nanos.tec.linutronix.de>

On Mon, Dec 10, 2018 at 06:43:51PM +0100, Thomas Gleixner wrote:
> On Mon, 10 Dec 2018, Peter Zijlstra wrote:
> > On Mon, Dec 10, 2018 at 04:23:06PM +0100, Thomas Gleixner wrote:
> > There is another callers of futex_lock_pi_atomic(),
> > futex_proxy_trylock_atomic(), which is part of futex_requeue(), that too
> > does a retry loop on -EAGAIN.
> > 
> > And there is another caller of attach_to_pi_owner(): lookup_pi_state(),
> > and that too is in futex_requeue() and handles the retry case properly.
> > 
> > Yes, this all looks good.
> > 
> > Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> 
> Bah. The little devil in the unconcious part of my brain insisted on
> thinking further about that EAGAIN loop even despite my attempt to page
> that futex horrors out again immediately after sending that patch.
> 
> There is another related issue which is even worse than just mildly
> confusing user space:
> 
>    task1(SCHED_OTHER)
>    sys_exit()
>      do_exit()
>       exit_mm()
>        task1->flags |= PF_EXITING;
> 
>    ---> preemption
> 
>    task2(SCHED_FIFO)
>      sys_futex(LOCK_PI)
>        ....
>        attach_to_pi_owner() {
>          ...
>          if (!task1->flags & PF_EXITING) {
>            attach();
>          } else {
>               if (!(tsk->flags & PF_EXITPIDONE))
> 	         return -EAGAIN;
> 
> Now assume UP or both tasks pinned on the same CPU. That results in a
> livelock because task2 is going to loop forever.
> 
> No immediate idea how to cure that one w/o creating a mess.

One possible; but fairly gruesome hack; would be something like the
below.

Now, this obviously introduces a priority inversion, but that's
arguablly better than a live-lock, also I'm not sure there's really
anything 'sane' you can do in the case where your lock holder is dying
instead of doing a proper unlock anyway.

But no, I'm not liking this much either...

diff --git a/kernel/exit.c b/kernel/exit.c
index 0e21e6d21f35..bc6a01112d9d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -806,6 +806,8 @@ void __noreturn do_exit(long code)
 		 * task into the wait for ever nirwana as well.
 		 */
 		tsk->flags |= PF_EXITPIDONE;
+		smp_mb();
+		wake_up_bit(&tsk->flags, 3 /* PF_EXITPIDONE */);
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		schedule();
 	}
diff --git a/kernel/futex.c b/kernel/futex.c
index f423f9b6577e..a743d657e783 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1148,8 +1148,8 @@ static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
  * Lookup the task for the TID provided from user space and attach to
  * it after doing proper sanity checks.
  */
-static int attach_to_pi_owner(u32 uval, union futex_key *key,
-			      struct futex_pi_state **ps)
+static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
+			      struct futex_pi_state **ps, struct task_struct **pe)
 {
 	pid_t pid = uval & FUTEX_TID_MASK;
 	struct futex_pi_state *pi_state;
@@ -1187,10 +1236,15 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
 		 * set, we know that the task has finished the
 		 * cleanup:
 		 */
 		int ret = handle_exit_race(uaddr, uval, p);
 
 		raw_spin_unlock_irq(&p->pi_lock);
-		put_task_struct(p);
+
+		if (ret == -EAGAIN)
+			*pe = p;
+		else
+			put_task_struct(p);
+
 		return ret;
 	}
 
@@ -1244,7 +1298,7 @@ static int lookup_pi_state(u32 __user *uaddr, u32 uval,
 	 * We are the first waiter - try to look up the owner based on
 	 * @uval and attach to it.
 	 */
-	return attach_to_pi_owner(uval, key, ps);
+	return attach_to_pi_owner(uaddr, uval, key, ps);
 }
 
 static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
@@ -1282,7 +1336,8 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
 static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
 				union futex_key *key,
 				struct futex_pi_state **ps,
-				struct task_struct *task, int set_waiters)
+				struct task_struct *task, int set_waiters,
+				struct task_struct **exiting)
 {
 	u32 uval, newval, vpid = task_pid_vnr(task);
 	struct futex_q *top_waiter;
@@ -1352,7 +1407,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
 	 * attach to the owner. If that fails, no harm done, we only
 	 * set the FUTEX_WAITERS bit in the user space variable.
 	 */
-	return attach_to_pi_owner(uval, key, ps);
+	return attach_to_pi_owner(uaddr, uval, key, ps, exiting);
 }
 
 /**
@@ -2716,6 +2771,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
 	struct rt_mutex_waiter rt_waiter;
 	struct futex_hash_bucket *hb;
 	struct futex_q q = futex_q_init;
+	struct task_struct *exiting;
 	int res, ret;
 
 	if (!IS_ENABLED(CONFIG_FUTEX_PI))
@@ -2733,6 +2789,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
 	}
 
 retry:
+	exiting = NULL;
 	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE);
 	if (unlikely(ret != 0))
 		goto out;
@@ -2740,7 +2797,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
 retry_private:
 	hb = queue_lock(&q);
 
-	ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
+	ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0, &exiting);
 	if (unlikely(ret)) {
 		/*
 		 * Atomic work succeeded and we got the lock,
@@ -2762,6 +2819,12 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
 			 */
 			queue_unlock(hb);
 			put_futex_key(&q.key);
+
+			if (exiting) {
+				wait_bit(&exiting->flags, 3 /* PF_EXITPIDONE */, TASK_UNINTERRUPTIBLE);
+				put_task_struct(exiting);
+			}
+
 			cond_resched();
 			goto retry;
 		default: