linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Peter Zijlstra <peterz@infradead.org>
To: mingo@redhat.com, tglx@linutronix.de, juri.lelli@redhat.com,
	vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
	rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
	bristot@redhat.com
Cc: linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	linux-api@vger.kernel.org, x86@kernel.org, pjt@google.com,
	posk@google.com, avagin@google.com, jannh@google.com,
	tdelisle@uwaterloo.ca, mark.rutland@arm.com, posk@posk.io
Subject: Re: [RFC][PATCH v2 5/5] sched: User Mode Concurency Groups
Date: Mon, 24 Jan 2022 14:59:46 +0100	[thread overview]
Message-ID: <Ye6w0qw1VpfLL0Ng@hirez.programming.kicks-ass.net> (raw)
In-Reply-To: <20220121114758.GF20638@worktop.programming.kicks-ass.net>

On Fri, Jan 21, 2022 at 12:47:58PM +0100, Peter Zijlstra wrote:
> On Thu, Jan 20, 2022 at 04:55:22PM +0100, Peter Zijlstra wrote:
> 
> > +SYSCALL_DEFINE2(umcg_wait, u32, flags, u64, timo)
> > +{
> > +	struct task_struct *tsk = current;
> > +	struct umcg_task __user *self = READ_ONCE(tsk->umcg_task);
> > +	bool worker = tsk->flags & PF_UMCG_WORKER;
> > +	int ret;
> > +
> > +	if (!self || flags)
> > +		return -EINVAL;
> > +
> > +	if (worker) {
> > +		tsk->flags &= ~PF_UMCG_WORKER;
> > +		if (timo)
> > +			return -ERANGE;
> > +	}
> > +
> > +	/* see umcg_sys_{enter,exit}() syscall exceptions */
> > +	ret = umcg_pin_pages();
> > +	if (ret)
> > +		goto unblock;
> > +
> > +	/*
> > +	 * Clear UMCG_TF_COND_WAIT *and* check state == RUNNABLE.
> > +	 */
> > +	ret = umcg_update_state(tsk, self, UMCG_TASK_RUNNABLE, UMCG_TASK_RUNNABLE);
> > +	if (ret)
> > +		goto unpin;
> > +
> > +	ret = umcg_wake_next(tsk, self);
> > +	if (ret)
> > +		goto unpin;
> > +
> > +	if (worker) {
> > +		/*
> > +		 * If this fails it is possible ::next_tid is already running
> > +		 * while this task is not going to block. This violates our
> > +		 * constraints.
> > +		 *
> > +		 * That said, pretty much the only way to make this fail is by
> > +		 * force munmap()'ing things. In which case one is most welcome
> > +		 * to the pieces.
> > +		 */
> > +		ret = umcg_enqueue_and_wake(tsk);
> > +		if (ret)
> > +			goto unpin;
> > +	}
> > +
> > +	umcg_unpin_pages();
> > +
> > +	ret = umcg_wait(timo);
> > +	switch (ret) {
> > +	case 0:		/* all done */
> > +	case -EINTR:	/* umcg_notify_resume() will continue the wait */
> 
> So I was playing with the whole worker timeout thing last night and
> realized this is broken. If we get a signal while we have a timeout, the
> timeout gets lost.
> 
> I think the easiest solution is to have umcg_notify_resume() also resume
> the timeout, but the first pass of that was yuck, so I need to try
> again.
> 
> Related, by moving the whole enqueue-and-wake thing into the timeout, we
> get more 'fun' failure cases :-(

This is the best I can come up with,... but it's a hot mess :-(

Still, let me go try this.

---

--- a/include/uapi/linux/umcg.h
+++ b/include/uapi/linux/umcg.h
@@ -127,6 +127,14 @@ struct umcg_task {
 } __attribute__((packed, aligned(UMCG_TASK_ALIGN)));
 
 /**
+ * enum umcg_wait_flag - flags to pass to sys_umcg_wait
+ * @UMCG_WAIT_ENQUEUE:	Enqueue the task on runnable_workers_ptr before waiting
+ */
+enum umcg_wait_flag {
+	UMCG_WAIT_ENQUEUE	= 0x0001,
+};
+
+/**
  * enum umcg_ctl_flag - flags to pass to sys_umcg_ctl
  * @UMCG_CTL_REGISTER:   register the current task as a UMCG task
  * @UMCG_CTL_UNREGISTER: unregister the current task as a UMCG task
--- a/kernel/sched/umcg.c
+++ b/kernel/sched/umcg.c
@@ -227,7 +227,6 @@ static int umcg_update_state(struct task
 
 #define UMCG_DIE(reason)	__UMCG_DIE(,reason)
 #define UMCG_DIE_PF(reason)	__UMCG_DIE(pagefault_enable(), reason)
-#define UMCG_DIE_UNPIN(reason)	__UMCG_DIE(umcg_unpin_pages(), reason)
 
 /* Called from syscall enter path and exceptions that can schedule */
 void umcg_sys_enter(struct pt_regs *regs, long syscall)
@@ -371,15 +370,23 @@ static int umcg_enqueue_runnable(struct
 
 static int umcg_enqueue_and_wake(struct task_struct *tsk)
 {
-	int ret;
-
-	ret = umcg_enqueue_runnable(tsk);
+	int ret = umcg_enqueue_runnable(tsk);
 	if (!ret)
 		ret = umcg_wake_server(tsk);
 
 	return ret;
 }
 
+static int umcg_pin_enqueue_and_wake(struct task_struct *tsk)
+{
+	int ret = umcg_pin_pages();
+	if (!ret) {
+		ret = umcg_enqueue_and_wake(tsk);
+		umcg_unpin_pages();
+	}
+	return ret;
+}
+
 /*
  * umcg_wait: Wait for ->state to become RUNNING
  *
@@ -469,16 +476,11 @@ static void umcg_unblock_and_wait(void)
 	/* avoid recursion vs schedule() */
 	tsk->flags &= ~PF_UMCG_WORKER;
 
-	if (umcg_pin_pages())
-		UMCG_DIE("pin");
-
 	if (umcg_update_state(tsk, self, UMCG_TASK_BLOCKED, UMCG_TASK_RUNNABLE))
-		UMCG_DIE_UNPIN("state");
+		UMCG_DIE("state");
 
-	if (umcg_enqueue_and_wake(tsk))
-		UMCG_DIE_UNPIN("enqueue-wake");
-
-	umcg_unpin_pages();
+	if (umcg_pin_enqueue_and_wake(tsk))
+		UMCG_DIE("pin-enqueue-wake");
 
 	switch (umcg_wait(0)) {
 	case 0:
@@ -544,18 +546,13 @@ void umcg_notify_resume(struct pt_regs *
 		goto done;
 
 	if (state & UMCG_TF_PREEMPT) {
-		if (umcg_pin_pages())
-			UMCG_DIE("pin");
-
 		if (umcg_update_state(tsk, self,
 				      UMCG_TASK_RUNNING,
 				      UMCG_TASK_RUNNABLE))
-			UMCG_DIE_UNPIN("state");
+			UMCG_DIE("state");
 
-		if (umcg_enqueue_and_wake(tsk))
-			UMCG_DIE_UNPIN("enqueue-wake");
-
-		umcg_unpin_pages();
+		if (umcg_pin_enqueue_and_wake(tsk))
+			UMCG_DIE("pin-enqueue-wake");
 	}
 
 	if (WARN_ON_ONCE(timeout && syscall_get_nr(tsk, regs) != __NR_umcg_wait))
@@ -570,6 +567,13 @@ void umcg_notify_resume(struct pt_regs *
 
 	case -ETIMEDOUT:
 		regs_set_return_value(regs, ret);
+		if (worker) {
+			ret = umcg_pin_enqueue_and_wake(tsk);
+			if (ret) {
+				umcg_update_state(tsk, self, UMCG_TASK_RUNNABLE, UMCG_TASK_RUNNING);
+				regs_set_return_value(regs, ret);
+			}
+		}
 		break;
 
 	default:
@@ -710,7 +714,6 @@ static int umcg_wake_next(struct task_st
  * Returns:
  * 0		- OK;
  * -ETIMEDOUT	- the timeout expired;
- * -ERANGE	- the timeout is out of range (worker);
  * -EAGAIN	- ::state wasn't RUNNABLE, concurrent wakeup;
  * -EFAULT	- failed accessing struct umcg_task __user of the current
  *		  task, the server or next;
@@ -725,48 +728,40 @@ SYSCALL_DEFINE2(umcg_wait, u32, flags, u
 	bool worker = tsk->flags & PF_UMCG_WORKER;
 	int ret;
 
-	if (!self || flags)
+	if (!self || (flags & ~(UMCG_WAIT_ENQUEUE)))
 		return -EINVAL;
 
-	if (worker) {
-		tsk->flags &= ~PF_UMCG_WORKER;
-		if (timo)
-			return -ERANGE;
-	}
+	if ((flags & UMCG_WAIT_ENQUEUE) && (timo || !worker))
+		return -EINVAL;
 
-	/* see umcg_sys_{enter,exit}() syscall exceptions */
-	ret = umcg_pin_pages();
-	if (ret)
-		goto unblock;
+	if (worker)
+		tsk->flags &= ~PF_UMCG_WORKER;
 
 	/*
 	 * Clear UMCG_TF_COND_WAIT *and* check state == RUNNABLE.
 	 */
 	ret = umcg_update_state(tsk, self, UMCG_TASK_RUNNABLE, UMCG_TASK_RUNNABLE);
 	if (ret)
-		goto unpin;
+		goto unblock;
 
 	ret = umcg_wake_next(tsk, self);
 	if (ret)
-		goto unpin;
+		goto unblock;
 
-	if (worker) {
+	if (flags & UMCG_WAIT_ENQUEUE) {
 		/*
 		 * If this fails it is possible ::next_tid is already running
 		 * while this task is not going to block. This violates our
 		 * constraints.
 		 *
-		 * That said, pretty much the only way to make this fail is by
-		 * force munmap()'ing things. In which case one is most welcome
-		 * to the pieces.
+		 * Userspace can detect this case by looking at: ::next_tid &
+		 * TID_RUNNING.
 		 */
-		ret = umcg_enqueue_and_wake(tsk);
+		ret = umcg_pin_enqueue_and_wake(tsk);
 		if (ret)
-			goto unpin;
+			goto unblock;
 	}
 
-	umcg_unpin_pages();
-
 	ret = umcg_wait(timo);
 	switch (ret) {
 	case 0:		/* all done */
@@ -775,6 +770,26 @@ SYSCALL_DEFINE2(umcg_wait, u32, flags, u
 		ret = 0;
 		break;
 
+	case -ETIMEDOUT:
+		if (worker) {
+			/*
+			 * See the UMCG_WAIT_ENQUEUE case above; except this is
+			 * even more complicated because we *did* wait and
+			 * things might have progressed lots.
+			 *
+			 * Still, abort the wait because otherwise nobody would
+			 * ever find us again. Hopefully userspace can make
+			 * sense of things.
+			 */
+			ret = umcg_pin_enqueue_and_wake(tsk);
+			if (ret)
+				goto unblock;
+
+			ret = -ETIMEDOUT;
+			break;
+		}
+		goto unblock;
+
 	default:
 		goto unblock;
 	}
@@ -783,8 +798,6 @@ SYSCALL_DEFINE2(umcg_wait, u32, flags, u
 		tsk->flags |= PF_UMCG_WORKER;
 	return ret;
 
-unpin:
-	umcg_unpin_pages();
 unblock:
 	umcg_update_state(tsk, self, UMCG_TASK_RUNNABLE, UMCG_TASK_RUNNING);
 	goto out;

  parent reply	other threads:[~2022-01-24 14:00 UTC|newest]

Thread overview: 47+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-01-20 15:55 [RFC][PATCH v2 0/5] sched: User Managed Concurrency Groups Peter Zijlstra
2022-01-20 15:55 ` [RFC][PATCH v2 1/5] mm: Avoid unmapping pinned pages Peter Zijlstra
2022-01-20 18:03   ` Nadav Amit
2022-01-21  7:59     ` Peter Zijlstra
2022-01-20 18:25   ` David Hildenbrand
2022-01-21  7:51     ` Peter Zijlstra
2022-01-21  8:22       ` David Hildenbrand
2022-01-21  8:59       ` Peter Zijlstra
2022-01-21  9:04         ` David Hildenbrand
2022-01-21 11:40           ` Peter Zijlstra
2022-01-21 12:04             ` David Hildenbrand
2022-01-20 15:55 ` [RFC][PATCH v2 2/5] entry,x86: Create common IRQ operations for exceptions Peter Zijlstra
2022-01-21 16:34   ` Mark Rutland
2022-01-20 15:55 ` [RFC][PATCH v2 3/5] sched/umcg: add WF_CURRENT_CPU and externise ttwu Peter Zijlstra
2022-01-20 15:55 ` [RFC][PATCH v2 4/5] x86/uaccess: Implement unsafe_try_cmpxchg_user() Peter Zijlstra
2022-01-27  2:17   ` Sean Christopherson
2022-01-27  6:36     ` Sean Christopherson
2022-01-27  9:56       ` Peter Zijlstra
2022-01-27 23:33         ` Sean Christopherson
2022-01-28  0:17           ` Nick Desaulniers
2022-01-28 16:29             ` Sean Christopherson
2022-01-27  9:55     ` Peter Zijlstra
2022-01-20 15:55 ` [RFC][PATCH v2 5/5] sched: User Mode Concurency Groups Peter Zijlstra
2022-01-21 11:47   ` Peter Zijlstra
2022-01-21 15:18     ` Peter Zijlstra
2022-01-24 14:29       ` Peter Zijlstra
2022-01-24 16:44         ` Peter Zijlstra
2022-01-24 17:06           ` Peter Oskolkov
2022-01-25 14:59         ` Peter Zijlstra
2022-01-24 13:59     ` Peter Zijlstra [this message]
2022-01-21 12:26   ` Peter Zijlstra
2022-01-21 16:57   ` Mark Rutland
2022-01-24  9:48     ` Peter Zijlstra
2022-01-24 10:03     ` Peter Zijlstra
2022-01-24 10:07       ` Peter Zijlstra
2022-01-24 10:27         ` Mark Rutland
2022-01-24 14:46   ` Tao Zhou
2022-01-27 12:19     ` Peter Zijlstra
2022-01-27 18:33       ` Tao Zhou
2022-01-27 12:25     ` Peter Zijlstra
2022-01-27 18:47       ` Tao Zhou
2022-01-27 12:26     ` Peter Zijlstra
2022-01-27 18:31   ` Tao Zhou
2022-01-20 17:28 ` [RFC][PATCH v2 0/5] sched: User Managed Concurrency Groups Peter Oskolkov
2022-01-21  8:01   ` Peter Zijlstra
2022-01-21 18:01 ` Steven Rostedt
2022-01-24  8:20   ` Peter Zijlstra

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=Ye6w0qw1VpfLL0Ng@hirez.programming.kicks-ass.net \
    --to=peterz@infradead.org \
    --cc=avagin@google.com \
    --cc=bristot@redhat.com \
    --cc=bsegall@google.com \
    --cc=dietmar.eggemann@arm.com \
    --cc=jannh@google.com \
    --cc=juri.lelli@redhat.com \
    --cc=linux-api@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mark.rutland@arm.com \
    --cc=mgorman@suse.de \
    --cc=mingo@redhat.com \
    --cc=pjt@google.com \
    --cc=posk@google.com \
    --cc=posk@posk.io \
    --cc=rostedt@goodmis.org \
    --cc=tdelisle@uwaterloo.ca \
    --cc=tglx@linutronix.de \
    --cc=vincent.guittot@linaro.org \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).