linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Vineeth Remanan Pillai <vpillai@digitalocean.com>
To: Nishanth Aravamudan <naravamudan@digitalocean.com>,
	Julien Desfossez <jdesfossez@digitalocean.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Tim Chen <tim.c.chen@linux.intel.com>,
	mingo@kernel.org, tglx@linutronix.de, pjt@google.com,
	torvalds@linux-foundation.org
Cc: linux-kernel@vger.kernel.org, subhra.mazumdar@oracle.com,
	fweisbec@gmail.com, keescook@chromium.org, kerrnel@google.com,
	Phil Auld <pauld@redhat.com>, Aaron Lu <aaron.lwe@gmail.com>,
	Aubrey Li <aubrey.intel@gmail.com>,
	Valentin Schneider <valentin.schneider@arm.com>,
	Mel Gorman <mgorman@techsingularity.net>,
	Pawan Gupta <pawan.kumar.gupta@linux.intel.com>,
	Paolo Bonzini <pbonzini@redhat.com>,
	Joel Fernandes <joelaf@google.com>,
	joel@joelfernandes.org, vineethrp@gmail.com,
	Chen Yu <yu.c.chen@intel.com>,
	Christian Brauner <christian.brauner@ubuntu.com>,
	Vineeth Remanan Pillai <vpillai@digitalocean.com>,
	Aaron Lu <aaron.lu@linux.alibaba.com>
Subject: [RFC PATCH 06/16] sched: Add core wide task selection and scheduling.
Date: Tue, 30 Jun 2020 21:32:27 +0000	[thread overview]
Message-ID: <ed924e2cb450a4cce4a1b5a2c44d29e968467154.1593530334.git.vpillai@digitalocean.com> (raw)
In-Reply-To: <cover.1593530334.git.vpillai@digitalocean.com>
In-Reply-To: <cover.1593530334.git.vpillai@digitalocean.com>

From: Peter Zijlstra <peterz@infradead.org>

Instead of only selecting a local task, select a task for all SMT
siblings for every reschedule on the core (irrespective which logical
CPU does the reschedule).

There could be races in core scheduler where a CPU is trying to pick
a task for its sibling in core scheduler, when that CPU has just been
offlined.  We should not schedule any tasks on the CPU in this case.
Return an idle task in pick_next_task for this situation.

NOTE: there is still potential for siblings rivalry.
NOTE: this is far too complicated; but thus far I've failed to
      simplify it further.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Julien Desfossez <jdesfossez@digitalocean.com>
Signed-off-by: Vineeth Remanan Pillai <vpillai@digitalocean.com>
Signed-off-by: Aaron Lu <aaron.lu@linux.alibaba.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---
 kernel/sched/core.c  | 263 ++++++++++++++++++++++++++++++++++++++++++-
 kernel/sched/sched.h |   6 +-
 2 files changed, 267 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b21bcab20da6..f51e5c4798c8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4113,7 +4113,7 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt)
  * Pick up the highest-prio task:
  */
 static inline struct task_struct *
-pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+__pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 {
 	const struct sched_class *class;
 	struct task_struct *p;
@@ -4169,6 +4169,262 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 	BUG();
 }
 
+#ifdef CONFIG_SCHED_CORE
+
+static inline bool cookie_equals(struct task_struct *a, unsigned long cookie)
+{
+	return is_idle_task(a) || (a->core_cookie == cookie);
+}
+
+static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
+{
+	if (is_idle_task(a) || is_idle_task(b))
+		return true;
+
+	return a->core_cookie == b->core_cookie;
+}
+
+// XXX fairness/fwd progress conditions
+/*
+ * Returns
+ * - NULL if there is no runnable task for this class.
+ * - the highest priority task for this runqueue if it matches
+ *   rq->core->core_cookie or its priority is greater than max.
+ * - Else returns idle_task.
+ */
+static struct task_struct *
+pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *max)
+{
+	struct task_struct *class_pick, *cookie_pick;
+	unsigned long cookie = rq->core->core_cookie;
+
+	class_pick = class->pick_task(rq);
+	if (!class_pick)
+		return NULL;
+
+	if (!cookie) {
+		/*
+		 * If class_pick is tagged, return it only if it has
+		 * higher priority than max.
+		 */
+		if (max && class_pick->core_cookie &&
+		    prio_less(class_pick, max))
+			return idle_sched_class.pick_task(rq);
+
+		return class_pick;
+	}
+
+	/*
+	 * If class_pick is idle or matches cookie, return early.
+	 */
+	if (cookie_equals(class_pick, cookie))
+		return class_pick;
+
+	cookie_pick = sched_core_find(rq, cookie);
+
+	/*
+	 * If class > max && class > cookie, it is the highest priority task on
+	 * the core (so far) and it must be selected, otherwise we must go with
+	 * the cookie pick in order to satisfy the constraint.
+	 */
+	if (prio_less(cookie_pick, class_pick) &&
+	    (!max || prio_less(max, class_pick)))
+		return class_pick;
+
+	return cookie_pick;
+}
+
+static struct task_struct *
+pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+	struct task_struct *next, *max = NULL;
+	const struct sched_class *class;
+	const struct cpumask *smt_mask;
+	int i, j, cpu;
+	bool need_sync;
+
+	if (!sched_core_enabled(rq))
+		return __pick_next_task(rq, prev, rf);
+
+	/*
+	 * If there were no {en,de}queues since we picked (IOW, the task
+	 * pointers are all still valid), and we haven't scheduled the last
+	 * pick yet, do so now.
+	 */
+	if (rq->core->core_pick_seq == rq->core->core_task_seq &&
+	    rq->core->core_pick_seq != rq->core_sched_seq) {
+		WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq);
+
+		next = rq->core_pick;
+		if (next != prev) {
+			put_prev_task(rq, prev);
+			set_next_task(rq, next);
+		}
+		return next;
+	}
+
+	prev->sched_class->put_prev_task(rq, prev);
+	if (!rq->nr_running)
+		newidle_balance(rq, rf);
+
+	cpu = cpu_of(rq);
+	smt_mask = cpu_smt_mask(cpu);
+
+	/*
+	 * core->core_task_seq, core->core_pick_seq, rq->core_sched_seq
+	 *
+	 * @task_seq guards the task state ({en,de}queues)
+	 * @pick_seq is the @task_seq we did a selection on
+	 * @sched_seq is the @pick_seq we scheduled
+	 *
+	 * However, preemptions can cause multiple picks on the same task set.
+	 * 'Fix' this by also increasing @task_seq for every pick.
+	 */
+	rq->core->core_task_seq++;
+	need_sync = !!rq->core->core_cookie;
+
+	/* reset state */
+	rq->core->core_cookie = 0UL;
+	for_each_cpu(i, smt_mask) {
+		struct rq *rq_i = cpu_rq(i);
+
+		rq_i->core_pick = NULL;
+
+		if (rq_i->core_forceidle) {
+			need_sync = true;
+			rq_i->core_forceidle = false;
+		}
+
+		if (i != cpu)
+			update_rq_clock(rq_i);
+	}
+
+	/*
+	 * Try and select tasks for each sibling in decending sched_class
+	 * order.
+	 */
+	for_each_class(class) {
+again:
+		for_each_cpu_wrap(i, smt_mask, cpu) {
+			struct rq *rq_i = cpu_rq(i);
+			struct task_struct *p;
+
+			if (rq_i->core_pick)
+				continue;
+
+			/*
+			 * If this sibling doesn't yet have a suitable task to
+			 * run; ask for the most elegible task, given the
+			 * highest priority task already selected for this
+			 * core.
+			 */
+			p = pick_task(rq_i, class, max);
+			if (!p) {
+				/*
+				 * If there weren't no cookies; we don't need
+				 * to bother with the other siblings.
+				 */
+				if (i == cpu && !need_sync)
+					goto next_class;
+
+				continue;
+			}
+
+			/*
+			 * Optimize the 'normal' case where there aren't any
+			 * cookies and we don't need to sync up.
+			 */
+			if (i == cpu && !need_sync && !p->core_cookie) {
+				next = p;
+				goto done;
+			}
+
+			rq_i->core_pick = p;
+
+			/*
+			 * If this new candidate is of higher priority than the
+			 * previous; and they're incompatible; we need to wipe
+			 * the slate and start over. pick_task makes sure that
+			 * p's priority is more than max if it doesn't match
+			 * max's cookie.
+			 *
+			 * NOTE: this is a linear max-filter and is thus bounded
+			 * in execution time.
+			 */
+			if (!max || !cookie_match(max, p)) {
+				struct task_struct *old_max = max;
+
+				rq->core->core_cookie = p->core_cookie;
+				max = p;
+
+				if (old_max) {
+					for_each_cpu(j, smt_mask) {
+						if (j == i)
+							continue;
+
+						cpu_rq(j)->core_pick = NULL;
+					}
+					goto again;
+				} else {
+					/*
+					 * Once we select a task for a cpu, we
+					 * should not be doing an unconstrained
+					 * pick because it might starve a task
+					 * on a forced idle cpu.
+					 */
+					need_sync = true;
+				}
+
+			}
+		}
+next_class:;
+	}
+
+	rq->core->core_pick_seq = rq->core->core_task_seq;
+	next = rq->core_pick;
+	rq->core_sched_seq = rq->core->core_pick_seq;
+
+	/*
+	 * Reschedule siblings
+	 *
+	 * NOTE: L1TF -- at this point we're no longer running the old task and
+	 * sending an IPI (below) ensures the sibling will no longer be running
+	 * their task. This ensures there is no inter-sibling overlap between
+	 * non-matching user state.
+	 */
+	for_each_cpu(i, smt_mask) {
+		struct rq *rq_i = cpu_rq(i);
+
+		WARN_ON_ONCE(!rq_i->core_pick);
+
+		if (is_idle_task(rq_i->core_pick) && rq_i->nr_running)
+			rq_i->core_forceidle = true;
+
+		if (i == cpu)
+			continue;
+
+		if (rq_i->curr != rq_i->core_pick)
+			resched_curr(rq_i);
+
+		/* Did we break L1TF mitigation requirements? */
+		WARN_ON_ONCE(!cookie_match(next, rq_i->core_pick));
+	}
+
+done:
+	set_next_task(rq, next);
+	return next;
+}
+
+#else /* !CONFIG_SCHED_CORE */
+
+static struct task_struct *
+pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+	return __pick_next_task(rq, prev, rf);
+}
+
+#endif /* CONFIG_SCHED_CORE */
+
 /*
  * __schedule() is the main scheduler function.
  *
@@ -6931,7 +7187,12 @@ void __init sched_init(void)
 
 #ifdef CONFIG_SCHED_CORE
 		rq->core = NULL;
+		rq->core_pick = NULL;
 		rq->core_enabled = 0;
+		rq->core_tree = RB_ROOT;
+		rq->core_forceidle = false;
+
+		rq->core_cookie = 0UL;
 #endif
 	}
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c85c5a4bc21f..4a738093d731 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1032,11 +1032,16 @@ struct rq {
 #ifdef CONFIG_SCHED_CORE
 	/* per rq */
 	struct rq		*core;
+	struct task_struct	*core_pick;
 	unsigned int		core_enabled;
+	unsigned int		core_sched_seq;
 	struct rb_root		core_tree;
+	unsigned char		core_forceidle;
 
 	/* shared state */
 	unsigned int		core_task_seq;
+	unsigned int		core_pick_seq;
+	unsigned long		core_cookie;
 #endif
 };
 
@@ -1905,7 +1910,6 @@ static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
 
 static inline void set_next_task(struct rq *rq, struct task_struct *next)
 {
-	WARN_ON_ONCE(rq->curr != next);
 	next->sched_class->set_next_task(rq, next, false);
 }
 
-- 
2.17.1


  parent reply	other threads:[~2020-06-30 21:33 UTC|newest]

Thread overview: 81+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-06-30 21:32 [RFC PATCH 00/16] Core scheduling v6 Vineeth Remanan Pillai
2020-06-30 21:32 ` [RFC PATCH 01/16] sched: Wrap rq::lock access Vineeth Remanan Pillai
2020-06-30 21:32 ` [RFC PATCH 02/16] sched: Introduce sched_class::pick_task() Vineeth Remanan Pillai
2020-06-30 21:32 ` [RFC PATCH 03/16] sched: Core-wide rq->lock Vineeth Remanan Pillai
2020-06-30 21:32 ` [RFC PATCH 04/16] sched/fair: Add a few assertions Vineeth Remanan Pillai
2020-06-30 21:32 ` [RFC PATCH 05/16] sched: Basic tracking of matching tasks Vineeth Remanan Pillai
2020-07-21 14:02   ` [RFC PATCH 05/16] sched: Basic tracking of matching tasks(Internet mail) benbjiang(蒋彪)
2020-06-30 21:32 ` Vineeth Remanan Pillai [this message]
2020-07-01 23:28   ` [RFC PATCH 06/16] sched: Add core wide task selection and scheduling Joel Fernandes
2020-07-02  0:54     ` Tim Chen
2020-07-02 12:57       ` Joel Fernandes
2020-07-02 13:23         ` Joel Fernandes
2020-07-05 23:44         ` Tim Chen
2020-07-03 20:21     ` Vineeth Remanan Pillai
2020-07-06 14:09       ` Joel Fernandes
2020-07-06 14:38         ` Vineeth Remanan Pillai
2020-07-06 17:37           ` Joel Fernandes
2020-06-30 21:32 ` [RFC PATCH 07/16] sched/fair: Fix forced idle sibling starvation corner case Vineeth Remanan Pillai
2020-07-21  7:35   ` [RFC PATCH 07/16] sched/fair: Fix forced idle sibling starvation corner case(Internet mail) benbjiang(蒋彪)
2020-07-22  7:20   ` benbjiang(蒋彪)
2020-06-30 21:32 ` [RFC PATCH 08/16] sched/fair: wrapper for cfs_rq->min_vruntime Vineeth Remanan Pillai
2020-06-30 21:32 ` [RFC PATCH 09/16] sched/fair: core wide cfs task priority comparison Vineeth Remanan Pillai
2020-07-22  0:23   ` [RFC PATCH 09/16] sched/fair: core wide cfs task priority comparison(Internet mail) benbjiang(蒋彪)
2020-07-24  7:14     ` Aaron Lu
2020-07-24 12:08       ` Jiang Biao
2020-06-30 21:32 ` [RFC PATCH 10/16] sched: Trivial forced-newidle balancer Vineeth Remanan Pillai
2020-07-20  4:06   ` [RFC PATCH 10/16] sched: Trivial forced-newidle balancer(Internet mail) benbjiang(蒋彪)
2020-07-20  6:06     ` Li, Aubrey
     [not found]       ` <8082F052-2F52-42D3-B396-18A35A94F26F@tencent.com>
2020-07-20  8:03         ` Li, Aubrey
2020-07-20  8:22           ` benbjiang(蒋彪)
2020-07-20 14:34   ` benbjiang(蒋彪)
2020-06-30 21:32 ` [RFC PATCH 11/16] sched: migration changes for core scheduling Vineeth Remanan Pillai
2020-07-22  8:54   ` [RFC PATCH 11/16] sched: migration changes for core scheduling(Internet mail) benbjiang(蒋彪)
2020-07-22 12:13     ` Li, Aubrey
2020-07-22 14:32       ` benbjiang(蒋彪)
2020-07-23  1:57         ` Li, Aubrey
2020-07-23  2:42           ` benbjiang(蒋彪)
2020-07-23  3:35             ` Li, Aubrey
2020-07-23  4:23               ` benbjiang(蒋彪)
2020-07-23  5:39                 ` Li, Aubrey
2020-07-23  7:47                   ` benbjiang(蒋彪)
2020-07-23  8:06                     ` Li, Aubrey
2020-07-23  8:28                       ` benbjiang(蒋彪)
2020-07-23 23:43                         ` Aubrey Li
2020-07-24  1:26                           ` benbjiang(蒋彪)
2020-07-24  2:05                             ` Li, Aubrey
2020-07-24  2:29                               ` benbjiang(蒋彪)
2020-06-30 21:32 ` [RFC PATCH 12/16] sched: cgroup tagging interface for core scheduling Vineeth Remanan Pillai
2020-06-30 21:32 ` [RFC PATCH 13/16] sched: Fix pick_next_task() race condition in " Vineeth Remanan Pillai
2020-06-30 21:32 ` [RFC PATCH 14/16] irq: Add support for core-wide protection of IRQ and softirq Vineeth Remanan Pillai
2020-07-10 12:19   ` Li, Aubrey
2020-07-10 13:21     ` Joel Fernandes
2020-07-13  2:23       ` Li, Aubrey
2020-07-13 15:58         ` Joel Fernandes
2020-07-10 13:36     ` Vineeth Remanan Pillai
2020-07-11  1:33       ` Aubrey Li
2020-07-17 23:37     ` Thomas Gleixner
2020-07-18 17:05       ` Joel Fernandes
2020-07-17 23:36   ` Thomas Gleixner
2020-07-20  3:53     ` Joel Fernandes
2020-07-20  8:20       ` Thomas Gleixner
2020-07-20 11:09       ` Vineeth Pillai
2020-06-30 21:32 ` [RFC PATCH 15/16] Documentation: Add documentation on core scheduling Vineeth Remanan Pillai
2020-06-30 21:32 ` [RFC PATCH 16/16] sched: Debug bits Vineeth Remanan Pillai
2020-07-31 16:41 ` [RFC PATCH 00/16] Core scheduling v6 Vineeth Pillai
2020-08-03  8:23 ` Li, Aubrey
2020-08-03 16:53   ` Joel Fernandes
2020-08-05  3:57     ` Li, Aubrey
2020-08-05  6:16       ` [RFC PATCH 00/16] Core scheduling v6(Internet mail) benbjiang(蒋彪)
2020-08-09 16:44       ` [RFC PATCH 00/16] Core scheduling v6 Joel Fernandes
2020-08-12  2:01         ` Li, Aubrey
2020-08-12 23:08           ` Joel Fernandes
2020-08-13  4:28             ` Li, Aubrey
2020-08-14  0:26               ` [RFC PATCH 00/16] Core scheduling v6(Internet mail) benbjiang(蒋彪)
2020-08-14  1:36                 ` Li, Aubrey
2020-08-14  4:04                   ` benbjiang(蒋彪)
2020-08-14  5:18                     ` Li, Aubrey
2020-08-14  7:54                       ` benbjiang(蒋彪)
2020-08-20 22:37               ` [RFC PATCH 00/16] Core scheduling v6 Joel Fernandes
2020-08-27  0:30 ` Alexander Graf
2020-08-27  1:20   ` Vineeth Pillai

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ed924e2cb450a4cce4a1b5a2c44d29e968467154.1593530334.git.vpillai@digitalocean.com \
    --to=vpillai@digitalocean.com \
    --cc=aaron.lu@linux.alibaba.com \
    --cc=aaron.lwe@gmail.com \
    --cc=aubrey.intel@gmail.com \
    --cc=christian.brauner@ubuntu.com \
    --cc=fweisbec@gmail.com \
    --cc=jdesfossez@digitalocean.com \
    --cc=joel@joelfernandes.org \
    --cc=joelaf@google.com \
    --cc=keescook@chromium.org \
    --cc=kerrnel@google.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mgorman@techsingularity.net \
    --cc=mingo@kernel.org \
    --cc=naravamudan@digitalocean.com \
    --cc=pauld@redhat.com \
    --cc=pawan.kumar.gupta@linux.intel.com \
    --cc=pbonzini@redhat.com \
    --cc=peterz@infradead.org \
    --cc=pjt@google.com \
    --cc=subhra.mazumdar@oracle.com \
    --cc=tglx@linutronix.de \
    --cc=tim.c.chen@linux.intel.com \
    --cc=torvalds@linux-foundation.org \
    --cc=valentin.schneider@arm.com \
    --cc=vineethrp@gmail.com \
    --cc=yu.c.chen@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).