LKML Archive on lore.kernel.org
 help / color / Atom feed
From: Peter Zijlstra <peterz@infradead.org>
To: joel@joelfernandes.org, chris.hyser@oracle.com,
	joshdon@google.com, mingo@kernel.org, vincent.guittot@linaro.org,
	valentin.schneider@arm.com, mgorman@suse.de
Cc: linux-kernel@vger.kernel.org, peterz@infradead.org, tglx@linutronix.de
Subject: [PATCH 13/19] sched/fair: Snapshot the min_vruntime of CPUs on force idle
Date: Thu, 22 Apr 2021 14:05:12 +0200
Message-ID: <20210422123308.738542617@infradead.org> (raw)
In-Reply-To: <20210422120459.447350175@infradead.org>

From: "Joel Fernandes (Google)" <joel@joelfernandes.org>

During force-idle, we end up doing cross-cpu comparison of vruntimes
during pick_next_task. If we simply compare (vruntime-min_vruntime)
across CPUs, and if the CPUs only have 1 task each, we will always
end up comparing 0 with 0 and pick just one of the tasks all the time.
This starves the task that was not picked. To fix this, take a snapshot
of the min_vruntime when entering force idle and use it for comparison.
This min_vruntime snapshot will only be used for cross-CPU vruntime
comparison, and nothing else.

A note about the min_vruntime snapshot and force idling:

During selection:

  When we're not fi, we need to update snapshot.
  when we're fi and we were not fi, we must update snapshot.
  When we're fi and we were already fi, we must not update snapshot.

Which gives:

  fib     fi      update
  0       0       1
  0       1       1
  1       0       1
  1       1       0

Where:

  fi:  force-idled now
  fib: force-idled before

So the min_vruntime snapshot needs to be updated when: !(fib && fi).

Also, the cfs_prio_less() function needs to be aware of whether the
core is in force idle or not, since it will be use this information to
know whether to advance a cfs_rq's min_vruntime_fi in the hierarchy.
So pass this information along via pick_task() -> prio_less().

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c  |   59 +++++++++++++++++++++++-----------------
 kernel/sched/fair.c  |   75 +++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h |    8 +++++
 3 files changed, 117 insertions(+), 25 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -101,7 +101,7 @@ static inline int __task_prio(struct tas
  */
 
 /* real prio, less is less */
-static inline bool prio_less(struct task_struct *a, struct task_struct *b)
+static inline bool prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
 {
 
 	int pa = __task_prio(a), pb = __task_prio(b);
@@ -115,19 +115,8 @@ static inline bool prio_less(struct task
 	if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
 		return !dl_time_before(a->dl.deadline, b->dl.deadline);
 
-	if (pa == MAX_RT_PRIO + MAX_NICE)  { /* fair */
-		u64 vruntime = b->se.vruntime;
-
-		/*
-		 * Normalize the vruntime if tasks are in different cpus.
-		 */
-		if (task_cpu(a) != task_cpu(b)) {
-			vruntime -= task_cfs_rq(b)->min_vruntime;
-			vruntime += task_cfs_rq(a)->min_vruntime;
-		}
-
-		return !((s64)(a->se.vruntime - vruntime) <= 0);
-	}
+	if (pa == MAX_RT_PRIO + MAX_NICE)	/* fair */
+		return cfs_prio_less(a, b, in_fi);
 
 	return false;
 }
@@ -141,7 +130,7 @@ static inline bool __sched_core_less(str
 		return false;
 
 	/* flip prio, so high prio is leftmost */
-	if (prio_less(b, a))
+	if (prio_less(b, a, task_rq(a)->core->core_forceidle))
 		return true;
 
 	return false;
@@ -5145,7 +5134,7 @@ static inline bool cookie_match(struct t
  * - Else returns idle_task.
  */
 static struct task_struct *
-pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *max)
+pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *max, bool in_fi)
 {
 	struct task_struct *class_pick, *cookie_pick;
 	unsigned long cookie = rq->core->core_cookie;
@@ -5160,7 +5149,7 @@ pick_task(struct rq *rq, const struct sc
 		 * higher priority than max.
 		 */
 		if (max && class_pick->core_cookie &&
-		    prio_less(class_pick, max))
+		    prio_less(class_pick, max, in_fi))
 			return idle_sched_class.pick_task(rq);
 
 		return class_pick;
@@ -5179,19 +5168,22 @@ pick_task(struct rq *rq, const struct sc
 	 * the core (so far) and it must be selected, otherwise we must go with
 	 * the cookie pick in order to satisfy the constraint.
 	 */
-	if (prio_less(cookie_pick, class_pick) &&
-	    (!max || prio_less(max, class_pick)))
+	if (prio_less(cookie_pick, class_pick, in_fi) &&
+	    (!max || prio_less(max, class_pick, in_fi)))
 		return class_pick;
 
 	return cookie_pick;
 }
 
+extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi);
+
 static struct task_struct *
 pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 {
 	struct task_struct *next, *max = NULL;
 	const struct sched_class *class;
 	const struct cpumask *smt_mask;
+	bool fi_before = false;
 	bool need_sync;
 	int i, j, cpu;
 
@@ -5273,9 +5265,14 @@ pick_next_task(struct rq *rq, struct tas
 
 		if (!next->core_cookie) {
 			rq->core_pick = NULL;
+			/*
+			 * For robustness, update the min_vruntime_fi for
+			 * unconstrained picks as well.
+			 */
+			WARN_ON_ONCE(fi_before);
+			task_vruntime_update(rq, next, false);
 			goto done;
 		}
-		need_sync = true;
 	}
 
 	for_each_cpu(i, smt_mask) {
@@ -5306,11 +5303,16 @@ pick_next_task(struct rq *rq, struct tas
 			 * highest priority task already selected for this
 			 * core.
 			 */
-			p = pick_task(rq_i, class, max);
+			p = pick_task(rq_i, class, max, fi_before);
 			if (!p)
 				continue;
 
 			rq_i->core_pick = p;
+			if (rq_i->idle == p && rq_i->nr_running) {
+				rq->core->core_forceidle = true;
+				if (!fi_before)
+					rq->core->core_forceidle_seq++;
+			}
 
 			/*
 			 * If this new candidate is of higher priority than the
@@ -5329,6 +5331,7 @@ pick_next_task(struct rq *rq, struct tas
 				max = p;
 
 				if (old_max) {
+					rq->core->core_forceidle = false;
 					for_each_cpu(j, smt_mask) {
 						if (j == i)
 							continue;
@@ -5369,10 +5372,16 @@ pick_next_task(struct rq *rq, struct tas
 		if (!rq_i->core_pick)
 			continue;
 
-		if (is_task_rq_idle(rq_i->core_pick) && rq_i->nr_running &&
-		    !rq_i->core->core_forceidle) {
-			rq_i->core->core_forceidle = true;
-		}
+		/*
+		 * Update for new !FI->FI transitions, or if continuing to be in !FI:
+		 * fi_before     fi      update?
+		 *  0            0       1
+		 *  0            1       1
+		 *  1            0       1
+		 *  1            1       0
+		 */
+		if (!(fi_before && rq->core->core_forceidle))
+			task_vruntime_update(rq_i, rq_i->core_pick, rq->core->core_forceidle);
 
 		if (i == cpu) {
 			rq_i->core_pick = NULL;
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10727,6 +10727,81 @@ static inline void task_tick_core(struct
 	    __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
 		resched_curr(rq);
 }
+
+/*
+ * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed.
+ */
+static void se_fi_update(struct sched_entity *se, unsigned int fi_seq, bool forceidle)
+{
+	for_each_sched_entity(se) {
+		struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+		if (forceidle) {
+			if (cfs_rq->forceidle_seq == fi_seq)
+				break;
+			cfs_rq->forceidle_seq = fi_seq;
+		}
+
+		cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime;
+	}
+}
+
+void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi)
+{
+	struct sched_entity *se = &p->se;
+
+	if (p->sched_class != &fair_sched_class)
+		return;
+
+	se_fi_update(se, rq->core->core_forceidle_seq, in_fi);
+}
+
+bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
+{
+	struct rq *rq = task_rq(a);
+	struct sched_entity *sea = &a->se;
+	struct sched_entity *seb = &b->se;
+	struct cfs_rq *cfs_rqa;
+	struct cfs_rq *cfs_rqb;
+	s64 delta;
+
+	SCHED_WARN_ON(task_rq(b)->core != rq->core);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	/*
+	 * Find an se in the hierarchy for tasks a and b, such that the se's
+	 * are immediate siblings.
+	 */
+	while (sea->cfs_rq->tg != seb->cfs_rq->tg) {
+		int sea_depth = sea->depth;
+		int seb_depth = seb->depth;
+
+		if (sea_depth >= seb_depth)
+			sea = parent_entity(sea);
+		if (sea_depth <= seb_depth)
+			seb = parent_entity(seb);
+	}
+
+	se_fi_update(sea, rq->core->core_forceidle_seq, in_fi);
+	se_fi_update(seb, rq->core->core_forceidle_seq, in_fi);
+
+	cfs_rqa = sea->cfs_rq;
+	cfs_rqb = seb->cfs_rq;
+#else
+	cfs_rqa = &task_rq(a)->cfs;
+	cfs_rqb = &task_rq(b)->cfs;
+#endif
+
+	/*
+	 * Find delta after normalizing se's vruntime with its cfs_rq's
+	 * min_vruntime_fi, which would have been updated in prior calls
+	 * to se_fi_update().
+	 */
+	delta = (s64)(sea->vruntime - seb->vruntime) +
+		(s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
+
+	return delta > 0;
+}
 #else
 static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
 #endif
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -517,6 +517,11 @@ struct cfs_rq {
 
 	u64			exec_clock;
 	u64			min_vruntime;
+#ifdef CONFIG_SCHED_CORE
+	unsigned int		forceidle_seq;
+	u64			min_vruntime_fi;
+#endif
+
 #ifndef CONFIG_64BIT
 	u64			min_vruntime_copy;
 #endif
@@ -1075,6 +1080,7 @@ struct rq {
 	unsigned int		core_pick_seq;
 	unsigned long		core_cookie;
 	unsigned char		core_forceidle;
+	unsigned int		core_forceidle_seq;
 #endif
 };
 
@@ -1130,6 +1136,8 @@ static inline raw_spinlock_t *rq_lockp(s
 	return &rq->__lock;
 }
 
+bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool fi);
+
 #else /* !CONFIG_SCHED_CORE */
 
 static inline bool sched_core_enabled(struct rq *rq)



  parent reply index

Thread overview: 92+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-04-22 12:04 [PATCH 00/19] sched: Core Scheduling Peter Zijlstra
2021-04-22 12:05 ` [PATCH 01/19] sched/fair: Add a few assertions Peter Zijlstra
2021-05-12 10:28   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2021-05-13  8:56     ` Ning, Hongyu
2021-04-22 12:05 ` [PATCH 02/19] sched: Provide raw_spin_rq_*lock*() helpers Peter Zijlstra
2021-05-12 10:28   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2021-04-22 12:05 ` [PATCH 03/19] sched: Wrap rq::lock access Peter Zijlstra
2021-05-12 10:28   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2021-04-22 12:05 ` [PATCH 04/19] sched: Prepare for Core-wide rq->lock Peter Zijlstra
2021-04-24  1:22   ` Josh Don
2021-04-26  8:31     ` Peter Zijlstra
2021-04-26 22:21       ` Josh Don
2021-04-27 17:10         ` Don Hiatt
2021-04-27 23:35           ` Josh Don
2021-04-28  1:03             ` Aubrey Li
2021-04-28  6:05               ` Aubrey Li
2021-04-28 10:57                 ` Aubrey Li
2021-04-28 16:41                   ` Don Hiatt
2021-04-29 20:48                     ` Josh Don
2021-04-29 21:09                       ` Don Hiatt
2021-04-29 23:22                         ` Josh Don
2021-04-30 16:18                           ` Don Hiatt
2021-04-30  8:26                         ` Aubrey Li
2021-04-28 16:04             ` Don Hiatt
2021-04-27 23:30         ` Josh Don
2021-04-28  9:13           ` Peter Zijlstra
2021-04-28 10:35             ` Aubrey Li
2021-04-28 11:03               ` Peter Zijlstra
2021-04-28 14:18                 ` Paul E. McKenney
2021-04-29 20:11             ` Josh Don
2021-05-03 19:17               ` Peter Zijlstra
2021-04-28  7:13         ` Peter Zijlstra
2021-04-28  6:02   ` Aubrey Li
2021-04-29  8:03   ` Aubrey Li
2021-04-29 20:39     ` Josh Don
2021-04-30  8:20       ` Aubrey Li
2021-04-30  8:48         ` Josh Don
2021-04-30 14:15           ` Aubrey Li
2021-05-04  7:38       ` Peter Zijlstra
2021-05-05 16:20         ` Don Hiatt
2021-05-06 10:25           ` Peter Zijlstra
2021-05-07  9:50   ` [PATCH v2 " Peter Zijlstra
2021-05-08  8:07     ` Aubrey Li
2021-05-12  9:07       ` Peter Zijlstra
2021-04-22 12:05 ` [PATCH 05/19] sched: " Peter Zijlstra
2021-05-07  9:50   ` [PATCH v2 " Peter Zijlstra
2021-05-12 10:28     ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2021-04-22 12:05 ` [PATCH 06/19] sched: Optimize rq_lockp() usage Peter Zijlstra
2021-05-12 10:28   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2021-04-22 12:05 ` [PATCH 07/19] sched: Allow sched_core_put() from atomic context Peter Zijlstra
2021-05-12 10:28   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2021-04-22 12:05 ` [PATCH 08/19] sched: Introduce sched_class::pick_task() Peter Zijlstra
2021-05-12 10:28   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2021-04-22 12:05 ` [PATCH 09/19] sched: Basic tracking of matching tasks Peter Zijlstra
2021-05-12 10:28   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2021-04-22 12:05 ` [PATCH 10/19] sched: Add core wide task selection and scheduling Peter Zijlstra
2021-05-12 10:28   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2021-04-22 12:05 ` [PATCH 11/19] sched/fair: Fix forced idle sibling starvation corner case Peter Zijlstra
2021-05-12 10:28   ` [tip: sched/core] " tip-bot2 for Vineeth Pillai
2021-04-22 12:05 ` [PATCH 12/19] sched: Fix priority inversion of cookied task with sibling Peter Zijlstra
2021-05-12 10:28   ` [tip: sched/core] " tip-bot2 for Joel Fernandes (Google)
2021-04-22 12:05 ` Peter Zijlstra [this message]
2021-05-12 10:28   ` [tip: sched/core] sched/fair: Snapshot the min_vruntime of CPUs on force idle tip-bot2 for Joel Fernandes (Google)
2021-04-22 12:05 ` [PATCH 14/19] sched: Trivial forced-newidle balancer Peter Zijlstra
2021-05-12 10:28   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2021-04-22 12:05 ` [PATCH 15/19] sched: Migration changes for core scheduling Peter Zijlstra
2021-05-12 10:28   ` [tip: sched/core] " tip-bot2 for Aubrey Li
2021-04-22 12:05 ` [PATCH 16/19] sched: Trivial core scheduling cookie management Peter Zijlstra
2021-05-12 10:28   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2021-04-22 12:05 ` [PATCH 17/19] sched: Inherit task cookie on fork() Peter Zijlstra
2021-05-10 16:06   ` Joel Fernandes
2021-05-10 16:22     ` Chris Hyser
2021-05-10 20:47       ` Joel Fernandes
2021-05-10 21:38         ` Chris Hyser
2021-05-12  9:05           ` Peter Zijlstra
2021-05-12 20:20             ` Josh Don
2021-05-12 21:07               ` Don Hiatt
2021-05-12 10:28   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2021-04-22 12:05 ` [PATCH 18/19] sched: prctl() core-scheduling interface Peter Zijlstra
2021-05-12 10:28   ` [tip: sched/core] " tip-bot2 for Chris Hyser
2021-06-14 23:36   ` [PATCH 18/19] " Josh Don
2021-06-15 11:31     ` Joel Fernandes
2021-04-22 12:05 ` [PATCH 19/19] kselftest: Add test for core sched prctl interface Peter Zijlstra
2021-05-12 10:28   ` [tip: sched/core] " tip-bot2 for Chris Hyser
2021-04-22 16:43 ` [PATCH 00/19] sched: Core Scheduling Don Hiatt
2021-04-22 17:29   ` Peter Zijlstra
2021-04-30  6:47 ` Ning, Hongyu
2021-05-06 10:29   ` Peter Zijlstra
2021-05-06 12:53     ` Ning, Hongyu
2021-05-07 18:02 ` Joel Fernandes
2021-05-10 16:16 ` Vincent Guittot
2021-05-11  7:00   ` Vincent Guittot

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210422123308.738542617@infradead.org \
    --to=peterz@infradead.org \
    --cc=chris.hyser@oracle.com \
    --cc=joel@joelfernandes.org \
    --cc=joshdon@google.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mgorman@suse.de \
    --cc=mingo@kernel.org \
    --cc=tglx@linutronix.de \
    --cc=valentin.schneider@arm.com \
    --cc=vincent.guittot@linaro.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

LKML Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/lkml/0 lkml/git/0.git
	git clone --mirror https://lore.kernel.org/lkml/1 lkml/git/1.git
	git clone --mirror https://lore.kernel.org/lkml/2 lkml/git/2.git
	git clone --mirror https://lore.kernel.org/lkml/3 lkml/git/3.git
	git clone --mirror https://lore.kernel.org/lkml/4 lkml/git/4.git
	git clone --mirror https://lore.kernel.org/lkml/5 lkml/git/5.git
	git clone --mirror https://lore.kernel.org/lkml/6 lkml/git/6.git
	git clone --mirror https://lore.kernel.org/lkml/7 lkml/git/7.git
	git clone --mirror https://lore.kernel.org/lkml/8 lkml/git/8.git
	git clone --mirror https://lore.kernel.org/lkml/9 lkml/git/9.git
	git clone --mirror https://lore.kernel.org/lkml/10 lkml/git/10.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 lkml lkml/ https://lore.kernel.org/lkml \
		linux-kernel@vger.kernel.org
	public-inbox-index lkml

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-kernel


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git