linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Vineeth Remanan Pillai <vpillai@digitalocean.com>
To: Nishanth Aravamudan <naravamudan@digitalocean.com>,
	Julien Desfossez <jdesfossez@digitalocean.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Tim Chen <tim.c.chen@linux.intel.com>,
	mingo@kernel.org, tglx@linutronix.de, pjt@google.com,
	torvalds@linux-foundation.org
Cc: Aaron Lu <aaron.lwe@gmail.com>,
	linux-kernel@vger.kernel.org, subhra.mazumdar@oracle.com,
	fweisbec@gmail.com, keescook@chromium.org, kerrnel@google.com,
	Phil Auld <pauld@redhat.com>, Aubrey Li <aubrey.intel@gmail.com>,
	Valentin Schneider <valentin.schneider@arm.com>,
	Mel Gorman <mgorman@techsingularity.net>,
	Pawan Gupta <pawan.kumar.gupta@linux.intel.com>,
	Paolo Bonzini <pbonzini@redhat.com>,
	Joel Fernandes <joelaf@google.com>,
	joel@joelfernandes.org, vineethrp@gmail.com,
	Chen Yu <yu.c.chen@intel.com>,
	Christian Brauner <christian.brauner@ubuntu.com>,
	Aaron Lu <ziqian.lzq@antfin.com>
Subject: [RFC PATCH 09/16] sched/fair: core wide cfs task priority comparison
Date: Tue, 30 Jun 2020 21:32:30 +0000	[thread overview]
Message-ID: <2d0e3da8c80c314048d598fb03ae3fe52d9619f3.1593530334.git.vpillai@digitalocean.com> (raw)
In-Reply-To: <cover.1593530334.git.vpillai@digitalocean.com>
In-Reply-To: <cover.1593530334.git.vpillai@digitalocean.com>

From: Aaron Lu <aaron.lwe@gmail.com>

This patch provides a vruntime based way to compare two cfs task's
priority, be it on the same cpu or different threads of the same core.

When the two tasks are on the same CPU, we just need to find a common
cfs_rq both sched_entities are on and then do the comparison.

When the two tasks are on differen threads of the same core, each thread
will choose the next task to run the usual way and then the root level
sched entities which the two tasks belong to will be used to decide
which task runs next core wide.

An illustration for the cross CPU case:

   cpu0         cpu1
 /   |  \     /   |  \
se1 se2 se3  se4 se5 se6
    /  \            /   \
  se21 se22       se61  se62
  (A)                    /
                       se621
                        (B)

Assume CPU0 and CPU1 are smt siblings and cpu0 has decided task A to
run next and cpu1 has decided task B to run next. To compare priority
of task A and B, we compare priority of se2 and se6. Whose vruntime is
smaller, who wins.

To make this work, the root level sched entities' vruntime of the two
threads must be directly comparable. So one of the hyperthread's root
cfs_rq's min_vruntime is chosen as the core wide one and all root level
sched entities' vruntime is normalized against it.

All sub cfs_rqs and sched entities are not interesting in cross cpu
priority comparison as they will only participate in the usual cpu local
schedule decision so no need to normalize their vruntimes.

Signed-off-by: Aaron Lu <ziqian.lzq@antfin.com>
---
 kernel/sched/core.c  |  23 +++----
 kernel/sched/fair.c  | 142 ++++++++++++++++++++++++++++++++++++++++++-
 kernel/sched/sched.h |   3 +
 3 files changed, 150 insertions(+), 18 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f51e5c4798c8..4d6d6a678013 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -114,19 +114,8 @@ static inline bool prio_less(struct task_struct *a, struct task_struct *b)
 	if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
 		return !dl_time_before(a->dl.deadline, b->dl.deadline);
 
-	if (pa == MAX_RT_PRIO + MAX_NICE)  { /* fair */
-		u64 vruntime = b->se.vruntime;
-
-		/*
-		 * Normalize the vruntime if tasks are in different cpus.
-		 */
-		if (task_cpu(a) != task_cpu(b)) {
-			vruntime -= task_cfs_rq(b)->min_vruntime;
-			vruntime += task_cfs_rq(a)->min_vruntime;
-		}
-
-		return !((s64)(a->se.vruntime - vruntime) <= 0);
-	}
+	if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
+		return cfs_prio_less(a, b);
 
 	return false;
 }
@@ -229,8 +218,12 @@ static int __sched_core_stopper(void *data)
 	bool enabled = !!(unsigned long)data;
 	int cpu;
 
-	for_each_possible_cpu(cpu)
-		cpu_rq(cpu)->core_enabled = enabled;
+	for_each_possible_cpu(cpu) {
+		struct rq *rq = cpu_rq(cpu);
+		rq->core_enabled = enabled;
+		if (cpu_online(cpu) && rq->core != rq)
+			sched_core_adjust_sibling_vruntime(cpu, enabled);
+	}
 
 	return 0;
 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 61d19e573443..d16939766361 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -462,11 +462,142 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
+#ifdef CONFIG_SCHED_CORE
+static inline struct cfs_rq *root_cfs_rq(struct cfs_rq *cfs_rq)
+{
+	return &rq_of(cfs_rq)->cfs;
+}
+
+static inline bool is_root_cfs_rq(struct cfs_rq *cfs_rq)
+{
+	return cfs_rq == root_cfs_rq(cfs_rq);
+}
+
+static inline struct cfs_rq *core_cfs_rq(struct cfs_rq *cfs_rq)
+{
+	return &rq_of(cfs_rq)->core->cfs;
+}
+
+static inline u64 cfs_rq_min_vruntime(struct cfs_rq *cfs_rq)
+{
+	if (!sched_core_enabled(rq_of(cfs_rq)) || !is_root_cfs_rq(cfs_rq))
+		return cfs_rq->min_vruntime;
+
+	return core_cfs_rq(cfs_rq)->min_vruntime;
+}
+
+#ifndef CONFIG_64BIT
+static inline u64 cfs_rq_min_vruntime_copy(struct cfs_rq *cfs_rq)
+{
+	if (!sched_core_enabled(rq_of(cfs_rq)) || !is_root_cfs_rq(cfs_rq))
+		return cfs_rq->min_vruntime_copy;
+
+	return core_cfs_rq(cfs_rq)->min_vruntime_copy;
+}
+#endif /* CONFIG_64BIT */
+
+bool cfs_prio_less(struct task_struct *a, struct task_struct *b)
+{
+	struct sched_entity *sea = &a->se;
+	struct sched_entity *seb = &b->se;
+	bool samecpu = task_cpu(a) == task_cpu(b);
+	s64 delta;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	if (samecpu) {
+		/* vruntime is per cfs_rq */
+		while (!is_same_group(sea, seb)) {
+			int sea_depth = sea->depth;
+			int seb_depth = seb->depth;
+
+			if (sea_depth >= seb_depth)
+				sea = parent_entity(sea);
+			if (sea_depth <= seb_depth)
+				seb = parent_entity(seb);
+		}
+
+		delta = (s64)(sea->vruntime - seb->vruntime);
+		goto out;
+	}
+
+	/* crosscpu: compare root level se's vruntime to decide priority */
+	while (sea->parent)
+		sea = sea->parent;
+	while (seb->parent)
+		seb = seb->parent;
+#else
+	/*
+	 * Use the min_vruntime of root cfs_rq if the tasks are
+	 * enqueued in different cpus.
+	 * */
+	if (!samecpu) {
+		delta = (s64)(task_rq(a)->cfs.min_vruntime -
+			      task_rq(b)->cfs.min_vruntime);
+		goto out;
+	}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+	delta = (s64)(sea->vruntime - seb->vruntime);
+
+out:
+	return delta > 0;
+}
+
+/*
+ * This function takes care of adjusting the min_vruntime of siblings of
+ * a core during coresched enable/disable.
+ * This is called in stop machine context so no need to take the rq lock.
+ *
+ * Coresched enable case
+ *  Once Core scheduling is enabled, the root level sched entities
+ *  of both siblings will use cfs_rq->min_vruntime as the common cfs_rq
+ *  min_vruntime, so it's necessary to normalize vruntime of existing root
+ *  level sched entities in sibling_cfs_rq.
+ *
+ *  Update of sibling_cfs_rq's min_vruntime isn't necessary as we will be
+ *  only using cfs_rq->min_vruntime during the entire run of core scheduling.
+ *
+ * Coresched disable case
+ *  During the entire run of core scheduling, sibling_cfs_rq's min_vruntime
+ *  is left unused and could lag far behind its still queued sched entities.
+ *  Sync it to the up2date core wide one to avoid problems.
+ */
+void sched_core_adjust_sibling_vruntime(int cpu, bool coresched_enabled)
+{
+	struct cfs_rq *cfs = &cpu_rq(cpu)->cfs;
+	struct cfs_rq *core_cfs = &cpu_rq(cpu)->core->cfs;
+	if (coresched_enabled) {
+		struct sched_entity *se, *next;
+		s64 delta = core_cfs->min_vruntime - cfs->min_vruntime;
+		rbtree_postorder_for_each_entry_safe(se, next,
+				&cfs->tasks_timeline.rb_root,
+				run_node) {
+			se->vruntime += delta;
+		}
+	} else {
+		cfs->min_vruntime = core_cfs->min_vruntime;
+#ifndef CONFIG_64BIT
+		smp_wmb();
+		cfs->min_vruntime_copy = core_cfs->min_vruntime;
+#endif
+	}
+}
+
+#else
 static inline u64 cfs_rq_min_vruntime(struct cfs_rq *cfs_rq)
 {
 	return cfs_rq->min_vruntime;
 }
 
+#ifndef CONFIG_64BIT
+static inline u64 cfs_rq_min_vruntime_copy(struct cfs_rq *cfs_rq)
+{
+	return cfs_rq->min_vruntime_copy;
+}
+#endif /* CONFIG_64BIT */
+
+#endif /* CONFIG_SCHED_CORE */
+
 static __always_inline
 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
 
@@ -522,8 +653,13 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
 			vruntime = min_vruntime(vruntime, se->vruntime);
 	}
 
+#ifdef CONFIG_SCHED_CORE
+	if (sched_core_enabled(rq_of(cfs_rq)) && is_root_cfs_rq(cfs_rq))
+		cfs_rq = core_cfs_rq(cfs_rq);
+#endif
+
 	/* ensure we never gain time by being placed backwards. */
-	cfs_rq->min_vruntime = max_vruntime(cfs_rq_min_vruntime(cfs_rq), vruntime);
+	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
 #ifndef CONFIG_64BIT
 	smp_wmb();
 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
@@ -6700,9 +6836,9 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
 		u64 min_vruntime_copy;
 
 		do {
-			min_vruntime_copy = cfs_rq->min_vruntime_copy;
+			min_vruntime_copy = cfs_rq_min_vruntime_copy(cfs_rq);
 			smp_rmb();
-			min_vruntime = cfs_rq->min_vruntime;
+			min_vruntime = cfs_rq_min_vruntime(cfs_rq);
 		} while (min_vruntime != min_vruntime_copy);
 #else
 		min_vruntime = cfs_rq_min_vruntime(cfs_rq);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4a738093d731..293aa1ae0308 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1086,6 +1086,9 @@ static inline raw_spinlock_t *rq_lockp(struct rq *rq)
 	return &rq->__lock;
 }
 
+bool cfs_prio_less(struct task_struct *a, struct task_struct *b);
+void sched_core_adjust_sibling_vruntime(int cpu, bool coresched_enabled);
+
 #else /* !CONFIG_SCHED_CORE */
 
 static inline bool sched_core_enabled(struct rq *rq)
-- 
2.17.1


  parent reply	other threads:[~2020-06-30 21:33 UTC|newest]

Thread overview: 81+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-06-30 21:32 [RFC PATCH 00/16] Core scheduling v6 Vineeth Remanan Pillai
2020-06-30 21:32 ` [RFC PATCH 01/16] sched: Wrap rq::lock access Vineeth Remanan Pillai
2020-06-30 21:32 ` [RFC PATCH 02/16] sched: Introduce sched_class::pick_task() Vineeth Remanan Pillai
2020-06-30 21:32 ` [RFC PATCH 03/16] sched: Core-wide rq->lock Vineeth Remanan Pillai
2020-06-30 21:32 ` [RFC PATCH 04/16] sched/fair: Add a few assertions Vineeth Remanan Pillai
2020-06-30 21:32 ` [RFC PATCH 05/16] sched: Basic tracking of matching tasks Vineeth Remanan Pillai
2020-07-21 14:02   ` [RFC PATCH 05/16] sched: Basic tracking of matching tasks(Internet mail) benbjiang(蒋彪)
2020-06-30 21:32 ` [RFC PATCH 06/16] sched: Add core wide task selection and scheduling Vineeth Remanan Pillai
2020-07-01 23:28   ` Joel Fernandes
2020-07-02  0:54     ` Tim Chen
2020-07-02 12:57       ` Joel Fernandes
2020-07-02 13:23         ` Joel Fernandes
2020-07-05 23:44         ` Tim Chen
2020-07-03 20:21     ` Vineeth Remanan Pillai
2020-07-06 14:09       ` Joel Fernandes
2020-07-06 14:38         ` Vineeth Remanan Pillai
2020-07-06 17:37           ` Joel Fernandes
2020-06-30 21:32 ` [RFC PATCH 07/16] sched/fair: Fix forced idle sibling starvation corner case Vineeth Remanan Pillai
2020-07-21  7:35   ` [RFC PATCH 07/16] sched/fair: Fix forced idle sibling starvation corner case(Internet mail) benbjiang(蒋彪)
2020-07-22  7:20   ` benbjiang(蒋彪)
2020-06-30 21:32 ` [RFC PATCH 08/16] sched/fair: wrapper for cfs_rq->min_vruntime Vineeth Remanan Pillai
2020-06-30 21:32 ` Vineeth Remanan Pillai [this message]
2020-07-22  0:23   ` [RFC PATCH 09/16] sched/fair: core wide cfs task priority comparison(Internet mail) benbjiang(蒋彪)
2020-07-24  7:14     ` Aaron Lu
2020-07-24 12:08       ` Jiang Biao
2020-06-30 21:32 ` [RFC PATCH 10/16] sched: Trivial forced-newidle balancer Vineeth Remanan Pillai
2020-07-20  4:06   ` [RFC PATCH 10/16] sched: Trivial forced-newidle balancer(Internet mail) benbjiang(蒋彪)
2020-07-20  6:06     ` Li, Aubrey
     [not found]       ` <8082F052-2F52-42D3-B396-18A35A94F26F@tencent.com>
2020-07-20  8:03         ` Li, Aubrey
2020-07-20  8:22           ` benbjiang(蒋彪)
2020-07-20 14:34   ` benbjiang(蒋彪)
2020-06-30 21:32 ` [RFC PATCH 11/16] sched: migration changes for core scheduling Vineeth Remanan Pillai
2020-07-22  8:54   ` [RFC PATCH 11/16] sched: migration changes for core scheduling(Internet mail) benbjiang(蒋彪)
2020-07-22 12:13     ` Li, Aubrey
2020-07-22 14:32       ` benbjiang(蒋彪)
2020-07-23  1:57         ` Li, Aubrey
2020-07-23  2:42           ` benbjiang(蒋彪)
2020-07-23  3:35             ` Li, Aubrey
2020-07-23  4:23               ` benbjiang(蒋彪)
2020-07-23  5:39                 ` Li, Aubrey
2020-07-23  7:47                   ` benbjiang(蒋彪)
2020-07-23  8:06                     ` Li, Aubrey
2020-07-23  8:28                       ` benbjiang(蒋彪)
2020-07-23 23:43                         ` Aubrey Li
2020-07-24  1:26                           ` benbjiang(蒋彪)
2020-07-24  2:05                             ` Li, Aubrey
2020-07-24  2:29                               ` benbjiang(蒋彪)
2020-06-30 21:32 ` [RFC PATCH 12/16] sched: cgroup tagging interface for core scheduling Vineeth Remanan Pillai
2020-06-30 21:32 ` [RFC PATCH 13/16] sched: Fix pick_next_task() race condition in " Vineeth Remanan Pillai
2020-06-30 21:32 ` [RFC PATCH 14/16] irq: Add support for core-wide protection of IRQ and softirq Vineeth Remanan Pillai
2020-07-10 12:19   ` Li, Aubrey
2020-07-10 13:21     ` Joel Fernandes
2020-07-13  2:23       ` Li, Aubrey
2020-07-13 15:58         ` Joel Fernandes
2020-07-10 13:36     ` Vineeth Remanan Pillai
2020-07-11  1:33       ` Aubrey Li
2020-07-17 23:37     ` Thomas Gleixner
2020-07-18 17:05       ` Joel Fernandes
2020-07-17 23:36   ` Thomas Gleixner
2020-07-20  3:53     ` Joel Fernandes
2020-07-20  8:20       ` Thomas Gleixner
2020-07-20 11:09       ` Vineeth Pillai
2020-06-30 21:32 ` [RFC PATCH 15/16] Documentation: Add documentation on core scheduling Vineeth Remanan Pillai
2020-06-30 21:32 ` [RFC PATCH 16/16] sched: Debug bits Vineeth Remanan Pillai
2020-07-31 16:41 ` [RFC PATCH 00/16] Core scheduling v6 Vineeth Pillai
2020-08-03  8:23 ` Li, Aubrey
2020-08-03 16:53   ` Joel Fernandes
2020-08-05  3:57     ` Li, Aubrey
2020-08-05  6:16       ` [RFC PATCH 00/16] Core scheduling v6(Internet mail) benbjiang(蒋彪)
2020-08-09 16:44       ` [RFC PATCH 00/16] Core scheduling v6 Joel Fernandes
2020-08-12  2:01         ` Li, Aubrey
2020-08-12 23:08           ` Joel Fernandes
2020-08-13  4:28             ` Li, Aubrey
2020-08-14  0:26               ` [RFC PATCH 00/16] Core scheduling v6(Internet mail) benbjiang(蒋彪)
2020-08-14  1:36                 ` Li, Aubrey
2020-08-14  4:04                   ` benbjiang(蒋彪)
2020-08-14  5:18                     ` Li, Aubrey
2020-08-14  7:54                       ` benbjiang(蒋彪)
2020-08-20 22:37               ` [RFC PATCH 00/16] Core scheduling v6 Joel Fernandes
2020-08-27  0:30 ` Alexander Graf
2020-08-27  1:20   ` Vineeth Pillai

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=2d0e3da8c80c314048d598fb03ae3fe52d9619f3.1593530334.git.vpillai@digitalocean.com \
    --to=vpillai@digitalocean.com \
    --cc=aaron.lwe@gmail.com \
    --cc=aubrey.intel@gmail.com \
    --cc=christian.brauner@ubuntu.com \
    --cc=fweisbec@gmail.com \
    --cc=jdesfossez@digitalocean.com \
    --cc=joel@joelfernandes.org \
    --cc=joelaf@google.com \
    --cc=keescook@chromium.org \
    --cc=kerrnel@google.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mgorman@techsingularity.net \
    --cc=mingo@kernel.org \
    --cc=naravamudan@digitalocean.com \
    --cc=pauld@redhat.com \
    --cc=pawan.kumar.gupta@linux.intel.com \
    --cc=pbonzini@redhat.com \
    --cc=peterz@infradead.org \
    --cc=pjt@google.com \
    --cc=subhra.mazumdar@oracle.com \
    --cc=tglx@linutronix.de \
    --cc=tim.c.chen@linux.intel.com \
    --cc=torvalds@linux-foundation.org \
    --cc=valentin.schneider@arm.com \
    --cc=vineethrp@gmail.com \
    --cc=yu.c.chen@intel.com \
    --cc=ziqian.lzq@antfin.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).