LKML Archive on lore.kernel.org
 help / color / Atom feed
From: Aaron Lu <aaron.lu@linux.alibaba.com>
To: Aubrey Li <aubrey.intel@gmail.com>
Cc: "Julien Desfossez" <jdesfossez@digitalocean.com>,
	"Subhra Mazumdar" <subhra.mazumdar@oracle.com>,
	"Vineeth Remanan Pillai" <vpillai@digitalocean.com>,
	"Nishanth Aravamudan" <naravamudan@digitalocean.com>,
	"Peter Zijlstra" <peterz@infradead.org>,
	"Tim Chen" <tim.c.chen@linux.intel.com>,
	"Ingo Molnar" <mingo@kernel.org>,
	"Thomas Gleixner" <tglx@linutronix.de>,
	"Paul Turner" <pjt@google.com>,
	"Linus Torvalds" <torvalds@linux-foundation.org>,
	"Linux List Kernel Mailing" <linux-kernel@vger.kernel.org>,
	"Frédéric Weisbecker" <fweisbec@gmail.com>,
	"Kees Cook" <keescook@chromium.org>,
	"Greg Kerr" <kerrnel@google.com>, "Phil Auld" <pauld@redhat.com>,
	"Valentin Schneider" <valentin.schneider@arm.com>,
	"Mel Gorman" <mgorman@techsingularity.net>,
	"Pawan Gupta" <pawan.kumar.gupta@linux.intel.com>,
	"Paolo Bonzini" <pbonzini@redhat.com>
Subject: [PATCH 2/3] core vruntime comparison
Date: Thu, 25 Jul 2019 22:32:49 +0800
Message-ID: <20190725143248.GC992@aaronlu> (raw)
In-Reply-To: <20190725143003.GA992@aaronlu>

This patch provides a vruntime based way to compare two cfs task's
priority, be it on the same cpu or different threads of the same core.

When the two tasks are on the same CPU, we just need to find a common
cfs_rq both sched_entities are on and then do the comparison.

When the two tasks are on differen threads of the same core, the root
level sched_entities to which the two tasks belong will be used to do
the comparison.

An ugly illustration for the cross CPU case:

   cpu0         cpu1
 /   |  \     /   |  \
se1 se2 se3  se4 se5 se6
    /  \            /   \
  se21 se22       se61  se62

Assume CPU0 and CPU1 are smt siblings and task A's se is se21 while
task B's se is se61. To compare priority of task A and B, we compare
priority of se2 and se6. Whose vruntime is smaller, who wins.

To make this work, the root level se should have a common cfs_rq min
vuntime, which I call it the core cfs_rq min vruntime.

Potential issues: when core scheduling is enabled, if there are tasks
already in some CPU's rq, then new tasks will be queued with the per-core
cfs_rq min vruntime while the old tasks are using the original root
level cfs_rq's min_vruntime. The two values can differ greatly and can
cause tasks with a large vruntime starve. So enable core scheduling
early when the system is still kind of idle for the time being to avoid
this problem.

Signed-off-by: Aaron Lu <ziqian.lzq@antfin.com>
---
 kernel/sched/core.c  | 15 ++-------
 kernel/sched/fair.c  | 79 +++++++++++++++++++++++++++++++++++++++++++-
 kernel/sched/sched.h |  2 ++
 3 files changed, 82 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 90655c9ad937..bc746ea4cc82 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -105,19 +105,8 @@ static inline bool prio_less(struct task_struct *a, struct task_struct *b)
 	if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
 		return !dl_time_before(a->dl.deadline, b->dl.deadline);
 
-	if (pa == MAX_RT_PRIO + MAX_NICE)  { /* fair */
-		u64 vruntime = b->se.vruntime;
-
-		/*
-		 * Normalize the vruntime if tasks are in different cpus.
-		 */
-		if (task_cpu(a) != task_cpu(b)) {
-			vruntime -= task_cfs_rq(b)->min_vruntime;
-			vruntime += task_cfs_rq(a)->min_vruntime;
-		}
-
-		return !((s64)(a->se.vruntime - vruntime) <= 0);
-	}
+	if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
+		return cfs_prio_less(a, b);
 
 	return false;
 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a7b26c96f46b..43babc2a12a5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -431,9 +431,85 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
+static inline struct cfs_rq *root_cfs_rq(struct cfs_rq *cfs_rq)
+{
+	return &rq_of(cfs_rq)->cfs;
+}
+
+static inline bool is_root_cfs_rq(struct cfs_rq *cfs_rq)
+{
+	return cfs_rq == root_cfs_rq(cfs_rq);
+}
+
+static inline struct cfs_rq *core_cfs_rq(struct cfs_rq *cfs_rq)
+{
+	return &rq_of(cfs_rq)->core->cfs;
+}
+
 static inline u64 cfs_rq_min_vruntime(struct cfs_rq *cfs_rq)
 {
-	return cfs_rq->min_vruntime;
+	if (!sched_core_enabled(rq_of(cfs_rq)))
+		return cfs_rq->min_vruntime;
+
+	if (is_root_cfs_rq(cfs_rq))
+		return core_cfs_rq(cfs_rq)->min_vruntime;
+	else
+		return cfs_rq->min_vruntime;
+}
+
+static void update_core_cfs_rq_min_vruntime(struct cfs_rq *cfs_rq)
+{
+	struct cfs_rq *cfs_rq_core;
+
+	if (!sched_core_enabled(rq_of(cfs_rq)))
+		return;
+
+	if (!is_root_cfs_rq(cfs_rq))
+		return;
+
+	cfs_rq_core = core_cfs_rq(cfs_rq);
+	cfs_rq_core->min_vruntime = max(cfs_rq_core->min_vruntime,
+					cfs_rq->min_vruntime);
+}
+
+bool cfs_prio_less(struct task_struct *a, struct task_struct *b)
+{
+	struct sched_entity *sea = &a->se;
+	struct sched_entity *seb = &b->se;
+	bool samecpu = task_cpu(a) == task_cpu(b);
+	struct task_struct *p;
+	s64 delta;
+
+	if (samecpu) {
+		/* vruntime is per cfs_rq */
+		while (!is_same_group(sea, seb)) {
+			int sea_depth = sea->depth;
+			int seb_depth = seb->depth;
+
+			if (sea_depth >= seb_depth)
+				sea = parent_entity(sea);
+			if (sea_depth <= seb_depth)
+				seb = parent_entity(seb);
+		}
+
+		delta = (s64)(sea->vruntime - seb->vruntime);
+		goto out;
+	}
+
+	/* crosscpu: compare root level se's vruntime to decide priority */
+	while (sea->parent)
+		sea = sea->parent;
+	while (seb->parent)
+		seb = seb->parent;
+	delta = (s64)(sea->vruntime - seb->vruntime);
+
+out:
+	p = delta > 0 ? b : a;
+	trace_printk("picked %s/%d %s: %Ld %Ld %Ld\n", p->comm, p->pid,
+			samecpu ? "samecpu" : "crosscpu",
+			sea->vruntime, seb->vruntime, delta);
+
+	return delta > 0;
 }
 
 static __always_inline
@@ -493,6 +569,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
 
 	/* ensure we never gain time by being placed backwards. */
 	cfs_rq->min_vruntime = max_vruntime(cfs_rq_min_vruntime(cfs_rq), vruntime);
+	update_core_cfs_rq_min_vruntime(cfs_rq);
 #ifndef CONFIG_64BIT
 	smp_wmb();
 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e91c188a452c..02a6d71704f0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2454,3 +2454,5 @@ static inline bool sched_energy_enabled(void)
 static inline bool sched_energy_enabled(void) { return false; }
 
 #endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
+
+bool cfs_prio_less(struct task_struct *a, struct task_struct *b);
-- 
2.19.1.3.ge56e4f7


  parent reply index

Thread overview: 161+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-05-29 20:36 [RFC PATCH v3 00/16] Core scheduling v3 Vineeth Remanan Pillai
2019-05-29 20:36 ` [RFC PATCH v3 01/16] stop_machine: Fix stop_cpus_in_progress ordering Vineeth Remanan Pillai
2019-08-08 10:54   ` [tip:sched/core] " tip-bot for Peter Zijlstra
2019-08-26 16:19   ` [RFC PATCH v3 01/16] " mark gross
2019-08-26 16:59     ` Peter Zijlstra
2019-05-29 20:36 ` [RFC PATCH v3 02/16] sched: Fix kerneldoc comment for ia64_set_curr_task Vineeth Remanan Pillai
2019-08-08 10:55   ` [tip:sched/core] " tip-bot for Peter Zijlstra
2019-08-26 16:20   ` [RFC PATCH v3 02/16] " mark gross
2019-05-29 20:36 ` [RFC PATCH v3 03/16] sched: Wrap rq::lock access Vineeth Remanan Pillai
2019-05-29 20:36 ` [RFC PATCH v3 04/16] sched/{rt,deadline}: Fix set_next_task vs pick_next_task Vineeth Remanan Pillai
2019-08-08 10:55   ` [tip:sched/core] " tip-bot for Peter Zijlstra
2019-05-29 20:36 ` [RFC PATCH v3 05/16] sched: Add task_struct pointer to sched_class::set_curr_task Vineeth Remanan Pillai
2019-08-08 10:57   ` [tip:sched/core] " tip-bot for Peter Zijlstra
2019-05-29 20:36 ` [RFC PATCH v3 06/16] sched/fair: Export newidle_balance() Vineeth Remanan Pillai
2019-08-08 10:58   ` [tip:sched/core] sched/fair: Expose newidle_balance() tip-bot for Peter Zijlstra
2019-05-29 20:36 ` [RFC PATCH v3 07/16] sched: Allow put_prev_task() to drop rq->lock Vineeth Remanan Pillai
2019-08-08 10:58   ` [tip:sched/core] " tip-bot for Peter Zijlstra
2019-08-26 16:51   ` [RFC PATCH v3 07/16] " mark gross
2019-05-29 20:36 ` [RFC PATCH v3 08/16] sched: Rework pick_next_task() slow-path Vineeth Remanan Pillai
2019-08-08 10:59   ` [tip:sched/core] " tip-bot for Peter Zijlstra
2019-08-26 17:01   ` [RFC PATCH v3 08/16] " mark gross
2019-05-29 20:36 ` [RFC PATCH v3 09/16] sched: Introduce sched_class::pick_task() Vineeth Remanan Pillai
2019-08-26 17:14   ` mark gross
2019-05-29 20:36 ` [RFC PATCH v3 10/16] sched: Core-wide rq->lock Vineeth Remanan Pillai
2019-05-31 11:08   ` Peter Zijlstra
2019-05-31 15:23     ` Vineeth Pillai
2019-05-29 20:36 ` [RFC PATCH v3 11/16] sched: Basic tracking of matching tasks Vineeth Remanan Pillai
2019-08-26 20:59   ` mark gross
2019-05-29 20:36 ` [RFC PATCH v3 12/16] sched: A quick and dirty cgroup tagging interface Vineeth Remanan Pillai
2019-05-29 20:36 ` [RFC PATCH v3 13/16] sched: Add core wide task selection and scheduling Vineeth Remanan Pillai
2019-06-07 23:36   ` Pawan Gupta
2019-05-29 20:36 ` [RFC PATCH v3 14/16] sched/fair: Add a few assertions Vineeth Remanan Pillai
2019-05-29 20:36 ` [RFC PATCH v3 15/16] sched: Trivial forced-newidle balancer Vineeth Remanan Pillai
2019-05-29 20:36 ` [RFC PATCH v3 16/16] sched: Debug bits Vineeth Remanan Pillai
2019-05-29 21:02   ` Peter Oskolkov
2019-05-30 14:04 ` [RFC PATCH v3 00/16] Core scheduling v3 Aubrey Li
2019-05-30 14:17   ` Julien Desfossez
2019-05-31  4:55     ` Aubrey Li
2019-05-31  3:01   ` Aaron Lu
2019-05-31  5:12     ` Aubrey Li
2019-05-31  6:09       ` Aaron Lu
2019-05-31  6:53         ` Aubrey Li
2019-05-31  7:44           ` Aaron Lu
2019-05-31  8:26             ` Aubrey Li
2019-05-31 21:08     ` Julien Desfossez
2019-06-06 15:26       ` Julien Desfossez
2019-06-12  1:52         ` Li, Aubrey
2019-06-12 16:06           ` Julien Desfossez
2019-06-12 16:33         ` Julien Desfossez
2019-06-13  0:03           ` Subhra Mazumdar
2019-06-13  3:22             ` Julien Desfossez
2019-06-17  2:51               ` Aubrey Li
2019-06-19 18:33                 ` Julien Desfossez
2019-07-18 10:07                   ` Aaron Lu
2019-07-18 23:27                     ` Tim Chen
2019-07-19  5:52                       ` Aaron Lu
2019-07-19 11:48                         ` Aubrey Li
2019-07-19 18:33                         ` Tim Chen
2019-07-22 10:26                     ` Aubrey Li
2019-07-22 10:43                       ` Aaron Lu
2019-07-23  2:52                         ` Aubrey Li
2019-07-25 14:30                       ` Aaron Lu
2019-07-25 14:31                         ` [RFC PATCH 1/3] wrapper for cfs_rq->min_vruntime Aaron Lu
2019-07-25 14:32                         ` Aaron Lu [this message]
2019-08-06 14:17                           ` [PATCH 2/3] core vruntime comparison Peter Zijlstra
2019-07-25 14:33                         ` [PATCH 3/3] temp hack to make tick based schedule happen Aaron Lu
2019-07-25 21:42                         ` [RFC PATCH v3 00/16] Core scheduling v3 Li, Aubrey
2019-07-26 15:21                         ` Julien Desfossez
2019-07-26 21:29                           ` Tim Chen
2019-07-31  2:42                           ` Li, Aubrey
2019-08-02 15:37                             ` Julien Desfossez
2019-08-05 15:55                               ` Tim Chen
2019-08-06  3:24                                 ` Aaron Lu
2019-08-06  6:56                                   ` Aubrey Li
2019-08-06  7:04                                     ` Aaron Lu
2019-08-06 12:24                                       ` Vineeth Remanan Pillai
2019-08-06 13:49                                         ` Aaron Lu
2019-08-06 16:14                                           ` Vineeth Remanan Pillai
2019-08-06 14:16                                         ` Peter Zijlstra
2019-08-06 15:53                                           ` Vineeth Remanan Pillai
2019-08-06 17:03                                   ` Tim Chen
2019-08-06 17:12                                     ` Peter Zijlstra
2019-08-06 21:19                                       ` Tim Chen
2019-08-08  6:47                                         ` Aaron Lu
2019-08-08 17:27                                           ` Tim Chen
2019-08-08 21:42                                             ` Tim Chen
2019-08-10 14:15                                               ` Aaron Lu
2019-08-12 15:38                                                 ` Vineeth Remanan Pillai
2019-08-13  2:24                                                   ` Aaron Lu
2019-08-08 12:55                                 ` Aaron Lu
2019-08-08 16:39                                   ` Tim Chen
2019-08-10 14:18                                     ` Aaron Lu
2019-08-05 20:09                               ` Phil Auld
2019-08-06 13:54                                 ` Aaron Lu
2019-08-06 14:17                                   ` Phil Auld
2019-08-06 14:41                                     ` Aaron Lu
2019-08-06 14:55                                       ` Phil Auld
2019-08-07  8:58                               ` Dario Faggioli
2019-08-07 17:10                                 ` Tim Chen
2019-08-15 16:09                                   ` Dario Faggioli
2019-08-16  2:33                                     ` Aaron Lu
2019-09-05  1:44                                   ` Julien Desfossez
2019-09-06 22:17                                     ` Tim Chen
2019-09-18 21:27                                     ` Tim Chen
2019-09-06 18:30                                   ` Tim Chen
2019-09-11 14:02                                     ` Aaron Lu
2019-09-11 16:19                                       ` Tim Chen
2019-09-11 16:47                                         ` Vineeth Remanan Pillai
2019-09-12 12:35                                           ` Aaron Lu
2019-09-12 17:29                                             ` Tim Chen
2019-09-13 14:15                                               ` Aaron Lu
2019-09-13 17:13                                                 ` Tim Chen
2019-09-30 11:53                                             ` Vineeth Remanan Pillai
2019-10-02 20:48                                               ` Vineeth Remanan Pillai
2019-10-10 13:54                                                 ` Aaron Lu
2019-10-10 14:29                                                   ` Vineeth Remanan Pillai
2019-10-11  7:33                                                     ` Aaron Lu
2019-10-11 11:32                                                       ` Vineeth Remanan Pillai
2019-10-11 12:01                                                         ` Aaron Lu
2019-10-11 12:10                                                           ` Vineeth Remanan Pillai
2019-10-12  3:55                                                             ` Aaron Lu
2019-10-13 12:44                                                               ` Vineeth Remanan Pillai
2019-10-14  9:57                                                                 ` Aaron Lu
2019-10-21 12:30                                                                   ` Vineeth Remanan Pillai
2019-09-12 12:04                                         ` Aaron Lu
2019-09-12 17:05                                           ` Tim Chen
2019-09-13 13:57                                             ` Aaron Lu
2019-09-12 23:12                                           ` Aubrey Li
2019-09-15 14:14                                             ` Aaron Lu
2019-09-18  1:33                                               ` Aubrey Li
2019-09-18 20:40                                                 ` Tim Chen
2019-09-18 22:16                                                   ` Aubrey Li
2019-09-30 14:36                                                     ` Vineeth Remanan Pillai
2019-10-29 20:40                                                   ` Julien Desfossez
2019-11-01 21:42                                                     ` Tim Chen
2019-10-29  9:11                                               ` Dario Faggioli
2019-10-29  9:15                                                 ` Dario Faggioli
2019-10-29  9:16                                                 ` Dario Faggioli
2019-10-29  9:17                                                 ` Dario Faggioli
2019-10-29  9:18                                                 ` Dario Faggioli
2019-10-29  9:18                                                 ` Dario Faggioli
2019-10-29  9:19                                                 ` Dario Faggioli
2019-10-29  9:20                                                 ` Dario Faggioli
2019-10-29 20:34                                                   ` Julien Desfossez
2019-11-15 16:30                                                     ` Dario Faggioli
2019-09-25  2:40                                     ` Aubrey Li
2019-09-25 17:24                                       ` Tim Chen
2019-09-25 22:07                                         ` Aubrey Li
2019-09-30 15:22                                     ` Julien Desfossez
2019-08-27 21:14 ` Matthew Garrett
2019-08-27 21:50   ` Peter Zijlstra
2019-08-28 15:30     ` Phil Auld
2019-08-28 16:01       ` Peter Zijlstra
2019-08-28 16:37         ` Tim Chen
2019-08-29 14:30         ` Phil Auld
2019-08-29 14:38           ` Peter Zijlstra
2019-09-10 14:27             ` Julien Desfossez
2019-09-18 21:12               ` Tim Chen
2019-08-28 15:59     ` Tim Chen
2019-08-28 16:16       ` Peter Zijlstra
2019-08-27 23:24   ` Aubrey Li

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190725143248.GC992@aaronlu \
    --to=aaron.lu@linux.alibaba.com \
    --cc=aubrey.intel@gmail.com \
    --cc=fweisbec@gmail.com \
    --cc=jdesfossez@digitalocean.com \
    --cc=keescook@chromium.org \
    --cc=kerrnel@google.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mgorman@techsingularity.net \
    --cc=mingo@kernel.org \
    --cc=naravamudan@digitalocean.com \
    --cc=pauld@redhat.com \
    --cc=pawan.kumar.gupta@linux.intel.com \
    --cc=pbonzini@redhat.com \
    --cc=peterz@infradead.org \
    --cc=pjt@google.com \
    --cc=subhra.mazumdar@oracle.com \
    --cc=tglx@linutronix.de \
    --cc=tim.c.chen@linux.intel.com \
    --cc=torvalds@linux-foundation.org \
    --cc=valentin.schneider@arm.com \
    --cc=vpillai@digitalocean.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

LKML Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/lkml/0 lkml/git/0.git
	git clone --mirror https://lore.kernel.org/lkml/1 lkml/git/1.git
	git clone --mirror https://lore.kernel.org/lkml/2 lkml/git/2.git
	git clone --mirror https://lore.kernel.org/lkml/3 lkml/git/3.git
	git clone --mirror https://lore.kernel.org/lkml/4 lkml/git/4.git
	git clone --mirror https://lore.kernel.org/lkml/5 lkml/git/5.git
	git clone --mirror https://lore.kernel.org/lkml/6 lkml/git/6.git
	git clone --mirror https://lore.kernel.org/lkml/7 lkml/git/7.git
	git clone --mirror https://lore.kernel.org/lkml/8 lkml/git/8.git
	git clone --mirror https://lore.kernel.org/lkml/9 lkml/git/9.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 lkml lkml/ https://lore.kernel.org/lkml \
		linux-kernel@vger.kernel.org
	public-inbox-index lkml

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-kernel


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git