All of lore.kernel.org
 help / color / mirror / Atom feed
From: vpillai <vpillai@digitalocean.com>
To: Nishanth Aravamudan <naravamudan@digitalocean.com>,
	Julien Desfossez <jdesfossez@digitalocean.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Tim Chen <tim.c.chen@linux.intel.com>,
	mingo@kernel.org, tglx@linutronix.de, pjt@google.com,
	torvalds@linux-foundation.org
Cc: Aaron Lu <aaron.lu@linux.alibaba.com>,
	linux-kernel@vger.kernel.org, fweisbec@gmail.com,
	keescook@chromium.org, kerrnel@google.com,
	Phil Auld <pauld@redhat.com>, Aaron Lu <aaron.lwe@gmail.com>,
	Aubrey Li <aubrey.intel@gmail.com>,
	aubrey.li@linux.intel.com,
	Valentin Schneider <valentin.schneider@arm.com>,
	Mel Gorman <mgorman@techsingularity.net>,
	Pawan Gupta <pawan.kumar.gupta@linux.intel.com>,
	Paolo Bonzini <pbonzini@redhat.com>,
	Joel Fernandes <joelaf@google.com>,
	joel@joelfernandes.org, Aaron Lu <ziqian.lzq@antfin.com>
Subject: [RFC PATCH 09/13] sched/fair: core wide vruntime comparison
Date: Wed,  4 Mar 2020 16:59:59 +0000	[thread overview]
Message-ID: <2f83d888890cec14be3a7aead0859dceebb4012f.1583332765.git.vpillai@digitalocean.com> (raw)
In-Reply-To: <cover.1583332764.git.vpillai@digitalocean.com>
In-Reply-To: <cover.1583332764.git.vpillai@digitalocean.com>

From: Aaron Lu <aaron.lu@linux.alibaba.com>

This patch provides a vruntime based way to compare two cfs task's
priority, be it on the same cpu or different threads of the same core.

When the two tasks are on the same CPU, we just need to find a common
cfs_rq both sched_entities are on and then do the comparison.

When the two tasks are on differen threads of the same core, the root
level sched_entities to which the two tasks belong will be used to do
the comparison.

An ugly illustration for the cross CPU case:

   cpu0         cpu1
 /   |  \     /   |  \
se1 se2 se3  se4 se5 se6
    /  \            /   \
  se21 se22       se61  se62

Assume CPU0 and CPU1 are smt siblings and task A's se is se21 while
task B's se is se61. To compare priority of task A and B, we compare
priority of se2 and se6. Whose vruntime is smaller, who wins.

To make this work, the root level se should have a common cfs_rq min
vuntime, which I call it the core cfs_rq min vruntime.

When we adjust the min_vruntime of rq->core, we need to propgate
that down the tree so as to not cause starvation of existing tasks
based on previous vruntime.

Signed-off-by: Aaron Lu <ziqian.lzq@antfin.com>
---
 kernel/sched/core.c  | 15 +------
 kernel/sched/fair.c  | 99 +++++++++++++++++++++++++++++++++++++++++++-
 kernel/sched/sched.h |  2 +
 3 files changed, 102 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9a1bd236044e..556bf054b896 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -119,19 +119,8 @@ static inline bool prio_less(struct task_struct *a, struct task_struct *b)
 	if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
 		return !dl_time_before(a->dl.deadline, b->dl.deadline);
 
-	if (pa == MAX_RT_PRIO + MAX_NICE)  { /* fair */
-		u64 vruntime = b->se.vruntime;
-
-		/*
-		 * Normalize the vruntime if tasks are in different cpus.
-		 */
-		if (task_cpu(a) != task_cpu(b)) {
-			vruntime -= task_cfs_rq(b)->min_vruntime;
-			vruntime += task_cfs_rq(a)->min_vruntime;
-		}
-
-		return !((s64)(a->se.vruntime - vruntime) <= 0);
-	}
+	if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
+		return cfs_prio_less(a, b);
 
 	return false;
 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d99ea6ee7af2..1c9a80d8dbb8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -449,9 +449,105 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
+static inline struct cfs_rq *root_cfs_rq(struct cfs_rq *cfs_rq)
+{
+	return &rq_of(cfs_rq)->cfs;
+}
+
+static inline bool is_root_cfs_rq(struct cfs_rq *cfs_rq)
+{
+	return cfs_rq == root_cfs_rq(cfs_rq);
+}
+
+static inline struct cfs_rq *core_cfs_rq(struct cfs_rq *cfs_rq)
+{
+	return &rq_of(cfs_rq)->core->cfs;
+}
+
 static inline u64 cfs_rq_min_vruntime(struct cfs_rq *cfs_rq)
 {
-	return cfs_rq->min_vruntime;
+	if (!sched_core_enabled(rq_of(cfs_rq)))
+		return cfs_rq->min_vruntime;
+
+	if (is_root_cfs_rq(cfs_rq))
+		return core_cfs_rq(cfs_rq)->min_vruntime;
+	else
+		return cfs_rq->min_vruntime;
+}
+
+static void coresched_adjust_vruntime(struct cfs_rq *cfs_rq, u64 delta)
+{
+	struct sched_entity *se, *next;
+
+	if (!cfs_rq)
+		return;
+
+	cfs_rq->min_vruntime -= delta;
+	rbtree_postorder_for_each_entry_safe(se, next,
+			&cfs_rq->tasks_timeline.rb_root, run_node) {
+		if (se->vruntime > delta)
+			se->vruntime -= delta;
+		if (se->my_q)
+			coresched_adjust_vruntime(se->my_q, delta);
+	}
+}
+
+static void update_core_cfs_rq_min_vruntime(struct cfs_rq *cfs_rq)
+{
+	struct cfs_rq *cfs_rq_core;
+
+	if (!sched_core_enabled(rq_of(cfs_rq)))
+		return;
+
+	if (!is_root_cfs_rq(cfs_rq))
+		return;
+
+	cfs_rq_core = core_cfs_rq(cfs_rq);
+	if (cfs_rq_core != cfs_rq &&
+	    cfs_rq->min_vruntime < cfs_rq_core->min_vruntime) {
+		u64 delta = cfs_rq_core->min_vruntime - cfs_rq->min_vruntime;
+		coresched_adjust_vruntime(cfs_rq_core, delta);
+	}
+}
+
+bool cfs_prio_less(struct task_struct *a, struct task_struct *b)
+{
+	struct sched_entity *sea = &a->se;
+	struct sched_entity *seb = &b->se;
+	bool samecpu = task_cpu(a) == task_cpu(b);
+	struct task_struct *p;
+	s64 delta;
+
+	if (samecpu) {
+		/* vruntime is per cfs_rq */
+		while (!is_same_group(sea, seb)) {
+			int sea_depth = sea->depth;
+			int seb_depth = seb->depth;
+
+			if (sea_depth >= seb_depth)
+				sea = parent_entity(sea);
+			if (sea_depth <= seb_depth)
+				seb = parent_entity(seb);
+		}
+
+		delta = (s64)(sea->vruntime - seb->vruntime);
+		goto out;
+	}
+
+	/* crosscpu: compare root level se's vruntime to decide priority */
+	while (sea->parent)
+		sea = sea->parent;
+	while (seb->parent)
+		seb = seb->parent;
+	delta = (s64)(sea->vruntime - seb->vruntime);
+
+out:
+	p = delta > 0 ? b : a;
+	trace_printk("picked %s/%d %s: %Ld %Ld %Ld\n", p->comm, p->pid,
+			samecpu ? "samecpu" : "crosscpu",
+			sea->vruntime, seb->vruntime, delta);
+
+	return delta > 0;
 }
 
 static __always_inline
@@ -511,6 +607,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
 
 	/* ensure we never gain time by being placed backwards. */
 	cfs_rq->min_vruntime = max_vruntime(cfs_rq_min_vruntime(cfs_rq), vruntime);
+	update_core_cfs_rq_min_vruntime(cfs_rq);
 #ifndef CONFIG_64BIT
 	smp_wmb();
 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a829e26fa43a..ef9e08e5da6a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2561,6 +2561,8 @@ static inline bool sched_energy_enabled(void) { return false; }
 
 #endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
 
+bool cfs_prio_less(struct task_struct *a, struct task_struct *b);
+
 #ifdef CONFIG_MEMBARRIER
 /*
  * The scheduler provides memory barriers required by membarrier between:
-- 
2.17.1


  parent reply	other threads:[~2020-03-04 17:01 UTC|newest]

Thread overview: 115+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-03-04 16:59 [RFC PATCH 00/13] Core scheduling v5 vpillai
2020-03-04 16:59 ` [RFC PATCH 01/13] sched: Wrap rq::lock access vpillai
2020-03-04 16:59 ` [RFC PATCH 02/13] sched: Introduce sched_class::pick_task() vpillai
2020-03-04 16:59 ` [RFC PATCH 03/13] sched: Core-wide rq->lock vpillai
2020-04-01 11:42   ` [PATCH] sched/arm64: store cpu topology before notify_cpu_starting Cheng Jian
2020-04-01 13:23     ` Valentin Schneider
2020-04-01 13:23       ` Valentin Schneider
2020-04-06  8:00       ` chengjian (D)
2020-04-06  8:00         ` chengjian (D)
2020-04-09  9:59       ` Sudeep Holla
2020-04-09  9:59         ` Sudeep Holla
2020-04-09 10:32         ` Valentin Schneider
2020-04-09 10:32           ` Valentin Schneider
2020-04-09 11:08           ` Sudeep Holla
2020-04-09 11:08             ` Sudeep Holla
2020-04-09 17:54     ` Joel Fernandes
2020-04-10 13:49       ` chengjian (D)
2020-04-14 11:36   ` [RFC PATCH 03/13] sched: Core-wide rq->lock Peter Zijlstra
2020-04-14 21:35     ` Vineeth Remanan Pillai
2020-04-15 10:55       ` Peter Zijlstra
2020-04-14 14:32   ` Peter Zijlstra
2020-03-04 16:59 ` [RFC PATCH 04/13] sched/fair: Add a few assertions vpillai
2020-03-04 16:59 ` [RFC PATCH 05/13] sched: Basic tracking of matching tasks vpillai
2020-03-04 16:59 ` [RFC PATCH 06/13] sched: Update core scheduler queue when taking cpu online/offline vpillai
2020-03-04 16:59 ` [RFC PATCH 07/13] sched: Add core wide task selection and scheduling vpillai
2020-04-14 13:35   ` Peter Zijlstra
2020-04-16 23:32     ` Tim Chen
2020-04-17 10:57       ` Peter Zijlstra
2020-04-16  3:39   ` Chen Yu
2020-04-16 19:59     ` Vineeth Remanan Pillai
2020-04-17 11:18     ` Peter Zijlstra
2020-04-19 15:31       ` Chen Yu
2020-05-21 23:14   ` Joel Fernandes
2020-05-21 23:16     ` Joel Fernandes
2020-05-22  2:35     ` Joel Fernandes
2020-05-22  3:44       ` Aaron Lu
2020-05-22 20:13         ` Joel Fernandes
2020-03-04 16:59 ` [RFC PATCH 08/13] sched/fair: wrapper for cfs_rq->min_vruntime vpillai
2020-03-04 16:59 ` vpillai [this message]
2020-04-14 13:56   ` [RFC PATCH 09/13] sched/fair: core wide vruntime comparison Peter Zijlstra
2020-04-15  3:34     ` Aaron Lu
2020-04-15  4:07       ` Aaron Lu
2020-04-15 21:24         ` Vineeth Remanan Pillai
2020-04-17  9:40           ` Aaron Lu
2020-04-20  8:07             ` [PATCH updated] sched/fair: core wide cfs task priority comparison Aaron Lu
2020-04-20 22:26               ` Vineeth Remanan Pillai
2020-04-21  2:51                 ` Aaron Lu
2020-04-24 14:24                   ` [PATCH updated v2] " Aaron Lu
2020-05-06 14:35                     ` Peter Zijlstra
2020-05-08  8:44                       ` Aaron Lu
2020-05-08  9:09                         ` Peter Zijlstra
2020-05-08 12:34                           ` Aaron Lu
2020-05-14 13:02                             ` Peter Zijlstra
2020-05-14 22:51                               ` Vineeth Remanan Pillai
2020-05-15 10:38                                 ` Peter Zijlstra
2020-05-15 10:43                                   ` Peter Zijlstra
2020-05-15 14:24                                   ` Vineeth Remanan Pillai
2020-05-16  3:42                               ` Aaron Lu
2020-05-22  9:40                                 ` Aaron Lu
2020-06-08  1:41                               ` Ning, Hongyu
2020-03-04 17:00 ` [RFC PATCH 10/13] sched: Trivial forced-newidle balancer vpillai
2020-03-04 17:00 ` [RFC PATCH 11/13] sched: migration changes for core scheduling vpillai
2020-06-12 13:21   ` Joel Fernandes
2020-06-12 21:32     ` Vineeth Remanan Pillai
2020-06-13  2:25       ` Joel Fernandes
2020-06-13 18:59         ` Vineeth Remanan Pillai
2020-06-15  2:05           ` Li, Aubrey
2020-03-04 17:00 ` [RFC PATCH 12/13] sched: cgroup tagging interface " vpillai
2020-06-26 15:06   ` Vineeth Remanan Pillai
2020-03-04 17:00 ` [RFC PATCH 13/13] sched: Debug bits vpillai
2020-03-04 17:36 ` [RFC PATCH 00/13] Core scheduling v5 Tim Chen
2020-03-04 17:42   ` Vineeth Remanan Pillai
2020-04-14 14:21 ` Peter Zijlstra
2020-04-15 16:32   ` Joel Fernandes
2020-04-17 11:12     ` Peter Zijlstra
2020-04-17 12:35       ` Alexander Graf
2020-04-17 13:08         ` Peter Zijlstra
2020-04-18  2:25       ` Joel Fernandes
2020-05-09 14:35   ` Dario Faggioli
     [not found] ` <38805656-2e2f-222a-c083-692f4b113313@linux.intel.com>
2020-05-09  3:39   ` Ning, Hongyu
2020-05-14 20:51     ` FW: " Gruza, Agata
2020-05-10 23:46 ` [PATCH RFC] Add support for core-wide protection of IRQ and softirq Joel Fernandes (Google)
2020-05-11 13:49   ` Peter Zijlstra
2020-05-11 14:54     ` Joel Fernandes
2020-05-20 22:26 ` [PATCH RFC] sched: Add a per-thread core scheduling interface Joel Fernandes (Google)
2020-05-21  4:09   ` [PATCH RFC] sched: Add a per-thread core scheduling interface(Internet mail) benbjiang(蒋彪)
2020-05-21 13:49     ` Joel Fernandes
2020-05-21  8:51   ` [PATCH RFC] sched: Add a per-thread core scheduling interface Peter Zijlstra
2020-05-21 13:47     ` Joel Fernandes
2020-05-21 20:20       ` Vineeth Remanan Pillai
2020-05-22 12:59       ` Peter Zijlstra
2020-05-22 21:35         ` Joel Fernandes
2020-05-24 14:00           ` Phil Auld
2020-05-28 14:51             ` Joel Fernandes
2020-05-28 17:01             ` Peter Zijlstra
2020-05-28 18:17               ` Phil Auld
2020-05-28 18:34                 ` Phil Auld
2020-05-28 18:23               ` Joel Fernandes
2020-05-21 18:31   ` Linus Torvalds
2020-05-21 20:40     ` Joel Fernandes
2020-05-21 21:58       ` Jesse Barnes
2020-05-22 16:33         ` Linus Torvalds
2020-05-20 22:37 ` [PATCH RFC v2] Add support for core-wide protection of IRQ and softirq Joel Fernandes (Google)
2020-05-20 22:48 ` [PATCH RFC] sched: Use sched-RCU in core-scheduling balancing logic Joel Fernandes (Google)
2020-05-21 22:52   ` Paul E. McKenney
2020-05-22  1:26     ` Joel Fernandes
2020-06-25 20:12 ` [RFC PATCH 00/13] Core scheduling v5 Vineeth Remanan Pillai
2020-06-26  1:47   ` Joel Fernandes
2020-06-26 14:36     ` Vineeth Remanan Pillai
2020-06-26 15:10       ` Joel Fernandes
2020-06-26 15:12         ` Joel Fernandes
2020-06-27 16:21         ` Joel Fernandes
2020-06-30 14:11         ` Phil Auld
2020-06-29 12:33   ` Li, Aubrey
2020-06-29 19:41     ` Vineeth Remanan Pillai

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=2f83d888890cec14be3a7aead0859dceebb4012f.1583332765.git.vpillai@digitalocean.com \
    --to=vpillai@digitalocean.com \
    --cc=aaron.lu@linux.alibaba.com \
    --cc=aaron.lwe@gmail.com \
    --cc=aubrey.intel@gmail.com \
    --cc=aubrey.li@linux.intel.com \
    --cc=fweisbec@gmail.com \
    --cc=jdesfossez@digitalocean.com \
    --cc=joel@joelfernandes.org \
    --cc=joelaf@google.com \
    --cc=keescook@chromium.org \
    --cc=kerrnel@google.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mgorman@techsingularity.net \
    --cc=mingo@kernel.org \
    --cc=naravamudan@digitalocean.com \
    --cc=pauld@redhat.com \
    --cc=pawan.kumar.gupta@linux.intel.com \
    --cc=pbonzini@redhat.com \
    --cc=peterz@infradead.org \
    --cc=pjt@google.com \
    --cc=tglx@linutronix.de \
    --cc=tim.c.chen@linux.intel.com \
    --cc=torvalds@linux-foundation.org \
    --cc=valentin.schneider@arm.com \
    --cc=ziqian.lzq@antfin.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.