linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Mel Gorman <mgorman@techsingularity.net>
To: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Ingo Molnar <mingo@kernel.org>,
	Peter Zijlstra <peterz@infradead.org>,
	Juri Lelli <juri.lelli@redhat.com>,
	Dietmar Eggemann <dietmar.eggemann@arm.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Ben Segall <bsegall@google.com>,
	Valentin Schneider <valentin.schneider@arm.com>,
	Phil Auld <pauld@redhat.com>, LKML <linux-kernel@vger.kernel.org>
Subject: [PATCH 10/11] sched/numa: Use similar logic to the load balancer for moving between domains with spare capacity
Date: Wed, 12 Feb 2020 15:46:06 +0000	[thread overview]
Message-ID: <20200212154606.GO3466@techsingularity.net> (raw)
In-Reply-To: <20200212093654.4816-1-mgorman@techsingularity.net>

The standard load balancer generally allows an imbalance to exist if
a domain has spare capacity. This patch uses similar logic within NUMA
balancing when moving a task to a preferred node. This is not a perfect
comparison with the load balancer but should be a close enough match
when the destination domain has spare capacity and the imbalance is not
too large.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
---
 kernel/sched/fair.c | 114 ++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 79 insertions(+), 35 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b2476ef0b056..69e41204cfae 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1473,21 +1473,19 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
 	       group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
 }
 
-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
-
-static unsigned long cpu_runnable_load(struct rq *rq)
-{
-	return cfs_rq_runnable_load_avg(&rq->cfs);
-}
-
 /* Cached statistics for all CPUs within a node */
 struct numa_stats {
-	unsigned long load;
+	unsigned long group_load;
+	unsigned long group_util;
 
 	/* Total compute capacity of CPUs on a node */
-	unsigned long compute_capacity;
+	unsigned long group_capacity;
+
+	unsigned int sum_nr_running;
 
 	/* Details on idle CPUs */
+	unsigned int group_weight;
+	int nr_idle;
 	int idle_cpu;
 };
 
@@ -1511,6 +1509,22 @@ static inline bool is_core_idle(int cpu)
 /* Forward declarations of select_idle_sibling helpers */
 static inline bool test_idle_cores(int cpu, bool def);
 
+/* Forward declarations of lb helpers */
+static unsigned long cpu_load(struct rq *rq);
+static inline unsigned long cpu_util(int cpu);
+static inline bool __lb_has_capacity(unsigned int imbalance_pct,
+	unsigned int sum_nr_running, unsigned int group_weight,
+	unsigned long group_capacity, unsigned long group_util);
+static inline long adjust_numa_imbalance(int imbalance, int src_nr_running);
+
+/* NUMA Balancing equivalents for LB helpers */
+static inline bool
+numa_has_capacity(unsigned int imbalance_pct, struct numa_stats *ns)
+{
+	return __lb_has_capacity(imbalance_pct, ns->sum_nr_running + 1,
+		ns->group_weight, ns->group_capacity, ns->group_util);
+}
+
 /*
  * Gather all necessary information to make NUMA balancing placement
  * decisions that are compatible with standard load balanced. This
@@ -1529,14 +1543,20 @@ update_numa_stats(struct numa_stats *ns, int nid,
 	ns->idle_cpu = -1;
 	for_each_cpu(cpu, cpumask_of_node(nid)) {
 		struct rq *rq = cpu_rq(cpu);
+		unsigned int nr_running = rq->nr_running;
 
-		ns->load += cpu_runnable_load(rq);
-		ns->compute_capacity += capacity_of(cpu);
+		ns->group_load += cpu_load(rq);
+		ns->group_util += cpu_util(cpu);
+		ns->group_capacity += capacity_of(cpu);
+		ns->group_weight++;
+		ns->sum_nr_running += nr_running;
 
-		if (find_idle && !rq->nr_running && idle_cpu(cpu)) {
+		if (!nr_running && idle_cpu(cpu)) {
 			int this_llc_id;
 
-			if (READ_ONCE(rq->numa_migrate_on) ||
+			ns->nr_idle++;
+
+			if (!find_idle || READ_ONCE(rq->numa_migrate_on) ||
 			    !cpumask_test_cpu(cpu, p->cpus_ptr))
 				continue;
 
@@ -1646,13 +1666,13 @@ static bool load_too_imbalanced(long src_load, long dst_load,
 	 * ------------ vs ---------
 	 * src_capacity    dst_capacity
 	 */
-	src_capacity = env->src_stats.compute_capacity;
-	dst_capacity = env->dst_stats.compute_capacity;
+	src_capacity = env->src_stats.group_capacity;
+	dst_capacity = env->dst_stats.group_capacity;
 
 	imb = abs(dst_load * src_capacity - src_load * dst_capacity);
 
-	orig_src_load = env->src_stats.load;
-	orig_dst_load = env->dst_stats.load;
+	orig_src_load = env->src_stats.group_load;
+	orig_dst_load = env->dst_stats.group_load;
 
 	old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);
 
@@ -1799,8 +1819,8 @@ static void task_numa_compare(struct task_numa_env *env,
 	if (!load)
 		goto assign;
 
-	dst_load = env->dst_stats.load + load;
-	src_load = env->src_stats.load - load;
+	dst_load = env->dst_stats.group_load + load;
+	src_load = env->src_stats.group_load - load;
 
 	if (load_too_imbalanced(src_load, dst_load, env))
 		goto unlock;
@@ -1838,23 +1858,38 @@ static void task_numa_find_cpu(struct task_numa_env *env,
 	bool maymove = false;
 	int cpu;
 
-	load = task_h_load(env->p);
-	dst_load = env->dst_stats.load + load;
-	src_load = env->src_stats.load - load;
-
 	/*
-	 * If the improvement from just moving env->p direction is better
-	 * than swapping tasks around, check if a move is possible.
+	 * If the load balancer is unlikely to interfere with the task after
+	 * a migration then use an idle CPU.
 	 */
-	maymove = !load_too_imbalanced(src_load, dst_load, env);
+	if (env->dst_stats.idle_cpu >= 0) {
+		unsigned int imbalance;
+		int src_running, dst_running;
 
-	/* Use an idle CPU if one has been found already */
-	if (maymove && env->dst_stats.idle_cpu >= 0) {
-		env->dst_cpu = env->dst_stats.idle_cpu;
-		task_numa_assign(env, NULL, 0);
-		return;
+		/* Would movement cause an imbalance? */
+		src_running = env->src_stats.sum_nr_running - 1;
+		dst_running = env->src_stats.sum_nr_running + 1;
+		imbalance = max(0, dst_running - src_running);
+		imbalance = adjust_numa_imbalance(imbalance, src_running);
+
+		/* Use idle CPU there is spare capacity and no imbalance */
+		if (numa_has_capacity(env->imbalance_pct, &env->dst_stats) &&
+		    !imbalance) {
+			env->dst_cpu = env->dst_stats.idle_cpu;
+			task_numa_assign(env, NULL, 0);
+			return;
+		}
 	}
 
+	/*
+	 * If using an idle CPU would cause an imbalance that would likely
+	 * be overridden by the load balancer, consider the load instead.
+	 */
+	load = task_h_load(env->p);
+	dst_load = env->dst_stats.group_load + load;
+	src_load = env->src_stats.group_load - load;
+	maymove = !load_too_imbalanced(src_load, dst_load, env);
+
 	for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
 		/* Skip this CPU if the source task cannot migrate */
 		if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
@@ -8048,18 +8083,27 @@ static inline int sg_imbalanced(struct sched_group *group)
  * any benefit for the load balance.
  */
 static inline bool
-group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
+__lb_has_capacity(unsigned int imbalance_pct, unsigned int sum_nr_running,
+	unsigned int group_weight, unsigned long group_capacity,
+	unsigned long group_util)
 {
-	if (sgs->sum_nr_running < sgs->group_weight)
+	if (sum_nr_running < group_weight)
 		return true;
 
-	if ((sgs->group_capacity * 100) >
-			(sgs->group_util * imbalance_pct))
+	if ((group_capacity * 100) >
+			(group_util * imbalance_pct))
 		return true;
 
 	return false;
 }
 
+static inline bool
+group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
+{
+	return __lb_has_capacity(imbalance_pct, sgs->sum_nr_running,
+		sgs->group_weight, sgs->group_capacity, sgs->group_util);
+}
+
 /*
  *  group_is_overloaded returns true if the group has more tasks than it can
  *  handle.
-- 
2.16.4


  parent reply	other threads:[~2020-02-12 15:46 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-02-12  9:36 [RFC PATCH 00/11] Reconcile NUMA balancing decisions with the load balancer Mel Gorman
2020-02-12  9:36 ` [PATCH 01/11] sched/fair: Allow a small load imbalance between low utilisation SD_NUMA domains Mel Gorman
2020-02-12  9:36 ` [PATCH 02/11] sched/fair: Optimize select_idle_core() Mel Gorman
2020-02-12  9:36 ` [PATCH 03/11] sched/fair: Allow a per-CPU kthread waking a task to stack on the same CPU, to fix XFS performance regression Mel Gorman
2020-02-12  9:36 ` [PATCH 04/11] sched/numa: Trace when no candidate CPU was found on the preferred node Mel Gorman
2020-02-12  9:36 ` [PATCH 05/11] sched/numa: Distinguish between the different task_numa_migrate failure cases Mel Gorman
2020-02-12 14:43   ` Steven Rostedt
2020-02-12 15:59     ` Mel Gorman
2020-02-12  9:36 ` [PATCH 06/11] sched/numa: Prefer using an idle cpu as a migration target instead of comparing tasks Mel Gorman
2020-02-12  9:36 ` [PATCH 07/11] sched/numa: Find an alternative idle CPU if the CPU is part of an active NUMA balance Mel Gorman
2020-02-12  9:36 ` [PATCH 08/11] sched/numa: Bias swapping tasks based on their preferred node Mel Gorman
2020-02-13 10:31   ` Peter Zijlstra
2020-02-13 11:18     ` Mel Gorman
2020-02-12 13:22 ` [RFC PATCH 00/11] Reconcile NUMA balancing decisions with the load balancer Vincent Guittot
2020-02-12 14:07   ` Valentin Schneider
2020-02-12 15:48   ` Mel Gorman
2020-02-12 16:13     ` Vincent Guittot
2020-02-12 15:45 ` [PATCH 09/11] sched/fair: Split out helper to adjust imbalances between domains Mel Gorman
2020-02-12 15:46 ` Mel Gorman [this message]
2020-02-12 15:46 ` [PATCH 11/11] sched/numa: Use similar logic to the load balancer for moving between overloaded domains Mel Gorman
     [not found] ` <20200214041232.18904-1-hdanton@sina.com>
2020-02-14  7:50   ` [PATCH 08/11] sched/numa: Bias swapping tasks based on their preferred node Mel Gorman

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200212154606.GO3466@techsingularity.net \
    --to=mgorman@techsingularity.net \
    --cc=bsegall@google.com \
    --cc=dietmar.eggemann@arm.com \
    --cc=juri.lelli@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@kernel.org \
    --cc=pauld@redhat.com \
    --cc=peterz@infradead.org \
    --cc=rostedt@goodmis.org \
    --cc=valentin.schneider@arm.com \
    --cc=vincent.guittot@linaro.org \
    --subject='Re: [PATCH 10/11] sched/numa: Use similar logic to the load balancer for moving between domains with spare capacity' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).