From: Mel Gorman <mgorman@techsingularity.net>
To: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>,
Vincent Guittot <vincent.guittot@linaro.org>,
Juri Lelli <juri.lelli@redhat.com>,
Dietmar Eggemann <dietmar.eggemann@arm.com>,
Steven Rostedt <rostedt@goodmis.org>,
Ben Segall <bsegall@google.com>,
Valentin Schneider <valentin.schneider@arm.com>,
Phil Auld <pauld@redhat.com>, Hillf Danton <hdanton@sina.com>,
LKML <linux-kernel@vger.kernel.org>,
Mel Gorman <mgorman@techsingularity.net>
Subject: [PATCH 05/13] sched/numa: Replace runnable_load_avg by load_avg
Date: Wed, 19 Feb 2020 14:07:28 +0000 [thread overview]
Message-ID: <20200219140736.20499-6-mgorman@techsingularity.net> (raw)
In-Reply-To: <20200219140736.20499-1-mgorman@techsingularity.net>
From: Vincent Guittot <vincent.guittot@linaro.org>
Similarly to what has been done for the normal load balancer, we can
replace runnable_load_avg by load_avg in numa load balancing and track the
other statistics like the utilization and the number of running tasks to
get to better view of the current state of a node.
[mgorman@techsingularity.net: Remove premature definition of cpu_runnable_load]
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
---
kernel/sched/fair.c | 97 ++++++++++++++++++++++++++++++++++++++---------------
1 file changed, 70 insertions(+), 27 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4395951b1530..52e74b53d6e7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1473,38 +1473,35 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
}
-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
-
-static unsigned long cpu_runnable_load(struct rq *rq)
-{
- return cfs_rq_runnable_load_avg(&rq->cfs);
-}
+/*
+ * 'numa_type' describes the node at the moment of load balancing.
+ */
+enum numa_type {
+ /* The node has spare capacity that can be used to run more tasks. */
+ node_has_spare = 0,
+ /*
+ * The node is fully used and the tasks don't compete for more CPU
+ * cycles. Nevertheless, some tasks might wait before running.
+ */
+ node_fully_busy,
+ /*
+ * The node is overloaded and can't provide expected CPU cycles to all
+ * tasks.
+ */
+ node_overloaded
+};
/* Cached statistics for all CPUs within a node */
struct numa_stats {
unsigned long load;
-
+ unsigned long util;
/* Total compute capacity of CPUs on a node */
unsigned long compute_capacity;
+ unsigned int nr_running;
+ unsigned int weight;
+ enum numa_type node_type;
};
-/*
- * XXX borrowed from update_sg_lb_stats
- */
-static void update_numa_stats(struct numa_stats *ns, int nid)
-{
- int cpu;
-
- memset(ns, 0, sizeof(*ns));
- for_each_cpu(cpu, cpumask_of_node(nid)) {
- struct rq *rq = cpu_rq(cpu);
-
- ns->load += cpu_runnable_load(rq);
- ns->compute_capacity += capacity_of(cpu);
- }
-
-}
-
struct task_numa_env {
struct task_struct *p;
@@ -1521,6 +1518,47 @@ struct task_numa_env {
int best_cpu;
};
+static unsigned long cpu_load(struct rq *rq);
+static unsigned long cpu_util(int cpu);
+
+static inline enum
+numa_type numa_classify(unsigned int imbalance_pct,
+ struct numa_stats *ns)
+{
+ if ((ns->nr_running > ns->weight) &&
+ ((ns->compute_capacity * 100) < (ns->util * imbalance_pct)))
+ return node_overloaded;
+
+ if ((ns->nr_running < ns->weight) ||
+ ((ns->compute_capacity * 100) > (ns->util * imbalance_pct)))
+ return node_has_spare;
+
+ return node_fully_busy;
+}
+
+/*
+ * XXX borrowed from update_sg_lb_stats
+ */
+static void update_numa_stats(struct task_numa_env *env,
+ struct numa_stats *ns, int nid)
+{
+ int cpu;
+
+ memset(ns, 0, sizeof(*ns));
+ for_each_cpu(cpu, cpumask_of_node(nid)) {
+ struct rq *rq = cpu_rq(cpu);
+
+ ns->load += cpu_load(rq);
+ ns->util += cpu_util(cpu);
+ ns->nr_running += rq->cfs.h_nr_running;
+ ns->compute_capacity += capacity_of(cpu);
+ }
+
+ ns->weight = cpumask_weight(cpumask_of_node(nid));
+
+ ns->node_type = numa_classify(env->imbalance_pct, ns);
+}
+
static void task_numa_assign(struct task_numa_env *env,
struct task_struct *p, long imp)
{
@@ -1556,6 +1594,11 @@ static bool load_too_imbalanced(long src_load, long dst_load,
long orig_src_load, orig_dst_load;
long src_capacity, dst_capacity;
+
+ /* If dst node has spare capacity, there is no real load imbalance */
+ if (env->dst_stats.node_type == node_has_spare)
+ return false;
+
/*
* The load is corrected for the CPU capacity available on each node.
*
@@ -1788,10 +1831,10 @@ static int task_numa_migrate(struct task_struct *p)
dist = env.dist = node_distance(env.src_nid, env.dst_nid);
taskweight = task_weight(p, env.src_nid, dist);
groupweight = group_weight(p, env.src_nid, dist);
- update_numa_stats(&env.src_stats, env.src_nid);
+ update_numa_stats(&env, &env.src_stats, env.src_nid);
taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
- update_numa_stats(&env.dst_stats, env.dst_nid);
+ update_numa_stats(&env, &env.dst_stats, env.dst_nid);
/* Try to find a spot on the preferred nid. */
task_numa_find_cpu(&env, taskimp, groupimp);
@@ -1824,7 +1867,7 @@ static int task_numa_migrate(struct task_struct *p)
env.dist = dist;
env.dst_nid = nid;
- update_numa_stats(&env.dst_stats, env.dst_nid);
+ update_numa_stats(&env, &env.dst_stats, env.dst_nid);
task_numa_find_cpu(&env, taskimp, groupimp);
}
}
--
2.16.4
next prev parent reply other threads:[~2020-02-19 14:08 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-02-19 14:07 [PATCH 00/13] Reconcile NUMA balancing decisions with the load balancer v5 Mel Gorman
2020-02-19 14:07 ` [PATCH 01/13] sched/fair: Allow a per-CPU kthread waking a task to stack on the same CPU, to fix XFS performance regression Mel Gorman
2020-02-19 14:07 ` [PATCH 02/13] sched/numa: Trace when no candidate CPU was found on the preferred node Mel Gorman
2020-02-19 14:07 ` [PATCH 03/13] sched/numa: Distinguish between the different task_numa_migrate failure cases Mel Gorman
2020-02-19 14:07 ` [PATCH 04/13] sched/fair: Reorder enqueue/dequeue_task_fair path Mel Gorman
2020-02-19 14:07 ` Mel Gorman [this message]
2020-02-19 14:07 ` [PATCH 06/13] sched/numa: Use similar logic to the load balancer for moving between domains with spare capacity Mel Gorman
2020-02-19 14:07 ` [PATCH 07/13] sched/pelt: Remove unused runnable load average Mel Gorman
2020-02-19 14:07 ` [PATCH 08/13] sched/pelt: Add a new runnable average signal Mel Gorman
2020-02-19 14:07 ` [PATCH 09/13] sched/fair: Take into account runnable_avg to classify group Mel Gorman
2020-02-19 14:07 ` [PATCH 10/13] sched/numa: Prefer using an idle cpu as a migration target instead of comparing tasks Mel Gorman
2020-02-19 14:07 ` [PATCH 11/13] sched/numa: Find an alternative idle CPU if the CPU is part of an active NUMA balance Mel Gorman
2020-02-19 14:07 ` [PATCH 12/13] sched/numa: Bias swapping tasks based on their preferred node Mel Gorman
2020-02-19 14:07 ` [PATCH 13/13] sched/numa: Stop an exhastive search if a reasonable swap candidate or idle CPU is found Mel Gorman
-- strict thread matches above, loose matches on Subject: below --
2020-02-24 9:52 [PATCH 00/13] Reconcile NUMA balancing decisions with the load balancer v6 Mel Gorman
2020-02-24 9:52 ` [PATCH 05/13] sched/numa: Replace runnable_load_avg by load_avg Mel Gorman
2020-02-19 13:54 [PATCH 00/13] Reconcile NUMA balancing decisions with the load balancer v4 Mel Gorman
2020-02-19 13:54 ` [PATCH 05/13] sched/numa: Replace runnable_load_avg by load_avg Mel Gorman
2020-02-17 10:43 [PATCH 00/13] Reconcile NUMA balancing decisions with the load balancer v3 Mel Gorman
2020-02-17 10:43 ` [PATCH 05/13] sched/numa: Replace runnable_load_avg by load_avg Mel Gorman
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20200219140736.20499-6-mgorman@techsingularity.net \
--to=mgorman@techsingularity.net \
--cc=bsegall@google.com \
--cc=dietmar.eggemann@arm.com \
--cc=hdanton@sina.com \
--cc=juri.lelli@redhat.com \
--cc=linux-kernel@vger.kernel.org \
--cc=mingo@kernel.org \
--cc=pauld@redhat.com \
--cc=peterz@infradead.org \
--cc=rostedt@goodmis.org \
--cc=valentin.schneider@arm.com \
--cc=vincent.guittot@linaro.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).