[PATCH 2/2] sched/fair: Adjust the allowed NUMA imbalance when SD_NUMA spans multiple LLCs

From: Mel Gorman <mgorman@techsingularity.net>
To: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>,
	Vincent Guittot <vincent.guittot@linaro.org>,
	Valentin Schneider <valentin.schneider@arm.com>,
	Aubrey Li <aubrey.li@linux.intel.com>,
	Barry Song <song.bao.hua@hisilicon.com>,
	Mike Galbraith <efault@gmx.de>,
	Srikar Dronamraju <srikar@linux.vnet.ibm.com>,
	Gautham Shenoy <gautham.shenoy@amd.com>,
	LKML <linux-kernel@vger.kernel.org>,
	Mel Gorman <mgorman@techsingularity.net>
Subject: [PATCH 2/2] sched/fair: Adjust the allowed NUMA imbalance when SD_NUMA spans multiple LLCs
Date: Fri, 10 Dec 2021 09:33:07 +0000	[thread overview]
Message-ID: <20211210093307.31701-3-mgorman@techsingularity.net> (raw)
In-Reply-To: <20211210093307.31701-1-mgorman@techsingularity.net>

Commit 7d2b5dd0bcc4 ("sched/numa: Allow a floating imbalance between NUMA
nodes") allowed an imbalance between NUMA nodes such that communicating
tasks would not be pulled apart by the load balancer. This works fine when
there is a 1:1 relationship between LLC and node but can be suboptimal
for multiple LLCs if independent tasks prematurely use CPUs sharing cache.

Zen* has multiple LLCs per node with local memory channels and due to
the allowed imbalance, it's far harder to tune some workloads to run
optimally than it is on hardware that has 1 LLC per node. This patch
adjusts the imbalance on multi-LLC machines to allow an imbalance up to
the point where LLCs should be balanced between nodes.

On a Zen3 machine running STREAM parallelised with OMP to have on instance
per LLC the results and without binding, the results are

                            5.16.0-rc1             5.16.0-rc1
                               vanilla       sched-numaimb-v4
MB/sec copy-16    166712.18 (   0.00%)   651540.22 ( 290.82%)
MB/sec scale-16   140109.66 (   0.00%)   382254.74 ( 172.83%)
MB/sec add-16     160791.18 (   0.00%)   623073.98 ( 287.51%)
MB/sec triad-16   160043.84 (   0.00%)   633964.52 ( 296.12%)

STREAM can use directives to force the spread if the OpenMP is new
enough but that doesn't help if an application uses threads and
it's not known in advance how many threads will be created.

Coremark is a CPU and cache intensive benchmark parallelised with
threads. When running with 1 thread per instance, the vanilla kernel
allows threads to contend on cache. With the patch;

                               5.16.0-rc1             5.16.0-rc1
                                  vanilla    sched-numaimb-v4r24
Min       Score-16   367816.09 (   0.00%)   384015.36 (   4.40%)
Hmean     Score-16   389627.78 (   0.00%)   431907.14 *  10.85%*
Max       Score-16   416178.96 (   0.00%)   480120.03 (  15.36%)
Stddev    Score-16    17361.82 (   0.00%)    32505.34 ( -87.22%)
CoeffVar  Score-16        4.45 (   0.00%)        7.49 ( -68.30%)

It can also make a big difference for semi-realistic workloads
like specjbb which can execute arbitrary numbers of threads without
advance knowledge of how they should be placed

                               5.16.0-rc1             5.16.0-rc1
                                  vanilla       sched-numaimb-v4
Hmean     tput-1      73743.05 (   0.00%)    70258.27 *  -4.73%*
Hmean     tput-8     563036.51 (   0.00%)   591187.39 (   5.00%)
Hmean     tput-16   1016590.61 (   0.00%)  1032311.78 (   1.55%)
Hmean     tput-24   1418558.41 (   0.00%)  1424005.80 (   0.38%)
Hmean     tput-32   1608794.22 (   0.00%)  1907855.80 *  18.59%*
Hmean     tput-40   1761338.13 (   0.00%)  2108162.23 *  19.69%*
Hmean     tput-48   2290646.54 (   0.00%)  2214383.47 (  -3.33%)
Hmean     tput-56   2463345.12 (   0.00%)  2780216.58 *  12.86%*
Hmean     tput-64   2650213.53 (   0.00%)  2598196.66 (  -1.96%)
Hmean     tput-72   2497253.28 (   0.00%)  2998882.47 *  20.09%*
Hmean     tput-80   2820786.72 (   0.00%)  2951655.27 (   4.64%)
Hmean     tput-88   2813541.68 (   0.00%)  3045450.86 *   8.24%*
Hmean     tput-96   2604158.67 (   0.00%)  3035311.91 *  16.56%*
Hmean     tput-104  2713810.62 (   0.00%)  2984270.04 (   9.97%)
Hmean     tput-112  2558425.37 (   0.00%)  2894737.46 *  13.15%*
Hmean     tput-120  2611434.93 (   0.00%)  2781661.01 (   6.52%)
Hmean     tput-128  2706103.22 (   0.00%)  2811447.85 (   3.89%)

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
---
 include/linux/sched/topology.h |  1 +
 kernel/sched/fair.c            | 36 +++++++++++++++++----------------
 kernel/sched/topology.c        | 37 ++++++++++++++++++++++++++++++++++
 3 files changed, 57 insertions(+), 17 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index c07bfa2d80f2..54f5207154d3 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -93,6 +93,7 @@ struct sched_domain {
 	unsigned int busy_factor;	/* less balancing by factor if busy */
 	unsigned int imbalance_pct;	/* No balance until over watermark */
 	unsigned int cache_nice_tries;	/* Leave cache hot tasks for # tries */
+	unsigned int imb_numa_nr;	/* Nr imbalanced tasks allowed between nodes */
 
 	int nohz_idle;			/* NOHZ IDLE status */
 	int flags;			/* See SD_* */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0a969affca76..972ba586b113 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1489,6 +1489,7 @@ struct task_numa_env {
 
 	int src_cpu, src_nid;
 	int dst_cpu, dst_nid;
+	int imb_numa_nr;
 
 	struct numa_stats src_stats, dst_stats;
 
@@ -1504,7 +1505,8 @@ static unsigned long cpu_load(struct rq *rq);
 static unsigned long cpu_runnable(struct rq *rq);
 static unsigned long cpu_util(int cpu);
 static inline long adjust_numa_imbalance(int imbalance,
-					int dst_running, int dst_weight);
+					int dst_running, int dst_weight,
+					int imb_numa_nr);
 
 static inline enum
 numa_type numa_classify(unsigned int imbalance_pct,
@@ -1885,7 +1887,8 @@ static void task_numa_find_cpu(struct task_numa_env *env,
 		dst_running = env->dst_stats.nr_running + 1;
 		imbalance = max(0, dst_running - src_running);
 		imbalance = adjust_numa_imbalance(imbalance, dst_running,
-							env->dst_stats.weight);
+						  env->dst_stats.weight,
+						  env->imb_numa_nr);
 
 		/* Use idle CPU if there is no imbalance */
 		if (!imbalance) {
@@ -1950,8 +1953,10 @@ static int task_numa_migrate(struct task_struct *p)
 	 */
 	rcu_read_lock();
 	sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
-	if (sd)
+	if (sd) {
 		env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
+		env.imb_numa_nr = sd->imb_numa_nr;
+	}
 	rcu_read_unlock();
 
 	/*
@@ -9186,12 +9191,13 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 				return idlest;
 #endif
 			/*
-			 * Otherwise, keep the task on this node to stay close
-			 * its wakeup source and improve locality. If there is
-			 * a real need of migration, periodic load balance will
-			 * take care of it.
+			 * Otherwise, keep the task on this node to stay local
+			 * to its wakeup source if the number of running tasks
+			 * are below the allowed imbalance. If there is a real
+			 * need of migration, periodic load balance will take
+			 * care of it.
 			 */
-			if (allow_numa_imbalance(local_sgs.sum_nr_running, sd->span_weight))
+			if (local_sgs.sum_nr_running <= sd->imb_numa_nr)
 				return NULL;
 		}
 
@@ -9280,19 +9286,14 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 	}
 }
 
-#define NUMA_IMBALANCE_MIN 2
-
 static inline long adjust_numa_imbalance(int imbalance,
-				int dst_running, int dst_weight)
+				int dst_running, int dst_weight,
+				int imb_numa_nr)
 {
 	if (!allow_numa_imbalance(dst_running, dst_weight))
 		return imbalance;
 
-	/*
-	 * Allow a small imbalance based on a simple pair of communicating
-	 * tasks that remain local when the destination is lightly loaded.
-	 */
-	if (imbalance <= NUMA_IMBALANCE_MIN)
+	if (imbalance <= imb_numa_nr)
 		return 0;
 
 	return imbalance;
@@ -9397,7 +9398,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 		/* Consider allowing a small imbalance between NUMA groups */
 		if (env->sd->flags & SD_NUMA) {
 			env->imbalance = adjust_numa_imbalance(env->imbalance,
-				busiest->sum_nr_running, env->sd->span_weight);
+				busiest->sum_nr_running, env->sd->span_weight,
+				env->sd->imb_numa_nr);
 		}
 
 		return;
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index d201a7052a29..bacec575ade2 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2242,6 +2242,43 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 		}
 	}
 
+	/*
+	 * Calculate an allowed NUMA imbalance such that LLCs do not get
+	 * imbalanced.
+	 */
+	for_each_cpu(i, cpu_map) {
+		unsigned int imb = 0;
+		unsigned int imb_span = 1;
+
+		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+			struct sched_domain *child = sd->child;
+
+			if (!(sd->flags & SD_SHARE_PKG_RESOURCES) && child &&
+			    (child->flags & SD_SHARE_PKG_RESOURCES)) {
+				struct sched_domain *top = sd;
+				unsigned int llc_sq;
+
+				/*
+				 * nr_llcs = (top->span_weight / llc_weight);
+				 * imb = (child_weight / nr_llcs) >> 2
+				 *
+				 * is equivalent to
+				 *
+				 * imb = (llc_weight^2 / top->span_weight) >> 2
+				 *
+				 */
+				llc_sq = child->span_weight * child->span_weight;
+
+				imb = max(2U, ((llc_sq / top->span_weight) >> 2));
+				imb_span = sd->span_weight;
+
+				sd->imb_numa_nr = imb;
+			} else {
+				sd->imb_numa_nr = imb * (sd->span_weight / imb_span);
+			}
+		}
+	}
+
 	/* Calculate CPU capacity for physical packages and nodes */
 	for (i = nr_cpumask_bits-1; i >= 0; i--) {
 		if (!cpumask_test_cpu(i, cpu_map))
-- 
2.31.1