[PATCH 4/4] sched/fair: Track possibly overloaded domains and abort a scan if necessary

From: Mel Gorman <mgorman@techsingularity.net>
To: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>,
	Vincent Guittot <vincent.guittot@linaro.org>,
	Valentin Schneider <valentin.schneider@arm.com>,
	Phil Auld <pauld@redhat.com>, LKML <linux-kernel@vger.kernel.org>,
	Mel Gorman <mgorman@techsingularity.net>
Subject: [PATCH 4/4] sched/fair: Track possibly overloaded domains and abort a scan if necessary
Date: Fri, 20 Mar 2020 15:12:45 +0000	[thread overview]
Message-ID: <20200320151245.21152-5-mgorman@techsingularity.net> (raw)
In-Reply-To: <20200320151245.21152-1-mgorman@techsingularity.net>

Once a domain is overloaded, it is very unlikely that a free CPU will
be found in the short term but there is still potentially a lot of
scanning. This patch tracks if a domain may be overloaded due to an
excessive number of running tasks relative to available CPUs.  In the
event a domain is overloaded, a search is aborted.

This has a variable impact on performance for hackbench which often
is overloaded on the test machines used. There was a mix of performance gains
and losses but there is a substantial impact on search efficiency.

On a 2-socket broadwell machine with 80 cores in total, tbench showed
small gains and some losses

Hmean     1        431.51 (   0.00%)      426.53 *  -1.15%*
Hmean     2        842.69 (   0.00%)      839.00 *  -0.44%*
Hmean     4       1631.09 (   0.00%)     1634.81 *   0.23%*
Hmean     8       3001.08 (   0.00%)     3020.85 *   0.66%*
Hmean     16      5631.75 (   0.00%)     5655.04 *   0.41%*
Hmean     32      9736.22 (   0.00%)     9645.68 *  -0.93%*
Hmean     64     13978.54 (   0.00%)    15215.65 *   8.85%*
Hmean     128    20093.06 (   0.00%)    19389.45 *  -3.50%*
Hmean     256    17491.34 (   0.00%)    18616.32 *   6.43%*
Hmean     320    17423.67 (   0.00%)    17793.38 *   2.12%*

However, the "SIS Domain Search Efficiency" went from 6.03% to 19.61%
indicating that far fewer CPUs were scanned. The impact of the patch
is more noticable when sockets have multiple L3 caches. While true for
EPYC 2nd generation, it's particularly noticable on EPYC 1st generation

Hmean     1        325.30 (   0.00%)      324.92 *  -0.12%*
Hmean     2        630.77 (   0.00%)      621.35 *  -1.49%*
Hmean     4       1211.41 (   0.00%)     1148.51 *  -5.19%*
Hmean     8       2017.29 (   0.00%)     1953.57 *  -3.16%*
Hmean     16      4068.81 (   0.00%)     3514.06 * -13.63%*
Hmean     32      5588.20 (   0.00%)     6583.58 *  17.81%*
Hmean     64      8470.14 (   0.00%)    10117.26 *  19.45%*
Hmean     128    11462.06 (   0.00%)    17207.68 *  50.13%*
Hmean     256    11433.74 (   0.00%)    13446.93 *  17.61%*
Hmean     512    12576.88 (   0.00%)    13630.08 *   8.37%*

On this machine, search efficiency goes from 21.04% to 32.66%. There
is a noticable problem at 16 when there are enough clients for a LLC
domain to spill over.

With hackbench, the overload problem is a bit more obvious. On the
2-socket broadwell machine using processes and pipes we see

Amean     1        0.3023 (   0.00%)      0.2893 (   4.30%)
Amean     4        0.6823 (   0.00%)      0.6930 (  -1.56%)
Amean     7        1.0293 (   0.00%)      1.0380 (  -0.84%)
Amean     12       1.6913 (   0.00%)      1.7027 (  -0.67%)
Amean     21       2.9307 (   0.00%)      2.9297 (   0.03%)
Amean     30       4.0040 (   0.00%)      4.0270 (  -0.57%)
Amean     48       6.0703 (   0.00%)      6.1067 (  -0.60%)
Amean     79       9.0630 (   0.00%)      9.1223 *  -0.65%*
Amean     110     12.1917 (   0.00%)     12.1693 (   0.18%)
Amean     141     15.7150 (   0.00%)     15.4187 (   1.89%)
Amean     172     19.5327 (   0.00%)     18.9937 (   2.76%)
Amean     203     23.3093 (   0.00%)     22.2497 *   4.55%*
Amean     234     27.8657 (   0.00%)     25.9627 *   6.83%*
Amean     265     32.9783 (   0.00%)     29.5240 *  10.47%*
Amean     296     35.6727 (   0.00%)     32.8260 *   7.98%*

More of the SIS stats are worth looking at in this case

Ops SIS Domain Search       10390526707.00  9822163508.00
Ops SIS Scanned            223173467577.00 48330226094.00
Ops SIS Domain Scanned     222820381314.00 47964114165.00
Ops SIS Failures            10183794873.00  9639912418.00
Ops SIS Recent Used Hit        22194515.00    22517194.00
Ops SIS Recent Used Miss     5733847634.00  5500415074.00
Ops SIS Recent Attempts      5756042149.00  5522932268.00
Ops SIS Search Efficiency             4.81          21.08

Search efficiency goes from 4.66% to 20.48% but the SIS Domain Scanned
shows the sheer volume of searching SIS does when prev, target and recent
CPUs are unavailable.

This could be much more aggressive by also cutting off a search for idle
cores. However, to make that work properly requires a much more intrusive
series that is likely to be controversial. This seemed like a reasonable
tradeoff to tackle the most obvious problem with select_idle_cpu.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
---
 include/linux/sched/topology.h |  1 +
 kernel/sched/fair.c            | 65 +++++++++++++++++++++++++++++++++++++++---
 kernel/sched/features.h        |  3 ++
 3 files changed, 65 insertions(+), 4 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index af9319e4cfb9..76ec7a54f57b 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -66,6 +66,7 @@ struct sched_domain_shared {
 	atomic_t	ref;
 	atomic_t	nr_busy_cpus;
 	int		has_idle_cores;
+	int		is_overloaded;
 };
 
 struct sched_domain {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 41913fac68de..31e011e627db 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5924,6 +5924,38 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
 	return new_cpu;
 }
 
+static inline void
+set_sd_overloaded(struct sched_domain_shared *sds, int val)
+{
+	if (!sds)
+		return;
+
+	WRITE_ONCE(sds->is_overloaded, val);
+}
+
+static inline bool test_sd_overloaded(struct sched_domain_shared *sds)
+{
+	return READ_ONCE(sds->is_overloaded);
+}
+
+/* Returns true if a previously overloaded domain is likely still overloaded. */
+static inline bool
+abort_sd_overloaded(struct sched_domain_shared *sds, int prev, int target)
+{
+	if (!sds || !test_sd_overloaded(sds))
+		return false;
+
+	/* Are either target or a suitable prev 1 or 0 tasks? */
+	if (cpu_rq(target)->nr_running <= 1 ||
+	    (prev != target && cpus_share_cache(prev, target) &&
+	     cpu_rq(prev)->nr_running <= 1)) {
+		set_sd_overloaded(sds, 0);
+		return false;
+	}
+
+	return true;
+}
+
 #ifdef CONFIG_SCHED_SMT
 DEFINE_STATIC_KEY_FALSE(sched_smt_present);
 EXPORT_SYMBOL_GPL(sched_smt_present);
@@ -6060,15 +6092,18 @@ static inline int select_idle_smt(struct task_struct *p, int target)
  * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
  * average idle time for this rq (as found in rq->avg_idle).
  */
-static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
+static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd,
+			   int prev, int target)
 {
 	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
 	struct sched_domain *this_sd;
+	struct sched_domain_shared *sds;
 	u64 avg_cost, avg_idle;
 	u64 time, cost;
 	s64 delta;
 	int this = smp_processor_id();
 	int cpu, nr = INT_MAX;
+	int nr_scanned = 0, nr_running = 0;
 
 	this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
 	if (!this_sd)
@@ -6092,18 +6127,40 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
 			nr = 4;
 	}
 
+	sds = rcu_dereference(per_cpu(sd_llc_shared, target));
+	if (sched_feat(SIS_OVERLOAD)) {
+		if (abort_sd_overloaded(sds, prev, target))
+			return -1;
+	}
+
 	time = cpu_clock(this);
 
 	cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
 
 	for_each_cpu_wrap(cpu, cpus, target) {
 		schedstat_inc(this_rq()->sis_scanned);
-		if (!--nr)
-			return -1;
+		if (!--nr) {
+			cpu = -1;
+			break;
+		}
 		if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
 			break;
+		if (sched_feat(SIS_OVERLOAD)) {
+			nr_scanned++;
+			nr_running += cpu_rq(cpu)->nr_running;
+		}
 	}
 
+	/* Check if domain should be marked overloaded if no cpu was found. */
+	if (sched_feat(SIS_OVERLOAD) && (signed)cpu >= nr_cpumask_bits &&
+	    nr_scanned && nr_running > (nr_scanned << 1)) {
+		set_sd_overloaded(sds, 1);
+	}
+
+	/* Scan cost not accounted for if scan is throttled */
+	if (!nr)
+		return -1;
+
 	time = cpu_clock(this) - time;
 	cost = this_sd->avg_scan_cost;
 	delta = (s64)(time - cost) / 8;
@@ -6236,7 +6293,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	if ((unsigned)i < nr_cpumask_bits)
 		return i;
 
-	i = select_idle_cpu(p, sd, target);
+	i = select_idle_cpu(p, sd, prev, target);
 	if ((unsigned)i < nr_cpumask_bits)
 		return i;
 
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 7481cd96f391..c36ae01910e2 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -57,6 +57,9 @@ SCHED_FEAT(TTWU_QUEUE, true)
 SCHED_FEAT(SIS_AVG_CPU, false)
 SCHED_FEAT(SIS_PROP, true)
 
+/* Limit scans if the domain is likely overloaded */
+SCHED_FEAT(SIS_OVERLOAD, true)
+
 /*
  * Issue a WARN when we do multiple update_rq_clock() calls
  * in a single rq->lock section. Default disabled because the
-- 
2.16.4