[PATCH] sched: Introduce scaled capacity awareness in enqueue

* [PATCH] sched: Introduce scaled capacity awareness in enqueue
@ 2017-05-22 20:48 Rohit Jain
  2017-05-30 12:28 ` Peter Zijlstra
  0 siblings, 1 reply; 10+ messages in thread
From: Rohit Jain @ 2017-05-22 20:48 UTC (permalink / raw)
  To: linux-kernel
  Cc: peterz, mingo, vincent.guittot, morten.rasmussen, dietmar.eggemann

The patch introduces capacity awarness in scheduler (CAS) which avoids
CPUs which might have their capacities reduced (due to IRQ/RT activity)
when trying to schedule threads (on the push side) in the system. This
awareness has been added into the fair scheduling class.

It does so by, using the following algorithm:
--------------------------------------------------------------------------
1) As in rt_avg the scaled capacities are already calculated.

2) This scaled capacity is normalized and mapped into buckets.

3) Any CPU which lies below the 80th percentile in terms of percentage
capacity available is considered as a low capacity CPU.

4) During idle CPU search from the scheduler perspective this
information is used to skip CPUs if better are available.

5) If none of the CPUs are better in terms of idleness and capacity, then
the low-capacity CPU is considered to be the best available CPU.
---------------------------------------------------------------------------


The performance numbers:
---------------------------------------------------------------------------
CAS shows upto 2% improvement on x86 when running OLTP select workload. I
also used barrier.c (open_mp code) as a microbenchmark. It does a number
of iterations and barrier sync at the end of each for loop.

I was also running ping on CPU 0 as:
'ping -l 10000 -q -s 10 -f host2'

The program (barrier.c) can be found at:
http://www.spinics.net/lists/kernel/msg2506955.html

Following are the results (higher is better), on a 40 CPU x86 machine:

+-------+----------------+----------------+------------------+
|Threads|CAS             |Baseline        |Baseline without  |
|       |with ping       |with ping       |ping              |
+-------+-------+--------+-------+--------+-------+----------+
|       |Mean   |Std. Dev|Mean   |Std. Dev|Mean   |Std. Dev  |
+-------+-------+--------+-------+--------+-------+----------+
|1      | 513.7 | 0.1    | 497.5 | 26.2   | 514.6 | 0.1      |
|2      | 480.2 | 24.3   | 480.9 | 28.1   | 512.2 | 1.0      |
|4      | 455.3 | 7.6    | 449.1 | 6.8    | 479.3 | 19.4     |
|8      | 423.1 | 8.4    | 416.8 | 8.5    | 448.9 | 8.4      |
|16     | 379.0 | 10.2   | 374.8 | 10.6   | 392.6 | 5.8      |
|32     | 275.4 | 3.6    | 270.0 | 7.0    | 277.0 | 0.3      |
|37     | 266.2 | 7.6    | 256.5 | 8.3    | 275.0 | 3.3      |
+-------+-------+--------+-------+--------+-------+----------+

Signed-off-by: Rohit Jain <rohit.k.jain@oracle.com>
---
 kernel/sched/core.c    |   6 ++-
 kernel/sched/fair.c    | 101 ++++++++++++++++++++++++++++++++++++++++---------
 kernel/sched/loadavg.c |  40 ++++++++++++++++++++
 kernel/sched/sched.h   |  21 ++++++++++
 4 files changed, 150 insertions(+), 18 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 759f4bd..47f3166 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5717,6 +5717,8 @@ void set_rq_online(struct rq *rq)
 			if (class->rq_online)
 				class->rq_online(rq);
 		}
+		rq->capload = CAPACITY_PRECISION-1;
+		inc_capacity_buckets(rq->capload);
 	}
 }
 
@@ -5732,6 +5734,7 @@ void set_rq_offline(struct rq *rq)
 
 		cpumask_clear_cpu(rq->cpu, rq->rd->online);
 		rq->online = 0;
+		dec_capacity_buckets(rq->capload);
 	}
 }
 
@@ -6182,8 +6185,9 @@ void __init sched_init(void)
 	set_cpu_rq_start_time(smp_processor_id());
 #endif
 	init_sched_fair_class();
-
 	init_schedstats();
+	init_capacity_buckets();
+	init_capacity_threshold();
 
 	scheduler_running = 1;
 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d711093..3088b08 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5423,10 +5423,10 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
 	 * task to be woken on this_cpu.
 	 */
 	this_eff_load = 100;
-	this_eff_load *= capacity_of(prev_cpu);
+	this_eff_load *= CPU_CAPLOAD(prev_cpu);
 
 	prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
-	prev_eff_load *= capacity_of(this_cpu);
+	prev_eff_load *= CPU_CAPLOAD(this_cpu);
 
 	if (this_load > 0) {
 		this_eff_load *= this_load +
@@ -5595,9 +5595,11 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 {
 	unsigned long load, min_load = ULONG_MAX;
 	unsigned int min_exit_latency = UINT_MAX;
+	unsigned int backup_capload = 0;
 	u64 latest_idle_timestamp = 0;
 	int least_loaded_cpu = this_cpu;
 	int shallowest_idle_cpu = -1;
+	int shallowest_idle_cpu_backup = -1;
 	int i;
 
 	/* Check if we have any choice: */
@@ -5617,7 +5619,12 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 				 */
 				min_exit_latency = idle->exit_latency;
 				latest_idle_timestamp = rq->idle_stamp;
-				shallowest_idle_cpu = i;
+				if (!CAPACITY_LOW_RQ(rq))
+					shallowest_idle_cpu = i;
+				else if (rq->capload > backup_capload) {
+					shallowest_idle_cpu_backup = i;
+					backup_capload = rq->capload;
+				}
 			} else if ((!idle || idle->exit_latency == min_exit_latency) &&
 				   rq->idle_stamp > latest_idle_timestamp) {
 				/*
@@ -5627,6 +5634,12 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 				 */
 				latest_idle_timestamp = rq->idle_stamp;
 				shallowest_idle_cpu = i;
+				if (!CAPACITY_LOW_RQ(rq))
+					shallowest_idle_cpu = i;
+				else if (rq->capload > backup_capload) {
+					shallowest_idle_cpu_backup = i;
+					backup_capload = rq->capload;
+				}
 			}
 		} else if (shallowest_idle_cpu == -1) {
 			load = weighted_cpuload(i);
@@ -5637,7 +5650,11 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 		}
 	}
 
-	return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
+	if (shallowest_idle_cpu != -1)
+		return shallowest_idle_cpu;
+	else if (shallowest_idle_cpu_backup != -1)
+		return shallowest_idle_cpu_backup;
+	return least_loaded_cpu;
 }
 
 /*
@@ -5748,15 +5765,29 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
 
 	for_each_cpu_wrap(core, cpus, target, wrap) {
 		bool idle = true;
+		int rcpu = -1;
+		int rcpu_backup = -1;
+		unsigned int capload, backup_capload = 0;
 
 		for_each_cpu(cpu, cpu_smt_mask(core)) {
 			cpumask_clear_cpu(cpu, cpus);
 			if (!idle_cpu(cpu))
 				idle = false;
+
+			capload = CPU_CAPLOAD(cpu);
+			if (!CAPACITY_LOW(capload))
+				rcpu = cpu;
+			else if ((rcpu == -1) && (capload > backup_capload)) {
+				backup_capload = capload;
+				rcpu_backup = cpu;
+			}
 		}
 
-		if (idle)
-			return core;
+		if (idle) {
+			if (rcpu == -1)
+				return (rcpu_backup == -1 ? core : rcpu_backup);
+			return rcpu;
+		}
 	}
 
 	/*
@@ -5772,7 +5803,8 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
  */
 static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
 {
-	int cpu;
+	int cpu, backup_cpu = -1;
+	unsigned int backup_capload = 0;
 
 	if (!static_branch_likely(&sched_smt_present))
 		return -1;
@@ -5780,11 +5812,19 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
 	for_each_cpu(cpu, cpu_smt_mask(target)) {
 		if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
 			continue;
-		if (idle_cpu(cpu))
-			return cpu;
+		if (idle_cpu(cpu)) {
+			unsigned int capload = CPU_CAPLOAD(cpu);
+
+			if (!CAPACITY_LOW(capload))
+				return cpu;
+			if (capload > backup_capload) {
+				backup_cpu = cpu;
+				backup_capload = capload;
+			}
+		}
 	}
 
-	return -1;
+	return backup_cpu;
 }
 
 #else /* CONFIG_SCHED_SMT */
@@ -5812,7 +5852,8 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
 	u64 avg_cost, avg_idle = this_rq()->avg_idle;
 	u64 time, cost;
 	s64 delta;
-	int cpu, wrap;
+	int cpu, wrap, backup_cpu = -1;
+	unsigned int backup_capload = 0;
 
 	this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
 	if (!this_sd)
@@ -5832,10 +5873,23 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
 	for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
 		if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
 			continue;
-		if (idle_cpu(cpu))
-			break;
+		if (idle_cpu(cpu)) {
+			unsigned int capload = CPU_CAPLOAD(cpu);
+
+			if (CAPACITY_LOW(capload)) {
+				if (capload > backup_capload) {
+					backup_cpu = cpu;
+					backup_capload = capload;
+				}
+			} else {
+				backup_cpu = -1;
+				break;
+			}
+		}
 	}
 
+	if (backup_cpu >= 0)
+		cpu = backup_cpu;
 	time = local_clock() - time;
 	cost = this_sd->avg_scan_cost;
 	delta = (s64)(time - cost) / 8;
@@ -5852,13 +5906,14 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	struct sched_domain *sd;
 	int i;
 
-	if (idle_cpu(target))
+	if (idle_cpu(target) && !CAPACITY_LOW_CPU(target))
 		return target;
 
 	/*
 	 * If the previous cpu is cache affine and idle, don't be stupid.
 	 */
-	if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
+	if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)
+	    && !CAPACITY_LOW_CPU(prev))
 		return prev;
 
 	sd = rcu_dereference(per_cpu(sd_llc, target));
@@ -7189,9 +7244,12 @@ static unsigned long scale_rt_capacity(int cpu)
 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
 {
 	unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
+	struct rq *rq = cpu_rq(cpu);
+	long max_cap = rq->rd->max_cpu_capacity;
 	struct sched_group *sdg = sd->groups;
+	unsigned int capload;
 
-	cpu_rq(cpu)->cpu_capacity_orig = capacity;
+	rq->cpu_capacity_orig = capacity;
 
 	capacity *= scale_rt_capacity(cpu);
 	capacity >>= SCHED_CAPACITY_SHIFT;
@@ -7199,7 +7257,16 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
 	if (!capacity)
 		capacity = 1;
 
-	cpu_rq(cpu)->cpu_capacity = capacity;
+	dec_capacity_buckets(rq->capload);
+	if (max_cap)
+		capload = (capacity*CAPACITY_PRECISION)/max_cap;
+
+	if (!max_cap || capload >= CAPACITY_PRECISION)
+		rq->capload = CAPACITY_PRECISION - 1;
+	else
+		rq->capload = capload;
+	inc_capacity_buckets(rq->capload);
+	rq->cpu_capacity = capacity;
 	sdg->sgc->capacity = capacity;
 	sdg->sgc->min_capacity = capacity;
 }
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index f15fb2b..273b7b4 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -58,12 +58,51 @@
  *  This covers the NO_HZ=n code, for extra head-aches, see the comment below.
  */
 
+#define	CAPACITY_BUCKET_NUM_ELEMS	(CAPACITY_PRECISION/CAPACITY_BUCKET_SZ)
 /* Variables and functions for calc_load */
 atomic_long_t calc_load_tasks;
 unsigned long calc_load_update;
 unsigned long avenrun[3];
+atomic_t capacity_buckets[CAPACITY_BUCKET_NUM_ELEMS];
+unsigned int capacity_threshold;
 EXPORT_SYMBOL(avenrun); /* should be removed */
 
+void init_capacity_buckets(void)
+{
+	int i;
+	int lastidx = CAPACITY_BUCKET_NUM_ELEMS-1;
+
+	atomic_set(capacity_buckets+lastidx, num_online_cpus());
+	for (i = 0; i < lastidx; i++)
+		atomic_set(capacity_buckets+i, 0);
+}
+
+void dec_capacity_buckets(unsigned int capload)
+{
+	atomic_dec_if_positive(capacity_buckets+
+			       (capload >> CAPACITY_BUCKET_SHIFT));
+}
+
+void inc_capacity_buckets(unsigned int capload)
+{
+	atomic_inc(capacity_buckets+(capload >> CAPACITY_BUCKET_SHIFT));
+}
+
+void update_capacity_threshold(void)
+{
+	unsigned int count_cpus = 0;
+	int bucket_count =  CAPACITY_BUCKET_NUM_ELEMS-1;
+
+	while ((count_cpus <
+		((num_online_cpus()*CAPACITY_THRS_PCT) >> CAPACITY_PRCSN_SHIFT))
+	       && (bucket_count >= 0)) {
+		count_cpus += atomic_read(capacity_buckets+bucket_count);
+		bucket_count--;
+	}
+
+	capacity_threshold = (bucket_count << CAPACITY_BUCKET_SHIFT);
+}
+
 /**
  * get_avenrun - get the load average array
  * @loads:	pointer to dest load array
@@ -381,6 +420,7 @@ void calc_global_load(unsigned long ticks)
 	 * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
 	 */
 	calc_global_nohz();
+	update_capacity_threshold();
 }
 
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7808ab0..a5b56a7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -44,6 +44,12 @@
 #define SCHED_WARN_ON(x)	((void)(x))
 #endif
 
+#define	CAPACITY_BUCKET_SZ		8
+#define	CAPACITY_BUCKET_SHIFT		3
+#define	CAPACITY_THRS_PCT		820
+#define	CAPACITY_PRECISION		1024
+#define	CAPACITY_PRCSN_SHIFT		10
+
 struct rq;
 struct cpuidle_state;
 
@@ -59,6 +65,20 @@ extern atomic_long_t calc_load_tasks;
 extern void calc_global_load_tick(struct rq *this_rq);
 extern long calc_load_fold_active(struct rq *this_rq, long adjust);
 
+extern unsigned int capacity_threshold;
+#define	CAPACITY_LOW(_capload)	((_capload) < capacity_threshold)
+#define	CAPACITY_LOW_RQ(_rq)	(((_rq)->capload) < capacity_threshold)
+#define	CAPACITY_LOW_CPU(_cpu)	((cpu_rq(_cpu)->capload) < capacity_threshold)
+#define	CPU_CAPLOAD(_cpu)	((cpu_rq(_cpu))->capload)
+
+extern void inc_capacity_buckets(unsigned int capload);
+extern void dec_capacity_buckets(unsigned int capload);
+extern void init_capacity_buckets(void);
+extern void update_rq_capload(struct rq *rq);
+extern void update_capacity_threshold(void);
+static inline void init_capacity_threshold(void)
+	{ capacity_threshold = 0; }
+
 #ifdef CONFIG_SMP
 extern void cpu_load_update_active(struct rq *this_rq);
 #else
@@ -683,6 +703,7 @@ struct rq {
 	struct root_domain *rd;
 	struct sched_domain *sd;
 
+	unsigned int capload;
 	unsigned long cpu_capacity;
 	unsigned long cpu_capacity_orig;
 
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 10+ messages in thread