[patch] sched/fair: Use instantaneous load in wakeup paths

From: Mike Galbraith <umgwanakikbuti@gmail.com>
To: Yuyang Du <yuyang.du@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>,
	LKML <linux-kernel@vger.kernel.org>
Subject: [patch] sched/fair: Use instantaneous load in wakeup paths
Date: Thu, 16 Jun 2016 14:41:28 +0200	[thread overview]
Message-ID: <1466080888.2278.30.camel@gmail.com> (raw)
In-Reply-To: <1466078681.2278.26.camel@gmail.com>

On Thu, 2016-06-16 at 14:04 +0200, Mike Galbraith wrote:
> On Thu, 2016-06-16 at 13:46 +0200, Mike Galbraith wrote:
> > On Wed, 2016-06-15 at 09:01 +0200, Mike Galbraith wrote:
> > > On Wed, 2016-06-15 at 06:42 +0800, Yuyang Du wrote:
> > > 
> > > > I am entirely for giving it a "clear unadulterated reality",
> > > > and
> > > > even
> > > > more for it an option.
> > > > 
> > > > Reviewed-by: Yuyang Du <yuyang.du@intel.com>
> > > 
> > > Thanks.  I'll have a look at perhaps having wake_affine to the
> > > same,
> > > such that there is a clean separation of wake/LB paths.
> > 
> > Something like so perhaps.  I turned it on and whacked 'rfc' to try
> > to
> > attract a robot.   Yoohoo, robo thingy...
> 
> (stealthily inserts refreshed one)

(grr.. I give up on today)

sched/fair: Use instantaneous load in wakeup paths

Using load averages is not optimal for some loads, the fuzzy view of
load can/will lead to more stacking of tasks that is good for loads
that are all about latency, as the hackbench numbers below demonstrate.
Give the user an instantaneous load switch for wakeup paths, perhaps
eliminating it in future if benchmarks don't gripe.

time sh -c 'for i in `seq 1000`; do hackbench -p -P > /dev/null; done'

real    0m55.397s
user    0m8.320s
sys     5m40.789s

echo LB_INSTANTANEOUS_LOAD > /sys/kernel/debug/sched_features

real    0m48.049s
user    0m6.510s
sys     5m6.291s

Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com>
---
 kernel/sched/fair.c     |  116 ++++++++++++++++++++++++++++++------------------
 kernel/sched/features.h |    1 
 kernel/sched/sched.h    |    8 +++
 3 files changed, 83 insertions(+), 42 deletions(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -735,7 +735,8 @@ void post_init_entity_util_avg(struct sc
 	}
 }
 
-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
+static inline unsigned long cfs_rq_runnable_load(struct cfs_rq *cfs_rq,
+						 enum load_type type);
 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
 #else
 void init_entity_runnable_average(struct sched_entity *se)
@@ -1226,9 +1227,9 @@ bool should_numa_migrate_memory(struct t
 	       group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
 }
 
-static unsigned long weighted_cpuload(const int cpu);
-static unsigned long source_load(int cpu, int type);
-static unsigned long target_load(int cpu, int type);
+static unsigned long cpu_load(const int cpu, enum load_type type);
+static unsigned long source_load(int cpu, int load_idx, enum load_type type);
+static unsigned long target_load(int cpu, int load_idx, enum load_type type);
 static unsigned long capacity_of(int cpu);
 static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
 
@@ -1258,7 +1259,7 @@ static void update_numa_stats(struct num
 		struct rq *rq = cpu_rq(cpu);
 
 		ns->nr_running += rq->nr_running;
-		ns->load += weighted_cpuload(cpu);
+		ns->load += cpu_load(cpu, LOAD_WEIGHTED);
 		ns->compute_capacity += capacity_of(cpu);
 
 		cpus++;
@@ -3085,8 +3086,11 @@ void remove_entity_load_avg(struct sched
 	atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
 }
 
-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
+static inline unsigned long cfs_rq_runnable_load(struct cfs_rq *cfs_rq,
+						 enum load_type type)
 {
+	if (type == LOAD_INSTANTANEOUS)
+		return scale_load_down(cfs_rq->load.weight);
 	return cfs_rq->runnable_load_avg;
 }
 
@@ -4679,9 +4683,9 @@ static void cpu_load_update(struct rq *t
 }
 
 /* Used instead of source_load when we know the type == 0 */
-static unsigned long weighted_cpuload(const int cpu)
+static unsigned long cpu_load(const int cpu, enum load_type type)
 {
-	return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
+	return cfs_rq_runnable_load(&cpu_rq(cpu)->cfs, type);
 }
 
 #ifdef CONFIG_NO_HZ_COMMON
@@ -4726,7 +4730,7 @@ static void cpu_load_update_idle(struct
 	/*
 	 * bail if there's load or we're actually up-to-date.
 	 */
-	if (weighted_cpuload(cpu_of(this_rq)))
+	if (cpu_load(cpu_of(this_rq), LOAD_WEIGHTED))
 		return;
 
 	cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
@@ -4743,11 +4747,11 @@ void cpu_load_update_nohz_start(void)
 	struct rq *this_rq = this_rq();
 
 	/*
-	 * This is all lockless but should be fine. If weighted_cpuload changes
+	 * This is all lockless but should be fine. If weighted cpu load changes
 	 * concurrently we'll exit nohz. And cpu_load write can race with
 	 * cpu_load_update_idle() but both updater would be writing the same.
 	 */
-	this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq));
+	this_rq->cpu_load[0] = cpu_load(cpu_of(this_rq), LOAD_WEIGHTED);
 }
 
 /*
@@ -4762,7 +4766,7 @@ void cpu_load_update_nohz_stop(void)
 	if (curr_jiffies == this_rq->last_load_update_tick)
 		return;
 
-	load = weighted_cpuload(cpu_of(this_rq));
+	load = cpu_load(cpu_of(this_rq), LOAD_WEIGHTED);
 	raw_spin_lock(&this_rq->lock);
 	update_rq_clock(this_rq);
 	cpu_load_update_nohz(this_rq, curr_jiffies, load);
@@ -4788,7 +4792,7 @@ static void cpu_load_update_periodic(str
  */
 void cpu_load_update_active(struct rq *this_rq)
 {
-	unsigned long load = weighted_cpuload(cpu_of(this_rq));
+	unsigned long load = cpu_load(cpu_of(this_rq), LOAD_WEIGHTED);
 
 	if (tick_nohz_tick_stopped())
 		cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
@@ -4803,30 +4807,30 @@ void cpu_load_update_active(struct rq *t
  * We want to under-estimate the load of migration sources, to
  * balance conservatively.
  */
-static unsigned long source_load(int cpu, int type)
+static unsigned long source_load(int cpu, int load_idx, enum load_type type)
 {
 	struct rq *rq = cpu_rq(cpu);
-	unsigned long total = weighted_cpuload(cpu);
+	unsigned long total = cpu_load(cpu, type);
 
-	if (type == 0 || !sched_feat(LB_BIAS))
+	if (load_idx == 0 || !sched_feat(LB_BIAS))
 		return total;
 
-	return min(rq->cpu_load[type-1], total);
+	return min(rq->cpu_load[load_idx-1], total);
 }
 
 /*
  * Return a high guess at the load of a migration-target cpu weighted
  * according to the scheduling class and "nice" value.
  */
-static unsigned long target_load(int cpu, int type)
+static unsigned long target_load(int cpu, int load_idx, enum load_type type)
 {
 	struct rq *rq = cpu_rq(cpu);
-	unsigned long total = weighted_cpuload(cpu);
+	unsigned long total = cpu_load(cpu, type);
 
-	if (type == 0 || !sched_feat(LB_BIAS))
+	if (load_idx == 0 || !sched_feat(LB_BIAS))
 		return total;
 
-	return max(rq->cpu_load[type-1], total);
+	return max(rq->cpu_load[load_idx-1], total);
 }
 
 static unsigned long capacity_of(int cpu)
@@ -4843,7 +4847,7 @@ static unsigned long cpu_avg_load_per_ta
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
-	unsigned long load_avg = weighted_cpuload(cpu);
+	unsigned long load_avg = cpu_load(cpu, LOAD_WEIGHTED);
 
 	if (nr_running)
 		return load_avg / nr_running;
@@ -5013,7 +5017,15 @@ static int wake_wide(struct task_struct
 	return 1;
 }
 
-static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
+static unsigned long task_load(struct task_struct *p, enum load_type type)
+{
+	if (type == LOAD_INSTANTANEOUS)
+		return scale_load_down(p->se.load.weight);
+	return p->se.avg.load_avg;
+}
+
+static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync,
+		       enum load_type type)
 {
 	s64 this_load, load;
 	s64 this_eff_load, prev_eff_load;
@@ -5025,8 +5037,8 @@ static int wake_affine(struct sched_doma
 	idx	  = sd->wake_idx;
 	this_cpu  = smp_processor_id();
 	prev_cpu  = task_cpu(p);
-	load	  = source_load(prev_cpu, idx);
-	this_load = target_load(this_cpu, idx);
+	load	  = source_load(prev_cpu, idx, type);
+	this_load = target_load(this_cpu, idx, type);
 
 	/*
 	 * If sync wakeup then subtract the (maximum possible)
@@ -5035,14 +5047,14 @@ static int wake_affine(struct sched_doma
 	 */
 	if (sync) {
 		tg = task_group(current);
-		weight = current->se.avg.load_avg;
+		weight = task_load(current, type);
 
 		this_load += effective_load(tg, this_cpu, -weight, -weight);
 		load += effective_load(tg, prev_cpu, 0, -weight);
 	}
 
 	tg = task_group(p);
-	weight = p->se.avg.load_avg;
+	weight = task_load(p, type);
 
 	/*
 	 * In low-load situations, where prev_cpu is idle and this_cpu is idle
@@ -5085,7 +5097,7 @@ static int wake_affine(struct sched_doma
  */
 static struct sched_group *
 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
-		  int this_cpu, int sd_flag)
+		  int this_cpu, int sd_flag, enum load_type type)
 {
 	struct sched_group *idlest = NULL, *group = sd->groups;
 	unsigned long min_load = ULONG_MAX, this_load = 0;
@@ -5114,9 +5126,9 @@ find_idlest_group(struct sched_domain *s
 		for_each_cpu(i, sched_group_cpus(group)) {
 			/* Bias balancing toward cpus of our domain */
 			if (local_group)
-				load = source_load(i, load_idx);
+				load = source_load(i, load_idx, type);
 			else
-				load = target_load(i, load_idx);
+				load = target_load(i, load_idx, type);
 
 			avg_load += load;
 		}
@@ -5141,7 +5153,8 @@ find_idlest_group(struct sched_domain *s
  * find_idlest_cpu - find the idlest cpu among the cpus in group.
  */
 static int
-find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
+find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
+		enum load_type type)
 {
 	unsigned long load, min_load = ULONG_MAX;
 	unsigned int min_exit_latency = UINT_MAX;
@@ -5175,7 +5188,7 @@ find_idlest_cpu(struct sched_group *grou
 				shallowest_idle_cpu = i;
 			}
 		} else if (shallowest_idle_cpu == -1) {
-			load = weighted_cpuload(i);
+			load = cpu_load(i, type);
 			if (load < min_load || (load == min_load && i == this_cpu)) {
 				min_load = load;
 				least_loaded_cpu = i;
@@ -5282,6 +5295,24 @@ static int cpu_util(int cpu)
 	return (util >= capacity) ? capacity : util;
 }
 
+enum load_type wakeup_load_type(struct task_struct *p)
+{
+	if (!sched_feat(WAKE_INSTANTANEOUS_LOAD))
+		return LOAD_WEIGHTED;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	/*
+	 * Group scheduling unconditionally use average load.  Use of
+	 * instantaneous load is all about loads that live or die in
+	 * the here and now, to which cgroups are fundamentally toxic.
+	 */
+	if (task_group(p)->parent)
+		return LOAD_WEIGHTED;
+#endif
+
+	return LOAD_INSTANTANEOUS;
+}
+
 /*
  * select_task_rq_fair: Select target runqueue for the waking task in domains
  * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
@@ -5302,6 +5333,7 @@ select_task_rq_fair(struct task_struct *
 	int new_cpu = prev_cpu;
 	int want_affine = 0;
 	int sync = wake_flags & WF_SYNC;
+	enum load_type type = wakeup_load_type(p);
 
 	if (sd_flag & SD_BALANCE_WAKE) {
 		record_wakee(p);
@@ -5331,7 +5363,7 @@ select_task_rq_fair(struct task_struct *
 
 	if (affine_sd) {
 		sd = NULL; /* Prefer wake_affine over balance flags */
-		if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+		if (cpu != prev_cpu && wake_affine(affine_sd, p, sync, type))
 			new_cpu = cpu;
 	}
 
@@ -5348,13 +5380,13 @@ select_task_rq_fair(struct task_struct *
 			continue;
 		}
 
-		group = find_idlest_group(sd, p, cpu, sd_flag);
+		group = find_idlest_group(sd, p, cpu, sd_flag, type);
 		if (!group) {
 			sd = sd->child;
 			continue;
 		}
 
-		new_cpu = find_idlest_cpu(group, p, cpu);
+		new_cpu = find_idlest_cpu(group, p, cpu, type);
 		if (new_cpu == -1 || new_cpu == cpu) {
 			/* Now try balancing at a lower domain level of cpu */
 			sd = sd->child;
@@ -6710,9 +6742,9 @@ static inline void update_sg_lb_stats(st
 
 		/* Bias balancing toward cpus of our domain */
 		if (local_group)
-			load = target_load(i, load_idx);
+			load = target_load(i, load_idx, LOAD_WEIGHTED);
 		else
-			load = source_load(i, load_idx);
+			load = source_load(i, load_idx, LOAD_WEIGHTED);
 
 		sgs->group_load += load;
 		sgs->group_util += cpu_util(i);
@@ -6726,7 +6758,7 @@ static inline void update_sg_lb_stats(st
 		sgs->nr_numa_running += rq->nr_numa_running;
 		sgs->nr_preferred_running += rq->nr_preferred_running;
 #endif
-		sgs->sum_weighted_load += weighted_cpuload(i);
+		sgs->sum_weighted_load += cpu_load(i, LOAD_WEIGHTED);
 		/*
 		 * No need to call idle_cpu() if nr_running is not 0
 		 */
@@ -7238,11 +7270,11 @@ static struct rq *find_busiest_queue(str
 
 		capacity = capacity_of(i);
 
-		wl = weighted_cpuload(i);
+		wl = cpu_load(i, LOAD_WEIGHTED);
 
 		/*
-		 * When comparing with imbalance, use weighted_cpuload()
-		 * which is not scaled with the cpu capacity.
+		 * When comparing with imbalance, use cpu_load() which is
+		 * not scaled with the cpu capacity.
 		 */
 
 		if (rq->nr_running == 1 && wl > env->imbalance &&
@@ -7251,7 +7283,7 @@ static struct rq *find_busiest_queue(str
 
 		/*
 		 * For the load comparisons with the other cpu's, consider
-		 * the weighted_cpuload() scaled with the cpu capacity, so
+		 * the weighted cpu load scaled with the cpu capacity, so
 		 * that the load can be moved away from the cpu that is
 		 * potentially running at a lower capacity.
 		 *
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -39,6 +39,7 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true)
 SCHED_FEAT(HRTICK, false)
 SCHED_FEAT(DOUBLE_TICK, false)
 SCHED_FEAT(LB_BIAS, true)
+SCHED_FEAT(WAKE_INSTANTANEOUS_LOAD, true)
 
 /*
  * Decrement CPU capacity based on time not spent running tasks
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1631,6 +1631,14 @@ static inline void double_rq_unlock(stru
 		__release(rq2->lock);
 }
 
+/*
+ * Tell load balancing functions whether we want instantaneous or average load
+ */
+enum load_type {
+	LOAD_INSTANTANEOUS,
+	LOAD_WEIGHTED,
+};
+
 #else /* CONFIG_SMP */
 
 /*