linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [rfc patch] sched/fair: Use instantaneous load for fork/exec balancing
@ 2016-06-14  7:58 Mike Galbraith
  2016-06-14 14:14 ` Dietmar Eggemann
  2016-06-14 22:42 ` Yuyang Du
  0 siblings, 2 replies; 22+ messages in thread
From: Mike Galbraith @ 2016-06-14  7:58 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Yuyang Du, LKML

SUSE's regression testing noticed that...

0905f04eb21f sched/fair: Fix new task's load avg removed from source CPU in wake_up_new_task()

...introduced a hackbench regression, and indeed it does.  I think this
regression has more to do with randomness than anything else, but in
general...

While averaging calms down load balancing, helping to keep migrations
down to a dull roar, it's not completely wonderful when it comes to
things that live in the here and now, hackbench being one such.

time sh -c 'for i in `seq 1000`; do hackbench -p -P > /dev/null; done'

real    0m55.397s
user    0m8.320s
sys     5m40.789s

echo LB_INSTANTANEOUS_LOAD > /sys/kernel/debug/sched_features

real    0m48.049s
user    0m6.510s
sys     5m6.291s

Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com>
---
 kernel/sched/fair.c     |   54 ++++++++++++++++++++++++------------------------
 kernel/sched/features.h |    1 
 kernel/sched/sched.h    |    6 +++++
 3 files changed, 35 insertions(+), 26 deletions(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -738,7 +738,7 @@ void post_init_entity_util_avg(struct sc
 	}
 }
 
-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
+static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq, int avg);
 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
 #else
 void init_entity_runnable_average(struct sched_entity *se)
@@ -1229,9 +1229,9 @@ bool should_numa_migrate_memory(struct t
 	       group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
 }
 
-static unsigned long weighted_cpuload(const int cpu);
-static unsigned long source_load(int cpu, int type);
-static unsigned long target_load(int cpu, int type);
+static unsigned long weighted_cpuload(const int cpu, int avg);
+static unsigned long source_load(int cpu, int type, int avg);
+static unsigned long target_load(int cpu, int type, int avg);
 static unsigned long capacity_of(int cpu);
 static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
 
@@ -1261,7 +1261,7 @@ static void update_numa_stats(struct num
 		struct rq *rq = cpu_rq(cpu);
 
 		ns->nr_running += rq->nr_running;
-		ns->load += weighted_cpuload(cpu);
+		ns->load += weighted_cpuload(cpu, LOAD_AVERAGE);
 		ns->compute_capacity += capacity_of(cpu);
 
 		cpus++;
@@ -3102,8 +3102,10 @@ void remove_entity_load_avg(struct sched
 	atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
 }
 
-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
+static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq, int avg)
 {
+	if (sched_feat(LB_INSTANTANEOUS_LOAD) && avg == LOAD_INSTANT)
+		return cfs_rq->load.weight;
 	return cfs_rq->runnable_load_avg;
 }
 
@@ -4701,9 +4703,9 @@ static void cpu_load_update(struct rq *t
 }
 
 /* Used instead of source_load when we know the type == 0 */
-static unsigned long weighted_cpuload(const int cpu)
+static unsigned long weighted_cpuload(const int cpu, int avg)
 {
-	return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
+	return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs, avg);
 }
 
 #ifdef CONFIG_NO_HZ_COMMON
@@ -4748,7 +4750,7 @@ static void cpu_load_update_idle(struct
 	/*
 	 * bail if there's load or we're actually up-to-date.
 	 */
-	if (weighted_cpuload(cpu_of(this_rq)))
+	if (weighted_cpuload(cpu_of(this_rq), LOAD_AVERAGE))
 		return;
 
 	cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
@@ -4769,7 +4771,7 @@ void cpu_load_update_nohz_start(void)
 	 * concurrently we'll exit nohz. And cpu_load write can race with
 	 * cpu_load_update_idle() but both updater would be writing the same.
 	 */
-	this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq));
+	this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq), LOAD_AVERAGE);
 }
 
 /*
@@ -4784,7 +4786,7 @@ void cpu_load_update_nohz_stop(void)
 	if (curr_jiffies == this_rq->last_load_update_tick)
 		return;
 
-	load = weighted_cpuload(cpu_of(this_rq));
+	load = weighted_cpuload(cpu_of(this_rq), LOAD_AVERAGE);
 	raw_spin_lock(&this_rq->lock);
 	update_rq_clock(this_rq);
 	cpu_load_update_nohz(this_rq, curr_jiffies, load);
@@ -4810,7 +4812,7 @@ static void cpu_load_update_periodic(str
  */
 void cpu_load_update_active(struct rq *this_rq)
 {
-	unsigned long load = weighted_cpuload(cpu_of(this_rq));
+	unsigned long load = weighted_cpuload(cpu_of(this_rq), LOAD_AVERAGE);
 
 	if (tick_nohz_tick_stopped())
 		cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
@@ -4825,10 +4827,10 @@ void cpu_load_update_active(struct rq *t
  * We want to under-estimate the load of migration sources, to
  * balance conservatively.
  */
-static unsigned long source_load(int cpu, int type)
+static unsigned long source_load(int cpu, int type, int avg)
 {
 	struct rq *rq = cpu_rq(cpu);
-	unsigned long total = weighted_cpuload(cpu);
+	unsigned long total = weighted_cpuload(cpu, avg);
 
 	if (type == 0 || !sched_feat(LB_BIAS))
 		return total;
@@ -4840,10 +4842,10 @@ static unsigned long source_load(int cpu
  * Return a high guess at the load of a migration-target cpu weighted
  * according to the scheduling class and "nice" value.
  */
-static unsigned long target_load(int cpu, int type)
+static unsigned long target_load(int cpu, int type, int avg)
 {
 	struct rq *rq = cpu_rq(cpu);
-	unsigned long total = weighted_cpuload(cpu);
+	unsigned long total = weighted_cpuload(cpu, avg);
 
 	if (type == 0 || !sched_feat(LB_BIAS))
 		return total;
@@ -4865,7 +4867,7 @@ static unsigned long cpu_avg_load_per_ta
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
-	unsigned long load_avg = weighted_cpuload(cpu);
+	unsigned long load_avg = weighted_cpuload(cpu, LOAD_AVERAGE);
 
 	if (nr_running)
 		return load_avg / nr_running;
@@ -5047,8 +5049,8 @@ static int wake_affine(struct sched_doma
 	idx	  = sd->wake_idx;
 	this_cpu  = smp_processor_id();
 	prev_cpu  = task_cpu(p);
-	load	  = source_load(prev_cpu, idx);
-	this_load = target_load(this_cpu, idx);
+	load	  = source_load(prev_cpu, idx, LOAD_AVERAGE);
+	this_load = target_load(this_cpu, idx, LOAD_AVERAGE);
 
 	/*
 	 * If sync wakeup then subtract the (maximum possible)
@@ -5136,9 +5138,9 @@ find_idlest_group(struct sched_domain *s
 		for_each_cpu(i, sched_group_cpus(group)) {
 			/* Bias balancing toward cpus of our domain */
 			if (local_group)
-				load = source_load(i, load_idx);
+				load = source_load(i, load_idx, LOAD_INSTANT);
 			else
-				load = target_load(i, load_idx);
+				load = target_load(i, load_idx, LOAD_INSTANT);
 
 			avg_load += load;
 		}
@@ -5197,7 +5199,7 @@ find_idlest_cpu(struct sched_group *grou
 				shallowest_idle_cpu = i;
 			}
 		} else if (shallowest_idle_cpu == -1) {
-			load = weighted_cpuload(i);
+			load = weighted_cpuload(i, LOAD_INSTANT);
 			if (load < min_load || (load == min_load && i == this_cpu)) {
 				min_load = load;
 				least_loaded_cpu = i;
@@ -6982,9 +6984,9 @@ static inline void update_sg_lb_stats(st
 
 		/* Bias balancing toward cpus of our domain */
 		if (local_group)
-			load = target_load(i, load_idx);
+			load = target_load(i, load_idx, LOAD_AVERAGE);
 		else
-			load = source_load(i, load_idx);
+			load = source_load(i, load_idx, LOAD_AVERAGE);
 
 		sgs->group_load += load;
 		sgs->group_util += cpu_util(i);
@@ -6998,7 +7000,7 @@ static inline void update_sg_lb_stats(st
 		sgs->nr_numa_running += rq->nr_numa_running;
 		sgs->nr_preferred_running += rq->nr_preferred_running;
 #endif
-		sgs->sum_weighted_load += weighted_cpuload(i);
+		sgs->sum_weighted_load += weighted_cpuload(i, LOAD_AVERAGE);
 		/*
 		 * No need to call idle_cpu() if nr_running is not 0
 		 */
@@ -7510,7 +7512,7 @@ static struct rq *find_busiest_queue(str
 
 		capacity = capacity_of(i);
 
-		wl = weighted_cpuload(i);
+		wl = weighted_cpuload(i, LOAD_AVERAGE);
 
 		/*
 		 * When comparing with imbalance, use weighted_cpuload()
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -39,6 +39,7 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true)
 SCHED_FEAT(HRTICK, false)
 SCHED_FEAT(DOUBLE_TICK, false)
 SCHED_FEAT(LB_BIAS, true)
+SCHED_FEAT(LB_INSTANTANEOUS_LOAD, false)
 
 /*
  * Decrement CPU capacity based on time not spent running tasks
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1630,6 +1630,12 @@ static inline void double_rq_unlock(stru
 		__release(rq2->lock);
 }
 
+/*
+ * Tell load balancing functions whether we want instant or average load
+ */
+#define LOAD_INSTANT	0
+#define LOAD_AVERAGE	1
+
 #else /* CONFIG_SMP */
 
 /*

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [rfc patch] sched/fair: Use instantaneous load for fork/exec balancing
  2016-06-14  7:58 [rfc patch] sched/fair: Use instantaneous load for fork/exec balancing Mike Galbraith
@ 2016-06-14 14:14 ` Dietmar Eggemann
  2016-06-14 16:40   ` Mike Galbraith
  2016-06-14 22:42 ` Yuyang Du
  1 sibling, 1 reply; 22+ messages in thread
From: Dietmar Eggemann @ 2016-06-14 14:14 UTC (permalink / raw)
  To: Mike Galbraith, Peter Zijlstra; +Cc: Yuyang Du, LKML

On 14/06/16 08:58, Mike Galbraith wrote:
> SUSE's regression testing noticed that...
> 
> 0905f04eb21f sched/fair: Fix new task's load avg removed from source CPU in wake_up_new_task()
> 
> ...introduced a hackbench regression, and indeed it does.  I think this
> regression has more to do with randomness than anything else, but in
> general...
> 
> While averaging calms down load balancing, helping to keep migrations
> down to a dull roar, it's not completely wonderful when it comes to
> things that live in the here and now, hackbench being one such.
> 
> time sh -c 'for i in `seq 1000`; do hackbench -p -P > /dev/null; done'
> 
> real    0m55.397s
> user    0m8.320s
> sys     5m40.789s
> 
> echo LB_INSTANTANEOUS_LOAD > /sys/kernel/debug/sched_features
> 
> real    0m48.049s
> user    0m6.510s
> sys     5m6.291s
> 
> Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com>

I see similar values on ARM64 (Juno r0: 2xCortex-A57 4xCortex-A53). OK,
1000 invocations of hackbench take a little bit longer but I guess it's
the fork's we're after.

- echo NO_LB_INSTANTANEOUS_LOAD > /sys/kernel/debug/sched_features

time sh -c 'for i in `seq 1000`; do hackbench -p -P > /dev/null; done'

root@juno:~# time sh -c 'for i in `seq 1000`; do hackbench -p -P >
/dev/null; done'

real	10m17.155s
user	2m56.976s
sys	38m0.324s

- echo LB_INSTANTANEOUS_LOAD > /sys/kernel/debug/sched_features

time sh -c 'for i in `seq 1000`; do hackbench -p -P > /dev/null; done'

real	9m49.832s
user	2m42.896s
sys	34m51.452s

- But I get a similar effect in case I initialize se->avg.load_avg w/ 0:

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -680,7 +680,7 @@ void init_entity_runnable_average(struct
sched_entity *se)
         * will definitely be update (after enqueue).
         */
        sa->period_contrib = 1023;
-       sa->load_avg = scale_load_down(se->load.weight);
+       sa->load_avg = scale_load_down(0);
        sa->load_sum = sa->load_avg * LOAD_AVG_MAX;

root@juno:~# time sh -c 'for i in `seq 1000`; do hackbench -p -P >
/dev/null; done'

real	9m55.396s
user	2m41.192s
sys	35m6.196s


IMHO, the hackbench performance "boost" w/o 0905f04eb21f is due to the
fact that a new task gets all it's load decayed (making it a small task)
in the __update_load_avg() call in remove_entity_load_avg() because its
se->avg.last_update_time value is 0 which creates a huge time difference
comparing it to cfs_rq->avg.last_update_time. The patch 0905f04eb21f
avoids this and thus the task stays big se->avg.load_avg = 1024.

It can't be a difference in the value of cfs_rq->removed_load_avg
because w/o the patch 0905f04eb21f, we atomic_long_add 0 and with the
patch we bail before the atomic_long_add().

[...]

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [rfc patch] sched/fair: Use instantaneous load for fork/exec balancing
  2016-06-14 14:14 ` Dietmar Eggemann
@ 2016-06-14 16:40   ` Mike Galbraith
  2016-06-15 15:32     ` Dietmar Eggemann
  0 siblings, 1 reply; 22+ messages in thread
From: Mike Galbraith @ 2016-06-14 16:40 UTC (permalink / raw)
  To: Dietmar Eggemann, Peter Zijlstra; +Cc: Yuyang Du, LKML

On Tue, 2016-06-14 at 15:14 +0100, Dietmar Eggemann wrote:

> IMHO, the hackbench performance "boost" w/o 0905f04eb21f is due to the
> fact that a new task gets all it's load decayed (making it a small task)
> in the __update_load_avg() call in remove_entity_load_avg() because its
> se->avg.last_update_time value is 0 which creates a huge time difference
> comparing it to cfs_rq->avg.last_update_time. The patch 0905f04eb21f
> avoids this and thus the task stays big se->avg.load_avg = 1024.

I don't care much at all about the hackbench "regression" in its own
right, and what causes it, for me, bottom line is that there are cases
where we need to be able to resolve, and can't, simply because we're
looking at a fuzzy (rippling) reflection.

In general, the fuzz helps us to not be so spastic.  I'm not sure that
we really really need to care all that much, because I strongly suspect
that it's only gonna make any difference at all in corner cases, but
there are real world cases that matter.  I know for fact that schbench
(facebook) which is at least based on a real world load fails early due
to us stacking tasks due to that fuzzy view of reality.  In that case,
it's because the fuzz consists of a high amplitude aging sawtooth..
find idlest* sees a collection of pesudo-random numbers, effectively,
the fates pick idlest via lottery, get it wrong often enough that a big
box _never_ reaches full utilization before we stack tasks, putting an
end to the latency game.  For generic loads, the smoothing works, but
for some corners, it blows chunks.  Fork/exec seemed like a spot where
you really can't go wrong by looking at clear unadulterated reality.

	-Mike

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [rfc patch] sched/fair: Use instantaneous load for fork/exec balancing
  2016-06-14  7:58 [rfc patch] sched/fair: Use instantaneous load for fork/exec balancing Mike Galbraith
  2016-06-14 14:14 ` Dietmar Eggemann
@ 2016-06-14 22:42 ` Yuyang Du
  2016-06-15  7:01   ` Mike Galbraith
  1 sibling, 1 reply; 22+ messages in thread
From: Yuyang Du @ 2016-06-14 22:42 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Peter Zijlstra, LKML

On Tue, Jun 14, 2016 at 09:58:31AM +0200, Mike Galbraith wrote:
> SUSE's regression testing noticed that...
> 
> 0905f04eb21f sched/fair: Fix new task's load avg removed from source CPU in wake_up_new_task()
> 
> ...introduced a hackbench regression, and indeed it does.  I think this
> regression has more to do with randomness than anything else, but in
> general...
> 
> While averaging calms down load balancing, helping to keep migrations
> down to a dull roar, it's not completely wonderful when it comes to
> things that live in the here and now, hackbench being one such.
> 
> time sh -c 'for i in `seq 1000`; do hackbench -p -P > /dev/null; done'
> 
> real    0m55.397s
> user    0m8.320s
> sys     5m40.789s
> 
> echo LB_INSTANTANEOUS_LOAD > /sys/kernel/debug/sched_features
> 
> real    0m48.049s
> user    0m6.510s
> sys     5m6.291s
> 
> Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com>

I am entirely for giving it a "clear unadulterated reality", and even
more for it an option.

Reviewed-by: Yuyang Du <yuyang.du@intel.com>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [rfc patch] sched/fair: Use instantaneous load for fork/exec balancing
  2016-06-14 22:42 ` Yuyang Du
@ 2016-06-15  7:01   ` Mike Galbraith
  2016-06-16 11:46     ` [patch] sched/fair: Use instantaneous load in wakeup paths Mike Galbraith
  0 siblings, 1 reply; 22+ messages in thread
From: Mike Galbraith @ 2016-06-15  7:01 UTC (permalink / raw)
  To: Yuyang Du; +Cc: Peter Zijlstra, LKML

On Wed, 2016-06-15 at 06:42 +0800, Yuyang Du wrote:

> I am entirely for giving it a "clear unadulterated reality", and even
> more for it an option.
> 
> Reviewed-by: Yuyang Du <yuyang.du@intel.com>

Thanks.  I'll have a look at perhaps having wake_affine to the same,
such that there is a clean separation of wake/LB paths.  I suppose I
should also try harder to sprinkle some 'pretty' on it, that's always
the hardest part for a master of fugly :)

	-Mike

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [rfc patch] sched/fair: Use instantaneous load for fork/exec balancing
  2016-06-14 16:40   ` Mike Galbraith
@ 2016-06-15 15:32     ` Dietmar Eggemann
  2016-06-15 16:03       ` Mike Galbraith
  2016-07-04 15:04       ` Matt Fleming
  0 siblings, 2 replies; 22+ messages in thread
From: Dietmar Eggemann @ 2016-06-15 15:32 UTC (permalink / raw)
  To: Mike Galbraith, Peter Zijlstra; +Cc: Yuyang Du, LKML

On 14/06/16 17:40, Mike Galbraith wrote:
> On Tue, 2016-06-14 at 15:14 +0100, Dietmar Eggemann wrote:
> 
>> IMHO, the hackbench performance "boost" w/o 0905f04eb21f is due to the
>> fact that a new task gets all it's load decayed (making it a small task)
>> in the __update_load_avg() call in remove_entity_load_avg() because its
>> se->avg.last_update_time value is 0 which creates a huge time difference
>> comparing it to cfs_rq->avg.last_update_time. The patch 0905f04eb21f
>> avoids this and thus the task stays big se->avg.load_avg = 1024.
> 
> I don't care much at all about the hackbench "regression" in its own
> right, and what causes it, for me, bottom line is that there are cases
> where we need to be able to resolve, and can't, simply because we're
> looking at a fuzzy (rippling) reflection.

Understood. I just thought it would be nice to know why 0905f04eb21f
makes this problem even more visible. But so far I wasn't able to figure
out why this diff in se->avg.load_avg [1024 versus 0] has this effect on
cfs_rq->runnable_load_avg making it even less suitable in find idlest*.
enqueue_entity_load_avg()'s cfs_rq->runnable_load_* += sa->load_* looks
suspicious though.
> 
> In general, the fuzz helps us to not be so spastic.  I'm not sure that
> we really really need to care all that much, because I strongly suspect
> that it's only gonna make any difference at all in corner cases, but
> there are real world cases that matter.  I know for fact that schbench
> (facebook) which is at least based on a real world load fails early due
> to us stacking tasks due to that fuzzy view of reality.  In that case,
> it's because the fuzz consists of a high amplitude aging sawtooth..

... only for fork/exec? Which then would be related to the initial value
of se->avg.load_avg. Otherwise we could go back to pre b92486cbf2aa
"sched: Compute runnable load avg in cpu_load and cpu_avg_load_per_task".

> find idlest* sees a collection of pesudo-random numbers, effectively,
> the fates pick idlest via lottery, get it wrong often enough that a big
> box _never_ reaches full utilization before we stack tasks, putting an
> end to the latency game.  For generic loads, the smoothing works, but
> for some corners, it blows chunks.  Fork/exec seemed like a spot where
> you really can't go wrong by looking at clear unadulterated reality.
> 
> 	-Mike
> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [rfc patch] sched/fair: Use instantaneous load for fork/exec balancing
  2016-06-15 15:32     ` Dietmar Eggemann
@ 2016-06-15 16:03       ` Mike Galbraith
  2016-06-15 19:03         ` Dietmar Eggemann
  2016-07-04 15:04       ` Matt Fleming
  1 sibling, 1 reply; 22+ messages in thread
From: Mike Galbraith @ 2016-06-15 16:03 UTC (permalink / raw)
  To: Dietmar Eggemann, Peter Zijlstra; +Cc: Yuyang Du, LKML

On Wed, 2016-06-15 at 16:32 +0100, Dietmar Eggemann wrote:

> > In general, the fuzz helps us to not be so spastic.  I'm not sure that
> > we really really need to care all that much, because I strongly suspect
> > that it's only gonna make any difference at all in corner cases, but
> > there are real world cases that matter.  I know for fact that schbench
> > (facebook) which is at least based on a real world load fails early due
> > to us stacking tasks due to that fuzzy view of reality.  In that case,
> > it's because the fuzz consists of a high amplitude aging sawtooth..
> 
> ... only for fork/exec?

No.  Identical workers had longish work/sleep cycle, aging resulted in
weights that ranged from roughly 300-700(ish), depending on when you
peeked at them.

	-Mike

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [rfc patch] sched/fair: Use instantaneous load for fork/exec balancing
  2016-06-15 16:03       ` Mike Galbraith
@ 2016-06-15 19:03         ` Dietmar Eggemann
  2016-06-16  3:33           ` Mike Galbraith
  0 siblings, 1 reply; 22+ messages in thread
From: Dietmar Eggemann @ 2016-06-15 19:03 UTC (permalink / raw)
  To: Mike Galbraith, Peter Zijlstra; +Cc: Yuyang Du, LKML

On 15/06/16 17:03, Mike Galbraith wrote:
> On Wed, 2016-06-15 at 16:32 +0100, Dietmar Eggemann wrote:
> 
>>> In general, the fuzz helps us to not be so spastic.  I'm not sure that
>>> we really really need to care all that much, because I strongly suspect
>>> that it's only gonna make any difference at all in corner cases, but
>>> there are real world cases that matter.  I know for fact that schbench
>>> (facebook) which is at least based on a real world load fails early due
>>> to us stacking tasks due to that fuzzy view of reality.  In that case,
>>> it's because the fuzz consists of a high amplitude aging sawtooth..
>>
>> ... only for fork/exec?
> 
> No.  Identical workers had longish work/sleep cycle, aging resulted in
> weights that ranged from roughly 300-700(ish), depending on when you
> peeked at them.
> 
> 	-Mike
> 

Isn't there a theoretical problem with the scale_load() on CONFIG_64BIT
machines on tip/sched/core? load.weight has a higher resolution than
runnable_load_avg (and so the values in the rq->cpu_load[] array).
Theoretically because [forkexec|wake]_idx is 0 so [target|source]_load()
is nothing else than weighted_cpuload().

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [rfc patch] sched/fair: Use instantaneous load for fork/exec balancing
  2016-06-15 19:03         ` Dietmar Eggemann
@ 2016-06-16  3:33           ` Mike Galbraith
  2016-06-16  9:01             ` Dietmar Eggemann
  0 siblings, 1 reply; 22+ messages in thread
From: Mike Galbraith @ 2016-06-16  3:33 UTC (permalink / raw)
  To: Dietmar Eggemann, Peter Zijlstra; +Cc: Yuyang Du, LKML

On Wed, 2016-06-15 at 20:03 +0100, Dietmar Eggemann wrote:

> Isn't there a theoretical problem with the scale_load() on CONFIG_64BIT
> machines on tip/sched/core? load.weight has a higher resolution than
> runnable_load_avg (and so the values in the rq->cpu_load[] array).
> Theoretically because [forkexec|wake]_idx is 0 so [target|source]_load()
> is nothing else than weighted_cpuload().

I see a not so theoretical problem with my rfc in that I forgot to
scale_load_down() if that's what you mean.

(changes nothing, reality was just extra special unadulterated;)

	-Mike  

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [rfc patch] sched/fair: Use instantaneous load for fork/exec balancing
  2016-06-16  3:33           ` Mike Galbraith
@ 2016-06-16  9:01             ` Dietmar Eggemann
  0 siblings, 0 replies; 22+ messages in thread
From: Dietmar Eggemann @ 2016-06-16  9:01 UTC (permalink / raw)
  To: Mike Galbraith, Peter Zijlstra; +Cc: Yuyang Du, LKML

On 16/06/16 04:33, Mike Galbraith wrote:
> On Wed, 2016-06-15 at 20:03 +0100, Dietmar Eggemann wrote:
> 
>> Isn't there a theoretical problem with the scale_load() on CONFIG_64BIT
>> machines on tip/sched/core? load.weight has a higher resolution than
>> runnable_load_avg (and so the values in the rq->cpu_load[] array).
>> Theoretically because [forkexec|wake]_idx is 0 so [target|source]_load()
>> is nothing else than weighted_cpuload().
> 
> I see a not so theoretical problem with my rfc in that I forgot to
> scale_load_down() if that's what you mean.

Yup. Theoretical in the sense that this_load and min_load will be
affected both the same way as long as load_idx = 0.

> 
> (changes nothing, reality was just extra special unadulterated;)

Agreed.

> 
> 	-Mike  
> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [patch] sched/fair: Use instantaneous load in wakeup paths
  2016-06-15  7:01   ` Mike Galbraith
@ 2016-06-16 11:46     ` Mike Galbraith
  2016-06-16 12:04       ` Mike Galbraith
  0 siblings, 1 reply; 22+ messages in thread
From: Mike Galbraith @ 2016-06-16 11:46 UTC (permalink / raw)
  To: Yuyang Du; +Cc: Peter Zijlstra, LKML

On Wed, 2016-06-15 at 09:01 +0200, Mike Galbraith wrote:
> On Wed, 2016-06-15 at 06:42 +0800, Yuyang Du wrote:
> 
> > I am entirely for giving it a "clear unadulterated reality", and
> > even
> > more for it an option.
> > 
> > Reviewed-by: Yuyang Du <yuyang.du@intel.com>
> 
> Thanks.  I'll have a look at perhaps having wake_affine to the same,
> such that there is a clean separation of wake/LB paths.

Something like so perhaps.  I turned it on and whacked 'rfc' to try to
attract a robot.   Yoohoo, robo thingy...

sched/fair: Use instantaneous load in wakeup paths

Using load averages is not optimal for some loads, the fuzzy view of
load can/will lead to more stacking of tasks that is good for loads
that are all about latency, as the hackbench numbers below demonstrate.
Give the user an instantaneous load switch for wakeup paths, perhaps
eliminating it in future if benchmarks don't gripe.

time sh -c 'for i in `seq 1000`; do hackbench -p -P > /dev/null; done'

real    0m55.397s
user    0m8.320s
sys     5m40.789s

echo LB_INSTANTANEOUS_LOAD > /sys/kernel/debug/sched_features

real    0m48.049s
user    0m6.510s
sys     5m6.291s

Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com>
---
 kernel/sched/fair.c     |  116 ++++++++++++++++++++++++++++++------------------
 kernel/sched/features.h |    1 
 kernel/sched/sched.h    |    8 +++
 3 files changed, 83 insertions(+), 42 deletions(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -735,7 +735,8 @@ void post_init_entity_util_avg(struct sc
 	}
 }
 
-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
+static inline unsigned long cfs_rq_runnable_load(struct cfs_rq *cfs_rq,
+						 enum load_type type);
 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
 #else
 void init_entity_runnable_average(struct sched_entity *se)
@@ -1226,9 +1227,9 @@ bool should_numa_migrate_memory(struct t
 	       group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
 }
 
-static unsigned long weighted_cpuload(const int cpu);
-static unsigned long source_load(int cpu, int type);
-static unsigned long target_load(int cpu, int type);
+static unsigned long cpu_load(const int cpu, enum load_type type);
+static unsigned long source_load(int cpu, int load_idx, enum load_type type);
+static unsigned long target_load(int cpu, int load_idx, enum load_type type);
 static unsigned long capacity_of(int cpu);
 static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
 
@@ -1258,7 +1259,7 @@ static void update_numa_stats(struct num
 		struct rq *rq = cpu_rq(cpu);
 
 		ns->nr_running += rq->nr_running;
-		ns->load += weighted_cpuload(cpu);
+		ns->load += cpu_load(cpu, LOAD_WEIGHTED);
 		ns->compute_capacity += capacity_of(cpu);
 
 		cpus++;
@@ -3085,8 +3086,11 @@ void remove_entity_load_avg(struct sched
 	atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
 }
 
-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
+static inline unsigned long cfs_rq_runnable_load(struct cfs_rq *cfs_rq,
+						 enum load_type type)
 {
+	if (type == LOAD_INSTANTANEOUS)
+		return scale_load_down(cfs_rq->load.weight);
 	return cfs_rq->runnable_load_avg;
 }
 
@@ -4679,9 +4683,9 @@ static void cpu_load_update(struct rq *t
 }
 
 /* Used instead of source_load when we know the type == 0 */
-static unsigned long weighted_cpuload(const int cpu)
+static unsigned long cpu_load(const int cpu, enum load_type type)
 {
-	return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
+	return cfs_rq_runnable_load(&cpu_rq(cpu)->cfs, type);
 }
 
 #ifdef CONFIG_NO_HZ_COMMON
@@ -4726,7 +4730,7 @@ static void cpu_load_update_idle(struct
 	/*
 	 * bail if there's load or we're actually up-to-date.
 	 */
-	if (weighted_cpuload(cpu_of(this_rq)))
+	if (cpu_load(cpu_of(this_rq), LOAD_WEIGHTED))
 		return;
 
 	cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
@@ -4743,11 +4747,11 @@ void cpu_load_update_nohz_start(void)
 	struct rq *this_rq = this_rq();
 
 	/*
-	 * This is all lockless but should be fine. If weighted_cpuload changes
+	 * This is all lockless but should be fine. If weighted cpu load changes
 	 * concurrently we'll exit nohz. And cpu_load write can race with
 	 * cpu_load_update_idle() but both updater would be writing the same.
 	 */
-	this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq));
+	this_rq->cpu_load[0] = cpu_load(cpu_of(this_rq), LOAD_WEIGHTED);
 }
 
 /*
@@ -4762,7 +4766,7 @@ void cpu_load_update_nohz_stop(void)
 	if (curr_jiffies == this_rq->last_load_update_tick)
 		return;
 
-	load = weighted_cpuload(cpu_of(this_rq));
+	load = cpu_load(cpu_of(this_rq), LOAD_WEIGHTED);
 	raw_spin_lock(&this_rq->lock);
 	update_rq_clock(this_rq);
 	cpu_load_update_nohz(this_rq, curr_jiffies, load);
@@ -4788,7 +4792,7 @@ static void cpu_load_update_periodic(str
  */
 void cpu_load_update_active(struct rq *this_rq)
 {
-	unsigned long load = weighted_cpuload(cpu_of(this_rq));
+	unsigned long load = cpu_load(cpu_of(this_rq), LOAD_WEIGHTED);
 
 	if (tick_nohz_tick_stopped())
 		cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
@@ -4803,30 +4807,30 @@ void cpu_load_update_active(struct rq *t
  * We want to under-estimate the load of migration sources, to
  * balance conservatively.
  */
-static unsigned long source_load(int cpu, int type)
+static unsigned long source_load(int cpu, int load_idx, enum load_type type)
 {
 	struct rq *rq = cpu_rq(cpu);
-	unsigned long total = weighted_cpuload(cpu);
+	unsigned long total = cpu_load(cpu, type);
 
-	if (type == 0 || !sched_feat(LB_BIAS))
+	if (load_idx == 0 || !sched_feat(LB_BIAS))
 		return total;
 
-	return min(rq->cpu_load[type-1], total);
+	return min(rq->cpu_load[load_idx-1], total);
 }
 
 /*
  * Return a high guess at the load of a migration-target cpu weighted
  * according to the scheduling class and "nice" value.
  */
-static unsigned long target_load(int cpu, int type)
+static unsigned long target_load(int cpu, int load_idx, enum load_type type)
 {
 	struct rq *rq = cpu_rq(cpu);
-	unsigned long total = weighted_cpuload(cpu);
+	unsigned long total = cpu_load(cpu, type);
 
-	if (type == 0 || !sched_feat(LB_BIAS))
+	if (load_idx == 0 || !sched_feat(LB_BIAS))
 		return total;
 
-	return max(rq->cpu_load[type-1], total);
+	return max(rq->cpu_load[load_idx-1], total);
 }
 
 static unsigned long capacity_of(int cpu)
@@ -4843,7 +4847,7 @@ static unsigned long cpu_avg_load_per_ta
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
-	unsigned long load_avg = weighted_cpuload(cpu);
+	unsigned long load_avg = cpu_load(cpu, LOAD_WEIGHTED);
 
 	if (nr_running)
 		return load_avg / nr_running;
@@ -5013,7 +5017,15 @@ static int wake_wide(struct task_struct
 	return 1;
 }
 
-static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
+static unsigned long task_load(struct task_struct *p, int weighted)
+{
+	if (!weighted)
+		return scale_load_down(p->se.load.weight);
+	return p->se.avg.load_avg;
+}
+
+static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync,
+		       int weighted)
 {
 	s64 this_load, load;
 	s64 this_eff_load, prev_eff_load;
@@ -5025,8 +5037,8 @@ static int wake_affine(struct sched_doma
 	idx	  = sd->wake_idx;
 	this_cpu  = smp_processor_id();
 	prev_cpu  = task_cpu(p);
-	load	  = source_load(prev_cpu, idx);
-	this_load = target_load(this_cpu, idx);
+	load	  = source_load(prev_cpu, idx, weighted);
+	this_load = target_load(this_cpu, idx, weighted);
 
 	/*
 	 * If sync wakeup then subtract the (maximum possible)
@@ -5035,14 +5047,14 @@ static int wake_affine(struct sched_doma
 	 */
 	if (sync) {
 		tg = task_group(current);
-		weight = current->se.avg.load_avg;
+		weight = task_load(current, weighted);
 
 		this_load += effective_load(tg, this_cpu, -weight, -weight);
 		load += effective_load(tg, prev_cpu, 0, -weight);
 	}
 
 	tg = task_group(p);
-	weight = p->se.avg.load_avg;
+	weight = task_load(p, weighted);
 
 	/*
 	 * In low-load situations, where prev_cpu is idle and this_cpu is idle
@@ -5085,7 +5097,7 @@ static int wake_affine(struct sched_doma
  */
 static struct sched_group *
 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
-		  int this_cpu, int sd_flag)
+		  int this_cpu, int sd_flag, enum load_type type)
 {
 	struct sched_group *idlest = NULL, *group = sd->groups;
 	unsigned long min_load = ULONG_MAX, this_load = 0;
@@ -5114,9 +5126,9 @@ find_idlest_group(struct sched_domain *s
 		for_each_cpu(i, sched_group_cpus(group)) {
 			/* Bias balancing toward cpus of our domain */
 			if (local_group)
-				load = source_load(i, load_idx);
+				load = source_load(i, load_idx, type);
 			else
-				load = target_load(i, load_idx);
+				load = target_load(i, load_idx, type);
 
 			avg_load += load;
 		}
@@ -5141,7 +5153,8 @@ find_idlest_group(struct sched_domain *s
  * find_idlest_cpu - find the idlest cpu among the cpus in group.
  */
 static int
-find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
+find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
+		enum load_type type)
 {
 	unsigned long load, min_load = ULONG_MAX;
 	unsigned int min_exit_latency = UINT_MAX;
@@ -5175,7 +5188,7 @@ find_idlest_cpu(struct sched_group *grou
 				shallowest_idle_cpu = i;
 			}
 		} else if (shallowest_idle_cpu == -1) {
-			load = weighted_cpuload(i);
+			load = cpu_load(i, type);
 			if (load < min_load || (load == min_load && i == this_cpu)) {
 				min_load = load;
 				least_loaded_cpu = i;
@@ -5282,6 +5295,24 @@ static int cpu_util(int cpu)
 	return (util >= capacity) ? capacity : util;
 }
 
+enum load_type wakeup_load_type(void)
+{
+	if (!sched_feat(WAKE_INSTANTANEOUS_LOAD))
+		return LOAD_WEIGHTED;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	/*
+	 * Group scheduling unconditionally use average load.  Use of
+	 * instantaneous load is all about loads that live or die in
+	 * the here and now, to which cgroups are fundamentally toxic.
+	 */
+	if (task_group(p)->parent))
+		return LOAD_WEIGHTED;
+#endif
+
+	return LOAD_INSTANTANEOUS;
+}
+
 /*
  * select_task_rq_fair: Select target runqueue for the waking task in domains
  * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
@@ -5302,6 +5333,7 @@ select_task_rq_fair(struct task_struct *
 	int new_cpu = prev_cpu;
 	int want_affine = 0;
 	int sync = wake_flags & WF_SYNC;
+	enum load_type type = wakeup_load_type();
 
 	if (sd_flag & SD_BALANCE_WAKE) {
 		record_wakee(p);
@@ -5331,7 +5363,7 @@ select_task_rq_fair(struct task_struct *
 
 	if (affine_sd) {
 		sd = NULL; /* Prefer wake_affine over balance flags */
-		if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+		if (cpu != prev_cpu && wake_affine(affine_sd, p, sync, type))
 			new_cpu = cpu;
 	}
 
@@ -5348,13 +5380,13 @@ select_task_rq_fair(struct task_struct *
 			continue;
 		}
 
-		group = find_idlest_group(sd, p, cpu, sd_flag);
+		group = find_idlest_group(sd, p, cpu, sd_flag, type);
 		if (!group) {
 			sd = sd->child;
 			continue;
 		}
 
-		new_cpu = find_idlest_cpu(group, p, cpu);
+		new_cpu = find_idlest_cpu(group, p, cpu, type);
 		if (new_cpu == -1 || new_cpu == cpu) {
 			/* Now try balancing at a lower domain level of cpu */
 			sd = sd->child;
@@ -6710,9 +6742,9 @@ static inline void update_sg_lb_stats(st
 
 		/* Bias balancing toward cpus of our domain */
 		if (local_group)
-			load = target_load(i, load_idx);
+			load = target_load(i, load_idx, LOAD_WEIGHTED);
 		else
-			load = source_load(i, load_idx);
+			load = source_load(i, load_idx, LOAD_WEIGHTED);
 
 		sgs->group_load += load;
 		sgs->group_util += cpu_util(i);
@@ -6726,7 +6758,7 @@ static inline void update_sg_lb_stats(st
 		sgs->nr_numa_running += rq->nr_numa_running;
 		sgs->nr_preferred_running += rq->nr_preferred_running;
 #endif
-		sgs->sum_weighted_load += weighted_cpuload(i);
+		sgs->sum_weighted_load += cpu_load(i, LOAD_WEIGHTED);
 		/*
 		 * No need to call idle_cpu() if nr_running is not 0
 		 */
@@ -7238,11 +7270,11 @@ static struct rq *find_busiest_queue(str
 
 		capacity = capacity_of(i);
 
-		wl = weighted_cpuload(i);
+		wl = cpu_load(i, LOAD_WEIGHTED);
 
 		/*
-		 * When comparing with imbalance, use weighted_cpuload()
-		 * which is not scaled with the cpu capacity.
+		 * When comparing with imbalance, use cpu_load() which is
+		 * not scaled with the cpu capacity.
 		 */
 
 		if (rq->nr_running == 1 && wl > env->imbalance &&
@@ -7251,7 +7283,7 @@ static struct rq *find_busiest_queue(str
 
 		/*
 		 * For the load comparisons with the other cpu's, consider
-		 * the weighted_cpuload() scaled with the cpu capacity, so
+		 * the weighted cpu load scaled with the cpu capacity, so
 		 * that the load can be moved away from the cpu that is
 		 * potentially running at a lower capacity.
 		 *
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -39,6 +39,7 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true)
 SCHED_FEAT(HRTICK, false)
 SCHED_FEAT(DOUBLE_TICK, false)
 SCHED_FEAT(LB_BIAS, true)
+SCHED_FEAT(WAKE_INSTANTANEOUS_LOAD, true)
 
 /*
  * Decrement CPU capacity based on time not spent running tasks
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1631,6 +1631,14 @@ static inline void double_rq_unlock(stru
 		__release(rq2->lock);
 }
 
+/*
+ * Tell load balancing functions whether we want instantaneous or average load
+ */
+enum load_type {
+	LOAD_INSTANTANEOUS,
+	LOAD_WEIGHTED,
+};
+
 #else /* CONFIG_SMP */
 
 /*

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [patch] sched/fair: Use instantaneous load in wakeup paths
  2016-06-16 11:46     ` [patch] sched/fair: Use instantaneous load in wakeup paths Mike Galbraith
@ 2016-06-16 12:04       ` Mike Galbraith
  2016-06-16 12:41         ` Mike Galbraith
  0 siblings, 1 reply; 22+ messages in thread
From: Mike Galbraith @ 2016-06-16 12:04 UTC (permalink / raw)
  To: Yuyang Du; +Cc: Peter Zijlstra, LKML

On Thu, 2016-06-16 at 13:46 +0200, Mike Galbraith wrote:
> On Wed, 2016-06-15 at 09:01 +0200, Mike Galbraith wrote:
> > On Wed, 2016-06-15 at 06:42 +0800, Yuyang Du wrote:
> > 
> > > I am entirely for giving it a "clear unadulterated reality", and
> > > even
> > > more for it an option.
> > > 
> > > Reviewed-by: Yuyang Du <yuyang.du@intel.com>
> > 
> > Thanks.  I'll have a look at perhaps having wake_affine to the
> > same,
> > such that there is a clean separation of wake/LB paths.
> 
> Something like so perhaps.  I turned it on and whacked 'rfc' to try to
> attract a robot.   Yoohoo, robo thingy...

(stealthily inserts refreshed one)

sched/fair: Use instantaneous load in wakeup paths

Using load averages is not optimal for some loads, the fuzzy view of
load can/will lead to more stacking of tasks that is good for loads
that are all about latency, as the hackbench numbers below demonstrate.
Give the user an instantaneous load switch for wakeup paths, perhaps
eliminating it in future if benchmarks don't gripe.

time sh -c 'for i in `seq 1000`; do hackbench -p -P > /dev/null; done'

real    0m55.397s
user    0m8.320s
sys     5m40.789s

echo LB_INSTANTANEOUS_LOAD > /sys/kernel/debug/sched_features

real    0m48.049s
user    0m6.510s
sys     5m6.291s

Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com>
---
 kernel/sched/fair.c     |  116 ++++++++++++++++++++++++++++++------------------
 kernel/sched/features.h |    1 
 kernel/sched/sched.h    |    8 +++
 3 files changed, 83 insertions(+), 42 deletions(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -735,7 +735,8 @@ void post_init_entity_util_avg(struct sc
 	}
 }
 
-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
+static inline unsigned long cfs_rq_runnable_load(struct cfs_rq *cfs_rq,
+						 enum load_type type);
 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
 #else
 void init_entity_runnable_average(struct sched_entity *se)
@@ -1226,9 +1227,9 @@ bool should_numa_migrate_memory(struct t
 	       group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
 }
 
-static unsigned long weighted_cpuload(const int cpu);
-static unsigned long source_load(int cpu, int type);
-static unsigned long target_load(int cpu, int type);
+static unsigned long cpu_load(const int cpu, enum load_type type);
+static unsigned long source_load(int cpu, int load_idx, enum load_type type);
+static unsigned long target_load(int cpu, int load_idx, enum load_type type);
 static unsigned long capacity_of(int cpu);
 static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
 
@@ -1258,7 +1259,7 @@ static void update_numa_stats(struct num
 		struct rq *rq = cpu_rq(cpu);
 
 		ns->nr_running += rq->nr_running;
-		ns->load += weighted_cpuload(cpu);
+		ns->load += cpu_load(cpu, LOAD_WEIGHTED);
 		ns->compute_capacity += capacity_of(cpu);
 
 		cpus++;
@@ -3085,8 +3086,11 @@ void remove_entity_load_avg(struct sched
 	atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
 }
 
-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
+static inline unsigned long cfs_rq_runnable_load(struct cfs_rq *cfs_rq,
+						 enum load_type type)
 {
+	if (type == LOAD_INSTANTANEOUS)
+		return scale_load_down(cfs_rq->load.weight);
 	return cfs_rq->runnable_load_avg;
 }
 
@@ -4679,9 +4683,9 @@ static void cpu_load_update(struct rq *t
 }
 
 /* Used instead of source_load when we know the type == 0 */
-static unsigned long weighted_cpuload(const int cpu)
+static unsigned long cpu_load(const int cpu, enum load_type type)
 {
-	return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
+	return cfs_rq_runnable_load(&cpu_rq(cpu)->cfs, type);
 }
 
 #ifdef CONFIG_NO_HZ_COMMON
@@ -4726,7 +4730,7 @@ static void cpu_load_update_idle(struct
 	/*
 	 * bail if there's load or we're actually up-to-date.
 	 */
-	if (weighted_cpuload(cpu_of(this_rq)))
+	if (cpu_load(cpu_of(this_rq), LOAD_WEIGHTED))
 		return;
 
 	cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
@@ -4743,11 +4747,11 @@ void cpu_load_update_nohz_start(void)
 	struct rq *this_rq = this_rq();
 
 	/*
-	 * This is all lockless but should be fine. If weighted_cpuload changes
+	 * This is all lockless but should be fine. If weighted cpu load changes
 	 * concurrently we'll exit nohz. And cpu_load write can race with
 	 * cpu_load_update_idle() but both updater would be writing the same.
 	 */
-	this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq));
+	this_rq->cpu_load[0] = cpu_load(cpu_of(this_rq), LOAD_WEIGHTED);
 }
 
 /*
@@ -4762,7 +4766,7 @@ void cpu_load_update_nohz_stop(void)
 	if (curr_jiffies == this_rq->last_load_update_tick)
 		return;
 
-	load = weighted_cpuload(cpu_of(this_rq));
+	load = cpu_load(cpu_of(this_rq), LOAD_WEIGHTED);
 	raw_spin_lock(&this_rq->lock);
 	update_rq_clock(this_rq);
 	cpu_load_update_nohz(this_rq, curr_jiffies, load);
@@ -4788,7 +4792,7 @@ static void cpu_load_update_periodic(str
  */
 void cpu_load_update_active(struct rq *this_rq)
 {
-	unsigned long load = weighted_cpuload(cpu_of(this_rq));
+	unsigned long load = cpu_load(cpu_of(this_rq), LOAD_WEIGHTED);
 
 	if (tick_nohz_tick_stopped())
 		cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
@@ -4803,30 +4807,30 @@ void cpu_load_update_active(struct rq *t
  * We want to under-estimate the load of migration sources, to
  * balance conservatively.
  */
-static unsigned long source_load(int cpu, int type)
+static unsigned long source_load(int cpu, int load_idx, enum load_type type)
 {
 	struct rq *rq = cpu_rq(cpu);
-	unsigned long total = weighted_cpuload(cpu);
+	unsigned long total = cpu_load(cpu, type);
 
-	if (type == 0 || !sched_feat(LB_BIAS))
+	if (load_idx == 0 || !sched_feat(LB_BIAS))
 		return total;
 
-	return min(rq->cpu_load[type-1], total);
+	return min(rq->cpu_load[load_idx-1], total);
 }
 
 /*
  * Return a high guess at the load of a migration-target cpu weighted
  * according to the scheduling class and "nice" value.
  */
-static unsigned long target_load(int cpu, int type)
+static unsigned long target_load(int cpu, int load_idx, enum load_type type)
 {
 	struct rq *rq = cpu_rq(cpu);
-	unsigned long total = weighted_cpuload(cpu);
+	unsigned long total = cpu_load(cpu, type);
 
-	if (type == 0 || !sched_feat(LB_BIAS))
+	if (load_idx == 0 || !sched_feat(LB_BIAS))
 		return total;
 
-	return max(rq->cpu_load[type-1], total);
+	return max(rq->cpu_load[load_idx-1], total);
 }
 
 static unsigned long capacity_of(int cpu)
@@ -4843,7 +4847,7 @@ static unsigned long cpu_avg_load_per_ta
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
-	unsigned long load_avg = weighted_cpuload(cpu);
+	unsigned long load_avg = cpu_load(cpu, LOAD_WEIGHTED);
 
 	if (nr_running)
 		return load_avg / nr_running;
@@ -5013,7 +5017,15 @@ static int wake_wide(struct task_struct
 	return 1;
 }
 
-static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
+static unsigned long task_load(struct task_struct *p, enum load_type type)
+{
+	if (type == LOAD_INSTANTANEOUS)
+		return scale_load_down(p->se.load.weight);
+	return p->se.avg.load_avg;
+}
+
+static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync,
+		       enum load_type type)
 {
 	s64 this_load, load;
 	s64 this_eff_load, prev_eff_load;
@@ -5025,8 +5037,8 @@ static int wake_affine(struct sched_doma
 	idx	  = sd->wake_idx;
 	this_cpu  = smp_processor_id();
 	prev_cpu  = task_cpu(p);
-	load	  = source_load(prev_cpu, idx);
-	this_load = target_load(this_cpu, idx);
+	load	  = source_load(prev_cpu, idx, type);
+	this_load = target_load(this_cpu, idx, type);
 
 	/*
 	 * If sync wakeup then subtract the (maximum possible)
@@ -5035,14 +5047,14 @@ static int wake_affine(struct sched_doma
 	 */
 	if (sync) {
 		tg = task_group(current);
-		weight = current->se.avg.load_avg;
+		weight = task_load(current, type);
 
 		this_load += effective_load(tg, this_cpu, -weight, -weight);
 		load += effective_load(tg, prev_cpu, 0, -weight);
 	}
 
 	tg = task_group(p);
-	weight = p->se.avg.load_avg;
+	weight = task_load(p, type);
 
 	/*
 	 * In low-load situations, where prev_cpu is idle and this_cpu is idle
@@ -5085,7 +5097,7 @@ static int wake_affine(struct sched_doma
  */
 static struct sched_group *
 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
-		  int this_cpu, int sd_flag)
+		  int this_cpu, int sd_flag, enum load_type type)
 {
 	struct sched_group *idlest = NULL, *group = sd->groups;
 	unsigned long min_load = ULONG_MAX, this_load = 0;
@@ -5114,9 +5126,9 @@ find_idlest_group(struct sched_domain *s
 		for_each_cpu(i, sched_group_cpus(group)) {
 			/* Bias balancing toward cpus of our domain */
 			if (local_group)
-				load = source_load(i, load_idx);
+				load = source_load(i, load_idx, type);
 			else
-				load = target_load(i, load_idx);
+				load = target_load(i, load_idx, type);
 
 			avg_load += load;
 		}
@@ -5141,7 +5153,8 @@ find_idlest_group(struct sched_domain *s
  * find_idlest_cpu - find the idlest cpu among the cpus in group.
  */
 static int
-find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
+find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
+		enum load_type type)
 {
 	unsigned long load, min_load = ULONG_MAX;
 	unsigned int min_exit_latency = UINT_MAX;
@@ -5175,7 +5188,7 @@ find_idlest_cpu(struct sched_group *grou
 				shallowest_idle_cpu = i;
 			}
 		} else if (shallowest_idle_cpu == -1) {
-			load = weighted_cpuload(i);
+			load = cpu_load(i, type);
 			if (load < min_load || (load == min_load && i == this_cpu)) {
 				min_load = load;
 				least_loaded_cpu = i;
@@ -5282,6 +5295,24 @@ static int cpu_util(int cpu)
 	return (util >= capacity) ? capacity : util;
 }
 
+enum load_type wakeup_load_type(void)
+{
+	if (!sched_feat(WAKE_INSTANTANEOUS_LOAD))
+		return LOAD_WEIGHTED;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	/*
+	 * Group scheduling unconditionally use average load.  Use of
+	 * instantaneous load is all about loads that live or die in
+	 * the here and now, to which cgroups are fundamentally toxic.
+	 */
+	if (task_group(p)->parent))
+		return LOAD_WEIGHTED;
+#endif
+
+	return LOAD_INSTANTANEOUS;
+}
+
 /*
  * select_task_rq_fair: Select target runqueue for the waking task in domains
  * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
@@ -5302,6 +5333,7 @@ select_task_rq_fair(struct task_struct *
 	int new_cpu = prev_cpu;
 	int want_affine = 0;
 	int sync = wake_flags & WF_SYNC;
+	enum load_type type = wakeup_load_type();
 
 	if (sd_flag & SD_BALANCE_WAKE) {
 		record_wakee(p);
@@ -5331,7 +5363,7 @@ select_task_rq_fair(struct task_struct *
 
 	if (affine_sd) {
 		sd = NULL; /* Prefer wake_affine over balance flags */
-		if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+		if (cpu != prev_cpu && wake_affine(affine_sd, p, sync, type))
 			new_cpu = cpu;
 	}
 
@@ -5348,13 +5380,13 @@ select_task_rq_fair(struct task_struct *
 			continue;
 		}
 
-		group = find_idlest_group(sd, p, cpu, sd_flag);
+		group = find_idlest_group(sd, p, cpu, sd_flag, type);
 		if (!group) {
 			sd = sd->child;
 			continue;
 		}
 
-		new_cpu = find_idlest_cpu(group, p, cpu);
+		new_cpu = find_idlest_cpu(group, p, cpu, type);
 		if (new_cpu == -1 || new_cpu == cpu) {
 			/* Now try balancing at a lower domain level of cpu */
 			sd = sd->child;
@@ -6710,9 +6742,9 @@ static inline void update_sg_lb_stats(st
 
 		/* Bias balancing toward cpus of our domain */
 		if (local_group)
-			load = target_load(i, load_idx);
+			load = target_load(i, load_idx, LOAD_WEIGHTED);
 		else
-			load = source_load(i, load_idx);
+			load = source_load(i, load_idx, LOAD_WEIGHTED);
 
 		sgs->group_load += load;
 		sgs->group_util += cpu_util(i);
@@ -6726,7 +6758,7 @@ static inline void update_sg_lb_stats(st
 		sgs->nr_numa_running += rq->nr_numa_running;
 		sgs->nr_preferred_running += rq->nr_preferred_running;
 #endif
-		sgs->sum_weighted_load += weighted_cpuload(i);
+		sgs->sum_weighted_load += cpu_load(i, LOAD_WEIGHTED);
 		/*
 		 * No need to call idle_cpu() if nr_running is not 0
 		 */
@@ -7238,11 +7270,11 @@ static struct rq *find_busiest_queue(str
 
 		capacity = capacity_of(i);
 
-		wl = weighted_cpuload(i);
+		wl = cpu_load(i, LOAD_WEIGHTED);
 
 		/*
-		 * When comparing with imbalance, use weighted_cpuload()
-		 * which is not scaled with the cpu capacity.
+		 * When comparing with imbalance, use cpu_load() which is
+		 * not scaled with the cpu capacity.
 		 */
 
 		if (rq->nr_running == 1 && wl > env->imbalance &&
@@ -7251,7 +7283,7 @@ static struct rq *find_busiest_queue(str
 
 		/*
 		 * For the load comparisons with the other cpu's, consider
-		 * the weighted_cpuload() scaled with the cpu capacity, so
+		 * the weighted cpu load scaled with the cpu capacity, so
 		 * that the load can be moved away from the cpu that is
 		 * potentially running at a lower capacity.
 		 *
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -39,6 +39,7 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true)
 SCHED_FEAT(HRTICK, false)
 SCHED_FEAT(DOUBLE_TICK, false)
 SCHED_FEAT(LB_BIAS, true)
+SCHED_FEAT(WAKE_INSTANTANEOUS_LOAD, true)
 
 /*
  * Decrement CPU capacity based on time not spent running tasks
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1631,6 +1631,14 @@ static inline void double_rq_unlock(stru
 		__release(rq2->lock);
 }
 
+/*
+ * Tell load balancing functions whether we want instantaneous or average load
+ */
+enum load_type {
+	LOAD_INSTANTANEOUS,
+	LOAD_WEIGHTED,
+};
+
 #else /* CONFIG_SMP */
 
 /*

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [patch] sched/fair: Use instantaneous load in wakeup paths
  2016-06-16 12:04       ` Mike Galbraith
@ 2016-06-16 12:41         ` Mike Galbraith
  2016-06-17  6:21           ` Mike Galbraith
  0 siblings, 1 reply; 22+ messages in thread
From: Mike Galbraith @ 2016-06-16 12:41 UTC (permalink / raw)
  To: Yuyang Du; +Cc: Peter Zijlstra, LKML

On Thu, 2016-06-16 at 14:04 +0200, Mike Galbraith wrote:
> On Thu, 2016-06-16 at 13:46 +0200, Mike Galbraith wrote:
> > On Wed, 2016-06-15 at 09:01 +0200, Mike Galbraith wrote:
> > > On Wed, 2016-06-15 at 06:42 +0800, Yuyang Du wrote:
> > > 
> > > > I am entirely for giving it a "clear unadulterated reality",
> > > > and
> > > > even
> > > > more for it an option.
> > > > 
> > > > Reviewed-by: Yuyang Du <yuyang.du@intel.com>
> > > 
> > > Thanks.  I'll have a look at perhaps having wake_affine to the
> > > same,
> > > such that there is a clean separation of wake/LB paths.
> > 
> > Something like so perhaps.  I turned it on and whacked 'rfc' to try
> > to
> > attract a robot.   Yoohoo, robo thingy...
> 
> (stealthily inserts refreshed one)

(grr.. I give up on today)

sched/fair: Use instantaneous load in wakeup paths

Using load averages is not optimal for some loads, the fuzzy view of
load can/will lead to more stacking of tasks that is good for loads
that are all about latency, as the hackbench numbers below demonstrate.
Give the user an instantaneous load switch for wakeup paths, perhaps
eliminating it in future if benchmarks don't gripe.

time sh -c 'for i in `seq 1000`; do hackbench -p -P > /dev/null; done'

real    0m55.397s
user    0m8.320s
sys     5m40.789s

echo LB_INSTANTANEOUS_LOAD > /sys/kernel/debug/sched_features

real    0m48.049s
user    0m6.510s
sys     5m6.291s

Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com>
---
 kernel/sched/fair.c     |  116 ++++++++++++++++++++++++++++++------------------
 kernel/sched/features.h |    1 
 kernel/sched/sched.h    |    8 +++
 3 files changed, 83 insertions(+), 42 deletions(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -735,7 +735,8 @@ void post_init_entity_util_avg(struct sc
 	}
 }
 
-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
+static inline unsigned long cfs_rq_runnable_load(struct cfs_rq *cfs_rq,
+						 enum load_type type);
 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
 #else
 void init_entity_runnable_average(struct sched_entity *se)
@@ -1226,9 +1227,9 @@ bool should_numa_migrate_memory(struct t
 	       group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
 }
 
-static unsigned long weighted_cpuload(const int cpu);
-static unsigned long source_load(int cpu, int type);
-static unsigned long target_load(int cpu, int type);
+static unsigned long cpu_load(const int cpu, enum load_type type);
+static unsigned long source_load(int cpu, int load_idx, enum load_type type);
+static unsigned long target_load(int cpu, int load_idx, enum load_type type);
 static unsigned long capacity_of(int cpu);
 static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
 
@@ -1258,7 +1259,7 @@ static void update_numa_stats(struct num
 		struct rq *rq = cpu_rq(cpu);
 
 		ns->nr_running += rq->nr_running;
-		ns->load += weighted_cpuload(cpu);
+		ns->load += cpu_load(cpu, LOAD_WEIGHTED);
 		ns->compute_capacity += capacity_of(cpu);
 
 		cpus++;
@@ -3085,8 +3086,11 @@ void remove_entity_load_avg(struct sched
 	atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
 }
 
-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
+static inline unsigned long cfs_rq_runnable_load(struct cfs_rq *cfs_rq,
+						 enum load_type type)
 {
+	if (type == LOAD_INSTANTANEOUS)
+		return scale_load_down(cfs_rq->load.weight);
 	return cfs_rq->runnable_load_avg;
 }
 
@@ -4679,9 +4683,9 @@ static void cpu_load_update(struct rq *t
 }
 
 /* Used instead of source_load when we know the type == 0 */
-static unsigned long weighted_cpuload(const int cpu)
+static unsigned long cpu_load(const int cpu, enum load_type type)
 {
-	return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
+	return cfs_rq_runnable_load(&cpu_rq(cpu)->cfs, type);
 }
 
 #ifdef CONFIG_NO_HZ_COMMON
@@ -4726,7 +4730,7 @@ static void cpu_load_update_idle(struct
 	/*
 	 * bail if there's load or we're actually up-to-date.
 	 */
-	if (weighted_cpuload(cpu_of(this_rq)))
+	if (cpu_load(cpu_of(this_rq), LOAD_WEIGHTED))
 		return;
 
 	cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
@@ -4743,11 +4747,11 @@ void cpu_load_update_nohz_start(void)
 	struct rq *this_rq = this_rq();
 
 	/*
-	 * This is all lockless but should be fine. If weighted_cpuload changes
+	 * This is all lockless but should be fine. If weighted cpu load changes
 	 * concurrently we'll exit nohz. And cpu_load write can race with
 	 * cpu_load_update_idle() but both updater would be writing the same.
 	 */
-	this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq));
+	this_rq->cpu_load[0] = cpu_load(cpu_of(this_rq), LOAD_WEIGHTED);
 }
 
 /*
@@ -4762,7 +4766,7 @@ void cpu_load_update_nohz_stop(void)
 	if (curr_jiffies == this_rq->last_load_update_tick)
 		return;
 
-	load = weighted_cpuload(cpu_of(this_rq));
+	load = cpu_load(cpu_of(this_rq), LOAD_WEIGHTED);
 	raw_spin_lock(&this_rq->lock);
 	update_rq_clock(this_rq);
 	cpu_load_update_nohz(this_rq, curr_jiffies, load);
@@ -4788,7 +4792,7 @@ static void cpu_load_update_periodic(str
  */
 void cpu_load_update_active(struct rq *this_rq)
 {
-	unsigned long load = weighted_cpuload(cpu_of(this_rq));
+	unsigned long load = cpu_load(cpu_of(this_rq), LOAD_WEIGHTED);
 
 	if (tick_nohz_tick_stopped())
 		cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
@@ -4803,30 +4807,30 @@ void cpu_load_update_active(struct rq *t
  * We want to under-estimate the load of migration sources, to
  * balance conservatively.
  */
-static unsigned long source_load(int cpu, int type)
+static unsigned long source_load(int cpu, int load_idx, enum load_type type)
 {
 	struct rq *rq = cpu_rq(cpu);
-	unsigned long total = weighted_cpuload(cpu);
+	unsigned long total = cpu_load(cpu, type);
 
-	if (type == 0 || !sched_feat(LB_BIAS))
+	if (load_idx == 0 || !sched_feat(LB_BIAS))
 		return total;
 
-	return min(rq->cpu_load[type-1], total);
+	return min(rq->cpu_load[load_idx-1], total);
 }
 
 /*
  * Return a high guess at the load of a migration-target cpu weighted
  * according to the scheduling class and "nice" value.
  */
-static unsigned long target_load(int cpu, int type)
+static unsigned long target_load(int cpu, int load_idx, enum load_type type)
 {
 	struct rq *rq = cpu_rq(cpu);
-	unsigned long total = weighted_cpuload(cpu);
+	unsigned long total = cpu_load(cpu, type);
 
-	if (type == 0 || !sched_feat(LB_BIAS))
+	if (load_idx == 0 || !sched_feat(LB_BIAS))
 		return total;
 
-	return max(rq->cpu_load[type-1], total);
+	return max(rq->cpu_load[load_idx-1], total);
 }
 
 static unsigned long capacity_of(int cpu)
@@ -4843,7 +4847,7 @@ static unsigned long cpu_avg_load_per_ta
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
-	unsigned long load_avg = weighted_cpuload(cpu);
+	unsigned long load_avg = cpu_load(cpu, LOAD_WEIGHTED);
 
 	if (nr_running)
 		return load_avg / nr_running;
@@ -5013,7 +5017,15 @@ static int wake_wide(struct task_struct
 	return 1;
 }
 
-static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
+static unsigned long task_load(struct task_struct *p, enum load_type type)
+{
+	if (type == LOAD_INSTANTANEOUS)
+		return scale_load_down(p->se.load.weight);
+	return p->se.avg.load_avg;
+}
+
+static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync,
+		       enum load_type type)
 {
 	s64 this_load, load;
 	s64 this_eff_load, prev_eff_load;
@@ -5025,8 +5037,8 @@ static int wake_affine(struct sched_doma
 	idx	  = sd->wake_idx;
 	this_cpu  = smp_processor_id();
 	prev_cpu  = task_cpu(p);
-	load	  = source_load(prev_cpu, idx);
-	this_load = target_load(this_cpu, idx);
+	load	  = source_load(prev_cpu, idx, type);
+	this_load = target_load(this_cpu, idx, type);
 
 	/*
 	 * If sync wakeup then subtract the (maximum possible)
@@ -5035,14 +5047,14 @@ static int wake_affine(struct sched_doma
 	 */
 	if (sync) {
 		tg = task_group(current);
-		weight = current->se.avg.load_avg;
+		weight = task_load(current, type);
 
 		this_load += effective_load(tg, this_cpu, -weight, -weight);
 		load += effective_load(tg, prev_cpu, 0, -weight);
 	}
 
 	tg = task_group(p);
-	weight = p->se.avg.load_avg;
+	weight = task_load(p, type);
 
 	/*
 	 * In low-load situations, where prev_cpu is idle and this_cpu is idle
@@ -5085,7 +5097,7 @@ static int wake_affine(struct sched_doma
  */
 static struct sched_group *
 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
-		  int this_cpu, int sd_flag)
+		  int this_cpu, int sd_flag, enum load_type type)
 {
 	struct sched_group *idlest = NULL, *group = sd->groups;
 	unsigned long min_load = ULONG_MAX, this_load = 0;
@@ -5114,9 +5126,9 @@ find_idlest_group(struct sched_domain *s
 		for_each_cpu(i, sched_group_cpus(group)) {
 			/* Bias balancing toward cpus of our domain */
 			if (local_group)
-				load = source_load(i, load_idx);
+				load = source_load(i, load_idx, type);
 			else
-				load = target_load(i, load_idx);
+				load = target_load(i, load_idx, type);
 
 			avg_load += load;
 		}
@@ -5141,7 +5153,8 @@ find_idlest_group(struct sched_domain *s
  * find_idlest_cpu - find the idlest cpu among the cpus in group.
  */
 static int
-find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
+find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
+		enum load_type type)
 {
 	unsigned long load, min_load = ULONG_MAX;
 	unsigned int min_exit_latency = UINT_MAX;
@@ -5175,7 +5188,7 @@ find_idlest_cpu(struct sched_group *grou
 				shallowest_idle_cpu = i;
 			}
 		} else if (shallowest_idle_cpu == -1) {
-			load = weighted_cpuload(i);
+			load = cpu_load(i, type);
 			if (load < min_load || (load == min_load && i == this_cpu)) {
 				min_load = load;
 				least_loaded_cpu = i;
@@ -5282,6 +5295,24 @@ static int cpu_util(int cpu)
 	return (util >= capacity) ? capacity : util;
 }
 
+enum load_type wakeup_load_type(struct task_struct *p)
+{
+	if (!sched_feat(WAKE_INSTANTANEOUS_LOAD))
+		return LOAD_WEIGHTED;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	/*
+	 * Group scheduling unconditionally use average load.  Use of
+	 * instantaneous load is all about loads that live or die in
+	 * the here and now, to which cgroups are fundamentally toxic.
+	 */
+	if (task_group(p)->parent)
+		return LOAD_WEIGHTED;
+#endif
+
+	return LOAD_INSTANTANEOUS;
+}
+
 /*
  * select_task_rq_fair: Select target runqueue for the waking task in domains
  * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
@@ -5302,6 +5333,7 @@ select_task_rq_fair(struct task_struct *
 	int new_cpu = prev_cpu;
 	int want_affine = 0;
 	int sync = wake_flags & WF_SYNC;
+	enum load_type type = wakeup_load_type(p);
 
 	if (sd_flag & SD_BALANCE_WAKE) {
 		record_wakee(p);
@@ -5331,7 +5363,7 @@ select_task_rq_fair(struct task_struct *
 
 	if (affine_sd) {
 		sd = NULL; /* Prefer wake_affine over balance flags */
-		if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+		if (cpu != prev_cpu && wake_affine(affine_sd, p, sync, type))
 			new_cpu = cpu;
 	}
 
@@ -5348,13 +5380,13 @@ select_task_rq_fair(struct task_struct *
 			continue;
 		}
 
-		group = find_idlest_group(sd, p, cpu, sd_flag);
+		group = find_idlest_group(sd, p, cpu, sd_flag, type);
 		if (!group) {
 			sd = sd->child;
 			continue;
 		}
 
-		new_cpu = find_idlest_cpu(group, p, cpu);
+		new_cpu = find_idlest_cpu(group, p, cpu, type);
 		if (new_cpu == -1 || new_cpu == cpu) {
 			/* Now try balancing at a lower domain level of cpu */
 			sd = sd->child;
@@ -6710,9 +6742,9 @@ static inline void update_sg_lb_stats(st
 
 		/* Bias balancing toward cpus of our domain */
 		if (local_group)
-			load = target_load(i, load_idx);
+			load = target_load(i, load_idx, LOAD_WEIGHTED);
 		else
-			load = source_load(i, load_idx);
+			load = source_load(i, load_idx, LOAD_WEIGHTED);
 
 		sgs->group_load += load;
 		sgs->group_util += cpu_util(i);
@@ -6726,7 +6758,7 @@ static inline void update_sg_lb_stats(st
 		sgs->nr_numa_running += rq->nr_numa_running;
 		sgs->nr_preferred_running += rq->nr_preferred_running;
 #endif
-		sgs->sum_weighted_load += weighted_cpuload(i);
+		sgs->sum_weighted_load += cpu_load(i, LOAD_WEIGHTED);
 		/*
 		 * No need to call idle_cpu() if nr_running is not 0
 		 */
@@ -7238,11 +7270,11 @@ static struct rq *find_busiest_queue(str
 
 		capacity = capacity_of(i);
 
-		wl = weighted_cpuload(i);
+		wl = cpu_load(i, LOAD_WEIGHTED);
 
 		/*
-		 * When comparing with imbalance, use weighted_cpuload()
-		 * which is not scaled with the cpu capacity.
+		 * When comparing with imbalance, use cpu_load() which is
+		 * not scaled with the cpu capacity.
 		 */
 
 		if (rq->nr_running == 1 && wl > env->imbalance &&
@@ -7251,7 +7283,7 @@ static struct rq *find_busiest_queue(str
 
 		/*
 		 * For the load comparisons with the other cpu's, consider
-		 * the weighted_cpuload() scaled with the cpu capacity, so
+		 * the weighted cpu load scaled with the cpu capacity, so
 		 * that the load can be moved away from the cpu that is
 		 * potentially running at a lower capacity.
 		 *
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -39,6 +39,7 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true)
 SCHED_FEAT(HRTICK, false)
 SCHED_FEAT(DOUBLE_TICK, false)
 SCHED_FEAT(LB_BIAS, true)
+SCHED_FEAT(WAKE_INSTANTANEOUS_LOAD, true)
 
 /*
  * Decrement CPU capacity based on time not spent running tasks
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1631,6 +1631,14 @@ static inline void double_rq_unlock(stru
 		__release(rq2->lock);
 }
 
+/*
+ * Tell load balancing functions whether we want instantaneous or average load
+ */
+enum load_type {
+	LOAD_INSTANTANEOUS,
+	LOAD_WEIGHTED,
+};
+
 #else /* CONFIG_SMP */
 
 /*

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch] sched/fair: Use instantaneous load in wakeup paths
  2016-06-16 12:41         ` Mike Galbraith
@ 2016-06-17  6:21           ` Mike Galbraith
  2016-06-17 10:55             ` Dietmar Eggemann
  0 siblings, 1 reply; 22+ messages in thread
From: Mike Galbraith @ 2016-06-17  6:21 UTC (permalink / raw)
  To: Yuyang Du; +Cc: Peter Zijlstra, LKML

Here are some schbench runs on an 8x8 box to show that longish
run/sleep period corner I mentioned.

vogelweide:~/:[1]# for i in `seq 5`; do schbench -m 8 -t 1 -a -r 10 2>&1 | grep 'threads 8'; done
cputime 30000 threads 8 p99 68
cputime 30000 threads 8 p99 46
cputime 30000 threads 8 p99 46
cputime 30000 threads 8 p99 45
cputime 30000 threads 8 p99 49
vogelweide:~/:[0]# echo NO_WAKE_INSTANTANEOUS_LOAD > /sys/kernel/debug/sched_features                
vogelweide:~/:[0]# for i in `seq 5`; do schbench -m 8 -t 1 -a -r 10 2>&1 | grep 'threads 8'; done
cputime 30000 threads 8 p99 9968
cputime 30000 threads 8 p99 10224
vogelweide:~/:[0]#

Using instantaneous load, we fill the box every time, without, we stack
every time.  This was with Peter's select_idle_sibling() rewrite
applied as well, but you can see that it does matter.

That doesn't mean I think my patch should immediately fly upstream
'course, who knows, there may be a less messy way to deal with it, or,
as already stated, maybe it just doesn't matter enough to the real
world to even bother with.

	-Mike

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch] sched/fair: Use instantaneous load in wakeup paths
  2016-06-17  6:21           ` Mike Galbraith
@ 2016-06-17 10:55             ` Dietmar Eggemann
  2016-06-17 13:57               ` Mike Galbraith
  0 siblings, 1 reply; 22+ messages in thread
From: Dietmar Eggemann @ 2016-06-17 10:55 UTC (permalink / raw)
  To: Mike Galbraith, Yuyang Du; +Cc: Peter Zijlstra, LKML

On 17/06/16 07:21, Mike Galbraith wrote:
> Here are some schbench runs on an 8x8 box to show that longish
> run/sleep period corner I mentioned.
> 
> vogelweide:~/:[1]# for i in `seq 5`; do schbench -m 8 -t 1 -a -r 10 2>&1 | grep 'threads 8'; done
> cputime 30000 threads 8 p99 68
> cputime 30000 threads 8 p99 46
> cputime 30000 threads 8 p99 46
> cputime 30000 threads 8 p99 45
> cputime 30000 threads 8 p99 49
> vogelweide:~/:[0]# echo NO_WAKE_INSTANTANEOUS_LOAD > /sys/kernel/debug/sched_features                
> vogelweide:~/:[0]# for i in `seq 5`; do schbench -m 8 -t 1 -a -r 10 2>&1 | grep 'threads 8'; done
> cputime 30000 threads 8 p99 9968
> cputime 30000 threads 8 p99 10224
> vogelweide:~/:[0]#
>

Is this the influence of wake_affine using instantaneous load now too or
did you set SD_BALANCE_WAKE on sd's or both?

> Using instantaneous load, we fill the box every time, without, we stack
> every time.  This was with Peter's select_idle_sibling() rewrite
> applied as well, but you can see that it does matter.
> 
> That doesn't mean I think my patch should immediately fly upstream
> 'course, who knows, there may be a less messy way to deal with it, or,
> as already stated, maybe it just doesn't matter enough to the real
> world to even bother with.

IMHO, if it would be possible to get rid of sd->wake_idx,
sd->forkexec_idx, the implementation would be less messy. Is there
anyone changing these values to something other that the default 0?

> 
> 	-Mike
> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch] sched/fair: Use instantaneous load in wakeup paths
  2016-06-17 10:55             ` Dietmar Eggemann
@ 2016-06-17 13:57               ` Mike Galbraith
  0 siblings, 0 replies; 22+ messages in thread
From: Mike Galbraith @ 2016-06-17 13:57 UTC (permalink / raw)
  To: Dietmar Eggemann, Yuyang Du; +Cc: Peter Zijlstra, LKML

On Fri, 2016-06-17 at 11:55 +0100, Dietmar Eggemann wrote:
> On 17/06/16 07:21, Mike Galbraith wrote:
> > Here are some schbench runs on an 8x8 box to show that longish
> > run/sleep period corner I mentioned.
> > 
> > vogelweide:~/:[1]# for i in `seq 5`; do schbench -m 8 -t 1 -a -r 10 2>&1 | grep 'threads 8'; done
> > cputime 30000 threads 8 p99 68
> > cputime 30000 threads 8 p99 46
> > cputime 30000 threads 8 p99 46
> > cputime 30000 threads 8 p99 45
> > cputime 30000 threads 8 p99 49
> > vogelweide:~/:[0]# echo NO_WAKE_INSTANTANEOUS_LOAD > /sys/kernel/debug/sched_features                
> > vogelweide:~/:[0]# for i in `seq 5`; do schbench -m 8 -t 1 -a -r 10 2>&1 | grep 'threads 8'; done
> > cputime 30000 threads 8 p99 9968
> > cputime 30000 threads 8 p99 10224
> > vogelweide:~/:[0]#
> > 
> 
> Is this the influence of wake_affine using instantaneous load now too or
> did you set SD_BALANCE_WAKE on sd's or both?

It's likely just the fork bits, I didn't turn on SD_BALANCE_WAKE.
> 
> > Using instantaneous load, we fill the box every time, without, we stack
> > every time.  This was with Peter's select_idle_sibling() rewrite
> > applied as well, but you can see that it does matter.
> > 
> > That doesn't mean I think my patch should immediately fly upstream
> > 'course, who knows, there may be a less messy way to deal with it, or,
> > as already stated, maybe it just doesn't matter enough to the real
> > world to even bother with.
> 
> IMHO, if it would be possible to get rid of sd->wake_idx,
> sd->forkexec_idx, the implementation would be less messy. Is there
> anyone changing these values to something other that the default 0?

Dunno.

Doesn't matter much until we answer the question are the numbers we're
using good enough, or are they not.  Hackbench and schbench say we can
certainly distribute load better by looking at the real deal instead of
a ball of fuzz (a scheduler dust monster;), but how long have we been
doing that, and how many real world complaints do we have?

The schbench thing is based on a real world load, but the real world
complaint isn't the fork distribution thing that schbench demonstrates,
that's a periodic load corner, not the we're waking to busy CPUs while
there are idle CPUs available that Facebook is griping about.  So we
have zero real world complaints, we have hackbench moving because the
ball of fuzz got reshaped, and we have the bumpy spot that schbench
hits with or without the bugfix that caused hackbench to twitch.

	-Mike

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [rfc patch] sched/fair: Use instantaneous load for fork/exec balancing
  2016-06-15 15:32     ` Dietmar Eggemann
  2016-06-15 16:03       ` Mike Galbraith
@ 2016-07-04 15:04       ` Matt Fleming
  2016-07-04 17:43         ` Mike Galbraith
  2016-07-11  8:58         ` Dietmar Eggemann
  1 sibling, 2 replies; 22+ messages in thread
From: Matt Fleming @ 2016-07-04 15:04 UTC (permalink / raw)
  To: Dietmar Eggemann
  Cc: Mike Galbraith, Peter Zijlstra, Yuyang Du, LKML, Mel Gorman

On Wed, 15 Jun, at 04:32:58PM, Dietmar Eggemann wrote:
> On 14/06/16 17:40, Mike Galbraith wrote:
> > On Tue, 2016-06-14 at 15:14 +0100, Dietmar Eggemann wrote:
> > 
> >> IMHO, the hackbench performance "boost" w/o 0905f04eb21f is due to the
> >> fact that a new task gets all it's load decayed (making it a small task)
> >> in the __update_load_avg() call in remove_entity_load_avg() because its
> >> se->avg.last_update_time value is 0 which creates a huge time difference
> >> comparing it to cfs_rq->avg.last_update_time. The patch 0905f04eb21f
> >> avoids this and thus the task stays big se->avg.load_avg = 1024.
> > 
> > I don't care much at all about the hackbench "regression" in its own
> > right, and what causes it, for me, bottom line is that there are cases
> > where we need to be able to resolve, and can't, simply because we're
> > looking at a fuzzy (rippling) reflection.
> 
> Understood. I just thought it would be nice to know why 0905f04eb21f
> makes this problem even more visible. But so far I wasn't able to figure
> out why this diff in se->avg.load_avg [1024 versus 0] has this effect on
> cfs_rq->runnable_load_avg making it even less suitable in find idlest*.
> enqueue_entity_load_avg()'s cfs_rq->runnable_load_* += sa->load_* looks
> suspicious though.

In my testing without 0905f04eb21f I saw that se->avg.load_avg
actually managed to skip being decayed at all before the task was
dequeued, which meant that cfs_rq->runnable_load_avg was more likely
to be zero after dequeue, for those workloads like hackbench that
essentially are just a fork bomb.

se->avg.load_avg evaded decay because se->avg.period_contrib was being
zero'd in __update_load_avg().

With 0905f04eb21f applied, it's less likely (though not impossible)
that ->period_contrib will be zero'd and so we usually end up with
some residual load in cfs_rq->runnable_load_avg on dequeue, and hence,

	cfs_rq->runnable_load_avg > se->avg.load_avg

even if 'se' is the only task on the runqueue.

FYI, below is my quick and dirty hack that restored hackbench
performance for the few machines I checked. I didn't try schbench with
it.

---

>From 4e9856ea3dc56e356195ca035dab7302754ce59b Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt@codeblueprint.co.uk>
Date: Thu, 9 Jun 2016 19:48:14 +0100
Subject: [PATCH] sched/fair: Reset ::runnable_load_avg when dequeueing last
 entity

The task and runqueue load averages maintained in p->se.avg.load_avg
and cfs_rq->runnable_load_avg respectively, can decay at different
wall clock rates, which means that enqueueing and then dequeueing a
task on an otherwise empty runqueue doesn't always leave
::runnable_load_avg with its initial value.

This can lead to the situation where cfs_rq->runnable_load_avg has a
non-zero value even though there are no runnable entities on the
runqueue. Assuming no entity is enqueued on this runqueue for some
time this residual load average will decay gradually as the load
averages are updated.

But we can optimise the special case of dequeueing the last entity and
reset ::runnable_load_avg early, which gives a performance improvement
to workloads that trigger the load balancer, such as fork-heavy
applications when SD_BALANCE_FORK is set, because it gives a more up
to date view of how busy the cpu is.

Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
---
 kernel/sched/fair.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c6dd8bab010c..408ee90c7ea8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3007,10 +3007,20 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 static inline void
 dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
+	unsigned long load_avg = 0;
+
 	update_load_avg(se, 1);
 
-	cfs_rq->runnable_load_avg =
-		max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
+	/*
+	 * If we're about to dequeue the last runnable entity we can
+	 * reset the runnable load average to zero instead of waiting
+	 * for it to decay naturally. This gives the load balancer a
+	 * more timely and accurate view of how busy this cpu is.
+	 */
+	if (cfs_rq->nr_running > 1)
+		load_avg = max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
+
+	cfs_rq->runnable_load_avg = load_avg;
 	cfs_rq->runnable_load_sum =
 		max_t(s64,  cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
 }
-- 
2.7.3

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [rfc patch] sched/fair: Use instantaneous load for fork/exec balancing
  2016-07-04 15:04       ` Matt Fleming
@ 2016-07-04 17:43         ` Mike Galbraith
  2016-07-06 11:45           ` Matt Fleming
  2016-07-11  8:58         ` Dietmar Eggemann
  1 sibling, 1 reply; 22+ messages in thread
From: Mike Galbraith @ 2016-07-04 17:43 UTC (permalink / raw)
  To: Matt Fleming, Dietmar Eggemann
  Cc: Peter Zijlstra, Yuyang Du, LKML, Mel Gorman

On Mon, 2016-07-04 at 16:04 +0100, Matt Fleming wrote:

> But we can optimise the special case of dequeueing the last entity and
> reset ::runnable_load_avg early, which gives a performance improvement
> to workloads that trigger the load balancer, such as fork-heavy
> applications when SD_BALANCE_FORK is set, because it gives a more up
> to date view of how busy the cpu is.

Begs the question: what's so special about this case vs any other
dequeue/enqueue?

I've given up on this as being a waste of time.  Either you serialize
everything box wide (not!) and can then make truly accurate evaluations
of state, or you're making an educated guess based upon what once was.

The only place I've seen where using the average consistently has
issues is with a longish period periodic load (schbench).

	-Mike

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [rfc patch] sched/fair: Use instantaneous load for fork/exec balancing
  2016-07-04 17:43         ` Mike Galbraith
@ 2016-07-06 11:45           ` Matt Fleming
  2016-07-06 12:21             ` Mike Galbraith
  0 siblings, 1 reply; 22+ messages in thread
From: Matt Fleming @ 2016-07-06 11:45 UTC (permalink / raw)
  To: Mike Galbraith
  Cc: Dietmar Eggemann, Peter Zijlstra, Yuyang Du, LKML, Mel Gorman

On Mon, 04 Jul, at 07:43:14PM, Mike Galbraith wrote:
> On Mon, 2016-07-04 at 16:04 +0100, Matt Fleming wrote:
> 
> > But we can optimise the special case of dequeueing the last entity and
> > reset ::runnable_load_avg early, which gives a performance improvement
> > to workloads that trigger the load balancer, such as fork-heavy
> > applications when SD_BALANCE_FORK is set, because it gives a more up
> > to date view of how busy the cpu is.
> 
> Begs the question: what's so special about this case vs any other
> dequeue/enqueue?
 
All that makes this special is that this is the behaviour seen when
running hackbench - initial heavy forking by some master task which
eventually wakes everyone up. So you get this huge sequence of "fork,
enqueue, run, dequeue". Yes, it's a complete hack.

> I've given up on this as being a waste of time.  Either you serialize
> everything box wide (not!) and can then make truly accurate evaluations
> of state, or you're making an educated guess based upon what once was.
> 
> The only place I've seen where using the average consistently has
> issues is with a longish period periodic load (schbench).

I'm open to any suggestion that restores performance to that seen
before commit 0905f04eb21f, whether or not that involves changing how
load averages are used.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [rfc patch] sched/fair: Use instantaneous load for fork/exec balancing
  2016-07-06 11:45           ` Matt Fleming
@ 2016-07-06 12:21             ` Mike Galbraith
  0 siblings, 0 replies; 22+ messages in thread
From: Mike Galbraith @ 2016-07-06 12:21 UTC (permalink / raw)
  To: Matt Fleming
  Cc: Dietmar Eggemann, Peter Zijlstra, Yuyang Du, LKML, Mel Gorman

On Wed, 2016-07-06 at 12:45 +0100, Matt Fleming wrote:
> On Mon, 04 Jul, at 07:43:14PM, Mike Galbraith wrote:
> > On Mon, 2016-07-04 at 16:04 +0100, Matt Fleming wrote:
> > 
> > > But we can optimise the special case of dequeueing the last entity and
> > > reset ::runnable_load_avg early, which gives a performance improvement
> > > to workloads that trigger the load balancer, such as fork-heavy
> > > applications when SD_BALANCE_FORK is set, because it gives a more up
> > > to date view of how busy the cpu is.
> > 
> > Begs the question: what's so special about this case vs any other
> > dequeue/enqueue?
>  
> All that makes this special is that this is the behaviour seen when
> running hackbench - initial heavy forking by some master task which
> eventually wakes everyone up. So you get this huge sequence of "fork,
> enqueue, run, dequeue". Yes, it's a complete hack.

I'm a bit concerned that poking holes in the logic to make hackbench a
bit happier will eradicate the calming effect that avg/aging business
has on load balancing, inflicting harm on real world loads.  That would
be a bad trade.

> > I've given up on this as being a waste of time.  Either you serialize
> > everything box wide (not!) and can then make truly accurate evaluations
> > of state, or you're making an educated guess based upon what once was.
> > 
> > The only place I've seen where using the average consistently has
> > issues is with a longish period periodic load (schbench).
> 
> I'm open to any suggestion that restores performance to that seen
> before commit 0905f04eb21f, whether or not that involves changing how
> load averages are used.

None here.  That hackbench was fond of that dead bug is just too bad,
as Peter seldom resurrects bugs once swatted :)  FWIW, I took a peek at
distribution on my little desktop box while fiddling, and while it was
not a pretty flat line, it wasn't a stock market crash graph either.

	-Mike

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [rfc patch] sched/fair: Use instantaneous load for fork/exec balancing
  2016-07-04 15:04       ` Matt Fleming
  2016-07-04 17:43         ` Mike Galbraith
@ 2016-07-11  8:58         ` Dietmar Eggemann
  2016-07-12 11:14           ` Matt Fleming
  1 sibling, 1 reply; 22+ messages in thread
From: Dietmar Eggemann @ 2016-07-11  8:58 UTC (permalink / raw)
  To: Matt Fleming; +Cc: Mike Galbraith, Peter Zijlstra, Yuyang Du, LKML, Mel Gorman

On 04/07/16 16:04, Matt Fleming wrote:
> On Wed, 15 Jun, at 04:32:58PM, Dietmar Eggemann wrote:
>> On 14/06/16 17:40, Mike Galbraith wrote:
>>> On Tue, 2016-06-14 at 15:14 +0100, Dietmar Eggemann wrote:
>>>
>>>> IMHO, the hackbench performance "boost" w/o 0905f04eb21f is due to the
>>>> fact that a new task gets all it's load decayed (making it a small task)
>>>> in the __update_load_avg() call in remove_entity_load_avg() because its
>>>> se->avg.last_update_time value is 0 which creates a huge time difference
>>>> comparing it to cfs_rq->avg.last_update_time. The patch 0905f04eb21f
>>>> avoids this and thus the task stays big se->avg.load_avg = 1024.
>>>
>>> I don't care much at all about the hackbench "regression" in its own
>>> right, and what causes it, for me, bottom line is that there are cases
>>> where we need to be able to resolve, and can't, simply because we're
>>> looking at a fuzzy (rippling) reflection.
>>
>> Understood. I just thought it would be nice to know why 0905f04eb21f
>> makes this problem even more visible. But so far I wasn't able to figure
>> out why this diff in se->avg.load_avg [1024 versus 0] has this effect on
>> cfs_rq->runnable_load_avg making it even less suitable in find idlest*.
>> enqueue_entity_load_avg()'s cfs_rq->runnable_load_* += sa->load_* looks
>> suspicious though.
> 
> In my testing without 0905f04eb21f I saw that se->avg.load_avg
> actually managed to skip being decayed at all before the task was
> dequeued, which meant that cfs_rq->runnable_load_avg was more likely
> to be zero after dequeue, for those workloads like hackbench that
> essentially are just a fork bomb.

Do you mean the first dequeue when the task is forked?

These are the pelt related functions which are called when the task is
forked:

detach_entity_load_avg
attach_entity_load_avg
remove_entity_load_avg <-- se->avg.load_avg is set to 0 w/o 0905f04eb21f
                           se->avg.load_avg stays 1024 w/ 0905f04eb21f
enqueue_entity_load_avg
attach_entity_load_avg (double attach is fixed on tip/sched/core)
dequeue_entity_load_avg

> se->avg.load_avg evaded decay because se->avg.period_contrib was being
> zero'd in __update_load_avg().

I don't see the relation to se->avg.period_contrib here. IMHO,
se->avg.period_contrib is purely there to manage the 3 different update
phases in __update_load_avg().

This difference in the initial se->avg.load_avg value [0 or 1024] has an
influence in wake_affine() [weight = p->se.avg.load_avg;] for the wakeup
handling of the hackbench tasks in the 'send/receive data' phase.

There are a couple of patches on tip/sched/core which might change the
behaviour of this: fork path, no double attach_entity_load_avg for new
task, no remove_entity_load_avg for new task, changes in effective_load ...

> With 0905f04eb21f applied, it's less likely (though not impossible)
> that ->period_contrib will be zero'd and so we usually end up with
> some residual load in cfs_rq->runnable_load_avg on dequeue, and hence,
> 
> 	cfs_rq->runnable_load_avg > se->avg.load_avg
> 
> even if 'se' is the only task on the runqueue.
> 
> FYI, below is my quick and dirty hack that restored hackbench
> performance for the few machines I checked. I didn't try schbench with
> it.
> 
> ---
> 
> From 4e9856ea3dc56e356195ca035dab7302754ce59b Mon Sep 17 00:00:00 2001
> From: Matt Fleming <matt@codeblueprint.co.uk>
> Date: Thu, 9 Jun 2016 19:48:14 +0100
> Subject: [PATCH] sched/fair: Reset ::runnable_load_avg when dequeueing last
>  entity
> 
> The task and runqueue load averages maintained in p->se.avg.load_avg
> and cfs_rq->runnable_load_avg respectively, can decay at different
> wall clock rates, which means that enqueueing and then dequeueing a
> task on an otherwise empty runqueue doesn't always leave
> ::runnable_load_avg with its initial value.
> 
> This can lead to the situation where cfs_rq->runnable_load_avg has a
> non-zero value even though there are no runnable entities on the
> runqueue. Assuming no entity is enqueued on this runqueue for some
> time this residual load average will decay gradually as the load
> averages are updated.
> 
> But we can optimise the special case of dequeueing the last entity and
> reset ::runnable_load_avg early, which gives a performance improvement
> to workloads that trigger the load balancer, such as fork-heavy
> applications when SD_BALANCE_FORK is set, because it gives a more up
> to date view of how busy the cpu is.
> 
> Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
> ---
>  kernel/sched/fair.c | 14 ++++++++++++--
>  1 file changed, 12 insertions(+), 2 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index c6dd8bab010c..408ee90c7ea8 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3007,10 +3007,20 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  static inline void
>  dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  {
> +	unsigned long load_avg = 0;
> +
>  	update_load_avg(se, 1);
>  
> -	cfs_rq->runnable_load_avg =
> -		max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
> +	/*
> +	 * If we're about to dequeue the last runnable entity we can
> +	 * reset the runnable load average to zero instead of waiting
> +	 * for it to decay naturally. This gives the load balancer a
> +	 * more timely and accurate view of how busy this cpu is.
> +	 */
> +	if (cfs_rq->nr_running > 1)
> +		load_avg = max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
> +
> +	cfs_rq->runnable_load_avg = load_avg;
>  	cfs_rq->runnable_load_sum =
>  		max_t(s64,  cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
>  }
> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [rfc patch] sched/fair: Use instantaneous load for fork/exec balancing
  2016-07-11  8:58         ` Dietmar Eggemann
@ 2016-07-12 11:14           ` Matt Fleming
  0 siblings, 0 replies; 22+ messages in thread
From: Matt Fleming @ 2016-07-12 11:14 UTC (permalink / raw)
  To: Dietmar Eggemann
  Cc: Mike Galbraith, Peter Zijlstra, Yuyang Du, LKML, Mel Gorman

On Mon, 11 Jul, at 09:58:52AM, Dietmar Eggemann wrote:
> This difference in the initial se->avg.load_avg value [0 or 1024] has an
> influence in wake_affine() [weight = p->se.avg.load_avg;] for the wakeup
> handling of the hackbench tasks in the 'send/receive data' phase.
 
The way I was running hackbench made it very susceptible to changes in
fork behaviour, i.e. running it with a small number of loops.

> There are a couple of patches on tip/sched/core which might change the
> behaviour of this: fork path, no double attach_entity_load_avg for new
> task, no remove_entity_load_avg for new task, changes in effective_load ...
 
Indeed they do! Things are much improved when running the latest
tip/sched/core, thanks for the pointer.

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2016-07-12 11:14 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-06-14  7:58 [rfc patch] sched/fair: Use instantaneous load for fork/exec balancing Mike Galbraith
2016-06-14 14:14 ` Dietmar Eggemann
2016-06-14 16:40   ` Mike Galbraith
2016-06-15 15:32     ` Dietmar Eggemann
2016-06-15 16:03       ` Mike Galbraith
2016-06-15 19:03         ` Dietmar Eggemann
2016-06-16  3:33           ` Mike Galbraith
2016-06-16  9:01             ` Dietmar Eggemann
2016-07-04 15:04       ` Matt Fleming
2016-07-04 17:43         ` Mike Galbraith
2016-07-06 11:45           ` Matt Fleming
2016-07-06 12:21             ` Mike Galbraith
2016-07-11  8:58         ` Dietmar Eggemann
2016-07-12 11:14           ` Matt Fleming
2016-06-14 22:42 ` Yuyang Du
2016-06-15  7:01   ` Mike Galbraith
2016-06-16 11:46     ` [patch] sched/fair: Use instantaneous load in wakeup paths Mike Galbraith
2016-06-16 12:04       ` Mike Galbraith
2016-06-16 12:41         ` Mike Galbraith
2016-06-17  6:21           ` Mike Galbraith
2016-06-17 10:55             ` Dietmar Eggemann
2016-06-17 13:57               ` Mike Galbraith

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).