* [rfc patch] sched/fair: Use instantaneous load for fork/exec balancing @ 2016-06-14 7:58 Mike Galbraith 2016-06-14 14:14 ` Dietmar Eggemann 2016-06-14 22:42 ` Yuyang Du 0 siblings, 2 replies; 22+ messages in thread From: Mike Galbraith @ 2016-06-14 7:58 UTC (permalink / raw) To: Peter Zijlstra; +Cc: Yuyang Du, LKML SUSE's regression testing noticed that... 0905f04eb21f sched/fair: Fix new task's load avg removed from source CPU in wake_up_new_task() ...introduced a hackbench regression, and indeed it does. I think this regression has more to do with randomness than anything else, but in general... While averaging calms down load balancing, helping to keep migrations down to a dull roar, it's not completely wonderful when it comes to things that live in the here and now, hackbench being one such. time sh -c 'for i in `seq 1000`; do hackbench -p -P > /dev/null; done' real 0m55.397s user 0m8.320s sys 5m40.789s echo LB_INSTANTANEOUS_LOAD > /sys/kernel/debug/sched_features real 0m48.049s user 0m6.510s sys 5m6.291s Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com> --- kernel/sched/fair.c | 54 ++++++++++++++++++++++++------------------------ kernel/sched/features.h | 1 kernel/sched/sched.h | 6 +++++ 3 files changed, 35 insertions(+), 26 deletions(-) --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -738,7 +738,7 @@ void post_init_entity_util_avg(struct sc } } -static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); +static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq, int avg); static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq); #else void init_entity_runnable_average(struct sched_entity *se) @@ -1229,9 +1229,9 @@ bool should_numa_migrate_memory(struct t group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; } -static unsigned long weighted_cpuload(const int cpu); -static unsigned long source_load(int cpu, int type); -static unsigned long target_load(int cpu, int type); +static unsigned long weighted_cpuload(const int cpu, int avg); +static unsigned long source_load(int cpu, int type, int avg); +static unsigned long target_load(int cpu, int type, int avg); static unsigned long capacity_of(int cpu); static long effective_load(struct task_group *tg, int cpu, long wl, long wg); @@ -1261,7 +1261,7 @@ static void update_numa_stats(struct num struct rq *rq = cpu_rq(cpu); ns->nr_running += rq->nr_running; - ns->load += weighted_cpuload(cpu); + ns->load += weighted_cpuload(cpu, LOAD_AVERAGE); ns->compute_capacity += capacity_of(cpu); cpus++; @@ -3102,8 +3102,10 @@ void remove_entity_load_avg(struct sched atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); } -static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq) +static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq, int avg) { + if (sched_feat(LB_INSTANTANEOUS_LOAD) && avg == LOAD_INSTANT) + return cfs_rq->load.weight; return cfs_rq->runnable_load_avg; } @@ -4701,9 +4703,9 @@ static void cpu_load_update(struct rq *t } /* Used instead of source_load when we know the type == 0 */ -static unsigned long weighted_cpuload(const int cpu) +static unsigned long weighted_cpuload(const int cpu, int avg) { - return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs); + return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs, avg); } #ifdef CONFIG_NO_HZ_COMMON @@ -4748,7 +4750,7 @@ static void cpu_load_update_idle(struct /* * bail if there's load or we're actually up-to-date. */ - if (weighted_cpuload(cpu_of(this_rq))) + if (weighted_cpuload(cpu_of(this_rq), LOAD_AVERAGE)) return; cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0); @@ -4769,7 +4771,7 @@ void cpu_load_update_nohz_start(void) * concurrently we'll exit nohz. And cpu_load write can race with * cpu_load_update_idle() but both updater would be writing the same. */ - this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq)); + this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq), LOAD_AVERAGE); } /* @@ -4784,7 +4786,7 @@ void cpu_load_update_nohz_stop(void) if (curr_jiffies == this_rq->last_load_update_tick) return; - load = weighted_cpuload(cpu_of(this_rq)); + load = weighted_cpuload(cpu_of(this_rq), LOAD_AVERAGE); raw_spin_lock(&this_rq->lock); update_rq_clock(this_rq); cpu_load_update_nohz(this_rq, curr_jiffies, load); @@ -4810,7 +4812,7 @@ static void cpu_load_update_periodic(str */ void cpu_load_update_active(struct rq *this_rq) { - unsigned long load = weighted_cpuload(cpu_of(this_rq)); + unsigned long load = weighted_cpuload(cpu_of(this_rq), LOAD_AVERAGE); if (tick_nohz_tick_stopped()) cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load); @@ -4825,10 +4827,10 @@ void cpu_load_update_active(struct rq *t * We want to under-estimate the load of migration sources, to * balance conservatively. */ -static unsigned long source_load(int cpu, int type) +static unsigned long source_load(int cpu, int type, int avg) { struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); + unsigned long total = weighted_cpuload(cpu, avg); if (type == 0 || !sched_feat(LB_BIAS)) return total; @@ -4840,10 +4842,10 @@ static unsigned long source_load(int cpu * Return a high guess at the load of a migration-target cpu weighted * according to the scheduling class and "nice" value. */ -static unsigned long target_load(int cpu, int type) +static unsigned long target_load(int cpu, int type, int avg) { struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); + unsigned long total = weighted_cpuload(cpu, avg); if (type == 0 || !sched_feat(LB_BIAS)) return total; @@ -4865,7 +4867,7 @@ static unsigned long cpu_avg_load_per_ta { struct rq *rq = cpu_rq(cpu); unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); - unsigned long load_avg = weighted_cpuload(cpu); + unsigned long load_avg = weighted_cpuload(cpu, LOAD_AVERAGE); if (nr_running) return load_avg / nr_running; @@ -5047,8 +5049,8 @@ static int wake_affine(struct sched_doma idx = sd->wake_idx; this_cpu = smp_processor_id(); prev_cpu = task_cpu(p); - load = source_load(prev_cpu, idx); - this_load = target_load(this_cpu, idx); + load = source_load(prev_cpu, idx, LOAD_AVERAGE); + this_load = target_load(this_cpu, idx, LOAD_AVERAGE); /* * If sync wakeup then subtract the (maximum possible) @@ -5136,9 +5138,9 @@ find_idlest_group(struct sched_domain *s for_each_cpu(i, sched_group_cpus(group)) { /* Bias balancing toward cpus of our domain */ if (local_group) - load = source_load(i, load_idx); + load = source_load(i, load_idx, LOAD_INSTANT); else - load = target_load(i, load_idx); + load = target_load(i, load_idx, LOAD_INSTANT); avg_load += load; } @@ -5197,7 +5199,7 @@ find_idlest_cpu(struct sched_group *grou shallowest_idle_cpu = i; } } else if (shallowest_idle_cpu == -1) { - load = weighted_cpuload(i); + load = weighted_cpuload(i, LOAD_INSTANT); if (load < min_load || (load == min_load && i == this_cpu)) { min_load = load; least_loaded_cpu = i; @@ -6982,9 +6984,9 @@ static inline void update_sg_lb_stats(st /* Bias balancing toward cpus of our domain */ if (local_group) - load = target_load(i, load_idx); + load = target_load(i, load_idx, LOAD_AVERAGE); else - load = source_load(i, load_idx); + load = source_load(i, load_idx, LOAD_AVERAGE); sgs->group_load += load; sgs->group_util += cpu_util(i); @@ -6998,7 +7000,7 @@ static inline void update_sg_lb_stats(st sgs->nr_numa_running += rq->nr_numa_running; sgs->nr_preferred_running += rq->nr_preferred_running; #endif - sgs->sum_weighted_load += weighted_cpuload(i); + sgs->sum_weighted_load += weighted_cpuload(i, LOAD_AVERAGE); /* * No need to call idle_cpu() if nr_running is not 0 */ @@ -7510,7 +7512,7 @@ static struct rq *find_busiest_queue(str capacity = capacity_of(i); - wl = weighted_cpuload(i); + wl = weighted_cpuload(i, LOAD_AVERAGE); /* * When comparing with imbalance, use weighted_cpuload() --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -39,6 +39,7 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true) SCHED_FEAT(HRTICK, false) SCHED_FEAT(DOUBLE_TICK, false) SCHED_FEAT(LB_BIAS, true) +SCHED_FEAT(LB_INSTANTANEOUS_LOAD, false) /* * Decrement CPU capacity based on time not spent running tasks --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1630,6 +1630,12 @@ static inline void double_rq_unlock(stru __release(rq2->lock); } +/* + * Tell load balancing functions whether we want instant or average load + */ +#define LOAD_INSTANT 0 +#define LOAD_AVERAGE 1 + #else /* CONFIG_SMP */ /* ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [rfc patch] sched/fair: Use instantaneous load for fork/exec balancing 2016-06-14 7:58 [rfc patch] sched/fair: Use instantaneous load for fork/exec balancing Mike Galbraith @ 2016-06-14 14:14 ` Dietmar Eggemann 2016-06-14 16:40 ` Mike Galbraith 2016-06-14 22:42 ` Yuyang Du 1 sibling, 1 reply; 22+ messages in thread From: Dietmar Eggemann @ 2016-06-14 14:14 UTC (permalink / raw) To: Mike Galbraith, Peter Zijlstra; +Cc: Yuyang Du, LKML On 14/06/16 08:58, Mike Galbraith wrote: > SUSE's regression testing noticed that... > > 0905f04eb21f sched/fair: Fix new task's load avg removed from source CPU in wake_up_new_task() > > ...introduced a hackbench regression, and indeed it does. I think this > regression has more to do with randomness than anything else, but in > general... > > While averaging calms down load balancing, helping to keep migrations > down to a dull roar, it's not completely wonderful when it comes to > things that live in the here and now, hackbench being one such. > > time sh -c 'for i in `seq 1000`; do hackbench -p -P > /dev/null; done' > > real 0m55.397s > user 0m8.320s > sys 5m40.789s > > echo LB_INSTANTANEOUS_LOAD > /sys/kernel/debug/sched_features > > real 0m48.049s > user 0m6.510s > sys 5m6.291s > > Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com> I see similar values on ARM64 (Juno r0: 2xCortex-A57 4xCortex-A53). OK, 1000 invocations of hackbench take a little bit longer but I guess it's the fork's we're after. - echo NO_LB_INSTANTANEOUS_LOAD > /sys/kernel/debug/sched_features time sh -c 'for i in `seq 1000`; do hackbench -p -P > /dev/null; done' root@juno:~# time sh -c 'for i in `seq 1000`; do hackbench -p -P > /dev/null; done' real 10m17.155s user 2m56.976s sys 38m0.324s - echo LB_INSTANTANEOUS_LOAD > /sys/kernel/debug/sched_features time sh -c 'for i in `seq 1000`; do hackbench -p -P > /dev/null; done' real 9m49.832s user 2m42.896s sys 34m51.452s - But I get a similar effect in case I initialize se->avg.load_avg w/ 0: --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -680,7 +680,7 @@ void init_entity_runnable_average(struct sched_entity *se) * will definitely be update (after enqueue). */ sa->period_contrib = 1023; - sa->load_avg = scale_load_down(se->load.weight); + sa->load_avg = scale_load_down(0); sa->load_sum = sa->load_avg * LOAD_AVG_MAX; root@juno:~# time sh -c 'for i in `seq 1000`; do hackbench -p -P > /dev/null; done' real 9m55.396s user 2m41.192s sys 35m6.196s IMHO, the hackbench performance "boost" w/o 0905f04eb21f is due to the fact that a new task gets all it's load decayed (making it a small task) in the __update_load_avg() call in remove_entity_load_avg() because its se->avg.last_update_time value is 0 which creates a huge time difference comparing it to cfs_rq->avg.last_update_time. The patch 0905f04eb21f avoids this and thus the task stays big se->avg.load_avg = 1024. It can't be a difference in the value of cfs_rq->removed_load_avg because w/o the patch 0905f04eb21f, we atomic_long_add 0 and with the patch we bail before the atomic_long_add(). [...] ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [rfc patch] sched/fair: Use instantaneous load for fork/exec balancing 2016-06-14 14:14 ` Dietmar Eggemann @ 2016-06-14 16:40 ` Mike Galbraith 2016-06-15 15:32 ` Dietmar Eggemann 0 siblings, 1 reply; 22+ messages in thread From: Mike Galbraith @ 2016-06-14 16:40 UTC (permalink / raw) To: Dietmar Eggemann, Peter Zijlstra; +Cc: Yuyang Du, LKML On Tue, 2016-06-14 at 15:14 +0100, Dietmar Eggemann wrote: > IMHO, the hackbench performance "boost" w/o 0905f04eb21f is due to the > fact that a new task gets all it's load decayed (making it a small task) > in the __update_load_avg() call in remove_entity_load_avg() because its > se->avg.last_update_time value is 0 which creates a huge time difference > comparing it to cfs_rq->avg.last_update_time. The patch 0905f04eb21f > avoids this and thus the task stays big se->avg.load_avg = 1024. I don't care much at all about the hackbench "regression" in its own right, and what causes it, for me, bottom line is that there are cases where we need to be able to resolve, and can't, simply because we're looking at a fuzzy (rippling) reflection. In general, the fuzz helps us to not be so spastic. I'm not sure that we really really need to care all that much, because I strongly suspect that it's only gonna make any difference at all in corner cases, but there are real world cases that matter. I know for fact that schbench (facebook) which is at least based on a real world load fails early due to us stacking tasks due to that fuzzy view of reality. In that case, it's because the fuzz consists of a high amplitude aging sawtooth.. find idlest* sees a collection of pesudo-random numbers, effectively, the fates pick idlest via lottery, get it wrong often enough that a big box _never_ reaches full utilization before we stack tasks, putting an end to the latency game. For generic loads, the smoothing works, but for some corners, it blows chunks. Fork/exec seemed like a spot where you really can't go wrong by looking at clear unadulterated reality. -Mike ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [rfc patch] sched/fair: Use instantaneous load for fork/exec balancing 2016-06-14 16:40 ` Mike Galbraith @ 2016-06-15 15:32 ` Dietmar Eggemann 2016-06-15 16:03 ` Mike Galbraith 2016-07-04 15:04 ` Matt Fleming 0 siblings, 2 replies; 22+ messages in thread From: Dietmar Eggemann @ 2016-06-15 15:32 UTC (permalink / raw) To: Mike Galbraith, Peter Zijlstra; +Cc: Yuyang Du, LKML On 14/06/16 17:40, Mike Galbraith wrote: > On Tue, 2016-06-14 at 15:14 +0100, Dietmar Eggemann wrote: > >> IMHO, the hackbench performance "boost" w/o 0905f04eb21f is due to the >> fact that a new task gets all it's load decayed (making it a small task) >> in the __update_load_avg() call in remove_entity_load_avg() because its >> se->avg.last_update_time value is 0 which creates a huge time difference >> comparing it to cfs_rq->avg.last_update_time. The patch 0905f04eb21f >> avoids this and thus the task stays big se->avg.load_avg = 1024. > > I don't care much at all about the hackbench "regression" in its own > right, and what causes it, for me, bottom line is that there are cases > where we need to be able to resolve, and can't, simply because we're > looking at a fuzzy (rippling) reflection. Understood. I just thought it would be nice to know why 0905f04eb21f makes this problem even more visible. But so far I wasn't able to figure out why this diff in se->avg.load_avg [1024 versus 0] has this effect on cfs_rq->runnable_load_avg making it even less suitable in find idlest*. enqueue_entity_load_avg()'s cfs_rq->runnable_load_* += sa->load_* looks suspicious though. > > In general, the fuzz helps us to not be so spastic. I'm not sure that > we really really need to care all that much, because I strongly suspect > that it's only gonna make any difference at all in corner cases, but > there are real world cases that matter. I know for fact that schbench > (facebook) which is at least based on a real world load fails early due > to us stacking tasks due to that fuzzy view of reality. In that case, > it's because the fuzz consists of a high amplitude aging sawtooth.. ... only for fork/exec? Which then would be related to the initial value of se->avg.load_avg. Otherwise we could go back to pre b92486cbf2aa "sched: Compute runnable load avg in cpu_load and cpu_avg_load_per_task". > find idlest* sees a collection of pesudo-random numbers, effectively, > the fates pick idlest via lottery, get it wrong often enough that a big > box _never_ reaches full utilization before we stack tasks, putting an > end to the latency game. For generic loads, the smoothing works, but > for some corners, it blows chunks. Fork/exec seemed like a spot where > you really can't go wrong by looking at clear unadulterated reality. > > -Mike > ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [rfc patch] sched/fair: Use instantaneous load for fork/exec balancing 2016-06-15 15:32 ` Dietmar Eggemann @ 2016-06-15 16:03 ` Mike Galbraith 2016-06-15 19:03 ` Dietmar Eggemann 2016-07-04 15:04 ` Matt Fleming 1 sibling, 1 reply; 22+ messages in thread From: Mike Galbraith @ 2016-06-15 16:03 UTC (permalink / raw) To: Dietmar Eggemann, Peter Zijlstra; +Cc: Yuyang Du, LKML On Wed, 2016-06-15 at 16:32 +0100, Dietmar Eggemann wrote: > > In general, the fuzz helps us to not be so spastic. I'm not sure that > > we really really need to care all that much, because I strongly suspect > > that it's only gonna make any difference at all in corner cases, but > > there are real world cases that matter. I know for fact that schbench > > (facebook) which is at least based on a real world load fails early due > > to us stacking tasks due to that fuzzy view of reality. In that case, > > it's because the fuzz consists of a high amplitude aging sawtooth.. > > ... only for fork/exec? No. Identical workers had longish work/sleep cycle, aging resulted in weights that ranged from roughly 300-700(ish), depending on when you peeked at them. -Mike ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [rfc patch] sched/fair: Use instantaneous load for fork/exec balancing 2016-06-15 16:03 ` Mike Galbraith @ 2016-06-15 19:03 ` Dietmar Eggemann 2016-06-16 3:33 ` Mike Galbraith 0 siblings, 1 reply; 22+ messages in thread From: Dietmar Eggemann @ 2016-06-15 19:03 UTC (permalink / raw) To: Mike Galbraith, Peter Zijlstra; +Cc: Yuyang Du, LKML On 15/06/16 17:03, Mike Galbraith wrote: > On Wed, 2016-06-15 at 16:32 +0100, Dietmar Eggemann wrote: > >>> In general, the fuzz helps us to not be so spastic. I'm not sure that >>> we really really need to care all that much, because I strongly suspect >>> that it's only gonna make any difference at all in corner cases, but >>> there are real world cases that matter. I know for fact that schbench >>> (facebook) which is at least based on a real world load fails early due >>> to us stacking tasks due to that fuzzy view of reality. In that case, >>> it's because the fuzz consists of a high amplitude aging sawtooth.. >> >> ... only for fork/exec? > > No. Identical workers had longish work/sleep cycle, aging resulted in > weights that ranged from roughly 300-700(ish), depending on when you > peeked at them. > > -Mike > Isn't there a theoretical problem with the scale_load() on CONFIG_64BIT machines on tip/sched/core? load.weight has a higher resolution than runnable_load_avg (and so the values in the rq->cpu_load[] array). Theoretically because [forkexec|wake]_idx is 0 so [target|source]_load() is nothing else than weighted_cpuload(). ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [rfc patch] sched/fair: Use instantaneous load for fork/exec balancing 2016-06-15 19:03 ` Dietmar Eggemann @ 2016-06-16 3:33 ` Mike Galbraith 2016-06-16 9:01 ` Dietmar Eggemann 0 siblings, 1 reply; 22+ messages in thread From: Mike Galbraith @ 2016-06-16 3:33 UTC (permalink / raw) To: Dietmar Eggemann, Peter Zijlstra; +Cc: Yuyang Du, LKML On Wed, 2016-06-15 at 20:03 +0100, Dietmar Eggemann wrote: > Isn't there a theoretical problem with the scale_load() on CONFIG_64BIT > machines on tip/sched/core? load.weight has a higher resolution than > runnable_load_avg (and so the values in the rq->cpu_load[] array). > Theoretically because [forkexec|wake]_idx is 0 so [target|source]_load() > is nothing else than weighted_cpuload(). I see a not so theoretical problem with my rfc in that I forgot to scale_load_down() if that's what you mean. (changes nothing, reality was just extra special unadulterated;) -Mike ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [rfc patch] sched/fair: Use instantaneous load for fork/exec balancing 2016-06-16 3:33 ` Mike Galbraith @ 2016-06-16 9:01 ` Dietmar Eggemann 0 siblings, 0 replies; 22+ messages in thread From: Dietmar Eggemann @ 2016-06-16 9:01 UTC (permalink / raw) To: Mike Galbraith, Peter Zijlstra; +Cc: Yuyang Du, LKML On 16/06/16 04:33, Mike Galbraith wrote: > On Wed, 2016-06-15 at 20:03 +0100, Dietmar Eggemann wrote: > >> Isn't there a theoretical problem with the scale_load() on CONFIG_64BIT >> machines on tip/sched/core? load.weight has a higher resolution than >> runnable_load_avg (and so the values in the rq->cpu_load[] array). >> Theoretically because [forkexec|wake]_idx is 0 so [target|source]_load() >> is nothing else than weighted_cpuload(). > > I see a not so theoretical problem with my rfc in that I forgot to > scale_load_down() if that's what you mean. Yup. Theoretical in the sense that this_load and min_load will be affected both the same way as long as load_idx = 0. > > (changes nothing, reality was just extra special unadulterated;) Agreed. > > -Mike > ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [rfc patch] sched/fair: Use instantaneous load for fork/exec balancing 2016-06-15 15:32 ` Dietmar Eggemann 2016-06-15 16:03 ` Mike Galbraith @ 2016-07-04 15:04 ` Matt Fleming 2016-07-04 17:43 ` Mike Galbraith 2016-07-11 8:58 ` Dietmar Eggemann 1 sibling, 2 replies; 22+ messages in thread From: Matt Fleming @ 2016-07-04 15:04 UTC (permalink / raw) To: Dietmar Eggemann Cc: Mike Galbraith, Peter Zijlstra, Yuyang Du, LKML, Mel Gorman On Wed, 15 Jun, at 04:32:58PM, Dietmar Eggemann wrote: > On 14/06/16 17:40, Mike Galbraith wrote: > > On Tue, 2016-06-14 at 15:14 +0100, Dietmar Eggemann wrote: > > > >> IMHO, the hackbench performance "boost" w/o 0905f04eb21f is due to the > >> fact that a new task gets all it's load decayed (making it a small task) > >> in the __update_load_avg() call in remove_entity_load_avg() because its > >> se->avg.last_update_time value is 0 which creates a huge time difference > >> comparing it to cfs_rq->avg.last_update_time. The patch 0905f04eb21f > >> avoids this and thus the task stays big se->avg.load_avg = 1024. > > > > I don't care much at all about the hackbench "regression" in its own > > right, and what causes it, for me, bottom line is that there are cases > > where we need to be able to resolve, and can't, simply because we're > > looking at a fuzzy (rippling) reflection. > > Understood. I just thought it would be nice to know why 0905f04eb21f > makes this problem even more visible. But so far I wasn't able to figure > out why this diff in se->avg.load_avg [1024 versus 0] has this effect on > cfs_rq->runnable_load_avg making it even less suitable in find idlest*. > enqueue_entity_load_avg()'s cfs_rq->runnable_load_* += sa->load_* looks > suspicious though. In my testing without 0905f04eb21f I saw that se->avg.load_avg actually managed to skip being decayed at all before the task was dequeued, which meant that cfs_rq->runnable_load_avg was more likely to be zero after dequeue, for those workloads like hackbench that essentially are just a fork bomb. se->avg.load_avg evaded decay because se->avg.period_contrib was being zero'd in __update_load_avg(). With 0905f04eb21f applied, it's less likely (though not impossible) that ->period_contrib will be zero'd and so we usually end up with some residual load in cfs_rq->runnable_load_avg on dequeue, and hence, cfs_rq->runnable_load_avg > se->avg.load_avg even if 'se' is the only task on the runqueue. FYI, below is my quick and dirty hack that restored hackbench performance for the few machines I checked. I didn't try schbench with it. --- >From 4e9856ea3dc56e356195ca035dab7302754ce59b Mon Sep 17 00:00:00 2001 From: Matt Fleming <matt@codeblueprint.co.uk> Date: Thu, 9 Jun 2016 19:48:14 +0100 Subject: [PATCH] sched/fair: Reset ::runnable_load_avg when dequeueing last entity The task and runqueue load averages maintained in p->se.avg.load_avg and cfs_rq->runnable_load_avg respectively, can decay at different wall clock rates, which means that enqueueing and then dequeueing a task on an otherwise empty runqueue doesn't always leave ::runnable_load_avg with its initial value. This can lead to the situation where cfs_rq->runnable_load_avg has a non-zero value even though there are no runnable entities on the runqueue. Assuming no entity is enqueued on this runqueue for some time this residual load average will decay gradually as the load averages are updated. But we can optimise the special case of dequeueing the last entity and reset ::runnable_load_avg early, which gives a performance improvement to workloads that trigger the load balancer, such as fork-heavy applications when SD_BALANCE_FORK is set, because it gives a more up to date view of how busy the cpu is. Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk> --- kernel/sched/fair.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c6dd8bab010c..408ee90c7ea8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3007,10 +3007,20 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { + unsigned long load_avg = 0; + update_load_avg(se, 1); - cfs_rq->runnable_load_avg = - max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); + /* + * If we're about to dequeue the last runnable entity we can + * reset the runnable load average to zero instead of waiting + * for it to decay naturally. This gives the load balancer a + * more timely and accurate view of how busy this cpu is. + */ + if (cfs_rq->nr_running > 1) + load_avg = max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); + + cfs_rq->runnable_load_avg = load_avg; cfs_rq->runnable_load_sum = max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); } -- 2.7.3 ^ permalink raw reply related [flat|nested] 22+ messages in thread
* Re: [rfc patch] sched/fair: Use instantaneous load for fork/exec balancing 2016-07-04 15:04 ` Matt Fleming @ 2016-07-04 17:43 ` Mike Galbraith 2016-07-06 11:45 ` Matt Fleming 2016-07-11 8:58 ` Dietmar Eggemann 1 sibling, 1 reply; 22+ messages in thread From: Mike Galbraith @ 2016-07-04 17:43 UTC (permalink / raw) To: Matt Fleming, Dietmar Eggemann Cc: Peter Zijlstra, Yuyang Du, LKML, Mel Gorman On Mon, 2016-07-04 at 16:04 +0100, Matt Fleming wrote: > But we can optimise the special case of dequeueing the last entity and > reset ::runnable_load_avg early, which gives a performance improvement > to workloads that trigger the load balancer, such as fork-heavy > applications when SD_BALANCE_FORK is set, because it gives a more up > to date view of how busy the cpu is. Begs the question: what's so special about this case vs any other dequeue/enqueue? I've given up on this as being a waste of time. Either you serialize everything box wide (not!) and can then make truly accurate evaluations of state, or you're making an educated guess based upon what once was. The only place I've seen where using the average consistently has issues is with a longish period periodic load (schbench). -Mike ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [rfc patch] sched/fair: Use instantaneous load for fork/exec balancing 2016-07-04 17:43 ` Mike Galbraith @ 2016-07-06 11:45 ` Matt Fleming 2016-07-06 12:21 ` Mike Galbraith 0 siblings, 1 reply; 22+ messages in thread From: Matt Fleming @ 2016-07-06 11:45 UTC (permalink / raw) To: Mike Galbraith Cc: Dietmar Eggemann, Peter Zijlstra, Yuyang Du, LKML, Mel Gorman On Mon, 04 Jul, at 07:43:14PM, Mike Galbraith wrote: > On Mon, 2016-07-04 at 16:04 +0100, Matt Fleming wrote: > > > But we can optimise the special case of dequeueing the last entity and > > reset ::runnable_load_avg early, which gives a performance improvement > > to workloads that trigger the load balancer, such as fork-heavy > > applications when SD_BALANCE_FORK is set, because it gives a more up > > to date view of how busy the cpu is. > > Begs the question: what's so special about this case vs any other > dequeue/enqueue? All that makes this special is that this is the behaviour seen when running hackbench - initial heavy forking by some master task which eventually wakes everyone up. So you get this huge sequence of "fork, enqueue, run, dequeue". Yes, it's a complete hack. > I've given up on this as being a waste of time. Either you serialize > everything box wide (not!) and can then make truly accurate evaluations > of state, or you're making an educated guess based upon what once was. > > The only place I've seen where using the average consistently has > issues is with a longish period periodic load (schbench). I'm open to any suggestion that restores performance to that seen before commit 0905f04eb21f, whether or not that involves changing how load averages are used. ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [rfc patch] sched/fair: Use instantaneous load for fork/exec balancing 2016-07-06 11:45 ` Matt Fleming @ 2016-07-06 12:21 ` Mike Galbraith 0 siblings, 0 replies; 22+ messages in thread From: Mike Galbraith @ 2016-07-06 12:21 UTC (permalink / raw) To: Matt Fleming Cc: Dietmar Eggemann, Peter Zijlstra, Yuyang Du, LKML, Mel Gorman On Wed, 2016-07-06 at 12:45 +0100, Matt Fleming wrote: > On Mon, 04 Jul, at 07:43:14PM, Mike Galbraith wrote: > > On Mon, 2016-07-04 at 16:04 +0100, Matt Fleming wrote: > > > > > But we can optimise the special case of dequeueing the last entity and > > > reset ::runnable_load_avg early, which gives a performance improvement > > > to workloads that trigger the load balancer, such as fork-heavy > > > applications when SD_BALANCE_FORK is set, because it gives a more up > > > to date view of how busy the cpu is. > > > > Begs the question: what's so special about this case vs any other > > dequeue/enqueue? > > All that makes this special is that this is the behaviour seen when > running hackbench - initial heavy forking by some master task which > eventually wakes everyone up. So you get this huge sequence of "fork, > enqueue, run, dequeue". Yes, it's a complete hack. I'm a bit concerned that poking holes in the logic to make hackbench a bit happier will eradicate the calming effect that avg/aging business has on load balancing, inflicting harm on real world loads. That would be a bad trade. > > I've given up on this as being a waste of time. Either you serialize > > everything box wide (not!) and can then make truly accurate evaluations > > of state, or you're making an educated guess based upon what once was. > > > > The only place I've seen where using the average consistently has > > issues is with a longish period periodic load (schbench). > > I'm open to any suggestion that restores performance to that seen > before commit 0905f04eb21f, whether or not that involves changing how > load averages are used. None here. That hackbench was fond of that dead bug is just too bad, as Peter seldom resurrects bugs once swatted :) FWIW, I took a peek at distribution on my little desktop box while fiddling, and while it was not a pretty flat line, it wasn't a stock market crash graph either. -Mike ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [rfc patch] sched/fair: Use instantaneous load for fork/exec balancing 2016-07-04 15:04 ` Matt Fleming 2016-07-04 17:43 ` Mike Galbraith @ 2016-07-11 8:58 ` Dietmar Eggemann 2016-07-12 11:14 ` Matt Fleming 1 sibling, 1 reply; 22+ messages in thread From: Dietmar Eggemann @ 2016-07-11 8:58 UTC (permalink / raw) To: Matt Fleming; +Cc: Mike Galbraith, Peter Zijlstra, Yuyang Du, LKML, Mel Gorman On 04/07/16 16:04, Matt Fleming wrote: > On Wed, 15 Jun, at 04:32:58PM, Dietmar Eggemann wrote: >> On 14/06/16 17:40, Mike Galbraith wrote: >>> On Tue, 2016-06-14 at 15:14 +0100, Dietmar Eggemann wrote: >>> >>>> IMHO, the hackbench performance "boost" w/o 0905f04eb21f is due to the >>>> fact that a new task gets all it's load decayed (making it a small task) >>>> in the __update_load_avg() call in remove_entity_load_avg() because its >>>> se->avg.last_update_time value is 0 which creates a huge time difference >>>> comparing it to cfs_rq->avg.last_update_time. The patch 0905f04eb21f >>>> avoids this and thus the task stays big se->avg.load_avg = 1024. >>> >>> I don't care much at all about the hackbench "regression" in its own >>> right, and what causes it, for me, bottom line is that there are cases >>> where we need to be able to resolve, and can't, simply because we're >>> looking at a fuzzy (rippling) reflection. >> >> Understood. I just thought it would be nice to know why 0905f04eb21f >> makes this problem even more visible. But so far I wasn't able to figure >> out why this diff in se->avg.load_avg [1024 versus 0] has this effect on >> cfs_rq->runnable_load_avg making it even less suitable in find idlest*. >> enqueue_entity_load_avg()'s cfs_rq->runnable_load_* += sa->load_* looks >> suspicious though. > > In my testing without 0905f04eb21f I saw that se->avg.load_avg > actually managed to skip being decayed at all before the task was > dequeued, which meant that cfs_rq->runnable_load_avg was more likely > to be zero after dequeue, for those workloads like hackbench that > essentially are just a fork bomb. Do you mean the first dequeue when the task is forked? These are the pelt related functions which are called when the task is forked: detach_entity_load_avg attach_entity_load_avg remove_entity_load_avg <-- se->avg.load_avg is set to 0 w/o 0905f04eb21f se->avg.load_avg stays 1024 w/ 0905f04eb21f enqueue_entity_load_avg attach_entity_load_avg (double attach is fixed on tip/sched/core) dequeue_entity_load_avg > se->avg.load_avg evaded decay because se->avg.period_contrib was being > zero'd in __update_load_avg(). I don't see the relation to se->avg.period_contrib here. IMHO, se->avg.period_contrib is purely there to manage the 3 different update phases in __update_load_avg(). This difference in the initial se->avg.load_avg value [0 or 1024] has an influence in wake_affine() [weight = p->se.avg.load_avg;] for the wakeup handling of the hackbench tasks in the 'send/receive data' phase. There are a couple of patches on tip/sched/core which might change the behaviour of this: fork path, no double attach_entity_load_avg for new task, no remove_entity_load_avg for new task, changes in effective_load ... > With 0905f04eb21f applied, it's less likely (though not impossible) > that ->period_contrib will be zero'd and so we usually end up with > some residual load in cfs_rq->runnable_load_avg on dequeue, and hence, > > cfs_rq->runnable_load_avg > se->avg.load_avg > > even if 'se' is the only task on the runqueue. > > FYI, below is my quick and dirty hack that restored hackbench > performance for the few machines I checked. I didn't try schbench with > it. > > --- > > From 4e9856ea3dc56e356195ca035dab7302754ce59b Mon Sep 17 00:00:00 2001 > From: Matt Fleming <matt@codeblueprint.co.uk> > Date: Thu, 9 Jun 2016 19:48:14 +0100 > Subject: [PATCH] sched/fair: Reset ::runnable_load_avg when dequeueing last > entity > > The task and runqueue load averages maintained in p->se.avg.load_avg > and cfs_rq->runnable_load_avg respectively, can decay at different > wall clock rates, which means that enqueueing and then dequeueing a > task on an otherwise empty runqueue doesn't always leave > ::runnable_load_avg with its initial value. > > This can lead to the situation where cfs_rq->runnable_load_avg has a > non-zero value even though there are no runnable entities on the > runqueue. Assuming no entity is enqueued on this runqueue for some > time this residual load average will decay gradually as the load > averages are updated. > > But we can optimise the special case of dequeueing the last entity and > reset ::runnable_load_avg early, which gives a performance improvement > to workloads that trigger the load balancer, such as fork-heavy > applications when SD_BALANCE_FORK is set, because it gives a more up > to date view of how busy the cpu is. > > Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk> > --- > kernel/sched/fair.c | 14 ++++++++++++-- > 1 file changed, 12 insertions(+), 2 deletions(-) > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index c6dd8bab010c..408ee90c7ea8 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -3007,10 +3007,20 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) > static inline void > dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) > { > + unsigned long load_avg = 0; > + > update_load_avg(se, 1); > > - cfs_rq->runnable_load_avg = > - max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); > + /* > + * If we're about to dequeue the last runnable entity we can > + * reset the runnable load average to zero instead of waiting > + * for it to decay naturally. This gives the load balancer a > + * more timely and accurate view of how busy this cpu is. > + */ > + if (cfs_rq->nr_running > 1) > + load_avg = max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); > + > + cfs_rq->runnable_load_avg = load_avg; > cfs_rq->runnable_load_sum = > max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); > } > ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [rfc patch] sched/fair: Use instantaneous load for fork/exec balancing 2016-07-11 8:58 ` Dietmar Eggemann @ 2016-07-12 11:14 ` Matt Fleming 0 siblings, 0 replies; 22+ messages in thread From: Matt Fleming @ 2016-07-12 11:14 UTC (permalink / raw) To: Dietmar Eggemann Cc: Mike Galbraith, Peter Zijlstra, Yuyang Du, LKML, Mel Gorman On Mon, 11 Jul, at 09:58:52AM, Dietmar Eggemann wrote: > This difference in the initial se->avg.load_avg value [0 or 1024] has an > influence in wake_affine() [weight = p->se.avg.load_avg;] for the wakeup > handling of the hackbench tasks in the 'send/receive data' phase. The way I was running hackbench made it very susceptible to changes in fork behaviour, i.e. running it with a small number of loops. > There are a couple of patches on tip/sched/core which might change the > behaviour of this: fork path, no double attach_entity_load_avg for new > task, no remove_entity_load_avg for new task, changes in effective_load ... Indeed they do! Things are much improved when running the latest tip/sched/core, thanks for the pointer. ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [rfc patch] sched/fair: Use instantaneous load for fork/exec balancing 2016-06-14 7:58 [rfc patch] sched/fair: Use instantaneous load for fork/exec balancing Mike Galbraith 2016-06-14 14:14 ` Dietmar Eggemann @ 2016-06-14 22:42 ` Yuyang Du 2016-06-15 7:01 ` Mike Galbraith 1 sibling, 1 reply; 22+ messages in thread From: Yuyang Du @ 2016-06-14 22:42 UTC (permalink / raw) To: Mike Galbraith; +Cc: Peter Zijlstra, LKML On Tue, Jun 14, 2016 at 09:58:31AM +0200, Mike Galbraith wrote: > SUSE's regression testing noticed that... > > 0905f04eb21f sched/fair: Fix new task's load avg removed from source CPU in wake_up_new_task() > > ...introduced a hackbench regression, and indeed it does. I think this > regression has more to do with randomness than anything else, but in > general... > > While averaging calms down load balancing, helping to keep migrations > down to a dull roar, it's not completely wonderful when it comes to > things that live in the here and now, hackbench being one such. > > time sh -c 'for i in `seq 1000`; do hackbench -p -P > /dev/null; done' > > real 0m55.397s > user 0m8.320s > sys 5m40.789s > > echo LB_INSTANTANEOUS_LOAD > /sys/kernel/debug/sched_features > > real 0m48.049s > user 0m6.510s > sys 5m6.291s > > Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com> I am entirely for giving it a "clear unadulterated reality", and even more for it an option. Reviewed-by: Yuyang Du <yuyang.du@intel.com> ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [rfc patch] sched/fair: Use instantaneous load for fork/exec balancing 2016-06-14 22:42 ` Yuyang Du @ 2016-06-15 7:01 ` Mike Galbraith 2016-06-16 11:46 ` [patch] sched/fair: Use instantaneous load in wakeup paths Mike Galbraith 0 siblings, 1 reply; 22+ messages in thread From: Mike Galbraith @ 2016-06-15 7:01 UTC (permalink / raw) To: Yuyang Du; +Cc: Peter Zijlstra, LKML On Wed, 2016-06-15 at 06:42 +0800, Yuyang Du wrote: > I am entirely for giving it a "clear unadulterated reality", and even > more for it an option. > > Reviewed-by: Yuyang Du <yuyang.du@intel.com> Thanks. I'll have a look at perhaps having wake_affine to the same, such that there is a clean separation of wake/LB paths. I suppose I should also try harder to sprinkle some 'pretty' on it, that's always the hardest part for a master of fugly :) -Mike ^ permalink raw reply [flat|nested] 22+ messages in thread
* [patch] sched/fair: Use instantaneous load in wakeup paths 2016-06-15 7:01 ` Mike Galbraith @ 2016-06-16 11:46 ` Mike Galbraith 2016-06-16 12:04 ` Mike Galbraith 0 siblings, 1 reply; 22+ messages in thread From: Mike Galbraith @ 2016-06-16 11:46 UTC (permalink / raw) To: Yuyang Du; +Cc: Peter Zijlstra, LKML On Wed, 2016-06-15 at 09:01 +0200, Mike Galbraith wrote: > On Wed, 2016-06-15 at 06:42 +0800, Yuyang Du wrote: > > > I am entirely for giving it a "clear unadulterated reality", and > > even > > more for it an option. > > > > Reviewed-by: Yuyang Du <yuyang.du@intel.com> > > Thanks. I'll have a look at perhaps having wake_affine to the same, > such that there is a clean separation of wake/LB paths. Something like so perhaps. I turned it on and whacked 'rfc' to try to attract a robot. Yoohoo, robo thingy... sched/fair: Use instantaneous load in wakeup paths Using load averages is not optimal for some loads, the fuzzy view of load can/will lead to more stacking of tasks that is good for loads that are all about latency, as the hackbench numbers below demonstrate. Give the user an instantaneous load switch for wakeup paths, perhaps eliminating it in future if benchmarks don't gripe. time sh -c 'for i in `seq 1000`; do hackbench -p -P > /dev/null; done' real 0m55.397s user 0m8.320s sys 5m40.789s echo LB_INSTANTANEOUS_LOAD > /sys/kernel/debug/sched_features real 0m48.049s user 0m6.510s sys 5m6.291s Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com> --- kernel/sched/fair.c | 116 ++++++++++++++++++++++++++++++------------------ kernel/sched/features.h | 1 kernel/sched/sched.h | 8 +++ 3 files changed, 83 insertions(+), 42 deletions(-) --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -735,7 +735,8 @@ void post_init_entity_util_avg(struct sc } } -static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); +static inline unsigned long cfs_rq_runnable_load(struct cfs_rq *cfs_rq, + enum load_type type); static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq); #else void init_entity_runnable_average(struct sched_entity *se) @@ -1226,9 +1227,9 @@ bool should_numa_migrate_memory(struct t group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; } -static unsigned long weighted_cpuload(const int cpu); -static unsigned long source_load(int cpu, int type); -static unsigned long target_load(int cpu, int type); +static unsigned long cpu_load(const int cpu, enum load_type type); +static unsigned long source_load(int cpu, int load_idx, enum load_type type); +static unsigned long target_load(int cpu, int load_idx, enum load_type type); static unsigned long capacity_of(int cpu); static long effective_load(struct task_group *tg, int cpu, long wl, long wg); @@ -1258,7 +1259,7 @@ static void update_numa_stats(struct num struct rq *rq = cpu_rq(cpu); ns->nr_running += rq->nr_running; - ns->load += weighted_cpuload(cpu); + ns->load += cpu_load(cpu, LOAD_WEIGHTED); ns->compute_capacity += capacity_of(cpu); cpus++; @@ -3085,8 +3086,11 @@ void remove_entity_load_avg(struct sched atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); } -static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq) +static inline unsigned long cfs_rq_runnable_load(struct cfs_rq *cfs_rq, + enum load_type type) { + if (type == LOAD_INSTANTANEOUS) + return scale_load_down(cfs_rq->load.weight); return cfs_rq->runnable_load_avg; } @@ -4679,9 +4683,9 @@ static void cpu_load_update(struct rq *t } /* Used instead of source_load when we know the type == 0 */ -static unsigned long weighted_cpuload(const int cpu) +static unsigned long cpu_load(const int cpu, enum load_type type) { - return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs); + return cfs_rq_runnable_load(&cpu_rq(cpu)->cfs, type); } #ifdef CONFIG_NO_HZ_COMMON @@ -4726,7 +4730,7 @@ static void cpu_load_update_idle(struct /* * bail if there's load or we're actually up-to-date. */ - if (weighted_cpuload(cpu_of(this_rq))) + if (cpu_load(cpu_of(this_rq), LOAD_WEIGHTED)) return; cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0); @@ -4743,11 +4747,11 @@ void cpu_load_update_nohz_start(void) struct rq *this_rq = this_rq(); /* - * This is all lockless but should be fine. If weighted_cpuload changes + * This is all lockless but should be fine. If weighted cpu load changes * concurrently we'll exit nohz. And cpu_load write can race with * cpu_load_update_idle() but both updater would be writing the same. */ - this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq)); + this_rq->cpu_load[0] = cpu_load(cpu_of(this_rq), LOAD_WEIGHTED); } /* @@ -4762,7 +4766,7 @@ void cpu_load_update_nohz_stop(void) if (curr_jiffies == this_rq->last_load_update_tick) return; - load = weighted_cpuload(cpu_of(this_rq)); + load = cpu_load(cpu_of(this_rq), LOAD_WEIGHTED); raw_spin_lock(&this_rq->lock); update_rq_clock(this_rq); cpu_load_update_nohz(this_rq, curr_jiffies, load); @@ -4788,7 +4792,7 @@ static void cpu_load_update_periodic(str */ void cpu_load_update_active(struct rq *this_rq) { - unsigned long load = weighted_cpuload(cpu_of(this_rq)); + unsigned long load = cpu_load(cpu_of(this_rq), LOAD_WEIGHTED); if (tick_nohz_tick_stopped()) cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load); @@ -4803,30 +4807,30 @@ void cpu_load_update_active(struct rq *t * We want to under-estimate the load of migration sources, to * balance conservatively. */ -static unsigned long source_load(int cpu, int type) +static unsigned long source_load(int cpu, int load_idx, enum load_type type) { struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); + unsigned long total = cpu_load(cpu, type); - if (type == 0 || !sched_feat(LB_BIAS)) + if (load_idx == 0 || !sched_feat(LB_BIAS)) return total; - return min(rq->cpu_load[type-1], total); + return min(rq->cpu_load[load_idx-1], total); } /* * Return a high guess at the load of a migration-target cpu weighted * according to the scheduling class and "nice" value. */ -static unsigned long target_load(int cpu, int type) +static unsigned long target_load(int cpu, int load_idx, enum load_type type) { struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); + unsigned long total = cpu_load(cpu, type); - if (type == 0 || !sched_feat(LB_BIAS)) + if (load_idx == 0 || !sched_feat(LB_BIAS)) return total; - return max(rq->cpu_load[type-1], total); + return max(rq->cpu_load[load_idx-1], total); } static unsigned long capacity_of(int cpu) @@ -4843,7 +4847,7 @@ static unsigned long cpu_avg_load_per_ta { struct rq *rq = cpu_rq(cpu); unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); - unsigned long load_avg = weighted_cpuload(cpu); + unsigned long load_avg = cpu_load(cpu, LOAD_WEIGHTED); if (nr_running) return load_avg / nr_running; @@ -5013,7 +5017,15 @@ static int wake_wide(struct task_struct return 1; } -static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) +static unsigned long task_load(struct task_struct *p, int weighted) +{ + if (!weighted) + return scale_load_down(p->se.load.weight); + return p->se.avg.load_avg; +} + +static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync, + int weighted) { s64 this_load, load; s64 this_eff_load, prev_eff_load; @@ -5025,8 +5037,8 @@ static int wake_affine(struct sched_doma idx = sd->wake_idx; this_cpu = smp_processor_id(); prev_cpu = task_cpu(p); - load = source_load(prev_cpu, idx); - this_load = target_load(this_cpu, idx); + load = source_load(prev_cpu, idx, weighted); + this_load = target_load(this_cpu, idx, weighted); /* * If sync wakeup then subtract the (maximum possible) @@ -5035,14 +5047,14 @@ static int wake_affine(struct sched_doma */ if (sync) { tg = task_group(current); - weight = current->se.avg.load_avg; + weight = task_load(current, weighted); this_load += effective_load(tg, this_cpu, -weight, -weight); load += effective_load(tg, prev_cpu, 0, -weight); } tg = task_group(p); - weight = p->se.avg.load_avg; + weight = task_load(p, weighted); /* * In low-load situations, where prev_cpu is idle and this_cpu is idle @@ -5085,7 +5097,7 @@ static int wake_affine(struct sched_doma */ static struct sched_group * find_idlest_group(struct sched_domain *sd, struct task_struct *p, - int this_cpu, int sd_flag) + int this_cpu, int sd_flag, enum load_type type) { struct sched_group *idlest = NULL, *group = sd->groups; unsigned long min_load = ULONG_MAX, this_load = 0; @@ -5114,9 +5126,9 @@ find_idlest_group(struct sched_domain *s for_each_cpu(i, sched_group_cpus(group)) { /* Bias balancing toward cpus of our domain */ if (local_group) - load = source_load(i, load_idx); + load = source_load(i, load_idx, type); else - load = target_load(i, load_idx); + load = target_load(i, load_idx, type); avg_load += load; } @@ -5141,7 +5153,8 @@ find_idlest_group(struct sched_domain *s * find_idlest_cpu - find the idlest cpu among the cpus in group. */ static int -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) +find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu, + enum load_type type) { unsigned long load, min_load = ULONG_MAX; unsigned int min_exit_latency = UINT_MAX; @@ -5175,7 +5188,7 @@ find_idlest_cpu(struct sched_group *grou shallowest_idle_cpu = i; } } else if (shallowest_idle_cpu == -1) { - load = weighted_cpuload(i); + load = cpu_load(i, type); if (load < min_load || (load == min_load && i == this_cpu)) { min_load = load; least_loaded_cpu = i; @@ -5282,6 +5295,24 @@ static int cpu_util(int cpu) return (util >= capacity) ? capacity : util; } +enum load_type wakeup_load_type(void) +{ + if (!sched_feat(WAKE_INSTANTANEOUS_LOAD)) + return LOAD_WEIGHTED; + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* + * Group scheduling unconditionally use average load. Use of + * instantaneous load is all about loads that live or die in + * the here and now, to which cgroups are fundamentally toxic. + */ + if (task_group(p)->parent)) + return LOAD_WEIGHTED; +#endif + + return LOAD_INSTANTANEOUS; +} + /* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, @@ -5302,6 +5333,7 @@ select_task_rq_fair(struct task_struct * int new_cpu = prev_cpu; int want_affine = 0; int sync = wake_flags & WF_SYNC; + enum load_type type = wakeup_load_type(); if (sd_flag & SD_BALANCE_WAKE) { record_wakee(p); @@ -5331,7 +5363,7 @@ select_task_rq_fair(struct task_struct * if (affine_sd) { sd = NULL; /* Prefer wake_affine over balance flags */ - if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) + if (cpu != prev_cpu && wake_affine(affine_sd, p, sync, type)) new_cpu = cpu; } @@ -5348,13 +5380,13 @@ select_task_rq_fair(struct task_struct * continue; } - group = find_idlest_group(sd, p, cpu, sd_flag); + group = find_idlest_group(sd, p, cpu, sd_flag, type); if (!group) { sd = sd->child; continue; } - new_cpu = find_idlest_cpu(group, p, cpu); + new_cpu = find_idlest_cpu(group, p, cpu, type); if (new_cpu == -1 || new_cpu == cpu) { /* Now try balancing at a lower domain level of cpu */ sd = sd->child; @@ -6710,9 +6742,9 @@ static inline void update_sg_lb_stats(st /* Bias balancing toward cpus of our domain */ if (local_group) - load = target_load(i, load_idx); + load = target_load(i, load_idx, LOAD_WEIGHTED); else - load = source_load(i, load_idx); + load = source_load(i, load_idx, LOAD_WEIGHTED); sgs->group_load += load; sgs->group_util += cpu_util(i); @@ -6726,7 +6758,7 @@ static inline void update_sg_lb_stats(st sgs->nr_numa_running += rq->nr_numa_running; sgs->nr_preferred_running += rq->nr_preferred_running; #endif - sgs->sum_weighted_load += weighted_cpuload(i); + sgs->sum_weighted_load += cpu_load(i, LOAD_WEIGHTED); /* * No need to call idle_cpu() if nr_running is not 0 */ @@ -7238,11 +7270,11 @@ static struct rq *find_busiest_queue(str capacity = capacity_of(i); - wl = weighted_cpuload(i); + wl = cpu_load(i, LOAD_WEIGHTED); /* - * When comparing with imbalance, use weighted_cpuload() - * which is not scaled with the cpu capacity. + * When comparing with imbalance, use cpu_load() which is + * not scaled with the cpu capacity. */ if (rq->nr_running == 1 && wl > env->imbalance && @@ -7251,7 +7283,7 @@ static struct rq *find_busiest_queue(str /* * For the load comparisons with the other cpu's, consider - * the weighted_cpuload() scaled with the cpu capacity, so + * the weighted cpu load scaled with the cpu capacity, so * that the load can be moved away from the cpu that is * potentially running at a lower capacity. * --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -39,6 +39,7 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true) SCHED_FEAT(HRTICK, false) SCHED_FEAT(DOUBLE_TICK, false) SCHED_FEAT(LB_BIAS, true) +SCHED_FEAT(WAKE_INSTANTANEOUS_LOAD, true) /* * Decrement CPU capacity based on time not spent running tasks --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1631,6 +1631,14 @@ static inline void double_rq_unlock(stru __release(rq2->lock); } +/* + * Tell load balancing functions whether we want instantaneous or average load + */ +enum load_type { + LOAD_INSTANTANEOUS, + LOAD_WEIGHTED, +}; + #else /* CONFIG_SMP */ /* ^ permalink raw reply [flat|nested] 22+ messages in thread
* [patch] sched/fair: Use instantaneous load in wakeup paths 2016-06-16 11:46 ` [patch] sched/fair: Use instantaneous load in wakeup paths Mike Galbraith @ 2016-06-16 12:04 ` Mike Galbraith 2016-06-16 12:41 ` Mike Galbraith 0 siblings, 1 reply; 22+ messages in thread From: Mike Galbraith @ 2016-06-16 12:04 UTC (permalink / raw) To: Yuyang Du; +Cc: Peter Zijlstra, LKML On Thu, 2016-06-16 at 13:46 +0200, Mike Galbraith wrote: > On Wed, 2016-06-15 at 09:01 +0200, Mike Galbraith wrote: > > On Wed, 2016-06-15 at 06:42 +0800, Yuyang Du wrote: > > > > > I am entirely for giving it a "clear unadulterated reality", and > > > even > > > more for it an option. > > > > > > Reviewed-by: Yuyang Du <yuyang.du@intel.com> > > > > Thanks. I'll have a look at perhaps having wake_affine to the > > same, > > such that there is a clean separation of wake/LB paths. > > Something like so perhaps. I turned it on and whacked 'rfc' to try to > attract a robot. Yoohoo, robo thingy... (stealthily inserts refreshed one) sched/fair: Use instantaneous load in wakeup paths Using load averages is not optimal for some loads, the fuzzy view of load can/will lead to more stacking of tasks that is good for loads that are all about latency, as the hackbench numbers below demonstrate. Give the user an instantaneous load switch for wakeup paths, perhaps eliminating it in future if benchmarks don't gripe. time sh -c 'for i in `seq 1000`; do hackbench -p -P > /dev/null; done' real 0m55.397s user 0m8.320s sys 5m40.789s echo LB_INSTANTANEOUS_LOAD > /sys/kernel/debug/sched_features real 0m48.049s user 0m6.510s sys 5m6.291s Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com> --- kernel/sched/fair.c | 116 ++++++++++++++++++++++++++++++------------------ kernel/sched/features.h | 1 kernel/sched/sched.h | 8 +++ 3 files changed, 83 insertions(+), 42 deletions(-) --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -735,7 +735,8 @@ void post_init_entity_util_avg(struct sc } } -static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); +static inline unsigned long cfs_rq_runnable_load(struct cfs_rq *cfs_rq, + enum load_type type); static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq); #else void init_entity_runnable_average(struct sched_entity *se) @@ -1226,9 +1227,9 @@ bool should_numa_migrate_memory(struct t group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; } -static unsigned long weighted_cpuload(const int cpu); -static unsigned long source_load(int cpu, int type); -static unsigned long target_load(int cpu, int type); +static unsigned long cpu_load(const int cpu, enum load_type type); +static unsigned long source_load(int cpu, int load_idx, enum load_type type); +static unsigned long target_load(int cpu, int load_idx, enum load_type type); static unsigned long capacity_of(int cpu); static long effective_load(struct task_group *tg, int cpu, long wl, long wg); @@ -1258,7 +1259,7 @@ static void update_numa_stats(struct num struct rq *rq = cpu_rq(cpu); ns->nr_running += rq->nr_running; - ns->load += weighted_cpuload(cpu); + ns->load += cpu_load(cpu, LOAD_WEIGHTED); ns->compute_capacity += capacity_of(cpu); cpus++; @@ -3085,8 +3086,11 @@ void remove_entity_load_avg(struct sched atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); } -static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq) +static inline unsigned long cfs_rq_runnable_load(struct cfs_rq *cfs_rq, + enum load_type type) { + if (type == LOAD_INSTANTANEOUS) + return scale_load_down(cfs_rq->load.weight); return cfs_rq->runnable_load_avg; } @@ -4679,9 +4683,9 @@ static void cpu_load_update(struct rq *t } /* Used instead of source_load when we know the type == 0 */ -static unsigned long weighted_cpuload(const int cpu) +static unsigned long cpu_load(const int cpu, enum load_type type) { - return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs); + return cfs_rq_runnable_load(&cpu_rq(cpu)->cfs, type); } #ifdef CONFIG_NO_HZ_COMMON @@ -4726,7 +4730,7 @@ static void cpu_load_update_idle(struct /* * bail if there's load or we're actually up-to-date. */ - if (weighted_cpuload(cpu_of(this_rq))) + if (cpu_load(cpu_of(this_rq), LOAD_WEIGHTED)) return; cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0); @@ -4743,11 +4747,11 @@ void cpu_load_update_nohz_start(void) struct rq *this_rq = this_rq(); /* - * This is all lockless but should be fine. If weighted_cpuload changes + * This is all lockless but should be fine. If weighted cpu load changes * concurrently we'll exit nohz. And cpu_load write can race with * cpu_load_update_idle() but both updater would be writing the same. */ - this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq)); + this_rq->cpu_load[0] = cpu_load(cpu_of(this_rq), LOAD_WEIGHTED); } /* @@ -4762,7 +4766,7 @@ void cpu_load_update_nohz_stop(void) if (curr_jiffies == this_rq->last_load_update_tick) return; - load = weighted_cpuload(cpu_of(this_rq)); + load = cpu_load(cpu_of(this_rq), LOAD_WEIGHTED); raw_spin_lock(&this_rq->lock); update_rq_clock(this_rq); cpu_load_update_nohz(this_rq, curr_jiffies, load); @@ -4788,7 +4792,7 @@ static void cpu_load_update_periodic(str */ void cpu_load_update_active(struct rq *this_rq) { - unsigned long load = weighted_cpuload(cpu_of(this_rq)); + unsigned long load = cpu_load(cpu_of(this_rq), LOAD_WEIGHTED); if (tick_nohz_tick_stopped()) cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load); @@ -4803,30 +4807,30 @@ void cpu_load_update_active(struct rq *t * We want to under-estimate the load of migration sources, to * balance conservatively. */ -static unsigned long source_load(int cpu, int type) +static unsigned long source_load(int cpu, int load_idx, enum load_type type) { struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); + unsigned long total = cpu_load(cpu, type); - if (type == 0 || !sched_feat(LB_BIAS)) + if (load_idx == 0 || !sched_feat(LB_BIAS)) return total; - return min(rq->cpu_load[type-1], total); + return min(rq->cpu_load[load_idx-1], total); } /* * Return a high guess at the load of a migration-target cpu weighted * according to the scheduling class and "nice" value. */ -static unsigned long target_load(int cpu, int type) +static unsigned long target_load(int cpu, int load_idx, enum load_type type) { struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); + unsigned long total = cpu_load(cpu, type); - if (type == 0 || !sched_feat(LB_BIAS)) + if (load_idx == 0 || !sched_feat(LB_BIAS)) return total; - return max(rq->cpu_load[type-1], total); + return max(rq->cpu_load[load_idx-1], total); } static unsigned long capacity_of(int cpu) @@ -4843,7 +4847,7 @@ static unsigned long cpu_avg_load_per_ta { struct rq *rq = cpu_rq(cpu); unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); - unsigned long load_avg = weighted_cpuload(cpu); + unsigned long load_avg = cpu_load(cpu, LOAD_WEIGHTED); if (nr_running) return load_avg / nr_running; @@ -5013,7 +5017,15 @@ static int wake_wide(struct task_struct return 1; } -static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) +static unsigned long task_load(struct task_struct *p, enum load_type type) +{ + if (type == LOAD_INSTANTANEOUS) + return scale_load_down(p->se.load.weight); + return p->se.avg.load_avg; +} + +static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync, + enum load_type type) { s64 this_load, load; s64 this_eff_load, prev_eff_load; @@ -5025,8 +5037,8 @@ static int wake_affine(struct sched_doma idx = sd->wake_idx; this_cpu = smp_processor_id(); prev_cpu = task_cpu(p); - load = source_load(prev_cpu, idx); - this_load = target_load(this_cpu, idx); + load = source_load(prev_cpu, idx, type); + this_load = target_load(this_cpu, idx, type); /* * If sync wakeup then subtract the (maximum possible) @@ -5035,14 +5047,14 @@ static int wake_affine(struct sched_doma */ if (sync) { tg = task_group(current); - weight = current->se.avg.load_avg; + weight = task_load(current, type); this_load += effective_load(tg, this_cpu, -weight, -weight); load += effective_load(tg, prev_cpu, 0, -weight); } tg = task_group(p); - weight = p->se.avg.load_avg; + weight = task_load(p, type); /* * In low-load situations, where prev_cpu is idle and this_cpu is idle @@ -5085,7 +5097,7 @@ static int wake_affine(struct sched_doma */ static struct sched_group * find_idlest_group(struct sched_domain *sd, struct task_struct *p, - int this_cpu, int sd_flag) + int this_cpu, int sd_flag, enum load_type type) { struct sched_group *idlest = NULL, *group = sd->groups; unsigned long min_load = ULONG_MAX, this_load = 0; @@ -5114,9 +5126,9 @@ find_idlest_group(struct sched_domain *s for_each_cpu(i, sched_group_cpus(group)) { /* Bias balancing toward cpus of our domain */ if (local_group) - load = source_load(i, load_idx); + load = source_load(i, load_idx, type); else - load = target_load(i, load_idx); + load = target_load(i, load_idx, type); avg_load += load; } @@ -5141,7 +5153,8 @@ find_idlest_group(struct sched_domain *s * find_idlest_cpu - find the idlest cpu among the cpus in group. */ static int -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) +find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu, + enum load_type type) { unsigned long load, min_load = ULONG_MAX; unsigned int min_exit_latency = UINT_MAX; @@ -5175,7 +5188,7 @@ find_idlest_cpu(struct sched_group *grou shallowest_idle_cpu = i; } } else if (shallowest_idle_cpu == -1) { - load = weighted_cpuload(i); + load = cpu_load(i, type); if (load < min_load || (load == min_load && i == this_cpu)) { min_load = load; least_loaded_cpu = i; @@ -5282,6 +5295,24 @@ static int cpu_util(int cpu) return (util >= capacity) ? capacity : util; } +enum load_type wakeup_load_type(void) +{ + if (!sched_feat(WAKE_INSTANTANEOUS_LOAD)) + return LOAD_WEIGHTED; + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* + * Group scheduling unconditionally use average load. Use of + * instantaneous load is all about loads that live or die in + * the here and now, to which cgroups are fundamentally toxic. + */ + if (task_group(p)->parent)) + return LOAD_WEIGHTED; +#endif + + return LOAD_INSTANTANEOUS; +} + /* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, @@ -5302,6 +5333,7 @@ select_task_rq_fair(struct task_struct * int new_cpu = prev_cpu; int want_affine = 0; int sync = wake_flags & WF_SYNC; + enum load_type type = wakeup_load_type(); if (sd_flag & SD_BALANCE_WAKE) { record_wakee(p); @@ -5331,7 +5363,7 @@ select_task_rq_fair(struct task_struct * if (affine_sd) { sd = NULL; /* Prefer wake_affine over balance flags */ - if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) + if (cpu != prev_cpu && wake_affine(affine_sd, p, sync, type)) new_cpu = cpu; } @@ -5348,13 +5380,13 @@ select_task_rq_fair(struct task_struct * continue; } - group = find_idlest_group(sd, p, cpu, sd_flag); + group = find_idlest_group(sd, p, cpu, sd_flag, type); if (!group) { sd = sd->child; continue; } - new_cpu = find_idlest_cpu(group, p, cpu); + new_cpu = find_idlest_cpu(group, p, cpu, type); if (new_cpu == -1 || new_cpu == cpu) { /* Now try balancing at a lower domain level of cpu */ sd = sd->child; @@ -6710,9 +6742,9 @@ static inline void update_sg_lb_stats(st /* Bias balancing toward cpus of our domain */ if (local_group) - load = target_load(i, load_idx); + load = target_load(i, load_idx, LOAD_WEIGHTED); else - load = source_load(i, load_idx); + load = source_load(i, load_idx, LOAD_WEIGHTED); sgs->group_load += load; sgs->group_util += cpu_util(i); @@ -6726,7 +6758,7 @@ static inline void update_sg_lb_stats(st sgs->nr_numa_running += rq->nr_numa_running; sgs->nr_preferred_running += rq->nr_preferred_running; #endif - sgs->sum_weighted_load += weighted_cpuload(i); + sgs->sum_weighted_load += cpu_load(i, LOAD_WEIGHTED); /* * No need to call idle_cpu() if nr_running is not 0 */ @@ -7238,11 +7270,11 @@ static struct rq *find_busiest_queue(str capacity = capacity_of(i); - wl = weighted_cpuload(i); + wl = cpu_load(i, LOAD_WEIGHTED); /* - * When comparing with imbalance, use weighted_cpuload() - * which is not scaled with the cpu capacity. + * When comparing with imbalance, use cpu_load() which is + * not scaled with the cpu capacity. */ if (rq->nr_running == 1 && wl > env->imbalance && @@ -7251,7 +7283,7 @@ static struct rq *find_busiest_queue(str /* * For the load comparisons with the other cpu's, consider - * the weighted_cpuload() scaled with the cpu capacity, so + * the weighted cpu load scaled with the cpu capacity, so * that the load can be moved away from the cpu that is * potentially running at a lower capacity. * --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -39,6 +39,7 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true) SCHED_FEAT(HRTICK, false) SCHED_FEAT(DOUBLE_TICK, false) SCHED_FEAT(LB_BIAS, true) +SCHED_FEAT(WAKE_INSTANTANEOUS_LOAD, true) /* * Decrement CPU capacity based on time not spent running tasks --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1631,6 +1631,14 @@ static inline void double_rq_unlock(stru __release(rq2->lock); } +/* + * Tell load balancing functions whether we want instantaneous or average load + */ +enum load_type { + LOAD_INSTANTANEOUS, + LOAD_WEIGHTED, +}; + #else /* CONFIG_SMP */ /* ^ permalink raw reply [flat|nested] 22+ messages in thread
* [patch] sched/fair: Use instantaneous load in wakeup paths 2016-06-16 12:04 ` Mike Galbraith @ 2016-06-16 12:41 ` Mike Galbraith 2016-06-17 6:21 ` Mike Galbraith 0 siblings, 1 reply; 22+ messages in thread From: Mike Galbraith @ 2016-06-16 12:41 UTC (permalink / raw) To: Yuyang Du; +Cc: Peter Zijlstra, LKML On Thu, 2016-06-16 at 14:04 +0200, Mike Galbraith wrote: > On Thu, 2016-06-16 at 13:46 +0200, Mike Galbraith wrote: > > On Wed, 2016-06-15 at 09:01 +0200, Mike Galbraith wrote: > > > On Wed, 2016-06-15 at 06:42 +0800, Yuyang Du wrote: > > > > > > > I am entirely for giving it a "clear unadulterated reality", > > > > and > > > > even > > > > more for it an option. > > > > > > > > Reviewed-by: Yuyang Du <yuyang.du@intel.com> > > > > > > Thanks. I'll have a look at perhaps having wake_affine to the > > > same, > > > such that there is a clean separation of wake/LB paths. > > > > Something like so perhaps. I turned it on and whacked 'rfc' to try > > to > > attract a robot. Yoohoo, robo thingy... > > (stealthily inserts refreshed one) (grr.. I give up on today) sched/fair: Use instantaneous load in wakeup paths Using load averages is not optimal for some loads, the fuzzy view of load can/will lead to more stacking of tasks that is good for loads that are all about latency, as the hackbench numbers below demonstrate. Give the user an instantaneous load switch for wakeup paths, perhaps eliminating it in future if benchmarks don't gripe. time sh -c 'for i in `seq 1000`; do hackbench -p -P > /dev/null; done' real 0m55.397s user 0m8.320s sys 5m40.789s echo LB_INSTANTANEOUS_LOAD > /sys/kernel/debug/sched_features real 0m48.049s user 0m6.510s sys 5m6.291s Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com> --- kernel/sched/fair.c | 116 ++++++++++++++++++++++++++++++------------------ kernel/sched/features.h | 1 kernel/sched/sched.h | 8 +++ 3 files changed, 83 insertions(+), 42 deletions(-) --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -735,7 +735,8 @@ void post_init_entity_util_avg(struct sc } } -static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); +static inline unsigned long cfs_rq_runnable_load(struct cfs_rq *cfs_rq, + enum load_type type); static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq); #else void init_entity_runnable_average(struct sched_entity *se) @@ -1226,9 +1227,9 @@ bool should_numa_migrate_memory(struct t group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; } -static unsigned long weighted_cpuload(const int cpu); -static unsigned long source_load(int cpu, int type); -static unsigned long target_load(int cpu, int type); +static unsigned long cpu_load(const int cpu, enum load_type type); +static unsigned long source_load(int cpu, int load_idx, enum load_type type); +static unsigned long target_load(int cpu, int load_idx, enum load_type type); static unsigned long capacity_of(int cpu); static long effective_load(struct task_group *tg, int cpu, long wl, long wg); @@ -1258,7 +1259,7 @@ static void update_numa_stats(struct num struct rq *rq = cpu_rq(cpu); ns->nr_running += rq->nr_running; - ns->load += weighted_cpuload(cpu); + ns->load += cpu_load(cpu, LOAD_WEIGHTED); ns->compute_capacity += capacity_of(cpu); cpus++; @@ -3085,8 +3086,11 @@ void remove_entity_load_avg(struct sched atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); } -static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq) +static inline unsigned long cfs_rq_runnable_load(struct cfs_rq *cfs_rq, + enum load_type type) { + if (type == LOAD_INSTANTANEOUS) + return scale_load_down(cfs_rq->load.weight); return cfs_rq->runnable_load_avg; } @@ -4679,9 +4683,9 @@ static void cpu_load_update(struct rq *t } /* Used instead of source_load when we know the type == 0 */ -static unsigned long weighted_cpuload(const int cpu) +static unsigned long cpu_load(const int cpu, enum load_type type) { - return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs); + return cfs_rq_runnable_load(&cpu_rq(cpu)->cfs, type); } #ifdef CONFIG_NO_HZ_COMMON @@ -4726,7 +4730,7 @@ static void cpu_load_update_idle(struct /* * bail if there's load or we're actually up-to-date. */ - if (weighted_cpuload(cpu_of(this_rq))) + if (cpu_load(cpu_of(this_rq), LOAD_WEIGHTED)) return; cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0); @@ -4743,11 +4747,11 @@ void cpu_load_update_nohz_start(void) struct rq *this_rq = this_rq(); /* - * This is all lockless but should be fine. If weighted_cpuload changes + * This is all lockless but should be fine. If weighted cpu load changes * concurrently we'll exit nohz. And cpu_load write can race with * cpu_load_update_idle() but both updater would be writing the same. */ - this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq)); + this_rq->cpu_load[0] = cpu_load(cpu_of(this_rq), LOAD_WEIGHTED); } /* @@ -4762,7 +4766,7 @@ void cpu_load_update_nohz_stop(void) if (curr_jiffies == this_rq->last_load_update_tick) return; - load = weighted_cpuload(cpu_of(this_rq)); + load = cpu_load(cpu_of(this_rq), LOAD_WEIGHTED); raw_spin_lock(&this_rq->lock); update_rq_clock(this_rq); cpu_load_update_nohz(this_rq, curr_jiffies, load); @@ -4788,7 +4792,7 @@ static void cpu_load_update_periodic(str */ void cpu_load_update_active(struct rq *this_rq) { - unsigned long load = weighted_cpuload(cpu_of(this_rq)); + unsigned long load = cpu_load(cpu_of(this_rq), LOAD_WEIGHTED); if (tick_nohz_tick_stopped()) cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load); @@ -4803,30 +4807,30 @@ void cpu_load_update_active(struct rq *t * We want to under-estimate the load of migration sources, to * balance conservatively. */ -static unsigned long source_load(int cpu, int type) +static unsigned long source_load(int cpu, int load_idx, enum load_type type) { struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); + unsigned long total = cpu_load(cpu, type); - if (type == 0 || !sched_feat(LB_BIAS)) + if (load_idx == 0 || !sched_feat(LB_BIAS)) return total; - return min(rq->cpu_load[type-1], total); + return min(rq->cpu_load[load_idx-1], total); } /* * Return a high guess at the load of a migration-target cpu weighted * according to the scheduling class and "nice" value. */ -static unsigned long target_load(int cpu, int type) +static unsigned long target_load(int cpu, int load_idx, enum load_type type) { struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); + unsigned long total = cpu_load(cpu, type); - if (type == 0 || !sched_feat(LB_BIAS)) + if (load_idx == 0 || !sched_feat(LB_BIAS)) return total; - return max(rq->cpu_load[type-1], total); + return max(rq->cpu_load[load_idx-1], total); } static unsigned long capacity_of(int cpu) @@ -4843,7 +4847,7 @@ static unsigned long cpu_avg_load_per_ta { struct rq *rq = cpu_rq(cpu); unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); - unsigned long load_avg = weighted_cpuload(cpu); + unsigned long load_avg = cpu_load(cpu, LOAD_WEIGHTED); if (nr_running) return load_avg / nr_running; @@ -5013,7 +5017,15 @@ static int wake_wide(struct task_struct return 1; } -static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) +static unsigned long task_load(struct task_struct *p, enum load_type type) +{ + if (type == LOAD_INSTANTANEOUS) + return scale_load_down(p->se.load.weight); + return p->se.avg.load_avg; +} + +static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync, + enum load_type type) { s64 this_load, load; s64 this_eff_load, prev_eff_load; @@ -5025,8 +5037,8 @@ static int wake_affine(struct sched_doma idx = sd->wake_idx; this_cpu = smp_processor_id(); prev_cpu = task_cpu(p); - load = source_load(prev_cpu, idx); - this_load = target_load(this_cpu, idx); + load = source_load(prev_cpu, idx, type); + this_load = target_load(this_cpu, idx, type); /* * If sync wakeup then subtract the (maximum possible) @@ -5035,14 +5047,14 @@ static int wake_affine(struct sched_doma */ if (sync) { tg = task_group(current); - weight = current->se.avg.load_avg; + weight = task_load(current, type); this_load += effective_load(tg, this_cpu, -weight, -weight); load += effective_load(tg, prev_cpu, 0, -weight); } tg = task_group(p); - weight = p->se.avg.load_avg; + weight = task_load(p, type); /* * In low-load situations, where prev_cpu is idle and this_cpu is idle @@ -5085,7 +5097,7 @@ static int wake_affine(struct sched_doma */ static struct sched_group * find_idlest_group(struct sched_domain *sd, struct task_struct *p, - int this_cpu, int sd_flag) + int this_cpu, int sd_flag, enum load_type type) { struct sched_group *idlest = NULL, *group = sd->groups; unsigned long min_load = ULONG_MAX, this_load = 0; @@ -5114,9 +5126,9 @@ find_idlest_group(struct sched_domain *s for_each_cpu(i, sched_group_cpus(group)) { /* Bias balancing toward cpus of our domain */ if (local_group) - load = source_load(i, load_idx); + load = source_load(i, load_idx, type); else - load = target_load(i, load_idx); + load = target_load(i, load_idx, type); avg_load += load; } @@ -5141,7 +5153,8 @@ find_idlest_group(struct sched_domain *s * find_idlest_cpu - find the idlest cpu among the cpus in group. */ static int -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) +find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu, + enum load_type type) { unsigned long load, min_load = ULONG_MAX; unsigned int min_exit_latency = UINT_MAX; @@ -5175,7 +5188,7 @@ find_idlest_cpu(struct sched_group *grou shallowest_idle_cpu = i; } } else if (shallowest_idle_cpu == -1) { - load = weighted_cpuload(i); + load = cpu_load(i, type); if (load < min_load || (load == min_load && i == this_cpu)) { min_load = load; least_loaded_cpu = i; @@ -5282,6 +5295,24 @@ static int cpu_util(int cpu) return (util >= capacity) ? capacity : util; } +enum load_type wakeup_load_type(struct task_struct *p) +{ + if (!sched_feat(WAKE_INSTANTANEOUS_LOAD)) + return LOAD_WEIGHTED; + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* + * Group scheduling unconditionally use average load. Use of + * instantaneous load is all about loads that live or die in + * the here and now, to which cgroups are fundamentally toxic. + */ + if (task_group(p)->parent) + return LOAD_WEIGHTED; +#endif + + return LOAD_INSTANTANEOUS; +} + /* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, @@ -5302,6 +5333,7 @@ select_task_rq_fair(struct task_struct * int new_cpu = prev_cpu; int want_affine = 0; int sync = wake_flags & WF_SYNC; + enum load_type type = wakeup_load_type(p); if (sd_flag & SD_BALANCE_WAKE) { record_wakee(p); @@ -5331,7 +5363,7 @@ select_task_rq_fair(struct task_struct * if (affine_sd) { sd = NULL; /* Prefer wake_affine over balance flags */ - if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) + if (cpu != prev_cpu && wake_affine(affine_sd, p, sync, type)) new_cpu = cpu; } @@ -5348,13 +5380,13 @@ select_task_rq_fair(struct task_struct * continue; } - group = find_idlest_group(sd, p, cpu, sd_flag); + group = find_idlest_group(sd, p, cpu, sd_flag, type); if (!group) { sd = sd->child; continue; } - new_cpu = find_idlest_cpu(group, p, cpu); + new_cpu = find_idlest_cpu(group, p, cpu, type); if (new_cpu == -1 || new_cpu == cpu) { /* Now try balancing at a lower domain level of cpu */ sd = sd->child; @@ -6710,9 +6742,9 @@ static inline void update_sg_lb_stats(st /* Bias balancing toward cpus of our domain */ if (local_group) - load = target_load(i, load_idx); + load = target_load(i, load_idx, LOAD_WEIGHTED); else - load = source_load(i, load_idx); + load = source_load(i, load_idx, LOAD_WEIGHTED); sgs->group_load += load; sgs->group_util += cpu_util(i); @@ -6726,7 +6758,7 @@ static inline void update_sg_lb_stats(st sgs->nr_numa_running += rq->nr_numa_running; sgs->nr_preferred_running += rq->nr_preferred_running; #endif - sgs->sum_weighted_load += weighted_cpuload(i); + sgs->sum_weighted_load += cpu_load(i, LOAD_WEIGHTED); /* * No need to call idle_cpu() if nr_running is not 0 */ @@ -7238,11 +7270,11 @@ static struct rq *find_busiest_queue(str capacity = capacity_of(i); - wl = weighted_cpuload(i); + wl = cpu_load(i, LOAD_WEIGHTED); /* - * When comparing with imbalance, use weighted_cpuload() - * which is not scaled with the cpu capacity. + * When comparing with imbalance, use cpu_load() which is + * not scaled with the cpu capacity. */ if (rq->nr_running == 1 && wl > env->imbalance && @@ -7251,7 +7283,7 @@ static struct rq *find_busiest_queue(str /* * For the load comparisons with the other cpu's, consider - * the weighted_cpuload() scaled with the cpu capacity, so + * the weighted cpu load scaled with the cpu capacity, so * that the load can be moved away from the cpu that is * potentially running at a lower capacity. * --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -39,6 +39,7 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true) SCHED_FEAT(HRTICK, false) SCHED_FEAT(DOUBLE_TICK, false) SCHED_FEAT(LB_BIAS, true) +SCHED_FEAT(WAKE_INSTANTANEOUS_LOAD, true) /* * Decrement CPU capacity based on time not spent running tasks --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1631,6 +1631,14 @@ static inline void double_rq_unlock(stru __release(rq2->lock); } +/* + * Tell load balancing functions whether we want instantaneous or average load + */ +enum load_type { + LOAD_INSTANTANEOUS, + LOAD_WEIGHTED, +}; + #else /* CONFIG_SMP */ /* ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [patch] sched/fair: Use instantaneous load in wakeup paths 2016-06-16 12:41 ` Mike Galbraith @ 2016-06-17 6:21 ` Mike Galbraith 2016-06-17 10:55 ` Dietmar Eggemann 0 siblings, 1 reply; 22+ messages in thread From: Mike Galbraith @ 2016-06-17 6:21 UTC (permalink / raw) To: Yuyang Du; +Cc: Peter Zijlstra, LKML Here are some schbench runs on an 8x8 box to show that longish run/sleep period corner I mentioned. vogelweide:~/:[1]# for i in `seq 5`; do schbench -m 8 -t 1 -a -r 10 2>&1 | grep 'threads 8'; done cputime 30000 threads 8 p99 68 cputime 30000 threads 8 p99 46 cputime 30000 threads 8 p99 46 cputime 30000 threads 8 p99 45 cputime 30000 threads 8 p99 49 vogelweide:~/:[0]# echo NO_WAKE_INSTANTANEOUS_LOAD > /sys/kernel/debug/sched_features vogelweide:~/:[0]# for i in `seq 5`; do schbench -m 8 -t 1 -a -r 10 2>&1 | grep 'threads 8'; done cputime 30000 threads 8 p99 9968 cputime 30000 threads 8 p99 10224 vogelweide:~/:[0]# Using instantaneous load, we fill the box every time, without, we stack every time. This was with Peter's select_idle_sibling() rewrite applied as well, but you can see that it does matter. That doesn't mean I think my patch should immediately fly upstream 'course, who knows, there may be a less messy way to deal with it, or, as already stated, maybe it just doesn't matter enough to the real world to even bother with. -Mike ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [patch] sched/fair: Use instantaneous load in wakeup paths 2016-06-17 6:21 ` Mike Galbraith @ 2016-06-17 10:55 ` Dietmar Eggemann 2016-06-17 13:57 ` Mike Galbraith 0 siblings, 1 reply; 22+ messages in thread From: Dietmar Eggemann @ 2016-06-17 10:55 UTC (permalink / raw) To: Mike Galbraith, Yuyang Du; +Cc: Peter Zijlstra, LKML On 17/06/16 07:21, Mike Galbraith wrote: > Here are some schbench runs on an 8x8 box to show that longish > run/sleep period corner I mentioned. > > vogelweide:~/:[1]# for i in `seq 5`; do schbench -m 8 -t 1 -a -r 10 2>&1 | grep 'threads 8'; done > cputime 30000 threads 8 p99 68 > cputime 30000 threads 8 p99 46 > cputime 30000 threads 8 p99 46 > cputime 30000 threads 8 p99 45 > cputime 30000 threads 8 p99 49 > vogelweide:~/:[0]# echo NO_WAKE_INSTANTANEOUS_LOAD > /sys/kernel/debug/sched_features > vogelweide:~/:[0]# for i in `seq 5`; do schbench -m 8 -t 1 -a -r 10 2>&1 | grep 'threads 8'; done > cputime 30000 threads 8 p99 9968 > cputime 30000 threads 8 p99 10224 > vogelweide:~/:[0]# > Is this the influence of wake_affine using instantaneous load now too or did you set SD_BALANCE_WAKE on sd's or both? > Using instantaneous load, we fill the box every time, without, we stack > every time. This was with Peter's select_idle_sibling() rewrite > applied as well, but you can see that it does matter. > > That doesn't mean I think my patch should immediately fly upstream > 'course, who knows, there may be a less messy way to deal with it, or, > as already stated, maybe it just doesn't matter enough to the real > world to even bother with. IMHO, if it would be possible to get rid of sd->wake_idx, sd->forkexec_idx, the implementation would be less messy. Is there anyone changing these values to something other that the default 0? > > -Mike > ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [patch] sched/fair: Use instantaneous load in wakeup paths 2016-06-17 10:55 ` Dietmar Eggemann @ 2016-06-17 13:57 ` Mike Galbraith 0 siblings, 0 replies; 22+ messages in thread From: Mike Galbraith @ 2016-06-17 13:57 UTC (permalink / raw) To: Dietmar Eggemann, Yuyang Du; +Cc: Peter Zijlstra, LKML On Fri, 2016-06-17 at 11:55 +0100, Dietmar Eggemann wrote: > On 17/06/16 07:21, Mike Galbraith wrote: > > Here are some schbench runs on an 8x8 box to show that longish > > run/sleep period corner I mentioned. > > > > vogelweide:~/:[1]# for i in `seq 5`; do schbench -m 8 -t 1 -a -r 10 2>&1 | grep 'threads 8'; done > > cputime 30000 threads 8 p99 68 > > cputime 30000 threads 8 p99 46 > > cputime 30000 threads 8 p99 46 > > cputime 30000 threads 8 p99 45 > > cputime 30000 threads 8 p99 49 > > vogelweide:~/:[0]# echo NO_WAKE_INSTANTANEOUS_LOAD > /sys/kernel/debug/sched_features > > vogelweide:~/:[0]# for i in `seq 5`; do schbench -m 8 -t 1 -a -r 10 2>&1 | grep 'threads 8'; done > > cputime 30000 threads 8 p99 9968 > > cputime 30000 threads 8 p99 10224 > > vogelweide:~/:[0]# > > > > Is this the influence of wake_affine using instantaneous load now too or > did you set SD_BALANCE_WAKE on sd's or both? It's likely just the fork bits, I didn't turn on SD_BALANCE_WAKE. > > > Using instantaneous load, we fill the box every time, without, we stack > > every time. This was with Peter's select_idle_sibling() rewrite > > applied as well, but you can see that it does matter. > > > > That doesn't mean I think my patch should immediately fly upstream > > 'course, who knows, there may be a less messy way to deal with it, or, > > as already stated, maybe it just doesn't matter enough to the real > > world to even bother with. > > IMHO, if it would be possible to get rid of sd->wake_idx, > sd->forkexec_idx, the implementation would be less messy. Is there > anyone changing these values to something other that the default 0? Dunno. Doesn't matter much until we answer the question are the numbers we're using good enough, or are they not. Hackbench and schbench say we can certainly distribute load better by looking at the real deal instead of a ball of fuzz (a scheduler dust monster;), but how long have we been doing that, and how many real world complaints do we have? The schbench thing is based on a real world load, but the real world complaint isn't the fork distribution thing that schbench demonstrates, that's a periodic load corner, not the we're waking to busy CPUs while there are idle CPUs available that Facebook is griping about. So we have zero real world complaints, we have hackbench moving because the ball of fuzz got reshaped, and we have the bumpy spot that schbench hits with or without the bugfix that caused hackbench to twitch. -Mike ^ permalink raw reply [flat|nested] 22+ messages in thread
end of thread, other threads:[~2016-07-12 11:14 UTC | newest] Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2016-06-14 7:58 [rfc patch] sched/fair: Use instantaneous load for fork/exec balancing Mike Galbraith 2016-06-14 14:14 ` Dietmar Eggemann 2016-06-14 16:40 ` Mike Galbraith 2016-06-15 15:32 ` Dietmar Eggemann 2016-06-15 16:03 ` Mike Galbraith 2016-06-15 19:03 ` Dietmar Eggemann 2016-06-16 3:33 ` Mike Galbraith 2016-06-16 9:01 ` Dietmar Eggemann 2016-07-04 15:04 ` Matt Fleming 2016-07-04 17:43 ` Mike Galbraith 2016-07-06 11:45 ` Matt Fleming 2016-07-06 12:21 ` Mike Galbraith 2016-07-11 8:58 ` Dietmar Eggemann 2016-07-12 11:14 ` Matt Fleming 2016-06-14 22:42 ` Yuyang Du 2016-06-15 7:01 ` Mike Galbraith 2016-06-16 11:46 ` [patch] sched/fair: Use instantaneous load in wakeup paths Mike Galbraith 2016-06-16 12:04 ` Mike Galbraith 2016-06-16 12:41 ` Mike Galbraith 2016-06-17 6:21 ` Mike Galbraith 2016-06-17 10:55 ` Dietmar Eggemann 2016-06-17 13:57 ` Mike Galbraith
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).