On Tue, Aug 01, 2017 at 02:19:12PM +0200, Peter Zijlstra wrote: > On Tue, Jun 27, 2017 at 10:55:58AM -0400, Rik van Riel wrote: > > On Tue, 2017-06-27 at 07:39 +0200, Peter Zijlstra wrote: > > > > If LLC < NUMA or !NUMA_BALANCING we have a region that needs to do > > > _something_. > > > > Agreed. I will fix this. Given that this is a bit > > of a corner case, > > > Ok, so it turns out all of facebook sits inside that corner case because > they have !NUMA_BALANCING. > > How does the below patch work? > > --- > Subject: sched/fair: Fix wake_affine() for !NUMA_BALANCING > From: Peter Zijlstra > Date: Mon Jul 31 17:50:05 CEST 2017 > > In commit: > > 3fed382b46ba ("sched/numa: Implement NUMA node level wake_affine()") > > Rik changed wake_affine to consider NUMA information when balancing > between LLC domains. > > There are a number of problems here which this patch tries to address: > > - LLC < NODE; in this case we'd use the wrong information to balance > - !NUMA_BALANCING: in this case, the new code doesn't do any > balancing at all > - re-computes the NUMA data for every wakeup, this can mean iterating > up to 64 CPUs for every wakeup. > - default affine wakeups inside a cache > > We address these by saving the load/capacity values for each > sched_domain during regular load-balance and using these values in > wake_affine_llc(). The obvious down-side to using cached values is > that they can be too old and poorly reflect reality. > > But this way we can use LLC wide information and thus not rely on > assuming LLC matches NODE. We also don't rely on NUMA_BALANCING nor do > we have to aggegate two nodes (or even cache domains) worth of CPUs > for each wakeup. > > Cc: Josef Bacik > Cc: Rik van Riel > Fixes: 3fed382b46ba ("sched/numa: Implement NUMA node level wake_affine()") > Signed-off-by: Peter Zijlstra (Intel) > --- > include/linux/sched/topology.h | 9 ++ > kernel/sched/fair.c | 182 +++++++++++++++++++++++++---------------- > 2 files changed, 123 insertions(+), 68 deletions(-) > > --- a/include/linux/sched/topology.h > +++ b/include/linux/sched/topology.h > @@ -71,6 +71,15 @@ struct sched_domain_shared { > atomic_t ref; > atomic_t nr_busy_cpus; > int has_idle_cores; > + > + /* > + * Some variables from the most recent sd_lb_stats for this domain. > + * > + * used by wake_affine(). > + */ > + unsigned long nr_running; > + unsigned long load; > + unsigned long capacity; > }; > > struct sched_domain { > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -2586,59 +2586,6 @@ void task_tick_numa(struct rq *rq, struc > } > } > > -/* > - * Can a task be moved from prev_cpu to this_cpu without causing a load > - * imbalance that would trigger the load balancer? > - */ > -static inline bool numa_wake_affine(struct sched_domain *sd, > - struct task_struct *p, int this_cpu, > - int prev_cpu, int sync) > -{ > - struct numa_stats prev_load, this_load; > - s64 this_eff_load, prev_eff_load; > - > - update_numa_stats(&prev_load, cpu_to_node(prev_cpu)); > - update_numa_stats(&this_load, cpu_to_node(this_cpu)); > - > - /* > - * If sync wakeup then subtract the (maximum possible) > - * effect of the currently running task from the load > - * of the current CPU: > - */ > - if (sync) { > - unsigned long current_load = task_h_load(current); > - > - if (this_load.load > current_load) > - this_load.load -= current_load; > - else > - this_load.load = 0; > - } > - > - /* > - * In low-load situations, where this_cpu's node is idle due to the > - * sync cause above having dropped this_load.load to 0, move the task. > - * Moving to an idle socket will not create a bad imbalance. > - * > - * Otherwise check if the nodes are near enough in load to allow this > - * task to be woken on this_cpu's node. > - */ > - if (this_load.load > 0) { > - unsigned long task_load = task_h_load(p); > - > - this_eff_load = 100; > - this_eff_load *= prev_load.compute_capacity; > - > - prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; > - prev_eff_load *= this_load.compute_capacity; > - > - this_eff_load *= this_load.load + task_load; > - prev_eff_load *= prev_load.load - task_load; > - > - return this_eff_load <= prev_eff_load; > - } > - > - return true; > -} > #else > static void task_tick_numa(struct rq *rq, struct task_struct *curr) > { > @@ -2652,14 +2599,6 @@ static inline void account_numa_dequeue( > { > } > > -#ifdef CONFIG_SMP > -static inline bool numa_wake_affine(struct sched_domain *sd, > - struct task_struct *p, int this_cpu, > - int prev_cpu, int sync) > -{ > - return true; > -} > -#endif /* !SMP */ > #endif /* CONFIG_NUMA_BALANCING */ > > static void > @@ -5356,20 +5295,110 @@ static int wake_wide(struct task_struct > return 1; > } > > +struct llc_stats { > + unsigned long nr_running; > + unsigned long load; > + unsigned long capacity; > + int has_capacity; > +}; > + > +static void get_llc_stats(struct llc_stats *stats, int cpu) > +{ > + struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); > + > + stats->nr_running = READ_ONCE(sds->nr_running); > + stats->load = READ_ONCE(sds->load); > + stats->capacity = READ_ONCE(sds->capacity); > + stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu); > +} > + > +/* > + * Can a task be moved from prev_cpu to this_cpu without causing a load > + * imbalance that would trigger the load balancer? > + * > + * Since we're running on 'stale' values, we might in fact create an imbalance > + * but recomputing these values is expensive, as that'd mean iteration 2 cache > + * domains worth of CPUs. > + */ > +static bool > +wake_affine_llc(struct sched_domain *sd, struct task_struct *p, > + int this_cpu, int prev_cpu, int sync) > +{ > + struct llc_stats prev_stats, this_stats; > + s64 this_eff_load, prev_eff_load; > + unsigned long task_load; > + > + get_llc_stats(&prev_stats, prev_cpu); > + get_llc_stats(&this_stats, this_cpu); > + > + /* > + * If sync wakeup then subtract the (maximum possible) > + * effect of the currently running task from the load > + * of the current LLC. > + */ > + if (sync) { > + unsigned long current_load = task_h_load(current); > + > + /* in this case load hits 0 and this LLC is considered 'idle' */ > + if (current_load > this_stats.load) > + return true; > + > + this_stats.load -= current_load; > + } > + > + /* > + * The has_capacity stuff is not SMT aware, but by trying to balance > + * the nr_running on both ends we try and fill the domain at equal > + * rates, thereby first consuming cores before siblings. > + */ > + > + /* if the old cache has capacity, stay there */ > + if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1) > + return false; > + > + /* if this cache has capacity, come here */ > + if (this_stats.has_capacity && this_stats.nr_running < prev_stats.nr_running+1) > + return true; > + > + > + /* > + * Check to see if we can move the load without causing too much > + * imbalance. > + */ > + task_load = task_h_load(p); > + > + this_eff_load = 100; > + this_eff_load *= prev_stats.capacity; > + > + prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; > + prev_eff_load *= this_stats.capacity; > + > + this_eff_load *= this_stats.load + task_load; > + prev_eff_load *= prev_stats.load - task_load; > + > + return this_eff_load <= prev_eff_load; > +} > + > static int wake_affine(struct sched_domain *sd, struct task_struct *p, > int prev_cpu, int sync) > { > int this_cpu = smp_processor_id(); > - bool affine = false; > + bool affine; > > /* > - * Common case: CPUs are in the same socket, and select_idle_sibling() > - * will do its thing regardless of what we return: > + * Default to no affine wakeups; wake_affine() should not effect a task > + * placement the load-balancer feels inclined to undo. The conservative > + * option is therefore to not move tasks when they wake up. > */ > - if (cpus_share_cache(prev_cpu, this_cpu)) > - affine = true; > - else > - affine = numa_wake_affine(sd, p, this_cpu, prev_cpu, sync); > + affine = false; > + > + /* > + * If the wakeup is across cache domains, try to evaluate if movement > + * makes sense, otherwise rely on select_idle_siblings() to do > + * placement inside the cache domain. > + */ > + if (!cpus_share_cache(prev_cpu, this_cpu)) > + affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync); > > schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); > if (affine) { > @@ -7049,6 +7078,7 @@ struct sg_lb_stats { > struct sd_lb_stats { > struct sched_group *busiest; /* Busiest group in this sd */ > struct sched_group *local; /* Local group in this sd */ > + unsigned long total_running; > unsigned long total_load; /* Total load of all groups in sd */ > unsigned long total_capacity; /* Total capacity of all groups in sd */ > unsigned long avg_load; /* Average load across all groups in sd */ > @@ -7068,6 +7098,7 @@ static inline void init_sd_lb_stats(stru > *sds = (struct sd_lb_stats){ > .busiest = NULL, > .local = NULL, > + .total_running = 0UL, > .total_load = 0UL, > .total_capacity = 0UL, > .busiest_stat = { > @@ -7503,6 +7534,7 @@ static inline enum fbq_type fbq_classify > */ > static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) > { > + struct sched_domain_shared *shared = env->sd->shared; > struct sched_domain *child = env->sd->child; > struct sched_group *sg = env->sd->groups; > struct sg_lb_stats *local = &sds->local_stat; > @@ -7559,6 +7591,7 @@ static inline void update_sd_lb_stats(st > > next_group: > /* Now, start updating sd_lb_stats */ > + sds->total_running += sgs->sum_nr_running; > sds->total_load += sgs->group_load; > sds->total_capacity += sgs->group_capacity; > > @@ -7574,6 +7607,18 @@ static inline void update_sd_lb_stats(st > env->dst_rq->rd->overload = overload; > } > > + /* > + * Since these are sums over groups they can contain some CPUs > + * multiple times for the NUMA domains. > + * > + * Currently only wake_affine_llc() and find_busiest_group() > + * uses these numbers, only the last is affected by this problem. > + * > + * XXX fix that. > + */ > + WRITE_ONCE(shared->nr_running, sds->total_running); > + WRITE_ONCE(shared->load, sds->total_load); > + WRITE_ONCE(shared->capacity, sds->total_capacity); This panic's on boot for me because shared is NULL. Same happens in select_task_rq_fair when it tries to do the READ_ONCE. Here is my .config in case it's something strange with my config. Thanks, Josef