On Tue, Aug 01, 2017 at 02:19:12PM +0200, Peter Zijlstra wrote:
> On Tue, Jun 27, 2017 at 10:55:58AM -0400, Rik van Riel wrote:
> > On Tue, 2017-06-27 at 07:39 +0200, Peter Zijlstra wrote:
> 
> > > If LLC < NUMA or !NUMA_BALANCING we have a region that needs to do
> > > _something_.
> > 
> > Agreed. I will fix this. Given that this is a bit
> > of a corner case, 
> 
> 
> Ok, so it turns out all of facebook sits inside that corner case because
> they have !NUMA_BALANCING.
> 
> How does the below patch work?
> 
> ---
> Subject: sched/fair: Fix wake_affine() for !NUMA_BALANCING
> From: Peter Zijlstra <peterz@infradead.org>
> Date: Mon Jul 31 17:50:05 CEST 2017
> 
> In commit:
> 
>   3fed382b46ba ("sched/numa: Implement NUMA node level wake_affine()")
> 
> Rik changed wake_affine to consider NUMA information when balancing
> between LLC domains.
> 
> There are a number of problems here which this patch tries to address:
> 
>  - LLC < NODE; in this case we'd use the wrong information to balance
>  - !NUMA_BALANCING: in this case, the new code doesn't do any
>    balancing at all
>  - re-computes the NUMA data for every wakeup, this can mean iterating
>    up to 64 CPUs for every wakeup.
>  - default affine wakeups inside a cache
> 
> We address these by saving the load/capacity values for each
> sched_domain during regular load-balance and using these values in
> wake_affine_llc(). The obvious down-side to using cached values is
> that they can be too old and poorly reflect reality.
> 
> But this way we can use LLC wide information and thus not rely on
> assuming LLC matches NODE. We also don't rely on NUMA_BALANCING nor do
> we have to aggegate two nodes (or even cache domains) worth of CPUs
> for each wakeup.
> 
> Cc: Josef Bacik <josef@toxicpanda.com>
> Cc: Rik van Riel <riel@redhat.com>
> Fixes: 3fed382b46ba ("sched/numa: Implement NUMA node level wake_affine()")
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  include/linux/sched/topology.h |    9 ++
>  kernel/sched/fair.c            |  182 +++++++++++++++++++++++++----------------
>  2 files changed, 123 insertions(+), 68 deletions(-)
> 
> --- a/include/linux/sched/topology.h
> +++ b/include/linux/sched/topology.h
> @@ -71,6 +71,15 @@ struct sched_domain_shared {
>  	atomic_t	ref;
>  	atomic_t	nr_busy_cpus;
>  	int		has_idle_cores;
> +
> +	/*
> +	 * Some variables from the most recent sd_lb_stats for this domain.
> +	 *
> +	 * used by wake_affine().
> +	 */
> +	unsigned long	nr_running;
> +	unsigned long	load;
> +	unsigned long	capacity;
>  };
>  
>  struct sched_domain {
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -2586,59 +2586,6 @@ void task_tick_numa(struct rq *rq, struc
>  	}
>  }
>  
> -/*
> - * Can a task be moved from prev_cpu to this_cpu without causing a load
> - * imbalance that would trigger the load balancer?
> - */
> -static inline bool numa_wake_affine(struct sched_domain *sd,
> -				    struct task_struct *p, int this_cpu,
> -				    int prev_cpu, int sync)
> -{
> -	struct numa_stats prev_load, this_load;
> -	s64 this_eff_load, prev_eff_load;
> -
> -	update_numa_stats(&prev_load, cpu_to_node(prev_cpu));
> -	update_numa_stats(&this_load, cpu_to_node(this_cpu));
> -
> -	/*
> -	 * If sync wakeup then subtract the (maximum possible)
> -	 * effect of the currently running task from the load
> -	 * of the current CPU:
> -	 */
> -	if (sync) {
> -		unsigned long current_load = task_h_load(current);
> -
> -		if (this_load.load > current_load)
> -			this_load.load -= current_load;
> -		else
> -			this_load.load = 0;
> -	}
> -
> -	/*
> -	 * In low-load situations, where this_cpu's node is idle due to the
> -	 * sync cause above having dropped this_load.load to 0, move the task.
> -	 * Moving to an idle socket will not create a bad imbalance.
> -	 *
> -	 * Otherwise check if the nodes are near enough in load to allow this
> -	 * task to be woken on this_cpu's node.
> -	 */
> -	if (this_load.load > 0) {
> -		unsigned long task_load = task_h_load(p);
> -
> -		this_eff_load = 100;
> -		this_eff_load *= prev_load.compute_capacity;
> -
> -		prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
> -		prev_eff_load *= this_load.compute_capacity;
> -
> -		this_eff_load *= this_load.load + task_load;
> -		prev_eff_load *= prev_load.load - task_load;
> -
> -		return this_eff_load <= prev_eff_load;
> -	}
> -
> -	return true;
> -}
>  #else
>  static void task_tick_numa(struct rq *rq, struct task_struct *curr)
>  {
> @@ -2652,14 +2599,6 @@ static inline void account_numa_dequeue(
>  {
>  }
>  
> -#ifdef CONFIG_SMP
> -static inline bool numa_wake_affine(struct sched_domain *sd,
> -				    struct task_struct *p, int this_cpu,
> -				    int prev_cpu, int sync)
> -{
> -	return true;
> -}
> -#endif /* !SMP */
>  #endif /* CONFIG_NUMA_BALANCING */
>  
>  static void
> @@ -5356,20 +5295,110 @@ static int wake_wide(struct task_struct
>  	return 1;
>  }
>  
> +struct llc_stats {
> +	unsigned long nr_running;
> +	unsigned long load;
> +	unsigned long capacity;
> +	int has_capacity;
> +};
> +
> +static void get_llc_stats(struct llc_stats *stats, int cpu)
> +{
> +	struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
> +
> +	stats->nr_running = READ_ONCE(sds->nr_running);
> +	stats->load       = READ_ONCE(sds->load);
> +	stats->capacity   = READ_ONCE(sds->capacity);
> +	stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu);
> +}
> +
> +/*
> + * Can a task be moved from prev_cpu to this_cpu without causing a load
> + * imbalance that would trigger the load balancer?
> + *
> + * Since we're running on 'stale' values, we might in fact create an imbalance
> + * but recomputing these values is expensive, as that'd mean iteration 2 cache
> + * domains worth of CPUs.
> + */
> +static bool
> +wake_affine_llc(struct sched_domain *sd, struct task_struct *p,
> +		int this_cpu, int prev_cpu, int sync)
> +{
> +	struct llc_stats prev_stats, this_stats;
> +	s64 this_eff_load, prev_eff_load;
> +	unsigned long task_load;
> +
> +	get_llc_stats(&prev_stats, prev_cpu);
> +	get_llc_stats(&this_stats, this_cpu);
> +
> +	/*
> +	 * If sync wakeup then subtract the (maximum possible)
> +	 * effect of the currently running task from the load
> +	 * of the current LLC.
> +	 */
> +	if (sync) {
> +		unsigned long current_load = task_h_load(current);
> +
> +		/* in this case load hits 0 and this LLC is considered 'idle' */
> +		if (current_load > this_stats.load)
> +			return true;
> +
> +		this_stats.load -= current_load;
> +	}
> +
> +	/*
> +	 * The has_capacity stuff is not SMT aware, but by trying to balance
> +	 * the nr_running on both ends we try and fill the domain at equal
> +	 * rates, thereby first consuming cores before siblings.
> +	 */
> +
> +	/* if the old cache has capacity, stay there */
> +	if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
> +		return false;
> +
> +	/* if this cache has capacity, come here */
> +	if (this_stats.has_capacity && this_stats.nr_running < prev_stats.nr_running+1)
> +		return true;
> +
> +
> +	/*
> +	 * Check to see if we can move the load without causing too much
> +	 * imbalance.
> +	 */
> +	task_load = task_h_load(p);
> +
> +	this_eff_load = 100;
> +	this_eff_load *= prev_stats.capacity;
> +
> +	prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
> +	prev_eff_load *= this_stats.capacity;
> +
> +	this_eff_load *= this_stats.load + task_load;
> +	prev_eff_load *= prev_stats.load - task_load;
> +
> +	return this_eff_load <= prev_eff_load;
> +}
> +
>  static int wake_affine(struct sched_domain *sd, struct task_struct *p,
>  		       int prev_cpu, int sync)
>  {
>  	int this_cpu = smp_processor_id();
> -	bool affine = false;
> +	bool affine;
>  
>  	/*
> -	 * Common case: CPUs are in the same socket, and select_idle_sibling()
> -	 * will do its thing regardless of what we return:
> +	 * Default to no affine wakeups; wake_affine() should not effect a task
> +	 * placement the load-balancer feels inclined to undo. The conservative
> +	 * option is therefore to not move tasks when they wake up.
>  	 */
> -	if (cpus_share_cache(prev_cpu, this_cpu))
> -		affine = true;
> -	else
> -		affine = numa_wake_affine(sd, p, this_cpu, prev_cpu, sync);
> +	affine = false;
> +
> +	/*
> +	 * If the wakeup is across cache domains, try to evaluate if movement
> +	 * makes sense, otherwise rely on select_idle_siblings() to do
> +	 * placement inside the cache domain.
> +	 */
> +	if (!cpus_share_cache(prev_cpu, this_cpu))
> +		affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync);
>  
>  	schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
>  	if (affine) {
> @@ -7049,6 +7078,7 @@ struct sg_lb_stats {
>  struct sd_lb_stats {
>  	struct sched_group *busiest;	/* Busiest group in this sd */
>  	struct sched_group *local;	/* Local group in this sd */
> +	unsigned long total_running;
>  	unsigned long total_load;	/* Total load of all groups in sd */
>  	unsigned long total_capacity;	/* Total capacity of all groups in sd */
>  	unsigned long avg_load;	/* Average load across all groups in sd */
> @@ -7068,6 +7098,7 @@ static inline void init_sd_lb_stats(stru
>  	*sds = (struct sd_lb_stats){
>  		.busiest = NULL,
>  		.local = NULL,
> +		.total_running = 0UL,
>  		.total_load = 0UL,
>  		.total_capacity = 0UL,
>  		.busiest_stat = {
> @@ -7503,6 +7534,7 @@ static inline enum fbq_type fbq_classify
>   */
>  static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
>  {
> +	struct sched_domain_shared *shared = env->sd->shared;
>  	struct sched_domain *child = env->sd->child;
>  	struct sched_group *sg = env->sd->groups;
>  	struct sg_lb_stats *local = &sds->local_stat;
> @@ -7559,6 +7591,7 @@ static inline void update_sd_lb_stats(st
>  
>  next_group:
>  		/* Now, start updating sd_lb_stats */
> +		sds->total_running += sgs->sum_nr_running;
>  		sds->total_load += sgs->group_load;
>  		sds->total_capacity += sgs->group_capacity;
>  
> @@ -7574,6 +7607,18 @@ static inline void update_sd_lb_stats(st
>  			env->dst_rq->rd->overload = overload;
>  	}
>  
> +	/*
> +	 * Since these are sums over groups they can contain some CPUs
> +	 * multiple times for the NUMA domains.
> +	 *
> +	 * Currently only wake_affine_llc() and find_busiest_group()
> +	 * uses these numbers, only the last is affected by this problem.
> +	 *
> +	 * XXX fix that.
> +	 */
> +	WRITE_ONCE(shared->nr_running,	sds->total_running);
> +	WRITE_ONCE(shared->load,	sds->total_load);
> +	WRITE_ONCE(shared->capacity,	sds->total_capacity);

This panic's on boot for me because shared is NULL.  Same happens in
select_task_rq_fair when it tries to do the READ_ONCE.  Here is my .config in
case it's something strange with my config.  Thanks,

Josef