Re: [PATCH 3/4] sched/numa: Apply imbalance limitations consistently

From: Mel Gorman <mgorman@techsingularity.net>
To: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>,
	Vincent Guittot <vincent.guittot@linaro.org>,
	Valentin Schneider <valentin.schneider@arm.com>,
	Aubrey Li <aubrey.li@linux.intel.com>,
	LKML <linux-kernel@vger.kernel.org>
Subject: Re: [PATCH 3/4] sched/numa: Apply imbalance limitations consistently
Date: Wed, 18 May 2022 11:46:52 +0100	[thread overview]
Message-ID: <20220518104652.GO3441@techsingularity.net> (raw)
In-Reply-To: <20220518093156.GD10117@worktop.programming.kicks-ass.net>

On Wed, May 18, 2022 at 11:31:56AM +0200, Peter Zijlstra wrote:
> On Wed, May 11, 2022 at 03:30:37PM +0100, Mel Gorman wrote:
> 
> > @@ -9108,6 +9108,24 @@ static inline bool allow_numa_imbalance(int running, int imb_numa_nr)
> >  	return running <= imb_numa_nr;
> >  }
> >  
> > +#define NUMA_IMBALANCE_MIN 2
> > +
> > +static inline long adjust_numa_imbalance(int imbalance,
> > +				int dst_running, int imb_numa_nr)
> > +{
> > +	if (!allow_numa_imbalance(dst_running, imb_numa_nr))
> > +		return imbalance;
> > +
> > +	/*
> > +	 * Allow a small imbalance based on a simple pair of communicating
> > +	 * tasks that remain local when the destination is lightly loaded.
> > +	 */
> > +	if (imbalance <= NUMA_IMBALANCE_MIN)
> > +		return 0;
> > +
> > +	return imbalance;
> > +}
> 
> > @@ -9334,24 +9356,6 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
> >  	}
> >  }
> >  
> > -#define NUMA_IMBALANCE_MIN 2
> > -
> > -static inline long adjust_numa_imbalance(int imbalance,
> > -				int dst_running, int imb_numa_nr)
> > -{
> > -	if (!allow_numa_imbalance(dst_running, imb_numa_nr))
> > -		return imbalance;
> > -
> > -	/*
> > -	 * Allow a small imbalance based on a simple pair of communicating
> > -	 * tasks that remain local when the destination is lightly loaded.
> > -	 */
> > -	if (imbalance <= NUMA_IMBALANCE_MIN)
> > -		return 0;
> > -
> > -	return imbalance;
> > -}
> 
> If we're going to move that one up and remove the only other caller of
> allow_numa_imbalance() we might as well move it up further still and
> fold the functions.
> 
> Hmm?
> 

Yes, that would be fine and makes sense. I remember thinking that they
should be folded and then failed to follow through.

> (Although I do wonder about that 25% figure in the comment; that doesn't
> seem to relate to any actual code anymore)
> 

You're right, by the end of the series it's completely inaccurate and
currently it's not accurate if there are multiple LLCs per node. I
adjusted the wording to "Allow a NUMA imbalance if busy CPUs is less
than the maximum threshold. Above this threshold, individual tasks may
be contending for both memory bandwidth and any shared HT resources."

Diff between v1 and v2 is now below

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 602c05b22805..51fde61ec756 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1536,8 +1536,31 @@ struct task_numa_env {
 
 static unsigned long cpu_load(struct rq *rq);
 static unsigned long cpu_runnable(struct rq *rq);
-static inline long adjust_numa_imbalance(int imbalance,
-					int dst_running, int imb_numa_nr);
+
+#define NUMA_IMBALANCE_MIN 2
+
+static inline long
+adjust_numa_imbalance(int imbalance, int dst_running, int imb_numa_nr)
+{
+	/*
+	 * Allow a NUMA imbalance if busy CPUs is less than the maximum
+	 * threshold. Above this threshold, individual tasks may be contending
+	 * for both memory bandwidth and any shared HT resources.  This is an
+	 * approximation as the number of running tasks may not be related to
+	 * the number of busy CPUs due to sched_setaffinity.
+	 */
+	if (dst_running > imb_numa_nr)
+		return imbalance;
+
+	/*
+	 * Allow a small imbalance based on a simple pair of communicating
+	 * tasks that remain local when the destination is lightly loaded.
+	 */
+	if (imbalance <= NUMA_IMBALANCE_MIN)
+		return 0;
+
+	return imbalance;
+}
 
 static inline enum
 numa_type numa_classify(unsigned int imbalance_pct,
@@ -9098,34 +9121,6 @@ static bool update_pick_idlest(struct sched_group *idlest,
 	return true;
 }
 
-/*
- * Allow a NUMA imbalance if busy CPUs is less than 25% of the domain.
- * This is an approximation as the number of running tasks may not be
- * related to the number of busy CPUs due to sched_setaffinity.
- */
-static inline bool allow_numa_imbalance(int running, int imb_numa_nr)
-{
-	return running <= imb_numa_nr;
-}
-
-#define NUMA_IMBALANCE_MIN 2
-
-static inline long adjust_numa_imbalance(int imbalance,
-				int dst_running, int imb_numa_nr)
-{
-	if (!allow_numa_imbalance(dst_running, imb_numa_nr))
-		return imbalance;
-
-	/*
-	 * Allow a small imbalance based on a simple pair of communicating
-	 * tasks that remain local when the destination is lightly loaded.
-	 */
-	if (imbalance <= NUMA_IMBALANCE_MIN)
-		return 0;
-
-	return imbalance;
-}
-
 /*
  * find_idlest_group() finds and returns the least busy CPU group within the
  * domain.
@@ -9448,14 +9443,15 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 			 * idle cpus.
 			 */
 			env->migration_type = migrate_task;
-			env->imbalance = max_t(long, 0, (local->idle_cpus -
-						 busiest->idle_cpus));
+			env->imbalance = max_t(long, 0,
+					       (local->idle_cpus - busiest->idle_cpus));
 		}
 
 		/* Consider allowing a small imbalance between NUMA groups */
 		if (env->sd->flags & SD_NUMA) {
 			env->imbalance = adjust_numa_imbalance(env->imbalance,
-				local->sum_nr_running + 1, env->sd->imb_numa_nr);
+							       local->sum_nr_running + 1,
+							       env->sd->imb_numa_nr);
 		}
 
 		/* Number of tasks to move to restore balance */