[2/3] sched/numa: Allow a floating imbalance between NUMA nodes
diff mbox series

Message ID 20201117134222.31482-3-mgorman@techsingularity.net
State New, archived
Headers show
Series
  • Revisit NUMA imbalance tolerance and fork balancing
Related show

Commit Message

Mel Gorman Nov. 17, 2020, 1:42 p.m. UTC
Currently, an imbalance is only allowed when a destination node
is almost completely idle. This solved one basic class of problems
and was the cautious approach.

This patch revisits the possibility that NUMA nodes can be imbalanced
until 25% of the CPUs are occupied. The reasoning behind 25% is somewhat
superficial -- it's half the cores when HT is enabled.  At higher
utilisations, balancing should continue as normal and keep things even
until scheduler domains are fully busy or over utilised.

Note that this is not expected to be a universal win. Any benchmark
that prefers spreading as wide as possible with limited communication
will favour the old behaviour as there is more memory bandwidth.
Workloads that communicate heavily in pairs such as netperf or tbench
benefit. For the tests I ran, the vast majority of workloads saw
a benefit so it seems to be a worthwhile tradeoff.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
---
 kernel/sched/fair.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

Comments

Peter Zijlstra Nov. 17, 2020, 2:16 p.m. UTC | #1
On Tue, Nov 17, 2020 at 01:42:21PM +0000, Mel Gorman wrote:
> This patch revisits the possibility that NUMA nodes can be imbalanced
> until 25% of the CPUs are occupied. The reasoning behind 25% is somewhat
> superficial -- it's half the cores when HT is enabled.  At higher
> utilisations, balancing should continue as normal and keep things even
> until scheduler domains are fully busy or over utilised.

Do we want to make that shift depend on the actual SMT factor?
Vincent Guittot Nov. 17, 2020, 2:24 p.m. UTC | #2
On Tue, 17 Nov 2020 at 14:42, Mel Gorman <mgorman@techsingularity.net> wrote:
>
> Currently, an imbalance is only allowed when a destination node
> is almost completely idle. This solved one basic class of problems
> and was the cautious approach.
>
> This patch revisits the possibility that NUMA nodes can be imbalanced
> until 25% of the CPUs are occupied. The reasoning behind 25% is somewhat
> superficial -- it's half the cores when HT is enabled.  At higher
> utilisations, balancing should continue as normal and keep things even
> until scheduler domains are fully busy or over utilised.

This reminds me previous discussions on the same topic: how much
imbalance is allowed that will not screw up the bandwidth of the node
I'm worried that there is no topology insight in the decision like
hyperthreading, or number of cpus in the LLC

>
> Note that this is not expected to be a universal win. Any benchmark
> that prefers spreading as wide as possible with limited communication
> will favour the old behaviour as there is more memory bandwidth.
> Workloads that communicate heavily in pairs such as netperf or tbench
> benefit. For the tests I ran, the vast majority of workloads saw
> a benefit so it seems to be a worthwhile tradeoff.
>
> Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
> ---
>  kernel/sched/fair.c | 22 ++++++++++++----------
>  1 file changed, 12 insertions(+), 10 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 5fbed29e4001..33709dfac24d 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1550,7 +1550,8 @@ struct task_numa_env {
>  static unsigned long cpu_load(struct rq *rq);
>  static unsigned long cpu_runnable(struct rq *rq);
>  static unsigned long cpu_util(int cpu);
> -static inline long adjust_numa_imbalance(int imbalance, int dst_running);
> +static inline long adjust_numa_imbalance(int imbalance,
> +                                       int dst_running, int dst_weight);
>
>  static inline enum
>  numa_type numa_classify(unsigned int imbalance_pct,
> @@ -1930,7 +1931,8 @@ static void task_numa_find_cpu(struct task_numa_env *env,
>                 src_running = env->src_stats.nr_running - 1;
>                 dst_running = env->dst_stats.nr_running + 1;
>                 imbalance = max(0, dst_running - src_running);
> -               imbalance = adjust_numa_imbalance(imbalance, dst_running);
> +               imbalance = adjust_numa_imbalance(imbalance, dst_running,
> +                                                       env->dst_stats.weight);
>
>                 /* Use idle CPU if there is no imbalance */
>                 if (!imbalance) {
> @@ -8993,16 +8995,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
>
>  #define NUMA_IMBALANCE_MIN 2
>
> -static inline long adjust_numa_imbalance(int imbalance, int dst_running)
> +static inline long adjust_numa_imbalance(int imbalance,
> +                               int dst_running, int dst_weight)
>  {
> -       unsigned int imbalance_min;
> -
>         /*
>          * Allow a small imbalance based on a simple pair of communicating
> -        * tasks that remain local when the source domain is almost idle.
> +        * when the destination is lightly loaded so that pairs of
> +        * communicating tasks may remain local.
>          */
> -       imbalance_min = NUMA_IMBALANCE_MIN;
> -       if (dst_running <= imbalance_min)
> +       if (dst_running < (dst_weight >> 2) && imbalance <= NUMA_IMBALANCE_MIN)
>                 return 0;
>
>         return imbalance;
> @@ -9105,9 +9106,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
>                 }
>
>                 /* Consider allowing a small imbalance between NUMA groups */
> -               if (env->sd->flags & SD_NUMA)
> +               if (env->sd->flags & SD_NUMA) {
>                         env->imbalance = adjust_numa_imbalance(env->imbalance,
> -                                               busiest->sum_nr_running);
> +                               busiest->sum_nr_running, busiest->group_weight);
> +               }
>
>                 return;
>         }
> --
> 2.26.2
>
Mel Gorman Nov. 17, 2020, 2:43 p.m. UTC | #3
On Tue, Nov 17, 2020 at 03:16:03PM +0100, Peter Zijlstra wrote:
> On Tue, Nov 17, 2020 at 01:42:21PM +0000, Mel Gorman wrote:
> > This patch revisits the possibility that NUMA nodes can be imbalanced
> > until 25% of the CPUs are occupied. The reasoning behind 25% is somewhat
> > superficial -- it's half the cores when HT is enabled.  At higher
> > utilisations, balancing should continue as normal and keep things even
> > until scheduler domains are fully busy or over utilised.
> 
> Do we want to make that shift depend on the actual SMT factor?

I considered it but decided against it. I wanted the balance point to
be somewhere below SMT because select_idle_sibling tries to avoid SMT
sharing so I didn't want a point where SMT sharing caused more problems
than memory locality. However, I worried that picking a different imbalance
point depending on SMT would be surprising to some.
Mel Gorman Nov. 17, 2020, 2:44 p.m. UTC | #4
On Tue, Nov 17, 2020 at 03:24:56PM +0100, Vincent Guittot wrote:
> On Tue, 17 Nov 2020 at 14:42, Mel Gorman <mgorman@techsingularity.net> wrote:
> >
> > Currently, an imbalance is only allowed when a destination node
> > is almost completely idle. This solved one basic class of problems
> > and was the cautious approach.
> >
> > This patch revisits the possibility that NUMA nodes can be imbalanced
> > until 25% of the CPUs are occupied. The reasoning behind 25% is somewhat
> > superficial -- it's half the cores when HT is enabled.  At higher
> > utilisations, balancing should continue as normal and keep things even
> > until scheduler domains are fully busy or over utilised.
> 
> This reminds me previous discussions on the same topic: how much
> imbalance is allowed that will not screw up the bandwidth of the node
> I'm worried that there is no topology insight in the decision like
> hyperthreading, or number of cpus in the LLC
> 

We still don't have a good answer for that. It could be a tunable I guess
but it would be horrible to tune properly.

Patch
diff mbox series

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5fbed29e4001..33709dfac24d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1550,7 +1550,8 @@  struct task_numa_env {
 static unsigned long cpu_load(struct rq *rq);
 static unsigned long cpu_runnable(struct rq *rq);
 static unsigned long cpu_util(int cpu);
-static inline long adjust_numa_imbalance(int imbalance, int dst_running);
+static inline long adjust_numa_imbalance(int imbalance,
+					int dst_running, int dst_weight);
 
 static inline enum
 numa_type numa_classify(unsigned int imbalance_pct,
@@ -1930,7 +1931,8 @@  static void task_numa_find_cpu(struct task_numa_env *env,
 		src_running = env->src_stats.nr_running - 1;
 		dst_running = env->dst_stats.nr_running + 1;
 		imbalance = max(0, dst_running - src_running);
-		imbalance = adjust_numa_imbalance(imbalance, dst_running);
+		imbalance = adjust_numa_imbalance(imbalance, dst_running,
+							env->dst_stats.weight);
 
 		/* Use idle CPU if there is no imbalance */
 		if (!imbalance) {
@@ -8993,16 +8995,15 @@  static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 
 #define NUMA_IMBALANCE_MIN 2
 
-static inline long adjust_numa_imbalance(int imbalance, int dst_running)
+static inline long adjust_numa_imbalance(int imbalance,
+				int dst_running, int dst_weight)
 {
-	unsigned int imbalance_min;
-
 	/*
 	 * Allow a small imbalance based on a simple pair of communicating
-	 * tasks that remain local when the source domain is almost idle.
+	 * when the destination is lightly loaded so that pairs of
+	 * communicating tasks may remain local.
 	 */
-	imbalance_min = NUMA_IMBALANCE_MIN;
-	if (dst_running <= imbalance_min)
+	if (dst_running < (dst_weight >> 2) && imbalance <= NUMA_IMBALANCE_MIN)
 		return 0;
 
 	return imbalance;
@@ -9105,9 +9106,10 @@  static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 		}
 
 		/* Consider allowing a small imbalance between NUMA groups */
-		if (env->sd->flags & SD_NUMA)
+		if (env->sd->flags & SD_NUMA) {
 			env->imbalance = adjust_numa_imbalance(env->imbalance,
-						busiest->sum_nr_running);
+				busiest->sum_nr_running, busiest->group_weight);
+		}
 
 		return;
 	}