linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v3 0/2] sched/fair: nohz.next_balance vs newly-idle CPUs
@ 2021-08-23 11:16 Valentin Schneider
  2021-08-23 11:16 ` [PATCH v3 1/2] sched/fair: Add NOHZ balancer flag for nohz.next_balance updates Valentin Schneider
  2021-08-23 11:17 ` [PATCH v3 2/2] sched/fair: Trigger nohz.next_balance updates when a CPU goes NOHZ-idle Valentin Schneider
  0 siblings, 2 replies; 13+ messages in thread
From: Valentin Schneider @ 2021-08-23 11:16 UTC (permalink / raw)
  To: linux-kernel
  Cc: Peter Zijlstra, Ingo Molnar, Vincent Guittot, Dietmar Eggemann

Hi folks,

This was caught up by our testing on an arm64 RB5 board - that's an 8 CPUs
DynamIQ SoC with 4 littles, 3 mediums and 1 big. It seems to rely more on NOHZ
balancing than our other boards being tested, which highlighted that not
including a newly-idle CPU into nohz.next_balance can cause issues (especially
when the other CPUs have had their balance_interval inflated by pinned tasks).

As suggested by Vincent, the approach here is to mimic what was done for
nohz.has_blocked, which gives us sane(ish) ordering guarantees.

Revisions
=========

v2 -> v3
++++++++

o Rebased against latest tip/sched/core: 234b8ab6476c ("sched: Introduce
  dl_task_check_affinity() to check proposed affinity")
  
o Kept NOHZ_NEXT_KICK in NOHZ_KICK_MASK, but changed nohz_balancer_kick() to
  issue kicks with NOHZ_STATS_KICK | NOHZ_BALANCE_KICK instead (Dietmar)
o Added missing NOHZ_STATS_KICK gate for nohz.next_blocked update (Vincent)

v1 -> v2
++++++++

o Ditched the extra cpumasks and went with a sibling of nohz.has_blocked
  (Vincent) 

Cheers,
Valentin

Valentin Schneider (2):
  sched/fair: Add NOHZ balancer flag for nohz.next_balance updates
  sched/fair: Trigger nohz.next_balance updates when a CPU goes
    NOHZ-idle

 kernel/sched/fair.c  | 39 +++++++++++++++++++++++++++------------
 kernel/sched/sched.h |  8 +++++++-
 2 files changed, 34 insertions(+), 13 deletions(-)

--
2.25.1


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH v3 1/2] sched/fair: Add NOHZ balancer flag for nohz.next_balance updates
  2021-08-23 11:16 [PATCH v3 0/2] sched/fair: nohz.next_balance vs newly-idle CPUs Valentin Schneider
@ 2021-08-23 11:16 ` Valentin Schneider
  2021-08-23 11:59   ` Peter Zijlstra
                     ` (3 more replies)
  2021-08-23 11:17 ` [PATCH v3 2/2] sched/fair: Trigger nohz.next_balance updates when a CPU goes NOHZ-idle Valentin Schneider
  1 sibling, 4 replies; 13+ messages in thread
From: Valentin Schneider @ 2021-08-23 11:16 UTC (permalink / raw)
  To: linux-kernel
  Cc: Vincent Guittot, Peter Zijlstra, Ingo Molnar, Dietmar Eggemann

A following patch will trigger NOHZ idle balances as a means to update
nohz.next_balance. Vincent noted that blocked load updates can have
non-negligible overhead, which should be avoided if the intent is to only
update nohz.next_balance.

Add a new NOHZ balance kick flag, NOHZ_NEXT_KICK. Gate NOHZ blocked load
update by the presence of NOHZ_STATS_KICK - currently all NOHZ balance
kicks will have the NOHZ_STATS_KICK flag set, so no change in behaviour is
expected.

Suggested-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
---
 kernel/sched/fair.c  | 24 ++++++++++++++----------
 kernel/sched/sched.h |  8 +++++++-
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6cd05f1d77ef..4a91f3027c92 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10342,7 +10342,7 @@ static void nohz_balancer_kick(struct rq *rq)
 		goto out;
 
 	if (rq->nr_running >= 2) {
-		flags = NOHZ_KICK_MASK;
+		flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
 		goto out;
 	}
 
@@ -10356,7 +10356,7 @@ static void nohz_balancer_kick(struct rq *rq)
 		 * on.
 		 */
 		if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
-			flags = NOHZ_KICK_MASK;
+			flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
 			goto unlock;
 		}
 	}
@@ -10370,7 +10370,7 @@ static void nohz_balancer_kick(struct rq *rq)
 		 */
 		for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
 			if (sched_asym_prefer(i, cpu)) {
-				flags = NOHZ_KICK_MASK;
+				flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
 				goto unlock;
 			}
 		}
@@ -10383,7 +10383,7 @@ static void nohz_balancer_kick(struct rq *rq)
 		 * to run the misfit task on.
 		 */
 		if (check_misfit_status(rq, sd)) {
-			flags = NOHZ_KICK_MASK;
+			flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
 			goto unlock;
 		}
 
@@ -10410,7 +10410,7 @@ static void nohz_balancer_kick(struct rq *rq)
 		 */
 		nr_busy = atomic_read(&sds->nr_busy_cpus);
 		if (nr_busy > 1) {
-			flags = NOHZ_KICK_MASK;
+			flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
 			goto unlock;
 		}
 	}
@@ -10572,7 +10572,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
 	 * setting the flag, we are sure to not clear the state and not
 	 * check the load of an idle cpu.
 	 */
-	WRITE_ONCE(nohz.has_blocked, 0);
+	if (flags & NOHZ_STATS_KICK)
+		WRITE_ONCE(nohz.has_blocked, 0);
 
 	/*
 	 * Ensures that if we miss the CPU, we must see the has_blocked
@@ -10594,13 +10595,15 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
 		 * balancing owner will pick it up.
 		 */
 		if (need_resched()) {
-			has_blocked_load = true;
+			if (flags & NOHZ_STATS_KICK)
+				has_blocked_load = true;
 			goto abort;
 		}
 
 		rq = cpu_rq(balance_cpu);
 
-		has_blocked_load |= update_nohz_stats(rq);
+		if (flags & NOHZ_STATS_KICK)
+			has_blocked_load |= update_nohz_stats(rq);
 
 		/*
 		 * If time for next balance is due,
@@ -10631,8 +10634,9 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
 	if (likely(update_next_balance))
 		nohz.next_balance = next_balance;
 
-	WRITE_ONCE(nohz.next_blocked,
-		now + msecs_to_jiffies(LOAD_AVG_PERIOD));
+	if (flags & NOHZ_STATS_KICK)
+		WRITE_ONCE(nohz.next_blocked,
+			   now + msecs_to_jiffies(LOAD_AVG_PERIOD));
 
 abort:
 	/* There is still blocked load, enable periodic update */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e7e2bba5b520..30b7bd2ef25d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2706,12 +2706,18 @@ extern void cfs_bandwidth_usage_dec(void);
 #define NOHZ_BALANCE_KICK_BIT	0
 #define NOHZ_STATS_KICK_BIT	1
 #define NOHZ_NEWILB_KICK_BIT	2
+#define NOHZ_NEXT_KICK_BIT	3
 
+/* Run rebalance_domains() */
 #define NOHZ_BALANCE_KICK	BIT(NOHZ_BALANCE_KICK_BIT)
+/* Update blocked load */
 #define NOHZ_STATS_KICK		BIT(NOHZ_STATS_KICK_BIT)
+/* Update blocked load when entering idle */
 #define NOHZ_NEWILB_KICK	BIT(NOHZ_NEWILB_KICK_BIT)
+/* Update nohz.next_balance */
+#define NOHZ_NEXT_KICK		BIT(NOHZ_NEXT_KICK_BIT)
 
-#define NOHZ_KICK_MASK	(NOHZ_BALANCE_KICK | NOHZ_STATS_KICK)
+#define NOHZ_KICK_MASK	(NOHZ_BALANCE_KICK | NOHZ_STATS_KICK | NOHZ_NEXT_KICK)
 
 #define nohz_flags(cpu)	(&cpu_rq(cpu)->nohz_flags)
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v3 2/2] sched/fair: Trigger nohz.next_balance updates when a CPU goes NOHZ-idle
  2021-08-23 11:16 [PATCH v3 0/2] sched/fair: nohz.next_balance vs newly-idle CPUs Valentin Schneider
  2021-08-23 11:16 ` [PATCH v3 1/2] sched/fair: Add NOHZ balancer flag for nohz.next_balance updates Valentin Schneider
@ 2021-08-23 11:17 ` Valentin Schneider
  2021-08-24  9:08   ` Vincent Guittot
                     ` (2 more replies)
  1 sibling, 3 replies; 13+ messages in thread
From: Valentin Schneider @ 2021-08-23 11:17 UTC (permalink / raw)
  To: linux-kernel
  Cc: Peter Zijlstra, Ingo Molnar, Vincent Guittot, Dietmar Eggemann

Consider a system with some NOHZ-idle CPUs, such that

  nohz.idle_cpus_mask = S
  nohz.next_balance = T

When a new CPU k goes NOHZ idle (nohz_balance_enter_idle()), we end up
with:

  nohz.idle_cpus_mask = S \U {k}
  nohz.next_balance = T

Note that the nohz.next_balance hasn't changed - it won't be updated until
a NOHZ balance is triggered. This is problematic if the newly NOHZ idle CPU
has an earlier rq.next_balance than the other NOHZ idle CPUs, IOW if:

  cpu_rq(k).next_balance < nohz.next_balance

In such scenarios, the existing nohz.next_balance will prevent any NOHZ
balance from happening, which itself will prevent nohz.next_balance from
being updated to this new cpu_rq(k).next_balance. Unnecessary load balance
delays of over 12ms caused by this were observed on an arm64 RB5 board.

Use the new nohz.needs_update flag to mark the presence of newly-idle CPUs
that need their rq->next_balance to be collated into
nohz.next_balance. Trigger a NOHZ_NEXT_KICK when the flag is set.

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
---
 kernel/sched/fair.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4a91f3027c92..081a9e54058a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5754,6 +5754,7 @@ static struct {
 	cpumask_var_t idle_cpus_mask;
 	atomic_t nr_cpus;
 	int has_blocked;		/* Idle CPUS has blocked load */
+	int needs_update;		/* Newly idle CPUs need their next_balance collated */
 	unsigned long next_balance;     /* in jiffy units */
 	unsigned long next_blocked;	/* Next update of blocked load in jiffies */
 } nohz ____cacheline_aligned;
@@ -10417,6 +10418,9 @@ static void nohz_balancer_kick(struct rq *rq)
 unlock:
 	rcu_read_unlock();
 out:
+	if (READ_ONCE(nohz.needs_update))
+		flags |= NOHZ_NEXT_KICK;
+
 	if (flags)
 		kick_ilb(flags);
 }
@@ -10513,12 +10517,13 @@ void nohz_balance_enter_idle(int cpu)
 	/*
 	 * Ensures that if nohz_idle_balance() fails to observe our
 	 * @idle_cpus_mask store, it must observe the @has_blocked
-	 * store.
+	 * and @needs_update stores.
 	 */
 	smp_mb__after_atomic();
 
 	set_cpu_sd_state_idle(cpu);
 
+	WRITE_ONCE(nohz.needs_update, 1);
 out:
 	/*
 	 * Each time a cpu enter idle, we assume that it has blocked load and
@@ -10567,13 +10572,17 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
 	/*
 	 * We assume there will be no idle load after this update and clear
 	 * the has_blocked flag. If a cpu enters idle in the mean time, it will
-	 * set the has_blocked flag and trig another update of idle load.
+	 * set the has_blocked flag and trigger another update of idle load.
 	 * Because a cpu that becomes idle, is added to idle_cpus_mask before
 	 * setting the flag, we are sure to not clear the state and not
 	 * check the load of an idle cpu.
+	 *
+	 * Same applies to idle_cpus_mask vs needs_update.
 	 */
 	if (flags & NOHZ_STATS_KICK)
 		WRITE_ONCE(nohz.has_blocked, 0);
+	if (flags & NOHZ_NEXT_KICK)
+		WRITE_ONCE(nohz.needs_update, 0);
 
 	/*
 	 * Ensures that if we miss the CPU, we must see the has_blocked
@@ -10597,6 +10606,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
 		if (need_resched()) {
 			if (flags & NOHZ_STATS_KICK)
 				has_blocked_load = true;
+			if (flags & NOHZ_NEXT_KICK)
+				WRITE_ONCE(nohz.needs_update, 1);
 			goto abort;
 		}
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH v3 1/2] sched/fair: Add NOHZ balancer flag for nohz.next_balance updates
  2021-08-23 11:16 ` [PATCH v3 1/2] sched/fair: Add NOHZ balancer flag for nohz.next_balance updates Valentin Schneider
@ 2021-08-23 11:59   ` Peter Zijlstra
  2021-08-23 12:57     ` Valentin Schneider
  2021-08-24  9:08   ` Vincent Guittot
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 13+ messages in thread
From: Peter Zijlstra @ 2021-08-23 11:59 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: linux-kernel, Vincent Guittot, Ingo Molnar, Dietmar Eggemann

On Mon, Aug 23, 2021 at 12:16:59PM +0100, Valentin Schneider wrote:

> Gate NOHZ blocked load
> update by the presence of NOHZ_STATS_KICK - currently all NOHZ balance
> kicks will have the NOHZ_STATS_KICK flag set, so no change in behaviour is
> expected.

> @@ -10572,7 +10572,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
>  	 * setting the flag, we are sure to not clear the state and not
>  	 * check the load of an idle cpu.
>  	 */
> -	WRITE_ONCE(nohz.has_blocked, 0);
> +	if (flags & NOHZ_STATS_KICK)
> +		WRITE_ONCE(nohz.has_blocked, 0);
>  
>  	/*
>  	 * Ensures that if we miss the CPU, we must see the has_blocked
> @@ -10594,13 +10595,15 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
>  		 * balancing owner will pick it up.
>  		 */
>  		if (need_resched()) {
> -			has_blocked_load = true;
> +			if (flags & NOHZ_STATS_KICK)
> +				has_blocked_load = true;
>  			goto abort;
>  		}
>  
>  		rq = cpu_rq(balance_cpu);
>  
> -		has_blocked_load |= update_nohz_stats(rq);
> +		if (flags & NOHZ_STATS_KICK)
> +			has_blocked_load |= update_nohz_stats(rq);
>  
>  		/*
>  		 * If time for next balance is due,
> @@ -10631,8 +10634,9 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
>  	if (likely(update_next_balance))
>  		nohz.next_balance = next_balance;
>  
> -	WRITE_ONCE(nohz.next_blocked,
> -		now + msecs_to_jiffies(LOAD_AVG_PERIOD));
> +	if (flags & NOHZ_STATS_KICK)
> +		WRITE_ONCE(nohz.next_blocked,
> +			   now + msecs_to_jiffies(LOAD_AVG_PERIOD));
>  
>  abort:
>  	/* There is still blocked load, enable periodic update */

I'm a bit puzzled by this; that function has:

  SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);

Which:

 - isn't updated
 - implies STATS must be set when BALANCE

the latter gives rise to my confusion; why add that gate on STATS? It
just doesn't make sense to do a BALANCE and not update STATS.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3 1/2] sched/fair: Add NOHZ balancer flag for nohz.next_balance updates
  2021-08-23 11:59   ` Peter Zijlstra
@ 2021-08-23 12:57     ` Valentin Schneider
  2021-08-23 13:53       ` Dietmar Eggemann
  0 siblings, 1 reply; 13+ messages in thread
From: Valentin Schneider @ 2021-08-23 12:57 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, Vincent Guittot, Ingo Molnar, Dietmar Eggemann

On 23/08/21 13:59, Peter Zijlstra wrote:
> On Mon, Aug 23, 2021 at 12:16:59PM +0100, Valentin Schneider wrote:
>
>> Gate NOHZ blocked load
>> update by the presence of NOHZ_STATS_KICK - currently all NOHZ balance
>> kicks will have the NOHZ_STATS_KICK flag set, so no change in behaviour is
>> expected.
>
>> @@ -10572,7 +10572,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
>>       * setting the flag, we are sure to not clear the state and not
>>       * check the load of an idle cpu.
>>       */
>> -	WRITE_ONCE(nohz.has_blocked, 0);
>> +	if (flags & NOHZ_STATS_KICK)
>> +		WRITE_ONCE(nohz.has_blocked, 0);
>>
>>      /*
>>       * Ensures that if we miss the CPU, we must see the has_blocked
>> @@ -10594,13 +10595,15 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
>>               * balancing owner will pick it up.
>>               */
>>              if (need_resched()) {
>> -			has_blocked_load = true;
>> +			if (flags & NOHZ_STATS_KICK)
>> +				has_blocked_load = true;
>>                      goto abort;
>>              }
>>
>>              rq = cpu_rq(balance_cpu);
>>
>> -		has_blocked_load |= update_nohz_stats(rq);
>> +		if (flags & NOHZ_STATS_KICK)
>> +			has_blocked_load |= update_nohz_stats(rq);
>>
>>              /*
>>               * If time for next balance is due,
>> @@ -10631,8 +10634,9 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
>>      if (likely(update_next_balance))
>>              nohz.next_balance = next_balance;
>>
>> -	WRITE_ONCE(nohz.next_blocked,
>> -		now + msecs_to_jiffies(LOAD_AVG_PERIOD));
>> +	if (flags & NOHZ_STATS_KICK)
>> +		WRITE_ONCE(nohz.next_blocked,
>> +			   now + msecs_to_jiffies(LOAD_AVG_PERIOD));
>>
>>  abort:
>>      /* There is still blocked load, enable periodic update */
>
> I'm a bit puzzled by this; that function has:
>
>   SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>
> Which:
>
>  - isn't updated
>  - implies STATS must be set when BALANCE

Yup

>
> the latter gives rise to my confusion; why add that gate on STATS? It
> just doesn't make sense to do a BALANCE and not update STATS.

AFAIA that warning was only there to catch BALANCE && !STATS, so I didn't
tweak it.

Now, you could still end up with

  flags == NOHZ_NEXT_KICK

(e.g. nohz.next_balance is in the future, but a new CPU entered NOHZ-idle
and needs its own rq.next_balance collated into the nohz struct)

in which case you don't do any blocked load update, hence the
gate. In v1 I had that piggyback on NOHZ_STATS_KICK, but Vincent noted
that might not be the best given blocked load updates can be time
consuming - hence the separate flag.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3 1/2] sched/fair: Add NOHZ balancer flag for nohz.next_balance updates
  2021-08-23 12:57     ` Valentin Schneider
@ 2021-08-23 13:53       ` Dietmar Eggemann
  2021-08-24  8:11         ` Peter Zijlstra
  0 siblings, 1 reply; 13+ messages in thread
From: Dietmar Eggemann @ 2021-08-23 13:53 UTC (permalink / raw)
  To: Valentin Schneider, Peter Zijlstra
  Cc: linux-kernel, Vincent Guittot, Ingo Molnar

On 23/08/2021 14:57, Valentin Schneider wrote:
> On 23/08/21 13:59, Peter Zijlstra wrote:
>> On Mon, Aug 23, 2021 at 12:16:59PM +0100, Valentin Schneider wrote:
>>
>>> Gate NOHZ blocked load
>>> update by the presence of NOHZ_STATS_KICK - currently all NOHZ balance
>>> kicks will have the NOHZ_STATS_KICK flag set, so no change in behaviour is
>>> expected.
>>
>>> @@ -10572,7 +10572,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
>>>       * setting the flag, we are sure to not clear the state and not
>>>       * check the load of an idle cpu.
>>>       */
>>> -	WRITE_ONCE(nohz.has_blocked, 0);
>>> +	if (flags & NOHZ_STATS_KICK)
>>> +		WRITE_ONCE(nohz.has_blocked, 0);
>>>
>>>      /*
>>>       * Ensures that if we miss the CPU, we must see the has_blocked
>>> @@ -10594,13 +10595,15 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
>>>               * balancing owner will pick it up.
>>>               */
>>>              if (need_resched()) {
>>> -			has_blocked_load = true;
>>> +			if (flags & NOHZ_STATS_KICK)
>>> +				has_blocked_load = true;
>>>                      goto abort;
>>>              }
>>>
>>>              rq = cpu_rq(balance_cpu);
>>>
>>> -		has_blocked_load |= update_nohz_stats(rq);
>>> +		if (flags & NOHZ_STATS_KICK)
>>> +			has_blocked_load |= update_nohz_stats(rq);
>>>
>>>              /*
>>>               * If time for next balance is due,
>>> @@ -10631,8 +10634,9 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
>>>      if (likely(update_next_balance))
>>>              nohz.next_balance = next_balance;
>>>
>>> -	WRITE_ONCE(nohz.next_blocked,
>>> -		now + msecs_to_jiffies(LOAD_AVG_PERIOD));
>>> +	if (flags & NOHZ_STATS_KICK)
>>> +		WRITE_ONCE(nohz.next_blocked,
>>> +			   now + msecs_to_jiffies(LOAD_AVG_PERIOD));
>>>
>>>  abort:
>>>      /* There is still blocked load, enable periodic update */
>>
>> I'm a bit puzzled by this; that function has:
>>
>>   SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>>
>> Which:
>>
>>  - isn't updated
>>  - implies STATS must be set when BALANCE
> 
> Yup
> 
>>
>> the latter gives rise to my confusion; why add that gate on STATS? It
>> just doesn't make sense to do a BALANCE and not update STATS.
> 
> AFAIA that warning was only there to catch BALANCE && !STATS, so I didn't
> tweak it.
> 
> Now, you could still end up with
> 
>   flags == NOHZ_NEXT_KICK
> 
> (e.g. nohz.next_balance is in the future, but a new CPU entered NOHZ-idle
> and needs its own rq.next_balance collated into the nohz struct)
> 
> in which case you don't do any blocked load update, hence the
> gate. In v1 I had that piggyback on NOHZ_STATS_KICK, but Vincent noted
> that might not be the best given blocked load updates can be time
> consuming - hence the separate flag.

Maybe the confusion stems from the fact that the NOHZ_NEXT_KICK-set
changes are only introduced in 2/2?

@@ -10417,6 +10418,9 @@ static void nohz_balancer_kick(struct rq *rq)
 unlock:
 	rcu_read_unlock();
 out:
+	if (READ_ONCE(nohz.needs_update))
+		flags |= NOHZ_NEXT_KICK;
+

@@ -10513,12 +10517,13 @@ void nohz_balance_enter_idle(int cpu)

...

+	WRITE_ONCE(nohz.needs_update, 1);

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3 1/2] sched/fair: Add NOHZ balancer flag for nohz.next_balance updates
  2021-08-23 13:53       ` Dietmar Eggemann
@ 2021-08-24  8:11         ` Peter Zijlstra
  0 siblings, 0 replies; 13+ messages in thread
From: Peter Zijlstra @ 2021-08-24  8:11 UTC (permalink / raw)
  To: Dietmar Eggemann
  Cc: Valentin Schneider, linux-kernel, Vincent Guittot, Ingo Molnar

On Mon, Aug 23, 2021 at 03:53:16PM +0200, Dietmar Eggemann wrote:
> >> I'm a bit puzzled by this; that function has:
> >>
> >>   SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
> >>
> >> Which:
> >>
> >>  - isn't updated
> >>  - implies STATS must be set when BALANCE
> > 
> > Yup
> > 
> >>
> >> the latter gives rise to my confusion; why add that gate on STATS? It
> >> just doesn't make sense to do a BALANCE and not update STATS.
> > 
> > AFAIA that warning was only there to catch BALANCE && !STATS, so I didn't
> > tweak it.
> > 
> > Now, you could still end up with
> > 
> >   flags == NOHZ_NEXT_KICK
> > 
> > (e.g. nohz.next_balance is in the future, but a new CPU entered NOHZ-idle
> > and needs its own rq.next_balance collated into the nohz struct)
> > 
> > in which case you don't do any blocked load update, hence the
> > gate. In v1 I had that piggyback on NOHZ_STATS_KICK, but Vincent noted
> > that might not be the best given blocked load updates can be time
> > consuming - hence the separate flag.
> 
> Maybe the confusion stems from the fact that the NOHZ_NEXT_KICK-set
> changes are only introduced in 2/2?
> 
> @@ -10417,6 +10418,9 @@ static void nohz_balancer_kick(struct rq *rq)
>  unlock:
>  	rcu_read_unlock();
>  out:
> +	if (READ_ONCE(nohz.needs_update))
> +		flags |= NOHZ_NEXT_KICK;
> +

The confusion was about how we'd ever get there and not have STATS set,
but i guess having it all nicely gated does make it saner.

Thanks!

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3 1/2] sched/fair: Add NOHZ balancer flag for nohz.next_balance updates
  2021-08-23 11:16 ` [PATCH v3 1/2] sched/fair: Add NOHZ balancer flag for nohz.next_balance updates Valentin Schneider
  2021-08-23 11:59   ` Peter Zijlstra
@ 2021-08-24  9:08   ` Vincent Guittot
  2021-09-09 11:18   ` [tip: sched/core] " tip-bot2 for Valentin Schneider
  2021-10-05 14:12   ` tip-bot2 for Valentin Schneider
  3 siblings, 0 replies; 13+ messages in thread
From: Vincent Guittot @ 2021-08-24  9:08 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: linux-kernel, Peter Zijlstra, Ingo Molnar, Dietmar Eggemann

On Mon, 23 Aug 2021 at 13:17, Valentin Schneider
<valentin.schneider@arm.com> wrote:
>
> A following patch will trigger NOHZ idle balances as a means to update
> nohz.next_balance. Vincent noted that blocked load updates can have
> non-negligible overhead, which should be avoided if the intent is to only
> update nohz.next_balance.
>
> Add a new NOHZ balance kick flag, NOHZ_NEXT_KICK. Gate NOHZ blocked load
> update by the presence of NOHZ_STATS_KICK - currently all NOHZ balance
> kicks will have the NOHZ_STATS_KICK flag set, so no change in behaviour is
> expected.
>
> Suggested-by: Vincent Guittot <vincent.guittot@linaro.org>
> Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>

Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>

> ---
>  kernel/sched/fair.c  | 24 ++++++++++++++----------
>  kernel/sched/sched.h |  8 +++++++-
>  2 files changed, 21 insertions(+), 11 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 6cd05f1d77ef..4a91f3027c92 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -10342,7 +10342,7 @@ static void nohz_balancer_kick(struct rq *rq)
>                 goto out;
>
>         if (rq->nr_running >= 2) {
> -               flags = NOHZ_KICK_MASK;
> +               flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
>                 goto out;
>         }
>
> @@ -10356,7 +10356,7 @@ static void nohz_balancer_kick(struct rq *rq)
>                  * on.
>                  */
>                 if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
> -                       flags = NOHZ_KICK_MASK;
> +                       flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
>                         goto unlock;
>                 }
>         }
> @@ -10370,7 +10370,7 @@ static void nohz_balancer_kick(struct rq *rq)
>                  */
>                 for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
>                         if (sched_asym_prefer(i, cpu)) {
> -                               flags = NOHZ_KICK_MASK;
> +                               flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
>                                 goto unlock;
>                         }
>                 }
> @@ -10383,7 +10383,7 @@ static void nohz_balancer_kick(struct rq *rq)
>                  * to run the misfit task on.
>                  */
>                 if (check_misfit_status(rq, sd)) {
> -                       flags = NOHZ_KICK_MASK;
> +                       flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
>                         goto unlock;
>                 }
>
> @@ -10410,7 +10410,7 @@ static void nohz_balancer_kick(struct rq *rq)
>                  */
>                 nr_busy = atomic_read(&sds->nr_busy_cpus);
>                 if (nr_busy > 1) {
> -                       flags = NOHZ_KICK_MASK;
> +                       flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
>                         goto unlock;
>                 }
>         }
> @@ -10572,7 +10572,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
>          * setting the flag, we are sure to not clear the state and not
>          * check the load of an idle cpu.
>          */
> -       WRITE_ONCE(nohz.has_blocked, 0);
> +       if (flags & NOHZ_STATS_KICK)
> +               WRITE_ONCE(nohz.has_blocked, 0);
>
>         /*
>          * Ensures that if we miss the CPU, we must see the has_blocked
> @@ -10594,13 +10595,15 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
>                  * balancing owner will pick it up.
>                  */
>                 if (need_resched()) {
> -                       has_blocked_load = true;
> +                       if (flags & NOHZ_STATS_KICK)
> +                               has_blocked_load = true;
>                         goto abort;
>                 }
>
>                 rq = cpu_rq(balance_cpu);
>
> -               has_blocked_load |= update_nohz_stats(rq);
> +               if (flags & NOHZ_STATS_KICK)
> +                       has_blocked_load |= update_nohz_stats(rq);
>
>                 /*
>                  * If time for next balance is due,
> @@ -10631,8 +10634,9 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
>         if (likely(update_next_balance))
>                 nohz.next_balance = next_balance;
>
> -       WRITE_ONCE(nohz.next_blocked,
> -               now + msecs_to_jiffies(LOAD_AVG_PERIOD));
> +       if (flags & NOHZ_STATS_KICK)
> +               WRITE_ONCE(nohz.next_blocked,
> +                          now + msecs_to_jiffies(LOAD_AVG_PERIOD));
>
>  abort:
>         /* There is still blocked load, enable periodic update */
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index e7e2bba5b520..30b7bd2ef25d 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2706,12 +2706,18 @@ extern void cfs_bandwidth_usage_dec(void);
>  #define NOHZ_BALANCE_KICK_BIT  0
>  #define NOHZ_STATS_KICK_BIT    1
>  #define NOHZ_NEWILB_KICK_BIT   2
> +#define NOHZ_NEXT_KICK_BIT     3
>
> +/* Run rebalance_domains() */
>  #define NOHZ_BALANCE_KICK      BIT(NOHZ_BALANCE_KICK_BIT)
> +/* Update blocked load */
>  #define NOHZ_STATS_KICK                BIT(NOHZ_STATS_KICK_BIT)
> +/* Update blocked load when entering idle */
>  #define NOHZ_NEWILB_KICK       BIT(NOHZ_NEWILB_KICK_BIT)
> +/* Update nohz.next_balance */
> +#define NOHZ_NEXT_KICK         BIT(NOHZ_NEXT_KICK_BIT)
>
> -#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK)
> +#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK | NOHZ_NEXT_KICK)
>
>  #define nohz_flags(cpu)        (&cpu_rq(cpu)->nohz_flags)
>
> --
> 2.25.1
>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3 2/2] sched/fair: Trigger nohz.next_balance updates when a CPU goes NOHZ-idle
  2021-08-23 11:17 ` [PATCH v3 2/2] sched/fair: Trigger nohz.next_balance updates when a CPU goes NOHZ-idle Valentin Schneider
@ 2021-08-24  9:08   ` Vincent Guittot
  2021-09-09 11:18   ` [tip: sched/core] " tip-bot2 for Valentin Schneider
  2021-10-05 14:12   ` tip-bot2 for Valentin Schneider
  2 siblings, 0 replies; 13+ messages in thread
From: Vincent Guittot @ 2021-08-24  9:08 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: linux-kernel, Peter Zijlstra, Ingo Molnar, Dietmar Eggemann

On Mon, 23 Aug 2021 at 13:17, Valentin Schneider
<valentin.schneider@arm.com> wrote:
>
> Consider a system with some NOHZ-idle CPUs, such that
>
>   nohz.idle_cpus_mask = S
>   nohz.next_balance = T
>
> When a new CPU k goes NOHZ idle (nohz_balance_enter_idle()), we end up
> with:
>
>   nohz.idle_cpus_mask = S \U {k}
>   nohz.next_balance = T
>
> Note that the nohz.next_balance hasn't changed - it won't be updated until
> a NOHZ balance is triggered. This is problematic if the newly NOHZ idle CPU
> has an earlier rq.next_balance than the other NOHZ idle CPUs, IOW if:
>
>   cpu_rq(k).next_balance < nohz.next_balance
>
> In such scenarios, the existing nohz.next_balance will prevent any NOHZ
> balance from happening, which itself will prevent nohz.next_balance from
> being updated to this new cpu_rq(k).next_balance. Unnecessary load balance
> delays of over 12ms caused by this were observed on an arm64 RB5 board.
>
> Use the new nohz.needs_update flag to mark the presence of newly-idle CPUs
> that need their rq->next_balance to be collated into
> nohz.next_balance. Trigger a NOHZ_NEXT_KICK when the flag is set.
>
> Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>

Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>

> ---
>  kernel/sched/fair.c | 15 +++++++++++++--
>  1 file changed, 13 insertions(+), 2 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 4a91f3027c92..081a9e54058a 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5754,6 +5754,7 @@ static struct {
>         cpumask_var_t idle_cpus_mask;
>         atomic_t nr_cpus;
>         int has_blocked;                /* Idle CPUS has blocked load */
> +       int needs_update;               /* Newly idle CPUs need their next_balance collated */
>         unsigned long next_balance;     /* in jiffy units */
>         unsigned long next_blocked;     /* Next update of blocked load in jiffies */
>  } nohz ____cacheline_aligned;
> @@ -10417,6 +10418,9 @@ static void nohz_balancer_kick(struct rq *rq)
>  unlock:
>         rcu_read_unlock();
>  out:
> +       if (READ_ONCE(nohz.needs_update))
> +               flags |= NOHZ_NEXT_KICK;
> +
>         if (flags)
>                 kick_ilb(flags);
>  }
> @@ -10513,12 +10517,13 @@ void nohz_balance_enter_idle(int cpu)
>         /*
>          * Ensures that if nohz_idle_balance() fails to observe our
>          * @idle_cpus_mask store, it must observe the @has_blocked
> -        * store.
> +        * and @needs_update stores.
>          */
>         smp_mb__after_atomic();
>
>         set_cpu_sd_state_idle(cpu);
>
> +       WRITE_ONCE(nohz.needs_update, 1);
>  out:
>         /*
>          * Each time a cpu enter idle, we assume that it has blocked load and
> @@ -10567,13 +10572,17 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
>         /*
>          * We assume there will be no idle load after this update and clear
>          * the has_blocked flag. If a cpu enters idle in the mean time, it will
> -        * set the has_blocked flag and trig another update of idle load.
> +        * set the has_blocked flag and trigger another update of idle load.
>          * Because a cpu that becomes idle, is added to idle_cpus_mask before
>          * setting the flag, we are sure to not clear the state and not
>          * check the load of an idle cpu.
> +        *
> +        * Same applies to idle_cpus_mask vs needs_update.
>          */
>         if (flags & NOHZ_STATS_KICK)
>                 WRITE_ONCE(nohz.has_blocked, 0);
> +       if (flags & NOHZ_NEXT_KICK)
> +               WRITE_ONCE(nohz.needs_update, 0);
>
>         /*
>          * Ensures that if we miss the CPU, we must see the has_blocked
> @@ -10597,6 +10606,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
>                 if (need_resched()) {
>                         if (flags & NOHZ_STATS_KICK)
>                                 has_blocked_load = true;
> +                       if (flags & NOHZ_NEXT_KICK)
> +                               WRITE_ONCE(nohz.needs_update, 1);
>                         goto abort;
>                 }
>
> --
> 2.25.1
>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [tip: sched/core] sched/fair: Trigger nohz.next_balance updates when a CPU goes NOHZ-idle
  2021-08-23 11:17 ` [PATCH v3 2/2] sched/fair: Trigger nohz.next_balance updates when a CPU goes NOHZ-idle Valentin Schneider
  2021-08-24  9:08   ` Vincent Guittot
@ 2021-09-09 11:18   ` tip-bot2 for Valentin Schneider
  2021-10-05 14:12   ` tip-bot2 for Valentin Schneider
  2 siblings, 0 replies; 13+ messages in thread
From: tip-bot2 for Valentin Schneider @ 2021-09-09 11:18 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Valentin Schneider, Peter Zijlstra (Intel),
	Vincent Guittot, x86, linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     df100a6682d3d9d4b7cbb531a3f783035732ba92
Gitweb:        https://git.kernel.org/tip/df100a6682d3d9d4b7cbb531a3f783035732ba92
Author:        Valentin Schneider <valentin.schneider@arm.com>
AuthorDate:    Mon, 23 Aug 2021 12:17:00 +01:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 09 Sep 2021 11:27:30 +02:00

sched/fair: Trigger nohz.next_balance updates when a CPU goes NOHZ-idle

Consider a system with some NOHZ-idle CPUs, such that

  nohz.idle_cpus_mask = S
  nohz.next_balance = T

When a new CPU k goes NOHZ idle (nohz_balance_enter_idle()), we end up
with:

  nohz.idle_cpus_mask = S \U {k}
  nohz.next_balance = T

Note that the nohz.next_balance hasn't changed - it won't be updated until
a NOHZ balance is triggered. This is problematic if the newly NOHZ idle CPU
has an earlier rq.next_balance than the other NOHZ idle CPUs, IOW if:

  cpu_rq(k).next_balance < nohz.next_balance

In such scenarios, the existing nohz.next_balance will prevent any NOHZ
balance from happening, which itself will prevent nohz.next_balance from
being updated to this new cpu_rq(k).next_balance. Unnecessary load balance
delays of over 12ms caused by this were observed on an arm64 RB5 board.

Use the new nohz.needs_update flag to mark the presence of newly-idle CPUs
that need their rq->next_balance to be collated into
nohz.next_balance. Trigger a NOHZ_NEXT_KICK when the flag is set.

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lkml.kernel.org/r/20210823111700.2842997-3-valentin.schneider@arm.com
---
 kernel/sched/fair.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 48ce754..2a5efde 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5754,6 +5754,7 @@ static struct {
 	cpumask_var_t idle_cpus_mask;
 	atomic_t nr_cpus;
 	int has_blocked;		/* Idle CPUS has blocked load */
+	int needs_update;		/* Newly idle CPUs need their next_balance collated */
 	unsigned long next_balance;     /* in jiffy units */
 	unsigned long next_blocked;	/* Next update of blocked load in jiffies */
 } nohz ____cacheline_aligned;
@@ -10417,6 +10418,9 @@ static void nohz_balancer_kick(struct rq *rq)
 unlock:
 	rcu_read_unlock();
 out:
+	if (READ_ONCE(nohz.needs_update))
+		flags |= NOHZ_NEXT_KICK;
+
 	if (flags)
 		kick_ilb(flags);
 }
@@ -10513,12 +10517,13 @@ void nohz_balance_enter_idle(int cpu)
 	/*
 	 * Ensures that if nohz_idle_balance() fails to observe our
 	 * @idle_cpus_mask store, it must observe the @has_blocked
-	 * store.
+	 * and @needs_update stores.
 	 */
 	smp_mb__after_atomic();
 
 	set_cpu_sd_state_idle(cpu);
 
+	WRITE_ONCE(nohz.needs_update, 1);
 out:
 	/*
 	 * Each time a cpu enter idle, we assume that it has blocked load and
@@ -10567,13 +10572,17 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
 	/*
 	 * We assume there will be no idle load after this update and clear
 	 * the has_blocked flag. If a cpu enters idle in the mean time, it will
-	 * set the has_blocked flag and trig another update of idle load.
+	 * set the has_blocked flag and trigger another update of idle load.
 	 * Because a cpu that becomes idle, is added to idle_cpus_mask before
 	 * setting the flag, we are sure to not clear the state and not
 	 * check the load of an idle cpu.
+	 *
+	 * Same applies to idle_cpus_mask vs needs_update.
 	 */
 	if (flags & NOHZ_STATS_KICK)
 		WRITE_ONCE(nohz.has_blocked, 0);
+	if (flags & NOHZ_NEXT_KICK)
+		WRITE_ONCE(nohz.needs_update, 0);
 
 	/*
 	 * Ensures that if we miss the CPU, we must see the has_blocked
@@ -10597,6 +10606,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
 		if (need_resched()) {
 			if (flags & NOHZ_STATS_KICK)
 				has_blocked_load = true;
+			if (flags & NOHZ_NEXT_KICK)
+				WRITE_ONCE(nohz.needs_update, 1);
 			goto abort;
 		}
 

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [tip: sched/core] sched/fair: Add NOHZ balancer flag for nohz.next_balance updates
  2021-08-23 11:16 ` [PATCH v3 1/2] sched/fair: Add NOHZ balancer flag for nohz.next_balance updates Valentin Schneider
  2021-08-23 11:59   ` Peter Zijlstra
  2021-08-24  9:08   ` Vincent Guittot
@ 2021-09-09 11:18   ` tip-bot2 for Valentin Schneider
  2021-10-05 14:12   ` tip-bot2 for Valentin Schneider
  3 siblings, 0 replies; 13+ messages in thread
From: tip-bot2 for Valentin Schneider @ 2021-09-09 11:18 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Vincent Guittot, Valentin Schneider, Peter Zijlstra (Intel),
	x86, linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     013ce5ed58f799a2f035b732f904f6ebd8e8d881
Gitweb:        https://git.kernel.org/tip/013ce5ed58f799a2f035b732f904f6ebd8e8d881
Author:        Valentin Schneider <valentin.schneider@arm.com>
AuthorDate:    Mon, 23 Aug 2021 12:16:59 +01:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 09 Sep 2021 11:27:29 +02:00

sched/fair: Add NOHZ balancer flag for nohz.next_balance updates

A following patch will trigger NOHZ idle balances as a means to update
nohz.next_balance. Vincent noted that blocked load updates can have
non-negligible overhead, which should be avoided if the intent is to only
update nohz.next_balance.

Add a new NOHZ balance kick flag, NOHZ_NEXT_KICK. Gate NOHZ blocked load
update by the presence of NOHZ_STATS_KICK - currently all NOHZ balance
kicks will have the NOHZ_STATS_KICK flag set, so no change in behaviour is
expected.

Suggested-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lkml.kernel.org/r/20210823111700.2842997-2-valentin.schneider@arm.com
---
 kernel/sched/fair.c  | 24 ++++++++++++++----------
 kernel/sched/sched.h |  8 +++++++-
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7b3e859..48ce754 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10342,7 +10342,7 @@ static void nohz_balancer_kick(struct rq *rq)
 		goto out;
 
 	if (rq->nr_running >= 2) {
-		flags = NOHZ_KICK_MASK;
+		flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
 		goto out;
 	}
 
@@ -10356,7 +10356,7 @@ static void nohz_balancer_kick(struct rq *rq)
 		 * on.
 		 */
 		if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
-			flags = NOHZ_KICK_MASK;
+			flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
 			goto unlock;
 		}
 	}
@@ -10370,7 +10370,7 @@ static void nohz_balancer_kick(struct rq *rq)
 		 */
 		for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
 			if (sched_asym_prefer(i, cpu)) {
-				flags = NOHZ_KICK_MASK;
+				flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
 				goto unlock;
 			}
 		}
@@ -10383,7 +10383,7 @@ static void nohz_balancer_kick(struct rq *rq)
 		 * to run the misfit task on.
 		 */
 		if (check_misfit_status(rq, sd)) {
-			flags = NOHZ_KICK_MASK;
+			flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
 			goto unlock;
 		}
 
@@ -10410,7 +10410,7 @@ static void nohz_balancer_kick(struct rq *rq)
 		 */
 		nr_busy = atomic_read(&sds->nr_busy_cpus);
 		if (nr_busy > 1) {
-			flags = NOHZ_KICK_MASK;
+			flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
 			goto unlock;
 		}
 	}
@@ -10572,7 +10572,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
 	 * setting the flag, we are sure to not clear the state and not
 	 * check the load of an idle cpu.
 	 */
-	WRITE_ONCE(nohz.has_blocked, 0);
+	if (flags & NOHZ_STATS_KICK)
+		WRITE_ONCE(nohz.has_blocked, 0);
 
 	/*
 	 * Ensures that if we miss the CPU, we must see the has_blocked
@@ -10594,13 +10595,15 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
 		 * balancing owner will pick it up.
 		 */
 		if (need_resched()) {
-			has_blocked_load = true;
+			if (flags & NOHZ_STATS_KICK)
+				has_blocked_load = true;
 			goto abort;
 		}
 
 		rq = cpu_rq(balance_cpu);
 
-		has_blocked_load |= update_nohz_stats(rq);
+		if (flags & NOHZ_STATS_KICK)
+			has_blocked_load |= update_nohz_stats(rq);
 
 		/*
 		 * If time for next balance is due,
@@ -10631,8 +10634,9 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
 	if (likely(update_next_balance))
 		nohz.next_balance = next_balance;
 
-	WRITE_ONCE(nohz.next_blocked,
-		now + msecs_to_jiffies(LOAD_AVG_PERIOD));
+	if (flags & NOHZ_STATS_KICK)
+		WRITE_ONCE(nohz.next_blocked,
+			   now + msecs_to_jiffies(LOAD_AVG_PERIOD));
 
 abort:
 	/* There is still blocked load, enable periodic update */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e7e2bba..30b7bd2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2706,12 +2706,18 @@ extern void cfs_bandwidth_usage_dec(void);
 #define NOHZ_BALANCE_KICK_BIT	0
 #define NOHZ_STATS_KICK_BIT	1
 #define NOHZ_NEWILB_KICK_BIT	2
+#define NOHZ_NEXT_KICK_BIT	3
 
+/* Run rebalance_domains() */
 #define NOHZ_BALANCE_KICK	BIT(NOHZ_BALANCE_KICK_BIT)
+/* Update blocked load */
 #define NOHZ_STATS_KICK		BIT(NOHZ_STATS_KICK_BIT)
+/* Update blocked load when entering idle */
 #define NOHZ_NEWILB_KICK	BIT(NOHZ_NEWILB_KICK_BIT)
+/* Update nohz.next_balance */
+#define NOHZ_NEXT_KICK		BIT(NOHZ_NEXT_KICK_BIT)
 
-#define NOHZ_KICK_MASK	(NOHZ_BALANCE_KICK | NOHZ_STATS_KICK)
+#define NOHZ_KICK_MASK	(NOHZ_BALANCE_KICK | NOHZ_STATS_KICK | NOHZ_NEXT_KICK)
 
 #define nohz_flags(cpu)	(&cpu_rq(cpu)->nohz_flags)
 

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [tip: sched/core] sched/fair: Trigger nohz.next_balance updates when a CPU goes NOHZ-idle
  2021-08-23 11:17 ` [PATCH v3 2/2] sched/fair: Trigger nohz.next_balance updates when a CPU goes NOHZ-idle Valentin Schneider
  2021-08-24  9:08   ` Vincent Guittot
  2021-09-09 11:18   ` [tip: sched/core] " tip-bot2 for Valentin Schneider
@ 2021-10-05 14:12   ` tip-bot2 for Valentin Schneider
  2 siblings, 0 replies; 13+ messages in thread
From: tip-bot2 for Valentin Schneider @ 2021-10-05 14:12 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Valentin Schneider, Peter Zijlstra (Intel),
	Vincent Guittot, x86, linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     7fd7a9e0caba10829b4f8db1aa7711b558681fd4
Gitweb:        https://git.kernel.org/tip/7fd7a9e0caba10829b4f8db1aa7711b558681fd4
Author:        Valentin Schneider <valentin.schneider@arm.com>
AuthorDate:    Mon, 23 Aug 2021 12:17:00 +01:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 05 Oct 2021 15:51:31 +02:00

sched/fair: Trigger nohz.next_balance updates when a CPU goes NOHZ-idle

Consider a system with some NOHZ-idle CPUs, such that

  nohz.idle_cpus_mask = S
  nohz.next_balance = T

When a new CPU k goes NOHZ idle (nohz_balance_enter_idle()), we end up
with:

  nohz.idle_cpus_mask = S \U {k}
  nohz.next_balance = T

Note that the nohz.next_balance hasn't changed - it won't be updated until
a NOHZ balance is triggered. This is problematic if the newly NOHZ idle CPU
has an earlier rq.next_balance than the other NOHZ idle CPUs, IOW if:

  cpu_rq(k).next_balance < nohz.next_balance

In such scenarios, the existing nohz.next_balance will prevent any NOHZ
balance from happening, which itself will prevent nohz.next_balance from
being updated to this new cpu_rq(k).next_balance. Unnecessary load balance
delays of over 12ms caused by this were observed on an arm64 RB5 board.

Use the new nohz.needs_update flag to mark the presence of newly-idle CPUs
that need their rq->next_balance to be collated into
nohz.next_balance. Trigger a NOHZ_NEXT_KICK when the flag is set.

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lkml.kernel.org/r/20210823111700.2842997-3-valentin.schneider@arm.com
---
 kernel/sched/fair.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f4de7f5..6cc958e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5787,6 +5787,7 @@ static struct {
 	cpumask_var_t idle_cpus_mask;
 	atomic_t nr_cpus;
 	int has_blocked;		/* Idle CPUS has blocked load */
+	int needs_update;		/* Newly idle CPUs need their next_balance collated */
 	unsigned long next_balance;     /* in jiffy units */
 	unsigned long next_blocked;	/* Next update of blocked load in jiffies */
 } nohz ____cacheline_aligned;
@@ -10450,6 +10451,9 @@ static void nohz_balancer_kick(struct rq *rq)
 unlock:
 	rcu_read_unlock();
 out:
+	if (READ_ONCE(nohz.needs_update))
+		flags |= NOHZ_NEXT_KICK;
+
 	if (flags)
 		kick_ilb(flags);
 }
@@ -10546,12 +10550,13 @@ void nohz_balance_enter_idle(int cpu)
 	/*
 	 * Ensures that if nohz_idle_balance() fails to observe our
 	 * @idle_cpus_mask store, it must observe the @has_blocked
-	 * store.
+	 * and @needs_update stores.
 	 */
 	smp_mb__after_atomic();
 
 	set_cpu_sd_state_idle(cpu);
 
+	WRITE_ONCE(nohz.needs_update, 1);
 out:
 	/*
 	 * Each time a cpu enter idle, we assume that it has blocked load and
@@ -10600,13 +10605,17 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
 	/*
 	 * We assume there will be no idle load after this update and clear
 	 * the has_blocked flag. If a cpu enters idle in the mean time, it will
-	 * set the has_blocked flag and trig another update of idle load.
+	 * set the has_blocked flag and trigger another update of idle load.
 	 * Because a cpu that becomes idle, is added to idle_cpus_mask before
 	 * setting the flag, we are sure to not clear the state and not
 	 * check the load of an idle cpu.
+	 *
+	 * Same applies to idle_cpus_mask vs needs_update.
 	 */
 	if (flags & NOHZ_STATS_KICK)
 		WRITE_ONCE(nohz.has_blocked, 0);
+	if (flags & NOHZ_NEXT_KICK)
+		WRITE_ONCE(nohz.needs_update, 0);
 
 	/*
 	 * Ensures that if we miss the CPU, we must see the has_blocked
@@ -10630,6 +10639,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
 		if (need_resched()) {
 			if (flags & NOHZ_STATS_KICK)
 				has_blocked_load = true;
+			if (flags & NOHZ_NEXT_KICK)
+				WRITE_ONCE(nohz.needs_update, 1);
 			goto abort;
 		}
 

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [tip: sched/core] sched/fair: Add NOHZ balancer flag for nohz.next_balance updates
  2021-08-23 11:16 ` [PATCH v3 1/2] sched/fair: Add NOHZ balancer flag for nohz.next_balance updates Valentin Schneider
                     ` (2 preceding siblings ...)
  2021-09-09 11:18   ` [tip: sched/core] " tip-bot2 for Valentin Schneider
@ 2021-10-05 14:12   ` tip-bot2 for Valentin Schneider
  3 siblings, 0 replies; 13+ messages in thread
From: tip-bot2 for Valentin Schneider @ 2021-10-05 14:12 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Vincent Guittot, Valentin Schneider, Peter Zijlstra (Intel),
	x86, linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     efd984c481abb516fab8bafb25bf41fd9397a43c
Gitweb:        https://git.kernel.org/tip/efd984c481abb516fab8bafb25bf41fd9397a43c
Author:        Valentin Schneider <valentin.schneider@arm.com>
AuthorDate:    Mon, 23 Aug 2021 12:16:59 +01:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 05 Oct 2021 15:51:30 +02:00

sched/fair: Add NOHZ balancer flag for nohz.next_balance updates

A following patch will trigger NOHZ idle balances as a means to update
nohz.next_balance. Vincent noted that blocked load updates can have
non-negligible overhead, which should be avoided if the intent is to only
update nohz.next_balance.

Add a new NOHZ balance kick flag, NOHZ_NEXT_KICK. Gate NOHZ blocked load
update by the presence of NOHZ_STATS_KICK - currently all NOHZ balance
kicks will have the NOHZ_STATS_KICK flag set, so no change in behaviour is
expected.

Suggested-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lkml.kernel.org/r/20210823111700.2842997-2-valentin.schneider@arm.com
---
 kernel/sched/fair.c  | 24 ++++++++++++++----------
 kernel/sched/sched.h |  8 +++++++-
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f6a05d9..f4de7f5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10375,7 +10375,7 @@ static void nohz_balancer_kick(struct rq *rq)
 		goto out;
 
 	if (rq->nr_running >= 2) {
-		flags = NOHZ_KICK_MASK;
+		flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
 		goto out;
 	}
 
@@ -10389,7 +10389,7 @@ static void nohz_balancer_kick(struct rq *rq)
 		 * on.
 		 */
 		if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
-			flags = NOHZ_KICK_MASK;
+			flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
 			goto unlock;
 		}
 	}
@@ -10403,7 +10403,7 @@ static void nohz_balancer_kick(struct rq *rq)
 		 */
 		for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
 			if (sched_asym_prefer(i, cpu)) {
-				flags = NOHZ_KICK_MASK;
+				flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
 				goto unlock;
 			}
 		}
@@ -10416,7 +10416,7 @@ static void nohz_balancer_kick(struct rq *rq)
 		 * to run the misfit task on.
 		 */
 		if (check_misfit_status(rq, sd)) {
-			flags = NOHZ_KICK_MASK;
+			flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
 			goto unlock;
 		}
 
@@ -10443,7 +10443,7 @@ static void nohz_balancer_kick(struct rq *rq)
 		 */
 		nr_busy = atomic_read(&sds->nr_busy_cpus);
 		if (nr_busy > 1) {
-			flags = NOHZ_KICK_MASK;
+			flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
 			goto unlock;
 		}
 	}
@@ -10605,7 +10605,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
 	 * setting the flag, we are sure to not clear the state and not
 	 * check the load of an idle cpu.
 	 */
-	WRITE_ONCE(nohz.has_blocked, 0);
+	if (flags & NOHZ_STATS_KICK)
+		WRITE_ONCE(nohz.has_blocked, 0);
 
 	/*
 	 * Ensures that if we miss the CPU, we must see the has_blocked
@@ -10627,13 +10628,15 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
 		 * balancing owner will pick it up.
 		 */
 		if (need_resched()) {
-			has_blocked_load = true;
+			if (flags & NOHZ_STATS_KICK)
+				has_blocked_load = true;
 			goto abort;
 		}
 
 		rq = cpu_rq(balance_cpu);
 
-		has_blocked_load |= update_nohz_stats(rq);
+		if (flags & NOHZ_STATS_KICK)
+			has_blocked_load |= update_nohz_stats(rq);
 
 		/*
 		 * If time for next balance is due,
@@ -10664,8 +10667,9 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
 	if (likely(update_next_balance))
 		nohz.next_balance = next_balance;
 
-	WRITE_ONCE(nohz.next_blocked,
-		now + msecs_to_jiffies(LOAD_AVG_PERIOD));
+	if (flags & NOHZ_STATS_KICK)
+		WRITE_ONCE(nohz.next_blocked,
+			   now + msecs_to_jiffies(LOAD_AVG_PERIOD));
 
 abort:
 	/* There is still blocked load, enable periodic update */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3d3e579..1fec313 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2709,12 +2709,18 @@ extern void cfs_bandwidth_usage_dec(void);
 #define NOHZ_BALANCE_KICK_BIT	0
 #define NOHZ_STATS_KICK_BIT	1
 #define NOHZ_NEWILB_KICK_BIT	2
+#define NOHZ_NEXT_KICK_BIT	3
 
+/* Run rebalance_domains() */
 #define NOHZ_BALANCE_KICK	BIT(NOHZ_BALANCE_KICK_BIT)
+/* Update blocked load */
 #define NOHZ_STATS_KICK		BIT(NOHZ_STATS_KICK_BIT)
+/* Update blocked load when entering idle */
 #define NOHZ_NEWILB_KICK	BIT(NOHZ_NEWILB_KICK_BIT)
+/* Update nohz.next_balance */
+#define NOHZ_NEXT_KICK		BIT(NOHZ_NEXT_KICK_BIT)
 
-#define NOHZ_KICK_MASK	(NOHZ_BALANCE_KICK | NOHZ_STATS_KICK)
+#define NOHZ_KICK_MASK	(NOHZ_BALANCE_KICK | NOHZ_STATS_KICK | NOHZ_NEXT_KICK)
 
 #define nohz_flags(cpu)	(&cpu_rq(cpu)->nohz_flags)
 

^ permalink raw reply related	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2021-10-05 14:13 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-23 11:16 [PATCH v3 0/2] sched/fair: nohz.next_balance vs newly-idle CPUs Valentin Schneider
2021-08-23 11:16 ` [PATCH v3 1/2] sched/fair: Add NOHZ balancer flag for nohz.next_balance updates Valentin Schneider
2021-08-23 11:59   ` Peter Zijlstra
2021-08-23 12:57     ` Valentin Schneider
2021-08-23 13:53       ` Dietmar Eggemann
2021-08-24  8:11         ` Peter Zijlstra
2021-08-24  9:08   ` Vincent Guittot
2021-09-09 11:18   ` [tip: sched/core] " tip-bot2 for Valentin Schneider
2021-10-05 14:12   ` tip-bot2 for Valentin Schneider
2021-08-23 11:17 ` [PATCH v3 2/2] sched/fair: Trigger nohz.next_balance updates when a CPU goes NOHZ-idle Valentin Schneider
2021-08-24  9:08   ` Vincent Guittot
2021-09-09 11:18   ` [tip: sched/core] " tip-bot2 for Valentin Schneider
2021-10-05 14:12   ` tip-bot2 for Valentin Schneider

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).