linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] sched/fair: Update nohz.next_balance for newly NOHZ-idle CPUs
@ 2021-07-14 11:39 Valentin Schneider
  2021-07-15  0:01 ` kernel test robot
                   ` (4 more replies)
  0 siblings, 5 replies; 9+ messages in thread
From: Valentin Schneider @ 2021-07-14 11:39 UTC (permalink / raw)
  To: linux-kernel
  Cc: Peter Zijlstra, Ingo Molnar, Vincent Guittot, Dietmar Eggemann

Consider a system with some NOHZ-idle CPUs, such that

  nohz.idle_cpus_mask = S
  nohz.next_balance = T

When a new CPU k goes NOHZ idle (nohz_balance_enter_idle()), we end up
with:

  nohz.idle_cpus_mask = S \U {k}
  nohz.next_balance = T

Note that the nohz.next_balance hasn't changed - it won't be updated until
a NOHZ balance is triggered. This is problematic if the newly NOHZ idle CPU
has an earlier rq.next_balance than the other NOHZ idle CPUs, IOW if:

  cpu_rq(k).next_balance < nohz.next_balance

In such scenarios, the existing nohz.next_balance will prevent any NOHZ
balance from happening, which itself will prevent nohz.next_balance from
being updated to this new cpu_rq(k).next_balance. Unnecessary load balance
delays of over 12ms caused by this were observed on an arm64 RB5 board.

Track which CPUs are iterated over during a NOHZ idle balance with a new
cpumask. When considering whether to kick a NOHZ idle balance, use this
cpumask to determine if any CPU has entered NOHZ idle but hasn't had its
rq.next_balance collated into nohz.next_balance yet, and kick a NOHZ_STATS
balance if it is the case.

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
---
 kernel/sched/core.c |  8 ++++++++
 kernel/sched/fair.c | 19 +++++++++++++++++--
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0c22cd026440..1bc4cbc1f85e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8893,6 +8893,10 @@ static struct kmem_cache *task_group_cache __read_mostly;
 DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
 DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
 
+#ifdef CONFIG_NOHZ_COMMON
+DECLARE_PER_CPU(cpumask_var_t, nohz_balance_mask);
+#endif /* CONFIG_NOHZ_COMMON */
+
 void __init sched_init(void)
 {
 	unsigned long ptr = 0;
@@ -8942,6 +8946,10 @@ void __init sched_init(void)
 			cpumask_size(), GFP_KERNEL, cpu_to_node(i));
 		per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
 			cpumask_size(), GFP_KERNEL, cpu_to_node(i));
+#ifdef CONFIG_NOHZ_COMMON
+		per_cpu(nohz_balance_mask, i) = (cpumask_var_t)kzalloc_node(
+			cpumask_size(), GFP_KERNEL, cpu_to_node(i));
+#endif /* CONFIG_NOHZ_COMMON */
 	}
 #endif /* CONFIG_CPUMASK_OFFSTACK */
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 11d22943753f..497208a1afb8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5694,8 +5694,11 @@ DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
 
 #ifdef CONFIG_NO_HZ_COMMON
 
+DEFINE_PER_CPU(cpumask_var_t, nohz_balance_mask);
+
 static struct {
-	cpumask_var_t idle_cpus_mask;
+	cpumask_var_t idle_cpus_mask;    /* CPUs in NOHZ idle */
+	cpumask_var_t last_balance_mask; /* CPUs covered by last NOHZ balance */
 	atomic_t nr_cpus;
 	int has_blocked;		/* Idle CPUS has blocked load */
 	unsigned long next_balance;     /* in jiffy units */
@@ -10351,6 +10354,13 @@ static void nohz_balancer_kick(struct rq *rq)
 unlock:
 	rcu_read_unlock();
 out:
+	/*
+	 * Some CPUs have recently gone into NOHZ idle; kick a balance to
+	 * collate the proper next balance interval.
+	 */
+	if (!cpumask_subset(nohz.idle_cpus_mask, nohz.last_balance_mask))
+		flags |= NOHZ_STATS_KICK;
+
 	if (flags)
 		kick_ilb(flags);
 }
@@ -10487,6 +10497,7 @@ static bool update_nohz_stats(struct rq *rq)
 static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
 			       enum cpu_idle_type idle)
 {
+	struct cpumask *cpus = this_cpu_cpumask_var_ptr(nohz_balance_mask);
 	/* Earliest time when we have to do rebalance again */
 	unsigned long now = jiffies;
 	unsigned long next_balance = now + 60*HZ;
@@ -10518,7 +10529,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
 	 * Start with the next CPU after this_cpu so we will end with this_cpu and let a
 	 * chance for other idle cpu to pull load.
 	 */
-	for_each_cpu_wrap(balance_cpu,  nohz.idle_cpus_mask, this_cpu+1) {
+	cpumask_copy(cpus, nohz.idle_cpus_mask);
+	for_each_cpu_wrap(balance_cpu, cpus, this_cpu+1) {
 		if (!idle_cpu(balance_cpu))
 			continue;
 
@@ -10565,6 +10577,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
 	if (likely(update_next_balance))
 		nohz.next_balance = next_balance;
 
+	cpumask_copy(nohz.last_balance_mask, cpus);
+
 	WRITE_ONCE(nohz.next_blocked,
 		now + msecs_to_jiffies(LOAD_AVG_PERIOD));
 
@@ -11550,6 +11564,7 @@ __init void init_sched_fair_class(void)
 	nohz.next_balance = jiffies;
 	nohz.next_blocked = jiffies;
 	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
+	zalloc_cpumask_var(&nohz.last_balance_mask, GFP_NOWAIT);
 #endif
 #endif /* SMP */
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2021-08-08 13:14 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-14 11:39 [PATCH] sched/fair: Update nohz.next_balance for newly NOHZ-idle CPUs Valentin Schneider
2021-07-15  0:01 ` kernel test robot
2021-07-15  0:02 ` [RFC PATCH] sched/fair: __pcpu_scope_nohz_balance_mask can be static kernel test robot
2021-07-15  7:42 ` [PATCH] sched/fair: Update nohz.next_balance for newly NOHZ-idle CPUs Vincent Guittot
2021-07-15 11:56   ` Valentin Schneider
2021-07-15 13:01     ` Vincent Guittot
2021-07-15 14:51       ` Valentin Schneider
2021-07-15 12:33 ` Dietmar Eggemann
2021-08-08 13:30 ` [sched/fair] cbd87e97ca: BUG:kernel_NULL_pointer_dereference,address kernel test robot

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).