All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] sched/fair: Update nohz.next_balance for newly NOHZ-idle CPUs
@ 2021-07-14 11:39 Valentin Schneider
  2021-07-15  0:01   ` kernel test robot
                   ` (4 more replies)
  0 siblings, 5 replies; 12+ messages in thread
From: Valentin Schneider @ 2021-07-14 11:39 UTC (permalink / raw)
  To: linux-kernel
  Cc: Peter Zijlstra, Ingo Molnar, Vincent Guittot, Dietmar Eggemann

Consider a system with some NOHZ-idle CPUs, such that

  nohz.idle_cpus_mask = S
  nohz.next_balance = T

When a new CPU k goes NOHZ idle (nohz_balance_enter_idle()), we end up
with:

  nohz.idle_cpus_mask = S \U {k}
  nohz.next_balance = T

Note that the nohz.next_balance hasn't changed - it won't be updated until
a NOHZ balance is triggered. This is problematic if the newly NOHZ idle CPU
has an earlier rq.next_balance than the other NOHZ idle CPUs, IOW if:

  cpu_rq(k).next_balance < nohz.next_balance

In such scenarios, the existing nohz.next_balance will prevent any NOHZ
balance from happening, which itself will prevent nohz.next_balance from
being updated to this new cpu_rq(k).next_balance. Unnecessary load balance
delays of over 12ms caused by this were observed on an arm64 RB5 board.

Track which CPUs are iterated over during a NOHZ idle balance with a new
cpumask. When considering whether to kick a NOHZ idle balance, use this
cpumask to determine if any CPU has entered NOHZ idle but hasn't had its
rq.next_balance collated into nohz.next_balance yet, and kick a NOHZ_STATS
balance if it is the case.

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
---
 kernel/sched/core.c |  8 ++++++++
 kernel/sched/fair.c | 19 +++++++++++++++++--
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0c22cd026440..1bc4cbc1f85e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8893,6 +8893,10 @@ static struct kmem_cache *task_group_cache __read_mostly;
 DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
 DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
 
+#ifdef CONFIG_NOHZ_COMMON
+DECLARE_PER_CPU(cpumask_var_t, nohz_balance_mask);
+#endif /* CONFIG_NOHZ_COMMON */
+
 void __init sched_init(void)
 {
 	unsigned long ptr = 0;
@@ -8942,6 +8946,10 @@ void __init sched_init(void)
 			cpumask_size(), GFP_KERNEL, cpu_to_node(i));
 		per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
 			cpumask_size(), GFP_KERNEL, cpu_to_node(i));
+#ifdef CONFIG_NOHZ_COMMON
+		per_cpu(nohz_balance_mask, i) = (cpumask_var_t)kzalloc_node(
+			cpumask_size(), GFP_KERNEL, cpu_to_node(i));
+#endif /* CONFIG_NOHZ_COMMON */
 	}
 #endif /* CONFIG_CPUMASK_OFFSTACK */
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 11d22943753f..497208a1afb8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5694,8 +5694,11 @@ DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
 
 #ifdef CONFIG_NO_HZ_COMMON
 
+DEFINE_PER_CPU(cpumask_var_t, nohz_balance_mask);
+
 static struct {
-	cpumask_var_t idle_cpus_mask;
+	cpumask_var_t idle_cpus_mask;    /* CPUs in NOHZ idle */
+	cpumask_var_t last_balance_mask; /* CPUs covered by last NOHZ balance */
 	atomic_t nr_cpus;
 	int has_blocked;		/* Idle CPUS has blocked load */
 	unsigned long next_balance;     /* in jiffy units */
@@ -10351,6 +10354,13 @@ static void nohz_balancer_kick(struct rq *rq)
 unlock:
 	rcu_read_unlock();
 out:
+	/*
+	 * Some CPUs have recently gone into NOHZ idle; kick a balance to
+	 * collate the proper next balance interval.
+	 */
+	if (!cpumask_subset(nohz.idle_cpus_mask, nohz.last_balance_mask))
+		flags |= NOHZ_STATS_KICK;
+
 	if (flags)
 		kick_ilb(flags);
 }
@@ -10487,6 +10497,7 @@ static bool update_nohz_stats(struct rq *rq)
 static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
 			       enum cpu_idle_type idle)
 {
+	struct cpumask *cpus = this_cpu_cpumask_var_ptr(nohz_balance_mask);
 	/* Earliest time when we have to do rebalance again */
 	unsigned long now = jiffies;
 	unsigned long next_balance = now + 60*HZ;
@@ -10518,7 +10529,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
 	 * Start with the next CPU after this_cpu so we will end with this_cpu and let a
 	 * chance for other idle cpu to pull load.
 	 */
-	for_each_cpu_wrap(balance_cpu,  nohz.idle_cpus_mask, this_cpu+1) {
+	cpumask_copy(cpus, nohz.idle_cpus_mask);
+	for_each_cpu_wrap(balance_cpu, cpus, this_cpu+1) {
 		if (!idle_cpu(balance_cpu))
 			continue;
 
@@ -10565,6 +10577,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
 	if (likely(update_next_balance))
 		nohz.next_balance = next_balance;
 
+	cpumask_copy(nohz.last_balance_mask, cpus);
+
 	WRITE_ONCE(nohz.next_blocked,
 		now + msecs_to_jiffies(LOAD_AVG_PERIOD));
 
@@ -11550,6 +11564,7 @@ __init void init_sched_fair_class(void)
 	nohz.next_balance = jiffies;
 	nohz.next_blocked = jiffies;
 	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
+	zalloc_cpumask_var(&nohz.last_balance_mask, GFP_NOWAIT);
 #endif
 #endif /* SMP */
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2021-08-08 13:30 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-14 11:39 [PATCH] sched/fair: Update nohz.next_balance for newly NOHZ-idle CPUs Valentin Schneider
2021-07-15  0:01 ` kernel test robot
2021-07-15  0:01   ` kernel test robot
2021-07-15  0:02 ` [RFC PATCH] sched/fair: __pcpu_scope_nohz_balance_mask can be static kernel test robot
2021-07-15  0:02   ` kernel test robot
2021-07-15  7:42 ` [PATCH] sched/fair: Update nohz.next_balance for newly NOHZ-idle CPUs Vincent Guittot
2021-07-15 11:56   ` Valentin Schneider
2021-07-15 13:01     ` Vincent Guittot
2021-07-15 14:51       ` Valentin Schneider
2021-07-15 12:33 ` Dietmar Eggemann
2021-08-08 13:30 ` [sched/fair] cbd87e97ca: BUG:kernel_NULL_pointer_dereference,address kernel test robot
2021-08-08 13:30   ` [sched/fair] cbd87e97ca: BUG:kernel_NULL_pointer_dereference, address kernel test robot

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.