[RFC PATCH 0/5] sched: On remote stats updates..

All of lore.kernel.org
 help / color / mirror / Atom feed

* [RFC PATCH 0/5] sched: On remote stats updates..
@ 2017-12-21 10:21 Peter Zijlstra
  2017-12-21 10:21 ` [RFC PATCH 1/5] sched: Convert nohz_flags to atomic_t Peter Zijlstra
                   ` (4 more replies)
  0 siblings, 5 replies; 56+ messages in thread
From: Peter Zijlstra @ 2017-12-21 10:21 UTC (permalink / raw)
  To: mingo, linux-kernel
  Cc: brendan.jackman, vincent.guittot, dietmar.eggemann, peterz,
	morten.rasmussen

This is what I hacked together this morning, it compiles.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* [RFC PATCH 1/5] sched: Convert nohz_flags to atomic_t
  2017-12-21 10:21 [RFC PATCH 0/5] sched: On remote stats updates Peter Zijlstra
@ 2017-12-21 10:21 ` Peter Zijlstra
  2017-12-21 10:21 ` [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK Peter Zijlstra
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 56+ messages in thread
From: Peter Zijlstra @ 2017-12-21 10:21 UTC (permalink / raw)
  To: mingo, linux-kernel
  Cc: brendan.jackman, vincent.guittot, dietmar.eggemann, peterz,
	morten.rasmussen

[-- Attachment #1: peterz-sched-nohz-atomic.patch --]
[-- Type: text/plain, Size: 4225 bytes --]



Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c  |    6 +++---
 kernel/sched/fair.c  |   23 +++++++++++++++--------
 kernel/sched/sched.h |   11 ++++++-----
 3 files changed, 24 insertions(+), 16 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -608,7 +608,7 @@ static inline bool got_nohz_idle_kick(vo
 {
 	int cpu = smp_processor_id();
 
-	if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
+	if (!(atomic_read(nohz_flags(cpu)) & NOHZ_BALANCE_KICK))
 		return false;
 
 	if (idle_cpu(cpu) && !need_resched())
@@ -618,7 +618,7 @@ static inline bool got_nohz_idle_kick(vo
 	 * We can't run Idle Load Balance on this CPU for this time so we
 	 * cancel it and clear NOHZ_BALANCE_KICK
 	 */
-	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
+	atomic_andnot(NOHZ_BALANCE_KICK, nohz_flags(cpu));
 	return false;
 }
 
@@ -6002,7 +6002,7 @@ void __init sched_init(void)
 		rq_attach_root(rq, &def_root_domain);
 #ifdef CONFIG_NO_HZ_COMMON
 		rq->last_load_update_tick = jiffies;
-		rq->nohz_flags = 0;
+		atomic_set(&rq->nohz_flags, 0);
 #endif
 #ifdef CONFIG_NO_HZ_FULL
 		rq->last_sched_tick = 0;
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8991,6 +8991,7 @@ static inline int find_new_ilb(void)
  */
 static void nohz_balancer_kick(void)
 {
+	unsigned int flags;
 	int ilb_cpu;
 
 	nohz.next_balance++;
@@ -9000,7 +9001,8 @@ static void nohz_balancer_kick(void)
 	if (ilb_cpu >= nr_cpu_ids)
 		return;
 
-	if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
+	flags = atomic_fetch_or(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu));
+	if (flags & NOHZ_BALANCE_KICK)
 		return;
 	/*
 	 * Use smp_send_reschedule() instead of resched_cpu().
@@ -9014,7 +9016,9 @@ static void nohz_balancer_kick(void)
 
 void nohz_balance_exit_idle(unsigned int cpu)
 {
-	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
+	unsigned int flags = atomic_read(nohz_flags(cpu));
+
+	if (unlikely(flags & NOHZ_TICK_STOPPED)) {
 		/*
 		 * Completely isolated CPUs don't ever set, so we must test.
 		 */
@@ -9022,7 +9026,8 @@ void nohz_balance_exit_idle(unsigned int
 			cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
 			atomic_dec(&nohz.nr_cpus);
 		}
-		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
+
+		atomic_andnot(NOHZ_TICK_STOPPED, nohz_flags(cpu));
 	}
 }
 
@@ -9076,7 +9081,7 @@ void nohz_balance_enter_idle(int cpu)
 	if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
 		return;
 
-	if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
+	if (atomic_read(nohz_flags(cpu)) & NOHZ_TICK_STOPPED)
 		return;
 
 	/*
@@ -9087,7 +9092,7 @@ void nohz_balance_enter_idle(int cpu)
 
 	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
 	atomic_inc(&nohz.nr_cpus);
-	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
+	atomic_or(NOHZ_TICK_STOPPED, nohz_flags(cpu));
 }
 #endif
 
@@ -9225,8 +9230,10 @@ static void nohz_idle_balance(struct rq
 	unsigned long next_balance = jiffies + 60*HZ;
 	int update_next_balance = 0;
 
-	if (idle != CPU_IDLE ||
-	    !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
+	if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_BALANCE_KICK))
+		return;
+
+	if (idle != CPU_IDLE)
 		goto end;
 
 	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
@@ -9272,7 +9279,7 @@ static void nohz_idle_balance(struct rq
 	if (likely(update_next_balance))
 		nohz.next_balance = next_balance;
 end:
-	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
+	atomic_andnot(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
 }
 
 /*
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -723,7 +723,7 @@ struct rq {
 #ifdef CONFIG_SMP
 	unsigned long last_load_update_tick;
 #endif /* CONFIG_SMP */
-	unsigned long nohz_flags;
+	atomic_t nohz_flags;
 #endif /* CONFIG_NO_HZ_COMMON */
 #ifdef CONFIG_NO_HZ_FULL
 	unsigned long last_sched_tick;
@@ -2003,10 +2003,11 @@ extern void cfs_bandwidth_usage_inc(void
 extern void cfs_bandwidth_usage_dec(void);
 
 #ifdef CONFIG_NO_HZ_COMMON
-enum rq_nohz_flag_bits {
-	NOHZ_TICK_STOPPED,
-	NOHZ_BALANCE_KICK,
-};
+#define NOHZ_TICK_STOPPED_BIT	0
+#define NOHZ_BALANCE_KICK_BIT	1
+
+#define NOHZ_TICK_STOPPED	BIT(NOHZ_TICK_STOPPED_BIT)
+#define NOHZ_BALANCE_KICK	BIT(NOHZ_BALANCE_KICK_BIT)
 
 #define nohz_flags(cpu)	(&cpu_rq(cpu)->nohz_flags)
 

^ permalink raw reply	[flat|nested] 56+ messages in thread

* [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK
  2017-12-21 10:21 [RFC PATCH 0/5] sched: On remote stats updates Peter Zijlstra
  2017-12-21 10:21 ` [RFC PATCH 1/5] sched: Convert nohz_flags to atomic_t Peter Zijlstra
@ 2017-12-21 10:21 ` Peter Zijlstra
  2017-12-21 16:23   ` Vincent Guittot
  2017-12-21 10:21 ` [RFC PATCH 3/5] sched: Restructure nohz_balance_kick Peter Zijlstra
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 56+ messages in thread
From: Peter Zijlstra @ 2017-12-21 10:21 UTC (permalink / raw)
  To: mingo, linux-kernel
  Cc: brendan.jackman, vincent.guittot, dietmar.eggemann, peterz,
	morten.rasmussen

[-- Attachment #1: peterz-sched-nohz-stats.patch --]
[-- Type: text/plain, Size: 4866 bytes --]


Suggested-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c  |    4 +--
 kernel/sched/fair.c  |   52 ++++++++++++++++++++++++++++++++++-----------------
 kernel/sched/sched.h |    4 +++
 3 files changed, 41 insertions(+), 19 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -608,7 +608,7 @@ static inline bool got_nohz_idle_kick(vo
 {
 	int cpu = smp_processor_id();
 
-	if (!(atomic_read(nohz_flags(cpu)) & NOHZ_BALANCE_KICK))
+	if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK))
 		return false;
 
 	if (idle_cpu(cpu) && !need_resched())
@@ -618,7 +618,7 @@ static inline bool got_nohz_idle_kick(vo
 	 * We can't run Idle Load Balance on this CPU for this time so we
 	 * cancel it and clear NOHZ_BALANCE_KICK
 	 */
-	atomic_andnot(NOHZ_BALANCE_KICK, nohz_flags(cpu));
+	atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
 	return false;
 }
 
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9001,8 +9001,8 @@ static void nohz_balancer_kick(void)
 	if (ilb_cpu >= nr_cpu_ids)
 		return;
 
-	flags = atomic_fetch_or(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu));
-	if (flags & NOHZ_BALANCE_KICK)
+	flags = atomic_fetch_or(NOHZ_KICK_MASK, nohz_flags(ilb_cpu));
+	if (flags & NOHZ_KICK_MASK)
 		return;
 	/*
 	 * Use smp_send_reschedule() instead of resched_cpu().
@@ -9125,8 +9125,6 @@ static void rebalance_domains(struct rq
 	int need_serialize, need_decay = 0;
 	u64 max_cost = 0;
 
-	update_blocked_averages(cpu);
-
 	rcu_read_lock();
 	for_each_domain(cpu, sd) {
 		/*
@@ -9221,20 +9219,27 @@ static void rebalance_domains(struct rq
  * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
  * rebalancing for all the cpus for whom scheduler ticks are stopped.
  */
-static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
+static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 {
-	int this_cpu = this_rq->cpu;
-	struct rq *rq;
-	int balance_cpu;
 	/* Earliest time when we have to do rebalance again */
 	unsigned long next_balance = jiffies + 60*HZ;
 	int update_next_balance = 0;
+	int this_cpu = this_rq->cpu;
+	unsigned int flags;
+	int balance_cpu;
+	struct rq *rq;
 
-	if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_BALANCE_KICK))
-		return;
+	if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
+		return false;
 
-	if (idle != CPU_IDLE)
-		goto end;
+	if (idle != CPU_IDLE) {
+		atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
+		return false;
+	}
+
+	flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
+
+	SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
 
 	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
 		if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
@@ -9262,7 +9267,9 @@ static void nohz_idle_balance(struct rq
 			cpu_load_update_idle(rq);
 			rq_unlock_irq(rq, &rf);
 
-			rebalance_domains(rq, CPU_IDLE);
+			update_blocked_averages(rq->cpu);
+			if (flags & NOHZ_BALANCE_KICK)
+				rebalance_domains(rq, CPU_IDLE);
 		}
 
 		if (time_after(next_balance, rq->next_balance)) {
@@ -9271,6 +9278,10 @@ static void nohz_idle_balance(struct rq
 		}
 	}
 
+	update_blocked_averages(this_cpu);
+	if (flags & NOHZ_BALANCE_KICK)
+		rebalance_domains(this_rq, CPU_IDLE);
+
 	/*
 	 * next_balance will be updated only when there is a need.
 	 * When the CPU is attached to null domain for ex, it will not be
@@ -9278,8 +9289,8 @@ static void nohz_idle_balance(struct rq
 	 */
 	if (likely(update_next_balance))
 		nohz.next_balance = next_balance;
-end:
-	atomic_andnot(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
+
+	return true;
 }
 
 /*
@@ -9366,7 +9377,10 @@ static inline bool nohz_kick_needed(stru
 	return kick;
 }
 #else
-static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
+static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
+{
+	return false;
+}
 #endif
 
 /*
@@ -9387,7 +9401,11 @@ static __latent_entropy void run_rebalan
 	 * load balance only within the local sched_domain hierarchy
 	 * and abort nohz_idle_balance altogether if we pull some load.
 	 */
-	nohz_idle_balance(this_rq, idle);
+	if (nohz_idle_balance(this_rq, idle))
+		return;
+
+	/* normal load balance */
+	update_blocked_averages(this_rq->cpu);
 	rebalance_domains(this_rq, idle);
 }
 
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2005,9 +2005,13 @@ extern void cfs_bandwidth_usage_dec(void
 #ifdef CONFIG_NO_HZ_COMMON
 #define NOHZ_TICK_STOPPED_BIT	0
 #define NOHZ_BALANCE_KICK_BIT	1
+#define NOHZ_STATS_KICK_BIT	2
 
 #define NOHZ_TICK_STOPPED	BIT(NOHZ_TICK_STOPPED_BIT)
 #define NOHZ_BALANCE_KICK	BIT(NOHZ_BALANCE_KICK_BIT)
+#define NOHZ_STATS_KICK		BIT(NOHZ_STATS_KICK_BIT)
+
+#define NOHZ_KICK_MASK	(NOHZ_BALANCE_KICK | NOHZ_STATS_KICK)
 
 #define nohz_flags(cpu)	(&cpu_rq(cpu)->nohz_flags)
 

^ permalink raw reply	[flat|nested] 56+ messages in thread

* [RFC PATCH 3/5] sched: Restructure nohz_balance_kick
  2017-12-21 10:21 [RFC PATCH 0/5] sched: On remote stats updates Peter Zijlstra
  2017-12-21 10:21 ` [RFC PATCH 1/5] sched: Convert nohz_flags to atomic_t Peter Zijlstra
  2017-12-21 10:21 ` [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK Peter Zijlstra
@ 2017-12-21 10:21 ` Peter Zijlstra
  2017-12-21 10:21 ` [RFC PATCH 4/5] sched: Add nohz stats balancing Peter Zijlstra
  2017-12-21 10:21 ` [RFC PATCH 5/5] sched: Update blocked load from NEWIDLE Peter Zijlstra
  4 siblings, 0 replies; 56+ messages in thread
From: Peter Zijlstra @ 2017-12-21 10:21 UTC (permalink / raw)
  To: mingo, linux-kernel
  Cc: brendan.jackman, vincent.guittot, dietmar.eggemann, peterz,
	morten.rasmussen

[-- Attachment #1: peterz-sched-balance_prod.patch --]
[-- Type: text/plain, Size: 6842 bytes --]



Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/fair.c |  218 ++++++++++++++++++++++++++--------------------------
 1 file changed, 111 insertions(+), 107 deletions(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8984,12 +8984,29 @@ static inline int find_new_ilb(void)
 	return nr_cpu_ids;
 }
 
+static inline void set_cpu_sd_state_busy(void)
+{
+	struct sched_domain *sd;
+	int cpu = smp_processor_id();
+
+	rcu_read_lock();
+	sd = rcu_dereference(per_cpu(sd_llc, cpu));
+
+	if (!sd || !sd->nohz_idle)
+		goto unlock;
+	sd->nohz_idle = 0;
+
+	atomic_inc(&sd->shared->nr_busy_cpus);
+unlock:
+	rcu_read_unlock();
+}
+
 /*
  * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
  * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
  * CPU (if there is one).
  */
-static void nohz_balancer_kick(void)
+static void kick_ilb(void)
 {
 	unsigned int flags;
 	int ilb_cpu;
@@ -9004,6 +9021,7 @@ static void nohz_balancer_kick(void)
 	flags = atomic_fetch_or(NOHZ_KICK_MASK, nohz_flags(ilb_cpu));
 	if (flags & NOHZ_KICK_MASK)
 		return;
+
 	/*
 	 * Use smp_send_reschedule() instead of resched_cpu().
 	 * This way we generate a sched IPI on the target cpu which
@@ -9011,7 +9029,94 @@ static void nohz_balancer_kick(void)
 	 * will be run before returning from the IPI.
 	 */
 	smp_send_reschedule(ilb_cpu);
-	return;
+}
+
+/*
+ * Current heuristic for kicking the idle load balancer in the presence
+ * of an idle cpu in the system.
+ *   - This rq has more than one task.
+ *   - This rq has at least one CFS task and the capacity of the CPU is
+ *     significantly reduced because of RT tasks or IRQs.
+ *   - At parent of LLC scheduler domain level, this cpu's scheduler group has
+ *     multiple busy cpu.
+ *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
+ *     domain span are idle.
+ */
+static void nohz_balancer_kick(struct rq *rq)
+{
+	unsigned long now = jiffies;
+	struct sched_domain_shared *sds;
+	struct sched_domain *sd;
+	int nr_busy, i, cpu = rq->cpu;
+	bool kick = false;
+
+	if (unlikely(rq->idle_balance))
+		return;
+
+       /*
+	* We may be recently in ticked or tickless idle mode. At the first
+	* busy tick after returning from idle, we will update the busy stats.
+	*/
+	set_cpu_sd_state_busy();
+	nohz_balance_exit_idle(cpu);
+
+	/*
+	 * None are in tickless mode and hence no need for NOHZ idle load
+	 * balancing.
+	 */
+	if (likely(!atomic_read(&nohz.nr_cpus)))
+		return;
+
+	if (time_before(now, nohz.next_balance))
+		return;
+
+	if (rq->nr_running >= 2) {
+		kick = true;
+		goto out;
+	}
+
+	rcu_read_lock();
+	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+	if (sds) {
+		/*
+		 * XXX: write a coherent comment on why we do this.
+		 * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
+		 */
+		nr_busy = atomic_read(&sds->nr_busy_cpus);
+		if (nr_busy > 1) {
+			kick = true;
+			goto unlock;
+		}
+
+	}
+
+	sd = rcu_dereference(rq->sd);
+	if (sd) {
+		if ((rq->cfs.h_nr_running >= 1) &&
+				check_cpu_capacity(rq, sd)) {
+			kick = true;
+			goto unlock;
+		}
+	}
+
+	sd = rcu_dereference(per_cpu(sd_asym, cpu));
+	if (sd) {
+		for_each_cpu(i, sched_domain_span(sd)) {
+			if (i == cpu ||
+			    !cpumask_test_cpu(i, nohz.idle_cpus_mask))
+				continue;
+
+			if (sched_asym_prefer(i, cpu)) {
+				kick = true;
+				goto unlock;
+			}
+		}
+	}
+unlock:
+	rcu_read_unlock();
+out:
+	if (kick)
+		kick_ilb();
 }
 
 void nohz_balance_exit_idle(unsigned int cpu)
@@ -9031,23 +9136,6 @@ void nohz_balance_exit_idle(unsigned int
 	}
 }
 
-static inline void set_cpu_sd_state_busy(void)
-{
-	struct sched_domain *sd;
-	int cpu = smp_processor_id();
-
-	rcu_read_lock();
-	sd = rcu_dereference(per_cpu(sd_llc, cpu));
-
-	if (!sd || !sd->nohz_idle)
-		goto unlock;
-	sd->nohz_idle = 0;
-
-	atomic_inc(&sd->shared->nr_busy_cpus);
-unlock:
-	rcu_read_unlock();
-}
-
 void set_cpu_sd_state_idle(void)
 {
 	struct sched_domain *sd;
@@ -9094,6 +9182,8 @@ void nohz_balance_enter_idle(int cpu)
 	atomic_inc(&nohz.nr_cpus);
 	atomic_or(NOHZ_TICK_STOPPED, nohz_flags(cpu));
 }
+#else
+static inline void nohz_balancer_kick(struct rq *rq) { }
 #endif
 
 static DEFINE_SPINLOCK(balancing);
@@ -9291,90 +9381,6 @@ static bool nohz_idle_balance(struct rq
 
 	return true;
 }
-
-/*
- * Current heuristic for kicking the idle load balancer in the presence
- * of an idle cpu in the system.
- *   - This rq has more than one task.
- *   - This rq has at least one CFS task and the capacity of the CPU is
- *     significantly reduced because of RT tasks or IRQs.
- *   - At parent of LLC scheduler domain level, this cpu's scheduler group has
- *     multiple busy cpu.
- *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
- *     domain span are idle.
- */
-static inline bool nohz_kick_needed(struct rq *rq)
-{
-	unsigned long now = jiffies;
-	struct sched_domain_shared *sds;
-	struct sched_domain *sd;
-	int nr_busy, i, cpu = rq->cpu;
-	bool kick = false;
-
-	if (unlikely(rq->idle_balance))
-		return false;
-
-       /*
-	* We may be recently in ticked or tickless idle mode. At the first
-	* busy tick after returning from idle, we will update the busy stats.
-	*/
-	set_cpu_sd_state_busy();
-	nohz_balance_exit_idle(cpu);
-
-	/*
-	 * None are in tickless mode and hence no need for NOHZ idle load
-	 * balancing.
-	 */
-	if (likely(!atomic_read(&nohz.nr_cpus)))
-		return false;
-
-	if (time_before(now, nohz.next_balance))
-		return false;
-
-	if (rq->nr_running >= 2)
-		return true;
-
-	rcu_read_lock();
-	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
-	if (sds) {
-		/*
-		 * XXX: write a coherent comment on why we do this.
-		 * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
-		 */
-		nr_busy = atomic_read(&sds->nr_busy_cpus);
-		if (nr_busy > 1) {
-			kick = true;
-			goto unlock;
-		}
-
-	}
-
-	sd = rcu_dereference(rq->sd);
-	if (sd) {
-		if ((rq->cfs.h_nr_running >= 1) &&
-				check_cpu_capacity(rq, sd)) {
-			kick = true;
-			goto unlock;
-		}
-	}
-
-	sd = rcu_dereference(per_cpu(sd_asym, cpu));
-	if (sd) {
-		for_each_cpu(i, sched_domain_span(sd)) {
-			if (i == cpu ||
-			    !cpumask_test_cpu(i, nohz.idle_cpus_mask))
-				continue;
-
-			if (sched_asym_prefer(i, cpu)) {
-				kick = true;
-				goto unlock;
-			}
-		}
-	}
-unlock:
-	rcu_read_unlock();
-	return kick;
-}
 #else
 static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 {
@@ -9419,10 +9425,8 @@ void trigger_load_balance(struct rq *rq)
 
 	if (time_after_eq(jiffies, rq->next_balance))
 		raise_softirq(SCHED_SOFTIRQ);
-#ifdef CONFIG_NO_HZ_COMMON
-	if (nohz_kick_needed(rq))
-		nohz_balancer_kick();
-#endif
+
+	nohz_balancer_kick(rq);
 }
 
 static void rq_online_fair(struct rq *rq)

^ permalink raw reply	[flat|nested] 56+ messages in thread

* [RFC PATCH 4/5] sched: Add nohz stats balancing
  2017-12-21 10:21 [RFC PATCH 0/5] sched: On remote stats updates Peter Zijlstra
                   ` (2 preceding siblings ...)
  2017-12-21 10:21 ` [RFC PATCH 3/5] sched: Restructure nohz_balance_kick Peter Zijlstra
@ 2017-12-21 10:21 ` Peter Zijlstra
  2017-12-21 10:21 ` [RFC PATCH 5/5] sched: Update blocked load from NEWIDLE Peter Zijlstra
  4 siblings, 0 replies; 56+ messages in thread
From: Peter Zijlstra @ 2017-12-21 10:21 UTC (permalink / raw)
  To: mingo, linux-kernel
  Cc: brendan.jackman, vincent.guittot, dietmar.eggemann, peterz,
	morten.rasmussen

[-- Attachment #1: peterz-sched-nohz-stat-balance.patch --]
[-- Type: text/plain, Size: 3305 bytes --]


Suggested-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/fair.c |   31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8972,6 +8972,7 @@ static struct {
 	cpumask_var_t idle_cpus_mask;
 	atomic_t nr_cpus;
 	unsigned long next_balance;     /* in jiffy units */
+	unsigned long next_stats;
 } nohz ____cacheline_aligned;
 
 static inline int find_new_ilb(void)
@@ -9006,9 +9007,8 @@ static inline void set_cpu_sd_state_busy
  * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
  * CPU (if there is one).
  */
-static void kick_ilb(void)
+static void kick_ilb(unsigned int flags)
 {
-	unsigned int flags;
 	int ilb_cpu;
 
 	nohz.next_balance++;
@@ -9018,7 +9018,7 @@ static void kick_ilb(void)
 	if (ilb_cpu >= nr_cpu_ids)
 		return;
 
-	flags = atomic_fetch_or(NOHZ_KICK_MASK, nohz_flags(ilb_cpu));
+	flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
 	if (flags & NOHZ_KICK_MASK)
 		return;
 
@@ -9048,7 +9048,7 @@ static void nohz_balancer_kick(struct rq
 	struct sched_domain_shared *sds;
 	struct sched_domain *sd;
 	int nr_busy, i, cpu = rq->cpu;
-	bool kick = false;
+	unsigned int flags = 0;
 
 	if (unlikely(rq->idle_balance))
 		return;
@@ -9067,11 +9067,14 @@ static void nohz_balancer_kick(struct rq
 	if (likely(!atomic_read(&nohz.nr_cpus)))
 		return;
 
+	if (time_after(now, nohz.next_stats))
+		flags = NOHZ_STATS_KICK;
+
 	if (time_before(now, nohz.next_balance))
-		return;
+		goto out;
 
 	if (rq->nr_running >= 2) {
-		kick = true;
+		flags = NOHZ_KICK_MASK;
 		goto out;
 	}
 
@@ -9084,7 +9087,7 @@ static void nohz_balancer_kick(struct rq
 		 */
 		nr_busy = atomic_read(&sds->nr_busy_cpus);
 		if (nr_busy > 1) {
-			kick = true;
+			flags = NOHZ_KICK_MASK
 			goto unlock;
 		}
 
@@ -9094,7 +9097,7 @@ static void nohz_balancer_kick(struct rq
 	if (sd) {
 		if ((rq->cfs.h_nr_running >= 1) &&
 				check_cpu_capacity(rq, sd)) {
-			kick = true;
+			flags = NOHZ_KICK_MASK;
 			goto unlock;
 		}
 	}
@@ -9107,7 +9110,7 @@ static void nohz_balancer_kick(struct rq
 				continue;
 
 			if (sched_asym_prefer(i, cpu)) {
-				kick = true;
+				flags = NOHZ_KICK_MASK;
 				goto unlock;
 			}
 		}
@@ -9115,8 +9118,8 @@ static void nohz_balancer_kick(struct rq
 unlock:
 	rcu_read_unlock();
 out:
-	if (kick)
-		kick_ilb();
+	if (flags)
+		kick_ilb(flags);
 }
 
 void nohz_balance_exit_idle(unsigned int cpu)
@@ -9312,7 +9315,9 @@ static void rebalance_domains(struct rq
 static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 {
 	/* Earliest time when we have to do rebalance again */
-	unsigned long next_balance = jiffies + 60*HZ;
+	unsigned long now = jiffies;
+	unsigned long next_balance = now + 60*HZ;
+	unsigned long next_stats = now + msecs_to_jiffies(LOAD_AVG_PERIOD);
 	int update_next_balance = 0;
 	int this_cpu = this_rq->cpu;
 	unsigned int flags;
@@ -9372,6 +9377,8 @@ static bool nohz_idle_balance(struct rq
 	if (flags & NOHZ_BALANCE_KICK)
 		rebalance_domains(this_rq, CPU_IDLE);
 
+	nohz.next_stats = next_stats;
+
 	/*
 	 * next_balance will be updated only when there is a need.
 	 * When the CPU is attached to null domain for ex, it will not be

^ permalink raw reply	[flat|nested] 56+ messages in thread

* [RFC PATCH 5/5] sched: Update blocked load from NEWIDLE
  2017-12-21 10:21 [RFC PATCH 0/5] sched: On remote stats updates Peter Zijlstra
                   ` (3 preceding siblings ...)
  2017-12-21 10:21 ` [RFC PATCH 4/5] sched: Add nohz stats balancing Peter Zijlstra
@ 2017-12-21 10:21 ` Peter Zijlstra
  4 siblings, 0 replies; 56+ messages in thread
From: Peter Zijlstra @ 2017-12-21 10:21 UTC (permalink / raw)
  To: mingo, linux-kernel
  Cc: brendan.jackman, vincent.guittot, dietmar.eggemann, peterz,
	morten.rasmussen

[-- Attachment #1: peterz-sched-nohz-newidle.patch --]
[-- Type: text/plain, Size: 3751 bytes --]


Suggested-by: Brendan Jackman <brendan.jackman@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c  |    1 +
 kernel/sched/fair.c  |   47 +++++++++++++++++++++++++++++++++++++++++------
 kernel/sched/sched.h |    1 +
 3 files changed, 43 insertions(+), 6 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6002,6 +6002,7 @@ void __init sched_init(void)
 		rq_attach_root(rq, &def_root_domain);
 #ifdef CONFIG_NO_HZ_COMMON
 		rq->last_load_update_tick = jiffies;
+		rq->last_blocked_load_update_tick = jiffies;
 		atomic_set(&rq->nohz_flags, 0);
 #endif
 #ifdef CONFIG_NO_HZ_FULL
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5379,6 +5379,14 @@ decay_load_missed(unsigned long load, un
 	}
 	return load;
 }
+
+static struct {
+	cpumask_var_t idle_cpus_mask;
+	atomic_t nr_cpus;
+	unsigned long next_balance;     /* in jiffy units */
+	unsigned long next_stats;
+} nohz ____cacheline_aligned;
+
 #endif /* CONFIG_NO_HZ_COMMON */
 
 /**
@@ -6942,6 +6950,7 @@ enum fbq_type { regular, remote, all };
 #define LBF_NEED_BREAK	0x02
 #define LBF_DST_PINNED  0x04
 #define LBF_SOME_PINNED	0x08
+#define LBF_NOHZ_STATS	0x10
 
 struct lb_env {
 	struct sched_domain	*sd;
@@ -7380,6 +7389,10 @@ static void update_blocked_averages(int
 		if (cfs_rq_is_decayed(cfs_rq))
 			list_del_leaf_cfs_rq(cfs_rq);
 	}
+
+#ifdef CONFIG_NO_HZ_COMMON
+	rq->last_blocked_load_update_tick = jiffies;
+#endif
 	rq_unlock_irqrestore(rq, &rf);
 }
 
@@ -7439,6 +7452,9 @@ static inline void update_blocked_averag
 	rq_lock_irqsave(rq, &rf);
 	update_rq_clock(rq);
 	update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
+#ifdef CONFIG_NO_HZ_COMMON
+	rq->last_blocked_load_update_tick = jiffies;
+#endif
 	rq_unlock_irqrestore(rq, &rf);
 }
 
@@ -7773,6 +7789,19 @@ group_type group_classify(struct sched_g
 	return group_other;
 }
 
+static void update_nohz_stats(struct rq *rq)
+{
+	unsigned int cpu = rq->cpu;
+
+	if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
+		return;
+
+	if (!time_after(jiffies, rq->last_blocked_load_update_tick))
+		return;
+
+	update_blocked_averages(cpu);
+}
+
 /**
  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
  * @env: The load balancing environment.
@@ -7795,6 +7824,9 @@ static inline void update_sg_lb_stats(st
 	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
 		struct rq *rq = cpu_rq(i);
 
+		if (env->flags & LBF_NOHZ_STATS)
+			update_nohz_stats(rq);
+
 		/* Bias balancing toward cpus of our domain */
 		if (local_group)
 			load = target_load(i, load_idx);
@@ -7950,6 +7982,15 @@ static inline void update_sd_lb_stats(st
 	if (child && child->flags & SD_PREFER_SIBLING)
 		prefer_sibling = 1;
 
+#ifdef CONFIG_NO_HZ_COMMON
+	if (env->idle == CPU_NEWLY_IDLE) {
+		env->flags |= LBF_NOHZ_STATS;
+
+		if (cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd)))
+			nohz.next_stats = jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD);
+	}
+#endif
+
 	load_idx = get_sd_load_idx(env->sd, env->idle);
 
 	do {
@@ -8968,12 +9009,6 @@ static inline int on_null_domain(struct
  *   needed, they will kick the idle load balancer, which then does idle
  *   load balancing for all the idle CPUs.
  */
-static struct {
-	cpumask_var_t idle_cpus_mask;
-	atomic_t nr_cpus;
-	unsigned long next_balance;     /* in jiffy units */
-	unsigned long next_stats;
-} nohz ____cacheline_aligned;
 
 static inline int find_new_ilb(void)
 {
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -722,6 +722,7 @@ struct rq {
 #ifdef CONFIG_NO_HZ_COMMON
 #ifdef CONFIG_SMP
 	unsigned long last_load_update_tick;
+	unsigned long last_blocked_load_update_tick;
 #endif /* CONFIG_SMP */
 	atomic_t nohz_flags;
 #endif /* CONFIG_NO_HZ_COMMON */

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK
  2017-12-21 10:21 ` [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK Peter Zijlstra
@ 2017-12-21 16:23   ` Vincent Guittot
  2017-12-21 16:56     ` Vincent Guittot
  2017-12-22  7:56     ` Peter Zijlstra
  0 siblings, 2 replies; 56+ messages in thread
From: Vincent Guittot @ 2017-12-21 16:23 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, linux-kernel, Brendan Jackman, Dietmar Eggemann,
	Morten Rasmussen

Hi Peter,

I think that part of the proposal is missing.

One goal of the patchset was to kick an update of the stats of idle
cpu when a task wake up on a cpu but the statistic has not been
updated for a while.

That's why there where a call to nohz_kick_needed in the proposal to
kick ilb but only for updating blocked load and not a full idle load
balance

I can't find this call any more in your patchset

On 21 December 2017 at 11:21, Peter Zijlstra <peterz@infradead.org> wrote:
>
> Suggested-by: Vincent Guittot <vincent.guittot@linaro.org>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  kernel/sched/core.c  |    4 +--
>  kernel/sched/fair.c  |   52 ++++++++++++++++++++++++++++++++++-----------------
>  kernel/sched/sched.h |    4 +++
>  3 files changed, 41 insertions(+), 19 deletions(-)
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -608,7 +608,7 @@ static inline bool got_nohz_idle_kick(vo
>  {
>         int cpu = smp_processor_id();
>
> -       if (!(atomic_read(nohz_flags(cpu)) & NOHZ_BALANCE_KICK))
> +       if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK))
>                 return false;
>
>         if (idle_cpu(cpu) && !need_resched())
> @@ -618,7 +618,7 @@ static inline bool got_nohz_idle_kick(vo
>          * We can't run Idle Load Balance on this CPU for this time so we
>          * cancel it and clear NOHZ_BALANCE_KICK
>          */
> -       atomic_andnot(NOHZ_BALANCE_KICK, nohz_flags(cpu));
> +       atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
>         return false;
>  }
>
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -9001,8 +9001,8 @@ static void nohz_balancer_kick(void)
>         if (ilb_cpu >= nr_cpu_ids)
>                 return;
>
> -       flags = atomic_fetch_or(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu));
> -       if (flags & NOHZ_BALANCE_KICK)
> +       flags = atomic_fetch_or(NOHZ_KICK_MASK, nohz_flags(ilb_cpu));
> +       if (flags & NOHZ_KICK_MASK)
>                 return;
>         /*
>          * Use smp_send_reschedule() instead of resched_cpu().
> @@ -9125,8 +9125,6 @@ static void rebalance_domains(struct rq
>         int need_serialize, need_decay = 0;
>         u64 max_cost = 0;
>
> -       update_blocked_averages(cpu);
> -
>         rcu_read_lock();
>         for_each_domain(cpu, sd) {
>                 /*
> @@ -9221,20 +9219,27 @@ static void rebalance_domains(struct rq
>   * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
>   * rebalancing for all the cpus for whom scheduler ticks are stopped.
>   */
> -static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> +static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>  {
> -       int this_cpu = this_rq->cpu;
> -       struct rq *rq;
> -       int balance_cpu;
>         /* Earliest time when we have to do rebalance again */
>         unsigned long next_balance = jiffies + 60*HZ;
>         int update_next_balance = 0;
> +       int this_cpu = this_rq->cpu;
> +       unsigned int flags;
> +       int balance_cpu;
> +       struct rq *rq;
>
> -       if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_BALANCE_KICK))
> -               return;
> +       if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
> +               return false;
>
> -       if (idle != CPU_IDLE)
> -               goto end;
> +       if (idle != CPU_IDLE) {
> +               atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
> +               return false;
> +       }
> +
> +       flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
> +
> +       SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>
>         for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
>                 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
> @@ -9262,7 +9267,9 @@ static void nohz_idle_balance(struct rq
>                         cpu_load_update_idle(rq);
>                         rq_unlock_irq(rq, &rf);
>
> -                       rebalance_domains(rq, CPU_IDLE);
> +                       update_blocked_averages(rq->cpu);
> +                       if (flags & NOHZ_BALANCE_KICK)
> +                               rebalance_domains(rq, CPU_IDLE);
>                 }
>
>                 if (time_after(next_balance, rq->next_balance)) {
> @@ -9271,6 +9278,10 @@ static void nohz_idle_balance(struct rq
>                 }
>         }
>
> +       update_blocked_averages(this_cpu);
> +       if (flags & NOHZ_BALANCE_KICK)
> +               rebalance_domains(this_rq, CPU_IDLE);
> +
>         /*
>          * next_balance will be updated only when there is a need.
>          * When the CPU is attached to null domain for ex, it will not be
> @@ -9278,8 +9289,8 @@ static void nohz_idle_balance(struct rq
>          */
>         if (likely(update_next_balance))
>                 nohz.next_balance = next_balance;
> -end:
> -       atomic_andnot(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
> +
> +       return true;
>  }
>
>  /*
> @@ -9366,7 +9377,10 @@ static inline bool nohz_kick_needed(stru
>         return kick;
>  }
>  #else
> -static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
> +static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> +{
> +       return false;
> +}
>  #endif
>
>  /*
> @@ -9387,7 +9401,11 @@ static __latent_entropy void run_rebalan
>          * load balance only within the local sched_domain hierarchy
>          * and abort nohz_idle_balance altogether if we pull some load.
>          */
> -       nohz_idle_balance(this_rq, idle);
> +       if (nohz_idle_balance(this_rq, idle))
> +               return;
> +
> +       /* normal load balance */
> +       update_blocked_averages(this_rq->cpu);
>         rebalance_domains(this_rq, idle);
>  }
>
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2005,9 +2005,13 @@ extern void cfs_bandwidth_usage_dec(void
>  #ifdef CONFIG_NO_HZ_COMMON
>  #define NOHZ_TICK_STOPPED_BIT  0
>  #define NOHZ_BALANCE_KICK_BIT  1
> +#define NOHZ_STATS_KICK_BIT    2
>
>  #define NOHZ_TICK_STOPPED      BIT(NOHZ_TICK_STOPPED_BIT)
>  #define NOHZ_BALANCE_KICK      BIT(NOHZ_BALANCE_KICK_BIT)
> +#define NOHZ_STATS_KICK                BIT(NOHZ_STATS_KICK_BIT)
> +
> +#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK)
>
>  #define nohz_flags(cpu)        (&cpu_rq(cpu)->nohz_flags)
>
>
>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK
  2017-12-21 16:23   ` Vincent Guittot
@ 2017-12-21 16:56     ` Vincent Guittot
  2017-12-22  7:59       ` Peter Zijlstra
  2017-12-22  7:56     ` Peter Zijlstra
  1 sibling, 1 reply; 56+ messages in thread
From: Vincent Guittot @ 2017-12-21 16:56 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, linux-kernel, Brendan Jackman, Dietmar Eggemann,
	Morten Rasmussen

On 21 December 2017 at 17:23, Vincent Guittot
<vincent.guittot@linaro.org> wrote:
> Hi Peter,
>
> I think that part of the proposal is missing.
>
> One goal of the patchset was to kick an update of the stats of idle
> cpu when a task wake up on a cpu but the statistic has not been
> updated for a while.
>
> That's why there where a call to nohz_kick_needed in the proposal to
> kick ilb but only for updating blocked load and not a full idle load

sorry a call to nohz_kick_needed (which becomes nohz_balancer_kick in
yours patchset) in select_task_fair_rq_fair

> balance
>
> I can't find this call any more in your patchset

In fact, we can't only rely on the tick and newly_idle load balance to
ensure a period update of the blocked load because they can never
happen. So we need to find another place to kick for a periodic update
which is when a task wake up


>
> On 21 December 2017 at 11:21, Peter Zijlstra <peterz@infradead.org> wrote:
>>
>> Suggested-by: Vincent Guittot <vincent.guittot@linaro.org>
>> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
>> ---
>>  kernel/sched/core.c  |    4 +--
>>  kernel/sched/fair.c  |   52 ++++++++++++++++++++++++++++++++++-----------------
>>  kernel/sched/sched.h |    4 +++
>>  3 files changed, 41 insertions(+), 19 deletions(-)
>>
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -608,7 +608,7 @@ static inline bool got_nohz_idle_kick(vo
>>  {
>>         int cpu = smp_processor_id();
>>
>> -       if (!(atomic_read(nohz_flags(cpu)) & NOHZ_BALANCE_KICK))
>> +       if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK))
>>                 return false;
>>
>>         if (idle_cpu(cpu) && !need_resched())
>> @@ -618,7 +618,7 @@ static inline bool got_nohz_idle_kick(vo
>>          * We can't run Idle Load Balance on this CPU for this time so we
>>          * cancel it and clear NOHZ_BALANCE_KICK
>>          */
>> -       atomic_andnot(NOHZ_BALANCE_KICK, nohz_flags(cpu));
>> +       atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
>>         return false;
>>  }
>>
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -9001,8 +9001,8 @@ static void nohz_balancer_kick(void)
>>         if (ilb_cpu >= nr_cpu_ids)
>>                 return;
>>
>> -       flags = atomic_fetch_or(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu));
>> -       if (flags & NOHZ_BALANCE_KICK)
>> +       flags = atomic_fetch_or(NOHZ_KICK_MASK, nohz_flags(ilb_cpu));
>> +       if (flags & NOHZ_KICK_MASK)
>>                 return;
>>         /*
>>          * Use smp_send_reschedule() instead of resched_cpu().
>> @@ -9125,8 +9125,6 @@ static void rebalance_domains(struct rq
>>         int need_serialize, need_decay = 0;
>>         u64 max_cost = 0;
>>
>> -       update_blocked_averages(cpu);
>> -
>>         rcu_read_lock();
>>         for_each_domain(cpu, sd) {
>>                 /*
>> @@ -9221,20 +9219,27 @@ static void rebalance_domains(struct rq
>>   * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
>>   * rebalancing for all the cpus for whom scheduler ticks are stopped.
>>   */
>> -static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>> +static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>  {
>> -       int this_cpu = this_rq->cpu;
>> -       struct rq *rq;
>> -       int balance_cpu;
>>         /* Earliest time when we have to do rebalance again */
>>         unsigned long next_balance = jiffies + 60*HZ;
>>         int update_next_balance = 0;
>> +       int this_cpu = this_rq->cpu;
>> +       unsigned int flags;
>> +       int balance_cpu;
>> +       struct rq *rq;
>>
>> -       if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_BALANCE_KICK))
>> -               return;
>> +       if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
>> +               return false;
>>
>> -       if (idle != CPU_IDLE)
>> -               goto end;
>> +       if (idle != CPU_IDLE) {
>> +               atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>> +               return false;
>> +       }
>> +
>> +       flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>> +
>> +       SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>>
>>         for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
>>                 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
>> @@ -9262,7 +9267,9 @@ static void nohz_idle_balance(struct rq
>>                         cpu_load_update_idle(rq);
>>                         rq_unlock_irq(rq, &rf);
>>
>> -                       rebalance_domains(rq, CPU_IDLE);
>> +                       update_blocked_averages(rq->cpu);
>> +                       if (flags & NOHZ_BALANCE_KICK)
>> +                               rebalance_domains(rq, CPU_IDLE);
>>                 }
>>
>>                 if (time_after(next_balance, rq->next_balance)) {
>> @@ -9271,6 +9278,10 @@ static void nohz_idle_balance(struct rq
>>                 }
>>         }
>>
>> +       update_blocked_averages(this_cpu);
>> +       if (flags & NOHZ_BALANCE_KICK)
>> +               rebalance_domains(this_rq, CPU_IDLE);
>> +
>>         /*
>>          * next_balance will be updated only when there is a need.
>>          * When the CPU is attached to null domain for ex, it will not be
>> @@ -9278,8 +9289,8 @@ static void nohz_idle_balance(struct rq
>>          */
>>         if (likely(update_next_balance))
>>                 nohz.next_balance = next_balance;
>> -end:
>> -       atomic_andnot(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
>> +
>> +       return true;
>>  }
>>
>>  /*
>> @@ -9366,7 +9377,10 @@ static inline bool nohz_kick_needed(stru
>>         return kick;
>>  }
>>  #else
>> -static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
>> +static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>> +{
>> +       return false;
>> +}
>>  #endif
>>
>>  /*
>> @@ -9387,7 +9401,11 @@ static __latent_entropy void run_rebalan
>>          * load balance only within the local sched_domain hierarchy
>>          * and abort nohz_idle_balance altogether if we pull some load.
>>          */
>> -       nohz_idle_balance(this_rq, idle);
>> +       if (nohz_idle_balance(this_rq, idle))
>> +               return;
>> +
>> +       /* normal load balance */
>> +       update_blocked_averages(this_rq->cpu);
>>         rebalance_domains(this_rq, idle);
>>  }
>>
>> --- a/kernel/sched/sched.h
>> +++ b/kernel/sched/sched.h
>> @@ -2005,9 +2005,13 @@ extern void cfs_bandwidth_usage_dec(void
>>  #ifdef CONFIG_NO_HZ_COMMON
>>  #define NOHZ_TICK_STOPPED_BIT  0
>>  #define NOHZ_BALANCE_KICK_BIT  1
>> +#define NOHZ_STATS_KICK_BIT    2
>>
>>  #define NOHZ_TICK_STOPPED      BIT(NOHZ_TICK_STOPPED_BIT)
>>  #define NOHZ_BALANCE_KICK      BIT(NOHZ_BALANCE_KICK_BIT)
>> +#define NOHZ_STATS_KICK                BIT(NOHZ_STATS_KICK_BIT)
>> +
>> +#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK)
>>
>>  #define nohz_flags(cpu)        (&cpu_rq(cpu)->nohz_flags)
>>
>>
>>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK
  2017-12-21 16:23   ` Vincent Guittot
  2017-12-21 16:56     ` Vincent Guittot
@ 2017-12-22  7:56     ` Peter Zijlstra
  2017-12-22  8:04       ` Vincent Guittot
  1 sibling, 1 reply; 56+ messages in thread
From: Peter Zijlstra @ 2017-12-22  7:56 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: Ingo Molnar, linux-kernel, Brendan Jackman, Dietmar Eggemann,
	Morten Rasmussen

On Thu, Dec 21, 2017 at 05:23:27PM +0100, Vincent Guittot wrote:
> Hi Peter,
> 
> I think that part of the proposal is missing.
> 
> One goal of the patchset was to kick an update of the stats of idle
> cpu when a task wake up on a cpu but the statistic has not been
> updated for a while.
> 
> That's why there where a call to nohz_kick_needed in the proposal to
> kick ilb but only for updating blocked load and not a full idle load
> balance
> 
> I can't find this call any more in your patchset

Yeah, I took it out because it didn't make sense to me. If you're waking
to a stale CPU, you're too late.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK
  2017-12-21 16:56     ` Vincent Guittot
@ 2017-12-22  7:59       ` Peter Zijlstra
  2017-12-22  8:05         ` Vincent Guittot
  0 siblings, 1 reply; 56+ messages in thread
From: Peter Zijlstra @ 2017-12-22  7:59 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: Ingo Molnar, linux-kernel, Brendan Jackman, Dietmar Eggemann,
	Morten Rasmussen

On Thu, Dec 21, 2017 at 05:56:32PM +0100, Vincent Guittot wrote:
> In fact, we can't only rely on the tick and newly_idle load balance to
> ensure a period update of the blocked load because they can never
> happen.

I'm confused, why would the ilb not happen?

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK
  2017-12-22  7:56     ` Peter Zijlstra
@ 2017-12-22  8:04       ` Vincent Guittot
  0 siblings, 0 replies; 56+ messages in thread
From: Vincent Guittot @ 2017-12-22  8:04 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, linux-kernel, Brendan Jackman, Dietmar Eggemann,
	Morten Rasmussen

On 22 December 2017 at 08:56, Peter Zijlstra <peterz@infradead.org> wrote:
> On Thu, Dec 21, 2017 at 05:23:27PM +0100, Vincent Guittot wrote:
>> Hi Peter,
>>
>> I think that part of the proposal is missing.
>>
>> One goal of the patchset was to kick an update of the stats of idle
>> cpu when a task wake up on a cpu but the statistic has not been
>> updated for a while.
>>
>> That's why there where a call to nohz_kick_needed in the proposal to
>> kick ilb but only for updating blocked load and not a full idle load
>> balance
>>
>> I can't find this call any more in your patchset
>
> Yeah, I took it out because it didn't make sense to me. If you're waking
> to a stale CPU, you're too late.

I agree that's it's too late for the current wake up but that's the
trade off with delaying the wake up until all blocked load has been
updated.

>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK
  2017-12-22  7:59       ` Peter Zijlstra
@ 2017-12-22  8:05         ` Vincent Guittot
  2017-12-22  8:29           ` Peter Zijlstra
  0 siblings, 1 reply; 56+ messages in thread
From: Vincent Guittot @ 2017-12-22  8:05 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, linux-kernel, Brendan Jackman, Dietmar Eggemann,
	Morten Rasmussen

On 22 December 2017 at 08:59, Peter Zijlstra <peterz@infradead.org> wrote:
> On Thu, Dec 21, 2017 at 05:56:32PM +0100, Vincent Guittot wrote:
>> In fact, we can't only rely on the tick and newly_idle load balance to
>> ensure a period update of the blocked load because they can never
>> happen.
>
> I'm confused, why would the ilb not happen?

the ilb will be kick only if tick fires which might not be the case
for task that runs less than a tick

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK
  2017-12-22  8:05         ` Vincent Guittot
@ 2017-12-22  8:29           ` Peter Zijlstra
  2017-12-22  9:12             ` Peter Zijlstra
  0 siblings, 1 reply; 56+ messages in thread
From: Peter Zijlstra @ 2017-12-22  8:29 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: Ingo Molnar, linux-kernel, Brendan Jackman, Dietmar Eggemann,
	Morten Rasmussen

On Fri, Dec 22, 2017 at 09:05:45AM +0100, Vincent Guittot wrote:
> On 22 December 2017 at 08:59, Peter Zijlstra <peterz@infradead.org> wrote:
> > On Thu, Dec 21, 2017 at 05:56:32PM +0100, Vincent Guittot wrote:
> >> In fact, we can't only rely on the tick and newly_idle load balance to
> >> ensure a period update of the blocked load because they can never
> >> happen.
> >
> > I'm confused, why would the ilb not happen?
> 
> the ilb will be kick only if tick fires which might not be the case
> for task that runs less than a tick

Oh, urgh, you're talking about when the entire system is idle. Yes
indeed.

Lemme have a think, surely we can do something saner there.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK
  2017-12-22  8:29           ` Peter Zijlstra
@ 2017-12-22  9:12             ` Peter Zijlstra
  2017-12-22 14:31               ` Peter Zijlstra
  2017-12-22 14:32               ` Vincent Guittot
  0 siblings, 2 replies; 56+ messages in thread
From: Peter Zijlstra @ 2017-12-22  9:12 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: Ingo Molnar, linux-kernel, Brendan Jackman, Dietmar Eggemann,
	Morten Rasmussen

On Fri, Dec 22, 2017 at 09:29:15AM +0100, Peter Zijlstra wrote:
> On Fri, Dec 22, 2017 at 09:05:45AM +0100, Vincent Guittot wrote:
> > On 22 December 2017 at 08:59, Peter Zijlstra <peterz@infradead.org> wrote:
> > > On Thu, Dec 21, 2017 at 05:56:32PM +0100, Vincent Guittot wrote:
> > >> In fact, we can't only rely on the tick and newly_idle load balance to
> > >> ensure a period update of the blocked load because they can never
> > >> happen.
> > >
> > > I'm confused, why would the ilb not happen?
> > 
> > the ilb will be kick only if tick fires which might not be the case
> > for task that runs less than a tick
> 
> Oh, urgh, you're talking about when the entire system is idle. Yes
> indeed.
> 
> Lemme have a think, surely we can do something saner there.

The only thing I could come up with is running a timer for this :/ That
would keep the ILB thing running until all load is decayed (have a patch
for that somewhere).

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK
  2017-12-22  9:12             ` Peter Zijlstra
@ 2017-12-22 14:31               ` Peter Zijlstra
  2017-12-22 14:34                 ` Vincent Guittot
  2017-12-22 14:32               ` Vincent Guittot
  1 sibling, 1 reply; 56+ messages in thread
From: Peter Zijlstra @ 2017-12-22 14:31 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: Ingo Molnar, linux-kernel, Brendan Jackman, Dietmar Eggemann,
	Morten Rasmussen

On Fri, Dec 22, 2017 at 10:12:21AM +0100, Peter Zijlstra wrote:

> The only thing I could come up with is running a timer for this :/ That
> would keep the ILB thing running until all load is decayed (have a patch
> for that somewhere).

Implemented that; pushed it out, should all be at:

  git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git sched/testing

but given how today is going, it'll eat your nan and set your cat on
fire.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK
  2017-12-22  9:12             ` Peter Zijlstra
  2017-12-22 14:31               ` Peter Zijlstra
@ 2017-12-22 14:32               ` Vincent Guittot
  2017-12-22 18:56                 ` Peter Zijlstra
  1 sibling, 1 reply; 56+ messages in thread
From: Vincent Guittot @ 2017-12-22 14:32 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, linux-kernel, Brendan Jackman, Dietmar Eggemann,
	Morten Rasmussen

On 22 December 2017 at 10:12, Peter Zijlstra <peterz@infradead.org> wrote:
> On Fri, Dec 22, 2017 at 09:29:15AM +0100, Peter Zijlstra wrote:
>> On Fri, Dec 22, 2017 at 09:05:45AM +0100, Vincent Guittot wrote:
>> > On 22 December 2017 at 08:59, Peter Zijlstra <peterz@infradead.org> wrote:
>> > > On Thu, Dec 21, 2017 at 05:56:32PM +0100, Vincent Guittot wrote:
>> > >> In fact, we can't only rely on the tick and newly_idle load balance to
>> > >> ensure a period update of the blocked load because they can never
>> > >> happen.
>> > >
>> > > I'm confused, why would the ilb not happen?
>> >
>> > the ilb will be kick only if tick fires which might not be the case
>> > for task that runs less than a tick
>>
>> Oh, urgh, you're talking about when the entire system is idle. Yes
>> indeed.
>>
>> Lemme have a think, surely we can do something saner there.
>
> The only thing I could come up with is running a timer for this :/ That
> would keep the ILB thing running until all load is decayed (have a patch
> for that somewhere).

IMHO running a timer doesn't sound really great
When we have enough activity on the system, the tick and the periodic
load balance will ensure the update of load of all cpus (including the
idle cpus) at the load balance period pace But if we don't have enough
activity to trig the periodic update through ilb or because the system
is not overloaded or even almost idle, we don't have these periodic
update anymore. The goal is to do a lazy update of the blocked load to
not hurt too much power consumption of idle CPUs. When a task wakes up
and the blocked idle load have not been updated for a while, we trig
the update of these blocked loads in parallel to the wake up so the
data will be more accurate for the next events. It's already too late
for the current wake up but that's not a big deal because the wake up
path of a light loaded system is mainly choosing between previous and
current cpu and the load_avg_contrib and the utilization will have
been updated for next events.

>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK
  2017-12-22 14:31               ` Peter Zijlstra
@ 2017-12-22 14:34                 ` Vincent Guittot
  0 siblings, 0 replies; 56+ messages in thread
From: Vincent Guittot @ 2017-12-22 14:34 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, linux-kernel, Brendan Jackman, Dietmar Eggemann,
	Morten Rasmussen

On 22 December 2017 at 15:31, Peter Zijlstra <peterz@infradead.org> wrote:
> On Fri, Dec 22, 2017 at 10:12:21AM +0100, Peter Zijlstra wrote:
>
>> The only thing I could come up with is running a timer for this :/ That
>> would keep the ILB thing running until all load is decayed (have a patch
>> for that somewhere).
>
> Implemented that; pushed it out, should all be at:
>
>   git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git sched/testing
>
> but given how today is going, it'll eat your nan and set your cat on
> fire.

Our emails crossed. i'm going to have a look at your branch

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK
  2017-12-22 14:32               ` Vincent Guittot
@ 2017-12-22 18:56                 ` Peter Zijlstra
  2017-12-22 20:42                   ` Peter Zijlstra
  0 siblings, 1 reply; 56+ messages in thread
From: Peter Zijlstra @ 2017-12-22 18:56 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: Ingo Molnar, linux-kernel, Brendan Jackman, Dietmar Eggemann,
	Morten Rasmussen

On Fri, Dec 22, 2017 at 03:32:53PM +0100, Vincent Guittot wrote:
> > The only thing I could come up with is running a timer for this :/ That
> > would keep the ILB thing running until all load is decayed (have a patch
> > for that somewhere).
> 
> IMHO running a timer doesn't sound really great

I tend to agree..

> When we have enough activity on the system, the tick and the periodic
> load balance will ensure the update of load of all cpus (including the
> idle cpus) at the load balance period pace.

> But if we don't have enough activity to trig the periodic update
> through ilb or because the system is not overloaded or even almost
> idle, we don't have these periodic update anymore.

> The goal is to do a lazy update of the blocked load to not hurt too
> much power consumption of idle CPUs. When a task wakes up and the
> blocked idle load have not been updated for a while, we trig the
> update of these blocked loads in parallel to the wake up so the data
> will be more accurate for the next events.

> It's already too late for the current wake up but that's not a big
> deal because the wake up path of a light loaded system is mainly
> choosing between previous and current cpu and the load_avg_contrib and
> the utilization will have been updated for next events.

Right; but I figured we'd try and do it 'right' and see how horrible it
is before we try and do funny things.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK
  2017-12-22 18:56                 ` Peter Zijlstra
@ 2017-12-22 20:42                   ` Peter Zijlstra
  2018-01-02 15:44                     ` Morten Rasmussen
  2018-01-03  9:16                     ` Vincent Guittot
  0 siblings, 2 replies; 56+ messages in thread
From: Peter Zijlstra @ 2017-12-22 20:42 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: Ingo Molnar, linux-kernel, Brendan Jackman, Dietmar Eggemann,
	Morten Rasmussen

On Fri, Dec 22, 2017 at 07:56:29PM +0100, Peter Zijlstra wrote:
> Right; but I figured we'd try and do it 'right' and see how horrible it
> is before we try and do funny things.

So now it should have a 32ms tick for up to .5s when the system goes
completely idle.

No idea how bad that is..

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK
  2017-12-22 20:42                   ` Peter Zijlstra
@ 2018-01-02 15:44                     ` Morten Rasmussen
  2018-01-15  9:43                       ` Peter Zijlstra
  2018-01-03  9:16                     ` Vincent Guittot
  1 sibling, 1 reply; 56+ messages in thread
From: Morten Rasmussen @ 2018-01-02 15:44 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Vincent Guittot, Ingo Molnar, linux-kernel, Brendan Jackman,
	Dietmar Eggemann, Morten Rasmussen

On Fri, Dec 22, 2017 at 09:42:47PM +0100, Peter Zijlstra wrote:
> On Fri, Dec 22, 2017 at 07:56:29PM +0100, Peter Zijlstra wrote:
> > Right; but I figured we'd try and do it 'right' and see how horrible it
> > is before we try and do funny things.
> 
> So now it should have a 32ms tick for up to .5s when the system goes
> completely idle.
> 
> No idea how bad that is..

Does it mean that the 32ms tick will keep going forever if the system
doesn't go completely idle? Some tiny background task or a slightly
bigger one with a longer period?

Do we actually care about stale values if the system is completely idle?

Instead of hacking select_task_rq_fair() to kick off a stats update as
Vincent already proposed, why can't we just modify Brendan's
CPU_NEWLY_IDLE proposal to do a stats update from idle_balance() every
32ms regardless of whether we need to load-balance?

This way we should get updates if there is anything running, we don't
touch the wake-up path, we don't cause any additional wake-ups, and we
don't need a timer. What am I missing?

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK
  2017-12-22 20:42                   ` Peter Zijlstra
  2018-01-02 15:44                     ` Morten Rasmussen
@ 2018-01-03  9:16                     ` Vincent Guittot
  2018-01-15  8:26                       ` Vincent Guittot
  1 sibling, 1 reply; 56+ messages in thread
From: Vincent Guittot @ 2018-01-03  9:16 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, linux-kernel, Brendan Jackman, Dietmar Eggemann,
	Morten Rasmussen

Hi Peter,

On 22 December 2017 at 21:42, Peter Zijlstra <peterz@infradead.org> wrote:
> On Fri, Dec 22, 2017 at 07:56:29PM +0100, Peter Zijlstra wrote:
>> Right; but I figured we'd try and do it 'right' and see how horrible it
>> is before we try and do funny things.
>
> So now it should have a 32ms tick for up to .5s when the system goes
> completely idle.
>
> No idea how bad that is..

I have tested your branch but the timer doesn't seem to fire correctly
because i can still see blocked load in the use case i have run.
I haven't found the reason yet

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK
  2018-01-03  9:16                     ` Vincent Guittot
@ 2018-01-15  8:26                       ` Vincent Guittot
  2018-01-18 10:38                         ` Morten Rasmussen
                                           ` (2 more replies)
  0 siblings, 3 replies; 56+ messages in thread
From: Vincent Guittot @ 2018-01-15  8:26 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, linux-kernel, Brendan Jackman, Dietmar Eggemann,
	Morten Rasmussen

Le Wednesday 03 Jan 2018 à 10:16:00 (+0100), Vincent Guittot a écrit :
> Hi Peter,
> 
> On 22 December 2017 at 21:42, Peter Zijlstra <peterz@infradead.org> wrote:
> > On Fri, Dec 22, 2017 at 07:56:29PM +0100, Peter Zijlstra wrote:
> >> Right; but I figured we'd try and do it 'right' and see how horrible it
> >> is before we try and do funny things.
> >
> > So now it should have a 32ms tick for up to .5s when the system goes
> > completely idle.
> >
> > No idea how bad that is..
> 
> I have tested your branch but the timer doesn't seem to fire correctly
> because i can still see blocked load in the use case i have run.
> I haven't found the reason yet

Hi Peter,

With the patch below on top of your branch, the blocked loads are updated and
decayed regularly. The main differences are:
- It doesn't use a timer to trig ilb but the tick and when a cpu becomes idle.
  The main drawback of this solution is that the load is blocked when the
  system is fully idle with the advantage of not waking up a fully idle
  system. We have to wait for the next tick or newly idle event for updating
  blocked load when the system leaves idle stat which can be up to a tick long.
  If this is too long, we can check for kicking ilb when task wakes up so the
  blocked load will be updated as soon as the system leaves idle state.
  The main advantage is that we don't wake up a fully idle system every 32ms to
  update blocked load that will be not used.
- I'm working on one more improvement to use nohz_idle_balance in the newly
  idle case when the system is not overloaded and 
  (this_rq->avg_idle > sysctl_sched_migration_cost). In this case, we can try to
  use nohz_idle_balance with NOHZ_STATS_KICK and abort as soon as it exceed
  this_rq->avg_idle. This will remove some calls to kick_ilb and some wake up
  of an idle cpus.

---
 kernel/sched/fair.c | 72 +++++++++++++++++++++++++----------------------------
 1 file changed, 34 insertions(+), 38 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 52114c6..898785d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5386,7 +5386,6 @@ static struct {
 	atomic_t stats_state;
 	unsigned long next_balance;     /* in jiffy units */
 	unsigned long next_stats;
-	struct timer_list timer;
 } nohz ____cacheline_aligned;
 
 #endif /* CONFIG_NO_HZ_COMMON */
@@ -8004,8 +8003,9 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 		prefer_sibling = 1;
 
 #ifdef CONFIG_NO_HZ_COMMON
-	if (env->idle == CPU_NEWLY_IDLE)
+	if (env->idle == CPU_NEWLY_IDLE && atomic_read(&nohz.stats_state)) {
 		env->flags |= LBF_NOHZ_STATS;
+	}
 #endif
 
 	load_idx = get_sd_load_idx(env->sd, env->idle);
@@ -8818,6 +8818,8 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
 		*next_balance = next;
 }
 
+static void kick_ilb(unsigned int flags);
+
 /*
  * idle_balance is called by schedule() if this_cpu is about to become
  * idle. Attempts to pull tasks from other CPUs.
@@ -8852,12 +8854,16 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
 
 	if (this_rq->avg_idle < sysctl_sched_migration_cost ||
 	    !this_rq->rd->overload) {
+		unsigned long next = READ_ONCE(nohz.next_stats);
 		rcu_read_lock();
 		sd = rcu_dereference_check_sched_domain(this_rq->sd);
 		if (sd)
 			update_next_balance(sd, &next_balance);
 		rcu_read_unlock();
 
+		if (time_after(jiffies, next) && atomic_read(&nohz.stats_state))
+			kick_ilb(NOHZ_STATS_KICK);
+
 		goto out;
 	}
 
@@ -9075,18 +9081,6 @@ static void kick_ilb(unsigned int flags)
 	smp_send_reschedule(ilb_cpu);
 }
 
-void nohz_balance_timer(struct timer_list *timer)
-{
-	unsigned long next = READ_ONCE(nohz.next_stats);
-
-	if (time_before(jiffies, next)) {
-		mod_timer(timer, next);
-		return;
-	}
-
-	kick_ilb(NOHZ_STATS_KICK);
-}
-
 /*
  * Current heuristic for kicking the idle load balancer in the presence
  * of an idle cpu in the system.
@@ -9122,6 +9116,9 @@ static void nohz_balancer_kick(struct rq *rq)
 	if (likely(!atomic_read(&nohz.nr_cpus)))
 		return;
 
+	if (time_after(now, nohz.next_stats) && atomic_read(&nohz.stats_state))
+		flags = NOHZ_STATS_KICK;
+
 	if (time_before(now, nohz.next_balance))
 		goto out;
 
@@ -9227,7 +9224,6 @@ static void set_cpu_sd_state_idle(int cpu)
 void nohz_balance_enter_idle(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
-	unsigned int val, new;
 
 	SCHED_WARN_ON(cpu != smp_processor_id());
 
@@ -9251,6 +9247,7 @@ void nohz_balance_enter_idle(int cpu)
 		return;
 
 	rq->nohz_tick_stopped = 1;
+	rq->has_blocked_load = 1;
 
 	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
 	atomic_inc(&nohz.nr_cpus);
@@ -9258,21 +9255,11 @@ void nohz_balance_enter_idle(int cpu)
 	set_cpu_sd_state_idle(cpu);
 
 	/*
-	 * implies a barrier such that if the stats_state update is observed
-	 * the above updates are also visible. Pairs with stuff in
-	 * update_sd_lb_stats() and nohz_idle_balance().
+	 * Each time a cpu enter idle, we assume that it has blocked load and
+	 * enable the periodic update of the load of idle cpus
 	 */
-	val = atomic_read(&nohz.stats_state);
-	do {
-		new = val + 2;
-		new |= 1;
-	} while (!atomic_try_cmpxchg(&nohz.stats_state, &val, new));
+	atomic_set(&nohz.stats_state, 1);
 
-	/*
-	 * If the timer was stopped, restart the thing.
-	 */
-	if (!(val & 1))
-		mod_timer(&nohz.timer, jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
 }
 #else
 static inline void nohz_balancer_kick(struct rq *rq) { }
@@ -9409,7 +9396,6 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 	bool has_blocked_load = false;
 	int update_next_balance = 0;
 	int this_cpu = this_rq->cpu;
-	unsigned int stats_seq;
 	unsigned int flags;
 	int balance_cpu;
 	struct rq *rq;
@@ -9422,7 +9408,6 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 		return false;
 	}
 
-	stats_seq = atomic_read(&nohz.stats_state);
 	/*
 	 * barrier, pairs with nohz_balance_enter_idle(), ensures ...
 	 */
@@ -9432,6 +9417,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 
 	SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
 
+	/*
+	 * We assume there will be no idle load after this update and clear
+	 * the stats state. If a cpu enters idle in the mean time, it will
+	 * set the stats state and trig another update of idle load.
+	 * Because a cpu that becomes idle, is added to idle_cpus_mask before
+	 * setting the stats state, we are sure to not clear the state and not
+	 * check the load of an idle cpu.
+	 */
+	atomic_set(&nohz.stats_state, 0);
+
 	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
 		if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
 			continue;
@@ -9441,8 +9436,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 		 * work being done for other cpus. Next load
 		 * balancing owner will pick it up.
 		 */
-		if (need_resched())
+		if (need_resched()) {
+			has_blocked_load = true;
 			break;
+		}
 
 		rq = cpu_rq(balance_cpu);
 
@@ -9477,12 +9474,12 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 	if (flags & NOHZ_BALANCE_KICK)
 		rebalance_domains(this_rq, CPU_IDLE);
 
-	if (has_blocked_load ||
-	    !atomic_try_cmpxchg(&nohz.stats_state, &stats_seq, 0)) {
-		WRITE_ONCE(nohz.next_stats,
-				now + msecs_to_jiffies(LOAD_AVG_PERIOD));
-		mod_timer(&nohz.timer, nohz.next_stats);
-	}
+	WRITE_ONCE(nohz.next_stats,
+		now + msecs_to_jiffies(LOAD_AVG_PERIOD));
+
+	/* There is still blocked load, enable periodic update */
+	if (has_blocked_load)
+		atomic_set(&nohz.stats_state, 1);
 
 	/*
 	 * next_balance will be updated only when there is a need.
@@ -10115,7 +10112,6 @@ __init void init_sched_fair_class(void)
 	nohz.next_balance = jiffies;
 	nohz.next_stats = jiffies;
 	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
-	timer_setup(&nohz.timer, nohz_balance_timer, 0);
 #endif
 #endif /* SMP */
 
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 56+ messages in thread

* Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK
  2018-01-02 15:44                     ` Morten Rasmussen
@ 2018-01-15  9:43                       ` Peter Zijlstra
  2018-01-18 10:32                         ` Morten Rasmussen
  0 siblings, 1 reply; 56+ messages in thread
From: Peter Zijlstra @ 2018-01-15  9:43 UTC (permalink / raw)
  To: Morten Rasmussen
  Cc: Vincent Guittot, Ingo Molnar, linux-kernel, Brendan Jackman,
	Dietmar Eggemann, Morten Rasmussen

On Tue, Jan 02, 2018 at 03:44:57PM +0000, Morten Rasmussen wrote:

> Vincent already proposed, why can't we just modify Brendan's
> CPU_NEWLY_IDLE proposal to do a stats update from idle_balance() every
> 32ms regardless of whether we need to load-balance?

I think that code is there, no?

Subject: sched: Update blocked load from NEWIDLE

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK
  2018-01-15  9:43                       ` Peter Zijlstra
@ 2018-01-18 10:32                         ` Morten Rasmussen
  0 siblings, 0 replies; 56+ messages in thread
From: Morten Rasmussen @ 2018-01-18 10:32 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Vincent Guittot, Ingo Molnar, linux-kernel, Brendan Jackman,
	Dietmar Eggemann, Morten Rasmussen

On Mon, Jan 15, 2018 at 10:43:18AM +0100, Peter Zijlstra wrote:
> On Tue, Jan 02, 2018 at 03:44:57PM +0000, Morten Rasmussen wrote:
> 
> > Vincent already proposed, why can't we just modify Brendan's
> > CPU_NEWLY_IDLE proposal to do a stats update from idle_balance() every
> > 32ms regardless of whether we need to load-balance?
> 
> I think that code is there, no?
> 
> Subject: sched: Update blocked load from NEWIDLE

The mechanics are there, but I think the problem is that the
idle_balance() bails out before we get to it in some cases. If we only
have a few small periodic tasks running rd->overload won't be set and
idle_balance() returns before doing anything.

We would need some sort of check to see if a PELT update is due and make
sure it happens, even if idle_balance() has nothing to do.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK
  2018-01-15  8:26                       ` Vincent Guittot
@ 2018-01-18 10:38                         ` Morten Rasmussen
  2018-01-24  8:25                           ` Vincent Guittot
  2018-02-01 16:55                           ` [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK Peter Zijlstra
  2018-01-22  9:40                         ` Dietmar Eggemann
  2018-02-01 16:52                         ` Peter Zijlstra
  2 siblings, 2 replies; 56+ messages in thread
From: Morten Rasmussen @ 2018-01-18 10:38 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: Peter Zijlstra, Ingo Molnar, linux-kernel, Brendan Jackman,
	Dietmar Eggemann, Morten Rasmussen

On Mon, Jan 15, 2018 at 09:26:09AM +0100, Vincent Guittot wrote:
> Le Wednesday 03 Jan 2018 à 10:16:00 (+0100), Vincent Guittot a écrit :
> > Hi Peter,
> > 
> > On 22 December 2017 at 21:42, Peter Zijlstra <peterz@infradead.org> wrote:
> > > On Fri, Dec 22, 2017 at 07:56:29PM +0100, Peter Zijlstra wrote:
> > >> Right; but I figured we'd try and do it 'right' and see how horrible it
> > >> is before we try and do funny things.
> > >
> > > So now it should have a 32ms tick for up to .5s when the system goes
> > > completely idle.
> > >
> > > No idea how bad that is..
> > 
> > I have tested your branch but the timer doesn't seem to fire correctly
> > because i can still see blocked load in the use case i have run.
> > I haven't found the reason yet
> 
> Hi Peter,
> 
> With the patch below on top of your branch, the blocked loads are updated and
> decayed regularly. The main differences are:
> - It doesn't use a timer to trig ilb but the tick and when a cpu becomes idle.
>   The main drawback of this solution is that the load is blocked when the
>   system is fully idle with the advantage of not waking up a fully idle
>   system. We have to wait for the next tick or newly idle event for updating
>   blocked load when the system leaves idle stat which can be up to a tick long.
>   If this is too long, we can check for kicking ilb when task wakes up so the
>   blocked load will be updated as soon as the system leaves idle state.
>   The main advantage is that we don't wake up a fully idle system every 32ms to
>   update blocked load that will be not used.
> - I'm working on one more improvement to use nohz_idle_balance in the newly
>   idle case when the system is not overloaded and 
>   (this_rq->avg_idle > sysctl_sched_migration_cost). In this case, we can try to
>   use nohz_idle_balance with NOHZ_STATS_KICK and abort as soon as it exceed
>   this_rq->avg_idle. This will remove some calls to kick_ilb and some wake up
>   of an idle cpus.

This sound like what I meant in my other reply :-)

It seems pointless to have a timer to update PELT if the system is
completely idle, and when it isn't we can piggy back other events to
make the updates happen.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK
  2018-01-15  8:26                       ` Vincent Guittot
  2018-01-18 10:38                         ` Morten Rasmussen
@ 2018-01-22  9:40                         ` Dietmar Eggemann
  2018-01-22 10:23                           ` Vincent Guittot
  2018-02-01 16:52                         ` Peter Zijlstra
  2 siblings, 1 reply; 56+ messages in thread
From: Dietmar Eggemann @ 2018-01-22  9:40 UTC (permalink / raw)
  To: Vincent Guittot, Peter Zijlstra
  Cc: Ingo Molnar, linux-kernel, Brendan Jackman, Morten Rasmussen

On 01/15/2018 08:26 AM, Vincent Guittot wrote:
> Le Wednesday 03 Jan 2018 à 10:16:00 (+0100), Vincent Guittot a écrit :
>> Hi Peter,
>>
>> On 22 December 2017 at 21:42, Peter Zijlstra <peterz@infradead.org> wrote:
>>> On Fri, Dec 22, 2017 at 07:56:29PM +0100, Peter Zijlstra wrote:
>>>> Right; but I figured we'd try and do it 'right' and see how horrible it
>>>> is before we try and do funny things.
>>>
>>> So now it should have a 32ms tick for up to .5s when the system goes
>>> completely idle.
>>>
>>> No idea how bad that is..
>>
>> I have tested your branch but the timer doesn't seem to fire correctly
>> because i can still see blocked load in the use case i have run.
>> I haven't found the reason yet

Isn't the issue with the timer based implementation that 
rq->has_blocked_load is never set to 1 ?

Something you changed in your implementation by adding a 
rq->has_blocked_load = 1 into nohz_balance_enter_idle().

> 
> Hi Peter,
> 
> With the patch below on top of your branch, the blocked loads are updated and
> decayed regularly. The main differences are:
> - It doesn't use a timer to trig ilb but the tick and when a cpu becomes idle.
>    The main drawback of this solution is that the load is blocked when the
>    system is fully idle with the advantage of not waking up a fully idle
>    system. We have to wait for the next tick or newly idle event for updating
>    blocked load when the system leaves idle stat which can be up to a tick long.
>    If this is too long, we can check for kicking ilb when task wakes up so the
>    blocked load will be updated as soon as the system leaves idle state.
>    The main advantage is that we don't wake up a fully idle system every 32ms to
>    update blocked load that will be not used.
> - I'm working on one more improvement to use nohz_idle_balance in the newly
>    idle case when the system is not overloaded and
>    (this_rq->avg_idle > sysctl_sched_migration_cost). In this case, we can try to
>    use nohz_idle_balance with NOHZ_STATS_KICK and abort as soon as it exceed
>    this_rq->avg_idle. This will remove some calls to kick_ilb and some wake up
>    of an idle cpus.
> 
> ---
>   kernel/sched/fair.c | 72 +++++++++++++++++++++++++----------------------------
>   1 file changed, 34 insertions(+), 38 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 52114c6..898785d 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5386,7 +5386,6 @@ static struct {
>   	atomic_t stats_state;
>   	unsigned long next_balance;     /* in jiffy units */
>   	unsigned long next_stats;
> -	struct timer_list timer;
>   } nohz ____cacheline_aligned;
>   
>   #endif /* CONFIG_NO_HZ_COMMON */
> @@ -8004,8 +8003,9 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
>   		prefer_sibling = 1;
>   
>   #ifdef CONFIG_NO_HZ_COMMON
> -	if (env->idle == CPU_NEWLY_IDLE)
> +	if (env->idle == CPU_NEWLY_IDLE && atomic_read(&nohz.stats_state)) {
>   		env->flags |= LBF_NOHZ_STATS;
> +	}
>   #endif
>   
>   	load_idx = get_sd_load_idx(env->sd, env->idle);
> @@ -8818,6 +8818,8 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
>   		*next_balance = next;
>   }
>   
> +static void kick_ilb(unsigned int flags);
> +
>   /*
>    * idle_balance is called by schedule() if this_cpu is about to become
>    * idle. Attempts to pull tasks from other CPUs.
> @@ -8852,12 +8854,16 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
>   
>   	if (this_rq->avg_idle < sysctl_sched_migration_cost ||
>   	    !this_rq->rd->overload) {
> +		unsigned long next = READ_ONCE(nohz.next_stats);
>   		rcu_read_lock();
>   		sd = rcu_dereference_check_sched_domain(this_rq->sd);
>   		if (sd)
>   			update_next_balance(sd, &next_balance);
>   		rcu_read_unlock();
>   
> +		if (time_after(jiffies, next) && atomic_read(&nohz.stats_state))
> +			kick_ilb(NOHZ_STATS_KICK);
> +
>   		goto out;
>   	}
>   
> @@ -9075,18 +9081,6 @@ static void kick_ilb(unsigned int flags)
>   	smp_send_reschedule(ilb_cpu);
>   }
>   
> -void nohz_balance_timer(struct timer_list *timer)
> -{
> -	unsigned long next = READ_ONCE(nohz.next_stats);
> -
> -	if (time_before(jiffies, next)) {
> -		mod_timer(timer, next);
> -		return;
> -	}
> -
> -	kick_ilb(NOHZ_STATS_KICK);
> -}
> -
>   /*
>    * Current heuristic for kicking the idle load balancer in the presence
>    * of an idle cpu in the system.
> @@ -9122,6 +9116,9 @@ static void nohz_balancer_kick(struct rq *rq)
>   	if (likely(!atomic_read(&nohz.nr_cpus)))
>   		return;
>   
> +	if (time_after(now, nohz.next_stats) && atomic_read(&nohz.stats_state))
> +		flags = NOHZ_STATS_KICK;
> +
>   	if (time_before(now, nohz.next_balance))
>   		goto out;
>   
> @@ -9227,7 +9224,6 @@ static void set_cpu_sd_state_idle(int cpu)
>   void nohz_balance_enter_idle(int cpu)
>   {
>   	struct rq *rq = cpu_rq(cpu);
> -	unsigned int val, new;
>   
>   	SCHED_WARN_ON(cpu != smp_processor_id());
>   
> @@ -9251,6 +9247,7 @@ void nohz_balance_enter_idle(int cpu)
>   		return;
>   
>   	rq->nohz_tick_stopped = 1;
> +	rq->has_blocked_load = 1;
>   
>   	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
>   	atomic_inc(&nohz.nr_cpus);
> @@ -9258,21 +9255,11 @@ void nohz_balance_enter_idle(int cpu)
>   	set_cpu_sd_state_idle(cpu);
>   
>   	/*
> -	 * implies a barrier such that if the stats_state update is observed
> -	 * the above updates are also visible. Pairs with stuff in
> -	 * update_sd_lb_stats() and nohz_idle_balance().
> +	 * Each time a cpu enter idle, we assume that it has blocked load and
> +	 * enable the periodic update of the load of idle cpus
>   	 */
> -	val = atomic_read(&nohz.stats_state);
> -	do {
> -		new = val + 2;
> -		new |= 1;
> -	} while (!atomic_try_cmpxchg(&nohz.stats_state, &val, new));
> +	atomic_set(&nohz.stats_state, 1);
>   
> -	/*
> -	 * If the timer was stopped, restart the thing.
> -	 */
> -	if (!(val & 1))
> -		mod_timer(&nohz.timer, jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
>   }
>   #else
>   static inline void nohz_balancer_kick(struct rq *rq) { }
> @@ -9409,7 +9396,6 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>   	bool has_blocked_load = false;
>   	int update_next_balance = 0;
>   	int this_cpu = this_rq->cpu;
> -	unsigned int stats_seq;
>   	unsigned int flags;
>   	int balance_cpu;
>   	struct rq *rq;
> @@ -9422,7 +9408,6 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>   		return false;
>   	}
>   
> -	stats_seq = atomic_read(&nohz.stats_state);
>   	/*
>   	 * barrier, pairs with nohz_balance_enter_idle(), ensures ...
>   	 */
> @@ -9432,6 +9417,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>   
>   	SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>   
> +	/*
> +	 * We assume there will be no idle load after this update and clear
> +	 * the stats state. If a cpu enters idle in the mean time, it will
> +	 * set the stats state and trig another update of idle load.
> +	 * Because a cpu that becomes idle, is added to idle_cpus_mask before
> +	 * setting the stats state, we are sure to not clear the state and not
> +	 * check the load of an idle cpu.
> +	 */
> +	atomic_set(&nohz.stats_state, 0);
> +
>   	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
>   		if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
>   			continue;
> @@ -9441,8 +9436,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>   		 * work being done for other cpus. Next load
>   		 * balancing owner will pick it up.
>   		 */
> -		if (need_resched())
> +		if (need_resched()) {
> +			has_blocked_load = true;
>   			break;
> +		}
>   
>   		rq = cpu_rq(balance_cpu);
>   
> @@ -9477,12 +9474,12 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>   	if (flags & NOHZ_BALANCE_KICK)
>   		rebalance_domains(this_rq, CPU_IDLE);
>   
> -	if (has_blocked_load ||
> -	    !atomic_try_cmpxchg(&nohz.stats_state, &stats_seq, 0)) {
> -		WRITE_ONCE(nohz.next_stats,
> -				now + msecs_to_jiffies(LOAD_AVG_PERIOD));
> -		mod_timer(&nohz.timer, nohz.next_stats);
> -	}
> +	WRITE_ONCE(nohz.next_stats,
> +		now + msecs_to_jiffies(LOAD_AVG_PERIOD));
> +
> +	/* There is still blocked load, enable periodic update */
> +	if (has_blocked_load)
> +		atomic_set(&nohz.stats_state, 1);
>   
>   	/*
>   	 * next_balance will be updated only when there is a need.
> @@ -10115,7 +10112,6 @@ __init void init_sched_fair_class(void)
>   	nohz.next_balance = jiffies;
>   	nohz.next_stats = jiffies;
>   	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
> -	timer_setup(&nohz.timer, nohz_balance_timer, 0);
>   #endif
>   #endif /* SMP */
>   
> 

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK
  2018-01-22  9:40                         ` Dietmar Eggemann
@ 2018-01-22 10:23                           ` Vincent Guittot
  0 siblings, 0 replies; 56+ messages in thread
From: Vincent Guittot @ 2018-01-22 10:23 UTC (permalink / raw)
  To: Dietmar Eggemann
  Cc: Peter Zijlstra, Ingo Molnar, linux-kernel, Brendan Jackman,
	Morten Rasmussen

On 22 January 2018 at 10:40, Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:
> On 01/15/2018 08:26 AM, Vincent Guittot wrote:
>>
>> Le Wednesday 03 Jan 2018 à 10:16:00 (+0100), Vincent Guittot a écrit :
>>>
>>> Hi Peter,
>>>
>>> On 22 December 2017 at 21:42, Peter Zijlstra <peterz@infradead.org>
>>> wrote:
>>>>
>>>> On Fri, Dec 22, 2017 at 07:56:29PM +0100, Peter Zijlstra wrote:
>>>>>
>>>>> Right; but I figured we'd try and do it 'right' and see how horrible it
>>>>> is before we try and do funny things.
>>>>
>>>>
>>>> So now it should have a 32ms tick for up to .5s when the system goes
>>>> completely idle.
>>>>
>>>> No idea how bad that is..
>>>
>>>
>>> I have tested your branch but the timer doesn't seem to fire correctly
>>> because i can still see blocked load in the use case i have run.
>>> I haven't found the reason yet
>
>
> Isn't the issue with the timer based implementation that
> rq->has_blocked_load is never set to 1 ?

Yes that's what I suggested to Peter on IRC

>
> Something you changed in your implementation by adding a
> rq->has_blocked_load = 1 into nohz_balance_enter_idle().
>
>
>>
>> Hi Peter,
>>
>> With the patch below on top of your branch, the blocked loads are updated
>> and
>> decayed regularly. The main differences are:
>> - It doesn't use a timer to trig ilb but the tick and when a cpu becomes
>> idle.
>>    The main drawback of this solution is that the load is blocked when the
>>    system is fully idle with the advantage of not waking up a fully idle
>>    system. We have to wait for the next tick or newly idle event for
>> updating
>>    blocked load when the system leaves idle stat which can be up to a tick
>> long.
>>    If this is too long, we can check for kicking ilb when task wakes up so
>> the
>>    blocked load will be updated as soon as the system leaves idle state.
>>    The main advantage is that we don't wake up a fully idle system every
>> 32ms to
>>    update blocked load that will be not used.
>> - I'm working on one more improvement to use nohz_idle_balance in the
>> newly
>>    idle case when the system is not overloaded and
>>    (this_rq->avg_idle > sysctl_sched_migration_cost). In this case, we can
>> try to
>>    use nohz_idle_balance with NOHZ_STATS_KICK and abort as soon as it
>> exceed
>>    this_rq->avg_idle. This will remove some calls to kick_ilb and some
>> wake up
>>    of an idle cpus.
>>
>> ---
>>   kernel/sched/fair.c | 72
>> +++++++++++++++++++++++++----------------------------
>>   1 file changed, 34 insertions(+), 38 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 52114c6..898785d 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -5386,7 +5386,6 @@ static struct {
>>         atomic_t stats_state;
>>         unsigned long next_balance;     /* in jiffy units */
>>         unsigned long next_stats;
>> -       struct timer_list timer;
>>   } nohz ____cacheline_aligned;
>>     #endif /* CONFIG_NO_HZ_COMMON */
>> @@ -8004,8 +8003,9 @@ static inline void update_sd_lb_stats(struct lb_env
>> *env, struct sd_lb_stats *sd
>>                 prefer_sibling = 1;
>>     #ifdef CONFIG_NO_HZ_COMMON
>> -       if (env->idle == CPU_NEWLY_IDLE)
>> +       if (env->idle == CPU_NEWLY_IDLE && atomic_read(&nohz.stats_state))
>> {
>>                 env->flags |= LBF_NOHZ_STATS;
>> +       }
>>   #endif
>>         load_idx = get_sd_load_idx(env->sd, env->idle);
>> @@ -8818,6 +8818,8 @@ update_next_balance(struct sched_domain *sd,
>> unsigned long *next_balance)
>>                 *next_balance = next;
>>   }
>>   +static void kick_ilb(unsigned int flags);
>> +
>>   /*
>>    * idle_balance is called by schedule() if this_cpu is about to become
>>    * idle. Attempts to pull tasks from other CPUs.
>> @@ -8852,12 +8854,16 @@ static int idle_balance(struct rq *this_rq, struct
>> rq_flags *rf)
>>         if (this_rq->avg_idle < sysctl_sched_migration_cost ||
>>             !this_rq->rd->overload) {
>> +               unsigned long next = READ_ONCE(nohz.next_stats);
>>                 rcu_read_lock();
>>                 sd = rcu_dereference_check_sched_domain(this_rq->sd);
>>                 if (sd)
>>                         update_next_balance(sd, &next_balance);
>>                 rcu_read_unlock();
>>   +             if (time_after(jiffies, next) &&
>> atomic_read(&nohz.stats_state))
>> +                       kick_ilb(NOHZ_STATS_KICK);
>> +
>>                 goto out;
>>         }
>>   @@ -9075,18 +9081,6 @@ static void kick_ilb(unsigned int flags)
>>         smp_send_reschedule(ilb_cpu);
>>   }
>>   -void nohz_balance_timer(struct timer_list *timer)
>> -{
>> -       unsigned long next = READ_ONCE(nohz.next_stats);
>> -
>> -       if (time_before(jiffies, next)) {
>> -               mod_timer(timer, next);
>> -               return;
>> -       }
>> -
>> -       kick_ilb(NOHZ_STATS_KICK);
>> -}
>> -
>>   /*
>>    * Current heuristic for kicking the idle load balancer in the presence
>>    * of an idle cpu in the system.
>> @@ -9122,6 +9116,9 @@ static void nohz_balancer_kick(struct rq *rq)
>>         if (likely(!atomic_read(&nohz.nr_cpus)))
>>                 return;
>>   +     if (time_after(now, nohz.next_stats) &&
>> atomic_read(&nohz.stats_state))
>> +               flags = NOHZ_STATS_KICK;
>> +
>>         if (time_before(now, nohz.next_balance))
>>                 goto out;
>>   @@ -9227,7 +9224,6 @@ static void set_cpu_sd_state_idle(int cpu)
>>   void nohz_balance_enter_idle(int cpu)
>>   {
>>         struct rq *rq = cpu_rq(cpu);
>> -       unsigned int val, new;
>>         SCHED_WARN_ON(cpu != smp_processor_id());
>>   @@ -9251,6 +9247,7 @@ void nohz_balance_enter_idle(int cpu)
>>                 return;
>>         rq->nohz_tick_stopped = 1;
>> +       rq->has_blocked_load = 1;
>>         cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
>>         atomic_inc(&nohz.nr_cpus);
>> @@ -9258,21 +9255,11 @@ void nohz_balance_enter_idle(int cpu)
>>         set_cpu_sd_state_idle(cpu);
>>         /*
>> -        * implies a barrier such that if the stats_state update is
>> observed
>> -        * the above updates are also visible. Pairs with stuff in
>> -        * update_sd_lb_stats() and nohz_idle_balance().
>> +        * Each time a cpu enter idle, we assume that it has blocked load
>> and
>> +        * enable the periodic update of the load of idle cpus
>>          */
>> -       val = atomic_read(&nohz.stats_state);
>> -       do {
>> -               new = val + 2;
>> -               new |= 1;
>> -       } while (!atomic_try_cmpxchg(&nohz.stats_state, &val, new));
>> +       atomic_set(&nohz.stats_state, 1);
>>   -     /*
>> -        * If the timer was stopped, restart the thing.
>> -        */
>> -       if (!(val & 1))
>> -               mod_timer(&nohz.timer, jiffies +
>> msecs_to_jiffies(LOAD_AVG_PERIOD));
>>   }
>>   #else
>>   static inline void nohz_balancer_kick(struct rq *rq) { }
>> @@ -9409,7 +9396,6 @@ static bool nohz_idle_balance(struct rq *this_rq,
>> enum cpu_idle_type idle)
>>         bool has_blocked_load = false;
>>         int update_next_balance = 0;
>>         int this_cpu = this_rq->cpu;
>> -       unsigned int stats_seq;
>>         unsigned int flags;
>>         int balance_cpu;
>>         struct rq *rq;
>> @@ -9422,7 +9408,6 @@ static bool nohz_idle_balance(struct rq *this_rq,
>> enum cpu_idle_type idle)
>>                 return false;
>>         }
>>   -     stats_seq = atomic_read(&nohz.stats_state);
>>         /*
>>          * barrier, pairs with nohz_balance_enter_idle(), ensures ...
>>          */
>> @@ -9432,6 +9417,16 @@ static bool nohz_idle_balance(struct rq *this_rq,
>> enum cpu_idle_type idle)
>>         SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>>   +     /*
>> +        * We assume there will be no idle load after this update and
>> clear
>> +        * the stats state. If a cpu enters idle in the mean time, it will
>> +        * set the stats state and trig another update of idle load.
>> +        * Because a cpu that becomes idle, is added to idle_cpus_mask
>> before
>> +        * setting the stats state, we are sure to not clear the state and
>> not
>> +        * check the load of an idle cpu.
>> +        */
>> +       atomic_set(&nohz.stats_state, 0);
>> +
>>         for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
>>                 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
>>                         continue;
>> @@ -9441,8 +9436,10 @@ static bool nohz_idle_balance(struct rq *this_rq,
>> enum cpu_idle_type idle)
>>                  * work being done for other cpus. Next load
>>                  * balancing owner will pick it up.
>>                  */
>> -               if (need_resched())
>> +               if (need_resched()) {
>> +                       has_blocked_load = true;
>>                         break;
>> +               }
>>                 rq = cpu_rq(balance_cpu);
>>   @@ -9477,12 +9474,12 @@ static bool nohz_idle_balance(struct rq
>> *this_rq, enum cpu_idle_type idle)
>>         if (flags & NOHZ_BALANCE_KICK)
>>                 rebalance_domains(this_rq, CPU_IDLE);
>>   -     if (has_blocked_load ||
>> -           !atomic_try_cmpxchg(&nohz.stats_state, &stats_seq, 0)) {
>> -               WRITE_ONCE(nohz.next_stats,
>> -                               now + msecs_to_jiffies(LOAD_AVG_PERIOD));
>> -               mod_timer(&nohz.timer, nohz.next_stats);
>> -       }
>> +       WRITE_ONCE(nohz.next_stats,
>> +               now + msecs_to_jiffies(LOAD_AVG_PERIOD));
>> +
>> +       /* There is still blocked load, enable periodic update */
>> +       if (has_blocked_load)
>> +               atomic_set(&nohz.stats_state, 1);
>>         /*
>>          * next_balance will be updated only when there is a need.
>> @@ -10115,7 +10112,6 @@ __init void init_sched_fair_class(void)
>>         nohz.next_balance = jiffies;
>>         nohz.next_stats = jiffies;
>>         zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
>> -       timer_setup(&nohz.timer, nohz_balance_timer, 0);
>>   #endif
>>   #endif /* SMP */
>>
>
>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK
  2018-01-18 10:38                         ` Morten Rasmussen
@ 2018-01-24  8:25                           ` Vincent Guittot
  2018-01-29 18:43                             ` Dietmar Eggemann
                                               ` (3 more replies)
  2018-02-01 16:55                           ` [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK Peter Zijlstra
  1 sibling, 4 replies; 56+ messages in thread
From: Vincent Guittot @ 2018-01-24  8:25 UTC (permalink / raw)
  To: Peter Zijlstra, Morten Rasmussen
  Cc: Ingo Molnar, linux-kernel, Brendan Jackman, Dietmar Eggemann,
	Morten Rasmussen

Hi,

Le Thursday 18 Jan 2018 à 10:38:07 (+0000), Morten Rasmussen a écrit :
> On Mon, Jan 15, 2018 at 09:26:09AM +0100, Vincent Guittot wrote:
> > Le Wednesday 03 Jan 2018 à 10:16:00 (+0100), Vincent Guittot a écrit :
> > > Hi Peter,
> > > 
> > > On 22 December 2017 at 21:42, Peter Zijlstra <peterz@infradead.org> wrote:
> > > > On Fri, Dec 22, 2017 at 07:56:29PM +0100, Peter Zijlstra wrote:
> > > >> Right; but I figured we'd try and do it 'right' and see how horrible it
> > > >> is before we try and do funny things.
> > > >
> > > > So now it should have a 32ms tick for up to .5s when the system goes
> > > > completely idle.
> > > >
> > > > No idea how bad that is..
> > > 
> > > I have tested your branch but the timer doesn't seem to fire correctly
> > > because i can still see blocked load in the use case i have run.
> > > I haven't found the reason yet
> > 
> > Hi Peter,
> > 
> > With the patch below on top of your branch, the blocked loads are updated and
> > decayed regularly. The main differences are:
> > - It doesn't use a timer to trig ilb but the tick and when a cpu becomes idle.
> >   The main drawback of this solution is that the load is blocked when the
> >   system is fully idle with the advantage of not waking up a fully idle
> >   system. We have to wait for the next tick or newly idle event for updating
> >   blocked load when the system leaves idle stat which can be up to a tick long.
> >   If this is too long, we can check for kicking ilb when task wakes up so the
> >   blocked load will be updated as soon as the system leaves idle state.
> >   The main advantage is that we don't wake up a fully idle system every 32ms to
> >   update blocked load that will be not used.
> > - I'm working on one more improvement to use nohz_idle_balance in the newly
> >   idle case when the system is not overloaded and 
> >   (this_rq->avg_idle > sysctl_sched_migration_cost). In this case, we can try to
> >   use nohz_idle_balance with NOHZ_STATS_KICK and abort as soon as it exceed
> >   this_rq->avg_idle. This will remove some calls to kick_ilb and some wake up
> >   of an idle cpus.
> 
> This sound like what I meant in my other reply :-)
> 
> It seems pointless to have a timer to update PELT if the system is
> completely idle, and when it isn't we can piggy back other events to
> make the updates happen.

The patch below implements what has been described above. It calls part of 
nohz_idle_balance when a cpu becomes idle and kick a ilb if it takes too much
time. This removes part of ilb that are kicked on an idle cpu for updating
the blocked load but the ratio really depends on when the tick happens compared
to a cpu becoming idle and the 32ms boundary. I have an additionnal patch that
enables to update the blocked loads when a cpu becomes idle 1 period before
kicking an ilb and there is far less ilb because we give more chance to the
newly idle case (time_after is replaced by time_after_eq in idle_balance()).

The patch also uses a function cfs_rq_has_blocked, which only checks the
util/load_avg, instead of the cfs_rq_is_decayed which check *_sum too. This
reduce significantly the number of update of blocked load. the *_avg will be
fully decayed in around 300~400ms but it's far longer for the *_sum which have
a higher resolution and we can easily reach almost seconds. But only the *_avg
are used to make decision so keeping some blocked *_sum is acceptable.

---
 kernel/sched/fair.c | 121 +++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 92 insertions(+), 29 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 898785d..ed90303 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7356,6 +7356,17 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
 	return true;
 }
 
+static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
+{
+	if (cfs_rq->avg.load_avg)
+		return true;
+
+	if (cfs_rq->avg.util_avg)
+		return true;
+
+	return false;
+}
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
 static void update_blocked_averages(int cpu)
@@ -7393,7 +7404,9 @@ static void update_blocked_averages(int cpu)
 		 */
 		if (cfs_rq_is_decayed(cfs_rq))
 			list_del_leaf_cfs_rq(cfs_rq);
-		else
+
+		/* Don't need periodic decay once load/util_avg are null */
+		if (cfs_rq_has_blocked(cfs_rq))
 			done = false;
 	}
 
@@ -7463,7 +7476,7 @@ static inline void update_blocked_averages(int cpu)
 	update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
 #ifdef CONFIG_NO_HZ_COMMON
 	rq->last_blocked_load_update_tick = jiffies;
-	if (cfs_rq_is_decayed(cfs_rq))
+	if (cfs_rq_has_blocked(cfs_rq))
 		rq->has_blocked_load = 0;
 #endif
 	rq_unlock_irqrestore(rq, &rf);
@@ -8818,6 +8831,7 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
 		*next_balance = next;
 }
 
+static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum cpu_idle_type idle);
 static void kick_ilb(unsigned int flags);
 
 /*
@@ -8861,7 +8875,14 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
 			update_next_balance(sd, &next_balance);
 		rcu_read_unlock();
 
-		if (time_after(jiffies, next) && atomic_read(&nohz.stats_state))
+		/*
+		 * Update blocked idle load if it has not been done for a
+		 * while. Try to do it locally before entering idle but kick a
+		 * ilb if it takes too much time and might delay next local
+		 * wake up
+		 */
+		if (time_after(jiffies, next) && atomic_read(&nohz.stats_state) &&
+				!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
 			kick_ilb(NOHZ_STATS_KICK);
 
 		goto out;
@@ -9237,6 +9258,7 @@ void nohz_balance_enter_idle(int cpu)
 	if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
 		return;
 
+	rq->has_blocked_load = 1;
 	if (rq->nohz_tick_stopped)
 		return;
 
@@ -9247,7 +9269,6 @@ void nohz_balance_enter_idle(int cpu)
 		return;
 
 	rq->nohz_tick_stopped = 1;
-	rq->has_blocked_load = 1;
 
 	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
 	atomic_inc(&nohz.nr_cpus);
@@ -9259,7 +9280,6 @@ void nohz_balance_enter_idle(int cpu)
 	 * enable the periodic update of the load of idle cpus
 	 */
 	atomic_set(&nohz.stats_state, 1);
-
 }
 #else
 static inline void nohz_balancer_kick(struct rq *rq) { }
@@ -9385,10 +9405,13 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
 
 #ifdef CONFIG_NO_HZ_COMMON
 /*
- * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
- * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ * Internal function that runs load balance for all idle cpus. The load balance
+ * can be a simple update of blocked load or a complete load balance with
+ * tasks movement depending of flags.
+ * For newly idle mode, we abort the loop if it takes too much time and return
+ * false to notify that the loop has not be completed and a ilb shoud be kick.
  */
-static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
+static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum cpu_idle_type idle)
 {
 	/* Earliest time when we have to do rebalance again */
 	unsigned long now = jiffies;
@@ -9396,24 +9419,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 	bool has_blocked_load = false;
 	int update_next_balance = 0;
 	int this_cpu = this_rq->cpu;
-	unsigned int flags;
 	int balance_cpu;
+	int ret = false;
 	struct rq *rq;
-
-	if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
-		return false;
-
-	if (idle != CPU_IDLE) {
-		atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
-		return false;
-	}
-
-	/*
-	 * barrier, pairs with nohz_balance_enter_idle(), ensures ...
-	 */
-	flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
-	if (!(flags & NOHZ_KICK_MASK))
-		return false;
+	u64 curr_cost = 0;
 
 	SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
 
@@ -9428,6 +9437,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 	atomic_set(&nohz.stats_state, 0);
 
 	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
+		u64 t0, domain_cost;
+
+		t0 = sched_clock_cpu(this_cpu);
+
 		if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
 			continue;
 
@@ -9438,7 +9451,17 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 		 */
 		if (need_resched()) {
 			has_blocked_load = true;
-			break;
+			goto abort;
+		}
+
+		/*
+		 * If the update is done while CPU becomes idle, we abort
+		 * the update when its cost is higher than the average idle
+		 * time in orde to not delay a possible wake up.
+		 */
+		if (idle == CPU_NEWLY_IDLE && this_rq->avg_idle < curr_cost) {
+			has_blocked_load = true;
+			goto abort;
 		}
 
 		rq = cpu_rq(balance_cpu);
@@ -9453,10 +9476,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 		if (time_after_eq(jiffies, rq->next_balance)) {
 			struct rq_flags rf;
 
-			rq_lock_irq(rq, &rf);
+			rq_lock_irqsave(rq, &rf);
 			update_rq_clock(rq);
 			cpu_load_update_idle(rq);
-			rq_unlock_irq(rq, &rf);
+			rq_unlock_irqrestore(rq, &rf);
 
 			if (flags & NOHZ_BALANCE_KICK)
 				rebalance_domains(rq, CPU_IDLE);
@@ -9466,10 +9489,17 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 			next_balance = rq->next_balance;
 			update_next_balance = 1;
 		}
+
+		domain_cost = sched_clock_cpu(this_cpu) - t0;
+		curr_cost += domain_cost;
+
 	}
 
-	update_blocked_averages(this_cpu);
-	has_blocked_load |= this_rq->has_blocked_load;
+	/* Newly idle CPU doesn't need an update */
+	if (idle != CPU_NEWLY_IDLE) {
+		update_blocked_averages(this_cpu);
+		has_blocked_load |= this_rq->has_blocked_load;
+	}
 
 	if (flags & NOHZ_BALANCE_KICK)
 		rebalance_domains(this_rq, CPU_IDLE);
@@ -9477,6 +9507,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 	WRITE_ONCE(nohz.next_stats,
 		now + msecs_to_jiffies(LOAD_AVG_PERIOD));
 
+	/* The full idle balance loop has been done */
+	ret = true;
+
+abort:
 	/* There is still blocked load, enable periodic update */
 	if (has_blocked_load)
 		atomic_set(&nohz.stats_state, 1);
@@ -9489,6 +9523,35 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 	if (likely(update_next_balance))
 		nohz.next_balance = next_balance;
 
+	return ret;
+}
+
+/*
+ * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
+ * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ */
+static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
+{
+	int this_cpu = this_rq->cpu;
+	unsigned int flags;
+
+	if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
+		return false;
+
+	if (idle != CPU_IDLE) {
+		atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
+		return false;
+	}
+
+	/*
+	 * barrier, pairs with nohz_balance_enter_idle(), ensures ...
+	 */
+	flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
+	if (!(flags & NOHZ_KICK_MASK))
+		return false;
+
+	_nohz_idle_balance(this_rq, flags, idle);
+
 	return true;
 }
 #else
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 56+ messages in thread

* Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK
  2018-01-24  8:25                           ` Vincent Guittot
@ 2018-01-29 18:43                             ` Dietmar Eggemann
  2018-01-30  8:00                               ` Vincent Guittot
  2018-01-29 19:31                             ` Valentin Schneider
                                               ` (2 subsequent siblings)
  3 siblings, 1 reply; 56+ messages in thread
From: Dietmar Eggemann @ 2018-01-29 18:43 UTC (permalink / raw)
  To: Vincent Guittot, Peter Zijlstra, Morten Rasmussen
  Cc: Ingo Molnar, linux-kernel, Brendan Jackman, Morten Rasmussen

On 01/24/2018 09:25 AM, Vincent Guittot wrote:
> Hi,
> 
> Le Thursday 18 Jan 2018 à 10:38:07 (+0000), Morten Rasmussen a écrit :
>> On Mon, Jan 15, 2018 at 09:26:09AM +0100, Vincent Guittot wrote:
>>> Le Wednesday 03 Jan 2018 à 10:16:00 (+0100), Vincent Guittot a écrit :

[...]

>>>
>>> Hi Peter,
>>>
>>> With the patch below on top of your branch, the blocked loads are updated and
>>> decayed regularly. The main differences are:
>>> - It doesn't use a timer to trig ilb but the tick and when a cpu becomes idle.
>>>    The main drawback of this solution is that the load is blocked when the
>>>    system is fully idle with the advantage of not waking up a fully idle
>>>    system. We have to wait for the next tick or newly idle event for updating
>>>    blocked load when the system leaves idle stat which can be up to a tick long.
>>>    If this is too long, we can check for kicking ilb when task wakes up so the
>>>    blocked load will be updated as soon as the system leaves idle state.
>>>    The main advantage is that we don't wake up a fully idle system every 32ms to
>>>    update blocked load that will be not used.
>>> - I'm working on one more improvement to use nohz_idle_balance in the newly
>>>    idle case when the system is not overloaded and
>>>    (this_rq->avg_idle > sysctl_sched_migration_cost). In this case, we can try to
>>>    use nohz_idle_balance with NOHZ_STATS_KICK and abort as soon as it exceed
>>>    this_rq->avg_idle. This will remove some calls to kick_ilb and some wake up
>>>    of an idle cpus.
>>
>> This sound like what I meant in my other reply :-)
>>
>> It seems pointless to have a timer to update PELT if the system is
>> completely idle, and when it isn't we can piggy back other events to
>> make the updates happen.
> 
> The patch below implements what has been described above. It calls part of
> nohz_idle_balance when a cpu becomes idle and kick a ilb if it takes too much
> time. This removes part of ilb that are kicked on an idle cpu for updating
> the blocked load but the ratio really depends on when the tick happens compared
> to a cpu becoming idle and the 32ms boundary. I have an additionnal patch that
> enables to update the blocked loads when a cpu becomes idle 1 period before
> kicking an ilb and there is far less ilb because we give more chance to the
> newly idle case (time_after is replaced by time_after_eq in idle_balance()).
> 
> The patch also uses a function cfs_rq_has_blocked, which only checks the
> util/load_avg, instead of the cfs_rq_is_decayed which check *_sum too. This
> reduce significantly the number of update of blocked load. the *_avg will be
> fully decayed in around 300~400ms but it's far longer for the *_sum which have
> a higher resolution and we can easily reach almost seconds. But only the *_avg
> are used to make decision so keeping some blocked *_sum is acceptable.
> 
> ---
>   kernel/sched/fair.c | 121 +++++++++++++++++++++++++++++++++++++++-------------
>   1 file changed, 92 insertions(+), 29 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 898785d..ed90303 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7356,6 +7356,17 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
>   	return true;
>   }
>   
> +static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
> +{
> +	if (cfs_rq->avg.load_avg)
> +		return true;
> +
> +	if (cfs_rq->avg.util_avg)
> +		return true;
> +
> +	return false;
> +}
> +

Can we not change cfs_rq_is_decayed() to use avg.foo_avg instead of 
avg.foo_sum ?

>   #ifdef CONFIG_FAIR_GROUP_SCHED
>   
>   static void update_blocked_averages(int cpu)
> @@ -7393,7 +7404,9 @@ static void update_blocked_averages(int cpu)
>   		 */
>   		if (cfs_rq_is_decayed(cfs_rq))
>   			list_del_leaf_cfs_rq(cfs_rq);
> -		else
> +
> +		/* Don't need periodic decay once load/util_avg are null */
> +		if (cfs_rq_has_blocked(cfs_rq))
>   			done = false;
>   	}
>   
> @@ -7463,7 +7476,7 @@ static inline void update_blocked_averages(int cpu)
>   	update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
>   #ifdef CONFIG_NO_HZ_COMMON
>   	rq->last_blocked_load_update_tick = jiffies;
> -	if (cfs_rq_is_decayed(cfs_rq))
> +	if (cfs_rq_has_blocked(cfs_rq))

Schouldn't this be !cfs_rq_has_blocked(cfs_rq) ?

>   		rq->has_blocked_load = 0;
>   #endif
>   	rq_unlock_irqrestore(rq, &rf);

[...]

> @@ -9438,7 +9451,17 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>   		 */
>   		if (need_resched()) {
>   			has_blocked_load = true;
> -			break;
> +			goto abort;
> +		}
> +
> +		/*
> +		 * If the update is done while CPU becomes idle, we abort
> +		 * the update when its cost is higher than the average idle
> +		 * time in orde to not delay a possible wake up.
> +		 */
> +		if (idle == CPU_NEWLY_IDLE && this_rq->avg_idle < curr_cost) {
> +			has_blocked_load = true;
> +			goto abort;
>   		}
>   
>   		rq = cpu_rq(balance_cpu);
> @@ -9453,10 +9476,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>   		if (time_after_eq(jiffies, rq->next_balance)) {
>   			struct rq_flags rf;
>   
> -			rq_lock_irq(rq, &rf);
> +			rq_lock_irqsave(rq, &rf);
>   			update_rq_clock(rq);
>   			cpu_load_update_idle(rq);
> -			rq_unlock_irq(rq, &rf);
> +			rq_unlock_irqrestore(rq, &rf);
>   
>   			if (flags & NOHZ_BALANCE_KICK)
>   				rebalance_domains(rq, CPU_IDLE);
> @@ -9466,10 +9489,17 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>   			next_balance = rq->next_balance;
>   			update_next_balance = 1;
>   		}

Why do you do this cpu_load_update_idle(rq) even this was called with 
CPU_NEWLY_IDLE? Wouldn't it be sufficient to jump to the curr_cost 
calculation in this case?

> +
> +		domain_cost = sched_clock_cpu(this_cpu) - t0;
> +		curr_cost += domain_cost;
> +
>   	}
>   
> -	update_blocked_averages(this_cpu);
> -	has_blocked_load |= this_rq->has_blocked_load;
> +	/* Newly idle CPU doesn't need an update */
> +	if (idle != CPU_NEWLY_IDLE) {
> +		update_blocked_averages(this_cpu);
> +		has_blocked_load |= this_rq->has_blocked_load;
> +	}
>   
>   	if (flags & NOHZ_BALANCE_KICK)
>   		rebalance_domains(this_rq, CPU_IDLE);

[...]

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK
  2018-01-24  8:25                           ` Vincent Guittot
  2018-01-29 18:43                             ` Dietmar Eggemann
@ 2018-01-29 19:31                             ` Valentin Schneider
  2018-01-30  8:32                               ` Vincent Guittot
  2018-02-01 18:16                               ` Peter Zijlstra
  2018-02-01 16:57                             ` Peter Zijlstra
  2018-02-01 18:10                             ` Peter Zijlstra
  3 siblings, 2 replies; 56+ messages in thread
From: Valentin Schneider @ 2018-01-29 19:31 UTC (permalink / raw)
  To: Vincent Guittot, Peter Zijlstra, Morten Rasmussen
  Cc: Ingo Molnar, linux-kernel, Brendan Jackman, Dietmar Eggemann,
	Morten Rasmussen

Hi Vincent, Peter,

I've been running some tests on your patches (Peter's base + the 2 from 
Vincent). The results themselves are hosted at [1].
The base of those tests is the same: a task ("accumulator") is ran for 5 
seconds (arbitrary value) to accumulate some load, then goes to sleep 
for .5 seconds.

I've set up 3 test scenarios:

Update by nohz_balance_kick()
-----------------------------
Right before the "accumulator" task goes to sleep, a CPU-hogging task 
(100% utilization) is spawned on another CPU. It won't go idle so the 
only way to update the blocked load generated by "accumulator" is to 
kick an ILB (NOHZ_STATS_KICK).

The test shows that this is behaving nicely - we keep kicking an ILB 
every ~36ms (see next test for comments on that) until there is no more 
blocked load. I did however notice some interesting scenarios: after the 
load has been fully decayed, a tiny background task can spawn and end in 
less than a scheduling period. However, it still goes through 
nohz_balance_enter_idle(), and thus sets nohz.stats_state, which will 
later cause an ILB kick.

This makes me wonder if it's worth kicking ILBs for such tiny load 
values - perhaps it could be worth having a margin to set 
rq->has_blocked_load ?

Furthermore, this tiny task will cause the ILB to iterate over all of 
the idle CPUs, although only one has stale load. For load update via 
NEWLY_IDLE load_balance() we use:

static bool update_nohz_stats(struct rq *rq)
{
     if (!rq->has_blocked_load)
      return false;
     [...]
}

But for load update via _nohz_idle_balance(), we iterate through all of 
the nohz CPUS and unconditionally call update_blocked_averages(). This 
could be avoided by remembering which CPUs have stale load before going 
idle. Initially I thought that was what nohz.stats_state was for, but it 
isn't.
With Vincent's patches it's only ever set to either 0 or 1, but we could 
use it as a CPU mask, and use it to skip nohz CPUs that don't have stale 
load in _nohz_idle_balance() (when NOHZ_STATS_KICK).

Update by idle_balance()
------------------------
Right before the "accumulator" task goes to sleep, a tiny periodic 
(period=32ms) task is spawned on another CPU. It's expected that it will 
update the blocked load in idle_balance(), either by running 
_nohz_idle_balance() locally or kicking an ILB (The overload flag 
shouldn't be set in this test case, so we shouldn't go through the 
NEWLY_IDLE load_balance()).

This also seems to be working fine, but I'm noticing a delay between 
load updates that is closer to 64ms than 32ms. After digging into it I 
found out that the time checks done in idle_balance() and 
nohz_balancer_kick() are time_after(jiffies, next_stats), but IMHO they 
should be time_after_eq(jiffies, next_stats) to have 32ms-based updates. 
This also explains the 36ms periodicity of the updates in the test above.


No update (idle system)
-----------------------
Nothing special here, just making sure nothing happens when the system 
is fully idle. On a sidenote, that's relatively hard to achieve - I had 
to switch over to Juno because my HiKey960 gets interrupts every 16ms. 
The Juno still gets woken up every now and then but it's a bit quieter.


[1]: https://gist.github.com/valschneider/a8da7bb8e11fb1ec63a419710f56c0a0


On 01/24/2018 08:25 AM, Vincent Guittot wrote:
> Hi,
>
> Le Thursday 18 Jan 2018 à 10:38:07 (+0000), Morten Rasmussen a écrit :
>> On Mon, Jan 15, 2018 at 09:26:09AM +0100, Vincent Guittot wrote:
>>> Le Wednesday 03 Jan 2018 à 10:16:00 (+0100), Vincent Guittot a écrit :
>>>> Hi Peter,
>>>>
>>>> On 22 December 2017 at 21:42, Peter Zijlstra <peterz@infradead.org> wrote:
>>>>> On Fri, Dec 22, 2017 at 07:56:29PM +0100, Peter Zijlstra wrote:
>>>>>> Right; but I figured we'd try and do it 'right' and see how horrible it
>>>>>> is before we try and do funny things.
>>>>> So now it should have a 32ms tick for up to .5s when the system goes
>>>>> completely idle.
>>>>>
>>>>> No idea how bad that is..
>>>> I have tested your branch but the timer doesn't seem to fire correctly
>>>> because i can still see blocked load in the use case i have run.
>>>> I haven't found the reason yet
>>> Hi Peter,
>>>
>>> With the patch below on top of your branch, the blocked loads are updated and
>>> decayed regularly. The main differences are:
>>> - It doesn't use a timer to trig ilb but the tick and when a cpu becomes idle.
>>>    The main drawback of this solution is that the load is blocked when the
>>>    system is fully idle with the advantage of not waking up a fully idle
>>>    system. We have to wait for the next tick or newly idle event for updating
>>>    blocked load when the system leaves idle stat which can be up to a tick long.
>>>    If this is too long, we can check for kicking ilb when task wakes up so the
>>>    blocked load will be updated as soon as the system leaves idle state.
>>>    The main advantage is that we don't wake up a fully idle system every 32ms to
>>>    update blocked load that will be not used.
>>> - I'm working on one more improvement to use nohz_idle_balance in the newly
>>>    idle case when the system is not overloaded and
>>>    (this_rq->avg_idle > sysctl_sched_migration_cost). In this case, we can try to
>>>    use nohz_idle_balance with NOHZ_STATS_KICK and abort as soon as it exceed
>>>    this_rq->avg_idle. This will remove some calls to kick_ilb and some wake up
>>>    of an idle cpus.
>> This sound like what I meant in my other reply :-)
>>
>> It seems pointless to have a timer to update PELT if the system is
>> completely idle, and when it isn't we can piggy back other events to
>> make the updates happen.
> The patch below implements what has been described above. It calls part of
> nohz_idle_balance when a cpu becomes idle and kick a ilb if it takes too much
> time. This removes part of ilb that are kicked on an idle cpu for updating
> the blocked load but the ratio really depends on when the tick happens compared
> to a cpu becoming idle and the 32ms boundary. I have an additionnal patch that
> enables to update the blocked loads when a cpu becomes idle 1 period before
> kicking an ilb and there is far less ilb because we give more chance to the
> newly idle case (time_after is replaced by time_after_eq in idle_balance()).
>
> The patch also uses a function cfs_rq_has_blocked, which only checks the
> util/load_avg, instead of the cfs_rq_is_decayed which check *_sum too. This
> reduce significantly the number of update of blocked load. the *_avg will be
> fully decayed in around 300~400ms but it's far longer for the *_sum which have
> a higher resolution and we can easily reach almost seconds. But only the *_avg
> are used to make decision so keeping some blocked *_sum is acceptable.
>
> ---
>   kernel/sched/fair.c | 121 +++++++++++++++++++++++++++++++++++++++-------------
>   1 file changed, 92 insertions(+), 29 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 898785d..ed90303 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7356,6 +7356,17 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
>   	return true;
>   }
>   
> +static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
> +{
> +	if (cfs_rq->avg.load_avg)
> +		return true;
> +
> +	if (cfs_rq->avg.util_avg)
> +		return true;
> +
> +	return false;
> +}
> +
>   #ifdef CONFIG_FAIR_GROUP_SCHED
>   
>   static void update_blocked_averages(int cpu)
> @@ -7393,7 +7404,9 @@ static void update_blocked_averages(int cpu)
>   		 */
>   		if (cfs_rq_is_decayed(cfs_rq))
>   			list_del_leaf_cfs_rq(cfs_rq);
> -		else
> +
> +		/* Don't need periodic decay once load/util_avg are null */
> +		if (cfs_rq_has_blocked(cfs_rq))
>   			done = false;
>   	}
>   
> @@ -7463,7 +7476,7 @@ static inline void update_blocked_averages(int cpu)
>   	update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
>   #ifdef CONFIG_NO_HZ_COMMON
>   	rq->last_blocked_load_update_tick = jiffies;
> -	if (cfs_rq_is_decayed(cfs_rq))
> +	if (cfs_rq_has_blocked(cfs_rq))
>   		rq->has_blocked_load = 0;
>   #endif
>   	rq_unlock_irqrestore(rq, &rf);
> @@ -8818,6 +8831,7 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
>   		*next_balance = next;
>   }
>   
> +static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum cpu_idle_type idle);
>   static void kick_ilb(unsigned int flags);
>   
>   /*
> @@ -8861,7 +8875,14 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
>   			update_next_balance(sd, &next_balance);
>   		rcu_read_unlock();
>   
> -		if (time_after(jiffies, next) && atomic_read(&nohz.stats_state))
> +		/*
> +		 * Update blocked idle load if it has not been done for a
> +		 * while. Try to do it locally before entering idle but kick a
> +		 * ilb if it takes too much time and might delay next local
> +		 * wake up
> +		 */
> +		if (time_after(jiffies, next) && atomic_read(&nohz.stats_state) &&
> +				!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
>   			kick_ilb(NOHZ_STATS_KICK);
>   
>   		goto out;
> @@ -9237,6 +9258,7 @@ void nohz_balance_enter_idle(int cpu)
>   	if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
>   		return;
>   
> +	rq->has_blocked_load = 1;
>   	if (rq->nohz_tick_stopped)
>   		return;
>   
> @@ -9247,7 +9269,6 @@ void nohz_balance_enter_idle(int cpu)
>   		return;
>   
>   	rq->nohz_tick_stopped = 1;
> -	rq->has_blocked_load = 1;
>   
>   	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
>   	atomic_inc(&nohz.nr_cpus);
> @@ -9259,7 +9280,6 @@ void nohz_balance_enter_idle(int cpu)
>   	 * enable the periodic update of the load of idle cpus
>   	 */
>   	atomic_set(&nohz.stats_state, 1);
> -
>   }
>   #else
>   static inline void nohz_balancer_kick(struct rq *rq) { }
> @@ -9385,10 +9405,13 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
>   
>   #ifdef CONFIG_NO_HZ_COMMON
>   /*
> - * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
> - * rebalancing for all the cpus for whom scheduler ticks are stopped.
> + * Internal function that runs load balance for all idle cpus. The load balance
> + * can be a simple update of blocked load or a complete load balance with
> + * tasks movement depending of flags.
> + * For newly idle mode, we abort the loop if it takes too much time and return
> + * false to notify that the loop has not be completed and a ilb shoud be kick.
>    */
> -static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> +static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum cpu_idle_type idle)
>   {
>   	/* Earliest time when we have to do rebalance again */
>   	unsigned long now = jiffies;
> @@ -9396,24 +9419,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>   	bool has_blocked_load = false;
>   	int update_next_balance = 0;
>   	int this_cpu = this_rq->cpu;
> -	unsigned int flags;
>   	int balance_cpu;
> +	int ret = false;
>   	struct rq *rq;
> -
> -	if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
> -		return false;
> -
> -	if (idle != CPU_IDLE) {
> -		atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
> -		return false;
> -	}
> -
> -	/*
> -	 * barrier, pairs with nohz_balance_enter_idle(), ensures ...
> -	 */
> -	flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
> -	if (!(flags & NOHZ_KICK_MASK))
> -		return false;
> +	u64 curr_cost = 0;
>   
>   	SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>   
> @@ -9428,6 +9437,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>   	atomic_set(&nohz.stats_state, 0);
>   
>   	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
> +		u64 t0, domain_cost;
> +
> +		t0 = sched_clock_cpu(this_cpu);
> +
>   		if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
>   			continue;
>   
> @@ -9438,7 +9451,17 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>   		 */
>   		if (need_resched()) {
>   			has_blocked_load = true;
> -			break;
> +			goto abort;
> +		}
> +
> +		/*
> +		 * If the update is done while CPU becomes idle, we abort
> +		 * the update when its cost is higher than the average idle
> +		 * time in orde to not delay a possible wake up.
> +		 */
> +		if (idle == CPU_NEWLY_IDLE && this_rq->avg_idle < curr_cost) {
> +			has_blocked_load = true;
> +			goto abort;
>   		}
>   
>   		rq = cpu_rq(balance_cpu);
> @@ -9453,10 +9476,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>   		if (time_after_eq(jiffies, rq->next_balance)) {
>   			struct rq_flags rf;
>   
> -			rq_lock_irq(rq, &rf);
> +			rq_lock_irqsave(rq, &rf);
>   			update_rq_clock(rq);
>   			cpu_load_update_idle(rq);
> -			rq_unlock_irq(rq, &rf);
> +			rq_unlock_irqrestore(rq, &rf);
>   
>   			if (flags & NOHZ_BALANCE_KICK)
>   				rebalance_domains(rq, CPU_IDLE);
> @@ -9466,10 +9489,17 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>   			next_balance = rq->next_balance;
>   			update_next_balance = 1;
>   		}
> +
> +		domain_cost = sched_clock_cpu(this_cpu) - t0;
> +		curr_cost += domain_cost;
> +
>   	}
>   
> -	update_blocked_averages(this_cpu);
> -	has_blocked_load |= this_rq->has_blocked_load;
> +	/* Newly idle CPU doesn't need an update */
> +	if (idle != CPU_NEWLY_IDLE) {
> +		update_blocked_averages(this_cpu);
> +		has_blocked_load |= this_rq->has_blocked_load;
> +	}
>   
>   	if (flags & NOHZ_BALANCE_KICK)
>   		rebalance_domains(this_rq, CPU_IDLE);
> @@ -9477,6 +9507,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>   	WRITE_ONCE(nohz.next_stats,
>   		now + msecs_to_jiffies(LOAD_AVG_PERIOD));
>   
> +	/* The full idle balance loop has been done */
> +	ret = true;
> +
> +abort:
>   	/* There is still blocked load, enable periodic update */
>   	if (has_blocked_load)
>   		atomic_set(&nohz.stats_state, 1);
> @@ -9489,6 +9523,35 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>   	if (likely(update_next_balance))
>   		nohz.next_balance = next_balance;
>   
> +	return ret;
> +}
> +
> +/*
> + * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
> + * rebalancing for all the cpus for whom scheduler ticks are stopped.
> + */
> +static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> +{
> +	int this_cpu = this_rq->cpu;
> +	unsigned int flags;
> +
> +	if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
> +		return false;
> +
> +	if (idle != CPU_IDLE) {
> +		atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
> +		return false;
> +	}
> +
> +	/*
> +	 * barrier, pairs with nohz_balance_enter_idle(), ensures ...
> +	 */
> +	flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
> +	if (!(flags & NOHZ_KICK_MASK))
> +		return false;
> +
> +	_nohz_idle_balance(this_rq, flags, idle);
> +
>   	return true;
>   }
>   #else

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK
  2018-01-29 18:43                             ` Dietmar Eggemann
@ 2018-01-30  8:00                               ` Vincent Guittot
  0 siblings, 0 replies; 56+ messages in thread
From: Vincent Guittot @ 2018-01-30  8:00 UTC (permalink / raw)
  To: Dietmar Eggemann
  Cc: Peter Zijlstra, Morten Rasmussen, Ingo Molnar, linux-kernel,
	Brendan Jackman, Morten Rasmussen

On 29 January 2018 at 19:43, Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:
> On 01/24/2018 09:25 AM, Vincent Guittot wrote:
>>
>> Hi,
>>
>> Le Thursday 18 Jan 2018 à 10:38:07 (+0000), Morten Rasmussen a écrit :
>>>
>>> On Mon, Jan 15, 2018 at 09:26:09AM +0100, Vincent Guittot wrote:
>>>>
>>>> Le Wednesday 03 Jan 2018 à 10:16:00 (+0100), Vincent Guittot a écrit :
>
>
> [...]
>
>
>>>>
>>>> Hi Peter,
>>>>
>>>> With the patch below on top of your branch, the blocked loads are
>>>> updated and
>>>> decayed regularly. The main differences are:
>>>> - It doesn't use a timer to trig ilb but the tick and when a cpu becomes
>>>> idle.
>>>>    The main drawback of this solution is that the load is blocked when
>>>> the
>>>>    system is fully idle with the advantage of not waking up a fully idle
>>>>    system. We have to wait for the next tick or newly idle event for
>>>> updating
>>>>    blocked load when the system leaves idle stat which can be up to a
>>>> tick long.
>>>>    If this is too long, we can check for kicking ilb when task wakes up
>>>> so the
>>>>    blocked load will be updated as soon as the system leaves idle state.
>>>>    The main advantage is that we don't wake up a fully idle system every
>>>> 32ms to
>>>>    update blocked load that will be not used.
>>>> - I'm working on one more improvement to use nohz_idle_balance in the
>>>> newly
>>>>    idle case when the system is not overloaded and
>>>>    (this_rq->avg_idle > sysctl_sched_migration_cost). In this case, we
>>>> can try to
>>>>    use nohz_idle_balance with NOHZ_STATS_KICK and abort as soon as it
>>>> exceed
>>>>    this_rq->avg_idle. This will remove some calls to kick_ilb and some
>>>> wake up
>>>>    of an idle cpus.
>>>
>>>
>>> This sound like what I meant in my other reply :-)
>>>
>>> It seems pointless to have a timer to update PELT if the system is
>>> completely idle, and when it isn't we can piggy back other events to
>>> make the updates happen.
>>
>>
>> The patch below implements what has been described above. It calls part of
>> nohz_idle_balance when a cpu becomes idle and kick a ilb if it takes too
>> much
>> time. This removes part of ilb that are kicked on an idle cpu for updating
>> the blocked load but the ratio really depends on when the tick happens
>> compared
>> to a cpu becoming idle and the 32ms boundary. I have an additionnal patch
>> that
>> enables to update the blocked loads when a cpu becomes idle 1 period
>> before
>> kicking an ilb and there is far less ilb because we give more chance to
>> the
>> newly idle case (time_after is replaced by time_after_eq in
>> idle_balance()).
>>
>> The patch also uses a function cfs_rq_has_blocked, which only checks the
>> util/load_avg, instead of the cfs_rq_is_decayed which check *_sum too.
>> This
>> reduce significantly the number of update of blocked load. the *_avg will
>> be
>> fully decayed in around 300~400ms but it's far longer for the *_sum which
>> have
>> a higher resolution and we can easily reach almost seconds. But only the
>> *_avg
>> are used to make decision so keeping some blocked *_sum is acceptable.
>>
>> ---
>>   kernel/sched/fair.c | 121
>> +++++++++++++++++++++++++++++++++++++++-------------
>>   1 file changed, 92 insertions(+), 29 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 898785d..ed90303 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -7356,6 +7356,17 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq
>> *cfs_rq)
>>         return true;
>>   }
>>   +static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
>> +{
>> +       if (cfs_rq->avg.load_avg)
>> +               return true;
>> +
>> +       if (cfs_rq->avg.util_avg)
>> +               return true;
>> +
>> +       return false;
>> +}
>> +
>
>
> Can we not change cfs_rq_is_decayed() to use avg.foo_avg instead of
> avg.foo_sum ?

I don't think so because the *_sum are used to keep coherency bewteen
cfs_rq and cgroups when task migrates and are enqueued/dequeued so wwe
can't remove it until the *_sum are null otherwise the all cfs_rq and
group will be out of sync

>
>>   #ifdef CONFIG_FAIR_GROUP_SCHED
>>     static void update_blocked_averages(int cpu)
>> @@ -7393,7 +7404,9 @@ static void update_blocked_averages(int cpu)
>>                  */
>>                 if (cfs_rq_is_decayed(cfs_rq))
>>                         list_del_leaf_cfs_rq(cfs_rq);
>> -               else
>> +
>> +               /* Don't need periodic decay once load/util_avg are null
>> */
>> +               if (cfs_rq_has_blocked(cfs_rq))
>>                         done = false;
>>         }
>>   @@ -7463,7 +7476,7 @@ static inline void update_blocked_averages(int
>> cpu)
>>         update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
>>   #ifdef CONFIG_NO_HZ_COMMON
>>         rq->last_blocked_load_update_tick = jiffies;
>> -       if (cfs_rq_is_decayed(cfs_rq))
>> +       if (cfs_rq_has_blocked(cfs_rq))
>
>
> Schouldn't this be !cfs_rq_has_blocked(cfs_rq) ?

yes. I copy/pasted too quickly from sched_group_fair to not sched_group_fair

>
>>                 rq->has_blocked_load = 0;
>>   #endif
>>         rq_unlock_irqrestore(rq, &rf);
>
>
> [...]
>
>
>> @@ -9438,7 +9451,17 @@ static bool nohz_idle_balance(struct rq *this_rq,
>> enum cpu_idle_type idle)
>>                  */
>>                 if (need_resched()) {
>>                         has_blocked_load = true;
>> -                       break;
>> +                       goto abort;
>> +               }
>> +
>> +               /*
>> +                * If the update is done while CPU becomes idle, we abort
>> +                * the update when its cost is higher than the average
>> idle
>> +                * time in orde to not delay a possible wake up.
>> +                */
>> +               if (idle == CPU_NEWLY_IDLE && this_rq->avg_idle <
>> curr_cost) {
>> +                       has_blocked_load = true;
>> +                       goto abort;
>>                 }
>>                 rq = cpu_rq(balance_cpu);
>> @@ -9453,10 +9476,10 @@ static bool nohz_idle_balance(struct rq *this_rq,
>> enum cpu_idle_type idle)
>>                 if (time_after_eq(jiffies, rq->next_balance)) {
>>                         struct rq_flags rf;
>>   -                     rq_lock_irq(rq, &rf);
>> +                       rq_lock_irqsave(rq, &rf);
>>                         update_rq_clock(rq);
>>                         cpu_load_update_idle(rq);
>> -                       rq_unlock_irq(rq, &rf);
>> +                       rq_unlock_irqrestore(rq, &rf);
>>                         if (flags & NOHZ_BALANCE_KICK)
>>                                 rebalance_domains(rq, CPU_IDLE);
>> @@ -9466,10 +9489,17 @@ static bool nohz_idle_balance(struct rq *this_rq,
>> enum cpu_idle_type idle)
>>                         next_balance = rq->next_balance;
>>                         update_next_balance = 1;
>>                 }
>
>
> Why do you do this cpu_load_update_idle(rq) even this was called with
> CPU_NEWLY_IDLE? Wouldn't it be sufficient to jump to the curr_cost
> calculation in this case?

just to keep thing similar to what happen with kick_ilb and that's an
occasion to also update the cpu_load

>
>> +
>> +               domain_cost = sched_clock_cpu(this_cpu) - t0;
>> +               curr_cost += domain_cost;
>> +
>>         }
>>   -     update_blocked_averages(this_cpu);
>> -       has_blocked_load |= this_rq->has_blocked_load;
>> +       /* Newly idle CPU doesn't need an update */
>> +       if (idle != CPU_NEWLY_IDLE) {
>> +               update_blocked_averages(this_cpu);
>> +               has_blocked_load |= this_rq->has_blocked_load;
>> +       }
>>         if (flags & NOHZ_BALANCE_KICK)
>>                 rebalance_domains(this_rq, CPU_IDLE);
>
>
> [...]

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK
  2018-01-29 19:31                             ` Valentin Schneider
@ 2018-01-30  8:32                               ` Vincent Guittot
  2018-01-30 11:41                                 ` Valentin Schneider
  2018-02-01 18:16                               ` Peter Zijlstra
  1 sibling, 1 reply; 56+ messages in thread
From: Vincent Guittot @ 2018-01-30  8:32 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: Peter Zijlstra, Morten Rasmussen, Ingo Molnar, linux-kernel,
	Brendan Jackman, Dietmar Eggemann, Morten Rasmussen

On 29 January 2018 at 20:31, Valentin Schneider
<valentin.schneider@arm.com> wrote:
> Hi Vincent, Peter,
>
> I've been running some tests on your patches (Peter's base + the 2 from
> Vincent). The results themselves are hosted at [1].
> The base of those tests is the same: a task ("accumulator") is ran for 5
> seconds (arbitrary value) to accumulate some load, then goes to sleep for .5
> seconds.
>
> I've set up 3 test scenarios:
>
> Update by nohz_balance_kick()
> -----------------------------
> Right before the "accumulator" task goes to sleep, a CPU-hogging task (100%
> utilization) is spawned on another CPU. It won't go idle so the only way to
> update the blocked load generated by "accumulator" is to kick an ILB
> (NOHZ_STATS_KICK).
>
> The test shows that this is behaving nicely - we keep kicking an ILB every
> ~36ms (see next test for comments on that) until there is no more blocked
> load. I did however notice some interesting scenarios: after the load has
> been fully decayed, a tiny background task can spawn and end in less than a
> scheduling period. However, it still goes through nohz_balance_enter_idle(),
> and thus sets nohz.stats_state, which will later cause an ILB kick.
>
> This makes me wonder if it's worth kicking ILBs for such tiny load values -
> perhaps it could be worth having a margin to set rq->has_blocked_load ?

So it's difficult to know what will be the load/utilization on the
cfs_rq once the cpu wakes up. Even if it's for a really short time,
that's doesn't mean that the load/utilization is small because it can
be the migration of a big task that just have a very short wakes up
this time.
That's why I don't make any assumption on the utilization/load value
when a cpu goes to sleep

>
> Furthermore, this tiny task will cause the ILB to iterate over all of the
> idle CPUs, although only one has stale load. For load update via NEWLY_IDLE
> load_balance() we use:
>
> static bool update_nohz_stats(struct rq *rq)
> {
>     if (!rq->has_blocked_load)
>      return false;
>     [...]
> }
>
> But for load update via _nohz_idle_balance(), we iterate through all of the
> nohz CPUS and unconditionally call update_blocked_averages(). This could be
> avoided by remembering which CPUs have stale load before going idle.
> Initially I thought that was what nohz.stats_state was for, but it isn't.
> With Vincent's patches it's only ever set to either 0 or 1, but we could use
> it as a CPU mask, and use it to skip nohz CPUs that don't have stale load in
> _nohz_idle_balance() (when NOHZ_STATS_KICK).

I have studied a way to keep track of how many cpus still have blocked
load to try to minimize the number of useless ilb kick but this add
more atomic operations which can impact the system throughput with
heavy load and lot of very small wake up. that's why i have propose
this solution which is more simple. But it's probably just a matter of
where we want to "waste" time. Either we accept to spent a bit more
time to check the state of idle CPUs or we accept to kick ilb from
time to time for no good reason.

>
> Update by idle_balance()
> ------------------------
> Right before the "accumulator" task goes to sleep, a tiny periodic
> (period=32ms) task is spawned on another CPU. It's expected that it will
> update the blocked load in idle_balance(), either by running
> _nohz_idle_balance() locally or kicking an ILB (The overload flag shouldn't
> be set in this test case, so we shouldn't go through the NEWLY_IDLE
> load_balance()).
>
> This also seems to be working fine, but I'm noticing a delay between load
> updates that is closer to 64ms than 32ms. After digging into it I found out
> that the time checks done in idle_balance() and nohz_balancer_kick() are
> time_after(jiffies, next_stats), but IMHO they should be
> time_after_eq(jiffies, next_stats) to have 32ms-based updates. This also
> explains the 36ms periodicity of the updates in the test above.

I have use the 32ms as a minimum value between update. We must use the
time_after()  if we want to have at least 32ms between each update. We
will have a 36ms period if the previous update was triggered by the
tick (just after in fact) but there will be only 32ms if the last
update was done during an idle_balance that happens just before the
tick. With  time_after_eq,  the update period will between 28 and
32ms.

Then, I mention a possible optimization by using time_after_eq in the
idle_balance() so a newly_idle cpu will have more chance (between 0
and 4ms for hz250) to do the update before a ilb is kicked

Thanks,
Vincent

>
>
> No update (idle system)
> -----------------------
> Nothing special here, just making sure nothing happens when the system is
> fully idle. On a sidenote, that's relatively hard to achieve - I had to
> switch over to Juno because my HiKey960 gets interrupts every 16ms. The Juno
> still gets woken up every now and then but it's a bit quieter.
>
>
> [1]: https://gist.github.com/valschneider/a8da7bb8e11fb1ec63a419710f56c0a0
>
>
>
> On 01/24/2018 08:25 AM, Vincent Guittot wrote:
>>
>> Hi,
>>
>> Le Thursday 18 Jan 2018 à 10:38:07 (+0000), Morten Rasmussen a écrit :
>>>
>>> On Mon, Jan 15, 2018 at 09:26:09AM +0100, Vincent Guittot wrote:
>>>>
>>>> Le Wednesday 03 Jan 2018 à 10:16:00 (+0100), Vincent Guittot a écrit :
>>>>>
>>>>> Hi Peter,
>>>>>
>>>>> On 22 December 2017 at 21:42, Peter Zijlstra <peterz@infradead.org>
>>>>> wrote:
>>>>>>
>>>>>> On Fri, Dec 22, 2017 at 07:56:29PM +0100, Peter Zijlstra wrote:
>>>>>>>
>>>>>>> Right; but I figured we'd try and do it 'right' and see how horrible
>>>>>>> it
>>>>>>> is before we try and do funny things.
>>>>>>
>>>>>> So now it should have a 32ms tick for up to .5s when the system goes
>>>>>> completely idle.
>>>>>>
>>>>>> No idea how bad that is..
>>>>>
>>>>> I have tested your branch but the timer doesn't seem to fire correctly
>>>>> because i can still see blocked load in the use case i have run.
>>>>> I haven't found the reason yet
>>>>
>>>> Hi Peter,
>>>>
>>>> With the patch below on top of your branch, the blocked loads are
>>>> updated and
>>>> decayed regularly. The main differences are:
>>>> - It doesn't use a timer to trig ilb but the tick and when a cpu becomes
>>>> idle.
>>>>    The main drawback of this solution is that the load is blocked when
>>>> the
>>>>    system is fully idle with the advantage of not waking up a fully idle
>>>>    system. We have to wait for the next tick or newly idle event for
>>>> updating
>>>>    blocked load when the system leaves idle stat which can be up to a
>>>> tick long.
>>>>    If this is too long, we can check for kicking ilb when task wakes up
>>>> so the
>>>>    blocked load will be updated as soon as the system leaves idle state.
>>>>    The main advantage is that we don't wake up a fully idle system every
>>>> 32ms to
>>>>    update blocked load that will be not used.
>>>> - I'm working on one more improvement to use nohz_idle_balance in the
>>>> newly
>>>>    idle case when the system is not overloaded and
>>>>    (this_rq->avg_idle > sysctl_sched_migration_cost). In this case, we
>>>> can try to
>>>>    use nohz_idle_balance with NOHZ_STATS_KICK and abort as soon as it
>>>> exceed
>>>>    this_rq->avg_idle. This will remove some calls to kick_ilb and some
>>>> wake up
>>>>    of an idle cpus.
>>>
>>> This sound like what I meant in my other reply :-)
>>>
>>> It seems pointless to have a timer to update PELT if the system is
>>> completely idle, and when it isn't we can piggy back other events to
>>> make the updates happen.
>>
>> The patch below implements what has been described above. It calls part of
>> nohz_idle_balance when a cpu becomes idle and kick a ilb if it takes too
>> much
>> time. This removes part of ilb that are kicked on an idle cpu for updating
>> the blocked load but the ratio really depends on when the tick happens
>> compared
>> to a cpu becoming idle and the 32ms boundary. I have an additionnal patch
>> that
>> enables to update the blocked loads when a cpu becomes idle 1 period
>> before
>> kicking an ilb and there is far less ilb because we give more chance to
>> the
>> newly idle case (time_after is replaced by time_after_eq in
>> idle_balance()).
>>
>> The patch also uses a function cfs_rq_has_blocked, which only checks the
>> util/load_avg, instead of the cfs_rq_is_decayed which check *_sum too.
>> This
>> reduce significantly the number of update of blocked load. the *_avg will
>> be
>> fully decayed in around 300~400ms but it's far longer for the *_sum which
>> have
>> a higher resolution and we can easily reach almost seconds. But only the
>> *_avg
>> are used to make decision so keeping some blocked *_sum is acceptable.
>>
>> ---
>>   kernel/sched/fair.c | 121
>> +++++++++++++++++++++++++++++++++++++++-------------
>>   1 file changed, 92 insertions(+), 29 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 898785d..ed90303 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -7356,6 +7356,17 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq
>> *cfs_rq)
>>         return true;
>>   }
>>   +static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
>> +{
>> +       if (cfs_rq->avg.load_avg)
>> +               return true;
>> +
>> +       if (cfs_rq->avg.util_avg)
>> +               return true;
>> +
>> +       return false;
>> +}
>> +
>>   #ifdef CONFIG_FAIR_GROUP_SCHED
>>     static void update_blocked_averages(int cpu)
>> @@ -7393,7 +7404,9 @@ static void update_blocked_averages(int cpu)
>>                  */
>>                 if (cfs_rq_is_decayed(cfs_rq))
>>                         list_del_leaf_cfs_rq(cfs_rq);
>> -               else
>> +
>> +               /* Don't need periodic decay once load/util_avg are null
>> */
>> +               if (cfs_rq_has_blocked(cfs_rq))
>>                         done = false;
>>         }
>>   @@ -7463,7 +7476,7 @@ static inline void update_blocked_averages(int
>> cpu)
>>         update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
>>   #ifdef CONFIG_NO_HZ_COMMON
>>         rq->last_blocked_load_update_tick = jiffies;
>> -       if (cfs_rq_is_decayed(cfs_rq))
>> +       if (cfs_rq_has_blocked(cfs_rq))
>>                 rq->has_blocked_load = 0;
>>   #endif
>>         rq_unlock_irqrestore(rq, &rf);
>> @@ -8818,6 +8831,7 @@ update_next_balance(struct sched_domain *sd,
>> unsigned long *next_balance)
>>                 *next_balance = next;
>>   }
>>   +static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
>> enum cpu_idle_type idle);
>>   static void kick_ilb(unsigned int flags);
>>     /*
>> @@ -8861,7 +8875,14 @@ static int idle_balance(struct rq *this_rq, struct
>> rq_flags *rf)
>>                         update_next_balance(sd, &next_balance);
>>                 rcu_read_unlock();
>>   -             if (time_after(jiffies, next) &&
>> atomic_read(&nohz.stats_state))
>> +               /*
>> +                * Update blocked idle load if it has not been done for a
>> +                * while. Try to do it locally before entering idle but
>> kick a
>> +                * ilb if it takes too much time and might delay next
>> local
>> +                * wake up
>> +                */
>> +               if (time_after(jiffies, next) &&
>> atomic_read(&nohz.stats_state) &&
>> +                               !_nohz_idle_balance(this_rq,
>> NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
>>                         kick_ilb(NOHZ_STATS_KICK);
>>                 goto out;
>> @@ -9237,6 +9258,7 @@ void nohz_balance_enter_idle(int cpu)
>>         if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
>>                 return;
>>   +     rq->has_blocked_load = 1;
>>         if (rq->nohz_tick_stopped)
>>                 return;
>>   @@ -9247,7 +9269,6 @@ void nohz_balance_enter_idle(int cpu)
>>                 return;
>>         rq->nohz_tick_stopped = 1;
>> -       rq->has_blocked_load = 1;
>>         cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
>>         atomic_inc(&nohz.nr_cpus);
>> @@ -9259,7 +9280,6 @@ void nohz_balance_enter_idle(int cpu)
>>          * enable the periodic update of the load of idle cpus
>>          */
>>         atomic_set(&nohz.stats_state, 1);
>> -
>>   }
>>   #else
>>   static inline void nohz_balancer_kick(struct rq *rq) { }
>> @@ -9385,10 +9405,13 @@ static void rebalance_domains(struct rq *rq, enum
>> cpu_idle_type idle)
>>     #ifdef CONFIG_NO_HZ_COMMON
>>   /*
>> - * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
>> - * rebalancing for all the cpus for whom scheduler ticks are stopped.
>> + * Internal function that runs load balance for all idle cpus. The load
>> balance
>> + * can be a simple update of blocked load or a complete load balance with
>> + * tasks movement depending of flags.
>> + * For newly idle mode, we abort the loop if it takes too much time and
>> return
>> + * false to notify that the loop has not be completed and a ilb shoud be
>> kick.
>>    */
>> -static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type
>> idle)
>> +static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
>> enum cpu_idle_type idle)
>>   {
>>         /* Earliest time when we have to do rebalance again */
>>         unsigned long now = jiffies;
>> @@ -9396,24 +9419,10 @@ static bool nohz_idle_balance(struct rq *this_rq,
>> enum cpu_idle_type idle)
>>         bool has_blocked_load = false;
>>         int update_next_balance = 0;
>>         int this_cpu = this_rq->cpu;
>> -       unsigned int flags;
>>         int balance_cpu;
>> +       int ret = false;
>>         struct rq *rq;
>> -
>> -       if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
>> -               return false;
>> -
>> -       if (idle != CPU_IDLE) {
>> -               atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>> -               return false;
>> -       }
>> -
>> -       /*
>> -        * barrier, pairs with nohz_balance_enter_idle(), ensures ...
>> -        */
>> -       flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>> -       if (!(flags & NOHZ_KICK_MASK))
>> -               return false;
>> +       u64 curr_cost = 0;
>>         SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>>   @@ -9428,6 +9437,10 @@ static bool nohz_idle_balance(struct rq *this_rq,
>> enum cpu_idle_type idle)
>>         atomic_set(&nohz.stats_state, 0);
>>         for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
>> +               u64 t0, domain_cost;
>> +
>> +               t0 = sched_clock_cpu(this_cpu);
>> +
>>                 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
>>                         continue;
>>   @@ -9438,7 +9451,17 @@ static bool nohz_idle_balance(struct rq *this_rq,
>> enum cpu_idle_type idle)
>>                  */
>>                 if (need_resched()) {
>>                         has_blocked_load = true;
>> -                       break;
>> +                       goto abort;
>> +               }
>> +
>> +               /*
>> +                * If the update is done while CPU becomes idle, we abort
>> +                * the update when its cost is higher than the average
>> idle
>> +                * time in orde to not delay a possible wake up.
>> +                */
>> +               if (idle == CPU_NEWLY_IDLE && this_rq->avg_idle <
>> curr_cost) {
>> +                       has_blocked_load = true;
>> +                       goto abort;
>>                 }
>>                 rq = cpu_rq(balance_cpu);
>> @@ -9453,10 +9476,10 @@ static bool nohz_idle_balance(struct rq *this_rq,
>> enum cpu_idle_type idle)
>>                 if (time_after_eq(jiffies, rq->next_balance)) {
>>                         struct rq_flags rf;
>>   -                     rq_lock_irq(rq, &rf);
>> +                       rq_lock_irqsave(rq, &rf);
>>                         update_rq_clock(rq);
>>                         cpu_load_update_idle(rq);
>> -                       rq_unlock_irq(rq, &rf);
>> +                       rq_unlock_irqrestore(rq, &rf);
>>                         if (flags & NOHZ_BALANCE_KICK)
>>                                 rebalance_domains(rq, CPU_IDLE);
>> @@ -9466,10 +9489,17 @@ static bool nohz_idle_balance(struct rq *this_rq,
>> enum cpu_idle_type idle)
>>                         next_balance = rq->next_balance;
>>                         update_next_balance = 1;
>>                 }
>> +
>> +               domain_cost = sched_clock_cpu(this_cpu) - t0;
>> +               curr_cost += domain_cost;
>> +
>>         }
>>   -     update_blocked_averages(this_cpu);
>> -       has_blocked_load |= this_rq->has_blocked_load;
>> +       /* Newly idle CPU doesn't need an update */
>> +       if (idle != CPU_NEWLY_IDLE) {
>> +               update_blocked_averages(this_cpu);
>> +               has_blocked_load |= this_rq->has_blocked_load;
>> +       }
>>         if (flags & NOHZ_BALANCE_KICK)
>>                 rebalance_domains(this_rq, CPU_IDLE);
>> @@ -9477,6 +9507,10 @@ static bool nohz_idle_balance(struct rq *this_rq,
>> enum cpu_idle_type idle)
>>         WRITE_ONCE(nohz.next_stats,
>>                 now + msecs_to_jiffies(LOAD_AVG_PERIOD));
>>   +     /* The full idle balance loop has been done */
>> +       ret = true;
>> +
>> +abort:
>>         /* There is still blocked load, enable periodic update */
>>         if (has_blocked_load)
>>                 atomic_set(&nohz.stats_state, 1);
>> @@ -9489,6 +9523,35 @@ static bool nohz_idle_balance(struct rq *this_rq,
>> enum cpu_idle_type idle)
>>         if (likely(update_next_balance))
>>                 nohz.next_balance = next_balance;
>>   +     return ret;
>> +}
>> +
>> +/*
>> + * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
>> + * rebalancing for all the cpus for whom scheduler ticks are stopped.
>> + */
>> +static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type
>> idle)
>> +{
>> +       int this_cpu = this_rq->cpu;
>> +       unsigned int flags;
>> +
>> +       if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
>> +               return false;
>> +
>> +       if (idle != CPU_IDLE) {
>> +               atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>> +               return false;
>> +       }
>> +
>> +       /*
>> +        * barrier, pairs with nohz_balance_enter_idle(), ensures ...
>> +        */
>> +       flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>> +       if (!(flags & NOHZ_KICK_MASK))
>> +               return false;
>> +
>> +       _nohz_idle_balance(this_rq, flags, idle);
>> +
>>         return true;
>>   }
>>   #else
>
>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK
  2018-01-30  8:32                               ` Vincent Guittot
@ 2018-01-30 11:41                                 ` Valentin Schneider
  2018-01-30 13:05                                   ` Vincent Guittot
  2018-02-05 22:18                                   ` Valentin Schneider
  0 siblings, 2 replies; 56+ messages in thread
From: Valentin Schneider @ 2018-01-30 11:41 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: Peter Zijlstra, Morten Rasmussen, Ingo Molnar, linux-kernel,
	Brendan Jackman, Dietmar Eggemann, Morten Rasmussen

(Resending because I snuck in some HTML... Apologies)

On 01/30/2018 08:32 AM, Vincent Guittot wrote:
> On 29 January 2018 at 20:31, Valentin Schneider
> <valentin.schneider@arm.com> wrote:
>> Hi Vincent, Peter,
>>
>> I've been running some tests on your patches (Peter's base + the 2 from
>> Vincent). The results themselves are hosted at [1].
>> The base of those tests is the same: a task ("accumulator") is ran for 5
>> seconds (arbitrary value) to accumulate some load, then goes to sleep for .5
>> seconds.
>>
>> I've set up 3 test scenarios:
>>
>> Update by nohz_balance_kick()
>> -----------------------------
>> Right before the "accumulator" task goes to sleep, a CPU-hogging task (100%
>> utilization) is spawned on another CPU. It won't go idle so the only way to
>> update the blocked load generated by "accumulator" is to kick an ILB
>> (NOHZ_STATS_KICK).
>>
>> The test shows that this is behaving nicely - we keep kicking an ILB every
>> ~36ms (see next test for comments on that) until there is no more blocked
>> load. I did however notice some interesting scenarios: after the load has
>> been fully decayed, a tiny background task can spawn and end in less than a
>> scheduling period. However, it still goes through nohz_balance_enter_idle(),
>> and thus sets nohz.stats_state, which will later cause an ILB kick.
>>
>> This makes me wonder if it's worth kicking ILBs for such tiny load values -
>> perhaps it could be worth having a margin to set rq->has_blocked_load ?
> 
> So it's difficult to know what will be the load/utilization on the
> cfs_rq once the cpu wakes up. Even if it's for a really short time,
> that's doesn't mean that the load/utilization is small because it can
> be the migration of a big task that just have a very short wakes up
> this time.
> That's why I don't make any assumption on the utilization/load value
> when a cpu goes to sleep
> 

Right, hadn't thought about those kind of migrations.

>>
>> Furthermore, this tiny task will cause the ILB to iterate over all of the
>> idle CPUs, although only one has stale load. For load update via NEWLY_IDLE
>> load_balance() we use:
>>
>> static bool update_nohz_stats(struct rq *rq)
>> {
>>      if (!rq->has_blocked_load)
>>       return false;
>>      [...]
>> }
>>
>> But for load update via _nohz_idle_balance(), we iterate through all of the
>> nohz CPUS and unconditionally call update_blocked_averages(). This could be
>> avoided by remembering which CPUs have stale load before going idle.
>> Initially I thought that was what nohz.stats_state was for, but it isn't.
>> With Vincent's patches it's only ever set to either 0 or 1, but we could use
>> it as a CPU mask, and use it to skip nohz CPUs that don't have stale load in
>> _nohz_idle_balance() (when NOHZ_STATS_KICK).
> 
> I have studied a way to keep track of how many cpus still have blocked
> load to try to minimize the number of useless ilb kick but this add
> more atomic operations which can impact the system throughput with
> heavy load and lot of very small wake up. that's why i have propose
> this solution which is more simple. But it's probably just a matter of
> where we want to "waste" time. Either we accept to spent a bit more
> time to check the state of idle CPUs or we accept to kick ilb from
> time to time for no good reason.
> 

Agreed. I have the feeling that spending more time doing atomic ops 
could be worth it - I'll try to test this out and see if it's actually 
relevant.

>>
>> Update by idle_balance()
>> ------------------------
>> Right before the "accumulator" task goes to sleep, a tiny periodic
>> (period=32ms) task is spawned on another CPU. It's expected that it will
>> update the blocked load in idle_balance(), either by running
>> _nohz_idle_balance() locally or kicking an ILB (The overload flag shouldn't
>> be set in this test case, so we shouldn't go through the NEWLY_IDLE
>> load_balance()).
>>
>> This also seems to be working fine, but I'm noticing a delay between load
>> updates that is closer to 64ms than 32ms. After digging into it I found out
>> that the time checks done in idle_balance() and nohz_balancer_kick() are
>> time_after(jiffies, next_stats), but IMHO they should be
>> time_after_eq(jiffies, next_stats) to have 32ms-based updates. This also
>> explains the 36ms periodicity of the updates in the test above.
> 
> I have use the 32ms as a minimum value between update. We must use the
> time_after()  if we want to have at least 32ms between each update. We
> will have a 36ms period if the previous update was triggered by the
> tick (just after in fact) but there will be only 32ms if the last
> update was done during an idle_balance that happens just before the
> tick. With  time_after_eq,  the update period will between 28 and
> 32ms.
> 
> Then, I mention a possible optimization by using time_after_eq in the
> idle_balance() so a newly_idle cpu will have more chance (between 0
> and 4ms for hz250) to do the update before a ilb is kicked
> 

IIUC with time_after() the update period should be within ]32, 36] ms, 
but it looks like I'm always on that upper bound in my tests.

When evaluating whether we need to kick_ilb() for load updates, we'll 
always be right after the tick (excluding the case in idle_balance), 
which explains why we wait for an extra tick in the "update by 
nohz_balancer_kick()" test case.

The tricky part is that, as you say, the update by idle_balance() can 
happen anywhere between [0-4[ ms after a tick (or before, depending on 
how you see it), so using time_after_eq could make the update period < 
32ms - and this also impacts a load update by nohz_balance_kick() if the 
previous update was done by idle_balance()... This is what causes the 
update period to be closer to 64ms in my test case, but it's somewhat 
artificial because I only have a 32ms-periodic task running - if there 
was any other task running the period could remain in that ]32, 36] ms 
interval.

Did I get that right ?

> Thanks,
> Vincent
> 
>>
>>
>> No update (idle system)
>> -----------------------
>> Nothing special here, just making sure nothing happens when the system is
>> fully idle. On a sidenote, that's relatively hard to achieve - I had to
>> switch over to Juno because my HiKey960 gets interrupts every 16ms. The Juno
>> still gets woken up every now and then but it's a bit quieter.
>>
>>
>> [1]: https://gist.github.com/valschneider/a8da7bb8e11fb1ec63a419710f56c0a0
>>
>>
>>
>> On 01/24/2018 08:25 AM, Vincent Guittot wrote:
>>>
>>> Hi,
>>>
>>> Le Thursday 18 Jan 2018 à 10:38:07 (+0000), Morten Rasmussen a écrit :
>>>>
>>>> On Mon, Jan 15, 2018 at 09:26:09AM +0100, Vincent Guittot wrote:
>>>>>
>>>>> Le Wednesday 03 Jan 2018 à 10:16:00 (+0100), Vincent Guittot a écrit :
>>>>>>
>>>>>> Hi Peter,
>>>>>>
>>>>>> On 22 December 2017 at 21:42, Peter Zijlstra <peterz@infradead.org>
>>>>>> wrote:
>>>>>>>
>>>>>>> On Fri, Dec 22, 2017 at 07:56:29PM +0100, Peter Zijlstra wrote:
>>>>>>>>
>>>>>>>> Right; but I figured we'd try and do it 'right' and see how horrible
>>>>>>>> it
>>>>>>>> is before we try and do funny things.
>>>>>>>
>>>>>>> So now it should have a 32ms tick for up to .5s when the system goes
>>>>>>> completely idle.
>>>>>>>
>>>>>>> No idea how bad that is..
>>>>>>
>>>>>> I have tested your branch but the timer doesn't seem to fire correctly
>>>>>> because i can still see blocked load in the use case i have run.
>>>>>> I haven't found the reason yet
>>>>>
>>>>> Hi Peter,
>>>>>
>>>>> With the patch below on top of your branch, the blocked loads are
>>>>> updated and
>>>>> decayed regularly. The main differences are:
>>>>> - It doesn't use a timer to trig ilb but the tick and when a cpu becomes
>>>>> idle.
>>>>>     The main drawback of this solution is that the load is blocked when
>>>>> the
>>>>>     system is fully idle with the advantage of not waking up a fully idle
>>>>>     system. We have to wait for the next tick or newly idle event for
>>>>> updating
>>>>>     blocked load when the system leaves idle stat which can be up to a
>>>>> tick long.
>>>>>     If this is too long, we can check for kicking ilb when task wakes up
>>>>> so the
>>>>>     blocked load will be updated as soon as the system leaves idle state.
>>>>>     The main advantage is that we don't wake up a fully idle system every
>>>>> 32ms to
>>>>>     update blocked load that will be not used.
>>>>> - I'm working on one more improvement to use nohz_idle_balance in the
>>>>> newly
>>>>>     idle case when the system is not overloaded and
>>>>>     (this_rq->avg_idle > sysctl_sched_migration_cost). In this case, we
>>>>> can try to
>>>>>     use nohz_idle_balance with NOHZ_STATS_KICK and abort as soon as it
>>>>> exceed
>>>>>     this_rq->avg_idle. This will remove some calls to kick_ilb and some
>>>>> wake up
>>>>>     of an idle cpus.
>>>>
>>>> This sound like what I meant in my other reply :-)
>>>>
>>>> It seems pointless to have a timer to update PELT if the system is
>>>> completely idle, and when it isn't we can piggy back other events to
>>>> make the updates happen.
>>>
>>> The patch below implements what has been described above. It calls part of
>>> nohz_idle_balance when a cpu becomes idle and kick a ilb if it takes too
>>> much
>>> time. This removes part of ilb that are kicked on an idle cpu for updating
>>> the blocked load but the ratio really depends on when the tick happens
>>> compared
>>> to a cpu becoming idle and the 32ms boundary. I have an additionnal patch
>>> that
>>> enables to update the blocked loads when a cpu becomes idle 1 period
>>> before
>>> kicking an ilb and there is far less ilb because we give more chance to
>>> the
>>> newly idle case (time_after is replaced by time_after_eq in
>>> idle_balance()).
>>>
>>> The patch also uses a function cfs_rq_has_blocked, which only checks the
>>> util/load_avg, instead of the cfs_rq_is_decayed which check *_sum too.
>>> This
>>> reduce significantly the number of update of blocked load. the *_avg will
>>> be
>>> fully decayed in around 300~400ms but it's far longer for the *_sum which
>>> have
>>> a higher resolution and we can easily reach almost seconds. But only the
>>> *_avg
>>> are used to make decision so keeping some blocked *_sum is acceptable.
>>>
>>> ---
>>>    kernel/sched/fair.c | 121
>>> +++++++++++++++++++++++++++++++++++++++-------------
>>>    1 file changed, 92 insertions(+), 29 deletions(-)
>>>
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index 898785d..ed90303 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -7356,6 +7356,17 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq
>>> *cfs_rq)
>>>          return true;
>>>    }
>>>    +static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
>>> +{
>>> +       if (cfs_rq->avg.load_avg)
>>> +               return true;
>>> +
>>> +       if (cfs_rq->avg.util_avg)
>>> +               return true;
>>> +
>>> +       return false;
>>> +}
>>> +
>>>    #ifdef CONFIG_FAIR_GROUP_SCHED
>>>      static void update_blocked_averages(int cpu)
>>> @@ -7393,7 +7404,9 @@ static void update_blocked_averages(int cpu)
>>>                   */
>>>                  if (cfs_rq_is_decayed(cfs_rq))
>>>                          list_del_leaf_cfs_rq(cfs_rq);
>>> -               else
>>> +
>>> +               /* Don't need periodic decay once load/util_avg are null
>>> */
>>> +               if (cfs_rq_has_blocked(cfs_rq))
>>>                          done = false;
>>>          }
>>>    @@ -7463,7 +7476,7 @@ static inline void update_blocked_averages(int
>>> cpu)
>>>          update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
>>>    #ifdef CONFIG_NO_HZ_COMMON
>>>          rq->last_blocked_load_update_tick = jiffies;
>>> -       if (cfs_rq_is_decayed(cfs_rq))
>>> +       if (cfs_rq_has_blocked(cfs_rq))
>>>                  rq->has_blocked_load = 0;
>>>    #endif
>>>          rq_unlock_irqrestore(rq, &rf);
>>> @@ -8818,6 +8831,7 @@ update_next_balance(struct sched_domain *sd,
>>> unsigned long *next_balance)
>>>                  *next_balance = next;
>>>    }
>>>    +static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
>>> enum cpu_idle_type idle);
>>>    static void kick_ilb(unsigned int flags);
>>>      /*
>>> @@ -8861,7 +8875,14 @@ static int idle_balance(struct rq *this_rq, struct
>>> rq_flags *rf)
>>>                          update_next_balance(sd, &next_balance);
>>>                  rcu_read_unlock();
>>>    -             if (time_after(jiffies, next) &&
>>> atomic_read(&nohz.stats_state))
>>> +               /*
>>> +                * Update blocked idle load if it has not been done for a
>>> +                * while. Try to do it locally before entering idle but
>>> kick a
>>> +                * ilb if it takes too much time and might delay next
>>> local
>>> +                * wake up
>>> +                */
>>> +               if (time_after(jiffies, next) &&
>>> atomic_read(&nohz.stats_state) &&
>>> +                               !_nohz_idle_balance(this_rq,
>>> NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
>>>                          kick_ilb(NOHZ_STATS_KICK);
>>>                  goto out;
>>> @@ -9237,6 +9258,7 @@ void nohz_balance_enter_idle(int cpu)
>>>          if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
>>>                  return;
>>>    +     rq->has_blocked_load = 1;
>>>          if (rq->nohz_tick_stopped)
>>>                  return;
>>>    @@ -9247,7 +9269,6 @@ void nohz_balance_enter_idle(int cpu)
>>>                  return;
>>>          rq->nohz_tick_stopped = 1;
>>> -       rq->has_blocked_load = 1;
>>>          cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
>>>          atomic_inc(&nohz.nr_cpus);
>>> @@ -9259,7 +9280,6 @@ void nohz_balance_enter_idle(int cpu)
>>>           * enable the periodic update of the load of idle cpus
>>>           */
>>>          atomic_set(&nohz.stats_state, 1);
>>> -
>>>    }
>>>    #else
>>>    static inline void nohz_balancer_kick(struct rq *rq) { }
>>> @@ -9385,10 +9405,13 @@ static void rebalance_domains(struct rq *rq, enum
>>> cpu_idle_type idle)
>>>      #ifdef CONFIG_NO_HZ_COMMON
>>>    /*
>>> - * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
>>> - * rebalancing for all the cpus for whom scheduler ticks are stopped.
>>> + * Internal function that runs load balance for all idle cpus. The load
>>> balance
>>> + * can be a simple update of blocked load or a complete load balance with
>>> + * tasks movement depending of flags.
>>> + * For newly idle mode, we abort the loop if it takes too much time and
>>> return
>>> + * false to notify that the loop has not be completed and a ilb shoud be
>>> kick.
>>>     */
>>> -static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type
>>> idle)
>>> +static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
>>> enum cpu_idle_type idle)
>>>    {
>>>          /* Earliest time when we have to do rebalance again */
>>>          unsigned long now = jiffies;
>>> @@ -9396,24 +9419,10 @@ static bool nohz_idle_balance(struct rq *this_rq,
>>> enum cpu_idle_type idle)
>>>          bool has_blocked_load = false;
>>>          int update_next_balance = 0;
>>>          int this_cpu = this_rq->cpu;
>>> -       unsigned int flags;
>>>          int balance_cpu;
>>> +       int ret = false;
>>>          struct rq *rq;
>>> -
>>> -       if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
>>> -               return false;
>>> -
>>> -       if (idle != CPU_IDLE) {
>>> -               atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>>> -               return false;
>>> -       }
>>> -
>>> -       /*
>>> -        * barrier, pairs with nohz_balance_enter_idle(), ensures ...
>>> -        */
>>> -       flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>>> -       if (!(flags & NOHZ_KICK_MASK))
>>> -               return false;
>>> +       u64 curr_cost = 0;
>>>          SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>>>    @@ -9428,6 +9437,10 @@ static bool nohz_idle_balance(struct rq *this_rq,
>>> enum cpu_idle_type idle)
>>>          atomic_set(&nohz.stats_state, 0);
>>>          for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
>>> +               u64 t0, domain_cost;
>>> +
>>> +               t0 = sched_clock_cpu(this_cpu);
>>> +
>>>                  if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
>>>                          continue;
>>>    @@ -9438,7 +9451,17 @@ static bool nohz_idle_balance(struct rq *this_rq,
>>> enum cpu_idle_type idle)
>>>                   */
>>>                  if (need_resched()) {
>>>                          has_blocked_load = true;
>>> -                       break;
>>> +                       goto abort;
>>> +               }
>>> +
>>> +               /*
>>> +                * If the update is done while CPU becomes idle, we abort
>>> +                * the update when its cost is higher than the average
>>> idle
>>> +                * time in orde to not delay a possible wake up.
>>> +                */
>>> +               if (idle == CPU_NEWLY_IDLE && this_rq->avg_idle <
>>> curr_cost) {
>>> +                       has_blocked_load = true;
>>> +                       goto abort;
>>>                  }
>>>                  rq = cpu_rq(balance_cpu);
>>> @@ -9453,10 +9476,10 @@ static bool nohz_idle_balance(struct rq *this_rq,
>>> enum cpu_idle_type idle)
>>>                  if (time_after_eq(jiffies, rq->next_balance)) {
>>>                          struct rq_flags rf;
>>>    -                     rq_lock_irq(rq, &rf);
>>> +                       rq_lock_irqsave(rq, &rf);
>>>                          update_rq_clock(rq);
>>>                          cpu_load_update_idle(rq);
>>> -                       rq_unlock_irq(rq, &rf);
>>> +                       rq_unlock_irqrestore(rq, &rf);
>>>                          if (flags & NOHZ_BALANCE_KICK)
>>>                                  rebalance_domains(rq, CPU_IDLE);
>>> @@ -9466,10 +9489,17 @@ static bool nohz_idle_balance(struct rq *this_rq,
>>> enum cpu_idle_type idle)
>>>                          next_balance = rq->next_balance;
>>>                          update_next_balance = 1;
>>>                  }
>>> +
>>> +               domain_cost = sched_clock_cpu(this_cpu) - t0;
>>> +               curr_cost += domain_cost;
>>> +
>>>          }
>>>    -     update_blocked_averages(this_cpu);
>>> -       has_blocked_load |= this_rq->has_blocked_load;
>>> +       /* Newly idle CPU doesn't need an update */
>>> +       if (idle != CPU_NEWLY_IDLE) {
>>> +               update_blocked_averages(this_cpu);
>>> +               has_blocked_load |= this_rq->has_blocked_load;
>>> +       }
>>>          if (flags & NOHZ_BALANCE_KICK)
>>>                  rebalance_domains(this_rq, CPU_IDLE);
>>> @@ -9477,6 +9507,10 @@ static bool nohz_idle_balance(struct rq *this_rq,
>>> enum cpu_idle_type idle)
>>>          WRITE_ONCE(nohz.next_stats,
>>>                  now + msecs_to_jiffies(LOAD_AVG_PERIOD));
>>>    +     /* The full idle balance loop has been done */
>>> +       ret = true;
>>> +
>>> +abort:
>>>          /* There is still blocked load, enable periodic update */
>>>          if (has_blocked_load)
>>>                  atomic_set(&nohz.stats_state, 1);
>>> @@ -9489,6 +9523,35 @@ static bool nohz_idle_balance(struct rq *this_rq,
>>> enum cpu_idle_type idle)
>>>          if (likely(update_next_balance))
>>>                  nohz.next_balance = next_balance;
>>>    +     return ret;
>>> +}
>>> +
>>> +/*
>>> + * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
>>> + * rebalancing for all the cpus for whom scheduler ticks are stopped.
>>> + */
>>> +static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type
>>> idle)
>>> +{
>>> +       int this_cpu = this_rq->cpu;
>>> +       unsigned int flags;
>>> +
>>> +       if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
>>> +               return false;
>>> +
>>> +       if (idle != CPU_IDLE) {
>>> +               atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>>> +               return false;
>>> +       }
>>> +
>>> +       /*
>>> +        * barrier, pairs with nohz_balance_enter_idle(), ensures ...
>>> +        */
>>> +       flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>>> +       if (!(flags & NOHZ_KICK_MASK))
>>> +               return false;
>>> +
>>> +       _nohz_idle_balance(this_rq, flags, idle);
>>> +
>>>          return true;
>>>    }
>>>    #else
>>
>>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK
  2018-01-30 11:41                                 ` Valentin Schneider
@ 2018-01-30 13:05                                   ` Vincent Guittot
  2018-02-05 22:18                                   ` Valentin Schneider
  1 sibling, 0 replies; 56+ messages in thread
From: Vincent Guittot @ 2018-01-30 13:05 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: Peter Zijlstra, Morten Rasmussen, Ingo Molnar, linux-kernel,
	Brendan Jackman, Dietmar Eggemann, Morten Rasmussen

On 30 January 2018 at 12:41, Valentin Schneider
<valentin.schneider@arm.com> wrote:
> (Resending because I snuck in some HTML... Apologies)
>
> On 01/30/2018 08:32 AM, Vincent Guittot wrote:
>>
>> On 29 January 2018 at 20:31, Valentin Schneider
>> <valentin.schneider@arm.com> wrote:
>>>
>>> Hi Vincent, Peter,
>>>
>>> I've been running some tests on your patches (Peter's base + the 2 from
>>> Vincent). The results themselves are hosted at [1].
>>> The base of those tests is the same: a task ("accumulator") is ran for 5
>>> seconds (arbitrary value) to accumulate some load, then goes to sleep for
>>> .5
>>> seconds.
>>>
>>> I've set up 3 test scenarios:
>>>
>>> Update by nohz_balance_kick()
>>> -----------------------------
>>> Right before the "accumulator" task goes to sleep, a CPU-hogging task
>>> (100%
>>> utilization) is spawned on another CPU. It won't go idle so the only way
>>> to
>>> update the blocked load generated by "accumulator" is to kick an ILB
>>> (NOHZ_STATS_KICK).
>>>
>>> The test shows that this is behaving nicely - we keep kicking an ILB
>>> every
>>> ~36ms (see next test for comments on that) until there is no more blocked
>>> load. I did however notice some interesting scenarios: after the load has
>>> been fully decayed, a tiny background task can spawn and end in less than
>>> a
>>> scheduling period. However, it still goes through
>>> nohz_balance_enter_idle(),
>>> and thus sets nohz.stats_state, which will later cause an ILB kick.
>>>
>>> This makes me wonder if it's worth kicking ILBs for such tiny load values
>>> -
>>> perhaps it could be worth having a margin to set rq->has_blocked_load ?
>>
>>
>> So it's difficult to know what will be the load/utilization on the
>> cfs_rq once the cpu wakes up. Even if it's for a really short time,
>> that's doesn't mean that the load/utilization is small because it can
>> be the migration of a big task that just have a very short wakes up
>> this time.
>> That's why I don't make any assumption on the utilization/load value
>> when a cpu goes to sleep
>>
>
> Right, hadn't thought about those kind of migrations.
>
>>>
>>> Furthermore, this tiny task will cause the ILB to iterate over all of the
>>> idle CPUs, although only one has stale load. For load update via
>>> NEWLY_IDLE
>>> load_balance() we use:
>>>
>>> static bool update_nohz_stats(struct rq *rq)
>>> {
>>>      if (!rq->has_blocked_load)
>>>       return false;
>>>      [...]
>>> }
>>>
>>> But for load update via _nohz_idle_balance(), we iterate through all of
>>> the
>>> nohz CPUS and unconditionally call update_blocked_averages(). This could
>>> be
>>> avoided by remembering which CPUs have stale load before going idle.
>>> Initially I thought that was what nohz.stats_state was for, but it isn't.
>>> With Vincent's patches it's only ever set to either 0 or 1, but we could
>>> use
>>> it as a CPU mask, and use it to skip nohz CPUs that don't have stale load
>>> in
>>> _nohz_idle_balance() (when NOHZ_STATS_KICK).
>>
>>
>> I have studied a way to keep track of how many cpus still have blocked
>> load to try to minimize the number of useless ilb kick but this add
>> more atomic operations which can impact the system throughput with
>> heavy load and lot of very small wake up. that's why i have propose
>> this solution which is more simple. But it's probably just a matter of
>> where we want to "waste" time. Either we accept to spent a bit more
>> time to check the state of idle CPUs or we accept to kick ilb from
>> time to time for no good reason.
>>
>
> Agreed. I have the feeling that spending more time doing atomic ops could be
> worth it - I'll try to test this out and see if it's actually relevant.
>
>>>
>>> Update by idle_balance()
>>> ------------------------
>>> Right before the "accumulator" task goes to sleep, a tiny periodic
>>> (period=32ms) task is spawned on another CPU. It's expected that it will
>>> update the blocked load in idle_balance(), either by running
>>> _nohz_idle_balance() locally or kicking an ILB (The overload flag
>>> shouldn't
>>> be set in this test case, so we shouldn't go through the NEWLY_IDLE
>>> load_balance()).
>>>
>>> This also seems to be working fine, but I'm noticing a delay between load
>>> updates that is closer to 64ms than 32ms. After digging into it I found
>>> out
>>> that the time checks done in idle_balance() and nohz_balancer_kick() are
>>> time_after(jiffies, next_stats), but IMHO they should be
>>> time_after_eq(jiffies, next_stats) to have 32ms-based updates. This also
>>> explains the 36ms periodicity of the updates in the test above.
>>
>>
>> I have use the 32ms as a minimum value between update. We must use the
>> time_after()  if we want to have at least 32ms between each update. We
>> will have a 36ms period if the previous update was triggered by the
>> tick (just after in fact) but there will be only 32ms if the last
>> update was done during an idle_balance that happens just before the
>> tick. With  time_after_eq,  the update period will between 28 and
>> 32ms.
>>
>> Then, I mention a possible optimization by using time_after_eq in the
>> idle_balance() so a newly_idle cpu will have more chance (between 0
>> and 4ms for hz250) to do the update before a ilb is kicked
>>
>
> IIUC with time_after() the update period should be within ]32, 36] ms, but
> it looks like I'm always on that upper bound in my tests.
>
> When evaluating whether we need to kick_ilb() for load updates, we'll always
> be right after the tick (excluding the case in idle_balance), which explains
> why we wait for an extra tick in the "update by nohz_balancer_kick()" test
> case.
>
> The tricky part is that, as you say, the update by idle_balance() can happen
> anywhere between [0-4[ ms after a tick (or before, depending on how you see
> it), so using time_after_eq could make the update period < 32ms - and this
> also impacts a load update by nohz_balance_kick() if the previous update was
> done by idle_balance()... This is what causes the update period to be closer
> to 64ms in my test case, but it's somewhat artificial because I only have a
> 32ms-periodic task running - if there was any other task running the period
> could remain in that ]32, 36] ms interval.
>
> Did I get that right ?

yes

>
>> Thanks,
>> Vincent
>>
>>>
>>>
>>> No update (idle system)
>>> -----------------------
>>> Nothing special here, just making sure nothing happens when the system is
>>> fully idle. On a sidenote, that's relatively hard to achieve - I had to
>>> switch over to Juno because my HiKey960 gets interrupts every 16ms. The
>>> Juno
>>> still gets woken up every now and then but it's a bit quieter.
>>>
>>>
>>> [1]:
>>> https://gist.github.com/valschneider/a8da7bb8e11fb1ec63a419710f56c0a0
>>>
>>>
>>>


[snip]

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK
  2018-01-15  8:26                       ` Vincent Guittot
  2018-01-18 10:38                         ` Morten Rasmussen
  2018-01-22  9:40                         ` Dietmar Eggemann
@ 2018-02-01 16:52                         ` Peter Zijlstra
  2018-02-01 17:25                           ` Vincent Guittot
  2 siblings, 1 reply; 56+ messages in thread
From: Peter Zijlstra @ 2018-02-01 16:52 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: Ingo Molnar, linux-kernel, Brendan Jackman, Dietmar Eggemann,
	Morten Rasmussen

On Mon, Jan 15, 2018 at 09:26:09AM +0100, Vincent Guittot wrote:

Would've probably been easier to read if you'd not included the revert
of that timer patch...

> @@ -9258,21 +9255,11 @@ void nohz_balance_enter_idle(int cpu)
>  	set_cpu_sd_state_idle(cpu);
>  
>  	/*
> -	 * implies a barrier such that if the stats_state update is observed
> -	 * the above updates are also visible. Pairs with stuff in
> -	 * update_sd_lb_stats() and nohz_idle_balance().
> +	 * Each time a cpu enter idle, we assume that it has blocked load and
> +	 * enable the periodic update of the load of idle cpus
>  	 */
> -	val = atomic_read(&nohz.stats_state);
> -	do {
> -		new = val + 2;
> -		new |= 1;
> -	} while (!atomic_try_cmpxchg(&nohz.stats_state, &val, new));
> +	atomic_set(&nohz.stats_state, 1);
>  

> @@ -9422,7 +9408,6 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>  		return false;
>  	}
>  
> -	stats_seq = atomic_read(&nohz.stats_state);
>  	/*
>  	 * barrier, pairs with nohz_balance_enter_idle(), ensures ...
>  	 */
> @@ -9432,6 +9417,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>  
>  	SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>  
> +	/*
> +	 * We assume there will be no idle load after this update and clear
> +	 * the stats state. If a cpu enters idle in the mean time, it will
> +	 * set the stats state and trig another update of idle load.
> +	 * Because a cpu that becomes idle, is added to idle_cpus_mask before
> +	 * setting the stats state, we are sure to not clear the state and not
> +	 * check the load of an idle cpu.
> +	 */
> +	atomic_set(&nohz.stats_state, 0);
> +
>  	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
>  		if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
>  			continue;
> @@ -9441,8 +9436,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>  		 * work being done for other cpus. Next load
>  		 * balancing owner will pick it up.
>  		 */
> -		if (need_resched())
> +		if (need_resched()) {
> +			has_blocked_load = true;
>  			break;
> +		}
>  
>  		rq = cpu_rq(balance_cpu);
>  
> @@ -9477,12 +9474,12 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>  	if (flags & NOHZ_BALANCE_KICK)
>  		rebalance_domains(this_rq, CPU_IDLE);
>  
> -	if (has_blocked_load ||
> -	    !atomic_try_cmpxchg(&nohz.stats_state, &stats_seq, 0)) {
> -		WRITE_ONCE(nohz.next_stats,
> -				now + msecs_to_jiffies(LOAD_AVG_PERIOD));
> -		mod_timer(&nohz.timer, nohz.next_stats);
> -	}
> +	WRITE_ONCE(nohz.next_stats,
> +		now + msecs_to_jiffies(LOAD_AVG_PERIOD));
> +
> +	/* There is still blocked load, enable periodic update */
> +	if (has_blocked_load)
> +		atomic_set(&nohz.stats_state, 1);
>  
>  	/*
>  	 * next_balance will be updated only when there is a need.

After this there is no point for stats_state to be atomic anymore. Also
a better name.

Maybe if I drop the last two patches (and you re-introduce the bits
from: Subject: sched: Optimize nohz stats, that you do need) this all
becomes more readable?

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK
  2018-01-18 10:38                         ` Morten Rasmussen
  2018-01-24  8:25                           ` Vincent Guittot
@ 2018-02-01 16:55                           ` Peter Zijlstra
  1 sibling, 0 replies; 56+ messages in thread
From: Peter Zijlstra @ 2018-02-01 16:55 UTC (permalink / raw)
  To: Morten Rasmussen
  Cc: Vincent Guittot, Ingo Molnar, linux-kernel, Brendan Jackman,
	Dietmar Eggemann, Morten Rasmussen

On Thu, Jan 18, 2018 at 10:38:07AM +0000, Morten Rasmussen wrote:
> It seems pointless to have a timer to update PELT if the system is
> completely idle, and when it isn't we can piggy back other events to
> make the updates happen.

Only if we do that update before making decisions based on the values.
The thing I was bothered by in the earlier patches was that wakeup would
use whatever current value and async kick something to go update.

That just seems wrong.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK
  2018-01-24  8:25                           ` Vincent Guittot
  2018-01-29 18:43                             ` Dietmar Eggemann
  2018-01-29 19:31                             ` Valentin Schneider
@ 2018-02-01 16:57                             ` Peter Zijlstra
  2018-02-01 17:26                               ` Vincent Guittot
  2018-02-01 18:10                             ` Peter Zijlstra
  3 siblings, 1 reply; 56+ messages in thread
From: Peter Zijlstra @ 2018-02-01 16:57 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: Morten Rasmussen, Ingo Molnar, linux-kernel, Brendan Jackman,
	Dietmar Eggemann, Morten Rasmussen

On Wed, Jan 24, 2018 at 09:25:36AM +0100, Vincent Guittot wrote:
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 898785d..ed90303 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7356,6 +7356,17 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
>  	return true;
>  }
>  
> +static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
> +{
> +	if (cfs_rq->avg.load_avg)
> +		return true;
> +
> +	if (cfs_rq->avg.util_avg)
> +		return true;
> +
> +	return false;
> +}
> +
>  #ifdef CONFIG_FAIR_GROUP_SCHED
>  
>  static void update_blocked_averages(int cpu)
> @@ -7393,7 +7404,9 @@ static void update_blocked_averages(int cpu)
>  		 */
>  		if (cfs_rq_is_decayed(cfs_rq))
>  			list_del_leaf_cfs_rq(cfs_rq);
> -		else
> +
> +		/* Don't need periodic decay once load/util_avg are null */
> +		if (cfs_rq_has_blocked(cfs_rq))
>  			done = false;
>  	}
>  
> @@ -7463,7 +7476,7 @@ static inline void update_blocked_averages(int cpu)
>  	update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
>  #ifdef CONFIG_NO_HZ_COMMON
>  	rq->last_blocked_load_update_tick = jiffies;
> -	if (cfs_rq_is_decayed(cfs_rq))
> +	if (cfs_rq_has_blocked(cfs_rq))
>  		rq->has_blocked_load = 0;
>  #endif
>  	rq_unlock_irqrestore(rq, &rf);

OK makes sense; would've been even better as a separate patch :-)

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK
  2018-02-01 16:52                         ` Peter Zijlstra
@ 2018-02-01 17:25                           ` Vincent Guittot
  0 siblings, 0 replies; 56+ messages in thread
From: Vincent Guittot @ 2018-02-01 17:25 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, linux-kernel, Brendan Jackman, Dietmar Eggemann,
	Morten Rasmussen

On 1 February 2018 at 17:52, Peter Zijlstra <peterz@infradead.org> wrote:
> On Mon, Jan 15, 2018 at 09:26:09AM +0100, Vincent Guittot wrote:
>
> Would've probably been easier to read if you'd not included the revert
> of that timer patch...
>
>> @@ -9258,21 +9255,11 @@ void nohz_balance_enter_idle(int cpu)
>>       set_cpu_sd_state_idle(cpu);
>>
>>       /*
>> -      * implies a barrier such that if the stats_state update is observed
>> -      * the above updates are also visible. Pairs with stuff in
>> -      * update_sd_lb_stats() and nohz_idle_balance().
>> +      * Each time a cpu enter idle, we assume that it has blocked load and
>> +      * enable the periodic update of the load of idle cpus
>>        */
>> -     val = atomic_read(&nohz.stats_state);
>> -     do {
>> -             new = val + 2;
>> -             new |= 1;
>> -     } while (!atomic_try_cmpxchg(&nohz.stats_state, &val, new));
>> +     atomic_set(&nohz.stats_state, 1);
>>
>
>> @@ -9422,7 +9408,6 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>               return false;
>>       }
>>
>> -     stats_seq = atomic_read(&nohz.stats_state);
>>       /*
>>        * barrier, pairs with nohz_balance_enter_idle(), ensures ...
>>        */
>> @@ -9432,6 +9417,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>
>>       SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>>
>> +     /*
>> +      * We assume there will be no idle load after this update and clear
>> +      * the stats state. If a cpu enters idle in the mean time, it will
>> +      * set the stats state and trig another update of idle load.
>> +      * Because a cpu that becomes idle, is added to idle_cpus_mask before
>> +      * setting the stats state, we are sure to not clear the state and not
>> +      * check the load of an idle cpu.
>> +      */
>> +     atomic_set(&nohz.stats_state, 0);
>> +
>>       for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
>>               if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
>>                       continue;
>> @@ -9441,8 +9436,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>                * work being done for other cpus. Next load
>>                * balancing owner will pick it up.
>>                */
>> -             if (need_resched())
>> +             if (need_resched()) {
>> +                     has_blocked_load = true;
>>                       break;
>> +             }
>>
>>               rq = cpu_rq(balance_cpu);
>>
>> @@ -9477,12 +9474,12 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>       if (flags & NOHZ_BALANCE_KICK)
>>               rebalance_domains(this_rq, CPU_IDLE);
>>
>> -     if (has_blocked_load ||
>> -         !atomic_try_cmpxchg(&nohz.stats_state, &stats_seq, 0)) {
>> -             WRITE_ONCE(nohz.next_stats,
>> -                             now + msecs_to_jiffies(LOAD_AVG_PERIOD));
>> -             mod_timer(&nohz.timer, nohz.next_stats);
>> -     }
>> +     WRITE_ONCE(nohz.next_stats,
>> +             now + msecs_to_jiffies(LOAD_AVG_PERIOD));
>> +
>> +     /* There is still blocked load, enable periodic update */
>> +     if (has_blocked_load)
>> +             atomic_set(&nohz.stats_state, 1);
>>
>>       /*
>>        * next_balance will be updated only when there is a need.
>
> After this there is no point for stats_state to be atomic anymore. Also
> a better name.

Ok

>
> Maybe if I drop the last two patches (and you re-introduce the bits
> from: Subject: sched: Optimize nohz stats, that you do need) this all
> becomes more readable?

Yes. we can do like that

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK
  2018-02-01 16:57                             ` Peter Zijlstra
@ 2018-02-01 17:26                               ` Vincent Guittot
  0 siblings, 0 replies; 56+ messages in thread
From: Vincent Guittot @ 2018-02-01 17:26 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Morten Rasmussen, Ingo Molnar, linux-kernel, Brendan Jackman,
	Dietmar Eggemann, Morten Rasmussen

On 1 February 2018 at 17:57, Peter Zijlstra <peterz@infradead.org> wrote:
> On Wed, Jan 24, 2018 at 09:25:36AM +0100, Vincent Guittot wrote:
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 898785d..ed90303 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -7356,6 +7356,17 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
>>       return true;
>>  }
>>
>> +static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
>> +{
>> +     if (cfs_rq->avg.load_avg)
>> +             return true;
>> +
>> +     if (cfs_rq->avg.util_avg)
>> +             return true;
>> +
>> +     return false;
>> +}
>> +
>>  #ifdef CONFIG_FAIR_GROUP_SCHED
>>
>>  static void update_blocked_averages(int cpu)
>> @@ -7393,7 +7404,9 @@ static void update_blocked_averages(int cpu)
>>                */
>>               if (cfs_rq_is_decayed(cfs_rq))
>>                       list_del_leaf_cfs_rq(cfs_rq);
>> -             else
>> +
>> +             /* Don't need periodic decay once load/util_avg are null */
>> +             if (cfs_rq_has_blocked(cfs_rq))
>>                       done = false;
>>       }
>>
>> @@ -7463,7 +7476,7 @@ static inline void update_blocked_averages(int cpu)
>>       update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
>>  #ifdef CONFIG_NO_HZ_COMMON
>>       rq->last_blocked_load_update_tick = jiffies;
>> -     if (cfs_rq_is_decayed(cfs_rq))
>> +     if (cfs_rq_has_blocked(cfs_rq))
>>               rq->has_blocked_load = 0;
>>  #endif
>>       rq_unlock_irqrestore(rq, &rf);
>
> OK makes sense; would've been even better as a separate patch :-)

Yes  i will make a separate patch for that

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK
  2018-01-24  8:25                           ` Vincent Guittot
                                               ` (2 preceding siblings ...)
  2018-02-01 16:57                             ` Peter Zijlstra
@ 2018-02-01 18:10                             ` Peter Zijlstra
  2018-02-01 19:11                               ` Vincent Guittot
  2018-02-06  8:32                               ` [PATCH 1/3] sched: Stop nohz stats when decayed Vincent Guittot
  3 siblings, 2 replies; 56+ messages in thread
From: Peter Zijlstra @ 2018-02-01 18:10 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: Morten Rasmussen, Ingo Molnar, linux-kernel, Brendan Jackman,
	Dietmar Eggemann, Morten Rasmussen

On Wed, Jan 24, 2018 at 09:25:36AM +0100, Vincent Guittot wrote:
> @@ -8861,7 +8875,14 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
>  			update_next_balance(sd, &next_balance);
>  		rcu_read_unlock();
>  
> -		if (time_after(jiffies, next) && atomic_read(&nohz.stats_state))
> +		/*
> +		 * Update blocked idle load if it has not been done for a
> +		 * while. Try to do it locally before entering idle but kick a
> +		 * ilb if it takes too much time and might delay next local
> +		 * wake up
> +		 */
> +		if (time_after(jiffies, next) && atomic_read(&nohz.stats_state) &&
> +				!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
>  			kick_ilb(NOHZ_STATS_KICK);
>  
>  		goto out;

This I really dislike. We're here because avg_idle is _really_ low, we
really should not then call _nohz_idle_balance().

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK
  2018-01-29 19:31                             ` Valentin Schneider
  2018-01-30  8:32                               ` Vincent Guittot
@ 2018-02-01 18:16                               ` Peter Zijlstra
  1 sibling, 0 replies; 56+ messages in thread
From: Peter Zijlstra @ 2018-02-01 18:16 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: Vincent Guittot, Morten Rasmussen, Ingo Molnar, linux-kernel,
	Brendan Jackman, Dietmar Eggemann, Morten Rasmussen

On Mon, Jan 29, 2018 at 07:31:07PM +0000, Valentin Schneider wrote:

> But for load update via _nohz_idle_balance(), we iterate through all of the
> nohz CPUS and unconditionally call update_blocked_averages(). This could be
> avoided by remembering which CPUs have stale load before going idle.
> Initially I thought that was what nohz.stats_state was for, but it isn't.
> With Vincent's patches it's only ever set to either 0 or 1, but we could use
> it as a CPU mask, and use it to skip nohz CPUs that don't have stale load in
> _nohz_idle_balance() (when NOHZ_STATS_KICK).

Yes, you'd need to allocate a second cpumask, worse you need atomic
bitops to set and clear bits there.

That all _might_ be worth it... dunno.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK
  2018-02-01 18:10                             ` Peter Zijlstra
@ 2018-02-01 19:11                               ` Vincent Guittot
  2018-02-06  8:32                               ` [PATCH 1/3] sched: Stop nohz stats when decayed Vincent Guittot
  1 sibling, 0 replies; 56+ messages in thread
From: Vincent Guittot @ 2018-02-01 19:11 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Morten Rasmussen, Ingo Molnar, linux-kernel, Brendan Jackman,
	Dietmar Eggemann, Morten Rasmussen

On 1 February 2018 at 19:10, Peter Zijlstra <peterz@infradead.org> wrote:
> On Wed, Jan 24, 2018 at 09:25:36AM +0100, Vincent Guittot wrote:
>> @@ -8861,7 +8875,14 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
>>                       update_next_balance(sd, &next_balance);
>>               rcu_read_unlock();
>>
>> -             if (time_after(jiffies, next) && atomic_read(&nohz.stats_state))
>> +             /*
>> +              * Update blocked idle load if it has not been done for a
>> +              * while. Try to do it locally before entering idle but kick a
>> +              * ilb if it takes too much time and might delay next local
>> +              * wake up
>> +              */
>> +             if (time_after(jiffies, next) && atomic_read(&nohz.stats_state) &&
>> +                             !_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
>>                       kick_ilb(NOHZ_STATS_KICK);
>>
>>               goto out;
>
> This I really dislike. We're here because avg_idle is _really_ low, we
> really should not then call _nohz_idle_balance().

Yes. In fact I was targeting the case were (this_rq->avg_idle >=
sysctl_sched_migration_cost) and the system is not overloaded

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK
  2018-01-30 11:41                                 ` Valentin Schneider
  2018-01-30 13:05                                   ` Vincent Guittot
@ 2018-02-05 22:18                                   ` Valentin Schneider
  2018-02-06  9:22                                     ` Vincent Guittot
  1 sibling, 1 reply; 56+ messages in thread
From: Valentin Schneider @ 2018-02-05 22:18 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: Peter Zijlstra, Morten Rasmussen, Ingo Molnar, linux-kernel,
	Brendan Jackman, Dietmar Eggemann, Morten Rasmussen

On 01/30/2018 11:41 AM, Valentin Schneider wrote:
> [...]
>> I have studied a way to keep track of how many cpus still have blocked
>> load to try to minimize the number of useless ilb kick but this add
>> more atomic operations which can impact the system throughput with
>> heavy load and lot of very small wake up. that's why i have propose
>> this solution which is more simple. But it's probably just a matter of
>> where we want to "waste" time. Either we accept to spent a bit more
>> time to check the state of idle CPUs or we accept to kick ilb from
>> time to time for no good reason.
>>
> 
> Agreed. I have the feeling that spending more time doing atomic ops could be worth it - I'll try to test this out and see if it's actually relevant.
> 

I gave this a spin, still using Vincent's patches with the included patch on top. Nothing too clever, just seeing how replacing nohz.stats_state with a cpumask would go.

I've replaced nohz.stats_state by nohz.stale_cpus_mask. I kept changes minimal - there are some places where I think nohz.idle_cpus_mask could be substituted by nohz.stale_cpus_mask. Speaking about that, I was about to write a comment regarding the fact that nohz.stale_cpus_mask should be a subset of nohz.idle_cpus_mask, but I realized it's actually not true:

In the current implementation (cpumask or stats_state flag), an idle CPU is defined as having blocked load as soon as it goes through nohz_balance_enter_idle(), and that flag is cleared when we go through _nohz_idle_balance() (and newly idle load balance in my cpumask implementation).
However we can imagine a scenario where a CPU goes idle, is flagged as having blocked load, then it wakes up and goes through its periodic balance code and updates that load. Yet, the flag (or cpumask) won't have been updated.
So I think there could be a discussion on whether the flag should be cleared on nohz_balance_exit_idle() if we assume periodic balance now takes care of this. It could cause issues if we have a weird scenario where a CPU keeps going online/idle but never stays online long enough for a tick though.
Alternatively we could clear that flag when going through the first periodic balance after idling, but then that's a bit weird because we're using a nohz structure in a non-nohz context.


Anyway, I tried to get some profiling done with the cpumask but there's something wrong with my setup, I would only get nonsense numbers (for both baseline & my patch), so I added start/end trace_printks to _nohz_idle_balance(). It's ugly, inaccurate and unorthodox but it still gives a rough idea of how the cpumask impacts stuff.
I ran 20 iterations of my "nohz update" test case (a task accumulates load, goes to sleep, and another always-running task keeps kicking an ILB to decay that blocked load) and the time saved by skipping CPUs is in the ballpark of 20%. Notebook is at [1].

I'll try to get a proper function profiling working for when Vincent posts his "v2".

[1]: https://gist.github.com/valschneider/6f203143bee1e149f24c44e9582a9eff

---
 kernel/sched/fair.c | 72 ++++++++++++++++++++++++++++-------------------------
 1 file changed, 38 insertions(+), 34 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3ada92b..8bcf465 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5404,8 +5404,8 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
 
 static struct {
 	cpumask_var_t idle_cpus_mask;
+	cpumask_var_t stale_cpus_mask;
 	atomic_t nr_cpus;
-	atomic_t stats_state;
 	unsigned long next_balance;     /* in jiffy units */
 	unsigned long next_stats;
 } nohz ____cacheline_aligned;
@@ -6968,7 +6968,6 @@ enum fbq_type { regular, remote, all };
 #define LBF_DST_PINNED  0x04
 #define LBF_SOME_PINNED	0x08
 #define LBF_NOHZ_STATS	0x10
-#define LBF_NOHZ_AGAIN	0x20
 
 struct lb_env {
 	struct sched_domain	*sd;
@@ -7829,25 +7828,25 @@ group_type group_classify(struct sched_group *group,
 	return group_other;
 }
 
-static bool update_nohz_stats(struct rq *rq)
+static void update_nohz_stats(struct rq *rq)
 {
 #ifdef CONFIG_NO_HZ_COMMON
 	unsigned int cpu = rq->cpu;
 
-	if (!rq->has_blocked_load)
-		return false;
-
-	if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
-		return false;
+	if (!cpumask_test_cpu(cpu, nohz.stale_cpus_mask))
+		return;
 
 	if (!time_after(jiffies, rq->last_blocked_load_update_tick))
-		return true;
+		return;
 
 	update_blocked_averages(cpu);
 
-	return rq->has_blocked_load;
+	if (rq->has_blocked_load)
+		return;
+
+	cpumask_clear_cpu(cpu, nohz.stale_cpus_mask);
 #else
-	return false;
+	return;
 #endif
 }
 
@@ -7873,8 +7872,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
 		struct rq *rq = cpu_rq(i);
 
-		if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq))
-			env->flags |= LBF_NOHZ_AGAIN;
+		if (env->flags & LBF_NOHZ_STATS)
+			update_nohz_stats(rq);
 
 		/* Bias balancing toward cpus of our domain */
 		if (local_group)
@@ -8032,7 +8031,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 		prefer_sibling = 1;
 
 #ifdef CONFIG_NO_HZ_COMMON
-	if (env->idle == CPU_NEWLY_IDLE && atomic_read(&nohz.stats_state)) {
+	if (env->idle == CPU_NEWLY_IDLE &&
+	    cpumask_intersects(sched_domain_span(env->sd), nohz.stale_cpus_mask)) {
 		env->flags |= LBF_NOHZ_STATS;
 	}
 #endif
@@ -8091,8 +8091,13 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 	} while (sg != env->sd->groups);
 
 #ifdef CONFIG_NO_HZ_COMMON
-	if ((env->flags & LBF_NOHZ_AGAIN) &&
-	    cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
+	/*
+	 * All nohz CPUs with blocked load were visited but some haven't fully
+	 * decayed. Visit them again later.
+	 */
+	if ((env->flags & LBF_NOHZ_STATS) &&
+	    cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd)) &&
+	    !cpumask_empty(nohz.stale_cpus_mask)) {
 
 		WRITE_ONCE(nohz.next_stats,
 				jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
@@ -8897,7 +8902,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
 		 * ilb if it takes too much time and might delay next local
 		 * wake up
 		 */
-		if (time_after(jiffies, next) && atomic_read(&nohz.stats_state) &&
+		if (time_after(jiffies, next) && !cpumask_empty(nohz.stale_cpus_mask) &&
 				!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
 			kick_ilb(NOHZ_STATS_KICK);
 
@@ -9153,7 +9158,7 @@ static void nohz_balancer_kick(struct rq *rq)
 	if (likely(!atomic_read(&nohz.nr_cpus)))
 		return;
 
-	if (time_after(now, nohz.next_stats) && atomic_read(&nohz.stats_state))
+	if (time_after(now, nohz.next_stats) && !cpumask_empty(nohz.stale_cpus_mask))
 		flags = NOHZ_STATS_KICK;
 
 	if (time_before(now, nohz.next_balance))
@@ -9292,10 +9297,10 @@ void nohz_balance_enter_idle(int cpu)
 	set_cpu_sd_state_idle(cpu);
 
 	/*
-	 * Each time a cpu enter idle, we assume that it has blocked load and
-	 * enable the periodic update of the load of idle cpus
+	 * Each time a cpu enters idle, we assume that it has blocked load and
+	 * thus enable the periodic update of its load
 	 */
-	atomic_set(&nohz.stats_state, 1);
+	cpumask_set_cpu(cpu, nohz.stale_cpus_mask);
 }
 #else
 static inline void nohz_balancer_kick(struct rq *rq) { }
@@ -9432,7 +9437,6 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum cpu_
 	/* Earliest time when we have to do rebalance again */
 	unsigned long now = jiffies;
 	unsigned long next_balance = now + 60*HZ;
-	bool has_blocked_load = false;
 	int update_next_balance = 0;
 	int this_cpu = this_rq->cpu;
 	int balance_cpu;
@@ -9450,7 +9454,6 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum cpu_
 	 * setting the stats state, we are sure to not clear the state and not
 	 * check the load of an idle cpu.
 	 */
-	atomic_set(&nohz.stats_state, 0);
 
 	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
 		u64 t0, domain_cost;
@@ -9460,30 +9463,31 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum cpu_
 		if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
 			continue;
 
+		if ((flags & NOHZ_KICK_MASK) == NOHZ_STATS_KICK &&
+		    !cpumask_test_cpu(balance_cpu, nohz.stale_cpus_mask))
+			continue;
+
 		/*
 		 * If this cpu gets work to do, stop the load balancing
 		 * work being done for other cpus. Next load
 		 * balancing owner will pick it up.
 		 */
-		if (need_resched()) {
-			has_blocked_load = true;
+		if (need_resched())
 			goto abort;
-		}
 
 		/*
 		 * If the update is done while CPU becomes idle, we abort
 		 * the update when its cost is higher than the average idle
 		 * time in orde to not delay a possible wake up.
 		 */
-		if (idle == CPU_NEWLY_IDLE && this_rq->avg_idle < curr_cost) {
-			has_blocked_load = true;
+		if (idle == CPU_NEWLY_IDLE && this_rq->avg_idle < curr_cost)
 			goto abort;
-		}
 
 		rq = cpu_rq(balance_cpu);
 
 		update_blocked_averages(rq->cpu);
-		has_blocked_load |= rq->has_blocked_load;
+		if (!rq->has_blocked_load)
+			cpumask_clear_cpu(balance_cpu, nohz.stale_cpus_mask);
 
 		/*
 		 * If time for next balance is due,
@@ -9514,7 +9518,9 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum cpu_
 	/* Newly idle CPU doesn't need an update */
 	if (idle != CPU_NEWLY_IDLE) {
 		update_blocked_averages(this_cpu);
-		has_blocked_load |= this_rq->has_blocked_load;
+		if (!this_rq->has_blocked_load)
+			cpumask_clear_cpu(this_cpu, nohz.stale_cpus_mask);
+
 	}
 
 	if (flags & NOHZ_BALANCE_KICK)
@@ -9527,9 +9533,6 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum cpu_
 	ret = true;
 
 abort:
-	/* There is still blocked load, enable periodic update */
-	if (has_blocked_load)
-		atomic_set(&nohz.stats_state, 1);
 
 	/*
 	 * next_balance will be updated only when there is a need.
@@ -10190,6 +10193,7 @@ __init void init_sched_fair_class(void)
 #ifdef CONFIG_NO_HZ_COMMON
 	nohz.next_balance = jiffies;
 	nohz.next_stats = jiffies;
+	zalloc_cpumask_var(&nohz.stale_cpus_mask, GFP_NOWAIT);
 	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
 #endif
 #endif /* SMP */
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 56+ messages in thread

* [PATCH 1/3] sched: Stop nohz stats when decayed
  2018-02-01 18:10                             ` Peter Zijlstra
  2018-02-01 19:11                               ` Vincent Guittot
@ 2018-02-06  8:32                               ` Vincent Guittot
  2018-02-06  8:32                                 ` [PATCH 2/3] sched: reduce the periodic update duration Vincent Guittot
                                                   ` (3 more replies)
  1 sibling, 4 replies; 56+ messages in thread
From: Vincent Guittot @ 2018-02-06  8:32 UTC (permalink / raw)
  To: peterz, mingo, linux-kernel, valentin.schneider
  Cc: morten.rasmussen, brendan.jackman, dietmar.eggemann, Vincent Guittot

Stopped the periodic update of blocked load when all idle CPUs have fully
decayed. We introduce a new nohz.has_blocked that reflect if some idle
CPUs has blocked load that have to be periodiccally updated. nohz.has_blocked
is set everytime that a Idle CPU can have blocked load and it is then clear
when no more blocked load has been detected during an update. We don't need
atomic operation but only to make cure of the right ordering when updating
nohz.idle_cpus_mask and nohz.has_blocked.

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
---
 kernel/sched/fair.c  | 94 +++++++++++++++++++++++++++++++++++++++++-----------
 kernel/sched/sched.h |  1 +
 2 files changed, 75 insertions(+), 20 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7af1fa9..279f4b2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5383,8 +5383,9 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
 static struct {
 	cpumask_var_t idle_cpus_mask;
 	atomic_t nr_cpus;
+	int has_blocked;		/* Idle CPUS has blocked load */
 	unsigned long next_balance;     /* in jiffy units */
-	unsigned long next_stats;
+	unsigned long next_blocked;	/* Next update of blocked load in jiffies */
 } nohz ____cacheline_aligned;
 
 #endif /* CONFIG_NO_HZ_COMMON */
@@ -6951,6 +6952,7 @@ enum fbq_type { regular, remote, all };
 #define LBF_DST_PINNED  0x04
 #define LBF_SOME_PINNED	0x08
 #define LBF_NOHZ_STATS	0x10
+#define LBF_NOHZ_AGAIN	0x20
 
 struct lb_env {
 	struct sched_domain	*sd;
@@ -7335,8 +7337,6 @@ static void attach_tasks(struct lb_env *env)
 	rq_unlock(env->dst_rq, &rf);
 }
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-
 static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
 {
 	if (cfs_rq->load.weight)
@@ -7354,11 +7354,14 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
 	return true;
 }
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
 static void update_blocked_averages(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	struct cfs_rq *cfs_rq, *pos;
 	struct rq_flags rf;
+	bool done = true;
 
 	rq_lock_irqsave(rq, &rf);
 	update_rq_clock(rq);
@@ -7388,10 +7391,14 @@ static void update_blocked_averages(int cpu)
 		 */
 		if (cfs_rq_is_decayed(cfs_rq))
 			list_del_leaf_cfs_rq(cfs_rq);
+		else
+			done = false;
 	}
 
 #ifdef CONFIG_NO_HZ_COMMON
 	rq->last_blocked_load_update_tick = jiffies;
+	if (done)
+		rq->has_blocked_load = 0;
 #endif
 	rq_unlock_irqrestore(rq, &rf);
 }
@@ -7454,6 +7461,8 @@ static inline void update_blocked_averages(int cpu)
 	update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
 #ifdef CONFIG_NO_HZ_COMMON
 	rq->last_blocked_load_update_tick = jiffies;
+	if (cfs_rq_is_decayed(cfs_rq))
+		rq->has_blocked_load = 0;
 #endif
 	rq_unlock_irqrestore(rq, &rf);
 }
@@ -7789,18 +7798,25 @@ group_type group_classify(struct sched_group *group,
 	return group_other;
 }
 
-static void update_nohz_stats(struct rq *rq)
+static bool update_nohz_stats(struct rq *rq)
 {
 #ifdef CONFIG_NO_HZ_COMMON
 	unsigned int cpu = rq->cpu;
 
+	if (!rq->has_blocked_load)
+		return false;
+
 	if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
-		return;
+		return false;
 
 	if (!time_after(jiffies, rq->last_blocked_load_update_tick))
-		return;
+		return true;
 
 	update_blocked_averages(cpu);
+
+	return rq->has_blocked_load;
+#else
+	return false;
 #endif
 }
 
@@ -7826,8 +7842,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
 		struct rq *rq = cpu_rq(i);
 
-		if (env->flags & LBF_NOHZ_STATS)
-			update_nohz_stats(rq);
+		if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq))
+			env->flags |= LBF_NOHZ_AGAIN;
 
 		/* Bias balancing toward cpus of our domain */
 		if (local_group)
@@ -7979,18 +7995,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 	struct sg_lb_stats *local = &sds->local_stat;
 	struct sg_lb_stats tmp_sgs;
 	int load_idx, prefer_sibling = 0;
+	int has_blocked = READ_ONCE(nohz.has_blocked);
 	bool overload = false;
 
 	if (child && child->flags & SD_PREFER_SIBLING)
 		prefer_sibling = 1;
 
 #ifdef CONFIG_NO_HZ_COMMON
-	if (env->idle == CPU_NEWLY_IDLE) {
+	if (env->idle == CPU_NEWLY_IDLE && has_blocked)
 		env->flags |= LBF_NOHZ_STATS;
-
-		if (cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd)))
-			nohz.next_stats = jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD);
-	}
 #endif
 
 	load_idx = get_sd_load_idx(env->sd, env->idle);
@@ -8046,6 +8059,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 		sg = sg->next;
 	} while (sg != env->sd->groups);
 
+#ifdef CONFIG_NO_HZ_COMMON
+	if ((env->flags & LBF_NOHZ_AGAIN) &&
+	    cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
+
+		WRITE_ONCE(nohz.next_blocked,
+				jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
+	}
+#endif
+
 	if (env->sd->flags & SD_NUMA)
 		env->fbq_type = fbq_classify_group(&sds->busiest_stat);
 
@@ -9069,6 +9091,8 @@ static void nohz_balancer_kick(struct rq *rq)
 	struct sched_domain *sd;
 	int nr_busy, i, cpu = rq->cpu;
 	unsigned int flags = 0;
+	unsigned long has_blocked = READ_ONCE(nohz.has_blocked);
+	unsigned long next = READ_ONCE(nohz.next_blocked);
 
 	if (unlikely(rq->idle_balance))
 		return;
@@ -9086,7 +9110,7 @@ static void nohz_balancer_kick(struct rq *rq)
 	if (likely(!atomic_read(&nohz.nr_cpus)))
 		return;
 
-	if (time_after(now, nohz.next_stats))
+	if (time_after(now, next) && has_blocked)
 		flags = NOHZ_STATS_KICK;
 
 	if (time_before(now, nohz.next_balance))
@@ -9207,13 +9231,15 @@ void nohz_balance_enter_idle(int cpu)
 	if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
 		return;
 
+	rq->has_blocked_load = 1;
+
 	if (rq->nohz_tick_stopped)
-		return;
+		goto out;
 
 	/*
 	 * If we're a completely isolated CPU, we don't play.
 	 */
-	if (on_null_domain(cpu_rq(cpu)))
+	if (on_null_domain(rq))
 		return;
 
 	rq->nohz_tick_stopped = 1;
@@ -9222,6 +9248,13 @@ void nohz_balance_enter_idle(int cpu)
 	atomic_inc(&nohz.nr_cpus);
 
 	set_cpu_sd_state_idle(cpu);
+
+out:
+	/*
+	 * Each time a cpu enter idle, we assume that it has blocked load and
+	 * enable the periodic update of the load of idle cpus
+	 */
+	WRITE_ONCE(nohz.has_blocked, 1);
 }
 #else
 static inline void nohz_balancer_kick(struct rq *rq) { }
@@ -9374,6 +9407,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 
 	SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
 
+	/*
+	 * We assume there will be no idle load after this update and clear
+	 * the stats state. If a cpu enters idle in the mean time, it will
+	 * set the stats state and trig another update of idle load.
+	 * Because a cpu that becomes idle, is added to idle_cpus_mask before
+	 * setting the stats state, we are sure to not clear the state and not
+	 * check the load of an idle cpu.
+	 */
+	WRITE_ONCE(nohz.has_blocked, 0);
+
 	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
 		if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
 			continue;
@@ -9383,11 +9426,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 		 * work being done for other cpus. Next load
 		 * balancing owner will pick it up.
 		 */
-		if (need_resched())
-			break;
+		if (need_resched()) {
+			has_blocked_load = true;
+			goto abort;
+		}
 
 		rq = cpu_rq(balance_cpu);
 
+		update_blocked_averages(rq->cpu);
+		has_blocked_load |= rq->has_blocked_load;
+
 		/*
 		 * If time for next balance is due,
 		 * do the balance.
@@ -9400,7 +9448,6 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 			cpu_load_update_idle(rq);
 			rq_unlock_irq(rq, &rf);
 
-			update_blocked_averages(rq->cpu);
 			if (flags & NOHZ_BALANCE_KICK)
 				rebalance_domains(rq, CPU_IDLE);
 		}
@@ -9415,7 +9462,13 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 	if (flags & NOHZ_BALANCE_KICK)
 		rebalance_domains(this_rq, CPU_IDLE);
 
-	nohz.next_stats = next_stats;
+	WRITE_ONCE(nohz.next_blocked,
+		now + msecs_to_jiffies(LOAD_AVG_PERIOD));
+
+abort:
+	/* There is still blocked load, enable periodic update */
+	if (has_blocked_load)
+		WRITE_ONCE(nohz.has_blocked, 1);
 
 	/*
 	 * next_balance will be updated only when there is a need.
@@ -10046,6 +10099,7 @@ __init void init_sched_fair_class(void)
 
 #ifdef CONFIG_NO_HZ_COMMON
 	nohz.next_balance = jiffies;
+	nohz.next_blocked = jiffies;
 	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
 #endif
 #endif /* SMP */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e200045..ad9b929 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -723,6 +723,7 @@ struct rq {
 #ifdef CONFIG_SMP
 	unsigned long last_load_update_tick;
 	unsigned long last_blocked_load_update_tick;
+	unsigned int has_blocked_load;
 #endif /* CONFIG_SMP */
 	unsigned int nohz_tick_stopped;
 	atomic_t nohz_flags;
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 56+ messages in thread

* [PATCH 2/3] sched: reduce the periodic update duration
  2018-02-06  8:32                               ` [PATCH 1/3] sched: Stop nohz stats when decayed Vincent Guittot
@ 2018-02-06  8:32                                 ` Vincent Guittot
  2018-02-06  8:32                                 ` [PATCH 3/3] sched: update blocked load when newly idle Vincent Guittot
                                                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 56+ messages in thread
From: Vincent Guittot @ 2018-02-06  8:32 UTC (permalink / raw)
  To: peterz, mingo, linux-kernel, valentin.schneider
  Cc: morten.rasmussen, brendan.jackman, dietmar.eggemann, Vincent Guittot

Instead of using the cfs_rq_is_decayed() which monitors all *_avg
and *_sum, we create a cfs_rq_has_blocked() which only takes care of
util_avg and load_avg. We are only interested by these 2 values which are
decaying faster than the *_sum so we can stop the periodic update earlier.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
---
 kernel/sched/fair.c | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 279f4b2..6998528 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7337,6 +7337,19 @@ static void attach_tasks(struct lb_env *env)
 	rq_unlock(env->dst_rq, &rf);
 }
 
+static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
+{
+	if (cfs_rq->avg.load_avg)
+		return true;
+
+	if (cfs_rq->avg.util_avg)
+		return true;
+
+	return false;
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
 static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
 {
 	if (cfs_rq->load.weight)
@@ -7354,8 +7367,6 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
 	return true;
 }
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-
 static void update_blocked_averages(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
@@ -7391,7 +7402,9 @@ static void update_blocked_averages(int cpu)
 		 */
 		if (cfs_rq_is_decayed(cfs_rq))
 			list_del_leaf_cfs_rq(cfs_rq);
-		else
+
+		/* Don't need periodic decay once load/util_avg are null */
+		if (cfs_rq_has_blocked(cfs_rq))
 			done = false;
 	}
 
@@ -7461,7 +7474,7 @@ static inline void update_blocked_averages(int cpu)
 	update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
 #ifdef CONFIG_NO_HZ_COMMON
 	rq->last_blocked_load_update_tick = jiffies;
-	if (cfs_rq_is_decayed(cfs_rq))
+	if (!cfs_rq_has_blocked(cfs_rq))
 		rq->has_blocked_load = 0;
 #endif
 	rq_unlock_irqrestore(rq, &rf);
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 56+ messages in thread

* [PATCH 3/3] sched: update blocked load when newly idle
  2018-02-06  8:32                               ` [PATCH 1/3] sched: Stop nohz stats when decayed Vincent Guittot
  2018-02-06  8:32                                 ` [PATCH 2/3] sched: reduce the periodic update duration Vincent Guittot
@ 2018-02-06  8:32                                 ` Vincent Guittot
  2018-02-06 14:32                                   ` Valentin Schneider
  2018-02-06  8:55                                 ` [PATCH 1/3] sched: Stop nohz stats when decayed Vincent Guittot
  2018-02-06 14:16                                 ` Valentin Schneider
  3 siblings, 1 reply; 56+ messages in thread
From: Vincent Guittot @ 2018-02-06  8:32 UTC (permalink / raw)
  To: peterz, mingo, linux-kernel, valentin.schneider
  Cc: morten.rasmussen, brendan.jackman, dietmar.eggemann, Vincent Guittot

When NEWLY_IDLE load balance is not triggered, we might need to update the
blocked load anyway. We can kick an ilb so an idle CPU will take care of
updating blocked load or we can try to update them locally before entering
idle. In the latter case, we reuse part of the nohz_idle_balance.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
---
 kernel/sched/fair.c | 102 ++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 84 insertions(+), 18 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6998528..256defe 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8829,6 +8829,9 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
 		*next_balance = next;
 }
 
+static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum cpu_idle_type idle);
+static void kick_ilb(unsigned int flags);
+
 /*
  * idle_balance is called by schedule() if this_cpu is about to become
  * idle. Attempts to pull tasks from other CPUs.
@@ -8863,12 +8866,26 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
 
 	if (this_rq->avg_idle < sysctl_sched_migration_cost ||
 	    !this_rq->rd->overload) {
+		unsigned long has_blocked = READ_ONCE(nohz.has_blocked);
+		unsigned long next = READ_ONCE(nohz.next_blocked);
+
 		rcu_read_lock();
 		sd = rcu_dereference_check_sched_domain(this_rq->sd);
 		if (sd)
 			update_next_balance(sd, &next_balance);
 		rcu_read_unlock();
 
+		/*
+		 * Update blocked idle load if it has not been done for a
+		 * while. Try to do it locally before entering idle but kick a
+		 * ilb if it takes too much time and/or might delay next local
+		 * wake up
+		 */
+		if (has_blocked && time_after_eq(jiffies, next) &&
+				(this_rq->avg_idle < sysctl_sched_migration_cost ||
+				!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE)))
+			kick_ilb(NOHZ_STATS_KICK);
+
 		goto out;
 	}
 
@@ -9393,30 +9410,24 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
 
 #ifdef CONFIG_NO_HZ_COMMON
 /*
- * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
- * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ * Internal function that runs load balance for all idle cpus. The load balance
+ * can be a simple update of blocked load or a complete load balance with
+ * tasks movement depending of flags.
+ * For newly idle mode, we abort the loop if it takes too much time and return
+ * false to notify that the loop has not be completed and a ilb should be kick.
  */
-static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
+static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum cpu_idle_type idle)
 {
 	/* Earliest time when we have to do rebalance again */
 	unsigned long now = jiffies;
 	unsigned long next_balance = now + 60*HZ;
-	unsigned long next_stats = now + msecs_to_jiffies(LOAD_AVG_PERIOD);
+	bool has_blocked_load = false;
 	int update_next_balance = 0;
 	int this_cpu = this_rq->cpu;
-	unsigned int flags;
 	int balance_cpu;
+	int ret = false;
 	struct rq *rq;
-
-	if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
-		return false;
-
-	if (idle != CPU_IDLE) {
-		atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
-		return false;
-	}
-
-	flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
+	u64 curr_cost = 0;
 
 	SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
 
@@ -9431,6 +9442,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 	WRITE_ONCE(nohz.has_blocked, 0);
 
 	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
+		u64 t0, domain_cost;
+
+		t0 = sched_clock_cpu(this_cpu);
+
 		if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
 			continue;
 
@@ -9444,6 +9459,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 			goto abort;
 		}
 
+		/*
+		 * If the update is done while CPU becomes idle, we abort
+		 * the update when its cost is higher than the average idle
+		 * time in orde to not delay a possible wake up.
+		 */
+		if (idle == CPU_NEWLY_IDLE && this_rq->avg_idle < curr_cost) {
+			has_blocked_load = true;
+			goto abort;
+		}
+
 		rq = cpu_rq(balance_cpu);
 
 		update_blocked_averages(rq->cpu);
@@ -9456,10 +9481,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 		if (time_after_eq(jiffies, rq->next_balance)) {
 			struct rq_flags rf;
 
-			rq_lock_irq(rq, &rf);
+			rq_lock_irqsave(rq, &rf);
 			update_rq_clock(rq);
 			cpu_load_update_idle(rq);
-			rq_unlock_irq(rq, &rf);
+			rq_unlock_irqrestore(rq, &rf);
 
 			if (flags & NOHZ_BALANCE_KICK)
 				rebalance_domains(rq, CPU_IDLE);
@@ -9469,15 +9494,27 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 			next_balance = rq->next_balance;
 			update_next_balance = 1;
 		}
+
+		domain_cost = sched_clock_cpu(this_cpu) - t0;
+		curr_cost += domain_cost;
+
+	}
+
+	/* Newly idle CPU doesn't need an update */
+	if (idle != CPU_NEWLY_IDLE) {
+		update_blocked_averages(this_cpu);
+		has_blocked_load |= this_rq->has_blocked_load;
 	}
 
-	update_blocked_averages(this_cpu);
 	if (flags & NOHZ_BALANCE_KICK)
 		rebalance_domains(this_rq, CPU_IDLE);
 
 	WRITE_ONCE(nohz.next_blocked,
 		now + msecs_to_jiffies(LOAD_AVG_PERIOD));
 
+	/* The full idle balance loop has been done */
+	ret = true;
+
 abort:
 	/* There is still blocked load, enable periodic update */
 	if (has_blocked_load)
@@ -9491,6 +9528,35 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 	if (likely(update_next_balance))
 		nohz.next_balance = next_balance;
 
+	return ret;
+}
+
+/*
+ * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
+ * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ */
+static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
+{
+	int this_cpu = this_rq->cpu;
+	unsigned int flags;
+
+	if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
+		return false;
+
+	if (idle != CPU_IDLE) {
+		atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
+		return false;
+	}
+
+	/*
+	 * barrier, pairs with nohz_balance_enter_idle(), ensures ...
+	 */
+	flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
+	if (!(flags & NOHZ_KICK_MASK))
+		return false;
+
+	_nohz_idle_balance(this_rq, flags, idle);
+
 	return true;
 }
 #else
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 56+ messages in thread

* Re: [PATCH 1/3] sched: Stop nohz stats when decayed
  2018-02-06  8:32                               ` [PATCH 1/3] sched: Stop nohz stats when decayed Vincent Guittot
  2018-02-06  8:32                                 ` [PATCH 2/3] sched: reduce the periodic update duration Vincent Guittot
  2018-02-06  8:32                                 ` [PATCH 3/3] sched: update blocked load when newly idle Vincent Guittot
@ 2018-02-06  8:55                                 ` Vincent Guittot
  2018-02-06 14:16                                 ` Valentin Schneider
  3 siblings, 0 replies; 56+ messages in thread
From: Vincent Guittot @ 2018-02-06  8:55 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, linux-kernel, Valentin Schneider
  Cc: Morten Rasmussen, Brendan Jackman, Dietmar Eggemann, Vincent Guittot

Hi Peter,

On 6 February 2018 at 09:32, Vincent Guittot <vincent.guittot@linaro.org> wrote:
> Stopped the periodic update of blocked load when all idle CPUs have fully
> decayed. We introduce a new nohz.has_blocked that reflect if some idle
> CPUs has blocked load that have to be periodiccally updated. nohz.has_blocked
> is set everytime that a Idle CPU can have blocked load and it is then clear
> when no more blocked load has been detected during an update. We don't need
> atomic operation but only to make cure of the right ordering when updating
> nohz.idle_cpus_mask and nohz.has_blocked.
>
> Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> ---

This patchset applies on your testing branch after removing the 2 last commits:
56eb4679 ("sched: Clean up nohz enter/exit")

>  kernel/sched/fair.c  | 94 +++++++++++++++++++++++++++++++++++++++++-----------
>  kernel/sched/sched.h |  1 +
>  2 files changed, 75 insertions(+), 20 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 7af1fa9..279f4b2 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5383,8 +5383,9 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
>  static struct {
>         cpumask_var_t idle_cpus_mask;
>         atomic_t nr_cpus;
> +       int has_blocked;                /* Idle CPUS has blocked load */
>         unsigned long next_balance;     /* in jiffy units */
> -       unsigned long next_stats;
> +       unsigned long next_blocked;     /* Next update of blocked load in jiffies */
>  } nohz ____cacheline_aligned;
>
>  #endif /* CONFIG_NO_HZ_COMMON */
> @@ -6951,6 +6952,7 @@ enum fbq_type { regular, remote, all };
>  #define LBF_DST_PINNED  0x04
>  #define LBF_SOME_PINNED        0x08
>  #define LBF_NOHZ_STATS 0x10
> +#define LBF_NOHZ_AGAIN 0x20
>
>  struct lb_env {
>         struct sched_domain     *sd;
> @@ -7335,8 +7337,6 @@ static void attach_tasks(struct lb_env *env)
>         rq_unlock(env->dst_rq, &rf);
>  }
>
> -#ifdef CONFIG_FAIR_GROUP_SCHED
> -
>  static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
>  {
>         if (cfs_rq->load.weight)
> @@ -7354,11 +7354,14 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
>         return true;
>  }
>
> +#ifdef CONFIG_FAIR_GROUP_SCHED
> +
>  static void update_blocked_averages(int cpu)
>  {
>         struct rq *rq = cpu_rq(cpu);
>         struct cfs_rq *cfs_rq, *pos;
>         struct rq_flags rf;
> +       bool done = true;
>
>         rq_lock_irqsave(rq, &rf);
>         update_rq_clock(rq);
> @@ -7388,10 +7391,14 @@ static void update_blocked_averages(int cpu)
>                  */
>                 if (cfs_rq_is_decayed(cfs_rq))
>                         list_del_leaf_cfs_rq(cfs_rq);
> +               else
> +                       done = false;
>         }
>
>  #ifdef CONFIG_NO_HZ_COMMON
>         rq->last_blocked_load_update_tick = jiffies;
> +       if (done)
> +               rq->has_blocked_load = 0;
>  #endif
>         rq_unlock_irqrestore(rq, &rf);
>  }
> @@ -7454,6 +7461,8 @@ static inline void update_blocked_averages(int cpu)
>         update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
>  #ifdef CONFIG_NO_HZ_COMMON
>         rq->last_blocked_load_update_tick = jiffies;
> +       if (cfs_rq_is_decayed(cfs_rq))
> +               rq->has_blocked_load = 0;
>  #endif
>         rq_unlock_irqrestore(rq, &rf);
>  }
> @@ -7789,18 +7798,25 @@ group_type group_classify(struct sched_group *group,
>         return group_other;
>  }
>
> -static void update_nohz_stats(struct rq *rq)
> +static bool update_nohz_stats(struct rq *rq)
>  {
>  #ifdef CONFIG_NO_HZ_COMMON
>         unsigned int cpu = rq->cpu;
>
> +       if (!rq->has_blocked_load)
> +               return false;
> +
>         if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
> -               return;
> +               return false;
>
>         if (!time_after(jiffies, rq->last_blocked_load_update_tick))
> -               return;
> +               return true;
>
>         update_blocked_averages(cpu);
> +
> +       return rq->has_blocked_load;
> +#else
> +       return false;
>  #endif
>  }
>
> @@ -7826,8 +7842,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
>         for_each_cpu_and(i, sched_group_span(group), env->cpus) {
>                 struct rq *rq = cpu_rq(i);
>
> -               if (env->flags & LBF_NOHZ_STATS)
> -                       update_nohz_stats(rq);
> +               if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq))
> +                       env->flags |= LBF_NOHZ_AGAIN;
>
>                 /* Bias balancing toward cpus of our domain */
>                 if (local_group)
> @@ -7979,18 +7995,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
>         struct sg_lb_stats *local = &sds->local_stat;
>         struct sg_lb_stats tmp_sgs;
>         int load_idx, prefer_sibling = 0;
> +       int has_blocked = READ_ONCE(nohz.has_blocked);
>         bool overload = false;
>
>         if (child && child->flags & SD_PREFER_SIBLING)
>                 prefer_sibling = 1;
>
>  #ifdef CONFIG_NO_HZ_COMMON
> -       if (env->idle == CPU_NEWLY_IDLE) {
> +       if (env->idle == CPU_NEWLY_IDLE && has_blocked)
>                 env->flags |= LBF_NOHZ_STATS;
> -
> -               if (cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd)))
> -                       nohz.next_stats = jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD);
> -       }
>  #endif
>
>         load_idx = get_sd_load_idx(env->sd, env->idle);
> @@ -8046,6 +8059,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
>                 sg = sg->next;
>         } while (sg != env->sd->groups);
>
> +#ifdef CONFIG_NO_HZ_COMMON
> +       if ((env->flags & LBF_NOHZ_AGAIN) &&
> +           cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
> +
> +               WRITE_ONCE(nohz.next_blocked,
> +                               jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
> +       }
> +#endif
> +
>         if (env->sd->flags & SD_NUMA)
>                 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
>
> @@ -9069,6 +9091,8 @@ static void nohz_balancer_kick(struct rq *rq)
>         struct sched_domain *sd;
>         int nr_busy, i, cpu = rq->cpu;
>         unsigned int flags = 0;
> +       unsigned long has_blocked = READ_ONCE(nohz.has_blocked);
> +       unsigned long next = READ_ONCE(nohz.next_blocked);
>
>         if (unlikely(rq->idle_balance))
>                 return;
> @@ -9086,7 +9110,7 @@ static void nohz_balancer_kick(struct rq *rq)
>         if (likely(!atomic_read(&nohz.nr_cpus)))
>                 return;
>
> -       if (time_after(now, nohz.next_stats))
> +       if (time_after(now, next) && has_blocked)
>                 flags = NOHZ_STATS_KICK;
>
>         if (time_before(now, nohz.next_balance))
> @@ -9207,13 +9231,15 @@ void nohz_balance_enter_idle(int cpu)
>         if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
>                 return;
>
> +       rq->has_blocked_load = 1;
> +
>         if (rq->nohz_tick_stopped)
> -               return;
> +               goto out;
>
>         /*
>          * If we're a completely isolated CPU, we don't play.
>          */
> -       if (on_null_domain(cpu_rq(cpu)))
> +       if (on_null_domain(rq))
>                 return;
>
>         rq->nohz_tick_stopped = 1;
> @@ -9222,6 +9248,13 @@ void nohz_balance_enter_idle(int cpu)
>         atomic_inc(&nohz.nr_cpus);
>
>         set_cpu_sd_state_idle(cpu);
> +
> +out:
> +       /*
> +        * Each time a cpu enter idle, we assume that it has blocked load and
> +        * enable the periodic update of the load of idle cpus
> +        */
> +       WRITE_ONCE(nohz.has_blocked, 1);
>  }
>  #else
>  static inline void nohz_balancer_kick(struct rq *rq) { }
> @@ -9374,6 +9407,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>
>         SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>
> +       /*
> +        * We assume there will be no idle load after this update and clear
> +        * the stats state. If a cpu enters idle in the mean time, it will
> +        * set the stats state and trig another update of idle load.
> +        * Because a cpu that becomes idle, is added to idle_cpus_mask before
> +        * setting the stats state, we are sure to not clear the state and not
> +        * check the load of an idle cpu.
> +        */
> +       WRITE_ONCE(nohz.has_blocked, 0);
> +
>         for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
>                 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
>                         continue;
> @@ -9383,11 +9426,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>                  * work being done for other cpus. Next load
>                  * balancing owner will pick it up.
>                  */
> -               if (need_resched())
> -                       break;
> +               if (need_resched()) {
> +                       has_blocked_load = true;
> +                       goto abort;
> +               }
>
>                 rq = cpu_rq(balance_cpu);
>
> +               update_blocked_averages(rq->cpu);
> +               has_blocked_load |= rq->has_blocked_load;
> +
>                 /*
>                  * If time for next balance is due,
>                  * do the balance.
> @@ -9400,7 +9448,6 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>                         cpu_load_update_idle(rq);
>                         rq_unlock_irq(rq, &rf);
>
> -                       update_blocked_averages(rq->cpu);
>                         if (flags & NOHZ_BALANCE_KICK)
>                                 rebalance_domains(rq, CPU_IDLE);
>                 }
> @@ -9415,7 +9462,13 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>         if (flags & NOHZ_BALANCE_KICK)
>                 rebalance_domains(this_rq, CPU_IDLE);
>
> -       nohz.next_stats = next_stats;
> +       WRITE_ONCE(nohz.next_blocked,
> +               now + msecs_to_jiffies(LOAD_AVG_PERIOD));
> +
> +abort:
> +       /* There is still blocked load, enable periodic update */
> +       if (has_blocked_load)
> +               WRITE_ONCE(nohz.has_blocked, 1);
>
>         /*
>          * next_balance will be updated only when there is a need.
> @@ -10046,6 +10099,7 @@ __init void init_sched_fair_class(void)
>
>  #ifdef CONFIG_NO_HZ_COMMON
>         nohz.next_balance = jiffies;
> +       nohz.next_blocked = jiffies;
>         zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
>  #endif
>  #endif /* SMP */
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index e200045..ad9b929 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -723,6 +723,7 @@ struct rq {
>  #ifdef CONFIG_SMP
>         unsigned long last_load_update_tick;
>         unsigned long last_blocked_load_update_tick;
> +       unsigned int has_blocked_load;
>  #endif /* CONFIG_SMP */
>         unsigned int nohz_tick_stopped;
>         atomic_t nohz_flags;
> --
> 2.7.4
>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK
  2018-02-05 22:18                                   ` Valentin Schneider
@ 2018-02-06  9:22                                     ` Vincent Guittot
  0 siblings, 0 replies; 56+ messages in thread
From: Vincent Guittot @ 2018-02-06  9:22 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: Peter Zijlstra, Morten Rasmussen, Ingo Molnar, linux-kernel,
	Brendan Jackman, Dietmar Eggemann, Morten Rasmussen

On 5 February 2018 at 23:18, Valentin Schneider
<valentin.schneider@arm.com> wrote:
> On 01/30/2018 11:41 AM, Valentin Schneider wrote:
>> [...]
>>> I have studied a way to keep track of how many cpus still have blocked
>>> load to try to minimize the number of useless ilb kick but this add
>>> more atomic operations which can impact the system throughput with
>>> heavy load and lot of very small wake up. that's why i have propose
>>> this solution which is more simple. But it's probably just a matter of
>>> where we want to "waste" time. Either we accept to spent a bit more
>>> time to check the state of idle CPUs or we accept to kick ilb from
>>> time to time for no good reason.
>>>
>>
>> Agreed. I have the feeling that spending more time doing atomic ops could be worth it - I'll try to test this out and see if it's actually relevant.
>>
>
> I gave this a spin, still using Vincent's patches with the included patch on top. Nothing too clever, just seeing how replacing nohz.stats_state with a cpumask would go.
>
> I've replaced nohz.stats_state by nohz.stale_cpus_mask. I kept changes minimal - there are some places where I think nohz.idle_cpus_mask could be substituted by nohz.stale_cpus_mask. Speaking about that, I was about to write a comment regarding the fact that nohz.stale_cpus_mask should be a subset of nohz.idle_cpus_mask, but I realized it's actually not true:
>
> In the current implementation (cpumask or stats_state flag), an idle CPU is defined as having blocked load as soon as it goes through nohz_balance_enter_idle(), and that flag is cleared when we go through _nohz_idle_balance() (and newly idle load balance in my cpumask implementation).
> However we can imagine a scenario where a CPU goes idle, is flagged as having blocked load, then it wakes up and goes through its periodic balance code and updates that load. Yet, the flag (or cpumask) won't have been updated.
> So I think there could be a discussion on whether the flag should be cleared on nohz_balance_exit_idle() if we assume periodic balance now takes care of this. It could cause issues if we have a weird scenario where a CPU keeps going online/idle but never stays online long enough for a tick though.
> Alternatively we could clear that flag when going through the first periodic balance after idling, but then that's a bit weird because we're using a nohz structure in a non-nohz context.
>
>
> Anyway, I tried to get some profiling done with the cpumask but there's something wrong with my setup, I would only get nonsense numbers (for both baseline & my patch), so I added start/end trace_printks to _nohz_idle_balance(). It's ugly, inaccurate and unorthodox but it still gives a rough idea of how the cpumask impacts stuff.
> I ran 20 iterations of my "nohz update" test case (a task accumulates load, goes to sleep, and another always-running task keeps kicking an ILB to decay that blocked load) and the time saved by skipping CPUs is in the ballpark of 20%. Notebook is at [1].

Even if saving time in the ILB is interesting, we have to check the
impact of bitmask operation on the wake up latency of the CPUs when
the system is heavily used and generate a lot of sleep/wakeup on CPUs


>
> I'll try to get a proper function profiling working for when Vincent posts his "v2".
>
> [1]: https://gist.github.com/valschneider/6f203143bee1e149f24c44e9582a9eff
>
> ---

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH 1/3] sched: Stop nohz stats when decayed
  2018-02-06  8:32                               ` [PATCH 1/3] sched: Stop nohz stats when decayed Vincent Guittot
                                                   ` (2 preceding siblings ...)
  2018-02-06  8:55                                 ` [PATCH 1/3] sched: Stop nohz stats when decayed Vincent Guittot
@ 2018-02-06 14:16                                 ` Valentin Schneider
  2018-02-06 14:31                                   ` Vincent Guittot
  3 siblings, 1 reply; 56+ messages in thread
From: Valentin Schneider @ 2018-02-06 14:16 UTC (permalink / raw)
  To: Vincent Guittot, peterz, mingo, linux-kernel
  Cc: morten.rasmussen, brendan.jackman, dietmar.eggemann

Hi Vincent,

On 02/06/2018 08:32 AM, Vincent Guittot wrote:
> Stopped the periodic update of blocked load when all idle CPUs have fully
> decayed. We introduce a new nohz.has_blocked that reflect if some idle
> CPUs has blocked load that have to be periodiccally updated. nohz.has_blocked
> is set everytime that a Idle CPU can have blocked load and it is then clear
> when no more blocked load has been detected during an update. We don't need
> atomic operation but only to make cure of the right ordering when updating
> nohz.idle_cpus_mask and nohz.has_blocked.
> 
> Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> ---
>  kernel/sched/fair.c  | 94 +++++++++++++++++++++++++++++++++++++++++-----------
>  kernel/sched/sched.h |  1 +
>  2 files changed, 75 insertions(+), 20 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 7af1fa9..279f4b2 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5383,8 +5383,9 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
>  static struct {
>  	cpumask_var_t idle_cpus_mask;
>  	atomic_t nr_cpus;
> +	int has_blocked;		/* Idle CPUS has blocked load */
>  	unsigned long next_balance;     /* in jiffy units */
> -	unsigned long next_stats;
> +	unsigned long next_blocked;	/* Next update of blocked load in jiffies */
>  } nohz ____cacheline_aligned;
>  
>  #endif /* CONFIG_NO_HZ_COMMON */
> @@ -6951,6 +6952,7 @@ enum fbq_type { regular, remote, all };
>  #define LBF_DST_PINNED  0x04
>  #define LBF_SOME_PINNED	0x08
>  #define LBF_NOHZ_STATS	0x10
> +#define LBF_NOHZ_AGAIN	0x20
>  
>  struct lb_env {
>  	struct sched_domain	*sd;
> @@ -7335,8 +7337,6 @@ static void attach_tasks(struct lb_env *env)
>  	rq_unlock(env->dst_rq, &rf);
>  }
>  
> -#ifdef CONFIG_FAIR_GROUP_SCHED
> -
>  static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
>  {
>  	if (cfs_rq->load.weight)
> @@ -7354,11 +7354,14 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
>  	return true;
>  }
>  
> +#ifdef CONFIG_FAIR_GROUP_SCHED
> +
>  static void update_blocked_averages(int cpu)
>  {
>  	struct rq *rq = cpu_rq(cpu);
>  	struct cfs_rq *cfs_rq, *pos;
>  	struct rq_flags rf;
> +	bool done = true;
>  
>  	rq_lock_irqsave(rq, &rf);
>  	update_rq_clock(rq);
> @@ -7388,10 +7391,14 @@ static void update_blocked_averages(int cpu)
>  		 */
>  		if (cfs_rq_is_decayed(cfs_rq))
>  			list_del_leaf_cfs_rq(cfs_rq);
> +		else
> +			done = false;
>  	}
>  
>  #ifdef CONFIG_NO_HZ_COMMON
>  	rq->last_blocked_load_update_tick = jiffies;
> +	if (done)
> +		rq->has_blocked_load = 0;
>  #endif
>  	rq_unlock_irqrestore(rq, &rf);
>  }
> @@ -7454,6 +7461,8 @@ static inline void update_blocked_averages(int cpu)
>  	update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
>  #ifdef CONFIG_NO_HZ_COMMON
>  	rq->last_blocked_load_update_tick = jiffies;
> +	if (cfs_rq_is_decayed(cfs_rq))
> +		rq->has_blocked_load = 0;
>  #endif
>  	rq_unlock_irqrestore(rq, &rf);
>  }
> @@ -7789,18 +7798,25 @@ group_type group_classify(struct sched_group *group,
>  	return group_other;
>  }
>  
> -static void update_nohz_stats(struct rq *rq)
> +static bool update_nohz_stats(struct rq *rq)
>  {
>  #ifdef CONFIG_NO_HZ_COMMON
>  	unsigned int cpu = rq->cpu;
>  
> +	if (!rq->has_blocked_load)
> +		return false;
> +
>  	if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
> -		return;
> +		return false;
>  
>  	if (!time_after(jiffies, rq->last_blocked_load_update_tick))

I forgot to ask this on the initial thread - what's the point of this condition ? At first glance I would have said that this would make more sense:

if (!time_after(jiffies, rq->last_blocked_load_update_tick + msecs_to_jiffies(LOAD_AVG_PERIOD))
	return false;

But maybe it's simply there to skip an update if it has already been done in the same jiffy interval ?

> -		return;
> +		return true;
>  
>  	update_blocked_averages(cpu);
> +
> +	return rq->has_blocked_load;
> +#else
> +	return false;
>  #endif
>  }
>  
> @@ -7826,8 +7842,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
>  	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
>  		struct rq *rq = cpu_rq(i);
>  
> -		if (env->flags & LBF_NOHZ_STATS)
> -			update_nohz_stats(rq);
> +		if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq))
> +			env->flags |= LBF_NOHZ_AGAIN;
>  
>  		/* Bias balancing toward cpus of our domain */
>  		if (local_group)
> @@ -7979,18 +7995,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
>  	struct sg_lb_stats *local = &sds->local_stat;
>  	struct sg_lb_stats tmp_sgs;
>  	int load_idx, prefer_sibling = 0;
> +	int has_blocked = READ_ONCE(nohz.has_blocked);
>  	bool overload = false;
>  
>  	if (child && child->flags & SD_PREFER_SIBLING)
>  		prefer_sibling = 1;
>  
>  #ifdef CONFIG_NO_HZ_COMMON
> -	if (env->idle == CPU_NEWLY_IDLE) {
> +	if (env->idle == CPU_NEWLY_IDLE && has_blocked)
>  		env->flags |= LBF_NOHZ_STATS;
> -
> -		if (cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd)))
> -			nohz.next_stats = jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD);
> -	}
>  #endif
>  
>  	load_idx = get_sd_load_idx(env->sd, env->idle);
> @@ -8046,6 +8059,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
>  		sg = sg->next;
>  	} while (sg != env->sd->groups);
>  
> +#ifdef CONFIG_NO_HZ_COMMON
> +	if ((env->flags & LBF_NOHZ_AGAIN) &&
> +	    cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
> +
> +		WRITE_ONCE(nohz.next_blocked,
> +				jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
> +	}
> +#endif
> +
>  	if (env->sd->flags & SD_NUMA)
>  		env->fbq_type = fbq_classify_group(&sds->busiest_stat);
>  
> @@ -9069,6 +9091,8 @@ static void nohz_balancer_kick(struct rq *rq)
>  	struct sched_domain *sd;
>  	int nr_busy, i, cpu = rq->cpu;
>  	unsigned int flags = 0;
> +	unsigned long has_blocked = READ_ONCE(nohz.has_blocked);
> +	unsigned long next = READ_ONCE(nohz.next_blocked);

What about something slightly more explicit, e.g. next_stats/next_blocked ? There's also nohz.next_balance referenced here so IMO it's best to keep things clear.

>  
>  	if (unlikely(rq->idle_balance))
>  		return;
> @@ -9086,7 +9110,7 @@ static void nohz_balancer_kick(struct rq *rq)
>  	if (likely(!atomic_read(&nohz.nr_cpus)))
>  		return;
>  
> -	if (time_after(now, nohz.next_stats))
> +	if (time_after(now, next) && has_blocked)
>  		flags = NOHZ_STATS_KICK;
>  
>  	if (time_before(now, nohz.next_balance))
> @@ -9207,13 +9231,15 @@ void nohz_balance_enter_idle(int cpu)
>  	if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
>  		return;
>  
> +	rq->has_blocked_load = 1;
> +
>  	if (rq->nohz_tick_stopped)
> -		return;
> +		goto out;
>  
>  	/*
>  	 * If we're a completely isolated CPU, we don't play.
>  	 */
> -	if (on_null_domain(cpu_rq(cpu)))
> +	if (on_null_domain(rq))
>  		return;
>  
>  	rq->nohz_tick_stopped = 1;
> @@ -9222,6 +9248,13 @@ void nohz_balance_enter_idle(int cpu)
>  	atomic_inc(&nohz.nr_cpus);
>  
>  	set_cpu_sd_state_idle(cpu);
> +
> +out:
> +	/*
> +	 * Each time a cpu enter idle, we assume that it has blocked load and
> +	 * enable the periodic update of the load of idle cpus
> +	 */
> +	WRITE_ONCE(nohz.has_blocked, 1);
>  }
>  #else
>  static inline void nohz_balancer_kick(struct rq *rq) { }
> @@ -9374,6 +9407,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>  
>  	SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>  
> +	/*
> +	 * We assume there will be no idle load after this update and clear
> +	 * the stats state. If a cpu enters idle in the mean time, it will

s/stats state/has_blocked flag/ (repeated a few times in the comment block)

> +	 * set the stats state and trig another update of idle load.
> +	 * Because a cpu that becomes idle, is added to idle_cpus_mask before
> +	 * setting the stats state, we are sure to not clear the state and not
> +	 * check the load of an idle cpu.
> +	 */
> +	WRITE_ONCE(nohz.has_blocked, 0);
> +
>  	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
>  		if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
>  			continue;
> @@ -9383,11 +9426,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>  		 * work being done for other cpus. Next load
>  		 * balancing owner will pick it up.
>  		 */
> -		if (need_resched())
> -			break;
> +		if (need_resched()) {
> +			has_blocked_load = true;
> +			goto abort;
> +		}
>  
>  		rq = cpu_rq(balance_cpu);
>  
> +		update_blocked_averages(rq->cpu);
> +		has_blocked_load |= rq->has_blocked_load;
> +
>  		/*
>  		 * If time for next balance is due,
>  		 * do the balance.
> @@ -9400,7 +9448,6 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>  			cpu_load_update_idle(rq);
>  			rq_unlock_irq(rq, &rf);
>  
> -			update_blocked_averages(rq->cpu);
>  			if (flags & NOHZ_BALANCE_KICK)
>  				rebalance_domains(rq, CPU_IDLE);
>  		}
> @@ -9415,7 +9462,13 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>  	if (flags & NOHZ_BALANCE_KICK)
>  		rebalance_domains(this_rq, CPU_IDLE);
>  
> -	nohz.next_stats = next_stats;
> +	WRITE_ONCE(nohz.next_blocked,
> +		now + msecs_to_jiffies(LOAD_AVG_PERIOD));
> +
> +abort:
> +	/* There is still blocked load, enable periodic update */
> +	if (has_blocked_load)
> +		WRITE_ONCE(nohz.has_blocked, 1);
>  
>  	/*
>  	 * next_balance will be updated only when there is a need.
> @@ -10046,6 +10099,7 @@ __init void init_sched_fair_class(void)
>  
>  #ifdef CONFIG_NO_HZ_COMMON
>  	nohz.next_balance = jiffies;
> +	nohz.next_blocked = jiffies;
>  	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
>  #endif
>  #endif /* SMP */
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index e200045..ad9b929 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -723,6 +723,7 @@ struct rq {
>  #ifdef CONFIG_SMP
>  	unsigned long last_load_update_tick;
>  	unsigned long last_blocked_load_update_tick;
> +	unsigned int has_blocked_load;
>  #endif /* CONFIG_SMP */
>  	unsigned int nohz_tick_stopped;
>  	atomic_t nohz_flags;
> 

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH 1/3] sched: Stop nohz stats when decayed
  2018-02-06 14:16                                 ` Valentin Schneider
@ 2018-02-06 14:31                                   ` Vincent Guittot
  0 siblings, 0 replies; 56+ messages in thread
From: Vincent Guittot @ 2018-02-06 14:31 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: Peter Zijlstra, Ingo Molnar, linux-kernel, Morten Rasmussen,
	Brendan Jackman, Dietmar Eggemann

On 6 February 2018 at 15:16, Valentin Schneider
<valentin.schneider@arm.com> wrote:
> Hi Vincent,
>
> On 02/06/2018 08:32 AM, Vincent Guittot wrote:
>> Stopped the periodic update of blocked load when all idle CPUs have fully
>> decayed. We introduce a new nohz.has_blocked that reflect if some idle
>> CPUs has blocked load that have to be periodiccally updated. nohz.has_blocked
>> is set everytime that a Idle CPU can have blocked load and it is then clear
>> when no more blocked load has been detected during an update. We don't need
>> atomic operation but only to make cure of the right ordering when updating
>> nohz.idle_cpus_mask and nohz.has_blocked.
>>
>> Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
>> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
>> ---
>>  kernel/sched/fair.c  | 94 +++++++++++++++++++++++++++++++++++++++++-----------
>>  kernel/sched/sched.h |  1 +
>>  2 files changed, 75 insertions(+), 20 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 7af1fa9..279f4b2 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -5383,8 +5383,9 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
>>  static struct {
>>       cpumask_var_t idle_cpus_mask;
>>       atomic_t nr_cpus;
>> +     int has_blocked;                /* Idle CPUS has blocked load */
>>       unsigned long next_balance;     /* in jiffy units */
>> -     unsigned long next_stats;
>> +     unsigned long next_blocked;     /* Next update of blocked load in jiffies */
>>  } nohz ____cacheline_aligned;
>>
>>  #endif /* CONFIG_NO_HZ_COMMON */
>> @@ -6951,6 +6952,7 @@ enum fbq_type { regular, remote, all };
>>  #define LBF_DST_PINNED  0x04
>>  #define LBF_SOME_PINNED      0x08
>>  #define LBF_NOHZ_STATS       0x10
>> +#define LBF_NOHZ_AGAIN       0x20
>>
>>  struct lb_env {
>>       struct sched_domain     *sd;
>> @@ -7335,8 +7337,6 @@ static void attach_tasks(struct lb_env *env)
>>       rq_unlock(env->dst_rq, &rf);
>>  }
>>
>> -#ifdef CONFIG_FAIR_GROUP_SCHED
>> -
>>  static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
>>  {
>>       if (cfs_rq->load.weight)
>> @@ -7354,11 +7354,14 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
>>       return true;
>>  }
>>
>> +#ifdef CONFIG_FAIR_GROUP_SCHED
>> +
>>  static void update_blocked_averages(int cpu)
>>  {
>>       struct rq *rq = cpu_rq(cpu);
>>       struct cfs_rq *cfs_rq, *pos;
>>       struct rq_flags rf;
>> +     bool done = true;
>>
>>       rq_lock_irqsave(rq, &rf);
>>       update_rq_clock(rq);
>> @@ -7388,10 +7391,14 @@ static void update_blocked_averages(int cpu)
>>                */
>>               if (cfs_rq_is_decayed(cfs_rq))
>>                       list_del_leaf_cfs_rq(cfs_rq);
>> +             else
>> +                     done = false;
>>       }
>>
>>  #ifdef CONFIG_NO_HZ_COMMON
>>       rq->last_blocked_load_update_tick = jiffies;
>> +     if (done)
>> +             rq->has_blocked_load = 0;
>>  #endif
>>       rq_unlock_irqrestore(rq, &rf);
>>  }
>> @@ -7454,6 +7461,8 @@ static inline void update_blocked_averages(int cpu)
>>       update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
>>  #ifdef CONFIG_NO_HZ_COMMON
>>       rq->last_blocked_load_update_tick = jiffies;
>> +     if (cfs_rq_is_decayed(cfs_rq))
>> +             rq->has_blocked_load = 0;
>>  #endif
>>       rq_unlock_irqrestore(rq, &rf);
>>  }
>> @@ -7789,18 +7798,25 @@ group_type group_classify(struct sched_group *group,
>>       return group_other;
>>  }
>>
>> -static void update_nohz_stats(struct rq *rq)
>> +static bool update_nohz_stats(struct rq *rq)
>>  {
>>  #ifdef CONFIG_NO_HZ_COMMON
>>       unsigned int cpu = rq->cpu;
>>
>> +     if (!rq->has_blocked_load)
>> +             return false;
>> +
>>       if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
>> -             return;
>> +             return false;
>>
>>       if (!time_after(jiffies, rq->last_blocked_load_update_tick))
>
> I forgot to ask this on the initial thread - what's the point of this condition ? At first glance I would have said that this would make more sense:
>
> if (!time_after(jiffies, rq->last_blocked_load_update_tick + msecs_to_jiffies(LOAD_AVG_PERIOD))
>         return false;
>
> But maybe it's simply there to skip an update if it has already been done in the same jiffy interval ?

That's exactly the purpose.

>
>> -             return;
>> +             return true;
>>
>>       update_blocked_averages(cpu);
>> +
>> +     return rq->has_blocked_load;
>> +#else
>> +     return false;
>>  #endif
>>  }
>>
>> @@ -7826,8 +7842,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
>>       for_each_cpu_and(i, sched_group_span(group), env->cpus) {
>>               struct rq *rq = cpu_rq(i);
>>
>> -             if (env->flags & LBF_NOHZ_STATS)
>> -                     update_nohz_stats(rq);
>> +             if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq))
>> +                     env->flags |= LBF_NOHZ_AGAIN;
>>
>>               /* Bias balancing toward cpus of our domain */
>>               if (local_group)
>> @@ -7979,18 +7995,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
>>       struct sg_lb_stats *local = &sds->local_stat;
>>       struct sg_lb_stats tmp_sgs;
>>       int load_idx, prefer_sibling = 0;
>> +     int has_blocked = READ_ONCE(nohz.has_blocked);
>>       bool overload = false;
>>
>>       if (child && child->flags & SD_PREFER_SIBLING)
>>               prefer_sibling = 1;
>>
>>  #ifdef CONFIG_NO_HZ_COMMON
>> -     if (env->idle == CPU_NEWLY_IDLE) {
>> +     if (env->idle == CPU_NEWLY_IDLE && has_blocked)
>>               env->flags |= LBF_NOHZ_STATS;
>> -
>> -             if (cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd)))
>> -                     nohz.next_stats = jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD);
>> -     }
>>  #endif
>>
>>       load_idx = get_sd_load_idx(env->sd, env->idle);
>> @@ -8046,6 +8059,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
>>               sg = sg->next;
>>       } while (sg != env->sd->groups);
>>
>> +#ifdef CONFIG_NO_HZ_COMMON
>> +     if ((env->flags & LBF_NOHZ_AGAIN) &&
>> +         cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
>> +
>> +             WRITE_ONCE(nohz.next_blocked,
>> +                             jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
>> +     }
>> +#endif
>> +
>>       if (env->sd->flags & SD_NUMA)
>>               env->fbq_type = fbq_classify_group(&sds->busiest_stat);
>>
>> @@ -9069,6 +9091,8 @@ static void nohz_balancer_kick(struct rq *rq)
>>       struct sched_domain *sd;
>>       int nr_busy, i, cpu = rq->cpu;
>>       unsigned int flags = 0;
>> +     unsigned long has_blocked = READ_ONCE(nohz.has_blocked);
>> +     unsigned long next = READ_ONCE(nohz.next_blocked);
>
> What about something slightly more explicit, e.g. next_stats/next_blocked ? There's also nohz.next_balance referenced here so IMO it's best to keep things clear.

ok

>
>>
>>       if (unlikely(rq->idle_balance))
>>               return;
>> @@ -9086,7 +9110,7 @@ static void nohz_balancer_kick(struct rq *rq)
>>       if (likely(!atomic_read(&nohz.nr_cpus)))
>>               return;
>>
>> -     if (time_after(now, nohz.next_stats))
>> +     if (time_after(now, next) && has_blocked)
>>               flags = NOHZ_STATS_KICK;
>>
>>       if (time_before(now, nohz.next_balance))
>> @@ -9207,13 +9231,15 @@ void nohz_balance_enter_idle(int cpu)
>>       if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
>>               return;
>>
>> +     rq->has_blocked_load = 1;
>> +
>>       if (rq->nohz_tick_stopped)
>> -             return;
>> +             goto out;
>>
>>       /*
>>        * If we're a completely isolated CPU, we don't play.
>>        */
>> -     if (on_null_domain(cpu_rq(cpu)))
>> +     if (on_null_domain(rq))
>>               return;
>>
>>       rq->nohz_tick_stopped = 1;
>> @@ -9222,6 +9248,13 @@ void nohz_balance_enter_idle(int cpu)
>>       atomic_inc(&nohz.nr_cpus);
>>
>>       set_cpu_sd_state_idle(cpu);
>> +
>> +out:
>> +     /*
>> +      * Each time a cpu enter idle, we assume that it has blocked load and
>> +      * enable the periodic update of the load of idle cpus
>> +      */
>> +     WRITE_ONCE(nohz.has_blocked, 1);
>>  }
>>  #else
>>  static inline void nohz_balancer_kick(struct rq *rq) { }
>> @@ -9374,6 +9407,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>
>>       SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>>
>> +     /*
>> +      * We assume there will be no idle load after this update and clear
>> +      * the stats state. If a cpu enters idle in the mean time, it will
>
> s/stats state/has_blocked flag/ (repeated a few times in the comment block)

yes. I will update others as well

>
>> +      * set the stats state and trig another update of idle load.
>> +      * Because a cpu that becomes idle, is added to idle_cpus_mask before
>> +      * setting the stats state, we are sure to not clear the state and not
>> +      * check the load of an idle cpu.
>> +      */
>> +     WRITE_ONCE(nohz.has_blocked, 0);
>> +
>>       for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
>>               if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
>>                       continue;
>> @@ -9383,11 +9426,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>                * work being done for other cpus. Next load
>>                * balancing owner will pick it up.
>>                */
>> -             if (need_resched())
>> -                     break;
>> +             if (need_resched()) {
>> +                     has_blocked_load = true;
>> +                     goto abort;
>> +             }
>>
>>               rq = cpu_rq(balance_cpu);
>>
>> +             update_blocked_averages(rq->cpu);
>> +             has_blocked_load |= rq->has_blocked_load;
>> +
>>               /*
>>                * If time for next balance is due,
>>                * do the balance.
>> @@ -9400,7 +9448,6 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>                       cpu_load_update_idle(rq);
>>                       rq_unlock_irq(rq, &rf);
>>
>> -                     update_blocked_averages(rq->cpu);
>>                       if (flags & NOHZ_BALANCE_KICK)
>>                               rebalance_domains(rq, CPU_IDLE);
>>               }
>> @@ -9415,7 +9462,13 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>       if (flags & NOHZ_BALANCE_KICK)
>>               rebalance_domains(this_rq, CPU_IDLE);
>>
>> -     nohz.next_stats = next_stats;
>> +     WRITE_ONCE(nohz.next_blocked,
>> +             now + msecs_to_jiffies(LOAD_AVG_PERIOD));
>> +
>> +abort:
>> +     /* There is still blocked load, enable periodic update */
>> +     if (has_blocked_load)
>> +             WRITE_ONCE(nohz.has_blocked, 1);
>>
>>       /*
>>        * next_balance will be updated only when there is a need.
>> @@ -10046,6 +10099,7 @@ __init void init_sched_fair_class(void)
>>
>>  #ifdef CONFIG_NO_HZ_COMMON
>>       nohz.next_balance = jiffies;
>> +     nohz.next_blocked = jiffies;
>>       zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
>>  #endif
>>  #endif /* SMP */
>> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>> index e200045..ad9b929 100644
>> --- a/kernel/sched/sched.h
>> +++ b/kernel/sched/sched.h
>> @@ -723,6 +723,7 @@ struct rq {
>>  #ifdef CONFIG_SMP
>>       unsigned long last_load_update_tick;
>>       unsigned long last_blocked_load_update_tick;
>> +     unsigned int has_blocked_load;
>>  #endif /* CONFIG_SMP */
>>       unsigned int nohz_tick_stopped;
>>       atomic_t nohz_flags;
>>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH 3/3] sched: update blocked load when newly idle
  2018-02-06  8:32                                 ` [PATCH 3/3] sched: update blocked load when newly idle Vincent Guittot
@ 2018-02-06 14:32                                   ` Valentin Schneider
  2018-02-06 16:17                                     ` Vincent Guittot
  0 siblings, 1 reply; 56+ messages in thread
From: Valentin Schneider @ 2018-02-06 14:32 UTC (permalink / raw)
  To: Vincent Guittot, peterz, mingo, linux-kernel
  Cc: morten.rasmussen, brendan.jackman, dietmar.eggemann

Hi Vincent,

On 02/06/2018 08:32 AM, Vincent Guittot wrote:
> When NEWLY_IDLE load balance is not triggered, we might need to update the
> blocked load anyway. We can kick an ilb so an idle CPU will take care of
> updating blocked load or we can try to update them locally before entering
> idle. In the latter case, we reuse part of the nohz_idle_balance.
> 
> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> ---
>  kernel/sched/fair.c | 102 ++++++++++++++++++++++++++++++++++++++++++----------
>  1 file changed, 84 insertions(+), 18 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 6998528..256defe 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -8829,6 +8829,9 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
>  		*next_balance = next;
>  }
>  
> +static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum cpu_idle_type idle);
> +static void kick_ilb(unsigned int flags);
> +
>  /*
>   * idle_balance is called by schedule() if this_cpu is about to become
>   * idle. Attempts to pull tasks from other CPUs.
> @@ -8863,12 +8866,26 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
>  
>  	if (this_rq->avg_idle < sysctl_sched_migration_cost ||
>  	    !this_rq->rd->overload) {
> +		unsigned long has_blocked = READ_ONCE(nohz.has_blocked);
> +		unsigned long next = READ_ONCE(nohz.next_blocked);

Ditto on 'next' - there's next_balance referenced in here so it'd be nice to make clear which is which.

> +
>  		rcu_read_lock();
>  		sd = rcu_dereference_check_sched_domain(this_rq->sd);
>  		if (sd)
>  			update_next_balance(sd, &next_balance);
>  		rcu_read_unlock();
>  
> +		/*
> +		 * Update blocked idle load if it has not been done for a
> +		 * while. Try to do it locally before entering idle but kick a
> +		 * ilb if it takes too much time and/or might delay next local
> +		 * wake up
> +		 */
> +		if (has_blocked && time_after_eq(jiffies, next) &&
> +				(this_rq->avg_idle < sysctl_sched_migration_cost ||
> +				!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE)))

"this_rq->avg_idle < sysctl_sched_migration_cost" is used twice now, how about storing it in an "idle_too_short" variable ?

> +			kick_ilb(NOHZ_STATS_KICK);
> +
>  		goto out;
>  	}
>  
> @@ -9393,30 +9410,24 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
>  
>  #ifdef CONFIG_NO_HZ_COMMON
>  /*
> - * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
> - * rebalancing for all the cpus for whom scheduler ticks are stopped.
> + * Internal function that runs load balance for all idle cpus. The load balance
> + * can be a simple update of blocked load or a complete load balance with
> + * tasks movement depending of flags.
> + * For newly idle mode, we abort the loop if it takes too much time and return
> + * false to notify that the loop has not be completed and a ilb should be kick.
>   */
> -static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> +static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum cpu_idle_type idle)
>  {
>  	/* Earliest time when we have to do rebalance again */
>  	unsigned long now = jiffies;
>  	unsigned long next_balance = now + 60*HZ;
> -	unsigned long next_stats = now + msecs_to_jiffies(LOAD_AVG_PERIOD);
> +	bool has_blocked_load = false;
>  	int update_next_balance = 0;
>  	int this_cpu = this_rq->cpu;
> -	unsigned int flags;
>  	int balance_cpu;
> +	int ret = false;
>  	struct rq *rq;
> -
> -	if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
> -		return false;
> -
> -	if (idle != CPU_IDLE) {
> -		atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
> -		return false;
> -	}
> -
> -	flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
> +	u64 curr_cost = 0;
>  
>  	SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>  
> @@ -9431,6 +9442,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>  	WRITE_ONCE(nohz.has_blocked, 0);
>  
>  	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
> +		u64 t0, domain_cost;
> +
> +		t0 = sched_clock_cpu(this_cpu);
> +
>  		if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
>  			continue;
>  
> @@ -9444,6 +9459,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>  			goto abort;
>  		}
>  
> +		/*
> +		 * If the update is done while CPU becomes idle, we abort
> +		 * the update when its cost is higher than the average idle
> +		 * time in orde to not delay a possible wake up.
> +		 */
> +		if (idle == CPU_NEWLY_IDLE && this_rq->avg_idle < curr_cost) {
> +			has_blocked_load = true;
> +			goto abort;
> +		}
> +
>  		rq = cpu_rq(balance_cpu);
>  
>  		update_blocked_averages(rq->cpu);
> @@ -9456,10 +9481,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>  		if (time_after_eq(jiffies, rq->next_balance)) {
>  			struct rq_flags rf;
>  
> -			rq_lock_irq(rq, &rf);
> +			rq_lock_irqsave(rq, &rf);
>  			update_rq_clock(rq);
>  			cpu_load_update_idle(rq);
> -			rq_unlock_irq(rq, &rf);
> +			rq_unlock_irqrestore(rq, &rf);
>  
>  			if (flags & NOHZ_BALANCE_KICK)
>  				rebalance_domains(rq, CPU_IDLE);
> @@ -9469,15 +9494,27 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>  			next_balance = rq->next_balance;
>  			update_next_balance = 1;
>  		}
> +
> +		domain_cost = sched_clock_cpu(this_cpu) - t0;
> +		curr_cost += domain_cost;
> +
> +	}
> +
> +	/* Newly idle CPU doesn't need an update */
> +	if (idle != CPU_NEWLY_IDLE) {
> +		update_blocked_averages(this_cpu);
> +		has_blocked_load |= this_rq->has_blocked_load;
>  	}
>  
> -	update_blocked_averages(this_cpu);
>  	if (flags & NOHZ_BALANCE_KICK)
>  		rebalance_domains(this_rq, CPU_IDLE);
>  
>  	WRITE_ONCE(nohz.next_blocked,
>  		now + msecs_to_jiffies(LOAD_AVG_PERIOD));
>  
> +	/* The full idle balance loop has been done */
> +	ret = true;
> +
>  abort:
>  	/* There is still blocked load, enable periodic update */
>  	if (has_blocked_load)
> @@ -9491,6 +9528,35 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>  	if (likely(update_next_balance))
>  		nohz.next_balance = next_balance;
>  
> +	return ret;
> +}
> +
> +/*
> + * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
> + * rebalancing for all the cpus for whom scheduler ticks are stopped.
> + */
> +static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> +{
> +	int this_cpu = this_rq->cpu;
> +	unsigned int flags;
> +
> +	if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
> +		return false;
> +
> +	if (idle != CPU_IDLE) {
> +		atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
> +		return false;
> +	}
> +
> +	/*
> +	 * barrier, pairs with nohz_balance_enter_idle(), ensures ...
> +	 */
> +	flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
> +	if (!(flags & NOHZ_KICK_MASK))
> +		return false;
> +
> +	_nohz_idle_balance(this_rq, flags, idle);
> +
>  	return true;
>  }
>  #else
> 

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH 3/3] sched: update blocked load when newly idle
  2018-02-06 14:32                                   ` Valentin Schneider
@ 2018-02-06 16:17                                     ` Vincent Guittot
  2018-02-06 16:32                                       ` Valentin Schneider
  0 siblings, 1 reply; 56+ messages in thread
From: Vincent Guittot @ 2018-02-06 16:17 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: Peter Zijlstra, Ingo Molnar, linux-kernel, Morten Rasmussen,
	Brendan Jackman, Dietmar Eggemann

On 6 February 2018 at 15:32, Valentin Schneider
<valentin.schneider@arm.com> wrote:
> Hi Vincent,
>
> On 02/06/2018 08:32 AM, Vincent Guittot wrote:
>> When NEWLY_IDLE load balance is not triggered, we might need to update the
>> blocked load anyway. We can kick an ilb so an idle CPU will take care of
>> updating blocked load or we can try to update them locally before entering
>> idle. In the latter case, we reuse part of the nohz_idle_balance.
>>
>> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
>> ---
>>  kernel/sched/fair.c | 102 ++++++++++++++++++++++++++++++++++++++++++----------
>>  1 file changed, 84 insertions(+), 18 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 6998528..256defe 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -8829,6 +8829,9 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
>>               *next_balance = next;
>>  }
>>
>> +static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum cpu_idle_type idle);
>> +static void kick_ilb(unsigned int flags);
>> +
>>  /*
>>   * idle_balance is called by schedule() if this_cpu is about to become
>>   * idle. Attempts to pull tasks from other CPUs.
>> @@ -8863,12 +8866,26 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
>>
>>       if (this_rq->avg_idle < sysctl_sched_migration_cost ||
>>           !this_rq->rd->overload) {
>> +             unsigned long has_blocked = READ_ONCE(nohz.has_blocked);
>> +             unsigned long next = READ_ONCE(nohz.next_blocked);
>
> Ditto on 'next' - there's next_balance referenced in here so it'd be nice to make clear which is which.
>
>> +
>>               rcu_read_lock();
>>               sd = rcu_dereference_check_sched_domain(this_rq->sd);
>>               if (sd)
>>                       update_next_balance(sd, &next_balance);
>>               rcu_read_unlock();
>>
>> +             /*
>> +              * Update blocked idle load if it has not been done for a
>> +              * while. Try to do it locally before entering idle but kick a
>> +              * ilb if it takes too much time and/or might delay next local
>> +              * wake up
>> +              */
>> +             if (has_blocked && time_after_eq(jiffies, next) &&
>> +                             (this_rq->avg_idle < sysctl_sched_migration_cost ||
>> +                             !_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE)))
>
> "this_rq->avg_idle < sysctl_sched_migration_cost" is used twice now, how about storing it in an "idle_too_short" variable ?

In fact it's already the 3rd time
Why do you want it to be stored in an "idle_too_short" variable ?

>
>> +                     kick_ilb(NOHZ_STATS_KICK);
>> +
>>               goto out;
>>       }
>>
>> @@ -9393,30 +9410,24 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
>>
>>  #ifdef CONFIG_NO_HZ_COMMON
>>  /*
>> - * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
>> - * rebalancing for all the cpus for whom scheduler ticks are stopped.
>> + * Internal function that runs load balance for all idle cpus. The load balance
>> + * can be a simple update of blocked load or a complete load balance with
>> + * tasks movement depending of flags.
>> + * For newly idle mode, we abort the loop if it takes too much time and return
>> + * false to notify that the loop has not be completed and a ilb should be kick.
>>   */
>> -static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>> +static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum cpu_idle_type idle)
>>  {
>>       /* Earliest time when we have to do rebalance again */
>>       unsigned long now = jiffies;
>>       unsigned long next_balance = now + 60*HZ;
>> -     unsigned long next_stats = now + msecs_to_jiffies(LOAD_AVG_PERIOD);
>> +     bool has_blocked_load = false;
>>       int update_next_balance = 0;
>>       int this_cpu = this_rq->cpu;
>> -     unsigned int flags;
>>       int balance_cpu;
>> +     int ret = false;
>>       struct rq *rq;
>> -
>> -     if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
>> -             return false;
>> -
>> -     if (idle != CPU_IDLE) {
>> -             atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>> -             return false;
>> -     }
>> -
>> -     flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>> +     u64 curr_cost = 0;
>>
>>       SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>>
>> @@ -9431,6 +9442,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>       WRITE_ONCE(nohz.has_blocked, 0);
>>
>>       for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
>> +             u64 t0, domain_cost;
>> +
>> +             t0 = sched_clock_cpu(this_cpu);
>> +
>>               if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
>>                       continue;
>>
>> @@ -9444,6 +9459,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>                       goto abort;
>>               }
>>
>> +             /*
>> +              * If the update is done while CPU becomes idle, we abort
>> +              * the update when its cost is higher than the average idle
>> +              * time in orde to not delay a possible wake up.
>> +              */
>> +             if (idle == CPU_NEWLY_IDLE && this_rq->avg_idle < curr_cost) {
>> +                     has_blocked_load = true;
>> +                     goto abort;
>> +             }
>> +
>>               rq = cpu_rq(balance_cpu);
>>
>>               update_blocked_averages(rq->cpu);
>> @@ -9456,10 +9481,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>               if (time_after_eq(jiffies, rq->next_balance)) {
>>                       struct rq_flags rf;
>>
>> -                     rq_lock_irq(rq, &rf);
>> +                     rq_lock_irqsave(rq, &rf);
>>                       update_rq_clock(rq);
>>                       cpu_load_update_idle(rq);
>> -                     rq_unlock_irq(rq, &rf);
>> +                     rq_unlock_irqrestore(rq, &rf);
>>
>>                       if (flags & NOHZ_BALANCE_KICK)
>>                               rebalance_domains(rq, CPU_IDLE);
>> @@ -9469,15 +9494,27 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>                       next_balance = rq->next_balance;
>>                       update_next_balance = 1;
>>               }
>> +
>> +             domain_cost = sched_clock_cpu(this_cpu) - t0;
>> +             curr_cost += domain_cost;
>> +
>> +     }
>> +
>> +     /* Newly idle CPU doesn't need an update */
>> +     if (idle != CPU_NEWLY_IDLE) {
>> +             update_blocked_averages(this_cpu);
>> +             has_blocked_load |= this_rq->has_blocked_load;
>>       }
>>
>> -     update_blocked_averages(this_cpu);
>>       if (flags & NOHZ_BALANCE_KICK)
>>               rebalance_domains(this_rq, CPU_IDLE);
>>
>>       WRITE_ONCE(nohz.next_blocked,
>>               now + msecs_to_jiffies(LOAD_AVG_PERIOD));
>>
>> +     /* The full idle balance loop has been done */
>> +     ret = true;
>> +
>>  abort:
>>       /* There is still blocked load, enable periodic update */
>>       if (has_blocked_load)
>> @@ -9491,6 +9528,35 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>       if (likely(update_next_balance))
>>               nohz.next_balance = next_balance;
>>
>> +     return ret;
>> +}
>> +
>> +/*
>> + * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
>> + * rebalancing for all the cpus for whom scheduler ticks are stopped.
>> + */
>> +static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>> +{
>> +     int this_cpu = this_rq->cpu;
>> +     unsigned int flags;
>> +
>> +     if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
>> +             return false;
>> +
>> +     if (idle != CPU_IDLE) {
>> +             atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>> +             return false;
>> +     }
>> +
>> +     /*
>> +      * barrier, pairs with nohz_balance_enter_idle(), ensures ...
>> +      */
>> +     flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>> +     if (!(flags & NOHZ_KICK_MASK))
>> +             return false;
>> +
>> +     _nohz_idle_balance(this_rq, flags, idle);
>> +
>>       return true;
>>  }
>>  #else
>>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH 3/3] sched: update blocked load when newly idle
  2018-02-06 16:17                                     ` Vincent Guittot
@ 2018-02-06 16:32                                       ` Valentin Schneider
  0 siblings, 0 replies; 56+ messages in thread
From: Valentin Schneider @ 2018-02-06 16:32 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: Peter Zijlstra, Ingo Molnar, linux-kernel, Morten Rasmussen,
	Brendan Jackman, Dietmar Eggemann

On 02/06/2018 04:17 PM, Vincent Guittot wrote:
> On 6 February 2018 at 15:32, Valentin Schneider
> <valentin.schneider@arm.com> wrote:
>> Hi Vincent,
>>
>> On 02/06/2018 08:32 AM, Vincent Guittot wrote:
>>> When NEWLY_IDLE load balance is not triggered, we might need to update the
>>> blocked load anyway. We can kick an ilb so an idle CPU will take care of
>>> updating blocked load or we can try to update them locally before entering
>>> idle. In the latter case, we reuse part of the nohz_idle_balance.
>>>
>>> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
>>> ---
>>>  kernel/sched/fair.c | 102 ++++++++++++++++++++++++++++++++++++++++++----------
>>>  1 file changed, 84 insertions(+), 18 deletions(-)
>>>
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index 6998528..256defe 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -8829,6 +8829,9 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
>>>               *next_balance = next;
>>>  }
>>>
>>> +static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum cpu_idle_type idle);
>>> +static void kick_ilb(unsigned int flags);
>>> +
>>>  /*
>>>   * idle_balance is called by schedule() if this_cpu is about to become
>>>   * idle. Attempts to pull tasks from other CPUs.
>>> @@ -8863,12 +8866,26 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
>>>
>>>       if (this_rq->avg_idle < sysctl_sched_migration_cost ||
>>>           !this_rq->rd->overload) {
>>> +             unsigned long has_blocked = READ_ONCE(nohz.has_blocked);
>>> +             unsigned long next = READ_ONCE(nohz.next_blocked);
>>
>> Ditto on 'next' - there's next_balance referenced in here so it'd be nice to make clear which is which.
>>
>>> +
>>>               rcu_read_lock();
>>>               sd = rcu_dereference_check_sched_domain(this_rq->sd);
>>>               if (sd)
>>>                       update_next_balance(sd, &next_balance);
>>>               rcu_read_unlock();
>>>
>>> +             /*
>>> +              * Update blocked idle load if it has not been done for a
>>> +              * while. Try to do it locally before entering idle but kick a
>>> +              * ilb if it takes too much time and/or might delay next local
>>> +              * wake up
>>> +              */
>>> +             if (has_blocked && time_after_eq(jiffies, next) &&
>>> +                             (this_rq->avg_idle < sysctl_sched_migration_cost ||
>>> +                             !_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE)))
>>
>> "this_rq->avg_idle < sysctl_sched_migration_cost" is used twice now, how about storing it in an "idle_too_short" variable ?
> 
> In fact it's already the 3rd time
> Why do you want it to be stored in an "idle_too_short" variable ?

I meant that locally in idle_balance() to not write the same thing twice. TBH that's me being nitpicky (and liking explicit variables).

> 
>>
>>> +                     kick_ilb(NOHZ_STATS_KICK);
>>> +
>>>               goto out;
>>>       }
>>>
>>> @@ -9393,30 +9410,24 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
>>>
>>>  #ifdef CONFIG_NO_HZ_COMMON
>>>  /*
>>> - * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
>>> - * rebalancing for all the cpus for whom scheduler ticks are stopped.
>>> + * Internal function that runs load balance for all idle cpus. The load balance
>>> + * can be a simple update of blocked load or a complete load balance with
>>> + * tasks movement depending of flags.
>>> + * For newly idle mode, we abort the loop if it takes too much time and return
>>> + * false to notify that the loop has not be completed and a ilb should be kick.
>>>   */
>>> -static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>> +static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum cpu_idle_type idle)
>>>  {
>>>       /* Earliest time when we have to do rebalance again */
>>>       unsigned long now = jiffies;
>>>       unsigned long next_balance = now + 60*HZ;
>>> -     unsigned long next_stats = now + msecs_to_jiffies(LOAD_AVG_PERIOD);
>>> +     bool has_blocked_load = false;
>>>       int update_next_balance = 0;
>>>       int this_cpu = this_rq->cpu;
>>> -     unsigned int flags;
>>>       int balance_cpu;
>>> +     int ret = false;
>>>       struct rq *rq;
>>> -
>>> -     if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
>>> -             return false;
>>> -
>>> -     if (idle != CPU_IDLE) {
>>> -             atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>>> -             return false;
>>> -     }
>>> -
>>> -     flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>>> +     u64 curr_cost = 0;
>>>
>>>       SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>>>
>>> @@ -9431,6 +9442,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>>       WRITE_ONCE(nohz.has_blocked, 0);
>>>
>>>       for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
>>> +             u64 t0, domain_cost;
>>> +
>>> +             t0 = sched_clock_cpu(this_cpu);
>>> +
>>>               if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
>>>                       continue;
>>>
>>> @@ -9444,6 +9459,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>>                       goto abort;
>>>               }
>>>
>>> +             /*
>>> +              * If the update is done while CPU becomes idle, we abort
>>> +              * the update when its cost is higher than the average idle
>>> +              * time in orde to not delay a possible wake up.
>>> +              */
>>> +             if (idle == CPU_NEWLY_IDLE && this_rq->avg_idle < curr_cost) {
>>> +                     has_blocked_load = true;
>>> +                     goto abort;
>>> +             }
>>> +
>>>               rq = cpu_rq(balance_cpu);
>>>
>>>               update_blocked_averages(rq->cpu);
>>> @@ -9456,10 +9481,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>>               if (time_after_eq(jiffies, rq->next_balance)) {
>>>                       struct rq_flags rf;
>>>
>>> -                     rq_lock_irq(rq, &rf);
>>> +                     rq_lock_irqsave(rq, &rf);
>>>                       update_rq_clock(rq);
>>>                       cpu_load_update_idle(rq);
>>> -                     rq_unlock_irq(rq, &rf);
>>> +                     rq_unlock_irqrestore(rq, &rf);
>>>
>>>                       if (flags & NOHZ_BALANCE_KICK)
>>>                               rebalance_domains(rq, CPU_IDLE);
>>> @@ -9469,15 +9494,27 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>>                       next_balance = rq->next_balance;
>>>                       update_next_balance = 1;
>>>               }
>>> +
>>> +             domain_cost = sched_clock_cpu(this_cpu) - t0;
>>> +             curr_cost += domain_cost;
>>> +
>>> +     }
>>> +
>>> +     /* Newly idle CPU doesn't need an update */
>>> +     if (idle != CPU_NEWLY_IDLE) {
>>> +             update_blocked_averages(this_cpu);
>>> +             has_blocked_load |= this_rq->has_blocked_load;
>>>       }
>>>
>>> -     update_blocked_averages(this_cpu);
>>>       if (flags & NOHZ_BALANCE_KICK)
>>>               rebalance_domains(this_rq, CPU_IDLE);
>>>
>>>       WRITE_ONCE(nohz.next_blocked,
>>>               now + msecs_to_jiffies(LOAD_AVG_PERIOD));
>>>
>>> +     /* The full idle balance loop has been done */
>>> +     ret = true;
>>> +
>>>  abort:
>>>       /* There is still blocked load, enable periodic update */
>>>       if (has_blocked_load)
>>> @@ -9491,6 +9528,35 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>>       if (likely(update_next_balance))
>>>               nohz.next_balance = next_balance;
>>>
>>> +     return ret;
>>> +}
>>> +
>>> +/*
>>> + * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
>>> + * rebalancing for all the cpus for whom scheduler ticks are stopped.
>>> + */
>>> +static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>> +{
>>> +     int this_cpu = this_rq->cpu;
>>> +     unsigned int flags;
>>> +
>>> +     if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
>>> +             return false;
>>> +
>>> +     if (idle != CPU_IDLE) {
>>> +             atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>>> +             return false;
>>> +     }
>>> +
>>> +     /*
>>> +      * barrier, pairs with nohz_balance_enter_idle(), ensures ...
>>> +      */
>>> +     flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>>> +     if (!(flags & NOHZ_KICK_MASK))
>>> +             return false;
>>> +
>>> +     _nohz_idle_balance(this_rq, flags, idle);
>>> +
>>>       return true;
>>>  }
>>>  #else
>>>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH 3/3] sched: update blocked load when newly idle
  2018-02-14 14:40   ` Valentin Schneider
@ 2018-02-14 14:43     ` Vincent Guittot
  0 siblings, 0 replies; 56+ messages in thread
From: Vincent Guittot @ 2018-02-14 14:43 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: Peter Zijlstra, Ingo Molnar, linux-kernel, Morten Rasmussen,
	Brendan Jackman, Dietmar Eggemann

On 14 February 2018 at 15:40, Valentin Schneider
<valentin.schneider@arm.com> wrote:
> On 02/13/2018 10:31 AM, Vincent Guittot wrote:
>> When NEWLY_IDLE load balance is not triggered, we might need to update the
>> blocked load anyway. We can kick an ilb so an idle CPU will take care of
>> updating blocked load or we can try to update them locally before entering
>> idle. In the latter case, we reuse part of the nohz_idle_balance.
>>
>> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
>> ---
>>  kernel/sched/fair.c | 324 +++++++++++++++++++++++++++++++---------------------
>>  1 file changed, 193 insertions(+), 131 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 9183fee..cb1ab5c 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>>
>> [...]
>>
>>  /*
>> + * idle_balance is called by schedule() if this_cpu is about to become
>> + * idle. Attempts to pull tasks from other CPUs.
>> + */
>> +static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
>> +{
>> +     unsigned long next_balance = jiffies + HZ;
>> +     int this_cpu = this_rq->cpu;
>> +     struct sched_domain *sd;
>> +     int pulled_task = 0;
>> +     u64 curr_cost = 0;
>> +
>> +     /*
>> +      * We must set idle_stamp _before_ calling idle_balance(), such that we
>> +      * measure the duration of idle_balance() as idle time.
>> +      */
>> +     this_rq->idle_stamp = rq_clock(this_rq);
>> +
>> +     /*
>> +      * Do not pull tasks towards !active CPUs...
>> +      */
>> +     if (!cpu_active(this_cpu))
>> +             return 0;
>> +
>> +     /*
>> +      * This is OK, because current is on_cpu, which avoids it being picked
>> +      * for load-balance and preemption/IRQs are still disabled avoiding
>> +      * further scheduler activity on it and we're being very careful to
>> +      * re-start the picking loop.
>> +      */
>> +     rq_unpin_lock(this_rq, rf);
>> +
>> +     if (this_rq->avg_idle < sysctl_sched_migration_cost ||
>> +         !this_rq->rd->overload) {
>> +#ifdef CONFIG_NO_HZ_COMMON
>> +             unsigned long has_blocked = READ_ONCE(nohz.has_blocked);
>> +             unsigned long next_blocked = READ_ONCE(nohz.next_blocked);
>> +#endif
>> +             rcu_read_lock();
>> +             sd = rcu_dereference_check_sched_domain(this_rq->sd);
>> +             if (sd)
>> +                     update_next_balance(sd, &next_balance);
>> +             rcu_read_unlock();
>> +
>> +#ifdef CONFIG_NO_HZ_COMMON
>> +             /*
>> +              * This CPU doesn't want to be disturbed by scheduler
>> +              * houskeeping
>
> Typo here (houskeeping)
>
>> +              */
>> +             if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED))
>> +                     goto out;
>> +
>> +             /* Will wake up very soon. No time for fdoing anything else*/
>
> Typo here (fdoing)
>
>> +             if (this_rq->avg_idle < sysctl_sched_migration_cost)
>> +                     goto out;
>> +
>> +             /* Don't need to update blocked load of idle CPUs*/
>> +             if (!has_blocked || time_after_eq(jiffies, next_blocked))
>> +                     goto out;
>
> My "stats update via NEWLY_IDLE" test case started misbehaving with this
> version: we skip most NEWLY_IDLE stats updates. AFAICT this is the culprit.
>
> I believe this time check should be time_before(jiffies, next_blocked)
> (or time_before_eq depending on what you want to guarantee with the jiffy
> interval stuff).

argh.. I have completely mess up the conditions when reordering them

Thanks for spotting this

>
>> +
>> +             raw_spin_unlock(&this_rq->lock);
>> +             /*
>> +              * This CPU is going to be idle and blocked load of idle CPUs
>> +              * need to be updated. Run the ilb locally as it is a good
>> +              * candidate for ilb instead of waking up another idle CPU.
>> +              * Kick an normal ilb if we failed to do the update.
>> +              */
>> +             if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
>> +                     kick_ilb(NOHZ_STATS_KICK);
>> +             raw_spin_lock(&this_rq->lock);
>> +#endif
>> +             goto out;
>> +     }
>> +

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH 3/3] sched: update blocked load when newly idle
  2018-02-13 10:31 ` [PATCH 3/3] sched: update blocked load when newly idle Vincent Guittot
@ 2018-02-14 14:40   ` Valentin Schneider
  2018-02-14 14:43     ` Vincent Guittot
  0 siblings, 1 reply; 56+ messages in thread
From: Valentin Schneider @ 2018-02-14 14:40 UTC (permalink / raw)
  To: Vincent Guittot, peterz, mingo, linux-kernel
  Cc: morten.rasmussen, brendan.jackman, dietmar.eggemann

On 02/13/2018 10:31 AM, Vincent Guittot wrote:
> When NEWLY_IDLE load balance is not triggered, we might need to update the
> blocked load anyway. We can kick an ilb so an idle CPU will take care of
> updating blocked load or we can try to update them locally before entering
> idle. In the latter case, we reuse part of the nohz_idle_balance.
> 
> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> ---
>  kernel/sched/fair.c | 324 +++++++++++++++++++++++++++++++---------------------
>  1 file changed, 193 insertions(+), 131 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 9183fee..cb1ab5c 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
>
> [...]
>
>  /*
> + * idle_balance is called by schedule() if this_cpu is about to become
> + * idle. Attempts to pull tasks from other CPUs.
> + */
> +static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
> +{
> +	unsigned long next_balance = jiffies + HZ;
> +	int this_cpu = this_rq->cpu;
> +	struct sched_domain *sd;
> +	int pulled_task = 0;
> +	u64 curr_cost = 0;
> +
> +	/*
> +	 * We must set idle_stamp _before_ calling idle_balance(), such that we
> +	 * measure the duration of idle_balance() as idle time.
> +	 */
> +	this_rq->idle_stamp = rq_clock(this_rq);
> +
> +	/*
> +	 * Do not pull tasks towards !active CPUs...
> +	 */
> +	if (!cpu_active(this_cpu))
> +		return 0;
> +
> +	/*
> +	 * This is OK, because current is on_cpu, which avoids it being picked
> +	 * for load-balance and preemption/IRQs are still disabled avoiding
> +	 * further scheduler activity on it and we're being very careful to
> +	 * re-start the picking loop.
> +	 */
> +	rq_unpin_lock(this_rq, rf);
> +
> +	if (this_rq->avg_idle < sysctl_sched_migration_cost ||
> +	    !this_rq->rd->overload) {
> +#ifdef CONFIG_NO_HZ_COMMON
> +		unsigned long has_blocked = READ_ONCE(nohz.has_blocked);
> +		unsigned long next_blocked = READ_ONCE(nohz.next_blocked);
> +#endif
> +		rcu_read_lock();
> +		sd = rcu_dereference_check_sched_domain(this_rq->sd);
> +		if (sd)
> +			update_next_balance(sd, &next_balance);
> +		rcu_read_unlock();
> +
> +#ifdef CONFIG_NO_HZ_COMMON
> +		/*
> +		 * This CPU doesn't want to be disturbed by scheduler
> +		 * houskeeping

Typo here (houskeeping)

> +		 */
> +		if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED))
> +			goto out;
> +
> +		/* Will wake up very soon. No time for fdoing anything else*/

Typo here (fdoing)

> +		if (this_rq->avg_idle < sysctl_sched_migration_cost)
> +			goto out;
> +
> +		/* Don't need to update blocked load of idle CPUs*/
> +		if (!has_blocked || time_after_eq(jiffies, next_blocked))
> +			goto out;

My "stats update via NEWLY_IDLE" test case started misbehaving with this
version: we skip most NEWLY_IDLE stats updates. AFAICT this is the culprit.

I believe this time check should be time_before(jiffies, next_blocked)
(or time_before_eq depending on what you want to guarantee with the jiffy
interval stuff).

> +
> +		raw_spin_unlock(&this_rq->lock);
> +		/*
> +		 * This CPU is going to be idle and blocked load of idle CPUs
> +		 * need to be updated. Run the ilb locally as it is a good
> +		 * candidate for ilb instead of waking up another idle CPU.
> +		 * Kick an normal ilb if we failed to do the update.
> +		 */
> +		if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
> +			kick_ilb(NOHZ_STATS_KICK);
> +		raw_spin_lock(&this_rq->lock);
> +#endif
> +		goto out;
> +	}
> +

^ permalink raw reply	[flat|nested] 56+ messages in thread

* [PATCH 3/3] sched: update blocked load when newly idle
  2018-02-13 10:31 [PATCH v4 0/3] sched: Update blocked load Vincent Guittot
@ 2018-02-13 10:31 ` Vincent Guittot
  2018-02-14 14:40   ` Valentin Schneider
  0 siblings, 1 reply; 56+ messages in thread
From: Vincent Guittot @ 2018-02-13 10:31 UTC (permalink / raw)
  To: peterz, mingo, linux-kernel, valentin.schneider
  Cc: morten.rasmussen, brendan.jackman, dietmar.eggemann, Vincent Guittot

When NEWLY_IDLE load balance is not triggered, we might need to update the
blocked load anyway. We can kick an ilb so an idle CPU will take care of
updating blocked load or we can try to update them locally before entering
idle. In the latter case, we reuse part of the nohz_idle_balance.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
---
 kernel/sched/fair.c | 324 +++++++++++++++++++++++++++++++---------------------
 1 file changed, 193 insertions(+), 131 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9183fee..cb1ab5c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8832,120 +8832,6 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
 }
 
 /*
- * idle_balance is called by schedule() if this_cpu is about to become
- * idle. Attempts to pull tasks from other CPUs.
- */
-static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
-{
-	unsigned long next_balance = jiffies + HZ;
-	int this_cpu = this_rq->cpu;
-	struct sched_domain *sd;
-	int pulled_task = 0;
-	u64 curr_cost = 0;
-
-	/*
-	 * We must set idle_stamp _before_ calling idle_balance(), such that we
-	 * measure the duration of idle_balance() as idle time.
-	 */
-	this_rq->idle_stamp = rq_clock(this_rq);
-
-	/*
-	 * Do not pull tasks towards !active CPUs...
-	 */
-	if (!cpu_active(this_cpu))
-		return 0;
-
-	/*
-	 * This is OK, because current is on_cpu, which avoids it being picked
-	 * for load-balance and preemption/IRQs are still disabled avoiding
-	 * further scheduler activity on it and we're being very careful to
-	 * re-start the picking loop.
-	 */
-	rq_unpin_lock(this_rq, rf);
-
-	if (this_rq->avg_idle < sysctl_sched_migration_cost ||
-	    !this_rq->rd->overload) {
-		rcu_read_lock();
-		sd = rcu_dereference_check_sched_domain(this_rq->sd);
-		if (sd)
-			update_next_balance(sd, &next_balance);
-		rcu_read_unlock();
-
-		goto out;
-	}
-
-	raw_spin_unlock(&this_rq->lock);
-
-	update_blocked_averages(this_cpu);
-	rcu_read_lock();
-	for_each_domain(this_cpu, sd) {
-		int continue_balancing = 1;
-		u64 t0, domain_cost;
-
-		if (!(sd->flags & SD_LOAD_BALANCE))
-			continue;
-
-		if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
-			update_next_balance(sd, &next_balance);
-			break;
-		}
-
-		if (sd->flags & SD_BALANCE_NEWIDLE) {
-			t0 = sched_clock_cpu(this_cpu);
-
-			pulled_task = load_balance(this_cpu, this_rq,
-						   sd, CPU_NEWLY_IDLE,
-						   &continue_balancing);
-
-			domain_cost = sched_clock_cpu(this_cpu) - t0;
-			if (domain_cost > sd->max_newidle_lb_cost)
-				sd->max_newidle_lb_cost = domain_cost;
-
-			curr_cost += domain_cost;
-		}
-
-		update_next_balance(sd, &next_balance);
-
-		/*
-		 * Stop searching for tasks to pull if there are
-		 * now runnable tasks on this rq.
-		 */
-		if (pulled_task || this_rq->nr_running > 0)
-			break;
-	}
-	rcu_read_unlock();
-
-	raw_spin_lock(&this_rq->lock);
-
-	if (curr_cost > this_rq->max_idle_balance_cost)
-		this_rq->max_idle_balance_cost = curr_cost;
-
-	/*
-	 * While browsing the domains, we released the rq lock, a task could
-	 * have been enqueued in the meantime. Since we're not going idle,
-	 * pretend we pulled a task.
-	 */
-	if (this_rq->cfs.h_nr_running && !pulled_task)
-		pulled_task = 1;
-
-out:
-	/* Move the next balance forward */
-	if (time_after(this_rq->next_balance, next_balance))
-		this_rq->next_balance = next_balance;
-
-	/* Is there a task of a high priority class? */
-	if (this_rq->nr_running != this_rq->cfs.h_nr_running)
-		pulled_task = -1;
-
-	if (pulled_task)
-		this_rq->idle_stamp = 0;
-
-	rq_repin_lock(this_rq, rf);
-
-	return pulled_task;
-}
-
-/*
  * active_load_balance_cpu_stop is run by cpu stopper. It pushes
  * running tasks off the busiest CPU onto idle CPUs. It requires at
  * least 1 task to be running on each physical CPU where possible, and
@@ -9413,10 +9299,14 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
 
 #ifdef CONFIG_NO_HZ_COMMON
 /*
- * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
- * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ * Internal function that runs load balance for all idle cpus. The load balance
+ * can be a simple update of blocked load or a complete load balance with
+ * tasks movement depending of flags.
+ * The function returns false if the loop has stopped before running
+ * through all idle CPUs.
  */
-static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
+static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
+			       enum cpu_idle_type idle)
 {
 	/* Earliest time when we have to do rebalance again */
 	unsigned long now = jiffies;
@@ -9424,20 +9314,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 	bool has_blocked_load = false;
 	int update_next_balance = 0;
 	int this_cpu = this_rq->cpu;
-	unsigned int flags;
 	int balance_cpu;
+	int ret = false;
 	struct rq *rq;
 
-	if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
-		return false;
-
-	if (idle != CPU_IDLE) {
-		atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
-		return false;
-	}
-
-	flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
-
 	SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
 
 	/*
@@ -9482,10 +9362,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 		if (time_after_eq(jiffies, rq->next_balance)) {
 			struct rq_flags rf;
 
-			rq_lock_irq(rq, &rf);
+			rq_lock_irqsave(rq, &rf);
 			update_rq_clock(rq);
 			cpu_load_update_idle(rq);
-			rq_unlock_irq(rq, &rf);
+			rq_unlock_irqrestore(rq, &rf);
 
 			if (flags & NOHZ_BALANCE_KICK)
 				rebalance_domains(rq, CPU_IDLE);
@@ -9497,13 +9377,21 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 		}
 	}
 
-	update_blocked_averages(this_cpu);
+	/* Newly idle CPU doesn't need an update */
+	if (idle != CPU_NEWLY_IDLE) {
+		update_blocked_averages(this_cpu);
+		has_blocked_load |= this_rq->has_blocked_load;
+	}
+
 	if (flags & NOHZ_BALANCE_KICK)
 		rebalance_domains(this_rq, CPU_IDLE);
 
 	WRITE_ONCE(nohz.next_blocked,
 		now + msecs_to_jiffies(LOAD_AVG_PERIOD));
 
+	/* The full idle balance loop has been done */
+	ret = true;
+
 abort:
 	/* There is still blocked load, enable periodic update */
 	if (has_blocked_load)
@@ -9517,6 +9405,35 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 	if (likely(update_next_balance))
 		nohz.next_balance = next_balance;
 
+	return ret;
+}
+
+/*
+ * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
+ * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ */
+static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
+{
+	int this_cpu = this_rq->cpu;
+	unsigned int flags;
+
+	if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
+		return false;
+
+	if (idle != CPU_IDLE) {
+		atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
+		return false;
+	}
+
+	/*
+	 * barrier, pairs with nohz_balance_enter_idle(), ensures ...
+	 */
+	flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
+	if (!(flags & NOHZ_KICK_MASK))
+		return false;
+
+	_nohz_idle_balance(this_rq, flags, idle);
+
 	return true;
 }
 #else
@@ -9527,6 +9444,151 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 #endif
 
 /*
+ * idle_balance is called by schedule() if this_cpu is about to become
+ * idle. Attempts to pull tasks from other CPUs.
+ */
+static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
+{
+	unsigned long next_balance = jiffies + HZ;
+	int this_cpu = this_rq->cpu;
+	struct sched_domain *sd;
+	int pulled_task = 0;
+	u64 curr_cost = 0;
+
+	/*
+	 * We must set idle_stamp _before_ calling idle_balance(), such that we
+	 * measure the duration of idle_balance() as idle time.
+	 */
+	this_rq->idle_stamp = rq_clock(this_rq);
+
+	/*
+	 * Do not pull tasks towards !active CPUs...
+	 */
+	if (!cpu_active(this_cpu))
+		return 0;
+
+	/*
+	 * This is OK, because current is on_cpu, which avoids it being picked
+	 * for load-balance and preemption/IRQs are still disabled avoiding
+	 * further scheduler activity on it and we're being very careful to
+	 * re-start the picking loop.
+	 */
+	rq_unpin_lock(this_rq, rf);
+
+	if (this_rq->avg_idle < sysctl_sched_migration_cost ||
+	    !this_rq->rd->overload) {
+#ifdef CONFIG_NO_HZ_COMMON
+		unsigned long has_blocked = READ_ONCE(nohz.has_blocked);
+		unsigned long next_blocked = READ_ONCE(nohz.next_blocked);
+#endif
+		rcu_read_lock();
+		sd = rcu_dereference_check_sched_domain(this_rq->sd);
+		if (sd)
+			update_next_balance(sd, &next_balance);
+		rcu_read_unlock();
+
+#ifdef CONFIG_NO_HZ_COMMON
+		/*
+		 * This CPU doesn't want to be disturbed by scheduler
+		 * houskeeping
+		 */
+		if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED))
+			goto out;
+
+		/* Will wake up very soon. No time for fdoing anything else*/
+		if (this_rq->avg_idle < sysctl_sched_migration_cost)
+			goto out;
+
+		/* Don't need to update blocked load of idle CPUs*/
+		if (!has_blocked || time_after_eq(jiffies, next_blocked))
+			goto out;
+
+		raw_spin_unlock(&this_rq->lock);
+		/*
+		 * This CPU is going to be idle and blocked load of idle CPUs
+		 * need to be updated. Run the ilb locally as it is a good
+		 * candidate for ilb instead of waking up another idle CPU.
+		 * Kick an normal ilb if we failed to do the update.
+		 */
+		if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
+			kick_ilb(NOHZ_STATS_KICK);
+		raw_spin_lock(&this_rq->lock);
+#endif
+		goto out;
+	}
+
+	raw_spin_unlock(&this_rq->lock);
+
+	update_blocked_averages(this_cpu);
+	rcu_read_lock();
+	for_each_domain(this_cpu, sd) {
+		int continue_balancing = 1;
+		u64 t0, domain_cost;
+
+		if (!(sd->flags & SD_LOAD_BALANCE))
+			continue;
+
+		if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
+			update_next_balance(sd, &next_balance);
+			break;
+		}
+
+		if (sd->flags & SD_BALANCE_NEWIDLE) {
+			t0 = sched_clock_cpu(this_cpu);
+
+			pulled_task = load_balance(this_cpu, this_rq,
+						   sd, CPU_NEWLY_IDLE,
+						   &continue_balancing);
+
+			domain_cost = sched_clock_cpu(this_cpu) - t0;
+			if (domain_cost > sd->max_newidle_lb_cost)
+				sd->max_newidle_lb_cost = domain_cost;
+
+			curr_cost += domain_cost;
+		}
+
+		update_next_balance(sd, &next_balance);
+
+		/*
+		 * Stop searching for tasks to pull if there are
+		 * now runnable tasks on this rq.
+		 */
+		if (pulled_task || this_rq->nr_running > 0)
+			break;
+	}
+	rcu_read_unlock();
+
+	raw_spin_lock(&this_rq->lock);
+
+	if (curr_cost > this_rq->max_idle_balance_cost)
+		this_rq->max_idle_balance_cost = curr_cost;
+
+	/*
+	 * While browsing the domains, we released the rq lock, a task could
+	 * have been enqueued in the meantime. Since we're not going idle,
+	 * pretend we pulled a task.
+	 */
+	if (this_rq->cfs.h_nr_running && !pulled_task)
+		pulled_task = 1;
+
+out:
+	/* Move the next balance forward */
+	if (time_after(this_rq->next_balance, next_balance))
+		this_rq->next_balance = next_balance;
+
+	/* Is there a task of a high priority class? */
+	if (this_rq->nr_running != this_rq->cfs.h_nr_running)
+		pulled_task = -1;
+
+	if (pulled_task)
+		this_rq->idle_stamp = 0;
+
+	rq_repin_lock(this_rq, rf);
+
+	return pulled_task;
+}
+
+/*
  * run_rebalance_domains is triggered when needed from the scheduler tick.
  * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
  */
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 56+ messages in thread

end of thread, other threads:[~2018-02-14 14:44 UTC | newest]

Thread overview: 56+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-12-21 10:21 [RFC PATCH 0/5] sched: On remote stats updates Peter Zijlstra
2017-12-21 10:21 ` [RFC PATCH 1/5] sched: Convert nohz_flags to atomic_t Peter Zijlstra
2017-12-21 10:21 ` [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK Peter Zijlstra
2017-12-21 16:23   ` Vincent Guittot
2017-12-21 16:56     ` Vincent Guittot
2017-12-22  7:59       ` Peter Zijlstra
2017-12-22  8:05         ` Vincent Guittot
2017-12-22  8:29           ` Peter Zijlstra
2017-12-22  9:12             ` Peter Zijlstra
2017-12-22 14:31               ` Peter Zijlstra
2017-12-22 14:34                 ` Vincent Guittot
2017-12-22 14:32               ` Vincent Guittot
2017-12-22 18:56                 ` Peter Zijlstra
2017-12-22 20:42                   ` Peter Zijlstra
2018-01-02 15:44                     ` Morten Rasmussen
2018-01-15  9:43                       ` Peter Zijlstra
2018-01-18 10:32                         ` Morten Rasmussen
2018-01-03  9:16                     ` Vincent Guittot
2018-01-15  8:26                       ` Vincent Guittot
2018-01-18 10:38                         ` Morten Rasmussen
2018-01-24  8:25                           ` Vincent Guittot
2018-01-29 18:43                             ` Dietmar Eggemann
2018-01-30  8:00                               ` Vincent Guittot
2018-01-29 19:31                             ` Valentin Schneider
2018-01-30  8:32                               ` Vincent Guittot
2018-01-30 11:41                                 ` Valentin Schneider
2018-01-30 13:05                                   ` Vincent Guittot
2018-02-05 22:18                                   ` Valentin Schneider
2018-02-06  9:22                                     ` Vincent Guittot
2018-02-01 18:16                               ` Peter Zijlstra
2018-02-01 16:57                             ` Peter Zijlstra
2018-02-01 17:26                               ` Vincent Guittot
2018-02-01 18:10                             ` Peter Zijlstra
2018-02-01 19:11                               ` Vincent Guittot
2018-02-06  8:32                               ` [PATCH 1/3] sched: Stop nohz stats when decayed Vincent Guittot
2018-02-06  8:32                                 ` [PATCH 2/3] sched: reduce the periodic update duration Vincent Guittot
2018-02-06  8:32                                 ` [PATCH 3/3] sched: update blocked load when newly idle Vincent Guittot
2018-02-06 14:32                                   ` Valentin Schneider
2018-02-06 16:17                                     ` Vincent Guittot
2018-02-06 16:32                                       ` Valentin Schneider
2018-02-06  8:55                                 ` [PATCH 1/3] sched: Stop nohz stats when decayed Vincent Guittot
2018-02-06 14:16                                 ` Valentin Schneider
2018-02-06 14:31                                   ` Vincent Guittot
2018-02-01 16:55                           ` [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK Peter Zijlstra
2018-01-22  9:40                         ` Dietmar Eggemann
2018-01-22 10:23                           ` Vincent Guittot
2018-02-01 16:52                         ` Peter Zijlstra
2018-02-01 17:25                           ` Vincent Guittot
2017-12-22  7:56     ` Peter Zijlstra
2017-12-22  8:04       ` Vincent Guittot
2017-12-21 10:21 ` [RFC PATCH 3/5] sched: Restructure nohz_balance_kick Peter Zijlstra
2017-12-21 10:21 ` [RFC PATCH 4/5] sched: Add nohz stats balancing Peter Zijlstra
2017-12-21 10:21 ` [RFC PATCH 5/5] sched: Update blocked load from NEWIDLE Peter Zijlstra
2018-02-13 10:31 [PATCH v4 0/3] sched: Update blocked load Vincent Guittot
2018-02-13 10:31 ` [PATCH 3/3] sched: update blocked load when newly idle Vincent Guittot
2018-02-14 14:40   ` Valentin Schneider
2018-02-14 14:43     ` Vincent Guittot

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.