[PATCH v3 0/3] sched: Update blocked load

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v3 0/3] sched: Update blocked load
@ 2018-02-12  8:07 Vincent Guittot
  2018-02-12  8:07 ` [PATCH v3 1/3] sched: Stop nohz stats when decayed Vincent Guittot
                   ` (2 more replies)
  0 siblings, 3 replies; 13+ messages in thread
From: Vincent Guittot @ 2018-02-12  8:07 UTC (permalink / raw)
  To: peterz, mingo, linux-kernel, valentin.schneider
  Cc: morten.rasmussen, brendan.jackman, dietmar.eggemann, Vincent Guittot

This patchset applies on top of Peter's sched/esting branch minus the last 2 commit:
56eb46798b33 ("sched: Clean up nohz enter/exit")

v3:
- add memory barrier
- add comments

v2:
- minor naming updates

Vincent Guittot (3):
  sched: Stop nohz stats when decayed
  sched: reduce the periodic update duration
  sched: update blocked load when newly idle

 kernel/sched/fair.c  | 227 +++++++++++++++++++++++++++++++++++++++++++--------
 kernel/sched/sched.h |   1 +
 2 files changed, 193 insertions(+), 35 deletions(-)

-- 
2.7.4

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH v3 1/3] sched: Stop nohz stats when decayed
  2018-02-12  8:07 [PATCH v3 0/3] sched: Update blocked load Vincent Guittot
@ 2018-02-12  8:07 ` Vincent Guittot
  2018-02-12  9:41   ` Peter Zijlstra
  2018-02-12 10:45   ` Peter Zijlstra
  2018-02-12  8:07 ` [PATCH v3 2/3] sched: reduce the periodic update duration Vincent Guittot
  2018-02-12  8:07 ` [PATCH v3 3/3] sched: update blocked load when newly idle Vincent Guittot
  2 siblings, 2 replies; 13+ messages in thread
From: Vincent Guittot @ 2018-02-12  8:07 UTC (permalink / raw)
  To: peterz, mingo, linux-kernel, valentin.schneider
  Cc: morten.rasmussen, brendan.jackman, dietmar.eggemann, Vincent Guittot

Stopped the periodic update of blocked load when all idle CPUs have fully
decayed. We introduce a new nohz.has_blocked that reflect if some idle
CPUs has blocked load that have to be periodiccally updated. nohz.has_blocked
is set everytime that a Idle CPU can have blocked load and it is then clear
when no more blocked load has been detected during an update. We don't need
atomic operation but only to make cure of the right ordering when updating
nohz.idle_cpus_mask and nohz.has_blocked.

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
---
 kernel/sched/fair.c  | 118 ++++++++++++++++++++++++++++++++++++++++++---------
 kernel/sched/sched.h |   1 +
 2 files changed, 99 insertions(+), 20 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7af1fa9..e228d3d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5383,8 +5383,9 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
 static struct {
 	cpumask_var_t idle_cpus_mask;
 	atomic_t nr_cpus;
+	int has_blocked;		/* Idle CPUS has blocked load */
 	unsigned long next_balance;     /* in jiffy units */
-	unsigned long next_stats;
+	unsigned long next_blocked;	/* Next update of blocked load in jiffies */
 } nohz ____cacheline_aligned;
 
 #endif /* CONFIG_NO_HZ_COMMON */
@@ -6951,6 +6952,7 @@ enum fbq_type { regular, remote, all };
 #define LBF_DST_PINNED  0x04
 #define LBF_SOME_PINNED	0x08
 #define LBF_NOHZ_STATS	0x10
+#define LBF_NOHZ_AGAIN	0x20
 
 struct lb_env {
 	struct sched_domain	*sd;
@@ -7335,8 +7337,6 @@ static void attach_tasks(struct lb_env *env)
 	rq_unlock(env->dst_rq, &rf);
 }
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-
 static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
 {
 	if (cfs_rq->load.weight)
@@ -7354,11 +7354,14 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
 	return true;
 }
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
 static void update_blocked_averages(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	struct cfs_rq *cfs_rq, *pos;
 	struct rq_flags rf;
+	bool done = true;
 
 	rq_lock_irqsave(rq, &rf);
 	update_rq_clock(rq);
@@ -7388,10 +7391,14 @@ static void update_blocked_averages(int cpu)
 		 */
 		if (cfs_rq_is_decayed(cfs_rq))
 			list_del_leaf_cfs_rq(cfs_rq);
+		else
+			done = false;
 	}
 
 #ifdef CONFIG_NO_HZ_COMMON
 	rq->last_blocked_load_update_tick = jiffies;
+	if (done)
+		rq->has_blocked_load = 0;
 #endif
 	rq_unlock_irqrestore(rq, &rf);
 }
@@ -7454,6 +7461,8 @@ static inline void update_blocked_averages(int cpu)
 	update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
 #ifdef CONFIG_NO_HZ_COMMON
 	rq->last_blocked_load_update_tick = jiffies;
+	if (cfs_rq_is_decayed(cfs_rq))
+		rq->has_blocked_load = 0;
 #endif
 	rq_unlock_irqrestore(rq, &rf);
 }
@@ -7789,18 +7798,25 @@ group_type group_classify(struct sched_group *group,
 	return group_other;
 }
 
-static void update_nohz_stats(struct rq *rq)
+static bool update_nohz_stats(struct rq *rq)
 {
 #ifdef CONFIG_NO_HZ_COMMON
 	unsigned int cpu = rq->cpu;
 
+	if (!rq->has_blocked_load)
+		return false;
+
 	if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
-		return;
+		return false;
 
 	if (!time_after(jiffies, rq->last_blocked_load_update_tick))
-		return;
+		return true;
 
 	update_blocked_averages(cpu);
+
+	return rq->has_blocked_load;
+#else
+	return false;
 #endif
 }
 
@@ -7826,8 +7842,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
 		struct rq *rq = cpu_rq(i);
 
-		if (env->flags & LBF_NOHZ_STATS)
-			update_nohz_stats(rq);
+		if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq))
+			env->flags |= LBF_NOHZ_AGAIN;
 
 		/* Bias balancing toward cpus of our domain */
 		if (local_group)
@@ -7979,18 +7995,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 	struct sg_lb_stats *local = &sds->local_stat;
 	struct sg_lb_stats tmp_sgs;
 	int load_idx, prefer_sibling = 0;
+	int has_blocked = READ_ONCE(nohz.has_blocked);
 	bool overload = false;
 
 	if (child && child->flags & SD_PREFER_SIBLING)
 		prefer_sibling = 1;
 
 #ifdef CONFIG_NO_HZ_COMMON
-	if (env->idle == CPU_NEWLY_IDLE) {
+	if (env->idle == CPU_NEWLY_IDLE && has_blocked)
 		env->flags |= LBF_NOHZ_STATS;
-
-		if (cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd)))
-			nohz.next_stats = jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD);
-	}
 #endif
 
 	load_idx = get_sd_load_idx(env->sd, env->idle);
@@ -8046,6 +8059,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 		sg = sg->next;
 	} while (sg != env->sd->groups);
 
+#ifdef CONFIG_NO_HZ_COMMON
+	if ((env->flags & LBF_NOHZ_AGAIN) &&
+	    cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
+
+		WRITE_ONCE(nohz.next_blocked,
+				jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
+	}
+#endif
+
 	if (env->sd->flags & SD_NUMA)
 		env->fbq_type = fbq_classify_group(&sds->busiest_stat);
 
@@ -9069,6 +9091,8 @@ static void nohz_balancer_kick(struct rq *rq)
 	struct sched_domain *sd;
 	int nr_busy, i, cpu = rq->cpu;
 	unsigned int flags = 0;
+	unsigned long has_blocked = READ_ONCE(nohz.has_blocked);
+	unsigned long next_blocked = READ_ONCE(nohz.next_blocked);
 
 	if (unlikely(rq->idle_balance))
 		return;
@@ -9086,7 +9110,7 @@ static void nohz_balancer_kick(struct rq *rq)
 	if (likely(!atomic_read(&nohz.nr_cpus)))
 		return;
 
-	if (time_after(now, nohz.next_stats))
+	if (time_after(now, next_blocked) && has_blocked)
 		flags = NOHZ_STATS_KICK;
 
 	if (time_before(now, nohz.next_balance))
@@ -9207,13 +9231,26 @@ void nohz_balance_enter_idle(int cpu)
 	if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
 		return;
 
+	/*
+	 * Can be safely set without rq->lock held
+	 * If a clear happens, it will have seen last additions because
+	 * rq->lock is held during the check and the clear
+	 */
+	rq->has_blocked_load = 1;
+
+	/*
+	 * The tick is still stopped but load could have been added in the
+	 * meantime. We set the nohz.has_blocked flag to trig a check the
+	 * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear
+	 * of nohz.has_blocked can only happen after checking the new load
+	 */
 	if (rq->nohz_tick_stopped)
-		return;
+		goto out;
 
 	/*
 	 * If we're a completely isolated CPU, we don't play.
 	 */
-	if (on_null_domain(cpu_rq(cpu)))
+	if (on_null_domain(rq))
 		return;
 
 	rq->nohz_tick_stopped = 1;
@@ -9222,6 +9259,20 @@ void nohz_balance_enter_idle(int cpu)
 	atomic_inc(&nohz.nr_cpus);
 
 	set_cpu_sd_state_idle(cpu);
+
+	/*
+         * Ensures that if nohz_idle_balance() fails to observe our
+         * @idle_cpus_mask store, it must observe the @has_blocked
+         * store.
+         */
+        smp_mb__after_atomic();
+
+out:
+	/*
+	 * Each time a cpu enter idle, we assume that it has blocked load and
+	 * enable the periodic update of the load of idle cpus
+	 */
+	WRITE_ONCE(nohz.has_blocked, 1);
 }
 #else
 static inline void nohz_balancer_kick(struct rq *rq) { }
@@ -9374,6 +9425,22 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 
 	SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
 
+	/*
+	 * We assume there will be no idle load after this update and clear
+	 * the has_blocked flag. If a cpu enters idle in the mean time, it will
+	 * set the has_blocked flag and trig another update of idle load.
+	 * Because a cpu that becomes idle, is added to idle_cpus_mask before
+	 * setting the flag, we are sure to not clear the state and not
+	 * check the load of an idle cpu.
+	 */
+	WRITE_ONCE(nohz.has_blocked, 0);
+
+	/*
+         * Ensures that if we miss the CPU, we must see the has_blocked
+         * store from nohz_balance_enter_idle().
+         */
+        smp_mb();
+
 	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
 		if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
 			continue;
@@ -9383,11 +9450,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 		 * work being done for other cpus. Next load
 		 * balancing owner will pick it up.
 		 */
-		if (need_resched())
-			break;
+		if (need_resched()) {
+			has_blocked_load = true;
+			goto abort;
+		}
 
 		rq = cpu_rq(balance_cpu);
 
+		update_blocked_averages(rq->cpu);
+		has_blocked_load |= rq->has_blocked_load;
+
 		/*
 		 * If time for next balance is due,
 		 * do the balance.
@@ -9400,7 +9472,6 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 			cpu_load_update_idle(rq);
 			rq_unlock_irq(rq, &rf);
 
-			update_blocked_averages(rq->cpu);
 			if (flags & NOHZ_BALANCE_KICK)
 				rebalance_domains(rq, CPU_IDLE);
 		}
@@ -9415,7 +9486,13 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 	if (flags & NOHZ_BALANCE_KICK)
 		rebalance_domains(this_rq, CPU_IDLE);
 
-	nohz.next_stats = next_stats;
+	WRITE_ONCE(nohz.next_blocked,
+		now + msecs_to_jiffies(LOAD_AVG_PERIOD));
+
+abort:
+	/* There is still blocked load, enable periodic update */
+	if (has_blocked_load)
+		WRITE_ONCE(nohz.has_blocked, 1);
 
 	/*
 	 * next_balance will be updated only when there is a need.
@@ -10046,6 +10123,7 @@ __init void init_sched_fair_class(void)
 
 #ifdef CONFIG_NO_HZ_COMMON
 	nohz.next_balance = jiffies;
+	nohz.next_blocked = jiffies;
 	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
 #endif
 #endif /* SMP */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e200045..ad9b929 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -723,6 +723,7 @@ struct rq {
 #ifdef CONFIG_SMP
 	unsigned long last_load_update_tick;
 	unsigned long last_blocked_load_update_tick;
+	unsigned int has_blocked_load;
 #endif /* CONFIG_SMP */
 	unsigned int nohz_tick_stopped;
 	atomic_t nohz_flags;
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v3 2/3] sched: reduce the periodic update duration
  2018-02-12  8:07 [PATCH v3 0/3] sched: Update blocked load Vincent Guittot
  2018-02-12  8:07 ` [PATCH v3 1/3] sched: Stop nohz stats when decayed Vincent Guittot
@ 2018-02-12  8:07 ` Vincent Guittot
  2018-02-12  8:07 ` [PATCH v3 3/3] sched: update blocked load when newly idle Vincent Guittot
  2 siblings, 0 replies; 13+ messages in thread
From: Vincent Guittot @ 2018-02-12  8:07 UTC (permalink / raw)
  To: peterz, mingo, linux-kernel, valentin.schneider
  Cc: morten.rasmussen, brendan.jackman, dietmar.eggemann, Vincent Guittot

Instead of using the cfs_rq_is_decayed() which monitors all *_avg
and *_sum, we create a cfs_rq_has_blocked() which only takes care of
util_avg and load_avg. We are only interested by these 2 values which are
decaying faster than the *_sum so we can stop the periodic update earlier.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
---
 kernel/sched/fair.c | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e228d3d..7566424 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7337,6 +7337,19 @@ static void attach_tasks(struct lb_env *env)
 	rq_unlock(env->dst_rq, &rf);
 }
 
+static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
+{
+	if (cfs_rq->avg.load_avg)
+		return true;
+
+	if (cfs_rq->avg.util_avg)
+		return true;
+
+	return false;
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
 static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
 {
 	if (cfs_rq->load.weight)
@@ -7354,8 +7367,6 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
 	return true;
 }
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-
 static void update_blocked_averages(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
@@ -7391,7 +7402,9 @@ static void update_blocked_averages(int cpu)
 		 */
 		if (cfs_rq_is_decayed(cfs_rq))
 			list_del_leaf_cfs_rq(cfs_rq);
-		else
+
+		/* Don't need periodic decay once load/util_avg are null */
+		if (cfs_rq_has_blocked(cfs_rq))
 			done = false;
 	}
 
@@ -7461,7 +7474,7 @@ static inline void update_blocked_averages(int cpu)
 	update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
 #ifdef CONFIG_NO_HZ_COMMON
 	rq->last_blocked_load_update_tick = jiffies;
-	if (cfs_rq_is_decayed(cfs_rq))
+	if (!cfs_rq_has_blocked(cfs_rq))
 		rq->has_blocked_load = 0;
 #endif
 	rq_unlock_irqrestore(rq, &rf);
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v3 3/3] sched: update blocked load when newly idle
  2018-02-12  8:07 [PATCH v3 0/3] sched: Update blocked load Vincent Guittot
  2018-02-12  8:07 ` [PATCH v3 1/3] sched: Stop nohz stats when decayed Vincent Guittot
  2018-02-12  8:07 ` [PATCH v3 2/3] sched: reduce the periodic update duration Vincent Guittot
@ 2018-02-12  8:07 ` Vincent Guittot
  2018-02-12 12:04   ` Peter Zijlstra
  2 siblings, 1 reply; 13+ messages in thread
From: Vincent Guittot @ 2018-02-12  8:07 UTC (permalink / raw)
  To: peterz, mingo, linux-kernel, valentin.schneider
  Cc: morten.rasmussen, brendan.jackman, dietmar.eggemann, Vincent Guittot

When NEWLY_IDLE load balance is not triggered, we might need to update the
blocked load anyway. We can kick an ilb so an idle CPU will take care of
updating blocked load or we can try to update them locally before entering
idle. In the latter case, we reuse part of the nohz_idle_balance.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
---
 kernel/sched/fair.c | 102 ++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 84 insertions(+), 18 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7566424..3c11ef6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8829,6 +8829,9 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
 		*next_balance = next;
 }
 
+static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum cpu_idle_type idle);
+static void kick_ilb(unsigned int flags);
+
 /*
  * idle_balance is called by schedule() if this_cpu is about to become
  * idle. Attempts to pull tasks from other CPUs.
@@ -8863,12 +8866,26 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
 
 	if (this_rq->avg_idle < sysctl_sched_migration_cost ||
 	    !this_rq->rd->overload) {
+		unsigned long has_blocked = READ_ONCE(nohz.has_blocked);
+		unsigned long next_blocked = READ_ONCE(nohz.next_blocked);
+
 		rcu_read_lock();
 		sd = rcu_dereference_check_sched_domain(this_rq->sd);
 		if (sd)
 			update_next_balance(sd, &next_balance);
 		rcu_read_unlock();
 
+		/*
+		 * Update blocked idle load if it has not been done for a
+		 * while. Try to do it locally before entering idle but kick a
+		 * ilb if it takes too much time and/or might delay next local
+		 * wake up
+		 */
+		if (has_blocked && time_after_eq(jiffies, next_blocked) &&
+				(this_rq->avg_idle < sysctl_sched_migration_cost ||
+				!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE)))
+			kick_ilb(NOHZ_STATS_KICK);
+
 		goto out;
 	}
 
@@ -9411,30 +9428,24 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
 
 #ifdef CONFIG_NO_HZ_COMMON
 /*
- * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
- * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ * Internal function that runs load balance for all idle cpus. The load balance
+ * can be a simple update of blocked load or a complete load balance with
+ * tasks movement depending of flags.
+ * For newly idle mode, we abort the loop if it takes too much time and return
+ * false to notify that the loop has not be completed and a ilb should be kick.
  */
-static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
+static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum cpu_idle_type idle)
 {
 	/* Earliest time when we have to do rebalance again */
 	unsigned long now = jiffies;
 	unsigned long next_balance = now + 60*HZ;
-	unsigned long next_stats = now + msecs_to_jiffies(LOAD_AVG_PERIOD);
+	bool has_blocked_load = false;
 	int update_next_balance = 0;
 	int this_cpu = this_rq->cpu;
-	unsigned int flags;
 	int balance_cpu;
+	int ret = false;
 	struct rq *rq;
-
-	if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
-		return false;
-
-	if (idle != CPU_IDLE) {
-		atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
-		return false;
-	}
-
-	flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
+	u64 curr_cost = 0;
 
 	SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
 
@@ -9455,6 +9466,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
         smp_mb();
 
 	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
+		u64 t0, domain_cost;
+
+		t0 = sched_clock_cpu(this_cpu);
+
 		if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
 			continue;
 
@@ -9468,6 +9483,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 			goto abort;
 		}
 
+		/*
+		 * If the update is done while CPU becomes idle, we abort
+		 * the update when its cost is higher than the average idle
+		 * time in orde to not delay a possible wake up.
+		 */
+		if (idle == CPU_NEWLY_IDLE && this_rq->avg_idle < curr_cost) {
+			has_blocked_load = true;
+			goto abort;
+		}
+
 		rq = cpu_rq(balance_cpu);
 
 		update_blocked_averages(rq->cpu);
@@ -9480,10 +9505,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 		if (time_after_eq(jiffies, rq->next_balance)) {
 			struct rq_flags rf;
 
-			rq_lock_irq(rq, &rf);
+			rq_lock_irqsave(rq, &rf);
 			update_rq_clock(rq);
 			cpu_load_update_idle(rq);
-			rq_unlock_irq(rq, &rf);
+			rq_unlock_irqrestore(rq, &rf);
 
 			if (flags & NOHZ_BALANCE_KICK)
 				rebalance_domains(rq, CPU_IDLE);
@@ -9493,15 +9518,27 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 			next_balance = rq->next_balance;
 			update_next_balance = 1;
 		}
+
+		domain_cost = sched_clock_cpu(this_cpu) - t0;
+		curr_cost += domain_cost;
+
+	}
+
+	/* Newly idle CPU doesn't need an update */
+	if (idle != CPU_NEWLY_IDLE) {
+		update_blocked_averages(this_cpu);
+		has_blocked_load |= this_rq->has_blocked_load;
 	}
 
-	update_blocked_averages(this_cpu);
 	if (flags & NOHZ_BALANCE_KICK)
 		rebalance_domains(this_rq, CPU_IDLE);
 
 	WRITE_ONCE(nohz.next_blocked,
 		now + msecs_to_jiffies(LOAD_AVG_PERIOD));
 
+	/* The full idle balance loop has been done */
+	ret = true;
+
 abort:
 	/* There is still blocked load, enable periodic update */
 	if (has_blocked_load)
@@ -9515,6 +9552,35 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 	if (likely(update_next_balance))
 		nohz.next_balance = next_balance;
 
+	return ret;
+}
+
+/*
+ * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
+ * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ */
+static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
+{
+	int this_cpu = this_rq->cpu;
+	unsigned int flags;
+
+	if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
+		return false;
+
+	if (idle != CPU_IDLE) {
+		atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
+		return false;
+	}
+
+	/*
+	 * barrier, pairs with nohz_balance_enter_idle(), ensures ...
+	 */
+	flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
+	if (!(flags & NOHZ_KICK_MASK))
+		return false;
+
+	_nohz_idle_balance(this_rq, flags, idle);
+
 	return true;
 }
 #else
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH v3 1/3] sched: Stop nohz stats when decayed
  2018-02-12  8:07 ` [PATCH v3 1/3] sched: Stop nohz stats when decayed Vincent Guittot
@ 2018-02-12  9:41   ` Peter Zijlstra
  2018-02-12  9:52     ` Vincent Guittot
  2018-02-12 10:45   ` Peter Zijlstra
  1 sibling, 1 reply; 13+ messages in thread
From: Peter Zijlstra @ 2018-02-12  9:41 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: mingo, linux-kernel, valentin.schneider, morten.rasmussen,
	brendan.jackman, dietmar.eggemann

On Mon, Feb 12, 2018 at 09:07:52AM +0100, Vincent Guittot wrote:
> @@ -9222,6 +9259,20 @@ void nohz_balance_enter_idle(int cpu)
>  	atomic_inc(&nohz.nr_cpus);
>  
>  	set_cpu_sd_state_idle(cpu);
> +
> +	/*
> +         * Ensures that if nohz_idle_balance() fails to observe our
> +         * @idle_cpus_mask store, it must observe the @has_blocked
> +         * store.
> +         */
> +        smp_mb__after_atomic();
> +
> +out:
> +	/*
> +	 * Each time a cpu enter idle, we assume that it has blocked load and
> +	 * enable the periodic update of the load of idle cpus
> +	 */
> +	WRITE_ONCE(nohz.has_blocked, 1);
>  }
>  #else
>  static inline void nohz_balancer_kick(struct rq *rq) { }

I moved the barrier up one statement, such that its right after the
atomic_inc(). Otherwise people will get all itchy about __after_atomic()
semantics and we really only care about the cpumask_set_cpu() vs
WRITE_ONCE() ordering, so it doesn't really matter _where_ that barrier
lands in between.

> @@ -9374,6 +9425,22 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>  
>  	SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>  
> +	/*
> +	 * We assume there will be no idle load after this update and clear
> +	 * the has_blocked flag. If a cpu enters idle in the mean time, it will
> +	 * set the has_blocked flag and trig another update of idle load.
> +	 * Because a cpu that becomes idle, is added to idle_cpus_mask before
> +	 * setting the flag, we are sure to not clear the state and not
> +	 * check the load of an idle cpu.
> +	 */
> +	WRITE_ONCE(nohz.has_blocked, 0);
> +
> +	/*
> +         * Ensures that if we miss the CPU, we must see the has_blocked
> +         * store from nohz_balance_enter_idle().
> +         */
> +        smp_mb();
> +
>  	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
>  		if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
>  			continue;

Fixed that white space damage for you ;-)

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3 1/3] sched: Stop nohz stats when decayed
  2018-02-12  9:41   ` Peter Zijlstra
@ 2018-02-12  9:52     ` Vincent Guittot
  0 siblings, 0 replies; 13+ messages in thread
From: Vincent Guittot @ 2018-02-12  9:52 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, linux-kernel, Valentin Schneider, Morten Rasmussen,
	Brendan Jackman, Dietmar Eggemann

On 12 February 2018 at 10:41, Peter Zijlstra <peterz@infradead.org> wrote:
> On Mon, Feb 12, 2018 at 09:07:52AM +0100, Vincent Guittot wrote:
>> @@ -9222,6 +9259,20 @@ void nohz_balance_enter_idle(int cpu)
>>       atomic_inc(&nohz.nr_cpus);
>>
>>       set_cpu_sd_state_idle(cpu);
>> +
>> +     /*
>> +         * Ensures that if nohz_idle_balance() fails to observe our
>> +         * @idle_cpus_mask store, it must observe the @has_blocked
>> +         * store.
>> +         */
>> +        smp_mb__after_atomic();
>> +
>> +out:
>> +     /*
>> +      * Each time a cpu enter idle, we assume that it has blocked load and
>> +      * enable the periodic update of the load of idle cpus
>> +      */
>> +     WRITE_ONCE(nohz.has_blocked, 1);
>>  }
>>  #else
>>  static inline void nohz_balancer_kick(struct rq *rq) { }
>
> I moved the barrier up one statement, such that its right after the
> atomic_inc(). Otherwise people will get all itchy about __after_atomic()
> semantics and we really only care about the cpumask_set_cpu() vs
> WRITE_ONCE() ordering, so it doesn't really matter _where_ that barrier
> lands in between.

ok. make sense

>
>> @@ -9374,6 +9425,22 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>
>>       SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>>
>> +     /*
>> +      * We assume there will be no idle load after this update and clear
>> +      * the has_blocked flag. If a cpu enters idle in the mean time, it will
>> +      * set the has_blocked flag and trig another update of idle load.
>> +      * Because a cpu that becomes idle, is added to idle_cpus_mask before
>> +      * setting the flag, we are sure to not clear the state and not
>> +      * check the load of an idle cpu.
>> +      */
>> +     WRITE_ONCE(nohz.has_blocked, 0);
>> +
>> +     /*
>> +         * Ensures that if we miss the CPU, we must see the has_blocked
>> +         * store from nohz_balance_enter_idle().
>> +         */
>> +        smp_mb();
>> +
>>       for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
>>               if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
>>                       continue;
>
> Fixed that white space damage for you ;-)

Thanks

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3 1/3] sched: Stop nohz stats when decayed
  2018-02-12  8:07 ` [PATCH v3 1/3] sched: Stop nohz stats when decayed Vincent Guittot
  2018-02-12  9:41   ` Peter Zijlstra
@ 2018-02-12 10:45   ` Peter Zijlstra
  2018-02-12 11:02     ` Vincent Guittot
  1 sibling, 1 reply; 13+ messages in thread
From: Peter Zijlstra @ 2018-02-12 10:45 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: mingo, linux-kernel, valentin.schneider, morten.rasmussen,
	brendan.jackman, dietmar.eggemann

On Mon, Feb 12, 2018 at 09:07:52AM +0100, Vincent Guittot wrote:
> @@ -9383,11 +9450,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>  		 * work being done for other cpus. Next load
>  		 * balancing owner will pick it up.
>  		 */
> -		if (need_resched())
> -			break;
> +		if (need_resched()) {
> +			has_blocked_load = true;
> +			goto abort;
> +		}
>  
>  		rq = cpu_rq(balance_cpu);
>  
> +		update_blocked_averages(rq->cpu);

Does that want to be update_nohz_stats() ?

> +		has_blocked_load |= rq->has_blocked_load;
> +
>  		/*
>  		 * If time for next balance is due,
>  		 * do the balance.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3 1/3] sched: Stop nohz stats when decayed
  2018-02-12 10:45   ` Peter Zijlstra
@ 2018-02-12 11:02     ` Vincent Guittot
  0 siblings, 0 replies; 13+ messages in thread
From: Vincent Guittot @ 2018-02-12 11:02 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, linux-kernel, Valentin Schneider, Morten Rasmussen,
	Brendan Jackman, Dietmar Eggemann

On 12 February 2018 at 11:45, Peter Zijlstra <peterz@infradead.org> wrote:
> On Mon, Feb 12, 2018 at 09:07:52AM +0100, Vincent Guittot wrote:
>> @@ -9383,11 +9450,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>                * work being done for other cpus. Next load
>>                * balancing owner will pick it up.
>>                */
>> -             if (need_resched())
>> -                     break;
>> +             if (need_resched()) {
>> +                     has_blocked_load = true;
>> +                     goto abort;
>> +             }
>>
>>               rq = cpu_rq(balance_cpu);
>>
>> +             update_blocked_averages(rq->cpu);
>
> Does that want to be update_nohz_stats() ?

I was about to say yesy but the test if (!time_after(jiffies,
rq->last_blocked_load_update_tick)) worry me a bit because it can skip
the update and the chance to clear nohz.has_blocked.
As an example the tick is 10ms long on arm 32bits so we can easily
reach the 32ms decay within the current tick even if we have already
updated it

>
>> +             has_blocked_load |= rq->has_blocked_load;
>> +
>>               /*
>>                * If time for next balance is due,
>>                * do the balance.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3 3/3] sched: update blocked load when newly idle
  2018-02-12  8:07 ` [PATCH v3 3/3] sched: update blocked load when newly idle Vincent Guittot
@ 2018-02-12 12:04   ` Peter Zijlstra
  2018-02-12 14:34     ` Vincent Guittot
  0 siblings, 1 reply; 13+ messages in thread
From: Peter Zijlstra @ 2018-02-12 12:04 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: mingo, linux-kernel, valentin.schneider, morten.rasmussen,
	brendan.jackman, dietmar.eggemann

On Mon, Feb 12, 2018 at 09:07:54AM +0100, Vincent Guittot wrote:
> @@ -8863,12 +8866,26 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
>  
>  	if (this_rq->avg_idle < sysctl_sched_migration_cost ||
>  	    !this_rq->rd->overload) {
> +		unsigned long has_blocked = READ_ONCE(nohz.has_blocked);
> +		unsigned long next_blocked = READ_ONCE(nohz.next_blocked);
> +
>  		rcu_read_lock();
>  		sd = rcu_dereference_check_sched_domain(this_rq->sd);
>  		if (sd)
>  			update_next_balance(sd, &next_balance);
>  		rcu_read_unlock();
>  
> +		/*
> +		 * Update blocked idle load if it has not been done for a
> +		 * while. Try to do it locally before entering idle but kick a
> +		 * ilb if it takes too much time and/or might delay next local
> +		 * wake up
> +		 */
> +		if (has_blocked && time_after_eq(jiffies, next_blocked) &&
> +				(this_rq->avg_idle < sysctl_sched_migration_cost ||
> +				!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE)))
> +			kick_ilb(NOHZ_STATS_KICK);
> +
>  		goto out;
>  	}

So I really hate this one, also I suspect its broken, because we do this
check before dropping rq->lock and _nohz_idle_balance() will take
rq->lock.

Aside from the above being an unreadable mess, I dislike that it breaks
the various isolation crud, we should not touch CPUs outside of our
domain.

Maybe something like the below? (unfinished)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7839,7 +7839,7 @@ group_type group_classify(struct sched_g
 	return group_other;
 }
 
-static bool update_nohz_stats(struct rq *rq)
+static bool update_nohz_stats(struct rq *rq, bool force)
 {
 #ifdef CONFIG_NO_HZ_COMMON
 	unsigned int cpu = rq->cpu;
@@ -7850,7 +7850,7 @@ static bool update_nohz_stats(struct rq
 	if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
 		return false;
 
-	if (!time_after(jiffies, rq->last_blocked_load_update_tick))
+	if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick))
 		return true;
 
 	update_blocked_averages(cpu);
@@ -7883,7 +7883,7 @@ static inline void update_sg_lb_stats(st
 	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
 		struct rq *rq = cpu_rq(i);
 
-		if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq))
+		if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
 			env->flags |= LBF_NOHZ_AGAIN;
 
 		/* Bias balancing toward cpus of our domain */
@@ -8857,6 +8857,29 @@ update_next_balance(struct sched_domain
 		*next_balance = next;
 }
 
+static int nohz_age(struct sched_domain *sd)
+{
+	struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
+	bool has_blocked_load;
+
+	WRITE_ONCE(nohz.has_blocked, 0);
+
+	smp_mb();
+
+	cpumask_and(cpus, sched_domain_span(sd), nohz.idle_cpus_mask);
+
+	has_blocked_load = cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(sd));
+
+	for_each_cpu(cpu, cpus) {
+		struct rq *rq = cpu_rq(cpu);
+
+		has_blocked_load |= update_nohz_stats(rq, true);
+	}
+
+	if (has_blocked_load)
+		WRITE_ONCE(nohz.has_blocked, 1);
+}
+
 /*
  * idle_balance is called by schedule() if this_cpu is about to become
  * idle. Attempts to pull tasks from other CPUs.
@@ -8868,6 +8891,7 @@ static int idle_balance(struct rq *this_
 	struct sched_domain *sd;
 	int pulled_task = 0;
 	u64 curr_cost = 0;
+	bool nohz_blocked = false;
 
 	/*
 	 * We must set idle_stamp _before_ calling idle_balance(), such that we
@@ -8889,8 +8913,8 @@ static int idle_balance(struct rq *this_
 	 */
 	rq_unpin_lock(this_rq, rf);
 
-	if (this_rq->avg_idle < sysctl_sched_migration_cost ||
-	    !this_rq->rd->overload) {
+	if (this_rq->avg_idle < sysctl_sched_migration_cost) {
+short_idle:
 		rcu_read_lock();
 		sd = rcu_dereference_check_sched_domain(this_rq->sd);
 		if (sd)
@@ -8900,6 +8924,18 @@ static int idle_balance(struct rq *this_
 		goto out;
 	}
 
+	if (!this_rq->rd->overload) {
+#ifdef CONFIG_NO_HZ_COMMON
+		unsigned int has_blocked = READ_ONCE(nohz.has_blocked);
+		unsigned long next_blocked = READ_ONE(nohz.next_blocked);
+
+		if (has_blocked && time_after_eq(jiffies, next_blocked))
+			nohz_blocked = true;
+		else
+#endif
+		goto short_idle;
+	}
+
 	raw_spin_unlock(&this_rq->lock);
 
 	update_blocked_averages(this_cpu);
@@ -8919,9 +8955,13 @@ static int idle_balance(struct rq *this_
 		if (sd->flags & SD_BALANCE_NEWIDLE) {
 			t0 = sched_clock_cpu(this_cpu);
 
-			pulled_task = load_balance(this_cpu, this_rq,
-						   sd, CPU_NEWLY_IDLE,
-						   &continue_balancing);
+			if (nohz_blocked) {
+				nohz_age(sd);
+			} else {
+				pulled_task = load_balance(this_cpu, this_rq,
+						sd, CPU_NEWLY_IDLE,
+						&continue_balancing);
+			}
 
 			domain_cost = sched_clock_cpu(this_cpu) - t0;
 			if (domain_cost > sd->max_newidle_lb_cost)
@@ -9497,8 +9537,7 @@ static bool nohz_idle_balance(struct rq
 
 		rq = cpu_rq(balance_cpu);
 
-		update_blocked_averages(rq->cpu);
-		has_blocked_load |= rq->has_blocked_load;
+		has_blocked_load |= update_nohz_stats(rq, true);
 
 		/*
 		 * If time for next balance is due,

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3 3/3] sched: update blocked load when newly idle
  2018-02-12 12:04   ` Peter Zijlstra
@ 2018-02-12 14:34     ` Vincent Guittot
  2018-02-12 15:38       ` Peter Zijlstra
  0 siblings, 1 reply; 13+ messages in thread
From: Vincent Guittot @ 2018-02-12 14:34 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, linux-kernel, valentin.schneider, morten.rasmussen,
	brendan.jackman, dietmar.eggemann

Le Monday 12 Feb 2018 à 13:04:11 (+0100), Peter Zijlstra a écrit :
> On Mon, Feb 12, 2018 at 09:07:54AM +0100, Vincent Guittot wrote:
> > @@ -8863,12 +8866,26 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
> >  
> >  	if (this_rq->avg_idle < sysctl_sched_migration_cost ||
> >  	    !this_rq->rd->overload) {
> > +		unsigned long has_blocked = READ_ONCE(nohz.has_blocked);
> > +		unsigned long next_blocked = READ_ONCE(nohz.next_blocked);
> > +
> >  		rcu_read_lock();
> >  		sd = rcu_dereference_check_sched_domain(this_rq->sd);
> >  		if (sd)
> >  			update_next_balance(sd, &next_balance);
> >  		rcu_read_unlock();
> >  
> > +		/*
> > +		 * Update blocked idle load if it has not been done for a
> > +		 * while. Try to do it locally before entering idle but kick a
> > +		 * ilb if it takes too much time and/or might delay next local
> > +		 * wake up
> > +		 */
> > +		if (has_blocked && time_after_eq(jiffies, next_blocked) &&
> > +				(this_rq->avg_idle < sysctl_sched_migration_cost ||
> > +				!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE)))
> > +			kick_ilb(NOHZ_STATS_KICK);
> > +
> >  		goto out;
> >  	}
> 
> So I really hate this one, also I suspect its broken, because we do this
> check before dropping rq->lock and _nohz_idle_balance() will take
> rq->lock.

yes. it will take both newly idle rq and idle rq lock

>
> 
> Aside from the above being an unreadable mess, I dislike that it breaks
> the various isolation crud, we should not touch CPUs outside of our
> domain.
>
> 
> Maybe something like the below? (unfinished)
>

good catch. I completely miss the isolation stuff.
But isn't already the case when kicking ilb ? I mean that an idle CPU touches
all idle CPUs and some can be outside its domain during ilb.

The goal of this call to _nohz_idle_balance was to consider this cpu as idle
because that's what will happen as it will not try to pull other task.
We save the kick of an already idle cpu for doing the update of blocked load

Shouldn't we test housekeeping_cpu(cpu, HK_FLAG_SCHED) instead if we want to
make sure that an isolated/full nohz CPU will not be used for updating blocked
load of CPUs outside its domain ?

Is something below more readable:
 
 		/*
-		 * Update blocked idle load if it has not been done for a
-		 * while. Try to do it locally before entering idle but kick a
-		 * ilb if it takes too much time and/or might delay next local
-		 * wake up
+		 * This CPU doesn't want to be disturbed by scheduler
+		 * houskeeping
 		 */
-		if (has_blocked && time_after_eq(jiffies, next_blocked) &&
-				(this_rq->avg_idle < sysctl_sched_migration_cost ||
-				!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE)))
+		if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
+			goto out;
+
+		/* Will wake up very soon. No time for doing anything else*/
+		if (this_rq->avg_idle < sysctl_sched_migration_cost)
+			goto out;
+
+		/* Don't need to update blocked load of idle CPUs*/
+		if (!has_blocked || time_after_eq(jiffies, next_blocked)
+			goto out;
+
+		raw_spin_unlock(&this_rq->lock);
+		/*
+		 * This CPU is going to be idle and blocked load of idle CPUs
+		 * need to be updated. Run the ilb locally as it is a good
+		 * candidate for ilb instead of waking up another idle CPU.
+		 * Kick an normal ilb if we failed to do the update.
+		 */
+		if !_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
 			kick_ilb(NOHZ_STATS_KICK);
+		raw_spin_lock(&this_rq->lock);
 
 		goto out;

> 
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7839,7 +7839,7 @@ group_type group_classify(struct sched_g
>  	return group_other;
>  }
>  
> -static bool update_nohz_stats(struct rq *rq)
> +static bool update_nohz_stats(struct rq *rq, bool force)
>  {
>  #ifdef CONFIG_NO_HZ_COMMON
>  	unsigned int cpu = rq->cpu;
> @@ -7850,7 +7850,7 @@ static bool update_nohz_stats(struct rq
>  	if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
>  		return false;
>  
> -	if (!time_after(jiffies, rq->last_blocked_load_update_tick))
> +	if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick))

This fix the concern raised on the other thread, isn't it ?

>  		return true;
>  
>  	update_blocked_averages(cpu);
> @@ -7883,7 +7883,7 @@ static inline void update_sg_lb_stats(st
>  	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
>  		struct rq *rq = cpu_rq(i);
>  
> -		if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq))
> +		if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
>  			env->flags |= LBF_NOHZ_AGAIN;
>  
>  		/* Bias balancing toward cpus of our domain */
> @@ -8857,6 +8857,29 @@ update_next_balance(struct sched_domain
>  		*next_balance = next;
>  }
>  
> +static int nohz_age(struct sched_domain *sd)
> +{
> +	struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
> +	bool has_blocked_load;
> +
> +	WRITE_ONCE(nohz.has_blocked, 0);
> +
> +	smp_mb();
> +
> +	cpumask_and(cpus, sched_domain_span(sd), nohz.idle_cpus_mask);
> +
> +	has_blocked_load = cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(sd));
> +
> +	for_each_cpu(cpu, cpus) {
> +		struct rq *rq = cpu_rq(cpu);
> +
> +		has_blocked_load |= update_nohz_stats(rq, true);
> +	}
> +
> +	if (has_blocked_load)
> +		WRITE_ONCE(nohz.has_blocked, 1);
> +}
> +

we duplicate what is done in nohe_idle_balance

>  /*
>   * idle_balance is called by schedule() if this_cpu is about to become
>   * idle. Attempts to pull tasks from other CPUs.
> @@ -8868,6 +8891,7 @@ static int idle_balance(struct rq *this_
>  	struct sched_domain *sd;
>  	int pulled_task = 0;
>  	u64 curr_cost = 0;
> +	bool nohz_blocked = false;
>  
>  	/*
>  	 * We must set idle_stamp _before_ calling idle_balance(), such that we
> @@ -8889,8 +8913,8 @@ static int idle_balance(struct rq *this_
>  	 */
>  	rq_unpin_lock(this_rq, rf);
>  
> -	if (this_rq->avg_idle < sysctl_sched_migration_cost ||
> -	    !this_rq->rd->overload) {
> +	if (this_rq->avg_idle < sysctl_sched_migration_cost) {
> +short_idle:
>  		rcu_read_lock();
>  		sd = rcu_dereference_check_sched_domain(this_rq->sd);
>  		if (sd)
> @@ -8900,6 +8924,18 @@ static int idle_balance(struct rq *this_
>  		goto out;
>  	}
>  
> +	if (!this_rq->rd->overload) {
> +#ifdef CONFIG_NO_HZ_COMMON
> +		unsigned int has_blocked = READ_ONCE(nohz.has_blocked);
> +		unsigned long next_blocked = READ_ONE(nohz.next_blocked);
> +
> +		if (has_blocked && time_after_eq(jiffies, next_blocked))
> +			nohz_blocked = true;
> +		else
> +#endif
> +		goto short_idle;
> +	}
> +
>  	raw_spin_unlock(&this_rq->lock);
>  
>  	update_blocked_averages(this_cpu);
> @@ -8919,9 +8955,13 @@ static int idle_balance(struct rq *this_
>  		if (sd->flags & SD_BALANCE_NEWIDLE) {
>  			t0 = sched_clock_cpu(this_cpu);
>  
> -			pulled_task = load_balance(this_cpu, this_rq,
> -						   sd, CPU_NEWLY_IDLE,
> -						   &continue_balancing);
> +			if (nohz_blocked) {
> +				nohz_age(sd);

Do we really need to loop all sched_domain of newly idle CPU and call
nohz_age for each level ?
Can't we only call  nohz_age with the widest/last sched_domain level ?

Furthermore, we use sd->max_newidle_lb_cost to decide to abort the loop.
But this is updated with full load balancing which is longer than just
updating blocked load.
This will increase the chance to abort before reaching the last level.

> +			} else {
> +				pulled_task = load_balance(this_cpu, this_rq,
> +						sd, CPU_NEWLY_IDLE,
> +						&continue_balancing);
> +			}
>  
>  			domain_cost = sched_clock_cpu(this_cpu) - t0;
>  			if (domain_cost > sd->max_newidle_lb_cost)

We have to kick an ilb if we must abort before looping all levels and all
idle CPUs otherwise we can have situation where the load of some idle CPus
could stay blocked

> @@ -9497,8 +9537,7 @@ static bool nohz_idle_balance(struct rq
>  
>  		rq = cpu_rq(balance_cpu);
>  
> -		update_blocked_averages(rq->cpu);
> -		has_blocked_load |= rq->has_blocked_load;
> +		has_blocked_load |= update_nohz_stats(rq, true);
>  
>  		/*
>  		 * If time for next balance is due,

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3 3/3] sched: update blocked load when newly idle
  2018-02-12 14:34     ` Vincent Guittot
@ 2018-02-12 15:38       ` Peter Zijlstra
  2018-02-12 16:06         ` Vincent Guittot
  2018-03-01 16:09         ` Frederic Weisbecker
  0 siblings, 2 replies; 13+ messages in thread
From: Peter Zijlstra @ 2018-02-12 15:38 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: mingo, linux-kernel, valentin.schneider, morten.rasmussen,
	brendan.jackman, dietmar.eggemann, Frederic Weisbecker

On Mon, Feb 12, 2018 at 03:34:44PM +0100, Vincent Guittot wrote:
> Le Monday 12 Feb 2018 à 13:04:11 (+0100), Peter Zijlstra a écrit :
> > On Mon, Feb 12, 2018 at 09:07:54AM +0100, Vincent Guittot wrote:

> > So I really hate this one, also I suspect its broken, because we do this
> > check before dropping rq->lock and _nohz_idle_balance() will take
> > rq->lock.
> 
> yes. it will take both newly idle rq and idle rq lock

Right, can't do that, there's ordering rules for multiple RQ locks etc..

> 
> >
> > 
> > Aside from the above being an unreadable mess, I dislike that it breaks
> > the various isolation crud, we should not touch CPUs outside of our
> > domain.
> >
> > 
> > Maybe something like the below? (unfinished)
> >
> 
> good catch. I completely miss the isolation stuff.
> But isn't already the case when kicking ilb ? I mean that an idle CPU touches
> all idle CPUs and some can be outside its domain during ilb.

> Shouldn't we test housekeeping_cpu(cpu, HK_FLAG_SCHED) instead if we want to
> make sure that an isolated/full nohz CPU will not be used for updating blocked
> load of CPUs outside its domain ?

I _thought_ we had some 'housekeeping' crud in the ilb selection logic,
but now I can't find it. Frederic?

> Is something below more readable:
>  
>  		/*
> +		 * This CPU doesn't want to be disturbed by scheduler
> +		 * houskeeping
>  		 */
> +		if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
> +			goto out;
> +
> +		/* Will wake up very soon. No time for doing anything else*/
> +		if (this_rq->avg_idle < sysctl_sched_migration_cost)
> +			goto out;
> +
> +		/* Don't need to update blocked load of idle CPUs*/
> +		if (!has_blocked || time_after_eq(jiffies, next_blocked)
> +			goto out;
> +
> +		raw_spin_unlock(&this_rq->lock);
> +		/*
> +		 * This CPU is going to be idle and blocked load of idle CPUs
> +		 * need to be updated. Run the ilb locally as it is a good
> +		 * candidate for ilb instead of waking up another idle CPU.
> +		 * Kick an normal ilb if we failed to do the update.
> +		 */
> +		if !_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
>  			kick_ilb(NOHZ_STATS_KICK);
> +		raw_spin_lock(&this_rq->lock);
>  
>  		goto out;

It is, but I think you're still doing that avg_idle thing twice now,
right?

> > @@ -7850,7 +7850,7 @@ static bool update_nohz_stats(struct rq
> >  	if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
> >  		return false;
> >  
> > -	if (!time_after(jiffies, rq->last_blocked_load_update_tick))
> > +	if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick))
> 
> This fix the concern raised on the other thread, isn't it ?

Yes.

> > +static int nohz_age(struct sched_domain *sd)
> > +{
> > +	struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
> > +	bool has_blocked_load;
> > +
> > +	WRITE_ONCE(nohz.has_blocked, 0);
> > +
> > +	smp_mb();
> > +
> > +	cpumask_and(cpus, sched_domain_span(sd), nohz.idle_cpus_mask);
> > +
> > +	has_blocked_load = cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(sd));
> > +
> > +	for_each_cpu(cpu, cpus) {
> > +		struct rq *rq = cpu_rq(cpu);
> > +
> > +		has_blocked_load |= update_nohz_stats(rq, true);
> > +	}
> > +
> > +	if (has_blocked_load)
> > +		WRITE_ONCE(nohz.has_blocked, 1);
> > +}
> > +
> 
> we duplicate what is done in nohe_idle_balance

In parts yes.. I was too lazy to combine :-)

> > @@ -8919,9 +8955,13 @@ static int idle_balance(struct rq *this_
> >  		if (sd->flags & SD_BALANCE_NEWIDLE) {
> >  			t0 = sched_clock_cpu(this_cpu);
> >  
> > -			pulled_task = load_balance(this_cpu, this_rq,
> > -						   sd, CPU_NEWLY_IDLE,
> > -						   &continue_balancing);
> > +			if (nohz_blocked) {
> > +				nohz_age(sd);
> 
> Do we really need to loop all sched_domain of newly idle CPU and call
> nohz_age for each level ?
> Can't we only call  nohz_age with the widest/last sched_domain level ?

Yeah, dunno. I went back and forth on that a bit. The largest is
rq->rd->span. The reason I settled on this variant in the end is that it
keeps locality. When short idle, it will only scan nearby CPUs instead
of reaching half-way across the machine.

> Furthermore, we use sd->max_newidle_lb_cost to decide to abort the loop.
> But this is updated with full load balancing which is longer than just
> updating blocked load.
> This will increase the chance to abort before reaching the last level.

Yes.. I figured we'd take that hit :/

> > +			} else {
> > +				pulled_task = load_balance(this_cpu, this_rq,
> > +						sd, CPU_NEWLY_IDLE,
> > +						&continue_balancing);
> > +			}
> >  
> >  			domain_cost = sched_clock_cpu(this_cpu) - t0;
> >  			if (domain_cost > sd->max_newidle_lb_cost)
> 
> We have to kick an ilb if we must abort before looping all levels and all
> idle CPUs otherwise we can have situation where the load of some idle CPus
> could stay blocked

Yes, like said, was unfinished, I gave up before I got to that.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3 3/3] sched: update blocked load when newly idle
  2018-02-12 15:38       ` Peter Zijlstra
@ 2018-02-12 16:06         ` Vincent Guittot
  2018-03-01 16:09         ` Frederic Weisbecker
  1 sibling, 0 replies; 13+ messages in thread
From: Vincent Guittot @ 2018-02-12 16:06 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, linux-kernel, Valentin Schneider, Morten Rasmussen,
	Brendan Jackman, Dietmar Eggemann, Frederic Weisbecker

On 12 February 2018 at 16:38, Peter Zijlstra <peterz@infradead.org> wrote:
> On Mon, Feb 12, 2018 at 03:34:44PM +0100, Vincent Guittot wrote:
>> Le Monday 12 Feb 2018 à 13:04:11 (+0100), Peter Zijlstra a écrit :
>> > On Mon, Feb 12, 2018 at 09:07:54AM +0100, Vincent Guittot wrote:
>
>> > So I really hate this one, also I suspect its broken, because we do this
>> > check before dropping rq->lock and _nohz_idle_balance() will take
>> > rq->lock.
>>
>> yes. it will take both newly idle rq and idle rq lock
>
> Right, can't do that, there's ordering rules for multiple RQ locks etc..
>
>>
>> >
>> >
>> > Aside from the above being an unreadable mess, I dislike that it breaks
>> > the various isolation crud, we should not touch CPUs outside of our
>> > domain.
>> >
>> >
>> > Maybe something like the below? (unfinished)
>> >
>>
>> good catch. I completely miss the isolation stuff.
>> But isn't already the case when kicking ilb ? I mean that an idle CPU touches
>> all idle CPUs and some can be outside its domain during ilb.
>
>> Shouldn't we test housekeeping_cpu(cpu, HK_FLAG_SCHED) instead if we want to
>> make sure that an isolated/full nohz CPU will not be used for updating blocked
>> load of CPUs outside its domain ?
>
> I _thought_ we had some 'housekeeping' crud in the ilb selection logic,
> but now I can't find it. Frederic?
>
>> Is something below more readable:
>>
>>               /*
>> +              * This CPU doesn't want to be disturbed by scheduler
>> +              * houskeeping
>>                */
>> +             if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
>> +                     goto out;
>> +
>> +             /* Will wake up very soon. No time for doing anything else*/
>> +             if (this_rq->avg_idle < sysctl_sched_migration_cost)
>> +                     goto out;
>> +
>> +             /* Don't need to update blocked load of idle CPUs*/
>> +             if (!has_blocked || time_after_eq(jiffies, next_blocked)
>> +                     goto out;
>> +
>> +             raw_spin_unlock(&this_rq->lock);
>> +             /*
>> +              * This CPU is going to be idle and blocked load of idle CPUs
>> +              * need to be updated. Run the ilb locally as it is a good
>> +              * candidate for ilb instead of waking up another idle CPU.
>> +              * Kick an normal ilb if we failed to do the update.
>> +              */
>> +             if !_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
>>                       kick_ilb(NOHZ_STATS_KICK);
>> +             raw_spin_lock(&this_rq->lock);
>>
>>               goto out;
>
> It is, but I think you're still doing that avg_idle thing twice now,
> right?

yes the goal was to try to not exceed idle time but I wonder if it is
really needed because the need_resched() in the
"for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
" will abort the loop if something is schedule on this_cpu just like
for a normal ilb().
So I think that we can remove this test with avg_idle.

>
>> > @@ -7850,7 +7850,7 @@ static bool update_nohz_stats(struct rq
>> >     if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
>> >             return false;
>> >
>> > -   if (!time_after(jiffies, rq->last_blocked_load_update_tick))
>> > +   if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick))
>>
>> This fix the concern raised on the other thread, isn't it ?
>
> Yes.
>
>> > +static int nohz_age(struct sched_domain *sd)
>> > +{
>> > +   struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
>> > +   bool has_blocked_load;
>> > +
>> > +   WRITE_ONCE(nohz.has_blocked, 0);
>> > +
>> > +   smp_mb();
>> > +
>> > +   cpumask_and(cpus, sched_domain_span(sd), nohz.idle_cpus_mask);
>> > +
>> > +   has_blocked_load = cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(sd));
>> > +
>> > +   for_each_cpu(cpu, cpus) {
>> > +           struct rq *rq = cpu_rq(cpu);
>> > +
>> > +           has_blocked_load |= update_nohz_stats(rq, true);
>> > +   }
>> > +
>> > +   if (has_blocked_load)
>> > +           WRITE_ONCE(nohz.has_blocked, 1);
>> > +}
>> > +
>>
>> we duplicate what is done in nohe_idle_balance
>
> In parts yes.. I was too lazy to combine :-)
>
>> > @@ -8919,9 +8955,13 @@ static int idle_balance(struct rq *this_
>> >             if (sd->flags & SD_BALANCE_NEWIDLE) {
>> >                     t0 = sched_clock_cpu(this_cpu);
>> >
>> > -                   pulled_task = load_balance(this_cpu, this_rq,
>> > -                                              sd, CPU_NEWLY_IDLE,
>> > -                                              &continue_balancing);
>> > +                   if (nohz_blocked) {
>> > +                           nohz_age(sd);
>>
>> Do we really need to loop all sched_domain of newly idle CPU and call
>> nohz_age for each level ?
>> Can't we only call  nohz_age with the widest/last sched_domain level ?
>
> Yeah, dunno. I went back and forth on that a bit. The largest is
> rq->rd->span. The reason I settled on this variant in the end is that it
> keeps locality. When short idle, it will only scan nearby CPUs instead
> of reaching half-way across the machine.
>
>> Furthermore, we use sd->max_newidle_lb_cost to decide to abort the loop.
>> But this is updated with full load balancing which is longer than just
>> updating blocked load.
>> This will increase the chance to abort before reaching the last level.
>
> Yes.. I figured we'd take that hit :/
>
>> > +                   } else {
>> > +                           pulled_task = load_balance(this_cpu, this_rq,
>> > +                                           sd, CPU_NEWLY_IDLE,
>> > +                                           &continue_balancing);
>> > +                   }
>> >
>> >                     domain_cost = sched_clock_cpu(this_cpu) - t0;
>> >                     if (domain_cost > sd->max_newidle_lb_cost)
>>
>> We have to kick an ilb if we must abort before looping all levels and all
>> idle CPUs otherwise we can have situation where the load of some idle CPus
>> could stay blocked
>
> Yes, like said, was unfinished, I gave up before I got to that.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3 3/3] sched: update blocked load when newly idle
  2018-02-12 15:38       ` Peter Zijlstra
  2018-02-12 16:06         ` Vincent Guittot
@ 2018-03-01 16:09         ` Frederic Weisbecker
  1 sibling, 0 replies; 13+ messages in thread
From: Frederic Weisbecker @ 2018-03-01 16:09 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Vincent Guittot, mingo, linux-kernel, valentin.schneider,
	morten.rasmussen, brendan.jackman, dietmar.eggemann,
	Frederic Weisbecker

On Mon, Feb 12, 2018 at 04:38:05PM +0100, Peter Zijlstra wrote:
> On Mon, Feb 12, 2018 at 03:34:44PM +0100, Vincent Guittot wrote:
> > > Aside from the above being an unreadable mess, I dislike that it breaks
> > > the various isolation crud, we should not touch CPUs outside of our
> > > domain.
> > >
> > > 
> > > Maybe something like the below? (unfinished)
> > >
> > 
> > good catch. I completely miss the isolation stuff.
> > But isn't already the case when kicking ilb ? I mean that an idle CPU touches
> > all idle CPUs and some can be outside its domain during ilb.
> 
> > Shouldn't we test housekeeping_cpu(cpu, HK_FLAG_SCHED) instead if we want to
> > make sure that an isolated/full nohz CPU will not be used for updating blocked
> > load of CPUs outside its domain ?
> 
> I _thought_ we had some 'housekeeping' crud in the ilb selection logic,
> but now I can't find it. Frederic?

I think you're referring to nohz_balance_idle(). The call is still there but HK_FLAG_SCHED
is unused for now. I initially turned it on by default on nohz_full but some people
complained. I don't recall why exactly. Anyway I'm waiting for a suitable interface
to use it.

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2018-03-01 16:09 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-02-12  8:07 [PATCH v3 0/3] sched: Update blocked load Vincent Guittot
2018-02-12  8:07 ` [PATCH v3 1/3] sched: Stop nohz stats when decayed Vincent Guittot
2018-02-12  9:41   ` Peter Zijlstra
2018-02-12  9:52     ` Vincent Guittot
2018-02-12 10:45   ` Peter Zijlstra
2018-02-12 11:02     ` Vincent Guittot
2018-02-12  8:07 ` [PATCH v3 2/3] sched: reduce the periodic update duration Vincent Guittot
2018-02-12  8:07 ` [PATCH v3 3/3] sched: update blocked load when newly idle Vincent Guittot
2018-02-12 12:04   ` Peter Zijlstra
2018-02-12 14:34     ` Vincent Guittot
2018-02-12 15:38       ` Peter Zijlstra
2018-02-12 16:06         ` Vincent Guittot
2018-03-01 16:09         ` Frederic Weisbecker

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.