linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/2] sched/fair: Use task_groups instead of leaf_cfs_rq_list to walk all cfs_rqs
@ 2017-04-26  0:40 Tejun Heo
  2017-04-26  0:43 ` [2/2] sched/fair: Fix O(# total cgroups) in load balance path Tejun Heo
                   ` (3 more replies)
  0 siblings, 4 replies; 14+ messages in thread
From: Tejun Heo @ 2017-04-26  0:40 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra
  Cc: “linux-kernel@vger.kernel.org”,
	Linus Torvalds, Mike Galbraith, Paul Turner, Chris Mason,
	“kernel-team@fb.com”

Currently, rq->leaf_cfs_rq_list is a traversal ordered list of all
live cfs_rqs which have ever been active on the CPU; unfortunately,
this makes update_blocked_averages() O(total number of CPU cgroups)
which isn't scalable at all.

The next patch will make rq->leaf_cfs_rq_list only contain the cfs_rqs
which are currently active.  In preparation, this patch converts users
which need to traverse all cfs_rqs to use task_groups list instead.

task_groups list is protected by its own lock.  While it allows RCU
protected traversal and the order of operations guarantees that all
online cfs_rqs will be visited, holding rq->lock won't protect against
iterating an already unregistered cfs_rq.  To avoid operating on an
already dead cfs_rq, a new state variable cfs_rq->online is added
which is protected by rq->lock and tracks whether the cfs_rq is
registered.

As clearing of cfs_rq->online should be protected by rq->lock, the
locking avoidance optimization in unregister_fair_sched_group() is
removed.  Given that the optimization is meanginful only when removing
cgroups which are created but never used and the general heaviness of
cgroup removal, the impact is unlikely to be noticeable.

Another change is that print_cfs_stats() would now print cfs_rqs in a
different order.  If this matters, we can improve it later so that it
first prints cfs_rqs on leaf_cfs_rq_list and then follow up with the
rest.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Turner <pjt@google.com>
Cc: Chris Mason <clm@fb.com>
---
Hello,

This is another set of CPU controller performance fix patches and
based on top of -next as of today.

Thanks.

 kernel/sched/fair.c  |   45 +++++++++++++++++++++++++--------------------
 kernel/sched/sched.h |    1 +
 2 files changed, 26 insertions(+), 20 deletions(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4644,23 +4644,32 @@ static void destroy_cfs_bandwidth(struct
 
 static void __maybe_unused update_runtime_enabled(struct rq *rq)
 {
-	struct cfs_rq *cfs_rq;
+	struct task_group *tg;
 
-	for_each_leaf_cfs_rq(rq, cfs_rq) {
-		struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
+	rcu_read_lock();
+	list_for_each_entry_rcu(tg, &task_groups, list) {
+		struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+
+		if (!cfs_rq->online)
+			continue;
 
 		raw_spin_lock(&cfs_b->lock);
 		cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
 		raw_spin_unlock(&cfs_b->lock);
 	}
+	rcu_read_unlock();
 }
 
 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
 {
-	struct cfs_rq *cfs_rq;
+	struct task_group *tg;
 
-	for_each_leaf_cfs_rq(rq, cfs_rq) {
-		if (!cfs_rq->runtime_enabled)
+	rcu_read_lock();
+	list_for_each_entry_rcu(tg, &task_groups, list) {
+		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+
+		if (!cfs_rq->online || !cfs_rq->runtime_enabled)
 			continue;
 
 		/*
@@ -4677,6 +4686,7 @@ static void __maybe_unused unthrottle_of
 		if (cfs_rq_throttled(cfs_rq))
 			unthrottle_cfs_rq(cfs_rq);
 	}
+	rcu_read_unlock();
 }
 
 #else /* CONFIG_CFS_BANDWIDTH */
@@ -9345,6 +9355,7 @@ void online_fair_sched_group(struct task
 		se = tg->se[i];
 
 		raw_spin_lock_irq(&rq->lock);
+		se->my_q->online = 1;
 		update_rq_clock(rq);
 		attach_entity_cfs_rq(se);
 		sync_throttle(tg, i);
@@ -9355,24 +9366,18 @@ void online_fair_sched_group(struct task
 void unregister_fair_sched_group(struct task_group *tg)
 {
 	unsigned long flags;
-	struct rq *rq;
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
+		struct rq *rq = cpu_rq(cpu);
+		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
+
 		if (tg->se[cpu])
 			remove_entity_load_avg(tg->se[cpu]);
 
-		/*
-		 * Only empty task groups can be destroyed; so we can speculatively
-		 * check on_list without danger of it being re-added.
-		 */
-		if (!tg->cfs_rq[cpu]->on_list)
-			continue;
-
-		rq = cpu_rq(cpu);
-
 		raw_spin_lock_irqsave(&rq->lock, flags);
-		list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
+		list_del_leaf_cfs_rq(cfs_rq);
+		cfs_rq->online = 0;
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 	}
 }
@@ -9523,11 +9528,11 @@ const struct sched_class fair_sched_clas
 #ifdef CONFIG_SCHED_DEBUG
 void print_cfs_stats(struct seq_file *m, int cpu)
 {
-	struct cfs_rq *cfs_rq;
+	struct task_group *tg;
 
 	rcu_read_lock();
-	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
-		print_cfs_rq(m, cpu, cfs_rq);
+	list_for_each_entry_rcu(tg, &task_groups, list)
+		print_cfs_rq(m, cpu, tg->cfs_rq[cpu]);
 	rcu_read_unlock();
 }
 
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -448,6 +448,7 @@ struct cfs_rq {
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */
+	int online;	/* online state, protected by rq->lock */
 
 	/*
 	 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [2/2] sched/fair: Fix O(# total cgroups) in load balance path
  2017-04-26  0:40 [PATCH 1/2] sched/fair: Use task_groups instead of leaf_cfs_rq_list to walk all cfs_rqs Tejun Heo
@ 2017-04-26  0:43 ` Tejun Heo
  2017-05-01 16:11   ` Peter Zijlstra
                     ` (2 more replies)
  2017-05-01 16:24 ` [PATCH 1/2] sched/fair: Use task_groups instead of leaf_cfs_rq_list to walk all cfs_rqs Peter Zijlstra
                   ` (2 subsequent siblings)
  3 siblings, 3 replies; 14+ messages in thread
From: Tejun Heo @ 2017-04-26  0:43 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra
  Cc: “linux-kernel@vger.kernel.org”,
	Linus Torvalds, Mike Galbraith, Paul Turner, Chris Mason,
	“kernel-team@fb.com”

Currently, rq->leaf_cfs_rq_list is a traversal ordered list of all
live cfs_rqs which have ever been active on the CPU; unfortunately,
this makes update_blocked_averages() O(# total cgroups) which isn't
scalable at all.

This shows up as a small CPU consumption and scheduling latency
increase in the load balancing path in systems with CPU controller
enabled across most cgroups.  In an edge case where temporary cgroups
were leaking, this caused the kernel to consume good several tens of
percents of CPU cycles running update_blocked_averages(), each run
taking multiple millisecs.

This patch fixes the issue by taking empty and fully decayed cfs_rqs
off the rq->leaf_cfs_rq_list.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Turner <pjt@google.com>
Cc: Chris Mason <clm@fb.com>
---
 kernel/sched/fair.c |   19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -369,8 +369,9 @@ static inline void list_del_leaf_cfs_rq(
 }
 
 /* Iterate thr' all leaf cfs_rq's on a runqueue */
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
-	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)			\
+	list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list,	\
+				 leaf_cfs_rq_list)
 
 /* Do the two (enqueued) entities belong to the same group ? */
 static inline struct cfs_rq *
@@ -463,7 +464,7 @@ static inline void list_del_leaf_cfs_rq(
 {
 }
 
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)	\
 		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
 
 static inline struct sched_entity *parent_entity(struct sched_entity *se)
@@ -6983,7 +6984,7 @@ static void attach_tasks(struct lb_env *
 static void update_blocked_averages(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
-	struct cfs_rq *cfs_rq;
+	struct cfs_rq *cfs_rq, *pos;
 	struct rq_flags rf;
 
 	rq_lock_irqsave(rq, &rf);
@@ -6993,7 +6994,7 @@ static void update_blocked_averages(int
 	 * Iterates the task_group tree in a bottom up fashion, see
 	 * list_add_leaf_cfs_rq() for details.
 	 */
-	for_each_leaf_cfs_rq(rq, cfs_rq) {
+	for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
 		struct sched_entity *se;
 
 		/* throttled entities do not contribute to load */
@@ -7007,6 +7008,14 @@ static void update_blocked_averages(int
 		se = cfs_rq->tg->se[cpu];
 		if (se && !skip_blocked_update(se))
 			update_load_avg(se, 0);
+
+		/*
+		 * There can be a lot of idle CPU cgroups.  Don't let fully
+		 * decayed cfs_rqs linger on the list.
+		 */
+		if (!cfs_rq->load.weight && !cfs_rq->avg.load_sum &&
+		    !cfs_rq->avg.util_sum && !cfs_rq->runnable_load_sum)
+			list_del_leaf_cfs_rq(cfs_rq);
 	}
 	rq_unlock_irqrestore(rq, &rf);
 }

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [2/2] sched/fair: Fix O(# total cgroups) in load balance path
  2017-04-26  0:43 ` [2/2] sched/fair: Fix O(# total cgroups) in load balance path Tejun Heo
@ 2017-05-01 16:11   ` Peter Zijlstra
  2017-05-01 19:11     ` Tejun Heo
  2017-05-15  9:13   ` [tip:sched/core] sched/fair: Fix O(nr_cgroups) " tip-bot for Tejun Heo
  2017-05-15 10:13   ` tip-bot for Tejun Heo
  2 siblings, 1 reply; 14+ messages in thread
From: Peter Zijlstra @ 2017-05-01 16:11 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Ingo Molnar, “linux-kernel@vger.kernel.org”,
	Linus Torvalds, Mike Galbraith, Paul Turner, Chris Mason,
	“kernel-team@fb.com”

On Tue, Apr 25, 2017 at 05:43:50PM -0700, Tejun Heo wrote:
> @@ -7007,6 +7008,14 @@ static void update_blocked_averages(int
>  		se = cfs_rq->tg->se[cpu];
>  		if (se && !skip_blocked_update(se))
>  			update_load_avg(se, 0);
> +
> +		/*
> +		 * There can be a lot of idle CPU cgroups.  Don't let fully
> +		 * decayed cfs_rqs linger on the list.
> +		 */
> +		if (!cfs_rq->load.weight && !cfs_rq->avg.load_sum &&
> +		    !cfs_rq->avg.util_sum && !cfs_rq->runnable_load_sum)
> +			list_del_leaf_cfs_rq(cfs_rq);
>  	}
>  	rq_unlock_irqrestore(rq, &rf);
>  }

Right this is a 'known' issue and we recently talked about this.

I think you got the condition right, we want to wait for all the stuff
to be decayed out before taking it off the list.

The only 'problem', which Vincent mentioned in that other thread, is that
NOHZ idle doesn't guarantee decay -- then again, you don't want to go
wake a CPU just to decay this crud either. And if we're idle, the list
being long doesn't matter either.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] sched/fair: Use task_groups instead of leaf_cfs_rq_list to walk all cfs_rqs
  2017-04-26  0:40 [PATCH 1/2] sched/fair: Use task_groups instead of leaf_cfs_rq_list to walk all cfs_rqs Tejun Heo
  2017-04-26  0:43 ` [2/2] sched/fair: Fix O(# total cgroups) in load balance path Tejun Heo
@ 2017-05-01 16:24 ` Peter Zijlstra
  2017-05-01 16:59 ` Peter Zijlstra
  2017-05-04 13:31 ` Peter Zijlstra
  3 siblings, 0 replies; 14+ messages in thread
From: Peter Zijlstra @ 2017-05-01 16:24 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Ingo Molnar, “linux-kernel@vger.kernel.org”,
	Linus Torvalds, Mike Galbraith, Paul Turner, Chris Mason,
	“kernel-team@fb.com”

On Tue, Apr 25, 2017 at 05:40:39PM -0700, Tejun Heo wrote:

> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4644,23 +4644,32 @@ static void destroy_cfs_bandwidth(struct
>  
>  static void __maybe_unused update_runtime_enabled(struct rq *rq)
>  {
> -	struct cfs_rq *cfs_rq;
> +	struct task_group *tg;
>  
> -	for_each_leaf_cfs_rq(rq, cfs_rq) {
> -		struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(tg, &task_groups, list) {
> +		struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
> +		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
> +
> +		if (!cfs_rq->online)
> +			continue;
>  
>  		raw_spin_lock(&cfs_b->lock);
>  		cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
>  		raw_spin_unlock(&cfs_b->lock);
>  	}
> +	rcu_read_unlock();
>  }
>  
>  static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
>  {
> -	struct cfs_rq *cfs_rq;
> +	struct task_group *tg;
>  
> -	for_each_leaf_cfs_rq(rq, cfs_rq) {
> -		if (!cfs_rq->runtime_enabled)
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(tg, &task_groups, list) {
> +		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
> +
> +		if (!cfs_rq->online || !cfs_rq->runtime_enabled)
>  			continue;
>  
>  		/*
> @@ -4677,6 +4686,7 @@ static void __maybe_unused unthrottle_of
>  		if (cfs_rq_throttled(cfs_rq))
>  			unthrottle_cfs_rq(cfs_rq);
>  	}
> +	rcu_read_unlock();
>  }

Note that both these are called with rq->lock held. I don't think we
need to actually add rcu_read_lock() here, as it will fully serialize
against your ->online = 0 that holds rq->lock.

Also, arguably you can keep using for_each_leaf_cfs_rq() for unthrottle,
because I don't think we can (should?) remove a throttled group from the
leaf list -- its not empty after all.

Then again, this is CPU hotplug code and nobody cares about its
performance much, and its better to be safe than sorry, so yes, use
task_groups list to make sure to reach all groups.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] sched/fair: Use task_groups instead of leaf_cfs_rq_list to walk all cfs_rqs
  2017-04-26  0:40 [PATCH 1/2] sched/fair: Use task_groups instead of leaf_cfs_rq_list to walk all cfs_rqs Tejun Heo
  2017-04-26  0:43 ` [2/2] sched/fair: Fix O(# total cgroups) in load balance path Tejun Heo
  2017-05-01 16:24 ` [PATCH 1/2] sched/fair: Use task_groups instead of leaf_cfs_rq_list to walk all cfs_rqs Peter Zijlstra
@ 2017-05-01 16:59 ` Peter Zijlstra
  2017-05-01 17:02   ` Peter Zijlstra
  2017-05-04 13:31 ` Peter Zijlstra
  3 siblings, 1 reply; 14+ messages in thread
From: Peter Zijlstra @ 2017-05-01 16:59 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Ingo Molnar, “linux-kernel@vger.kernel.org”,
	Linus Torvalds, Mike Galbraith, Paul Turner, Chris Mason,
	“kernel-team@fb.com”,
	Paul McKenney

On Tue, Apr 25, 2017 at 05:40:39PM -0700, Tejun Heo wrote:
> @@ -9355,24 +9366,18 @@ void online_fair_sched_group(struct task
>  void unregister_fair_sched_group(struct task_group *tg)
>  {
>  	unsigned long flags;
> -	struct rq *rq;
>  	int cpu;
>  
>  	for_each_possible_cpu(cpu) {
> +		struct rq *rq = cpu_rq(cpu);
> +		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
> +
>  		if (tg->se[cpu])
>  			remove_entity_load_avg(tg->se[cpu]);
>  
> -		/*
> -		 * Only empty task groups can be destroyed; so we can speculatively
> -		 * check on_list without danger of it being re-added.
> -		 */
> -		if (!tg->cfs_rq[cpu]->on_list)
> -			continue;
> -
> -		rq = cpu_rq(cpu);
> -
>  		raw_spin_lock_irqsave(&rq->lock, flags);
> -		list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
> +		list_del_leaf_cfs_rq(cfs_rq);
> +		cfs_rq->online = 0;
>  		raw_spin_unlock_irqrestore(&rq->lock, flags);
>  	}
>  }

It would be nice to be able to retain that on_list check and avoid
taking all those rq->lock's.

Since per the previous mail, those online/offline loops are protected by
rq->lock, all we need is to ensure an rcu_sched GP passes between here
and sched_free_group().

Which I think we can do differently, see below. Then we don't need the
->online thing and can keep using the ->on_list check.


> @@ -9523,11 +9528,11 @@ const struct sched_class fair_sched_clas
>  #ifdef CONFIG_SCHED_DEBUG
>  void print_cfs_stats(struct seq_file *m, int cpu)
>  {
> -	struct cfs_rq *cfs_rq;
> +	struct task_group *tg;
>  
>  	rcu_read_lock();
> -	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
> -		print_cfs_rq(m, cpu, cfs_rq);
> +	list_for_each_entry_rcu(tg, &task_groups, list)
> +		print_cfs_rq(m, cpu, tg->cfs_rq[cpu]);
>  	rcu_read_unlock();
>  }

If you have a gazillion empty groups, do we really want to print them?



diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 74d9c3a1feee..8bad2c371b18 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -58,6 +58,11 @@ static inline void cond_synchronize_sched(unsigned long oldstate)
 	might_sleep();
 }
 
+static bool void should_synchornize_sched(unsigned long oldstate)
+{
+	return true;
+}
+
 extern void rcu_barrier_bh(void);
 extern void rcu_barrier_sched(void);
 
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 0bacb6b2af69..9918b632db27 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -78,6 +78,7 @@ unsigned long get_state_synchronize_rcu(void);
 void cond_synchronize_rcu(unsigned long oldstate);
 unsigned long get_state_synchronize_sched(void);
 void cond_synchronize_sched(unsigned long oldstate);
+bool should_synchronize_sched(unsigned long oldstate);
 
 extern unsigned long rcutorture_testseq;
 extern unsigned long rcutorture_vernum;
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 91fff49d5869..59eb0d115217 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3423,6 +3423,18 @@ void cond_synchronize_sched(unsigned long oldstate)
 }
 EXPORT_SYMBOL_GPL(cond_synchronize_sched);
 
+bool should_synchronize_sched(unsigned long oldstate)
+{
+	unsigned long newstate;
+
+	/*
+	 * Ensure that this load happens before any RCU-destructive
+	 * actions the caller might carry out after we return.
+	 */
+	newstate = smp_load_acquire(&rcu_sched_state.completed);
+	return ULONG_CMP_GE(oldstate, newstate);
+}
+
 /*
  * Check to see if there is any immediate RCU-related work to be done
  * by the current CPU, for the specified type of RCU, returning 1 if so.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ecad8e2d2a29..d7630b34d197 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6362,7 +6362,7 @@ void ia64_set_curr_task(int cpu, struct task_struct *p)
 /* task_group_lock serializes the addition/removal of task groups */
 static DEFINE_SPINLOCK(task_group_lock);
 
-static void sched_free_group(struct task_group *tg)
+static void __sched_free_group(struct task_group *tg)
 {
 	free_fair_sched_group(tg);
 	free_rt_sched_group(tg);
@@ -6370,6 +6370,22 @@ static void sched_free_group(struct task_group *tg)
 	kmem_cache_free(task_group_cache, tg);
 }
 
+static void sched_free_group_rcu_sched(struct rcu_head *rhp)
+{
+	/* Now it should be safe to free those cfs_rqs: */
+	__sched_free_group(container_of(rhp, struct task_group, rcu));
+}
+
+static void sched_free_group(struct task_group *tg)
+{
+	if (should_synchronize_sched(tg->rcu_stamp)) {
+		call_rcu_sched(&tg->rcu, sched_free_group_rcu_sched);
+		return;
+	}
+
+	__sched_free_group(tg);
+}
+
 /* allocate runqueue etc for a new task group */
 struct task_group *sched_create_group(struct task_group *parent)
 {
@@ -6434,6 +6450,8 @@ void sched_offline_group(struct task_group *tg)
 	list_del_rcu(&tg->list);
 	list_del_rcu(&tg->siblings);
 	spin_unlock_irqrestore(&task_group_lock, flags);
+
+	tg->rcu_stamp = get_state_synchronize_sched();
 }
 
 static void sched_change_group(struct task_struct *tsk, int type)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b2e5e9e64c86..1dbab563e798 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -302,6 +302,7 @@ struct task_group {
 #endif
 
 	struct rcu_head rcu;
+	long rcu_stamp;
 	struct list_head list;
 
 	struct task_group *parent;

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] sched/fair: Use task_groups instead of leaf_cfs_rq_list to walk all cfs_rqs
  2017-05-01 16:59 ` Peter Zijlstra
@ 2017-05-01 17:02   ` Peter Zijlstra
  2017-05-01 19:07     ` Tejun Heo
  0 siblings, 1 reply; 14+ messages in thread
From: Peter Zijlstra @ 2017-05-01 17:02 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Ingo Molnar, “linux-kernel@vger.kernel.org”,
	Linus Torvalds, Mike Galbraith, Paul Turner, Chris Mason,
	“kernel-team@fb.com”,
	Paul McKenney

On Mon, May 01, 2017 at 06:59:39PM +0200, Peter Zijlstra wrote:
> On Tue, Apr 25, 2017 at 05:40:39PM -0700, Tejun Heo wrote:
> > @@ -9355,24 +9366,18 @@ void online_fair_sched_group(struct task
> >  void unregister_fair_sched_group(struct task_group *tg)
> >  {
> >  	unsigned long flags;
> > -	struct rq *rq;
> >  	int cpu;
> >  
> >  	for_each_possible_cpu(cpu) {
> > +		struct rq *rq = cpu_rq(cpu);
> > +		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
> > +
> >  		if (tg->se[cpu])
> >  			remove_entity_load_avg(tg->se[cpu]);
> >  
> > -		/*
> > -		 * Only empty task groups can be destroyed; so we can speculatively
> > -		 * check on_list without danger of it being re-added.
> > -		 */
> > -		if (!tg->cfs_rq[cpu]->on_list)
> > -			continue;
> > -
> > -		rq = cpu_rq(cpu);
> > -
> >  		raw_spin_lock_irqsave(&rq->lock, flags);
> > -		list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
> > +		list_del_leaf_cfs_rq(cfs_rq);
> > +		cfs_rq->online = 0;
> >  		raw_spin_unlock_irqrestore(&rq->lock, flags);
> >  	}
> >  }
> 
> It would be nice to be able to retain that on_list check and avoid
> taking all those rq->lock's.
> 
> Since per the previous mail, those online/offline loops are protected by
> rq->lock, all we need is to ensure an rcu_sched GP passes between here
> and sched_free_group().
> 
> Which I think we can do differently, see below. Then we don't need the
> ->online thing and can keep using the ->on_list check.

n/m, I need to stop staring at a screen. Wrapping those two sites in
rcu_read_lock() achieves the very same.

So we want the rcu_read_lock() to serialize against sched_free_group,
but don't need the new ->online thing and can retain the ->on_list
stuff. Or I've completely lost the plot (which is entirely possible...)

I'll stare at this again tomorrow

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] sched/fair: Use task_groups instead of leaf_cfs_rq_list to walk all cfs_rqs
  2017-05-01 17:02   ` Peter Zijlstra
@ 2017-05-01 19:07     ` Tejun Heo
  0 siblings, 0 replies; 14+ messages in thread
From: Tejun Heo @ 2017-05-01 19:07 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, “linux-kernel@vger.kernel.org”,
	Linus Torvalds, Mike Galbraith, Paul Turner, Chris Mason,
	“kernel-team@fb.com”,
	Paul McKenney

Hello,

On Mon, May 01, 2017 at 07:02:41PM +0200, Peter Zijlstra wrote:
> n/m, I need to stop staring at a screen. Wrapping those two sites in
> rcu_read_lock() achieves the very same.
> 
> So we want the rcu_read_lock() to serialize against sched_free_group,
> but don't need the new ->online thing and can retain the ->on_list
> stuff. Or I've completely lost the plot (which is entirely possible...)
> 
> I'll stare at this again tomorrow

So, the rcu_read_lock() thing protects against sched_free_group() and
thanks to the order of operations, all online cfs_rq's are guaranteed
to be visbile in the two callbacks; however, nothing prevents the code
paths from seeing already dead cfs_rqs, which *may* be okay if the
code paths are safe to run on dead and unlinked cfs_rqs, but it's
still nasty and fragile.

The new ->online condition which is synchronized by rq->lock
guarantees that both functions only process live ones.  We need
something synchronized by rq->lock to guarantee this whether that's a
new list entry or a flag like ->online.  It'd be better to encapsulate
and document this iteration.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [2/2] sched/fair: Fix O(# total cgroups) in load balance path
  2017-05-01 16:11   ` Peter Zijlstra
@ 2017-05-01 19:11     ` Tejun Heo
  0 siblings, 0 replies; 14+ messages in thread
From: Tejun Heo @ 2017-05-01 19:11 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, “linux-kernel@vger.kernel.org”,
	Linus Torvalds, Mike Galbraith, Paul Turner, Chris Mason,
	“kernel-team@fb.com”

Hello, Peter.

On Mon, May 01, 2017 at 06:11:58PM +0200, Peter Zijlstra wrote:
> On Tue, Apr 25, 2017 at 05:43:50PM -0700, Tejun Heo wrote:
> > @@ -7007,6 +7008,14 @@ static void update_blocked_averages(int
> >  		se = cfs_rq->tg->se[cpu];
> >  		if (se && !skip_blocked_update(se))
> >  			update_load_avg(se, 0);
> > +
> > +		/*
> > +		 * There can be a lot of idle CPU cgroups.  Don't let fully
> > +		 * decayed cfs_rqs linger on the list.
> > +		 */
> > +		if (!cfs_rq->load.weight && !cfs_rq->avg.load_sum &&
> > +		    !cfs_rq->avg.util_sum && !cfs_rq->runnable_load_sum)
> > +			list_del_leaf_cfs_rq(cfs_rq);
> >  	}
> >  	rq_unlock_irqrestore(rq, &rf);
> >  }
> 
> Right this is a 'known' issue and we recently talked about this.
> 
> I think you got the condition right, we want to wait for all the stuff
> to be decayed out before taking it off the list.
> 
> The only 'problem', which Vincent mentioned in that other thread, is that
> NOHZ idle doesn't guarantee decay -- then again, you don't want to go
> wake a CPU just to decay this crud either. And if we're idle, the list
> being long doesn't matter either.

The list staying long is fine as long as nobody walks it; however, the
list can be *really* long, e.g. hundreds of thousands long, so walking
it repeatedly won't be a good idea even if the system is idle.  As
long as NOHZ decays and trims the list when it ends up walking the
list, and AFAICS it does, it should be fine.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] sched/fair: Use task_groups instead of leaf_cfs_rq_list to walk all cfs_rqs
  2017-04-26  0:40 [PATCH 1/2] sched/fair: Use task_groups instead of leaf_cfs_rq_list to walk all cfs_rqs Tejun Heo
                   ` (2 preceding siblings ...)
  2017-05-01 16:59 ` Peter Zijlstra
@ 2017-05-04 13:31 ` Peter Zijlstra
  2017-05-04 17:31   ` Tejun Heo
  2017-05-15  9:12   ` [tip:sched/core] " tip-bot for Peter Zijlstra
  3 siblings, 2 replies; 14+ messages in thread
From: Peter Zijlstra @ 2017-05-04 13:31 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Ingo Molnar, “linux-kernel@vger.kernel.org”,
	Linus Torvalds, Mike Galbraith, Paul Turner, Chris Mason,
	“kernel-team@fb.com”

On Tue, Apr 25, 2017 at 05:40:39PM -0700, Tejun Heo wrote:

> task_groups list is protected by its own lock.  While it allows RCU
> protected traversal and the order of operations guarantees that all
> online cfs_rqs will be visited, holding rq->lock won't protect against
> iterating an already unregistered cfs_rq.

So I think the below is sufficient.

Yes we can hit an (almost) dead cfs_rq, but poking the bandwidth
variables thereof is harmless.

unthrottle_cfs_rq() also stops doing anything much when it finds the
cfs_rq is empty, which must be the case if we're removing it.

I don't know Paul's opinion on RCU GPs happening while stop_machine(),
but just in case he feels that's fair game, I did add the
rcu_read_lock() thingies.

The lockdep assert is mostly documentation, to more easily see
it is indeed held when we get there.

I left print_cfs_stats using the leaf list, no point in printing stuff
that's empty.

This way we can avoid taking all RQ locks on cgroup destruction.

---
 kernel/sched/fair.c | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cd6c3f957d44..c897ad6d714c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4644,22 +4644,32 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 
 static void __maybe_unused update_runtime_enabled(struct rq *rq)
 {
-	struct cfs_rq *cfs_rq;
+	struct task_group *tg;
 
-	for_each_leaf_cfs_rq(rq, cfs_rq) {
-		struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
+	lockdep_assert_held(&rq->lock);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(tg, &task_groups, list) {
+		struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 
 		raw_spin_lock(&cfs_b->lock);
 		cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
 		raw_spin_unlock(&cfs_b->lock);
 	}
+	rcu_read_unlock();
 }
 
 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
 {
-	struct cfs_rq *cfs_rq;
+	struct task_group *tg;
+
+	lockdep_assert_held(&rq->lock);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(tg, &task_groups, list) {
+		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 
-	for_each_leaf_cfs_rq(rq, cfs_rq) {
 		if (!cfs_rq->runtime_enabled)
 			continue;
 
@@ -4677,6 +4687,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
 		if (cfs_rq_throttled(cfs_rq))
 			unthrottle_cfs_rq(cfs_rq);
 	}
+	rcu_read_unlock();
 }
 
 #else /* CONFIG_CFS_BANDWIDTH */

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] sched/fair: Use task_groups instead of leaf_cfs_rq_list to walk all cfs_rqs
  2017-05-04 13:31 ` Peter Zijlstra
@ 2017-05-04 17:31   ` Tejun Heo
  2017-05-15  9:12   ` [tip:sched/core] " tip-bot for Peter Zijlstra
  1 sibling, 0 replies; 14+ messages in thread
From: Tejun Heo @ 2017-05-04 17:31 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, “linux-kernel@vger.kernel.org”,
	Linus Torvalds, Mike Galbraith, Paul Turner, Chris Mason,
	“kernel-team@fb.com”

Hello, Peter.

On Thu, May 04, 2017 at 03:31:22PM +0200, Peter Zijlstra wrote:
> Yes we can hit an (almost) dead cfs_rq, but poking the bandwidth
> variables thereof is harmless.
> 
> unthrottle_cfs_rq() also stops doing anything much when it finds the
> cfs_rq is empty, which must be the case if we're removing it.

Yeah, if you're okay with calling the functions on dead cfs_rq's, just
wrapping with rcu_read_lock should be enough.

> I don't know Paul's opinion on RCU GPs happening while stop_machine(),
> but just in case he feels that's fair game, I did add the
> rcu_read_lock() thingies.
> 
> The lockdep assert is mostly documentation, to more easily see
> it is indeed held when we get there.
> 
> I left print_cfs_stats using the leaf list, no point in printing stuff
> that's empty.
> 
> This way we can avoid taking all RQ locks on cgroup destruction.

Looks good to me.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [tip:sched/core] sched/fair: Use task_groups instead of leaf_cfs_rq_list to walk all cfs_rqs
  2017-05-04 13:31 ` Peter Zijlstra
  2017-05-04 17:31   ` Tejun Heo
@ 2017-05-15  9:12   ` tip-bot for Peter Zijlstra
  1 sibling, 0 replies; 14+ messages in thread
From: tip-bot for Peter Zijlstra @ 2017-05-15  9:12 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: clm, efault, peterz, linux-kernel, tj, hpa, torvalds, tglx, mingo, pjt

Commit-ID:  502ce005ab95d5d9481768649dbab808845b24d7
Gitweb:     http://git.kernel.org/tip/502ce005ab95d5d9481768649dbab808845b24d7
Author:     Peter Zijlstra <peterz@infradead.org>
AuthorDate: Thu, 4 May 2017 15:31:22 +0200
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Mon, 15 May 2017 10:15:35 +0200

sched/fair: Use task_groups instead of leaf_cfs_rq_list to walk all cfs_rqs

In order to allow leaf_cfs_rq_list to remove entries switch the
bandwidth hotplug code over to the task_groups list.

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Chris Mason <clm@fb.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170504133122.a6qjlj3hlblbjxux@hirez.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index eede181..c8fa777 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4642,24 +4642,43 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 	hrtimer_cancel(&cfs_b->slack_timer);
 }
 
+/*
+ * Both these cpu hotplug callbacks race against unregister_fair_sched_group()
+ *
+ * The race is harmless, since modifying bandwidth settings of unhooked group
+ * bits doesn't do much.
+ */
+
+/* cpu online calback */
 static void __maybe_unused update_runtime_enabled(struct rq *rq)
 {
-	struct cfs_rq *cfs_rq;
+	struct task_group *tg;
 
-	for_each_leaf_cfs_rq(rq, cfs_rq) {
-		struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
+	lockdep_assert_held(&rq->lock);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(tg, &task_groups, list) {
+		struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 
 		raw_spin_lock(&cfs_b->lock);
 		cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
 		raw_spin_unlock(&cfs_b->lock);
 	}
+	rcu_read_unlock();
 }
 
+/* cpu offline callback */
 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
 {
-	struct cfs_rq *cfs_rq;
+	struct task_group *tg;
+
+	lockdep_assert_held(&rq->lock);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(tg, &task_groups, list) {
+		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 
-	for_each_leaf_cfs_rq(rq, cfs_rq) {
 		if (!cfs_rq->runtime_enabled)
 			continue;
 
@@ -4677,6 +4696,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
 		if (cfs_rq_throttled(cfs_rq))
 			unthrottle_cfs_rq(cfs_rq);
 	}
+	rcu_read_unlock();
 }
 
 #else /* CONFIG_CFS_BANDWIDTH */

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [tip:sched/core] sched/fair: Fix O(nr_cgroups) in load balance path
  2017-04-26  0:43 ` [2/2] sched/fair: Fix O(# total cgroups) in load balance path Tejun Heo
  2017-05-01 16:11   ` Peter Zijlstra
@ 2017-05-15  9:13   ` tip-bot for Tejun Heo
  2017-05-15  9:43     ` Peter Zijlstra
  2017-05-15 10:13   ` tip-bot for Tejun Heo
  2 siblings, 1 reply; 14+ messages in thread
From: tip-bot for Tejun Heo @ 2017-05-15  9:13 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: efault, pjt, tj, hpa, torvalds, clm, vincent.guittot, mingo,
	peterz, linux-kernel, tglx

Commit-ID:  d517a114fc478a618eeee86c3fdf22db68b6d831
Gitweb:     http://git.kernel.org/tip/d517a114fc478a618eeee86c3fdf22db68b6d831
Author:     Tejun Heo <tj@kernel.org>
AuthorDate: Tue, 25 Apr 2017 17:43:50 -0700
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Mon, 15 May 2017 10:15:36 +0200

sched/fair: Fix O(nr_cgroups) in load balance path

Currently, rq->leaf_cfs_rq_list is a traversal ordered list of all
live cfs_rqs which have ever been active on the CPU; unfortunately,
this makes update_blocked_averages() O(# total cgroups) which isn't
scalable at all.

This shows up as a small CPU consumption and scheduling latency
increase in the load balancing path in systems with CPU controller
enabled across most cgroups.  In an edge case where temporary cgroups
were leaking, this caused the kernel to consume good several tens of
percents of CPU cycles running update_blocked_averages(), each run
taking multiple millisecs.

This patch fixes the issue by taking empty and fully decayed cfs_rqs
off the rq->leaf_cfs_rq_list.

Signed-off-by: Tejun Heo <tj@kernel.org>
[peterz: added cfs_rq_is_decayed()]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Chris Mason <clm@fb.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170426004350.GB3222@wtj.duckdns.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 40 +++++++++++++++++++++++++++++++++-------
 1 file changed, 33 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c8fa777..ca08ae2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -369,8 +369,9 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 }
 
 /* Iterate thr' all leaf cfs_rq's on a runqueue */
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
-	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)			\
+	list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list,	\
+				 leaf_cfs_rq_list)
 
 /* Do the two (enqueued) entities belong to the same group ? */
 static inline struct cfs_rq *
@@ -463,7 +464,7 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
 }
 
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)	\
 		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
 
 static inline struct sched_entity *parent_entity(struct sched_entity *se)
@@ -6953,10 +6954,28 @@ static void attach_tasks(struct lb_env *env)
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+
+static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
+{
+	if (cfs_rq->load.weight)
+		return false;
+
+	if (cfs_rq->avg.load_sum)
+		return false;
+
+	if (cfs_rq->avg.util_sum)
+		return false;
+
+	if (cfs_rq->runnable_load_sum)
+		return false;
+
+	return true;
+}
+
 static void update_blocked_averages(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
-	struct cfs_rq *cfs_rq;
+	struct cfs_rq *cfs_rq, *pos;
 	struct rq_flags rf;
 
 	rq_lock_irqsave(rq, &rf);
@@ -6966,7 +6985,7 @@ static void update_blocked_averages(int cpu)
 	 * Iterates the task_group tree in a bottom up fashion, see
 	 * list_add_leaf_cfs_rq() for details.
 	 */
-	for_each_leaf_cfs_rq(rq, cfs_rq) {
+	for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
 		struct sched_entity *se;
 
 		/* throttled entities do not contribute to load */
@@ -6980,6 +6999,13 @@ static void update_blocked_averages(int cpu)
 		se = cfs_rq->tg->se[cpu];
 		if (se && !skip_blocked_update(se))
 			update_load_avg(se, 0);
+
+		/*
+		 * There can be a lot of idle CPU cgroups.  Don't let fully
+		 * decayed cfs_rqs linger on the list.
+		 */
+		if (cfs_rq_is_decayed(cfs_rq))
+			list_del_leaf_cfs_rq(cfs_rq);
 	}
 	rq_unlock_irqrestore(rq, &rf);
 }
@@ -9503,10 +9529,10 @@ const struct sched_class fair_sched_class = {
 #ifdef CONFIG_SCHED_DEBUG
 void print_cfs_stats(struct seq_file *m, int cpu)
 {
-	struct cfs_rq *cfs_rq;
+	struct cfs_rq *cfs_rq, *pos;
 
 	rcu_read_lock();
-	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
+	for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
 		print_cfs_rq(m, cpu, cfs_rq);
 	rcu_read_unlock();
 }

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [tip:sched/core] sched/fair: Fix O(nr_cgroups) in load balance path
  2017-05-15  9:13   ` [tip:sched/core] sched/fair: Fix O(nr_cgroups) " tip-bot for Tejun Heo
@ 2017-05-15  9:43     ` Peter Zijlstra
  0 siblings, 0 replies; 14+ messages in thread
From: Peter Zijlstra @ 2017-05-15  9:43 UTC (permalink / raw)
  To: pjt, efault, tj, hpa, torvalds, clm, mingo, vincent.guittot,
	linux-kernel, tglx
  Cc: linux-tip-commits

On Mon, May 15, 2017 at 02:13:17AM -0700, tip-bot for Tejun Heo wrote:
> @@ -463,7 +464,7 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
>  {
>  }
>  
> -#define for_each_leaf_cfs_rq(rq, cfs_rq) \
> +#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)	\
>  		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
>  
>  static inline struct sched_entity *parent_entity(struct sched_entity *se)


That generates a build warn on CGROUP=n SCHED_DEBUG=y

This seems to cure..

Ingo, could you fold or append?

---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ca08ae2a7877..219fe58e3023 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -465,7 +465,7 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 }
 
 #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)	\
-		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
+		for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
 
 static inline struct sched_entity *parent_entity(struct sched_entity *se)
 {

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [tip:sched/core] sched/fair: Fix O(nr_cgroups) in load balance path
  2017-04-26  0:43 ` [2/2] sched/fair: Fix O(# total cgroups) in load balance path Tejun Heo
  2017-05-01 16:11   ` Peter Zijlstra
  2017-05-15  9:13   ` [tip:sched/core] sched/fair: Fix O(nr_cgroups) " tip-bot for Tejun Heo
@ 2017-05-15 10:13   ` tip-bot for Tejun Heo
  2 siblings, 0 replies; 14+ messages in thread
From: tip-bot for Tejun Heo @ 2017-05-15 10:13 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: mingo, efault, pjt, vincent.guittot, clm, linux-kernel, peterz,
	torvalds, tj, hpa, tglx

Commit-ID:  a9e7f6544b9cebdae54d29f87a7ba2a83c0471b5
Gitweb:     http://git.kernel.org/tip/a9e7f6544b9cebdae54d29f87a7ba2a83c0471b5
Author:     Tejun Heo <tj@kernel.org>
AuthorDate: Tue, 25 Apr 2017 17:43:50 -0700
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Mon, 15 May 2017 12:07:44 +0200

sched/fair: Fix O(nr_cgroups) in load balance path

Currently, rq->leaf_cfs_rq_list is a traversal ordered list of all
live cfs_rqs which have ever been active on the CPU; unfortunately,
this makes update_blocked_averages() O(# total cgroups) which isn't
scalable at all.

This shows up as a small CPU consumption and scheduling latency
increase in the load balancing path in systems with CPU controller
enabled across most cgroups.  In an edge case where temporary cgroups
were leaking, this caused the kernel to consume good several tens of
percents of CPU cycles running update_blocked_averages(), each run
taking multiple millisecs.

This patch fixes the issue by taking empty and fully decayed cfs_rqs
off the rq->leaf_cfs_rq_list.

Signed-off-by: Tejun Heo <tj@kernel.org>
[ Added cfs_rq_is_decayed() ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Chris Mason <clm@fb.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170426004350.GB3222@wtj.duckdns.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 42 ++++++++++++++++++++++++++++++++++--------
 1 file changed, 34 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c8fa777..219fe58 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -369,8 +369,9 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 }
 
 /* Iterate thr' all leaf cfs_rq's on a runqueue */
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
-	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)			\
+	list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list,	\
+				 leaf_cfs_rq_list)
 
 /* Do the two (enqueued) entities belong to the same group ? */
 static inline struct cfs_rq *
@@ -463,8 +464,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
 }
 
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
-		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)	\
+		for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
 
 static inline struct sched_entity *parent_entity(struct sched_entity *se)
 {
@@ -6953,10 +6954,28 @@ static void attach_tasks(struct lb_env *env)
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+
+static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
+{
+	if (cfs_rq->load.weight)
+		return false;
+
+	if (cfs_rq->avg.load_sum)
+		return false;
+
+	if (cfs_rq->avg.util_sum)
+		return false;
+
+	if (cfs_rq->runnable_load_sum)
+		return false;
+
+	return true;
+}
+
 static void update_blocked_averages(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
-	struct cfs_rq *cfs_rq;
+	struct cfs_rq *cfs_rq, *pos;
 	struct rq_flags rf;
 
 	rq_lock_irqsave(rq, &rf);
@@ -6966,7 +6985,7 @@ static void update_blocked_averages(int cpu)
 	 * Iterates the task_group tree in a bottom up fashion, see
 	 * list_add_leaf_cfs_rq() for details.
 	 */
-	for_each_leaf_cfs_rq(rq, cfs_rq) {
+	for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
 		struct sched_entity *se;
 
 		/* throttled entities do not contribute to load */
@@ -6980,6 +6999,13 @@ static void update_blocked_averages(int cpu)
 		se = cfs_rq->tg->se[cpu];
 		if (se && !skip_blocked_update(se))
 			update_load_avg(se, 0);
+
+		/*
+		 * There can be a lot of idle CPU cgroups.  Don't let fully
+		 * decayed cfs_rqs linger on the list.
+		 */
+		if (cfs_rq_is_decayed(cfs_rq))
+			list_del_leaf_cfs_rq(cfs_rq);
 	}
 	rq_unlock_irqrestore(rq, &rf);
 }
@@ -9503,10 +9529,10 @@ const struct sched_class fair_sched_class = {
 #ifdef CONFIG_SCHED_DEBUG
 void print_cfs_stats(struct seq_file *m, int cpu)
 {
-	struct cfs_rq *cfs_rq;
+	struct cfs_rq *cfs_rq, *pos;
 
 	rcu_read_lock();
-	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
+	for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
 		print_cfs_rq(m, cpu, cfs_rq);
 	rcu_read_unlock();
 }

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2017-05-15 10:17 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-04-26  0:40 [PATCH 1/2] sched/fair: Use task_groups instead of leaf_cfs_rq_list to walk all cfs_rqs Tejun Heo
2017-04-26  0:43 ` [2/2] sched/fair: Fix O(# total cgroups) in load balance path Tejun Heo
2017-05-01 16:11   ` Peter Zijlstra
2017-05-01 19:11     ` Tejun Heo
2017-05-15  9:13   ` [tip:sched/core] sched/fair: Fix O(nr_cgroups) " tip-bot for Tejun Heo
2017-05-15  9:43     ` Peter Zijlstra
2017-05-15 10:13   ` tip-bot for Tejun Heo
2017-05-01 16:24 ` [PATCH 1/2] sched/fair: Use task_groups instead of leaf_cfs_rq_list to walk all cfs_rqs Peter Zijlstra
2017-05-01 16:59 ` Peter Zijlstra
2017-05-01 17:02   ` Peter Zijlstra
2017-05-01 19:07     ` Tejun Heo
2017-05-04 13:31 ` Peter Zijlstra
2017-05-04 17:31   ` Tejun Heo
2017-05-15  9:12   ` [tip:sched/core] " tip-bot for Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).