[PATCH 1/2] sched/fair: Use task_groups instead of leaf_cfs_rq_list to walk all cfs_rqs

* [PATCH 1/2] sched/fair: Use task_groups instead of leaf_cfs_rq_list to walk all cfs_rqs
@ 2017-04-26  0:40 Tejun Heo
  2017-04-26  0:43 ` [2/2] sched/fair: Fix O(# total cgroups) in load balance path Tejun Heo
                   ` (3 more replies)
  0 siblings, 4 replies; 14+ messages in thread
From: Tejun Heo @ 2017-04-26  0:40 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra
  Cc: “linux-kernel@vger.kernel.org”,
	Linus Torvalds, Mike Galbraith, Paul Turner, Chris Mason,
	“kernel-team@fb.com”

Currently, rq->leaf_cfs_rq_list is a traversal ordered list of all
live cfs_rqs which have ever been active on the CPU; unfortunately,
this makes update_blocked_averages() O(total number of CPU cgroups)
which isn't scalable at all.

The next patch will make rq->leaf_cfs_rq_list only contain the cfs_rqs
which are currently active.  In preparation, this patch converts users
which need to traverse all cfs_rqs to use task_groups list instead.

task_groups list is protected by its own lock.  While it allows RCU
protected traversal and the order of operations guarantees that all
online cfs_rqs will be visited, holding rq->lock won't protect against
iterating an already unregistered cfs_rq.  To avoid operating on an
already dead cfs_rq, a new state variable cfs_rq->online is added
which is protected by rq->lock and tracks whether the cfs_rq is
registered.

As clearing of cfs_rq->online should be protected by rq->lock, the
locking avoidance optimization in unregister_fair_sched_group() is
removed.  Given that the optimization is meanginful only when removing
cgroups which are created but never used and the general heaviness of
cgroup removal, the impact is unlikely to be noticeable.

Another change is that print_cfs_stats() would now print cfs_rqs in a
different order.  If this matters, we can improve it later so that it
first prints cfs_rqs on leaf_cfs_rq_list and then follow up with the
rest.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Turner <pjt@google.com>
Cc: Chris Mason <clm@fb.com>
---
Hello,

This is another set of CPU controller performance fix patches and
based on top of -next as of today.

Thanks.

 kernel/sched/fair.c  |   45 +++++++++++++++++++++++++--------------------
 kernel/sched/sched.h |    1 +
 2 files changed, 26 insertions(+), 20 deletions(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4644,23 +4644,32 @@ static void destroy_cfs_bandwidth(struct
 
 static void __maybe_unused update_runtime_enabled(struct rq *rq)
 {
-	struct cfs_rq *cfs_rq;
+	struct task_group *tg;
 
-	for_each_leaf_cfs_rq(rq, cfs_rq) {
-		struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
+	rcu_read_lock();
+	list_for_each_entry_rcu(tg, &task_groups, list) {
+		struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+
+		if (!cfs_rq->online)
+			continue;
 
 		raw_spin_lock(&cfs_b->lock);
 		cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
 		raw_spin_unlock(&cfs_b->lock);
 	}
+	rcu_read_unlock();
 }
 
 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
 {
-	struct cfs_rq *cfs_rq;
+	struct task_group *tg;
 
-	for_each_leaf_cfs_rq(rq, cfs_rq) {
-		if (!cfs_rq->runtime_enabled)
+	rcu_read_lock();
+	list_for_each_entry_rcu(tg, &task_groups, list) {
+		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+
+		if (!cfs_rq->online || !cfs_rq->runtime_enabled)
 			continue;
 
 		/*
@@ -4677,6 +4686,7 @@ static void __maybe_unused unthrottle_of
 		if (cfs_rq_throttled(cfs_rq))
 			unthrottle_cfs_rq(cfs_rq);
 	}
+	rcu_read_unlock();
 }
 
 #else /* CONFIG_CFS_BANDWIDTH */
@@ -9345,6 +9355,7 @@ void online_fair_sched_group(struct task
 		se = tg->se[i];
 
 		raw_spin_lock_irq(&rq->lock);
+		se->my_q->online = 1;
 		update_rq_clock(rq);
 		attach_entity_cfs_rq(se);
 		sync_throttle(tg, i);
@@ -9355,24 +9366,18 @@ void online_fair_sched_group(struct task
 void unregister_fair_sched_group(struct task_group *tg)
 {
 	unsigned long flags;
-	struct rq *rq;
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
+		struct rq *rq = cpu_rq(cpu);
+		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
+
 		if (tg->se[cpu])
 			remove_entity_load_avg(tg->se[cpu]);
 
-		/*
-		 * Only empty task groups can be destroyed; so we can speculatively
-		 * check on_list without danger of it being re-added.
-		 */
-		if (!tg->cfs_rq[cpu]->on_list)
-			continue;
-
-		rq = cpu_rq(cpu);
-
 		raw_spin_lock_irqsave(&rq->lock, flags);
-		list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
+		list_del_leaf_cfs_rq(cfs_rq);
+		cfs_rq->online = 0;
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 	}
 }
@@ -9523,11 +9528,11 @@ const struct sched_class fair_sched_clas
 #ifdef CONFIG_SCHED_DEBUG
 void print_cfs_stats(struct seq_file *m, int cpu)
 {
-	struct cfs_rq *cfs_rq;
+	struct task_group *tg;
 
 	rcu_read_lock();
-	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
-		print_cfs_rq(m, cpu, cfs_rq);
+	list_for_each_entry_rcu(tg, &task_groups, list)
+		print_cfs_rq(m, cpu, tg->cfs_rq[cpu]);
 	rcu_read_unlock();
 }
 
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -448,6 +448,7 @@ struct cfs_rq {
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */
+	int online;	/* online state, protected by rq->lock */
 
 	/*
 	 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in

^ permalink raw reply	[flat|nested] 14+ messages in thread