linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: tip-bot for Tejun Heo <tipbot@zytor.com>
To: linux-tip-commits@vger.kernel.org
Cc: efault@gmx.de, pjt@google.com, tj@kernel.org, hpa@zytor.com,
	torvalds@linux-foundation.org, clm@fb.com,
	vincent.guittot@linaro.org, mingo@kernel.org,
	peterz@infradead.org, linux-kernel@vger.kernel.org,
	tglx@linutronix.de
Subject: [tip:sched/core] sched/fair: Fix O(nr_cgroups) in load balance path
Date: Mon, 15 May 2017 02:13:17 -0700	[thread overview]
Message-ID: <tip-d517a114fc478a618eeee86c3fdf22db68b6d831@git.kernel.org> (raw)
In-Reply-To: <20170426004350.GB3222@wtj.duckdns.org>

Commit-ID:  d517a114fc478a618eeee86c3fdf22db68b6d831
Gitweb:     http://git.kernel.org/tip/d517a114fc478a618eeee86c3fdf22db68b6d831
Author:     Tejun Heo <tj@kernel.org>
AuthorDate: Tue, 25 Apr 2017 17:43:50 -0700
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Mon, 15 May 2017 10:15:36 +0200

sched/fair: Fix O(nr_cgroups) in load balance path

Currently, rq->leaf_cfs_rq_list is a traversal ordered list of all
live cfs_rqs which have ever been active on the CPU; unfortunately,
this makes update_blocked_averages() O(# total cgroups) which isn't
scalable at all.

This shows up as a small CPU consumption and scheduling latency
increase in the load balancing path in systems with CPU controller
enabled across most cgroups.  In an edge case where temporary cgroups
were leaking, this caused the kernel to consume good several tens of
percents of CPU cycles running update_blocked_averages(), each run
taking multiple millisecs.

This patch fixes the issue by taking empty and fully decayed cfs_rqs
off the rq->leaf_cfs_rq_list.

Signed-off-by: Tejun Heo <tj@kernel.org>
[peterz: added cfs_rq_is_decayed()]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Chris Mason <clm@fb.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170426004350.GB3222@wtj.duckdns.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 40 +++++++++++++++++++++++++++++++++-------
 1 file changed, 33 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c8fa777..ca08ae2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -369,8 +369,9 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 }
 
 /* Iterate thr' all leaf cfs_rq's on a runqueue */
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
-	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)			\
+	list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list,	\
+				 leaf_cfs_rq_list)
 
 /* Do the two (enqueued) entities belong to the same group ? */
 static inline struct cfs_rq *
@@ -463,7 +464,7 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
 }
 
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)	\
 		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
 
 static inline struct sched_entity *parent_entity(struct sched_entity *se)
@@ -6953,10 +6954,28 @@ static void attach_tasks(struct lb_env *env)
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+
+static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
+{
+	if (cfs_rq->load.weight)
+		return false;
+
+	if (cfs_rq->avg.load_sum)
+		return false;
+
+	if (cfs_rq->avg.util_sum)
+		return false;
+
+	if (cfs_rq->runnable_load_sum)
+		return false;
+
+	return true;
+}
+
 static void update_blocked_averages(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
-	struct cfs_rq *cfs_rq;
+	struct cfs_rq *cfs_rq, *pos;
 	struct rq_flags rf;
 
 	rq_lock_irqsave(rq, &rf);
@@ -6966,7 +6985,7 @@ static void update_blocked_averages(int cpu)
 	 * Iterates the task_group tree in a bottom up fashion, see
 	 * list_add_leaf_cfs_rq() for details.
 	 */
-	for_each_leaf_cfs_rq(rq, cfs_rq) {
+	for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
 		struct sched_entity *se;
 
 		/* throttled entities do not contribute to load */
@@ -6980,6 +6999,13 @@ static void update_blocked_averages(int cpu)
 		se = cfs_rq->tg->se[cpu];
 		if (se && !skip_blocked_update(se))
 			update_load_avg(se, 0);
+
+		/*
+		 * There can be a lot of idle CPU cgroups.  Don't let fully
+		 * decayed cfs_rqs linger on the list.
+		 */
+		if (cfs_rq_is_decayed(cfs_rq))
+			list_del_leaf_cfs_rq(cfs_rq);
 	}
 	rq_unlock_irqrestore(rq, &rf);
 }
@@ -9503,10 +9529,10 @@ const struct sched_class fair_sched_class = {
 #ifdef CONFIG_SCHED_DEBUG
 void print_cfs_stats(struct seq_file *m, int cpu)
 {
-	struct cfs_rq *cfs_rq;
+	struct cfs_rq *cfs_rq, *pos;
 
 	rcu_read_lock();
-	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
+	for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
 		print_cfs_rq(m, cpu, cfs_rq);
 	rcu_read_unlock();
 }

  parent reply	other threads:[~2017-05-15  9:18 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-04-26  0:40 [PATCH 1/2] sched/fair: Use task_groups instead of leaf_cfs_rq_list to walk all cfs_rqs Tejun Heo
2017-04-26  0:43 ` [2/2] sched/fair: Fix O(# total cgroups) in load balance path Tejun Heo
2017-05-01 16:11   ` Peter Zijlstra
2017-05-01 19:11     ` Tejun Heo
2017-05-15  9:13   ` tip-bot for Tejun Heo [this message]
2017-05-15  9:43     ` [tip:sched/core] sched/fair: Fix O(nr_cgroups) " Peter Zijlstra
2017-05-15 10:13   ` tip-bot for Tejun Heo
2017-05-01 16:24 ` [PATCH 1/2] sched/fair: Use task_groups instead of leaf_cfs_rq_list to walk all cfs_rqs Peter Zijlstra
2017-05-01 16:59 ` Peter Zijlstra
2017-05-01 17:02   ` Peter Zijlstra
2017-05-01 19:07     ` Tejun Heo
2017-05-04 13:31 ` Peter Zijlstra
2017-05-04 17:31   ` Tejun Heo
2017-05-15  9:12   ` [tip:sched/core] " tip-bot for Peter Zijlstra

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=tip-d517a114fc478a618eeee86c3fdf22db68b6d831@git.kernel.org \
    --to=tipbot@zytor.com \
    --cc=clm@fb.com \
    --cc=efault@gmx.de \
    --cc=hpa@zytor.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-tip-commits@vger.kernel.org \
    --cc=mingo@kernel.org \
    --cc=peterz@infradead.org \
    --cc=pjt@google.com \
    --cc=tglx@linutronix.de \
    --cc=tj@kernel.org \
    --cc=torvalds@linux-foundation.org \
    --cc=vincent.guittot@linaro.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).