All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH RFC] sched: move h_load calculation to task_h_load
@ 2013-07-13  8:47 Vladimir Davydov
  2013-07-15  8:28 ` Peter Zijlstra
  0 siblings, 1 reply; 7+ messages in thread
From: Vladimir Davydov @ 2013-07-13  8:47 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra; +Cc: linux-kernel, devel

The bad thing about update_h_load(), which computes hierarchical load
factor for task groups, is that it is called for each task group in the
system before every load balancer run, and since rebalance can be
triggered very often, this function can eat really a lot of cpu time if
there are many cpu cgroups in the system.

Although the situation was improved significantly by commit a35b646
('sched, cgroup: Reduce rq->lock hold times for large cgroup
hierarchies'), the problem still can arise under some kinds of loads,
e.g. when cpus are switching from idle to busy and back very frequently.

For instance, when I start 1000 of processes that wake up every
millisecond on my 8 cpus host, 'top' and 'perf top' show:

Cpu(s): 17.8%us, 24.3%sy,  0.0%ni, 57.9%id,  0.0%wa,  0.0%hi,  0.0%si
Events: 243K cycles
  7.57%  [kernel]               [k] __schedule
  7.08%  [kernel]               [k] timerqueue_add
  6.13%  libc-2.12.so           [.] usleep

Then if I create 10000 *idle* cpu cgroups (no processes in them), cpu
usage increases significantly although the 'wakers' are still executing
in the root cpu cgroup:

Cpu(s): 19.1%us, 48.7%sy,  0.0%ni, 31.6%id,  0.0%wa,  0.0%hi,  0.7%si
Events: 230K cycles
 24.56%  [kernel]            [k] tg_load_down
  5.76%  [kernel]            [k] __schedule

This happens because this particular kind of load triggers 'new idle'
rebalance very frequently, which requires calling update_h_load(),
which, in turn, calls tg_load_down() for every *idle* cpu cgroup even
though it is absolutely useless, because idle cpu cgroups have no tasks
to pull.

This patch tries to improve the situation by making h_load calculation
proceed only when h_load is really necessary. To achieve this, it
substitutes update_h_load() with update_cfs_rq_h_load(), which computes
h_load only for a given cfs_rq and all its ascendants, and makes the
load balancer call this function whenever it considers if a task should
be pulled, i.e. it moves h_load calculations directly to task_h_load().
For h_load of the same cfs_rq not to be updated multiple times (in case
several tasks in the same cgroup are considered during the same balance
run), the patch keeps the time of the last h_load update for each cfs_rq
and breaks calculation when it finds h_load to be uptodate.

The benefit of it is that h_load is computed only for those cfs_rq's,
which really need it, in particular all idle task groups are skipped.
Although this, in fact, moves h_load calculation under rq lock, it
should not affect latency much, because the amount of work done under rq
lock while trying to pull tasks is limited by sched_nr_migrate.

After the patch applied with the setup described above (1000 wakers in
the root cgroup and 10000 idle cgroups), I get:

Cpu(s): 16.9%us, 24.8%sy,  0.0%ni, 58.4%id,  0.0%wa,  0.0%hi,  0.0%si
Events: 242K cycles
  7.57%  [kernel]                  [k] __schedule
  6.70%  [kernel]                  [k] timerqueue_add
  5.93%  libc-2.12.so              [.] usleep

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
---
 kernel/sched/fair.c  |   56 ++++++++++++++++++++++----------------------------
 kernel/sched/sched.h |    7 +++----
 2 files changed, 28 insertions(+), 35 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f77f9c5..de90690 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4171,47 +4171,46 @@ static void update_blocked_averages(int cpu)
 }
 
 /*
- * Compute the cpu's hierarchical load factor for each task group.
+ * Compute the hierarchical load factor for cfs_rq and all its ascendants.
  * This needs to be done in a top-down fashion because the load of a child
  * group is a fraction of its parents load.
  */
-static int tg_load_down(struct task_group *tg, void *data)
+static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
 {
+	struct rq *rq = rq_of(cfs_rq);
+	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
 	unsigned long load;
-	long cpu = (long)data;
 
-	if (!tg->parent) {
-		load = cpu_rq(cpu)->avg.load_avg_contrib;
-	} else {
-		load = tg->parent->cfs_rq[cpu]->h_load;
-		load = div64_ul(load * tg->se[cpu]->avg.load_avg_contrib,
-				tg->parent->cfs_rq[cpu]->runnable_load_avg + 1);
+	cfs_rq->h_load_next = NULL;
+	for_each_sched_entity(se) {
+		cfs_rq = cfs_rq_of(se);
+		cfs_rq->h_load_next = se;
+		if (cfs_rq->last_h_load_update == rq->clock)
+			break;
 	}
 
-	tg->cfs_rq[cpu]->h_load = load;
-
-	return 0;
-}
-
-static void update_h_load(long cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
-	unsigned long now = jiffies;
-
-	if (rq->h_load_throttle == now)
-		return;
-
-	rq->h_load_throttle = now;
+	if (!se) {
+		cfs_rq->h_load = rq->avg.load_avg_contrib;
+		cfs_rq->last_h_load_update = rq->clock;
+	}
 
-	rcu_read_lock();
-	walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
-	rcu_read_unlock();
+	while ((se = cfs_rq->h_load_next) != NULL) {
+		load = cfs_rq->h_load;
+		load = div64_ul(load * se->avg.load_avg_contrib,
+				cfs_rq->runnable_load_avg + 1);
+		cfs_rq = group_cfs_rq(se);
+		cfs_rq->h_load = load;
+		cfs_rq->last_h_load_update = rq->clock;
+	}
 }
 
 static unsigned long task_h_load(struct task_struct *p)
 {
 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
 
+	if (cfs_rq->last_h_load_update != rq_of(cfs_rq)->clock)
+		update_cfs_rq_h_load(cfs_rq);
+
 	return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
 			cfs_rq->runnable_load_avg + 1);
 }
@@ -4220,10 +4219,6 @@ static inline void update_blocked_averages(int cpu)
 {
 }
 
-static inline void update_h_load(long cpu)
-{
-}
-
 static unsigned long task_h_load(struct task_struct *p)
 {
 	return p->se.avg.load_avg_contrib;
@@ -5108,7 +5103,6 @@ redo:
 		env.src_rq    = busiest;
 		env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
 
-		update_h_load(env.src_cpu);
 more_balance:
 		local_irq_save(flags);
 		double_rq_lock(env.dst_rq, busiest);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ef0a7b2..5e129ef 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -285,7 +285,6 @@ struct cfs_rq {
 	/* Required to track per-cpu representation of a task_group */
 	u32 tg_runnable_contrib;
 	unsigned long tg_load_contrib;
-#endif /* CONFIG_FAIR_GROUP_SCHED */
 
 	/*
 	 *   h_load = weight * f(tg)
@@ -294,6 +293,9 @@ struct cfs_rq {
 	 * this group.
 	 */
 	unsigned long h_load;
+	u64 last_h_load_update;
+	struct sched_entity *h_load_next;
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -429,9 +431,6 @@ struct rq {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* list of leaf cfs_rq on this cpu: */
 	struct list_head leaf_cfs_rq_list;
-#ifdef CONFIG_SMP
-	unsigned long h_load_throttle;
-#endif /* CONFIG_SMP */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 #ifdef CONFIG_RT_GROUP_SCHED
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH RFC] sched: move h_load calculation to task_h_load
  2013-07-13  8:47 [PATCH RFC] sched: move h_load calculation to task_h_load Vladimir Davydov
@ 2013-07-15  8:28 ` Peter Zijlstra
  2013-07-15 10:00   ` Vladimir Davydov
  0 siblings, 1 reply; 7+ messages in thread
From: Peter Zijlstra @ 2013-07-15  8:28 UTC (permalink / raw)
  To: Vladimir Davydov; +Cc: Ingo Molnar, linux-kernel, devel, pjt

On Sat, Jul 13, 2013 at 12:47:15PM +0400, Vladimir Davydov wrote:

> ---
>  kernel/sched/fair.c  |   56 ++++++++++++++++++++++----------------------------
>  kernel/sched/sched.h |    7 +++----
>  2 files changed, 28 insertions(+), 35 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index f77f9c5..de90690 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4171,47 +4171,46 @@ static void update_blocked_averages(int cpu)
>  }
>  
>  /*
> + * Compute the hierarchical load factor for cfs_rq and all its ascendants.
>   * This needs to be done in a top-down fashion because the load of a child
>   * group is a fraction of its parents load.
>   */
> +static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
>  {
> +	struct rq *rq = rq_of(cfs_rq);
> +	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
>  	unsigned long load;
>  
> +	cfs_rq->h_load_next = NULL;
> +	for_each_sched_entity(se) {
> +		cfs_rq = cfs_rq_of(se);
> +		cfs_rq->h_load_next = se;
> +		if (cfs_rq->last_h_load_update == rq->clock)
> +			break;
>  	}
>  
> +	if (!se) {
> +		cfs_rq->h_load = rq->avg.load_avg_contrib;
> +		cfs_rq->last_h_load_update = rq->clock;
> +	}
>  
> +	while ((se = cfs_rq->h_load_next) != NULL) {
> +		load = cfs_rq->h_load;
> +		load = div64_ul(load * se->avg.load_avg_contrib,
> +				cfs_rq->runnable_load_avg + 1);
> +		cfs_rq = group_cfs_rq(se);
> +		cfs_rq->h_load = load;
> +		cfs_rq->last_h_load_update = rq->clock;
> +	}
>  }
>  
>  static unsigned long task_h_load(struct task_struct *p)
>  {
>  	struct cfs_rq *cfs_rq = task_cfs_rq(p);
>  
> +	if (cfs_rq->last_h_load_update != rq_of(cfs_rq)->clock)
> +		update_cfs_rq_h_load(cfs_rq);
> +
>  	return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
>  			cfs_rq->runnable_load_avg + 1);
>  }

OK, fair enough. It does somewhat rely on us getting the single
rq->clock update thing right, but that should be ok.

But yeah, when you have stupid many cgroups we quickly need less h_load
instances than there are cgroups.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH RFC] sched: move h_load calculation to task_h_load
  2013-07-15  8:28 ` Peter Zijlstra
@ 2013-07-15 10:00   ` Vladimir Davydov
  2013-07-15 10:59     ` Peter Zijlstra
  0 siblings, 1 reply; 7+ messages in thread
From: Vladimir Davydov @ 2013-07-15 10:00 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, linux-kernel, devel, pjt

On 07/15/2013 12:28 PM, Peter Zijlstra wrote:
> OK, fair enough. It does somewhat rely on us getting the single
> rq->clock update thing right, but that should be ok.

Frankly, I doubt that rq->clock is the right thing to use here, because 
it can be updated very frequently under some conditions, so that 
cfs_rq->h_load can get updated several times during the same balance 
run. Perhaps, jiffies would suit better for that purpose. This would 
work as h_load update throttler similar to how it works now (I mean 
rq->h_load_throttle in update_h_load()).

If there is something else you don't like/have some thoughts on, please 
share - I'll try to improve.

Thank you for your feedback.

> But yeah, when you have stupid many cgroups we quickly need less h_load
> instances than there are cgroups.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH RFC] sched: move h_load calculation to task_h_load
  2013-07-15 10:00   ` Vladimir Davydov
@ 2013-07-15 10:59     ` Peter Zijlstra
  2013-07-15 13:49       ` [PATCH v2] " Vladimir Davydov
  0 siblings, 1 reply; 7+ messages in thread
From: Peter Zijlstra @ 2013-07-15 10:59 UTC (permalink / raw)
  To: Vladimir Davydov; +Cc: Ingo Molnar, linux-kernel, devel, pjt

On Mon, Jul 15, 2013 at 02:00:42PM +0400, Vladimir Davydov wrote:
> On 07/15/2013 12:28 PM, Peter Zijlstra wrote:
> >OK, fair enough. It does somewhat rely on us getting the single
> >rq->clock update thing right, but that should be ok.
> 
> Frankly, I doubt that rq->clock is the right thing to use here, because it
> can be updated very frequently under some conditions, so that cfs_rq->h_load
> can get updated several times during the same balance run. Perhaps, jiffies
> would suit better for that purpose. This would work as h_load update
> throttler similar to how it works now (I mean rq->h_load_throttle in
> update_h_load()).
> 
> If there is something else you don't like/have some thoughts on, please
> share - I'll try to improve.

Yeah, go with jiffies, that makes more sense.

Other than that, when I initially saw your patch I thought about 'fixing'
walk_tg_tree() to ignore groups that have no tasks but that's not going to help
for the case where all groups have 1 (or more) tasks that are inactive.

We also cannot use cfs_rq::h_nr_running because that's per cpu and not system
wide.

So yeah, after a bit of a ponder I agreed that your aproach was the most
feasible.

So unless someone else has a bright idea I think what you propose is a good
solution.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH v2] sched: move h_load calculation to task_h_load
  2013-07-15 10:59     ` Peter Zijlstra
@ 2013-07-15 13:49       ` Vladimir Davydov
  2013-07-16 15:50         ` Peter Zijlstra
  2013-07-24  3:56         ` [tip:perf/core] sched: Move h_load calculation to task_h_load() tip-bot for Vladimir Davydov
  0 siblings, 2 replies; 7+ messages in thread
From: Vladimir Davydov @ 2013-07-15 13:49 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra; +Cc: pjt, linux-kernel, devel

The bad thing about update_h_load(), which computes hierarchical load
factor for task groups, is that it is called for each task group in the
system before every load balancer run, and since rebalance can be
triggered very often, this function can eat really a lot of cpu time if
there are many cpu cgroups in the system.

Although the situation was improved significantly by commit a35b646
('sched, cgroup: Reduce rq->lock hold times for large cgroup
hierarchies'), the problem still can arise under some kinds of loads,
e.g. when cpus are switching from idle to busy and back very frequently.

For instance, when I start 1000 of processes that wake up every
millisecond on my 8 cpus host, 'top' and 'perf top' show:

Cpu(s): 17.8%us, 24.3%sy,  0.0%ni, 57.9%id,  0.0%wa,  0.0%hi,  0.0%si
Events: 243K cycles
  7.57%  [kernel]               [k] __schedule
  7.08%  [kernel]               [k] timerqueue_add
  6.13%  libc-2.12.so           [.] usleep

Then if I create 10000 *idle* cpu cgroups (no processes in them), cpu
usage increases significantly although the 'wakers' are still executing
in the root cpu cgroup:

Cpu(s): 19.1%us, 48.7%sy,  0.0%ni, 31.6%id,  0.0%wa,  0.0%hi,  0.7%si
Events: 230K cycles
 24.56%  [kernel]            [k] tg_load_down
  5.76%  [kernel]            [k] __schedule

This happens because this particular kind of load triggers 'new idle'
rebalance very frequently, which requires calling update_h_load(),
which, in turn, calls tg_load_down() for every *idle* cpu cgroup even
though it is absolutely useless, because idle cpu cgroups have no tasks
to pull.

This patch tries to improve the situation by making h_load calculation
proceed only when h_load is really necessary. To achieve this, it
substitutes update_h_load() with update_cfs_rq_h_load(), which computes
h_load only for a given cfs_rq and all its ascendants, and makes the
load balancer call this function whenever it considers if a task should
be pulled, i.e. it moves h_load calculations directly to task_h_load().
For h_load of the same cfs_rq not to be updated multiple times (in case
several tasks in the same cgroup are considered during the same balance
run), the patch keeps the time of the last h_load update for each cfs_rq
and breaks calculation when it finds h_load to be uptodate.

The benefit of it is that h_load is computed only for those cfs_rq's,
which really need it, in particular all idle task groups are skipped.
Although this, in fact, moves h_load calculation under rq lock, it
should not affect latency much, because the amount of work done under rq
lock while trying to pull tasks is limited by sched_nr_migrate.

After the patch applied with the setup described above (1000 wakers in
the root cgroup and 10000 idle cgroups), I get:

Cpu(s): 16.9%us, 24.8%sy,  0.0%ni, 58.4%id,  0.0%wa,  0.0%hi,  0.0%si
Events: 242K cycles
  7.57%  [kernel]                  [k] __schedule
  6.70%  [kernel]                  [k] timerqueue_add
  5.93%  libc-2.12.so              [.] usleep

Changes in v2:
 * use jiffies instead of rq->clock for last_h_load_update.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
---
 kernel/sched/fair.c  |   58 +++++++++++++++++++++++---------------------------
 kernel/sched/sched.h |    7 +++---
 2 files changed, 30 insertions(+), 35 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f77f9c5..12d0137 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4171,47 +4171,48 @@ static void update_blocked_averages(int cpu)
 }
 
 /*
- * Compute the cpu's hierarchical load factor for each task group.
+ * Compute the hierarchical load factor for cfs_rq and all its ascendants.
  * This needs to be done in a top-down fashion because the load of a child
  * group is a fraction of its parents load.
  */
-static int tg_load_down(struct task_group *tg, void *data)
+static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
 {
-	unsigned long load;
-	long cpu = (long)data;
-
-	if (!tg->parent) {
-		load = cpu_rq(cpu)->avg.load_avg_contrib;
-	} else {
-		load = tg->parent->cfs_rq[cpu]->h_load;
-		load = div64_ul(load * tg->se[cpu]->avg.load_avg_contrib,
-				tg->parent->cfs_rq[cpu]->runnable_load_avg + 1);
-	}
-
-	tg->cfs_rq[cpu]->h_load = load;
-
-	return 0;
-}
-
-static void update_h_load(long cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
+	struct rq *rq = rq_of(cfs_rq);
+	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
 	unsigned long now = jiffies;
+	unsigned long load;
 
-	if (rq->h_load_throttle == now)
+	if (cfs_rq->last_h_load_update == now)
 		return;
 
-	rq->h_load_throttle = now;
+	cfs_rq->h_load_next = NULL;
+	for_each_sched_entity(se) {
+		cfs_rq = cfs_rq_of(se);
+		cfs_rq->h_load_next = se;
+		if (cfs_rq->last_h_load_update == now)
+			break;
+	}
 
-	rcu_read_lock();
-	walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
-	rcu_read_unlock();
+	if (!se) {
+		cfs_rq->h_load = rq->avg.load_avg_contrib;
+		cfs_rq->last_h_load_update = now;
+	}
+
+	while ((se = cfs_rq->h_load_next) != NULL) {
+		load = cfs_rq->h_load;
+		load = div64_ul(load * se->avg.load_avg_contrib,
+				cfs_rq->runnable_load_avg + 1);
+		cfs_rq = group_cfs_rq(se);
+		cfs_rq->h_load = load;
+		cfs_rq->last_h_load_update = now;
+	}
 }
 
 static unsigned long task_h_load(struct task_struct *p)
 {
 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
 
+	update_cfs_rq_h_load(cfs_rq);
 	return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
 			cfs_rq->runnable_load_avg + 1);
 }
@@ -4220,10 +4221,6 @@ static inline void update_blocked_averages(int cpu)
 {
 }
 
-static inline void update_h_load(long cpu)
-{
-}
-
 static unsigned long task_h_load(struct task_struct *p)
 {
 	return p->se.avg.load_avg_contrib;
@@ -5108,7 +5105,6 @@ redo:
 		env.src_rq    = busiest;
 		env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
 
-		update_h_load(env.src_cpu);
 more_balance:
 		local_irq_save(flags);
 		double_rq_lock(env.dst_rq, busiest);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ef0a7b2..5e129ef 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -285,7 +285,6 @@ struct cfs_rq {
 	/* Required to track per-cpu representation of a task_group */
 	u32 tg_runnable_contrib;
 	unsigned long tg_load_contrib;
-#endif /* CONFIG_FAIR_GROUP_SCHED */
 
 	/*
 	 *   h_load = weight * f(tg)
@@ -294,6 +293,9 @@ struct cfs_rq {
 	 * this group.
 	 */
 	unsigned long h_load;
+	u64 last_h_load_update;
+	struct sched_entity *h_load_next;
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -429,9 +431,6 @@ struct rq {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* list of leaf cfs_rq on this cpu: */
 	struct list_head leaf_cfs_rq_list;
-#ifdef CONFIG_SMP
-	unsigned long h_load_throttle;
-#endif /* CONFIG_SMP */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 #ifdef CONFIG_RT_GROUP_SCHED
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] sched: move h_load calculation to task_h_load
  2013-07-15 13:49       ` [PATCH v2] " Vladimir Davydov
@ 2013-07-16 15:50         ` Peter Zijlstra
  2013-07-24  3:56         ` [tip:perf/core] sched: Move h_load calculation to task_h_load() tip-bot for Vladimir Davydov
  1 sibling, 0 replies; 7+ messages in thread
From: Peter Zijlstra @ 2013-07-16 15:50 UTC (permalink / raw)
  To: Vladimir Davydov; +Cc: Ingo Molnar, pjt, linux-kernel, devel

On Mon, Jul 15, 2013 at 05:49:19PM +0400, Vladimir Davydov wrote:
> The bad thing about update_h_load(), which computes hierarchical load
> factor for task groups, is that it is called for each task group in the
> system before every load balancer run, and since rebalance can be
> triggered very often, this function can eat really a lot of cpu time if
> there are many cpu cgroups in the system.
> 
> Although the situation was improved significantly by commit a35b646
> ('sched, cgroup: Reduce rq->lock hold times for large cgroup
> hierarchies'), the problem still can arise under some kinds of loads,
> e.g. when cpus are switching from idle to busy and back very frequently.
> 
> For instance, when I start 1000 of processes that wake up every
> millisecond on my 8 cpus host, 'top' and 'perf top' show:
> 
> Cpu(s): 17.8%us, 24.3%sy,  0.0%ni, 57.9%id,  0.0%wa,  0.0%hi,  0.0%si
> Events: 243K cycles
>   7.57%  [kernel]               [k] __schedule
>   7.08%  [kernel]               [k] timerqueue_add
>   6.13%  libc-2.12.so           [.] usleep
> 
> Then if I create 10000 *idle* cpu cgroups (no processes in them), cpu
> usage increases significantly although the 'wakers' are still executing
> in the root cpu cgroup:
> 
> Cpu(s): 19.1%us, 48.7%sy,  0.0%ni, 31.6%id,  0.0%wa,  0.0%hi,  0.7%si
> Events: 230K cycles
>  24.56%  [kernel]            [k] tg_load_down
>   5.76%  [kernel]            [k] __schedule
> 
> This happens because this particular kind of load triggers 'new idle'
> rebalance very frequently, which requires calling update_h_load(),
> which, in turn, calls tg_load_down() for every *idle* cpu cgroup even
> though it is absolutely useless, because idle cpu cgroups have no tasks
> to pull.
> 
> This patch tries to improve the situation by making h_load calculation
> proceed only when h_load is really necessary. To achieve this, it
> substitutes update_h_load() with update_cfs_rq_h_load(), which computes
> h_load only for a given cfs_rq and all its ascendants, and makes the
> load balancer call this function whenever it considers if a task should
> be pulled, i.e. it moves h_load calculations directly to task_h_load().
> For h_load of the same cfs_rq not to be updated multiple times (in case
> several tasks in the same cgroup are considered during the same balance
> run), the patch keeps the time of the last h_load update for each cfs_rq
> and breaks calculation when it finds h_load to be uptodate.
> 
> The benefit of it is that h_load is computed only for those cfs_rq's,
> which really need it, in particular all idle task groups are skipped.
> Although this, in fact, moves h_load calculation under rq lock, it
> should not affect latency much, because the amount of work done under rq
> lock while trying to pull tasks is limited by sched_nr_migrate.
> 
> After the patch applied with the setup described above (1000 wakers in
> the root cgroup and 10000 idle cgroups), I get:
> 
> Cpu(s): 16.9%us, 24.8%sy,  0.0%ni, 58.4%id,  0.0%wa,  0.0%hi,  0.0%si
> Events: 242K cycles
>   7.57%  [kernel]                  [k] __schedule
>   6.70%  [kernel]                  [k] timerqueue_add
>   5.93%  libc-2.12.so              [.] usleep
> 
> Changes in v2:
>  * use jiffies instead of rq->clock for last_h_load_update.
> 
> Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>

Thanks!

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [tip:perf/core] sched: Move h_load calculation to task_h_load()
  2013-07-15 13:49       ` [PATCH v2] " Vladimir Davydov
  2013-07-16 15:50         ` Peter Zijlstra
@ 2013-07-24  3:56         ` tip-bot for Vladimir Davydov
  1 sibling, 0 replies; 7+ messages in thread
From: tip-bot for Vladimir Davydov @ 2013-07-24  3:56 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: linux-kernel, hpa, mingo, a.p.zijlstra, tglx, vdavydov

Commit-ID:  685207963be973fbb73550db6edaf920a283e1a7
Gitweb:     http://git.kernel.org/tip/685207963be973fbb73550db6edaf920a283e1a7
Author:     Vladimir Davydov <vdavydov@parallels.com>
AuthorDate: Mon, 15 Jul 2013 17:49:19 +0400
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Tue, 23 Jul 2013 12:18:41 +0200

sched: Move h_load calculation to task_h_load()

The bad thing about update_h_load(), which computes hierarchical load
factor for task groups, is that it is called for each task group in the
system before every load balancer run, and since rebalance can be
triggered very often, this function can eat really a lot of cpu time if
there are many cpu cgroups in the system.

Although the situation was improved significantly by commit a35b646
('sched, cgroup: Reduce rq->lock hold times for large cgroup
hierarchies'), the problem still can arise under some kinds of loads,
e.g. when cpus are switching from idle to busy and back very frequently.

For instance, when I start 1000 of processes that wake up every
millisecond on my 8 cpus host, 'top' and 'perf top' show:

Cpu(s): 17.8%us, 24.3%sy,  0.0%ni, 57.9%id,  0.0%wa,  0.0%hi,  0.0%si
Events: 243K cycles
  7.57%  [kernel]               [k] __schedule
  7.08%  [kernel]               [k] timerqueue_add
  6.13%  libc-2.12.so           [.] usleep

Then if I create 10000 *idle* cpu cgroups (no processes in them), cpu
usage increases significantly although the 'wakers' are still executing
in the root cpu cgroup:

Cpu(s): 19.1%us, 48.7%sy,  0.0%ni, 31.6%id,  0.0%wa,  0.0%hi,  0.7%si
Events: 230K cycles
 24.56%  [kernel]            [k] tg_load_down
  5.76%  [kernel]            [k] __schedule

This happens because this particular kind of load triggers 'new idle'
rebalance very frequently, which requires calling update_h_load(),
which, in turn, calls tg_load_down() for every *idle* cpu cgroup even
though it is absolutely useless, because idle cpu cgroups have no tasks
to pull.

This patch tries to improve the situation by making h_load calculation
proceed only when h_load is really necessary. To achieve this, it
substitutes update_h_load() with update_cfs_rq_h_load(), which computes
h_load only for a given cfs_rq and all its ascendants, and makes the
load balancer call this function whenever it considers if a task should
be pulled, i.e. it moves h_load calculations directly to task_h_load().
For h_load of the same cfs_rq not to be updated multiple times (in case
several tasks in the same cgroup are considered during the same balance
run), the patch keeps the time of the last h_load update for each cfs_rq
and breaks calculation when it finds h_load to be uptodate.

The benefit of it is that h_load is computed only for those cfs_rq's,
which really need it, in particular all idle task groups are skipped.
Although this, in fact, moves h_load calculation under rq lock, it
should not affect latency much, because the amount of work done under rq
lock while trying to pull tasks is limited by sched_nr_migrate.

After the patch applied with the setup described above (1000 wakers in
the root cgroup and 10000 idle cgroups), I get:

Cpu(s): 16.9%us, 24.8%sy,  0.0%ni, 58.4%id,  0.0%wa,  0.0%hi,  0.0%si
Events: 242K cycles
  7.57%  [kernel]                  [k] __schedule
  6.70%  [kernel]                  [k] timerqueue_add
  5.93%  libc-2.12.so              [.] usleep

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1373896159-1278-1-git-send-email-vdavydov@parallels.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c  | 58 ++++++++++++++++++++++++----------------------------
 kernel/sched/sched.h |  7 +++----
 2 files changed, 30 insertions(+), 35 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bb456f4..765d87a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4171,47 +4171,48 @@ static void update_blocked_averages(int cpu)
 }
 
 /*
- * Compute the cpu's hierarchical load factor for each task group.
+ * Compute the hierarchical load factor for cfs_rq and all its ascendants.
  * This needs to be done in a top-down fashion because the load of a child
  * group is a fraction of its parents load.
  */
-static int tg_load_down(struct task_group *tg, void *data)
+static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
 {
-	unsigned long load;
-	long cpu = (long)data;
-
-	if (!tg->parent) {
-		load = cpu_rq(cpu)->avg.load_avg_contrib;
-	} else {
-		load = tg->parent->cfs_rq[cpu]->h_load;
-		load = div64_ul(load * tg->se[cpu]->avg.load_avg_contrib,
-				tg->parent->cfs_rq[cpu]->runnable_load_avg + 1);
-	}
-
-	tg->cfs_rq[cpu]->h_load = load;
-
-	return 0;
-}
-
-static void update_h_load(long cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
+	struct rq *rq = rq_of(cfs_rq);
+	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
 	unsigned long now = jiffies;
+	unsigned long load;
 
-	if (rq->h_load_throttle == now)
+	if (cfs_rq->last_h_load_update == now)
 		return;
 
-	rq->h_load_throttle = now;
+	cfs_rq->h_load_next = NULL;
+	for_each_sched_entity(se) {
+		cfs_rq = cfs_rq_of(se);
+		cfs_rq->h_load_next = se;
+		if (cfs_rq->last_h_load_update == now)
+			break;
+	}
 
-	rcu_read_lock();
-	walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
-	rcu_read_unlock();
+	if (!se) {
+		cfs_rq->h_load = rq->avg.load_avg_contrib;
+		cfs_rq->last_h_load_update = now;
+	}
+
+	while ((se = cfs_rq->h_load_next) != NULL) {
+		load = cfs_rq->h_load;
+		load = div64_ul(load * se->avg.load_avg_contrib,
+				cfs_rq->runnable_load_avg + 1);
+		cfs_rq = group_cfs_rq(se);
+		cfs_rq->h_load = load;
+		cfs_rq->last_h_load_update = now;
+	}
 }
 
 static unsigned long task_h_load(struct task_struct *p)
 {
 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
 
+	update_cfs_rq_h_load(cfs_rq);
 	return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
 			cfs_rq->runnable_load_avg + 1);
 }
@@ -4220,10 +4221,6 @@ static inline void update_blocked_averages(int cpu)
 {
 }
 
-static inline void update_h_load(long cpu)
-{
-}
-
 static unsigned long task_h_load(struct task_struct *p)
 {
 	return p->se.avg.load_avg_contrib;
@@ -5108,7 +5105,6 @@ redo:
 		env.src_rq    = busiest;
 		env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
 
-		update_h_load(env.src_cpu);
 more_balance:
 		local_irq_save(flags);
 		double_rq_lock(env.dst_rq, busiest);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ef0a7b2..5e129ef 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -285,7 +285,6 @@ struct cfs_rq {
 	/* Required to track per-cpu representation of a task_group */
 	u32 tg_runnable_contrib;
 	unsigned long tg_load_contrib;
-#endif /* CONFIG_FAIR_GROUP_SCHED */
 
 	/*
 	 *   h_load = weight * f(tg)
@@ -294,6 +293,9 @@ struct cfs_rq {
 	 * this group.
 	 */
 	unsigned long h_load;
+	u64 last_h_load_update;
+	struct sched_entity *h_load_next;
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -429,9 +431,6 @@ struct rq {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* list of leaf cfs_rq on this cpu: */
 	struct list_head leaf_cfs_rq_list;
-#ifdef CONFIG_SMP
-	unsigned long h_load_throttle;
-#endif /* CONFIG_SMP */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 #ifdef CONFIG_RT_GROUP_SCHED

^ permalink raw reply related	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2013-07-24  3:56 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-07-13  8:47 [PATCH RFC] sched: move h_load calculation to task_h_load Vladimir Davydov
2013-07-15  8:28 ` Peter Zijlstra
2013-07-15 10:00   ` Vladimir Davydov
2013-07-15 10:59     ` Peter Zijlstra
2013-07-15 13:49       ` [PATCH v2] " Vladimir Davydov
2013-07-16 15:50         ` Peter Zijlstra
2013-07-24  3:56         ` [tip:perf/core] sched: Move h_load calculation to task_h_load() tip-bot for Vladimir Davydov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.