All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v4] sched: Fast idling of CPU when system is partially loaded
@ 2014-06-23 19:16 Tim Chen
  2014-06-23 19:22 ` Rik van Riel
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: Tim Chen @ 2014-06-23 19:16 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra
  Cc: Andrew Morton, Davidlohr Bueso, Alex Shi, Andi Kleen,
	Michel Lespinasse, Rik van Riel, Peter Hurley, Thomas Gleixner,
	Paul E.McKenney, Jason Low, linux-kernel

Thanks to the review from Jason, Andi and Peter. I've updated
the code as Peter suggested with simplified logic.

When a system is lightly loaded (i.e. no more than 1 job per cpu),
attempt to pull job to a cpu before putting it to idle is unnecessary and
can be skipped.  This patch adds an indicator so the scheduler can know
when there's no more than 1 active job is on any CPU in the system to
skip needless job pulls.

On a 4 socket machine with a request/response kind of workload from
clients, we saw about 0.13 msec delay when we go through a full load
balance to try pull job from all the other cpus.  While 0.1 msec was
spent on processing the request and generating a response, the 0.13 msec
load balance overhead was actually more than the actual work being done.
This overhead can be skipped much of the time for lightly loaded systems.

With this patch, we tested with a netperf request/response workload that
has the server busy with half the cpus in a 4 socket system.  We found
the patch eliminated 75% of the load balance attempts before idling a cpu.

The overhead of setting/clearing the indicator is low as we already gather
the necessary info while we call add_nr_running and update_sd_lb_stats.
We switch to full load balance load immediately if any cpu got more than
one job on its run queue in add_nr_running.  We'll clear the indicator
to avoid load balance when we detect no cpu's have more than one job
when we scan the work queues in update_sg_lb_stats.  We are aggressive
in turning on the load balance and opportunistic in skipping the load
balance.

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Acked-by: Jason Low <jason.low2@hp.com>
---
Change Log:
v4:
1. Remove an unnecessary check of root domain before updating local
copy of overload indicator.
 
v3:
1. Simplify the logic to call sg_lb_stats from update_sd_lb_stats. 

v2:
1. Moved check of whether load balance is required to idle_balance.
2. Use more direct access to root domain.

 kernel/sched/fair.c  | 21 ++++++++++++++++++---
 kernel/sched/sched.h | 10 ++++++++--
 2 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fea7d33..7dfe2ad 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5867,7 +5867,8 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro
  */
 static inline void update_sg_lb_stats(struct lb_env *env,
 			struct sched_group *group, int load_idx,
-			int local_group, struct sg_lb_stats *sgs)
+			int local_group, struct sg_lb_stats *sgs,
+			bool *overload)
 {
 	unsigned long load;
 	int i;
@@ -5885,6 +5886,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 
 		sgs->group_load += load;
 		sgs->sum_nr_running += rq->nr_running;
+
+		if (rq->nr_running > 1)
+			*overload = true;
+
 #ifdef CONFIG_NUMA_BALANCING
 		sgs->nr_numa_running += rq->nr_numa_running;
 		sgs->nr_preferred_running += rq->nr_preferred_running;
@@ -5995,6 +6000,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 	struct sched_group *sg = env->sd->groups;
 	struct sg_lb_stats tmp_sgs;
 	int load_idx, prefer_sibling = 0;
+	bool overload = false;
 
 	if (child && child->flags & SD_PREFER_SIBLING)
 		prefer_sibling = 1;
@@ -6015,7 +6021,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 				update_group_capacity(env->sd, env->dst_cpu);
 		}
 
-		update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
+		update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
+						&overload);
 
 		if (local_group)
 			goto next_group;
@@ -6049,6 +6056,13 @@ next_group:
 
 	if (env->sd->flags & SD_NUMA)
 		env->fbq_type = fbq_classify_group(&sds->busiest_stat);
+
+	if (!env->sd->parent) {
+		/* update overload indicator if we are at root domain */
+		if (env->dst_rq->rd->overload != overload)
+			env->dst_rq->rd->overload = overload;
+	}
+
 }
 
 /**
@@ -6767,7 +6781,8 @@ static int idle_balance(struct rq *this_rq)
 	 */
 	this_rq->idle_stamp = rq_clock(this_rq);
 
-	if (this_rq->avg_idle < sysctl_sched_migration_cost) {
+	if (this_rq->avg_idle < sysctl_sched_migration_cost ||
+	    !this_rq->rd->overload) {
 		rcu_read_lock();
 		sd = rcu_dereference_check_sched_domain(this_rq->sd);
 		if (sd)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 31cc02e..6d25f1d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -477,6 +477,9 @@ struct root_domain {
 	cpumask_var_t span;
 	cpumask_var_t online;
 
+	/* Indicate more than one runnable task for any CPU */
+	bool overload;
+
 	/*
 	 * The bit corresponding to a CPU gets set here if such CPU has more
 	 * than one runnable -deadline task (as it is below for RT tasks).
@@ -1218,15 +1221,18 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
 
 	rq->nr_running = prev_nr + count;
 
-#ifdef CONFIG_NO_HZ_FULL
 	if (prev_nr < 2 && rq->nr_running >= 2) {
+		if (!rq->rd->overload)
+			rq->rd->overload = true;
+
+#ifdef CONFIG_NO_HZ_FULL
 		if (tick_nohz_full_cpu(rq->cpu)) {
 			/* Order rq->nr_running write against the IPI */
 			smp_wmb();
 			smp_send_reschedule(rq->cpu);
 		}
-       }
 #endif
+	}
 }
 
 static inline void sub_nr_running(struct rq *rq, unsigned count)
-- 
1.7.11.7



^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH v4] sched: Fast idling of CPU when system is partially loaded
  2014-06-23 19:16 [PATCH v4] sched: Fast idling of CPU when system is partially loaded Tim Chen
@ 2014-06-23 19:22 ` Rik van Riel
  2014-06-24 16:15 ` Tim Chen
  2014-07-05 10:44 ` [tip:sched/core] sched/fair: Implement fast idling of CPUs when the " tip-bot for Tim Chen
  2 siblings, 0 replies; 5+ messages in thread
From: Rik van Riel @ 2014-06-23 19:22 UTC (permalink / raw)
  To: Tim Chen, Ingo Molnar, Peter Zijlstra
  Cc: Andrew Morton, Davidlohr Bueso, Alex Shi, Andi Kleen,
	Michel Lespinasse, Peter Hurley, Thomas Gleixner,
	Paul E.McKenney, Jason Low, linux-kernel

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

On 06/23/2014 03:16 PM, Tim Chen wrote:
> Thanks to the review from Jason, Andi and Peter. I've updated the
> code as Peter suggested with simplified logic.
> 
> When a system is lightly loaded (i.e. no more than 1 job per cpu), 
> attempt to pull job to a cpu before putting it to idle is
> unnecessary and can be skipped.  This patch adds an indicator so
> the scheduler can know when there's no more than 1 active job is on
> any CPU in the system to skip needless job pulls.
> 
> On a 4 socket machine with a request/response kind of workload
> from clients, we saw about 0.13 msec delay when we go through a
> full load balance to try pull job from all the other cpus.  While
> 0.1 msec was spent on processing the request and generating a
> response, the 0.13 msec load balance overhead was actually more
> than the actual work being done. This overhead can be skipped much
> of the time for lightly loaded systems.
> 
> With this patch, we tested with a netperf request/response workload
> that has the server busy with half the cpus in a 4 socket system.
> We found the patch eliminated 75% of the load balance attempts
> before idling a cpu.
> 
> The overhead of setting/clearing the indicator is low as we already
> gather the necessary info while we call add_nr_running and
> update_sd_lb_stats. We switch to full load balance load immediately
> if any cpu got more than one job on its run queue in
> add_nr_running.  We'll clear the indicator to avoid load balance
> when we detect no cpu's have more than one job when we scan the
> work queues in update_sg_lb_stats.  We are aggressive in turning on
> the load balance and opportunistic in skipping the load balance.
> 
> Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com> Acked-by:
> Jason Low <jason.low2@hp.com>

Acked-by: Rik van Riel <riel@redhat.com>


- -- 
All rights reversed
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1
Comment: Using GnuPG with Thunderbird - http://www.enigmail.net/

iQEcBAEBAgAGBQJTqH50AAoJEM553pKExN6DYj4H/17YlyWg0QDoNrkNbhEi8SfD
I0BsklbbDIpcq9hKpwiJOCiQNeVLjMwhTniKOOXj1LgbtSsHYz7Scac/vt9sJRsy
PLJKoQt43lSv7Ff3mJFbUZG5u2RWoLs8TLuSFhPd39J8XupJX9jVe3GejBsp8lN4
Mpmo2DD6FvjvF2mpIP+CpEDFQZNeEZBb9UMvAJCjAU4JwdodwFkLRgxTWyOUSFpS
eOhDj99nRgyCz0LwLaVD2mfi29B/C5giIk70G+1II9BjTDGlFJC9X5FairZ3bM+K
6//BDq1baNnVu7pKKtQe8bLhNFQ1z1WNtgDr8L9c6dw9rLI+5AGjZb+KK5mMnG8=
=goKD
-----END PGP SIGNATURE-----

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v4] sched: Fast idling of CPU when system is partially loaded
  2014-06-23 19:16 [PATCH v4] sched: Fast idling of CPU when system is partially loaded Tim Chen
  2014-06-23 19:22 ` Rik van Riel
@ 2014-06-24 16:15 ` Tim Chen
  2014-06-24 20:36   ` Peter Zijlstra
  2014-07-05 10:44 ` [tip:sched/core] sched/fair: Implement fast idling of CPUs when the " tip-bot for Tim Chen
  2 siblings, 1 reply; 5+ messages in thread
From: Tim Chen @ 2014-06-24 16:15 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Andrew Morton, Davidlohr Bueso, Alex Shi,
	Andi Kleen, Michel Lespinasse, Rik van Riel, Peter Hurley,
	Thomas Gleixner, Paul E.McKenney, Jason Low, linux-kernel

On Mon, 2014-06-23 at 12:16 -0700, Tim Chen wrote:
> Thanks to the review from Jason, Andi and Peter. I've updated
> the code as Peter suggested with simplified logic.
> 
> When a system is lightly loaded (i.e. no more than 1 job per cpu),
> attempt to pull job to a cpu before putting it to idle is unnecessary and
> can be skipped.  This patch adds an indicator so the scheduler can know
> when there's no more than 1 active job is on any CPU in the system to
> skip needless job pulls.
> 
> On a 4 socket machine with a request/response kind of workload from
> clients, we saw about 0.13 msec delay when we go through a full load
> balance to try pull job from all the other cpus.  While 0.1 msec was
> spent on processing the request and generating a response, the 0.13 msec
> load balance overhead was actually more than the actual work being done.
> This overhead can be skipped much of the time for lightly loaded systems.
> 
> With this patch, we tested with a netperf request/response workload that
> has the server busy with half the cpus in a 4 socket system.  We found
> the patch eliminated 75% of the load balance attempts before idling a cpu.
> 
> The overhead of setting/clearing the indicator is low as we already gather
> the necessary info while we call add_nr_running and update_sd_lb_stats.
> We switch to full load balance load immediately if any cpu got more than
> one job on its run queue in add_nr_running.  We'll clear the indicator
> to avoid load balance when we detect no cpu's have more than one job
> when we scan the work queues in update_sg_lb_stats.  We are aggressive
> in turning on the load balance and opportunistic in skipping the load
> balance.
> 
> Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
> Acked-by: Jason Low <jason.low2@hp.com>

Peter,

I need to fixup the code of updating the indicator under
the CONFIG_SMP compile flag.  

Also attached a complete updated patch.

Thanks.

Tim

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6d25f1d..d051712 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1222,9 +1222,10 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
 	rq->nr_running = prev_nr + count;
 
 	if (prev_nr < 2 && rq->nr_running >= 2) {
+#ifdef CONFIG_SMP
 		if (!rq->rd->overload)
 			rq->rd->overload = true;
-
+#endif
 #ifdef CONFIG_NO_HZ_FULL
 		if (tick_nohz_full_cpu(rq->cpu)) {
 			/* Order rq->nr_running write against the IPI */



The complete updated patch is attached below:
---
>From 8716a50c85f98a92d2240da923ef4ae9a9719bbe Mon Sep 17 00:00:00 2001
Message-Id: <8716a50c85f98a92d2240da923ef4ae9a9719bbe.1403625949.git.tim.c.chen@linux.intel.com>
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Thu, 12 Jun 2014 11:28:38 -0700
Subject: [PATCH v5] sched: Fast idling of CPU when system is partially loaded
To: Ingo Molnar <mingo@elte.hu>, Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>, Davidlohr Bueso <davidlohr@hp.com>, Alex Shi <alex.shi@linaro.org>, Andi Kleen <andi@firstfloor.org>, Michel Lespinasse <walken@google.com>, Rik van Riel <riel@redhat.com>, Peter Hurley <peter@hurleysoftware.com>, Thomas Gleixner <tglx@linutronix.de>, Paul E.McKenney <paulmck@linux.vnet.ibm.com>, Jason Low <jason.low2@hp.com>, linux-kernel@vger.kernel.org

When a system is lightly loaded (i.e. no more than 1 job per cpu),
attempt to pull job to a cpu before putting it to idle is unnecessary and
can be skipped.  This patch adds an indicator so the scheduler can know
when there's no more than 1 active job is on any CPU in the system to
skip needless job pulls.

On a 4 socket machine with a request/response kind of workload from
clients, we saw about 0.13 msec delay when we go through a full load
balance to try pull job from all the other cpus.  While 0.1 msec was
spent on processing the request and generating a response, the 0.13 msec
load balance overhead was actually more than the actual work being done.
This overhead can be skipped much of the time for lightly loaded systems.

With this patch, we tested with a netperf request/response workload that
has the server busy with half the cpus in a 4 socket system.  We found
the patch eliminated 75% of the load balance attempts before idling a cpu.

The overhead of setting/clearing the indicator is low as we already gather
the necessary info while we call add_nr_running and update_sd_lb_stats.
We switch to full load balance load immediately if any cpu got more than
one job on its run queue in add_nr_running.  We'll clear the indicator
to avoid load balance when we detect no cpu's have more than one job
when we scan the work queues in update_sg_lb_stats.  We are aggressive
in turning on the load balance and opportunistic in skipping the load
balance.

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Acked-by: Jason Low <jason.low2@hp.com>
Acked-by: Rik van Riel <riel@redhat.com>
---
 kernel/sched/fair.c  | 21 ++++++++++++++++++---
 kernel/sched/sched.h | 11 +++++++++--
 2 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fea7d33..7dfe2ad 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5867,7 +5867,8 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro
  */
 static inline void update_sg_lb_stats(struct lb_env *env,
 			struct sched_group *group, int load_idx,
-			int local_group, struct sg_lb_stats *sgs)
+			int local_group, struct sg_lb_stats *sgs,
+			bool *overload)
 {
 	unsigned long load;
 	int i;
@@ -5885,6 +5886,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 
 		sgs->group_load += load;
 		sgs->sum_nr_running += rq->nr_running;
+
+		if (rq->nr_running > 1)
+			*overload = true;
+
 #ifdef CONFIG_NUMA_BALANCING
 		sgs->nr_numa_running += rq->nr_numa_running;
 		sgs->nr_preferred_running += rq->nr_preferred_running;
@@ -5995,6 +6000,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 	struct sched_group *sg = env->sd->groups;
 	struct sg_lb_stats tmp_sgs;
 	int load_idx, prefer_sibling = 0;
+	bool overload = false;
 
 	if (child && child->flags & SD_PREFER_SIBLING)
 		prefer_sibling = 1;
@@ -6015,7 +6021,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 				update_group_capacity(env->sd, env->dst_cpu);
 		}
 
-		update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
+		update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
+						&overload);
 
 		if (local_group)
 			goto next_group;
@@ -6049,6 +6056,13 @@ next_group:
 
 	if (env->sd->flags & SD_NUMA)
 		env->fbq_type = fbq_classify_group(&sds->busiest_stat);
+
+	if (!env->sd->parent) {
+		/* update overload indicator if we are at root domain */
+		if (env->dst_rq->rd->overload != overload)
+			env->dst_rq->rd->overload = overload;
+	}
+
 }
 
 /**
@@ -6767,7 +6781,8 @@ static int idle_balance(struct rq *this_rq)
 	 */
 	this_rq->idle_stamp = rq_clock(this_rq);
 
-	if (this_rq->avg_idle < sysctl_sched_migration_cost) {
+	if (this_rq->avg_idle < sysctl_sched_migration_cost ||
+	    !this_rq->rd->overload) {
 		rcu_read_lock();
 		sd = rcu_dereference_check_sched_domain(this_rq->sd);
 		if (sd)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 31cc02e..d051712 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -477,6 +477,9 @@ struct root_domain {
 	cpumask_var_t span;
 	cpumask_var_t online;
 
+	/* Indicate more than one runnable task for any CPU */
+	bool overload;
+
 	/*
 	 * The bit corresponding to a CPU gets set here if such CPU has more
 	 * than one runnable -deadline task (as it is below for RT tasks).
@@ -1218,15 +1221,19 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
 
 	rq->nr_running = prev_nr + count;
 
-#ifdef CONFIG_NO_HZ_FULL
 	if (prev_nr < 2 && rq->nr_running >= 2) {
+#ifdef CONFIG_SMP
+		if (!rq->rd->overload)
+			rq->rd->overload = true;
+#endif
+#ifdef CONFIG_NO_HZ_FULL
 		if (tick_nohz_full_cpu(rq->cpu)) {
 			/* Order rq->nr_running write against the IPI */
 			smp_wmb();
 			smp_send_reschedule(rq->cpu);
 		}
-       }
 #endif
+	}
 }
 
 static inline void sub_nr_running(struct rq *rq, unsigned count)
-- 
1.7.11.7





^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH v4] sched: Fast idling of CPU when system is partially loaded
  2014-06-24 16:15 ` Tim Chen
@ 2014-06-24 20:36   ` Peter Zijlstra
  0 siblings, 0 replies; 5+ messages in thread
From: Peter Zijlstra @ 2014-06-24 20:36 UTC (permalink / raw)
  To: Tim Chen
  Cc: Ingo Molnar, Andrew Morton, Davidlohr Bueso, Alex Shi,
	Andi Kleen, Michel Lespinasse, Rik van Riel, Peter Hurley,
	Thomas Gleixner, Paul E.McKenney, Jason Low, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 180 bytes --]

> I need to fixup the code of updating the indicator under
> the CONFIG_SMP compile flag.  
> 
Yep, saw the bot email, fixed the patch and will push out a new stack
shortly..

[-- Attachment #2: Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [tip:sched/core] sched/fair: Implement fast idling of CPUs when the system is partially loaded
  2014-06-23 19:16 [PATCH v4] sched: Fast idling of CPU when system is partially loaded Tim Chen
  2014-06-23 19:22 ` Rik van Riel
  2014-06-24 16:15 ` Tim Chen
@ 2014-07-05 10:44 ` tip-bot for Tim Chen
  2 siblings, 0 replies; 5+ messages in thread
From: tip-bot for Tim Chen @ 2014-07-05 10:44 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, hpa, mingo, torvalds, peterz, tim.c.chen, peter,
	jason.low2, alex.shi, riel, paulmck, akpm, tglx, walken,
	davidlohr

Commit-ID:  4486edd12b5ac8a9af7a5e16e4b9eeb3b8339c10
Gitweb:     http://git.kernel.org/tip/4486edd12b5ac8a9af7a5e16e4b9eeb3b8339c10
Author:     Tim Chen <tim.c.chen@linux.intel.com>
AuthorDate: Mon, 23 Jun 2014 12:16:49 -0700
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Sat, 5 Jul 2014 11:17:32 +0200

sched/fair: Implement fast idling of CPUs when the system is partially loaded

When a system is lightly loaded (i.e. no more than 1 job per cpu),
attempt to pull job to a cpu before putting it to idle is unnecessary and
can be skipped.  This patch adds an indicator so the scheduler can know
when there's no more than 1 active job is on any CPU in the system to
skip needless job pulls.

On a 4 socket machine with a request/response kind of workload from
clients, we saw about 0.13 msec delay when we go through a full load
balance to try pull job from all the other cpus.  While 0.1 msec was
spent on processing the request and generating a response, the 0.13 msec
load balance overhead was actually more than the actual work being done.
This overhead can be skipped much of the time for lightly loaded systems.

With this patch, we tested with a netperf request/response workload that
has the server busy with half the cpus in a 4 socket system.  We found
the patch eliminated 75% of the load balance attempts before idling a cpu.

The overhead of setting/clearing the indicator is low as we already gather
the necessary info while we call add_nr_running() and update_sd_lb_stats.()
We switch to full load balance load immediately if any cpu got more than
one job on its run queue in add_nr_running.  We'll clear the indicator
to avoid load balance when we detect no cpu's have more than one job
when we scan the work queues in update_sg_lb_stats().  We are aggressive
in turning on the load balance and opportunistic in skipping the load
balance.

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Jason Low <jason.low2@hp.com>
Cc: "Paul E.McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Davidlohr Bueso <davidlohr@hp.com>
Cc: Alex Shi <alex.shi@linaro.org>
Cc: Michel Lespinasse <walken@google.com>
Cc: Peter Hurley <peter@hurleysoftware.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1403551009.2970.613.camel@schen9-DESK
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c  | 21 ++++++++++++++++++---
 kernel/sched/sched.h | 12 ++++++++++--
 2 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ef5eac7..e3ff3d1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5866,7 +5866,8 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro
  */
 static inline void update_sg_lb_stats(struct lb_env *env,
 			struct sched_group *group, int load_idx,
-			int local_group, struct sg_lb_stats *sgs)
+			int local_group, struct sg_lb_stats *sgs,
+			bool *overload)
 {
 	unsigned long load;
 	int i;
@@ -5884,6 +5885,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 
 		sgs->group_load += load;
 		sgs->sum_nr_running += rq->nr_running;
+
+		if (rq->nr_running > 1)
+			*overload = true;
+
 #ifdef CONFIG_NUMA_BALANCING
 		sgs->nr_numa_running += rq->nr_numa_running;
 		sgs->nr_preferred_running += rq->nr_preferred_running;
@@ -5994,6 +5999,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 	struct sched_group *sg = env->sd->groups;
 	struct sg_lb_stats tmp_sgs;
 	int load_idx, prefer_sibling = 0;
+	bool overload = false;
 
 	if (child && child->flags & SD_PREFER_SIBLING)
 		prefer_sibling = 1;
@@ -6014,7 +6020,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 				update_group_capacity(env->sd, env->dst_cpu);
 		}
 
-		update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
+		update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
+						&overload);
 
 		if (local_group)
 			goto next_group;
@@ -6048,6 +6055,13 @@ next_group:
 
 	if (env->sd->flags & SD_NUMA)
 		env->fbq_type = fbq_classify_group(&sds->busiest_stat);
+
+	if (!env->sd->parent) {
+		/* update overload indicator if we are at root domain */
+		if (env->dst_rq->rd->overload != overload)
+			env->dst_rq->rd->overload = overload;
+	}
+
 }
 
 /**
@@ -6766,7 +6780,8 @@ static int idle_balance(struct rq *this_rq)
 	 */
 	this_rq->idle_stamp = rq_clock(this_rq);
 
-	if (this_rq->avg_idle < sysctl_sched_migration_cost) {
+	if (this_rq->avg_idle < sysctl_sched_migration_cost ||
+	    !this_rq->rd->overload) {
 		rcu_read_lock();
 		sd = rcu_dereference_check_sched_domain(this_rq->sd);
 		if (sd)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index eb85676..0191ed5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -477,6 +477,9 @@ struct root_domain {
 	cpumask_var_t span;
 	cpumask_var_t online;
 
+	/* Indicate more than one runnable task for any CPU */
+	bool overload;
+
 	/*
 	 * The bit corresponding to a CPU gets set here if such CPU has more
 	 * than one runnable -deadline task (as it is below for RT tasks).
@@ -1218,8 +1221,13 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
 
 	rq->nr_running = prev_nr + count;
 
-#ifdef CONFIG_NO_HZ_FULL
 	if (prev_nr < 2 && rq->nr_running >= 2) {
+#ifdef CONFIG_SMP
+		if (!rq->rd->overload)
+			rq->rd->overload = true;
+#endif
+
+#ifdef CONFIG_NO_HZ_FULL
 		if (tick_nohz_full_cpu(rq->cpu)) {
 			/*
 			 * Tick is needed if more than one task runs on a CPU.
@@ -1231,8 +1239,8 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
 			 */
 			tick_nohz_full_kick_cpu(rq->cpu);
 		}
-       }
 #endif
+	}
 }
 
 static inline void sub_nr_running(struct rq *rq, unsigned count)

^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2014-07-05 10:45 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-06-23 19:16 [PATCH v4] sched: Fast idling of CPU when system is partially loaded Tim Chen
2014-06-23 19:22 ` Rik van Riel
2014-06-24 16:15 ` Tim Chen
2014-06-24 20:36   ` Peter Zijlstra
2014-07-05 10:44 ` [tip:sched/core] sched/fair: Implement fast idling of CPUs when the " tip-bot for Tim Chen

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.