All of lore.kernel.org
 help / color / mirror / Atom feed
From: Mike Galbraith <bitbucket@online.de>
To: Michael Wang <wangyun@linux.vnet.ibm.com>
Cc: linux-kernel@vger.kernel.org, mingo@redhat.com,
	peterz@infradead.org, mingo@kernel.org, a.p.zijlstra@chello.nl
Subject: Re: [RFC PATCH 0/2] sched: simplify the select_task_rq_fair()
Date: Wed, 23 Jan 2013 10:32:35 +0100	[thread overview]
Message-ID: <1358933555.5752.132.camel@marge.simpson.net> (raw)
In-Reply-To: <1358932694.5752.126.camel@marge.simpson.net>

On Wed, 2013-01-23 at 10:18 +0100, Mike Galbraith wrote: 
> On Wed, 2013-01-23 at 17:00 +0800, Michael Wang wrote: 
> > On 01/23/2013 04:49 PM, Mike Galbraith wrote:
> > > On Wed, 2013-01-23 at 16:30 +0800, Michael Wang wrote: 
> > >> On 01/23/2013 04:20 PM, Mike Galbraith wrote:
> > >>> On Wed, 2013-01-23 at 15:10 +0800, Michael Wang wrote: 
> > >>>> On 01/23/2013 02:28 PM, Mike Galbraith wrote:
> > >>>
> > >>>>> Abbreviated test run:
> > >>>>> Tasks    jobs/min  jti  jobs/min/task      real       cpu
> > >>>>>   640   158044.01   81       246.9438     24.54    577.66   Wed Jan 23 07:14:33 2013
> > >>>>>  1280    50434.33   39        39.4018    153.80   5737.57   Wed Jan 23 07:17:07 2013
> > >>>>>  2560    47214.07   34        18.4430    328.58  12715.56   Wed Jan 23 07:22:36 2013
> > >>>>
> > >>>> So still not works... and not going to balance path while waking up will
> > >>>> fix it, looks like that's the only choice if no error on balance path
> > >>>> could be found...benchmark wins again, I'm feeling bad...
> > >>>>
> > >>>> I will conclude the info we collected and make a v3 later.
> > >>>
> > >>> FWIW, I hacked virgin to do full balance if an idle CPU was not found,
> > >>> leaving the preference to wake cache affine intact though, turned on
> > >>> WAKE_BALANCE in all domains, and it did not collapse.  In fact, the high
> > >>> load end, where the idle search will frequently be a waste of cycles,
> > >>> actually improved a bit.  Things that make ya go hmmm.
> > >>
> > >> Oh, does that means the old balance path is good while the new is really
> > >> broken, I mean, compared this with the previously results, could we say
> > >> that all the collapse was just caused by the change of balance path?
> > > 
> > > That's a good supposition.  I'll see if it holds.
> > 
> > I just notice that there is no sd support the WAKE flag at all according
> > to your debug info, isn't it?
> 
> There is, I turned it on in all domains.

For your patches, I had to turn it on at birth, but doing that, and
restoring the full balance path to original form killed the collapse.

Tasks    jobs/min  jti  jobs/min/task      real       cpu
  640   152452.83   97       238.2075     25.44    613.48   Wed Jan 23 10:22:12 2013
 1280   190491.16   97       148.8212     40.72   1223.74   Wed Jan 23 10:22:53 2013
 2560   219397.54   95        85.7022     70.71   2422.46   Wed Jan 23 10:24:04 2013

---
 include/linux/topology.h |    6 ++---
 kernel/sched/core.c      |   41 ++++++++++++++++++++++++++++++-------
 kernel/sched/fair.c      |   52 +++++++++++++++++++++++++++++------------------
 3 files changed, 70 insertions(+), 29 deletions(-)

--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -95,7 +95,7 @@ int arch_update_cpu_topology(void);
 				| 1*SD_BALANCE_NEWIDLE			\
 				| 1*SD_BALANCE_EXEC			\
 				| 1*SD_BALANCE_FORK			\
-				| 0*SD_BALANCE_WAKE			\
+				| 1*SD_BALANCE_WAKE			\
 				| 1*SD_WAKE_AFFINE			\
 				| 1*SD_SHARE_CPUPOWER			\
 				| 1*SD_SHARE_PKG_RESOURCES		\
@@ -126,7 +126,7 @@ int arch_update_cpu_topology(void);
 				| 1*SD_BALANCE_NEWIDLE			\
 				| 1*SD_BALANCE_EXEC			\
 				| 1*SD_BALANCE_FORK			\
-				| 0*SD_BALANCE_WAKE			\
+				| 1*SD_BALANCE_WAKE			\
 				| 1*SD_WAKE_AFFINE			\
 				| 0*SD_SHARE_CPUPOWER			\
 				| 1*SD_SHARE_PKG_RESOURCES		\
@@ -156,7 +156,7 @@ int arch_update_cpu_topology(void);
 				| 1*SD_BALANCE_NEWIDLE			\
 				| 1*SD_BALANCE_EXEC			\
 				| 1*SD_BALANCE_FORK			\
-				| 0*SD_BALANCE_WAKE			\
+				| 1*SD_BALANCE_WAKE			\
 				| 1*SD_WAKE_AFFINE			\
 				| 0*SD_SHARE_CPUPOWER			\
 				| 0*SD_SHARE_PKG_RESOURCES		\
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5609,11 +5609,39 @@ static void update_top_cache_domain(int
 static int sbm_max_level;
 DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_balance_map, sbm_array);
 
+static void debug_sched_balance_map(int cpu)
+{
+	int i, type, level = 0;
+	struct sched_balance_map *sbm = &per_cpu(sbm_array, cpu);
+
+	printk("WYT: sbm of cpu %d\n", cpu);
+
+	for (type = 0; type < SBM_MAX_TYPE; type++) {
+		if (type == SBM_EXEC_TYPE)
+			printk("WYT: \t exec map\n");
+		else if (type == SBM_FORK_TYPE)
+			printk("WYT: \t fork map\n");
+		else if (type == SBM_WAKE_TYPE)
+			printk("WYT: \t wake map\n");
+
+		for (level = 0; level < sbm_max_level; level++) {
+			if (sbm->sd[type][level])
+				printk("WYT: \t\t sd %x, idx %d, level %d, weight %d\n", sbm->sd[type][level], level, sbm->sd[type][level]->level, sbm->sd[type][level]->span_weight);
+		}
+	}
+
+	printk("WYT: \t affine map\n");
+
+	for_each_possible_cpu(i) {
+		if (sbm->affine_map[i])
+			printk("WYT: \t\t affine with cpu %x in sd %x, weight %d\n", i, sbm->affine_map[i], sbm->affine_map[i]->span_weight);
+	}
+}
+
 static void build_sched_balance_map(int cpu)
 {
 	struct sched_balance_map *sbm = &per_cpu(sbm_array, cpu);
 	struct sched_domain *sd = cpu_rq(cpu)->sd;
-	struct sched_domain *top_sd = NULL;
 	int i, type, level = 0;
 
 	memset(sbm->top_level, 0, sizeof((*sbm).top_level));
@@ -5656,11 +5684,9 @@ static void build_sched_balance_map(int
 	 * fill the hole to get lower level sd easily.
 	 */
 	for (type = 0; type < SBM_MAX_TYPE; type++) {
-		level = sbm->top_level[type];
-		top_sd = sbm->sd[type][level];
-		if ((++level != sbm_max_level) && top_sd) {
-			for (; level < sbm_max_level; level++)
-				sbm->sd[type][level] = top_sd;
+		for (level = 1; level < sbm_max_level; level++) {
+			if (!sbm->sd[type][level])
+				sbm->sd[type][level] = sbm->sd[type][level - 1];
 		}
 	}
 }
@@ -5719,6 +5745,7 @@ cpu_attach_domain(struct sched_domain *s
 	 * destroy_sched_domains() already do the work.
 	 */
 	build_sched_balance_map(cpu);
+//MIKE	debug_sched_balance_map(cpu);
 	rcu_assign_pointer(rq->sbm, sbm);
 }
 
@@ -6220,7 +6247,7 @@ sd_numa_init(struct sched_domain_topolog
 					| 1*SD_BALANCE_NEWIDLE
 					| 0*SD_BALANCE_EXEC
 					| 0*SD_BALANCE_FORK
-					| 0*SD_BALANCE_WAKE
+					| 1*SD_BALANCE_WAKE
 					| 0*SD_WAKE_AFFINE
 					| 0*SD_SHARE_CPUPOWER
 					| 0*SD_SHARE_PKG_RESOURCES
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3312,7 +3312,7 @@ static int select_idle_sibling(struct ta
 static int
 select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
 {
-	struct sched_domain *sd = NULL;
+	struct sched_domain *sd = NULL, *tmp;
 	int cpu = smp_processor_id();
 	int prev_cpu = task_cpu(p);
 	int new_cpu = cpu;
@@ -3376,31 +3376,45 @@ select_task_rq_fair(struct task_struct *
 
 balance_path:
 	new_cpu = (sd_flag & SD_BALANCE_WAKE) ? prev_cpu : cpu;
-	sd = sbm->sd[type][sbm->top_level[type]];
+	sd = tmp = sbm->sd[type][sbm->top_level[type]];
 
 	while (sd) {
 		int load_idx = sd->forkexec_idx;
-		struct sched_group *sg = NULL;
+		struct sched_group *group;
+		int weight;
+
+		if (!(sd->flags & sd_flag)) {
+			sd = sd->child;
+			continue;
+		}
 
 		if (sd_flag & SD_BALANCE_WAKE)
 			load_idx = sd->wake_idx;
 
-		sg = find_idlest_group(sd, p, cpu, load_idx);
-		if (!sg)
-			goto next_sd;
-
-		new_cpu = find_idlest_cpu(sg, p, cpu);
-		if (new_cpu != -1)
-			cpu = new_cpu;
-next_sd:
-		if (!sd->level)
-			break;
-
-		sbm = cpu_rq(cpu)->sbm;
-		if (!sbm)
-			break;
-
-		sd = sbm->sd[type][sd->level - 1];
+		group = find_idlest_group(sd, p, cpu, load_idx);
+		if (!group) {
+			sd = sd->child;
+			continue;
+		}
+
+		new_cpu = find_idlest_cpu(group, p, cpu);
+		if (new_cpu == -1 || new_cpu == cpu) {
+			/* Now try balancing at a lower domain level of cpu */
+			sd = sd->child;
+			continue;
+		}
+
+		/* Now try balancing at a lower domain level of new_cpu */
+		cpu = new_cpu;
+		weight = sd->span_weight;
+		sd = NULL;
+		for_each_domain(cpu, tmp) {
+			if (weight <= tmp->span_weight)
+				break;
+			if (tmp->flags & sd_flag)
+				sd = tmp;
+		}
+		/* while loop will break here if sd == NULL */
 	}
 
 unlock:



  parent reply	other threads:[~2013-01-23  9:32 UTC|newest]

Thread overview: 57+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <1356588535-23251-1-git-send-email-wangyun@linux.vnet.ibm.com>
2013-01-09  9:28 ` [RFC PATCH 0/2] sched: simplify the select_task_rq_fair() Michael Wang
2013-01-12  8:01   ` Mike Galbraith
2013-01-12 10:19     ` Mike Galbraith
2013-01-14  9:21       ` Mike Galbraith
2013-01-15  3:10         ` Michael Wang
2013-01-15  4:52           ` Mike Galbraith
2013-01-15  8:26             ` Michael Wang
2013-01-17  5:55         ` Michael Wang
2013-01-20  4:09           ` Mike Galbraith
2013-01-21  2:50             ` Michael Wang
2013-01-21  4:38               ` Mike Galbraith
2013-01-21  5:07                 ` Michael Wang
2013-01-21  6:42                   ` Mike Galbraith
2013-01-21  7:09                     ` Mike Galbraith
2013-01-21  7:45                       ` Michael Wang
2013-01-21  9:09                         ` Mike Galbraith
2013-01-21  9:22                           ` Michael Wang
2013-01-21  9:44                             ` Mike Galbraith
2013-01-21 10:30                               ` Mike Galbraith
2013-01-22  3:43                               ` Michael Wang
2013-01-22  8:03                                 ` Mike Galbraith
2013-01-22  8:56                                   ` Michael Wang
2013-01-22 11:34                                     ` Mike Galbraith
2013-01-23  3:01                                       ` Michael Wang
2013-01-23  5:02                                         ` Mike Galbraith
2013-01-22 14:41                                     ` Mike Galbraith
2013-01-23  2:44                                       ` Michael Wang
2013-01-23  4:31                                         ` Mike Galbraith
2013-01-23  5:09                                           ` Michael Wang
2013-01-23  6:28                                             ` Mike Galbraith
2013-01-23  7:10                                               ` Michael Wang
2013-01-23  8:20                                                 ` Mike Galbraith
2013-01-23  8:30                                                   ` Michael Wang
2013-01-23  8:49                                                     ` Mike Galbraith
2013-01-23  9:00                                                       ` Michael Wang
2013-01-23  9:18                                                         ` Mike Galbraith
2013-01-23  9:26                                                           ` Michael Wang
2013-01-23  9:37                                                             ` Mike Galbraith
2013-01-23  9:32                                                           ` Mike Galbraith [this message]
2013-01-24  6:01                                                             ` Michael Wang
2013-01-24  6:51                                                               ` Mike Galbraith
2013-01-24  7:15                                                                 ` Michael Wang
2013-01-24  7:47                                                                   ` Mike Galbraith
2013-01-24  8:14                                                                     ` Michael Wang
2013-01-24  9:07                                                                       ` Mike Galbraith
2013-01-24  9:26                                                                         ` Michael Wang
2013-01-24 10:34                                                                           ` Mike Galbraith
2013-01-25  2:14                                                                             ` Michael Wang
2013-01-24  7:00                                                               ` Michael Wang
2013-01-21  7:34                     ` Michael Wang
2013-01-21  8:26                       ` Mike Galbraith
2013-01-21  8:46                         ` Michael Wang
2013-01-21  9:11                           ` Mike Galbraith
2013-01-15  2:46     ` Michael Wang
2013-01-11  8:15 Michael Wang
2013-01-11 10:13 ` Nikunj A Dadhania
2013-01-15  2:20   ` Michael Wang

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1358933555.5752.132.camel@marge.simpson.net \
    --to=bitbucket@online.de \
    --cc=a.p.zijlstra@chello.nl \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@kernel.org \
    --cc=mingo@redhat.com \
    --cc=peterz@infradead.org \
    --cc=wangyun@linux.vnet.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.