linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [patch 1/5] sched: remove degenerate domains
@ 2005-04-05 23:44 Nick Piggin
  2005-04-05 23:45 ` [patch 2/5] sched: NULL domains Nick Piggin
  2005-04-06  5:44 ` [patch 1/5] sched: remove degenerate domains Ingo Molnar
  0 siblings, 2 replies; 27+ messages in thread
From: Nick Piggin @ 2005-04-05 23:44 UTC (permalink / raw)
  To: Andrew Morton, linux-kernel; +Cc: Ingo Molnar, Siddha, Suresh B

[-- Attachment #1: Type: text/plain, Size: 76 bytes --]

This is Suresh's patch with some modifications.

-- 
SUSE Labs, Novell Inc.

[-- Attachment #2: sched-remove-degenerate-domains.patch --]
[-- Type: text/plain, Size: 3953 bytes --]

Remove degenerate scheduler domains during the sched-domain init.

For example on x86_64, we always have NUMA configured in. On Intel EM64T
systems, top most sched domain will be of NUMA and with only one sched_group in
it. 

With fork/exec balances(recent Nick's fixes in -mm tree), we always endup 
taking wrong decisions because of this topmost domain (as it contains only 
one group and find_idlest_group always returns NULL). We will endup loading 
HT package completely first, letting active load balance kickin and correct it.

In general, this patch also makes sense with out recent Nick's fixes
in -mm.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>

Modified to account for more than just sched_groups when scanning for
degenerate domains by Nick Piggin. Allow a runqueue's sd to go NULL, which
required small changes to the smtnice code.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>


Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c	2005-04-05 16:38:21.000000000 +1000
+++ linux-2.6/kernel/sched.c	2005-04-05 18:39:09.000000000 +1000
@@ -2583,11 +2583,15 @@ out:
 #ifdef CONFIG_SCHED_SMT
 static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
 {
-	struct sched_domain *sd = this_rq->sd;
+	struct sched_domain *tmp, *sd = NULL;
 	cpumask_t sibling_map;
 	int i;
+	
+	for_each_domain(this_cpu, tmp)
+		if (tmp->flags & SD_SHARE_CPUPOWER)
+			sd = tmp;
 
-	if (!(sd->flags & SD_SHARE_CPUPOWER))
+	if (!sd)
 		return;
 
 	/*
@@ -2628,13 +2632,17 @@ static inline void wake_sleeping_depende
 
 static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
 {
-	struct sched_domain *sd = this_rq->sd;
+	struct sched_domain *tmp, *sd = NULL;
 	cpumask_t sibling_map;
 	prio_array_t *array;
 	int ret = 0, i;
 	task_t *p;
 
-	if (!(sd->flags & SD_SHARE_CPUPOWER))
+	for_each_domain(this_cpu, tmp)
+		if (tmp->flags & SD_SHARE_CPUPOWER)
+			sd = tmp;
+
+	if (!sd)
 		return 0;
 
 	/*
@@ -4604,6 +4612,11 @@ static void sched_domain_debug(struct sc
 {
 	int level = 0;
 
+	if (!sd) {
+		printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
+		return;
+	}
+		
 	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
 
 	do {
@@ -4809,6 +4822,50 @@ static void init_sched_domain_sysctl(voi
 }
 #endif
 
+static int __devinit sd_degenerate(struct sched_domain *sd)
+{
+	if (cpus_weight(sd->span) == 1)
+		return 1;
+
+	/* Following flags need at least 2 groups */
+	if (sd->flags & (SD_LOAD_BALANCE |
+			 SD_BALANCE_NEWIDLE |
+			 SD_BALANCE_FORK |
+			 SD_BALANCE_EXEC)) {
+		if (sd->groups != sd->groups->next)
+			return 0;
+	}
+
+	/* Following flags don't use groups */
+	if (sd->flags & (SD_WAKE_IDLE |
+			 SD_WAKE_AFFINE |
+			 SD_WAKE_BALANCE))
+		return 0;
+
+	return 1;
+}
+
+static int __devinit sd_parent_degenerate(struct sched_domain *sd,
+						struct sched_domain *parent)
+{
+	unsigned long cflags = sd->flags, pflags = parent->flags;
+
+	if (sd_degenerate(parent))
+		return 1;
+
+	if (!cpus_equal(sd->span, parent->span))
+		return 0;
+
+	/* Does parent contain flags not in child? */
+	/* WAKE_BALANCE is a subset of WAKE_AFFINE */
+	if (cflags & SD_WAKE_AFFINE)
+		pflags &= ~SD_WAKE_BALANCE;
+	if ((~sd->flags) & parent->flags)
+		return 0;
+
+	return 1;
+}
+
 /*
  * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
  * hold the hotplug lock.
@@ -4819,6 +4876,19 @@ void __devinit cpu_attach_domain(struct 
 	unsigned long flags;
 	runqueue_t *rq = cpu_rq(cpu);
 	int local = 1;
+	struct sched_domain *tmp;
+
+	/* Remove the sched domains which do not contribute to scheduling. */
+	for (tmp = sd; tmp; tmp = tmp->parent) {
+		struct sched_domain *parent = tmp->parent;
+		if (!parent)
+			break;
+		if (sd_parent_degenerate(tmp, parent))
+			tmp->parent = parent->parent;
+	}
+
+	if (sd_degenerate(sd))
+		sd = sd->parent;
 
 	sched_domain_debug(sd, cpu);
 

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [patch 2/5] sched: NULL domains
  2005-04-05 23:44 [patch 1/5] sched: remove degenerate domains Nick Piggin
@ 2005-04-05 23:45 ` Nick Piggin
  2005-04-05 23:46   ` [patch 3/5] sched: multilevel sbe and sbf Nick Piggin
  2005-04-06  5:45   ` [patch 2/5] sched: NULL domains Ingo Molnar
  2005-04-06  5:44 ` [patch 1/5] sched: remove degenerate domains Ingo Molnar
  1 sibling, 2 replies; 27+ messages in thread
From: Nick Piggin @ 2005-04-05 23:45 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, Ingo Molnar, Siddha, Suresh B

[-- Attachment #1: Type: text/plain, Size: 4 bytes --]

2/5

[-- Attachment #2: sched-null-domains.patch --]
[-- Type: text/plain, Size: 2380 bytes --]

The previous patch fixed the last 2 places that directly access a
runqueue's sched-domain and assume it cannot be NULL.

We can now use a NULL domain instead of a dummy domain to signify
no balancing is to happen. No functional changes.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c	2005-04-05 16:38:40.000000000 +1000
+++ linux-2.6/kernel/sched.c	2005-04-05 18:39:08.000000000 +1000
@@ -4887,7 +4887,7 @@ void __devinit cpu_attach_domain(struct 
 			tmp->parent = parent->parent;
 	}
 
-	if (sd_degenerate(sd))
+	if (sd && sd_degenerate(sd))
 		sd = sd->parent;
 
 	sched_domain_debug(sd, cpu);
@@ -5054,7 +5054,7 @@ static void __devinit arch_init_sched_do
 	cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
 
 	/*
-	 * Set up domains. Isolated domains just stay on the dummy domain.
+	 * Set up domains. Isolated domains just stay on the NULL domain.
 	 */
 	for_each_cpu_mask(i, cpu_default_map) {
 		int group;
@@ -5167,18 +5167,11 @@ static void __devinit arch_destroy_sched
 
 #endif /* ARCH_HAS_SCHED_DOMAIN */
 
-/*
- * Initial dummy domain for early boot and for hotplug cpu. Being static,
- * it is initialized to zero, so all balancing flags are cleared which is
- * what we want.
- */
-static struct sched_domain sched_domain_dummy;
-
 #ifdef CONFIG_HOTPLUG_CPU
 /*
  * Force a reinitialization of the sched domains hierarchy.  The domains
  * and groups cannot be updated in place without racing with the balancing
- * code, so we temporarily attach all running cpus to a "dummy" domain
+ * code, so we temporarily attach all running cpus to the NULL domain
  * which will prevent rebalancing while the sched domains are recalculated.
  */
 static int update_sched_domains(struct notifier_block *nfb,
@@ -5190,7 +5183,7 @@ static int update_sched_domains(struct n
 	case CPU_UP_PREPARE:
 	case CPU_DOWN_PREPARE:
 		for_each_online_cpu(i)
-			cpu_attach_domain(&sched_domain_dummy, i);
+			cpu_attach_domain(NULL, i);
 		arch_destroy_sched_domains();
 		return NOTIFY_OK;
 
@@ -5253,7 +5246,7 @@ void __init sched_init(void)
 		rq->best_expired_prio = MAX_PRIO;
 
 #ifdef CONFIG_SMP
-		rq->sd = &sched_domain_dummy;
+		rq->sd = NULL;
 		for (j = 1; j < 3; j++)
 			rq->cpu_load[j] = 0;
 		rq->active_balance = 0;

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [patch 3/5] sched: multilevel sbe and sbf
  2005-04-05 23:45 ` [patch 2/5] sched: NULL domains Nick Piggin
@ 2005-04-05 23:46   ` Nick Piggin
  2005-04-05 23:47     ` [patch 4/5] sched: RCU sched domains Nick Piggin
  2005-04-06  5:54     ` [patch 3/5] sched: multilevel sbe and sbf Ingo Molnar
  2005-04-06  5:45   ` [patch 2/5] sched: NULL domains Ingo Molnar
  1 sibling, 2 replies; 27+ messages in thread
From: Nick Piggin @ 2005-04-05 23:46 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, Ingo Molnar, Siddha, Suresh B

[-- Attachment #1: Type: text/plain, Size: 4 bytes --]

3/5

[-- Attachment #2: sched-multilevel-sbe-sbf.patch --]
[-- Type: text/plain, Size: 2753 bytes --]

The fundamental problem that Suresh has with balance on exec and fork
is that it only tries to balance the top level domain with the flag
set.

This was worked around by removing degenerate domains, but is still a
problem if people want to start using more complex sched-domains, especially
multilevel NUMA that ia64 is already using.

This patch makes balance on fork and exec try balancing over not just the
top most domain with the flag set, but all the way down the domain tree.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c	2005-04-05 16:38:53.000000000 +1000
+++ linux-2.6/kernel/sched.c	2005-04-05 18:39:07.000000000 +1000
@@ -1320,21 +1320,24 @@ void fastcall wake_up_new_task(task_t * 
 			sd = tmp;
 
 	if (sd) {
+		cpumask_t span;
 		int new_cpu;
 		struct sched_group *group;
 
+again:
 		schedstat_inc(sd, sbf_cnt);
+		span = sd->span;
 		cpu = task_cpu(p);
 		group = find_idlest_group(sd, p, cpu);
 		if (!group) {
 			schedstat_inc(sd, sbf_balanced);
-			goto no_forkbalance;
+			goto nextlevel;
 		}
 
 		new_cpu = find_idlest_cpu(group, cpu);
 		if (new_cpu == -1 || new_cpu == cpu) {
 			schedstat_inc(sd, sbf_balanced);
-			goto no_forkbalance;
+			goto nextlevel;
 		}
 
 		if (cpu_isset(new_cpu, p->cpus_allowed)) {
@@ -1344,9 +1347,21 @@ void fastcall wake_up_new_task(task_t * 
 			rq = task_rq_lock(p, &flags);
 			cpu = task_cpu(p);
 		}
+
+		/* Now try balancing at a lower domain level */
+nextlevel:
+		sd = NULL;
+		for_each_domain(cpu, tmp) {
+			if (cpus_subset(span, tmp->span))
+				break;
+			if (tmp->flags & SD_BALANCE_FORK)
+				sd = tmp;
+		}
+
+		if (sd)
+			goto again;
 	}
 
-no_forkbalance:
 #endif
 	/*
 	 * We decrease the sleep average of forking parents
@@ -1712,25 +1727,41 @@ void sched_exec(void)
 			sd = tmp;
 
 	if (sd) {
+		cpumask_t span;
 		struct sched_group *group;
+again:
 		schedstat_inc(sd, sbe_cnt);
+		span = sd->span;
 		group = find_idlest_group(sd, current, this_cpu);
 		if (!group) {
 			schedstat_inc(sd, sbe_balanced);
-			goto out;
+			goto nextlevel;
 		}
 		new_cpu = find_idlest_cpu(group, this_cpu);
 		if (new_cpu == -1 || new_cpu == this_cpu) {
 			schedstat_inc(sd, sbe_balanced);
-			goto out;
+			goto nextlevel;
 		}
 
 		schedstat_inc(sd, sbe_pushed);
 		put_cpu();
 		sched_migrate_task(current, new_cpu);
-		return;
+		
+		/* Now try balancing at a lower domain level */
+		this_cpu = get_cpu();
+nextlevel:
+		sd = NULL;
+		for_each_domain(this_cpu, tmp) {
+			if (cpus_subset(span, tmp->span))
+				break;
+			if (tmp->flags & SD_BALANCE_EXEC)
+				sd = tmp;
+		}
+
+		if (sd)
+			goto again;
 	}
-out:
+
 	put_cpu();
 }
 

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [patch 4/5] sched: RCU sched domains
  2005-04-05 23:46   ` [patch 3/5] sched: multilevel sbe and sbf Nick Piggin
@ 2005-04-05 23:47     ` Nick Piggin
  2005-04-05 23:49       ` [patch 5/5] sched: consolidate sbe sbf Nick Piggin
  2005-04-06  6:18       ` [patch 4/5] sched: RCU sched domains Ingo Molnar
  2005-04-06  5:54     ` [patch 3/5] sched: multilevel sbe and sbf Ingo Molnar
  1 sibling, 2 replies; 27+ messages in thread
From: Nick Piggin @ 2005-04-05 23:47 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, Ingo Molnar, Siddha, Suresh B

[-- Attachment #1: Type: text/plain, Size: 4 bytes --]

4/5

[-- Attachment #2: sched-rcu-domains.patch --]
[-- Type: text/plain, Size: 3445 bytes --]

One of the problems with the multilevel balance-on-fork/exec is that it
needs to jump through hoops to satisfy sched-domain's locking semantics
(that is, you may traverse your own domain when not preemptable, and
you may traverse others' domains when holding their runqueue lock).

balance-on-exec had to potentially migrate between more than one CPU before
finding a final CPU to migrate to, and balance-on-fork needed to potentially
take multiple runqueue locks.

So bite the bullet and make sched-domains go completely RCU. This actually
simplifies the code quite a bit.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c	2005-04-05 16:39:14.000000000 +1000
+++ linux-2.6/kernel/sched.c	2005-04-05 18:39:05.000000000 +1000
@@ -825,22 +825,12 @@ inline int task_curr(const task_t *p)
 }
 
 #ifdef CONFIG_SMP
-enum request_type {
-	REQ_MOVE_TASK,
-	REQ_SET_DOMAIN,
-};
-
 typedef struct {
 	struct list_head list;
-	enum request_type type;
 
-	/* For REQ_MOVE_TASK */
 	task_t *task;
 	int dest_cpu;
 
-	/* For REQ_SET_DOMAIN */
-	struct sched_domain *sd;
-
 	struct completion done;
 } migration_req_t;
 
@@ -862,7 +852,6 @@ static int migrate_task(task_t *p, int d
 	}
 
 	init_completion(&req->done);
-	req->type = REQ_MOVE_TASK;
 	req->task = p;
 	req->dest_cpu = dest_cpu;
 	list_add(&req->list, &rq->migration_queue);
@@ -4365,17 +4354,9 @@ static int migration_thread(void * data)
 		req = list_entry(head->next, migration_req_t, list);
 		list_del_init(head->next);
 
-		if (req->type == REQ_MOVE_TASK) {
-			spin_unlock(&rq->lock);
-			__migrate_task(req->task, cpu, req->dest_cpu);
-			local_irq_enable();
-		} else if (req->type == REQ_SET_DOMAIN) {
-			rq->sd = req->sd;
-			spin_unlock_irq(&rq->lock);
-		} else {
-			spin_unlock_irq(&rq->lock);
-			WARN_ON(1);
-		}
+		spin_unlock(&rq->lock);
+		__migrate_task(req->task, cpu, req->dest_cpu);
+		local_irq_enable();
 
 		complete(&req->done);
 	}
@@ -4606,7 +4587,6 @@ static int migration_call(struct notifie
 			migration_req_t *req;
 			req = list_entry(rq->migration_queue.next,
 					 migration_req_t, list);
-			BUG_ON(req->type != REQ_MOVE_TASK);
 			list_del_init(&req->list);
 			complete(&req->done);
 		}
@@ -4903,10 +4883,7 @@ static int __devinit sd_parent_degenerat
  */
 void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu)
 {
-	migration_req_t req;
-	unsigned long flags;
 	runqueue_t *rq = cpu_rq(cpu);
-	int local = 1;
 	struct sched_domain *tmp;
 
 	/* Remove the sched domains which do not contribute to scheduling. */
@@ -4923,24 +4900,7 @@ void __devinit cpu_attach_domain(struct 
 
 	sched_domain_debug(sd, cpu);
 
-	spin_lock_irqsave(&rq->lock, flags);
-
-	if (cpu == smp_processor_id() || !cpu_online(cpu)) {
-		rq->sd = sd;
-	} else {
-		init_completion(&req.done);
-		req.type = REQ_SET_DOMAIN;
-		req.sd = sd;
-		list_add(&req.list, &rq->migration_queue);
-		local = 0;
-	}
-
-	spin_unlock_irqrestore(&rq->lock, flags);
-
-	if (!local) {
-		wake_up_process(rq->migration_thread);
-		wait_for_completion(&req.done);
-	}
+	rq->sd = sd;
 }
 
 /* cpus with isolated domains */
@@ -5215,6 +5175,7 @@ static int update_sched_domains(struct n
 	case CPU_DOWN_PREPARE:
 		for_each_online_cpu(i)
 			cpu_attach_domain(NULL, i);
+		synchronize_kernel();
 		arch_destroy_sched_domains();
 		return NOTIFY_OK;
 

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [patch 5/5] sched: consolidate sbe sbf
  2005-04-05 23:47     ` [patch 4/5] sched: RCU sched domains Nick Piggin
@ 2005-04-05 23:49       ` Nick Piggin
  2005-04-06  6:27         ` Ingo Molnar
  2005-04-06  6:18       ` [patch 4/5] sched: RCU sched domains Ingo Molnar
  1 sibling, 1 reply; 27+ messages in thread
From: Nick Piggin @ 2005-04-05 23:49 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, Ingo Molnar, Siddha, Suresh B

[-- Attachment #1: Type: text/plain, Size: 116 bytes --]

5/5

Any ideas about what to do with schedstats?
Do we really need balance on exec and fork as seperate
statistics?

[-- Attachment #2: sched-consolidate-sbe-sbf.patch --]
[-- Type: text/plain, Size: 5228 bytes --]

Consolidate balance-on-exec with balance-on-fork. This is made easy
by the sched-domains RCU patches.

As well as the general goodness of code reduction, this allows
the runqueues to be unlocked during balance-on-fork.

schedstats is a problem. Maybe just have balance-on-event instead
of distinguishing fork and exec?

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c	2005-04-05 18:39:14.000000000 +1000
+++ linux-2.6/kernel/sched.c	2005-04-05 18:40:18.000000000 +1000
@@ -1013,8 +1013,57 @@ static int find_idlest_cpu(struct sched_
 	return idlest;
 }
 
+/*
+ * sched_balance_self: balance the current task (running on cpu) in domains
+ * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
+ * SD_BALANCE_EXEC.
+ *
+ * Balance, ie. select the least loaded group.
+ *
+ * Returns the target CPU number, or the same CPU if no balancing is needed.
+ *
+ * preempt must be disabled.
+ */
+static int sched_balance_self(int cpu, int flag)
+{
+	struct task_struct *t = current;
+	struct sched_domain *tmp, *sd = NULL;
 
-#endif
+	for_each_domain(cpu, tmp)
+		if (tmp->flags & flag)
+			sd = tmp;
+
+	while (sd) {
+		cpumask_t span;
+		struct sched_group *group;
+		int new_cpu;
+
+		span = sd->span;
+		group = find_idlest_group(sd, t, cpu);
+		if (!group)
+			goto nextlevel;
+
+		new_cpu = find_idlest_cpu(group, cpu);
+		if (new_cpu == -1 || new_cpu == cpu)
+			goto nextlevel;
+
+		/* Now try balancing at a lower domain level */
+		cpu = new_cpu;
+nextlevel:
+		sd = NULL;
+		for_each_domain(cpu, tmp) {
+			if (cpus_subset(span, tmp->span))
+				break;
+			if (tmp->flags & flag)
+				sd = tmp;
+		}
+		/* while loop will break here if sd == NULL */
+	}
+
+	return cpu;
+}
+
+#endif /* CONFIG_SMP */
 
 /*
  * wake_idle() will wake a task on an idle cpu if task->cpu is
@@ -1295,63 +1344,22 @@ void fastcall wake_up_new_task(task_t * 
 	int this_cpu, cpu;
 	runqueue_t *rq, *this_rq;
 #ifdef CONFIG_SMP
-	struct sched_domain *tmp, *sd = NULL;
-#endif
+	int new_cpu;
 
+	cpu = task_cpu(p);
+	preempt_disable();
+	new_cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
+	preempt_enable();
+	if (new_cpu != cpu)
+		set_task_cpu(p, new_cpu);
+#endif
+	
+	cpu = task_cpu(p);
 	rq = task_rq_lock(p, &flags);
-	BUG_ON(p->state != TASK_RUNNING);
 	this_cpu = smp_processor_id();
-	cpu = task_cpu(p);
-
-#ifdef CONFIG_SMP
-	for_each_domain(cpu, tmp)
-		if (tmp->flags & SD_BALANCE_FORK)
-			sd = tmp;
-
-	if (sd) {
-		cpumask_t span;
-		int new_cpu;
-		struct sched_group *group;
-
-again:
-		schedstat_inc(sd, sbf_cnt);
-		span = sd->span;
-		cpu = task_cpu(p);
-		group = find_idlest_group(sd, p, cpu);
-		if (!group) {
-			schedstat_inc(sd, sbf_balanced);
-			goto nextlevel;
-		}
 
-		new_cpu = find_idlest_cpu(group, cpu);
-		if (new_cpu == -1 || new_cpu == cpu) {
-			schedstat_inc(sd, sbf_balanced);
-			goto nextlevel;
-		}
-
-		if (cpu_isset(new_cpu, p->cpus_allowed)) {
-			schedstat_inc(sd, sbf_pushed);
-			set_task_cpu(p, new_cpu);
-			task_rq_unlock(rq, &flags);
-			rq = task_rq_lock(p, &flags);
-			cpu = task_cpu(p);
-		}
-
-		/* Now try balancing at a lower domain level */
-nextlevel:
-		sd = NULL;
-		for_each_domain(cpu, tmp) {
-			if (cpus_subset(span, tmp->span))
-				break;
-			if (tmp->flags & SD_BALANCE_FORK)
-				sd = tmp;
-		}
-
-		if (sd)
-			goto again;
-	}
+	BUG_ON(p->state != TASK_RUNNING);
 
-#endif
 	/*
 	 * We decrease the sleep average of forking parents
 	 * and children as well, to keep max-interactive tasks
@@ -1699,59 +1707,17 @@ out:
 	task_rq_unlock(rq, &flags);
 }
 
-/*
- * sched_exec(): find the highest-level, exec-balance-capable
- * domain and try to migrate the task to the least loaded CPU.
- *
- * execve() is a valuable balancing opportunity, because at this point
- * the task has the smallest effective memory and cache footprint.
+/* 
+ * sched_exec - execve() is a valuable balancing opportunity, because at
+ * this point the task has the smallest effective memory and cache footprint.
  */
 void sched_exec(void)
 {
-	struct sched_domain *tmp, *sd = NULL;
 	int new_cpu, this_cpu = get_cpu();
-
-	for_each_domain(this_cpu, tmp)
-		if (tmp->flags & SD_BALANCE_EXEC)
-			sd = tmp;
-
-	if (sd) {
-		cpumask_t span;
-		struct sched_group *group;
-again:
-		schedstat_inc(sd, sbe_cnt);
-		span = sd->span;
-		group = find_idlest_group(sd, current, this_cpu);
-		if (!group) {
-			schedstat_inc(sd, sbe_balanced);
-			goto nextlevel;
-		}
-		new_cpu = find_idlest_cpu(group, this_cpu);
-		if (new_cpu == -1 || new_cpu == this_cpu) {
-			schedstat_inc(sd, sbe_balanced);
-			goto nextlevel;
-		}
-
-		schedstat_inc(sd, sbe_pushed);
-		put_cpu();
-		sched_migrate_task(current, new_cpu);
-		
-		/* Now try balancing at a lower domain level */
-		this_cpu = get_cpu();
-nextlevel:
-		sd = NULL;
-		for_each_domain(this_cpu, tmp) {
-			if (cpus_subset(span, tmp->span))
-				break;
-			if (tmp->flags & SD_BALANCE_EXEC)
-				sd = tmp;
-		}
-
-		if (sd)
-			goto again;
-	}
-
+	new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
 	put_cpu();
+	if (new_cpu != this_cpu)
+		sched_migrate_task(current, new_cpu);
 }
 
 /*

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch 1/5] sched: remove degenerate domains
  2005-04-05 23:44 [patch 1/5] sched: remove degenerate domains Nick Piggin
  2005-04-05 23:45 ` [patch 2/5] sched: NULL domains Nick Piggin
@ 2005-04-06  5:44 ` Ingo Molnar
  2005-04-06  7:10   ` Siddha, Suresh B
  2005-04-06  7:49   ` Nick Piggin
  1 sibling, 2 replies; 27+ messages in thread
From: Ingo Molnar @ 2005-04-06  5:44 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, linux-kernel, Siddha, Suresh B


* Nick Piggin <nickpiggin@yahoo.com.au> wrote:

> This is Suresh's patch with some modifications.

> Remove degenerate scheduler domains during the sched-domain init.

actually, i'd suggest to not do this patch. The point of booting with a 
CONFIG_NUMA kernel on a non-NUMA box is mostly for testing, and the 
'degenerate' toplevel domain exposed conceptual bugs in the 
sched-domains code. In that sense removing such 'unnecessary' domains 
inhibits debuggability to a certain degree. If we had this patch earlier 
we'd not have experienced the wrong decisions taken by the scheduler, 
only on the much rarer 'really NUMA' boxes.

is there any case where we'd want to simplify the domain tree? One more 
domain level is just one (and very minor) aspect of CONFIG_NUMA - i'd 
not want to run a CONFIG_NUMA kernel on a non-NUMA box, even if the 
domain tree got optimized. Hm?

	Ingo

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch 2/5] sched: NULL domains
  2005-04-05 23:45 ` [patch 2/5] sched: NULL domains Nick Piggin
  2005-04-05 23:46   ` [patch 3/5] sched: multilevel sbe and sbf Nick Piggin
@ 2005-04-06  5:45   ` Ingo Molnar
  2005-04-06  5:48     ` Ingo Molnar
  1 sibling, 1 reply; 27+ messages in thread
From: Ingo Molnar @ 2005-04-06  5:45 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, linux-kernel, Siddha, Suresh B


* Nick Piggin <nickpiggin@yahoo.com.au> wrote:

> 2/5

> The previous patch fixed the last 2 places that directly access a
> runqueue's sched-domain and assume it cannot be NULL.
> 
> We can now use a NULL domain instead of a dummy domain to signify
> no balancing is to happen. No functional changes.
> 
> Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>

Acked-by: Ingo Molnar <mingo@elte.hu>

	Ingo

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch 2/5] sched: NULL domains
  2005-04-06  5:45   ` [patch 2/5] sched: NULL domains Ingo Molnar
@ 2005-04-06  5:48     ` Ingo Molnar
  2005-04-06  7:51       ` Nick Piggin
  0 siblings, 1 reply; 27+ messages in thread
From: Ingo Molnar @ 2005-04-06  5:48 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, linux-kernel, Siddha, Suresh B


* Ingo Molnar <mingo@elte.hu> wrote:

> 
> * Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> 
> > 2/5
> 
> > The previous patch fixed the last 2 places that directly access a
> > runqueue's sched-domain and assume it cannot be NULL.
> > 
> > We can now use a NULL domain instead of a dummy domain to signify
> > no balancing is to happen. No functional changes.
> > 
> > Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
> 
> Acked-by: Ingo Molnar <mingo@elte.hu>

if the previous 'remove degenerate domains' patch would go away then 
this patch needs to be merged/modified. (and most of the others as well)

	Ingo

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch 3/5] sched: multilevel sbe and sbf
  2005-04-05 23:46   ` [patch 3/5] sched: multilevel sbe and sbf Nick Piggin
  2005-04-05 23:47     ` [patch 4/5] sched: RCU sched domains Nick Piggin
@ 2005-04-06  5:54     ` Ingo Molnar
  2005-04-06  7:53       ` Nick Piggin
  1 sibling, 1 reply; 27+ messages in thread
From: Ingo Molnar @ 2005-04-06  5:54 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, linux-kernel, Siddha, Suresh B


* Nick Piggin <nickpiggin@yahoo.com.au> wrote:

> 3/5

> The fundamental problem that Suresh has with balance on exec and fork
> is that it only tries to balance the top level domain with the flag
> set.
> 
> This was worked around by removing degenerate domains, but is still a
> problem if people want to start using more complex sched-domains, especially
> multilevel NUMA that ia64 is already using.
> 
> This patch makes balance on fork and exec try balancing over not just the
> top most domain with the flag set, but all the way down the domain tree.
> 
> Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>

Acked-by: Ingo Molnar <mingo@elte.hu>

note that no matter how much scheduler logic, in the end 
cross-scheduling of tasks between nodes on NUMA will always have a 
permanent penalty (i.e. the 'migration cost' is 'infinity' in the long 
run), so the primary focus _hast to be_ on 'get it right initially' When 
tasks must spill over to other nodes will always remain a special case.  
So balance-on-fork/exec/[clone] definitely needs to be aware of the full 
domain tree picture.

	Ingo

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch 4/5] sched: RCU sched domains
  2005-04-05 23:47     ` [patch 4/5] sched: RCU sched domains Nick Piggin
  2005-04-05 23:49       ` [patch 5/5] sched: consolidate sbe sbf Nick Piggin
@ 2005-04-06  6:18       ` Ingo Molnar
  2005-04-06  8:01         ` Nick Piggin
  1 sibling, 1 reply; 27+ messages in thread
From: Ingo Molnar @ 2005-04-06  6:18 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, linux-kernel, Siddha, Suresh B


* Nick Piggin <nickpiggin@yahoo.com.au> wrote:

> 4/5

> One of the problems with the multilevel balance-on-fork/exec is that 
> it needs to jump through hoops to satisfy sched-domain's locking 
> semantics (that is, you may traverse your own domain when not 
> preemptable, and you may traverse others' domains when holding their 
> runqueue lock).
> 
> balance-on-exec had to potentially migrate between more than one CPU 
> before finding a final CPU to migrate to, and balance-on-fork needed 
> to potentially take multiple runqueue locks.
> 
> So bite the bullet and make sched-domains go completely RCU. This 
> actually simplifies the code quite a bit.
> 
> Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>

i like it conceptually, so:

Acked-by: Ingo Molnar <mingo@elte.hu>

from now on, all domain-tree readonly uses have to be rcu_read_lock()-ed 
(or otherwise have to be in a non-preemptible section). But there's a 
bug in show_shedstats() which does a for_each_domain() from within a 
preemptible section. (It was a bug with the current hotplug logic too i 
think.)

At a minimum i think we need the fix+comment below.

	Ingo

Signed-off-by: Ingo Molnar <mingo@elte.hu>

--- linux/kernel/sched.c.orig
+++ linux/kernel/sched.c
@@ -260,6 +260,10 @@ struct runqueue {
 
 static DEFINE_PER_CPU(struct runqueue, runqueues);
 
+/*
+ * The domain tree (rq->sd) is RCU locked. I.e. it may only be accessed
+ * from within an rcu_read_lock() [or otherwise preempt-disabled] sections.
+ */
 #define for_each_domain(cpu, domain) \
 	for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent)
 
@@ -338,6 +342,7 @@ static int show_schedstat(struct seq_fil
 
 #ifdef CONFIG_SMP
 		/* domain-specific stats */
+		rcu_read_lock();
 		for_each_domain(cpu, sd) {
 			enum idle_type itype;
 			char mask_str[NR_CPUS];
@@ -361,6 +366,7 @@ static int show_schedstat(struct seq_fil
 			    sd->sbe_pushed, sd->sbe_attempts,
 			    sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);
 		}
+		rcu_read_unlock();
 #endif
 	}
 	return 0;
 

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch 5/5] sched: consolidate sbe sbf
  2005-04-05 23:49       ` [patch 5/5] sched: consolidate sbe sbf Nick Piggin
@ 2005-04-06  6:27         ` Ingo Molnar
  2005-04-06  8:09           ` Nick Piggin
  0 siblings, 1 reply; 27+ messages in thread
From: Ingo Molnar @ 2005-04-06  6:27 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, linux-kernel, Siddha, Suresh B


* Nick Piggin <nickpiggin@yahoo.com.au> wrote:

> 5/5
> 
> Any ideas about what to do with schedstats?
> Do we really need balance on exec and fork as seperate
> statistics?

> Consolidate balance-on-exec with balance-on-fork. This is made easy
> by the sched-domains RCU patches.
> 
> As well as the general goodness of code reduction, this allows
> the runqueues to be unlocked during balance-on-fork.
> 
> schedstats is a problem. Maybe just have balance-on-event instead
> of distinguishing fork and exec?
> 
> Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>

looks good.

 Acked-by: Ingo Molnar <mingo@elte.hu>

while the code is now consolidated, i think we still need the separate 
fork/exec stats for schedstat. We have 3 conceptual ways to start off a 
new context: fork(), pthread_create() and execve(), and applications use 
them in different patterns. We have different flags and tuning 
parameters for two of them (which might have to become 3, i'm not 
entirely convinced we'll be able to ignore the 'process vs. thread' 
condition in wake_up_new_task(), STREAM is a really bad example in that 
sense), so we need 2 (or 3) separate stats.

	Ingo

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch 1/5] sched: remove degenerate domains
  2005-04-06  5:44 ` [patch 1/5] sched: remove degenerate domains Ingo Molnar
@ 2005-04-06  7:10   ` Siddha, Suresh B
  2005-04-06  7:13     ` Ingo Molnar
  2005-04-06  7:49   ` Nick Piggin
  1 sibling, 1 reply; 27+ messages in thread
From: Siddha, Suresh B @ 2005-04-06  7:10 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Nick Piggin, Andrew Morton, linux-kernel, Siddha, Suresh B

On Wed, Apr 06, 2005 at 07:44:12AM +0200, Ingo Molnar wrote:
> 
> * Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> 
> > This is Suresh's patch with some modifications.
> 
> > Remove degenerate scheduler domains during the sched-domain init.
> 
> actually, i'd suggest to not do this patch. The point of booting with a 
> CONFIG_NUMA kernel on a non-NUMA box is mostly for testing, and the 

Not really. All of the x86_64 kernels are NUMA enabled and most Intel x86_64
systems today are non NUMA.

> 'degenerate' toplevel domain exposed conceptual bugs in the 
> sched-domains code. In that sense removing such 'unnecessary' domains 
> inhibits debuggability to a certain degree. If we had this patch earlier 
> we'd not have experienced the wrong decisions taken by the scheduler, 
> only on the much rarer 'really NUMA' boxes.
> 
> is there any case where we'd want to simplify the domain tree? One more 
> domain level is just one (and very minor) aspect of CONFIG_NUMA - i'd 
> not want to run a CONFIG_NUMA kernel on a non-NUMA box, even if the 
> domain tree got optimized. Hm?
> 

Ingo, pardon me! Actually I used NUMA domain as an excuse to push domain
degenerate patch.... As I mentioned earlier, we should remove SMT domain
on a non-HT capable system.

Similarly I am working on adding a new core domain for dual-core systems!
All these domains are unnecessary and cause performance isssues on
non Multi-threading/Multi-core capable cpus! Agreed that performance 
impact will be minor but still...

thanks,
suresh

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch 1/5] sched: remove degenerate domains
  2005-04-06  7:10   ` Siddha, Suresh B
@ 2005-04-06  7:13     ` Ingo Molnar
  2005-04-06  8:12       ` Nick Piggin
  0 siblings, 1 reply; 27+ messages in thread
From: Ingo Molnar @ 2005-04-06  7:13 UTC (permalink / raw)
  To: Siddha, Suresh B; +Cc: Nick Piggin, Andrew Morton, linux-kernel


* Siddha, Suresh B <suresh.b.siddha@intel.com> wrote:

> Similarly I am working on adding a new core domain for dual-core 
> systems! All these domains are unnecessary and cause performance 
> isssues on non Multi-threading/Multi-core capable cpus! Agreed that 
> performance impact will be minor but still...

ok, lets keep it then. It may in fact simplify the domain setup code: we 
could generate the 'most generic' layout for a given arch all the time, 
and then optimize it automatically. I.e. in theory we could have just a 
single domain-setup routine, which would e.g. generate the NUMA domains 
on SMP too, which would then be optimized away.

	Ingo

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch 1/5] sched: remove degenerate domains
  2005-04-06  5:44 ` [patch 1/5] sched: remove degenerate domains Ingo Molnar
  2005-04-06  7:10   ` Siddha, Suresh B
@ 2005-04-06  7:49   ` Nick Piggin
  2005-04-07  7:00     ` Ingo Molnar
  1 sibling, 1 reply; 27+ messages in thread
From: Nick Piggin @ 2005-04-06  7:49 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Andrew Morton, linux-kernel, Siddha, Suresh B

Ingo Molnar wrote:
> * Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> 
> 
>>This is Suresh's patch with some modifications.
> 
> 
>>Remove degenerate scheduler domains during the sched-domain init.
> 
> 
> actually, i'd suggest to not do this patch. The point of booting with a 
> CONFIG_NUMA kernel on a non-NUMA box is mostly for testing, and the 
> 'degenerate' toplevel domain exposed conceptual bugs in the 
> sched-domains code. In that sense removing such 'unnecessary' domains 
> inhibits debuggability to a certain degree. If we had this patch earlier 
> we'd not have experienced the wrong decisions taken by the scheduler, 
> only on the much rarer 'really NUMA' boxes.
> 

True. Although I'd imagine it may be something distros may want.
For example, a generic x86-64 kernel for both AMD and Intel systems
could easily have SMT and NUMA turned on.

I agree with the downside of exercising less code paths though.

What about putting as a (default to off for 2.6) config option in
the config embedded menu?

> is there any case where we'd want to simplify the domain tree? One more 
> domain level is just one (and very minor) aspect of CONFIG_NUMA - i'd 
> not want to run a CONFIG_NUMA kernel on a non-NUMA box, even if the 
> domain tree got optimized. Hm?
> 

I guess there is the SMT issue too, and even booting an SMP kernel
on a UP system. Also small ia64 NUMA systems will probably have one
redundant NUMA level.

If/when topologies get more complex (for example, the recent Altix
discussions we had with Paul), it will be generally easier to set
up all levels in a generic way, then weed them out using something
like this, rather than put the logic in the domain setup code.

Nick

-- 
SUSE Labs, Novell Inc.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch 2/5] sched: NULL domains
  2005-04-06  5:48     ` Ingo Molnar
@ 2005-04-06  7:51       ` Nick Piggin
  0 siblings, 0 replies; 27+ messages in thread
From: Nick Piggin @ 2005-04-06  7:51 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Andrew Morton, linux-kernel, Siddha, Suresh B

Ingo Molnar wrote:
> * Ingo Molnar <mingo@elte.hu> wrote:
> 
> 
>>* Nick Piggin <nickpiggin@yahoo.com.au> wrote:
>>
>>
>>>2/5
>>
>>>The previous patch fixed the last 2 places that directly access a
>>>runqueue's sched-domain and assume it cannot be NULL.
>>>
>>>We can now use a NULL domain instead of a dummy domain to signify
>>>no balancing is to happen. No functional changes.
>>>
>>>Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
>>
>>Acked-by: Ingo Molnar <mingo@elte.hu>
   ^^^

Thanks.

> 
> 
> if the previous 'remove degenerate domains' patch would go away then 
> this patch needs to be merged/modified. (and most of the others as well)
> 

I probably should respin this so it goes in *first* anyway.
Rather than doing half in the remove degenerate domains and
half here.

-- 
SUSE Labs, Novell Inc.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch 3/5] sched: multilevel sbe and sbf
  2005-04-06  5:54     ` [patch 3/5] sched: multilevel sbe and sbf Ingo Molnar
@ 2005-04-06  7:53       ` Nick Piggin
  0 siblings, 0 replies; 27+ messages in thread
From: Nick Piggin @ 2005-04-06  7:53 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Andrew Morton, linux-kernel, Siddha, Suresh B

Ingo Molnar wrote:

> Acked-by: Ingo Molnar <mingo@elte.hu>
> 
> note that no matter how much scheduler logic, in the end 
> cross-scheduling of tasks between nodes on NUMA will always have a 
> permanent penalty (i.e. the 'migration cost' is 'infinity' in the long 
> run), so the primary focus _hast to be_ on 'get it right initially' When 
> tasks must spill over to other nodes will always remain a special case.  
> So balance-on-fork/exec/[clone] definitely needs to be aware of the full 
> domain tree picture.
> 

Yes, well put. I imagine this will only become more important as
there becomes more push towards multiprocessing machines, and the
need for higher memory bandwidth and lower latency to CPUs.


-- 
SUSE Labs, Novell Inc.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch 4/5] sched: RCU sched domains
  2005-04-06  6:18       ` [patch 4/5] sched: RCU sched domains Ingo Molnar
@ 2005-04-06  8:01         ` Nick Piggin
  2005-04-07  7:11           ` Ingo Molnar
  0 siblings, 1 reply; 27+ messages in thread
From: Nick Piggin @ 2005-04-06  8:01 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Andrew Morton, linux-kernel, Siddha, Suresh B

Ingo Molnar wrote:
> * Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> 
> 
>>4/5
> 
> 
>>One of the problems with the multilevel balance-on-fork/exec is that 
>>it needs to jump through hoops to satisfy sched-domain's locking 
>>semantics (that is, you may traverse your own domain when not 
>>preemptable, and you may traverse others' domains when holding their 
>>runqueue lock).
>>
>>balance-on-exec had to potentially migrate between more than one CPU 
>>before finding a final CPU to migrate to, and balance-on-fork needed 
>>to potentially take multiple runqueue locks.
>>
>>So bite the bullet and make sched-domains go completely RCU. This 
>>actually simplifies the code quite a bit.
>>
>>Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
> 
> 
> i like it conceptually, so:
> 
> Acked-by: Ingo Molnar <mingo@elte.hu>
> 

Oh good, thanks.

> from now on, all domain-tree readonly uses have to be rcu_read_lock()-ed 
> (or otherwise have to be in a non-preemptible section). But there's a 
> bug in show_shedstats() which does a for_each_domain() from within a 
> preemptible section. (It was a bug with the current hotplug logic too i 
> think.)
> 

Ah, thanks. That looks like a bug in the code with the locking
we have now too...

> At a minimum i think we need the fix+comment below.
> 

Well if we say "this is actually RCU", then yes. And we should
probably change the preempt_{dis|en}ables in other places to
rcu_read_lock.

OTOH, if we say we just want all running threads to process through
a preemption stage, then this would just be a preempt_disable/enable
pair.

In practice that makes no difference yet, but it looks like you and
Paul are working to distinguish these two cases in the RCU code, to
accomodate your low latency RCU stuff?

I'd prefer the latter (ie. just disable preempt, and use
synchronize_sched), but I'm not too sure of what is going on with
your the low latency RCU work...?

> 	Ingo
> 
> Signed-off-by: Ingo Molnar <mingo@elte.hu>
> 

Thanks for catching that. I may just push it through first as a fix
to the current 2.6 schedstats code (using preempt_disable), and
afterwards we can change it to rcu_read_lock if that is required.

-- 
SUSE Labs, Novell Inc.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch 5/5] sched: consolidate sbe sbf
  2005-04-06  6:27         ` Ingo Molnar
@ 2005-04-06  8:09           ` Nick Piggin
  2005-04-06  8:16             ` Nick Piggin
  2005-04-07  7:15             ` Ingo Molnar
  0 siblings, 2 replies; 27+ messages in thread
From: Nick Piggin @ 2005-04-06  8:09 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Andrew Morton, linux-kernel, Siddha, Suresh B

Ingo Molnar wrote:
> * Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> 
> 
>>5/5
>>
>>Any ideas about what to do with schedstats?
>>Do we really need balance on exec and fork as seperate
>>statistics?
> 
> 
>>Consolidate balance-on-exec with balance-on-fork. This is made easy
>>by the sched-domains RCU patches.
>>
>>As well as the general goodness of code reduction, this allows
>>the runqueues to be unlocked during balance-on-fork.
>>
>>schedstats is a problem. Maybe just have balance-on-event instead
>>of distinguishing fork and exec?
>>
>>Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
> 
> 
> looks good.
> 

One problem I just noticed, sorry. This is doing set_cpus_allowed
without holding the runqueue lock and without checking the hard
affinity mask either.

We could just do a set_cpus_allowed, or take the lock, set_cpus_allowed,
and take the new lock, but that's probably a bit heavy if we can avoid it.
In the interests of speed in this fast path, do you think we can do this
in sched_fork, before the task has even been put on the tasklist?

That would avoid all locking problems. Passing clone_flags into sched_fork
would not be a problem if we want to distinguish fork() and clone(CLONE_VM).

Yes? I'll cut a new patch to do just that.

>  Acked-by: Ingo Molnar <mingo@elte.hu>
> 
> while the code is now consolidated, i think we still need the separate 
> fork/exec stats for schedstat.

This makes it a bit harder then, to get good stats in the sched-domain
(which is really what we want). It would basically mean doing
if (balance fork)
	schedstat_inc(sbf_cnt);
else if (balance exec)
	schedstat_inc(sbe_cnt);
etc.

That should all get optimised out by the compiler, but still a bit ugly.
Any ideas?


-- 
SUSE Labs, Novell Inc.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch 1/5] sched: remove degenerate domains
  2005-04-06  7:13     ` Ingo Molnar
@ 2005-04-06  8:12       ` Nick Piggin
  0 siblings, 0 replies; 27+ messages in thread
From: Nick Piggin @ 2005-04-06  8:12 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Siddha, Suresh B, Andrew Morton, linux-kernel

Ingo Molnar wrote:
> * Siddha, Suresh B <suresh.b.siddha@intel.com> wrote:
> 
> 
>>Similarly I am working on adding a new core domain for dual-core 
>>systems! All these domains are unnecessary and cause performance 
>>isssues on non Multi-threading/Multi-core capable cpus! Agreed that 
>>performance impact will be minor but still...
> 
> 
> ok, lets keep it then. It may in fact simplify the domain setup code: we 
> could generate the 'most generic' layout for a given arch all the time, 
> and then optimize it automatically. I.e. in theory we could have just a 
> single domain-setup routine, which would e.g. generate the NUMA domains 
> on SMP too, which would then be optimized away.
> 

Yep, exactly. Even so, Andrew: please ignore this patch series
and I'll redo it for you when we all agree on everything.

Thanks.

-- 
SUSE Labs, Novell Inc.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch 5/5] sched: consolidate sbe sbf
  2005-04-06  8:09           ` Nick Piggin
@ 2005-04-06  8:16             ` Nick Piggin
  2005-04-07  7:17               ` Ingo Molnar
  2005-04-07  7:15             ` Ingo Molnar
  1 sibling, 1 reply; 27+ messages in thread
From: Nick Piggin @ 2005-04-06  8:16 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Andrew Morton, linux-kernel, Siddha, Suresh B

Nick Piggin wrote:

> 
> One problem I just noticed, sorry. This is doing set_cpus_allowed
> without holding the runqueue lock and without checking the hard
> affinity mask either.
> 

Err, that is to say set_task_cpu, not set_cpus_allowed.

-- 
SUSE Labs, Novell Inc.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch 1/5] sched: remove degenerate domains
  2005-04-06  7:49   ` Nick Piggin
@ 2005-04-07  7:00     ` Ingo Molnar
  0 siblings, 0 replies; 27+ messages in thread
From: Ingo Molnar @ 2005-04-07  7:00 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, linux-kernel, Siddha, Suresh B


* Nick Piggin <nickpiggin@yahoo.com.au> wrote:

> [...] Although I'd imagine it may be something distros may want. For 
> example, a generic x86-64 kernel for both AMD and Intel systems could 
> easily have SMT and NUMA turned on.

yes, that's true - in fact reducing the number of separate kernel 
packages is of utmost importance to all distributions. (I'm not sure we 
are there yet with CONFIG_NUMA, but small steps wont hurt.)

> I agree with the downside of exercising less code paths though.

if we make CONFIG_NUMA good enough on small boxes so that distributors 
can turn it on then in the long run the loss could be offset by the win 
the extra QA gives.

> >is there any case where we'd want to simplify the domain tree? One more 
> >domain level is just one (and very minor) aspect of CONFIG_NUMA - i'd 
> >not want to run a CONFIG_NUMA kernel on a non-NUMA box, even if the 
> >domain tree got optimized. Hm?
> 
> I guess there is the SMT issue too, and even booting an SMP kernel on 
> a UP system. Also small ia64 NUMA systems will probably have one 
> redundant NUMA level.

i think most factors of not running an SMP kernel on a UP box are not 
due scheduler overhead: the biggest cost is spinlock overhead. Someone 
should try a little prototype: use the 'alternate instructions' 
framework to patch out calls to spinlock functions to NOPs, and 
benchmark the resulting kernel against UP. If it's "good enough", 
distros will use it. Having just a single binary kernel RPM that 
supports everything from NUMA through SMP to UP is the holy grail of 
distros. (especially the ones that offer commercial support and 
services.)

this is probably not possible on x86 - e.g. it would probably be 
expensive (in terms of runtime cost) to make the PAE/non-PAE decision 
runtime (the distro boot kernel needs to be non-PAE). But for newer 
arches like x64 it should be easier.

> If/when topologies get more complex (for example, the recent Altix 
> discussions we had with Paul), it will be generally easier to set up 
> all levels in a generic way, then weed them out using something like 
> this, rather than put the logic in the domain setup code.

ok. That should also make it easier to put more of the arch domain setup 
code into sched.c. E.g. i'm still uneasy about it having so much 
scheduler code in arch/ia64/kernel/domain.c, and all the ripple effects. 
(the #ifdefs, include file impact, etc.)

	Ingo

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch 4/5] sched: RCU sched domains
  2005-04-06  8:01         ` Nick Piggin
@ 2005-04-07  7:11           ` Ingo Molnar
  2005-04-07  7:58             ` Nick Piggin
  0 siblings, 1 reply; 27+ messages in thread
From: Ingo Molnar @ 2005-04-07  7:11 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, linux-kernel, Siddha, Suresh B


* Nick Piggin <nickpiggin@yahoo.com.au> wrote:

> > At a minimum i think we need the fix+comment below.
> 
> Well if we say "this is actually RCU", then yes. And we should 
> probably change the preempt_{dis|en}ables in other places to 
> rcu_read_lock.
> 
> OTOH, if we say we just want all running threads to process through a 
> preemption stage, then this would just be a preempt_disable/enable 
> pair.
> 
> In practice that makes no difference yet, but it looks like you and 
> Paul are working to distinguish these two cases in the RCU code, to 
> accomodate your low latency RCU stuff?

it doesnt impact PREEMPT_RCU/PREEMPT_RT directly, because the scheduler 
itself always needs to be non-preemptible.

those few places where we currently do preempt_disable(), which should 
thus be rcu_read_lock(), are never in codepaths that can take alot of 
time.

but yes, in principle you are right, but in this particular (and 
special) case it's not a big issue. We should document the RCU read-lock 
dependencies cleanly and make all rcu-read-lock cases truly 
rcu_read_lock(), but it's not a pressing issue even considering possible 
future features like PREEMPT_RT.

the only danger in this area is to PREEMPT_RT: it is a bug on PREEMPT_RT 
if kernel code has an implicit 'spinlock means preempt-off and thus 
RCU-read-lock' assumption. Most of the time these get discovered via 
PREEMPT_DEBUG. (preempt_disable() disables preemption on PREEMPT_RT too, 
so that is not a problem either.)

	Ingo

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch 5/5] sched: consolidate sbe sbf
  2005-04-06  8:09           ` Nick Piggin
  2005-04-06  8:16             ` Nick Piggin
@ 2005-04-07  7:15             ` Ingo Molnar
  1 sibling, 0 replies; 27+ messages in thread
From: Ingo Molnar @ 2005-04-07  7:15 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, linux-kernel, Siddha, Suresh B


* Nick Piggin <nickpiggin@yahoo.com.au> wrote:

> We could just do a set_cpus_allowed, or take the lock, 
> set_cpus_allowed, and take the new lock, but that's probably a bit 
> heavy if we can avoid it. In the interests of speed in this fast path, 
> do you think we can do this in sched_fork, before the task has even 
> been put on the tasklist?

yeah, that shouldnt be a problem. Technically we set cpus_allowed up 
under the tasklist lock just to be non-preemptible and to copy the 
parent's _current_ affinity to the child. But sched_fork() is called 
just before and if the parent got its affinity changed between the two 
calls, so what? I'd move all of this code into sched_fork().

> That would avoid all locking problems. Passing clone_flags into 
> sched_fork would not be a problem if we want to distinguish fork() and 
> clone(CLONE_VM).

sure, that was the plan all along with sched_fork() anyway. (i think the 
initial versions had the flag)

> Yes? I'll cut a new patch to do just that.

sure, fine by me.

	Ingo

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch 5/5] sched: consolidate sbe sbf
  2005-04-06  8:16             ` Nick Piggin
@ 2005-04-07  7:17               ` Ingo Molnar
  0 siblings, 0 replies; 27+ messages in thread
From: Ingo Molnar @ 2005-04-07  7:17 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, linux-kernel, Siddha, Suresh B


* Nick Piggin <nickpiggin@yahoo.com.au> wrote:

> Nick Piggin wrote:
> 
> >
> >One problem I just noticed, sorry. This is doing set_cpus_allowed
> >without holding the runqueue lock and without checking the hard
> >affinity mask either.
> >
> 
> Err, that is to say set_task_cpu, not set_cpus_allowed.

yes. The whole cpus_allowed+set_task_cpu() section in copy_process() 
should move into sched_fork().

	Ingo

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch 4/5] sched: RCU sched domains
  2005-04-07  7:11           ` Ingo Molnar
@ 2005-04-07  7:58             ` Nick Piggin
  2005-04-11 22:15               ` Paul E. McKenney
  0 siblings, 1 reply; 27+ messages in thread
From: Nick Piggin @ 2005-04-07  7:58 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Andrew Morton, linux-kernel, Siddha, Suresh B

Ingo Molnar wrote:
> * Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> 
> 
>>>At a minimum i think we need the fix+comment below.
>>
>>Well if we say "this is actually RCU", then yes. And we should 
>>probably change the preempt_{dis|en}ables in other places to 
>>rcu_read_lock.
>>
>>OTOH, if we say we just want all running threads to process through a 
>>preemption stage, then this would just be a preempt_disable/enable 
>>pair.
>>
>>In practice that makes no difference yet, but it looks like you and 
>>Paul are working to distinguish these two cases in the RCU code, to 
>>accomodate your low latency RCU stuff?
> 
> 
> it doesnt impact PREEMPT_RCU/PREEMPT_RT directly, because the scheduler 
> itself always needs to be non-preemptible.
> 
> those few places where we currently do preempt_disable(), which should 
> thus be rcu_read_lock(), are never in codepaths that can take alot of 
> time.
> 
> but yes, in principle you are right, but in this particular (and 
> special) case it's not a big issue. We should document the RCU read-lock 
> dependencies cleanly and make all rcu-read-lock cases truly 
> rcu_read_lock(), but it's not a pressing issue even considering possible 
> future features like PREEMPT_RT.
> 
> the only danger in this area is to PREEMPT_RT: it is a bug on PREEMPT_RT 
> if kernel code has an implicit 'spinlock means preempt-off and thus 
> RCU-read-lock' assumption. Most of the time these get discovered via 
> PREEMPT_DEBUG. (preempt_disable() disables preemption on PREEMPT_RT too, 
> so that is not a problem either.)
> 

OK thanks for the good explanation. So I'll keep it as is for now,
and whatever needs cleaning up later can be worked out as it comes
up.

-- 
SUSE Labs, Novell Inc.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch 4/5] sched: RCU sched domains
  2005-04-07  7:58             ` Nick Piggin
@ 2005-04-11 22:15               ` Paul E. McKenney
  2005-04-12  0:03                 ` Nick Piggin
  0 siblings, 1 reply; 27+ messages in thread
From: Paul E. McKenney @ 2005-04-11 22:15 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Ingo Molnar, Andrew Morton, linux-kernel, Siddha, Suresh B

On Thu, Apr 07, 2005 at 05:58:40PM +1000, Nick Piggin wrote:
> Ingo Molnar wrote:
> >* Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> >
> >
> >>>At a minimum i think we need the fix+comment below.
> >>
> >>Well if we say "this is actually RCU", then yes. And we should 
> >>probably change the preempt_{dis|en}ables in other places to 
> >>rcu_read_lock.
> >>
> >>OTOH, if we say we just want all running threads to process through a 
> >>preemption stage, then this would just be a preempt_disable/enable 
> >>pair.
> >>
> >>In practice that makes no difference yet, but it looks like you and 
> >>Paul are working to distinguish these two cases in the RCU code, to 
> >>accomodate your low latency RCU stuff?
> >
> >
> >it doesnt impact PREEMPT_RCU/PREEMPT_RT directly, because the scheduler 
> >itself always needs to be non-preemptible.
> >
> >those few places where we currently do preempt_disable(), which should 
> >thus be rcu_read_lock(), are never in codepaths that can take alot of 
> >time.
> >
> >but yes, in principle you are right, but in this particular (and 
> >special) case it's not a big issue. We should document the RCU read-lock 
> >dependencies cleanly and make all rcu-read-lock cases truly 
> >rcu_read_lock(), but it's not a pressing issue even considering possible 
> >future features like PREEMPT_RT.
> >
> >the only danger in this area is to PREEMPT_RT: it is a bug on PREEMPT_RT 
> >if kernel code has an implicit 'spinlock means preempt-off and thus 
> >RCU-read-lock' assumption. Most of the time these get discovered via 
> >PREEMPT_DEBUG. (preempt_disable() disables preemption on PREEMPT_RT too, 
> >so that is not a problem either.)
> >
> 
> OK thanks for the good explanation. So I'll keep it as is for now,
> and whatever needs cleaning up later can be worked out as it comes
> up.

Looking forward to the split of synchronize_kernel() into synchronize_rcu()
and synchronize_sched(), the two choices are:

o	Use synchronize_rcu(), but insert rcu_read_lock()/rcu_read_unlock()
	pairs on the read side.

o	Use synchronize_sched(), and make sure all read-side code is
	under preempt_disable().

Either way, there may also need to be some rcu_dereference()s when picking
up pointer and rcu_assign_pointer()s when updating the pointers.
For example, if traversing the domain parent list is to be RCU protected,
the for_each_domain() macro should change to something like:

#define for_each_domain(cpu, domain) \
	for (domain = cpu_rq(cpu)->sd; domain; domain = rcu_dereference(domain->parent))

							Thanx, Paul

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch 4/5] sched: RCU sched domains
  2005-04-11 22:15               ` Paul E. McKenney
@ 2005-04-12  0:03                 ` Nick Piggin
  0 siblings, 0 replies; 27+ messages in thread
From: Nick Piggin @ 2005-04-12  0:03 UTC (permalink / raw)
  To: paulmck; +Cc: Ingo Molnar, Andrew Morton, linux-kernel, Siddha, Suresh B

Paul E. McKenney wrote:
> On Thu, Apr 07, 2005 at 05:58:40PM +1000, Nick Piggin wrote:
> 

>>
>>OK thanks for the good explanation. So I'll keep it as is for now,
>>and whatever needs cleaning up later can be worked out as it comes
>>up.
> 
> 
> Looking forward to the split of synchronize_kernel() into synchronize_rcu()
> and synchronize_sched(), the two choices are:
> 
> o	Use synchronize_rcu(), but insert rcu_read_lock()/rcu_read_unlock()
> 	pairs on the read side.
> 
> o	Use synchronize_sched(), and make sure all read-side code is
> 	under preempt_disable().
> 

Yep, I think we'll go for the second option initially (because that
pretty closely matches the homebrew locking scheme that it used to
use).

> Either way, there may also need to be some rcu_dereference()s when picking
> up pointer and rcu_assign_pointer()s when updating the pointers.
> For example, if traversing the domain parent list is to be RCU protected,
> the for_each_domain() macro should change to something like:
> 

Yes, I think you're right, because there's no barriers or synchronisation
when attaching a new domain. Just a small point though:

> #define for_each_domain(cpu, domain) \
> 	for (domain = cpu_rq(cpu)->sd; domain; domain = rcu_dereference(domain->parent))
> 

This should probably be done like so?

#define for_each_domain(cpu, domain) \
	for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent)

And I think it would be wise to use rcu_assign_pointer in the update too.
Thanks Paul.

-- 
SUSE Labs, Novell Inc.


^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2005-04-12  0:03 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2005-04-05 23:44 [patch 1/5] sched: remove degenerate domains Nick Piggin
2005-04-05 23:45 ` [patch 2/5] sched: NULL domains Nick Piggin
2005-04-05 23:46   ` [patch 3/5] sched: multilevel sbe and sbf Nick Piggin
2005-04-05 23:47     ` [patch 4/5] sched: RCU sched domains Nick Piggin
2005-04-05 23:49       ` [patch 5/5] sched: consolidate sbe sbf Nick Piggin
2005-04-06  6:27         ` Ingo Molnar
2005-04-06  8:09           ` Nick Piggin
2005-04-06  8:16             ` Nick Piggin
2005-04-07  7:17               ` Ingo Molnar
2005-04-07  7:15             ` Ingo Molnar
2005-04-06  6:18       ` [patch 4/5] sched: RCU sched domains Ingo Molnar
2005-04-06  8:01         ` Nick Piggin
2005-04-07  7:11           ` Ingo Molnar
2005-04-07  7:58             ` Nick Piggin
2005-04-11 22:15               ` Paul E. McKenney
2005-04-12  0:03                 ` Nick Piggin
2005-04-06  5:54     ` [patch 3/5] sched: multilevel sbe and sbf Ingo Molnar
2005-04-06  7:53       ` Nick Piggin
2005-04-06  5:45   ` [patch 2/5] sched: NULL domains Ingo Molnar
2005-04-06  5:48     ` Ingo Molnar
2005-04-06  7:51       ` Nick Piggin
2005-04-06  5:44 ` [patch 1/5] sched: remove degenerate domains Ingo Molnar
2005-04-06  7:10   ` Siddha, Suresh B
2005-04-06  7:13     ` Ingo Molnar
2005-04-06  8:12       ` Nick Piggin
2005-04-06  7:49   ` Nick Piggin
2005-04-07  7:00     ` Ingo Molnar

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).