All of lore.kernel.org
 help / color / mirror / Atom feed
* [rfc][patch] sched: remove smpnice
@ 2006-02-07 14:28 Nick Piggin
  2006-02-07 14:57 ` Con Kolivas
  0 siblings, 1 reply; 33+ messages in thread
From: Nick Piggin @ 2006-02-07 14:28 UTC (permalink / raw)
  To: Andrew Morton, Ingo Molnar, Con Kolivas, Steven Rostedt,
	Peter Williams, Suresh B, Linux Kernel Mailing List
  Cc: Linus Torvalds

I'd like to get some comments on removing smpnice for 2.6.16. I don't
think the code is quite ready, which is why I asked for Peter's additions
to also be merged before I acked it (although it turned out that it still
isn't quite ready with his additions either).

Basically I have had similar observations to Suresh in that it does not
play nicely with the rest of the balancing infrastructure (and raised
similar concerns in my review).

The samples (group of 4) I got for "maximum recorded imbalance" on a 2x2
SMP+HT Xeon are as follows:

           | Following boot | hackbench 20        | hackbench 40
-----------+----------------+---------------------+---------------------
2.6.16-rc2 | 30,37,100,112  | 5600,5530,6020,6090 | 6390,7090,8760,8470
+nosmpnice |  3, 2,  4,  2  |   28, 150, 294, 132 |  348, 348, 294, 347

Hackbench raw performance is down around 15% with smpnice (but that in
itself isn't a huge deal because it is just a benchmark). However, the
samples show that the imbalance passed into move_tasks is increased by
about a factor of 10-30. I think this would also go some way to
explaining latency blips turning up in the balancing code (though I
haven't actually measured that).

We'll probably have to revert this in the SUSE kernel.

The other option for 2.6.16 would be to fast track Peter's stuff, which
I could put some time into... but that seems a bit risky at this stage
of the game.

I'd like to hear any other suggestions though. Patch included to aid
discussion at this stage, rather than to encourage any rash decisions.

Nick
--

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -215,7 +215,6 @@ struct runqueue {
 	 */
 	unsigned long nr_running;
 #ifdef CONFIG_SMP
-	unsigned long prio_bias;
 	unsigned long cpu_load[3];
 #endif
 	unsigned long long nr_switches;
@@ -671,68 +670,13 @@ static int effective_prio(task_t *p)
 	return prio;
 }
 
-#ifdef CONFIG_SMP
-static inline void inc_prio_bias(runqueue_t *rq, int prio)
-{
-	rq->prio_bias += MAX_PRIO - prio;
-}
-
-static inline void dec_prio_bias(runqueue_t *rq, int prio)
-{
-	rq->prio_bias -= MAX_PRIO - prio;
-}
-
-static inline void inc_nr_running(task_t *p, runqueue_t *rq)
-{
-	rq->nr_running++;
-	if (rt_task(p)) {
-		if (p != rq->migration_thread)
-			/*
-			 * The migration thread does the actual balancing. Do
-			 * not bias by its priority as the ultra high priority
-			 * will skew balancing adversely.
-			 */
-			inc_prio_bias(rq, p->prio);
-	} else
-		inc_prio_bias(rq, p->static_prio);
-}
-
-static inline void dec_nr_running(task_t *p, runqueue_t *rq)
-{
-	rq->nr_running--;
-	if (rt_task(p)) {
-		if (p != rq->migration_thread)
-			dec_prio_bias(rq, p->prio);
-	} else
-		dec_prio_bias(rq, p->static_prio);
-}
-#else
-static inline void inc_prio_bias(runqueue_t *rq, int prio)
-{
-}
-
-static inline void dec_prio_bias(runqueue_t *rq, int prio)
-{
-}
-
-static inline void inc_nr_running(task_t *p, runqueue_t *rq)
-{
-	rq->nr_running++;
-}
-
-static inline void dec_nr_running(task_t *p, runqueue_t *rq)
-{
-	rq->nr_running--;
-}
-#endif
-
 /*
  * __activate_task - move a task to the runqueue.
  */
 static inline void __activate_task(task_t *p, runqueue_t *rq)
 {
 	enqueue_task(p, rq->active);
-	inc_nr_running(p, rq);
+	rq->nr_running++;
 }
 
 /*
@@ -741,7 +685,7 @@ static inline void __activate_task(task_
 static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
 {
 	enqueue_task_head(p, rq->active);
-	inc_nr_running(p, rq);
+	rq->nr_running++;
 }
 
 static int recalc_task_prio(task_t *p, unsigned long long now)
@@ -865,7 +809,7 @@ static void activate_task(task_t *p, run
  */
 static void deactivate_task(struct task_struct *p, runqueue_t *rq)
 {
-	dec_nr_running(p, rq);
+	rq->nr_running--;
 	dequeue_task(p, p->array);
 	p->array = NULL;
 }
@@ -1009,61 +953,27 @@ void kick_process(task_t *p)
  * We want to under-estimate the load of migration sources, to
  * balance conservatively.
  */
-static unsigned long __source_load(int cpu, int type, enum idle_type idle)
+static inline unsigned long source_load(int cpu, int type)
 {
 	runqueue_t *rq = cpu_rq(cpu);
-	unsigned long running = rq->nr_running;
-	unsigned long source_load, cpu_load = rq->cpu_load[type-1],
-		load_now = running * SCHED_LOAD_SCALE;
-
+	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
 	if (type == 0)
-		source_load = load_now;
-	else
-		source_load = min(cpu_load, load_now);
-
-	if (running > 1 || (idle == NOT_IDLE && running))
-		/*
-		 * If we are busy rebalancing the load is biased by
-		 * priority to create 'nice' support across cpus. When
-		 * idle rebalancing we should only bias the source_load if
-		 * there is more than one task running on that queue to
-		 * prevent idle rebalance from trying to pull tasks from a
-		 * queue with only one running task.
-		 */
-		source_load = source_load * rq->prio_bias / running;
+		return load_now;
 
-	return source_load;
-}
-
-static inline unsigned long source_load(int cpu, int type)
-{
-	return __source_load(cpu, type, NOT_IDLE);
+	return min(rq->cpu_load[type-1], load_now);
 }
 
 /*
  * Return a high guess at the load of a migration-target cpu
  */
-static inline unsigned long __target_load(int cpu, int type, enum idle_type idle)
+static inline unsigned long target_load(int cpu, int type)
 {
 	runqueue_t *rq = cpu_rq(cpu);
-	unsigned long running = rq->nr_running;
-	unsigned long target_load, cpu_load = rq->cpu_load[type-1],
-		load_now = running * SCHED_LOAD_SCALE;
-
+	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
 	if (type == 0)
-		target_load = load_now;
-	else
-		target_load = max(cpu_load, load_now);
+		return load_now;
 
-	if (running > 1 || (idle == NOT_IDLE && running))
-		target_load = target_load * rq->prio_bias / running;
-
-	return target_load;
-}
-
-static inline unsigned long target_load(int cpu, int type)
-{
-	return __target_load(cpu, type, NOT_IDLE);
+	return max(rq->cpu_load[type-1], load_now);
 }
 
 /*
@@ -1532,7 +1442,7 @@ void fastcall wake_up_new_task(task_t *p
 				list_add_tail(&p->run_list, &current->run_list);
 				p->array = current->array;
 				p->array->nr_active++;
-				inc_nr_running(p, rq);
+				rq->nr_running++;
 			}
 			set_need_resched();
 		} else
@@ -1877,9 +1787,9 @@ void pull_task(runqueue_t *src_rq, prio_
 	       runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
 {
 	dequeue_task(p, src_array);
-	dec_nr_running(p, src_rq);
+	src_rq->nr_running--;
 	set_task_cpu(p, this_cpu);
-	inc_nr_running(p, this_rq);
+	this_rq->nr_running++;
 	enqueue_task(p, this_array);
 	p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
 				+ this_rq->timestamp_last_tick;
@@ -2058,9 +1968,9 @@ find_busiest_group(struct sched_domain *
 
 			/* Bias balancing toward cpus of our domain */
 			if (local_group)
-				load = __target_load(i, load_idx, idle);
+				load = target_load(i, load_idx);
 			else
-				load = __source_load(i, load_idx, idle);
+				load = source_load(i, load_idx);
 
 			avg_load += load;
 		}
@@ -2173,7 +2083,7 @@ static runqueue_t *find_busiest_queue(st
 	int i;
 
 	for_each_cpu_mask(i, group->cpumask) {
-		load = __source_load(i, 0, idle);
+		load = source_load(i, 0);
 
 		if (load > max_load) {
 			max_load = load;
@@ -3576,10 +3486,8 @@ void set_user_nice(task_t *p, long nice)
 		goto out_unlock;
 	}
 	array = p->array;
-	if (array) {
+	if (array)
 		dequeue_task(p, array);
-		dec_prio_bias(rq, p->static_prio);
-	}
 
 	old_prio = p->prio;
 	new_prio = NICE_TO_PRIO(nice);
@@ -3589,7 +3497,6 @@ void set_user_nice(task_t *p, long nice)
 
 	if (array) {
 		enqueue_task(p, array);
-		inc_prio_bias(rq, p->static_prio);
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [rfc][patch] sched: remove smpnice
  2006-02-07 14:28 [rfc][patch] sched: remove smpnice Nick Piggin
@ 2006-02-07 14:57 ` Con Kolivas
  2006-02-07 15:05   ` Nick Piggin
  2006-02-07 22:15   ` Andrew Morton
  0 siblings, 2 replies; 33+ messages in thread
From: Con Kolivas @ 2006-02-07 14:57 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andrew Morton, Ingo Molnar, Steven Rostedt, Peter Williams,
	Suresh B, Linux Kernel Mailing List, Linus Torvalds

On Wednesday 08 February 2006 01:28, Nick Piggin wrote:
> I'd like to get some comments on removing smpnice for 2.6.16. I don't
> think the code is quite ready, which is why I asked for Peter's additions
> to also be merged before I acked it (although it turned out that it still
> isn't quite ready with his additions either).
>
> Basically I have had similar observations to Suresh in that it does not
> play nicely with the rest of the balancing infrastructure (and raised
> similar concerns in my review).
>
> The samples (group of 4) I got for "maximum recorded imbalance" on a 2x2
>
> SMP+HT Xeon are as follows:
>            | Following boot | hackbench 20        | hackbench 40
>
> -----------+----------------+---------------------+---------------------
> 2.6.16-rc2 | 30,37,100,112  | 5600,5530,6020,6090 | 6390,7090,8760,8470
> +nosmpnice |  3, 2,  4,  2  |   28, 150, 294, 132 |  348, 348, 294, 347
>
> Hackbench raw performance is down around 15% with smpnice (but that in
> itself isn't a huge deal because it is just a benchmark). However, the
> samples show that the imbalance passed into move_tasks is increased by
> about a factor of 10-30. I think this would also go some way to
> explaining latency blips turning up in the balancing code (though I
> haven't actually measured that).
>
> We'll probably have to revert this in the SUSE kernel.
>
> The other option for 2.6.16 would be to fast track Peter's stuff, which
> I could put some time into... but that seems a bit risky at this stage
> of the game.
>
> I'd like to hear any other suggestions though. Patch included to aid
> discussion at this stage, rather than to encourage any rash decisions.

I see the demonstrable imbalance but I was wondering if there is there a real 
world benchmark that is currently affected?

Cheers,
Con

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [rfc][patch] sched: remove smpnice
  2006-02-07 14:57 ` Con Kolivas
@ 2006-02-07 15:05   ` Nick Piggin
  2006-02-07 22:15   ` Andrew Morton
  1 sibling, 0 replies; 33+ messages in thread
From: Nick Piggin @ 2006-02-07 15:05 UTC (permalink / raw)
  To: Con Kolivas
  Cc: Nick Piggin, Andrew Morton, Ingo Molnar, Steven Rostedt,
	Peter Williams, Suresh B, Linux Kernel Mailing List,
	Linus Torvalds

On Wed, Feb 08, 2006 at 01:57:06AM +1100, Con Kolivas wrote:
> On Wednesday 08 February 2006 01:28, Nick Piggin wrote:
> >
> > I'd like to hear any other suggestions though. Patch included to aid
> > discussion at this stage, rather than to encourage any rash decisions.
> 
> I see the demonstrable imbalance but I was wondering if there is there a real 
> world benchmark that is currently affected?
> 

Other than the hackbench lock latency that Ingo believes is quite
important (and I wouldn't disagree at all if it turns out to be a
regression), I have not had much time to test things out. I don't
think we can chance it in the SUSE kernel.

The other thing is that it causes headaches when trying to analyse
and review proposed scheduler changes (eg. Suresh's power saving
work).

Nick


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [rfc][patch] sched: remove smpnice
  2006-02-07 14:57 ` Con Kolivas
  2006-02-07 15:05   ` Nick Piggin
@ 2006-02-07 22:15   ` Andrew Morton
  2006-02-07 23:11     ` Con Kolivas
  2006-02-07 23:20     ` Peter Williams
  1 sibling, 2 replies; 33+ messages in thread
From: Andrew Morton @ 2006-02-07 22:15 UTC (permalink / raw)
  To: Con Kolivas
  Cc: npiggin, mingo, rostedt, pwil3058, suresh.b.siddha, linux-kernel,
	torvalds

Con Kolivas <kernel@kolivas.org> wrote:
>
> On Wednesday 08 February 2006 01:28, Nick Piggin wrote:
> > I'd like to get some comments on removing smpnice for 2.6.16. I don't
> > think the code is quite ready, which is why I asked for Peter's additions
> > to also be merged before I acked it (although it turned out that it still
> > isn't quite ready with his additions either).
> >
> > Basically I have had similar observations to Suresh in that it does not
> > play nicely with the rest of the balancing infrastructure (and raised
> > similar concerns in my review).
> >
> > The samples (group of 4) I got for "maximum recorded imbalance" on a 2x2
> >
> > SMP+HT Xeon are as follows:
> >            | Following boot | hackbench 20        | hackbench 40
> >
> > -----------+----------------+---------------------+---------------------
> > 2.6.16-rc2 | 30,37,100,112  | 5600,5530,6020,6090 | 6390,7090,8760,8470
> > +nosmpnice |  3, 2,  4,  2  |   28, 150, 294, 132 |  348, 348, 294, 347
> >
> > Hackbench raw performance is down around 15% with smpnice (but that in
> > itself isn't a huge deal because it is just a benchmark). However, the
> > samples show that the imbalance passed into move_tasks is increased by
> > about a factor of 10-30. I think this would also go some way to
> > explaining latency blips turning up in the balancing code (though I
> > haven't actually measured that).
> >
> > We'll probably have to revert this in the SUSE kernel.
> >
> > The other option for 2.6.16 would be to fast track Peter's stuff, which
> > I could put some time into... but that seems a bit risky at this stage
> > of the game.
> >
> > I'd like to hear any other suggestions though. Patch included to aid
> > discussion at this stage, rather than to encourage any rash decisions.
> 
> I see the demonstrable imbalance but I was wondering if there is there a real 
> world benchmark that is currently affected?
> 

Well was any real-world workload (or benchmark) improved by the smpnice
change?

Because if we have one workload which slowed and and none which sped up,
it's a pretty easy decision..

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [rfc][patch] sched: remove smpnice
  2006-02-07 22:15   ` Andrew Morton
@ 2006-02-07 23:11     ` Con Kolivas
  2006-02-07 23:36       ` Andrew Morton
  2006-02-07 23:20     ` Peter Williams
  1 sibling, 1 reply; 33+ messages in thread
From: Con Kolivas @ 2006-02-07 23:11 UTC (permalink / raw)
  To: Andrew Morton
  Cc: npiggin, mingo, rostedt, pwil3058, suresh.b.siddha, linux-kernel,
	torvalds

On Wed, 8 Feb 2006 09:15 am, Andrew Morton wrote:
> Con Kolivas <kernel@kolivas.org> wrote:
> > On Wednesday 08 February 2006 01:28, Nick Piggin wrote:
> > > I'd like to get some comments on removing smpnice for 2.6.16. I don't
> > > think the code is quite ready, which is why I asked for Peter's
> > > additions to also be merged before I acked it (although it turned out
> > > that it still isn't quite ready with his additions either).
> > >
> > > Basically I have had similar observations to Suresh in that it does not
> > > play nicely with the rest of the balancing infrastructure (and raised
> > > similar concerns in my review).
> > >
> > > The samples (group of 4) I got for "maximum recorded imbalance" on a
> > > 2x2
> > >
> > > SMP+HT Xeon are as follows:
> > >            | Following boot | hackbench 20        | hackbench 40
> > >
> > > -----------+----------------+---------------------+--------------------
> > >- 2.6.16-rc2 | 30,37,100,112  | 5600,5530,6020,6090 |
> > > 6390,7090,8760,8470 +nosmpnice |  3, 2,  4,  2  |   28, 150, 294, 132 |
> > >  348, 348, 294, 347
> > >
> > > Hackbench raw performance is down around 15% with smpnice (but that in
> > > itself isn't a huge deal because it is just a benchmark). However, the
> > > samples show that the imbalance passed into move_tasks is increased by
> > > about a factor of 10-30. I think this would also go some way to
> > > explaining latency blips turning up in the balancing code (though I
> > > haven't actually measured that).
> > >
> > > We'll probably have to revert this in the SUSE kernel.
> > >
> > > The other option for 2.6.16 would be to fast track Peter's stuff, which
> > > I could put some time into... but that seems a bit risky at this stage
> > > of the game.
> > >
> > > I'd like to hear any other suggestions though. Patch included to aid
> > > discussion at this stage, rather than to encourage any rash decisions.
> >
> > I see the demonstrable imbalance but I was wondering if there is there a
> > real world benchmark that is currently affected?
>
> Well was any real-world workload (or benchmark) improved by the smpnice
> change?

No benchmark improved but 'nice' handling moved from completely not working on 
SMP to having quite effective cpu resource modification according to nice. 
nice 19 vs nice 0 has 5% of the cpu on UP. On SMP machines without smp nice 
if you have more tasks than cpus (the 5 tasks on 4 cpu example) you often get 
nice 19 tasks getting more cpu resources than nice 0 tasks; a nice 19 task 
would get 100% of one cpu and two nice 0 tasks would get 50% of a cpu. With 
smp nice the nice 19 task received between 5-30% of one cpu depending on 
their sleep/wake pattern.

> Because if we have one workload which slowed and and none which sped up,
> it's a pretty easy decision..

The resource allocation fairness was improved with smp nice but no benchmark 
directly sped up per se.

Cheers,
Con

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [rfc][patch] sched: remove smpnice
  2006-02-07 22:15   ` Andrew Morton
  2006-02-07 23:11     ` Con Kolivas
@ 2006-02-07 23:20     ` Peter Williams
  2006-02-07 23:29       ` Con Kolivas
  2006-02-07 23:36       ` Martin Bligh
  1 sibling, 2 replies; 33+ messages in thread
From: Peter Williams @ 2006-02-07 23:20 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Con Kolivas, npiggin, mingo, rostedt, suresh.b.siddha,
	linux-kernel, torvalds, Martin Bligh

Andrew Morton wrote:
> Con Kolivas <kernel@kolivas.org> wrote:
> 
>>On Wednesday 08 February 2006 01:28, Nick Piggin wrote:
>>
>>>I'd like to get some comments on removing smpnice for 2.6.16. I don't
>>>think the code is quite ready, which is why I asked for Peter's additions
>>>to also be merged before I acked it (although it turned out that it still
>>>isn't quite ready with his additions either).

I think the issues have now been resolved (see Martin J Bligh's message 
of 2006/01/29 11:52 "Re: -mm seems significantly slower than mainline on 
kernbench thread").

>>>
>>>Basically I have had similar observations to Suresh in that it does not
>>>play nicely with the rest of the balancing infrastructure (and raised
>>>similar concerns in my review).
>>>
>>>The samples (group of 4) I got for "maximum recorded imbalance" on a 2x2
>>>
>>>SMP+HT Xeon are as follows:
>>>           | Following boot | hackbench 20        | hackbench 40
>>>
>>>-----------+----------------+---------------------+---------------------
>>>2.6.16-rc2 | 30,37,100,112  | 5600,5530,6020,6090 | 6390,7090,8760,8470
>>>+nosmpnice |  3, 2,  4,  2  |   28, 150, 294, 132 |  348, 348, 294, 347
>>>
>>>Hackbench raw performance is down around 15% with smpnice (but that in
>>>itself isn't a huge deal because it is just a benchmark). However, the
>>>samples show that the imbalance passed into move_tasks is increased by
>>>about a factor of 10-30. I think this would also go some way to
>>>explaining latency blips turning up in the balancing code (though I
>>>haven't actually measured that).
>>>
>>>We'll probably have to revert this in the SUSE kernel.
>>>
>>>The other option for 2.6.16 would be to fast track Peter's stuff, which
>>>I could put some time into...

See below.  Should only need testing with hackbench.

> but that seems a bit risky at this stage
>>>of the game.
>>>
>>>I'd like to hear any other suggestions though. Patch included to aid
>>>discussion at this stage, rather than to encourage any rash decisions.
>>
>>I see the demonstrable imbalance but I was wondering if there is there a real 
>>world benchmark that is currently affected?
>>
> 
> 
> Well was any real-world workload (or benchmark) improved by the smpnice
> change?
> 
> Because if we have one workload which slowed and and none which sped up,
> it's a pretty easy decision..

Not really as the "smp nice" mechanism isn't meant to speed things up. 
It's meant to ensure that nice still works more or less as expected on 
SMP systems.  So the real question is "Does it achieve this without 
undue cost or unacceptable side effects?".  Even though there are no 
free lunches the extra costs of this mechanism when there are no non 
zero niced tasks is expected to be almost too small to measure. 
Similarly there should be no side effects unless there are non zero 
niced tasks and the side effects should be predictable (or, at least, 
explainable) and due solely to the fact that nice is being better enforced.

I think that the problems with excessive idling with the early versions 
of my modifications to Con's patch showed that the load balancing code 
is fairly sensitive to the average load per normal task not being 
approximately 1.  My latest patches restore this state of affairs and 
kernbench testing indicates that the excessive idling has gone away (see 
Martin J Bligh's message of 2006/01/29 11:52 "Re: -mm seems 
significantly slower than mainline on kernbench thread").

My testing indicates that niceness works well on dual CPU SMP systems 
but it would be nice if those who have commented (from time to time) on 
the need for this (I forget their names -- Con?) could also test it. 
Some testing of this on larger systems would also be appreciated.  (A 
quick test is to start 4 (reasonably CPU intensive e.g. 80%) tasks per 
CPU with nice values of 0, 5, 10 and 15 and then observing their CPU 
usage rates after the system has had a chance to settle down.  Their 
usage rates should be in accord with their nice values. The aspin 
program in the simloads package 
<http://prdownloads.sourceforge.net/cpuse/simloads-0.1.1.tar.gz?download> 
is suitable for the purpose as it has mechanisms for setting cpu demand 
and niceness.  E.g. "aspin -c 0.08 -p 0.02 -n 4 -N 10 &" would start 4 
copies of aspin with "nice" == 10 that would loop for 6 minutes and 
during each loop it would run until it consumed 0.08 seconds of CPU and 
then sleep until 0.02 seconds have elapsed giving a demand of about 80%. 
  If you do this on a system without the patches you may be lucky and 
have the tasks distributed among the CPUs so that their usage rates 
match their nice values but more often than not they will be distributed 
in ways that causes their usage rates to not match their nice values. 
Users complain when this results in heavily niced tasks still getting 
higher CPU usage rates than tasks with nice of 0.)

Finally, the issue of whether the unacceptable side effects with 
hackbench described by Nick are still present when my patch is applied 
on top of Con's patch needs to be tested.  I think that this is the only 
issue that would prevent my patch going into the main line as the other 
issues have been resolved.

Peter
-- 
Peter Williams                                   pwil3058@bigpond.net.au

"Learning, n. The kind of ignorance distinguishing the studious."
  -- Ambrose Bierce

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [rfc][patch] sched: remove smpnice
  2006-02-07 23:20     ` Peter Williams
@ 2006-02-07 23:29       ` Con Kolivas
  2006-02-07 23:36       ` Martin Bligh
  1 sibling, 0 replies; 33+ messages in thread
From: Con Kolivas @ 2006-02-07 23:29 UTC (permalink / raw)
  To: Peter Williams
  Cc: Andrew Morton, npiggin, mingo, rostedt, suresh.b.siddha,
	linux-kernel, torvalds, Martin Bligh

On Wed, 8 Feb 2006 10:20 am, Peter Williams wrote:
> Finally, the issue of whether the unacceptable side effects with
> hackbench described by Nick are still present when my patch is applied
> on top of Con's patch needs to be tested.  I think that this is the only
> issue that would prevent my patch going into the main line as the other
> issues have been resolved.

Since whatever we do is a compromise, I vote we fasttrack Peter's patches. See 
if they fix the hackbench regression and take it from there. They have had 
reasonable testing in -mm, and the last changes he made did not significantly 
change the code but fixed balancing decisions. 

Cheers,
Con

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [rfc][patch] sched: remove smpnice
  2006-02-07 23:20     ` Peter Williams
  2006-02-07 23:29       ` Con Kolivas
@ 2006-02-07 23:36       ` Martin Bligh
  1 sibling, 0 replies; 33+ messages in thread
From: Martin Bligh @ 2006-02-07 23:36 UTC (permalink / raw)
  To: Peter Williams
  Cc: Andrew Morton, Con Kolivas, npiggin, mingo, rostedt,
	suresh.b.siddha, linux-kernel, torvalds


> I think that the problems with excessive idling with the early versions 
> of my modifications to Con's patch showed that the load balancing code 
> is fairly sensitive to the average load per normal task not being 
> approximately 1.  My latest patches restore this state of affairs and 
> kernbench testing indicates that the excessive idling has gone away (see 
> Martin J Bligh's message of 2006/01/29 11:52 "Re: -mm seems 
> significantly slower than mainline on kernbench thread").

I *think* the latest slowdown in -mm was due to some TSC side effects 
from John's patches - see his other patch earlier today to fix (oops,
I forgot to reply to that ..)

So AFAICS, all issues with Peter's stuff were fixed.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [rfc][patch] sched: remove smpnice
  2006-02-07 23:11     ` Con Kolivas
@ 2006-02-07 23:36       ` Andrew Morton
  2006-02-08  3:28         ` Nick Piggin
                           ` (2 more replies)
  0 siblings, 3 replies; 33+ messages in thread
From: Andrew Morton @ 2006-02-07 23:36 UTC (permalink / raw)
  To: Con Kolivas
  Cc: npiggin, mingo, rostedt, pwil3058, suresh.b.siddha, linux-kernel,
	torvalds

Con Kolivas <kernel@kolivas.org> wrote:
>
> > Well was any real-world workload (or benchmark) improved by the smpnice
> > change?
> 
> No benchmark improved but 'nice' handling moved from completely not working on 
> SMP to having quite effective cpu resource modification according to nice. 
> nice 19 vs nice 0 has 5% of the cpu on UP. On SMP machines without smp nice 
> if you have more tasks than cpus (the 5 tasks on 4 cpu example) you often get 
> nice 19 tasks getting more cpu resources than nice 0 tasks; a nice 19 task 
> would get 100% of one cpu and two nice 0 tasks would get 50% of a cpu. With 
> smp nice the nice 19 task received between 5-30% of one cpu depending on 
> their sleep/wake pattern.

Oh, is that why my machine goes paralytic during `nice -19 make -j 128'.

> > Because if we have one workload which slowed and and none which sped up,
> > it's a pretty easy decision..
> 
> The resource allocation fairness was improved with smp nice but no benchmark 
> directly sped up per se.
> 

In that case I think we're better off fixing both problems rather than
fixing neither.

Suresh, Martin, Ingo, Nick and Con: please drop everything, triple-check
and test this:



From: Peter Williams <pwil3058@bigpond.net.au>

This is a modified version of Con Kolivas's patch to add "nice" support to
load balancing across physical CPUs on SMP systems.

The principal modifications to the Con's mechanism are changing
move_tasks() so that it endeavours to move a specified amount of biased
load rather than a specified number of tasks, changing find_busiest_group()
so that the value returned in "imbalance" is in terms of biased load rather
than number of tasks and changing rebalance_tick() to calculate "cpu_load"
for each run queue as biased load rather than plain load.  To be more
precise, because of the special case of active_load_balance() wanting to
move exactly 1 task, move_tasks() actually moves up to a given number of
tasks or up to a given amount of biased load.

Because these changes mean that tasks' biased prio is evaluated much more
often than in the original implementation a "bias_prio" field has been
added to the task structure to hold its value meaning that it only needs to
be calculated when the task's nice or scheduling class is changed.  This
change facilitates considerable simplification of much of the code.

Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: "Martin J. Bligh" <mbligh@google.com>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 include/linux/sched.h |    3 
 kernel/sched.c        |  198 ++++++++++++++++++++++------------------
 2 files changed, 112 insertions(+), 89 deletions(-)

diff -puN include/linux/sched.h~sched-modified-nice-support-for-smp-load-balancing include/linux/sched.h
--- 25/include/linux/sched.h~sched-modified-nice-support-for-smp-load-balancing	Tue Feb  7 15:33:17 2006
+++ 25-akpm/include/linux/sched.h	Tue Feb  7 15:33:29 2006
@@ -704,6 +704,9 @@ struct task_struct {
 #endif
 #endif
 	int prio, static_prio;
+#ifdef CONFIG_SMP
+	int bias_prio;	/* load "weight" factor for load balancing purposes */
+#endif
 	struct list_head run_list;
 	prio_array_t *array;
 
diff -puN kernel/sched.c~sched-modified-nice-support-for-smp-load-balancing kernel/sched.c
--- 25/kernel/sched.c~sched-modified-nice-support-for-smp-load-balancing	Tue Feb  7 15:33:17 2006
+++ 25-akpm/kernel/sched.c	Tue Feb  7 15:33:33 2006
@@ -63,6 +63,14 @@
 #define TASK_NICE(p)		PRIO_TO_NICE((p)->static_prio)
 
 /*
+ * Priority bias for load balancing ranges from 1 (nice==19) to 139 (RT
+ * priority of 100).
+ */
+#define NICE_TO_BIAS_PRIO(nice)	(20 - (nice))
+#define PRIO_TO_BIAS_PRIO(prio)	NICE_TO_BIAS_PRIO(PRIO_TO_NICE(prio))
+#define RTPRIO_TO_BIAS_PRIO(rp)	(40 + (rp))
+
+/*
  * 'User priority' is the nice value converted to something we
  * can work with better when scaling various scheduler parameters,
  * it's a [ 0 ... 39 ] range.
@@ -670,46 +678,72 @@ static int effective_prio(task_t *p)
 }
 
 #ifdef CONFIG_SMP
-static inline void inc_prio_bias(runqueue_t *rq, int prio)
+/*
+ * To aid in avoiding the subversion of "niceness" due to uneven distribution
+ * of tasks with abnormal "nice" values across CPUs the contribution that
+ * each task makes to its run queue's load is weighted according to its
+ * scheduling class and "nice" value.  The bias_prio field holds the value
+ * used to calculate the weight for each task.
+ */
+static inline void set_bias_prio(task_t *p)
+{
+	if (rt_task(p)) {
+		if (p == task_rq(p)->migration_thread)
+			/*
+			 * The migration thread does the actual balancing. Do
+			 * not bias by its priority as the ultra high priority
+			 * will skew balancing adversely.
+			 */
+			p->bias_prio = 0;
+		else
+			p->bias_prio = RTPRIO_TO_BIAS_PRIO(p->rt_priority);
+	} else
+		p->bias_prio = PRIO_TO_BIAS_PRIO(p->static_prio);
+}
+
+static inline void inc_prio_bias(runqueue_t *rq, const task_t *p)
 {
-	rq->prio_bias += MAX_PRIO - prio;
+	rq->prio_bias += p->bias_prio;
 }
 
-static inline void dec_prio_bias(runqueue_t *rq, int prio)
+static inline void dec_prio_bias(runqueue_t *rq, const task_t *p)
 {
-	rq->prio_bias -= MAX_PRIO - prio;
+	rq->prio_bias -= p->bias_prio;
 }
 
 static inline void inc_nr_running(task_t *p, runqueue_t *rq)
 {
 	rq->nr_running++;
-	if (rt_task(p)) {
-		if (p != rq->migration_thread)
-			/*
-			 * The migration thread does the actual balancing. Do
-			 * not bias by its priority as the ultra high priority
-			 * will skew balancing adversely.
-			 */
-			inc_prio_bias(rq, p->prio);
-	} else
-		inc_prio_bias(rq, p->static_prio);
+	inc_prio_bias(rq, p);
 }
 
 static inline void dec_nr_running(task_t *p, runqueue_t *rq)
 {
 	rq->nr_running--;
-	if (rt_task(p)) {
-		if (p != rq->migration_thread)
-			dec_prio_bias(rq, p->prio);
-	} else
-		dec_prio_bias(rq, p->static_prio);
+	dec_prio_bias(rq, p);
+}
+
+/* convert biased priority to scaled weighted load */
+static inline unsigned long weighted_load(unsigned long bias)
+{
+	return (bias * SCHED_LOAD_SCALE) / NICE_TO_BIAS_PRIO(0);
+}
+
+/* convert scaled weighted load to unscaled biased load */
+static inline unsigned long biased_load(unsigned long wload)
+{
+	return (wload * NICE_TO_BIAS_PRIO(0)) / SCHED_LOAD_SCALE;
 }
 #else
-static inline void inc_prio_bias(runqueue_t *rq, int prio)
+static inline void set_bias_prio(task_t *p)
 {
 }
 
-static inline void dec_prio_bias(runqueue_t *rq, int prio)
+static inline void inc_prio_bias(runqueue_t *rq, const task_t *p)
+{
+}
+
+static inline void dec_prio_bias(runqueue_t *rq, const task_t *p)
 {
 }
 
@@ -1002,66 +1036,36 @@ void kick_process(task_t *p)
 }
 
 /*
- * Return a low guess at the load of a migration-source cpu.
+ * Return a low guess at the load of a migration-source cpu weighted
+ * according to the scheduling class and "nice" value.
  *
  * We want to under-estimate the load of migration sources, to
  * balance conservatively.
  */
-static unsigned long __source_load(int cpu, int type, enum idle_type idle)
+static unsigned long source_load(int cpu, int type)
 {
 	runqueue_t *rq = cpu_rq(cpu);
-	unsigned long running = rq->nr_running;
-	unsigned long source_load, cpu_load = rq->cpu_load[type-1],
-		load_now = running * SCHED_LOAD_SCALE;
+	unsigned long load_now = weighted_load(rq->prio_bias);
 
 	if (type == 0)
-		source_load = load_now;
-	else
-		source_load = min(cpu_load, load_now);
+		return load_now;
 
-	if (running > 1 || (idle == NOT_IDLE && running))
-		/*
-		 * If we are busy rebalancing the load is biased by
-		 * priority to create 'nice' support across cpus. When
-		 * idle rebalancing we should only bias the source_load if
-		 * there is more than one task running on that queue to
-		 * prevent idle rebalance from trying to pull tasks from a
-		 * queue with only one running task.
-		 */
-		source_load = source_load * rq->prio_bias / running;
-
-	return source_load;
-}
-
-static inline unsigned long source_load(int cpu, int type)
-{
-	return __source_load(cpu, type, NOT_IDLE);
+	return min(rq->cpu_load[type-1], load_now);
 }
 
 /*
- * Return a high guess at the load of a migration-target cpu
+ * Return a high guess at the load of a migration-target cpu weighted
+ * according to the scheduling class and "nice" value.
  */
-static inline unsigned long __target_load(int cpu, int type, enum idle_type idle)
+static inline unsigned long target_load(int cpu, int type)
 {
 	runqueue_t *rq = cpu_rq(cpu);
-	unsigned long running = rq->nr_running;
-	unsigned long target_load, cpu_load = rq->cpu_load[type-1],
-		load_now = running * SCHED_LOAD_SCALE;
+	unsigned long load_now = weighted_load(rq->prio_bias);
 
 	if (type == 0)
-		target_load = load_now;
-	else
-		target_load = max(cpu_load, load_now);
-
-	if (running > 1 || (idle == NOT_IDLE && running))
-		target_load = target_load * rq->prio_bias / running;
+		return load_now;
 
-	return target_load;
-}
-
-static inline unsigned long target_load(int cpu, int type)
-{
-	return __target_load(cpu, type, NOT_IDLE);
+	return max(rq->cpu_load[type-1], load_now);
 }
 
 /*
@@ -1322,7 +1326,7 @@ static int try_to_wake_up(task_t *p, uns
 			 * of the current CPU:
 			 */
 			if (sync)
-				tl -= SCHED_LOAD_SCALE;
+				tl -= weighted_load(p->bias_prio);
 
 			if ((tl <= load &&
 				tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) ||
@@ -1925,22 +1929,23 @@ int can_migrate_task(task_t *p, runqueue
 }
 
 /*
- * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,
- * as part of a balancing operation within "domain". Returns the number of
- * tasks moved.
+ * move_tasks tries to move up to max_nr_move tasks and max_bias_move biased
+ * load from busiest to this_rq, as part of a balancing operation within
+ * "domain". Returns the number of tasks moved.
  *
  * Called with both runqueues locked.
  */
 static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
-		      unsigned long max_nr_move, struct sched_domain *sd,
-		      enum idle_type idle, int *all_pinned)
+		      unsigned long max_nr_move, long max_bias_move,
+		      struct sched_domain *sd, enum idle_type idle,
+		      int *all_pinned)
 {
 	prio_array_t *array, *dst_array;
 	struct list_head *head, *curr;
 	int idx, pulled = 0, pinned = 0;
 	task_t *tmp;
 
-	if (max_nr_move == 0)
+	if (max_nr_move == 0 || max_bias_move == 0)
 		goto out;
 
 	pinned = 1;
@@ -1983,7 +1988,8 @@ skip_queue:
 
 	curr = curr->prev;
 
-	if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
+	if (tmp->bias_prio > max_bias_move ||
+	    !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
 		if (curr != head)
 			goto skip_queue;
 		idx++;
@@ -1997,9 +2003,13 @@ skip_queue:
 
 	pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
 	pulled++;
+	max_bias_move -= tmp->bias_prio;
 
-	/* We only want to steal up to the prescribed number of tasks. */
-	if (pulled < max_nr_move) {
+	/*
+	 * We only want to steal up to the prescribed number of tasks
+	 * and the prescribed amount of biased load.
+	 */
+	if (pulled < max_nr_move && max_bias_move > 0) {
 		if (curr != head)
 			goto skip_queue;
 		idx++;
@@ -2020,7 +2030,7 @@ out:
 
 /*
  * find_busiest_group finds and returns the busiest CPU group within the
- * domain. It calculates and returns the number of tasks which should be
+ * domain. It calculates and returns the amount of biased load which should be
  * moved to restore balance via the imbalance parameter.
  */
 static struct sched_group *
@@ -2056,9 +2066,9 @@ find_busiest_group(struct sched_domain *
 
 			/* Bias balancing toward cpus of our domain */
 			if (local_group)
-				load = __target_load(i, load_idx, idle);
+				load = target_load(i, load_idx);
 			else
-				load = __source_load(i, load_idx, idle);
+				load = source_load(i, load_idx);
 
 			avg_load += load;
 		}
@@ -2113,7 +2123,7 @@ find_busiest_group(struct sched_domain *
 		unsigned long tmp;
 
 		if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
-			*imbalance = 1;
+			*imbalance = NICE_TO_BIAS_PRIO(0);
 			return busiest;
 		}
 
@@ -2146,12 +2156,15 @@ find_busiest_group(struct sched_domain *
 		if (pwr_move <= pwr_now)
 			goto out_balanced;
 
-		*imbalance = 1;
+		*imbalance = NICE_TO_BIAS_PRIO(0);
 		return busiest;
 	}
 
-	/* Get rid of the scaling factor, rounding down as we divide */
-	*imbalance = *imbalance / SCHED_LOAD_SCALE;
+	/*
+	 * Get rid of the scaling factor, rounding down as we divide and
+	 * converting to biased load for use by move_tasks()
+	 */
+	*imbalance = biased_load(*imbalance);
 	return busiest;
 
 out_balanced:
@@ -2163,15 +2176,14 @@ out_balanced:
 /*
  * find_busiest_queue - find the busiest runqueue among the cpus in group.
  */
-static runqueue_t *find_busiest_queue(struct sched_group *group,
-	enum idle_type idle)
+static runqueue_t *find_busiest_queue(struct sched_group *group)
 {
 	unsigned long load, max_load = 0;
 	runqueue_t *busiest = NULL;
 	int i;
 
 	for_each_cpu_mask(i, group->cpumask) {
-		load = __source_load(i, 0, idle);
+		load = source_load(i, 0);
 
 		if (load > max_load) {
 			max_load = load;
@@ -2188,6 +2200,7 @@ static runqueue_t *find_busiest_queue(st
  */
 #define MAX_PINNED_INTERVAL	512
 
+#define minus_1_or_zero(n) ((n) > 0 ? (n) - 1 : 0)
 /*
  * Check this_cpu to ensure it is balanced within domain. Attempt to move
  * tasks if there is an imbalance.
@@ -2215,7 +2228,7 @@ static int load_balance(int this_cpu, ru
 		goto out_balanced;
 	}
 
-	busiest = find_busiest_queue(group, idle);
+	busiest = find_busiest_queue(group);
 	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[idle]);
 		goto out_balanced;
@@ -2235,6 +2248,7 @@ static int load_balance(int this_cpu, ru
 		 */
 		double_rq_lock(this_rq, busiest);
 		nr_moved = move_tasks(this_rq, this_cpu, busiest,
+					minus_1_or_zero(busiest->nr_running),
 					imbalance, sd, idle, &all_pinned);
 		double_rq_unlock(this_rq, busiest);
 
@@ -2338,7 +2352,7 @@ static int load_balance_newidle(int this
 		goto out_balanced;
 	}
 
-	busiest = find_busiest_queue(group, NEWLY_IDLE);
+	busiest = find_busiest_queue(group);
 	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
 		goto out_balanced;
@@ -2353,6 +2367,7 @@ static int load_balance_newidle(int this
 		/* Attempt to move tasks */
 		double_lock_balance(this_rq, busiest);
 		nr_moved = move_tasks(this_rq, this_cpu, busiest,
+					minus_1_or_zero(busiest->nr_running),
 					imbalance, sd, NEWLY_IDLE, NULL);
 		spin_unlock(&busiest->lock);
 	}
@@ -2433,7 +2448,8 @@ static void active_load_balance(runqueue
 
 	schedstat_inc(sd, alb_cnt);
 
-	if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL))
+	if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
+			RTPRIO_TO_BIAS_PRIO(100), sd, SCHED_IDLE, NULL))
 		schedstat_inc(sd, alb_pushed);
 	else
 		schedstat_inc(sd, alb_failed);
@@ -2461,7 +2477,8 @@ static void rebalance_tick(int this_cpu,
 	struct sched_domain *sd;
 	int i;
 
-	this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
+	/* weight load according to scheduling class and "nice" value */
+	this_load = weighted_load(this_rq->prio_bias);
 	/* Update our load */
 	for (i = 0; i < 3; i++) {
 		unsigned long new_load = this_load;
@@ -3573,18 +3590,19 @@ void set_user_nice(task_t *p, long nice)
 	array = p->array;
 	if (array) {
 		dequeue_task(p, array);
-		dec_prio_bias(rq, p->static_prio);
+		dec_prio_bias(rq, p);
 	}
 
 	old_prio = p->prio;
 	new_prio = NICE_TO_PRIO(nice);
 	delta = new_prio - old_prio;
 	p->static_prio = NICE_TO_PRIO(nice);
+	set_bias_prio(p);
 	p->prio += delta;
 
 	if (array) {
 		enqueue_task(p, array);
-		inc_prio_bias(rq, p->static_prio);
+		inc_prio_bias(rq, p);
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
@@ -3720,6 +3738,7 @@ static void __setscheduler(struct task_s
 		if (policy == SCHED_BATCH)
 			p->sleep_avg = 0;
 	}
+	set_bias_prio(p);
 }
 
 /**
@@ -6143,6 +6162,7 @@ void __init sched_init(void)
 		}
 	}
 
+	set_bias_prio(&init_task);
 	/*
 	 * The boot idle thread does lazy MMU switching as well:
 	 */
_


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [rfc][patch] sched: remove smpnice
  2006-02-07 23:36       ` Andrew Morton
@ 2006-02-08  3:28         ` Nick Piggin
  2006-02-08 14:56         ` Ingo Molnar
  2006-02-10  7:01         ` Siddha, Suresh B
  2 siblings, 0 replies; 33+ messages in thread
From: Nick Piggin @ 2006-02-08  3:28 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Con Kolivas, npiggin, mingo, rostedt, pwil3058, suresh.b.siddha,
	linux-kernel, torvalds

Andrew Morton wrote:

> In that case I think we're better off fixing both problems rather than
> fixing neither.
> 
> Suresh, Martin, Ingo, Nick and Con: please drop everything, triple-check
> and test this:
> 

OK, great. I was under the impression that this still had problems
but Peter and Martin seem to agree they've been resolved.

> 
> 
> From: Peter Williams <pwil3058@bigpond.net.au>
> 

...

I'll give it some testing.

-- 
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [rfc][patch] sched: remove smpnice
  2006-02-07 23:36       ` Andrew Morton
  2006-02-08  3:28         ` Nick Piggin
@ 2006-02-08 14:56         ` Ingo Molnar
  2006-02-10  7:01         ` Siddha, Suresh B
  2 siblings, 0 replies; 33+ messages in thread
From: Ingo Molnar @ 2006-02-08 14:56 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Con Kolivas, npiggin, rostedt, pwil3058, suresh.b.siddha,
	linux-kernel, torvalds


* Andrew Morton <akpm@osdl.org> wrote:

> In that case I think we're better off fixing both problems rather than 
> fixing neither.
> 
> Suresh, Martin, Ingo, Nick and Con: please drop everything, 
> triple-check and test this:

Peter's latest patch looks good to me, code-wise.

i also did some testing. Find below the test results of running 12 types 
of benchmarks (running each benchmark at least 3 times), on 4 separate 
kernels, on 3 separate boxes (altogether giving 12 separate bootups and 
432 testresults).

the 4 kernels i used are:

         v2.6.15: vanilla 2.6.15
            -rc2: v2.6.16-rc2-git4
  -rc2-nosmpnice: v2.6.16-rc2-git4 + Nick's nosmpnice patch
   -rc2-smpnice2: v2.6.16-rc2-git4 + Peter's latest smpnice patch

[ Test method: the same set of 4 kernel images was used on each of the 
  boxes. The same .config was used to build all the kernel images: 
  SMP+SMT, no debugging and preempt-off. Each kernel instance did 3 
  tests of every test-row, and the table contains the fastest, "MIN" 
  result. (the precise method: there were at least 3 tests done, and 
  the test-scripts detected 3 consecutive test-results that are within a 
  given spread. I.e. outliers automatically cause a restart of that 
  test.) ]

here are the numbers (time units, smaller is better, percentage is 
relative to the baseline v2.6.15 column):

2/4-way HT P4-Xeon box:                (smaller is better)
======================
   MIN           v2.6.15             -rc2   -rc2-nosmpnice    -rc2-smpnice2
---------------------------------------------------------------------------
        ctx-2:      3.51      4.13 ( 17%)      4.68 ( 33%)      3.65 (  3%)
       ctx-20:      4.44      4.72 (  6%)      4.41 (  0%)      4.40 (  0%)
      ctx-200:      8.15      8.58 (  5%)      8.54 (  4%)      8.06 ( -1%)
         mmap:    784.00    756.00 ( -3%)    768.00 ( -2%)    763.00 ( -2%)
       select:     69.17     70.09 (  1%)     69.04 (  0%)     69.24 (  0%)
    proc-exec:    153.77    156.03 (  1%)    158.14 (  2%)    158.11 (  2%)
    proc-fork:    136.66    137.83 (  0%)    138.78 (  1%)    139.79 (  2%)
 syscall-open:      5.02      4.66 ( -7%)      4.82 ( -4%)      4.77 ( -4%)
 hackbench-10:      0.77      0.82 (  6%)      0.85 ( 10%)      0.79 (  2%)
 hackbench-20:      1.56      1.49 ( -4%)      1.38 (-11%)      1.42 ( -8%)
 hackbench-50:      4.20      4.02 ( -4%)      3.57 (-15%)      3.48 (-17%)
       volano:     18.53     20.07 (  8%)     19.09 (  3%)     19.33 (  4%)

as can be seen, the -rc2 slowdown is gone on this box. -nosmpnice and 
-smpnice2 are equivalent, within noise => good.

1/2-way dual-core Athlon64 box:        (smaller is better)
==============================
   MIN           v2.6.15             -rc2   -rc2-nosmpnice    -rc2-smpnice2
---------------------------------------------------------------------------
        ctx-2:      1.10      1.22 ( 10%)      1.33 ( 20%)      1.23 ( 11%)
       ctx-20:      1.36      1.38 (  1%)      1.32 ( -2%)      1.34 ( -1%)
      ctx-200:      2.99      3.36 ( 12%)      3.82 ( 27%)      3.70 ( 23%)
         mmap:    371.00    336.00 ( -9%)    332.00 (-10%)    426.00 ( 14%)
       select:     19.04     18.94 (  0%)     18.13 ( -4%)     18.43 ( -3%)
    proc-exec:    984.00    998.50 (  1%)   1017.83 (  3%)   1004.83 (  2%)
    proc-fork:     87.98     92.11 (  4%)     90.56 (  2%)     93.38 (  6%)
 syscall-open:      3.22      3.31 (  2%)      3.48 (  8%)      3.66 ( 13%)
 hackbench-10:      0.61      0.63 (  3%)      0.60 (  0%)      0.60 ( -1%)
 hackbench-20:      1.14      1.20 (  5%)      1.17 (  2%)      1.15 (  0%)
 hackbench-50:      2.72      2.88 (  6%)      2.82 (  3%)      2.78 (  2%)
       volano:      9.68     10.26 (  6%)     10.15 (  4%)      9.98 (  3%)

here the fluctuation of the numbers is higher (caching artifacts on this 
box prevent a less noisy measurement.) , but it can be seen that most of 
the -rc2 slowdown is gone, and in fact -smpnice2 seems to be a small net 
win over -nosmpnice => good.

1-way P4 box:                          (smaller is better)
============
   MIN           v2.6.15             -rc2   -rc2-nosmpnice    -rc2-smpnice2
---------------------------------------------------------------------------
         mmap:    889.00    859.00 ( -3%)    862.00 ( -3%)    855.00 ( -3%)
        ctx-2:      2.26      2.27 (  0%)      2.36 (  4%)      2.25 (  0%)
       select:     78.98     79.13 (  0%)     77.30 ( -2%)     79.09 (  0%)
       ctx-20:      2.57      2.65 (  3%)      2.60 (  1%)      2.60 (  1%)
      ctx-200:      7.58      7.66 (  1%)      7.66 (  1%)      7.50 ( -1%)
    proc-exec:    173.28    172.28 (  0%)    172.40 (  0%)    172.28 (  0%)
    proc-fork:    155.38    153.38 ( -1%)    155.60 (  0%)    153.33 ( -1%)
 syscall-open:      5.30      5.32 (  0%)      5.37 (  1%)      5.32 (  0%)
 hackbench-10:      1.92      1.90 ( -1%)      1.89 ( -1%)      1.85 ( -4%)
 hackbench-20:      3.88      3.64 ( -6%)      3.65 ( -6%)      3.61 ( -6%)
 hackbench-50:      9.75      9.48 ( -2%)      9.28 ( -4%)      9.24 ( -5%)
       volano:     28.18     28.99 (  2%)     28.40 (  0%)     27.63 ( -1%)

this confirms that smpnice has no effect on an UP box => good.

all in one, my conclusion is that Peter's patch fixes the smpnice 
slowdown on a wide range of boxes:

Acked-by: Ingo Molnar <mingo@elte.hu>

	Ingo

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [rfc][patch] sched: remove smpnice
  2006-02-07 23:36       ` Andrew Morton
  2006-02-08  3:28         ` Nick Piggin
  2006-02-08 14:56         ` Ingo Molnar
@ 2006-02-10  7:01         ` Siddha, Suresh B
  2006-02-10  7:17           ` Andrew Morton
  2006-02-13 14:12           ` Con Kolivas
  2 siblings, 2 replies; 33+ messages in thread
From: Siddha, Suresh B @ 2006-02-10  7:01 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Con Kolivas, npiggin, mingo, rostedt, pwil3058, suresh.b.siddha,
	linux-kernel, torvalds

On Tue, Feb 07, 2006 at 03:36:17PM -0800, Andrew Morton wrote:
> Suresh, Martin, Ingo, Nick and Con: please drop everything, triple-check
> and test this:
> 
> From: Peter Williams <pwil3058@bigpond.net.au>
> 
> This is a modified version of Con Kolivas's patch to add "nice" support to
> load balancing across physical CPUs on SMP systems.

I have couple of issues with this patch.

a) on a lightly loaded system, this will result in higher priority job hopping 
around from one processor to another processor.. This is because of the 
code in find_busiest_group() which assumes that SCHED_LOAD_SCALE represents 
a unit process load and with nice_to_bias calculations this is no longer 
true(in the presence of non nice-0 tasks)

My testing showed that 178.galgel in SPECfp2000 is down by ~10% when run with 
nice -20 on a 4P(8-way with HT) system compared to a nice-0 run.

b) On a lightly loaded system, this can result in HT scheduler optimizations
being disabled in presence of low priority tasks... in this case, they(low
priority ones) can end up running on the same package, even in the presence 
of other idle packages.. Though this is not as serious as "a" above...

thanks,
suresh

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [rfc][patch] sched: remove smpnice
  2006-02-10  7:01         ` Siddha, Suresh B
@ 2006-02-10  7:17           ` Andrew Morton
  2006-02-10  7:23             ` Con Kolivas
                               ` (2 more replies)
  2006-02-13 14:12           ` Con Kolivas
  1 sibling, 3 replies; 33+ messages in thread
From: Andrew Morton @ 2006-02-10  7:17 UTC (permalink / raw)
  To: Siddha, Suresh B
  Cc: kernel, npiggin, mingo, rostedt, pwil3058, suresh.b.siddha,
	linux-kernel, torvalds

"Siddha, Suresh B" <suresh.b.siddha@intel.com> wrote:
>
> On Tue, Feb 07, 2006 at 03:36:17PM -0800, Andrew Morton wrote:
> > Suresh, Martin, Ingo, Nick and Con: please drop everything, triple-check
> > and test this:
> > 
> > From: Peter Williams <pwil3058@bigpond.net.au>
> > 
> > This is a modified version of Con Kolivas's patch to add "nice" support to
> > load balancing across physical CPUs on SMP systems.
> 
> I have couple of issues with this patch.
> 
> a) on a lightly loaded system, this will result in higher priority job hopping 
> around from one processor to another processor.. This is because of the 
> code in find_busiest_group() which assumes that SCHED_LOAD_SCALE represents 
> a unit process load and with nice_to_bias calculations this is no longer 
> true(in the presence of non nice-0 tasks)
> 
> My testing showed that 178.galgel in SPECfp2000 is down by ~10% when run with 
> nice -20 on a 4P(8-way with HT) system compared to a nice-0 run.
> 
> b) On a lightly loaded system, this can result in HT scheduler optimizations
> being disabled in presence of low priority tasks... in this case, they(low
> priority ones) can end up running on the same package, even in the presence 
> of other idle packages.. Though this is not as serious as "a" above...
> 

Thanks very much for discvoring those things.

That rather leaves us in a pickle wrt 2.6.16.

It looks like we back out smpnice after all?

Whatever we do, time is pressing.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [rfc][patch] sched: remove smpnice
  2006-02-10  7:17           ` Andrew Morton
@ 2006-02-10  7:23             ` Con Kolivas
  2006-02-10  9:06             ` Ingo Molnar
  2006-02-11  1:27             ` Peter Williams
  2 siblings, 0 replies; 33+ messages in thread
From: Con Kolivas @ 2006-02-10  7:23 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Siddha, Suresh B, npiggin, mingo, rostedt, pwil3058,
	linux-kernel, torvalds

On Friday 10 February 2006 18:17, Andrew Morton wrote:
> "Siddha, Suresh B" <suresh.b.siddha@intel.com> wrote:
> > On Tue, Feb 07, 2006 at 03:36:17PM -0800, Andrew Morton wrote:
> > > Suresh, Martin, Ingo, Nick and Con: please drop everything,
> > > triple-check and test this:
> > >
> > > From: Peter Williams <pwil3058@bigpond.net.au>
> > >
> > > This is a modified version of Con Kolivas's patch to add "nice" support
> > > to load balancing across physical CPUs on SMP systems.
> >
> > I have couple of issues with this patch.
> >
> > a) on a lightly loaded system, this will result in higher priority job
> > hopping around from one processor to another processor.. This is because
> > of the code in find_busiest_group() which assumes that SCHED_LOAD_SCALE
> > represents a unit process load and with nice_to_bias calculations this is
> > no longer true(in the presence of non nice-0 tasks)
> >
> > My testing showed that 178.galgel in SPECfp2000 is down by ~10% when run
> > with nice -20 on a 4P(8-way with HT) system compared to a nice-0 run.
> >
> > b) On a lightly loaded system, this can result in HT scheduler
> > optimizations being disabled in presence of low priority tasks... in this
> > case, they(low priority ones) can end up running on the same package,
> > even in the presence of other idle packages.. Though this is not as
> > serious as "a" above...
>
> Thanks very much for discvoring those things.
>
> That rather leaves us in a pickle wrt 2.6.16.
>
> It looks like we back out smpnice after all?

Give it the arse.

> Whatever we do, time is pressing.

We did without smp nice from 2.6.0 till 2.6.14, we can do without it again for 
some more time. Put it back in -mm for more tweaking and hopefully this added 
attention will get it more testing before being pushed.

Cheers,
Con

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [rfc][patch] sched: remove smpnice
  2006-02-10  7:17           ` Andrew Morton
  2006-02-10  7:23             ` Con Kolivas
@ 2006-02-10  9:06             ` Ingo Molnar
  2006-02-11  1:27             ` Peter Williams
  2 siblings, 0 replies; 33+ messages in thread
From: Ingo Molnar @ 2006-02-10  9:06 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Siddha, Suresh B, kernel, npiggin, rostedt, pwil3058,
	linux-kernel, torvalds


* Andrew Morton <akpm@osdl.org> wrote:

> Thanks very much for discvoring those things.
> 
> That rather leaves us in a pickle wrt 2.6.16.
> 
> It looks like we back out smpnice after all?
> 
> Whatever we do, time is pressing.

oh well, i think it's strike 3 now and lets apply the reverter for 
2.6.16. We should keep it in -mm though.

	Ingo

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [rfc][patch] sched: remove smpnice
  2006-02-10  7:17           ` Andrew Morton
  2006-02-10  7:23             ` Con Kolivas
  2006-02-10  9:06             ` Ingo Molnar
@ 2006-02-11  1:27             ` Peter Williams
  2006-02-11  2:00               ` Andrew Morton
                                 ` (3 more replies)
  2 siblings, 4 replies; 33+ messages in thread
From: Peter Williams @ 2006-02-11  1:27 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Siddha, Suresh B, kernel, npiggin, mingo, rostedt, linux-kernel,
	torvalds

Andrew Morton wrote:
> "Siddha, Suresh B" <suresh.b.siddha@intel.com> wrote:
> 
>>On Tue, Feb 07, 2006 at 03:36:17PM -0800, Andrew Morton wrote:
>>
>>>Suresh, Martin, Ingo, Nick and Con: please drop everything, triple-check
>>>and test this:
>>>
>>>From: Peter Williams <pwil3058@bigpond.net.au>
>>>
>>>This is a modified version of Con Kolivas's patch to add "nice" support to
>>>load balancing across physical CPUs on SMP systems.
>>
>>I have couple of issues with this patch.
>>
>>a) on a lightly loaded system, this will result in higher priority job hopping 
>>around from one processor to another processor.. This is because of the 
>>code in find_busiest_group() which assumes that SCHED_LOAD_SCALE represents 
>>a unit process load and with nice_to_bias calculations this is no longer 
>>true(in the presence of non nice-0 tasks)
>>
>>My testing showed that 178.galgel in SPECfp2000 is down by ~10% when run with 
>>nice -20 on a 4P(8-way with HT) system compared to a nice-0 run.

This is a bit of a surprise.  Surely, even with this mod, a task 
shouldn't be moved if it's the only runnable one on its CPU.  If it 
isn't the only runnable one on its CPU, it's not actually on the CPU and 
it's not cache hot then moving it to another (presumably) idle CPU 
should be a gain?

Presumably the delay waiting for the current task to exit the CPU is 
less than the time taken to move the task to the new CPU?  I'd guess 
that this means that the task about to be moved is either: a) higher 
priority than the current task on the CPU and is waiting for it to be 
preempted off or b) it's equal priority (or at least next one due to be 
scheduled) to the current task, waiting for the current task to 
surrender the CPU and that surrender is going to happen pretty quickly 
due to the current task's natural behaviour?

Is it normal to run enough -20 tasks to cause this problem to manifest?

>>
>>b) On a lightly loaded system, this can result in HT scheduler optimizations
>>being disabled in presence of low priority tasks... in this case, they(low
>>priority ones) can end up running on the same package, even in the presence 
>>of other idle packages.. Though this is not as serious as "a" above...
>>

I think that this issue comes under the heading of "Result of better 
nice enforcement" which is the purpose of the patch :-).  I wouldn't 
call this HT disablement or do I misunderstand the issue.

The only way that I can see load balancing subverting the HT scheduling 
mechanisms is if (say) there are 2 CPUs with 2 HT channels each and all 
of the high priority tasks end up sharing the 2 channels of one CPU 
while all of the low priority tasks share the 2 channels of the other 
one.  This scenario is far more likely to happen without the smpnice 
patches than with them.

> 
> 
> Thanks very much for discvoring those things.
> 
> That rather leaves us in a pickle wrt 2.6.16.
> 
> It looks like we back out smpnice after all?
> 
> Whatever we do, time is pressing.

I don't think either of these issues warrant abandoning smpnice.  The 
first is highly unlikely to occur on real systems and the second is just 
an example of the patch doing its job (maybe too officiously).  I don't 
think users would notice either on real systems.

Even if you pull it from 2.6.16 rather than upgrading it with my patch 
can you please leave both in -mm?

I think that there a few inexpensive things that can be tried before we 
go as far as sophisticated solutions (such as guesstimating how long a 
task will have to wait for CPU access if we don't move it).  E.g. with 
this patch move_tasks() takes two arguments: a) maximum number of tasks 
to be moved and b) maximum amount of biased load to be moved; and for 
normal use is just passed max(nr_running - 1, 0) as the first of these 
arguments and the move is controlled by the second but we could modify 
find_busiest_group() to give us values for both arguments.  Other 
options include modifying the function that maps nice to bias_prio so 
that the weights aren't quite so heavy.  Leaving the patches in -mm 
would allow some of these options to be tested.

Peter
-- 
Peter Williams                                   pwil3058@bigpond.net.au

"Learning, n. The kind of ignorance distinguishing the studious."
  -- Ambrose Bierce

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [rfc][patch] sched: remove smpnice
  2006-02-11  1:27             ` Peter Williams
@ 2006-02-11  2:00               ` Andrew Morton
  2006-02-12  1:13                 ` Peter Williams
  2006-02-11  3:36               ` Peter Williams
                                 ` (2 subsequent siblings)
  3 siblings, 1 reply; 33+ messages in thread
From: Andrew Morton @ 2006-02-11  2:00 UTC (permalink / raw)
  To: Peter Williams
  Cc: suresh.b.siddha, kernel, npiggin, mingo, rostedt, linux-kernel, torvalds

Peter Williams <pwil3058@bigpond.net.au> wrote:
>
> I don't think either of these issues warrant abandoning smpnice.  The 
>  first is highly unlikely to occur on real systems and the second is just 
>  an example of the patch doing its job (maybe too officiously).  I don't 
>  think users would notice either on real systems.
> 
>  Even if you pull it from 2.6.16 rather than upgrading it with my patch 
>  can you please leave both in -mm?

Yes, I have done that.  I currently have:

sched-restore-smpnice.patch
sched-modified-nice-support-for-smp-load-balancing.patch
sched-cleanup_task_activated.patch
sched-alter_uninterruptible_sleep_interactivity.patch
sched-make_task_noninteractive_use_sleep_type.patch
sched-dont_decrease_idle_sleep_avg.patch
sched-include_noninteractive_sleep_in_idle_detect.patch
sched-new-sched-domain-for-representing-multi-core.patch
sched-fix-group-power-for-allnodes_domains.patch


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [rfc][patch] sched: remove smpnice
  2006-02-11  1:27             ` Peter Williams
  2006-02-11  2:00               ` Andrew Morton
@ 2006-02-11  3:36               ` Peter Williams
  2006-02-11  4:04               ` Peter Williams
  2006-02-14  9:07               ` Siddha, Suresh B
  3 siblings, 0 replies; 33+ messages in thread
From: Peter Williams @ 2006-02-11  3:36 UTC (permalink / raw)
  To: Siddha, Suresh B
  Cc: Andrew Morton, kernel, npiggin, mingo, rostedt, linux-kernel, torvalds

Peter Williams wrote:
> Andrew Morton wrote:
> 
>> "Siddha, Suresh B" <suresh.b.siddha@intel.com> wrote:
>>
>>> On Tue, Feb 07, 2006 at 03:36:17PM -0800, Andrew Morton wrote:
>>>
>>>> Suresh, Martin, Ingo, Nick and Con: please drop everything, 
>>>> triple-check
>>>> and test this:
>>>>
>>>> From: Peter Williams <pwil3058@bigpond.net.au>
>>>>
>>>> This is a modified version of Con Kolivas's patch to add "nice" 
>>>> support to
>>>> load balancing across physical CPUs on SMP systems.
>>>
>>>
>>> I have couple of issues with this patch.
>>>
>>> a) on a lightly loaded system, this will result in higher priority 
>>> job hopping around from one processor to another processor.. This is 
>>> because of the code in find_busiest_group() which assumes that 
>>> SCHED_LOAD_SCALE represents a unit process load and with nice_to_bias 
>>> calculations this is no longer true(in the presence of non nice-0 tasks)
>>>
>>> My testing showed that 178.galgel in SPECfp2000 is down by ~10% when 
>>> run with nice -20 on a 4P(8-way with HT) system compared to a nice-0 
>>> run.
> 
> 
> This is a bit of a surprise.  Surely, even with this mod, a task 
> shouldn't be moved if it's the only runnable one on its CPU.  If it 
> isn't the only runnable one on its CPU, it's not actually on the CPU and 
> it's not cache hot then moving it to another (presumably) idle CPU 
> should be a gain?
> 
> Presumably the delay waiting for the current task to exit the CPU is 
> less than the time taken to move the task to the new CPU?  I'd guess 
> that this means that the task about to be moved is either: a) higher 
> priority than the current task on the CPU and is waiting for it to be 
> preempted off or b) it's equal priority (or at least next one due to be 
> scheduled) to the current task, waiting for the current task to 
> surrender the CPU and that surrender is going to happen pretty quickly 
> due to the current task's natural behaviour?

After a little lie down :-),  I now think that this problem has been 
misdiagnosed and the actual problem is that movement of high priority 
tasks on lightly loaded systems is supressed by this patch rather than 
it causing such tasks to hop from CPU to CPU.

The reason that I think this is that the implementation of biased_load() 
makes it a likely outcome.  As a shortcut to converting weighted load to 
biased load it assumes the the average weighted load per runnable task 
is 1 (or, equivalently, that the average biased prio per runnable is 
NICE_TO_BIAS_PRIO(p)) and this means that if there's only one task to 
move and its nice value is less than zero (i.e. it's high priority) then 
the biased load to be moved that is calculated will be smaller than that 
task's bias_prio causing it to NOT be moved by move_tasks().

Do you have any direct evidence to support your "hopping" hypothesis or 
is my hypothesis equally likely?

If my hypothesis holds there is a relatively simple fix that would 
involve modifying biased_load() to take into account rq->prio_bias and 
rq->nr_running during its calculations.  Basically, in that function, 
(wload * NICE_TO_BIAS_PRIO(0)) would be replaced by (wload * 
rq->prio_bias / rq->nr_running) which would, in turn, create a 
requirement for rq to be passed in as an argument.

If there is direct evidence of hopping then, because of my hypothesis 
above, I would shift the suspicion from find_busiest_group() to 
try_to_wake_up().

Peter
-- 
Peter Williams                                   pwil3058@bigpond.net.au

"Learning, n. The kind of ignorance distinguishing the studious."
  -- Ambrose Bierce

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [rfc][patch] sched: remove smpnice
  2006-02-11  1:27             ` Peter Williams
  2006-02-11  2:00               ` Andrew Morton
  2006-02-11  3:36               ` Peter Williams
@ 2006-02-11  4:04               ` Peter Williams
  2006-02-14  9:07               ` Siddha, Suresh B
  3 siblings, 0 replies; 33+ messages in thread
From: Peter Williams @ 2006-02-11  4:04 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Siddha, Suresh B, kernel, npiggin, mingo, rostedt, linux-kernel,
	torvalds

Peter Williams wrote:
> Andrew Morton wrote:
> 
>> "Siddha, Suresh B" <suresh.b.siddha@intel.com> wrote:
>>
>>> b) On a lightly loaded system, this can result in HT scheduler 
>>> optimizations
>>> being disabled in presence of low priority tasks... in this case, 
>>> they(low
>>> priority ones) can end up running on the same package, even in the 
>>> presence of other idle packages.. Though this is not as serious as 
>>> "a" above...
>>>
> 
> I think that this issue comes under the heading of "Result of better 
> nice enforcement" which is the purpose of the patch :-).

On the assumption that this enforcement is considered to be too 
vigorous,  I think that it is also amenable to a fix based on a new 
biased_load() function by replacing the (*imbalance < SCHED_LOAD_SCALE) 
test with (biased_load(*imbalance, busiest) == 0) and (possibly) some 
modifications within the if statement's body (most notably replacing the 
NICE_TO_BIAS_PRIO(0) expressions with (busiest->prio_bias / 
busiest->nr_running) or something similar).

This change would cause no change in functionality in the case where all 
tasks are nice==0.

Peter
-- 
Peter Williams                                   pwil3058@bigpond.net.au

"Learning, n. The kind of ignorance distinguishing the studious."
  -- Ambrose Bierce

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [rfc][patch] sched: remove smpnice
  2006-02-11  2:00               ` Andrew Morton
@ 2006-02-12  1:13                 ` Peter Williams
  2006-02-12 23:10                   ` Peter Williams
  0 siblings, 1 reply; 33+ messages in thread
From: Peter Williams @ 2006-02-12  1:13 UTC (permalink / raw)
  To: Andrew Morton
  Cc: suresh.b.siddha, kernel, npiggin, mingo, rostedt, linux-kernel, torvalds

[-- Attachment #1: Type: text/plain, Size: 2388 bytes --]

Andrew Morton wrote:
> Peter Williams <pwil3058@bigpond.net.au> wrote:
> 
>>I don't think either of these issues warrant abandoning smpnice.  The 
>> first is highly unlikely to occur on real systems and the second is just 
>> an example of the patch doing its job (maybe too officiously).  I don't 
>> think users would notice either on real systems.
>>
>> Even if you pull it from 2.6.16 rather than upgrading it with my patch 
>> can you please leave both in -mm?
> 
> 
> Yes, I have done that.  I currently have:
> 
> sched-restore-smpnice.patch
> sched-modified-nice-support-for-smp-load-balancing.patch
> sched-cleanup_task_activated.patch
> sched-alter_uninterruptible_sleep_interactivity.patch
> sched-make_task_noninteractive_use_sleep_type.patch
> sched-dont_decrease_idle_sleep_avg.patch
> sched-include_noninteractive_sleep_in_idle_detect.patch
> sched-new-sched-domain-for-representing-multi-core.patch
> sched-fix-group-power-for-allnodes_domains.patch

OK.  Having slept on these problems I am now of the opinion that the 
problems are caused by the use of NICE_TO_BIAS_PRIO(0) to set *imbalance 
inside the (*imbalance < SCHED_LOAD_SCALE) if statement in 
find_busiest_group().  What is happening here is that even though the 
imbalance is less than one (average) task sometimes the decision is made 
to move a task anyway but with the current version this decision can be 
subverted in two ways: 1) if the task to be moved has a nice value less 
than zero the value of *imbalance that is set will be too small for 
move_tasks() to move it; and 2) if there are a number of tasks with nice 
values greater than zero on the "busiest" more than one of them may be 
moved as the value of *imbalance that is set may be big enough to 
include more than one of these tasks.

The fix for this problem is to replace NICE_TO_BIAS_PRIO(0) with the 
"average bias prio per runnable task" on "busiest".  This will 
(generally) result in a larger value for *imbalance in case 1. above and 
a smaller one in case 2. and alleviate both problems.  A patch to apply 
this fix is attached.

Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>

Could you please add this patch to -mm so that it can be tested?

Thanks
Peter
-- 
Peter Williams                                   pwil3058@bigpond.net.au

"Learning, n. The kind of ignorance distinguishing the studious."
  -- Ambrose Bierce

[-- Attachment #2: fix-smpnice-light-load-problem --]
[-- Type: text/plain, Size: 1320 bytes --]

Index: MM-2.6.X/kernel/sched.c
===================================================================
--- MM-2.6.X.orig/kernel/sched.c	2006-02-12 11:24:48.000000000 +1100
+++ MM-2.6.X/kernel/sched.c	2006-02-12 11:35:40.000000000 +1100
@@ -735,6 +735,19 @@ static inline unsigned long biased_load(
 {
 	return (wload * NICE_TO_BIAS_PRIO(0)) / SCHED_LOAD_SCALE;
 }
+
+/* get the average biased load per runnable task for a run queue */
+static inline unsigned long avg_biased_load(runqueue_t *rq)
+{
+	/*
+	 * When I'm convinced that this won't be called with a zero nr_running
+	 * and that it can't change during the call this can be simplified.
+	 * For the time being and for proof of concept let's paly it safe.
+	 */
+	unsigned long n = rq->nr_running;
+
+	return n ? rq->prio_bias / n : 0;
+}
 #else
 static inline void set_bias_prio(task_t *p)
 {
@@ -2116,7 +2129,7 @@ find_busiest_group(struct sched_domain *
 		unsigned long tmp;
 
 		if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
-			*imbalance = NICE_TO_BIAS_PRIO(0);
+			*imbalance = avg_biased_load(busiest);
 			return busiest;
 		}
 
@@ -2149,7 +2162,7 @@ find_busiest_group(struct sched_domain *
 		if (pwr_move <= pwr_now)
 			goto out_balanced;
 
-		*imbalance = NICE_TO_BIAS_PRIO(0);
+		*imbalance = avg_biased_load(busiest);
 		return busiest;
 	}
 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [rfc][patch] sched: remove smpnice
  2006-02-12  1:13                 ` Peter Williams
@ 2006-02-12 23:10                   ` Peter Williams
  2006-02-13  1:06                     ` Peter Williams
  0 siblings, 1 reply; 33+ messages in thread
From: Peter Williams @ 2006-02-12 23:10 UTC (permalink / raw)
  To: Peter Williams
  Cc: Andrew Morton, suresh.b.siddha, kernel, npiggin, mingo, rostedt,
	linux-kernel, torvalds

Peter Williams wrote:
> Andrew Morton wrote:
> 
>> Peter Williams <pwil3058@bigpond.net.au> wrote:
>>
>>> I don't think either of these issues warrant abandoning smpnice.  The 
>>> first is highly unlikely to occur on real systems and the second is 
>>> just an example of the patch doing its job (maybe too officiously).  
>>> I don't think users would notice either on real systems.
>>>
>>> Even if you pull it from 2.6.16 rather than upgrading it with my 
>>> patch can you please leave both in -mm?
>>
>>
>>
>> Yes, I have done that.  I currently have:
>>
>> sched-restore-smpnice.patch
>> sched-modified-nice-support-for-smp-load-balancing.patch
>> sched-cleanup_task_activated.patch
>> sched-alter_uninterruptible_sleep_interactivity.patch
>> sched-make_task_noninteractive_use_sleep_type.patch
>> sched-dont_decrease_idle_sleep_avg.patch
>> sched-include_noninteractive_sleep_in_idle_detect.patch
>> sched-new-sched-domain-for-representing-multi-core.patch
>> sched-fix-group-power-for-allnodes_domains.patch
> 
> 
> OK.  Having slept on these problems I am now of the opinion that the 
> problems are caused by the use of NICE_TO_BIAS_PRIO(0) to set *imbalance 
> inside the (*imbalance < SCHED_LOAD_SCALE) if statement in 
> find_busiest_group().  What is happening here is that even though the 
> imbalance is less than one (average) task sometimes the decision is made 
> to move a task anyway but with the current version this decision can be 
> subverted in two ways: 1) if the task to be moved has a nice value less 
> than zero the value of *imbalance that is set will be too small for 
> move_tasks() to move it; and 2) if there are a number of tasks with nice 
> values greater than zero on the "busiest" more than one of them may be 
> moved as the value of *imbalance that is set may be big enough to 
> include more than one of these tasks.
> 
> The fix for this problem is to replace NICE_TO_BIAS_PRIO(0) with the 
> "average bias prio per runnable task" on "busiest".  This will 
> (generally) result in a larger value for *imbalance in case 1. above and 
> a smaller one in case 2. and alleviate both problems.  A patch to apply 
> this fix is attached.
> 
> Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
> 
> Could you please add this patch to -mm so that it can be tested?
> 
> Thanks
> Peter
> 
> 
> ------------------------------------------------------------------------
> 
> Index: MM-2.6.X/kernel/sched.c
> ===================================================================
> --- MM-2.6.X.orig/kernel/sched.c	2006-02-12 11:24:48.000000000 +1100
> +++ MM-2.6.X/kernel/sched.c	2006-02-12 11:35:40.000000000 +1100
> @@ -735,6 +735,19 @@ static inline unsigned long biased_load(
>  {
>  	return (wload * NICE_TO_BIAS_PRIO(0)) / SCHED_LOAD_SCALE;
>  }
> +
> +/* get the average biased load per runnable task for a run queue */
> +static inline unsigned long avg_biased_load(runqueue_t *rq)
> +{
> +	/*
> +	 * When I'm convinced that this won't be called with a zero nr_running
> +	 * and that it can't change during the call this can be simplified.
> +	 * For the time being and for proof of concept let's paly it safe.
> +	 */
> +	unsigned long n = rq->nr_running;
> +
> +	return n ? rq->prio_bias / n : 0;
> +}
>  #else
>  static inline void set_bias_prio(task_t *p)
>  {
> @@ -2116,7 +2129,7 @@ find_busiest_group(struct sched_domain *
>  		unsigned long tmp;
>  
>  		if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
> -			*imbalance = NICE_TO_BIAS_PRIO(0);
> +			*imbalance = avg_biased_load(busiest);
>  			return busiest;
>  		}
>  
> @@ -2149,7 +2162,7 @@ find_busiest_group(struct sched_domain *
>  		if (pwr_move <= pwr_now)
>  			goto out_balanced;
>  
> -		*imbalance = NICE_TO_BIAS_PRIO(0);
> +		*imbalance = avg_biased_load(busiest);
>  		return busiest;
>  	}
>  

Can we pull this one, please?  I've mistakenly assumed that busiest was 
a run queue when it's actually a sched group. :-(

Peter
-- 
Peter Williams                                   pwil3058@bigpond.net.au

"Learning, n. The kind of ignorance distinguishing the studious."
  -- Ambrose Bierce

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [rfc][patch] sched: remove smpnice
  2006-02-12 23:10                   ` Peter Williams
@ 2006-02-13  1:06                     ` Peter Williams
  2006-02-14  0:37                       ` Peter Williams
  0 siblings, 1 reply; 33+ messages in thread
From: Peter Williams @ 2006-02-13  1:06 UTC (permalink / raw)
  To: Andrew Morton
  Cc: suresh.b.siddha, kernel, npiggin, mingo, rostedt, linux-kernel, torvalds

[-- Attachment #1: Type: text/plain, Size: 3226 bytes --]

Peter Williams wrote:
> Peter Williams wrote:
> 
>> Andrew Morton wrote:
>>
>>> Peter Williams <pwil3058@bigpond.net.au> wrote:
>>>
>>>> I don't think either of these issues warrant abandoning smpnice.  
>>>> The first is highly unlikely to occur on real systems and the second 
>>>> is just an example of the patch doing its job (maybe too 
>>>> officiously).  I don't think users would notice either on real systems.
>>>>
>>>> Even if you pull it from 2.6.16 rather than upgrading it with my 
>>>> patch can you please leave both in -mm?
>>>
>>>
>>>
>>>
>>> Yes, I have done that.  I currently have:
>>>
>>> sched-restore-smpnice.patch
>>> sched-modified-nice-support-for-smp-load-balancing.patch
>>> sched-cleanup_task_activated.patch
>>> sched-alter_uninterruptible_sleep_interactivity.patch
>>> sched-make_task_noninteractive_use_sleep_type.patch
>>> sched-dont_decrease_idle_sleep_avg.patch
>>> sched-include_noninteractive_sleep_in_idle_detect.patch
>>> sched-new-sched-domain-for-representing-multi-core.patch
>>> sched-fix-group-power-for-allnodes_domains.patch
>>
>>
>>
>> OK.  Having slept on these problems I am now of the opinion that the 
>> problems are caused by the use of NICE_TO_BIAS_PRIO(0) to set 
>> *imbalance inside the (*imbalance < SCHED_LOAD_SCALE) if statement in 
>> find_busiest_group().  What is happening here is that even though the 
>> imbalance is less than one (average) task sometimes the decision is 
>> made to move a task anyway but with the current version this decision 
>> can be subverted in two ways: 1) if the task to be moved has a nice 
>> value less than zero the value of *imbalance that is set will be too 
>> small for move_tasks() to move it; and 2) if there are a number of 
>> tasks with nice values greater than zero on the "busiest" more than 
>> one of them may be moved as the value of *imbalance that is set may be 
>> big enough to include more than one of these tasks.
>>
>> The fix for this problem is to replace NICE_TO_BIAS_PRIO(0) with the 
>> "average bias prio per runnable task" on "busiest".  This will 
>> (generally) result in a larger value for *imbalance in case 1. above 
>> and a smaller one in case 2. and alleviate both problems.  A patch to 
>> apply this fix is attached.
>>
>> Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
>>
>> Could you please add this patch to -mm so that it can be tested?
>>
>> [old patch removed]
> 
> 
> Can we pull this one, please?  I've mistakenly assumed that busiest was 
> a run queue when it's actually a sched group. :-(

Here's a fixed version that takes into account the fact that busiest is 
a group not a run queue.  This patch also includes "improvements" to the 
logic at the end of find_busiest_queue() to take account of the fact 
that the loads are weighted.  Hopefully, the comments in the code 
explain these changes.

Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>

Because this is a more substantial change it may be advisable to wait 
for comments from Ingo and Nick before including this in -mm for testing.

Peter
-- 
Peter Williams                                   pwil3058@bigpond.net.au

"Learning, n. The kind of ignorance distinguishing the studious."
  -- Ambrose Bierce

[-- Attachment #2: fix-smpnice-light-load-problem-v2 --]
[-- Type: text/plain, Size: 3456 bytes --]

Index: MM-2.6.X/kernel/sched.c
===================================================================
--- MM-2.6.X.orig/kernel/sched.c	2006-02-13 10:23:12.000000000 +1100
+++ MM-2.6.X/kernel/sched.c	2006-02-13 11:46:41.000000000 +1100
@@ -2033,9 +2033,11 @@ find_busiest_group(struct sched_domain *
 	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
 	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
 	unsigned long max_pull;
+	unsigned long avg_bias_prio, busiest_prio_bias, busiest_nr_running;
 	int load_idx;
 
 	max_load = this_load = total_load = total_pwr = 0;
+	busiest_prio_bias = busiest_nr_running = 0;
 	if (idle == NOT_IDLE)
 		load_idx = sd->busy_idx;
 	else if (idle == NEWLY_IDLE)
@@ -2047,13 +2049,16 @@ find_busiest_group(struct sched_domain *
 		unsigned long load;
 		int local_group;
 		int i;
+		unsigned long sum_prio_bias, sum_nr_running;
 
 		local_group = cpu_isset(this_cpu, group->cpumask);
 
 		/* Tally up the load of all CPUs in the group */
-		avg_load = 0;
+		sum_prio_bias = sum_nr_running = avg_load = 0;
 
 		for_each_cpu_mask(i, group->cpumask) {
+			runqueue_t *rq = cpu_rq(i);
+
 			if (*sd_idle && !idle_cpu(i))
 				*sd_idle = 0;
 
@@ -2064,6 +2069,8 @@ find_busiest_group(struct sched_domain *
 				load = source_load(i, load_idx);
 
 			avg_load += load;
+			sum_prio_bias += rq->prio_bias;
+			sum_nr_running += rq->nr_running;
 		}
 
 		total_load += avg_load;
@@ -2078,6 +2085,8 @@ find_busiest_group(struct sched_domain *
 		} else if (avg_load > max_load) {
 			max_load = avg_load;
 			busiest = group;
+			busiest_prio_bias = sum_prio_bias;
+			busiest_nr_running = sum_nr_running;
 		}
 		group = group->next;
 	} while (group != sd->groups);
@@ -2111,12 +2120,25 @@ find_busiest_group(struct sched_domain *
 				(avg_load - this_load) * this->cpu_power)
 			/ SCHED_LOAD_SCALE;
 
-	if (*imbalance < SCHED_LOAD_SCALE) {
+	/* assume that busiest_nr_running > 0 */
+	avg_bias_prio = busiest_prio_bias / busiest_nr_running;
+	/*
+	 * Get rid of the scaling factor, rounding down as we divide and
+	 * converting to biased load for use by move_tasks()
+	 */
+	*imbalance = biased_load(*imbalance);
+	/*
+	 * if *imbalance is less than the average runnable task biased prio
+	 * there is no gaurantee that any tasks will be moved so we'll have
+	 * a think about bumping its value to force at least one task to be
+	 * moved
+	 */
+	if (*imbalance < avg_bias_prio) {
 		unsigned long pwr_now = 0, pwr_move = 0;
 		unsigned long tmp;
 
-		if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
-			*imbalance = NICE_TO_BIAS_PRIO(0);
+		if (biased_load(max_load - this_load) >= avg_bias_prio*2) {
+			*imbalance = avg_bias_prio;
 			return busiest;
 		}
 
@@ -2146,18 +2168,19 @@ find_busiest_group(struct sched_domain *
 		pwr_move /= SCHED_LOAD_SCALE;
 
 		/* Move if we gain throughput */
-		if (pwr_move <= pwr_now)
-			goto out_balanced;
+		if (pwr_move <= pwr_now) {
+			/* or if there's a reasonable chance that *imbalance
+			 * is big enough to cause a move
+			 */
+			if (*imbalance <= avg_bias_prio / 2)
+				goto out_balanced;
+			else
+				return busiest;
+		}
 
-		*imbalance = NICE_TO_BIAS_PRIO(0);
-		return busiest;
+		*imbalance = avg_bias_prio;
 	}
 
-	/*
-	 * Get rid of the scaling factor, rounding down as we divide and
-	 * converting to biased load for use by move_tasks()
-	 */
-	*imbalance = biased_load(*imbalance);
 	return busiest;
 
 out_balanced:

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [rfc][patch] sched: remove smpnice
  2006-02-10  7:01         ` Siddha, Suresh B
  2006-02-10  7:17           ` Andrew Morton
@ 2006-02-13 14:12           ` Con Kolivas
  1 sibling, 0 replies; 33+ messages in thread
From: Con Kolivas @ 2006-02-13 14:12 UTC (permalink / raw)
  To: Siddha, Suresh B
  Cc: Andrew Morton, npiggin, mingo, rostedt, pwil3058, linux-kernel, torvalds

On Friday 10 February 2006 18:01, Siddha, Suresh B wrote:
> On Tue, Feb 07, 2006 at 03:36:17PM -0800, Andrew Morton wrote:
> > Suresh, Martin, Ingo, Nick and Con: please drop everything, triple-check
> > and test this:
> >
> > From: Peter Williams <pwil3058@bigpond.net.au>
> >
> > This is a modified version of Con Kolivas's patch to add "nice" support
> > to load balancing across physical CPUs on SMP systems.
>
> I have couple of issues with this patch.
>
> a) on a lightly loaded system, this will result in higher priority job
> hopping around from one processor to another processor.. This is because of
> the code in find_busiest_group() which assumes that SCHED_LOAD_SCALE
> represents a unit process load and with nice_to_bias calculations this is
> no longer true(in the presence of non nice-0 tasks)
>
> My testing showed that 178.galgel in SPECfp2000 is down by ~10% when run
> with nice -20 on a 4P(8-way with HT) system compared to a nice-0 run.

While I voted to remove smp nice till this regression is fixed can I just 
point out a mildly amusing catch 22 in this. Nice levels are _broken_ on smp 
without smp nice, yet it's only by using nice levels that you get this 
performance problem. I'm not arguing for changing our decision based on this.

Cheers,
Con

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [rfc][patch] sched: remove smpnice
  2006-02-13  1:06                     ` Peter Williams
@ 2006-02-14  0:37                       ` Peter Williams
  2006-02-14  8:53                         ` Siddha, Suresh B
  0 siblings, 1 reply; 33+ messages in thread
From: Peter Williams @ 2006-02-14  0:37 UTC (permalink / raw)
  To: Andrew Morton
  Cc: suresh.b.siddha, kernel, npiggin, mingo, rostedt, linux-kernel, torvalds

[-- Attachment #1: Type: text/plain, Size: 7785 bytes --]

Peter Williams wrote:
> Peter Williams wrote:
> 
>> Peter Williams wrote:
>>
>>> Andrew Morton wrote:
>>>
>>>> Peter Williams <pwil3058@bigpond.net.au> wrote:
>>>>
>>>>> I don't think either of these issues warrant abandoning smpnice.  
>>>>> The first is highly unlikely to occur on real systems and the 
>>>>> second is just an example of the patch doing its job (maybe too 
>>>>> officiously).  I don't think users would notice either on real 
>>>>> systems.
>>>>>
>>>>> Even if you pull it from 2.6.16 rather than upgrading it with my 
>>>>> patch can you please leave both in -mm?
>>>>
>>>>
>>>>
>>>>
>>>>
>>>> Yes, I have done that.  I currently have:
>>>>
>>>> sched-restore-smpnice.patch
>>>> sched-modified-nice-support-for-smp-load-balancing.patch
>>>> sched-cleanup_task_activated.patch
>>>> sched-alter_uninterruptible_sleep_interactivity.patch
>>>> sched-make_task_noninteractive_use_sleep_type.patch
>>>> sched-dont_decrease_idle_sleep_avg.patch
>>>> sched-include_noninteractive_sleep_in_idle_detect.patch
>>>> sched-new-sched-domain-for-representing-multi-core.patch
>>>> sched-fix-group-power-for-allnodes_domains.patch
>>>
>>>
>>>
>>>
>>> OK.  Having slept on these problems I am now of the opinion that the 
>>> problems are caused by the use of NICE_TO_BIAS_PRIO(0) to set 
>>> *imbalance inside the (*imbalance < SCHED_LOAD_SCALE) if statement in 
>>> find_busiest_group().  What is happening here is that even though the 
>>> imbalance is less than one (average) task sometimes the decision is 
>>> made to move a task anyway but with the current version this decision 
>>> can be subverted in two ways: 1) if the task to be moved has a nice 
>>> value less than zero the value of *imbalance that is set will be too 
>>> small for move_tasks() to move it; and 2) if there are a number of 
>>> tasks with nice values greater than zero on the "busiest" more than 
>>> one of them may be moved as the value of *imbalance that is set may 
>>> be big enough to include more than one of these tasks.
>>>
>>> The fix for this problem is to replace NICE_TO_BIAS_PRIO(0) with the 
>>> "average bias prio per runnable task" on "busiest".  This will 
>>> (generally) result in a larger value for *imbalance in case 1. above 
>>> and a smaller one in case 2. and alleviate both problems.  A patch to 
>>> apply this fix is attached.
>>>
>>> Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
>>>
>>> Could you please add this patch to -mm so that it can be tested?
>>>
>>> [old patch removed]
>>
>>
>>
>> Can we pull this one, please?  I've mistakenly assumed that busiest 
>> was a run queue when it's actually a sched group. :-(
> 
> 
> Here's a fixed version that takes into account the fact that busiest is 
> a group not a run queue.  This patch also includes "improvements" to the 
> logic at the end of find_busiest_queue() to take account of the fact 
> that the loads are weighted.  Hopefully, the comments in the code 
> explain these changes.
> 
> Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
> 
> Because this is a more substantial change it may be advisable to wait 
> for comments from Ingo and Nick before including this in -mm for testing.
> 
> Peter
> 
> 
> ------------------------------------------------------------------------
> 
> Index: MM-2.6.X/kernel/sched.c
> ===================================================================
> --- MM-2.6.X.orig/kernel/sched.c	2006-02-13 10:23:12.000000000 +1100
> +++ MM-2.6.X/kernel/sched.c	2006-02-13 11:46:41.000000000 +1100
> @@ -2033,9 +2033,11 @@ find_busiest_group(struct sched_domain *
>  	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
>  	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
>  	unsigned long max_pull;
> +	unsigned long avg_bias_prio, busiest_prio_bias, busiest_nr_running;
>  	int load_idx;
>  
>  	max_load = this_load = total_load = total_pwr = 0;
> +	busiest_prio_bias = busiest_nr_running = 0;
>  	if (idle == NOT_IDLE)
>  		load_idx = sd->busy_idx;
>  	else if (idle == NEWLY_IDLE)
> @@ -2047,13 +2049,16 @@ find_busiest_group(struct sched_domain *
>  		unsigned long load;
>  		int local_group;
>  		int i;
> +		unsigned long sum_prio_bias, sum_nr_running;
>  
>  		local_group = cpu_isset(this_cpu, group->cpumask);
>  
>  		/* Tally up the load of all CPUs in the group */
> -		avg_load = 0;
> +		sum_prio_bias = sum_nr_running = avg_load = 0;
>  
>  		for_each_cpu_mask(i, group->cpumask) {
> +			runqueue_t *rq = cpu_rq(i);
> +
>  			if (*sd_idle && !idle_cpu(i))
>  				*sd_idle = 0;
>  
> @@ -2064,6 +2069,8 @@ find_busiest_group(struct sched_domain *
>  				load = source_load(i, load_idx);
>  
>  			avg_load += load;
> +			sum_prio_bias += rq->prio_bias;
> +			sum_nr_running += rq->nr_running;
>  		}
>  
>  		total_load += avg_load;
> @@ -2078,6 +2085,8 @@ find_busiest_group(struct sched_domain *
>  		} else if (avg_load > max_load) {
>  			max_load = avg_load;
>  			busiest = group;
> +			busiest_prio_bias = sum_prio_bias;
> +			busiest_nr_running = sum_nr_running;
>  		}
>  		group = group->next;
>  	} while (group != sd->groups);
> @@ -2111,12 +2120,25 @@ find_busiest_group(struct sched_domain *
>  				(avg_load - this_load) * this->cpu_power)
>  			/ SCHED_LOAD_SCALE;
>  
> -	if (*imbalance < SCHED_LOAD_SCALE) {
> +	/* assume that busiest_nr_running > 0 */
> +	avg_bias_prio = busiest_prio_bias / busiest_nr_running;
> +	/*
> +	 * Get rid of the scaling factor, rounding down as we divide and
> +	 * converting to biased load for use by move_tasks()
> +	 */
> +	*imbalance = biased_load(*imbalance);
> +	/*
> +	 * if *imbalance is less than the average runnable task biased prio
> +	 * there is no gaurantee that any tasks will be moved so we'll have
> +	 * a think about bumping its value to force at least one task to be
> +	 * moved
> +	 */
> +	if (*imbalance < avg_bias_prio) {
>  		unsigned long pwr_now = 0, pwr_move = 0;
>  		unsigned long tmp;
>  
> -		if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
> -			*imbalance = NICE_TO_BIAS_PRIO(0);
> +		if (biased_load(max_load - this_load) >= avg_bias_prio*2) {
> +			*imbalance = avg_bias_prio;
>  			return busiest;
>  		}
>  
> @@ -2146,18 +2168,19 @@ find_busiest_group(struct sched_domain *
>  		pwr_move /= SCHED_LOAD_SCALE;
>  
>  		/* Move if we gain throughput */
> -		if (pwr_move <= pwr_now)
> -			goto out_balanced;
> +		if (pwr_move <= pwr_now) {
> +			/* or if there's a reasonable chance that *imbalance
> +			 * is big enough to cause a move
> +			 */
> +			if (*imbalance <= avg_bias_prio / 2)
> +				goto out_balanced;
> +			else
> +				return busiest;
> +		}
>  
> -		*imbalance = NICE_TO_BIAS_PRIO(0);
> -		return busiest;
> +		*imbalance = avg_bias_prio;
>  	}
>  
> -	/*
> -	 * Get rid of the scaling factor, rounding down as we divide and
> -	 * converting to biased load for use by move_tasks()
> -	 */
> -	*imbalance = biased_load(*imbalance);
>  	return busiest;
>  
>  out_balanced:

Attached is an alternative patch for this problem that adds less 
overhead.  It takes advantage of the fact that biased_load(max_load) for 
busiest group is APPROXIMATELY equal to the sum of the prio_biases for 
the run queues in the group.  This has a number of advantages:

1. less work is done in the do loop
2. there is no need to convert *imbalance to biased load before 
comparison to avg_bias_prio as they are in the same units.

There is one possible disadvantage and that is that the approximation is 
sufficiently bad to break the functionality.

In summary, if this version works it should be used in preference to v2 
as it involves less overhead.

Peter
-- 
Peter Williams                                   pwil3058@bigpond.net.au

"Learning, n. The kind of ignorance distinguishing the studious."
  -- Ambrose Bierce

[-- Attachment #2: fix-smpnice-light-load-problem-v3 --]
[-- Type: text/plain, Size: 2917 bytes --]

Index: MM-2.6.X/kernel/sched.c
===================================================================
--- MM-2.6.X.orig/kernel/sched.c	2006-02-13 10:23:12.000000000 +1100
+++ MM-2.6.X/kernel/sched.c	2006-02-14 09:47:17.000000000 +1100
@@ -2033,9 +2033,11 @@ find_busiest_group(struct sched_domain *
 	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
 	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
 	unsigned long max_pull;
+	unsigned long avg_load_per_task, busiest_nr_running;
 	int load_idx;
 
 	max_load = this_load = total_load = total_pwr = 0;
+	busiest_nr_running = 0;
 	if (idle == NOT_IDLE)
 		load_idx = sd->busy_idx;
 	else if (idle == NEWLY_IDLE)
@@ -2047,11 +2049,12 @@ find_busiest_group(struct sched_domain *
 		unsigned long load;
 		int local_group;
 		int i;
+		unsigned long sum_nr_running;
 
 		local_group = cpu_isset(this_cpu, group->cpumask);
 
 		/* Tally up the load of all CPUs in the group */
-		avg_load = 0;
+		sum_nr_running = avg_load = 0;
 
 		for_each_cpu_mask(i, group->cpumask) {
 			if (*sd_idle && !idle_cpu(i))
@@ -2064,6 +2067,7 @@ find_busiest_group(struct sched_domain *
 				load = source_load(i, load_idx);
 
 			avg_load += load;
+			sum_nr_running += cpu_rq(i)->nr_running;
 		}
 
 		total_load += avg_load;
@@ -2078,6 +2082,7 @@ find_busiest_group(struct sched_domain *
 		} else if (avg_load > max_load) {
 			max_load = avg_load;
 			busiest = group;
+			busiest_nr_running = sum_nr_running;
 		}
 		group = group->next;
 	} while (group != sd->groups);
@@ -2111,12 +2116,20 @@ find_busiest_group(struct sched_domain *
 				(avg_load - this_load) * this->cpu_power)
 			/ SCHED_LOAD_SCALE;
 
-	if (*imbalance < SCHED_LOAD_SCALE) {
+	/* Don't assume that busiest_nr_running > 0 */
+	avg_load_per_task = busiest_nr_running ? max_load / busiest_nr_running : avg_load;
+	/*
+	 * if *imbalance is less than the average load per runnable task
+	 * there is no gaurantee that any tasks will be moved so we'll have
+	 * a think about bumping its value to force at least one task to be
+	 * moved
+	 */
+	if (*imbalance < avg_load_per_task) {
 		unsigned long pwr_now = 0, pwr_move = 0;
 		unsigned long tmp;
 
-		if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
-			*imbalance = NICE_TO_BIAS_PRIO(0);
+		if (max_load - this_load >= avg_load_per_task*2) {
+			*imbalance = biased_load(avg_load_per_task);
 			return busiest;
 		}
 
@@ -2146,11 +2159,14 @@ find_busiest_group(struct sched_domain *
 		pwr_move /= SCHED_LOAD_SCALE;
 
 		/* Move if we gain throughput */
-		if (pwr_move <= pwr_now)
-			goto out_balanced;
-
-		*imbalance = NICE_TO_BIAS_PRIO(0);
-		return busiest;
+		if (pwr_move <= pwr_now) {
+			/* or if there's a reasonable chance that *imbalance
+			 * is big enough to cause a move
+			 */
+			if (*imbalance <= avg_load_per_task / 2)
+				goto out_balanced;
+		} else
+			*imbalance = avg_load_per_task;
 	}
 
 	/*

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [rfc][patch] sched: remove smpnice
  2006-02-14  0:37                       ` Peter Williams
@ 2006-02-14  8:53                         ` Siddha, Suresh B
  0 siblings, 0 replies; 33+ messages in thread
From: Siddha, Suresh B @ 2006-02-14  8:53 UTC (permalink / raw)
  To: Peter Williams
  Cc: Andrew Morton, suresh.b.siddha, kernel, npiggin, mingo, rostedt,
	linux-kernel, torvalds

Actually peter I noticed this issue(that you are tying to fix in this
appended patch) too but was n't much bothered as active load balance  
will finally be called with  RTPRIO_TO_BIAS_PRIO(100).

But the hopping issue of the high priority process is still there...
And the reason is the code still assumes that a unit load is
represented by SCHED_LOAD_SCALE.. See the code near comment 
"however we may be able to increase total CPU power used by moving them"
Basically idle cpu thinks there is an imbalance and kicks active load
balance on the processor with high priority process. And this goes on
forever.

Please be careful with this area... HT scheduler optimizations depend a 
lot on these code pieces..

Nick and Ingo, I will be on vacation starting from middle of next week.
So please review and provide feedback to Peter.

thanks,
suresh


On Tue, Feb 14, 2006 at 11:37:24AM +1100, Peter Williams wrote:
> Peter Williams wrote:
> > Peter Williams wrote:
> > 
> >> Peter Williams wrote:
> >>
> >>> Andrew Morton wrote:
> >>>
> >>>> Peter Williams <pwil3058@bigpond.net.au> wrote:
> >>>>
> >>>>> I don't think either of these issues warrant abandoning smpnice.  
> >>>>> The first is highly unlikely to occur on real systems and the 
> >>>>> second is just an example of the patch doing its job (maybe too 
> >>>>> officiously).  I don't think users would notice either on real 
> >>>>> systems.
> >>>>>
> >>>>> Even if you pull it from 2.6.16 rather than upgrading it with my 
> >>>>> patch can you please leave both in -mm?
> >>>>
> >>>>
> >>>>
> >>>>
> >>>>
> >>>> Yes, I have done that.  I currently have:
> >>>>
> >>>> sched-restore-smpnice.patch
> >>>> sched-modified-nice-support-for-smp-load-balancing.patch
> >>>> sched-cleanup_task_activated.patch
> >>>> sched-alter_uninterruptible_sleep_interactivity.patch
> >>>> sched-make_task_noninteractive_use_sleep_type.patch
> >>>> sched-dont_decrease_idle_sleep_avg.patch
> >>>> sched-include_noninteractive_sleep_in_idle_detect.patch
> >>>> sched-new-sched-domain-for-representing-multi-core.patch
> >>>> sched-fix-group-power-for-allnodes_domains.patch
> >>>
> >>>
> >>>
> >>>
> >>> OK.  Having slept on these problems I am now of the opinion that the 
> >>> problems are caused by the use of NICE_TO_BIAS_PRIO(0) to set 
> >>> *imbalance inside the (*imbalance < SCHED_LOAD_SCALE) if statement in 
> >>> find_busiest_group().  What is happening here is that even though the 
> >>> imbalance is less than one (average) task sometimes the decision is 
> >>> made to move a task anyway but with the current version this decision 
> >>> can be subverted in two ways: 1) if the task to be moved has a nice 
> >>> value less than zero the value of *imbalance that is set will be too 
> >>> small for move_tasks() to move it; and 2) if there are a number of 
> >>> tasks with nice values greater than zero on the "busiest" more than 
> >>> one of them may be moved as the value of *imbalance that is set may 
> >>> be big enough to include more than one of these tasks.
> >>>
> >>> The fix for this problem is to replace NICE_TO_BIAS_PRIO(0) with the 
> >>> "average bias prio per runnable task" on "busiest".  This will 
> >>> (generally) result in a larger value for *imbalance in case 1. above 
> >>> and a smaller one in case 2. and alleviate both problems.  A patch to 
> >>> apply this fix is attached.
> >>>
> >>> Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
> >>>
> >>> Could you please add this patch to -mm so that it can be tested?
> >>>
> >>> [old patch removed]
> >>
> >>
> >>
> >> Can we pull this one, please?  I've mistakenly assumed that busiest 
> >> was a run queue when it's actually a sched group. :-(
> > 
> > 
> > Here's a fixed version that takes into account the fact that busiest is 
> > a group not a run queue.  This patch also includes "improvements" to the 
> > logic at the end of find_busiest_queue() to take account of the fact 
> > that the loads are weighted.  Hopefully, the comments in the code 
> > explain these changes.
> > 
> > Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
> > 
> > Because this is a more substantial change it may be advisable to wait 
> > for comments from Ingo and Nick before including this in -mm for testing.
> > 
> > Peter
> > 
> > 
> > ------------------------------------------------------------------------
> > 
> > Index: MM-2.6.X/kernel/sched.c
> > ===================================================================
> > --- MM-2.6.X.orig/kernel/sched.c	2006-02-13 10:23:12.000000000 +1100
> > +++ MM-2.6.X/kernel/sched.c	2006-02-13 11:46:41.000000000 +1100
> > @@ -2033,9 +2033,11 @@ find_busiest_group(struct sched_domain *
> >  	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
> >  	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
> >  	unsigned long max_pull;
> > +	unsigned long avg_bias_prio, busiest_prio_bias, busiest_nr_running;
> >  	int load_idx;
> >  
> >  	max_load = this_load = total_load = total_pwr = 0;
> > +	busiest_prio_bias = busiest_nr_running = 0;
> >  	if (idle == NOT_IDLE)
> >  		load_idx = sd->busy_idx;
> >  	else if (idle == NEWLY_IDLE)
> > @@ -2047,13 +2049,16 @@ find_busiest_group(struct sched_domain *
> >  		unsigned long load;
> >  		int local_group;
> >  		int i;
> > +		unsigned long sum_prio_bias, sum_nr_running;
> >  
> >  		local_group = cpu_isset(this_cpu, group->cpumask);
> >  
> >  		/* Tally up the load of all CPUs in the group */
> > -		avg_load = 0;
> > +		sum_prio_bias = sum_nr_running = avg_load = 0;
> >  
> >  		for_each_cpu_mask(i, group->cpumask) {
> > +			runqueue_t *rq = cpu_rq(i);
> > +
> >  			if (*sd_idle && !idle_cpu(i))
> >  				*sd_idle = 0;
> >  
> > @@ -2064,6 +2069,8 @@ find_busiest_group(struct sched_domain *
> >  				load = source_load(i, load_idx);
> >  
> >  			avg_load += load;
> > +			sum_prio_bias += rq->prio_bias;
> > +			sum_nr_running += rq->nr_running;
> >  		}
> >  
> >  		total_load += avg_load;
> > @@ -2078,6 +2085,8 @@ find_busiest_group(struct sched_domain *
> >  		} else if (avg_load > max_load) {
> >  			max_load = avg_load;
> >  			busiest = group;
> > +			busiest_prio_bias = sum_prio_bias;
> > +			busiest_nr_running = sum_nr_running;
> >  		}
> >  		group = group->next;
> >  	} while (group != sd->groups);
> > @@ -2111,12 +2120,25 @@ find_busiest_group(struct sched_domain *
> >  				(avg_load - this_load) * this->cpu_power)
> >  			/ SCHED_LOAD_SCALE;
> >  
> > -	if (*imbalance < SCHED_LOAD_SCALE) {
> > +	/* assume that busiest_nr_running > 0 */
> > +	avg_bias_prio = busiest_prio_bias / busiest_nr_running;
> > +	/*
> > +	 * Get rid of the scaling factor, rounding down as we divide and
> > +	 * converting to biased load for use by move_tasks()
> > +	 */
> > +	*imbalance = biased_load(*imbalance);
> > +	/*
> > +	 * if *imbalance is less than the average runnable task biased prio
> > +	 * there is no gaurantee that any tasks will be moved so we'll have
> > +	 * a think about bumping its value to force at least one task to be
> > +	 * moved
> > +	 */
> > +	if (*imbalance < avg_bias_prio) {
> >  		unsigned long pwr_now = 0, pwr_move = 0;
> >  		unsigned long tmp;
> >  
> > -		if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
> > -			*imbalance = NICE_TO_BIAS_PRIO(0);
> > +		if (biased_load(max_load - this_load) >= avg_bias_prio*2) {
> > +			*imbalance = avg_bias_prio;
> >  			return busiest;
> >  		}
> >  
> > @@ -2146,18 +2168,19 @@ find_busiest_group(struct sched_domain *
> >  		pwr_move /= SCHED_LOAD_SCALE;
> >  
> >  		/* Move if we gain throughput */
> > -		if (pwr_move <= pwr_now)
> > -			goto out_balanced;
> > +		if (pwr_move <= pwr_now) {
> > +			/* or if there's a reasonable chance that *imbalance
> > +			 * is big enough to cause a move
> > +			 */
> > +			if (*imbalance <= avg_bias_prio / 2)
> > +				goto out_balanced;
> > +			else
> > +				return busiest;
> > +		}
> >  
> > -		*imbalance = NICE_TO_BIAS_PRIO(0);
> > -		return busiest;
> > +		*imbalance = avg_bias_prio;
> >  	}
> >  
> > -	/*
> > -	 * Get rid of the scaling factor, rounding down as we divide and
> > -	 * converting to biased load for use by move_tasks()
> > -	 */
> > -	*imbalance = biased_load(*imbalance);
> >  	return busiest;
> >  
> >  out_balanced:
> 
> Attached is an alternative patch for this problem that adds less 
> overhead.  It takes advantage of the fact that biased_load(max_load) for 
> busiest group is APPROXIMATELY equal to the sum of the prio_biases for 
> the run queues in the group.  This has a number of advantages:
> 
> 1. less work is done in the do loop
> 2. there is no need to convert *imbalance to biased load before 
> comparison to avg_bias_prio as they are in the same units.
> 
> There is one possible disadvantage and that is that the approximation is 
> sufficiently bad to break the functionality.
> 
> In summary, if this version works it should be used in preference to v2 
> as it involves less overhead.
> 
> Peter
> -- 
> Peter Williams                                   pwil3058@bigpond.net.au
> 
> "Learning, n. The kind of ignorance distinguishing the studious."
>   -- Ambrose Bierce

> Index: MM-2.6.X/kernel/sched.c
> ===================================================================
> --- MM-2.6.X.orig/kernel/sched.c	2006-02-13 10:23:12.000000000 +1100
> +++ MM-2.6.X/kernel/sched.c	2006-02-14 09:47:17.000000000 +1100
> @@ -2033,9 +2033,11 @@ find_busiest_group(struct sched_domain *
>  	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
>  	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
>  	unsigned long max_pull;
> +	unsigned long avg_load_per_task, busiest_nr_running;
>  	int load_idx;
>  
>  	max_load = this_load = total_load = total_pwr = 0;
> +	busiest_nr_running = 0;
>  	if (idle == NOT_IDLE)
>  		load_idx = sd->busy_idx;
>  	else if (idle == NEWLY_IDLE)
> @@ -2047,11 +2049,12 @@ find_busiest_group(struct sched_domain *
>  		unsigned long load;
>  		int local_group;
>  		int i;
> +		unsigned long sum_nr_running;
>  
>  		local_group = cpu_isset(this_cpu, group->cpumask);
>  
>  		/* Tally up the load of all CPUs in the group */
> -		avg_load = 0;
> +		sum_nr_running = avg_load = 0;
>  
>  		for_each_cpu_mask(i, group->cpumask) {
>  			if (*sd_idle && !idle_cpu(i))
> @@ -2064,6 +2067,7 @@ find_busiest_group(struct sched_domain *
>  				load = source_load(i, load_idx);
>  
>  			avg_load += load;
> +			sum_nr_running += cpu_rq(i)->nr_running;
>  		}
>  
>  		total_load += avg_load;
> @@ -2078,6 +2082,7 @@ find_busiest_group(struct sched_domain *
>  		} else if (avg_load > max_load) {
>  			max_load = avg_load;
>  			busiest = group;
> +			busiest_nr_running = sum_nr_running;
>  		}
>  		group = group->next;
>  	} while (group != sd->groups);
> @@ -2111,12 +2116,20 @@ find_busiest_group(struct sched_domain *
>  				(avg_load - this_load) * this->cpu_power)
>  			/ SCHED_LOAD_SCALE;
>  
> -	if (*imbalance < SCHED_LOAD_SCALE) {
> +	/* Don't assume that busiest_nr_running > 0 */
> +	avg_load_per_task = busiest_nr_running ? max_load / busiest_nr_running : avg_load;
> +	/*
> +	 * if *imbalance is less than the average load per runnable task
> +	 * there is no gaurantee that any tasks will be moved so we'll have
> +	 * a think about bumping its value to force at least one task to be
> +	 * moved
> +	 */
> +	if (*imbalance < avg_load_per_task) {
>  		unsigned long pwr_now = 0, pwr_move = 0;
>  		unsigned long tmp;
>  
> -		if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
> -			*imbalance = NICE_TO_BIAS_PRIO(0);
> +		if (max_load - this_load >= avg_load_per_task*2) {
> +			*imbalance = biased_load(avg_load_per_task);
>  			return busiest;
>  		}
>  
> @@ -2146,11 +2159,14 @@ find_busiest_group(struct sched_domain *
>  		pwr_move /= SCHED_LOAD_SCALE;
>  
>  		/* Move if we gain throughput */
> -		if (pwr_move <= pwr_now)
> -			goto out_balanced;
> -
> -		*imbalance = NICE_TO_BIAS_PRIO(0);
> -		return busiest;
> +		if (pwr_move <= pwr_now) {
> +			/* or if there's a reasonable chance that *imbalance
> +			 * is big enough to cause a move
> +			 */
> +			if (*imbalance <= avg_load_per_task / 2)
> +				goto out_balanced;
> +		} else
> +			*imbalance = avg_load_per_task;
>  	}
>  
>  	/*


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [rfc][patch] sched: remove smpnice
  2006-02-11  1:27             ` Peter Williams
                                 ` (2 preceding siblings ...)
  2006-02-11  4:04               ` Peter Williams
@ 2006-02-14  9:07               ` Siddha, Suresh B
  2006-02-14 22:40                 ` Peter Williams
  3 siblings, 1 reply; 33+ messages in thread
From: Siddha, Suresh B @ 2006-02-14  9:07 UTC (permalink / raw)
  To: Peter Williams
  Cc: Andrew Morton, Siddha, Suresh B, kernel, npiggin, mingo, rostedt,
	linux-kernel, torvalds

On Fri, Feb 10, 2006 at 05:27:06PM -0800, Peter Williams wrote:
>> "Siddha, Suresh B" <suresh.b.siddha@intel.com> wrote:
>>>My testing showed that 178.galgel in SPECfp2000 is down by 
>~10% when run with 
>>>nice -20 on a 4P(8-way with HT) system compared to a nice-0 run.
>
>Is it normal to run enough -20 tasks to cause this problem to manifest?

On a 4P(8-way with HT), if you run a -20 task(a simple infinite loop)
it hops from one processor to another processor... you can observe it
using top.

find_busiest_group() thinks there is an imbalance and ultimately the
idle cpu kicks active load balance on busy cpu, resulting in the hopping.

>>>
>>>b) On a lightly loaded system, this can result in HT 
>scheduler optimizations
>>>being disabled in presence of low priority tasks... in this 
>case, they(low
>>>priority ones) can end up running on the same package, even 
>in the presence 
>>>of other idle packages.. Though this is not as serious as 
>"a" above...
>>>
>
>I think that this issue comes under the heading of "Result of better 
>nice enforcement" which is the purpose of the patch :-).  I wouldn't 
>call this HT disablement or do I misunderstand the issue.
>
>The only way that I can see load balancing subverting the HT 
>scheduling 
>mechanisms is if (say) there are 2 CPUs with 2 HT channels 
>each and all 
>of the high priority tasks end up sharing the 2 channels of one CPU 
>while all of the low priority tasks share the 2 channels of the other 
>one.  This scenario is far more likely to happen without the smpnice 
>patches than with them.

I agree with you.. But lets take a DP system with HT, now if there are
only two low priority tasks running, ideally we should be running them
on two different packages. With this patch, we may end up running on the
same logical processor.. leave alone running on the same package..
As these are low priority tasks, it might be ok.. But...

thanks,
suresh

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [rfc][patch] sched: remove smpnice
  2006-02-14  9:07               ` Siddha, Suresh B
@ 2006-02-14 22:40                 ` Peter Williams
  2006-02-14 23:44                   ` Paul Jackson
  2006-02-15  7:07                   ` Siddha, Suresh B
  0 siblings, 2 replies; 33+ messages in thread
From: Peter Williams @ 2006-02-14 22:40 UTC (permalink / raw)
  To: Siddha, Suresh B
  Cc: Andrew Morton, kernel, npiggin, mingo, rostedt, linux-kernel, torvalds

Siddha, Suresh B wrote:
> On Fri, Feb 10, 2006 at 05:27:06PM -0800, Peter Williams wrote:
> 
>>>"Siddha, Suresh B" <suresh.b.siddha@intel.com> wrote:
>>>
>>>>My testing showed that 178.galgel in SPECfp2000 is down by 
>>
>>~10% when run with 
>>
>>>>nice -20 on a 4P(8-way with HT) system compared to a nice-0 run.
>>
>>Is it normal to run enough -20 tasks to cause this problem to manifest?
> 
> 
> On a 4P(8-way with HT), if you run a -20 task(a simple infinite loop)
> it hops from one processor to another processor... you can observe it
> using top.

How do you get top to display which CPU tasks are running on?

> 
> find_busiest_group() thinks there is an imbalance and ultimately the
> idle cpu kicks active load balance on busy cpu, resulting in the hopping.

I'm still having trouble getting my head around this.  A task shouldn't 
be moved unless there's at least one other task on its current CPU, it 
(the task to be moved) is cache cold and it (the task to be moved) is 
not the currently active task.  In these circumstances, moving the task 
to an idle CPU should be a "good thing" unless the time taken for the 
move is longer than the time that will pass before the task becomes the 
running task on its current CPU.

For a nice==-20 task to be hopping from CPU to CPU there must be higher 
(or equal) priority tasks running on each of the CPUs that it gets 
booted off at the time it gets booted off.

> 
> 
>>>>b) On a lightly loaded system, this can result in HT 
>>
>>scheduler optimizations
>>
>>>>being disabled in presence of low priority tasks... in this 
>>
>>case, they(low
>>
>>>>priority ones) can end up running on the same package, even 
>>
>>in the presence 
>>
>>>>of other idle packages.. Though this is not as serious as 
>>
>>"a" above...
>>
>>I think that this issue comes under the heading of "Result of better 
>>nice enforcement" which is the purpose of the patch :-).  I wouldn't 
>>call this HT disablement or do I misunderstand the issue.
>>
>>The only way that I can see load balancing subverting the HT 
>>scheduling 
>>mechanisms is if (say) there are 2 CPUs with 2 HT channels 
>>each and all 
>>of the high priority tasks end up sharing the 2 channels of one CPU 
>>while all of the low priority tasks share the 2 channels of the other 
>>one.  This scenario is far more likely to happen without the smpnice 
>>patches than with them.
> 
> 
> I agree with you.. But lets take a DP system with HT, now if there are
> only two low priority tasks running, ideally we should be running them
> on two different packages. With this patch, we may end up running on the
> same logical processor.. leave alone running on the same package..
> As these are low priority tasks, it might be ok.. But...

Yes, this is an issue but it's not restricted to HT systems (except for 
the same package part).  The latest patch that I've posted addresses 
(part of) this problem by replacing SCHED_LOAD_SCALE with the average 
load per runnable task in the code at the end of find_busiest_group() 
which handles the case where imbalance is small.  This should enable 
load balancing to occur even if all runnable tasks are low priority.

The issue of the two tasks ending up on the same package isn't (as far 
as I can see) caused by the smpnice load balancing changes.

Peter
-- 
Peter Williams                                   pwil3058@bigpond.net.au

"Learning, n. The kind of ignorance distinguishing the studious."
  -- Ambrose Bierce

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [rfc][patch] sched: remove smpnice
  2006-02-14 22:40                 ` Peter Williams
@ 2006-02-14 23:44                   ` Paul Jackson
  2006-02-15  0:09                     ` Peter Williams
  2006-02-15  7:07                   ` Siddha, Suresh B
  1 sibling, 1 reply; 33+ messages in thread
From: Paul Jackson @ 2006-02-14 23:44 UTC (permalink / raw)
  To: Peter Williams
  Cc: suresh.b.siddha, akpm, kernel, npiggin, mingo, rostedt,
	linux-kernel, torvalds

Peter wrote:
> In these circumstances, moving the task 
> to an idle CPU should be a "good thing" unless the time taken for the 
> move is longer than the time that will pass before the task becomes the 
> running task on its current CPU.

Even then, it's not always a "good thing".

The less of the cache-memory hierarchy the two CPUs share, the greater
the penalty to the task for memory accesses after the move.

At one extreme, two hyperthreads on the same core share essentially all
the memory hierarchy, so have no such penalty.

At the other extreme, two CPUs at opposite ends of a big NUMA box have,
so far as performance is concerned, quite different views of the memory
hierarchy.  A task moved to a far away CPU will be cache cold for
several layers of core, package, board, and perhaps router hierarchy,
and have slower access to its main memory pages.


-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [rfc][patch] sched: remove smpnice
  2006-02-14 23:44                   ` Paul Jackson
@ 2006-02-15  0:09                     ` Peter Williams
  2006-02-15  1:00                       ` Paul Jackson
  0 siblings, 1 reply; 33+ messages in thread
From: Peter Williams @ 2006-02-15  0:09 UTC (permalink / raw)
  To: Paul Jackson
  Cc: suresh.b.siddha, akpm, kernel, npiggin, mingo, rostedt,
	linux-kernel, torvalds

Paul Jackson wrote:
> Peter wrote:
> 
>>In these circumstances, moving the task 
>>to an idle CPU should be a "good thing" unless the time taken for the 
>>move is longer than the time that will pass before the task becomes the 
>>running task on its current CPU.
> 
> 
> Even then, it's not always a "good thing".
> 
> The less of the cache-memory hierarchy the two CPUs share, the greater
> the penalty to the task for memory accesses after the move.
> 
> At one extreme, two hyperthreads on the same core share essentially all
> the memory hierarchy, so have no such penalty.
> 
> At the other extreme, two CPUs at opposite ends of a big NUMA box have,
> so far as performance is concerned, quite different views of the memory
> hierarchy.  A task moved to a far away CPU will be cache cold for
> several layers of core, package, board, and perhaps router hierarchy,
> and have slower access to its main memory pages.

This will complicate things IF we end up having to introduce an "is it 
worth moving this particular task" test to move_tasks() in addition to 
the "cache hot" test.  E.g. "will it take longer for this task to do 
something useful on its new CPU than if we leave it here?" would 
obviously have to take into account any delay in accessing memory as a 
result of the move.  Hopefully it won't come to that :-).

Peter
-- 
Peter Williams                                   pwil3058@bigpond.net.au

"Learning, n. The kind of ignorance distinguishing the studious."
  -- Ambrose Bierce

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [rfc][patch] sched: remove smpnice
  2006-02-15  0:09                     ` Peter Williams
@ 2006-02-15  1:00                       ` Paul Jackson
  0 siblings, 0 replies; 33+ messages in thread
From: Paul Jackson @ 2006-02-15  1:00 UTC (permalink / raw)
  To: Peter Williams
  Cc: suresh.b.siddha, akpm, kernel, npiggin, mingo, rostedt,
	linux-kernel, torvalds

> Hopefully it won't come to that :-).

Agreed.  Sched_setaffinity and cpusets can help here,
by restricting the choices available to the scheduler.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [rfc][patch] sched: remove smpnice
  2006-02-14 22:40                 ` Peter Williams
  2006-02-14 23:44                   ` Paul Jackson
@ 2006-02-15  7:07                   ` Siddha, Suresh B
  2006-02-15 22:36                     ` Peter Williams
  1 sibling, 1 reply; 33+ messages in thread
From: Siddha, Suresh B @ 2006-02-15  7:07 UTC (permalink / raw)
  To: Peter Williams
  Cc: Siddha, Suresh B, Andrew Morton, kernel, npiggin, mingo, rostedt,
	linux-kernel, torvalds

On Wed, Feb 15, 2006 at 09:40:32AM +1100, Peter Williams wrote:
> Siddha, Suresh B wrote:
> > On a 4P(8-way with HT), if you run a -20 task(a simple infinite loop)
> > it hops from one processor to another processor... you can observe it
> > using top.
> 
> How do you get top to display which CPU tasks are running on?

In the interactive mode, you can select the "last used cpu" field to display.
or you can use /proc/pid/stat

> > 
> > find_busiest_group() thinks there is an imbalance and ultimately the
> > idle cpu kicks active load balance on busy cpu, resulting in the hopping.
> 
> I'm still having trouble getting my head around this.  A task shouldn't 
> be moved unless there's at least one other task on its current CPU, it 

Because of the highest priority task, weighted load of that cpu
will be > SCHED_LOAD_SCALE. Because of this, an idle cpu in 
find_busiest_group() thinks that there is an imbalance.. This is due to
the code near the comment "however we may be able to increase 
total CPU power used by ...". That piece of code assumes that a unit load
is represented by SCHED_LOAD_SCALE (which is no longer true with smpnice)
and finally results in "pwr_move > pwr_now".. This will make the idle cpu
try to pull that process from busiest cpu and the process will ultimately move
with the help of active load balance...

> > I agree with you.. But lets take a DP system with HT, now if there are
> > only two low priority tasks running, ideally we should be running them
> > on two different packages. With this patch, we may end up running on the
> > same logical processor.. leave alone running on the same package..
> > As these are low priority tasks, it might be ok.. But...
> 
> Yes, this is an issue but it's not restricted to HT systems (except for 

Agreed.

> the same package part).  The latest patch that I've posted addresses 
> (part of) this problem by replacing SCHED_LOAD_SCALE with the average 
> load per runnable task in the code at the end of find_busiest_group() 
> which handles the case where imbalance is small.  This should enable 
> load balancing to occur even if all runnable tasks are low priority.

Yes. We need to fix the code I mentioned above too.... And please make sure
it doesn't break HT optimizations as this piece of code is mainly used
for implementing HT optimizations..

thanks,
suresh

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [rfc][patch] sched: remove smpnice
  2006-02-15  7:07                   ` Siddha, Suresh B
@ 2006-02-15 22:36                     ` Peter Williams
  2006-02-15 23:29                       ` Peter Williams
  0 siblings, 1 reply; 33+ messages in thread
From: Peter Williams @ 2006-02-15 22:36 UTC (permalink / raw)
  To: Siddha, Suresh B
  Cc: Andrew Morton, kernel, npiggin, mingo, rostedt, linux-kernel, torvalds

Siddha, Suresh B wrote:
> On Wed, Feb 15, 2006 at 09:40:32AM +1100, Peter Williams wrote:
> 
>>Siddha, Suresh B wrote:
>>
>>>On a 4P(8-way with HT), if you run a -20 task(a simple infinite loop)
>>>it hops from one processor to another processor... you can observe it
>>>using top.
>>
>>How do you get top to display which CPU tasks are running on?
> 
> 
> In the interactive mode, you can select the "last used cpu" field to display.
> or you can use /proc/pid/stat
> 
> 
>>>find_busiest_group() thinks there is an imbalance and ultimately the
>>>idle cpu kicks active load balance on busy cpu, resulting in the hopping.
>>
>>I'm still having trouble getting my head around this.  A task shouldn't 
>>be moved unless there's at least one other task on its current CPU, it 
> 
> 
> Because of the highest priority task, weighted load of that cpu
> will be > SCHED_LOAD_SCALE. Because of this, an idle cpu in 
> find_busiest_group() thinks that there is an imbalance.. This is due to
> the code near the comment "however we may be able to increase 
> total CPU power used by ...". That piece of code assumes that a unit load
> is represented by SCHED_LOAD_SCALE (which is no longer true with smpnice)
> and finally results in "pwr_move > pwr_now".. This will make the idle cpu
> try to pull that process from busiest cpu and the process will ultimately move
> with the help of active load balance...

But as I pointed out, with the code as it is in the recent -mm kernels, 
the amount of biased load (i.e. NICE_TO_BIAS_PRIO(0)) that 
find_busiest_group() sets *imbalance to in these circumstances is too 
small for a task with nice less than zero to be moved i.e. move_tasks() 
will skip it.  Or are you just referring to the vanilla kernels?

> 
> 
>>>I agree with you.. But lets take a DP system with HT, now if there are
>>>only two low priority tasks running, ideally we should be running them
>>>on two different packages. With this patch, we may end up running on the
>>>same logical processor.. leave alone running on the same package..
>>>As these are low priority tasks, it might be ok.. But...
>>
>>Yes, this is an issue but it's not restricted to HT systems (except for 
> 
> 
> Agreed.
> 
> 
>>the same package part).  The latest patch that I've posted addresses 
>>(part of) this problem by replacing SCHED_LOAD_SCALE with the average 
>>load per runnable task in the code at the end of find_busiest_group() 
>>which handles the case where imbalance is small.  This should enable 
>>load balancing to occur even if all runnable tasks are low priority.
> 
> 
> Yes. We need to fix the code I mentioned above too.... And please make sure
> it doesn't break HT optimizations as this piece of code is mainly used
> for implementing HT optimizations..

OK, but I wouldn't call them optimizations (to say something is optimal 
is a big call :-)) as the HT code is more about enforcing "nice" (at the 
expense of throughput) than it is anything else.  By the way, I think 
that one avenue that should be considered for HT systems is to have a 
single run queue for each package and feed the logical CPUs in the 
package from that.  This would greatly improve the likelihood that the 
tasks selected to run on the logical CPUs within the package are of 
approximately the same priority and, consequently reduce the need to 
idle one of them.  The downside of this is the extra complexity that it 
would introduce.

Peter
-- 
Peter Williams                                   pwil3058@bigpond.net.au

"Learning, n. The kind of ignorance distinguishing the studious."
  -- Ambrose Bierce

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [rfc][patch] sched: remove smpnice
  2006-02-15 22:36                     ` Peter Williams
@ 2006-02-15 23:29                       ` Peter Williams
  0 siblings, 0 replies; 33+ messages in thread
From: Peter Williams @ 2006-02-15 23:29 UTC (permalink / raw)
  To: Peter Williams
  Cc: Siddha, Suresh B, Andrew Morton, kernel, npiggin, mingo, rostedt,
	linux-kernel, torvalds

Peter Williams wrote:
> Siddha, Suresh B wrote:
> 
>> On Wed, Feb 15, 2006 at 09:40:32AM +1100, Peter Williams wrote:
>>
>>> Siddha, Suresh B wrote:
>>>
>>>> On a 4P(8-way with HT), if you run a -20 task(a simple infinite loop)
>>>> it hops from one processor to another processor... you can observe it
>>>> using top.
>>>
>>>
>>> How do you get top to display which CPU tasks are running on?
>>
>>
>>
>> In the interactive mode, you can select the "last used cpu" field to 
>> display.

Thanks this works and I'm now convinced that there's hopping occurring 
but I disagree on the cause (see below).

>> or you can use /proc/pid/stat
>>
>>
>>>> find_busiest_group() thinks there is an imbalance and ultimately the
>>>> idle cpu kicks active load balance on busy cpu, resulting in the 
>>>> hopping.
>>>
>>>
>>> I'm still having trouble getting my head around this.  A task 
>>> shouldn't be moved unless there's at least one other task on its 
>>> current CPU, it 
>>
>>
>>
>> Because of the highest priority task, weighted load of that cpu
>> will be > SCHED_LOAD_SCALE. Because of this, an idle cpu in 
>> find_busiest_group() thinks that there is an imbalance.. This is due to
>> the code near the comment "however we may be ablet to increase total 
>> CPU power used by ...". That piece of code assumes that a unit load
>> is represented by SCHED_LOAD_SCALE (which is no longer true with smpnice)
>> and finally results in "pwr_move > pwr_now".. This will make the idle cpu
>> try to pull that process from busiest cpu and the process will 
>> ultimately move
>> with the help of active load balance...
> 
> 
> But as I pointed out, with the code as it is in the recent -mm kernels, 
> the amount of biased load (i.e. NICE_TO_BIAS_PRIO(0)) that 
> find_busiest_group() sets *imbalance to in these circumstances is too 
> small for a task with nice less than zero to be moved i.e. move_tasks() 
> will skip it.  Or are you just referring to the vanilla kernels?

The upshot of this is that the code near "however we may" etc. never 
gets executed.  So what must be happening is that *imbalance is greater 
than SCHED_LOAD_SCALE so it goes out unchanged (as the fact that the 
problem occurs with hard spinners lets try_to_wake_up() off the hook). 
But, as I also said in another e-mail, there must be another task with 
equal or higher priority than the task (that's doing the hopping) on its 
CPU for it to be booted off.  I would surmise that the prime candidate 
here would be the migration thread for the CPU in question as it's a 
real time task and seems to run fairly frequently.

The obvious way to handle this is too NOT move tasks from a run queue 
where the migration thread is running unless the number of running tasks 
on that CPU is greater than 2.  I'll try to come up with a patch that 
does this without causing too much complexity.

Peter
-- 
Peter Williams                                   pwil3058@bigpond.net.au

"Learning, n. The kind of ignorance distinguishing the studious."
  -- Ambrose Bierce

^ permalink raw reply	[flat|nested] 33+ messages in thread

end of thread, other threads:[~2006-02-15 23:29 UTC | newest]

Thread overview: 33+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2006-02-07 14:28 [rfc][patch] sched: remove smpnice Nick Piggin
2006-02-07 14:57 ` Con Kolivas
2006-02-07 15:05   ` Nick Piggin
2006-02-07 22:15   ` Andrew Morton
2006-02-07 23:11     ` Con Kolivas
2006-02-07 23:36       ` Andrew Morton
2006-02-08  3:28         ` Nick Piggin
2006-02-08 14:56         ` Ingo Molnar
2006-02-10  7:01         ` Siddha, Suresh B
2006-02-10  7:17           ` Andrew Morton
2006-02-10  7:23             ` Con Kolivas
2006-02-10  9:06             ` Ingo Molnar
2006-02-11  1:27             ` Peter Williams
2006-02-11  2:00               ` Andrew Morton
2006-02-12  1:13                 ` Peter Williams
2006-02-12 23:10                   ` Peter Williams
2006-02-13  1:06                     ` Peter Williams
2006-02-14  0:37                       ` Peter Williams
2006-02-14  8:53                         ` Siddha, Suresh B
2006-02-11  3:36               ` Peter Williams
2006-02-11  4:04               ` Peter Williams
2006-02-14  9:07               ` Siddha, Suresh B
2006-02-14 22:40                 ` Peter Williams
2006-02-14 23:44                   ` Paul Jackson
2006-02-15  0:09                     ` Peter Williams
2006-02-15  1:00                       ` Paul Jackson
2006-02-15  7:07                   ` Siddha, Suresh B
2006-02-15 22:36                     ` Peter Williams
2006-02-15 23:29                       ` Peter Williams
2006-02-13 14:12           ` Con Kolivas
2006-02-07 23:20     ` Peter Williams
2006-02-07 23:29       ` Con Kolivas
2006-02-07 23:36       ` Martin Bligh

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.