linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] implement nice support across physical cpus on SMP
@ 2005-05-07 13:42 Con Kolivas
  2005-05-07 17:59 ` Carlos Carvalho
  2005-05-09 11:24 ` Markus   Törnqvist
  0 siblings, 2 replies; 21+ messages in thread
From: Con Kolivas @ 2005-05-07 13:42 UTC (permalink / raw)
  To: linux kernel mailing list; +Cc: ck, Ingo Molnar, Andrew Morton, Carlos Carvalho


[-- Attachment #1.1: Type: text/plain, Size: 350 bytes --]

SMP balancing is currently designed purely with throughput in mind. This 
working patch implements a mechanism for supporting 'nice' across physical 
cpus without impacting throughput.

This is a version for stable kernel 2.6.11.*

Carlos, if you could test this with your test case it would be appreciated.

Ingo, comments?

Cheers,
Con

[-- Attachment #1.2: cross_cpu_smp_nice_support.diff --]
[-- Type: text/x-diff, Size: 7775 bytes --]

This patch implements 'nice' support across physical cpus on SMP.

It introduces an extra runqueue variable prio_bias which is the sum of the
(inverted) static priorities of all the tasks on the runqueue. This is then used
to bias busy rebalancing between runqueues to obtain good distribution of tasks
of different nice values. By biasing the balancing only during busy rebalancing
we can avoid having any significant loss of throughput by not affecting the
carefully tuned idle balancing already in place. If all tasks are running at the
same nice level this code should also have minimal effect. The code is optimised
out in the !CONFIG_SMP case.

Signed-off-by: Con Kolivas <kernel@kolivas.org>

Index: linux-2.6.11-smpnice/kernel/sched.c
===================================================================
--- linux-2.6.11-smpnice.orig/kernel/sched.c	2005-03-02 19:30:30.000000000 +1100
+++ linux-2.6.11-smpnice/kernel/sched.c	2005-05-07 23:25:15.000000000 +1000
@@ -204,6 +204,7 @@ struct runqueue {
 	 */
 	unsigned long nr_running;
 #ifdef CONFIG_SMP
+	unsigned long prio_bias;
 	unsigned long cpu_load;
 #endif
 	unsigned long long nr_switches;
@@ -628,13 +629,45 @@ static int effective_prio(task_t *p)
 	return prio;
 }
 
+#ifdef CONFIG_SMP
+static inline void inc_prio_bias(runqueue_t *rq, int static_prio)
+{
+	rq->prio_bias += MAX_PRIO - static_prio;
+}
+
+static inline void dec_prio_bias(runqueue_t *rq, int static_prio)
+{
+	rq->prio_bias -= MAX_PRIO - static_prio;
+}
+#else
+static inline void inc_prio_bias(runqueue_t *rq, int static_prio)
+{
+}
+
+static inline void dec_prio_bias(runqueue_t *rq, int static_prio)
+{
+}
+#endif
+
+static inline void inc_nr_running(task_t *p, runqueue_t *rq)
+{
+	rq->nr_running++;
+	inc_prio_bias(rq, p->static_prio);
+}
+
+static inline void dec_nr_running(task_t *p, runqueue_t *rq)
+{
+	rq->nr_running--;
+	dec_prio_bias(rq, p->static_prio);
+}
+
 /*
  * __activate_task - move a task to the runqueue.
  */
 static inline void __activate_task(task_t *p, runqueue_t *rq)
 {
 	enqueue_task(p, rq->active);
-	rq->nr_running++;
+	inc_nr_running(p, rq);
 }
 
 /*
@@ -643,7 +676,7 @@ static inline void __activate_task(task_
 static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
 {
 	enqueue_task_head(p, rq->active);
-	rq->nr_running++;
+	inc_nr_running(p, rq);
 }
 
 static void recalc_task_prio(task_t *p, unsigned long long now)
@@ -761,7 +794,7 @@ static void activate_task(task_t *p, run
  */
 static void deactivate_task(struct task_struct *p, runqueue_t *rq)
 {
-	rq->nr_running--;
+	dec_nr_running(p, rq);
 	dequeue_task(p, p->array);
 	p->array = NULL;
 }
@@ -909,23 +942,37 @@ void kick_process(task_t *p)
  * We want to under-estimate the load of migration sources, to
  * balance conservatively.
  */
-static inline unsigned long source_load(int cpu)
+static inline unsigned long source_load(int cpu, enum idle_type idle)
 {
 	runqueue_t *rq = cpu_rq(cpu);
-	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+	unsigned long cpu_load = rq->cpu_load,
+		load_now = rq->nr_running * SCHED_LOAD_SCALE;
 
-	return min(rq->cpu_load, load_now);
+	if (idle == NOT_IDLE) {
+		/*
+		 * If we are balancing busy runqueues the load is biased by
+		 * priority to create 'nice' support across cpus.
+		 */
+		cpu_load += rq->prio_bias;
+		load_now += rq->prio_bias;
+	}
+	return min(cpu_load, load_now);
 }
 
 /*
  * Return a high guess at the load of a migration-target cpu
  */
-static inline unsigned long target_load(int cpu)
+static inline unsigned long target_load(int cpu, enum idle_type idle)
 {
 	runqueue_t *rq = cpu_rq(cpu);
-	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+	unsigned long cpu_load = rq->cpu_load,
+		load_now = rq->nr_running * SCHED_LOAD_SCALE;
 
-	return max(rq->cpu_load, load_now);
+	if (idle == NOT_IDLE) {
+		cpu_load += rq->prio_bias;
+		load_now += rq->prio_bias;
+	}
+	return max(cpu_load, load_now);
 }
 
 #endif
@@ -1015,8 +1062,8 @@ static int try_to_wake_up(task_t * p, un
 	if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
 		goto out_set_cpu;
 
-	load = source_load(cpu);
-	this_load = target_load(this_cpu);
+	load = source_load(cpu, SCHED_IDLE);
+	this_load = target_load(this_cpu, SCHED_IDLE);
 
 	/*
 	 * If sync wakeup then subtract the (maximum possible) effect of
@@ -1240,7 +1287,7 @@ void fastcall wake_up_new_task(task_t * 
 				list_add_tail(&p->run_list, &current->run_list);
 				p->array = current->array;
 				p->array->nr_active++;
-				rq->nr_running++;
+				inc_nr_running(p, rq);
 			}
 			set_need_resched();
 		} else
@@ -1524,7 +1571,7 @@ static int find_idlest_cpu(struct task_s
 	cpus_and(mask, sd->span, p->cpus_allowed);
 
 	for_each_cpu_mask(i, mask) {
-		load = target_load(i);
+		load = target_load(i, SCHED_IDLE);
 
 		if (load < min_load) {
 			min_cpu = i;
@@ -1537,7 +1584,7 @@ static int find_idlest_cpu(struct task_s
 	}
 
 	/* add +1 to account for the new task */
-	this_load = source_load(this_cpu) + SCHED_LOAD_SCALE;
+	this_load = source_load(this_cpu, SCHED_IDLE) + SCHED_LOAD_SCALE;
 
 	/*
 	 * Would with the addition of the new task to the
@@ -1630,9 +1677,9 @@ void pull_task(runqueue_t *src_rq, prio_
 	       runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
 {
 	dequeue_task(p, src_array);
-	src_rq->nr_running--;
+	dec_nr_running(p, src_rq);
 	set_task_cpu(p, this_cpu);
-	this_rq->nr_running++;
+	inc_nr_running(p, this_rq);
 	enqueue_task(p, this_array);
 	p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
 				+ this_rq->timestamp_last_tick;
@@ -1790,9 +1837,9 @@ find_busiest_group(struct sched_domain *
 		for_each_cpu_mask(i, group->cpumask) {
 			/* Bias balancing toward cpus of our domain */
 			if (local_group)
-				load = target_load(i);
+				load = target_load(i, idle);
 			else
-				load = source_load(i);
+				load = source_load(i, idle);
 
 			nr_cpus++;
 			avg_load += load;
@@ -1904,14 +1951,14 @@ out_balanced:
 /*
  * find_busiest_queue - find the busiest runqueue among the cpus in group.
  */
-static runqueue_t *find_busiest_queue(struct sched_group *group)
+static runqueue_t *find_busiest_queue(struct sched_group *group, enum idle_type idle)
 {
 	unsigned long load, max_load = 0;
 	runqueue_t *busiest = NULL;
 	int i;
 
 	for_each_cpu_mask(i, group->cpumask) {
-		load = source_load(i);
+		load = source_load(i, idle);
 
 		if (load > max_load) {
 			max_load = load;
@@ -1945,7 +1992,7 @@ static int load_balance(int this_cpu, ru
 		goto out_balanced;
 	}
 
-	busiest = find_busiest_queue(group);
+	busiest = find_busiest_queue(group, idle);
 	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[idle]);
 		goto out_balanced;
@@ -2049,7 +2096,7 @@ static int load_balance_newidle(int this
 		goto out;
 	}
 
-	busiest = find_busiest_queue(group);
+	busiest = find_busiest_queue(group, NEWLY_IDLE);
 	if (!busiest || busiest == this_rq) {
 		schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
 		goto out;
@@ -3245,7 +3292,9 @@ void set_user_nice(task_t *p, long nice)
 	 * not SCHED_NORMAL:
 	 */
 	if (rt_task(p)) {
+		dec_prio_bias(rq, p->static_prio);
 		p->static_prio = NICE_TO_PRIO(nice);
+		inc_prio_bias(rq, p->static_prio);
 		goto out_unlock;
 	}
 	array = p->array;
@@ -3255,7 +3304,9 @@ void set_user_nice(task_t *p, long nice)
 	old_prio = p->prio;
 	new_prio = NICE_TO_PRIO(nice);
 	delta = new_prio - old_prio;
+	dec_prio_bias(rq, p->static_prio);
 	p->static_prio = NICE_TO_PRIO(nice);
+	inc_prio_bias(rq, p->static_prio);
 	p->prio += delta;
 
 	if (array) {

[-- Attachment #2: Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] implement nice support across physical cpus on SMP
  2005-05-07 13:42 [PATCH] implement nice support across physical cpus on SMP Con Kolivas
@ 2005-05-07 17:59 ` Carlos Carvalho
  2005-05-07 21:45   ` Con Kolivas
  2005-05-09 11:24 ` Markus   Törnqvist
  1 sibling, 1 reply; 21+ messages in thread
From: Carlos Carvalho @ 2005-05-07 17:59 UTC (permalink / raw)
  To: Con Kolivas; +Cc: linux kernel mailing list, ck, Ingo Molnar, Andrew Morton

Con Kolivas (kernel@kolivas.org) wrote on 7 May 2005 23:42:
 >SMP balancing is currently designed purely with throughput in mind. This 
 >working patch implements a mechanism for supporting 'nice' across physical 
 >cpus without impacting throughput.
 >
 >This is a version for stable kernel 2.6.11.*
 >
 >Carlos, if you could test this with your test case it would be appreciated.

Unfortunately it doesn't seem to have any effect:

  PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND
  184 user1    39  19  7220 5924  520 R 99.9  1.1 209:40.68 mi41
  266 user2    25   0  1760  480  420 R 50.5  0.1  86:36.31 xdipole1
  227 user3    25   0  155m  62m  640 R 49.5 12.3  95:07.89 b170-se.x

Note that the nice 19 job monopolizes one processor while the other
two nice 0 ones share a single processor.

This is really a showstopper for this kind of application :-(

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] implement nice support across physical cpus on SMP
  2005-05-07 17:59 ` Carlos Carvalho
@ 2005-05-07 21:45   ` Con Kolivas
  0 siblings, 0 replies; 21+ messages in thread
From: Con Kolivas @ 2005-05-07 21:45 UTC (permalink / raw)
  To: Carlos Carvalho; +Cc: linux kernel mailing list, ck, Ingo Molnar, Andrew Morton

[-- Attachment #1: Type: text/plain, Size: 1204 bytes --]

On Sun, 8 May 2005 03:59, Carlos Carvalho wrote:
> Con Kolivas (kernel@kolivas.org) wrote on 7 May 2005 23:42:
>  >SMP balancing is currently designed purely with throughput in mind. This
>  >working patch implements a mechanism for supporting 'nice' across
>  > physical cpus without impacting throughput.
>  >
>  >This is a version for stable kernel 2.6.11.*
>  >
>  >Carlos, if you could test this with your test case it would be
>  > appreciated.
>
> Unfortunately it doesn't seem to have any effect:
>
>   PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND
>   184 user1    39  19  7220 5924  520 R 99.9  1.1 209:40.68 mi41
>   266 user2    25   0  1760  480  420 R 50.5  0.1  86:36.31 xdipole1
>   227 user3    25   0  155m  62m  640 R 49.5 12.3  95:07.89 b170-se.x
>
> Note that the nice 19 job monopolizes one processor while the other
> two nice 0 ones share a single processor.
>
> This is really a showstopper for this kind of application :-(

Ok back to the drawing board. I have to try and figure out why it doesn't work 
for your case. I tried it on 4x with lots of cpu bound tasks so I'm not sure 
why it doesn't help with tyours.

Cheers,
Con

[-- Attachment #2: Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] implement nice support across physical cpus on SMP
  2005-05-07 13:42 [PATCH] implement nice support across physical cpus on SMP Con Kolivas
  2005-05-07 17:59 ` Carlos Carvalho
@ 2005-05-09 11:24 ` Markus   Törnqvist
  2005-05-09 11:28   ` [ck] " Markus   Törnqvist
  2005-05-09 11:47   ` Con Kolivas
  1 sibling, 2 replies; 21+ messages in thread
From: Markus   Törnqvist @ 2005-05-09 11:24 UTC (permalink / raw)
  To: linux-kernel; +Cc: ck, Ingo Molnar, AndrewMorton, Carlos Carvalho

[-- Attachment #1: Type: text/plain, Size: 5081 bytes --]

I beg to differ with Mr. Carvalho's assesment with this patch;
it works like a charm, and then some.

The rest of the message is just my analysis of the situation
ran on a Dell PowerEdge 2850, dual hyperthread Xeon EM64Ts, with
Debian Pure64 Sarge installed.

Mr. Carvalho, is the program you tested a failure with open, or
is it possible for me to have the code and try to reproduce this
nevertheless?

My two cents say this is going in :)

And on replying, anyone, please keep me in the Cc, as I'm not
subscribed.

The rest of this message is just the "raw" data on my experiment.

$ cat load.sh
#!/bin/sh

if [ $1 ] && [ -n $1 ]; then
  count=$1
else
  count=1
fi

cur=0
while [ $cur -lt $count ]; do
  cur=$[ $cur + 1 ]
  if [ $cur -eq $[ $count-1 ] ] || [ $cur -eq $count ]; then
    nice -n 19 load_base.sh &
  else
    load_base.sh &
  fi
done

$ cat load_base.sh 
#!/bin/sh

while true; do a=1; done


$ ./load.sh 5
  PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND            
 3918 mjt       34   0  5660 1136  936 R 99.9  0.0   1:34.30 load_base.sh       
 3917 mjt       35   0  5660 1136  936 R 99.5  0.0   1:34.30 load_base.sh       
 3916 mjt       34   0  5660 1136  936 R 59.7  0.0   0:53.16 load_base.sh       
 3919 mjt       39  19  5660 1136  936 R  6.0  0.0   0:05.97 load_base.sh       
 3920 mjt       39  19  5660 1136  936 R  3.0  0.0   0:02.62 load_base.sh

  PID USER      PR  NI  VIRT  SHR S %CPU %MEM    TIME+  #C COMMAND              
 3917 mjt       26   0  5660  936 R 99.9  0.0   3:37.61  0 load_base.sh         
 3918 mjt       25   0  5660  936 R 99.9  0.0   3:37.60  3 load_base.sh         
 3916 mjt       26   0  5660  936 R 52.7  0.0   2:02.37  2 load_base.sh         
 3919 mjt       39  19  5660  936 R  7.0  0.0   0:13.80  1 load_base.sh         
 3920 mjt       39  19  5660  936 R  3.0  0.0   0:06.05  2 load_base.sh

top - 11:09:24 up 15:30,  2 users,  load average: 4.99, 3.55, 1.63
  PID USER      PR  NI  VIRT  SHR S %CPU %MEM    TIME+  #C COMMAND              
 3917 mjt       25   0  5660  936 R 99.6  0.0   6:11.35  0 load_base.sh         
 3918 mjt       24   0  5660  936 R 99.6  0.0   6:11.34  3 load_base.sh         
 3916 mjt       39   0  5660  936 R 65.7  0.0   3:28.95  2 load_base.sh         
 3919 mjt       39  19  5660  936 R  7.0  0.0   0:23.54  1 load_base.sh         
 3920 mjt       39  19  5660  936 R  3.0  0.0   0:10.33  2 load_base.sh

top - 11:10:57 up 15:32,  2 users,  load average: 4.99, 3.94, 1.95
  PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND            
 3917 mjt       22   0  5660 1136  936 R 99.5  0.0   7:51.62 load_base.sh       
 3918 mjt       21   0  5660 1136  936 R 99.5  0.0   7:51.61 load_base.sh       
 3916 mjt       39   0  5660 1136  936 R 53.7  0.0   4:25.26 load_base.sh       
 3919 mjt       39  19  5660 1136  936 R  7.0  0.0   0:29.92 load_base.sh       
 3920 mjt       39  19  5660 1136  936 R  3.0  0.0   0:13.13 load_base.sh

top - 11:12:32 up 15:33,  2 users,  load average: 4.99, 4.22, 2.24
  PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  #C COMMAND         
 3917 mjt       35   0  5660 1136  936 R 99.9  0.0   9:28.56  0 load_base.sh    
 3918 mjt       34   0  5660 1136  936 R 99.5  0.0   9:28.54  3 load_base.sh    
 3916 mjt       35   0  5660 1136  936 R 61.7  0.0   5:19.77  2 load_base.sh    
 3919 mjt       39  19  5660 1136  936 R  6.0  0.0   0:36.07  1 load_base.sh    
 3920 mjt       39  19  5660 1136  936 R  3.0  0.0   0:15.82  2 load_base.sh

$ ./load.sh 7
top - 11:13:49 up 15:35,  2 users,  load average: 5.17, 4.40, 2.45
  PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  #C COMMAND         
 3952 mjt       29   0  5660 1140  936 R 99.9  0.0   0:33.53  2 load_base.sh    
 3950 mjt       31   0  5660 1140  936 R 99.5  0.0   0:33.33  1 load_base.sh    
 3953 mjt       30   0  5660 1140  936 R 55.7  0.0   0:16.82  3 load_base.sh    
 3951 mjt       39   0  5660 1140  936 R 43.8  0.0   0:16.70  3 load_base.sh    
 3949 mjt       39   0  5660 1140  936 R 23.9  0.0   0:13.18  0 load_base.sh    
 3954 mjt       39  19  5660 1140  936 R  2.0  0.0   0:00.64  0 load_base.sh    
 3955 mjt       39  19  5660 1140  936 R  2.0  0.0   0:00.64  0 load_base.sh    

top - 11:14:53 up 15:36,  2 users,  load average: 6.38, 4.91, 2.76
  PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  #C COMMAND         
 3950 mjt       23   0  5660 1140  936 R 99.9  0.0   1:39.67  1 load_base.sh    
 3952 mjt       21   0  5660 1140  936 R 99.9  0.0   1:39.87  2 load_base.sh    
 3951 mjt       39   0  5660 1140  936 R 52.7  0.0   0:49.91  3 load_base.sh    
 3953 mjt       22   0  5660 1140  936 R 47.8  0.0   0:49.95  3 load_base.sh    
 3949 mjt       39   0  5660 1140  936 R 43.8  0.0   0:38.70  0 load_base.sh    
 3954 mjt       39  19  5660 1140  936 R  2.0  0.0   0:01.90  0 load_base.sh    
 3955 mjt       39  19  5660 1140  936 R  2.0  0.0   0:01.90  0 load_base.sh

-- 
mjt


[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [ck] Re: [PATCH] implement nice support across physical cpus on SMP
  2005-05-09 11:24 ` Markus   Törnqvist
@ 2005-05-09 11:28   ` Markus   Törnqvist
  2005-05-09 11:47   ` Con Kolivas
  1 sibling, 0 replies; 21+ messages in thread
From: Markus   Törnqvist @ 2005-05-09 11:28 UTC (permalink / raw)
  To: linux-kernel; +Cc: ck, AndrewMorton, Carlos Carvalho

[-- Attachment #1: Type: text/plain, Size: 213 bytes --]

On Mon, May 09, 2005 at 02:24:46PM +0300, Markus Törnqvist wrote:
>The rest of the message is just my analysis of the situation

Typing faster than thinking syndrome, running late for an exam ;)

-- 
mjt


[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [ck] Re: [PATCH] implement nice support across physical cpus on SMP
  2005-05-09 11:24 ` Markus   Törnqvist
  2005-05-09 11:28   ` [ck] " Markus   Törnqvist
@ 2005-05-09 11:47   ` Con Kolivas
  2005-05-09 18:55     ` Markus   Törnqvist
  2005-05-09 23:54     ` Carlos Carvalho
  1 sibling, 2 replies; 21+ messages in thread
From: Con Kolivas @ 2005-05-09 11:47 UTC (permalink / raw)
  To: ck, Ingo Molnar
  Cc: Markus Törnqvist, linux-kernel, AndrewMorton, Carlos Carvalho

[-- Attachment #1: Type: text/plain, Size: 4218 bytes --]

On Mon, 9 May 2005 21:24, Markus Törnqvist wrote:
> I beg to differ with Mr. Carvalho's assesment with this patch;
> it works like a charm, and then some.
>
> The rest of the message is just my analysis of the situation
> ran on a Dell PowerEdge 2850, dual hyperthread Xeon EM64Ts, with
> Debian Pure64 Sarge installed.

Thanks for feedback.

>   PID USER      PR  NI  VIRT  SHR S %CPU %MEM    TIME+  #C COMMAND
>  3917 mjt       26   0  5660  936 R 99.9  0.0   3:37.61  0 load_base.sh
>  3918 mjt       25   0  5660  936 R 99.9  0.0   3:37.60  3 load_base.sh
>  3916 mjt       26   0  5660  936 R 52.7  0.0   2:02.37  2 load_base.sh
>  3919 mjt       39  19  5660  936 R  7.0  0.0   0:13.80  1 load_base.sh
>  3920 mjt       39  19  5660  936 R  3.0  0.0   0:06.05  2 load_base.sh
>
> top - 11:09:24 up 15:30,  2 users,  load average: 4.99, 3.55, 1.63
>   PID USER      PR  NI  VIRT  SHR S %CPU %MEM    TIME+  #C COMMAND
>  3917 mjt       25   0  5660  936 R 99.6  0.0   6:11.35  0 load_base.sh
>  3918 mjt       24   0  5660  936 R 99.6  0.0   6:11.34  3 load_base.sh
>  3916 mjt       39   0  5660  936 R 65.7  0.0   3:28.95  2 load_base.sh
>  3919 mjt       39  19  5660  936 R  7.0  0.0   0:23.54  1 load_base.sh
>  3920 mjt       39  19  5660  936 R  3.0  0.0   0:10.33  2 load_base.sh

These runs don't look absolutely "ideal" as one nice 19 task is bound to cpu1 
however since you're running hyperthreading it would seem the SMT nice code 
is keeping that under check anyway (0:23 vs 6:11)

> top - 11:12:32 up 15:33,  2 users,  load average: 4.99, 4.22, 2.24
>   PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  #C COMMAND
>  3917 mjt       35   0  5660 1136  936 R 99.9  0.0   9:28.56  0
> load_base.sh 3918 mjt       34   0  5660 1136  936 R 99.5  0.0   9:28.54  3
> load_base.sh 3916 mjt       35   0  5660 1136  936 R 61.7  0.0   5:19.77  2
> load_base.sh 3919 mjt       39  19  5660 1136  936 R  6.0  0.0   0:36.07  1
> load_base.sh 3920 mjt       39  19  5660 1136  936 R  3.0  0.0   0:15.82  2
> load_base.sh
>
> $ ./load.sh 7
> top - 11:13:49 up 15:35,  2 users,  load average: 5.17, 4.40, 2.45
>   PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  #C COMMAND
>  3952 mjt       29   0  5660 1140  936 R 99.9  0.0   0:33.53  2
> load_base.sh 3950 mjt       31   0  5660 1140  936 R 99.5  0.0   0:33.33  1
> load_base.sh 3953 mjt       30   0  5660 1140  936 R 55.7  0.0   0:16.82  3
> load_base.sh 3951 mjt       39   0  5660 1140  936 R 43.8  0.0   0:16.70  3
> load_base.sh 3949 mjt       39   0  5660 1140  936 R 23.9  0.0   0:13.18  0
> load_base.sh 3954 mjt       39  19  5660 1140  936 R  2.0  0.0   0:00.64  0
> load_base.sh 3955 mjt       39  19  5660 1140  936 R  2.0  0.0   0:00.64  0
> load_base.sh
>
> top - 11:14:53 up 15:36,  2 users,  load average: 6.38, 4.91, 2.76
>   PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  #C COMMAND
>  3950 mjt       23   0  5660 1140  936 R 99.9  0.0   1:39.67  1
> load_base.sh 3952 mjt       21   0  5660 1140  936 R 99.9  0.0   1:39.87  2
> load_base.sh 3951 mjt       39   0  5660 1140  936 R 52.7  0.0   0:49.91  3
> load_base.sh 3953 mjt       22   0  5660 1140  936 R 47.8  0.0   0:49.95  3
> load_base.sh 3949 mjt       39   0  5660 1140  936 R 43.8  0.0   0:38.70  0
> load_base.sh 3954 mjt       39  19  5660 1140  936 R  2.0  0.0   0:01.90  0
> load_base.sh 3955 mjt       39  19  5660 1140  936 R  2.0  0.0   0:01.90  0

These runs pretty much confirm what I found to happen. My test machine for 
this was also 4x. I can't see how the code would behave differently on 2x. 
Perhaps if I make the prio_bias multiplied instead of added to the cpu load 
it will be less affected by SCHED_LOAD_SCALE. The attached patch was 
confirmed during testing to also provide smp distribution according to nice 
on 4x. Carlos I know your machine is in production so you testing may not be 
easy for you. Please try this on top if you have time.

Cheers,
Con

---
This patch alters the effect priority bias has on busy rebalancing by 
multiplying the cpu load by the total priority instead of adding it.

Signed-off-by: Con Kolivas <kernel@kolivas.org>

[-- Attachment #2: alter_prio_bias.diff --]
[-- Type: text/x-diff, Size: 912 bytes --]

Index: linux-2.6.11-smpnice/kernel/sched.c
===================================================================
--- linux-2.6.11-smpnice.orig/kernel/sched.c	2005-05-07 23:25:15.000000000 +1000
+++ linux-2.6.11-smpnice/kernel/sched.c	2005-05-09 21:42:02.000000000 +1000
@@ -953,8 +953,8 @@ static inline unsigned long source_load(
 		 * If we are balancing busy runqueues the load is biased by
 		 * priority to create 'nice' support across cpus.
 		 */
-		cpu_load += rq->prio_bias;
-		load_now += rq->prio_bias;
+		cpu_load *= rq->prio_bias;
+		load_now *= rq->prio_bias;
 	}
 	return min(cpu_load, load_now);
 }
@@ -969,8 +969,8 @@ static inline unsigned long target_load(
 		load_now = rq->nr_running * SCHED_LOAD_SCALE;
 
 	if (idle == NOT_IDLE) {
-		cpu_load += rq->prio_bias;
-		load_now += rq->prio_bias;
+		cpu_load *= rq->prio_bias;
+		load_now *= rq->prio_bias;
 	}
 	return max(cpu_load, load_now);
 }

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [ck] Re: [PATCH] implement nice support across physical cpus on SMP
  2005-05-09 11:47   ` Con Kolivas
@ 2005-05-09 18:55     ` Markus   Törnqvist
  2005-05-09 23:54     ` Carlos Carvalho
  1 sibling, 0 replies; 21+ messages in thread
From: Markus   Törnqvist @ 2005-05-09 18:55 UTC (permalink / raw)
  To: Con Kolivas; +Cc: ck, Ingo Molnar, linux-kernel, AndrewMorton, Carlos Carvalho

[-- Attachment #1: Type: text/plain, Size: 10588 bytes --]

On Mon, May 09, 2005 at 09:47:05PM +1000, Con Kolivas wrote:
>
>Thanks for feedback.

For once I can give something back, it seems; thus it's my pleasure.

>> top - 11:09:24 up 15:30,  2 users,  load average: 4.99, 3.55, 1.63
>>   PID USER      PR  NI  VIRT  SHR S %CPU %MEM    TIME+  #C COMMAND
>>  3917 mjt       25   0  5660  936 R 99.6  0.0   6:11.35  0 load_base.sh
>>  3918 mjt       24   0  5660  936 R 99.6  0.0   6:11.34  3 load_base.sh
>>  3916 mjt       39   0  5660  936 R 65.7  0.0   3:28.95  2 load_base.sh
>>  3919 mjt       39  19  5660  936 R  7.0  0.0   0:23.54  1 load_base.sh
>>  3920 mjt       39  19  5660  936 R  3.0  0.0   0:10.33  2 load_base.sh
>
>These runs don't look absolutely "ideal" as one nice 19 task is bound to cpu1 
>however since you're running hyperthreading it would seem the SMT nice code 
>is keeping that under check anyway (0:23 vs 6:11)

So let no one touch the SMT code as long as it works...

>These runs pretty much confirm what I found to happen. My test machine for 
>this was also 4x. I can't see how the code would behave differently on 2x. 

Who on whichever list one is subscribed to or not would care to
replicate these results on 2x and report?

Thank you.

>Perhaps if I make the prio_bias multiplied instead of added to the cpu load 
>it will be less affected by SCHED_LOAD_SCALE. The attached patch was 
>confirmed during testing to also provide smp distribution according to nice 
>on 4x. Carlos I know your machine is in production so you testing may not be 
>easy for you. Please try this on top if you have time.

I have no idea about SCHED_LOAD_SCALE, I'm afraid, but I will give
this latest patch a run while I'm at it.

The load.sh is the same I posted before

$ ./load.sh 5
top - 19:41:33 up 9 min,  2 users,  load average: 0.40, 0.10, 0.03
  PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  #C COMMAND        
 2798 mjt       25   0  5660 1140  936 R 99.9  0.0   0:02.04  2 load_base.sh   
 2799 mjt       25   0  5660 1140  936 R 99.9  0.0   0:02.04  3 load_base.sh   
 2797 mjt       25   0  5660 1140  936 R 51.8  0.0   0:01.16  0 load_base.sh   
 2801 mjt       39  19  5660 1140  936 R  7.0  0.0   0:00.12  1 load_base.sh   
 2800 mjt       39  19  5660 1140  936 R  3.0  0.0   0:00.05  0 load_base.sh   

top - 19:42:20 up 10 min,  2 users,  load average: 2.83, 0.78, 0.26
PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  #C COMMAND        
 2798 mjt       35   0  5660 1140  936 R 99.5  0.0   0:48.55  2 load_base.sh   
 2799 mjt       35   0  5660 1140  936 R 99.5  0.0   0:48.55  3 load_base.sh   
 2797 mjt       34   0  5660 1140  936 R 61.7  0.0   0:27.43  0 load_base.sh   
 2801 mjt       39  19  5660 1140  936 R  6.0  0.0   0:03.11  1 load_base.sh   
 2800 mjt       39  19  5660 1140  936 R  3.0  0.0   0:01.35  0 load_base.sh   

top - 19:43:00 up 10 min,  2 users,  load average: 3.88, 1.31, 0.46
  PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  #C COMMAND        
 2798 mjt       24   0  5660 1140  936 R 99.9  0.0   1:29.13  2 load_base.sh   
 2799 mjt       24   0  5660 1140  936 R 99.5  0.0   1:29.12  3 load_base.sh   
 2797 mjt       24   0  5660 1140  936 R 49.8  0.0   0:50.19  0 load_base.sh   
 2801 mjt       39  19  5660 1140  936 R  7.0  0.0   0:05.76  1 load_base.sh   
 2800 mjt       39  19  5660 1140  936 R  3.0  0.0   0:02.48  0 load_base.sh   

$ ./load.sh 7
top - 19:43:49 up 11 min,  2 users,  load average: 4.98, 1.97, 0.73
  PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  #C COMMAND        
 2807 mjt       21   0  5660 1140  936 R 99.9  0.0   0:27.73  0 load_base.sh   
 2804 mjt       34   0  5660 1140  936 R 99.6  0.0   0:23.46  1 load_base.sh   
 2808 mjt       20   0  5660 1140  936 R 99.6  0.0   0:25.51  2 load_base.sh   
 2805 mjt       39   0  5660 1140  936 R 39.8  0.0   0:08.12  3 load_base.sh   
 2806 mjt       33   0  5660 1140  936 R 37.9  0.0   0:12.46  3 load_base.sh   
 2788 mjt       20   0  5168 1092  832 R  1.0  0.0   0:00.56  3 top            
 2809 mjt       39  19  5660 1140  936 R  1.0  0.0   0:00.41  3 load_base.sh   
 2810 mjt       39  19  5660 1144  936 R  1.0  0.0   0:00.40  3 load_base.sh   

top - 19:44:20 up 12 min,  2 users,  load average: 5.78, 2.45, 0.92
  PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  #C COMMAND        
 2807 mjt       22   0  5660 1140  936 R 99.9  0.0   0:56.56  0 load_base.sh   
 2804 mjt       35   0  5660 1140  936 R 99.5  0.0   0:52.29  1 load_base.sh   
 2808 mjt       21   0  5660 1140  936 R 99.5  0.0   0:54.34  2 load_base.sh   
 2805 mjt       35   0  5660 1140  936 R 33.8  0.0   0:15.99  3 load_base.sh   
 2806 mjt       39   0  5660 1140  936 R 21.9  0.0   0:20.22  3 load_base.sh   
 2788 mjt       20   0  5168 1092  832 R  1.0  0.0   0:00.65  3 top            
 2809 mjt       39  19  5660 1140  936 R  1.0  0.0   0:00.80  3 load_base.sh   
 2810 mjt       39  19  5660 1144  936 R  1.0  0.0   0:00.79  3 load_base.sh   

top - 19:45:00 up 12 min,  2 users,  load average: 6.37, 3.02, 1.18
  PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  #C COMMAND        
 2804 mjt       28   0  5660 1140  936 R 99.9  0.0   1:32.02  1 load_base.sh   
 2807 mjt       35   0  5660 1140  936 R 99.9  0.0   1:36.29  0 load_base.sh   
 2808 mjt       33   0  5660 1140  936 R 99.5  0.0   1:34.07  2 load_base.sh   
 2806 mjt       27   0  5660 1140  936 R 30.9  0.0   0:31.09  3 load_base.sh   
 2805 mjt       39   0  5660 1140  936 R 24.9  0.0   0:26.81  3 load_base.sh   
 2809 mjt       39  19  5660 1140  936 R  1.0  0.0   0:01.34  3 load_base.sh   
 2810 mjt       39  19  5660 1144  936 R  1.0  0.0   0:01.33  3 load_base.sh   

Then I decided to do something crazier and renice some pids, to see
what happens...

top - 19:45:45 up 13 min,  2 users,  load average: 6.70, 3.58, 1.45
  PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  #C COMMAND        
 2807 mjt       37   0  5660 1140  936 R 99.9  0.0   2:21.19  0 load_base.sh   
 2804 mjt       30   0  5660 1140  936 R 99.5  0.0   2:16.92  1 load_base.sh   
 2806 mjt       36   0  5660 1140  936 R 41.8  0.0   0:43.73  2 load_base.sh   
 2805 mjt       39   0  5660 1140  936 R 21.9  0.0   0:39.13  2 load_base.sh   
 2809 mjt       39  19  5660 1140  936 R  6.0  0.0   0:02.10  3 load_base.sh   
 2808 mjt       39  10  5660 1140  936 R  2.0  0.0   2:16.01  2 load_base.sh   
 2810 mjt       39  19  5660 1144  936 R  2.0  0.0   0:01.96  2 load_base.sh   

top - 19:46:20 up 14 min,  2 users,  load average: 6.83, 3.95, 1.66
  PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  #C COMMAND        
 2804 mjt       35   0  5660 1140  936 R 99.6  0.0   2:52.38  1 load_base.sh   
 2805 mjt       37   0  5660 1140  936 R 99.6  0.0   0:54.29  3 load_base.sh   
 2807 mjt       21   0  5660 1140  936 R 99.6  0.0   2:56.65  0 load_base.sh   
 2806 mjt       21   0  5660 1140  936 R 23.9  0.0   0:53.67  2 load_base.sh   
 2808 mjt       39  10  5660 1140  936 R 21.9  0.0   2:34.49  2 load_base.sh   
 2809 mjt       39  19  5660 1140  936 R  2.0  0.0   0:02.66  2 load_base.sh   
 2810 mjt       39  19  5660 1144  936 R  2.0  0.0   0:02.57  2 load_base.sh   

top - 19:47:00 up 14 min,  2 users,  load average: 6.91, 4.33, 1.88
  PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  #C COMMAND        
 2805 mjt       32   0  5660 1140  936 R 99.9  0.0   1:34.05  3 load_base.sh   
 2807 mjt       37   0  5660 1140  936 R 99.9  0.0   3:36.42  0 load_base.sh   
 2804 mjt       30   0  5660 1140  936 R 99.6  0.0   3:32.15  1 load_base.sh   
 2806 mjt       36   0  5660 1140  936 R 40.8  0.0   1:07.04  2 load_base.sh   
 2808 mjt       39  10  5660 1140  936 R 21.9  0.0   2:41.09  2 load_base.sh   
 2809 mjt       39  19  5660 1140  936 R  2.0  0.0   0:03.32  2 load_base.sh   
 2810 mjt       39  19  5660 1144  936 R  1.0  0.0   0:03.23  2 load_base.sh   

And sudo renice before I call it a day

top - 19:48:27 up 16 min,  2 users,  load average: 7.21, 5.05, 2.34
  PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  #C COMMAND        
 2806 mjt       28   0  5660 1140  936 R 99.9  0.0   1:57.55  2 load_base.sh   
 2807 mjt       27   0  5660 1140  936 R 99.9  0.0   5:00.99  0 load_base.sh   
 2810 mjt       26 -10  5660 1144  936 R 96.3  0.0   0:05.56  3 load_base.sh   
 2804 mjt       39   0  5660 1140  936 R 24.8  0.0   4:57.88  1 load_base.sh   
 2808 mjt       36  10  5660 1140  936 R  9.5  0.0   2:56.90  1 load_base.sh   
 2805 mjt       39   0  5660 1140  936 R  1.0  0.0   2:29.56  1 load_base.sh   

top - 19:49:10 up 16 min,  2 users,  load average: 7.65, 5.46, 2.61
  PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  #C COMMAND        
 2806 mjt       24   0  5660 1140  936 R 99.9  0.0   2:39.96  2 load_base.sh   
 2807 mjt       23   0  5660 1140  936 R 99.9  0.0   5:43.40  0 load_base.sh   
 2810 mjt       26 -10  5660 1144  936 R 99.9  0.0   0:45.87  1 load_base.sh   
 2805 mjt       39   0  5660 1140  936 R 17.5  0.0   2:36.59  3 load_base.sh   
 2808 mjt       39  10  5660 1140  936 R  8.7  0.0   2:59.79  3 load_base.sh   
 2804 mjt       27   0  5660 1140  936 R  7.1  0.0   5:03.47  3 load_base.sh   

top - 19:49:45 up 17 min,  2 users,  load average: 7.36, 5.63, 2.77
  PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  #C COMMAND        
 2806 mjt       31   0  5660 1140  936 R 99.9  0.0   3:15.66  2 load_base.sh   
 2807 mjt       29   0  5660 1140  936 R 99.9  0.0   6:19.10  0 load_base.sh   
 2810 mjt       26 -10  5660 1144  936 R 99.9  0.0   1:16.74  3 load_base.sh   
 2804 mjt       39   0  5660 1140  936 R 17.5  0.0   5:09.32  1 load_base.sh   
 2805 mjt       39   0  5660 1140  936 R 17.5  0.0   2:45.17  1 load_base.sh   
 2808 mjt       39  10  5660 1140  936 R  8.7  0.0   3:02.87  1 load_base.sh   

Seems good enough under this very fabricated stress, hopefully someone
can tell me a good practical application with processes of different
nices going all over the place, so I can try something else.

But these processes are still clinging a bit to cpu 1 here, that's probably
another SMT feature.
Who tests this on SMP without SMT? Anyone? You! Over there!

This box will go into production soon and then I can maybe get some
glimpses of what happens in practice, and that's about it.
And it'll probably run everything with default values, except a light
mysql -5.

Thanks!

-- 
mjt


[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [ck] Re: [PATCH] implement nice support across physical cpus on SMP
  2005-05-09 11:47   ` Con Kolivas
  2005-05-09 18:55     ` Markus   Törnqvist
@ 2005-05-09 23:54     ` Carlos Carvalho
  2005-05-11  2:56       ` Con Kolivas
  2005-05-11  3:04       ` [SMP NICE] [PATCH 1/2] SCHED: Implement " Con Kolivas
  1 sibling, 2 replies; 21+ messages in thread
From: Carlos Carvalho @ 2005-05-09 23:54 UTC (permalink / raw)
  To: Con Kolivas
  Cc: ck, Ingo Molnar, Markus Törnqvist, linux-kernel, AndrewMorton

Con Kolivas (kernel@kolivas.org) wrote on 9 May 2005 21:47:
 >Perhaps if I make the prio_bias multiplied instead of added to the cpu load 
 >it will be less affected by SCHED_LOAD_SCALE. The attached patch was 
 >confirmed during testing to also provide smp distribution according to nice 
 >on 4x.

It seems to work. I've tested it for a few hours on the same machine
and the 2 nice 0 processes take the bulk of the cpu time, while that
cpu bound program running at nice 19 takes only about 7%.

Maybe it's a bit early to say it's fine, but it does semm much better
than before, so I think it should go into the tree.

Thanks a lot!

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [ck] Re: [PATCH] implement nice support across physical cpus on SMP
  2005-05-09 23:54     ` Carlos Carvalho
@ 2005-05-11  2:56       ` Con Kolivas
  2005-05-11  3:04       ` [SMP NICE] [PATCH 1/2] SCHED: Implement " Con Kolivas
  1 sibling, 0 replies; 21+ messages in thread
From: Con Kolivas @ 2005-05-11  2:56 UTC (permalink / raw)
  To: Carlos Carvalho
  Cc: ck, Ingo Molnar, Markus Törnqvist, linux-kernel, AndrewMorton

On Tue, 10 May 2005 09:54 am, Carlos Carvalho wrote:
> Con Kolivas (kernel@kolivas.org) wrote on 9 May 2005 21:47:
>  >Perhaps if I make the prio_bias multiplied instead of added to the cpu
>  > load it will be less affected by SCHED_LOAD_SCALE. The attached patch
>  > was confirmed during testing to also provide smp distribution according
>  > to nice on 4x.
>
> It seems to work. I've tested it for a few hours on the same machine
> and the 2 nice 0 processes take the bulk of the cpu time, while that
> cpu bound program running at nice 19 takes only about 7%.
>
> Maybe it's a bit early to say it's fine, but it does semm much better
> than before, so I think it should go into the tree.
>
> Thanks a lot!

My pleasure. Thanks for testing.

I'll roll up these patches for rc4 and make smp nice balancing a config option 
for ultimate flexibility.

Cheers,
Con

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [SMP NICE] [PATCH 1/2] SCHED: Implement nice support across physical cpus on SMP
  2005-05-09 23:54     ` Carlos Carvalho
  2005-05-11  2:56       ` Con Kolivas
@ 2005-05-11  3:04       ` Con Kolivas
  2005-05-11  3:05         ` [SMP NICE] [PATCH 2/2] SCHED: Make SMP nice a config option Con Kolivas
  2005-05-16 11:33         ` [SMP NICE] [PATCH] SCHED: Implement nice support across physical cpus on SMP Con Kolivas
  1 sibling, 2 replies; 21+ messages in thread
From: Con Kolivas @ 2005-05-11  3:04 UTC (permalink / raw)
  To: AndrewMorton
  Cc: Carlos Carvalho, ck, Ingo Molnar, Markus Törnqvist, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 44 bytes --]

Andrew please consider for inclusion in -mm

[-- Attachment #2: implement_smp_nice_balancing.diff --]
[-- Type: text/x-diff, Size: 7560 bytes --]

This patch implements 'nice' support across physical cpus on SMP.

It introduces an extra runqueue variable prio_bias which is the sum of the
(inverted) static priorities of all the tasks on the runqueue. This is then used
to bias busy rebalancing between runqueues to obtain good distribution of tasks
of different nice values. By biasing the balancing only during busy rebalancing
we can avoid having any significant loss of throughput by not affecting the
carefully tuned idle balancing already in place. If all tasks are running at the
same nice level this code should also have minimal effect. The code is optimised
out in the !CONFIG_SMP case.

Signed-off-by: Con Kolivas <kernel@kolivas.org>

Index: linux-2.6.12-rc4-smpnice/kernel/sched.c
===================================================================
--- linux-2.6.12-rc4-smpnice.orig/kernel/sched.c	2005-05-08 20:18:01.000000000 +1000
+++ linux-2.6.12-rc4-smpnice/kernel/sched.c	2005-05-10 20:28:34.000000000 +1000
@@ -206,6 +206,7 @@ struct runqueue {
 	 */
 	unsigned long nr_running;
 #ifdef CONFIG_SMP
+	unsigned long prio_bias;
 	unsigned long cpu_load;
 #endif
 	unsigned long long nr_switches;
@@ -604,13 +605,45 @@ static int effective_prio(task_t *p)
 	return prio;
 }
 
+#ifdef CONFIG_SMP
+static inline void inc_prio_bias(runqueue_t *rq, int static_prio)
+{
+	rq->prio_bias += MAX_PRIO - static_prio;
+}
+
+static inline void dec_prio_bias(runqueue_t *rq, int static_prio)
+{
+	rq->prio_bias -= MAX_PRIO - static_prio;
+}
+#else
+static inline void inc_prio_bias(runqueue_t *rq, int static_prio)
+{
+}
+
+static inline void dec_prio_bias(runqueue_t *rq, int static_prio)
+{
+}
+#endif
+
+static inline void inc_nr_running(task_t *p, runqueue_t *rq)
+{
+	rq->nr_running++;
+	inc_prio_bias(rq, p->static_prio);
+}
+
+static inline void dec_nr_running(task_t *p, runqueue_t *rq)
+{
+	rq->nr_running--;
+	dec_prio_bias(rq, p->static_prio);
+}
+
 /*
  * __activate_task - move a task to the runqueue.
  */
 static inline void __activate_task(task_t *p, runqueue_t *rq)
 {
 	enqueue_task(p, rq->active);
-	rq->nr_running++;
+	inc_nr_running(p, rq);
 }
 
 /*
@@ -619,7 +652,7 @@ static inline void __activate_task(task_
 static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
 {
 	enqueue_task_head(p, rq->active);
-	rq->nr_running++;
+	inc_nr_running(p, rq);
 }
 
 static void recalc_task_prio(task_t *p, unsigned long long now)
@@ -738,7 +771,7 @@ static void activate_task(task_t *p, run
  */
 static void deactivate_task(struct task_struct *p, runqueue_t *rq)
 {
-	rq->nr_running--;
+	dec_nr_running(p, rq);
 	dequeue_task(p, p->array);
 	p->array = NULL;
 }
@@ -886,23 +919,37 @@ void kick_process(task_t *p)
  * We want to under-estimate the load of migration sources, to
  * balance conservatively.
  */
-static inline unsigned long source_load(int cpu)
+static inline unsigned long source_load(int cpu, enum idle_type idle)
 {
 	runqueue_t *rq = cpu_rq(cpu);
-	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+	unsigned long cpu_load = rq->cpu_load,
+		load_now = rq->nr_running * SCHED_LOAD_SCALE;
 
-	return min(rq->cpu_load, load_now);
+	if (idle == NOT_IDLE) {
+		/*
+		 * If we are balancing busy runqueues the load is biased by
+		 * priority to create 'nice' support across cpus.
+		 */
+		cpu_load *= rq->prio_bias;
+		load_now *= rq->prio_bias;
+	}
+	return min(cpu_load, load_now);
 }
 
 /*
  * Return a high guess at the load of a migration-target cpu
  */
-static inline unsigned long target_load(int cpu)
+static inline unsigned long target_load(int cpu, enum idle_type idle)
 {
 	runqueue_t *rq = cpu_rq(cpu);
-	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+	unsigned long cpu_load = rq->cpu_load,
+		load_now = rq->nr_running * SCHED_LOAD_SCALE;
 
-	return max(rq->cpu_load, load_now);
+	if (idle == NOT_IDLE) {
+		cpu_load *= rq->prio_bias;
+		load_now *= rq->prio_bias;
+	}
+	return max(cpu_load, load_now);
 }
 
 #endif
@@ -1004,8 +1051,8 @@ static int try_to_wake_up(task_t * p, un
 	if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
 		goto out_set_cpu;
 
-	load = source_load(cpu);
-	this_load = target_load(this_cpu);
+	load = source_load(cpu, SCHED_IDLE);
+	this_load = target_load(this_cpu, SCHED_IDLE);
 
 	/*
 	 * If sync wakeup then subtract the (maximum possible) effect of
@@ -1226,7 +1273,7 @@ void fastcall wake_up_new_task(task_t * 
 				list_add_tail(&p->run_list, &current->run_list);
 				p->array = current->array;
 				p->array->nr_active++;
-				rq->nr_running++;
+				inc_nr_running(p, rq);
 			}
 			set_need_resched();
 		} else
@@ -1509,7 +1556,7 @@ static int find_idlest_cpu(struct task_s
 	cpus_and(mask, sd->span, p->cpus_allowed);
 
 	for_each_cpu_mask(i, mask) {
-		load = target_load(i);
+		load = target_load(i, SCHED_IDLE);
 
 		if (load < min_load) {
 			min_cpu = i;
@@ -1522,7 +1569,7 @@ static int find_idlest_cpu(struct task_s
 	}
 
 	/* add +1 to account for the new task */
-	this_load = source_load(this_cpu) + SCHED_LOAD_SCALE;
+	this_load = source_load(this_cpu, SCHED_IDLE) + SCHED_LOAD_SCALE;
 
 	/*
 	 * Would with the addition of the new task to the
@@ -1613,9 +1660,9 @@ void pull_task(runqueue_t *src_rq, prio_
 	       runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
 {
 	dequeue_task(p, src_array);
-	src_rq->nr_running--;
+	dec_nr_running(p, src_rq);
 	set_task_cpu(p, this_cpu);
-	this_rq->nr_running++;
+	inc_nr_running(p, this_rq);
 	enqueue_task(p, this_array);
 	p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
 				+ this_rq->timestamp_last_tick;
@@ -1776,9 +1823,9 @@ find_busiest_group(struct sched_domain *
 		for_each_cpu_mask(i, group->cpumask) {
 			/* Bias balancing toward cpus of our domain */
 			if (local_group)
-				load = target_load(i);
+				load = target_load(i, idle);
 			else
-				load = source_load(i);
+				load = source_load(i, idle);
 
 			avg_load += load;
 		}
@@ -1887,14 +1934,14 @@ out_balanced:
 /*
  * find_busiest_queue - find the busiest runqueue among the cpus in group.
  */
-static runqueue_t *find_busiest_queue(struct sched_group *group)
+static runqueue_t *find_busiest_queue(struct sched_group *group, enum idle_type idle)
 {
 	unsigned long load, max_load = 0;
 	runqueue_t *busiest = NULL;
 	int i;
 
 	for_each_cpu_mask(i, group->cpumask) {
-		load = source_load(i);
+		load = source_load(i, idle);
 
 		if (load > max_load) {
 			max_load = load;
@@ -1928,7 +1975,7 @@ static int load_balance(int this_cpu, ru
 		goto out_balanced;
 	}
 
-	busiest = find_busiest_queue(group);
+	busiest = find_busiest_queue(group, idle);
 	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[idle]);
 		goto out_balanced;
@@ -2035,7 +2082,7 @@ static int load_balance_newidle(int this
 		goto out;
 	}
 
-	busiest = find_busiest_queue(group);
+	busiest = find_busiest_queue(group, NEWLY_IDLE);
 	if (!busiest || busiest == this_rq) {
 		schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
 		schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
@@ -3196,7 +3243,9 @@ void set_user_nice(task_t *p, long nice)
 	 * not SCHED_NORMAL:
 	 */
 	if (rt_task(p)) {
+		dec_prio_bias(rq, p->static_prio);
 		p->static_prio = NICE_TO_PRIO(nice);
+		inc_prio_bias(rq, p->static_prio);
 		goto out_unlock;
 	}
 	array = p->array;
@@ -3206,7 +3255,9 @@ void set_user_nice(task_t *p, long nice)
 	old_prio = p->prio;
 	new_prio = NICE_TO_PRIO(nice);
 	delta = new_prio - old_prio;
+	dec_prio_bias(rq, p->static_prio);
 	p->static_prio = NICE_TO_PRIO(nice);
+	inc_prio_bias(rq, p->static_prio);
 	p->prio += delta;
 
 	if (array) {

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [SMP NICE] [PATCH 2/2] SCHED: Make SMP nice a config option
  2005-05-11  3:04       ` [SMP NICE] [PATCH 1/2] SCHED: Implement " Con Kolivas
@ 2005-05-11  3:05         ` Con Kolivas
  2005-05-11  7:20           ` Ingo Molnar
  2005-05-16 11:33         ` [SMP NICE] [PATCH] SCHED: Implement nice support across physical cpus on SMP Con Kolivas
  1 sibling, 1 reply; 21+ messages in thread
From: Con Kolivas @ 2005-05-11  3:05 UTC (permalink / raw)
  To: AndrewMorton
  Cc: Carlos Carvalho, ck, Ingo Molnar, Markus Törnqvist, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1 bytes --]



[-- Attachment #2: make_smp_nice_config_option.diff --]
[-- Type: text/x-diff, Size: 15552 bytes --]

Certain configurations may not need the SMP nice balancing scheme and would
prefer SMP balancing to be based purely on throughput. Make SMP nice support
a config option which disables priority bias for SMP balancing and priority
based SMT sibling sleeps.

Signed-off-by: Con Kolivas <kernel@kolivas.org>


Index: linux-2.6.12-rc4-smpnice/arch/alpha/Kconfig
===================================================================
--- linux-2.6.12-rc4-smpnice.orig/arch/alpha/Kconfig	2005-05-11 11:45:42.000000000 +1000
+++ linux-2.6.12-rc4-smpnice/arch/alpha/Kconfig	2005-05-11 12:25:40.000000000 +1000
@@ -498,6 +498,19 @@ config SMP
 
 	  If you don't know what to do here, say N.
 
+config SMP_NICE
+	bool "SMP support for nice levels across cpus"
+	depends on SMP
+	default y
+	---help---
+	  This option supports a degree of unbalancing of cpus according to
+	  processes' nice levels. Disabling this option on dedicated single
+	  purpose servers may improve throughput slightly but cpu resource
+	  sharing according to 'nice' across physical or logical cpus will
+	  be lost.
+
+	  If unsure say Y
+
 config HAVE_DEC_LOCK
 	bool
 	depends on SMP
Index: linux-2.6.12-rc4-smpnice/arch/arm/Kconfig
===================================================================
--- linux-2.6.12-rc4-smpnice.orig/arch/arm/Kconfig	2005-05-11 11:45:42.000000000 +1000
+++ linux-2.6.12-rc4-smpnice/arch/arm/Kconfig	2005-05-11 12:26:21.000000000 +1000
@@ -324,6 +324,19 @@ config SMP
 
 	  If you don't know what to do here, say N.
 
+config SMP_NICE
+	bool "SMP support for nice levels across cpus"
+	depends on SMP
+	default y
+	---help---
+	  This option supports a degree of unbalancing of cpus according to
+	  processes' nice levels. Disabling this option on dedicated single
+	  purpose servers may improve throughput slightly but cpu resource
+	  sharing according to 'nice' across physical or logical cpus will
+	  be lost.
+
+	  If unsure say Y
+
 config NR_CPUS
 	int "Maximum number of CPUs (2-32)"
 	range 2 32
Index: linux-2.6.12-rc4-smpnice/arch/i386/Kconfig
===================================================================
--- linux-2.6.12-rc4-smpnice.orig/arch/i386/Kconfig	2005-05-11 11:45:42.000000000 +1000
+++ linux-2.6.12-rc4-smpnice/arch/i386/Kconfig	2005-05-11 12:26:05.000000000 +1000
@@ -487,6 +487,19 @@ config SMP
 
 	  If you don't know what to do here, say N.
 
+config SMP_NICE
+	bool "SMP support for nice levels across cpus"
+	depends on SMP
+	default y
+	---help---
+	  This option supports a degree of unbalancing of cpus according to
+	  processes' nice levels. Disabling this option on dedicated single
+	  purpose servers may improve throughput slightly but cpu resource
+	  sharing according to 'nice' across physical or logical cpus will
+	  be lost.
+
+	  If unsure say Y
+
 config NR_CPUS
 	int "Maximum number of CPUs (2-255)"
 	range 2 255
Index: linux-2.6.12-rc4-smpnice/arch/ia64/Kconfig
===================================================================
--- linux-2.6.12-rc4-smpnice.orig/arch/ia64/Kconfig	2005-05-11 11:45:42.000000000 +1000
+++ linux-2.6.12-rc4-smpnice/arch/ia64/Kconfig	2005-05-11 12:28:03.000000000 +1000
@@ -253,6 +253,19 @@ config SMP
 
 	  If you don't know what to do here, say N.
 
+config SMP_NICE
+	bool "SMP support for nice levels across cpus"
+	depends on SMP
+	default y
+	---help---
+	  This option supports a degree of unbalancing of cpus according to
+	  processes' nice levels. Disabling this option on dedicated single
+	  purpose servers may improve throughput slightly but cpu resource
+	  sharing according to 'nice' across physical or logical cpus will
+	  be lost.
+
+	  If unsure say Y
+
 config NR_CPUS
 	int "Maximum number of CPUs (2-512)"
 	range 2 512
Index: linux-2.6.12-rc4-smpnice/arch/m32r/Kconfig
===================================================================
--- linux-2.6.12-rc4-smpnice.orig/arch/m32r/Kconfig	2005-03-02 18:37:30.000000000 +1100
+++ linux-2.6.12-rc4-smpnice/arch/m32r/Kconfig	2005-05-11 12:27:56.000000000 +1000
@@ -241,6 +241,19 @@ config SMP
 
 	  If you don't know what to do here, say N.
 
+config SMP_NICE
+	bool "SMP support for nice levels across cpus"
+	depends on SMP
+	default y
+	---help---
+	  This option supports a degree of unbalancing of cpus according to
+	  processes' nice levels. Disabling this option on dedicated single
+	  purpose servers may improve throughput slightly but cpu resource
+	  sharing according to 'nice' across physical or logical cpus will
+	  be lost.
+
+	  If unsure say Y
+
 config CHIP_M32700_TS1
 	bool "Workaround code for the M32700 TS1 chip's bug"
 	depends on (CHIP_M32700 && SMP)
Index: linux-2.6.12-rc4-smpnice/arch/mips/Kconfig
===================================================================
--- linux-2.6.12-rc4-smpnice.orig/arch/mips/Kconfig	2005-05-11 11:45:42.000000000 +1000
+++ linux-2.6.12-rc4-smpnice/arch/mips/Kconfig	2005-05-11 12:27:48.000000000 +1000
@@ -1438,6 +1438,19 @@ config SMP
 
 	  If you don't know what to do here, say N.
 
+config SMP_NICE
+	bool "SMP support for nice levels across cpus"
+	depends on SMP
+	default y
+	---help---
+	  This option supports a degree of unbalancing of cpus according to
+	  processes' nice levels. Disabling this option on dedicated single
+	  purpose servers may improve throughput slightly but cpu resource
+	  sharing according to 'nice' across physical or logical cpus will
+	  be lost.
+
+	  If unsure say Y
+
 config NR_CPUS
 	int "Maximum number of CPUs (2-64)"
 	range 2 64
Index: linux-2.6.12-rc4-smpnice/arch/parisc/Kconfig
===================================================================
--- linux-2.6.12-rc4-smpnice.orig/arch/parisc/Kconfig	2005-05-11 11:45:42.000000000 +1000
+++ linux-2.6.12-rc4-smpnice/arch/parisc/Kconfig	2005-05-11 12:27:40.000000000 +1000
@@ -143,6 +143,19 @@ config SMP
 
 	  If you don't know what to do here, say N.
 
+config SMP_NICE
+	bool "SMP support for nice levels across cpus"
+	depends on SMP
+	default y
+	---help---
+	  This option supports a degree of unbalancing of cpus according to
+	  processes' nice levels. Disabling this option on dedicated single
+	  purpose servers may improve throughput slightly but cpu resource
+	  sharing according to 'nice' across physical or logical cpus will
+	  be lost.
+
+	  If unsure say Y
+
 config HOTPLUG_CPU
 	bool
 	default y if SMP
Index: linux-2.6.12-rc4-smpnice/arch/ppc/Kconfig
===================================================================
--- linux-2.6.12-rc4-smpnice.orig/arch/ppc/Kconfig	2005-05-11 11:45:42.000000000 +1000
+++ linux-2.6.12-rc4-smpnice/arch/ppc/Kconfig	2005-05-11 12:27:32.000000000 +1000
@@ -882,6 +882,19 @@ config SMP
 
 	  If you don't know what to do here, say N.
 
+config SMP_NICE
+	bool "SMP support for nice levels across cpus"
+	depends on SMP
+	default y
+	---help---
+	  This option supports a degree of unbalancing of cpus according to
+	  processes' nice levels. Disabling this option on dedicated single
+	  purpose servers may improve throughput slightly but cpu resource
+	  sharing according to 'nice' across physical or logical cpus will
+	  be lost.
+
+	  If unsure say Y
+
 config IRQ_ALL_CPUS
 	bool "Distribute interrupts on all CPUs by default"
 	depends on SMP
Index: linux-2.6.12-rc4-smpnice/arch/ppc64/Kconfig
===================================================================
--- linux-2.6.12-rc4-smpnice.orig/arch/ppc64/Kconfig	2005-05-11 11:45:42.000000000 +1000
+++ linux-2.6.12-rc4-smpnice/arch/ppc64/Kconfig	2005-05-11 12:27:26.000000000 +1000
@@ -185,6 +185,19 @@ config SMP
 
 	  If you don't know what to do here, say Y.
 
+config SMP_NICE
+	bool "SMP support for nice levels across cpus"
+	depends on SMP
+	default y
+	---help---
+	  This option supports a degree of unbalancing of cpus according to
+	  processes' nice levels. Disabling this option on dedicated single
+	  purpose servers may improve throughput slightly but cpu resource
+	  sharing according to 'nice' across physical or logical cpus will
+	  be lost.
+
+	  If unsure say Y
+
 config NR_CPUS
 	int "Maximum number of CPUs (2-128)"
 	range 2 128
Index: linux-2.6.12-rc4-smpnice/arch/s390/Kconfig
===================================================================
--- linux-2.6.12-rc4-smpnice.orig/arch/s390/Kconfig	2005-05-11 11:45:42.000000000 +1000
+++ linux-2.6.12-rc4-smpnice/arch/s390/Kconfig	2005-05-11 12:27:19.000000000 +1000
@@ -70,6 +70,19 @@ config SMP
 
 	  Even if you don't know what to do here, say Y.
 
+config SMP_NICE
+	bool "SMP support for nice levels across cpus"
+	depends on SMP
+	default y
+	---help---
+	  This option supports a degree of unbalancing of cpus according to
+	  processes' nice levels. Disabling this option on dedicated single
+	  purpose servers may improve throughput slightly but cpu resource
+	  sharing according to 'nice' across physical or logical cpus will
+	  be lost.
+
+	  If unsure say Y
+
 config NR_CPUS
 	int "Maximum number of CPUs (2-64)"
 	range 2 64
Index: linux-2.6.12-rc4-smpnice/arch/sh/Kconfig
===================================================================
--- linux-2.6.12-rc4-smpnice.orig/arch/sh/Kconfig	2005-05-11 11:45:42.000000000 +1000
+++ linux-2.6.12-rc4-smpnice/arch/sh/Kconfig	2005-05-11 12:27:10.000000000 +1000
@@ -605,6 +605,19 @@ config SMP
 
 	  If you don't know what to do here, say N.
 
+config SMP_NICE
+	bool "SMP support for nice levels across cpus"
+	depends on SMP
+	default y
+	---help---
+	  This option supports a degree of unbalancing of cpus according to
+	  processes' nice levels. Disabling this option on dedicated single
+	  purpose servers may improve throughput slightly but cpu resource
+	  sharing according to 'nice' across physical or logical cpus will
+	  be lost.
+
+	  If unsure say Y
+
 config NR_CPUS
 	int "Maximum number of CPUs (2-32)"
 	range 2 32
Index: linux-2.6.12-rc4-smpnice/arch/sparc/Kconfig
===================================================================
--- linux-2.6.12-rc4-smpnice.orig/arch/sparc/Kconfig	2005-03-02 18:37:30.000000000 +1100
+++ linux-2.6.12-rc4-smpnice/arch/sparc/Kconfig	2005-05-11 12:27:00.000000000 +1000
@@ -105,6 +105,19 @@ config SMP
 
 	  If you don't know what to do here, say N.
 
+config SMP_NICE
+	bool "SMP support for nice levels across cpus"
+	depends on SMP
+	default y
+	---help---
+	  This option supports a degree of unbalancing of cpus according to
+	  processes' nice levels. Disabling this option on dedicated single
+	  purpose servers may improve throughput slightly but cpu resource
+	  sharing according to 'nice' across physical or logical cpus will
+	  be lost.
+
+	  If unsure say Y
+
 config NR_CPUS
 	int "Maximum number of CPUs (2-32)"
 	range 2 32
Index: linux-2.6.12-rc4-smpnice/arch/sparc64/Kconfig
===================================================================
--- linux-2.6.12-rc4-smpnice.orig/arch/sparc64/Kconfig	2005-05-11 11:45:42.000000000 +1000
+++ linux-2.6.12-rc4-smpnice/arch/sparc64/Kconfig	2005-05-11 12:26:51.000000000 +1000
@@ -144,6 +144,19 @@ config SMP
 
 	  If you don't know what to do here, say N.
 
+config SMP_NICE
+	bool "SMP support for nice levels across cpus"
+	depends on SMP
+	default y
+	---help---
+	  This option supports a degree of unbalancing of cpus according to
+	  processes' nice levels. Disabling this option on dedicated single
+	  purpose servers may improve throughput slightly but cpu resource
+	  sharing according to 'nice' across physical or logical cpus will
+	  be lost.
+
+	  If unsure say Y
+
 config PREEMPT
 	bool "Preemptible Kernel"
 	help
Index: linux-2.6.12-rc4-smpnice/arch/um/Kconfig
===================================================================
--- linux-2.6.12-rc4-smpnice.orig/arch/um/Kconfig	2005-05-11 11:45:42.000000000 +1000
+++ linux-2.6.12-rc4-smpnice/arch/um/Kconfig	2005-05-11 12:26:34.000000000 +1000
@@ -211,6 +211,19 @@ config SMP
 
 	If you don't know what to do, say N.
 
+config SMP_NICE
+	bool "SMP support for nice levels across cpus"
+	depends on SMP
+	default y
+	---help---
+	  This option supports a degree of unbalancing of cpus according to
+	  processes' nice levels. Disabling this option on dedicated single
+	  purpose servers may improve throughput slightly but cpu resource
+	  sharing according to 'nice' across physical or logical cpus will
+	  be lost.
+
+	  If unsure say Y
+
 config NR_CPUS
 	int "Maximum number of CPUs (2-32)"
 	range 2 32
Index: linux-2.6.12-rc4-smpnice/arch/x86_64/Kconfig
===================================================================
--- linux-2.6.12-rc4-smpnice.orig/arch/x86_64/Kconfig	2005-05-11 11:45:42.000000000 +1000
+++ linux-2.6.12-rc4-smpnice/arch/x86_64/Kconfig	2005-05-11 12:25:02.000000000 +1000
@@ -207,6 +207,19 @@ config SMP
 
 	  If you don't know what to do here, say N.
 
+config SMP_NICE
+	bool "SMP support for nice levels across cpus"
+	depends on SMP
+	default y
+	---help---
+	  This option supports a degree of unbalancing of cpus according to
+	  processes' nice levels. Disabling this option on dedicated single
+	  purpose servers may improve throughput slightly but cpu resource
+	  sharing according to 'nice' across physical or logical cpus will
+	  be lost.
+
+	  If unsure say Y
+
 config PREEMPT
 	bool "Preemptible Kernel"
 	---help---
Index: linux-2.6.12-rc4-smpnice/kernel/sched.c
===================================================================
--- linux-2.6.12-rc4-smpnice.orig/kernel/sched.c	2005-05-11 12:20:45.000000000 +1000
+++ linux-2.6.12-rc4-smpnice/kernel/sched.c	2005-05-11 12:29:15.000000000 +1000
@@ -605,7 +605,7 @@ static int effective_prio(task_t *p)
 	return prio;
 }
 
-#ifdef CONFIG_SMP
+#ifdef CONFIG_SMP_NICE
 static inline void inc_prio_bias(runqueue_t *rq, int static_prio)
 {
 	rq->prio_bias += MAX_PRIO - static_prio;
@@ -615,7 +615,7 @@ static inline void dec_prio_bias(runqueu
 {
 	rq->prio_bias -= MAX_PRIO - static_prio;
 }
-#else
+#else	/* !CONFIG_SMP_NICE */
 static inline void inc_prio_bias(runqueue_t *rq, int static_prio)
 {
 }
@@ -925,6 +925,7 @@ static inline unsigned long source_load(
 	unsigned long cpu_load = rq->cpu_load,
 		load_now = rq->nr_running * SCHED_LOAD_SCALE;
 
+#ifdef CONFIG_SMP_NICE
 	if (idle == NOT_IDLE) {
 		/*
 		 * If we are balancing busy runqueues the load is biased by
@@ -933,6 +934,7 @@ static inline unsigned long source_load(
 		cpu_load *= rq->prio_bias;
 		load_now *= rq->prio_bias;
 	}
+#endif
 	return min(cpu_load, load_now);
 }
 
@@ -945,10 +947,12 @@ static inline unsigned long target_load(
 	unsigned long cpu_load = rq->cpu_load,
 		load_now = rq->nr_running * SCHED_LOAD_SCALE;
 
+#ifdef CONFIG_SMP_NICE
 	if (idle == NOT_IDLE) {
 		cpu_load *= rq->prio_bias;
 		load_now *= rq->prio_bias;
 	}
+#endif
 	return max(cpu_load, load_now);
 }
 
@@ -2255,7 +2259,7 @@ static inline void idle_balance(int cpu,
 static inline int wake_priority_sleeper(runqueue_t *rq)
 {
 	int ret = 0;
-#ifdef CONFIG_SCHED_SMT
+#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_SMP_NICE)
 	spin_lock(&rq->lock);
 	/*
 	 * If an SMT sibling task has been put to sleep for priority
@@ -2491,7 +2495,7 @@ out:
 	rebalance_tick(cpu, rq, NOT_IDLE);
 }
 
-#ifdef CONFIG_SCHED_SMT
+#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_SMP_NICE)
 static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
 {
 	struct sched_domain *sd = this_rq->sd;
@@ -2605,7 +2609,7 @@ out_unlock:
 		spin_unlock(&cpu_rq(i)->lock);
 	return ret;
 }
-#else
+#else	/* !(CONFIG_SCHED_SMT && CONFIG_SMP_NICE) */
 static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
 {
 }

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [SMP NICE] [PATCH 2/2] SCHED: Make SMP nice a config option
  2005-05-11  3:05         ` [SMP NICE] [PATCH 2/2] SCHED: Make SMP nice a config option Con Kolivas
@ 2005-05-11  7:20           ` Ingo Molnar
  2005-05-12 10:49             ` Con Kolivas
  0 siblings, 1 reply; 21+ messages in thread
From: Ingo Molnar @ 2005-05-11  7:20 UTC (permalink / raw)
  To: Con Kolivas
  Cc: AndrewMorton, Carlos Carvalho, ck, Markus Törnqvist, linux-kernel


ack on the first patch - but please dont make it a .config option!  
Either it's good enough so that everyone can use it, or it isnt.

	Ingo

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [SMP NICE] [PATCH 2/2] SCHED: Make SMP nice a config option
  2005-05-11  7:20           ` Ingo Molnar
@ 2005-05-12 10:49             ` Con Kolivas
  0 siblings, 0 replies; 21+ messages in thread
From: Con Kolivas @ 2005-05-12 10:49 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: AndrewMorton, Carlos Carvalho, ck, Markus Törnqvist, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 348 bytes --]

On Wed, 11 May 2005 17:20, Ingo Molnar wrote:
> ack on the first patch - but please dont make it a .config option!
> Either it's good enough so that everyone can use it, or it isnt.

Makes a heck of a lot of sense to me. I guess I was just being paranoid / 
defensive for no good reason. The first patch alone should suffice.

Cheers,
Con

[-- Attachment #2: Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [SMP NICE] [PATCH] SCHED: Implement nice support across physical cpus on SMP
  2005-05-11  3:04       ` [SMP NICE] [PATCH 1/2] SCHED: Implement " Con Kolivas
  2005-05-11  3:05         ` [SMP NICE] [PATCH 2/2] SCHED: Make SMP nice a config option Con Kolivas
@ 2005-05-16 11:33         ` Con Kolivas
  2005-05-16 18:31           ` Markus   Törnqvist
  2005-05-17 13:39           ` Carlos Carvalho
  1 sibling, 2 replies; 21+ messages in thread
From: Con Kolivas @ 2005-05-16 11:33 UTC (permalink / raw)
  To: AndrewMorton
  Cc: Carlos Carvalho, ck, Ingo Molnar, Markus Törnqvist, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 526 bytes --]

On Wed, 11 May 2005 13:04, Con Kolivas wrote:
> Andrew please consider for inclusion in -mm

It looks like I missed my window of opportunity and the SMP balancing design 
has been restructured in latest -mm again so this patch will have to wait 
another generation. Carlos, Markus you'll have to wait till that code settles 
down (if ever) before I (or someone else) rewrites it for it to get included 
in -mm followed by mainline. The patch you currently have will work fine for 
2.6.11* and 2.6.12*

Cheers,
Con

[-- Attachment #2: Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [SMP NICE] [PATCH] SCHED: Implement nice support across physical cpus on SMP
  2005-05-16 11:33         ` [SMP NICE] [PATCH] SCHED: Implement nice support across physical cpus on SMP Con Kolivas
@ 2005-05-16 18:31           ` Markus   Törnqvist
  2005-05-17 13:39           ` Carlos Carvalho
  1 sibling, 0 replies; 21+ messages in thread
From: Markus   Törnqvist @ 2005-05-16 18:31 UTC (permalink / raw)
  To: Con Kolivas; +Cc: AndrewMorton, Carlos Carvalho, ck, Ingo Molnar, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 256 bytes --]

On Mon, May 16, 2005 at 09:33:09PM +1000, Con Kolivas wrote:
>
>It looks like I missed my window of opportunity and the SMP balancing design 
>has been restructured in latest -mm again so this patch will have to wait 

...incredible...

-- 
mjt


[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [SMP NICE] [PATCH] SCHED: Implement nice support across physical cpus on SMP
  2005-05-16 11:33         ` [SMP NICE] [PATCH] SCHED: Implement nice support across physical cpus on SMP Con Kolivas
  2005-05-16 18:31           ` Markus   Törnqvist
@ 2005-05-17 13:39           ` Carlos Carvalho
  2005-05-18 11:30             ` Markus   Törnqvist
  1 sibling, 1 reply; 21+ messages in thread
From: Carlos Carvalho @ 2005-05-17 13:39 UTC (permalink / raw)
  To: Con Kolivas
  Cc: AndrewMorton, ck, Ingo Molnar, Markus Törnqvist, linux-kernel

Con Kolivas (kernel@kolivas.org) wrote on 16 May 2005 21:33:
 >On Wed, 11 May 2005 13:04, Con Kolivas wrote:
 >> Andrew please consider for inclusion in -mm
 >
 >It looks like I missed my window of opportunity and the SMP balancing design 
 >has been restructured in latest -mm again so this patch will have to wait 
 >another generation. Carlos, Markus you'll have to wait till that code settles 
 >down (if ever) before I (or someone else) rewrites it for it to get included 
 >in -mm followed by mainline. The patch you currently have will work fine for 
 >2.6.11* and 2.6.12*

That's a pity. What's more important however is that this misfeature
of the scheduler should be corrected ASAP. The nice control is a
traditional UNIX characteristic and it should have higher priority in
the patch inclusion queue than other scheduler improvements.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [SMP NICE] [PATCH] SCHED: Implement nice support across physical cpus on SMP
  2005-05-17 13:39           ` Carlos Carvalho
@ 2005-05-18 11:30             ` Markus   Törnqvist
  2005-05-18 13:45               ` Con Kolivas
  0 siblings, 1 reply; 21+ messages in thread
From: Markus   Törnqvist @ 2005-05-18 11:30 UTC (permalink / raw)
  To: Carlos Carvalho; +Cc: Con Kolivas, AndrewMorton, ck, Ingo Molnar, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 574 bytes --]

On Tue, May 17, 2005 at 10:39:28AM -0300, Carlos Carvalho wrote:
>That's a pity. What's more important however is that this misfeature
>of the scheduler should be corrected ASAP. The nice control is a
>traditional UNIX characteristic and it should have higher priority in
>the patch inclusion queue than other scheduler improvements.

Linux is not a traditional unix, but it doesn't mean the support
shouldn't exist.

My suggestion is that whoever broke the interface, rendering
con's patch which mingo accepted useless, merge the patch.

Thanks!

-- 
mjt


[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [SMP NICE] [PATCH] SCHED: Implement nice support across physical cpus on SMP
  2005-05-18 11:30             ` Markus   Törnqvist
@ 2005-05-18 13:45               ` Con Kolivas
  2005-05-21  5:00                 ` Con Kolivas
  0 siblings, 1 reply; 21+ messages in thread
From: Con Kolivas @ 2005-05-18 13:45 UTC (permalink / raw)
  To: Markus Törnqvist
  Cc: Carlos Carvalho, AndrewMorton, ck, Ingo Molnar, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1379 bytes --]

On Wed, 18 May 2005 21:30, Markus Törnqvist wrote:
> On Tue, May 17, 2005 at 10:39:28AM -0300, Carlos Carvalho wrote:
> >That's a pity. What's more important however is that this misfeature
> >of the scheduler should be corrected ASAP. The nice control is a
> >traditional UNIX characteristic and it should have higher priority in
> >the patch inclusion queue than other scheduler improvements.
>
> Linux is not a traditional unix, but it doesn't mean the support
> shouldn't exist.
>
> My suggestion is that whoever broke the interface, rendering
> con's patch which mingo accepted useless, merge the patch.

Unrealistic. We are in a constant state of development, the direction of which 
is determined by who is hacking on what, when - as opposed to "we need this 
feature or fix now so lets direct all our efforts to that". Unfortunately the 
SMP balancing changes need more than one iteration of a mainline kernel 
before being incorporated due to the potential for regression so the 
likelihood of "SMP nice" becoming part of mainline if it is based on this new 
code is going to be (at a guess) 6 months. Of course my patch could go into 
mainline in its current form and the SMP balancing code in -mm could be 
modified with that in place rather than the other way around but I just 
didn't get in early enough for that to happen ;)

Cheers,
Con

[-- Attachment #2: Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH] SCHED: Implement nice support across physical cpus on SMP
  2005-05-18 13:45               ` Con Kolivas
@ 2005-05-21  5:00                 ` Con Kolivas
  2005-05-23  9:28                   ` [PATCH] SCHED: change_prio_bias_only_if_queued Con Kolivas
  0 siblings, 1 reply; 21+ messages in thread
From: Con Kolivas @ 2005-05-21  5:00 UTC (permalink / raw)
  To: AndrewMorton, Ingo Molnar
  Cc: ck, Markus Törnqvist, Carlos Carvalho, linux-kernel


[-- Attachment #1.1: Type: text/plain, Size: 169 bytes --]

Ok I've respun the smp nice support for the cpu scheduler modifications that 
are in current -mm. Tested on 2.6.12-rc4-mm2 on 4x and seems to work fine.

Con
---


[-- Attachment #1.2: sched-implement_smp_nice_support.diff --]
[-- Type: text/x-diff, Size: 7254 bytes --]

This patch implements 'nice' support across physical cpus on SMP.

It introduces an extra runqueue variable prio_bias which is the sum of the
(inverted) static priorities of all the tasks on the runqueue. This is then used
to bias busy rebalancing between runqueues to obtain good distribution of tasks
of different nice values. By biasing the balancing only during busy rebalancing
we can avoid having any significant loss of throughput by not affecting the
carefully tuned idle balancing already in place. If all tasks are running at the
same nice level this code should also have minimal effect. The code is optimised
out in the !CONFIG_SMP case.

Signed-off-by: Con Kolivas <kernel@kolivas.org>

Index: linux-2.6.12-rc4-mm2/kernel/sched.c
===================================================================
--- linux-2.6.12-rc4-mm2.orig/kernel/sched.c	2005-05-21 11:14:49.000000000 +1000
+++ linux-2.6.12-rc4-mm2/kernel/sched.c	2005-05-21 14:25:07.000000000 +1000
@@ -208,6 +208,7 @@ struct runqueue {
 	 */
 	unsigned long nr_running;
 #ifdef CONFIG_SMP
+	unsigned long prio_bias;
 	unsigned long cpu_load[3];
 #endif
 	unsigned long long nr_switches;
@@ -657,13 +658,45 @@ static int effective_prio(task_t *p)
 	return prio;
 }
 
+#ifdef CONFIG_SMP
+static inline void inc_prio_bias(runqueue_t *rq, int static_prio)
+{
+	rq->prio_bias += MAX_PRIO - static_prio;
+}
+
+static inline void dec_prio_bias(runqueue_t *rq, int static_prio)
+{
+	rq->prio_bias -= MAX_PRIO - static_prio;
+}
+#else
+static inline void inc_prio_bias(runqueue_t *rq, int static_prio)
+{
+}
+
+static inline void dec_prio_bias(runqueue_t *rq, int static_prio)
+{
+}
+#endif
+
+static inline void inc_nr_running(task_t *p, runqueue_t *rq)
+{
+	rq->nr_running++;
+	inc_prio_bias(rq, p->static_prio);
+}
+
+static inline void dec_nr_running(task_t *p, runqueue_t *rq)
+{
+	rq->nr_running--;
+	dec_prio_bias(rq, p->static_prio);
+}
+
 /*
  * __activate_task - move a task to the runqueue.
  */
 static inline void __activate_task(task_t *p, runqueue_t *rq)
 {
 	enqueue_task(p, rq->active);
-	rq->nr_running++;
+	inc_nr_running(p, rq);
 }
 
 /*
@@ -672,7 +705,7 @@ static inline void __activate_task(task_
 static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
 {
 	enqueue_task_head(p, rq->active);
-	rq->nr_running++;
+	inc_nr_running(p, rq);
 }
 
 static void recalc_task_prio(task_t *p, unsigned long long now)
@@ -791,7 +824,7 @@ static void activate_task(task_t *p, run
  */
 static void deactivate_task(struct task_struct *p, runqueue_t *rq)
 {
-	rq->nr_running--;
+	dec_nr_running(p, rq);
 	dequeue_task(p, p->array);
 	p->array = NULL;
 }
@@ -928,27 +961,54 @@ void kick_process(task_t *p)
  * We want to under-estimate the load of migration sources, to
  * balance conservatively.
  */
-static inline unsigned long source_load(int cpu, int type)
+static inline unsigned long __source_load(int cpu, int type, enum idle_type idle)
 {
 	runqueue_t *rq = cpu_rq(cpu);
-	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+	unsigned long cpu_load = rq->cpu_load[type-1],
+		load_now = rq->nr_running * SCHED_LOAD_SCALE;
+
+	if (idle == NOT_IDLE) {
+		/*
+		 * If we are balancing busy runqueues the load is biased by
+		 * priority to create 'nice' support across cpus.
+		 */
+		cpu_load *= rq->prio_bias;
+		load_now *= rq->prio_bias;
+	}
+
 	if (type == 0)
 		return load_now;
 
-	return min(rq->cpu_load[type-1], load_now);
+	return min(cpu_load, load_now);
+}
+
+static inline unsigned long source_load(int cpu, int type)
+{
+	return __source_load(cpu, type, NOT_IDLE);
 }
 
 /*
  * Return a high guess at the load of a migration-target cpu
  */
-static inline unsigned long target_load(int cpu, int type)
+static inline unsigned long __target_load(int cpu, int type, enum idle_type idle)
 {
 	runqueue_t *rq = cpu_rq(cpu);
-	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+	unsigned long cpu_load = rq->cpu_load[type-1],
+		load_now = rq->nr_running * SCHED_LOAD_SCALE;
+
 	if (type == 0)
 		return load_now;
 
-	return max(rq->cpu_load[type-1], load_now);
+	if (idle == NOT_IDLE) {
+		cpu_load *= rq->prio_bias;
+		load_now *= rq->prio_bias;
+	}
+	return max(cpu_load, load_now);
+}
+
+static inline unsigned long target_load(int cpu, int type)
+{
+	return __target_load(cpu, type, NOT_IDLE);
 }
 
 /*
@@ -1389,7 +1449,7 @@ void fastcall wake_up_new_task(task_t * 
 				list_add_tail(&p->run_list, &current->run_list);
 				p->array = current->array;
 				p->array->nr_active++;
-				rq->nr_running++;
+				inc_nr_running(p, rq);
 			}
 			set_need_resched();
 		} else
@@ -1733,9 +1793,9 @@ void pull_task(runqueue_t *src_rq, prio_
 	       runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
 {
 	dequeue_task(p, src_array);
-	src_rq->nr_running--;
+	dec_nr_running(p, src_rq);
 	set_task_cpu(p, this_cpu);
-	this_rq->nr_running++;
+	inc_nr_running(p, this_rq);
 	enqueue_task(p, this_array);
 	p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
 				+ this_rq->timestamp_last_tick;
@@ -1909,9 +1969,9 @@ find_busiest_group(struct sched_domain *
 		for_each_cpu_mask(i, group->cpumask) {
 			/* Bias balancing toward cpus of our domain */
 			if (local_group)
-				load = target_load(i, load_idx);
+				load = __target_load(i, load_idx, idle);
 			else
-				load = source_load(i, load_idx);
+				load = __source_load(i, load_idx, idle);
 
 			avg_load += load;
 		}
@@ -2012,14 +2072,15 @@ out_balanced:
 /*
  * find_busiest_queue - find the busiest runqueue among the cpus in group.
  */
-static runqueue_t *find_busiest_queue(struct sched_group *group)
+static runqueue_t *find_busiest_queue(struct sched_group *group,
+	enum idle_type idle)
 {
 	unsigned long load, max_load = 0;
 	runqueue_t *busiest = NULL;
 	int i;
 
 	for_each_cpu_mask(i, group->cpumask) {
-		load = source_load(i, 0);
+		load = __source_load(i, 0, idle);
 
 		if (load > max_load) {
 			max_load = load;
@@ -2060,7 +2121,7 @@ static int load_balance(int this_cpu, ru
 		goto out_balanced;
 	}
 
-	busiest = find_busiest_queue(group);
+	busiest = find_busiest_queue(group, idle);
 	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[idle]);
 		goto out_balanced;
@@ -2168,7 +2229,7 @@ static int load_balance_newidle(int this
 		goto out_balanced;
 	}
 
-	busiest = find_busiest_queue(group);
+	busiest = find_busiest_queue(group, NEWLY_IDLE);
 	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
 		goto out_balanced;
@@ -3338,7 +3399,9 @@ void set_user_nice(task_t *p, long nice)
 	 * not SCHED_NORMAL:
 	 */
 	if (rt_task(p)) {
+		dec_prio_bias(rq, p->static_prio);
 		p->static_prio = NICE_TO_PRIO(nice);
+		inc_prio_bias(rq, p->static_prio);
 		goto out_unlock;
 	}
 	array = p->array;
@@ -3348,7 +3411,9 @@ void set_user_nice(task_t *p, long nice)
 	old_prio = p->prio;
 	new_prio = NICE_TO_PRIO(nice);
 	delta = new_prio - old_prio;
+	dec_prio_bias(rq, p->static_prio);
 	p->static_prio = NICE_TO_PRIO(nice);
+	inc_prio_bias(rq, p->static_prio);
 	p->prio += delta;
 
 	if (array) {

[-- Attachment #2: Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH] SCHED: change_prio_bias_only_if_queued
  2005-05-21  5:00                 ` Con Kolivas
@ 2005-05-23  9:28                   ` Con Kolivas
  2005-05-23 10:07                     ` [PATCH] SCHED: account_rt_tasks_in_prio_bias Con Kolivas
  0 siblings, 1 reply; 21+ messages in thread
From: Con Kolivas @ 2005-05-23  9:28 UTC (permalink / raw)
  To: AndrewMorton
  Cc: Ingo Molnar, ck, Markus Törnqvist, Carlos Carvalho,
	linux-kernel, Peter Williams


[-- Attachment #1.1: Type: text/plain, Size: 338 bytes --]

On Sat, 21 May 2005 15:00, Con Kolivas wrote:
> Ok I've respun the smp nice support for the cpu scheduler modifications
> that are in current -mm. Tested on 2.6.12-rc4-mm2 on 4x and seems to work
> fine.

Thanks to Peter Williams for noting I should only change the prio_bias if the 
task is queued in set_user_nice.

Con
---


[-- Attachment #1.2: change_prio_bias_only_if_queued.diff --]
[-- Type: text/x-diff, Size: 1235 bytes --]

prio_bias should only be adjusted in set_user_nice if p is actually currently
queued.

Signed-off-by: Con Kolivas <kernel@kolivas.org>

Index: linux-2.6.12-rc4-mm2/kernel/sched.c
===================================================================
--- linux-2.6.12-rc4-mm2.orig/kernel/sched.c	2005-05-21 14:25:07.000000000 +1000
+++ linux-2.6.12-rc4-mm2/kernel/sched.c	2005-05-23 19:23:10.000000000 +1000
@@ -3399,25 +3399,24 @@ void set_user_nice(task_t *p, long nice)
 	 * not SCHED_NORMAL:
 	 */
 	if (rt_task(p)) {
-		dec_prio_bias(rq, p->static_prio);
 		p->static_prio = NICE_TO_PRIO(nice);
-		inc_prio_bias(rq, p->static_prio);
 		goto out_unlock;
 	}
 	array = p->array;
-	if (array)
+	if (array) {
 		dequeue_task(p, array);
+		dec_prio_bias(rq, p->static_prio);
+	}
 
 	old_prio = p->prio;
 	new_prio = NICE_TO_PRIO(nice);
 	delta = new_prio - old_prio;
-	dec_prio_bias(rq, p->static_prio);
 	p->static_prio = NICE_TO_PRIO(nice);
-	inc_prio_bias(rq, p->static_prio);
 	p->prio += delta;
 
 	if (array) {
 		enqueue_task(p, array);
+		inc_prio_bias(rq, p->static_prio);
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:

[-- Attachment #2: Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH] SCHED: account_rt_tasks_in_prio_bias
  2005-05-23  9:28                   ` [PATCH] SCHED: change_prio_bias_only_if_queued Con Kolivas
@ 2005-05-23 10:07                     ` Con Kolivas
  0 siblings, 0 replies; 21+ messages in thread
From: Con Kolivas @ 2005-05-23 10:07 UTC (permalink / raw)
  To: AndrewMorton
  Cc: Ingo Molnar, ck, Markus Törnqvist, Carlos Carvalho,
	linux-kernel, Peter Williams


[-- Attachment #1.1: Type: text/plain, Size: 542 bytes --]

On Mon, 23 May 2005 19:28, Con Kolivas wrote:
> On Sat, 21 May 2005 15:00, Con Kolivas wrote:
> > Ok I've respun the smp nice support for the cpu scheduler modifications
> > that are in current -mm. Tested on 2.6.12-rc4-mm2 on 4x and seems to work
> > fine.
>
> Thanks to Peter Williams for noting I should only change the prio_bias if
> the task is queued in set_user_nice.

And for completeness the effect of real time tasks' real time priority level 
should be considered in prio_bias instead of their nice level.

Con
---


[-- Attachment #1.2: account_rt_tasks_in_prio_bias.diff --]
[-- Type: text/x-diff, Size: 1818 bytes --]

Real time tasks' effect on prio_bias should be based on their real time
priority level instead of their static_prio which is based on nice.

Signed-off-by: Con Kolivas <kernel@kolivas.org>

Index: linux-2.6.12-rc4-mm2/kernel/sched.c
===================================================================
--- linux-2.6.12-rc4-mm2.orig/kernel/sched.c	2005-05-23 19:23:10.000000000 +1000
+++ linux-2.6.12-rc4-mm2/kernel/sched.c	2005-05-23 19:59:45.000000000 +1000
@@ -659,21 +659,21 @@ static int effective_prio(task_t *p)
 }
 
 #ifdef CONFIG_SMP
-static inline void inc_prio_bias(runqueue_t *rq, int static_prio)
+static inline void inc_prio_bias(runqueue_t *rq, int prio)
 {
-	rq->prio_bias += MAX_PRIO - static_prio;
+	rq->prio_bias += MAX_PRIO - prio;
 }
 
-static inline void dec_prio_bias(runqueue_t *rq, int static_prio)
+static inline void dec_prio_bias(runqueue_t *rq, int prio)
 {
-	rq->prio_bias -= MAX_PRIO - static_prio;
+	rq->prio_bias -= MAX_PRIO - prio;
 }
 #else
-static inline void inc_prio_bias(runqueue_t *rq, int static_prio)
+static inline void inc_prio_bias(runqueue_t *rq, int prio)
 {
 }
 
-static inline void dec_prio_bias(runqueue_t *rq, int static_prio)
+static inline void dec_prio_bias(runqueue_t *rq, int prio)
 {
 }
 #endif
@@ -681,13 +681,19 @@ static inline void dec_prio_bias(runqueu
 static inline void inc_nr_running(task_t *p, runqueue_t *rq)
 {
 	rq->nr_running++;
-	inc_prio_bias(rq, p->static_prio);
+	if (rt_task(p))
+		inc_prio_bias(rq, p->prio);
+	else
+		inc_prio_bias(rq, p->static_prio);
 }
 
 static inline void dec_nr_running(task_t *p, runqueue_t *rq)
 {
 	rq->nr_running--;
-	dec_prio_bias(rq, p->static_prio);
+	if (rt_task(p))
+		dec_prio_bias(rq, p->prio);
+	else
+		dec_prio_bias(rq, p->static_prio);
 }
 
 /*

[-- Attachment #2: Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2005-05-23 10:07 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2005-05-07 13:42 [PATCH] implement nice support across physical cpus on SMP Con Kolivas
2005-05-07 17:59 ` Carlos Carvalho
2005-05-07 21:45   ` Con Kolivas
2005-05-09 11:24 ` Markus   Törnqvist
2005-05-09 11:28   ` [ck] " Markus   Törnqvist
2005-05-09 11:47   ` Con Kolivas
2005-05-09 18:55     ` Markus   Törnqvist
2005-05-09 23:54     ` Carlos Carvalho
2005-05-11  2:56       ` Con Kolivas
2005-05-11  3:04       ` [SMP NICE] [PATCH 1/2] SCHED: Implement " Con Kolivas
2005-05-11  3:05         ` [SMP NICE] [PATCH 2/2] SCHED: Make SMP nice a config option Con Kolivas
2005-05-11  7:20           ` Ingo Molnar
2005-05-12 10:49             ` Con Kolivas
2005-05-16 11:33         ` [SMP NICE] [PATCH] SCHED: Implement nice support across physical cpus on SMP Con Kolivas
2005-05-16 18:31           ` Markus   Törnqvist
2005-05-17 13:39           ` Carlos Carvalho
2005-05-18 11:30             ` Markus   Törnqvist
2005-05-18 13:45               ` Con Kolivas
2005-05-21  5:00                 ` Con Kolivas
2005-05-23  9:28                   ` [PATCH] SCHED: change_prio_bias_only_if_queued Con Kolivas
2005-05-23 10:07                     ` [PATCH] SCHED: account_rt_tasks_in_prio_bias Con Kolivas

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).