linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [patch] scheduler fix for 1cpu/node case
@ 2003-07-28 19:16 Erich Focht
  2003-07-28 19:55 ` Martin J. Bligh
  2003-07-31 15:05 ` Martin J. Bligh
  0 siblings, 2 replies; 25+ messages in thread
From: Erich Focht @ 2003-07-28 19:16 UTC (permalink / raw)
  To: linux-kernel, LSE; +Cc: Martin J. Bligh, Andi Kleen, torvalds

[-- Attachment #1: Type: text/plain, Size: 1686 bytes --]

Hi,

after talking to several people at OLS about the current NUMA
scheduler the conclusion was:
(1) it sucks (for particular workloads),
(2) on x86_64 (embarassingly simple NUMA) it's useless, goto (1).

Fact is that the current separation of local and global balancing,
where global balancing is done only in the timer interrupt at a fixed
rate is way too unflexible. A CPU going idle inside a well balanced
node will stay idle for a while even if there's a lot of work to
do. Especially in the corner case of one CPU per node this is
condemning that CPU to idleness for at least 5 ms. So x86_64 platforms
(but not only those!) suffer and whish to switch off the NUMA
scheduler while keeping NUMA memory management on.

The attached patch is a simple solution which
- solves the 1 CPU / node problem,
- lets other systems behave (almost) as before,
- opens the way to other optimisations like multi-level node
  hierarchies (by tuning the retry rate)
- simpifies the NUMA scheduler and deletes more lines of code than it
  adds.

The timer interrupt based global rebalancing might appear to be a
simple and good idea but it takes the scheduler a lot of
flexibility. In the patch the global rebalancing is done after a
certain number of failed attempts to locally balance. The number of
attempts is proportional to the number of CPUs in the current
node. For only 1 CPU in the current node the scheduler doesn't even
try to balance locally, it wouldn't make sense anyway. Of course one
could instead set IDLE_NODE_REBALANCE_TICK = IDLE_REBALANCE_TICK, but
this is more ugly (IMHO) and only helps when all nodes have 1 CPU /
node.

Please consider this for inclusion.

Thanks,
Erich




[-- Attachment #2: 1cpufix-lb-2.6.0t1.patch --]
[-- Type: text/x-diff, Size: 4255 bytes --]

diff -urNp 2.6.0t1/kernel/sched.c 2.6.0t1-1cpufix/kernel/sched.c
--- 2.6.0t1/kernel/sched.c	2003-07-14 05:37:14.000000000 +0200
+++ 2.6.0t1-1cpufix/kernel/sched.c	2003-07-28 05:32:28.000000000 +0200
@@ -164,6 +164,7 @@ struct runqueue {
 	prio_array_t *active, *expired, arrays[2];
 	int prev_cpu_load[NR_CPUS];
 #ifdef CONFIG_NUMA
+	unsigned int nr_lb_failed;
 	atomic_t *node_nr_running;
 	int prev_node_load[MAX_NUMNODES];
 #endif
@@ -856,6 +857,35 @@ static int find_busiest_node(int this_no
 	return node;
 }
 
+/*
+ * Decide whether the scheduler should balance locally (inside the same node)
+ * or globally depending on the number of failed local balance attempts.
+ * The number of failed local balance attempts depends on the number of cpus
+ * in the current node. In case it's just one, go immediately for global
+ * balancing. On a busy cpu the number of retries is smaller.
+ */
+static inline unsigned long cpus_to_balance(int this_cpu, runqueue_t *this_rq)
+{
+	int node, retries, this_node = cpu_to_node(this_cpu);
+
+	retries = nr_cpus_node(this_node) - 1;
+	if (this_rq->curr != this_rq->idle)
+		retries >>= 1;
+	if (this_rq->nr_lb_failed >= retries) {
+		node = find_busiest_node(this_node);
+		this_rq->nr_lb_failed = 0;
+		if (node >= 0)
+			return (node_to_cpu_mask(node) | (1UL << this_cpu));
+	}
+	return node_to_cpu_mask(this_node);
+}
+
+#else /* !CONFIG_NUMA */
+
+static inline unsigned long cpus_to_balance(int this_cpu, runqueue_t *this_rq)
+{
+	return cpu_online_map;
+}
 #endif /* CONFIG_NUMA */
 
 #ifdef CONFIG_SMP
@@ -960,6 +990,12 @@ static inline runqueue_t *find_busiest_q
 		busiest = NULL;
 	}
 out:
+#ifdef CONFIG_NUMA
+	if (!busiest)
+		this_rq->nr_lb_failed++;
+	else
+		this_rq->nr_lb_failed = 0;
+#endif
 	return busiest;
 }
 
@@ -995,7 +1031,7 @@ static inline void pull_task(runqueue_t 
  * We call this with the current runqueue locked,
  * irqs disabled.
  */
-static void load_balance(runqueue_t *this_rq, int idle, unsigned long cpumask)
+static void load_balance(runqueue_t *this_rq, int idle)
 {
 	int imbalance, idx, this_cpu = smp_processor_id();
 	runqueue_t *busiest;
@@ -1003,7 +1039,8 @@ static void load_balance(runqueue_t *thi
 	struct list_head *head, *curr;
 	task_t *tmp;
 
-	busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance, cpumask);
+	busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance,
+				     cpus_to_balance(this_cpu, this_rq));
 	if (!busiest)
 		goto out;
 
@@ -1085,29 +1122,9 @@ out:
  */
 #define IDLE_REBALANCE_TICK (HZ/1000 ?: 1)
 #define BUSY_REBALANCE_TICK (HZ/5 ?: 1)
-#define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * 5)
-#define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * 2)
-
-#ifdef CONFIG_NUMA
-static void balance_node(runqueue_t *this_rq, int idle, int this_cpu)
-{
-	int node = find_busiest_node(cpu_to_node(this_cpu));
-	unsigned long cpumask, this_cpumask = 1UL << this_cpu;
-
-	if (node >= 0) {
-		cpumask = node_to_cpumask(node) | this_cpumask;
-		spin_lock(&this_rq->lock);
-		load_balance(this_rq, idle, cpumask);
-		spin_unlock(&this_rq->lock);
-	}
-}
-#endif
 
 static void rebalance_tick(runqueue_t *this_rq, int idle)
 {
-#ifdef CONFIG_NUMA
-	int this_cpu = smp_processor_id();
-#endif
 	unsigned long j = jiffies;
 
 	/*
@@ -1119,24 +1136,16 @@ static void rebalance_tick(runqueue_t *t
 	 * are not balanced.)
 	 */
 	if (idle) {
-#ifdef CONFIG_NUMA
-		if (!(j % IDLE_NODE_REBALANCE_TICK))
-			balance_node(this_rq, idle, this_cpu);
-#endif
 		if (!(j % IDLE_REBALANCE_TICK)) {
 			spin_lock(&this_rq->lock);
-			load_balance(this_rq, idle, cpu_to_node_mask(this_cpu));
+			load_balance(this_rq, idle);
 			spin_unlock(&this_rq->lock);
 		}
 		return;
 	}
-#ifdef CONFIG_NUMA
-	if (!(j % BUSY_NODE_REBALANCE_TICK))
-		balance_node(this_rq, idle, this_cpu);
-#endif
 	if (!(j % BUSY_REBALANCE_TICK)) {
 		spin_lock(&this_rq->lock);
-		load_balance(this_rq, idle, cpu_to_node_mask(this_cpu));
+		load_balance(this_rq, idle);
 		spin_unlock(&this_rq->lock);
 	}
 }
@@ -1306,7 +1315,7 @@ need_resched:
 pick_next_task:
 	if (unlikely(!rq->nr_running)) {
 #ifdef CONFIG_SMP
-		load_balance(rq, 1, cpu_to_node_mask(smp_processor_id()));
+		load_balance(rq, 1);
 		if (rq->nr_running)
 			goto pick_next_task;
 #endif

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch] scheduler fix for 1cpu/node case
  2003-07-28 19:16 [patch] scheduler fix for 1cpu/node case Erich Focht
@ 2003-07-28 19:55 ` Martin J. Bligh
  2003-07-28 20:18   ` Erich Focht
  2003-07-31 15:05 ` Martin J. Bligh
  1 sibling, 1 reply; 25+ messages in thread
From: Martin J. Bligh @ 2003-07-28 19:55 UTC (permalink / raw)
  To: Erich Focht, linux-kernel, LSE; +Cc: Andi Kleen, torvalds

> after talking to several people at OLS about the current NUMA
> scheduler the conclusion was:
> (1) it sucks (for particular workloads),
> (2) on x86_64 (embarassingly simple NUMA) it's useless, goto (1).

I really feel there's no point in a NUMA scheduler for the Hammer
style architectures. A config option to turn it off would seem like
a simpler way to go, unless people can see some advantage of the 
full NUMA code? 

The interesting thing is probably whether we want balance on exec
or not ... but that probably applies to UMA SMP as well ...
 
> Fact is that the current separation of local and global balancing,
> where global balancing is done only in the timer interrupt at a fixed
> rate is way too unflexible. A CPU going idle inside a well balanced
> node will stay idle for a while even if there's a lot of work to
> do. Especially in the corner case of one CPU per node this is
> condemning that CPU to idleness for at least 5 ms. 

Surely it'll hit the idle local balancer and rebalance within the node? 
Or are you thinking of a case with 3 tasks on a 4 cpu/node system?

> So x86_64 platforms
> (but not only those!) suffer and whish to switch off the NUMA
> scheduler while keeping NUMA memory management on.

Right - I have a patch to make it a config option (CONFIG_NUMA_SCHED) 
... I'll feed that upstream this week.
 
> The attached patch is a simple solution which
> - solves the 1 CPU / node problem,
> - lets other systems behave (almost) as before,
> - opens the way to other optimisations like multi-level node
>   hierarchies (by tuning the retry rate)
> - simpifies the NUMA scheduler and deletes more lines of code than it
>   adds.

Looks simple, I'll test it out.

> The timer interrupt based global rebalancing might appear to be a
> simple and good idea but it takes the scheduler a lot of
> flexibility. In the patch the global rebalancing is done after a
> certain number of failed attempts to locally balance. The number of
> attempts is proportional to the number of CPUs in the current
> node.

Seems like a good plan.

M.


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch] scheduler fix for 1cpu/node case
  2003-07-28 19:55 ` Martin J. Bligh
@ 2003-07-28 20:18   ` Erich Focht
  2003-07-28 20:37     ` Martin J. Bligh
  2003-07-29 14:41     ` Andi Kleen
  0 siblings, 2 replies; 25+ messages in thread
From: Erich Focht @ 2003-07-28 20:18 UTC (permalink / raw)
  To: Martin J. Bligh, linux-kernel, LSE; +Cc: Andi Kleen, torvalds

On Monday 28 July 2003 21:55, Martin J. Bligh wrote:
> > after talking to several people at OLS about the current NUMA
> > scheduler the conclusion was:
> > (1) it sucks (for particular workloads),
> > (2) on x86_64 (embarassingly simple NUMA) it's useless, goto (1).
>
> I really feel there's no point in a NUMA scheduler for the Hammer
> style architectures. A config option to turn it off would seem like
> a simpler way to go, unless people can see some advantage of the
> full NUMA code?

But the Hammer is a NUMA architecture and a working NUMA scheduler
should be flexible enough to deal with it. And: the corner case of 1
CPU per node is possible also on any other NUMA platform, when in some
of the nodes (or even just one) only one CPU is configured in. Solving
that problem automatically gives the Hammer what it needs.

> > Fact is that the current separation of local and global balancing,
> > where global balancing is done only in the timer interrupt at a fixed
> > rate is way too unflexible. A CPU going idle inside a well balanced
> > node will stay idle for a while even if there's a lot of work to
> > do. Especially in the corner case of one CPU per node this is
> > condemning that CPU to idleness for at least 5 ms.
>
> Surely it'll hit the idle local balancer and rebalance within the node?
> Or are you thinking of a case with 3 tasks on a 4 cpu/node system?

No, no, the local load balancer just doesn't make sense now if you
have one cpu per node. It is called although it will never find
another cpu in the own cell to steal from. There just is nothing
else... (or did I misunderstand your comment?)

> > So x86_64 platforms
> > (but not only those!) suffer and whish to switch off the NUMA
> > scheduler while keeping NUMA memory management on.
>
> Right - I have a patch to make it a config option (CONFIG_NUMA_SCHED)
> ... I'll feed that upstream this week.

That's one way, but the proposed patch just solves the problem (in a
more general way, also for other NUMA cases). If you deconfigure NUMA
for a NUMA platform, we'll have problems switching it back on when
adding smarter things like node affine or homenode extensions.

> > The attached patch is a simple solution which
> > - solves the 1 CPU / node problem,
> > - lets other systems behave (almost) as before,
> > - opens the way to other optimisations like multi-level node
> >   hierarchies (by tuning the retry rate)
> > - simpifies the NUMA scheduler and deletes more lines of code than it
> >   adds.
>
> Looks simple, I'll test it out.

Great! Thanks!

Erich



^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch] scheduler fix for 1cpu/node case
  2003-07-28 20:18   ` Erich Focht
@ 2003-07-28 20:37     ` Martin J. Bligh
  2003-07-29  2:24       ` Andrew Theurer
  2003-07-29 10:08       ` Erich Focht
  2003-07-29 14:41     ` Andi Kleen
  1 sibling, 2 replies; 25+ messages in thread
From: Martin J. Bligh @ 2003-07-28 20:37 UTC (permalink / raw)
  To: Erich Focht, linux-kernel, LSE; +Cc: Andi Kleen, torvalds

> But the Hammer is a NUMA architecture and a working NUMA scheduler
> should be flexible enough to deal with it. And: the corner case of 1
> CPU per node is possible also on any other NUMA platform, when in some
> of the nodes (or even just one) only one CPU is configured in. Solving
> that problem automatically gives the Hammer what it needs.

OK, well what we have now is a "multi-cpu-per-node-numa-scheduler" if
you really want to say all that ;-)

The question is "does Hammer benefit from the additional complexity"?
I'm guessing not ... if so, then yes, it's worth fixing. If not, it
would seem better to just leave it at SMP for the scheduler stuff.
Simpler, more shared code with common systems.
 
>> > Fact is that the current separation of local and global balancing,
>> > where global balancing is done only in the timer interrupt at a fixed
>> > rate is way too unflexible. A CPU going idle inside a well balanced
>> > node will stay idle for a while even if there's a lot of work to
>> > do. Especially in the corner case of one CPU per node this is
>> > condemning that CPU to idleness for at least 5 ms.
>> 
>> Surely it'll hit the idle local balancer and rebalance within the node?
>> Or are you thinking of a case with 3 tasks on a 4 cpu/node system?
> 
> No, no, the local load balancer just doesn't make sense now if you
> have one cpu per node. It is called although it will never find
> another cpu in the own cell to steal from. There just is nothing
> else... (or did I misunderstand your comment?)

Right, I realise that the 1 cpu per node case is broken. However, doesn't
this also affect the multi-cpu per node case in the manner I suggested
above? So even if we turn off NUMA scheduler for Hammer, this still 
needs fixing, right?
 
> That's one way, but the proposed patch just solves the problem (in a
> more general way, also for other NUMA cases). If you deconfigure NUMA
> for a NUMA platform, we'll have problems switching it back on when
> adding smarter things like node affine or homenode extensions.

Yeah, maybe. I'd rather have the code in hand that needs it, but still ...
if Andi's happy that fixes it, then we're OK.

M.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch] scheduler fix for 1cpu/node case
  2003-07-28 20:37     ` Martin J. Bligh
@ 2003-07-29  2:24       ` Andrew Theurer
  2003-07-29 10:08         ` Erich Focht
  2003-08-13 20:49         ` Bill Davidsen
  2003-07-29 10:08       ` Erich Focht
  1 sibling, 2 replies; 25+ messages in thread
From: Andrew Theurer @ 2003-07-29  2:24 UTC (permalink / raw)
  To: Martin J. Bligh, Erich Focht, linux-kernel, LSE; +Cc: Andi Kleen, torvalds

On Monday 28 July 2003 15:37, Martin J. Bligh wrote:
> > But the Hammer is a NUMA architecture and a working NUMA scheduler
> > should be flexible enough to deal with it. And: the corner case of 1
> > CPU per node is possible also on any other NUMA platform, when in some
> > of the nodes (or even just one) only one CPU is configured in. Solving
> > that problem automatically gives the Hammer what it needs.

I am going to ask a silly question, do we have any data showing this really is 
a problem on AMD?  I would think, even if we have an idle cpu, sometimes a 
little delay on task migration (on NUMA) may not be a bad thing.   If it is 
too long, can we just make the rebalance ticks arch specific?  

I'd much rather have info related to the properties of the NUMA arch than 
something that makes decisions based on nr_cpus_node().  For example, we may 
want to inter-node balance as much or more often on ppc64 than even AMD, but 
it has 8 cpus per node.  On this patch it would has the lowest inter-node 
balance frequency, even though it probably has one of the lowest latencies 
between nodes and highest throughput interconnects.  

> OK, well what we have now is a "multi-cpu-per-node-numa-scheduler" if
> you really want to say all that ;-)
>
> The question is "does Hammer benefit from the additional complexity"?
> I'm guessing not ... if so, then yes, it's worth fixing. If not, it
> would seem better to just leave it at SMP for the scheduler stuff.
> Simpler, more shared code with common systems.
>
> >> > Fact is that the current separation of local and global balancing,
> >> > where global balancing is done only in the timer interrupt at a fixed
> >> > rate is way too unflexible. A CPU going idle inside a well balanced
> >> > node will stay idle for a while even if there's a lot of work to
> >> > do. Especially in the corner case of one CPU per node this is
> >> > condemning that CPU to idleness for at least 5 ms.
> >>
> >> Surely it'll hit the idle local balancer and rebalance within the node?
> >> Or are you thinking of a case with 3 tasks on a 4 cpu/node system?
> >
> > No, no, the local load balancer just doesn't make sense now if you
> > have one cpu per node. It is called although it will never find
> > another cpu in the own cell to steal from. There just is nothing
> > else... (or did I misunderstand your comment?)
>
> Right, I realise that the 1 cpu per node case is broken. However, doesn't
> this also affect the multi-cpu per node case in the manner I suggested
> above? So even if we turn off NUMA scheduler for Hammer, this still
> needs fixing, right?

Maybe so, but if we start making idle rebalance more aggressive, I think we 
would need to make CAN_MIGRATE more restrictive, taking memory placement of 
the tasks in to account.  On AMD with interleaved memory allocation, tasks 
would move very easily, since their memory is spread out anyway.  On "home 
node" or node-local policy, we may not move a task (or maybe not on the first 
attempt), even if there is an idle cpu in another node.  

>> That's one way, but the proposed patch just solves the problem (in a
>> more general way, also for other NUMA cases). If you deconfigure NUMA
>> for a NUMA platform, we'll have problems switching it back on when
>> adding smarter things like node affine or homenode extensions.
>
>Yeah, maybe. I'd rather have the code in hand that needs it, but still ...
>if Andi's happy that fixes it, then we're OK.

Personally, I'd like to see all systems use NUMA sched, non NUMA systems being 
a single node (no policy difference from non-numa sched), allowing us to 
remove all NUMA ifdefs.  I think the code would be much more readable.

-Andrew Theurer

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch] scheduler fix for 1cpu/node case
  2003-07-28 20:37     ` Martin J. Bligh
  2003-07-29  2:24       ` Andrew Theurer
@ 2003-07-29 10:08       ` Erich Focht
  1 sibling, 0 replies; 25+ messages in thread
From: Erich Focht @ 2003-07-29 10:08 UTC (permalink / raw)
  To: Martin J. Bligh, linux-kernel, LSE; +Cc: Andi Kleen, torvalds

On Monday 28 July 2003 22:37, Martin J. Bligh wrote:
> > But the Hammer is a NUMA architecture and a working NUMA scheduler
> > should be flexible enough to deal with it. And: the corner case of 1
> > CPU per node is possible also on any other NUMA platform, when in some
> > of the nodes (or even just one) only one CPU is configured in. Solving
> > that problem automatically gives the Hammer what it needs.
>
> OK, well what we have now is a "multi-cpu-per-node-numa-scheduler" if
> you really want to say all that ;-)

And with the fix posted it will just be called "numa-scheduler". It's
a simplification, as you noticed.

> The question is "does Hammer benefit from the additional complexity"?
> I'm guessing not ... if so, then yes, it's worth fixing. If not, it
> would seem better to just leave it at SMP for the scheduler stuff.
> Simpler, more shared code with common systems.

Hammer will just go for the other nodes, as it should, when the own
node is free. Then you can benefit of NUMA improvements in the load
calculation and later of smarter distribution across the
nodes. Reverting to SMP is a quick fix but for improvements we'll have
to get back to NUMA. So why not fix it now and stay with NUMA.

> > No, no, the local load balancer just doesn't make sense now if you
> > have one cpu per node. It is called although it will never find
> > another cpu in the own cell to steal from. There just is nothing
> > else... (or did I misunderstand your comment?)
>
> Right, I realise that the 1 cpu per node case is broken. However, doesn't
> this also affect the multi-cpu per node case in the manner I suggested
> above? So even if we turn off NUMA scheduler for Hammer, this still
> needs fixing, right?

Yes.

> > That's one way, but the proposed patch just solves the problem (in a
> > more general way, also for other NUMA cases). If you deconfigure NUMA
> > for a NUMA platform, we'll have problems switching it back on when
> > adding smarter things like node affine or homenode extensions.
>
> Yeah, maybe. I'd rather have the code in hand that needs it, but still ...
> if Andi's happy that fixes it, then we're OK.

http://marc.theaimsgroup.com/?l=linux-kernel&m=105854610325226&w=2
Andi mentioned in his talk to follow that path, too. We'll have to
experiment a while with the ideas we currently have to get the initial
load balancing/ dynamic homenode issue solved, there's no "golden way"
right now. I'll post the stuff I mentioned in my talk soon, but want
to gain some experience with it, first.

Regards,
Erich



^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch] scheduler fix for 1cpu/node case
  2003-07-29  2:24       ` Andrew Theurer
@ 2003-07-29 10:08         ` Erich Focht
  2003-07-29 13:33           ` [Lse-tech] " Andrew Theurer
  2003-07-29 14:27           ` Martin J. Bligh
  2003-08-13 20:49         ` Bill Davidsen
  1 sibling, 2 replies; 25+ messages in thread
From: Erich Focht @ 2003-07-29 10:08 UTC (permalink / raw)
  To: habanero, Martin J. Bligh, linux-kernel, LSE; +Cc: Andi Kleen, torvalds

On Tuesday 29 July 2003 04:24, Andrew Theurer wrote:
> On Monday 28 July 2003 15:37, Martin J. Bligh wrote:
> > > But the Hammer is a NUMA architecture and a working NUMA scheduler
> > > should be flexible enough to deal with it. And: the corner case of 1
> > > CPU per node is possible also on any other NUMA platform, when in some
> > > of the nodes (or even just one) only one CPU is configured in. Solving
> > > that problem automatically gives the Hammer what it needs.
>
> I am going to ask a silly question, do we have any data showing this really
> is a problem on AMD?  I would think, even if we have an idle cpu, sometimes
> a little delay on task migration (on NUMA) may not be a bad thing.   If it
> is too long, can we just make the rebalance ticks arch specific?

The fact that global rebalances are done only in the timer interrupt
is simply bad! It complicates rebalance_tick() and wastes the
opportunity to get feedback from the failed local balance attempts.

If you want data supporting my assumptions: Ted Ts'o's talk at OLS
shows the necessity to rebalance ASAP (even in try_to_wake_up). There
are plenty of arguments towards this, starting with the steal delay
parameter scans from the early days of multi-queue schedulers (Davide
Libenzi), over my experiments with NUMA schedulers and the observation
of Andi Kleen that on Opteron you better run from the wrong CPU than
wait (if the tasks returns to the right cpu, all's fine anyway).

> I'd much rather have info related to the properties of the NUMA arch than
> something that makes decisions based on nr_cpus_node().  For example, we
> may want to inter-node balance as much or more often on ppc64 than even
> AMD, but it has 8 cpus per node.  On this patch it would has the lowest
> inter-node balance frequency, even though it probably has one of the lowest
> latencies between nodes and highest throughput interconnects.

We can still discuss on the formula. Currently there's a bug in the
scheduler and the corner case of 1 cpu/node is just broken. The
function local_balance_retries(attempts, cpus_in_this_node) must
return 0 for cpus_in_this_node=1 and should return larger values for
larger cpus_in_this_node. To have an upper limit is fine, but maybe
not necessary.

Regarding the ppc64 interconnect, I'm glad that you said "probably"
and "one of the ...". No need to point you to better ones ;-)

> > Right, I realise that the 1 cpu per node case is broken. However, doesn't
> > this also affect the multi-cpu per node case in the manner I suggested
> > above? So even if we turn off NUMA scheduler for Hammer, this still
> > needs fixing, right?
>
> Maybe so, but if we start making idle rebalance more aggressive, I think we
> would need to make CAN_MIGRATE more restrictive, taking memory placement of
> the tasks in to account.  On AMD with interleaved memory allocation, tasks
> would move very easily, since their memory is spread out anyway.  On "home
> node" or node-local policy, we may not move a task (or maybe not on the
> first attempt), even if there is an idle cpu in another node.

Aehm, that's another story and I'm sure we will fix that too. There
are a few ideas around. But you shouldn't expect to solve all problems
at once, after all optimal NUMA scheduling can still be considered a
research area.

> Personally, I'd like to see all systems use NUMA sched, non NUMA systems
> being a single node (no policy difference from non-numa sched), allowing us
> to remove all NUMA ifdefs.  I think the code would be much more readable.

:-) Then you expect that everybody who reads the scheduler code knows
what NUMA stands for and what it means? Pretty optimistic, but yes,
I'd like that, too.

Erich



^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Lse-tech] Re: [patch] scheduler fix for 1cpu/node case
  2003-07-29 10:08         ` Erich Focht
@ 2003-07-29 13:33           ` Andrew Theurer
  2003-07-30 15:23             ` Erich Focht
  2003-07-29 14:27           ` Martin J. Bligh
  1 sibling, 1 reply; 25+ messages in thread
From: Andrew Theurer @ 2003-07-29 13:33 UTC (permalink / raw)
  To: Erich Focht, Martin J. Bligh, linux-kernel, LSE; +Cc: Andi Kleen, torvalds

[-- Attachment #1: Type: text/plain, Size: 5667 bytes --]

On Tuesday 29 July 2003 05:08, Erich Focht wrote:
> On Tuesday 29 July 2003 04:24, Andrew Theurer wrote:
> > On Monday 28 July 2003 15:37, Martin J. Bligh wrote:
> > > > But the Hammer is a NUMA architecture and a working NUMA scheduler
> > > > should be flexible enough to deal with it. And: the corner case of 1
> > > > CPU per node is possible also on any other NUMA platform, when in
> > > > some of the nodes (or even just one) only one CPU is configured in.
> > > > Solving that problem automatically gives the Hammer what it needs.
> >
> > I am going to ask a silly question, do we have any data showing this
> > really is a problem on AMD?  I would think, even if we have an idle cpu,
> > sometimes a little delay on task migration (on NUMA) may not be a bad
> > thing.   If it is too long, can we just make the rebalance ticks arch
> > specific?
>
> The fact that global rebalances are done only in the timer interrupt
> is simply bad! 

Even with this patch it still seems that most balances are still timer based, 
because we still call load_balance in rebalance_tick.  Granted, we may 
inter-node balance more often, well, maybe less often since 
node_busy_rebalance_tick was busy_rebalance_tick*2.  I do see the advantage 
of doing this at idle, but idle only, that's why I'd would be more inclined a 
only a much more aggressive idle rebalance.

> It complicates rebalance_tick() and wastes the
> opportunity to get feedback from the failed local balance attempts.

What does "failed" really mean?  To me, when *busiest=null, that means we 
passed, the node itself is probably balanced, and there's nothing to do.  It 
gives no indication at all of the global load [im]balance.  Shouldn't the 
thing we are looking for is the imbalance among node_nr_running[]?  Would it 
make sense to go forward with a global balance based on that only?

> If you want data supporting my assumptions: Ted Ts'o's talk at OLS
> shows the necessity to rebalance ASAP (even in try_to_wake_up). 

If this is the patch I am thinking of, it was the (attached) one I sent them, 
which did a light "push" rebalance at try_to_wake_up.  Calling load_balance 
at try_to_wake_up seems very heavy-weight.  This patch only looks for an idle 
cpu (within the same node) to wake up on before task activation, only if the 
task_rq(p)->nr_running is too long.  So, yes, I do believe this can be 
important, but I think it's only called for when we have an idle cpu.  

> There
> are plenty of arguments towards this, starting with the steal delay
> parameter scans from the early days of multi-queue schedulers (Davide
> Libenzi), over my experiments with NUMA schedulers and the observation
> of Andi Kleen that on Opteron you better run from the wrong CPU than
> wait (if the tasks returns to the right cpu, all's fine anyway).
>
> > I'd much rather have info related to the properties of the NUMA arch than
> > something that makes decisions based on nr_cpus_node().  For example, we
> > may want to inter-node balance as much or more often on ppc64 than even
> > AMD, but it has 8 cpus per node.  On this patch it would has the lowest
> > inter-node balance frequency, even though it probably has one of the
> > lowest latencies between nodes and highest throughput interconnects.
>
> We can still discuss on the formula. Currently there's a bug in the
> scheduler and the corner case of 1 cpu/node is just broken. The
> function local_balance_retries(attempts, cpus_in_this_node) must
> return 0 for cpus_in_this_node=1 and should return larger values for
> larger cpus_in_this_node. To have an upper limit is fine, but maybe
> not necessary.
>
> Regarding the ppc64 interconnect, I'm glad that you said "probably"
> and "one of the ...". No need to point you to better ones ;-)

OK, we wont get into a pissing match :)  I just wanted to base the scheduler 
decisions on data related to the hardware NUMA properties, not the cpu count.  

> > > Right, I realise that the 1 cpu per node case is broken. However,
> > > doesn't this also affect the multi-cpu per node case in the manner I
> > > suggested above? So even if we turn off NUMA scheduler for Hammer, this
> > > still needs fixing, right?
> >
> > Maybe so, but if we start making idle rebalance more aggressive, I think
> > we would need to make CAN_MIGRATE more restrictive, taking memory
> > placement of the tasks in to account.  On AMD with interleaved memory
> > allocation, tasks would move very easily, since their memory is spread
> > out anyway.  On "home node" or node-local policy, we may not move a task
> > (or maybe not on the first attempt), even if there is an idle cpu in
> > another node.
>
> Aehm, that's another story and I'm sure we will fix that too. There
> are a few ideas around. But you shouldn't expect to solve all problems
> at once, after all optimal NUMA scheduling can still be considered a
> research area.
>
> > Personally, I'd like to see all systems use NUMA sched, non NUMA systems
> > being a single node (no policy difference from non-numa sched), allowing
> > us to remove all NUMA ifdefs.  I think the code would be much more
> > readable.
> >
> :-) Then you expect that everybody who reads the scheduler code knows
>
> what NUMA stands for and what it means? Pretty optimistic, but yes,
> I'd like that, too.

Yes, at some point we have to.  We cannot have two different schedulers.  Non 
numa should have the exact same scheduling policy as a numa system with one 
node.  I don't know if that's acceptable for 2.6, but I really want to go for 
that in 2.7.  

-Andrew Theurer

[-- Attachment #2: patch-wakey2-2567 --]
[-- Type: text/x-diff, Size: 2751 bytes --]

diff -Naur 2.5.67-BK-9-4-2003/Makefile 2.5.67-BK-9-4-2003-wakey2/Makefile
--- 2.5.67-BK-9-4-2003/Makefile	2003-04-15 13:42:53.000000000 -0700
+++ 2.5.67-BK-9-4-2003-wakey2/Makefile	2003-04-15 13:57:14.000000000 -0700
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 5
 SUBLEVEL = 67
-EXTRAVERSION = -BK-9-4-2003
+EXTRAVERSION = -BK-9-4-2003-wakey2
 
 # *DOCUMENTATION*
 # To see a list of typical targets execute "make help"
diff -Naur 2.5.67-BK-9-4-2003/kernel/sched.c 2.5.67-BK-9-4-2003-wakey2/kernel/sched.c
--- 2.5.67-BK-9-4-2003/kernel/sched.c	2003-04-13 15:04:34.000000000 -0700
+++ 2.5.67-BK-9-4-2003-wakey2/kernel/sched.c	2003-04-15 14:00:01.000000000 -0700
@@ -486,11 +486,32 @@
  */
 static int try_to_wake_up(task_t * p, unsigned int state, int sync)
 {
-	int success = 0, requeue_waker = 0;
-	unsigned long flags;
+	int success = 0, target_cpu, i;
+	unsigned long flags, cpumask;
 	long old_state;
 	runqueue_t *rq;
 
+	target_cpu = smp_processor_id();
+
+	/* change task_cpu to an idle cpu if its
+	 * default rq is really busy and sync
+	 * wakeup is not requested */
+	if (!sync && ((nr_running()*4) <= (num_online_cpus()*5)) &&
+		(task_rq(p)->nr_running > 0)) {
+		cpumask = node_to_cpumask(cpu_to_node(task_cpu(p)));
+		for (i=0; i<NR_CPUS; i++) {
+			if (!(cpumask & (1UL << i)))
+				continue;
+			if (!(cpu_online(i)))
+				continue;
+			if (idle_cpu(i)) {
+				sync = 1;
+				target_cpu = i;
+				break;
+			}
+		}
+	}
+
 repeat_lock_task:
 	rq = task_rq_lock(p, &flags);
 	old_state = p->state;
@@ -501,43 +522,25 @@
 			 * currently. Do not violate hard affinity.
 			 */
 			if (unlikely(sync && !task_running(rq, p) &&
-				(task_cpu(p) != smp_processor_id()) &&
-				(p->cpus_allowed & (1UL << smp_processor_id())))) {
+				(task_cpu(p) != target_cpu) &&
+				(p->cpus_allowed & (1UL << target_cpu)))) {
 
-				set_task_cpu(p, smp_processor_id());
+				set_task_cpu(p, target_cpu);
 				task_rq_unlock(rq, &flags);
 				goto repeat_lock_task;
 			}
 			if (old_state == TASK_UNINTERRUPTIBLE)
 				rq->nr_uninterruptible--;
-			if (sync)
-				__activate_task(p, rq);
-			else {
-				requeue_waker = activate_task(p, rq);
-				if (p->prio < rq->curr->prio)
+			activate_task(p, rq);
+
+			if (p->prio < rq->curr->prio)
 					resched_task(rq->curr);
-			}
 			success = 1;
 		}
 		p->state = TASK_RUNNING;
 	}
 	task_rq_unlock(rq, &flags);
 
-	/*
-	 * We have to do this outside the other spinlock, the two
-	 * runqueues might be different:
-	 */
-	if (requeue_waker) {
-		prio_array_t *array;
-
-		rq = task_rq_lock(current, &flags);
-		array = current->array;
-		dequeue_task(current, array);
-		current->prio = effective_prio(current);
-		enqueue_task(current, array);
-		task_rq_unlock(rq, &flags);
-	}
-
 	return success;
 }
 

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch] scheduler fix for 1cpu/node case
  2003-07-29 10:08         ` Erich Focht
  2003-07-29 13:33           ` [Lse-tech] " Andrew Theurer
@ 2003-07-29 14:27           ` Martin J. Bligh
  1 sibling, 0 replies; 25+ messages in thread
From: Martin J. Bligh @ 2003-07-29 14:27 UTC (permalink / raw)
  To: Erich Focht, habanero, linux-kernel, LSE; +Cc: Andi Kleen, torvalds

>> I am going to ask a silly question, do we have any data showing this really
>> is a problem on AMD?  I would think, even if we have an idle cpu, sometimes
>> a little delay on task migration (on NUMA) may not be a bad thing.   If it
>> is too long, can we just make the rebalance ticks arch specific?
> 
> The fact that global rebalances are done only in the timer interrupt
> is simply bad! It complicates rebalance_tick() and wastes the
> opportunity to get feedback from the failed local balance attempts.

The whole point of the NUMA scheduler is to rebalance globally less
often. Now I'd agree that in the idle case we want to be more agressive,
as your patch does, but we need to be careful not to end up in a cacheline
thrashing war. 

Aiming for perfect balance is not always a good idea - the expense of both 
the calculation and the migrations has to be taken into account. For some 
arches, it's less important than others ... one thing we're missing is
a per arch "node-balance-agressiveness" factor.

Having said that, the NUMA scheduler is clearly broken on Hammer at the
moment, from Andi's observations.

> If you want data supporting my assumptions: Ted Ts'o's talk at OLS
> shows the necessity to rebalance ASAP (even in try_to_wake_up). There
> are plenty of arguments towards this, starting with the steal delay
> parameter scans from the early days of multi-queue schedulers (Davide
> Libenzi), over my experiments with NUMA schedulers and the observation
> of Andi Kleen that on Opteron you better run from the wrong CPU than
> wait (if the tasks returns to the right cpu, all's fine anyway).

That's a drastic oversimplification. It may be better in some 
circumstances, on some benchmarks. For now, let's just get your patch
tested on Hammer, and see if it works better to have the NUMA scheduler
on than off after your patch ...

M.




^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch] scheduler fix for 1cpu/node case
  2003-07-28 20:18   ` Erich Focht
  2003-07-28 20:37     ` Martin J. Bligh
@ 2003-07-29 14:41     ` Andi Kleen
  1 sibling, 0 replies; 25+ messages in thread
From: Andi Kleen @ 2003-07-29 14:41 UTC (permalink / raw)
  To: Erich Focht; +Cc: Martin J. Bligh, linux-kernel, LSE, Andi Kleen, torvalds

On Mon, Jul 28, 2003 at 10:18:19PM +0200, Erich Focht wrote:
> > > So x86_64 platforms
> > > (but not only those!) suffer and whish to switch off the NUMA
> > > scheduler while keeping NUMA memory management on.
> >
> > Right - I have a patch to make it a config option (CONFIG_NUMA_SCHED)
> > ... I'll feed that upstream this week.
> 
> That's one way, but the proposed patch just solves the problem (in a
> more general way, also for other NUMA cases). If you deconfigure NUMA
> for a NUMA platform, we'll have problems switching it back on when
> adding smarter things like node affine or homenode extensions.

That's one important point IMHO. Currently Opteron does not really 
need the NUMA scheduler, but it will be in future with such extensions.
This means it would be better if the current scheduler supports 
it already so that it can be easily extended.

Thanks for the patch.

-Andi

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Lse-tech] Re: [patch] scheduler fix for 1cpu/node case
  2003-07-29 13:33           ` [Lse-tech] " Andrew Theurer
@ 2003-07-30 15:23             ` Erich Focht
  2003-07-30 15:44               ` Andrew Theurer
  0 siblings, 1 reply; 25+ messages in thread
From: Erich Focht @ 2003-07-30 15:23 UTC (permalink / raw)
  To: habanero, Martin J. Bligh, linux-kernel, LSE; +Cc: Andi Kleen, torvalds

On Tuesday 29 July 2003 15:33, Andrew Theurer wrote:
> > The fact that global rebalances are done only in the timer interrupt
> > is simply bad!
>
> Even with this patch it still seems that most balances are still timer
> based, because we still call load_balance in rebalance_tick.  Granted, we
> may inter-node balance more often, well, maybe less often since
> node_busy_rebalance_tick was busy_rebalance_tick*2.  I do see the advantage
> of doing this at idle, but idle only, that's why I'd would be more inclined
> a only a much more aggressive idle rebalance.

Without this patch the probability to globally balance when going idle
is 0. Now it is 1/global_balance_rate(cpus_in_this_node) . This is
tunable and we can make this probability depending on the node load
imbalance. I'll try that in the next version, it sounds like a good
idea. Also changing this probability by some factor could give us a
way to handle the differences between the platforms.

Balancing globally only when idle isn't a good idea as long as we
don't have multiple steals per balance attempt. Even then, tasks don't
live all the same time, so you easilly end up with one node overloaded
and others just busy and not able to steal from the busiest node.

> > It complicates rebalance_tick() and wastes the
> > opportunity to get feedback from the failed local balance attempts.
>
> What does "failed" really mean?  To me, when *busiest=null, that means we
> passed, the node itself is probably balanced, and there's nothing to do. 
> It gives no indication at all of the global load [im]balance.  Shouldn't
> the thing we are looking for is the imbalance among node_nr_running[]? 
> Would it make sense to go forward with a global balance based on that only?

I'll try to include the node imbalance in the global balance rate
calculation, let's see how it works. Just wanted to fix one issue with
my patch, now it looks like it provides some simple ways to solve
other issues as well... Thanks for the idea!

Regards,
Erich





^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Lse-tech] Re: [patch] scheduler fix for 1cpu/node case
  2003-07-30 15:23             ` Erich Focht
@ 2003-07-30 15:44               ` Andrew Theurer
  0 siblings, 0 replies; 25+ messages in thread
From: Andrew Theurer @ 2003-07-30 15:44 UTC (permalink / raw)
  To: Erich Focht, Martin J. Bligh, linux-kernel, LSE; +Cc: Andi Kleen, torvalds

On Wednesday 30 July 2003 10:23, Erich Focht wrote:
> On Tuesday 29 July 2003 15:33, Andrew Theurer wrote:
> > > The fact that global rebalances are done only in the timer interrupt
> > > is simply bad!
> >
> > Even with this patch it still seems that most balances are still timer
> > based, because we still call load_balance in rebalance_tick.  Granted, we
> > may inter-node balance more often, well, maybe less often since
> > node_busy_rebalance_tick was busy_rebalance_tick*2.  I do see the
> > advantage of doing this at idle, but idle only, that's why I'd would be
> > more inclined a only a much more aggressive idle rebalance.
>
> Without this patch the probability to globally balance when going idle
> is 0. Now it is 1/global_balance_rate(cpus_in_this_node) . This is
> tunable and we can make this probability depending on the node load
> imbalance. I'll try that in the next version, it sounds like a good
> idea. Also changing this probability by some factor could give us a
> way to handle the differences between the platforms.
>
> Balancing globally only when idle isn't a good idea as long as we
> don't have multiple steals per balance attempt. Even then, tasks don't
> live all the same time, so you easilly end up with one node overloaded
> and others just busy and not able to steal from the busiest node.

I think I would agree with this statements.  Yes, your patch does fix the 
"going idle" AMD issue, and yes, we don't want to balance globally on idle by 
default.  However I do still think there may be a better way to solve this.  

I see this as "range" problem.  When balancing, the NUMA scheduler's tactic is 
to limit the range of cpus to find a busiest queue.  First we start with 
within a node, then once in a while we go to the next "level", and look at 
cpus in all nodes in that "level".  Someday we may even go up another "level" 
and include even more cpus in our range.  We limit how often we go up a 
level, as to help memory locality of the running tasks.  This is obviously 
well known to you, since you wrote most of this :)   What I would like to 
propose is the following:

For load_balance, each architecture tells the scheduler what the range is, 
where the range starts, and where it ends.  For ex: Summit starts at level1, 
the cpus in a node, and ends at level2, cpus in all level1 nodes.  AMD starts 
at level2, all the cpus in all level1 nodes and also ends at the level2.  
8-way AMD would go to level3 :)  NEC (I assume) would start at the level1, 
and end at level3, the "super" nodes.  This should solve the AMD problem very 
easily. 

As for idle balances, we may be able to go a step further: follow the range 
rules, but do a more aggressive/frequent search.  On busy rebalance, I'd 
prefer to still uphold the node_rebalance_ticks, but on idle, I would think, 
since we are idle, some more time spent on a thorough search would be OK:  We 
first start at the lower range and try to find something to steal (always 
favor stealing a smaller range first), then as needed increase the range, 
looking for a cpu to steal from.  Do not use a node_idle_rebalance interval, 
just keep increasing the range until we are successful.

I'll see if I can get something coded up here.  I don't think these concepts 
would be too difficult to add.

-Andrew Theurer

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch] scheduler fix for 1cpu/node case
  2003-07-28 19:16 [patch] scheduler fix for 1cpu/node case Erich Focht
  2003-07-28 19:55 ` Martin J. Bligh
@ 2003-07-31 15:05 ` Martin J. Bligh
  2003-07-31 21:45   ` Erich Focht
  1 sibling, 1 reply; 25+ messages in thread
From: Martin J. Bligh @ 2003-07-31 15:05 UTC (permalink / raw)
  To: Erich Focht, linux-kernel, LSE; +Cc: Andi Kleen


you're using node_to_cpu_mask for ia64 ... others were using 
node_to_cpumask (1 less "_"), so this doesn't build ...

M.


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch] scheduler fix for 1cpu/node case
  2003-07-31 15:05 ` Martin J. Bligh
@ 2003-07-31 21:45   ` Erich Focht
  2003-08-01  0:26     ` Martin J. Bligh
  0 siblings, 1 reply; 25+ messages in thread
From: Erich Focht @ 2003-07-31 21:45 UTC (permalink / raw)
  To: Martin J. Bligh, linux-kernel, LSE; +Cc: Andi Kleen

[-- Attachment #1: Type: text/plain, Size: 248 bytes --]

On Thursday 31 July 2003 17:05, Martin J. Bligh wrote:
> you're using node_to_cpu_mask for ia64 ... others were using
> node_to_cpumask (1 less "_"), so this doesn't build ...

Ooops, you're right, of course. Sorry about this mistake :-(

Erich




[-- Attachment #2: 1cpufix-lb2-2.6.0t1.patch --]
[-- Type: text/x-diff, Size: 4253 bytes --]

diff -urNp 2.6.0t1/kernel/sched.c 2.6.0t1-1cpufix/kernel/sched.c
--- 2.6.0t1/kernel/sched.c	2003-07-14 05:37:14.000000000 +0200
+++ 2.6.0t1-1cpufix/kernel/sched.c	2003-07-31 20:46:30.000000000 +0200
@@ -164,6 +164,7 @@ struct runqueue {
 	prio_array_t *active, *expired, arrays[2];
 	int prev_cpu_load[NR_CPUS];
 #ifdef CONFIG_NUMA
+	unsigned int nr_lb_failed;
 	atomic_t *node_nr_running;
 	int prev_node_load[MAX_NUMNODES];
 #endif
@@ -856,6 +857,35 @@ static int find_busiest_node(int this_no
 	return node;
 }
 
+/*
+ * Decide whether the scheduler should balance locally (inside the same node)
+ * or globally depending on the number of failed local balance attempts.
+ * The number of failed local balance attempts depends on the number of cpus
+ * in the current node. In case it's just one, go immediately for global
+ * balancing. On a busy cpu the number of retries is smaller.
+ */
+static inline unsigned long cpus_to_balance(int this_cpu, runqueue_t *this_rq)
+{
+	int node, retries, this_node = cpu_to_node(this_cpu);
+
+	retries = nr_cpus_node(this_node) - 1;
+	if (this_rq->curr != this_rq->idle)
+		retries >>= 1;
+	if (this_rq->nr_lb_failed >= retries) {
+		node = find_busiest_node(this_node);
+		this_rq->nr_lb_failed = 0;
+		if (node >= 0)
+			return (node_to_cpumask(node) | (1UL << this_cpu));
+	}
+	return node_to_cpumask(this_node);
+}
+
+#else /* !CONFIG_NUMA */
+
+static inline unsigned long cpus_to_balance(int this_cpu, runqueue_t *this_rq)
+{
+	return cpu_online_map;
+}
 #endif /* CONFIG_NUMA */
 
 #ifdef CONFIG_SMP
@@ -960,6 +990,12 @@ static inline runqueue_t *find_busiest_q
 		busiest = NULL;
 	}
 out:
+#ifdef CONFIG_NUMA
+	if (!busiest)
+		this_rq->nr_lb_failed++;
+	else
+		this_rq->nr_lb_failed = 0;
+#endif
 	return busiest;
 }
 
@@ -995,7 +1031,7 @@ static inline void pull_task(runqueue_t 
  * We call this with the current runqueue locked,
  * irqs disabled.
  */
-static void load_balance(runqueue_t *this_rq, int idle, unsigned long cpumask)
+static void load_balance(runqueue_t *this_rq, int idle)
 {
 	int imbalance, idx, this_cpu = smp_processor_id();
 	runqueue_t *busiest;
@@ -1003,7 +1039,8 @@ static void load_balance(runqueue_t *thi
 	struct list_head *head, *curr;
 	task_t *tmp;
 
-	busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance, cpumask);
+	busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance,
+				     cpus_to_balance(this_cpu, this_rq));
 	if (!busiest)
 		goto out;
 
@@ -1085,29 +1122,9 @@ out:
  */
 #define IDLE_REBALANCE_TICK (HZ/1000 ?: 1)
 #define BUSY_REBALANCE_TICK (HZ/5 ?: 1)
-#define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * 5)
-#define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * 2)
-
-#ifdef CONFIG_NUMA
-static void balance_node(runqueue_t *this_rq, int idle, int this_cpu)
-{
-	int node = find_busiest_node(cpu_to_node(this_cpu));
-	unsigned long cpumask, this_cpumask = 1UL << this_cpu;
-
-	if (node >= 0) {
-		cpumask = node_to_cpumask(node) | this_cpumask;
-		spin_lock(&this_rq->lock);
-		load_balance(this_rq, idle, cpumask);
-		spin_unlock(&this_rq->lock);
-	}
-}
-#endif
 
 static void rebalance_tick(runqueue_t *this_rq, int idle)
 {
-#ifdef CONFIG_NUMA
-	int this_cpu = smp_processor_id();
-#endif
 	unsigned long j = jiffies;
 
 	/*
@@ -1119,24 +1136,16 @@ static void rebalance_tick(runqueue_t *t
 	 * are not balanced.)
 	 */
 	if (idle) {
-#ifdef CONFIG_NUMA
-		if (!(j % IDLE_NODE_REBALANCE_TICK))
-			balance_node(this_rq, idle, this_cpu);
-#endif
 		if (!(j % IDLE_REBALANCE_TICK)) {
 			spin_lock(&this_rq->lock);
-			load_balance(this_rq, idle, cpu_to_node_mask(this_cpu));
+			load_balance(this_rq, idle);
 			spin_unlock(&this_rq->lock);
 		}
 		return;
 	}
-#ifdef CONFIG_NUMA
-	if (!(j % BUSY_NODE_REBALANCE_TICK))
-		balance_node(this_rq, idle, this_cpu);
-#endif
 	if (!(j % BUSY_REBALANCE_TICK)) {
 		spin_lock(&this_rq->lock);
-		load_balance(this_rq, idle, cpu_to_node_mask(this_cpu));
+		load_balance(this_rq, idle);
 		spin_unlock(&this_rq->lock);
 	}
 }
@@ -1306,7 +1315,7 @@ need_resched:
 pick_next_task:
 	if (unlikely(!rq->nr_running)) {
 #ifdef CONFIG_SMP
-		load_balance(rq, 1, cpu_to_node_mask(smp_processor_id()));
+		load_balance(rq, 1);
 		if (rq->nr_running)
 			goto pick_next_task;
 #endif

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch] scheduler fix for 1cpu/node case
  2003-07-31 21:45   ` Erich Focht
@ 2003-08-01  0:26     ` Martin J. Bligh
  2003-08-01 16:30       ` [Lse-tech] " Erich Focht
  0 siblings, 1 reply; 25+ messages in thread
From: Martin J. Bligh @ 2003-08-01  0:26 UTC (permalink / raw)
  To: Erich Focht, linux-kernel, LSE; +Cc: Andi Kleen

> On Thursday 31 July 2003 17:05, Martin J. Bligh wrote:
>> you're using node_to_cpu_mask for ia64 ... others were using
>> node_to_cpumask (1 less "_"), so this doesn't build ...
> 
> Ooops, you're right, of course. Sorry about this mistake :-(

np - was easy to fix up ;-) I did run some benchmarks on it ... 
low end SDET seemed highly variable, but otherwise looked OK. 
If I have only 4 tasks running on a 16x (4x4), what's the rate
limit on the idle cpus trying to steal now?

M.


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Lse-tech] Re: [patch] scheduler fix for 1cpu/node case
  2003-08-01  0:26     ` Martin J. Bligh
@ 2003-08-01 16:30       ` Erich Focht
  0 siblings, 0 replies; 25+ messages in thread
From: Erich Focht @ 2003-08-01 16:30 UTC (permalink / raw)
  To: Martin J. Bligh, linux-kernel, LSE; +Cc: Andi Kleen

On Friday 01 August 2003 02:26, Martin J. Bligh wrote:
> np - was easy to fix up ;-) I did run some benchmarks on it ...
> low end SDET seemed highly variable, but otherwise looked OK.
> If I have only 4 tasks running on a 16x (4x4), what's the rate
> limit on the idle cpus trying to steal now?

the rate is 3ms, that's a bit too fast, maybe. And the busy rate is
200ms (instead of 400). I made some experiments with 2 CPUs per node
and 1 ms seems to be definitely too fast. I'll tune the formula a
bit...

Regards,
Erich



^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch] scheduler fix for 1cpu/node case
  2003-07-29  2:24       ` Andrew Theurer
  2003-07-29 10:08         ` Erich Focht
@ 2003-08-13 20:49         ` Bill Davidsen
  2003-08-22 15:46           ` [Lse-tech] " Andrew Theurer
  1 sibling, 1 reply; 25+ messages in thread
From: Bill Davidsen @ 2003-08-13 20:49 UTC (permalink / raw)
  To: Andrew Theurer
  Cc: Martin J. Bligh, Erich Focht, linux-kernel, LSE, Andi Kleen, torvalds

On Mon, 28 Jul 2003, Andrew Theurer wrote:

> Personally, I'd like to see all systems use NUMA sched, non NUMA systems being 
> a single node (no policy difference from non-numa sched), allowing us to 
> remove all NUMA ifdefs.  I think the code would be much more readable.

That sounds like a great idea, but I'm not sure it could be realized short
of a major rewrite. Look how hard Ingo and Con are working just to get a
single node doing a good job with interactive and throughput tradeoffs.

Once they get a good handle on identifying process behaviour, and I
believe they will, that information could be used in improving NUMA
performance, by sending not just 'a job" but "the right job" if it exists.
I'm sure there are still a few graduate theses possible on the topic!

-- 
bill davidsen <davidsen@tmr.com>
  CTO, TMR Associates, Inc
Doing interesting things with little computers since 1979.


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Lse-tech] Re: [patch] scheduler fix for 1cpu/node case
  2003-08-13 20:49         ` Bill Davidsen
@ 2003-08-22 15:46           ` Andrew Theurer
  2003-08-22 22:56             ` Nick Piggin
  0 siblings, 1 reply; 25+ messages in thread
From: Andrew Theurer @ 2003-08-22 15:46 UTC (permalink / raw)
  To: Bill Davidsen
  Cc: Martin J. Bligh, Erich Focht, linux-kernel, LSE, Andi Kleen, torvalds

[-- Attachment #1: Type: text/plain, Size: 1689 bytes --]

On Wednesday 13 August 2003 15:49, Bill Davidsen wrote:
> On Mon, 28 Jul 2003, Andrew Theurer wrote:
> > Personally, I'd like to see all systems use NUMA sched, non NUMA systems
> > being a single node (no policy difference from non-numa sched), allowing
> > us to remove all NUMA ifdefs.  I think the code would be much more
> > readable.
>
> That sounds like a great idea, but I'm not sure it could be realized short
> of a major rewrite. Look how hard Ingo and Con are working just to get a
> single node doing a good job with interactive and throughput tradeoffs.

Actually it's not too bad.  Attached is a patch to do it.  It also does 
multi-level node support and makes all the load balance routines 
runqueue-centric instead of cpu-centric, so adding something like shared 
runqueues (for HT) should be really easy.  Hmm, other things: inter-node 
balance intervals are now arch specific (AMD is "1").  The default busy/idle 
balance timers of 200/1 are not arch specific, but I'm thinking they should 
be.  And for non-numa, the scheduling policy is the same as it was with 
vanilla O(1). 

> Once they get a good handle on identifying process behaviour, and I
> believe they will, that information could be used in improving NUMA
> performance, by sending not just 'a job" but "the right job" if it exists.
> I'm sure there are still a few graduate theses possible on the topic!

I do agree, but I think _what_ we pick is definitely 2.7 work.  All I'd like 
to see for 2.6 is a very nice, unified framework to build on.

As for 1cpu/node case I think this second patch (on top of first) will take 
care of it; however I have not had a chance to test on AMD yet.

-Andrew Theurer

[-- Attachment #2: patch-numasched2.260-test3-bk8 --]
[-- Type: text/x-diff, Size: 1396 bytes --]

diff -Naurp linux-2.6.0-test3-bk8-numasched/kernel/sched.c linux-2.6.0-test3-bk8-numasched2/kernel/sched.c
--- linux-2.6.0-test3-bk8-numasched/kernel/sched.c	2003-08-22 12:03:30.000000000 -0500
+++ linux-2.6.0-test3-bk8-numasched2/kernel/sched.c	2003-08-22 12:14:09.000000000 -0500
@@ -1261,7 +1261,7 @@ static void rebalance_tick(runqueue_t *t
 
 	if (idle) {
 		while (node) {
-			if (!(j % node->idle_rebalance_tick))
+			if (!(j % node->idle_rebalance_tick) && (node->nr_cpus > 1))
 				balance_node(this_rq, idle, last_node, node);
 			last_node = node;
 			node = node->parent_node;
@@ -1269,7 +1269,7 @@ static void rebalance_tick(runqueue_t *t
 		return;
 	}
 	while (node) {
-		if (!(j % node->busy_rebalance_tick))
+		if (!(j % node->busy_rebalance_tick) && (node->nr_cpus > 1))
 			balance_node(this_rq, idle, last_node, node);
 		last_node = node;
 		node = node->parent_node;
@@ -1405,6 +1405,7 @@ asmlinkage void schedule(void)
 	runqueue_t *rq;
 	prio_array_t *array;
 	struct list_head *queue;
+	struct node_t *node;
 	int idx;
 
 	/*
@@ -1449,8 +1450,10 @@ need_resched:
 pick_next_task:
 	if (unlikely(!rq->nr_running)) {
 #ifdef CONFIG_SMP
-		if (rq->node)
-			load_balance(rq, 1, rq->node);
+		node = rq->node;
+		while (node->nr_cpus < 2 && node != top_node)
+			node = node->parent_node;
+		load_balance(rq, 1, node);
 		if (rq->nr_running)
 			goto pick_next_task;
 #endif

[-- Attachment #3: patch-numasched.260-test3-bk8 --]
[-- Type: text/x-diff, Size: 22174 bytes --]

diff -Naurp linux-2.6.0-test3-bk8/arch/i386/kernel/smpboot.c linux-2.6.0-test3-bk8-numasched/arch/i386/kernel/smpboot.c
--- linux-2.6.0-test3-bk8/arch/i386/kernel/smpboot.c	2003-08-21 13:17:40.000000000 -0500
+++ linux-2.6.0-test3-bk8-numasched/arch/i386/kernel/smpboot.c	2003-08-21 18:34:23.000000000 -0500
@@ -504,12 +504,16 @@ cpumask_t node_2_cpu_mask[MAX_NR_NODES] 
 /* which node each logical CPU is on */
 int cpu_2_node[NR_CPUS] = { [0 ... NR_CPUS-1] = 0 };
 
+/* which node each node is on */
+int node_2_node[MAX_NUMNODES] = { [0 ... MAX_NUMNODES-1] = MAX_NUMNODES-1 };
+
 /* set up a mapping between cpu and node. */
 static inline void map_cpu_to_node(int cpu, int node)
 {
 	printk("Mapping cpu %d to node %d\n", cpu, node);
 	cpu_set(cpu, node_2_cpu_mask[node]);
 	cpu_2_node[cpu] = node;
+	node_2_node[node] = MAX_NUMNODES-1;
 }
 
 /* undo a mapping between cpu and node. */
diff -Naurp linux-2.6.0-test3-bk8/arch/x86_64/kernel/smpboot.c linux-2.6.0-test3-bk8-numasched/arch/x86_64/kernel/smpboot.c
--- linux-2.6.0-test3-bk8/arch/x86_64/kernel/smpboot.c	2003-08-21 10:42:38.000000000 -0500
+++ linux-2.6.0-test3-bk8-numasched/arch/x86_64/kernel/smpboot.c	2003-08-21 15:33:36.000000000 -0500
@@ -63,6 +63,9 @@ static cpumask_t smp_commenced_mask;
 /* Per CPU bogomips and other parameters */
 struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
 
+/* which node each node is on */
+volatile int node_2_node[MAX_NUMNODES] = { [0 ... MAX_NUMNODES-1] = MAX_NUMNODES-1;
+
 /* Set when the idlers are all forked */
 int smp_threads_ready;
 
diff -Naurp linux-2.6.0-test3-bk8/include/asm-generic/topology.h linux-2.6.0-test3-bk8-numasched/include/asm-generic/topology.h
--- linux-2.6.0-test3-bk8/include/asm-generic/topology.h	2003-08-21 10:39:24.000000000 -0500
+++ linux-2.6.0-test3-bk8-numasched/include/asm-generic/topology.h	2003-08-21 15:33:36.000000000 -0500
@@ -52,8 +52,11 @@
 #endif
 
 /* Cross-node load balancing interval. */
-#ifndef NODE_BALANCE_RATE
-#define NODE_BALANCE_RATE 10
+#ifndef IDLE_NODE_BALANCE_INTERVAL
+#define IDLE_NODE_BALANCE_INTERVAL 2
+#endif
+#ifndef BUSY_NODE_BALANCE_INTERVAL
+#define BUSY_NODE_BALANCE_INTERVAL 5
 #endif
 
 #endif /* _ASM_GENERIC_TOPOLOGY_H */
diff -Naurp linux-2.6.0-test3-bk8/include/asm-i386/topology.h linux-2.6.0-test3-bk8-numasched/include/asm-i386/topology.h
--- linux-2.6.0-test3-bk8/include/asm-i386/topology.h	2003-08-21 10:42:40.000000000 -0500
+++ linux-2.6.0-test3-bk8-numasched/include/asm-i386/topology.h	2003-08-21 15:38:09.000000000 -0500
@@ -36,6 +36,7 @@
 /* Mappings between logical cpu number and node number */
 extern cpumask_t node_2_cpu_mask[];
 extern int cpu_2_node[];
+extern int node_2_node[];
 
 /* Returns the number of the node containing CPU 'cpu' */
 static inline int cpu_to_node(int cpu)
@@ -46,9 +47,11 @@ static inline int cpu_to_node(int cpu)
 /* Returns the number of the node containing MemBlk 'memblk' */
 #define memblk_to_node(memblk) (memblk)
 
-/* Returns the number of the node containing Node 'node'.  This architecture is flat, 
-   so it is a pretty simple function! */
-#define parent_node(node) (node)
+/* Returns the number of the node containing Node 'node'. */   
+static inline int parent_node(int node)
+{
+	return node_2_node[node];
+}
 
 /* Returns a bitmask of CPUs on Node 'node'. */
 static inline cpumask_t node_to_cpumask(int node)
@@ -73,7 +76,8 @@ static inline cpumask_t pcibus_to_cpumas
 }
 
 /* Cross-node load balancing interval. */
-#define NODE_BALANCE_RATE 100
+#define IDLE_NODE_BALANCE_INTERVAL 2
+#define BUSY_NODE_BALANCE_INTERVAL 5
 
 #else /* !CONFIG_NUMA */
 /*
diff -Naurp linux-2.6.0-test3-bk8/include/asm-x86_64/topology.h linux-2.6.0-test3-bk8-numasched/include/asm-x86_64/topology.h
--- linux-2.6.0-test3-bk8/include/asm-x86_64/topology.h	2003-08-21 10:42:40.000000000 -0500
+++ linux-2.6.0-test3-bk8-numasched/include/asm-x86_64/topology.h	2003-08-21 15:33:36.000000000 -0500
@@ -27,7 +27,8 @@ static inline cpumask_t pcibus_to_cpumas
 	return ret;
 }
 
-#define NODE_BALANCE_RATE 30	/* CHECKME */ 
+#define IDLE_NODE_BALANCE_INTERVAL	1
+#define BUSY_NODE_BALANCE_INTERVAL	1
 
 #endif
 
diff -Naurp linux-2.6.0-test3-bk8/include/linux/sched.h linux-2.6.0-test3-bk8-numasched/include/linux/sched.h
--- linux-2.6.0-test3-bk8/include/linux/sched.h	2003-08-21 10:42:40.000000000 -0500
+++ linux-2.6.0-test3-bk8-numasched/include/linux/sched.h	2003-08-21 15:33:36.000000000 -0500
@@ -498,14 +498,10 @@ static inline int set_cpus_allowed(task_
 }
 #endif
 
-#ifdef CONFIG_NUMA
+extern void numa_sched_topology_report(void);
+extern void rq_to_node_init(void);
+extern void node_to_node_init(void);
 extern void sched_balance_exec(void);
-extern void node_nr_running_init(void);
-#else
-#define sched_balance_exec()   {}
-#define node_nr_running_init() {}
-#endif
-
 extern void set_user_nice(task_t *p, long nice);
 extern int task_prio(task_t *p);
 extern int task_nice(task_t *p);
diff -Naurp linux-2.6.0-test3-bk8/init/main.c linux-2.6.0-test3-bk8-numasched/init/main.c
--- linux-2.6.0-test3-bk8/init/main.c	2003-08-21 10:39:58.000000000 -0500
+++ linux-2.6.0-test3-bk8-numasched/init/main.c	2003-08-21 15:33:36.000000000 -0500
@@ -537,11 +537,12 @@ static void do_pre_smp_initcalls(void)
 {
 	extern int spawn_ksoftirqd(void);
 #ifdef CONFIG_SMP
+	node_to_node_init();
+	rq_to_node_init();
+	numa_sched_topology_report();
 	extern int migration_init(void);
-
 	migration_init();
 #endif
-	node_nr_running_init();
 	spawn_ksoftirqd();
 }
 
diff -Naurp linux-2.6.0-test3-bk8/kernel/sched.c linux-2.6.0-test3-bk8-numasched/kernel/sched.c
--- linux-2.6.0-test3-bk8/kernel/sched.c	2003-08-21 10:42:40.000000000 -0500
+++ linux-2.6.0-test3-bk8-numasched/kernel/sched.c	2003-08-22 12:03:30.000000000 -0500
@@ -35,12 +35,6 @@
 #include <linux/cpu.h>
 #include <linux/percpu.h>
 
-#ifdef CONFIG_NUMA
-#define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
-#else
-#define cpu_to_node_mask(cpu) (cpu_online_map)
-#endif
-
 /*
  * Convert user-nice values [ -20 ... 0 ... 19 ]
  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -76,6 +70,8 @@
 #define MAX_SLEEP_AVG		(10*HZ)
 #define STARVATION_LIMIT	(10*HZ)
 #define NODE_THRESHOLD		125
+#define IDLE_REBALANCE_TICK 	(HZ/1000 ?: 1)
+#define BUSY_REBALANCE_TICK 	(HZ/5 ?: 1)
 
 /*
  * If a task is 'interactive' then we reinsert it in the active
@@ -162,15 +158,13 @@ struct runqueue {
 	task_t *curr, *idle;
 	struct mm_struct *prev_mm;
 	prio_array_t *active, *expired, arrays[2];
-	int prev_cpu_load[NR_CPUS];
-#ifdef CONFIG_NUMA
-	atomic_t *node_nr_running;
+	int id, prev_cpu_load[NR_CPUS];
+	struct node_t *node;
 	int prev_node_load[MAX_NUMNODES];
-#endif
 	task_t *migration_thread;
 	struct list_head migration_queue;
-
 	atomic_t nr_iowait;
+	runqueue_t *next;
 };
 
 static DEFINE_PER_CPU(struct runqueue, runqueues);
@@ -189,51 +183,177 @@ static DEFINE_PER_CPU(struct runqueue, r
 # define finish_arch_switch(rq, next)	spin_unlock_irq(&(rq)->lock)
 # define task_running(rq, p)		((rq)->curr == (p))
 #endif
+ 
+/*
+ * This is our NUMA node struct, used heavily for load balance.
+ * Nodes may contain lists of runqueues or child_nodes.  Nodes
+ * also point to their parent_node.  These pointers allow 
+ * an arbitrary level of nested nodes.  The scheduler code should
+ * use these structs, not simple NUMA topology when interpreting 
+ * topology.  Simple NUMA topology should only be used to populate
+ * these structs. 
+ */
+struct node_t {
+	int id, idle_rebalance_tick, busy_rebalance_tick, nr_cpus;
+	struct node_t *parent_node, *child_nodes;
+	struct runqueue *runqueues;
+	atomic_t nr_running;
+	struct node_t *next;
+} ____cacheline_aligned;
+
+static struct node_t nodes[MAX_NUMNODES] __cacheline_aligned = {
+	[0 ...MAX_NUMNODES-1].parent_node = NULL,
+	[0 ...MAX_NUMNODES-1].child_nodes = NULL,
+	[0 ...MAX_NUMNODES-1].runqueues = NULL,
+	[0 ...MAX_NUMNODES-1].nr_running = ATOMIC_INIT(0),
+	[0 ...MAX_NUMNODES-1].nr_cpus = 0,
+	[0 ...MAX_NUMNODES-1].busy_rebalance_tick = BUSY_REBALANCE_TICK,
+	[0 ...MAX_NUMNODES-1].idle_rebalance_tick = IDLE_REBALANCE_TICK
+};
 
-#ifdef CONFIG_NUMA
+#define nodeptr(node)		(nodes + (node))
 
-/*
- * Keep track of running tasks.
+/* 
+ * The last node is always the highest level node
  */
+#define top_node		(nodes + MAX_NUMNODES -1)
 
-static atomic_t node_nr_running[MAX_NUMNODES] ____cacheline_maxaligned_in_smp =
-	{[0 ...MAX_NUMNODES-1] = ATOMIC_INIT(0)};
 
-static inline void nr_running_init(struct runqueue *rq)
-{
-	rq->node_nr_running = &node_nr_running[0];
-}
+/*
+ * Keep track of running tasks. Update nr_running for 
+ * rq->node and all parent nodes
+ */
 
 static inline void nr_running_inc(runqueue_t *rq)
 {
-	atomic_inc(rq->node_nr_running);
+	struct node_t *node = rq->node;
+	while (node) {
+		atomic_inc(&node->nr_running);
+		node = node->parent_node;
+	}
 	rq->nr_running++;
 }
 
 static inline void nr_running_dec(runqueue_t *rq)
 {
-	atomic_dec(rq->node_nr_running);
+	struct node_t *node = rq->node;
+	while (node) {
+		atomic_dec(&node->nr_running);
+		node = node->parent_node;
+	}
 	rq->nr_running--;
 }
 
-__init void node_nr_running_init(void)
+void numa_sched_topology_report(void)
 {
-	int i;
 
-	for (i = 0; i < NR_CPUS; i++) {
-		if (cpu_possible(i))
-			cpu_rq(i)->node_nr_running =
-				&node_nr_running[cpu_to_node(i)];
+struct node_t *m, *n, *o;
+runqueue_t *rq;
+
+	printk("scheduler topology report:\n");
+	m = top_node;
+	while (m) {
+		n = m;
+		while (n) {
+			printk("node %i...\n", n->id);
+			printk("nr_running: %u\n", atomic_read(&n->nr_running));
+			printk("idle_rebalance_tick: %i\n", n->idle_rebalance_tick);
+			printk("busy_rebalance_tick: %i\n", n->busy_rebalance_tick);
+			printk("  has %i cpus\n", n->nr_cpus);
+			printk("  has these runqueues: ");
+			rq = n->runqueues;
+			while (rq) {
+				printk(" %i,", rq->id);
+				rq = rq->next;
+			}
+			printk("\n");
+
+			printk("  has these children:");
+			o = n->child_nodes;
+			while (o) {
+				printk(" %i,", o->id);
+				o = o->next;
+			}
+			printk("\n\n");
+		n = n->next;
+		}
+	m = m->child_nodes;
 	}
 }
 
-#else /* !CONFIG_NUMA */
+void map_rq_to_node(runqueue_t *rq, struct node_t *node)
+{
+	/* node to rq */
+	rq->next = node->runqueues;
+	node->runqueues = rq; 
+	node->nr_cpus++;
+	/* rq to node */
+	rq->node = node;
+}
 
-# define nr_running_init(rq)   do { } while (0)
-# define nr_running_inc(rq)    do { (rq)->nr_running++; } while (0)
-# define nr_running_dec(rq)    do { (rq)->nr_running--; } while (0)
+void map_node_to_node(struct node_t *child, struct node_t *parent)
+{
+		/* parent to child */
+		child->next = parent->child_nodes;
+		parent->child_nodes = child;
+		/* child to parent */
+		child->parent_node = parent;
+}
+
+/* 
+ * Make sure all valid runqueues point to a node.
+ */
+void rq_to_node_init(void)
+{
+	int i, node;
 
-#endif /* CONFIG_NUMA */
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!cpu_possible(i))
+			continue;
+		node = cpu_to_node(i);
+		if (node >= 0) {
+			if (!(cpu_rq(i)->node))
+				map_rq_to_node(cpu_rq(i), nodeptr(node));
+		}
+	}
+}
+
+/* 
+ * Setup the pointers between parent and child nodes 
+ * (use simple NUMA topology functions).
+ * Also setup parent node balance ticks
+ */
+void node_to_node_init(void)
+{
+	int i, node;
+	struct node_t *m, *n;
+
+	/* Setup the hierarchy of nodes */
+	m = top_node;
+	while (m) {
+		n = m;
+		while (n) {
+			for (i = 0; i < MAX_NUMNODES; i++) {
+				node = parent_node(i);
+				if ((n->id == node) && (i != node))
+					map_node_to_node(nodeptr(i), nodeptr(node));
+			}
+			n = n->next;
+		}
+		m = m->child_nodes;
+	}
+	/* Setup the balance intervals */
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		n = nodeptr(i)->child_nodes;
+		if (n) {
+			nodeptr(i)->idle_rebalance_tick = 
+				n->idle_rebalance_tick * IDLE_NODE_BALANCE_INTERVAL;
+			nodeptr(i)->busy_rebalance_tick = 
+				n->busy_rebalance_tick * BUSY_NODE_BALANCE_INTERVAL;
+		}
+	}
+	top_node->nr_cpus = 16;
+}
 
 /*
  * task_rq_lock - lock the runqueue a given task resides on and disable
@@ -688,12 +808,7 @@ static inline task_t * context_switch(ru
  */
 unsigned long nr_running(void)
 {
-	unsigned long i, sum = 0;
-
-	for (i = 0; i < NR_CPUS; i++)
-		sum += cpu_rq(i)->nr_running;
-
-	return sum;
+	return atomic_read(&top_node->nr_running);
 }
 
 unsigned long nr_uninterruptible(void)
@@ -766,7 +881,6 @@ static inline void double_rq_unlock(runq
 		spin_unlock(&rq2->lock);
 }
 
-#ifdef CONFIG_NUMA
 /*
  * If dest_cpu is allowed for this process, migrate the task to it.
  * This is accomplished by forcing the cpu_allowed mask to only
@@ -793,37 +907,46 @@ static void sched_migrate_task(task_t *p
  */
 static int sched_best_cpu(struct task_struct *p)
 {
-	int i, minload, load, best_cpu, node = 0;
-	cpumask_t cpumask;
+	int minload, load, best_cpu;
+	struct node_t *node, *best_node;
+	runqueue_t *rq;
 
 	best_cpu = task_cpu(p);
 	if (cpu_rq(best_cpu)->nr_running <= 2)
 		return best_cpu;
 
 	minload = 10000000;
-	for_each_node_with_cpus(i) {
-		/*
-		 * Node load is always divided by nr_cpus_node to normalise 
-		 * load values in case cpu count differs from node to node.
-		 * We first multiply node_nr_running by 10 to get a little
-		 * better resolution.   
-		 */
-		load = 10 * atomic_read(&node_nr_running[i]) / nr_cpus_node(i);
-		if (load < minload) {
-			minload = load;
-			node = i;
+	node = top_node;
+	while (node->child_nodes) {
+		node = node->child_nodes;
+		best_node = node->child_nodes;
+		while (node) {
+			/*
+			 * Node load is always divided by nr_cpus_node to normalise 
+			 * load values in case cpu count differs from node to node.
+			 * We first multiply node_nr_running by 10 to get a little
+			 * better resolution.   
+			 */
+			if (node->nr_cpus) {
+				load = 10 * atomic_read(&node->nr_running)
+				/ node->nr_cpus;
+				if (load < minload) {
+					minload = load;
+					best_node = node;
+				}	
+			}
+			node = node->next;
 		}
+		node = best_node;
 	}
-
 	minload = 10000000;
-	cpumask = node_to_cpumask(node);
-	for (i = 0; i < NR_CPUS; ++i) {
-		if (!cpu_isset(i, cpumask))
-			continue;
-		if (cpu_rq(i)->nr_running < minload) {
-			best_cpu = i;
-			minload = cpu_rq(i)->nr_running;
+	rq = node->runqueues;
+	while (rq) {
+		if (rq->nr_running < minload) {
+			minload = rq->nr_running;
+			best_cpu = rq->id;
 		}
+		rq = rq->next;
 	}
 	return best_cpu;
 }
@@ -848,33 +971,38 @@ void sched_balance_exec(void)
  * of different cpu count but also [first] multiplied by 10 to 
  * provide better resolution.
  */
-static int find_busiest_node(int this_node)
+static struct node_t *find_busiest_node(struct node_t *this_node, struct node_t *parent_node)
 {
-	int i, node = -1, load, this_load, maxload;
+	int load, this_load, maxload = 0;
+	struct node_t *node, *busiest_node = NULL;
 
-	if (!nr_cpus_node(this_node))
-		return node;
-	this_load = maxload = (this_rq()->prev_node_load[this_node] >> 1)
-		+ (10 * atomic_read(&node_nr_running[this_node])
-		/ nr_cpus_node(this_node));
-	this_rq()->prev_node_load[this_node] = this_load;
-	for_each_node_with_cpus(i) {
-		if (i == this_node)
+	if (!(this_node->nr_cpus))
+		return busiest_node;
+	this_load = maxload = (this_rq()->prev_node_load[this_node->id] >> 1)
+		+ (10 * atomic_read(&this_node->nr_running))
+		/ this_node->nr_cpus;
+	this_rq()->prev_node_load[this_node->id] = this_load;
+
+	/* cycle through the list of member nodes */
+	node = parent_node->child_nodes;
+	while (node) {
+		if (!(node->nr_cpus) || (node == this_node)) {
+			node = node->next;
 			continue;
-		load = (this_rq()->prev_node_load[i] >> 1)
-			+ (10 * atomic_read(&node_nr_running[i])
-			/ nr_cpus_node(i));
-		this_rq()->prev_node_load[i] = load;
+		}
+		load = (this_rq()->prev_node_load[node->id] >> 1)
+			+ (10 * atomic_read(&node->nr_running))
+			/ node->nr_cpus;
+		this_rq()->prev_node_load[node->id] = load;
 		if (load > maxload && (100*load > NODE_THRESHOLD*this_load)) {
 			maxload = load;
-			node = i;
+			busiest_node = node;
 		}
+		node = node->next;
 	}
-	return node;
+	return busiest_node;
 }
 
-#endif /* CONFIG_NUMA */
-
 #ifdef CONFIG_SMP
 
 /*
@@ -905,10 +1033,11 @@ static inline unsigned int double_lock_b
 /*
  * find_busiest_queue - find the busiest runqueue among the cpus in cpumask.
  */
-static inline runqueue_t *find_busiest_queue(runqueue_t *this_rq, int this_cpu, int idle, int *imbalance, cpumask_t cpumask)
+static inline runqueue_t *find_busiest_queue(runqueue_t *this_rq, int this_cpu, int idle , int *imbalance, struct node_t *node)
 {
-	int nr_running, load, max_load, i;
+	int nr_running, load, max_load, cpu_id;
 	runqueue_t *busiest, *rq_src;
+	struct node_t *m, *n;
 
 	/*
 	 * We search all runqueues to find the most busy one.
@@ -939,21 +1068,31 @@ static inline runqueue_t *find_busiest_q
 
 	busiest = NULL;
 	max_load = 1;
-	for (i = 0; i < NR_CPUS; i++) {
-		if (!cpu_isset(i, cpumask))
-			continue;
-
-		rq_src = cpu_rq(i);
-		if (idle || (rq_src->nr_running < this_rq->prev_cpu_load[i]))
-			load = rq_src->nr_running;
-		else
-			load = this_rq->prev_cpu_load[i];
-		this_rq->prev_cpu_load[i] = rq_src->nr_running;
-
-		if ((load > max_load) && (rq_src != this_rq)) {
-			busiest = rq_src;
-			max_load = load;
+	if (nr_running > 1)
+		max_load = nr_running;
+	n = node;
+	while (n) {
+		m = n;
+		while (m) {
+			rq_src = m->runqueues;
+			while (rq_src) {
+				cpu_id = rq_src->id;
+				if (idle || (rq_src->nr_running < this_rq->prev_cpu_load[cpu_id]))
+					load = rq_src->nr_running;
+				else
+					load = this_rq->prev_cpu_load[cpu_id];
+				this_rq->prev_cpu_load[cpu_id] = rq_src->nr_running;
+				load = rq_src->nr_running;
+
+				if ((load > max_load) && (rq_src != this_rq)) {
+					busiest = rq_src;
+					max_load = load;
+				}
+			rq_src = rq_src->next;
+			}
+			m = m->next;
 		}
+		n = n->child_nodes;
 	}
 
 	if (likely(!busiest))
@@ -1012,7 +1151,7 @@ static inline void pull_task(runqueue_t 
  * We call this with the current runqueue locked,
  * irqs disabled.
  */
-static void load_balance(runqueue_t *this_rq, int idle, cpumask_t cpumask)
+static void load_balance(runqueue_t *this_rq, int idle, struct node_t *node)
 {
 	int imbalance, idx, this_cpu = smp_processor_id();
 	runqueue_t *busiest;
@@ -1020,7 +1159,7 @@ static void load_balance(runqueue_t *thi
 	struct list_head *head, *curr;
 	task_t *tmp;
 
-	busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance, cpumask);
+	busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance, node);
 	if (!busiest)
 		goto out;
 
@@ -1094,67 +1233,46 @@ out:
  * get called every timer tick, on every CPU. Our balancing action
  * frequency and balancing agressivity depends on whether the CPU is
  * idle or not.
- *
- * busy-rebalance every 200 msecs. idle-rebalance every 1 msec. (or on
- * systems with HZ=100, every 10 msecs.)
- *
- * On NUMA, do a node-rebalance every 400 msecs.
  */
-#define IDLE_REBALANCE_TICK (HZ/1000 ?: 1)
-#define BUSY_REBALANCE_TICK (HZ/5 ?: 1)
-#define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * 5)
-#define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * 2)
 
-#ifdef CONFIG_NUMA
-static void balance_node(runqueue_t *this_rq, int idle, int this_cpu)
+static void balance_node(runqueue_t *this_rq, int idle, struct node_t *last_node, struct node_t *node)
 {
-	int node = find_busiest_node(cpu_to_node(this_cpu));
+	if (last_node)
+		node = find_busiest_node(last_node, node);
 
-	if (node >= 0) {
-		cpumask_t cpumask = node_to_cpumask(node);
-		cpu_set(this_cpu, cpumask);
+	if (node) {
 		spin_lock(&this_rq->lock);
-		load_balance(this_rq, idle, cpumask);
+		load_balance(this_rq, idle, node);
 		spin_unlock(&this_rq->lock);
 	}
 }
-#endif
 
 static void rebalance_tick(runqueue_t *this_rq, int idle)
 {
-#ifdef CONFIG_NUMA
-	int this_cpu = smp_processor_id();
-#endif
 	unsigned long j = jiffies;
+	struct node_t *node = this_rq->node, *last_node = NULL;
 
 	/*
-	 * First do inter-node rebalancing, then intra-node rebalancing,
-	 * if both events happen in the same tick. The inter-node
-	 * rebalancing does not necessarily have to create a perfect
-	 * balance within the node, since we load-balance the most loaded
-	 * node with the current CPU. (ie. other CPUs in the local node
-	 * are not balanced.)
+	 * we start with lowest level node (the whole system in nonNUMA)
+	 * then we loop, node = node->parent_node and see if it's time for
+	 * a balance.  Each time we loop, the scope of cpus to balance gets
+	 * bigger.
 	 */
+
 	if (idle) {
-#ifdef CONFIG_NUMA
-		if (!(j % IDLE_NODE_REBALANCE_TICK))
-			balance_node(this_rq, idle, this_cpu);
-#endif
-		if (!(j % IDLE_REBALANCE_TICK)) {
-			spin_lock(&this_rq->lock);
-			load_balance(this_rq, idle, cpu_to_node_mask(this_cpu));
-			spin_unlock(&this_rq->lock);
+		while (node) {
+			if (!(j % node->idle_rebalance_tick))
+				balance_node(this_rq, idle, last_node, node);
+			last_node = node;
+			node = node->parent_node;
 		}
 		return;
 	}
-#ifdef CONFIG_NUMA
-	if (!(j % BUSY_NODE_REBALANCE_TICK))
-		balance_node(this_rq, idle, this_cpu);
-#endif
-	if (!(j % BUSY_REBALANCE_TICK)) {
-		spin_lock(&this_rq->lock);
-		load_balance(this_rq, idle, cpu_to_node_mask(this_cpu));
-		spin_unlock(&this_rq->lock);
+	while (node) {
+		if (!(j % node->busy_rebalance_tick))
+			balance_node(this_rq, idle, last_node, node);
+		last_node = node;
+		node = node->parent_node;
 	}
 }
 #else
@@ -1331,7 +1449,8 @@ need_resched:
 pick_next_task:
 	if (unlikely(!rq->nr_running)) {
 #ifdef CONFIG_SMP
-		load_balance(rq, 1, cpu_to_node_mask(smp_processor_id()));
+		if (rq->node)
+			load_balance(rq, 1, rq->node);
 		if (rq->nr_running)
 			goto pick_next_task;
 #endif
@@ -2516,16 +2635,18 @@ void __init sched_init(void)
 
 	/* Init the kstat counters */
 	init_kstat();
+	for (i = 0; i < MAX_NUMNODES; i++)
+		nodeptr(i)->id = i;
 	for (i = 0; i < NR_CPUS; i++) {
 		prio_array_t *array;
 
 		rq = cpu_rq(i);
+		rq->id = i;
 		rq->active = rq->arrays;
 		rq->expired = rq->arrays + 1;
 		spin_lock_init(&rq->lock);
 		INIT_LIST_HEAD(&rq->migration_queue);
 		atomic_set(&rq->nr_iowait, 0);
-		nr_running_init(rq);
 
 		for (j = 0; j < 2; j++) {
 			array = rq->arrays + j;

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Lse-tech] Re: [patch] scheduler fix for 1cpu/node case
  2003-08-22 15:46           ` [Lse-tech] " Andrew Theurer
@ 2003-08-22 22:56             ` Nick Piggin
  2003-08-23  0:12               ` Andrew Theurer
  0 siblings, 1 reply; 25+ messages in thread
From: Nick Piggin @ 2003-08-22 22:56 UTC (permalink / raw)
  To: Andrew Theurer
  Cc: Bill Davidsen, Martin J. Bligh, Erich Focht, linux-kernel, LSE,
	Andi Kleen, torvalds



Andrew Theurer wrote:

>On Wednesday 13 August 2003 15:49, Bill Davidsen wrote:
>
>>On Mon, 28 Jul 2003, Andrew Theurer wrote:
>>
>>>Personally, I'd like to see all systems use NUMA sched, non NUMA systems
>>>being a single node (no policy difference from non-numa sched), allowing
>>>us to remove all NUMA ifdefs.  I think the code would be much more
>>>readable.
>>>
>>That sounds like a great idea, but I'm not sure it could be realized short
>>of a major rewrite. Look how hard Ingo and Con are working just to get a
>>single node doing a good job with interactive and throughput tradeoffs.
>>
>
>Actually it's not too bad.  Attached is a patch to do it.  It also does 
>multi-level node support and makes all the load balance routines 
>runqueue-centric instead of cpu-centric, so adding something like shared 
>runqueues (for HT) should be really easy.  Hmm, other things: inter-node 
>balance intervals are now arch specific (AMD is "1").  The default busy/idle 
>balance timers of 200/1 are not arch specific, but I'm thinking they should 
>be.  And for non-numa, the scheduling policy is the same as it was with 
>vanilla O(1). 
>

I'm not saying you're wrong, but do you have some numbers where this
helps? ie. two architectures that need very different balance numbers.
And what is the reason for making AMD's balance interval 1?

Also, things like nr_running_inc are supposed to be very fast. I am
a bit worried to see a loop and CPU shared atomics in there.

node_2_node is an odd sounding conversion too ;)

BTW. you should be CC'ing Ingo if you have any intention of scheduler
stuff getting into 2.6.



^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Lse-tech] Re: [patch] scheduler fix for 1cpu/node case
  2003-08-22 22:56             ` Nick Piggin
@ 2003-08-23  0:12               ` Andrew Theurer
  2003-08-23  0:29                 ` Nick Piggin
  2003-08-23  1:31                 ` Martin J. Bligh
  0 siblings, 2 replies; 25+ messages in thread
From: Andrew Theurer @ 2003-08-23  0:12 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Bill Davidsen, Martin J. Bligh, Erich Focht, linux-kernel, LSE,
	Andi Kleen, torvalds, mingo

On Friday 22 August 2003 17:56, Nick Piggin wrote:
> Andrew Theurer wrote:
> >On Wednesday 13 August 2003 15:49, Bill Davidsen wrote:
> >>On Mon, 28 Jul 2003, Andrew Theurer wrote:
> >>>Personally, I'd like to see all systems use NUMA sched, non NUMA systems
> >>>being a single node (no policy difference from non-numa sched), allowing
> >>>us to remove all NUMA ifdefs.  I think the code would be much more
> >>>readable.
> >>
> >>That sounds like a great idea, but I'm not sure it could be realized
> >> short of a major rewrite. Look how hard Ingo and Con are working just to
> >> get a single node doing a good job with interactive and throughput
> >> tradeoffs.
> >
> >Actually it's not too bad.  Attached is a patch to do it.  It also does
> >multi-level node support and makes all the load balance routines
> >runqueue-centric instead of cpu-centric, so adding something like shared
> >runqueues (for HT) should be really easy.  Hmm, other things: inter-node
> >balance intervals are now arch specific (AMD is "1").  The default
> > busy/idle balance timers of 200/1 are not arch specific, but I'm thinking
> > they should be.  And for non-numa, the scheduling policy is the same as
> > it was with vanilla O(1).
>
> I'm not saying you're wrong, but do you have some numbers where this
> helps? ie. two architectures that need very different balance numbers.
> And what is the reason for making AMD's balance interval 1?

AMD is 1 because there's no need to balance within a node, so I want the 
inter-node balance frequency to be as often as it was with just O(1).  This 
interval would not work well with other NUMA boxes, so that's the main reason 
to have arch specific intervals.  And, as a general guideline, boxes with 
different local-remote latency ratios will probably benefit from different 
inter-node balance intervals.  I don't know what these ratios are, but I'd 
like the kernel to have the ability to change for one arch and not affect 
another.

> Also, things like nr_running_inc are supposed to be very fast. I am
> a bit worried to see a loop and CPU shared atomics in there.

That has concerned me, too.  So far I haven't been able to see a measurable 
difference either way (within noise level), but it's possible.  The other 
alternative is to sum up node load at sched_best_cpu and find_busiest_node.

> node_2_node is an odd sounding conversion too ;)

I just went off the toplogy already there, so I left it.

> BTW. you should be CC'ing Ingo if you have any intention of scheduler
> stuff getting into 2.6.

OK, thanks!


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Lse-tech] Re: [patch] scheduler fix for 1cpu/node case
  2003-08-23  0:12               ` Andrew Theurer
@ 2003-08-23  0:29                 ` Nick Piggin
  2003-08-23  0:47                   ` William Lee Irwin III
  2003-08-23 14:32                   ` Andrew Theurer
  2003-08-23  1:31                 ` Martin J. Bligh
  1 sibling, 2 replies; 25+ messages in thread
From: Nick Piggin @ 2003-08-23  0:29 UTC (permalink / raw)
  To: habanero
  Cc: Bill Davidsen, Martin J. Bligh, Erich Focht, linux-kernel, LSE,
	Andi Kleen, torvalds, mingo



Andrew Theurer wrote:

>On Friday 22 August 2003 17:56, Nick Piggin wrote:
>
>>Andrew Theurer wrote:
>>
>>>On Wednesday 13 August 2003 15:49, Bill Davidsen wrote:
>>>
>>>>On Mon, 28 Jul 2003, Andrew Theurer wrote:
>>>>
>>>>>Personally, I'd like to see all systems use NUMA sched, non NUMA systems
>>>>>being a single node (no policy difference from non-numa sched), allowing
>>>>>us to remove all NUMA ifdefs.  I think the code would be much more
>>>>>readable.
>>>>>
>>>>That sounds like a great idea, but I'm not sure it could be realized
>>>>short of a major rewrite. Look how hard Ingo and Con are working just to
>>>>get a single node doing a good job with interactive and throughput
>>>>tradeoffs.
>>>>
>>>Actually it's not too bad.  Attached is a patch to do it.  It also does
>>>multi-level node support and makes all the load balance routines
>>>runqueue-centric instead of cpu-centric, so adding something like shared
>>>runqueues (for HT) should be really easy.  Hmm, other things: inter-node
>>>balance intervals are now arch specific (AMD is "1").  The default
>>>busy/idle balance timers of 200/1 are not arch specific, but I'm thinking
>>>they should be.  And for non-numa, the scheduling policy is the same as
>>>it was with vanilla O(1).
>>>
>>I'm not saying you're wrong, but do you have some numbers where this
>>helps? ie. two architectures that need very different balance numbers.
>>And what is the reason for making AMD's balance interval 1?
>>
>
>AMD is 1 because there's no need to balance within a node, so I want the 
>inter-node balance frequency to be as often as it was with just O(1).  This 
>interval would not work well with other NUMA boxes, so that's the main reason 
>to have arch specific intervals.
>

OK, I misread the patch. IIRC AMD has 1 CPU per node? If so, why doesn't
this simply prevent balancing within a node?

>  And, as a general guideline, boxes with 
>different local-remote latency ratios will probably benefit from different 
>inter-node balance intervals.  I don't know what these ratios are, but I'd 
>like the kernel to have the ability to change for one arch and not affect 
>another.
>

I fully appreciate there are huge differences... I am curious if
you can see much improvements in practice.

>
>>Also, things like nr_running_inc are supposed to be very fast. I am
>>a bit worried to see a loop and CPU shared atomics in there.
>>
>
>That has concerned me, too.  So far I haven't been able to see a measurable 
>difference either way (within noise level), but it's possible.  The other 
>alternative is to sum up node load at sched_best_cpu and find_busiest_node.
>

Hmm... get someone to try the scheduler benchmarks on a 32 way box ;)

>
>>node_2_node is an odd sounding conversion too ;)
>>
>
>I just went off the toplogy already there, so I left it.
>
>
>>BTW. you should be CC'ing Ingo if you have any intention of scheduler
>>stuff getting into 2.6.
>>
>
>OK, thanks!
>
>

Good luck with it. Definitely some good ideas.



^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Lse-tech] Re: [patch] scheduler fix for 1cpu/node case
  2003-08-23  0:29                 ` Nick Piggin
@ 2003-08-23  0:47                   ` William Lee Irwin III
  2003-08-23  8:48                     ` Nick Piggin
  2003-08-23 14:32                   ` Andrew Theurer
  1 sibling, 1 reply; 25+ messages in thread
From: William Lee Irwin III @ 2003-08-23  0:47 UTC (permalink / raw)
  To: Nick Piggin
  Cc: habanero, Bill Davidsen, Martin J. Bligh, Erich Focht,
	linux-kernel, LSE, Andi Kleen, torvalds, mingo

On Sat, Aug 23, 2003 at 10:29:21AM +1000, Nick Piggin wrote:
> Hmm... get someone to try the scheduler benchmarks on a 32 way box ;)

What scheduler benchmark?


-- wli

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Lse-tech] Re: [patch] scheduler fix for 1cpu/node case
  2003-08-23  0:12               ` Andrew Theurer
  2003-08-23  0:29                 ` Nick Piggin
@ 2003-08-23  1:31                 ` Martin J. Bligh
  1 sibling, 0 replies; 25+ messages in thread
From: Martin J. Bligh @ 2003-08-23  1:31 UTC (permalink / raw)
  To: habanero, Nick Piggin, colpatch
  Cc: Erich Focht, linux-kernel, LSE, Andi Kleen

>> node_2_node is an odd sounding conversion too ;)
> 
> I just went off the toplogy already there, so I left it.

I thought we changed that to parent_node or something a while back?
Yes, node_2_node is rather nondescriptive ... 

Matt?

M.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Lse-tech] Re: [patch] scheduler fix for 1cpu/node case
  2003-08-23  0:47                   ` William Lee Irwin III
@ 2003-08-23  8:48                     ` Nick Piggin
  0 siblings, 0 replies; 25+ messages in thread
From: Nick Piggin @ 2003-08-23  8:48 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: habanero, Bill Davidsen, Martin J. Bligh, Erich Focht,
	linux-kernel, LSE, Andi Kleen, torvalds, mingo



William Lee Irwin III wrote:

>On Sat, Aug 23, 2003 at 10:29:21AM +1000, Nick Piggin wrote:
>
>>Hmm... get someone to try the scheduler benchmarks on a 32 way box ;)
>>
>
>What scheduler benchmark?
>
>

I don't know! I don't care about high end scheduler performance.
Volanomark? Kernbench? SDET? Whatever you care about.



^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Lse-tech] Re: [patch] scheduler fix for 1cpu/node case
  2003-08-23  0:29                 ` Nick Piggin
  2003-08-23  0:47                   ` William Lee Irwin III
@ 2003-08-23 14:32                   ` Andrew Theurer
  1 sibling, 0 replies; 25+ messages in thread
From: Andrew Theurer @ 2003-08-23 14:32 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Bill Davidsen, Martin J. Bligh, Erich Focht, linux-kernel, LSE,
	Andi Kleen, torvalds, mingo

> >AMD is 1 because there's no need to balance within a node, so I want the
> >inter-node balance frequency to be as often as it was with just O(1). 
> > This interval would not work well with other NUMA boxes, so that's the
> > main reason to have arch specific intervals.
>
> OK, I misread the patch. IIRC AMD has 1 CPU per node? If so, why doesn't
> this simply prevent balancing within a node?

Yes, one cpu/node.  Oh, it does prevent it, but with the current intervals, we 
end up not really balancing as often (since we need a inter-node balance), 
and when we call load_balance in schedule when idle, we don't balance at all 
since it's only a node local balance.

> >  And, as a general guideline, boxes with
> >different local-remote latency ratios will probably benefit from different
> >inter-node balance intervals.  I don't know what these ratios are, but I'd
> >like the kernel to have the ability to change for one arch and not affect
> >another.
>
> I fully appreciate there are huge differences... I am curious if
> you can see much improvements in practice.

I think AMD would be the first good test.  Maybe Andi has some results on 
numasched vs O(1), which would be a good indication.

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2003-08-23 16:57 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2003-07-28 19:16 [patch] scheduler fix for 1cpu/node case Erich Focht
2003-07-28 19:55 ` Martin J. Bligh
2003-07-28 20:18   ` Erich Focht
2003-07-28 20:37     ` Martin J. Bligh
2003-07-29  2:24       ` Andrew Theurer
2003-07-29 10:08         ` Erich Focht
2003-07-29 13:33           ` [Lse-tech] " Andrew Theurer
2003-07-30 15:23             ` Erich Focht
2003-07-30 15:44               ` Andrew Theurer
2003-07-29 14:27           ` Martin J. Bligh
2003-08-13 20:49         ` Bill Davidsen
2003-08-22 15:46           ` [Lse-tech] " Andrew Theurer
2003-08-22 22:56             ` Nick Piggin
2003-08-23  0:12               ` Andrew Theurer
2003-08-23  0:29                 ` Nick Piggin
2003-08-23  0:47                   ` William Lee Irwin III
2003-08-23  8:48                     ` Nick Piggin
2003-08-23 14:32                   ` Andrew Theurer
2003-08-23  1:31                 ` Martin J. Bligh
2003-07-29 10:08       ` Erich Focht
2003-07-29 14:41     ` Andi Kleen
2003-07-31 15:05 ` Martin J. Bligh
2003-07-31 21:45   ` Erich Focht
2003-08-01  0:26     ` Martin J. Bligh
2003-08-01 16:30       ` [Lse-tech] " Erich Focht

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).