linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [patch] scheduler fix for 1cpu/node case
@ 2003-07-28 19:16 Erich Focht
  2003-07-28 19:55 ` Martin J. Bligh
  2003-07-31 15:05 ` Martin J. Bligh
  0 siblings, 2 replies; 29+ messages in thread
From: Erich Focht @ 2003-07-28 19:16 UTC (permalink / raw)
  To: linux-kernel, LSE; +Cc: Martin J. Bligh, Andi Kleen, torvalds

[-- Attachment #1: Type: text/plain, Size: 1686 bytes --]

Hi,

after talking to several people at OLS about the current NUMA
scheduler the conclusion was:
(1) it sucks (for particular workloads),
(2) on x86_64 (embarassingly simple NUMA) it's useless, goto (1).

Fact is that the current separation of local and global balancing,
where global balancing is done only in the timer interrupt at a fixed
rate is way too unflexible. A CPU going idle inside a well balanced
node will stay idle for a while even if there's a lot of work to
do. Especially in the corner case of one CPU per node this is
condemning that CPU to idleness for at least 5 ms. So x86_64 platforms
(but not only those!) suffer and whish to switch off the NUMA
scheduler while keeping NUMA memory management on.

The attached patch is a simple solution which
- solves the 1 CPU / node problem,
- lets other systems behave (almost) as before,
- opens the way to other optimisations like multi-level node
  hierarchies (by tuning the retry rate)
- simpifies the NUMA scheduler and deletes more lines of code than it
  adds.

The timer interrupt based global rebalancing might appear to be a
simple and good idea but it takes the scheduler a lot of
flexibility. In the patch the global rebalancing is done after a
certain number of failed attempts to locally balance. The number of
attempts is proportional to the number of CPUs in the current
node. For only 1 CPU in the current node the scheduler doesn't even
try to balance locally, it wouldn't make sense anyway. Of course one
could instead set IDLE_NODE_REBALANCE_TICK = IDLE_REBALANCE_TICK, but
this is more ugly (IMHO) and only helps when all nodes have 1 CPU /
node.

Please consider this for inclusion.

Thanks,
Erich




[-- Attachment #2: 1cpufix-lb-2.6.0t1.patch --]
[-- Type: text/x-diff, Size: 4255 bytes --]

diff -urNp 2.6.0t1/kernel/sched.c 2.6.0t1-1cpufix/kernel/sched.c
--- 2.6.0t1/kernel/sched.c	2003-07-14 05:37:14.000000000 +0200
+++ 2.6.0t1-1cpufix/kernel/sched.c	2003-07-28 05:32:28.000000000 +0200
@@ -164,6 +164,7 @@ struct runqueue {
 	prio_array_t *active, *expired, arrays[2];
 	int prev_cpu_load[NR_CPUS];
 #ifdef CONFIG_NUMA
+	unsigned int nr_lb_failed;
 	atomic_t *node_nr_running;
 	int prev_node_load[MAX_NUMNODES];
 #endif
@@ -856,6 +857,35 @@ static int find_busiest_node(int this_no
 	return node;
 }
 
+/*
+ * Decide whether the scheduler should balance locally (inside the same node)
+ * or globally depending on the number of failed local balance attempts.
+ * The number of failed local balance attempts depends on the number of cpus
+ * in the current node. In case it's just one, go immediately for global
+ * balancing. On a busy cpu the number of retries is smaller.
+ */
+static inline unsigned long cpus_to_balance(int this_cpu, runqueue_t *this_rq)
+{
+	int node, retries, this_node = cpu_to_node(this_cpu);
+
+	retries = nr_cpus_node(this_node) - 1;
+	if (this_rq->curr != this_rq->idle)
+		retries >>= 1;
+	if (this_rq->nr_lb_failed >= retries) {
+		node = find_busiest_node(this_node);
+		this_rq->nr_lb_failed = 0;
+		if (node >= 0)
+			return (node_to_cpu_mask(node) | (1UL << this_cpu));
+	}
+	return node_to_cpu_mask(this_node);
+}
+
+#else /* !CONFIG_NUMA */
+
+static inline unsigned long cpus_to_balance(int this_cpu, runqueue_t *this_rq)
+{
+	return cpu_online_map;
+}
 #endif /* CONFIG_NUMA */
 
 #ifdef CONFIG_SMP
@@ -960,6 +990,12 @@ static inline runqueue_t *find_busiest_q
 		busiest = NULL;
 	}
 out:
+#ifdef CONFIG_NUMA
+	if (!busiest)
+		this_rq->nr_lb_failed++;
+	else
+		this_rq->nr_lb_failed = 0;
+#endif
 	return busiest;
 }
 
@@ -995,7 +1031,7 @@ static inline void pull_task(runqueue_t 
  * We call this with the current runqueue locked,
  * irqs disabled.
  */
-static void load_balance(runqueue_t *this_rq, int idle, unsigned long cpumask)
+static void load_balance(runqueue_t *this_rq, int idle)
 {
 	int imbalance, idx, this_cpu = smp_processor_id();
 	runqueue_t *busiest;
@@ -1003,7 +1039,8 @@ static void load_balance(runqueue_t *thi
 	struct list_head *head, *curr;
 	task_t *tmp;
 
-	busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance, cpumask);
+	busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance,
+				     cpus_to_balance(this_cpu, this_rq));
 	if (!busiest)
 		goto out;
 
@@ -1085,29 +1122,9 @@ out:
  */
 #define IDLE_REBALANCE_TICK (HZ/1000 ?: 1)
 #define BUSY_REBALANCE_TICK (HZ/5 ?: 1)
-#define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * 5)
-#define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * 2)
-
-#ifdef CONFIG_NUMA
-static void balance_node(runqueue_t *this_rq, int idle, int this_cpu)
-{
-	int node = find_busiest_node(cpu_to_node(this_cpu));
-	unsigned long cpumask, this_cpumask = 1UL << this_cpu;
-
-	if (node >= 0) {
-		cpumask = node_to_cpumask(node) | this_cpumask;
-		spin_lock(&this_rq->lock);
-		load_balance(this_rq, idle, cpumask);
-		spin_unlock(&this_rq->lock);
-	}
-}
-#endif
 
 static void rebalance_tick(runqueue_t *this_rq, int idle)
 {
-#ifdef CONFIG_NUMA
-	int this_cpu = smp_processor_id();
-#endif
 	unsigned long j = jiffies;
 
 	/*
@@ -1119,24 +1136,16 @@ static void rebalance_tick(runqueue_t *t
 	 * are not balanced.)
 	 */
 	if (idle) {
-#ifdef CONFIG_NUMA
-		if (!(j % IDLE_NODE_REBALANCE_TICK))
-			balance_node(this_rq, idle, this_cpu);
-#endif
 		if (!(j % IDLE_REBALANCE_TICK)) {
 			spin_lock(&this_rq->lock);
-			load_balance(this_rq, idle, cpu_to_node_mask(this_cpu));
+			load_balance(this_rq, idle);
 			spin_unlock(&this_rq->lock);
 		}
 		return;
 	}
-#ifdef CONFIG_NUMA
-	if (!(j % BUSY_NODE_REBALANCE_TICK))
-		balance_node(this_rq, idle, this_cpu);
-#endif
 	if (!(j % BUSY_REBALANCE_TICK)) {
 		spin_lock(&this_rq->lock);
-		load_balance(this_rq, idle, cpu_to_node_mask(this_cpu));
+		load_balance(this_rq, idle);
 		spin_unlock(&this_rq->lock);
 	}
 }
@@ -1306,7 +1315,7 @@ need_resched:
 pick_next_task:
 	if (unlikely(!rq->nr_running)) {
 #ifdef CONFIG_SMP
-		load_balance(rq, 1, cpu_to_node_mask(smp_processor_id()));
+		load_balance(rq, 1);
 		if (rq->nr_running)
 			goto pick_next_task;
 #endif

^ permalink raw reply	[flat|nested] 29+ messages in thread
* Re: [Lse-tech] Re: [patch] scheduler fix for 1cpu/node case
@ 2003-07-29 14:06 Mala Anand
  2003-07-29 14:29 ` Martin J. Bligh
  0 siblings, 1 reply; 29+ messages in thread
From: Mala Anand @ 2003-07-29 14:06 UTC (permalink / raw)
  To: Erich Focht, Martin J. Bligh, linux-kernel, LSE; +Cc: Andi Kleen, torvalds






>> If you want data supporting my assumptions: Ted Ts'o's talk at OLS
>> shows the necessity to rebalance ASAP (even in try_to_wake_up).

>If this is the patch I am thinking of, it was the (attached) one I sent
them,
>which did a light "push" rebalance at try_to_wake_up.  Calling
load_balance
>at try_to_wake_up seems very heavy-weight.  This patch only looks for an
idle
>cpu (within the same node) to wake up on before task activation, only if
the
>task_rq(p)->nr_running is too long.  So, yes, I do believe this can be
>important, but I think it's only called for when we have an idle cpu.

The patch that you sent to Rajan didn't yield any improvement on
specjappserver so we did not include that  in the ols paper. What
is described in the ols paper is "calling load-balance" from
try-to-wake-up. Both calling load-balance from try-to-wakeup and
the "light push" rebalance at try_to_wake_up are already done in
Andrea's 0(1) scheduler patch.

Regards,
    Mala


   Mala Anand
   IBM Linux Technology Center - Kernel Performance
   E-mail:manand@us.ibm.com
   http://www-124.ibm.com/developerworks/opensource/linuxperf
   http://www-124.ibm.com/developerworks/projects/linuxperf
   Phone:838-8088; Tie-line:678-8088



^ permalink raw reply	[flat|nested] 29+ messages in thread
* Re: [Lse-tech] Re: [patch] scheduler fix for 1cpu/node case
@ 2003-07-29 16:04 Mala Anand
  0 siblings, 0 replies; 29+ messages in thread
From: Mala Anand @ 2003-07-29 16:04 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Andi Kleen, Erich Focht, linux-kernel, LSE, lse-tech-admin,
	Mala Anand, torvalds





>Are the balances you're doing on wakeup global or node-local?
The test is not done on NUMA systems.

Regards,
    Mala


   Mala Anand
   IBM Linux Technology Center - Kernel Performance
   E-mail:manand@us.ibm.com
   http://www-124.ibm.com/developerworks/opensource/linuxperf
   http://www-124.ibm.com/developerworks/projects/linuxperf
   Phone:838-8088; Tie-line:678-8088




                                                                                                                                               
                      "Martin J. Bligh"                                                                                                        
                      <mbligh@aracnet.com>             To:       Mala Anand/Austin/IBM@IBMUS, Erich Focht <efocht@hpce.nec.com>, linux-kernel  
                      Sent by:                          <linux-kernel@vger.kernel.org>, LSE <lse-tech@lists.sourceforge.net>                   
                      lse-tech-admin@lists.sour        cc:       Andi Kleen <ak@muc.de>, torvalds@osdl.org                                     
                      ceforge.net                      Subject:  Re: [Lse-tech] Re: [patch] scheduler fix for 1cpu/node case                   
                                                                                                                                               
                                                                                                                                               
                      07/29/2003 09:29 AM                                                                                                      
                                                                                                                                               
                                                                                                                                               




>>> If you want data supporting my assumptions: Ted Ts'o's talk at OLS
>>> shows the necessity to rebalance ASAP (even in try_to_wake_up).
>
>> If this is the patch I am thinking of, it was the (attached) one I sent
> them,
>> which did a light "push" rebalance at try_to_wake_up.  Calling
> load_balance
>> at try_to_wake_up seems very heavy-weight.  This patch only looks for an
> idle
>> cpu (within the same node) to wake up on before task activation, only if
> the
>> task_rq(p)->nr_running is too long.  So, yes, I do believe this can be
>> important, but I think it's only called for when we have an idle cpu.
>
> The patch that you sent to Rajan didn't yield any improvement on
> specjappserver so we did not include that  in the ols paper. What
> is described in the ols paper is "calling load-balance" from
> try-to-wake-up. Both calling load-balance from try-to-wakeup and
> the "light push" rebalance at try_to_wake_up are already done in
> Andrea's 0(1) scheduler patch.

Are the balances you're doing on wakeup global or node-local?

M.



-------------------------------------------------------
This SF.Net email sponsored by: Free pre-built ASP.NET sites including
Data Reports, E-commerce, Portals, and Forums are available now.
Download today and enter to win an XBOX or Visual Studio .NET.
http://aspnet.click-url.com/go/psa00100003ave/direct;at.aspnet_072303_01/01
_______________________________________________
Lse-tech mailing list
Lse-tech@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/lse-tech




^ permalink raw reply	[flat|nested] 29+ messages in thread
* RE: [Lse-tech] Re: [patch] scheduler fix for 1cpu/node case
@ 2003-07-30 16:34 Luck, Tony
  0 siblings, 0 replies; 29+ messages in thread
From: Luck, Tony @ 2003-07-30 16:34 UTC (permalink / raw)
  To: Andrew Theurer, Erich Focht, Martin J. Bligh, linux-kernel, LSE
  Cc: Andi Kleen, torvalds

> As for idle balances, we may be able to go a step further: 
> follow the range rules, but do a more aggressive/frequent search.

Be cautious about how much work you do here ... while you can burn
as much cpu time as you like on an idle cpu without affecting anything,
you need to be sure that your activities don't burn too much bus
bandwidth, or cause cache lines to ping-pong around the machine. The
classic case of this has been seen while one cpu is trying to boot,
and the other 31 idle cpus beat the bus to death looking to see
whether they can "help".

-Tony

^ permalink raw reply	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2003-08-23 16:57 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2003-07-28 19:16 [patch] scheduler fix for 1cpu/node case Erich Focht
2003-07-28 19:55 ` Martin J. Bligh
2003-07-28 20:18   ` Erich Focht
2003-07-28 20:37     ` Martin J. Bligh
2003-07-29  2:24       ` Andrew Theurer
2003-07-29 10:08         ` Erich Focht
2003-07-29 13:33           ` [Lse-tech] " Andrew Theurer
2003-07-30 15:23             ` Erich Focht
2003-07-30 15:44               ` Andrew Theurer
2003-07-29 14:27           ` Martin J. Bligh
2003-08-13 20:49         ` Bill Davidsen
2003-08-22 15:46           ` [Lse-tech] " Andrew Theurer
2003-08-22 22:56             ` Nick Piggin
2003-08-23  0:12               ` Andrew Theurer
2003-08-23  0:29                 ` Nick Piggin
2003-08-23  0:47                   ` William Lee Irwin III
2003-08-23  8:48                     ` Nick Piggin
2003-08-23 14:32                   ` Andrew Theurer
2003-08-23  1:31                 ` Martin J. Bligh
2003-07-29 10:08       ` Erich Focht
2003-07-29 14:41     ` Andi Kleen
2003-07-31 15:05 ` Martin J. Bligh
2003-07-31 21:45   ` Erich Focht
2003-08-01  0:26     ` Martin J. Bligh
2003-08-01 16:30       ` [Lse-tech] " Erich Focht
2003-07-29 14:06 Mala Anand
2003-07-29 14:29 ` Martin J. Bligh
2003-07-29 16:04 Mala Anand
2003-07-30 16:34 Luck, Tony

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).