linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [patch] HT scheduler, sched-2.5.68-B2
@ 2003-04-23 16:23 Ingo Molnar
  2003-04-23 17:47 ` Martin J. Bligh
  0 siblings, 1 reply; 10+ messages in thread
From: Ingo Molnar @ 2003-04-23 16:23 UTC (permalink / raw)
  To: linux-kernel


This is the latest release of the HT scheduler, against 2.5.68.
Changes since -A9:

 - change migration sleep back to interruptible. (noticed by Rick 
   Lindsley)

 - turn off the more agressive idle-steal variant. This could fix the
   low-load regression reported by Martin J. Bligh.

 - fixed a HT-balancing bug causing oopses.

the patch (and future versions) can also be found at:

    http://redhat.com/~mingo/O(1)-scheduler/

	Ingo

--- linux/include/linux/sched.h.orig	
+++ linux/include/linux/sched.h	
@@ -147,6 +147,24 @@ typedef struct task_struct task_t;
 extern void sched_init(void);
 extern void init_idle(task_t *idle, int cpu);
 
+/*
+ * Is there a way to do this via Kconfig?
+ */
+#if CONFIG_NR_SIBLINGS_2
+# define CONFIG_NR_SIBLINGS 2
+#elif CONFIG_NR_SIBLINGS_4
+# define CONFIG_NR_SIBLINGS 4
+#else
+# define CONFIG_NR_SIBLINGS 0
+#endif
+
+#if CONFIG_NR_SIBLINGS
+# define CONFIG_SHARE_RUNQUEUE 1
+#else
+# define CONFIG_SHARE_RUNQUEUE 0
+#endif
+extern void sched_map_runqueue(int cpu1, int cpu2);
+
 extern void show_state(void);
 extern void show_trace(unsigned long *stack);
 extern void show_stack(unsigned long *stack);
@@ -814,11 +832,6 @@ static inline unsigned int task_cpu(stru
 	return p->thread_info->cpu;
 }
 
-static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
-{
-	p->thread_info->cpu = cpu;
-}
-
 #else
 
 static inline unsigned int task_cpu(struct task_struct *p)
@@ -826,10 +839,6 @@ static inline unsigned int task_cpu(stru
 	return 0;
 }
 
-static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
-{
-}
-
 #endif /* CONFIG_SMP */
 
 #endif /* __KERNEL__ */
--- linux/include/asm-i386/apic.h.orig	
+++ linux/include/asm-i386/apic.h	
@@ -101,4 +101,6 @@ extern unsigned int nmi_watchdog;
 
 #endif /* CONFIG_X86_LOCAL_APIC */
 
+extern int phys_proc_id[NR_CPUS];
+
 #endif /* __ASM_APIC_H */
--- linux/arch/i386/kernel/cpu/proc.c.orig	
+++ linux/arch/i386/kernel/cpu/proc.c	
@@ -1,4 +1,5 @@
 #include <linux/smp.h>
+#include <linux/sched.h>
 #include <linux/timex.h>
 #include <linux/string.h>
 #include <asm/semaphore.h>
@@ -114,6 +115,13 @@ static int show_cpuinfo(struct seq_file 
 		     fpu_exception ? "yes" : "no",
 		     c->cpuid_level,
 		     c->wp_works_ok ? "yes" : "no");
+#if CONFIG_SHARE_RUNQUEUE
+{
+	extern long __rq_idx[NR_CPUS];
+
+	seq_printf(m, "\nrunqueue\t: %ld\n", __rq_idx[n]);
+}
+#endif
 
 	for ( i = 0 ; i < 32*NCAPINTS ; i++ )
 		if ( test_bit(i, c->x86_capability) &&
--- linux/arch/i386/kernel/smpboot.c.orig	
+++ linux/arch/i386/kernel/smpboot.c	
@@ -38,6 +38,7 @@
 #include <linux/kernel.h>
 
 #include <linux/mm.h>
+#include <linux/sched.h>
 #include <linux/kernel_stat.h>
 #include <linux/smp_lock.h>
 #include <linux/irq.h>
@@ -933,6 +934,16 @@ void *xquad_portio;
 
 int cpu_sibling_map[NR_CPUS] __cacheline_aligned;
 
+static int test_ht;
+
+static int __init ht_setup(char *str)
+{
+	test_ht = 1;
+	return 1;
+}
+
+__setup("test_ht", ht_setup);
+
 static void __init smp_boot_cpus(unsigned int max_cpus)
 {
 	int apicid, cpu, bit;
@@ -1074,7 +1085,20 @@ static void __init smp_boot_cpus(unsigne
 	Dprintk("Boot done.\n");
 
 	/*
-	 * If Hyper-Threading is avaialble, construct cpu_sibling_map[], so
+	 * Here we can be sure that there is an IO-APIC in the system. Let's
+	 * go and set it up:
+	 */
+	smpboot_setup_io_apic();
+
+	setup_boot_APIC_clock();
+
+	/*
+	 * Synchronize the TSC with the AP
+	 */
+	if (cpu_has_tsc && cpucount && cpu_khz)
+		synchronize_tsc_bp();
+	/*
+	 * If Hyper-Threading is available, construct cpu_sibling_map[], so
 	 * that we can tell the sibling CPU efficiently.
 	 */
 	if (cpu_has_ht && smp_num_siblings > 1) {
@@ -1083,7 +1107,8 @@ static void __init smp_boot_cpus(unsigne
 		
 		for (cpu = 0; cpu < NR_CPUS; cpu++) {
 			int 	i;
-			if (!test_bit(cpu, &cpu_callout_map)) continue;
+			if (!test_bit(cpu, &cpu_callout_map))
+				continue;
 
 			for (i = 0; i < NR_CPUS; i++) {
 				if (i == cpu || !test_bit(i, &cpu_callout_map))
@@ -1099,17 +1124,41 @@ static void __init smp_boot_cpus(unsigne
 				printk(KERN_WARNING "WARNING: No sibling found for CPU %d.\n", cpu);
 			}
 		}
+#if CONFIG_SHARE_RUNQUEUE
+		/*
+		 * At this point APs would have synchronised TSC and
+		 * waiting for smp_commenced, with their APIC timer
+		 * disabled. So BP can go ahead do some initialization
+		 * for Hyper-Threading (if needed).
+		 */
+		for (cpu = 0; cpu < NR_CPUS; cpu++) {
+			int i;
+			if (!test_bit(cpu, &cpu_callout_map))
+				continue;
+			for (i = 0; i < NR_CPUS; i++) {
+				if (i <= cpu)
+					continue;
+				if (!test_bit(i, &cpu_callout_map))
+					continue;
+  
+				if (phys_proc_id[cpu] != phys_proc_id[i])
+					continue;
+				/*
+				 * merge runqueues, resulting in one
+				 * runqueue per package:
+				 */
+				sched_map_runqueue(cpu, i);
+				break;
+			}
+		}
+#endif
 	}
-
-	smpboot_setup_io_apic();
-
-	setup_boot_APIC_clock();
-
-	/*
-	 * Synchronize the TSC with the AP
-	 */
-	if (cpu_has_tsc && cpucount && cpu_khz)
-		synchronize_tsc_bp();
+#if CONFIG_SHARE_RUNQUEUE
+	if (smp_num_siblings == 1 && test_ht) {
+		printk("Simulating shared runqueues - mapping CPU#1's runqueue to CPU#0's runqueue.\n");
+		sched_map_runqueue(0, 1);
+	}
+#endif
 }
 
 /* These are wrappers to interface to the new boot process.  Someone
--- linux/arch/i386/Kconfig.orig	
+++ linux/arch/i386/Kconfig	
@@ -413,6 +413,24 @@ config SMP
 
 	  If you don't know what to do here, say N.
 
+choice
+
+	prompt "Hyperthreading Support"
+	depends on SMP
+	default NR_SIBLINGS_0
+
+config NR_SIBLINGS_0
+	bool "off"
+
+config NR_SIBLINGS_2
+	bool "2 siblings"
+
+config NR_SIBLINGS_4
+	bool "4 siblings"
+
+endchoice
+
+
 config PREEMPT
 	bool "Preemptible Kernel"
 	help
--- linux/kernel/sched.c.orig	
+++ linux/kernel/sched.c	
@@ -73,6 +73,7 @@
 #define INTERACTIVE_DELTA	2
 #define MAX_SLEEP_AVG		(10*HZ)
 #define STARVATION_LIMIT	(10*HZ)
+#define AGRESSIVE_IDLE_STEAL	0
 #define NODE_THRESHOLD		125
 
 /*
@@ -147,6 +148,48 @@ struct prio_array {
 };
 
 /*
+ * It's possible for two CPUs to share the same runqueue.
+ * This makes sense if they eg. share caches.
+ *
+ * We take the common 1:1 (SMP, UP) case and optimize it,
+ * the rest goes via remapping: rq_idx(cpu) gives the
+ * runqueue on which a particular cpu is on, cpu_idx(cpu)
+ * gives the rq-specific index of the cpu.
+ *
+ * (Note that the generic scheduler code does not impose any
+ *  restrictions on the mappings - there can be 4 CPUs per
+ *  runqueue or even assymetric mappings.)
+ */
+#if CONFIG_SHARE_RUNQUEUE
+# define MAX_NR_SIBLINGS CONFIG_NR_SIBLINGS
+  long __rq_idx[NR_CPUS] __cacheline_aligned;
+  static long __cpu_idx[NR_CPUS] __cacheline_aligned;
+# define rq_idx(cpu) (__rq_idx[(cpu)])
+# define cpu_idx(cpu) (__cpu_idx[(cpu)])
+# define for_each_sibling(idx, rq) \
+		for ((idx) = 0; (idx) < (rq)->nr_cpus; (idx)++)
+# define rq_nr_cpus(rq) ((rq)->nr_cpus)
+# define cpu_active_balance(c) (cpu_rq(c)->cpu[0].active_balance)
+#else
+# define MAX_NR_SIBLINGS 1
+# define rq_idx(cpu) (cpu)
+# define cpu_idx(cpu) 0
+# define for_each_sibling(idx, rq) while (0)
+# define cpu_active_balance(c) 0
+# define do_active_balance(rq, cpu) do { } while (0)
+# define rq_nr_cpus(rq) 1
+  static inline void active_load_balance(runqueue_t *rq, int this_cpu) { }
+#endif
+
+typedef struct cpu_s {
+	task_t *curr, *idle;
+	task_t *migration_thread;
+	struct list_head migration_queue;
+	int active_balance;
+	int cpu;
+} cpu_t;
+
+/*
  * This is the main, per-CPU runqueue data structure.
  *
  * Locking rule: those places that want to lock multiple runqueues
@@ -157,7 +200,6 @@ struct runqueue {
 	spinlock_t lock;
 	unsigned long nr_running, nr_switches, expired_timestamp,
 			nr_uninterruptible;
-	task_t *curr, *idle;
 	struct mm_struct *prev_mm;
 	prio_array_t *active, *expired, arrays[2];
 	int prev_cpu_load[NR_CPUS];
@@ -165,27 +207,39 @@ struct runqueue {
 	atomic_t *node_nr_running;
 	int prev_node_load[MAX_NUMNODES];
 #endif
-	task_t *migration_thread;
-	struct list_head migration_queue;
+	int nr_cpus;
+	cpu_t cpu[MAX_NR_SIBLINGS];
 
 	atomic_t nr_iowait;
 } ____cacheline_aligned;
 
 static struct runqueue runqueues[NR_CPUS] __cacheline_aligned;
 
-#define cpu_rq(cpu)		(runqueues + (cpu))
+#define cpu_rq(cpu)		(runqueues + (rq_idx(cpu)))
+#define cpu_int(c)		((cpu_rq(c))->cpu + cpu_idx(c))
+#define cpu_curr_ptr(cpu)	(cpu_int(cpu)->curr)
+#define cpu_idle_ptr(cpu)	(cpu_int(cpu)->idle)
+
 #define this_rq()		cpu_rq(smp_processor_id())
 #define task_rq(p)		cpu_rq(task_cpu(p))
-#define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 #define rt_task(p)		((p)->prio < MAX_RT_PRIO)
 
+#define migration_thread(cpu)	(cpu_int(cpu)->migration_thread)
+#define migration_queue(cpu)	(&cpu_int(cpu)->migration_queue)
+
+#if NR_CPUS > 1
+# define task_allowed(p, cpu)	((p)->cpus_allowed & (1UL << (cpu)))
+#else
+# define task_allowed(p, cpu)	1
+#endif
+
 /*
  * Default context-switch locking:
  */
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(rq, next)	do { } while(0)
 # define finish_arch_switch(rq, next)	spin_unlock_irq(&(rq)->lock)
-# define task_running(rq, p)		((rq)->curr == (p))
+# define task_running(p)		(cpu_curr_ptr(task_cpu(p)) == (p))
 #endif
 
 #ifdef CONFIG_NUMA
@@ -414,8 +468,18 @@ static inline void resched_task(task_t *
 #endif
 }
 
+static inline void resched_cpu(int cpu)
+{
+	resched_task(cpu_curr_ptr(cpu));
+}
+
 #ifdef CONFIG_SMP
 
+static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
+{
+	p->thread_info->cpu = cpu;
+}
+
 /*
  * wait_task_inactive - wait for a thread to unschedule.
  *
@@ -433,7 +497,7 @@ void wait_task_inactive(task_t * p)
 repeat:
 	preempt_disable();
 	rq = task_rq(p);
-	if (unlikely(task_running(rq, p))) {
+	if (unlikely(task_running(p))) {
 		cpu_relax();
 		/*
 		 * enable/disable preemption just to make this
@@ -444,7 +508,7 @@ repeat:
 		goto repeat;
 	}
 	rq = task_rq_lock(p, &flags);
-	if (unlikely(task_running(rq, p))) {
+	if (unlikely(task_running(p))) {
 		task_rq_unlock(rq, &flags);
 		preempt_enable();
 		goto repeat;
@@ -452,6 +516,13 @@ repeat:
 	task_rq_unlock(rq, &flags);
 	preempt_enable();
 }
+
+#else
+
+static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
+{
+}
+
 #endif
 
 /*
@@ -466,10 +537,44 @@ repeat:
  */
 void kick_if_running(task_t * p)
 {
-	if ((task_running(task_rq(p), p)) && (task_cpu(p) != smp_processor_id()))
+	if ((task_running(p)) && (task_cpu(p) != smp_processor_id()))
 		resched_task(p);
 }
 
+static void wake_up_cpu(runqueue_t *rq, int cpu, task_t *p)
+{
+	cpu_t *curr_cpu;
+	task_t *curr;
+	int idx;
+
+	if (idle_cpu(cpu))
+		return resched_cpu(cpu);
+
+	for_each_sibling(idx, rq) {
+		curr_cpu = rq->cpu + idx;
+		if (!task_allowed(p, curr_cpu->cpu))
+			continue;
+		if (curr_cpu->idle == curr_cpu->curr)
+			return resched_cpu(curr_cpu->cpu);
+	}
+
+	if (p->prio < cpu_curr_ptr(cpu)->prio)
+		return resched_task(cpu_curr_ptr(cpu));
+	if (p->prio == cpu_curr_ptr(cpu)->prio &&
+			p->time_slice > cpu_curr_ptr(cpu)->time_slice)
+		return resched_task(cpu_curr_ptr(cpu));
+
+	for_each_sibling(idx, rq) {
+		curr_cpu = rq->cpu + idx;
+		if (!task_allowed(p, curr_cpu->cpu))
+			continue;
+		curr = curr_cpu->curr;
+		if (p->prio < curr->prio)
+			return resched_task(curr);
+		if (p->prio == curr->prio && p->time_slice > curr->time_slice)
+			return resched_task(curr);
+	}
+}
 /***
  * try_to_wake_up - wake up a thread
  * @p: the to-be-woken-up thread
@@ -491,6 +596,7 @@ static int try_to_wake_up(task_t * p, un
 	long old_state;
 	runqueue_t *rq;
 
+	BUG_ON(!p);
 repeat_lock_task:
 	rq = task_rq_lock(p, &flags);
 	old_state = p->state;
@@ -500,7 +606,7 @@ repeat_lock_task:
 			 * Fast-migrate the task if it's not running or runnable
 			 * currently. Do not violate hard affinity.
 			 */
-			if (unlikely(sync && !task_running(rq, p) &&
+			if (unlikely(sync && !task_running(p) &&
 				(task_cpu(p) != smp_processor_id()) &&
 				(p->cpus_allowed & (1UL << smp_processor_id())))) {
 
@@ -514,8 +620,7 @@ repeat_lock_task:
 				__activate_task(p, rq);
 			else {
 				requeue_waker = activate_task(p, rq);
-				if (p->prio < rq->curr->prio)
-					resched_task(rq->curr);
+				wake_up_cpu(rq, task_cpu(p), p);
 			}
 			success = 1;
 		}
@@ -692,8 +797,9 @@ unsigned long nr_running(void)
 	unsigned long i, sum = 0;
 
 	for (i = 0; i < NR_CPUS; i++)
-		sum += cpu_rq(i)->nr_running;
-
+		/* Shared runqueues are counted only once. */
+		if (!cpu_idx(i))
+			sum += cpu_rq(i)->nr_running;
 	return sum;
 }
 
@@ -704,7 +810,9 @@ unsigned long nr_uninterruptible(void)
 	for (i = 0; i < NR_CPUS; i++) {
 		if (!cpu_online(i))
 			continue;
-		sum += cpu_rq(i)->nr_uninterruptible;
+		/* Shared runqueues are counted only once. */
+		if (!cpu_idx(i))
+			sum += cpu_rq(i)->nr_uninterruptible;
 	}
 	return sum;
 }
@@ -863,7 +971,15 @@ static int find_busiest_node(int this_no
 
 #endif /* CONFIG_NUMA */
 
-#if CONFIG_SMP
+#if !CONFIG_SMP
+
+/*
+ * on UP we do not need to balance between CPUs:
+ */
+static inline void load_balance(runqueue_t *this_rq, int this_cpu, int idle, unsigned long cpumask) { }
+static inline void rebalance_tick(runqueue_t *this_rq, int this_cpu, int idle) { }
+
+#else
 
 /*
  * double_lock_balance - lock the busiest runqueue
@@ -979,17 +1095,7 @@ static inline void pull_task(runqueue_t 
 	set_task_cpu(p, this_cpu);
 	nr_running_inc(this_rq);
 	enqueue_task(p, this_rq->active);
-	/*
-	 * Note that idle threads have a prio of MAX_PRIO, for this test
-	 * to be always true for them.
-	 */
-	if (p->prio < this_rq->curr->prio)
-		set_need_resched();
-	else {
-		if (p->prio == this_rq->curr->prio &&
-				p->time_slice > this_rq->curr->time_slice)
-			set_need_resched();
-	}
+	wake_up_cpu(this_rq, this_cpu, p);
 }
 
 /*
@@ -1000,12 +1106,12 @@ static inline void pull_task(runqueue_t 
  * We call this with the current runqueue locked,
  * irqs disabled.
  */
-static void load_balance(runqueue_t *this_rq, int idle, unsigned long cpumask)
+static void load_balance(runqueue_t *this_rq, int this_cpu, int idle, unsigned long cpumask)
 {
-	int imbalance, idx, this_cpu = smp_processor_id();
+	struct list_head *head, *curr;
 	runqueue_t *busiest;
 	prio_array_t *array;
-	struct list_head *head, *curr;
+	int imbalance, idx;
 	task_t *tmp;
 
 	busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance, cpumask);
@@ -1039,6 +1145,7 @@ skip_bitmap:
 		goto out_unlock;
 	}
 
+
 	head = array->queue + idx;
 	curr = head->prev;
 skip_queue:
@@ -1049,12 +1156,15 @@ skip_queue:
 	 * 1) running (obviously), or
 	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
 	 * 3) are cache-hot on their current CPU.
+	 *
+	 * (except if we are in idle mode which is a more agressive
+	 *  form of rebalancing.)
 	 */
 
-#define CAN_MIGRATE_TASK(p,rq,this_cpu)					\
-	((jiffies - (p)->last_run > cache_decay_ticks) &&	\
-		!task_running(rq, p) &&					\
-			((p)->cpus_allowed & (1UL << (this_cpu))))
+#define CAN_MIGRATE_TASK(p,rq,cpu)		\
+	(((idle && AGRESSIVE_IDLE_STEAL) ||	\
+		(jiffies - (p)->last_run > cache_decay_ticks)) && \
+			!task_running(p) && task_allowed(p, cpu))
 
 	curr = curr->prev;
 
@@ -1077,6 +1187,133 @@ out:
 	;
 }
 
+#if CONFIG_SHARE_RUNQUEUE
+static void active_load_balance(runqueue_t *this_rq, int this_cpu)
+{
+	runqueue_t *rq;
+	int i, idx;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!cpu_online(i))
+			continue;
+		rq = cpu_rq(i);
+		if (rq == this_rq)
+			continue;
+ 		/*
+		 * If there's only one CPU mapped to this runqueue
+		 * then there can be no SMT imbalance:
+		 */
+		if (rq->nr_cpus == 1)
+			continue;
+		/*
+		 * Any SMT-specific imbalance?
+		 */
+		for_each_sibling(idx, rq)
+			if (rq->cpu[idx].idle == rq->cpu[idx].curr)
+				goto next_cpu;
+
+		/*
+		 * At this point it's sure that we have a SMT
+		 * imbalance: this (physical) CPU is idle but
+		 * another CPU has two (or more) tasks running.
+		 *
+		 * We wake up one of the migration threads (it
+		 * doesnt matter which one) and let it fix things up:
+		 */
+		if (!cpu_active_balance(i)) {
+			cpu_active_balance(i) = 1;
+			spin_unlock(&this_rq->lock);
+			if (rq->cpu[0].migration_thread)
+				wake_up_process(rq->cpu[0].migration_thread);
+			spin_lock(&this_rq->lock);
+		}
+next_cpu:
+	}
+}
+
+static void do_active_balance(runqueue_t *this_rq, int this_cpu)
+{
+	runqueue_t *rq;
+	int i, idx;
+
+	spin_unlock(&this_rq->lock);
+
+	cpu_active_balance(this_cpu) = 0;
+
+	/*
+	 * Is the imbalance still present?
+	 */
+	for_each_sibling(idx, this_rq)
+		if (this_rq->cpu[idx].idle == this_rq->cpu[idx].curr)
+			goto out;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!cpu_online(i))
+			continue;
+		rq = cpu_rq(i);
+		if (rq == this_rq)
+			continue;
+
+		/* completely idle CPU? */
+		if (rq->nr_running)
+			continue;
+
+		/*
+		 * At this point it's reasonably sure that we have an
+		 * imbalance. Since we are the migration thread, try to
+	 	 * balance a thread over to the target queue.
+		 */
+		spin_lock(&this_rq->lock);
+		this_rq->nr_running--;
+		spin_unlock(&this_rq->lock);
+
+		spin_lock(&rq->lock);
+		load_balance(rq, i, 1, cpu_to_node_mask(i));
+		spin_unlock(&rq->lock);
+
+		spin_lock(&this_rq->lock);
+		this_rq->nr_running++;
+		spin_unlock(&this_rq->lock);
+		goto out;
+	}
+out:
+	spin_lock(&this_rq->lock);
+}
+
+/*
+ * This routine is called to map a CPU into another CPU's runqueue.
+ *
+ * This must be called during bootup with the merged runqueue having
+ * no tasks.
+ */
+void sched_map_runqueue(int cpu1, int cpu2)
+{
+	runqueue_t *rq1 = cpu_rq(cpu1);
+	runqueue_t *rq2 = cpu_rq(cpu2);
+	int cpu2_idx_orig = cpu_idx(cpu2), cpu2_idx;
+
+	BUG_ON(rq1 == rq2 || rq2->nr_running || rq_idx(cpu1) != cpu1);
+	/*
+	 * At this point, we dont have anything in the runqueue yet. So,
+	 * there is no need to move processes between the runqueues.
+	 * Only, the idle processes should be combined and accessed
+	 * properly.
+	 */
+	cpu2_idx = rq1->nr_cpus++;
+
+	rq_idx(cpu2) = cpu1;
+	cpu_idx(cpu2) = cpu2_idx;
+	rq1->cpu[cpu2_idx].cpu = cpu2;
+	rq1->cpu[cpu2_idx].idle = rq2->cpu[cpu2_idx_orig].idle;
+	rq1->cpu[cpu2_idx].curr = rq2->cpu[cpu2_idx_orig].curr;
+	INIT_LIST_HEAD(&rq1->cpu[cpu2_idx].migration_queue);
+
+	/* just to be safe: */
+	rq2->cpu[cpu2_idx_orig].idle = NULL;
+	rq2->cpu[cpu2_idx_orig].curr = NULL;
+}
+#endif
+
 /*
  * One of the idle_cpu_tick() and busy_cpu_tick() functions will
  * get called every timer tick, on every CPU. Our balancing action
@@ -1102,13 +1339,13 @@ static void balance_node(runqueue_t *thi
 	if (node >= 0) {
 		cpumask = node_to_cpumask(node) | this_cpumask;
 		spin_lock(&this_rq->lock);
-		load_balance(this_rq, idle, cpumask);
+		load_balance(this_rq, this_cpu, idle, cpumask);
 		spin_unlock(&this_rq->lock);
 	}
 }
 #endif
 
-static void rebalance_tick(runqueue_t *this_rq, int idle)
+static void rebalance_tick(runqueue_t *this_rq, int this_cpu, int idle)
 {
 #ifdef CONFIG_NUMA
 	int this_cpu = smp_processor_id();
@@ -1130,7 +1367,7 @@ static void rebalance_tick(runqueue_t *t
 #endif
 		if (!(j % IDLE_REBALANCE_TICK)) {
 			spin_lock(&this_rq->lock);
-			load_balance(this_rq, 0, cpu_to_node_mask(this_cpu));
+			load_balance(this_rq, this_cpu, 0, cpu_to_node_mask(this_cpu));
 			spin_unlock(&this_rq->lock);
 		}
 		return;
@@ -1141,19 +1378,13 @@ static void rebalance_tick(runqueue_t *t
 #endif
 	if (!(j % BUSY_REBALANCE_TICK)) {
 		spin_lock(&this_rq->lock);
-		load_balance(this_rq, idle, cpu_to_node_mask(this_cpu));
+		load_balance(this_rq, this_cpu, idle, cpu_to_node_mask(this_cpu));
 		spin_unlock(&this_rq->lock);
 	}
 }
-#else
-/*
- * on UP we do not need to balance between CPUs:
- */
-static inline void rebalance_tick(runqueue_t *this_rq, int idle)
-{
-}
 #endif
 
+
 DEFINE_PER_CPU(struct kernel_stat, kstat) = { { 0 } };
 
 /*
@@ -1181,12 +1412,13 @@ void scheduler_tick(int user_ticks, int 
 {
 	int cpu = smp_processor_id();
 	runqueue_t *rq = this_rq();
+	unsigned long j = jiffies;
 	task_t *p = current;
 
 	if (rcu_pending(cpu))
 		rcu_check_callbacks(cpu, user_ticks);
 
-	if (p == rq->idle) {
+	if (p == cpu_idle_ptr(cpu)) {
 		/* note: this timer irq context must be accounted for as well */
 		if (irq_count() - HARDIRQ_OFFSET >= SOFTIRQ_OFFSET)
 			kstat_cpu(cpu).cpustat.system += sys_ticks;
@@ -1194,7 +1426,7 @@ void scheduler_tick(int user_ticks, int 
 			kstat_cpu(cpu).cpustat.iowait += sys_ticks;
 		else
 			kstat_cpu(cpu).cpustat.idle += sys_ticks;
-		rebalance_tick(rq, 1);
+		rebalance_tick(rq, cpu, 1);
 		return;
 	}
 	if (TASK_NICE(p) > 0)
@@ -1209,6 +1441,7 @@ void scheduler_tick(int user_ticks, int 
 		return;
 	}
 	spin_lock(&rq->lock);
+  	/* Task might have expired already, but not scheduled off yet */
 	/*
 	 * The task was running during this tick - update the
 	 * time slice counter and the sleep average. Note: we
@@ -1244,14 +1477,14 @@ void scheduler_tick(int user_ticks, int 
 
 		if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
 			if (!rq->expired_timestamp)
-				rq->expired_timestamp = jiffies;
+				rq->expired_timestamp = j;
 			enqueue_task(p, rq->expired);
 		} else
 			enqueue_task(p, rq->active);
 	}
 out:
 	spin_unlock(&rq->lock);
-	rebalance_tick(rq, 0);
+	rebalance_tick(rq, cpu, 0);
 }
 
 void scheduling_functions_start_here(void) { }
@@ -1261,11 +1494,11 @@ void scheduling_functions_start_here(voi
  */
 asmlinkage void schedule(void)
 {
+	int idx, this_cpu, retry = 0;
+	struct list_head *queue;
 	task_t *prev, *next;
-	runqueue_t *rq;
 	prio_array_t *array;
-	struct list_head *queue;
-	int idx;
+	runqueue_t *rq;
 
 	/*
 	 * Test if we are atomic.  Since do_exit() needs to call into
@@ -1283,6 +1516,7 @@ asmlinkage void schedule(void)
 need_resched:
 	preempt_disable();
 	prev = current;
+	this_cpu = smp_processor_id();
 	rq = this_rq();
 
 	release_kernel_lock(prev);
@@ -1307,14 +1541,17 @@ need_resched:
 	case TASK_RUNNING:
 		;
 	}
+
 pick_next_task:
 	if (unlikely(!rq->nr_running)) {
-#if CONFIG_SMP
-		load_balance(rq, 1, cpu_to_node_mask(smp_processor_id()));
+		load_balance(rq, this_cpu, 1, cpu_to_node_mask(this_cpu));
 		if (rq->nr_running)
 			goto pick_next_task;
-#endif
-		next = rq->idle;
+		active_load_balance(rq, this_cpu);
+		if (rq->nr_running)
+			goto pick_next_task;
+pick_idle:
+		next = cpu_idle_ptr(this_cpu);
 		rq->expired_timestamp = 0;
 		goto switch_tasks;
 	}
@@ -1330,18 +1567,49 @@ pick_next_task:
 		rq->expired_timestamp = 0;
 	}
 
+new_array:
 	idx = sched_find_first_bit(array->bitmap);
-	queue = array->queue + idx;
-	next = list_entry(queue->next, task_t, run_list);
+  	queue = array->queue + idx;
+  	next = list_entry(queue->next, task_t, run_list);
+	if ((next != prev) && (rq_nr_cpus(rq) > 1)) {
+		struct list_head *tmp = queue->next;
+
+		while ((task_running(next) && (next != prev)) || !task_allowed(next, this_cpu)) {
+			tmp = tmp->next;
+			if (tmp != queue) {
+				next = list_entry(tmp, task_t, run_list);
+				continue;
+			}
+			idx = find_next_bit(array->bitmap, MAX_PRIO, ++idx);
+			if (idx == MAX_PRIO) {
+				if (retry || !rq->expired->nr_active) {
+					goto pick_idle;
+				}
+				/*
+				 * To avoid infinite changing of arrays,
+				 * when we have only tasks runnable by
+				 * sibling.
+				 */
+				retry = 1;
+
+				array = rq->expired;
+				goto new_array;
+			}
+			queue = array->queue + idx;
+			tmp = queue->next;
+			next = list_entry(tmp, task_t, run_list);
+		}
+	}
 
 switch_tasks:
 	prefetch(next);
 	clear_tsk_need_resched(prev);
-	RCU_qsctr(prev->thread_info->cpu)++;
+	RCU_qsctr(task_cpu(prev))++;
 
 	if (likely(prev != next)) {
 		rq->nr_switches++;
-		rq->curr = next;
+		cpu_curr_ptr(this_cpu) = next;
+		set_task_cpu(next, this_cpu);
 
 		prepare_arch_switch(rq, next);
 		prev = context_switch(rq, prev, next);
@@ -1604,9 +1872,8 @@ void set_user_nice(task_t *p, long nice)
 		 * If the task is running and lowered its priority,
 		 * or increased its priority then reschedule its CPU:
 		 */
-		if ((NICE_TO_PRIO(nice) < p->static_prio) ||
-							task_running(rq, p))
-			resched_task(rq->curr);
+		if ((NICE_TO_PRIO(nice) < p->static_prio) || task_running(p))
+			resched_task(cpu_curr_ptr(task_cpu(p)));
 	}
 out_unlock:
 	task_rq_unlock(rq, &flags);
@@ -1684,7 +1951,7 @@ int task_nice(task_t *p)
  */
 int task_curr(task_t *p)
 {
-	return cpu_curr(task_cpu(p)) == p;
+	return cpu_curr_ptr(task_cpu(p)) == p;
 }
 
 /**
@@ -1693,7 +1960,7 @@ int task_curr(task_t *p)
  */
 int idle_cpu(int cpu)
 {
-	return cpu_curr(cpu) == cpu_rq(cpu)->idle;
+	return cpu_curr_ptr(cpu) == cpu_idle_ptr(cpu);
 }
 
 /**
@@ -2243,7 +2510,7 @@ void __init init_idle(task_t *idle, int 
 	local_irq_save(flags);
 	double_rq_lock(idle_rq, rq);
 
-	idle_rq->curr = idle_rq->idle = idle;
+	cpu_curr_ptr(cpu) = cpu_idle_ptr(cpu) = idle;
 	deactivate_task(idle, rq);
 	idle->array = NULL;
 	idle->prio = MAX_PRIO;
@@ -2298,6 +2565,7 @@ void set_cpus_allowed(task_t *p, unsigne
 	unsigned long flags;
 	migration_req_t req;
 	runqueue_t *rq;
+	int cpu;
 
 #if 0 /* FIXME: Grab cpu_lock, return error on this case. --RR */
 	new_mask &= cpu_online_map;
@@ -2319,32 +2587,32 @@ void set_cpus_allowed(task_t *p, unsigne
 	 * If the task is not on a runqueue (and not running), then
 	 * it is sufficient to simply update the task's cpu field.
 	 */
-	if (!p->array && !task_running(rq, p)) {
+	if (!p->array && !task_running(p)) {
 		set_task_cpu(p, __ffs(p->cpus_allowed));
 		task_rq_unlock(rq, &flags);
 		return;
 	}
 	init_completion(&req.done);
 	req.task = p;
-	list_add(&req.list, &rq->migration_queue);
+	cpu = task_cpu(p);
+	list_add(&req.list, migration_queue(cpu));
 	task_rq_unlock(rq, &flags);
-
-	wake_up_process(rq->migration_thread);
+	wake_up_process(migration_thread(cpu));
 
 	wait_for_completion(&req.done);
 }
 
 /*
- * migration_thread - this is a highprio system thread that performs
+ * migration_task - this is a highprio system thread that performs
  * thread migration by 'pulling' threads into the target runqueue.
  */
-static int migration_thread(void * data)
+static int migration_task(void * data)
 {
 	/* Marking "param" __user is ok, since we do a set_fs(KERNEL_DS); */
 	struct sched_param __user param = { .sched_priority = MAX_RT_PRIO-1 };
 	int cpu = (long) data;
 	runqueue_t *rq;
-	int ret;
+	int ret, idx;
 
 	daemonize("migration/%d", cpu);
 	set_fs(KERNEL_DS);
@@ -2358,7 +2626,8 @@ static int migration_thread(void * data)
 	ret = setscheduler(0, SCHED_FIFO, &param);
 
 	rq = this_rq();
-	rq->migration_thread = current;
+	migration_thread(cpu) = current;
+	idx = cpu_idx(cpu);
 
 	for (;;) {
 		runqueue_t *rq_src, *rq_dest;
@@ -2369,7 +2638,9 @@ static int migration_thread(void * data)
 		task_t *p;
 
 		spin_lock_irqsave(&rq->lock, flags);
-		head = &rq->migration_queue;
+		if (cpu_active_balance(cpu))
+			do_active_balance(rq, cpu);
+		head = migration_queue(cpu);
 		current->state = TASK_INTERRUPTIBLE;
 		if (list_empty(head)) {
 			spin_unlock_irqrestore(&rq->lock, flags);
@@ -2399,8 +2670,7 @@ repeat:
 			if (p->array) {
 				deactivate_task(p, rq_src);
 				__activate_task(p, rq_dest);
-				if (p->prio < rq_dest->curr->prio)
-					resched_task(rq_dest->curr);
+				wake_up_cpu(rq_dest, cpu_dest, p);
 			}
 		}
 		double_rq_unlock(rq_src, rq_dest);
@@ -2418,12 +2688,13 @@ static int migration_call(struct notifie
 			  unsigned long action,
 			  void *hcpu)
 {
+	long cpu = (long) hcpu;
+
 	switch (action) {
 	case CPU_ONLINE:
-		printk("Starting migration thread for cpu %li\n",
-		       (long)hcpu);
-		kernel_thread(migration_thread, hcpu, CLONE_KERNEL);
-		while (!cpu_rq((long)hcpu)->migration_thread)
+		printk("Starting migration thread for cpu %li\n", cpu);
+		kernel_thread(migration_task, hcpu, CLONE_KERNEL);
+		while (!migration_thread(cpu))
 			yield();
 		break;
 	}
@@ -2488,6 +2759,7 @@ __init static void init_kstat(void) {
 	register_cpu_notifier(&kstat_nb);
 }
 
+
 void __init sched_init(void)
 {
 	runqueue_t *rq;
@@ -2498,11 +2770,20 @@ void __init sched_init(void)
 	for (i = 0; i < NR_CPUS; i++) {
 		prio_array_t *array;
 
+		/*
+		 * Start with a 1:1 mapping between CPUs and runqueues:
+		 */
+#if CONFIG_SHARE_RUNQUEUE
+		rq_idx(i) = i;
+		cpu_idx(i) = 0;
+#endif
 		rq = cpu_rq(i);
 		rq->active = rq->arrays;
 		rq->expired = rq->arrays + 1;
 		spin_lock_init(&rq->lock);
-		INIT_LIST_HEAD(&rq->migration_queue);
+		INIT_LIST_HEAD(migration_queue(i));
+		rq->nr_cpus = 1;
+		rq->cpu[cpu_idx(i)].cpu = i;
 		atomic_set(&rq->nr_iowait, 0);
 		nr_running_init(rq);
 
@@ -2520,9 +2801,9 @@ void __init sched_init(void)
 	 * We have to do a little magic to get the first
 	 * thread right in SMP mode.
 	 */
-	rq = this_rq();
-	rq->curr = current;
-	rq->idle = current;
+	cpu_curr_ptr(smp_processor_id()) = current;
+	cpu_idle_ptr(smp_processor_id()) = current;
+
 	set_task_cpu(current, smp_processor_id());
 	wake_up_forked_process(current);
 
--- linux/kernel/profile.c.orig	
+++ linux/kernel/profile.c	
@@ -9,7 +9,7 @@
 #include <linux/notifier.h>
 #include <linux/mm.h>
 
-extern char _stext, _etext;
+extern char _stext, _etext, _end;
 
 unsigned int * prof_buffer;
 unsigned long prof_len;
@@ -36,7 +36,7 @@ void __init profile_init(void)
 		return;
  
 	/* only text is profiled */
-	prof_len = (unsigned long) &_etext - (unsigned long) &_stext;
+	prof_len = (unsigned long) &_end - (unsigned long) &_stext;
 	prof_len >>= prof_shift;
 		
 	size = prof_len * sizeof(unsigned int) + PAGE_SIZE - 1;
--- linux/init/main.c.orig	
+++ linux/init/main.c	
@@ -367,7 +367,14 @@ static void __init smp_init(void)
 
 static void rest_init(void)
 {
+	/* 
+	 *	We count on the initial thread going ok 
+	 *	Like idlers init is an unlocked kernel thread, which will
+	 *	make syscalls (and thus be locked).
+	 */
+	init_idle(current, smp_processor_id());
 	kernel_thread(init, NULL, CLONE_KERNEL);
+
 	unlock_kernel();
  	cpu_idle();
 } 
@@ -453,13 +460,6 @@ asmlinkage void __init start_kernel(void
 	check_bugs();
 	printk("POSIX conformance testing by UNIFIX\n");
 
-	/* 
-	 *	We count on the initial thread going ok 
-	 *	Like idlers init is an unlocked kernel thread, which will
-	 *	make syscalls (and thus be locked).
-	 */
-	init_idle(current, smp_processor_id());
-
 	/* Do the rest non-__init'ed, we're now alive */
 	rest_init();
 }


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [patch] HT scheduler, sched-2.5.68-B2
  2003-04-23 16:23 [patch] HT scheduler, sched-2.5.68-B2 Ingo Molnar
@ 2003-04-23 17:47 ` Martin J. Bligh
  2003-04-23 17:53   ` Andrew Theurer
  0 siblings, 1 reply; 10+ messages in thread
From: Martin J. Bligh @ 2003-04-23 17:47 UTC (permalink / raw)
  To: Ingo Molnar, linux-kernel

>  - turn off the more agressive idle-steal variant. This could fix the
>    low-load regression reported by Martin J. Bligh.

Yup, that fixed it (I tested just your first version with just that
bit altered).

Thanks! 

M.


 DISCLAIMER: SPEC(tm) and the benchmark name SDET(tm) are registered
trademarks of the Standard Performance Evaluation Corporation. This 
benchmarking was performed for research purposes only, and the run results
are non-compliant and not-comparable with any published results.

Results are shown as percentages of the first set displayed

SDET 1  (see disclaimer)
                           Throughput    Std. Dev
                   2.5.68       100.0%         0.7%
                2.5.68-ht        72.9%         1.8%
           2.5.68-ht-pasv        98.4%         3.4%

SDET 2  (see disclaimer)
                           Throughput    Std. Dev
                   2.5.68       100.0%         2.8%
                2.5.68-ht        73.7%         2.1%
           2.5.68-ht-pasv       100.4%         1.0%

SDET 4  (see disclaimer)
                           Throughput    Std. Dev
                   2.5.68       100.0%         1.0%
                2.5.68-ht        62.5%        47.0%
           2.5.68-ht-pasv        99.7%         2.8%

SDET 8  (see disclaimer)
                           Throughput    Std. Dev
                   2.5.68       100.0%         0.6%
                2.5.68-ht        92.6%         1.0%
           2.5.68-ht-pasv       101.2%         1.7%

SDET 16  (see disclaimer)
                           Throughput    Std. Dev
                   2.5.68       100.0%         0.1%
                2.5.68-ht       100.0%         0.5%
           2.5.68-ht-pasv       101.1%         0.4%

SDET 32  (see disclaimer)
                           Throughput    Std. Dev
                   2.5.68       100.0%         0.4%
                2.5.68-ht        99.1%         0.6%
           2.5.68-ht-pasv        99.6%         0.6%

SDET 64  (see disclaimer)
                           Throughput    Std. Dev
                   2.5.68       100.0%         0.2%
                2.5.68-ht        99.0%         0.1%
           2.5.68-ht-pasv       100.3%         0.4%

SDET 128  (see disclaimer)
                           Throughput    Std. Dev
                   2.5.68       100.0%         0.1%
                2.5.68-ht        99.0%         0.1%
           2.5.68-ht-pasv        99.2%         1.3%



^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [patch] HT scheduler, sched-2.5.68-B2
  2003-04-23 17:47 ` Martin J. Bligh
@ 2003-04-23 17:53   ` Andrew Theurer
  2003-04-23 18:43     ` Martin J. Bligh
  0 siblings, 1 reply; 10+ messages in thread
From: Andrew Theurer @ 2003-04-23 17:53 UTC (permalink / raw)
  To: Martin J. Bligh, Ingo Molnar, linux-kernel

On Wednesday 23 April 2003 12:47, Martin J. Bligh wrote:
> >  - turn off the more agressive idle-steal variant. This could fix the
> >    low-load regression reported by Martin J. Bligh.
>
> Yup, that fixed it (I tested just your first version with just that
> bit altered).

Can we make this an arch specific option?  I have a feeling the HT performance 
on low loads will actually drop with this disabled.  

-Andrew

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [patch] HT scheduler, sched-2.5.68-B2
  2003-04-23 17:53   ` Andrew Theurer
@ 2003-04-23 18:43     ` Martin J. Bligh
  2003-04-23 20:14       ` Andrew Theurer
  2003-04-23 22:38       ` Martin J. Bligh
  0 siblings, 2 replies; 10+ messages in thread
From: Martin J. Bligh @ 2003-04-23 18:43 UTC (permalink / raw)
  To: Andrew Theurer, Ingo Molnar, linux-kernel

>> >  - turn off the more agressive idle-steal variant. This could fix the
>> >    low-load regression reported by Martin J. Bligh.
>> 
>> Yup, that fixed it (I tested just your first version with just that
>> bit altered).
> 
> Can we make this an arch specific option?  I have a feeling the HT performance 
> on low loads will actually drop with this disabled.  

Is it really an arch thing? Or is it a load level thing? I get the feeling
it might be switchable dynamically, dependant on load ...

M.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [patch] HT scheduler, sched-2.5.68-B2
  2003-04-23 18:43     ` Martin J. Bligh
@ 2003-04-23 20:14       ` Andrew Theurer
  2003-04-24  8:43         ` Rick Lindsley
  2003-04-23 22:38       ` Martin J. Bligh
  1 sibling, 1 reply; 10+ messages in thread
From: Andrew Theurer @ 2003-04-23 20:14 UTC (permalink / raw)
  To: Martin J. Bligh, Ingo Molnar, linux-kernel

On Wednesday 23 April 2003 13:43, Martin J. Bligh wrote:
> >> >  - turn off the more agressive idle-steal variant. This could fix the
> >> >    low-load regression reported by Martin J. Bligh.
> >>
> >> Yup, that fixed it (I tested just your first version with just that
> >> bit altered).
> >
> > Can we make this an arch specific option?  I have a feeling the HT
> > performance on low loads will actually drop with this disabled.
>
> Is it really an arch thing? Or is it a load level thing? I get the feeling
> it might be switchable dynamically, dependant on load ...

Well on high load, you shouldn't have an idle cpu anyway, so you would never 
pass the requirements for the agressive -idle- steal even if it was turned 
on.   On low loads on HT, without this agressive balance on cpu bound tasks, 
you will always load up one core before using any of the others.  When you 
fork/exec, the child will start on the same runqueue as the parent, the idle 
sibling will start running it, and it will never get a chance to balance 
properly while it's in a run state.  This is the same behavior I saw with the 
NUMA-HT solution, because I didn't have this agressive balance (although it 
could be added I suppose), and as a result it consistently performed less 
than Ingo's solution (but still better than no patch at all).

-Andrew Theurer

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [patch] HT scheduler, sched-2.5.68-B2
  2003-04-23 18:43     ` Martin J. Bligh
  2003-04-23 20:14       ` Andrew Theurer
@ 2003-04-23 22:38       ` Martin J. Bligh
  2003-04-24 21:29         ` Bill Davidsen
  1 sibling, 1 reply; 10+ messages in thread
From: Martin J. Bligh @ 2003-04-23 22:38 UTC (permalink / raw)
  To: Andrew Theurer, Ingo Molnar, linux-kernel; +Cc: ricklind

>>> >  - turn off the more agressive idle-steal variant. This could fix the
>>> >    low-load regression reported by Martin J. Bligh.
>>> 
>>> Yup, that fixed it (I tested just your first version with just that
>>> bit altered).
>> 
>> Can we make this an arch specific option?  I have a feeling the HT performance 
>> on low loads will actually drop with this disabled.  

Actually, what must be happening here is that we're agressively stealing
things on non-HT machines ... we should be able to easily prevent that.

Suppose we have 2 real cpus + HT ... A,B,C,D are the cpus, where A, B 
are HT twins, and C,D are HT twins. A&B share runqueue X, and C&D share
runqueue Y

What I presume you're trying to do is when A and B are running 1 task
each, and C and D are not running anything, balance out so we have
one on A and one on C. If we define some "nr_active(rq)" concept to be
the number of tasks actually actively running on cpus, then if we we're
switching from nr_actives of 2/0 to 1/0.

However, we don't want to switch from 2/1 to 1/2 ... that's pointless.
Or 0/1 to 1/0 (which I think it's what's happening). But in the case
where we had (theoretically) 4 HT siblings per real cpu, we would want
to migrate 1/3 to 2/2.

The key is that we want to agressively steal when 
nr_active(remote) - nr_active(idle) > 1 ... not > 0.
This will implicitly *never* happen on non HT machines, so it seems
like a nice algorithm ... ?

M.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [patch] HT scheduler, sched-2.5.68-B2
  2003-04-23 20:14       ` Andrew Theurer
@ 2003-04-24  8:43         ` Rick Lindsley
  2003-04-24 14:23           ` Andrew Theurer
  0 siblings, 1 reply; 10+ messages in thread
From: Rick Lindsley @ 2003-04-24  8:43 UTC (permalink / raw)
  To: Andrew Theurer; +Cc: Martin J. Bligh, Ingo Molnar, linux-kernel

    Well on high load, you shouldn't have an idle cpu anyway, so you would never 
    pass the requirements for the agressive -idle- steal even if it was turned 
    on.   On low loads on HT, without this agressive balance on cpu bound tasks, 
    you will always load up one core before using any of the others.

For a brief period of time.   But with active_load_balance() being called
on the idle processors, they will steal from a core/sibling pair to give
themselves something to do in, I should think, relatively short order.

Myself, I'm getting odd results on kernbench (kernel compiles).  On a
4-proc + 4-sibling HT machine, I'm seeing:

            2.5.68    HW HT on, regular scheduler
        2.5.68-ht2    HW HT on, A9 hyperthreading scheduler siblings=2

The HT scheduler seems to give us wins in every category but elapsed time.
This was *with* the aggressive steal, so I've a bit more testing to try
without it, collecting more information to identify why elapsed time is
not dropping too.

Rick

make -j2
                       User     System    Elapsed   %CPU
            2.5.68    471.19     34.14     263.25   191%
        2.5.68-ht2    335.54     24.46     257.86   139%

make -j4
                       User     System    Elapsed   %CPU
            2.5.68    581.93     40.37     164.36   378%
        2.5.68-ht2    421.77     28.64     165.06   272%

make -j8
                       User     System    Elapsed   %CPU
            2.5.68    946.24     60.05     138.13   728%
        2.5.68-ht2    685.05     43.38     138.59   525%

make -j16
                       User     System    Elapsed   %CPU
            2.5.68    954.35     61.12     139.17   729%
        2.5.68-ht2    690.11     43.91     138.52   529%


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [patch] HT scheduler, sched-2.5.68-B2
  2003-04-24  8:43         ` Rick Lindsley
@ 2003-04-24 14:23           ` Andrew Theurer
  0 siblings, 0 replies; 10+ messages in thread
From: Andrew Theurer @ 2003-04-24 14:23 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Martin J. Bligh, linux-kernel, Rick Lindsley

BTW, I am getting an oops on boot with the A9 version in wake_up_cpu:

Unable to handle kernel NULL pointer dereference at virtual address 00000004
c011910f
*pde = 00104001
Oops: 0000 [#1]
CPU:    0
EIP:    0060:[<c011910f>]    Not tainted
Using defaults from ksymoops -t elf32-i386 -a i386
EFLAGS: 00010046
eax: 00000000   ebx: f23f4d40   ecx: 00000000   edx: 424614f0
esi: c03f8f80   edi: c03f9938   ebp: f2d87f2c   esp: f2d87f0c
ds: 007b   es: 007b   ss: 0068
Stack: 00000000 00000000 00000000 00000001 00000000 f23f4d40 c03f8f80 c03f9938
       f2d87f78 c01195e1 c03f8f80 c03f9938 f23f5330 f2d87f58 f2d8d628 00000000
       c03f8f80 00000002 00000000 00000086 00000020 f23f5330 f2d8d330 c0396fc0
Call Trace: [<c01195e1>]  [<c011af50>]  [<c011b025>]  [<c011c5aa>]  
[<c010a9ba>]
  [<c011c300>]  [<c0108b25>]
code: 8b 41 04 ba 03 00 00 00 8b 58 08 c1 eb 10 83 e3 01 f0 0f ab

>>EIP; c011910f <wake_up_cpu+3f/310>   <=====
Trace; c01195e1 <try_to_wake_up+201/310>
Trace; c011af50 <__wake_up_common+40/60>
Trace; c011b025 <complete+25/40>
Trace; c011c5aa <migration_task+2aa/2b0>
Trace; c010a9ba <work_resched+5/16>
Trace; c011c300 <migration_task+0/2b0>
Trace; c0108b25 <kernel_thread_helper+5/10>
Code;  c011910f <wake_up_cpu+3f/310>
00000000 <_EIP>:
Code;  c011910f <wake_up_cpu+3f/310>   <=====
   0:   8b 41 04                  mov    0x4(%ecx),%eax   <=====
Code;  c0119112 <wake_up_cpu+42/310>
   3:   ba 03 00 00 00            mov    $0x3,%edx
Code;  c0119117 <wake_up_cpu+47/310>
   8:   8b 58 08                  mov    0x8(%eax),%ebx
Code;  c011911a <wake_up_cpu+4a/310>
   b:   c1 eb 10                  shr    $0x10,%ebx
Code;  c011911d <wake_up_cpu+4d/310>
   e:   83 e3 01                  and    $0x1,%ebx
Code;  c0119120 <wake_up_cpu+50/310>
  11:   f0 0f ab 00               lock bts %eax,(%eax)


One thing to note, I am using a patch to skip cpus on boot by specifying their 
apicid.  I have used this patch without any problems for quite a while, but 
for some reason with the HT patch I get the oops.  I am using this patch so I 
can boot only the processors in the first node on an x440:

boot param used: skip_apicids=32,33,34,35,48,49,50,51
(all of the apicids in the 2nd node)

patch:
diff -Naur linux-2.5.65/arch/i386/kernel/smpboot.c 
linux-2.5.65-skip-apic/arch/i386/kernel/smpboot.c
--- linux-2.5.65/arch/i386/kernel/smpboot.c	Mon Mar 17 13:44:05 2003
+++ linux-2.5.65-skip-apic/arch/i386/kernel/smpboot.c	Fri Mar 21 09:04:03 2003
@@ -82,6 +82,10 @@
 extern unsigned char trampoline_end  [];
 static unsigned char *trampoline_base;
 
+/* used to selectively not boot certain CPUs with apicids */
+extern int skip_apic_ids[NR_CPUS];
+
+
 /*
  * Currently trivial. Write the real->protected mode
  * bootstrap into the page concerned. The caller
@@ -939,6 +943,16 @@
 
 int cpu_sibling_map[NR_CPUS] __cacheline_aligned;
 
+static int skip_this_apicid(int apicid)
+{
+	int i;
+
+	for (i = 1; i < (skip_apic_ids[0] + 1); i++) 
+		if (apicid == skip_apic_ids[i]) 
+			return 1;
+	return 0;
+}
+
 static void __init smp_boot_cpus(unsigned int max_cpus)
 {
 	int apicid, cpu, bit;
@@ -1036,6 +1050,8 @@
 			continue;
 		if (max_cpus <= cpucount+1)
 			continue;
+		if (skip_this_apicid(apicid))
+			continue;
 
 		if (do_boot_cpu(apicid))
 			printk("CPU #%d not responding - cannot use it.\n",
diff -Naur linux-2.5.65/init/main.c linux-2.5.65-skip-apic/init/main.c
--- linux-2.5.65/init/main.c	Mon Mar 17 13:43:44 2003
+++ linux-2.5.65-skip-apic/init/main.c	Fri Mar 21 09:04:18 2003
@@ -113,6 +113,8 @@
 
 /* Setup configured maximum number of CPUs to activate */
 static unsigned int max_cpus = NR_CPUS;
+/* Don't boot apicids in this list */
+int skip_apic_ids[NR_CPUS];
 
 /*
  * Setup routine for controlling SMP activation
@@ -140,6 +142,18 @@
 
 __setup("maxcpus=", maxcpus);
 
+static int __init skip_apicids(char *str)
+{
+	int i;
+
+	get_options(str, NR_CPUS, skip_apic_ids);
+	for (i = 1; i < (skip_apic_ids[0]+1); i++)
+		printk( "CPU with apicid %i will not be booted\n", skip_apic_ids[i]);
+	return 1;
+}
+
+__setup("skip_apicids=", skip_apicids);
+
 static char * argv_init[MAX_INIT_ARGS+2] = { "init", NULL, };
 char * envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, };
 


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [patch] HT scheduler, sched-2.5.68-B2
  2003-04-23 22:38       ` Martin J. Bligh
@ 2003-04-24 21:29         ` Bill Davidsen
  2003-04-24 22:39           ` Martin J. Bligh
  0 siblings, 1 reply; 10+ messages in thread
From: Bill Davidsen @ 2003-04-24 21:29 UTC (permalink / raw)
  To: Andrew Theurer, Martin J. Bligh; +Cc: Ingo Molnar, linux-kernel, ricklind

On Wed, 23 Apr 2003, Andrew Theurer wrote:

> Well on high load, you shouldn't have an idle cpu anyway, so you would never 
> pass the requirements for the agressive -idle- steal even if it was turned 
> on.   On low loads on HT, without this agressive balance on cpu bound tasks, 
> you will always load up one core before using any of the others.  When you 
> fork/exec, the child will start on the same runqueue as the parent, the idle 
> sibling will start running it, and it will never get a chance to balance 
> properly while it's in a run state.  This is the same behavior I saw with the 
> NUMA-HT solution, because I didn't have this agressive balance (although it 
> could be added I suppose), and as a result it consistently performed less 
> than Ingo's solution (but still better than no patch at all).

Sorry if I misunderstand, but if HT is present, I would think that you
would want to start the child of a fork on the same runqueue, because the
cache is loaded, and to run the child first because in many cases the
child will do and exac. At that point it is probable that the exec'd
process run on another CPU would leave the cache useful to the parent.

I fully admit that this is "seems to me" rather than measured, but
protecting the cache is certainly a good thing in general.


On Wed, 23 Apr 2003, Martin J. Bligh wrote:

> Actually, what must be happening here is that we're agressively stealing
> things on non-HT machines ... we should be able to easily prevent that.
> 
> Suppose we have 2 real cpus + HT ... A,B,C,D are the cpus, where A, B 
> are HT twins, and C,D are HT twins. A&B share runqueue X, and C&D share
> runqueue Y
> 
> What I presume you're trying to do is when A and B are running 1 task
> each, and C and D are not running anything, balance out so we have
> one on A and one on C. If we define some "nr_active(rq)" concept to be
> the number of tasks actually actively running on cpus, then if we we're
> switching from nr_actives of 2/0 to 1/0.
> 
> However, we don't want to switch from 2/1 to 1/2 ... that's pointless.
> Or 0/1 to 1/0 (which I think it's what's happening). But in the case
> where we had (theoretically) 4 HT siblings per real cpu, we would want
> to migrate 1/3 to 2/2.
> 
> The key is that we want to agressively steal when 
> nr_active(remote) - nr_active(idle) > 1 ... not > 0.
> This will implicitly *never* happen on non HT machines, so it seems
> like a nice algorithm ... ?

Is it really that simple? 

-- 
bill davidsen <davidsen@tmr.com>
  CTO, TMR Associates, Inc
Doing interesting things with little computers since 1979.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [patch] HT scheduler, sched-2.5.68-B2
  2003-04-24 21:29         ` Bill Davidsen
@ 2003-04-24 22:39           ` Martin J. Bligh
  0 siblings, 0 replies; 10+ messages in thread
From: Martin J. Bligh @ 2003-04-24 22:39 UTC (permalink / raw)
  To: Bill Davidsen, Andrew Theurer; +Cc: Ingo Molnar, linux-kernel, ricklind

> Sorry if I misunderstand, but if HT is present, I would think that you
> would want to start the child of a fork on the same runqueue, because the
> cache is loaded, and to run the child first because in many cases the
> child will do and exac. At that point it is probable that the exec'd
> process run on another CPU would leave the cache useful to the parent.
> 
> I fully admit that this is "seems to me" rather than measured, but
> protecting the cache is certainly a good thing in general.

We don't do balance on exec for SMP. I think we should ;-)
AFAIK, fork always stays on the same runqueue anyway.

>> The key is that we want to agressively steal when 
>> nr_active(remote) - nr_active(idle) > 1 ... not > 0.
>> This will implicitly *never* happen on non HT machines, so it seems
>> like a nice algorithm ... ?
> 
> Is it really that simple? 

Well, *I* think so (obviously) ;-)
Feel free to poke holes in the argument ...

M.


^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2003-04-24 22:38 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2003-04-23 16:23 [patch] HT scheduler, sched-2.5.68-B2 Ingo Molnar
2003-04-23 17:47 ` Martin J. Bligh
2003-04-23 17:53   ` Andrew Theurer
2003-04-23 18:43     ` Martin J. Bligh
2003-04-23 20:14       ` Andrew Theurer
2003-04-24  8:43         ` Rick Lindsley
2003-04-24 14:23           ` Andrew Theurer
2003-04-23 22:38       ` Martin J. Bligh
2003-04-24 21:29         ` Bill Davidsen
2003-04-24 22:39           ` Martin J. Bligh

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).