linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/2] node affine NUMA scheduler
@ 2002-09-21  9:59 Erich Focht
  2002-09-21 10:02 ` [PATCH 2/2] " Erich Focht
                   ` (2 more replies)
  0 siblings, 3 replies; 27+ messages in thread
From: Erich Focht @ 2002-09-21  9:59 UTC (permalink / raw)
  To: linux-kernel; +Cc: LSE, Ingo Molnar, Michael Hohnbaum

[-- Attachment #1: Type: text/plain, Size: 1831 bytes --]

Hi,

here is an update of the NUMA scheduler for the 2.5.37 kernel. It
contains some bugfixes and the coupling to discontigmem memory
allocation (memory is allocated from the processes' homenode).

The node affine NUMA scheduler is targeted for multi-node platforms
and built on top of the O(1) scheduler. Its main objective is to
keep the memory access latency for each task as low as possible by
scheduling it on or near the node on which its memory is allocated.
This should achieve the hard-affinity benefits automatically.

The patch comes in two parts. The first part is the core NUMA scheduler,
it is functional without the second part and provides following features:
 - Node aware scheduler (implemented CPU pools).
 - Scheduler behaves like O(1) scheduler within a node.
 - Equal load among nodes is targeted, stealing tasks from remote nodes
   is delayed more if the current node is averagely loaded, less if it's
   unloaded.
 - Multi-level node hierarchies are supported by stealing delays adjusted
   by the relative "node-distance" (memory access latency ratio).
 
The second part of the patch extends the pooling NUMA scheduler to
have node affine tasks:
 - Each process has a homenode assigned to it at creation time
   (initial load balancing). Memory will be allocated from this node.
 - Each process is preferentially scheduled on its homenode and
   attracted back to it if scheduled away for some reason. For
   multi-level node hierarchies the task is attracted to its
   supernode, too.

The patch was tested on IA64 platforms but should work on NUMAQ i386,
too. Similar code for 2.4.18 (cf. http://home.arcor.de/efocht/sched) 
runs in production environments since months.

Comments, tests, ports to other platforms/architectures are very welcome!

Regards,
Erich


[-- Attachment #2: 01-numa_sched_core-2.5.patch --]
[-- Type: text/x-diff, Size: 30280 bytes --]

diff -urNp a/arch/i386/config.in b/arch/i386/config.in
--- a/arch/i386/config.in	Fri Sep 20 17:20:22 2002
+++ b/arch/i386/config.in	Sat Sep 21 11:44:48 2002
@@ -177,6 +177,7 @@ else
      fi
      # Common NUMA Features
      if [ "$CONFIG_X86_NUMAQ" = "y" ]; then
+        bool 'Enable NUMA scheduler' CONFIG_NUMA_SCHED
         bool 'Numa Memory Allocation Support' CONFIG_NUMA
         if [ "$CONFIG_NUMA" = "y" ]; then
            define_bool CONFIG_DISCONTIGMEM y
diff -urNp a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
--- a/arch/i386/kernel/smpboot.c	Fri Sep 20 17:20:26 2002
+++ b/arch/i386/kernel/smpboot.c	Sat Sep 21 11:44:48 2002
@@ -765,6 +765,8 @@ static int __init wakeup_secondary_via_I
 
 extern unsigned long cpu_initialized;
 
+static int __initdata nr_lnodes = 0;
+
 static void __init do_boot_cpu (int apicid) 
 /*
  * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
@@ -773,11 +775,17 @@ static void __init do_boot_cpu (int apic
 {
 	struct task_struct *idle;
 	unsigned long boot_error = 0;
-	int timeout, cpu;
+	int timeout, cpu, cell;
 	unsigned long start_eip;
 	unsigned short nmi_high, nmi_low;
 
 	cpu = ++cpucount;
+#ifdef CONFIG_NUMA_SCHED
+	cell = SAPICID_TO_PNODE(apicid);
+	if (pnode_to_lnode[cell] < 0) {
+		pnode_to_lnode[cell] = nr_lnodes++;
+	}
+#endif
 	/*
 	 * We can't use kernel_thread since we must avoid to
 	 * reschedule the child.
@@ -996,6 +1004,10 @@ static void __init smp_boot_cpus(unsigne
 	set_bit(0, &cpu_callout_map);
 	boot_cpu_logical_apicid = logical_smp_processor_id();
 	map_cpu_to_boot_apicid(0, boot_cpu_apicid);
+#ifdef CONFIG_NUMA_SCHED
+	pnode_to_lnode[SAPICID_TO_PNODE(boot_cpu_apicid)] = nr_lnodes++;
+	printk("boot_cpu_apicid = %d, nr_lnodes = %d, lnode = %d\n", boot_cpu_apicid, nr_lnodes, pnode_to_lnode[0]);
+#endif
 
 	current_thread_info()->cpu = 0;
 	smp_tune_scheduling();
@@ -1194,6 +1206,11 @@ int __devinit __cpu_up(unsigned int cpu)
 void __init smp_cpus_done(unsigned int max_cpus)
 {
 	zap_low_mappings();
+#ifdef CONFIG_NUMA_SCHED
+	pooldata_lock();
+	bld_pools();
+	pooldata_unlock();
+#endif
 }
 
 void __init smp_intr_init()
diff -urNp a/arch/ia64/config.in b/arch/ia64/config.in
--- a/arch/ia64/config.in	Fri Sep 20 17:20:37 2002
+++ b/arch/ia64/config.in	Sat Sep 21 11:44:48 2002
@@ -67,11 +67,13 @@ fi
 if [ "$CONFIG_IA64_GENERIC" = "y" -o "$CONFIG_IA64_DIG" = "y" -o "$CONFIG_IA64_HP_ZX1" = "y" ];
 then
 	bool '  Enable IA-64 Machine Check Abort' CONFIG_IA64_MCA
+   	bool '  Enable NUMA scheduler' CONFIG_NUMA_SCHED
 	define_bool CONFIG_PM y
 fi
 
 if [ "$CONFIG_IA64_SGI_SN1" = "y" -o "$CONFIG_IA64_SGI_SN2" = "y" ]; then
 	define_bool CONFIG_IA64_SGI_SN y
+   	bool '  Enable NUMA scheduler' CONFIG_NUMA_SCHED
 	bool '  Enable extra debugging code' CONFIG_IA64_SGI_SN_DEBUG
 	bool '  Enable SGI Medusa Simulator Support' CONFIG_IA64_SGI_SN_SIM
 	bool '  Enable autotest (llsc). Option to run cache test instead of booting' \
diff -urNp a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
--- a/arch/ia64/kernel/smpboot.c	Fri Sep 20 17:20:20 2002
+++ b/arch/ia64/kernel/smpboot.c	Sat Sep 21 11:55:03 2002
@@ -16,6 +16,7 @@
 
 #include <linux/config.h>
 
+#include <linux/acpi.h>
 #include <linux/bootmem.h>
 #include <linux/delay.h>
 #include <linux/init.h>
@@ -364,11 +365,22 @@ fork_by_hand (void)
 	return do_fork(CLONE_VM|CLONE_IDLETASK, 0, 0, 0);
 }
 
+#ifdef CONFIG_NUMA_SCHED
+static int __initdata nr_lnodes=0;
+#endif
+
 static int __init
 do_boot_cpu (int sapicid, int cpu)
 {
 	struct task_struct *idle;
-	int timeout;
+	int timeout, cell;
+
+#ifdef CONFIG_NUMA_SCHED
+	cell = SAPICID_TO_PNODE(sapicid);
+	if (pnode_to_lnode[cell] < 0) {
+		pnode_to_lnode[cell] = nr_lnodes++;
+	}
+#endif
 
 	/*
 	 * We can't use kernel_thread since we must avoid to reschedule the child.
@@ -421,7 +433,7 @@ unsigned long cache_decay_ticks;	/* # of
 static void
 smp_tune_scheduling (void)
 {
-	cache_decay_ticks = 10;	/* XXX base this on PAL info and cache-bandwidth estimate */
+	cache_decay_ticks = 8;	/* XXX base this on PAL info and cache-bandwidth estimate */
 
 	printk("task migration cache decay timeout: %ld msecs.\n",
 	       (cache_decay_ticks + 1) * 1000 / HZ);
@@ -474,6 +486,9 @@ smp_prepare_cpus (unsigned int max_cpus)
 
 	local_cpu_data->loops_per_jiffy = loops_per_jiffy;
 	ia64_cpu_to_sapicid[0] = boot_cpu_id;
+#ifdef CONFIG_NUMA_SCHED
+	pnode_to_lnode[SAPICID_TO_PNODE(boot_cpu_id)] = nr_lnodes++;
+#endif
 
 	printk("Boot processor id 0x%x/0x%x\n", 0, boot_cpu_id);
 
@@ -506,6 +521,11 @@ smp_cpus_done (unsigned int dummy)
 
 	printk(KERN_INFO"Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
 	       num_online_cpus(), bogosum/(500000/HZ), (bogosum/(5000/HZ))%100);
+#ifdef CONFIG_NUMA_SCHED
+	pooldata_lock();
+	bld_pools();
+	pooldata_unlock();
+#endif
 }
 
 int __devinit
diff -urNp a/include/asm-i386/atomic.h b/include/asm-i386/atomic.h
--- a/include/asm-i386/atomic.h	Fri Sep 20 17:20:21 2002
+++ b/include/asm-i386/atomic.h	Sat Sep 21 11:44:48 2002
@@ -111,6 +111,18 @@ static __inline__ void atomic_inc(atomic
 }
 
 /**
+ * atomic_inc_return - increment atomic variable and return new value
+ * @v: pointer of type atomic_t
+ *
+ * Atomically increments @v by 1 and return it's new value.  Note that
+ * the guaranteed useful range of an atomic_t is only 24 bits.
+ */
+static inline int atomic_inc_return(atomic_t *v){
+	atomic_inc(v);
+	return v->counter;
+}
+
+/**
  * atomic_dec - decrement atomic variable
  * @v: pointer of type atomic_t
  * 
diff -urNp a/include/asm-i386/smp.h b/include/asm-i386/smp.h
--- a/include/asm-i386/smp.h	Fri Sep 20 17:20:15 2002
+++ b/include/asm-i386/smp.h	Sat Sep 21 11:44:48 2002
@@ -124,5 +124,11 @@ static inline int num_booting_cpus(void)
 
 #define NO_PROC_ID		0xFF		/* No processor magic marker */
 
+#ifdef CONFIG_NUMA_SCHED
+#define NR_NODES               8
+#define cpu_physical_id(cpuid) (cpu_to_physical_apicid(cpuid))
+#define SAPICID_TO_PNODE(hwid)  (hwid >> 4)
+#endif
+
 #endif
 #endif
diff -urNp a/include/asm-ia64/smp.h b/include/asm-ia64/smp.h
--- a/include/asm-ia64/smp.h	Fri Sep 20 17:20:26 2002
+++ b/include/asm-ia64/smp.h	Sat Sep 21 11:44:48 2002
@@ -13,6 +13,7 @@
 
 #ifdef CONFIG_SMP
 
+#include <linux/cache.h>
 #include <linux/init.h>
 #include <linux/threads.h>
 #include <linux/kernel.h>
@@ -130,6 +131,31 @@ extern void __cpu_die (unsigned int cpu)
 extern int __cpu_up (unsigned int cpu);
 extern void __init smp_build_cpu_map(void);
 
+#ifdef CONFIG_NUMA_SCHED
+#ifdef CONFIG_IA64_DIG
+/* sooner or later this should be a configurable parameter [EF] */
+#define NR_NODES 8
+/* 
+ * This is the node ID on the NEC AzusA, 
+ * on LION and BigSur it correctly initializes to node 0
+ */
+#define SAPICID_TO_PNODE(hwid) ((hwid >> 12) & 0xff)
+
+#elif defined(CONFIG_IA64_SGI_SN)
+
+/*
+ * SGI SN1 & SN2 specific macros
+ */
+#define NR_NODES 32
+#define SAPICID_TO_PNODE(hwid) cpuid_to_cnodeid(hwid)
+
+#endif
+
+#else   /* CONFIG_NUMA_SCHED */
+#define NR_NODES               1
+#define SAPICID_TO_PNODE(hwid)  0
+#endif  /* CONFIG_NUMA_SCHED */
+
 extern void __init init_smp_config (void);
 extern void smp_do_timer (struct pt_regs *regs);
 
diff -urNp a/include/linux/sched.h b/include/linux/sched.h
--- a/include/linux/sched.h	Fri Sep 20 17:20:16 2002
+++ b/include/linux/sched.h	Sat Sep 21 11:44:48 2002
@@ -159,7 +159,7 @@ extern void update_one_process(struct ta
 			       unsigned long system, int cpu);
 extern void scheduler_tick(int user_tick, int system);
 extern unsigned long cache_decay_ticks;
-
+extern void sched_migrate_task(task_t *p, int cpu);
 
 #define	MAX_SCHEDULE_TIMEOUT	LONG_MAX
 extern signed long FASTCALL(schedule_timeout(signed long timeout));
@@ -449,6 +449,46 @@ extern void set_cpus_allowed(task_t *p, 
 # define set_cpus_allowed(p, new_mask) do { } while (0)
 #endif
 
+/* Avoid zeroes in integer divides for load calculations */
+#define BALANCE_FACTOR 100
+/*
+ * If the current node has average load it waits 100ms before trying to
+ * steal a task from a remote node.
+ */
+
+#ifdef CONFIG_NUMA_SCHED
+
+#define POOL_DELAY(this_node,node)      \
+                (_pool_delay[this_node * numpools + node])
+#define POOL_WEIGHT(this_node,node)     \
+                (_pool_weight[this_node * numpools + node])
+
+#define NODPOL_EXEC         0   /* choose node & cpu in do_exec */
+#define NODPOL_FORK         1   /* choose node & cpu in do_fork if !CLONE_VM */
+#define NODPOL_FORK_ALL     2   /* choose node & cpu in do_fork */
+
+extern int node_levels[NR_NODES];
+extern int nr_node_levels;
+/* extern void find_node_levels(int numpools); */
+
+extern int numpools;
+extern int pool_ptr[NR_NODES+1];
+extern int pool_cpus[NR_CPUS];
+extern int pool_nr_cpus[NR_NODES];
+extern unsigned long pool_mask[NR_NODES];
+extern int pnode_to_lnode[NR_NODES];
+extern atomic_t pool_lock;
+extern void *runqueues_address;
+extern char lnode_number[NR_CPUS] __cacheline_aligned;
+#define CPU_TO_NODE(cpu) lnode_number[cpu]
+
+extern void pooldata_lock(void);
+extern void pooldata_unlock(void);
+#else
+# define POOL_DELAY(x,y)  (0)
+# define CPU_TO_NODE(cpu) (0)
+#endif
+
 extern void set_user_nice(task_t *p, long nice);
 extern int task_prio(task_t *p);
 extern int task_nice(task_t *p);
diff -urNp a/kernel/ksyms.c b/kernel/ksyms.c
--- a/kernel/ksyms.c	Fri Sep 20 17:20:14 2002
+++ b/kernel/ksyms.c	Sat Sep 21 11:44:48 2002
@@ -606,6 +606,18 @@ EXPORT_SYMBOL(init_task);
 EXPORT_SYMBOL(init_thread_union);
 
 EXPORT_SYMBOL(tasklist_lock);
+
+#ifdef CONFIG_NUMA_SCHED
+#include <linux/sched.h>
+EXPORT_SYMBOL(runqueues_address);
+EXPORT_SYMBOL(numpools);
+EXPORT_SYMBOL(pool_ptr);
+EXPORT_SYMBOL(pool_cpus);
+EXPORT_SYMBOL(pool_nr_cpus);
+EXPORT_SYMBOL(pool_mask);
+EXPORT_SYMBOL(sched_migrate_task);
+#endif
+
 #if defined(CONFIG_SMP) && defined(__GENERIC_PER_CPU)
 EXPORT_SYMBOL(__per_cpu_offset);
 #endif
diff -urNp a/kernel/sched.c b/kernel/sched.c
--- a/kernel/sched.c	Fri Sep 20 17:20:32 2002
+++ b/kernel/sched.c	Sat Sep 21 11:44:48 2002
@@ -154,6 +154,10 @@ struct runqueue {
 	task_t *migration_thread;
 	struct list_head migration_queue;
 
+	unsigned long wait_time;
+	int wait_node;
+	int load[2][NR_CPUS];
+
 } ____cacheline_aligned;
 
 static struct runqueue runqueues[NR_CPUS] __cacheline_aligned;
@@ -174,6 +178,39 @@ static struct runqueue runqueues[NR_CPUS
 #endif
 
 /*
+ * Variables for describing and accessing processor pools. Using a
+ * compressed row format like notation. Processor pools are treated
+ * like logical node numbers.
+ *
+ * numpools: number of CPU pools (nodes),
+ * pool_cpus[]: CPUs in pools sorted by their pool ID,
+ * pool_ptr[pool]: index of first element in pool_cpus[] belonging to pool.
+ * pnode_to_lnode[pnode]: pool number corresponding to a physical node ID.
+ * pool_mask[]: cpu mask of a pool.
+ * _pool_delay[]: delay when stealing a task from remote nodes for multilevel
+ *               topology. Needed by the macro POOL_DELAY().
+ *
+ * Example: loop over all CPUs in a pool p:
+ * for (i = pool_ptr[p]; i < pool_ptr[p+1]; i++) {
+ *      cpu = pool_cpus[i];
+ *      ...
+ * }
+ *                                                      <efocht@ess.nec.de>
+ */
+int numpools = 1;
+int pool_ptr[NR_NODES+1] = { 0, NR_CPUS, };
+int pool_cpus[NR_CPUS];
+int pool_nr_cpus[NR_NODES] = { NR_CPUS, };
+unsigned long pool_mask[NR_NODES] = { -1L, };
+int pnode_to_lnode[NR_NODES] = { [0 ... NR_NODES-1] = -1 };
+void *runqueues_address = (void *)runqueues; /* export this symbol to modules */
+char lnode_number[NR_CPUS] __cacheline_aligned;
+static int _pool_delay[NR_NODES*NR_NODES] __cacheline_aligned;
+static int _pool_weight[NR_NODES*NR_NODES] __cacheline_aligned;
+static atomic_t pool_lock = ATOMIC_INIT(0); /* set to 1 while modifying pool data */
+#define MAX_CACHE_WEIGHT 100
+
+/*
  * task_rq_lock - lock the runqueue a given task resides on and disable
  * interrupts.  Note the ordering: we can safely lookup the task_rq without
  * explicitly disabling preemption.
@@ -447,8 +484,10 @@ int wake_up_process(task_t * p)
  */
 void wake_up_forked_process(task_t * p)
 {
-	runqueue_t *rq = this_rq_lock();
+	runqueue_t *rq;
+	unsigned long flags;
 
+	rq = task_rq_lock(p, &flags);
 	p->state = TASK_RUNNING;
 	if (!rt_task(p)) {
 		/*
@@ -462,8 +501,9 @@ void wake_up_forked_process(task_t * p)
 	}
 	set_task_cpu(p, smp_processor_id());
 	activate_task(p, rq);
-
-	rq_unlock(rq);
+	if (p->prio < rq->curr->prio)
+		resched_task(rq->curr);
+	task_rq_unlock(rq, &flags);
 }
 
 /*
@@ -548,6 +588,11 @@ unsigned long nr_running(void)
 	return sum;
 }
 
+unsigned long qnr_running(int cpu)
+{
+	return cpu_rq(cpu)->nr_running;
+}
+
 unsigned long nr_uninterruptible(void)
 {
 	unsigned long i, sum = 0;
@@ -630,46 +675,21 @@ static inline unsigned int double_lock_b
 }
 
 /*
- * find_busiest_queue - find the busiest runqueue.
+ * Calculate load of a CPU pool, store results in data[][NR_CPUS].
+ * Return the index of the most loaded runqueue.
+ *
  */
-static inline runqueue_t *find_busiest_queue(runqueue_t *this_rq, int this_cpu, int idle, int *imbalance)
+static int calc_pool_load(int data[][NR_CPUS], int this_cpu, int pool, int idle)
 {
-	int nr_running, load, max_load, i;
-	runqueue_t *busiest, *rq_src;
-
-	/*
-	 * We search all runqueues to find the most busy one.
-	 * We do this lockless to reduce cache-bouncing overhead,
-	 * we re-check the 'best' source CPU later on again, with
-	 * the lock held.
-	 *
-	 * We fend off statistical fluctuations in runqueue lengths by
-	 * saving the runqueue length during the previous load-balancing
-	 * operation and using the smaller one the current and saved lengths.
-	 * If a runqueue is long enough for a longer amount of time then
-	 * we recognize it and pull tasks from it.
-	 *
-	 * The 'current runqueue length' is a statistical maximum variable,
-	 * for that one we take the longer one - to avoid fluctuations in
-	 * the other direction. So for a load-balance to happen it needs
-	 * stable long runqueue on the target CPU and stable short runqueue
-	 * on the local runqueue.
-	 *
-	 * We make an exception if this CPU is about to become idle - in
-	 * that case we are less picky about moving a task across CPUs and
-	 * take what can be taken.
-	 */
-	if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu]))
-		nr_running = this_rq->nr_running;
-	else
-		nr_running = this_rq->prev_nr_running[this_cpu];
+	runqueue_t *rq_src, *this_rq = cpu_rq(this_cpu);
+	int this_pool = CPU_TO_NODE(this_cpu);
+	int i, ii, idx=-1, refload, load;
 
-	busiest = NULL;
-	max_load = 1;
-	for (i = 0; i < NR_CPUS; i++) {
-		if (!cpu_online(i))
-			continue;
+	data[1][pool] = 0;
+	refload = -1;
 
+	for (ii = pool_ptr[pool]; ii < pool_ptr[pool+1]; ii++) {
+		i = pool_cpus[ii];
 		rq_src = cpu_rq(i);
 		if (idle || (rq_src->nr_running < this_rq->prev_nr_running[i]))
 			load = rq_src->nr_running;
@@ -677,74 +697,125 @@ static inline runqueue_t *find_busiest_q
 			load = this_rq->prev_nr_running[i];
 		this_rq->prev_nr_running[i] = rq_src->nr_running;
 
-		if ((load > max_load) && (rq_src != this_rq)) {
-			busiest = rq_src;
-			max_load = load;
+		data[0][i] = load;
+		data[1][pool] += load;
+
+		if (load > refload) {
+			idx = i;
+			refload = load;
 		}
 	}
+	data[1][pool] = data[1][pool] * BALANCE_FACTOR / pool_nr_cpus[pool];
+	return idx;
+}
 
-	if (likely(!busiest))
-		goto out;
+/*
+ * Find a runqueue from which to steal a task. We try to do this as locally as
+ * possible because we don't want to let tasks get far from their home node.
+ * This is done in two steps:
+ * 1. First try to find a runqueue within the own CPU pool (AKA node) with
+ * imbalance larger than 25% (relative to the current runqueue).
+ * 2. If the local node is well balanced, locate the most loaded node and its
+ * most loaded CPU. Remote runqueues running tasks having their homenode on the
+ * current node are preferred (those tasks count twice in the load calculation).
+ * If the current load is far below the average try to steal a task from the
+ * most loaded node/cpu. Otherwise wait 100ms and give less loaded nodes the
+ * chance to approach the average load.
+ *
+ * This concept can be extended easilly to more than two levels (multi-level
+ * scheduler?), e.g.: CPU -> multi-core package -> node -> supernode...
+ *                                                         <efocht@ess.nec.de>
+ */
+static inline runqueue_t *find_busiest_queue(runqueue_t *this_rq, int this_cpu,
+					     int idle, int *nr_running)
+{
+	runqueue_t *busiest = NULL;
+	int imax, best_cpu, pool, max_pool_load, max_pool_idx;
+	int i, del_shift;
+	int avg_load=-1, this_pool = CPU_TO_NODE(this_cpu);
 
-	*imbalance = (max_load - nr_running) / 2;
+	/* Need at least ~25% imbalance to trigger balancing. */
+#define BALANCED(m,t) (((m) <= 1) || (((m) - (t))/2 < (((m) + (t))/2 + 3)/4))
+
+	if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu]))
+		*nr_running = this_rq->nr_running;
+	else
+		*nr_running = this_rq->prev_nr_running[this_cpu];
 
-	/* It needs an at least ~25% imbalance to trigger balancing. */
-	if (!idle && (*imbalance < (max_load + 3)/4)) {
-		busiest = NULL;
+	best_cpu = calc_pool_load(this_rq->load, this_cpu, this_pool, idle);
+	if (best_cpu != this_cpu)
+		goto check_out;
+
+ scan_all:
+	best_cpu = -1;
+	max_pool_load = this_rq->load[1][this_pool];
+	max_pool_idx = this_pool;
+	avg_load = max_pool_load * pool_nr_cpus[this_pool];
+	for (i = 1; i < numpools; i++) {
+		pool = (i + this_pool) % numpools;
+		imax = calc_pool_load(this_rq->load, this_cpu, pool, idle);
+		avg_load += this_rq->load[1][pool]*pool_nr_cpus[pool];
+		if (this_rq->load[1][pool] > max_pool_load) {
+			max_pool_load = this_rq->load[1][pool];
+			max_pool_idx = pool;
+			best_cpu = imax;
+		}
+	}
+	/* Exit if not enough imbalance on any remote node. */
+	if ((best_cpu < 0) ||
+	    BALANCED(max_pool_load,this_rq->load[1][this_pool])) {
+		this_rq->wait_node = -1;
 		goto out;
 	}
+	avg_load /= num_online_cpus();
+	/* Wait longer before stealing if load is average. */
+	if (BALANCED(avg_load,this_rq->load[1][this_pool]))
+		del_shift = 0;
+	else
+		del_shift = 6;
 
-	nr_running = double_lock_balance(this_rq, busiest, this_cpu, idle, nr_running);
-	/*
-	 * Make sure nothing changed since we checked the
-	 * runqueue length.
-	 */
-	if (busiest->nr_running <= nr_running + 1) {
-		spin_unlock(&busiest->lock);
-		busiest = NULL;
-	}
-out:
+	if (this_rq->wait_node != max_pool_idx) {
+		this_rq->wait_node = max_pool_idx;
+		this_rq->wait_time = jiffies;
+		goto out;
+	} else
+		if (jiffies - this_rq->wait_time <
+		    (POOL_DELAY(this_pool,this_rq->wait_node) >> del_shift))
+			goto out;
+ check_out:
+	/* Enough imbalance in the remote cpu loads? */
+	if (!BALANCED(this_rq->load[0][best_cpu],*nr_running)) {
+		busiest = cpu_rq(best_cpu);
+		this_rq->wait_node = -1;
+	} else if (avg_load == -1)
+		/* only scanned local pool, so let's look at all of them */
+		goto scan_all;
+ out:
 	return busiest;
 }
 
 /*
- * pull_task - move a task from a remote runqueue to the local runqueue.
- * Both runqueues must be locked.
- */
-static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, runqueue_t *this_rq, int this_cpu)
-{
-	dequeue_task(p, src_array);
-	src_rq->nr_running--;
-	set_task_cpu(p, this_cpu);
-	this_rq->nr_running++;
-	enqueue_task(p, this_rq->active);
-	/*
-	 * Note that idle threads have a prio of MAX_PRIO, for this test
-	 * to be always true for them.
-	 */
-	if (p->prio < this_rq->curr->prio)
-		set_need_resched();
-}
-
-/*
- * Current runqueue is empty, or rebalance tick: if there is an
- * inbalance (current runqueue is too short) then pull from
- * busiest runqueue(s).
- *
- * We call this with the current runqueue locked,
- * irqs disabled.
+ * Find a task to steal from the busiest RQ. The busiest->lock must be held
+ * while calling this routine. 
  */
-static void load_balance(runqueue_t *this_rq, int idle)
+static inline task_t *task_to_steal(runqueue_t *busiest, int this_cpu)
 {
-	int imbalance, idx, this_cpu = smp_processor_id();
-	runqueue_t *busiest;
+	int idx;
+	task_t *next = NULL, *tmp;
 	prio_array_t *array;
 	struct list_head *head, *curr;
-	task_t *tmp;
+	int this_pool=CPU_TO_NODE(this_cpu), weight, maxweight=0;
 
-	busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance);
-	if (!busiest)
-		goto out;
+	/*
+	 * We do not migrate tasks that are:
+	 * 1) running (obviously), or
+	 * 2) cannot be migrated to this CPU due to cpus_allowed.
+	 */
+
+#define CAN_MIGRATE_TASK(p,rq,this_cpu)	\
+		((jiffies - (p)->sleep_timestamp > cache_decay_ticks) && \
+		p != rq->curr && \
+		 ((p)->cpus_allowed & (1UL<<(this_cpu))))
 
 	/*
 	 * We first consider expired tasks. Those will likely not be
@@ -770,7 +841,7 @@ skip_bitmap:
 			array = busiest->active;
 			goto new_array;
 		}
-		goto out_unlock;
+		goto out;
 	}
 
 	head = array->queue + idx;
@@ -778,33 +849,72 @@ skip_bitmap:
 skip_queue:
 	tmp = list_entry(curr, task_t, run_list);
 
+	if (CAN_MIGRATE_TASK(tmp, busiest, this_cpu)) {
+		weight = (jiffies - tmp->sleep_timestamp)/cache_decay_ticks;
+		if (weight > maxweight) {
+			maxweight = weight;
+			next = tmp;
+		}
+	}
+	curr = curr->next;
+	if (curr != head)
+		goto skip_queue;
+	idx++;
+	goto skip_bitmap;
+
+ out:
+	return next;
+}
+
+/*
+ * pull_task - move a task from a remote runqueue to the local runqueue.
+ * Both runqueues must be locked.
+ */
+static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, runqueue_t *this_rq, int this_cpu)
+{
+	dequeue_task(p, src_array);
+	src_rq->nr_running--;
+	set_task_cpu(p, this_cpu);
+	this_rq->nr_running++;
+	enqueue_task(p, this_rq->active);
 	/*
-	 * We do not migrate tasks that are:
-	 * 1) running (obviously), or
-	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
-	 * 3) are cache-hot on their current CPU.
+	 * Note that idle threads have a prio of MAX_PRIO, for this test
+	 * to be always true for them.
 	 */
+	if (p->prio < this_rq->curr->prio)
+		set_need_resched();
+}
 
-#define CAN_MIGRATE_TASK(p,rq,this_cpu)					\
-	((jiffies - (p)->sleep_timestamp > cache_decay_ticks) &&	\
-		!task_running(rq, p) &&					\
-			((p)->cpus_allowed & (1UL << (this_cpu))))
-
-	curr = curr->prev;
-
-	if (!CAN_MIGRATE_TASK(tmp, busiest, this_cpu)) {
-		if (curr != head)
-			goto skip_queue;
-		idx++;
-		goto skip_bitmap;
-	}
-	pull_task(busiest, array, tmp, this_rq, this_cpu);
-	if (!idle && --imbalance) {
-		if (curr != head)
-			goto skip_queue;
-		idx++;
-		goto skip_bitmap;
-	}
+/*
+ * Current runqueue is empty, or rebalance tick: if there is an
+ * inbalance (current runqueue is too short) then pull from
+ * busiest runqueue(s).
+ *
+ * We call this with the current runqueue locked,
+ * irqs disabled.
+ */
+static void load_balance(runqueue_t *this_rq, int idle)
+{
+	int nr_running, this_cpu = task_cpu(this_rq->curr);
+	task_t *tmp;
+	runqueue_t *busiest;
+
+	/* avoid deadlock by timer interrupt on own cpu */
+	if (atomic_read(&pool_lock)) return;
+	busiest = find_busiest_queue(this_rq, this_cpu, idle, &nr_running);
+
+	nr_running = double_lock_balance(this_rq, busiest, this_cpu, idle, nr_running);
+	/*
+	 * Make sure nothing changed since we checked the
+	 * runqueue length.
+	 */
+	if (busiest->nr_running <= nr_running + 1)
+		goto out_unlock;
+
+	tmp = task_to_steal(busiest, this_cpu);
+	if (!tmp)
+		goto out_unlock;
+	pull_task(busiest, tmp->array, tmp, this_rq, this_cpu);
 out_unlock:
 	spin_unlock(&busiest->lock);
 out:
@@ -817,10 +927,10 @@ out:
  * frequency and balancing agressivity depends on whether the CPU is
  * idle or not.
  *
- * busy-rebalance every 250 msecs. idle-rebalance every 1 msec. (or on
+ * busy-rebalance every 200 msecs. idle-rebalance every 1 msec. (or on
  * systems with HZ=100, every 10 msecs.)
  */
-#define BUSY_REBALANCE_TICK (HZ/4 ?: 1)
+#define BUSY_REBALANCE_TICK (HZ/5 ?: 1)
 #define IDLE_REBALANCE_TICK (HZ/1000 ?: 1)
 
 static inline void idle_tick(runqueue_t *rq)
@@ -1862,6 +1972,176 @@ void show_state(void)
 	read_unlock(&tasklist_lock);
 }
 
+#ifdef CONFIG_NUMA_SCHED
+void pools_info(void)
+{
+	int i, j;
+
+	printk("CPU pools : %d\n",numpools);
+	for (i=0;i<numpools;i++) {
+		printk("pool %d :",i);
+		for (j=pool_ptr[i];j<pool_ptr[i+1];j++)
+			printk("%d ",pool_cpus[j]);
+		printk("\n");
+	}
+}
+
+void pooldata_lock(void)
+{
+	int i;
+ retry:
+	while (atomic_read(&pool_lock));
+	if (atomic_inc_return(&pool_lock) > 1) {
+		atomic_dec(&pool_lock);
+		goto retry;
+	}
+	/* 
+	 * Wait a while, any loops using pool data should finish
+	 * in between. This is VERY ugly and should be replaced
+	 * by some real RCU stuff. [EF]
+	 */
+	for (i=0; i<100; i++)
+		udelay(1000);
+}
+
+void pooldata_unlock(void)
+{
+	atomic_dec(&pool_lock);
+}
+
+int node_levels[NR_NODES];
+int nr_node_levels;
+
+/*
+ * Default setting of node_distance() for up to 8 nodes.
+ * Each platform should initialize this to more appropriate values
+ * in the arch dependent part.
+ */
+#ifndef node_distance
+int __node_distance[ 8 * 8]    = { 10, 15, 15, 15, 20, 20, 20, 20,
+				   15, 10, 15, 15, 20, 20, 20, 20,
+				   15, 15, 10, 15, 20, 20, 20, 20,
+				   15, 15, 15, 10, 20, 20, 20, 20,
+				   20, 20, 20, 20, 10, 15, 15, 15,
+				   20, 20, 20, 20, 15, 10, 15, 15,
+				   20, 20, 20, 20, 15, 15, 10, 15,
+				   20, 20, 20, 20, 15, 15, 15, 10 };
+#define node_distance(i,j)  __node_distance[i*8+j]
+#endif
+
+/*
+ * Find all values of node distances in the SLIT table and sort them
+ * into the array node_levels[].
+ */
+static void find_node_levels(int numpools)
+{
+	int lev, tgtlev, nlarger, i;
+
+	nr_node_levels = 1;
+	node_levels[0] = node_distance(0, 0);
+	do {
+		nlarger = 0;
+		tgtlev = 100000;
+		for (i=1; i<numpools; i++) {
+			lev = node_distance(0, i);
+			if (lev > node_levels[nr_node_levels-1] &&
+			    lev < tgtlev) {
+				if (tgtlev < 100000) nlarger++;
+				tgtlev = lev;
+			}
+			if (lev > tgtlev) nlarger++;
+		}
+		if (tgtlev != 100000)
+			node_levels[nr_node_levels++] = tgtlev;
+	} while (nlarger);
+
+	for (i=0; i<nr_node_levels; i++)
+		printk("node level %d : %d\n",i,node_levels[i]);
+}
+
+/*
+ * Initialize the pool_weight[] matrix.
+ */
+static void
+init_pool_weight(void)
+{
+	int i, j, lev;
+
+	for (i=0; i<numpools; i++)
+		for (j=0; j<numpools; j++)
+			for (lev=0; lev<nr_node_levels; lev++)
+				if (node_distance(i,j) == node_levels[lev]) {
+					_pool_weight[i*numpools+j] =
+						(2*(nr_node_levels-lev)-1)
+						*MAX_CACHE_WEIGHT;
+					break;
+				}
+}
+
+/*
+ * Initialize delay for stealing tasks from remote nodes
+ */
+static void
+init_pool_delay(void)
+{
+	int i, j;
+
+	for(i=0; i<numpools; i++)
+		for(j=0; j<numpools; j++)
+			_pool_delay[i * numpools + j] = HZ * \
+				node_distance(i,j) / 77;
+
+	printk("pool_delay matrix:\n");
+	for(i=0; i<numpools; i++){
+		for(j=0; j<numpools; j++)
+			printk("%4d ",_pool_delay[i*numpools+j]);
+		printk("\n");
+	}
+}
+
+/*
+ * Call pooldata_lock() before calling this function and
+ * pooldata_unlock() after!
+ */
+void bld_pools(void)
+{
+	int i, j, ptr;
+	int flag[NR_CPUS] = { [ 0 ... NR_CPUS-1] = 0 };
+	unsigned long mask;
+
+	/* build translation table for CPU_TO_NODE macro */
+	for (i = 0; i < NR_CPUS; i++)
+		if (cpu_online(i))
+			lnode_number[i] = pnode_to_lnode[SAPICID_TO_PNODE(cpu_physical_id(i))];
+
+	numpools = 0;
+	ptr = 0;
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!cpu_online(i)) continue;
+		if (!flag[i]) {
+			pool_ptr[numpools]=ptr;
+			mask = 0UL;
+			for (j = 0; j < NR_CPUS; j++) {
+				if (!cpu_online(j)) continue;
+				if (i == j || CPU_TO_NODE(i) == CPU_TO_NODE(j)) {
+					pool_cpus[ptr++]=j;
+					flag[j]=1;
+					mask |= (1UL<<j);
+				}
+			}
+			pool_nr_cpus[numpools] = ptr - pool_ptr[numpools];
+			pool_mask[numpools] = mask;
+			numpools++;
+		}
+	}
+	pool_ptr[numpools]=ptr;
+	pools_info();
+	find_node_levels(numpools);
+	init_pool_weight();
+	init_pool_delay();
+}
+
+#endif /* CONFIG_NUMA_SCHED */
 void __init init_idle(task_t *idle, int cpu)
 {
 	runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(task_cpu(idle));
@@ -1909,6 +2189,8 @@ typedef struct {
 	struct list_head list;
 	task_t *task;
 	struct completion done;
+	int cpu_dest;
+	int sync;
 } migration_req_t;
 
 /*
@@ -1954,6 +2236,7 @@ void set_cpus_allowed(task_t *p, unsigne
 	}
 	init_completion(&req.done);
 	req.task = p;
+	req.sync = 1;
 	list_add(&req.list, &rq->migration_queue);
 	task_rq_unlock(rq, &flags);
 	wake_up_process(rq->migration_thread);
@@ -1963,6 +2246,17 @@ out:
 	preempt_enable();
 }
 
+void sched_migrate_task(task_t *p, int dest_cpu)
+{
+	unsigned long old_mask;
+
+	old_mask = p->cpus_allowed;
+	if (!(old_mask & (1UL << dest_cpu)))
+		return;
+	set_cpus_allowed(p, 1UL << dest_cpu);
+	set_cpus_allowed(p, old_mask);
+}
+
 /*
  * migration_thread - this is a highprio system thread that performs
  * thread migration by 'pulling' threads into the target runqueue.
@@ -1998,7 +2292,7 @@ static int migration_thread(void * data)
 	for (;;) {
 		runqueue_t *rq_src, *rq_dest;
 		struct list_head *head;
-		int cpu_src, cpu_dest;
+		int cpu_src, cpu_dest, sync;
 		migration_req_t *req;
 		unsigned long flags;
 		task_t *p;
@@ -2013,10 +2307,17 @@ static int migration_thread(void * data)
 		}
 		req = list_entry(head->next, migration_req_t, list);
 		list_del_init(head->next);
-		spin_unlock_irqrestore(&rq->lock, flags);
 
 		p = req->task;
-		cpu_dest = __ffs(p->cpus_allowed);
+		sync = req->sync;
+		if (sync)
+			cpu_dest = __ffs(p->cpus_allowed & cpu_online_map);
+		else {
+			cpu_dest = req->cpu_dest;
+			req->task = NULL;
+		}
+		spin_unlock_irqrestore(&rq->lock, flags);
+
 		rq_dest = cpu_rq(cpu_dest);
 repeat:
 		cpu_src = task_cpu(p);
@@ -2039,7 +2340,8 @@ repeat:
 		double_rq_unlock(rq_src, rq_dest);
 		local_irq_restore(flags);
 
-		complete(&req->done);
+		if (sync)
+			complete(&req->done);
 	}
 }
 
@@ -2119,6 +2421,15 @@ void __init sched_init(void)
 			__set_bit(MAX_PRIO, array->bitmap);
 		}
 	}
+#ifdef CONFIG_NUMA_SCHED
+	pool_ptr[0] = 0;
+	pool_ptr[1] = NR_CPUS;
+
+	numpools = 1;
+	pool_mask[0] = -1L;
+	pool_nr_cpus[0] = NR_CPUS;
+#endif
+
 	/*
 	 * We have to do a little magic to get the first
 	 * thread right in SMP mode.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] node affine NUMA scheduler
  2002-09-21  9:59 [PATCH 1/2] node affine NUMA scheduler Erich Focht
@ 2002-09-21 10:02 ` Erich Focht
  2002-09-21 15:55 ` [Lse-tech] [PATCH 1/2] " Martin J. Bligh
  2002-09-22 15:52 ` Martin J. Bligh
  2 siblings, 0 replies; 27+ messages in thread
From: Erich Focht @ 2002-09-21 10:02 UTC (permalink / raw)
  To: linux-kernel; +Cc: LSE, Ingo Molnar, Michael Hohnbaum

[-- Attachment #1: Type: text/plain, Size: 1119 bytes --]

Here comes the second part of the node affine NUMA scheduler.

> The patch comes in two parts. The first part is the core NUMA scheduler,
> it is functional without the second part and provides following features:
>  - Node aware scheduler (implemented CPU pools).
>  - Scheduler behaves like O(1) scheduler within a node.
>  - Equal load among nodes is targeted, stealing tasks from remote nodes
>    is delayed more if the current node is averagely loaded, less if it's
>    unloaded.
>  - Multi-level node hierarchies are supported by stealing delays adjusted
>    by the relative "node-distance" (memory access latency ratio).
>
> The second part of the patch extends the pooling NUMA scheduler to
> have node affine tasks:
>  - Each process has a homenode assigned to it at creation time
>    (initial load balancing). Memory will be allocated from this node.
>  - Each process is preferentially scheduled on its homenode and
>    attracted back to it if scheduled away for some reason. For
>    multi-level node hierarchies the task is attracted to its
>    supernode, too.

Regards,
Erich

[-- Attachment #2: 02-numa_sched_affine-2.5.patch --]
[-- Type: text/x-diff, Size: 15211 bytes --]

diff -urNp a/arch/i386/config.in b/arch/i386/config.in
--- a/arch/i386/config.in	Sat Sep 21 11:44:48 2002
+++ b/arch/i386/config.in	Sat Sep 21 11:56:23 2002
@@ -177,7 +177,7 @@ else
      fi
      # Common NUMA Features
      if [ "$CONFIG_X86_NUMAQ" = "y" ]; then
-        bool 'Enable NUMA scheduler' CONFIG_NUMA_SCHED
+        bool 'Enable node affine NUMA scheduler' CONFIG_NUMA_SCHED
         bool 'Numa Memory Allocation Support' CONFIG_NUMA
         if [ "$CONFIG_NUMA" = "y" ]; then
            define_bool CONFIG_DISCONTIGMEM y
diff -urNp a/arch/ia64/config.in b/arch/ia64/config.in
--- a/arch/ia64/config.in	Sat Sep 21 11:44:48 2002
+++ b/arch/ia64/config.in	Sat Sep 21 11:56:23 2002
@@ -67,13 +67,13 @@ fi
 if [ "$CONFIG_IA64_GENERIC" = "y" -o "$CONFIG_IA64_DIG" = "y" -o "$CONFIG_IA64_HP_ZX1" = "y" ];
 then
 	bool '  Enable IA-64 Machine Check Abort' CONFIG_IA64_MCA
-   	bool '  Enable NUMA scheduler' CONFIG_NUMA_SCHED
+   	bool '  Enable node affine NUMA scheduler' CONFIG_NUMA_SCHED
 	define_bool CONFIG_PM y
 fi
 
 if [ "$CONFIG_IA64_SGI_SN1" = "y" -o "$CONFIG_IA64_SGI_SN2" = "y" ]; then
 	define_bool CONFIG_IA64_SGI_SN y
-   	bool '  Enable NUMA scheduler' CONFIG_NUMA_SCHED
+   	bool '  Enable node affine NUMA scheduler' CONFIG_NUMA_SCHED
 	bool '  Enable extra debugging code' CONFIG_IA64_SGI_SN_DEBUG
 	bool '  Enable SGI Medusa Simulator Support' CONFIG_IA64_SGI_SN_SIM
 	bool '  Enable autotest (llsc). Option to run cache test instead of booting' \
diff -urNp a/fs/exec.c b/fs/exec.c
--- a/fs/exec.c	Fri Sep 20 17:20:23 2002
+++ b/fs/exec.c	Sat Sep 21 11:56:23 2002
@@ -995,6 +995,9 @@ int do_execve(char * filename, char ** a
 	int retval;
 	int i;
 
+	if (current->node_policy == NODPOL_EXEC)
+		sched_balance_exec();
+
 	file = open_exec(filename);
 
 	retval = PTR_ERR(file);
diff -urNp a/include/asm-i386/mmzone.h b/include/asm-i386/mmzone.h
--- a/include/asm-i386/mmzone.h	Fri Sep 20 17:20:26 2002
+++ b/include/asm-i386/mmzone.h	Sat Sep 21 11:56:23 2002
@@ -20,7 +20,11 @@
 #endif /* CONFIG_X86_NUMAQ */
 
 #ifdef CONFIG_NUMA
+#ifdef CONFIG_NUMA_SCHED
+#define numa_node_id() (current->node)
+#else
 #define numa_node_id() _cpu_to_node(smp_processor_id())
+#endif
 #endif /* CONFIG_NUMA */
 
 extern struct pglist_data *node_data[];
diff -urNp a/include/linux/prctl.h b/include/linux/prctl.h
--- a/include/linux/prctl.h	Fri Sep 20 17:20:30 2002
+++ b/include/linux/prctl.h	Sat Sep 21 11:56:23 2002
@@ -34,4 +34,10 @@
 # define PR_FP_EXC_ASYNC	2	/* async recoverable exception mode */
 # define PR_FP_EXC_PRECISE	3	/* precise exception mode */
 
+/* Get/set node for node-affine scheduling */
+#define PR_GET_NODE       16
+#define PR_SET_NODE       17
+#define PR_GET_NODPOL     18
+#define PR_SET_NODPOL     19
+
 #endif /* _LINUX_PRCTL_H */
diff -urNp a/include/linux/sched.h b/include/linux/sched.h
--- a/include/linux/sched.h	Sat Sep 21 11:44:48 2002
+++ b/include/linux/sched.h	Sat Sep 21 11:56:23 2002
@@ -159,6 +159,15 @@ extern void update_one_process(struct ta
 			       unsigned long system, int cpu);
 extern void scheduler_tick(int user_tick, int system);
 extern unsigned long cache_decay_ticks;
+#ifdef CONFIG_NUMA_SCHED
+extern void sched_balance_exec(void);
+extern void sched_balance_fork(task_t *p);
+extern void set_task_node(task_t *p, int node);
+#else
+#define sched_balance_exec() {}
+#define sched_balance_fork(p) {}
+#define set_task_node(p,n) {}
+#endif
 extern void sched_migrate_task(task_t *p, int cpu);
 
 #define	MAX_SCHEDULE_TIMEOUT	LONG_MAX
@@ -293,6 +302,8 @@ struct task_struct {
 	unsigned long policy;
 	unsigned long cpus_allowed;
 	unsigned int time_slice, first_time_slice;
+	int node;
+	int node_policy;
 
 	struct list_head tasks;
 	struct list_head ptrace_children;
@@ -484,9 +495,13 @@ extern char lnode_number[NR_CPUS] __cach
 
 extern void pooldata_lock(void);
 extern void pooldata_unlock(void);
+# define HOMENODE_INC(rq,node) (rq)->nr_homenode[node]++
+# define HOMENODE_DEC(rq,node) (rq)->nr_homenode[node]--
 #else
 # define POOL_DELAY(x,y)  (0)
 # define CPU_TO_NODE(cpu) (0)
+# define HOMENODE_INC(rq,node) {}
+# define HOMENODE_DEC(rq,node) {}
 #endif
 
 extern void set_user_nice(task_t *p, long nice);
diff -urNp a/kernel/fork.c b/kernel/fork.c
--- a/kernel/fork.c	Fri Sep 20 17:20:16 2002
+++ b/kernel/fork.c	Sat Sep 21 11:56:23 2002
@@ -701,6 +701,9 @@ static struct task_struct *copy_process(
 #ifdef CONFIG_SMP
 	{
 		int i;
+		if (p->node_policy == NODPOL_FORK_ALL ||
+		    (p->node_policy == NODPOL_FORK && !(clone_flags & CLONE_VM)))
+				sched_balance_fork(p);
 
 		/* ?? should we just memset this ?? */
 		for(i = 0; i < NR_CPUS; i++)
diff -urNp a/kernel/sched.c b/kernel/sched.c
--- a/kernel/sched.c	Sat Sep 21 11:44:48 2002
+++ b/kernel/sched.c	Sat Sep 21 11:56:23 2002
@@ -156,6 +156,7 @@ struct runqueue {
 
 	unsigned long wait_time;
 	int wait_node;
+	short nr_homenode[NR_NODES];
 	int load[2][NR_CPUS];
 
 } ____cacheline_aligned;
@@ -328,6 +329,7 @@ static inline void activate_task(task_t 
 	}
 	enqueue_task(p, array);
 	rq->nr_running++;
+	HOMENODE_INC(rq,p->node);
 }
 
 /*
@@ -335,6 +337,7 @@ static inline void activate_task(task_t 
  */
 static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
 {
+	HOMENODE_DEC(rq,p->node);
 	rq->nr_running--;
 	if (p->state == TASK_UNINTERRUPTIBLE)
 		rq->nr_uninterruptible++;
@@ -696,6 +699,9 @@ static int calc_pool_load(int data[][NR_
 		else
 			load = this_rq->prev_nr_running[i];
 		this_rq->prev_nr_running[i] = rq_src->nr_running;
+		/* prefer cpus running tasks from this node */
+		if (pool != this_pool)
+			load += rq_src->nr_homenode[this_pool];
 
 		data[0][i] = load;
 		data[1][pool] += load;
@@ -851,6 +857,13 @@ skip_queue:
 
 	if (CAN_MIGRATE_TASK(tmp, busiest, this_cpu)) {
 		weight = (jiffies - tmp->sleep_timestamp)/cache_decay_ticks;
+		/* limit weight influence of sleep_time and cache coolness */
+		if (weight >= MAX_CACHE_WEIGHT) weight=MAX_CACHE_WEIGHT-1;
+		/* weight depending on homenode of task */
+		weight += POOL_WEIGHT(this_pool,tmp->node);
+		/* task gets bonus if running on its homenode */
+		if (tmp->node == CPU_TO_NODE(task_cpu(busiest->curr)))
+			weight -= MAX_CACHE_WEIGHT;
 		if (weight > maxweight) {
 			maxweight = weight;
 			next = tmp;
@@ -885,6 +898,36 @@ static inline void pull_task(runqueue_t 
 		set_need_resched();
 }
 
+static inline void
+try_push_home(runqueue_t *this_rq, int this_cpu, int nr_running)
+{
+	task_t *p;
+	int tgt_pool, tgt_cpu, i, ii;
+	runqueue_t *rq;
+	static int sched_push_task(task_t *p, int cpu_dest);
+
+	if (nr_running != 1)
+		return;
+	p = this_rq->curr;
+	tgt_pool = p->node;
+	if (tgt_pool != CPU_TO_NODE(this_cpu)) {
+		/* compute how many own tasks run on the tgt node */
+		int load = 0;
+		for (ii=pool_ptr[tgt_pool]; ii<pool_ptr[tgt_pool+1]; ii++) {
+			i = pool_cpus[ii];
+			rq = cpu_rq(i);
+			load += rq->nr_homenode[tgt_pool];
+		}
+		load = BALANCE_FACTOR * load / pool_nr_cpus[tgt_pool];
+		if (load < BALANCE_FACTOR/4) {
+			tgt_cpu = __ffs(p->cpus_allowed & pool_mask[tgt_pool]
+					& cpu_online_map);
+			if (tgt_cpu)
+				sched_push_task(p, tgt_cpu);
+		}
+	}
+}
+
 /*
  * Current runqueue is empty, or rebalance tick: if there is an
  * inbalance (current runqueue is too short) then pull from
@@ -902,6 +945,10 @@ static void load_balance(runqueue_t *thi
 	/* avoid deadlock by timer interrupt on own cpu */
 	if (atomic_read(&pool_lock)) return;
 	busiest = find_busiest_queue(this_rq, this_cpu, idle, &nr_running);
+	if (!busiest) {
+		try_push_home(this_rq, this_cpu, nr_running);
+		goto out;
+	}
 
 	nr_running = double_lock_balance(this_rq, busiest, this_cpu, idle, nr_running);
 	/*
@@ -1972,6 +2019,102 @@ void show_state(void)
 	read_unlock(&tasklist_lock);
 }
 
+/* used as counter for round-robin node-scheduling */
+static atomic_t sched_node=ATOMIC_INIT(0);
+
+/*
+ * Find the least loaded CPU on the current node of the task.
+ */
+static int sched_best_cpu(struct task_struct *p)
+{
+	int n, cpu, load, best_cpu = task_cpu(p);
+	
+	load = 1000000;
+	for (n = pool_ptr[p->node]; n < pool_ptr[p->node+1]; n++) {
+		cpu = pool_cpus[n];
+		if (!(p->cpus_allowed & (1UL << cpu) & cpu_online_map))
+			continue;
+		if (cpu_rq(cpu)->nr_running < load) {
+			best_cpu = cpu;
+			load = cpu_rq(cpu)->nr_running;
+		}
+	}
+	return best_cpu;
+}
+
+/*
+ * Find the node with fewest tasks assigned to it. Don't bother about the
+ * current load of the nodes, the load balancer should take care.
+ * The argument  flag  gives some options for initial load balancing:
+ *    flag = 0:  don't count own task (when balancing at do_exec())
+ *    flag = 1:  count own task (when balancing at do_fork())
+ */
+static int sched_best_node(struct task_struct *p, int flag)
+{
+	int n, best_node=0, min_load, pool_load, min_pool=p->node;
+	int pool, load[NR_NODES];
+	unsigned long mask = p->cpus_allowed & cpu_online_map;
+
+	do {
+		best_node = atomic_inc_return(&sched_node) % numpools;
+	} while (!(pool_mask[best_node] & mask));
+
+	for (pool = 0; pool < numpools; pool++)
+		load[pool] = 0;
+
+	for (n = 0; n < NR_CPUS; n++) {
+		if (!cpu_online(n)) continue;
+#if 1
+		for (pool = 0; pool < numpools; pool++)
+			load[pool] += cpu_rq(n)->nr_homenode[pool];
+#else
+		load[CPU_TO_NODE(n)] += cpu_rq(n)->nr_running;
+#endif
+	}
+
+	/* don't count own process, this one will be moved */
+	if (!flag)
+		--load[p->node];
+
+	min_load = 100000000;
+	for (n = 0; n < numpools; n++) {
+		pool = (best_node + n) % numpools;
+		pool_load = (100*load[pool])/pool_nr_cpus[pool];
+		if ((pool_load < min_load) && (pool_mask[pool] & mask)) {
+			min_load = pool_load;
+			min_pool = pool;
+		}
+	}
+	atomic_set(&sched_node, min_pool);
+	return min_pool;
+}
+
+void sched_balance_exec(void)
+{
+	int new_cpu, new_node;
+
+	while (atomic_read(&pool_lock))
+		cpu_relax();
+	if (numpools > 1) {
+		new_node = sched_best_node(current, 0);
+		if (new_node != current->node) {
+			set_task_node(current, new_node);
+		}
+	}
+	new_cpu = sched_best_cpu(current);
+	if (new_cpu != smp_processor_id())
+		sched_migrate_task(current, new_cpu);
+}
+
+void sched_balance_fork(task_t *p)
+{
+	while (atomic_read(&pool_lock))
+		cpu_relax();
+	if (numpools > 1)
+		p->node = sched_best_node(p, 1);
+	set_task_cpu(p, sched_best_cpu(p));
+}
+
 #ifdef CONFIG_NUMA_SCHED
 void pools_info(void)
 {
@@ -2141,6 +2284,20 @@ void bld_pools(void)
 	init_pool_delay();
 }
 
+void set_task_node(task_t *p, int node)
+{
+	runqueue_t *rq;
+	unsigned long flags;
+
+	if (node < 0 || node >= numpools) return;
+	rq = task_rq_lock(p, &flags);
+	if (p->array) {
+		HOMENODE_DEC(rq, p->node);
+		HOMENODE_INC(rq, node);
+	}
+	p->node = node;
+	task_rq_unlock(rq, &flags);
+}
 #endif /* CONFIG_NUMA_SCHED */
 void __init init_idle(task_t *idle, int cpu)
 {
@@ -2156,6 +2313,7 @@ void __init init_idle(task_t *idle, int 
 	idle->prio = MAX_PRIO;
 	idle->state = TASK_RUNNING;
 	set_task_cpu(idle, cpu);
+	idle->node = 0; /* was: SAPICID_TO_PNODE(cpu_physical_id(cpu)); */
 	double_rq_unlock(idle_rq, rq);
 	set_tsk_need_resched(idle);
 	local_irq_restore(flags);
@@ -2246,6 +2404,42 @@ out:
 	preempt_enable();
 }
 
+/*
+ * Static per cpu migration request structures for pushing the current
+ * process to another CPU from within load_balance().
+ */
+static migration_req_t migr_req[NR_CPUS];
+
+/*
+ * Push the current task to another cpu asynchronously. To be used from within
+ * load_balance() to push tasks running alone on a remote node back to their
+ * homenode. The RQ lock must be held when calling this function, it protects
+ * migr_req[cpu]. Function should not be preempted!
+ */
+static int sched_push_task(task_t *p, int cpu_dest)
+{
+	int cpu = smp_processor_id();
+	runqueue_t *rq = this_rq();
+	unsigned long flags;
+
+	if (migr_req[cpu].task)
+		return -1;
+	else {
+		migr_req[cpu].task = p;
+		migr_req[cpu].cpu_dest = cpu_dest;
+		migr_req[cpu].sync = 0;
+		list_add(&migr_req[cpu].list, &rq->migration_queue);
+
+		if (!rq->migration_thread->array) {
+			activate_task(rq->migration_thread, rq);
+			if (rq->migration_thread->prio < rq->curr->prio)
+				resched_task(rq->curr);
+		}
+		rq->migration_thread->state = TASK_RUNNING;
+		return 0;
+	}
+}
+
 void sched_migrate_task(task_t *p, int dest_cpu)
 {
 	unsigned long old_mask;
@@ -2273,6 +2467,7 @@ static int migration_thread(void * data)
 	set_fs(KERNEL_DS);
 
 	set_cpus_allowed(current, 1UL << cpu);
+	set_task_node(current, CPU_TO_NODE(cpu));
 
 	/*
 	 * Migration can happen without a migration thread on the
@@ -2420,6 +2615,9 @@ void __init sched_init(void)
 			// delimiter for bitsearch
 			__set_bit(MAX_PRIO, array->bitmap);
 		}
+		for (j = 0; j < NR_NODES; j++)
+			rq->nr_homenode[j]=0;
+		pool_cpus[i] = i;
 	}
 #ifdef CONFIG_NUMA_SCHED
 	pool_ptr[0] = 0;
diff -urNp a/kernel/softirq.c b/kernel/softirq.c
--- a/kernel/softirq.c	Fri Sep 20 17:20:20 2002
+++ b/kernel/softirq.c	Sat Sep 21 11:56:23 2002
@@ -365,6 +365,7 @@ static int ksoftirqd(void * __bind_cpu)
 	set_cpus_allowed(current, 1UL << cpu);
 	if (smp_processor_id() != cpu)
 		BUG();
+	set_task_node(current, CPU_TO_NODE(cpu));
 
 	sprintf(current->comm, "ksoftirqd_CPU%d", cpu);
 
diff -urNp a/kernel/sys.c b/kernel/sys.c
--- a/kernel/sys.c	Fri Sep 20 17:20:16 2002
+++ b/kernel/sys.c	Sat Sep 21 11:56:23 2002
@@ -1309,6 +1309,8 @@ asmlinkage long sys_prctl(int option, un
 {
 	int error = 0;
 	int sig;
+	int pid;
+	struct task_struct *child;
 
 	error = security_ops->task_prctl(option, arg2, arg3, arg4, arg5);
 	if (error)
@@ -1369,6 +1371,63 @@ asmlinkage long sys_prctl(int option, un
 			}
 			current->keep_capabilities = arg2;
 			break;
+		case PR_GET_NODE:
+			pid = (int) arg3;
+			read_lock(&tasklist_lock);
+			child = find_task_by_pid(pid);
+			if (child) {
+				error = put_user(child->node,(int *)arg2);
+			} else {
+				printk("prctl: could not find process %d\n",pid);
+				error = -EINVAL;
+			}
+			read_unlock(&tasklist_lock);
+			break;
+		case PR_SET_NODE:
+			pid = (int) arg3;
+			read_lock(&tasklist_lock);
+			child = find_task_by_pid(pid);
+			if (child) {
+				if (child->uid == current->uid || \
+				    current->uid == 0) {
+					printk("setting node of pid %d to %d\n",pid,(int)arg2);
+					set_task_node(child,(int)arg2);
+				}
+			} else {
+				printk("prctl: could not find pid %d\n",pid);
+				error = -EINVAL;
+			}
+			read_unlock(&tasklist_lock);
+			break;
+
+		case PR_GET_NODPOL:
+			pid = (int) arg3;
+			read_lock(&tasklist_lock);
+			child = find_task_by_pid(pid);
+			if (child) {
+				error = put_user(child->node_policy,(int *)arg2);
+			} else {
+				printk("prctl: could not find pid %d\n",pid);
+				error = -EINVAL;
+			}
+			read_unlock(&tasklist_lock);
+			break;
+		case PR_SET_NODPOL:
+			pid = (int) arg3;
+			read_lock(&tasklist_lock);
+			child = find_task_by_pid(pid);
+			if (child) {
+				if (child->uid == current->uid || \
+				    current->uid == 0) {
+					printk("setting node policy of process %d to %d\n",pid,(int)arg2);
+					child->node_policy = (int) arg2;
+				}
+			} else {
+				printk("prctl: could not find pid %d\n",pid);
+				error = -EINVAL;
+			}
+			read_unlock(&tasklist_lock);
+			break;
 		default:
 			error = -EINVAL;
 			break;

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Lse-tech] [PATCH 1/2] node affine NUMA scheduler
  2002-09-21  9:59 [PATCH 1/2] node affine NUMA scheduler Erich Focht
  2002-09-21 10:02 ` [PATCH 2/2] " Erich Focht
@ 2002-09-21 15:55 ` Martin J. Bligh
  2002-09-21 16:32   ` Martin J. Bligh
  2002-09-22 10:45   ` Erich Focht
  2002-09-22 15:52 ` Martin J. Bligh
  2 siblings, 2 replies; 27+ messages in thread
From: Martin J. Bligh @ 2002-09-21 15:55 UTC (permalink / raw)
  To: Erich Focht, linux-kernel; +Cc: LSE, Ingo Molnar, Michael Hohnbaum

 
> The second part of the patch extends the pooling NUMA scheduler to
> have node affine tasks:
>  - Each process has a homenode assigned to it at creation time
>    (initial load balancing). Memory will be allocated from this node.

Hmmm ... I was wondering how you achieved that without modifying
alloc_pages ... until I saw this bit.

 #ifdef CONFIG_NUMA
+#ifdef CONFIG_NUMA_SCHED
+#define numa_node_id() (current->node)
+#else
 #define numa_node_id() _cpu_to_node(smp_processor_id())
+#endif
 #endif /* CONFIG_NUMA */

I'm not convinced it's a good idea to modify this generic function,
which was meant to tell you what node you're running on. I can't
see it being used anywhere else right now, but wouldn't it be better
to just modify alloc_pages instead to use current->node, and leave
this macro as intended? Or make a process_node_id or something?

Anyway, I'm giving your code a quick spin ... will give you some
results later ;-)

M.



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Lse-tech] [PATCH 1/2] node affine NUMA scheduler
  2002-09-21 15:55 ` [Lse-tech] [PATCH 1/2] " Martin J. Bligh
@ 2002-09-21 16:32   ` Martin J. Bligh
  2002-09-21 16:46     ` Martin J. Bligh
  2002-09-22 10:45   ` Erich Focht
  1 sibling, 1 reply; 27+ messages in thread
From: Martin J. Bligh @ 2002-09-21 16:32 UTC (permalink / raw)
  To: Erich Focht, linux-kernel; +Cc: LSE, Ingo Molnar, Michael Hohnbaum

> Anyway, I'm giving your code a quick spin ... will give you some
> results later ;-)

Hmmm .... well I ran the One True Benchmark (tm). The patch 
*increased* my kernel compile time from about 20s to about 28s. 
Not sure I like that idea ;-) Anything you'd like tweaked, or 
more info? Both user and system time were up ... I'll grab a 
profile of kernel stuff.

M.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Lse-tech] [PATCH 1/2] node affine NUMA scheduler
  2002-09-21 16:32   ` Martin J. Bligh
@ 2002-09-21 16:46     ` Martin J. Bligh
  2002-09-21 17:11       ` Martin J. Bligh
                         ` (2 more replies)
  0 siblings, 3 replies; 27+ messages in thread
From: Martin J. Bligh @ 2002-09-21 16:46 UTC (permalink / raw)
  To: Erich Focht, linux-kernel; +Cc: LSE, Ingo Molnar, Michael Hohnbaum

>> Anyway, I'm giving your code a quick spin ... will give you some
>> results later ;-)
> 
> Hmmm .... well I ran the One True Benchmark (tm). The patch 
> *increased* my kernel compile time from about 20s to about 28s. 
> Not sure I like that idea ;-) Anything you'd like tweaked, or 
> more info? Both user and system time were up ... I'll grab a 
> profile of kernel stuff.

>From the below, I'd suggest you're getting pages off the wrong 
nodes: do_anonymous_page is page zeroing, and rmqueue the buddy
allocator. Are you sure the current->node thing is getting set
correctly? I'll try backing out your alloc_pages tweaking, and
see what happens.

An old compile off 2.5.31-mm1 + extras (I don't have 37, but similar)

87elapse133639 total                                      0.1390
 74447 default_idle                            
  6887 do_anonymous_page                       
  4456 page_remove_rmap                        
  4082 handle_mm_fault                         
  3733 .text.lock.namei                        
  2512 page_add_rmap                           
  2187 __generic_copy_from_user                
  1989 rmqueue                                 
  1964 .text.lock.dec_and_lock                 
  1761 vm_enough_memory                        
  1631 file_read_actor                         
  1599 zap_pte_range                           
  1507 __free_pages_ok                         
  1323 find_get_page                           
  1117 do_no_page                              
  1097 get_empty_filp                          
  1023 link_path_walk                          

2.5.37-mm1

256745 total                                      0.2584
 82934 default_idle                            
 38978 do_anonymous_page                       
 36533 rmqueue                                 
 35099 __free_pages_ok                         
  5551 page_remove_rmap                        
  4694 handle_mm_fault                         
  3166 page_add_rmap                           
  2904 do_no_page                              
  2674 .text.lock.namei                        
  2566 __alloc_pages                           
  2526 zap_pte_range                           
  2306 __generic_copy_from_user                
  2218 file_read_actor                         
  1803 vm_enough_memory                        
  1789 do_wp_page                              
  1557 .text.lock.dec_and_lock                 
  1414 find_get_page                           
  1251 do_softirq                              
  1123 release_pages                           
  1086 link_path_walk                          
  1072 get_empty_filp                          
  1038 schedule                                

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Lse-tech] [PATCH 1/2] node affine NUMA scheduler
  2002-09-21 16:46     ` Martin J. Bligh
@ 2002-09-21 17:11       ` Martin J. Bligh
  2002-09-21 17:32         ` Erich Focht
  2002-09-21 23:18       ` William Lee Irwin III
  2002-09-22 10:35       ` [Lse-tech] [PATCH 1/2] node affine NUMA scheduler Erich Focht
  2 siblings, 1 reply; 27+ messages in thread
From: Martin J. Bligh @ 2002-09-21 17:11 UTC (permalink / raw)
  To: Erich Focht, linux-kernel; +Cc: LSE, Ingo Molnar, Michael Hohnbaum

> From the below, I'd suggest you're getting pages off the wrong 
> nodes: do_anonymous_page is page zeroing, and rmqueue the buddy
> allocator. Are you sure the current->node thing is getting set
> correctly? I'll try backing out your alloc_pages tweaking, and
> see what happens.

OK, well removing that part of the patch gets us back from 28s to 
about 21s (compared to 20s virgin), total user time compared to 
virgin is up from 59s to 62s, user from 191 to 195. So it's still 
a net loss, but not by nearly as much. Are you determining target 
node on fork or exec ? I forget ...

Profile is more comparible. Nothing sticks out any more, but maybe
it just needs some tuning for balance intervals or something. 

153385 total                                      0.1544
 91219 default_idle                            
  7475 do_anonymous_page                       
  4564 page_remove_rmap                        
  4167 handle_mm_fault                         
  3467 .text.lock.namei                        
  2520 page_add_rmap                           
  2112 rmqueue                                 
  1905 .text.lock.dec_and_lock                 
  1849 zap_pte_range                           
  1668 vm_enough_memory                        
  1612 __free_pages_ok                         
  1504 file_read_actor                         
  1484 find_get_page                           
  1381 __generic_copy_from_user                
  1207 do_no_page                              
  1066 schedule                                
  1050 get_empty_filp                          
  1034 link_path_walk                          


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Lse-tech] [PATCH 1/2] node affine NUMA scheduler
  2002-09-21 17:11       ` Martin J. Bligh
@ 2002-09-21 17:32         ` Erich Focht
  2002-09-21 17:38           ` William Lee Irwin III
  0 siblings, 1 reply; 27+ messages in thread
From: Erich Focht @ 2002-09-21 17:32 UTC (permalink / raw)
  To: Martin J. Bligh, linux-kernel; +Cc: LSE, Ingo Molnar, Michael Hohnbaum

Hi Martin,

thanks for the comments and the testing!

On Saturday 21 September 2002 19:11, Martin J. Bligh wrote:
> > From the below, I'd suggest you're getting pages off the wrong
> > nodes: do_anonymous_page is page zeroing, and rmqueue the buddy
> > allocator. Are you sure the current->node thing is getting set
> > correctly? I'll try backing out your alloc_pages tweaking, and
> > see what happens.

The current->node is most probably wrong for most of the kernel threads,
except for migration_thread and ksoftirqd. But it should be fine for
user processes.

Might also be that the __node_distance matrix which you might use
by default is not optimal for NUMAQ. It is fine for our remote/local
latency ratio of 1.6. Yours is maybe an order of magnitude larger?
Try replacing: 15 -> 50, guess you don't go beyond 4 nodes now...


> OK, well removing that part of the patch gets us back from 28s to
> about 21s (compared to 20s virgin), total user time compared to
> virgin is up from 59s to 62s, user from 191 to 195. So it's still
> a net loss, but not by nearly as much. Are you determining target
> node on fork or exec ? I forget ...

The default is exec(). You can use
http://home.arcor.de/efocht/sched/nodpol.c
to set the node_policy to do initial load_balancing in fork().
Just do "nodpol -P 2" in the shell before starting the job/task.
This is VERY reccomended if you are creating many tasks/threads.
The default behavior is fine for MPI jobs or users starting serial
processes.

> Profile is more comparible. Nothing sticks out any more, but maybe
> it just needs some tuning for balance intervals or something.

Hmmm... There are two changes which might lead to lower performance:
1. load_balance() is not inlined any more.
2. pull_task steals only one task at a load_balance() call. It was
maximally imbalance/2 (if I remember correctly).

And of course, there is some real additional overhead due to the
initial load balancing which one feels for short living tasks... So
please try "nodpol -P 2" (and reset to default by "nodpol -P 0".

Did you try the first patch alone? I mean the pooling-only scheduler?

Thanks,
Erich

> 153385 total                                      0.1544
>  91219 default_idle
>   7475 do_anonymous_page
>   4564 page_remove_rmap
>   4167 handle_mm_fault
>   3467 .text.lock.namei
>   2520 page_add_rmap
>   2112 rmqueue
>   1905 .text.lock.dec_and_lock
>   1849 zap_pte_range
>   1668 vm_enough_memory
>   1612 __free_pages_ok
>   1504 file_read_actor
>   1484 find_get_page
>   1381 __generic_copy_from_user
>   1207 do_no_page
>   1066 schedule
>   1050 get_empty_filp
>   1034 link_path_walk

-- 
Dr. Erich Focht                                <efocht@ess.nec.de>
NEC European Supercomputer Systems, European HPC Technology Center
Hessbruehlstr. 21B, 70565 Stuttgart, Germany
phone: +49-711-78055-15                    fax  : +49-711-78055-25


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Lse-tech] [PATCH 1/2] node affine NUMA scheduler
  2002-09-21 17:32         ` Erich Focht
@ 2002-09-21 17:38           ` William Lee Irwin III
  0 siblings, 0 replies; 27+ messages in thread
From: William Lee Irwin III @ 2002-09-21 17:38 UTC (permalink / raw)
  To: Erich Focht
  Cc: Martin J. Bligh, linux-kernel, LSE, Ingo Molnar, Michael Hohnbaum

On Sat, Sep 21, 2002 at 07:32:59PM +0200, Erich Focht wrote:
> Might also be that the __node_distance matrix which you might use
> by default is not optimal for NUMAQ. It is fine for our remote/local
> latency ratio of 1.6. Yours is maybe an order of magnitude larger?
> Try replacing: 15 -> 50, guess you don't go beyond 4 nodes now...

I'm running with 8 over the weekend, and by and large we go to 16,
though we rarely put all our eggs in one basket.

I'll take it for a spin.


Cheers,
Bill

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Lse-tech] [PATCH 1/2] node affine NUMA scheduler
  2002-09-21 16:46     ` Martin J. Bligh
  2002-09-21 17:11       ` Martin J. Bligh
@ 2002-09-21 23:18       ` William Lee Irwin III
  2002-09-22  8:09         ` William Lee Irwin III
  2002-09-22 10:35       ` [Lse-tech] [PATCH 1/2] node affine NUMA scheduler Erich Focht
  2 siblings, 1 reply; 27+ messages in thread
From: William Lee Irwin III @ 2002-09-21 23:18 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Erich Focht, linux-kernel, LSE, Ingo Molnar, Michael Hohnbaum

On Sat, Sep 21, 2002 at 09:46:05AM -0700, Martin J. Bligh wrote:
> An old compile off 2.5.31-mm1 + extras (I don't have 37, but similar)

Some 8-quad numbers for 2.5.37 (virgin) follow.

I'll get dcache_rcu and NUMA sched stuff in on the act for round 2.
This is more fs transaction-based (and VM process-spawning overhead)
and not pure I/O throughput so there won't be things like "but
everybody's running at peak I/O bandwidth" obscuring the issues.

real    0m30.854s

c01053ec 16963617 89.009      poll_idle
c0114a48 452526   2.37443     load_balance
c013962c 253666   1.331       get_page_state
c01466de 177354   0.930586    .text.lock.file_table
c0114ec0 150583   0.790117    scheduler_tick
c01547b3 116144   0.609414    .text.lock.namei
c01422ac 94042    0.493444    page_remove_rmap
c0138c24 83293    0.437043    rmqueue
c0141e48 51963    0.272653    page_add_rmap
c012d5ec 37086    0.194592    do_anonymous_page
c01391d0 34454    0.180782    __alloc_pages
c0130e08 32111    0.168488    find_get_page
c0139534 29222    0.153329    nr_free_pages
c012df4c 26734    0.140275    handle_mm_fault
c0146070 25881    0.135799    get_empty_filp
c01a0d2c 25250    0.132488    __generic_copy_from_user
c0111728 22018    0.11553     smp_apic_timer_interrupt
c0112b90 20777    0.109018    pfn_to_nid
c0136238 18410    0.0965983   kmem_cache_free
c0113270 17494    0.091792    do_page_fault
c0138900 17282    0.0906796   __free_pages_ok
c01a0f90 17160    0.0900395   atomic_dec_and_lock
c01a100b 16937    0.0888694   .text.lock.dec_and_lock

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Lse-tech] [PATCH 1/2] node affine NUMA scheduler
  2002-09-21 23:18       ` William Lee Irwin III
@ 2002-09-22  8:09         ` William Lee Irwin III
  2002-09-22  8:30           ` Erich Focht
  0 siblings, 1 reply; 27+ messages in thread
From: William Lee Irwin III @ 2002-09-22  8:09 UTC (permalink / raw)
  To: Martin J. Bligh, Erich Focht, linux-kernel, LSE, Ingo Molnar,
	Michael Hohnbaum

On Sat, Sep 21, 2002 at 09:46:05AM -0700, Martin J. Bligh wrote:
>> An old compile off 2.5.31-mm1 + extras (I don't have 37, but similar)

On Sat, Sep 21, 2002 at 04:18:10PM -0700, William Lee Irwin III wrote:
> Some 8-quad numbers for 2.5.37 (virgin) follow.

Okay, 2.5.37 virgin with overcommit_memory set to 1 this time.
(compiles with -j256 seem to do better than -j32 or -j48, here is -j256):

... will follow up with 2.5.38-mm1 with and without NUMA sched, at
least if the arrival rate of releases doesn't exceed the benchtime.

c01053ec 1605553  95.6785     poll_idle
c0114a48 7611     0.453556    load_balance
c0114ec0 5303     0.316018    scheduler_tick
c01422ac 5017     0.298974    page_remove_rmap
c01466de 4026     0.239918    .text.lock.file_table
c012d5ec 3290     0.196058    do_anonymous_page
c0141e48 3211     0.191351    page_add_rmap
c012df4c 2920     0.174009    handle_mm_fault
c010d6e8 2844     0.16948     timer_interrupt
c01547b3 2080     0.123952    .text.lock.namei
c0146070 1934     0.115251    get_empty_filp
c01a0d2c 1591     0.0948112   __generic_copy_from_user
c0111728 1477     0.0880177   smp_apic_timer_interrupt
c014633c 1437     0.085634    __fput
c01a0ce0 1346     0.0802111   __generic_copy_to_user
c0138c24 1249     0.0744307   rmqueue
c012e450 1169     0.0696633   vm_enough_memory
c012b694 1056     0.0629294   zap_pte_range
c0130e08 997      0.0594135   find_get_page
c014427c 950      0.0566126   dentry_open
c01391d0 949      0.056553    __alloc_pages
c0151424 892      0.0531563   link_path_walk
c01a0f90 834      0.0496999   atomic_dec_and_lock

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Lse-tech] [PATCH 1/2] node affine NUMA scheduler
  2002-09-22  8:09         ` William Lee Irwin III
@ 2002-09-22  8:30           ` Erich Focht
  2002-09-22 17:11             ` Martin J. Bligh
  0 siblings, 1 reply; 27+ messages in thread
From: Erich Focht @ 2002-09-22  8:30 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: Martin J. Bligh, linux-kernel, LSE, Ingo Molnar, Michael Hohnbaum

Bill,

would you please check the boot messages for the NUMA scheduler before
doing the run. Martin sent me an example where he has:

CPU pools : 1
pool 0 :0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 
node level 0 : 10
pool_delay matrix:
 129 

which is clearly wrong. In that case we need to fix the cpu-pools setup
first.

Regards,
Erich

On Sunday 22 September 2002 10:09, William Lee Irwin III wrote:
> On Sat, Sep 21, 2002 at 09:46:05AM -0700, Martin J. Bligh wrote:
> >> An old compile off 2.5.31-mm1 + extras (I don't have 37, but similar)
>
> On Sat, Sep 21, 2002 at 04:18:10PM -0700, William Lee Irwin III wrote:
> > Some 8-quad numbers for 2.5.37 (virgin) follow.
>
> Okay, 2.5.37 virgin with overcommit_memory set to 1 this time.
> (compiles with -j256 seem to do better than -j32 or -j48, here is -j256):
>
> ... will follow up with 2.5.38-mm1 with and without NUMA sched, at
> least if the arrival rate of releases doesn't exceed the benchtime.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Lse-tech] [PATCH 1/2] node affine NUMA scheduler
  2002-09-21 16:46     ` Martin J. Bligh
  2002-09-21 17:11       ` Martin J. Bligh
  2002-09-21 23:18       ` William Lee Irwin III
@ 2002-09-22 10:35       ` Erich Focht
  2 siblings, 0 replies; 27+ messages in thread
From: Erich Focht @ 2002-09-22 10:35 UTC (permalink / raw)
  To: Martin J. Bligh, linux-kernel; +Cc: LSE, Ingo Molnar, Michael Hohnbaum

[-- Attachment #1: Type: text/plain, Size: 1608 bytes --]

On Saturday 21 September 2002 18:46, Martin J. Bligh wrote:
> > Hmmm .... well I ran the One True Benchmark (tm). The patch
> > *increased* my kernel compile time from about 20s to about 28s.
> > Not sure I like that idea ;-) Anything you'd like tweaked, or
> > more info? Both user and system time were up ... I'll grab a
> > profile of kernel stuff.
>
> From the below, I'd suggest you're getting pages off the wrong
> nodes: do_anonymous_page is page zeroing, and rmqueue the buddy
> allocator. Are you sure the current->node thing is getting set
> correctly? I'll try backing out your alloc_pages tweaking, and
> see what happens.

Could you please check in dmesg whether the CPU pools are initialised
correctly? Maybe something goes wrong for your platform.

The node_distance is most probably non-optimal for NUMAQ, that might
need some tuning. The default is set for maximum 8 nodes, nodes 1-4
and 5-8 being in separate supernodes, with the latency ratios 1:1.5:2.

You could use the attached patch for getting an idea about the load
distribution. It's a quick&dirty hack which creates files called
/proc/sched/load/rqNN  :load of RQs, including info on tasks not running
                        on their homenode
/proc/sched/history/ilbNN : history of last 25 initial load balancing
                            decisions for runqueue NN
/proc/sched/history/lbNN  : last 25 load balancing decisions on rq NN.

It should be possible to find the reason for the poor performance by
looking at the nr_homenode entries in /proc/sched/load/rqNN.

Thanks,
best regards,
Erich

[-- Attachment #2: proc_sched_hist_2.5.37.patch --]
[-- Type: text/x-diff, Size: 6470 bytes --]

diff -urNp 2.5.37-node-affine/kernel/sched.c 2.5.37-node-affine-mon/kernel/sched.c
--- 2.5.37-node-affine/kernel/sched.c	Sun Sep 22 11:13:59 2002
+++ 2.5.37-node-affine-mon/kernel/sched.c	Sun Sep 22 11:29:26 2002
@@ -677,6 +677,175 @@ static inline unsigned int double_lock_b
 	return nr_running;
 }
 
+#define HISTORY_RING_SIZE 25
+/* load balancing history entry */
+struct lb_hist_entry {
+	unsigned long time;	/* jiffy */
+	int pid;		/* stolen task (0 if none) */
+	int busiest_cpu;	/* busiest RQ */
+};
+/* load balancing history ring */
+struct lb_hist_ring {
+	int curr;	/* current pointer */
+	struct lb_hist_entry data[HISTORY_RING_SIZE];
+} ____cacheline_aligned;
+/* per CPU history ring array */
+struct lb_hist_ring lb_ring[NR_CPUS];
+
+/* initial load balancing decision entry */
+struct ilb_hist_entry {
+	unsigned long time;	/* jiffy */
+	int pid;
+	int node;		/* selected homenode */
+	int load[NR_NODES];	/* node loads at decision time */
+};
+/* initial load balancing history ring */
+struct ilb_hist_ring {
+	int curr;	/* current pointer */
+	struct ilb_hist_entry data[HISTORY_RING_SIZE];
+} ____cacheline_aligned;
+/* per CPU history ring array */
+struct ilb_hist_ring ilb_ring[NR_CPUS];
+
+/* add entry to lb_ring */
+void lb_ring_add(int cpu, int pid, int busiest_cpu)
+{
+	int next=(lb_ring[cpu].curr + 1 ) % HISTORY_RING_SIZE;
+
+	lb_ring[cpu].data[next].time = jiffies;
+	lb_ring[cpu].data[next].pid = pid;
+	lb_ring[cpu].data[next].busiest_cpu = busiest_cpu;
+	lb_ring[cpu].curr = next;
+}
+
+/* add entry to ilb_ring */
+void ilb_ring_add(int cpu, int pid, int node, int *load)
+{
+	int i, next=(ilb_ring[cpu].curr + 1 ) % HISTORY_RING_SIZE;
+
+	ilb_ring[cpu].data[next].time = jiffies;
+	ilb_ring[cpu].data[next].pid  = pid;
+	ilb_ring[cpu].data[next].node = node;
+	for (i=0; i<numpools; i++)
+		ilb_ring[cpu].data[next].load[i] = load[i];
+	ilb_ring[cpu].curr = next;
+}
+
+/* print lb history ring buffer */
+int lb_ring_read_proc(char *page, char **start, off_t off, 
+			int count, int *eof, void *data)
+{
+	int i, len, entry;
+	char *buff=page;
+	int cpu=(int)data;
+
+	buff += sprintf(buff,"     tick      pid  from_cpu\n");
+	entry = lb_ring[cpu].curr;
+	for (i=0; i<HISTORY_RING_SIZE; i++) {
+		entry = (entry + 1) % HISTORY_RING_SIZE;
+		buff += sprintf(buff,"%12ld %6d %2d\n",
+				lb_ring[cpu].data[entry].time,
+				lb_ring[cpu].data[entry].pid,
+				lb_ring[cpu].data[entry].busiest_cpu);
+	}
+	len = buff-page;
+	if (len <= off+count) *eof = 1;
+	len -= off;
+	if (len>count) len = count;
+	if (len<0) len = 0;
+	return len;
+}
+
+/* print initial lb history ring buffer */
+int ilb_ring_read_proc(char *page, char **start, off_t off, 
+			int count, int *eof, void *data)
+{
+	int i, j, len, entry;
+	char *buff=page;
+	int cpu=(int)data;
+
+	buff += sprintf(buff,"     tick      pid node node_loads\n");
+	entry = ilb_ring[cpu].curr;
+	for (i=0; i<HISTORY_RING_SIZE; i++) {
+		entry = (entry + 1) % HISTORY_RING_SIZE;
+		buff += sprintf(buff,"%12ld %6d %2d",
+				ilb_ring[cpu].data[entry].time,
+				ilb_ring[cpu].data[entry].pid,
+				ilb_ring[cpu].data[entry].node);
+		for (j=0; j<numpools; j++)
+			buff += sprintf(buff," %3d",
+					ilb_ring[cpu].data[entry].load[j]);
+		buff += sprintf(buff,"\n");
+	}
+	len = buff-page;
+	if (len <= off+count) *eof = 1;
+	len -= off;
+	if (len>count) len = count;
+	if (len<0) len = 0;
+	return len;
+}
+
+/* print runqueue load */
+int rq_load_read_proc(char *page, char **start, off_t off, 
+			int count, int *eof, void *data)
+{
+	int i, len;
+	runqueue_t *rq;
+	char *buff=page;
+	int cpu=(int)data;
+
+	rq=cpu_rq(cpu);
+	buff += sprintf(buff,"cpu %d : ",cpu);
+	buff += sprintf(buff,"curr: %d %s\n",rq->curr->pid,rq->curr->comm);
+	buff += sprintf(buff,"running uninter nr_homenode\n");
+	buff += sprintf(buff,"%7d %7d",rq->nr_running,rq->nr_uninterruptible);
+	for (i=0; i<numpools; i++)
+		buff += sprintf(buff," %4d",rq->nr_homenode[i]);
+	buff += sprintf(buff,"\n");
+
+	len = buff-page;
+	if (len <= off+count) *eof = 1;
+	len -= off;
+	if (len>count) len = count;
+	if (len<0) len = 0;
+	return len;
+}
+
+#include <linux/proc_fs.h>
+/* initialize /proc entries */
+void init_sched_proc(void)
+{
+	int i;
+	char name[12];
+	struct proc_dir_entry *p, *hist, *sched, *load;
+
+	sched = proc_mkdir("sched",&proc_root);
+	hist = proc_mkdir("history",sched);
+	for (i=0; i<smp_num_cpus; i++) {
+		sprintf(name,"lb%02d",i);
+		p = create_proc_entry(name,S_IRUGO,hist);
+		if (p) {
+			p->read_proc = lb_ring_read_proc;
+			p->data = (long)i;
+		}
+		sprintf(name,"ilb%02d",i);
+		p = create_proc_entry(name,S_IRUGO,hist);
+		if (p) {
+			p->read_proc = ilb_ring_read_proc;
+			p->data = (long)i;
+		}
+	}
+	load = proc_mkdir("load",sched);
+	for (i=0; i<smp_num_cpus; i++) {
+		sprintf(name,"rq%02d",i);
+		p = create_proc_entry(name,S_IRUGO,load);
+		if (p) {
+			p->read_proc = rq_load_read_proc;
+			p->data = (long)i;
+		}
+	}
+}
+
 /*
  * Calculate load of a CPU pool, store results in data[][NR_CPUS].
  * Return the index of the most loaded runqueue.
@@ -961,6 +1130,7 @@ static void load_balance(runqueue_t *thi
 	tmp = task_to_steal(busiest, this_cpu);
 	if (!tmp)
 		goto out_unlock;
+	lb_ring_add(smp_processor_id(), tmp->pid, tmp->thread_info->cpu);
 	pull_task(busiest, tmp->array, tmp, this_rq, this_cpu);
 out_unlock:
 	spin_unlock(&busiest->lock);
@@ -2051,7 +2221,7 @@ static int sched_best_cpu(struct task_st
  */
 static int sched_best_node(struct task_struct *p, int flag)
 {
-	int n, best_node=0, min_load, pool_load, min_pool=p->node;
+	int n, best_node=0, min_load, min_pool=p->node;
 	int pool, load[NR_NODES];
 	unsigned long mask = p->cpus_allowed & cpu_online_map;
 
@@ -2079,13 +2249,14 @@ static int sched_best_node(struct task_s
 	min_load = 100000000;
 	for (n = 0; n < numpools; n++) {
 		pool = (best_node + n) % numpools;
-		pool_load = (100*load[pool])/pool_nr_cpus[pool];
-		if ((pool_load < min_load) && (pool_mask[pool] & mask)) {
-			min_load = pool_load;
+		load[pool] = (100*load[pool])/pool_nr_cpus[pool];
+		if ((load[pool] < min_load) && (pool_mask[pool] & mask)) {
+			min_load = load[pool];
 			min_pool = pool;
 		}
 	}
 	atomic_set(&sched_node, min_pool);
+	ilb_ring_add(smp_processor_id(), p->pid, min_pool, load);
 	return min_pool;
 }
 
@@ -2282,6 +2453,7 @@ void bld_pools(void)
 	find_node_levels(numpools);
 	init_pool_weight();
 	init_pool_delay();
+	init_sched_proc();
 }
 
 void set_task_node(task_t *p, int node)

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Lse-tech] [PATCH 1/2] node affine NUMA scheduler
  2002-09-21 15:55 ` [Lse-tech] [PATCH 1/2] " Martin J. Bligh
  2002-09-21 16:32   ` Martin J. Bligh
@ 2002-09-22 10:45   ` Erich Focht
  2002-09-22 14:57     ` Martin J. Bligh
  1 sibling, 1 reply; 27+ messages in thread
From: Erich Focht @ 2002-09-22 10:45 UTC (permalink / raw)
  To: Martin J. Bligh, linux-kernel; +Cc: LSE, Ingo Molnar, Michael Hohnbaum

On Saturday 21 September 2002 17:55, Martin J. Bligh wrote:
> >  - Each process has a homenode assigned to it at creation time
> >    (initial load balancing). Memory will be allocated from this node.
...
>  #ifdef CONFIG_NUMA
> +#ifdef CONFIG_NUMA_SCHED
> +#define numa_node_id() (current->node)
> +#else
>  #define numa_node_id() _cpu_to_node(smp_processor_id())
> +#endif
>  #endif /* CONFIG_NUMA */
>
> I'm not convinced it's a good idea to modify this generic function,
> which was meant to tell you what node you're running on. I can't
> see it being used anywhere else right now, but wouldn't it be better
> to just modify alloc_pages instead to use current->node, and leave
> this macro as intended? Or make a process_node_id or something?

OK, I see your point and I agree that numa_node_is() should be similar to
smp_processor_id(). I'll change the alloc_pages instead.

Do you think it makes sense to get memory from the homenode only for
user processes? Many kernel threads have currently the wrong homenode,
for some of them it's unclear which homenode they should have...

There is an alternative idea (we discussed this at OLS with Andrea, maybe
you remember): allocate memory from the current node and keep statistics
on where it is allocated. Determine the homenode from this (from time to
time) and schedule accordingly. This eliminates the initial load balancing
and leaves it all to the scheduler, but has the drawback that memory can
be somewhat scattered across the nodes. Any comments?

Regards,
Erich


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Lse-tech] [PATCH 1/2] node affine NUMA scheduler
  2002-09-22 10:45   ` Erich Focht
@ 2002-09-22 14:57     ` Martin J. Bligh
  2002-09-23 18:38       ` Erich Focht
  0 siblings, 1 reply; 27+ messages in thread
From: Martin J. Bligh @ 2002-09-22 14:57 UTC (permalink / raw)
  To: Erich Focht, linux-kernel; +Cc: LSE, Ingo Molnar, Michael Hohnbaum

> OK, I see your point and I agree that numa_node_is() should be similar to
> smp_processor_id(). I'll change the alloc_pages instead.
> 
> Do you think it makes sense to get memory from the homenode only for
> user processes? Many kernel threads have currently the wrong homenode,
> for some of them it's unclear which homenode they should have...

Well yes ... if you can keep things pretty much on their home nodes.
That means some sort of algorithm for updating it, which may be fairly
complex (and doesn't currently seem to work, but maybe that's just 
because I only have 1 pool)
 
> There is an alternative idea (we discussed this at OLS with Andrea, maybe
> you remember): allocate memory from the current node and keep statistics
> on where it is allocated. Determine the homenode from this (from time to
> time) and schedule accordingly. This eliminates the initial load balancing
> and leaves it all to the scheduler, but has the drawback that memory can
> be somewhat scattered across the nodes. Any comments?

Well, that's a lot simpler. Things should end up running on their home
node, and thus will allocate pages from their home node, so it should
be self-re-enforcing. The algorithm for the home node is then implicitly
worked out from the scheduler itself, and its actions, so it's one less
set of stuff to write. Would suggest we do this at first, to keep things
as simple as possible so you have something mergeable.

M.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Lse-tech] [PATCH 1/2] node affine NUMA scheduler
  2002-09-21  9:59 [PATCH 1/2] node affine NUMA scheduler Erich Focht
  2002-09-21 10:02 ` [PATCH 2/2] " Erich Focht
  2002-09-21 15:55 ` [Lse-tech] [PATCH 1/2] " Martin J. Bligh
@ 2002-09-22 15:52 ` Martin J. Bligh
  2002-09-22 19:24   ` Martin J. Bligh
  2002-09-24 23:59   ` Matthew Dobson
  2 siblings, 2 replies; 27+ messages in thread
From: Martin J. Bligh @ 2002-09-22 15:52 UTC (permalink / raw)
  To: Erich Focht, linux-kernel; +Cc: LSE, Ingo Molnar, Michael Hohnbaum

A few comments ... mainly just i386 arch things (which aren't
really your fault, was just a lack of the mechanisms being in
mainline), and a request to push a couple of things down into
the arch trees from rearing their ugly head into generic code ;-)

M.

> +static int __initdata nr_lnodes = 0;
> +

Use numnodes.

>  	cpu = ++cpucount;
> +#ifdef CONFIG_NUMA_SCHED
> +	cell = SAPICID_TO_PNODE(apicid);
> +	if (pnode_to_lnode[cell] < 0) {
> +		pnode_to_lnode[cell] = nr_lnodes++;
> +	}
> +#endif

pnodes and lnodes are all 1-1, so they're just called nodes for
i386, and there's no such thing as a SAPICID on this platform.

>  	/*
>  	 * We can't use kernel_thread since we must avoid to
>  	 * reschedule the child.
> @@ -996,6 +1004,10 @@ static void __init smp_boot_cpus(unsigne
>  	set_bit(0, &cpu_callout_map);
>  	boot_cpu_logical_apicid = logical_smp_processor_id();
>  	map_cpu_to_boot_apicid(0, boot_cpu_apicid);
> +#ifdef CONFIG_NUMA_SCHED
> +	pnode_to_lnode[SAPICID_TO_PNODE(boot_cpu_apicid)] = nr_lnodes++;
> +	printk("boot_cpu_apicid = %d, nr_lnodes = %d, lnode = %d\n", boot_cpu_apicid, nr_lnodes, pnode_to_lnode[0]);
> +#endif

Ditto. All these mappings exist already. No need to reinvent them.
The -mm tree has a generic cpu_to_node macro, which keys off the
logical_apic_id.

http://www.zipworld.com.au/~akpm/linux/patches/2.5/2.5.37/2.5.37-mm1/broken-out/topology-api.patch

> +#ifdef CONFIG_NUMA_SCHED
> +#define NR_NODES               8

MAX_NUMNODES exists for every arch (except maybe ia64 ;-))

> +#define cpu_physical_id(cpuid) (cpu_to_physical_apicid(cpuid))

Not needed, should be buried within a wrapper, not exposed to
generic code.

> +#define SAPICID_TO_PNODE(hwid)  (hwid >> 4)

Grrr. No such thing.

> diff -urNp a/include/linux/sched.h b/include/linux/sched.h

> +extern int pnode_to_lnode[NR_NODES];
> +extern char lnode_number[NR_CPUS] __cacheline_aligned;

Can't you push all this down into the arch ....

> +#define CPU_TO_NODE(cpu) lnode_number[cpu]

... by letting them define cpu_to_node() themselves?
(most people don't have lnodes and pnodes, etc).

> +EXPORT_SYMBOL(runqueues_address);
> +EXPORT_SYMBOL(numpools);
> +EXPORT_SYMBOL(pool_ptr);
> +EXPORT_SYMBOL(pool_cpus);
> +EXPORT_SYMBOL(pool_nr_cpus);
> +EXPORT_SYMBOL(pool_mask);
> +EXPORT_SYMBOL(sched_migrate_task);

Aren't these internal scheduler things?

> diff -urNp a/kernel/sched.c b/kernel/sched.c

> +int pnode_to_lnode[NR_NODES] = { [0 ... NR_NODES-1] = -1 };
> +char lnode_number[NR_CPUS] __cacheline_aligned;

Ditto.

> +	int this_pool = CPU_TO_NODE(this_cpu);
> +	int this_pool=CPU_TO_NODE(this_cpu), weight, maxweight=0;

Howcome you can use the CPU_TO_NODE abstraction here ...

> +	/* build translation table for CPU_TO_NODE macro */
> +	for (i = 0; i < NR_CPUS; i++)
> +		if (cpu_online(i))
> +			lnode_number[i] = pnode_to_lnode[SAPICID_TO_PNODE(cpu_physical_id(i))];

But not here?



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Lse-tech] [PATCH 1/2] node affine NUMA scheduler
  2002-09-22  8:30           ` Erich Focht
@ 2002-09-22 17:11             ` Martin J. Bligh
  2002-09-22 19:20               ` Martin J. Bligh
  2002-09-23 18:19               ` node affine NUMA scheduler: simple benchmark Erich Focht
  0 siblings, 2 replies; 27+ messages in thread
From: Martin J. Bligh @ 2002-09-22 17:11 UTC (permalink / raw)
  To: Erich Focht, William Lee Irwin III
  Cc: linux-kernel, LSE, Ingo Molnar, Michael Hohnbaum

> would you please check the boot messages for the NUMA scheduler before
> doing the run. Martin sent me an example where he has:
> 
> CPU pools : 1
> pool 0 :0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 
> node level 0 : 10
> pool_delay matrix:
>  129 
> 
> which is clearly wrong. In that case we need to fix the cpu-pools setup
> first.

OK, well I hacked this for now:

sched.c somewhere:
- lnode_number[i] = pnode_to_lnode[SAPICID_TO_PNODE(cpu_physical_id(i))];
+ lnode_number[i] = i/4;

Which makes the pools work properly. I think you should just use
the cpu_to_node macro here, but the hack will allow us to do some
testing.

Results, averaged over 5 kernel compiles:

Before:
Elapsed: 20.82s User: 191.262s System: 59.782s CPU: 1206.4%
After:
Elapsed: 21.918s User: 190.224s System: 59.166s CPU: 1137.4%

So you actually take a little less horsepower to do the work, but 
don't utilize the CPUs quite as well, so elapsed time is higher.
I seem to recall getting better results from Mike's quick hack
though ... that was a long time back. What were the balancing 
issues you mentioned?

M.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Lse-tech] [PATCH 1/2] node affine NUMA scheduler
  2002-09-22 17:11             ` Martin J. Bligh
@ 2002-09-22 19:20               ` Martin J. Bligh
  2002-09-22 21:59                 ` Erich Focht
  2002-09-23 18:19               ` node affine NUMA scheduler: simple benchmark Erich Focht
  1 sibling, 1 reply; 27+ messages in thread
From: Martin J. Bligh @ 2002-09-22 19:20 UTC (permalink / raw)
  To: Erich Focht, William Lee Irwin III
  Cc: linux-kernel, LSE, Ingo Molnar, Michael Hohnbaum

I tried putting back the current->node logic now that we have
the correct node IDs, but it made things worse (not as bad as
before, but ... looks like we're still allocing off the wrong
node.

This run is the last one in the list.

Virgin:
Elapsed: 20.82s User: 191.262s System: 59.782s CPU: 1206.4%
  7059 do_anonymous_page                       
  4459 page_remove_rmap                        
  3863 handle_mm_fault                         
  3695 .text.lock.namei                        
  2912 page_add_rmap                           
  2458 rmqueue                                 
  2119 vm_enough_memory                        

Both numasched patches, just compile fixes:
Elapsed: 28.744s User: 204.62s System: 173.708s CPU: 1315.8%
 38978 do_anonymous_page                       
 36533 rmqueue                                 
 35099 __free_pages_ok                         
  5551 page_remove_rmap                        
  4694 handle_mm_fault                         
  3166 page_add_rmap                           

Both numasched patches, alloc from local node
Elapsed: 21.094s User: 195.808s System: 62.41s CPU: 1224.4%
  7475 do_anonymous_page                       
  4564 page_remove_rmap                        
  4167 handle_mm_fault                         
  3467 .text.lock.namei                        
  2520 page_add_rmap                           
  2112 rmqueue                                 
  1905 .text.lock.dec_and_lock                 
  1849 zap_pte_range                           
  1668 vm_enough_memory                        

Both numasched patches, hack node IDs, alloc from local node
Elapsed: 21.918s User: 190.224s System: 59.166s CPU: 1137.4%
  5793 do_anonymous_page                       
  4475 page_remove_rmap                        
  4281 handle_mm_fault                         
  3820 .text.lock.namei                        
  2625 page_add_rmap                           
  2028 .text.lock.dec_and_lock                 
  1748 vm_enough_memory                        
  1713 file_read_actor                         
  1672 rmqueue                                 

Both numasched patches, hack node IDs, alloc from current->node
Elapsed: 24.414s User: 194.86s System: 98.606s CPU: 1201.6%
 30317 do_anonymous_page                       
  6962 rmqueue                                 
  5190 page_remove_rmap                        
  4773 handle_mm_fault                         
  3522 .text.lock.namei                        
  3161 page_add_rmap                           


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Lse-tech] [PATCH 1/2] node affine NUMA scheduler
  2002-09-22 15:52 ` Martin J. Bligh
@ 2002-09-22 19:24   ` Martin J. Bligh
  2002-09-24 23:59   ` Matthew Dobson
  1 sibling, 0 replies; 27+ messages in thread
From: Martin J. Bligh @ 2002-09-22 19:24 UTC (permalink / raw)
  To: Erich Focht, linux-kernel; +Cc: LSE, Ingo Molnar, Michael Hohnbaum

>> +	int this_pool = CPU_TO_NODE(this_cpu);
>> +	int this_pool=CPU_TO_NODE(this_cpu), weight, maxweight=0;
> 
> Howcome you can use the CPU_TO_NODE abstraction here ...
> 
>> +	/* build translation table for CPU_TO_NODE macro */
>> +	for (i = 0; i < NR_CPUS; i++)
>> +		if (cpu_online(i))
>> +			lnode_number[i] = pnode_to_lnode[SAPICID_TO_PNODE(cpu_physical_id(i))];
> 
> But not here?

Doh! Because you're building the list to use for CPU_TO_NODE,
obviously ;-) Sorry.

Should still get buried back down into the arch code though. 

M.




^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Lse-tech] [PATCH 1/2] node affine NUMA scheduler
  2002-09-22 19:20               ` Martin J. Bligh
@ 2002-09-22 21:59                 ` Erich Focht
  2002-09-22 22:36                   ` William Lee Irwin III
  0 siblings, 1 reply; 27+ messages in thread
From: Erich Focht @ 2002-09-22 21:59 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: William Lee Irwin III, linux-kernel, LSE, Ingo Molnar, Michael Hohnbaum

[-- Attachment #1: Type: text/plain, Size: 2837 bytes --]

On Sunday 22 September 2002 21:20, Martin J. Bligh wrote:
> I tried putting back the current->node logic now that we have
> the correct node IDs, but it made things worse (not as bad as
> before, but ... looks like we're still allocing off the wrong
> node.

Thanks a lot for the testing! It looks like something is still
wrong. NUMAQ suffers a lot more from hops across the nodes than
the Azusa therefore I expect it is more sensitive to initial
load balancing errors.

The easiest thing to try is "nodpol -P 2" in the shell before
running the test. This changes the initial load balancing policy
from exec() to fork() ("nodpol -P 0" gets you back to the default).

A bit more difficult is tuning the scheduler parameters which can
be done pretty simply by changing the __node_distance matrix. A first
attempt could be: 10 on the diagonal, 100 off-diagonal. This leads to
larger delays when stealing from a remote node.

Anyhow, it would be good to understand what is going on and maybe
simpler tests than a kernel compile can reveal something. Or looking
into at the /proc/sched/load/rqNN files (you need the patch I posted
a few mails ago).

I'll modify alloc_pages not to take into acount the kernel threads
in the mean time.

Regards,
Erich

> This run is the last one in the list.
>
> Virgin:
> Elapsed: 20.82s User: 191.262s System: 59.782s CPU: 1206.4%
>   7059 do_anonymous_page
>   4459 page_remove_rmap
>   3863 handle_mm_fault
>   3695 .text.lock.namei
>   2912 page_add_rmap
>   2458 rmqueue
>   2119 vm_enough_memory
>
> Both numasched patches, just compile fixes:
> Elapsed: 28.744s User: 204.62s System: 173.708s CPU: 1315.8%
>  38978 do_anonymous_page
>  36533 rmqueue
>  35099 __free_pages_ok
>   5551 page_remove_rmap
>   4694 handle_mm_fault
>   3166 page_add_rmap
>
> Both numasched patches, alloc from local node
> Elapsed: 21.094s User: 195.808s System: 62.41s CPU: 1224.4%
>   7475 do_anonymous_page
>   4564 page_remove_rmap
>   4167 handle_mm_fault
>   3467 .text.lock.namei
>   2520 page_add_rmap
>   2112 rmqueue
>   1905 .text.lock.dec_and_lock
>   1849 zap_pte_range
>   1668 vm_enough_memory
>
> Both numasched patches, hack node IDs, alloc from local node
> Elapsed: 21.918s User: 190.224s System: 59.166s CPU: 1137.4%
>   5793 do_anonymous_page
>   4475 page_remove_rmap
>   4281 handle_mm_fault
>   3820 .text.lock.namei
>   2625 page_add_rmap
>   2028 .text.lock.dec_and_lock
>   1748 vm_enough_memory
>   1713 file_read_actor
>   1672 rmqueue
>
> Both numasched patches, hack node IDs, alloc from current->node
> Elapsed: 24.414s User: 194.86s System: 98.606s CPU: 1201.6%
>  30317 do_anonymous_page
>   6962 rmqueue
>   5190 page_remove_rmap
>   4773 handle_mm_fault
>   3522 .text.lock.namei
>   3161 page_add_rmap

[-- Attachment #2: nodpol.c --]
[-- Type: text/x-csrc, Size: 2581 bytes --]

/*-------------------------------------------------------------------------
  Read and set the homenode and the node_policy used for node affine
  scheduling.

  (c) Erich Focht <efocht@ess.nec.de>

  $Id:$
  $Revision:$
  $Log:$

  -------------------------------------------------------------------------*/

#include <linux/config.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <getopt.h>
#include <errno.h>
#include <pwd.h>
#include <sys/prctl.h>
#include <string.h>


static char VersionString[] = "$Revision:$";

void usage(char *exe)
{
	printf("Usage: %s [-n] [-N node] [-p] [-P policy] [pid]\n",
	       exe);
	printf("  -n         : get preferred node for process pid\n");
	printf("  -N         : set preferred node for process pid\n");
	printf("  -p         : get scheduler node_policy for process pid\n");
	printf("  -P         : set scheduler node_policy for process pid\n");
	printf("  pid        : process ID of targetted process (default: parent)\n");
	printf("\n%s\n\n",VersionString);
}

#define MAXPIDS 256

int main(int argc,char *argv[])
{
	int ierr=0, pid[MAXPIDS], policy, numpids=0, i;
	int result;
        extern char *optarg;
        extern int optind;
        int c;
	int getpol=0, getnod=0, setnod=0, setpol=0, help=0, node;

	// set PID to parent process ID of current process, this is what we
	// usually mean to change
	pid[numpids++]=getppid();

        // parse options
        while (( c = getopt(argc, argv, "nN:pP:h")) != EOF ) {
		switch( c ) {
		case 'n':    /* get task->node value */
			getnod=1;
			break;
		case 'N':    /* set task->node value */
			setnod=1;
			sscanf(optarg,"%d",&node);
			break;
		case 'p':    /* get task->node_policy value */
			getpol=1;
			break;
		case 'P':    /* set task->node_policy value */
			setpol=1;
			sscanf(optarg,"%d",&policy);
			break;
		case 'h':
			help=1;
			break;
		default:
			printf("Unknown option %s\n",optarg);
		}
        }
	if (optind<argc) numpids=0;

	while (optind<argc && numpids<MAXPIDS-1)
		sscanf(argv[optind++],"%d",&pid[numpids++]);
		
	if (help) {
		usage(argv[0]);
		exit(0);
	}
	
	if (getnod)
		printf("pid \t node\n");
	if (getpol)
		printf("pid \t policy\n");
	for (i=0;i<numpids;i++) {
		if (getnod)
			if((ierr = prctl(PR_GET_NODE,&result,pid[i])) == 0)
				printf("%d\t%d\n",pid[i],result);
		if (setnod)
			ierr = prctl(PR_SET_NODE,(int)node,pid[i]);
		if (getpol)
			if((ierr = prctl(PR_GET_NODPOL,&result,pid[i])) == 0)
				printf("%d\t%d\n",pid[i],result);
		if (setpol)
			ierr = prctl(PR_SET_NODPOL,(int)policy,pid[i]);
	}
	exit(ierr);
}				

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Lse-tech] [PATCH 1/2] node affine NUMA scheduler
  2002-09-22 21:59                 ` Erich Focht
@ 2002-09-22 22:36                   ` William Lee Irwin III
  2002-09-22 22:51                     ` Martin J. Bligh
  0 siblings, 1 reply; 27+ messages in thread
From: William Lee Irwin III @ 2002-09-22 22:36 UTC (permalink / raw)
  To: Erich Focht
  Cc: Martin J. Bligh, linux-kernel, LSE, Ingo Molnar, Michael Hohnbaum

On Sun, Sep 22, 2002 at 11:59:16PM +0200, Erich Focht wrote:
> A bit more difficult is tuning the scheduler parameters which can
> be done pretty simply by changing the __node_distance matrix. A first
> attempt could be: 10 on the diagonal, 100 off-diagonal. This leads to
> larger delays when stealing from a remote node.

This is not entirely reflective of our architecture. Node-to-node
latencies vary as well. Some notion of whether communication must cross
a lash at the very least should be present.


Cheers,
Bill

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Lse-tech] [PATCH 1/2] node affine NUMA scheduler
  2002-09-22 22:36                   ` William Lee Irwin III
@ 2002-09-22 22:51                     ` Martin J. Bligh
  0 siblings, 0 replies; 27+ messages in thread
From: Martin J. Bligh @ 2002-09-22 22:51 UTC (permalink / raw)
  To: William Lee Irwin III, Erich Focht
  Cc: linux-kernel, LSE, Ingo Molnar, Michael Hohnbaum

> On Sun, Sep 22, 2002 at 11:59:16PM +0200, Erich Focht wrote:
>> A bit more difficult is tuning the scheduler parameters which can
>> be done pretty simply by changing the __node_distance matrix. A first
>> attempt could be: 10 on the diagonal, 100 off-diagonal. This leads to
>> larger delays when stealing from a remote node.
> 
> This is not entirely reflective of our architecture. Node-to-node
> latencies vary as well. Some notion of whether communication must cross
> a lash at the very least should be present.

Ummm ... I think it's just flat on or off node, presumably Erich
has "on the diagonal" meaning they're on the same node, and 
"off-diagonal" meaning they're not. In which case, what he suggested
seems fine ... it's really about 20:1 ratio so I might use 10
and 200, but other than that, it seems correct to me.

M.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* node affine NUMA scheduler: simple benchmark
  2002-09-22 17:11             ` Martin J. Bligh
  2002-09-22 19:20               ` Martin J. Bligh
@ 2002-09-23 18:19               ` Erich Focht
  1 sibling, 0 replies; 27+ messages in thread
From: Erich Focht @ 2002-09-23 18:19 UTC (permalink / raw)
  To: Martin J. Bligh, William Lee Irwin III
  Cc: linux-kernel, LSE, Ingo Molnar, Michael Hohnbaum

[-- Attachment #1: Type: text/plain, Size: 596 bytes --]

Here is a simple benchmark which is NUMA sensitive and simulates a simple
but normal situation in an environment running number crunching jobs. It
starts N independent tasks which access a large array in a random manner.
This is both bandwidth and latency sensitive. The output shows on which
node(s) the tasks have spent their lives. Additionally it shows (on a
NUMA scheduler kernel) the homenode (iSched).

Could you please run it on the virgin kernel and on the
"Both numasched patches, hack node IDs, alloc from current->node" one?

Maybe we see what's wrong...

Thanks,
Erich

[-- Attachment #2: rand_updt --]
[-- Type: application/x-shellscript, Size: 5673 bytes --]

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Lse-tech] [PATCH 1/2] node affine NUMA scheduler
  2002-09-22 14:57     ` Martin J. Bligh
@ 2002-09-23 18:38       ` Erich Focht
  2002-09-23 18:47         ` Martin J. Bligh
  0 siblings, 1 reply; 27+ messages in thread
From: Erich Focht @ 2002-09-23 18:38 UTC (permalink / raw)
  To: Martin J. Bligh, linux-kernel; +Cc: LSE, Ingo Molnar, Michael Hohnbaum

[-- Attachment #1: Type: text/plain, Size: 1877 bytes --]

On Sunday 22 September 2002 16:57, Martin J. Bligh wrote:
> > There is an alternative idea (we discussed this at OLS with Andrea, maybe
> > you remember): allocate memory from the current node and keep statistics
> > on where it is allocated. Determine the homenode from this (from time to
> > time) and schedule accordingly. This eliminates the initial load
> > balancing and leaves it all to the scheduler, but has the drawback that
> > memory can be somewhat scattered across the nodes. Any comments?
>
> Well, that's a lot simpler. Things should end up running on their home
> node, and thus will allocate pages from their home node, so it should
> be self-re-enforcing. The algorithm for the home node is then implicitly
> worked out from the scheduler itself, and its actions, so it's one less
> set of stuff to write. Would suggest we do this at first, to keep things
> as simple as possible so you have something mergeable.

OK, sounds encouraging. So here is my first attempt (attached). You'll
have to apply it on top of the two NUMA scheduler patches and hack
SAPICID_TO_PNODE again.

The patch adds a node_mem[NR_NODES] array to each task. When allocating
memory (in rmqueue) and freeing it (in free_pages_ok) the number of
pages is added/subtracted from that array and the homenode is set to
the node having the largest entry. Is there a better place where to put
this in (other than rmqueue/free_pages_ok)?

I have two problems with this approach:
1: Freeing memory is quite expensive, as it currently involves finding the
maximum of the array node_mem[].
2: I have no idea how tasks sharing the mm structure will behave. I'd
like them to run on different nodes (that's why node_mem is not in mm),
but they could (legally) free pages which they did not allocate and
have wrong values in node_mem[].

Comments?

Regards,
Erich

[-- Attachment #2: node_affine_dyn-2.5.37.patch --]
[-- Type: text/x-diff, Size: 8959 bytes --]

diff -urNp 2.5.37-node-affine/fs/exec.c 2.5.37-node-affine-v2/fs/exec.c
--- 2.5.37-node-affine/fs/exec.c	Sun Sep 22 11:13:59 2002
+++ 2.5.37-node-affine-v2/fs/exec.c	Mon Sep 23 18:56:43 2002
@@ -995,9 +995,6 @@ int do_execve(char * filename, char ** a
 	int retval;
 	int i;
 
-	if (current->node_policy == NODPOL_EXEC)
-		sched_balance_exec();
-
 	file = open_exec(filename);
 
 	retval = PTR_ERR(file);
diff -urNp 2.5.37-node-affine/include/asm-i386/mmzone.h 2.5.37-node-affine-v2/include/asm-i386/mmzone.h
--- 2.5.37-node-affine/include/asm-i386/mmzone.h	Sun Sep 22 11:13:59 2002
+++ 2.5.37-node-affine-v2/include/asm-i386/mmzone.h	Mon Sep 23 19:27:09 2002
@@ -20,11 +20,7 @@
 #endif /* CONFIG_X86_NUMAQ */
 
 #ifdef CONFIG_NUMA
-#ifdef CONFIG_NUMA_SCHED
-#define numa_node_id() (current->node)
-#else
 #define numa_node_id() _cpu_to_node(smp_processor_id())
-#endif
 #endif /* CONFIG_NUMA */
 
 extern struct pglist_data *node_data[];
diff -urNp 2.5.37-node-affine/include/linux/prctl.h 2.5.37-node-affine-v2/include/linux/prctl.h
--- 2.5.37-node-affine/include/linux/prctl.h	Sun Sep 22 11:13:59 2002
+++ 2.5.37-node-affine-v2/include/linux/prctl.h	Mon Sep 23 18:57:29 2002
@@ -37,7 +37,5 @@
 /* Get/set node for node-affine scheduling */
 #define PR_GET_NODE       16
 #define PR_SET_NODE       17
-#define PR_GET_NODPOL     18
-#define PR_SET_NODPOL     19
 
 #endif /* _LINUX_PRCTL_H */
diff -urNp 2.5.37-node-affine/include/linux/sched.h 2.5.37-node-affine-v2/include/linux/sched.h
--- 2.5.37-node-affine/include/linux/sched.h	Sun Sep 22 11:13:59 2002
+++ 2.5.37-node-affine-v2/include/linux/sched.h	Mon Sep 23 19:04:55 2002
@@ -160,13 +160,13 @@ extern void update_one_process(struct ta
 extern void scheduler_tick(int user_tick, int system);
 extern unsigned long cache_decay_ticks;
 #ifdef CONFIG_NUMA_SCHED
-extern void sched_balance_exec(void);
-extern void sched_balance_fork(task_t *p);
 extern void set_task_node(task_t *p, int node);
+extern void inc_node_mem(int node, int blocks);
+extern void dec_node_mem(int node, int blocks);
 #else
-#define sched_balance_exec() {}
-#define sched_balance_fork(p) {}
 #define set_task_node(p,n) {}
+#define inc_node_mem(node,blocks) {}
+#define dec_node_mem(node,blocks) {}
 #endif
 extern void sched_migrate_task(task_t *p, int cpu);
 
@@ -303,7 +303,7 @@ struct task_struct {
 	unsigned long cpus_allowed;
 	unsigned int time_slice, first_time_slice;
 	int node;
-	int node_policy;
+	int node_mem[NR_NODES];
 
 	struct list_head tasks;
 	struct list_head ptrace_children;
diff -urNp 2.5.37-node-affine/kernel/fork.c 2.5.37-node-affine-v2/kernel/fork.c
--- 2.5.37-node-affine/kernel/fork.c	Sun Sep 22 11:13:59 2002
+++ 2.5.37-node-affine-v2/kernel/fork.c	Mon Sep 23 18:56:19 2002
@@ -701,9 +701,6 @@ static struct task_struct *copy_process(
 #ifdef CONFIG_SMP
 	{
 		int i;
-		if (p->node_policy == NODPOL_FORK_ALL ||
-		    (p->node_policy == NODPOL_FORK && !(clone_flags & CLONE_VM)))
-				sched_balance_fork(p);
 
 		/* ?? should we just memset this ?? */
 		for(i = 0; i < NR_CPUS; i++)
diff -urNp 2.5.37-node-affine/kernel/sched.c 2.5.37-node-affine-v2/kernel/sched.c
--- 2.5.37-node-affine/kernel/sched.c	Sun Sep 22 11:13:59 2002
+++ 2.5.37-node-affine-v2/kernel/sched.c	Mon Sep 23 19:08:41 2002
@@ -2019,103 +2019,59 @@ void show_state(void)
 	read_unlock(&tasklist_lock);
 }
 
-/* used as counter for round-robin node-scheduling */
-static atomic_t sched_node=ATOMIC_INIT(0);
+#ifdef CONFIG_NUMA_SCHED
 
-/*
- * Find the least loaded CPU on the current node of the task.
- */
-static int sched_best_cpu(struct task_struct *p)
+void inc_node_mem(int node, int blocks)
 {
-	int n, cpu, load, best_cpu = task_cpu(p);
-	
-	load = 1000000;
-	for (n = pool_ptr[p->node]; n < pool_ptr[p->node+1]; n++) {
-		cpu = pool_cpus[n];
-		if (!(p->cpus_allowed & (1UL << cpu) & cpu_online_map))
-			continue;
-		if (cpu_rq(cpu)->nr_running < load) {
-			best_cpu = cpu;
-			load = cpu_rq(cpu)->nr_running;
+	int homenode;
+	unsigned long flags;
+	runqueue_t *rq;
+
+	/* ignore kernel threads */
+	if (current->active_mm == &init_mm)
+		return;
+	homenode = current->node;
+	current->node_mem[node] += blocks;
+	if (unlikely(node != homenode))
+		if(current->node_mem[node] >= current->node_mem[homenode]) {
+			rq = task_rq_lock(current, &flags);
+			HOMENODE_DEC(rq, homenode);
+			HOMENODE_INC(rq, node);
+			task_rq_unlock(rq, &flags);
 		}
-	}
-	return best_cpu;
 }
 
-/*
- * Find the node with fewest tasks assigned to it. Don't bother about the
- * current load of the nodes, the load balancer should take care.
- * The argument  flag  gives some options for initial load balancing:
- *    flag = 0:  don't count own task (when balancing at do_exec())
- *    flag = 1:  count own task (when balancing at do_fork())
- */
-static int sched_best_node(struct task_struct *p, int flag)
+void dec_node_mem(int node, int blocks)
 {
-	int n, best_node=0, min_load, pool_load, min_pool=p->node;
-	int pool, load[NR_NODES];
-	unsigned long mask = p->cpus_allowed & cpu_online_map;
-
-	do {
-		best_node = atomic_inc_return(&sched_node) % numpools;
-	} while (!(pool_mask[best_node] & mask));
-
-	for (pool = 0; pool < numpools; pool++)
-		load[pool] = 0;
-
-	for (n = 0; n < NR_CPUS; n++) {
-		if (!cpu_online(n)) continue;
-#if 1
-		for (pool = 0; pool < numpools; pool++)
-			load[pool] += cpu_rq(n)->nr_homenode[pool];
-#else
-		load[CPU_TO_NODE(n)] += cpu_rq(n)->nr_running;
-#endif
-	}
-
-	/* don't count own process, this one will be moved */
-	if (!flag)
-		--load[p->node];
-
-	min_load = 100000000;
-	for (n = 0; n < numpools; n++) {
-		pool = (best_node + n) % numpools;
-		pool_load = (100*load[pool])/pool_nr_cpus[pool];
-		if ((pool_load < min_load) && (pool_mask[pool] & mask)) {
-			min_load = pool_load;
-			min_pool = pool;
+	int homenode, n;
+	unsigned long flags, maxblk;
+	runqueue_t *rq;
+
+	/* ignore kernel threads */
+	if (current->active_mm == &init_mm)
+		return;
+	homenode = current->node;
+	current->node_mem[node] -= blocks;
+	if (current->node_mem[node] < 0)
+		current->node_mem[node] = 0;
+	if (node == homenode) {
+		maxblk=current->node_mem[node];
+		for (n=0; n<numpools; n++) {
+			if (current->node_mem[n] > maxblk) {
+				maxblk = current->node_mem[n];
+				homenode = n;
+			}
 		}
-	}
-	atomic_set(&sched_node, min_pool);
-	return min_pool;
-}
-
-void sched_balance_exec(void)
-{
-	int new_cpu, new_node;
-
-	while (atomic_read(&pool_lock))
-		cpu_relax();
-	if (numpools > 1) {
-		new_node = sched_best_node(current, 0);
-		if (new_node != current->node) {
-			set_task_node(current, new_node);
+		if(node != homenode) {
+			current->node = homenode;
+			rq = task_rq_lock(current, &flags);
+			HOMENODE_DEC(rq, node);
+			HOMENODE_INC(rq, homenode);
+			task_rq_unlock(rq, &flags);
 		}
 	}
-	new_cpu = sched_best_cpu(current);
-	if (new_cpu != smp_processor_id())
-		sched_migrate_task(current, new_cpu);
-}
-
-void sched_balance_fork(task_t *p)
-{
-	while (atomic_read(&pool_lock))
-		cpu_relax();
-	if (numpools > 1)
-		p->node = sched_best_node(p, 1);
-	set_task_cpu(p, sched_best_cpu(p));
 }
 
-#ifdef CONFIG_NUMA_SCHED
 void pools_info(void)
 {
 	int i, j;
diff -urNp 2.5.37-node-affine/kernel/sys.c 2.5.37-node-affine-v2/kernel/sys.c
--- 2.5.37-node-affine/kernel/sys.c	Sun Sep 22 11:13:59 2002
+++ 2.5.37-node-affine-v2/kernel/sys.c	Mon Sep 23 18:57:13 2002
@@ -1400,34 +1400,6 @@ asmlinkage long sys_prctl(int option, un
 			read_unlock(&tasklist_lock);
 			break;
 
-		case PR_GET_NODPOL:
-			pid = (int) arg3;
-			read_lock(&tasklist_lock);
-			child = find_task_by_pid(pid);
-			if (child) {
-				error = put_user(child->node_policy,(int *)arg2);
-			} else {
-				printk("prctl: could not find pid %d\n",pid);
-				error = -EINVAL;
-			}
-			read_unlock(&tasklist_lock);
-			break;
-		case PR_SET_NODPOL:
-			pid = (int) arg3;
-			read_lock(&tasklist_lock);
-			child = find_task_by_pid(pid);
-			if (child) {
-				if (child->uid == current->uid || \
-				    current->uid == 0) {
-					printk("setting node policy of process %d to %d\n",pid,(int)arg2);
-					child->node_policy = (int) arg2;
-				}
-			} else {
-				printk("prctl: could not find pid %d\n",pid);
-				error = -EINVAL;
-			}
-			read_unlock(&tasklist_lock);
-			break;
 		default:
 			error = -EINVAL;
 			break;
diff -urNp 2.5.37-node-affine/mm/page_alloc.c 2.5.37-node-affine-v2/mm/page_alloc.c
--- 2.5.37-node-affine/mm/page_alloc.c	Fri Sep 20 17:20:15 2002
+++ 2.5.37-node-affine-v2/mm/page_alloc.c	Mon Sep 23 18:02:22 2002
@@ -146,6 +146,7 @@ void __free_pages_ok (struct page *page,
 	}
 	list_add(&(base + page_idx)->list, &area->free_list);
 	spin_unlock_irqrestore(&zone->lock, flags);
+	dec_node_mem(zone->zone_pgdat->node_id, 1<<order);
 out:
 	return;
 }
@@ -222,6 +223,7 @@ static struct page *rmqueue(struct zone 
 			if (bad_range(zone, page))
 				BUG();
 			prep_new_page(page);
+			inc_node_mem(zone->zone_pgdat->node_id, 1<<order);
 			return page;	
 		}
 		curr_order++;

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Lse-tech] [PATCH 1/2] node affine NUMA scheduler
  2002-09-23 18:38       ` Erich Focht
@ 2002-09-23 18:47         ` Martin J. Bligh
  2002-09-24 21:04           ` Erich Focht
  0 siblings, 1 reply; 27+ messages in thread
From: Martin J. Bligh @ 2002-09-23 18:47 UTC (permalink / raw)
  To: Erich Focht, linux-kernel; +Cc: LSE, Ingo Molnar, Michael Hohnbaum

> OK, sounds encouraging. So here is my first attempt (attached). You'll
> have to apply it on top of the two NUMA scheduler patches and hack
> SAPICID_TO_PNODE again.
> 
> The patch adds a node_mem[NR_NODES] array to each task. When allocating
> memory (in rmqueue) and freeing it (in free_pages_ok) the number of
> pages is added/subtracted from that array and the homenode is set to
> the node having the largest entry. Is there a better place where to put
> this in (other than rmqueue/free_pages_ok)?
> 
> I have two problems with this approach:
> 1: Freeing memory is quite expensive, as it currently involves finding the
> maximum of the array node_mem[].

Bleh ... why? This needs to be calculated much more lazily than this,
or you're going to kick the hell out of any cache affinity. Can you 
recalc this in the rebalance code or something instead?

> 2: I have no idea how tasks sharing the mm structure will behave. I'd
> like them to run on different nodes (that's why node_mem is not in mm),
> but they could (legally) free pages which they did not allocate and
> have wrong values in node_mem[].

Yes, that really ought to be per-process, not per task. Which means
locking or atomics ... and overhead. Ick.

For the first cut of the NUMA sched, maybe you could just leave page
allocation alone, and do that seperately? or is that what the second 
patch was meant to be?

M.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Lse-tech] [PATCH 1/2] node affine NUMA scheduler
  2002-09-23 18:47         ` Martin J. Bligh
@ 2002-09-24 21:04           ` Erich Focht
  2002-09-24 21:17             ` Martin J. Bligh
  0 siblings, 1 reply; 27+ messages in thread
From: Erich Focht @ 2002-09-24 21:04 UTC (permalink / raw)
  To: Martin J. Bligh, linux-kernel; +Cc: LSE, Ingo Molnar, Michael Hohnbaum

On Monday 23 September 2002 20:47, Martin J. Bligh wrote:
> > I have two problems with this approach:
> > 1: Freeing memory is quite expensive, as it currently involves finding
> > the maximum of the array node_mem[].
>
> Bleh ... why? This needs to be calculated much more lazily than this,
> or you're going to kick the hell out of any cache affinity. Can you
> recalc this in the rebalance code or something instead?

You're right, that would be too slow. I think of marking the tasks
needing recalculation and update their homenode when their runqueue
is scanned for a task to be stolen.

> > 2: I have no idea how tasks sharing the mm structure will behave. I'd
> > like them to run on different nodes (that's why node_mem is not in mm),
> > but they could (legally) free pages which they did not allocate and
> > have wrong values in node_mem[].
>
> Yes, that really ought to be per-process, not per task. Which means
> locking or atomics ... and overhead. Ick.

Hmm, I think it is sometimes ok to have it per task. For example OpenMP
parallel jobs working on huge arrays. The "first-touch" of these arrays
leads to pagefaults generated by the different tasks and thus different
node_mem[] arrays for each task. As long as they just allocate memory,
all is well. If they only release it at the end of the job, too. This
probably goes wrong if we have a long running task that spawns short
living clones. They inherit the node_mem from the parent but pages
added by them to the common mm are not reflected in the parent's node_mem
after their death.

> For the first cut of the NUMA sched, maybe you could just leave page
> allocation alone, and do that seperately? or is that what the second
> patch was meant to be?

The first patch needs a correction, add in load_balance()
	if (!busiest) goto out;
after the call to find_busiest_queue. This works alone. On top of this
pooling NUMA scheduler we can put the node affinity approach that fits
best. With or without memory allocation. I'll update the patches and
their setup code (thanks for the comments!) and resend them.

Regards,
Erich


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Lse-tech] [PATCH 1/2] node affine NUMA scheduler
  2002-09-24 21:04           ` Erich Focht
@ 2002-09-24 21:17             ` Martin J. Bligh
  0 siblings, 0 replies; 27+ messages in thread
From: Martin J. Bligh @ 2002-09-24 21:17 UTC (permalink / raw)
  To: Erich Focht, linux-kernel; +Cc: LSE, Ingo Molnar, Michael Hohnbaum

>> > 2: I have no idea how tasks sharing the mm structure will behave. I'd
>> > like them to run on different nodes (that's why node_mem is not in mm),
>> > but they could (legally) free pages which they did not allocate and
>> > have wrong values in node_mem[].
>> 
>> Yes, that really ought to be per-process, not per task. Which means
>> locking or atomics ... and overhead. Ick.
> 
> Hmm, I think it is sometimes ok to have it per task. For example OpenMP
> parallel jobs working on huge arrays. The "first-touch" of these arrays
> leads to pagefaults generated by the different tasks and thus different
> node_mem[] arrays for each task. As long as they just allocate memory,
> all is well. If they only release it at the end of the job, too. This
> probably goes wrong if we have a long running task that spawns short
> living clones. They inherit the node_mem from the parent but pages
> added by them to the common mm are not reflected in the parent's node_mem
> after their death.

But you're left with a choice whether to base it on the per-task or
per-process information when you make decisions.

1. per-process requires cross-node collation for a data read. Bad.

2. per-task leads to obviously bad decision cases when there's significant
amounts of shared data between the tasks of a process (which was often
the point of making them threads in the first place).

Yes, I can imagine a situation for which it would work, as you describe
above ... but that's not the point - this is a general policy, and I
don't think it works in general ... as you say yourself "it is 
*sometimes* ok" ;-)
 
> The first patch needs a correction, add in load_balance()
> 	if (!busiest) goto out;
> after the call to find_busiest_queue. This works alone. On top of this
> pooling NUMA scheduler we can put the node affinity approach that fits
> best. With or without memory allocation. I'll update the patches and
> their setup code (thanks for the comments!) and resend them.

Excellent! Will try out the correction above and get back to you.
Might be a day or so, I'm embroiled in some other code at the moment.

M.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Lse-tech] [PATCH 1/2] node affine NUMA scheduler
  2002-09-22 15:52 ` Martin J. Bligh
  2002-09-22 19:24   ` Martin J. Bligh
@ 2002-09-24 23:59   ` Matthew Dobson
  1 sibling, 0 replies; 27+ messages in thread
From: Matthew Dobson @ 2002-09-24 23:59 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Erich Focht, linux-kernel, LSE, Ingo Molnar, Michael Hohnbaum

Martin J. Bligh wrote:
> A few comments ... mainly just i386 arch things (which aren't
> really your fault, was just a lack of the mechanisms being in
> mainline), and a request to push a couple of things down into
> the arch trees from rearing their ugly head into generic code ;-)


<SNIP>

>>+extern int pnode_to_lnode[NR_NODES];
>>+extern char lnode_number[NR_CPUS] __cacheline_aligned;
> 
> Can't you push all this down into the arch ....
> 
>>+#define CPU_TO_NODE(cpu) lnode_number[cpu]
> 
> ... by letting them define cpu_to_node() themselves?
> (most people don't have lnodes and pnodes, etc).
Yep... Sorry to get into the thread late, but Erich, you should really 
look at the latest in-kernel topology API stuff in the mm tree.  I'd be 
happy to discuss it with you over email...  Plus, I'd get another user 
of the topology (to help push it into mainline) and ia64 support, and 
you'd get a generic topology mechanism that works (err, will work?) for 
all architectures.  Email me for more info, or just check out the patch 
that Martin pointed to! :)

Cheers!

-Matt


^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2002-09-24 23:57 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2002-09-21  9:59 [PATCH 1/2] node affine NUMA scheduler Erich Focht
2002-09-21 10:02 ` [PATCH 2/2] " Erich Focht
2002-09-21 15:55 ` [Lse-tech] [PATCH 1/2] " Martin J. Bligh
2002-09-21 16:32   ` Martin J. Bligh
2002-09-21 16:46     ` Martin J. Bligh
2002-09-21 17:11       ` Martin J. Bligh
2002-09-21 17:32         ` Erich Focht
2002-09-21 17:38           ` William Lee Irwin III
2002-09-21 23:18       ` William Lee Irwin III
2002-09-22  8:09         ` William Lee Irwin III
2002-09-22  8:30           ` Erich Focht
2002-09-22 17:11             ` Martin J. Bligh
2002-09-22 19:20               ` Martin J. Bligh
2002-09-22 21:59                 ` Erich Focht
2002-09-22 22:36                   ` William Lee Irwin III
2002-09-22 22:51                     ` Martin J. Bligh
2002-09-23 18:19               ` node affine NUMA scheduler: simple benchmark Erich Focht
2002-09-22 10:35       ` [Lse-tech] [PATCH 1/2] node affine NUMA scheduler Erich Focht
2002-09-22 10:45   ` Erich Focht
2002-09-22 14:57     ` Martin J. Bligh
2002-09-23 18:38       ` Erich Focht
2002-09-23 18:47         ` Martin J. Bligh
2002-09-24 21:04           ` Erich Focht
2002-09-24 21:17             ` Martin J. Bligh
2002-09-22 15:52 ` Martin J. Bligh
2002-09-22 19:24   ` Martin J. Bligh
2002-09-24 23:59   ` Matthew Dobson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).