diff -urNp 2.5.37-node-affine/fs/exec.c 2.5.37-node-affine-v2/fs/exec.c --- 2.5.37-node-affine/fs/exec.c Sun Sep 22 11:13:59 2002 +++ 2.5.37-node-affine-v2/fs/exec.c Mon Sep 23 18:56:43 2002 @@ -995,9 +995,6 @@ int do_execve(char * filename, char ** a int retval; int i; - if (current->node_policy == NODPOL_EXEC) - sched_balance_exec(); - file = open_exec(filename); retval = PTR_ERR(file); diff -urNp 2.5.37-node-affine/include/asm-i386/mmzone.h 2.5.37-node-affine-v2/include/asm-i386/mmzone.h --- 2.5.37-node-affine/include/asm-i386/mmzone.h Sun Sep 22 11:13:59 2002 +++ 2.5.37-node-affine-v2/include/asm-i386/mmzone.h Mon Sep 23 19:27:09 2002 @@ -20,11 +20,7 @@ #endif /* CONFIG_X86_NUMAQ */ #ifdef CONFIG_NUMA -#ifdef CONFIG_NUMA_SCHED -#define numa_node_id() (current->node) -#else #define numa_node_id() _cpu_to_node(smp_processor_id()) -#endif #endif /* CONFIG_NUMA */ extern struct pglist_data *node_data[]; diff -urNp 2.5.37-node-affine/include/linux/prctl.h 2.5.37-node-affine-v2/include/linux/prctl.h --- 2.5.37-node-affine/include/linux/prctl.h Sun Sep 22 11:13:59 2002 +++ 2.5.37-node-affine-v2/include/linux/prctl.h Mon Sep 23 18:57:29 2002 @@ -37,7 +37,5 @@ /* Get/set node for node-affine scheduling */ #define PR_GET_NODE 16 #define PR_SET_NODE 17 -#define PR_GET_NODPOL 18 -#define PR_SET_NODPOL 19 #endif /* _LINUX_PRCTL_H */ diff -urNp 2.5.37-node-affine/include/linux/sched.h 2.5.37-node-affine-v2/include/linux/sched.h --- 2.5.37-node-affine/include/linux/sched.h Sun Sep 22 11:13:59 2002 +++ 2.5.37-node-affine-v2/include/linux/sched.h Mon Sep 23 19:04:55 2002 @@ -160,13 +160,13 @@ extern void update_one_process(struct ta extern void scheduler_tick(int user_tick, int system); extern unsigned long cache_decay_ticks; #ifdef CONFIG_NUMA_SCHED -extern void sched_balance_exec(void); -extern void sched_balance_fork(task_t *p); extern void set_task_node(task_t *p, int node); +extern void inc_node_mem(int node, int blocks); +extern void dec_node_mem(int node, int blocks); #else -#define sched_balance_exec() {} -#define sched_balance_fork(p) {} #define set_task_node(p,n) {} +#define inc_node_mem(node,blocks) {} +#define dec_node_mem(node,blocks) {} #endif extern void sched_migrate_task(task_t *p, int cpu); @@ -303,7 +303,7 @@ struct task_struct { unsigned long cpus_allowed; unsigned int time_slice, first_time_slice; int node; - int node_policy; + int node_mem[NR_NODES]; struct list_head tasks; struct list_head ptrace_children; diff -urNp 2.5.37-node-affine/kernel/fork.c 2.5.37-node-affine-v2/kernel/fork.c --- 2.5.37-node-affine/kernel/fork.c Sun Sep 22 11:13:59 2002 +++ 2.5.37-node-affine-v2/kernel/fork.c Mon Sep 23 18:56:19 2002 @@ -701,9 +701,6 @@ static struct task_struct *copy_process( #ifdef CONFIG_SMP { int i; - if (p->node_policy == NODPOL_FORK_ALL || - (p->node_policy == NODPOL_FORK && !(clone_flags & CLONE_VM))) - sched_balance_fork(p); /* ?? should we just memset this ?? */ for(i = 0; i < NR_CPUS; i++) diff -urNp 2.5.37-node-affine/kernel/sched.c 2.5.37-node-affine-v2/kernel/sched.c --- 2.5.37-node-affine/kernel/sched.c Sun Sep 22 11:13:59 2002 +++ 2.5.37-node-affine-v2/kernel/sched.c Mon Sep 23 19:08:41 2002 @@ -2019,103 +2019,59 @@ void show_state(void) read_unlock(&tasklist_lock); } -/* used as counter for round-robin node-scheduling */ -static atomic_t sched_node=ATOMIC_INIT(0); +#ifdef CONFIG_NUMA_SCHED -/* - * Find the least loaded CPU on the current node of the task. - */ -static int sched_best_cpu(struct task_struct *p) +void inc_node_mem(int node, int blocks) { - int n, cpu, load, best_cpu = task_cpu(p); - - load = 1000000; - for (n = pool_ptr[p->node]; n < pool_ptr[p->node+1]; n++) { - cpu = pool_cpus[n]; - if (!(p->cpus_allowed & (1UL << cpu) & cpu_online_map)) - continue; - if (cpu_rq(cpu)->nr_running < load) { - best_cpu = cpu; - load = cpu_rq(cpu)->nr_running; + int homenode; + unsigned long flags; + runqueue_t *rq; + + /* ignore kernel threads */ + if (current->active_mm == &init_mm) + return; + homenode = current->node; + current->node_mem[node] += blocks; + if (unlikely(node != homenode)) + if(current->node_mem[node] >= current->node_mem[homenode]) { + rq = task_rq_lock(current, &flags); + HOMENODE_DEC(rq, homenode); + HOMENODE_INC(rq, node); + task_rq_unlock(rq, &flags); } - } - return best_cpu; } -/* - * Find the node with fewest tasks assigned to it. Don't bother about the - * current load of the nodes, the load balancer should take care. - * The argument flag gives some options for initial load balancing: - * flag = 0: don't count own task (when balancing at do_exec()) - * flag = 1: count own task (when balancing at do_fork()) - */ -static int sched_best_node(struct task_struct *p, int flag) +void dec_node_mem(int node, int blocks) { - int n, best_node=0, min_load, pool_load, min_pool=p->node; - int pool, load[NR_NODES]; - unsigned long mask = p->cpus_allowed & cpu_online_map; - - do { - best_node = atomic_inc_return(&sched_node) % numpools; - } while (!(pool_mask[best_node] & mask)); - - for (pool = 0; pool < numpools; pool++) - load[pool] = 0; - - for (n = 0; n < NR_CPUS; n++) { - if (!cpu_online(n)) continue; -#if 1 - for (pool = 0; pool < numpools; pool++) - load[pool] += cpu_rq(n)->nr_homenode[pool]; -#else - load[CPU_TO_NODE(n)] += cpu_rq(n)->nr_running; -#endif - } - - /* don't count own process, this one will be moved */ - if (!flag) - --load[p->node]; - - min_load = 100000000; - for (n = 0; n < numpools; n++) { - pool = (best_node + n) % numpools; - pool_load = (100*load[pool])/pool_nr_cpus[pool]; - if ((pool_load < min_load) && (pool_mask[pool] & mask)) { - min_load = pool_load; - min_pool = pool; + int homenode, n; + unsigned long flags, maxblk; + runqueue_t *rq; + + /* ignore kernel threads */ + if (current->active_mm == &init_mm) + return; + homenode = current->node; + current->node_mem[node] -= blocks; + if (current->node_mem[node] < 0) + current->node_mem[node] = 0; + if (node == homenode) { + maxblk=current->node_mem[node]; + for (n=0; nnode_mem[n] > maxblk) { + maxblk = current->node_mem[n]; + homenode = n; + } } - } - atomic_set(&sched_node, min_pool); - return min_pool; -} - -void sched_balance_exec(void) -{ - int new_cpu, new_node; - - while (atomic_read(&pool_lock)) - cpu_relax(); - if (numpools > 1) { - new_node = sched_best_node(current, 0); - if (new_node != current->node) { - set_task_node(current, new_node); + if(node != homenode) { + current->node = homenode; + rq = task_rq_lock(current, &flags); + HOMENODE_DEC(rq, node); + HOMENODE_INC(rq, homenode); + task_rq_unlock(rq, &flags); } } - new_cpu = sched_best_cpu(current); - if (new_cpu != smp_processor_id()) - sched_migrate_task(current, new_cpu); -} - -void sched_balance_fork(task_t *p) -{ - while (atomic_read(&pool_lock)) - cpu_relax(); - if (numpools > 1) - p->node = sched_best_node(p, 1); - set_task_cpu(p, sched_best_cpu(p)); } -#ifdef CONFIG_NUMA_SCHED void pools_info(void) { int i, j; diff -urNp 2.5.37-node-affine/kernel/sys.c 2.5.37-node-affine-v2/kernel/sys.c --- 2.5.37-node-affine/kernel/sys.c Sun Sep 22 11:13:59 2002 +++ 2.5.37-node-affine-v2/kernel/sys.c Mon Sep 23 18:57:13 2002 @@ -1400,34 +1400,6 @@ asmlinkage long sys_prctl(int option, un read_unlock(&tasklist_lock); break; - case PR_GET_NODPOL: - pid = (int) arg3; - read_lock(&tasklist_lock); - child = find_task_by_pid(pid); - if (child) { - error = put_user(child->node_policy,(int *)arg2); - } else { - printk("prctl: could not find pid %d\n",pid); - error = -EINVAL; - } - read_unlock(&tasklist_lock); - break; - case PR_SET_NODPOL: - pid = (int) arg3; - read_lock(&tasklist_lock); - child = find_task_by_pid(pid); - if (child) { - if (child->uid == current->uid || \ - current->uid == 0) { - printk("setting node policy of process %d to %d\n",pid,(int)arg2); - child->node_policy = (int) arg2; - } - } else { - printk("prctl: could not find pid %d\n",pid); - error = -EINVAL; - } - read_unlock(&tasklist_lock); - break; default: error = -EINVAL; break; diff -urNp 2.5.37-node-affine/mm/page_alloc.c 2.5.37-node-affine-v2/mm/page_alloc.c --- 2.5.37-node-affine/mm/page_alloc.c Fri Sep 20 17:20:15 2002 +++ 2.5.37-node-affine-v2/mm/page_alloc.c Mon Sep 23 18:02:22 2002 @@ -146,6 +146,7 @@ void __free_pages_ok (struct page *page, } list_add(&(base + page_idx)->list, &area->free_list); spin_unlock_irqrestore(&zone->lock, flags); + dec_node_mem(zone->zone_pgdat->node_id, 1<zone_pgdat->node_id, 1<