diff -urNp a/include/linux/sched.h b/include/linux/sched.h --- a/include/linux/sched.h Fri Oct 11 17:44:26 2002 +++ b/include/linux/sched.h Fri Oct 11 19:32:47 2002 @@ -301,6 +301,7 @@ struct task_struct { unsigned long policy; unsigned long cpus_allowed; unsigned int time_slice, first_time_slice; + int node; struct list_head tasks; struct list_head ptrace_children; @@ -462,8 +463,14 @@ extern void build_pools(void); extern void pooldata_lock(void); extern void pooldata_unlock(void); extern void sched_balance_exec(void); +extern void set_task_node(task_t *p, int node); +# define homenode_inc(rq,node) (rq)->nr_homenode[node]++ +# define homenode_dec(rq,node) (rq)->nr_homenode[node]-- #else #define sched_balance_exec() {} +#define set_task_node(p,n) {} +# define homenode_inc(rq,node) {} +# define homenode_dec(rq,node) {} #endif extern void sched_migrate_task(task_t *p, int cpu); diff -urNp a/kernel/sched.c b/kernel/sched.c --- a/kernel/sched.c Fri Oct 11 17:44:26 2002 +++ b/kernel/sched.c Fri Oct 11 19:35:58 2002 @@ -156,6 +156,7 @@ struct runqueue { unsigned long wait_time; int wait_node; + short nr_homenode[MAX_NUMNODES]; } ____cacheline_aligned; @@ -201,11 +202,13 @@ static spinlock_t pool_lock = SPIN_LOCK_ /* Avoid zeroes in integer divides for load calculations */ #define BALANCE_FACTOR 100 +#define MAX_CACHE_WEIGHT 100 #ifdef CONFIG_NUMA int pool_cpus[NR_CPUS]; #define POOL_DELAY_IDLE (1*HZ/1000) #define POOL_DELAY_BUSY (20*HZ/1000) +#define pool_weight(a,b) ((a)==(b)?MAX_CACHE_WEIGHT:0) #define loop_over_node(i,cpu,n) \ for(i=pool_ptr[n], cpu=pool_cpus[i]; iidle->node = n; + } pool_nr_cpus[n] = ptr - pool_ptr[n]; } numpools=numnodes; @@ -265,6 +270,7 @@ void build_pools(void) #else #define POOL_DELAY_IDLE 0 #define POOL_DELAY_BUSY 0 +#define pool_weight(a,b) (0) #define loop_over_node(i,cpu,n) for(cpu=0; cpunr_running++; + homenode_inc(rq,p->node); } /* @@ -394,6 +401,7 @@ static inline void activate_task(task_t */ static inline void deactivate_task(struct task_struct *p, runqueue_t *rq) { + homenode_dec(rq,p->node); rq->nr_running--; if (p->state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible++; @@ -736,6 +744,7 @@ static int calc_pool_load(int *pool_load { runqueue_t *rq_src, *this_rq = cpu_rq(this_cpu); int i, cpu, idx=-1, refload, load; + int this_pool = cpu_to_node(this_cpu); *pool_load = 0; refload = -1; @@ -747,6 +756,9 @@ static int calc_pool_load(int *pool_load else load = this_rq->prev_nr_running[cpu]; this_rq->prev_nr_running[cpu] = rq_src->nr_running; + /* prefer cpus running tasks from this node */ + if (pool != this_pool) + load += rq_src->nr_homenode[this_pool]; *pool_load += load; if (load > refload) { idx = cpu; @@ -880,6 +892,7 @@ static inline runqueue_t *find_busiest_q static inline task_t *task_to_steal(runqueue_t *busiest, int this_cpu) { int idx; + int this_pool = cpu_to_node(this_cpu); task_t *next = NULL, *tmp; prio_array_t *array; struct list_head *head, *curr; @@ -930,6 +943,13 @@ skip_queue: if (CAN_MIGRATE_TASK(tmp, busiest, this_cpu)) { weight = (jiffies - tmp->sleep_timestamp)/cache_decay_ticks; + /* limit weight influence of sleep_time and cache coolness */ + if (weight >= MAX_CACHE_WEIGHT) weight=MAX_CACHE_WEIGHT-1; + /* weight depending on homenode of task */ + weight += pool_weight(this_pool,tmp->node); + /* task gets bonus if running on its homenode */ + if (tmp->node == cpu_to_node(task_cpu(busiest->curr))) + weight -= MAX_CACHE_WEIGHT; if (weight > maxweight) { maxweight = weight; next = tmp; @@ -964,6 +984,35 @@ static inline void pull_task(runqueue_t set_need_resched(); } +static inline void +try_push_home(runqueue_t *this_rq, int this_cpu, int nr_running) +{ + task_t *p; + int tgt_pool, i, cpu; + runqueue_t *rq; + static int sched_push_task(task_t *p, int cpu_dest); + + if ((nr_running != 1) || (current->active_mm == &init_mm)) + return; + p = this_rq->curr; + tgt_pool = p->node; + if (tgt_pool != cpu_to_node(this_cpu)) { + /* compute how many own tasks run on the tgt node */ + unsigned long load = 0, tot_load = 0, mask; + mask = p->cpus_allowed & pool_mask[tgt_pool] & cpu_online_map; + loop_over_node(i,cpu,tgt_pool) { + rq = cpu_rq(cpu); + if (rq->nr_homenode[tgt_pool]) { + load += rq->nr_homenode[tgt_pool]; + mask &= ~(1UL<nr_running; + } + if (mask && tot_load > load) + sched_push_task(p, __ffs(mask)); + } +} + /* * Current runqueue is empty, or rebalance tick: if there is an * inbalance (current runqueue is too short) then pull from @@ -981,8 +1030,10 @@ static void load_balance(runqueue_t *thi /* avoid deadlock by timer interrupt on own cpu */ if (pooldata_is_locked()) return; busiest = find_busiest_queue(this_cpu, idle, &nr_running); - if (!busiest) + if (!busiest) { + try_push_home(this_rq, this_cpu, nr_running); goto out; + } nr_running = double_lock_balance(this_rq, busiest, this_cpu, idle, nr_running); /* @@ -2167,6 +2218,20 @@ out: } #ifdef CONFIG_NUMA +void set_task_node(task_t *p, int node) +{ + runqueue_t *rq; + unsigned long flags; + + if (node < 0 || node >= numpools) return; + rq = task_rq_lock(p, &flags); + if (p->array) { + homenode_dec(rq, p->node); + homenode_inc(rq, node); + } + p->node = node; + task_rq_unlock(rq, &flags); +} /* used as counter for round-robin node-scheduling */ static atomic_t sched_node=ATOMIC_INIT(0); @@ -2231,6 +2296,8 @@ void sched_balance_exec(void) cpu_relax(); if (numpools > 1) { new_node = sched_best_node(current); + if (new_node != current->node) + set_task_node(current, new_node); } new_cpu = sched_best_cpu(current, new_node); if (new_cpu != smp_processor_id()) @@ -2238,6 +2305,41 @@ void sched_balance_exec(void) } #endif +/* + * Static per cpu migration request structures for pushing the current + * process to another CPU from within load_balance(). + */ +static migration_req_t migr_req[NR_CPUS]; + +/* + * Push the current task to another cpu asynchronously. To be used from within + * load_balance() to push tasks running alone on a remote node back to their + * homenode. The RQ lock must be held when calling this function, it protects + * migr_req[cpu]. Function should not be preempted! + */ +static int sched_push_task(task_t *p, int cpu_dest) +{ + int cpu = smp_processor_id(); + runqueue_t *rq = this_rq(); + + if (migr_req[cpu].task) + return -1; + else { + migr_req[cpu].task = p; + migr_req[cpu].cpu_dest = cpu_dest; + migr_req[cpu].sync = 0; + list_add(&migr_req[cpu].list, &rq->migration_queue); + + if (!rq->migration_thread->array) { + activate_task(rq->migration_thread, rq); + if (rq->migration_thread->prio < rq->curr->prio) + resched_task(rq->curr); + } + rq->migration_thread->state = TASK_RUNNING; + return 0; + } +} + void sched_migrate_task(task_t *p, int dest_cpu) { unsigned long old_mask; @@ -2265,6 +2367,7 @@ static int migration_thread(void * data) set_fs(KERNEL_DS); set_cpus_allowed(current, 1UL << cpu); + set_task_node(current, cpu_to_node(cpu)); /* * Migration can happen without a migration thread on the diff -urNp a/kernel/softirq.c b/kernel/softirq.c --- a/kernel/softirq.c Tue Oct 8 15:03:55 2002 +++ b/kernel/softirq.c Fri Oct 11 17:44:53 2002 @@ -368,6 +368,7 @@ static int ksoftirqd(void * __bind_cpu) set_cpus_allowed(current, 1UL << cpu); if (smp_processor_id() != cpu) BUG(); + set_task_node(current, __cpu_to_node(cpu)); sprintf(current->comm, "ksoftirqd_CPU%d", cpu);