diff -urNp a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c --- a/arch/i386/kernel/smpboot.c Fri Sep 27 23:49:54 2002 +++ b/arch/i386/kernel/smpboot.c Fri Oct 11 14:47:53 2002 @@ -1194,6 +1194,9 @@ int __devinit __cpu_up(unsigned int cpu) void __init smp_cpus_done(unsigned int max_cpus) { zap_low_mappings(); +#ifdef CONFIG_NUMA + build_pools(); +#endif } void __init smp_intr_init() diff -urNp a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c --- a/arch/ia64/kernel/smpboot.c Tue Oct 8 15:04:54 2002 +++ b/arch/ia64/kernel/smpboot.c Fri Oct 11 14:47:25 2002 @@ -397,7 +397,7 @@ unsigned long cache_decay_ticks; /* # of static void smp_tune_scheduling (void) { - cache_decay_ticks = 10; /* XXX base this on PAL info and cache-bandwidth estimate */ + cache_decay_ticks = 8; /* XXX base this on PAL info and cache-bandwidth estimate */ printk("task migration cache decay timeout: %ld msecs.\n", (cache_decay_ticks + 1) * 1000 / HZ); @@ -508,6 +508,9 @@ smp_cpus_done (unsigned int dummy) printk(KERN_INFO"Total of %d processors activated (%lu.%02lu BogoMIPS).\n", num_online_cpus(), bogosum/(500000/HZ), (bogosum/(5000/HZ))%100); +#ifdef CONFIG_NUMA + build_pools(); +#endif } int __devinit diff -urNp a/include/linux/sched.h b/include/linux/sched.h --- a/include/linux/sched.h Tue Oct 8 15:03:54 2002 +++ b/include/linux/sched.h Thu Oct 10 13:45:18 2002 @@ -22,6 +22,7 @@ extern unsigned long event; #include #include +#include #include #include #include @@ -167,7 +168,6 @@ extern void update_one_process(struct ta extern void scheduler_tick(int user_tick, int system); extern unsigned long cache_decay_ticks; - #define MAX_SCHEDULE_TIMEOUT LONG_MAX extern signed long FASTCALL(schedule_timeout(signed long timeout)); asmlinkage void schedule(void); @@ -457,6 +457,13 @@ extern void set_cpus_allowed(task_t *p, # define set_cpus_allowed(p, new_mask) do { } while (0) #endif +#ifdef CONFIG_NUMA +extern void build_pools(void); +extern void pooldata_lock(void); +extern void pooldata_unlock(void); +#endif +extern void sched_migrate_task(task_t *p, int cpu); + extern void set_user_nice(task_t *p, long nice); extern int task_prio(task_t *p); extern int task_nice(task_t *p); diff -urNp a/kernel/sched.c b/kernel/sched.c --- a/kernel/sched.c Fri Sep 27 23:50:27 2002 +++ b/kernel/sched.c Fri Oct 11 19:08:32 2002 @@ -154,6 +154,9 @@ struct runqueue { task_t *migration_thread; struct list_head migration_queue; + unsigned long wait_time; + int wait_node; + } ____cacheline_aligned; static struct runqueue runqueues[NR_CPUS] __cacheline_aligned; @@ -174,6 +177,103 @@ static struct runqueue runqueues[NR_CPUS #endif /* + * Variables for describing and accessing processor pools. Using a + * compressed row format like notation. Processor pools are treated + * like logical node numbers. + * + * numpools: number of CPU pools (nodes), + * pool_cpus[]: CPUs in pools sorted by their pool ID, + * pool_ptr[pool]: index of first element in pool_cpus[] belonging to pool. + * pool_mask[]: cpu mask of a pool. + * + * Example: loop over all CPUs in a node: + * loop_over_node(i,cpu,node) { + * // "cpu" is set to each CPU in the node + * // "i" is needed as a loop index, should be of int type + * ... + * } + * + */ +int numpools, pool_ptr[MAX_NUMNODES+1], pool_nr_cpus[MAX_NUMNODES]; +unsigned long pool_mask[MAX_NUMNODES]; +static spinlock_t pool_lock = SPIN_LOCK_UNLOCKED; +#define cpu_to_node(cpu) __cpu_to_node(cpu) + +/* Avoid zeroes in integer divides for load calculations */ +#define BALANCE_FACTOR 100 + +#ifdef CONFIG_NUMA +int pool_cpus[NR_CPUS]; +#define POOL_DELAY_IDLE (1*HZ/1000) +#define POOL_DELAY_BUSY (20*HZ/1000) +#define loop_over_node(i,cpu,n) \ + for(i=pool_ptr[n], cpu=pool_cpus[i]; inr_running > this_rq->prev_nr_running[this_cpu])) - nr_running = this_rq->nr_running; - else - nr_running = this_rq->prev_nr_running[this_cpu]; - - busiest = NULL; - max_load = 1; - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_online(i)) - continue; - - rq_src = cpu_rq(i); - if (idle || (rq_src->nr_running < this_rq->prev_nr_running[i])) + * Calculate load of a CPU pool, return it in pool_load, return load + * of most loaded CPU in cpu_load. + * Return the index of the most loaded runqueue. + */ +static int calc_pool_load(int *pool_load, int *cpu_load, int this_cpu, int pool, int idle) +{ + runqueue_t *rq_src, *this_rq = cpu_rq(this_cpu); + int i, cpu, idx=-1, refload, load; + + *pool_load = 0; + refload = -1; + loop_over_node(i, cpu, pool) { + if (!cpu_online(i)) continue; + rq_src = cpu_rq(cpu); + if (idle || (rq_src->nr_running < this_rq->prev_nr_running[cpu])) load = rq_src->nr_running; else - load = this_rq->prev_nr_running[i]; - this_rq->prev_nr_running[i] = rq_src->nr_running; + load = this_rq->prev_nr_running[cpu]; + this_rq->prev_nr_running[cpu] = rq_src->nr_running; + *pool_load += load; + if (load > refload) { + idx = cpu; + refload = load; + } + } + *pool_load = *pool_load * BALANCE_FACTOR / pool_nr_cpus[pool]; + return idx; +} - if ((load > max_load) && (rq_src != this_rq)) { - busiest = rq_src; - max_load = load; +#ifdef CONFIG_NUMA +/* + * Scan the load of all pools/nodes, find the most loaded one and the + * most loaded CPU on it. Returns: + * -1 : if no steal should occur, + * cpu number : steal should by attempted from the given cpu. + * + * This routine implements node balancing by delaying steals from remote + * nodes more if the own node is (within margins) averagely loaded. The + * most loaded node is remembered as well as the time (jiffies). In the + * following calls to the load_balancer the time is compared with + * POOL_DELAY_BUSY (if load is around the average) or POOL_DELAY_IDLE (if own + * node is unloaded) if the most loaded node didn't change. This gives less + * loaded nodes the chance to approach the average load but doesn't exclude + * busy nodes from stealing (just in case the cpus_allowed mask isn't good + * for the idle nodes). + * This concept can be extended easilly to more than two levels (multi-level + * scheduler), e.g.: CPU -> node -> supernode... by implementing node-distance + * dependent steal delays. + * + */ +static inline +int scan_other_pools(int *max_cpu_load, int this_pool_load, + int this_pool, int this_cpu, int idle) +{ + int i, imax, max_pool_load, max_pool_idx, best_cpu=-1; + int pool, pool_load, cpu_load, steal_delay, avg_load; + runqueue_t *this_rq = cpu_rq(this_cpu); + +#define POOLS_BALANCED(comp,this) (((comp) -(this)) < 50) + best_cpu = -1; + max_pool_load = this_pool_load; + max_pool_idx = this_pool; + avg_load = max_pool_load * pool_nr_cpus[this_pool]; + for (i = 1; i < numpools; i++) { + pool = (i + this_pool) % numpools; + imax = calc_pool_load(&pool_load, &cpu_load, this_cpu, pool, idle); + avg_load += pool_load*pool_nr_cpus[pool]; + if (pool_load > max_pool_load) { + max_pool_load = pool_load; + *max_cpu_load = cpu_load; + max_pool_idx = pool; + best_cpu = imax; } } + /* Exit if not enough imbalance on any remote node. */ + if ((best_cpu < 0) || (max_pool_load <= 100) || + POOLS_BALANCED(max_pool_load,this_pool_load)) { + this_rq->wait_node = -1; + return -1; + } + avg_load /= num_online_cpus(); + /* Wait longer before stealing if own pool's load is average. */ + if (POOLS_BALANCED(avg_load,this_pool_load)) + steal_delay = POOL_DELAY_BUSY; + else + steal_delay = POOL_DELAY_IDLE; + /* if we have a new most loaded node: just mark it */ + if (this_rq->wait_node != max_pool_idx) { + this_rq->wait_node = max_pool_idx; + this_rq->wait_time = jiffies; + return -1; + } else + /* old most loaded node: check if waited enough */ + if (jiffies - this_rq->wait_time < steal_delay) + return -1; + return best_cpu; +} +#endif - if (likely(!busiest)) - goto out; +/* + * Find a runqueue from which to steal a task. We try to do this as locally as + * possible because we don't want to let tasks get far from their node. + * This is done in two steps: + * 1. First try to find a runqueue within the own CPU pool (AKA node) with + * imbalance larger than 25% (relative to the current runqueue). + * 2. If the local node is well balanced, locate the most loaded node and its + * most loaded CPU. This is handled in scan_other_pools(). + * + */ +static inline runqueue_t *find_busiest_queue(int this_cpu, int idle, int *nr_running) +{ + runqueue_t *busiest = NULL, *this_rq = cpu_rq(this_cpu); + int best_cpu, max_cpu_load; + int this_pool, this_pool_load; - *imbalance = (max_load - nr_running) / 2; + /* Need at least ~25% imbalance to trigger balancing. */ +#define CPUS_BALANCED(m,t) (((m) <= 1) || (((m) - (t))/2 < (((m) + (t))/2 + 3)/4)) - /* It needs an at least ~25% imbalance to trigger balancing. */ - if (!idle && (*imbalance < (max_load + 3)/4)) { - busiest = NULL; - goto out; - } + if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu])) + *nr_running = this_rq->nr_running; + else + *nr_running = this_rq->prev_nr_running[this_cpu]; - nr_running = double_lock_balance(this_rq, busiest, this_cpu, idle, nr_running); - /* - * Make sure nothing changed since we checked the - * runqueue length. - */ - if (busiest->nr_running <= nr_running + 1) { - spin_unlock(&busiest->lock); - busiest = NULL; + this_pool = (numpools == 1 ? 0 : cpu_to_node(this_cpu)); + best_cpu = calc_pool_load(&this_pool_load, &max_cpu_load, this_cpu, this_pool, idle); + if (best_cpu != this_cpu) + if (!CPUS_BALANCED(max_cpu_load,*nr_running)) { + busiest = cpu_rq(best_cpu); + this_rq->wait_node = -1; + goto out; + } +#ifdef CONFIG_NUMA + if (numpools > 1) { + best_cpu = scan_other_pools(&max_cpu_load, this_pool_load, this_pool, this_cpu, idle); + if ((best_cpu >= 0) && + (!CPUS_BALANCED(max_cpu_load,*nr_running))) { + busiest = cpu_rq(best_cpu); + this_rq->wait_node = -1; + } } -out: +#endif + out: return busiest; } /* - * pull_task - move a task from a remote runqueue to the local runqueue. - * Both runqueues must be locked. + * Find a task to steal from the busiest RQ. The busiest->lock must be held + * while calling this routine. */ -static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, runqueue_t *this_rq, int this_cpu) +static inline task_t *task_to_steal(runqueue_t *busiest, int this_cpu) { - dequeue_task(p, src_array); - src_rq->nr_running--; - set_task_cpu(p, this_cpu); - this_rq->nr_running++; - enqueue_task(p, this_rq->active); - /* - * Note that idle threads have a prio of MAX_PRIO, for this test - * to be always true for them. - */ - if (p->prio < this_rq->curr->prio) - set_need_resched(); -} - -/* - * Current runqueue is empty, or rebalance tick: if there is an - * inbalance (current runqueue is too short) then pull from - * busiest runqueue(s). - * - * We call this with the current runqueue locked, - * irqs disabled. - */ -static void load_balance(runqueue_t *this_rq, int idle) -{ - int imbalance, idx, this_cpu = smp_processor_id(); - runqueue_t *busiest; + int idx; + task_t *next = NULL, *tmp; prio_array_t *array; struct list_head *head, *curr; - task_t *tmp; + int weight, maxweight=0; - busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance); - if (!busiest) - goto out; + /* + * We do not migrate tasks that are: + * 1) running (obviously), or + * 2) cannot be migrated to this CPU due to cpus_allowed. + */ + +#define CAN_MIGRATE_TASK(p,rq,this_cpu) \ + ((jiffies - (p)->sleep_timestamp > cache_decay_ticks) && \ + p != rq->curr && \ + ((p)->cpus_allowed & (1UL<<(this_cpu)))) /* * We first consider expired tasks. Those will likely not be @@ -772,7 +924,7 @@ skip_bitmap: array = busiest->active; goto new_array; } - goto out_unlock; + goto out; } head = array->queue + idx; @@ -780,33 +932,74 @@ skip_bitmap: skip_queue: tmp = list_entry(curr, task_t, run_list); + if (CAN_MIGRATE_TASK(tmp, busiest, this_cpu)) { + weight = (jiffies - tmp->sleep_timestamp)/cache_decay_ticks; + if (weight > maxweight) { + maxweight = weight; + next = tmp; + } + } + curr = curr->next; + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + + out: + return next; +} + +/* + * pull_task - move a task from a remote runqueue to the local runqueue. + * Both runqueues must be locked. + */ +static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, runqueue_t *this_rq, int this_cpu) +{ + dequeue_task(p, src_array); + src_rq->nr_running--; + set_task_cpu(p, this_cpu); + this_rq->nr_running++; + enqueue_task(p, this_rq->active); /* - * We do not migrate tasks that are: - * 1) running (obviously), or - * 2) cannot be migrated to this CPU due to cpus_allowed, or - * 3) are cache-hot on their current CPU. + * Note that idle threads have a prio of MAX_PRIO, for this test + * to be always true for them. */ + if (p->prio < this_rq->curr->prio) + set_need_resched(); +} -#define CAN_MIGRATE_TASK(p,rq,this_cpu) \ - ((jiffies - (p)->sleep_timestamp > cache_decay_ticks) && \ - !task_running(rq, p) && \ - ((p)->cpus_allowed & (1UL << (this_cpu)))) - - curr = curr->prev; - - if (!CAN_MIGRATE_TASK(tmp, busiest, this_cpu)) { - if (curr != head) - goto skip_queue; - idx++; - goto skip_bitmap; - } - pull_task(busiest, array, tmp, this_rq, this_cpu); - if (!idle && --imbalance) { - if (curr != head) - goto skip_queue; - idx++; - goto skip_bitmap; - } +/* + * Current runqueue is empty, or rebalance tick: if there is an + * inbalance (current runqueue is too short) then pull from + * busiest runqueue(s). + * + * We call this with the current runqueue locked, + * irqs disabled. + */ +static void load_balance(runqueue_t *this_rq, int idle) +{ + int nr_running, this_cpu = task_cpu(this_rq->curr); + task_t *tmp; + runqueue_t *busiest; + + /* avoid deadlock by timer interrupt on own cpu */ + if (pooldata_is_locked()) return; + busiest = find_busiest_queue(this_cpu, idle, &nr_running); + if (!busiest) + goto out; + + nr_running = double_lock_balance(this_rq, busiest, this_cpu, idle, nr_running); + /* + * Make sure nothing changed since we checked the + * runqueue length. + */ + if (busiest->nr_running <= nr_running + 1) + goto out_unlock; + + tmp = task_to_steal(busiest, this_cpu); + if (!tmp) + goto out_unlock; + pull_task(busiest, tmp->array, tmp, this_rq, this_cpu); out_unlock: spin_unlock(&busiest->lock); out: @@ -819,10 +1012,10 @@ out: * frequency and balancing agressivity depends on whether the CPU is * idle or not. * - * busy-rebalance every 250 msecs. idle-rebalance every 1 msec. (or on + * busy-rebalance every 200 msecs. idle-rebalance every 1 msec. (or on * systems with HZ=100, every 10 msecs.) */ -#define BUSY_REBALANCE_TICK (HZ/4 ?: 1) +#define BUSY_REBALANCE_TICK (HZ/5 ?: 1) #define IDLE_REBALANCE_TICK (HZ/1000 ?: 1) static inline void idle_tick(runqueue_t *rq) @@ -1920,6 +2113,8 @@ typedef struct { struct list_head list; task_t *task; struct completion done; + int cpu_dest; + int sync; } migration_req_t; /* @@ -1965,6 +2160,7 @@ void set_cpus_allowed(task_t *p, unsigne } init_completion(&req.done); req.task = p; + req.sync = 1; list_add(&req.list, &rq->migration_queue); task_rq_unlock(rq, &flags); wake_up_process(rq->migration_thread); @@ -1974,6 +2170,17 @@ out: preempt_enable(); } +void sched_migrate_task(task_t *p, int dest_cpu) +{ + unsigned long old_mask; + + old_mask = p->cpus_allowed; + if (!(old_mask & (1UL << dest_cpu))) + return; + set_cpus_allowed(p, 1UL << dest_cpu); + set_cpus_allowed(p, old_mask); +} + /* * migration_thread - this is a highprio system thread that performs * thread migration by 'pulling' threads into the target runqueue. @@ -2009,7 +2216,7 @@ static int migration_thread(void * data) for (;;) { runqueue_t *rq_src, *rq_dest; struct list_head *head; - int cpu_src, cpu_dest; + int cpu_src, cpu_dest, sync; migration_req_t *req; unsigned long flags; task_t *p; @@ -2024,10 +2231,17 @@ static int migration_thread(void * data) } req = list_entry(head->next, migration_req_t, list); list_del_init(head->next); - spin_unlock_irqrestore(&rq->lock, flags); p = req->task; - cpu_dest = __ffs(p->cpus_allowed); + sync = req->sync; + if (sync) + cpu_dest = __ffs(p->cpus_allowed & cpu_online_map); + else { + cpu_dest = req->cpu_dest; + req->task = NULL; + } + spin_unlock_irqrestore(&rq->lock, flags); + rq_dest = cpu_rq(cpu_dest); repeat: cpu_src = task_cpu(p); @@ -2050,7 +2264,8 @@ repeat: double_rq_unlock(rq_src, rq_dest); local_irq_restore(flags); - complete(&req->done); + if (sync) + complete(&req->done); } } @@ -2129,7 +2344,17 @@ void __init sched_init(void) // delimiter for bitsearch __set_bit(MAX_PRIO, array->bitmap); } +#ifdef CONFIG_NUMA + pool_cpus[i] = i; +#endif } + numpools = 1; + pool_ptr[0] = 0; + pool_ptr[1] = NR_CPUS; + pool_mask[0] = -1L; + pool_nr_cpus[0] = NR_CPUS; + if (cache_decay_ticks) + cache_decay_ticks=1; /* * We have to do a little magic to get the first * thread right in SMP mode.