From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1756239AbXFWNJt (ORCPT ); Sat, 23 Jun 2007 09:09:49 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1751411AbXFWNJl (ORCPT ); Sat, 23 Jun 2007 09:09:41 -0400 Received: from e2.ny.us.ibm.com ([32.97.182.142]:54370 "EHLO e2.ny.us.ibm.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752657AbXFWNJj (ORCPT ); Sat, 23 Jun 2007 09:09:39 -0400 Date: Sat, 23 Jun 2007 18:48:07 +0530 From: Srivatsa Vaddagiri To: Ingo Molnar Cc: Nick Piggin , efault@gmx.de, kernel@kolivas.org, containers@lists.osdl.org, ckrm-tech@lists.sourceforge.net, torvalds@linux-foundation.org, akpm@linux-foundation.org, pwil3058@bigpond.net.au, tong.n.li@intel.com, wli@holomorphy.com, linux-kernel@vger.kernel.org, dmitry.adamushko@gmail.com, balbir@in.ibm.com, dev@sw.ru Subject: [PATCH 1/2] Introduce notion of scheduler hierarchy Message-ID: <20070623131807.GB5297@linux.vnet.ibm.com> Reply-To: vatsa@linux.vnet.ibm.com References: <20070623131545.GA5297@linux.vnet.ibm.com> Mime-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20070623131545.GA5297@linux.vnet.ibm.com> User-Agent: Mutt/1.5.11 Sender: linux-kernel-owner@vger.kernel.org X-Mailing-List: linux-kernel@vger.kernel.org This patch introduces the core changes in CFS required to accomplish group fairness at higher levels. It also modifies load balance interface between classes a bit, so that move_tasks (which is centric to load balance) can be reused to balance between runqueues of various types (struct rq in case of SCHED_RT tasks, struct lrq in case of SCHED_NORMAL/BATCH tasks). Signed-off-by : Srivatsa Vaddagiri --- include/linux/sched.h | 17 ++ kernel/sched.c | 174 ++++++++++++++++------------ kernel/sched_debug.c | 31 +++-- kernel/sched_fair.c | 295 ++++++++++++++++++++++++++++++++++++++++++++---- kernel/sched_idletask.c | 11 + kernel/sched_rt.c | 47 ++++++- 6 files changed, 464 insertions(+), 111 deletions(-) Index: current/include/linux/sched.h =================================================================== --- current.orig/include/linux/sched.h +++ current/include/linux/sched.h @@ -134,8 +134,11 @@ extern unsigned long nr_iowait(void); extern unsigned long weighted_cpuload(const int cpu); struct seq_file; +struct cfs_rq; extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m); extern void proc_sched_set_task(struct task_struct *p); +extern void +print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now); /* * Task state bitmask. NOTE! These bits are also @@ -865,8 +868,13 @@ struct sched_class { struct task_struct * (*pick_next_task) (struct rq *rq, u64 now); void (*put_prev_task) (struct rq *rq, struct task_struct *p, u64 now); - struct task_struct * (*load_balance_start) (struct rq *rq); - struct task_struct * (*load_balance_next) (struct rq *rq); + int (*load_balance) (struct rq *this_rq, int this_cpu, + struct rq *busiest, + unsigned long max_nr_move, unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned, unsigned long *total_load_moved); + + void (*set_curr_task) (struct rq *rq); void (*task_tick) (struct rq *rq, struct task_struct *p); void (*task_new) (struct rq *rq, struct task_struct *p); }; @@ -899,6 +907,11 @@ struct sched_entity { s64 fair_key; s64 sum_wait_runtime, sum_sleep_runtime; unsigned long wait_runtime_overruns, wait_runtime_underruns; +#ifdef CONFIG_FAIR_GROUP_SCHED + struct sched_entity *parent; + struct cfs_rq *cfs_rq, /* rq on which this entity is (to be) queued */ + *my_q; /* rq "owned" by this entity/group */ +#endif }; struct task_struct { Index: current/kernel/sched.c =================================================================== --- current.orig/kernel/sched.c +++ current/kernel/sched.c @@ -133,6 +133,22 @@ struct cfs_rq { struct rb_root tasks_timeline; struct rb_node *rb_leftmost; struct rb_node *rb_load_balance_curr; +#ifdef CONFIG_FAIR_GROUP_SCHED + /* 'curr' points to currently running entity on this cfs_rq. + * It is set to NULL otherwise (i.e when none are currently running). + */ + struct sched_entity *curr; + struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ + + /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in + * a hierarchy). Non-leaf lrqs hold other higher schedulable entities + * (like users, containers etc.) + * + * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This + * list is used during load balance. + */ + struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */ +#endif }; /* Real-Time classes' related field in a runqueue: */ @@ -168,6 +184,9 @@ struct rq { u64 nr_switches; struct cfs_rq cfs; +#ifdef CONFIG_FAIR_GROUP_SCHED + struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */ +#endif struct rt_rq rt; /* @@ -342,6 +361,16 @@ static inline unsigned long long rq_cloc #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) +#ifdef CONFIG_FAIR_GROUP_SCHED +/* Change a task's ->cfs_rq if it moves across CPUs */ +static inline void set_task_cfs_rq(struct task_struct *p) +{ + p->se.cfs_rq = &task_rq(p)->cfs; +} +#else +static inline void set_task_cfs_rq(struct task_struct *p) { } +#endif + #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) #endif @@ -738,6 +767,21 @@ static inline void dec_nr_running(struct } static void activate_task(struct rq *rq, struct task_struct *p, int wakeup); +#ifdef CONFIG_SMP + +struct rq_iterator { + void *arg; + struct task_struct *(*start)(void *); + struct task_struct *(*next)(void *); +}; + +static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_nr_move, unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned, unsigned long *load_moved, + int this_best_prio, int best_prio, int best_prio_seen, + struct rq_iterator *iterator); +#endif #include "sched_stats.h" #include "sched_rt.c" @@ -894,6 +938,7 @@ unsigned long weighted_cpuload(const int static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) { task_thread_info(p)->cpu = cpu; + set_task_cfs_rq(p); } void set_task_cpu(struct task_struct *p, unsigned int new_cpu) @@ -919,6 +964,7 @@ void set_task_cpu(struct task_struct *p, task_thread_info(p)->cpu = new_cpu; + set_task_cfs_rq(p); } struct migration_req { @@ -2003,89 +2049,26 @@ int can_migrate_task(struct task_struct return 1; } -/* - * Load-balancing iterator: iterate through the hieararchy of scheduling - * classes, starting with the highest-prio one: - */ - -struct task_struct * load_balance_start(struct rq *rq) -{ - struct sched_class *class = sched_class_highest; - struct task_struct *p; - - do { - p = class->load_balance_start(rq); - if (p) { - rq->load_balance_class = class; - return p; - } - class = class->next; - } while (class); - - return NULL; -} - -struct task_struct * load_balance_next(struct rq *rq) -{ - struct sched_class *class = rq->load_balance_class; - struct task_struct *p; - - p = class->load_balance_next(rq); - if (p) - return p; - /* - * Pick up the next class (if any) and attempt to start - * the iterator there: - */ - while ((class = class->next)) { - p = class->load_balance_start(rq); - if (p) { - rq->load_balance_class = class; - return p; - } - } - return NULL; -} - -#define rq_best_prio(rq) (rq)->curr->prio - -/* - * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted - * load from busiest to this_rq, as part of a balancing operation within - * "domain". Returns the number of tasks moved. - * - * Called with both runqueues locked. - */ -static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, +static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_nr_move, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned) + int *all_pinned, unsigned long *load_moved, + int this_best_prio, int best_prio, int best_prio_seen, + struct rq_iterator *iterator) { - int pulled = 0, pinned = 0, this_best_prio, best_prio, - best_prio_seen, skip_for_load; + int pulled = 0, pinned = 0, skip_for_load; struct task_struct *p; - long rem_load_move; + long rem_load_move = max_load_move; if (max_nr_move == 0 || max_load_move == 0) goto out; - rem_load_move = max_load_move; pinned = 1; - this_best_prio = rq_best_prio(this_rq); - best_prio = rq_best_prio(busiest); - /* - * Enable handling of the case where there is more than one task - * with the best priority. If the current running task is one - * of those with prio==best_prio we know it won't be moved - * and therefore it's safe to override the skip (based on load) of - * any task we find with that prio. - */ - best_prio_seen = best_prio == busiest->curr->prio; /* * Start the load-balancing iterator: */ - p = load_balance_start(busiest); + p = iterator->start(iterator->arg); next: if (!p) goto out; @@ -2102,7 +2085,7 @@ next: !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { best_prio_seen |= p->prio == best_prio; - p = load_balance_next(busiest); + p = iterator->next(iterator->arg); goto next; } @@ -2117,7 +2100,7 @@ next: if (pulled < max_nr_move && rem_load_move > 0) { if (p->prio < this_best_prio) this_best_prio = p->prio; - p = load_balance_next(busiest); + p = iterator->next(iterator->arg); goto next; } out: @@ -2130,10 +2113,40 @@ out: if (all_pinned) *all_pinned = pinned; + *load_moved = max_load_move - rem_load_move; return pulled; } /* + * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted + * load from busiest to this_rq, as part of a balancing operation within + * "domain". Returns the number of tasks moved. + * + * Called with both runqueues locked. + */ +static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_nr_move, unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned) +{ + struct sched_class *class = sched_class_highest; + unsigned long load_moved, total_nr_moved = 0, nr_moved; + long rem_load_move = max_load_move; + + do { + nr_moved = class->load_balance(this_rq, this_cpu, busiest, + max_nr_move, (unsigned long)rem_load_move, + sd, idle, all_pinned, &load_moved); + total_nr_moved += nr_moved; + max_nr_move -= nr_moved; + rem_load_move -= load_moved; + class = class->next; + } while (class && max_nr_move && rem_load_move > 0); + + return total_nr_moved; +} + +/* * find_busiest_group finds and returns the busiest CPU group within the * domain. It calculates and returns the amount of weighted load which * should be moved to restore balance via the imbalance parameter. @@ -6184,6 +6197,15 @@ int in_sched_functions(unsigned long add && addr < (unsigned long)__sched_text_end); } +static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) +{ + cfs_rq->tasks_timeline = RB_ROOT; + cfs_rq->fair_clock = 1; +#ifdef CONFIG_FAIR_GROUP_SCHED + cfs_rq->rq = rq; +#endif +} + void __init sched_init(void) { int highest_cpu = 0; @@ -6204,10 +6226,14 @@ void __init sched_init(void) spin_lock_init(&rq->lock); lockdep_set_class(&rq->lock, &rq->rq_lock_key); rq->nr_running = 0; - rq->cfs.tasks_timeline = RB_ROOT; - rq->clock = rq->cfs.fair_clock = 1; + rq->clock = 1; rq->ls.load_update_last = sched_clock(); rq->ls.load_update_start = sched_clock(); + init_cfs_rq(&rq->cfs, rq); +#ifdef CONFIG_FAIR_GROUP_SCHED + INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); + list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); +#endif for (j = 0; j < CPU_LOAD_IDX_MAX; j++) rq->cpu_load[j] = 0; Index: current/kernel/sched_debug.c =================================================================== --- current.orig/kernel/sched_debug.c +++ current/kernel/sched_debug.c @@ -80,15 +80,17 @@ static void print_rq(struct seq_file *m, read_unlock_irq(&tasklist_lock); } -static void print_rq_runtime_sum(struct seq_file *m, struct rq *rq) +static void +print_cfs_rq_runtime_sum(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { s64 wait_runtime_rq_sum = 0; struct task_struct *p; struct rb_node *curr; unsigned long flags; + struct rq *rq = &per_cpu(runqueues, cpu); spin_lock_irqsave(&rq->lock, flags); - curr = first_fair(&rq->cfs); + curr = first_fair(cfs_rq); while (curr) { p = rb_entry(curr, struct task_struct, se.run_node); wait_runtime_rq_sum += p->se.wait_runtime; @@ -101,6 +103,23 @@ static void print_rq_runtime_sum(struct (long long)wait_runtime_rq_sum); } +void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now) +{ + SEQ_printf(m, "\ncfs_rq %p\n", cfs_rq); + +#define P(x) \ + SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(cfs_rq->x)) + + P(fair_clock); + P(exec_clock); + P(wait_runtime); + P(wait_runtime_overruns); + P(wait_runtime_underruns); +#undef P + + print_cfs_rq_runtime_sum(m, cpu, cfs_rq); +} + static void print_cpu(struct seq_file *m, int cpu, u64 now) { struct rq *rq = &per_cpu(runqueues, cpu); @@ -136,18 +155,14 @@ static void print_cpu(struct seq_file *m P(clock_overflows); P(clock_unstable_events); P(clock_max_delta); - P(cfs.fair_clock); - P(cfs.exec_clock); - P(cfs.wait_runtime); - P(cfs.wait_runtime_overruns); - P(cfs.wait_runtime_underruns); P(cpu_load[0]); P(cpu_load[1]); P(cpu_load[2]); P(cpu_load[3]); P(cpu_load[4]); #undef P - print_rq_runtime_sum(m, rq); + + print_cfs_stats(m, cpu, now); print_rq(m, rq, cpu, now); } Index: current/kernel/sched_fair.c =================================================================== --- current.orig/kernel/sched_fair.c +++ current/kernel/sched_fair.c @@ -70,6 +70,31 @@ extern struct sched_class fair_sched_cla * CFS operations on generic schedulable entities: */ +#ifdef CONFIG_FAIR_GROUP_SCHED + +/* cpu runqueue to which this cfs_rq is attached */ +static inline struct rq *rq_of(struct cfs_rq *cfs_rq) +{ + return cfs_rq->rq; +} + +/* currently running entity (if any) on this cfs_rq */ +static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq) +{ + return cfs_rq->curr; +} + +/* An entity is a task if it doesn't "own" a runqueue */ +#define entity_is_task(se) (!se->my_q) + +static inline void +set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + cfs_rq->curr = se; +} + +#else /* CONFIG_FAIR_GROUP_SCHED */ + static inline struct rq *rq_of(struct cfs_rq *cfs_rq) { return container_of(cfs_rq, struct rq, cfs); @@ -87,6 +112,11 @@ static inline struct sched_entity *cfs_r #define entity_is_task(se) 1 +static inline void +set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) { } + +#endif /* CONFIG_FAIR_GROUP_SCHED */ + static inline struct task_struct *task_of(struct sched_entity *se) { return container_of(se, struct task_struct, se); @@ -588,10 +618,9 @@ __check_preempt_curr_fair(struct cfs_rq resched_task(rq_of(cfs_rq)->curr); } -static struct sched_entity * pick_next_entity(struct cfs_rq *cfs_rq, u64 now) +static inline void +set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) { - struct sched_entity *se = __pick_next_entity(cfs_rq); - /* * Any task has to be enqueued before it get to execute on * a CPU. So account for the time it spent waiting on the @@ -601,6 +630,14 @@ static struct sched_entity * pick_next_e */ update_stats_wait_end(cfs_rq, se, now); update_stats_curr_start(cfs_rq, se, now); + set_cfs_rq_curr(cfs_rq, se); +} + +static struct sched_entity * pick_next_entity(struct cfs_rq *cfs_rq, u64 now) +{ + struct sched_entity *se = __pick_next_entity(cfs_rq); + + set_next_entity(cfs_rq, se, now); return se; } @@ -638,6 +675,7 @@ put_prev_entity(struct cfs_rq *cfs_rq, s if (prev->on_rq) update_stats_wait_start(cfs_rq, prev, now); + set_cfs_rq_curr(cfs_rq, NULL); } static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) @@ -677,11 +715,90 @@ static void entity_tick(struct cfs_rq *c * CFS operations on tasks: */ +#ifdef CONFIG_FAIR_GROUP_SCHED + +/* Walk up scheduling entities hierarchy */ +#define for_each_sched_entity(se) \ + for (; se; se = se->parent) + +static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) +{ + return p->se.cfs_rq; +} + +/* runqueue on which this entity is (to be) queued */ +static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +{ + return se->cfs_rq; +} + +/* runqueue "owned" by this group */ +static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) +{ + return grp->my_q; +} + +/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on + * another cpu ('this_cpu') + */ +static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) +{ + /* A later patch will take group into account */ + return &cpu_rq(this_cpu)->cfs; +} + +/* Iterate thr' all leaf cfs_rq's on a runqueue */ +#define for_each_leaf_cfs_rq(rq, cfs_rq) \ + list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) + +/* Do the two (enqueued) tasks belong to the same group ? */ +static inline int is_same_group(struct task_struct *curr, struct task_struct *p) +{ + if (curr->se.cfs_rq == p->se.cfs_rq) + return 1; + + return 0; +} + +#else /* CONFIG_FAIR_GROUP_SCHED */ + +#define for_each_sched_entity(se) \ + for (; se; se = NULL) + static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) { return &task_rq(p)->cfs; } +static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +{ + struct task_struct *p = task_of(se); + struct rq *rq = task_rq(p); + + return &rq->cfs; +} + +/* runqueue "owned" by this group */ +static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) +{ + return NULL; +} + +static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) +{ + return &cpu_rq(this_cpu)->cfs; +} + +#define for_each_leaf_cfs_rq(rq, cfs_rq) \ + for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) + +static inline int is_same_group(struct task_struct *curr, struct task_struct *p) +{ + return 1; +} + +#endif /* CONFIG_FAIR_GROUP_SCHED */ + /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and @@ -690,10 +807,15 @@ static inline struct cfs_rq *task_cfs_rq static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, u64 now) { - struct cfs_rq *cfs_rq = task_cfs_rq(p); + struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; - enqueue_entity(cfs_rq, se, wakeup, now); + for_each_sched_entity(se) { + if (se->on_rq) + break; + cfs_rq = cfs_rq_of(se); + enqueue_entity(cfs_rq, se, wakeup, now); + } } /* @@ -704,10 +826,16 @@ enqueue_task_fair(struct rq *rq, struct static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep, u64 now) { - struct cfs_rq *cfs_rq = task_cfs_rq(p); + struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; - dequeue_entity(cfs_rq, se, sleep, now); + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + dequeue_entity(cfs_rq, se, sleep, now); + /* Don't dequeue parent if it has other entities besides us */ + if (cfs_rq->load.weight) + break; + } } /* @@ -755,7 +883,8 @@ static void check_preempt_curr_fair(stru if (unlikely(p->policy == SCHED_BATCH)) gran = sysctl_sched_batch_wakeup_granularity; - __check_preempt_curr_fair(cfs_rq, &p->se, &curr->se, gran); + if (is_same_group(curr, p)) + __check_preempt_curr_fair(cfs_rq, &p->se, &curr->se, gran); } static struct task_struct * pick_next_task_fair(struct rq *rq, u64 now) @@ -766,7 +895,10 @@ static struct task_struct * pick_next_ta if (unlikely(!cfs_rq->nr_running)) return NULL; - se = pick_next_entity(cfs_rq, now); + do { + se = pick_next_entity(cfs_rq, now); + cfs_rq = group_cfs_rq(se); + } while (cfs_rq); return task_of(se); } @@ -776,7 +908,13 @@ static struct task_struct * pick_next_ta */ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, u64 now) { - put_prev_entity(task_cfs_rq(prev), &prev->se, now); + struct cfs_rq *cfs_rq; + struct sched_entity *se = &prev->se; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + put_prev_entity(cfs_rq, se, now); + } } /************************************************** @@ -791,7 +929,7 @@ static void put_prev_task_fair(struct rq * the current task: */ static inline struct task_struct * -__load_balance_iterator(struct rq *rq, struct rb_node *curr) +__load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr) { struct task_struct *p; @@ -799,19 +937,104 @@ __load_balance_iterator(struct rq *rq, s return NULL; p = rb_entry(curr, struct task_struct, se.run_node); - rq->cfs.rb_load_balance_curr = rb_next(curr); + cfs_rq->rb_load_balance_curr = rb_next(curr); return p; } -static struct task_struct * load_balance_start_fair(struct rq *rq) +static struct task_struct * load_balance_start_fair(void *arg) { - return __load_balance_iterator(rq, first_fair(&rq->cfs)); + struct cfs_rq *cfs_rq = arg; + + return __load_balance_iterator(cfs_rq, first_fair(cfs_rq)); } -static struct task_struct * load_balance_next_fair(struct rq *rq) +static struct task_struct * load_balance_next_fair(void *arg) { - return __load_balance_iterator(rq, rq->cfs.rb_load_balance_curr); + struct cfs_rq *cfs_rq = arg; + + return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); +} + +static inline int cfs_rq_best_prio(struct cfs_rq *cfs_rq) +{ + struct sched_entity *curr; + struct task_struct *p; + + if (!cfs_rq->nr_running) + return MAX_PRIO; + + curr = __pick_next_entity(cfs_rq); + p = task_of(curr); + + return p->prio; +} + +static int +load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_nr_move, unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned, unsigned long *total_load_moved) +{ + struct cfs_rq *busy_cfs_rq; + unsigned long load_moved, total_nr_moved = 0, nr_moved; + long rem_load_move = max_load_move; + struct rq_iterator cfs_rq_iterator; + + cfs_rq_iterator.start = load_balance_start_fair; + cfs_rq_iterator.next = load_balance_next_fair; + + for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { + struct cfs_rq *this_cfs_rq; + long imbalance; + unsigned long maxload; + int this_best_prio, best_prio, best_prio_seen = 0; + + this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); + + imbalance = busy_cfs_rq->load.weight - + this_cfs_rq->load.weight; + /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ + if (imbalance <= 0) + continue; + + /* Don't pull more than imbalance/2 */ + imbalance /= 2; + maxload = min(rem_load_move, imbalance); + + this_best_prio = cfs_rq_best_prio(this_cfs_rq); + best_prio = cfs_rq_best_prio(busy_cfs_rq); + + /* + * Enable handling of the case where there is more than one task + * with the best priority. If the current running task is one + * of those with prio==best_prio we know it won't be moved + * and therefore it's safe to override the skip (based on load) + * of any task we find with that prio. + */ + if (cfs_rq_curr(busy_cfs_rq) == &busiest->curr->se) + best_prio_seen = 1; + + /* pass busy_cfs_rq argument into + * load_balance_[start|next]_fair iterators + */ + cfs_rq_iterator.arg = busy_cfs_rq; + nr_moved = balance_tasks(this_rq, this_cpu, busiest, + max_nr_move, maxload, sd, idle, all_pinned, + &load_moved, this_best_prio, best_prio, + best_prio_seen, &cfs_rq_iterator); + + total_nr_moved += nr_moved; + max_nr_move -= nr_moved; + rem_load_move -= load_moved; + + if (max_nr_move <= 0 || rem_load_move <= 0) + break; + } + + *total_load_moved = max_load_move - rem_load_move; + + return total_nr_moved; } /* @@ -819,7 +1042,13 @@ static struct task_struct * load_balance */ static void task_tick_fair(struct rq *rq, struct task_struct *curr) { - entity_tick(task_cfs_rq(curr), &curr->se); + struct cfs_rq *cfs_rq; + struct sched_entity *se = &curr->se; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + entity_tick(cfs_rq, se); + } } /* @@ -863,6 +1092,24 @@ static void task_new_fair(struct rq *rq, inc_nr_running(p, rq, now); } +/* Account for a task changing its policy or group. + * + * This routine is mostly called to set cfs_rq->curr field when a task + * migrates between groups/classes. + */ +static void set_curr_task_fair(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + struct sched_entity *se = &curr->se; + struct cfs_rq *cfs_rq; + u64 now = rq_clock(rq); + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + set_next_entity(cfs_rq, se, now); + } +} + /* * All the scheduling class methods: */ @@ -876,8 +1123,18 @@ struct sched_class fair_sched_class __re .pick_next_task = pick_next_task_fair, .put_prev_task = put_prev_task_fair, - .load_balance_start = load_balance_start_fair, - .load_balance_next = load_balance_next_fair, + .load_balance = load_balance_fair, + + .set_curr_task = set_curr_task_fair, .task_tick = task_tick_fair, .task_new = task_new_fair, }; + +void print_cfs_stats(struct seq_file *m, int cpu, u64 now) +{ + struct rq *rq = cpu_rq(cpu); + struct cfs_rq *cfs_rq; + + for_each_leaf_cfs_rq(rq, cfs_rq) + print_cfs_rq(m, cpu, cfs_rq, now); +} Index: current/kernel/sched_idletask.c =================================================================== --- current.orig/kernel/sched_idletask.c +++ current/kernel/sched_idletask.c @@ -37,9 +37,13 @@ static void put_prev_task_idle(struct rq { } -static struct task_struct *load_balance_start_idle(struct rq *rq) +static int +load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_nr_move, unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned, unsigned long *total_load_moved) { - return NULL; + return 0; } static void task_tick_idle(struct rq *rq, struct task_struct *curr) @@ -60,8 +64,7 @@ struct sched_class idle_sched_class __re .pick_next_task = pick_next_task_idle, .put_prev_task = put_prev_task_idle, - .load_balance_start = load_balance_start_idle, - /* no .load_balance_next for idle tasks */ + .load_balance = load_balance_idle, .task_tick = task_tick_idle, /* no .task_new for idle tasks */ Index: current/kernel/sched_rt.c =================================================================== --- current.orig/kernel/sched_rt.c +++ current/kernel/sched_rt.c @@ -107,8 +107,9 @@ static void put_prev_task_rt(struct rq * * achieve that by always pre-iterating before returning * the current task: */ -static struct task_struct * load_balance_start_rt(struct rq *rq) +static struct task_struct * load_balance_start_rt(void *arg) { + struct rq *rq = arg; struct prio_array *array = &rq->rt.active; struct list_head *head, *curr; struct task_struct *p; @@ -132,8 +133,9 @@ static struct task_struct * load_balance return p; } -static struct task_struct * load_balance_next_rt(struct rq *rq) +static struct task_struct * load_balance_next_rt(void *arg) { + struct rq *rq = arg; struct prio_array *array = &rq->rt.active; struct list_head *head, *curr; struct task_struct *p; @@ -170,6 +172,44 @@ static struct task_struct * load_balance return p; } +static int +load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_nr_move, unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned, unsigned long *load_moved) +{ + int this_best_prio, best_prio, best_prio_seen = 0; + int nr_moved; + struct rq_iterator rt_rq_iterator; + + best_prio = sched_find_first_bit(busiest->rt.active.bitmap); + this_best_prio = sched_find_first_bit(this_rq->rt.active.bitmap); + + /* + * Enable handling of the case where there is more than one task + * with the best priority. If the current running task is one + * of those with prio==best_prio we know it won't be moved + * and therefore it's safe to override the skip (based on load) + * of any task we find with that prio. + */ + if (busiest->curr->prio == best_prio) + best_prio_seen = 1; + + rt_rq_iterator.start = load_balance_start_rt; + rt_rq_iterator.next = load_balance_next_rt; + /* pass 'busiest' rq argument into + * load_balance_[start|next]_rt iterators + */ + rt_rq_iterator.arg = busiest; + + nr_moved = balance_tasks(this_rq, this_cpu, busiest, max_nr_move, + max_load_move, sd, idle, all_pinned, load_moved, + this_best_prio, best_prio, best_prio_seen, + &rt_rq_iterator); + + return nr_moved; +} + static void task_tick_rt(struct rq *rq, struct task_struct *p) { /* @@ -207,8 +247,7 @@ static struct sched_class rt_sched_class .pick_next_task = pick_next_task_rt, .put_prev_task = put_prev_task_rt, - .load_balance_start = load_balance_start_rt, - .load_balance_next = load_balance_next_rt, + .load_balance = load_balance_rt, .task_tick = task_tick_rt, .task_new = task_new_rt, -- Regards, vatsa