From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1755159Ab2DFHPp (ORCPT ); Fri, 6 Apr 2012 03:15:45 -0400 Received: from mail-wg0-f44.google.com ([74.125.82.44]:64275 "EHLO mail-wg0-f44.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1754837Ab2DFHPl (ORCPT ); Fri, 6 Apr 2012 03:15:41 -0400 From: Juri Lelli To: peterz@infradead.org, tglx@linutronix.de Cc: mingo@redhat.com, rostedt@goodmis.org, cfriesen@nortel.com, oleg@redhat.com, fweisbec@gmail.com, darren@dvhart.com, johan.eker@ericsson.com, p.faure@akatech.ch, linux-kernel@vger.kernel.org, claudio@evidence.eu.com, michael@amarulasolutions.com, fchecconi@gmail.com, tommaso.cucinotta@sssup.it, juri.lelli@gmail.com, nicola.manica@disi.unitn.it, luca.abeni@unitn.it, dhaval.giani@gmail.com, hgu1972@gmail.com, paulmck@linux.vnet.ibm.com, raistlin@linux.it, insop.song@ericsson.com, liming.wang@windriver.com Subject: [PATCH 15/16] sched: speed up -dl pushes with a push-heap. Date: Fri, 6 Apr 2012 09:14:40 +0200 Message-Id: <1333696481-3433-16-git-send-email-juri.lelli@gmail.com> X-Mailer: git-send-email 1.7.5.4 In-Reply-To: <1333696481-3433-1-git-send-email-juri.lelli@gmail.com> References: <1333696481-3433-1-git-send-email-juri.lelli@gmail.com> Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Data from tests confirmed that the original active load balancing logic didn't scale neither in the number of CPU nor in the number of tasks (as sched_rt does). Here we provide a global data structure to keep track of deadlines of the running tasks in the system. The structure is composed by a bitmask showing the free CPUs and a max-heap, needed when the system is heavily loaded. The implementation and concurrent access scheme are kept simple by design. However, our measurements show that we can compete with sched_rt on large multi-CPUs machines [1]. Only the push path is addressed, the extension to use this structure also for pull decisions is straightforward. However, we are currently evaluating different (in order to decrease/avoid contention) data structures to solve possibly both problems. We are also going to re-run tests considering recent changes inside cpupri [2]. [1] http://retis.sssup.it/~jlelli/papers/Ospert11Lelli.pdf [2] http://www.spinics.net/lists/linux-rt-users/msg06778.html Signed-off-by: Juri Lelli --- kernel/Makefile | 1 + kernel/sched.c | 5 + kernel/sched_cpudl.c | 208 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched_cpudl.h | 34 ++++++++ kernel/sched_dl.c | 52 +++--------- 5 files changed, 261 insertions(+), 39 deletions(-) create mode 100644 kernel/sched_cpudl.c create mode 100644 kernel/sched_cpudl.h diff --git a/kernel/Makefile b/kernel/Makefile index c961d3a..0aca455 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -105,6 +105,7 @@ obj-$(CONFIG_X86_DS) += trace/ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_TRACEPOINTS) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o +obj-$(CONFIG_SMP) += sched_cpudl.o obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_CPU_PM) += cpu_pm.o diff --git a/kernel/sched.c b/kernel/sched.c index d5403c8..f8a857a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -81,6 +81,7 @@ #endif #include "sched_cpupri.h" +#include "sched_cpudl.h" #include "workqueue_sched.h" #include "sched_autogroup.h" @@ -707,6 +708,7 @@ struct root_domain { cpumask_var_t dlo_mask; atomic_t dlo_count; struct dl_bw dl_bw; + struct cpudl cpudl; /* * The "RT overload" flag: it gets set if a CPU has more than @@ -7683,6 +7685,7 @@ static void free_rootdomain(struct rcu_head *rcu) struct root_domain *rd = container_of(rcu, struct root_domain, rcu); cpupri_cleanup(&rd->cpupri); + cpudl_cleanup(&rd->cpudl); free_cpumask_var(rd->dlo_mask); free_cpumask_var(rd->rto_mask); free_cpumask_var(rd->online); @@ -7741,6 +7744,8 @@ static int init_rootdomain(struct root_domain *rd) goto free_dlo_mask; init_dl_bw(&rd->dl_bw); + if (cpudl_init(&rd->cpudl) != 0) + goto free_dlo_mask; if (cpupri_init(&rd->cpupri) != 0) goto free_rto_mask; diff --git a/kernel/sched_cpudl.c b/kernel/sched_cpudl.c new file mode 100644 index 0000000..b0908ac --- /dev/null +++ b/kernel/sched_cpudl.c @@ -0,0 +1,208 @@ +/* + * kernel/sched_cpudl.c + * + * Global CPU deadline management + * + * Author: Juri Lelli + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include +#include +#include "sched_cpudl.h" + +static inline int parent(int i) +{ + return (i - 1) >> 1; +} + +static inline int left_child(int i) +{ + return (i << 1) + 1; +} + +static inline int right_child(int i) +{ + return (i << 1) + 2; +} + +static inline int dl_time_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} + +void cpudl_exchange(struct cpudl *cp, int a, int b) +{ + int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; + + swap(cp->elements[a], cp->elements[b]); + swap(cp->cpu_to_idx[cpu_a], cp->cpu_to_idx[cpu_b]); +} + +void cpudl_heapify(struct cpudl *cp, int idx) +{ + int l, r, largest; + + /* adapted from lib/prio_heap.c */ + while(1) { + l = left_child(idx); + r = right_child(idx); + largest = idx; + + if ((l < cp->size) && dl_time_before(cp->elements[idx].dl, + cp->elements[l].dl)) + largest = l; + if ((r < cp->size) && dl_time_before(cp->elements[largest].dl, + cp->elements[r].dl)) + largest = r; + if (largest == idx) + break; + + /* Push idx down the heap one level and bump one up */ + cpudl_exchange(cp, largest, idx); + idx = largest; + } +} + +void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl) +{ + WARN_ON(idx > num_present_cpus() && idx != -1); + + if (dl_time_before(new_dl, cp->elements[idx].dl)) { + cp->elements[idx].dl = new_dl; + cpudl_heapify(cp, idx); + } else { + cp->elements[idx].dl = new_dl; + while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl, + cp->elements[idx].dl)) { + cpudl_exchange(cp, idx, parent(idx)); + idx = parent(idx); + } + } +} + +static inline int cpudl_maximum(struct cpudl *cp) +{ + return cp->elements[0].cpu; +} + +/* + * cpudl_find - find the best (later-dl) CPU in the system + * @cp: the cpudl max-heap context + * @p: the task + * @later_mask: a mask to fill in with the selected CPUs (or NULL) + * + * Returns: int - best CPU (heap maximum if suitable) + */ +int cpudl_find(struct cpudl *cp, struct cpumask *dlo_mask, + struct task_struct *p, struct cpumask *later_mask) +{ + int best_cpu = -1; + const struct sched_dl_entity *dl_se = &p->dl; + + if (later_mask && cpumask_and(later_mask, cp->free_cpus, + &p->cpus_allowed) && cpumask_and(later_mask, + later_mask, cpu_active_mask)) { + best_cpu = cpumask_any(later_mask); + goto out; + } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && + dl_time_before(dl_se->deadline, cp->elements[0].dl)) { + best_cpu = cpudl_maximum(cp); + if (later_mask) + cpumask_set_cpu(best_cpu, later_mask); + } + +out: + WARN_ON(best_cpu > num_present_cpus() && best_cpu != -1); + + return best_cpu; +} + +/* + * cpudl_set - update the cpudl max-heap + * @cp: the cpudl max-heap context + * @cpu: the target cpu + * @dl: the new earliest deadline for this cpu + * + * Notes: assumes cpu_rq(cpu)->lock is locked + * + * Returns: (void) + */ +void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) +{ + int old_idx, new_cpu; + unsigned long flags; + + WARN_ON(cpu > num_present_cpus()); + + raw_spin_lock_irqsave(&cp->lock, flags); + old_idx = cp->cpu_to_idx[cpu]; + if (!is_valid) { + /* remove item */ + new_cpu = cp->elements[cp->size - 1].cpu; + cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl; + cp->elements[old_idx].cpu = new_cpu; + cp->size--; + cp->cpu_to_idx[new_cpu] = old_idx; + cp->cpu_to_idx[cpu] = IDX_INVALID; + while (old_idx > 0 && dl_time_before( + cp->elements[parent(old_idx)].dl, + cp->elements[old_idx].dl)) { + cpudl_exchange(cp, old_idx, parent(old_idx)); + old_idx = parent(old_idx); + } + cpumask_set_cpu(cpu, cp->free_cpus); + cpudl_heapify(cp, old_idx); + + goto out; + } + + if (old_idx == IDX_INVALID) { + cp->size++; + cp->elements[cp->size - 1].dl = 0; + cp->elements[cp->size - 1].cpu = cpu; + cp->cpu_to_idx[cpu] = cp->size - 1; + cpudl_change_key(cp, cp->size - 1, dl); + cpumask_clear_cpu(cpu, cp->free_cpus); + } else { + cpudl_change_key(cp, old_idx, dl); + } + +out: + raw_spin_unlock_irqrestore(&cp->lock, flags); +} + +/* + * cpudl_init - initialize the cpudl structure + * @cp: the cpudl max-heap context + */ +int cpudl_init(struct cpudl *cp) +{ + int i; + + memset(cp, 0, sizeof(*cp)); + raw_spin_lock_init(&cp->lock); + cp->size = 0; + for (i = 0; i < NR_CPUS; i++) + cp->cpu_to_idx[i] = IDX_INVALID; + if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) + return -ENOMEM; + cpumask_setall(cp->free_cpus); + + return 0; +} + +/* + * cpudl_cleanup - clean up the cpudl structure + * @cp: the cpudl max-heap context + */ +void cpudl_cleanup(struct cpudl *cp) +{ + /* + * nothing to do for the moment + */ +} diff --git a/kernel/sched_cpudl.h b/kernel/sched_cpudl.h new file mode 100644 index 0000000..9285dc4 --- /dev/null +++ b/kernel/sched_cpudl.h @@ -0,0 +1,34 @@ +#ifndef _LINUX_CPUDL_H +#define _LINUX_CPUDL_H + +#include + +#define IDX_INVALID -1 + +struct array_item { + u64 dl; + int cpu; +}; + +struct cpudl { + raw_spinlock_t lock; + int size; + int cpu_to_idx[NR_CPUS]; + struct array_item elements[NR_CPUS]; + cpumask_var_t free_cpus; +}; + + +#ifdef CONFIG_SMP +int cpudl_find(struct cpudl *cp, struct cpumask *dlo_mask, + struct task_struct *p, struct cpumask *later_mask); +void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); +int cpudl_init(struct cpudl *cp); +void cpudl_cleanup(struct cpudl *cp); +#else +#define cpudl_set(cp, cpu, dl) do { } while (0) +#define cpudl_init() do { } while (0) +#endif /* CONFIG_SMP */ + +#endif /* _LINUX_CPUDL_H */ + diff --git a/kernel/sched_dl.c b/kernel/sched_dl.c index fee9434..2f280b3 100644 --- a/kernel/sched_dl.c +++ b/kernel/sched_dl.c @@ -595,6 +595,7 @@ static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) */ dl_rq->earliest_dl.next = dl_rq->earliest_dl.curr; dl_rq->earliest_dl.curr = deadline; + cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1); } else if (dl_rq->earliest_dl.next == 0 || dl_time_before(deadline, dl_rq->earliest_dl.next)) { /* @@ -618,6 +619,7 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) if (!dl_rq->dl_nr_running) { dl_rq->earliest_dl.curr = 0; dl_rq->earliest_dl.next = 0; + cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); } else { struct rb_node *leftmost = dl_rq->rb_leftmost; struct sched_dl_entity *entry; @@ -625,6 +627,7 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); dl_rq->earliest_dl.curr = entry->deadline; dl_rq->earliest_dl.next = next_deadline(rq); + cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1); } } @@ -802,9 +805,6 @@ static void yield_task_dl(struct rq *rq) #ifdef CONFIG_SMP static int find_later_rq(struct task_struct *task); -static int latest_cpu_find(struct cpumask *span, - struct task_struct *task, - struct cpumask *later_mask); static int select_task_rq_dl(struct task_struct *p, int sd_flag, int flags) @@ -852,7 +852,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) * let's hope p can move out. */ if (rq->curr->dl.nr_cpus_allowed == 1 || - latest_cpu_find(rq->rd->span, rq->curr, NULL) == -1) + cpudl_find(&rq->rd->cpudl, rq->rd->dlo_mask, rq->curr, NULL) == -1) return; /* @@ -860,7 +860,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) * see if it is pushed or pulled somewhere else. */ if (p->dl.nr_cpus_allowed != 1 && - latest_cpu_find(rq->rd->span, p, NULL) != -1) + cpudl_find(&rq->rd->cpudl, rq->rd->dlo_mask, p, NULL) != -1) return; resched_task(rq->curr); @@ -1054,39 +1054,6 @@ next_node: return NULL; } -static int latest_cpu_find(struct cpumask *span, - struct task_struct *task, - struct cpumask *later_mask) -{ - const struct sched_dl_entity *dl_se = &task->dl; - int cpu, found = -1, best = 0; - u64 max_dl = 0; - - for_each_cpu(cpu, span) { - struct rq *rq = cpu_rq(cpu); - struct dl_rq *dl_rq = &rq->dl; - - if (cpumask_test_cpu(cpu, &task->cpus_allowed) && - (!dl_rq->dl_nr_running || dl_time_before(dl_se->deadline, - dl_rq->earliest_dl.curr))) { - if (later_mask) - cpumask_set_cpu(cpu, later_mask); - if (!best && !dl_rq->dl_nr_running) { - best = 1; - found = cpu; - } else if (!best && - dl_time_before(max_dl, - dl_rq->earliest_dl.curr)) { - max_dl = dl_rq->earliest_dl.curr; - found = cpu; - } - } else if (later_mask) - cpumask_clear_cpu(cpu, later_mask); - } - - return found; -} - static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl); static int find_later_rq(struct task_struct *task) @@ -1099,7 +1066,9 @@ static int find_later_rq(struct task_struct *task) if (task->dl.nr_cpus_allowed == 1) return -1; - best_cpu = latest_cpu_find(task_rq(task)->rd->span, task, later_mask); + best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, + task_rq(task)->rd->dlo_mask, + task, later_mask); if (best_cpu == -1) return -1; @@ -1464,6 +1433,9 @@ static void rq_online_dl(struct rq *rq) { if (rq->dl.overloaded) dl_set_overload(rq); + + if (rq->dl.dl_nr_running > 0) + cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1); } /* Assumes rq->lock is held */ @@ -1471,6 +1443,8 @@ static void rq_offline_dl(struct rq *rq) { if (rq->dl.overloaded) dl_clear_overload(rq); + + cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); } static inline void init_sched_dl_class(void) -- 1.7.5.4