The bandwidth enforcing mechanism implemented inside the SCHED_DEADLINE policy ensures that overrunning tasks are slowed down without interfering with well behaving ones. This, however, comes at the price of limiting the capability of a task to exploit more bandwidth than it is asigned. The current implementation always stops a task that is trying to use more than its runtime (every deadline). Something else that could be done is to let it continue running, but with a "decreased priority". This way, we can exploit full CPU bandwidth and still avoid interferences. In order of "decreasing the priority" of a deadline task, we can: - let it stay SCHED_DEADLINE and postpone its deadline. This way it will always be scheduled before -rt and -other tasks but it won't affect other -deadline tasks; - put it in SCHED_FIFO with some priority. This way it will always be scheduled before -other tasks but it won't affect -deadline tasks, nor other -rt tasks with higher priority; - put it in SCHED_OTHER. Notice also that this can be done on a per-task basis, e.g., each task can specify what kind of reclaiming mechanism it wants to use by means of the sched_flags field of sched_param_ex. Therefore, this patch: - adds the flags for specyfing DEADLINE, RT or OTHER reclaiming behaviour; - adds the logic that changes the scheduling class of a task when it overruns, according to the requested policy. Signed-off-by: Dario Faggioli --- include/linux/sched.h | 25 ++++++++++++++ kernel/hrtimer.c | 2 +- kernel/sched.c | 86 ++++++++++++++++++++++++++++++++----------------- kernel/sched_debug.c | 2 +- kernel/sched_dl.c | 44 +++++++++++++++++++++++-- 5 files changed, 123 insertions(+), 36 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index b729f83..8806c1f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -172,10 +172,26 @@ struct sched_param_ex { * a runtime overrun occurs; * @SF_SIG_DMISS tells us the task wants to be notified whenever * a scheduling deadline is missed. + * @SF_BWRECL_DL tells us that the task doesn't stop when exhausting + * its runtime, and it remains a -deadline task, even + * though its deadline is postponed. This means it + * won't affect the scheduling of the other -deadline + * tasks, but if it is a CPU-hog, lower scheduling + * classes will starve! + * @SF_BWRECL_RT tells us that the task doesn't stop when exhausting + * its runtime, and it becomes a -rt task, with the + * priority specified in the sched_priority field of + * struct shced_param_ex. + * @SF_BWRECL_OTH tells us that the task doesn't stop when exhausting + * its runtime, and it becomes a normal task, with + * default priority. */ #define SF_HEAD 1 #define SF_SIG_RORUN 2 #define SF_SIG_DMISS 4 +#define SF_BWRECL_DL 8 +#define SF_BWRECL_RT 16 +#define SF_BWRECL_NR 32 struct exec_domain; struct futex_pi_state; @@ -1694,6 +1710,15 @@ static inline int dl_task(struct task_struct *p) return dl_prio(p->prio); } +/* + * We might have temporarily dropped -deadline policy, + * but still be a -deadline task! + */ +static inline int __dl_task(struct task_struct *p) +{ + return dl_task(p) || p->policy == SCHED_DEADLINE; +} + static inline int rt_prio(int prio) { if (unlikely(prio >= MAX_DL_PRIO && prio < MAX_RT_PRIO)) diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 9cd8564..54277be 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1574,7 +1574,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, unsigned long slack; slack = current->timer_slack_ns; - if (dl_task(current) || rt_task(current)) + if (__dl_task(current) || rt_task(current)) slack = 0; hrtimer_init_on_stack(&t.timer, clockid, mode); diff --git a/kernel/sched.c b/kernel/sched.c index 79cac6e..4d291e3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2235,7 +2235,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) #endif trace_sched_migrate_task(p, new_cpu); - if (unlikely(dl_task(p))) + if (unlikely(__dl_task(p))) trace_sched_migrate_task_dl(p, task_rq(p)->clock, new_cpu, cpu_rq(new_cpu)->clock); @@ -2983,6 +2983,16 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) prev->sched_class->task_dead(prev); /* + * If we are a -deadline task, dieing while + * hanging out in a different scheduling class + * we need to manually call our own cleanup function, + * at least to stop the bandwidth timer. + */ + if (unlikely(task_has_dl_policy(prev) && + prev->sched_class != &dl_sched_class)) + dl_sched_class.task_dead(prev); + + /* * Remove function-return probe instances associated with this * task and put them back on the free list. */ @@ -3064,7 +3074,7 @@ context_switch(struct rq *rq, struct task_struct *prev, prepare_task_switch(rq, prev, next); trace_sched_switch(prev, next); - if (unlikely(dl_task(prev) || dl_task(next))) + if (unlikely(__dl_task(prev) || __dl_task(next))) trace_sched_switch_dl(rq->clock, prev, next); mm = next->mm; oldmm = prev->active_mm; @@ -4554,34 +4564,13 @@ long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) } EXPORT_SYMBOL(sleep_on_timeout); -#ifdef CONFIG_RT_MUTEXES - -/* - * rt_mutex_setprio - set the current priority of a task - * @p: task - * @prio: prio value (kernel-internal form) - * - * This function changes the 'effective' priority of a task. It does - * not touch ->normal_prio like __setscheduler(). - * - * Used by the rt_mutex code to implement priority inheritance logic. - */ -void rt_mutex_setprio(struct task_struct *p, int prio) +static void __setprio(struct rq *rq, struct task_struct *p, int prio) { - unsigned long flags; - int oldprio, on_rq, running; - struct rq *rq; - const struct sched_class *prev_class; - - BUG_ON(prio < 0 || prio > MAX_PRIO); + int oldprio = p->prio; + const struct sched_class *prev_class = p->sched_class; + int running = task_current(rq, p); + int on_rq = p->se.on_rq; - rq = task_rq_lock(p, &flags); - - trace_sched_pi_setprio(p, prio); - oldprio = p->prio; - prev_class = p->sched_class; - on_rq = p->se.on_rq; - running = task_current(rq, p); if (on_rq) dequeue_task(rq, p, 0); if (running) @@ -4603,6 +4592,30 @@ void rt_mutex_setprio(struct task_struct *p, int prio) check_class_changed(rq, p, prev_class, oldprio, running); } +} + +#ifdef CONFIG_RT_MUTEXES + +/* + * rt_mutex_setprio - set the current priority of a task + * @p: task + * @prio: prio value (kernel-internal form) + * + * This function changes the 'effective' priority of a task. It does + * not touch ->normal_prio like __setscheduler(). + * + * Used by the rt_mutex code to implement priority inheritance logic. + */ +void rt_mutex_setprio(struct task_struct *p, int prio) +{ + unsigned long flags; + struct rq *rq; + + BUG_ON(prio < 0 || prio > MAX_PRIO); + + rq = task_rq_lock(p, &flags); + trace_sched_pi_setprio(p, prio); + __setprio(rq, p, prio); task_rq_unlock(rq, &flags); } @@ -4909,19 +4922,32 @@ recheck: */ if (user && !capable(CAP_SYS_NICE)) { if (dl_policy(policy)) { - u64 rlim_dline, rlim_rtime; + u64 rlim_dline, rlim_rtime, rlim_rtprio; u64 dline, rtime; if (!lock_task_sighand(p, &flags)) return -ESRCH; rlim_dline = p->signal->rlim[RLIMIT_DLDLINE].rlim_cur; rlim_rtime = p->signal->rlim[RLIMIT_DLRTIME].rlim_cur; + rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; unlock_task_sighand(p, &flags); /* can't set/change -deadline policy */ if (policy != p->policy && !rlim_rtime) return -EPERM; + /* can't set/change reclaiming policy to -deadline */ + if ((param_ex->sched_flags & SF_BWRECL_DL) != + (p->dl.flags & SF_BWRECL_DL)) + return -EPERM; + + /* can't set/increase -rt reclaiming priority */ + if (param_ex->sched_flags & SF_BWRECL_RT && + (param_ex->sched_priority <= 0 || + (param_ex->sched_priority > p->rt_priority && + param_ex->sched_priority > rlim_rtprio))) + return -EPERM; + /* can't decrease the deadline */ rlim_dline *= NSEC_PER_USEC; dline = timespec_to_ns(¶m_ex->sched_deadline); @@ -8596,7 +8622,7 @@ void normalize_rt_tasks(void) p->se.statistics.block_start = 0; #endif - if (!dl_task(p) && !rt_task(p)) { + if (!__dl_task(p) && !rt_task(p)) { /* * Renice negative nice level userspace * tasks back to 0: diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 4949a21..2bf4e72 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -467,7 +467,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) P(se.statistics.nr_wakeups_affine_attempts); P(se.statistics.nr_wakeups_passive); P(se.statistics.nr_wakeups_idle); - if (dl_task(p)) { + if (__dl_task(p)) { P(dl.stats.dmiss); PN(dl.stats.last_dmiss); PN(dl.stats.dmiss_max); diff --git a/kernel/sched_dl.c b/kernel/sched_dl.c index eff183a..4d24109 100644 --- a/kernel/sched_dl.c +++ b/kernel/sched_dl.c @@ -15,6 +15,8 @@ * Fabio Checconi */ +static const struct sched_class dl_sched_class; + static inline int dl_time_before(u64 a, u64 b) { return (s64)(a - b) < 0; @@ -382,6 +384,17 @@ static int start_dl_timer(struct sched_dl_entity *dl_se) s64 delta; /* + * If the task wants to stay -deadline even if it exhausted + * its runtime we allow that by not starting the timer. + * update_curr_dl() will thus queue it back after replenishment + * and deadline postponing. + * This won't affect the other -deadline tasks, but if we are + * a CPU-hog, lower scheduling classes will starve! + */ + if (dl_se->flags & SF_BWRECL_DL) + return 0; + + /* * We want the timer to fire at the deadline, but considering * that it is actually coming from rq->clock and not from * hrtimer's time base reading. @@ -414,6 +427,8 @@ static int start_dl_timer(struct sched_dl_entity *dl_se) return hrtimer_active(&dl_se->dl_timer); } +static void __setprio(struct rq *rq, struct task_struct *p, int prio); + /* * This is the bandwidth enforcement timer callback. If here, we know * a task is not on its dl_rq, since the fact that the timer was running @@ -440,12 +455,18 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * We need to take care of a possible races here. In fact, the * task might have changed its scheduling policy to something * different from SCHED_DEADLINE (through sched_setscheduler()). + * However, if we changed scheduling class for reclaiming, it + * is correct to handle this replenishment, since this is what + * will put us back into the -deadline scheduling class. */ - if (!dl_task(p)) + if (!__dl_task(p)) goto unlock; trace_sched_timer_dl(p, rq->clock, p->se.on_rq, task_current(rq, p)); + if (unlikely(p->sched_class != &dl_sched_class)) + __setprio(rq, p, MAX_DL_PRIO-1); + dl_se->dl_throttled = 0; if (p->se.on_rq) { enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); @@ -530,6 +551,16 @@ int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se) return 1; } +static inline void throttle_curr_dl(struct rq *rq, struct task_struct *curr) +{ + curr->dl.dl_throttled = 1; + + if (curr->dl.flags & SF_BWRECL_RT) + __setprio(rq, curr, MAX_RT_PRIO-1 - curr->rt_priority); + else if (curr->dl.flags & SF_BWRECL_NR) + __setprio(rq, curr, DEFAULT_PRIO); +} + /* * Update the current task's runtime statistics (provided it is still * a -deadline task and has not been removed from the dl_rq). @@ -565,7 +596,7 @@ static void update_curr_dl(struct rq *rq) if (dl_runtime_exceeded(rq, dl_se)) { __dequeue_task_dl(rq, curr, 0); if (likely(start_dl_timer(dl_se))) - dl_se->dl_throttled = 1; + throttle_curr_dl(rq, curr); else enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); @@ -765,8 +796,10 @@ static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) { - update_curr_dl(rq); - __dequeue_task_dl(rq, p, flags); + if (likely(!p->dl.dl_throttled)) { + update_curr_dl(rq); + __dequeue_task_dl(rq, p, flags); + } } /* @@ -1000,6 +1033,9 @@ struct task_struct *pick_next_task_dl(struct rq *rq) static void put_prev_task_dl(struct rq *rq, struct task_struct *p) { + if (unlikely(p->dl.dl_throttled)) + return; + update_curr_dl(rq); p->se.exec_start = 0; -- 1.7.2.3 -- <> (Raistlin Majere) ---------------------------------------------------------------------- Dario Faggioli, ReTiS Lab, Scuola Superiore Sant'Anna, Pisa (Italy) http://blog.linux.it/raistlin / raistlin@ekiga.net / dario.faggioli@jabber.org