From: Peter Zijlstra <peterz@infradead.org>
To: mingo@kernel.org, tglx@linutronix.de, juri.lelli@arm.com,
rostedt@goodmis.org, xlpang@redhat.com
Cc: linux-kernel@vger.kernel.org, mathieu.desnoyers@efficios.com,
jdesfossez@efficios.com, bristot@redhat.com,
peterz@infradead.org
Subject: [RFC][PATCH 6/8] sched/rtmutex: Refactor rt_mutex_setprio()
Date: Tue, 07 Jun 2016 21:56:41 +0200 [thread overview]
Message-ID: <20160607200215.990237037@infradead.org> (raw)
In-Reply-To: 20160607195635.710022345@infradead.org
[-- Attachment #1: peterz-cleanup-rt-mutex-3.patch --]
[-- Type: text/plain, Size: 12018 bytes --]
With the introduction of SCHED_DEADLINE the whole notion that priority
is a single number is gone, therefore the @prio argument to
rt_mutex_setprio() doesn't make sense anymore.
So rework the code to pass a pi_task instead.
Note this also fixes a problem with pi_top_task caching; previously we
would not set the pointer (call rt_mutex_update_top_task) if the
priority didn't change, this could lead to a stale pointer.
As for the XXX, I think its fine to use pi_task->prio, because if it
differs from waiter->prio, a PI chain update is immenent.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
include/linux/sched/rt.h | 21 +-------
kernel/locking/rtmutex.c | 105 +++++++++++-----------------------------
kernel/locking/rtmutex_common.h | 1
kernel/sched/core.c | 66 ++++++++++++++++++++-----
4 files changed, 88 insertions(+), 105 deletions(-)
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -16,28 +16,17 @@ static inline int rt_task(struct task_st
}
#ifdef CONFIG_RT_MUTEXES
-extern int rt_mutex_getprio(struct task_struct *p);
-extern void rt_mutex_setprio(struct task_struct *p, int prio);
-extern int rt_mutex_get_effective_prio(struct task_struct *task, int newprio);
-extern void rt_mutex_update_top_task(struct task_struct *p);
-extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task);
+static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *p)
+{
+ return p->pi_top_task;
+}
+extern void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task);
extern void rt_mutex_adjust_pi(struct task_struct *p);
static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
{
return tsk->pi_blocked_on != NULL;
}
#else
-static inline int rt_mutex_getprio(struct task_struct *p)
-{
- return p->normal_prio;
-}
-
-static inline int rt_mutex_get_effective_prio(struct task_struct *task,
- int newprio)
-{
- return newprio;
-}
-
static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
{
return NULL;
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -256,61 +256,16 @@ rt_mutex_dequeue_pi(struct task_struct *
RB_CLEAR_NODE(&waiter->pi_tree_entry);
}
-void rt_mutex_update_top_task(struct task_struct *p)
+static void rt_mutex_adjust_prio(struct task_struct *p)
{
- if (!task_has_pi_waiters(p)) {
- p->pi_top_task = NULL;
- return;
- }
+ struct task_struct *pi_task = NULL;
- p->pi_top_task = task_top_pi_waiter(p)->task;
-}
-
-/*
- * Calculate task priority from the waiter tree priority
- *
- * Return task->normal_prio when the waiter tree is empty or when
- * the waiter is not allowed to do priority boosting
- */
-int rt_mutex_getprio(struct task_struct *task)
-{
- if (likely(!task_has_pi_waiters(task)))
- return task->normal_prio;
+ lockdep_assert_held(&p->pi_lock);
- return min(task_top_pi_waiter(task)->prio,
- task->normal_prio);
-}
+ if (!task_has_pi_waiters(p))
+ pi_task = task_top_pi_waiter(p)->task;
-struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
-{
- return task->pi_top_task;
-}
-
-/*
- * Called by sched_setscheduler() to get the priority which will be
- * effective after the change.
- */
-int rt_mutex_get_effective_prio(struct task_struct *task, int newprio)
-{
- struct task_struct *top_task = rt_mutex_get_top_task(task);
-
- if (!top_task)
- return newprio;
-
- return min(top_task->prio, newprio);
-}
-
-/*
- * Adjust the priority of a task, after its pi_waiters got modified.
- *
- * This can be both boosting and unboosting. task->pi_lock must be held.
- */
-static void __rt_mutex_adjust_prio(struct task_struct *task)
-{
- int prio = rt_mutex_getprio(task);
-
- if (task->prio != prio || dl_prio(prio))
- rt_mutex_setprio(task, prio);
+ rt_mutex_setprio(p, pi_task);
}
/*
@@ -670,7 +625,7 @@ static int rt_mutex_adjust_prio_chain(st
*/
rt_mutex_dequeue_pi(task, prerequeue_top_waiter);
rt_mutex_enqueue_pi(task, waiter);
- __rt_mutex_adjust_prio(task);
+ rt_mutex_adjust_prio(task);
} else if (prerequeue_top_waiter == waiter) {
/*
@@ -686,7 +641,7 @@ static int rt_mutex_adjust_prio_chain(st
rt_mutex_dequeue_pi(task, waiter);
waiter = rt_mutex_top_waiter(lock);
rt_mutex_enqueue_pi(task, waiter);
- __rt_mutex_adjust_prio(task);
+ rt_mutex_adjust_prio(task);
} else {
/*
* Nothing changed. No need to do any priority
@@ -896,7 +851,7 @@ static int task_blocks_on_rt_mutex(struc
return -EDEADLK;
raw_spin_lock(&task->pi_lock);
- __rt_mutex_adjust_prio(task);
+ rt_mutex_adjust_prio(task);
waiter->task = task;
waiter->lock = lock;
waiter->prio = task->prio;
@@ -918,7 +873,7 @@ static int task_blocks_on_rt_mutex(struc
rt_mutex_dequeue_pi(owner, top_waiter);
rt_mutex_enqueue_pi(owner, waiter);
- __rt_mutex_adjust_prio(owner);
+ rt_mutex_adjust_prio(owner);
if (owner->pi_blocked_on)
chain_walk = 1;
} else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
@@ -970,13 +925,14 @@ static void mark_wakeup_next_waiter(stru
waiter = rt_mutex_top_waiter(lock);
/*
- * Remove it from current->pi_waiters. We do not adjust a
- * possible priority boost right now. We execute wakeup in the
- * boosted mode and go back to normal after releasing
- * lock->wait_lock.
+ * Remove it from current->pi_waiters and deboost.
+ *
+ * We must in fact deboost here in order to ensure we call
+ * rt_mutex_setprio() to update p->pi_top_task before the
+ * task unblocks.
*/
rt_mutex_dequeue_pi(current, waiter);
- __rt_mutex_adjust_prio(current);
+ rt_mutex_adjust_prio(current);
/*
* As we are waking up the top waiter, and the waiter stays
@@ -988,9 +944,19 @@ static void mark_wakeup_next_waiter(stru
*/
lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
- raw_spin_unlock(¤t->pi_lock);
-
+ /*
+ * We deboosted before waking the top waiter task such that we don't
+ * run two tasks with the 'same' priority (and ensure the
+ * p->pi_top_task pointer points to a blocked task). This however can
+ * lead to priority inversion if we would get preempted after the
+ * deboost but before waking our donor task, hence the preempt_disable()
+ * before unlock.
+ *
+ * Pairs with preempt_enable() in rt_mutex_postunlock();
+ */
+ preempt_disable();
wake_q_add(wake_q, waiter->task);
+ raw_spin_unlock(¤t->pi_lock);
}
/*
@@ -1025,7 +991,7 @@ static void remove_waiter(struct rt_mute
if (rt_mutex_has_waiters(lock))
rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
- __rt_mutex_adjust_prio(owner);
+ rt_mutex_adjust_prio(owner);
/* Store the lock on which owner is blocked or NULL */
next_lock = task_blocked_on_lock(owner);
@@ -1064,8 +1030,7 @@ void rt_mutex_adjust_pi(struct task_stru
raw_spin_lock_irqsave(&task->pi_lock, flags);
waiter = task->pi_blocked_on;
- if (!waiter || (waiter->prio == task->prio &&
- !dl_prio(task->prio))) {
+ if (!waiter || (waiter->prio == task->prio && !dl_prio(task->prio))) {
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
return;
}
@@ -1316,16 +1281,6 @@ static bool __sched rt_mutex_slowunlock(
*/
mark_wakeup_next_waiter(wake_q, lock);
- /*
- * We should deboost before waking the top waiter task such that
- * we don't run two tasks with the 'same' priority. This however
- * can lead to prio-inversion if we would get preempted after
- * the deboost but before waking our high-prio task, hence the
- * preempt_disable before unlock. Pairs with preempt_enable() in
- * rt_mutex_postunlock();
- */
- preempt_disable();
-
raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
/* call rt_mutex_postunlock() */
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -112,7 +112,6 @@ extern int rt_mutex_timed_futex_lock(str
extern bool rt_mutex_futex_unlock(struct rt_mutex *lock,
struct wake_q_head *wqh);
extern void rt_mutex_postunlock(struct wake_q_head *wake_q);
-extern void rt_mutex_adjust_prio(struct task_struct *task);
#ifdef CONFIG_DEBUG_RT_MUTEXES
# include "rtmutex-debug.h"
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3532,10 +3532,25 @@ EXPORT_SYMBOL(default_wake_function);
#ifdef CONFIG_RT_MUTEXES
+static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
+{
+ if (pi_task)
+ prio = min(prio, pi_task->prio);
+
+ return prio;
+}
+
+static inline int rt_effective_prio(struct task_struct *p, int prio)
+{
+ struct task_struct *pi_task = rt_mutex_get_top_task(p);
+
+ return __rt_effective_prio(pi_task, prio);
+}
+
/*
* rt_mutex_setprio - set the current priority of a task
- * @p: task
- * @prio: prio value (kernel-internal form)
+ * @p: task to boost
+ * @pi_task: donor task
*
* This function changes the 'effective' priority of a task. It does
* not touch ->normal_prio like __setscheduler().
@@ -3543,16 +3558,40 @@ EXPORT_SYMBOL(default_wake_function);
* Used by the rt_mutex code to implement priority inheritance
* logic. Call site only calls if the priority of the task changed.
*/
-void rt_mutex_setprio(struct task_struct *p, int prio)
+void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
{
- int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
+ int prio, oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
const struct sched_class *prev_class;
struct rq_flags rf;
struct rq *rq;
- BUG_ON(prio > MAX_PRIO);
+ /* XXX used to be waiter->prio, not waiter->task->prio */
+ prio = __rt_effective_prio(pi_task, p->normal_prio);
+
+ /*
+ * If nothing changed; bail early.
+ */
+ if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio))
+ return;
rq = __task_rq_lock(p, &rf);
+ /*
+ * Set under pi_lock && rq->lock, such that the value can be used under
+ * either lock.
+ *
+ * Note that there is loads of tricky to make this pointer cache work
+ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to
+ * ensure a task is de-boosted (pi_task is set to NULL) before the
+ * task is allowed to run again (and can exit). This ensures the pointer
+ * points to a blocked task -- which guaratees the task is present.
+ */
+ p->pi_top_task = pi_task;
+
+ /*
+ * For FIFO/RR we only need to set prio, if that matches we're done.
+ */
+ if (prio == p->prio && !dl_prio(prio))
+ goto out_unlock;
/*
* Idle task boosting is a nono in general. There is one
@@ -3572,9 +3611,7 @@ void rt_mutex_setprio(struct task_struct
goto out_unlock;
}
- rt_mutex_update_top_task(p);
-
- trace_sched_pi_setprio(p, prio);
+ trace_sched_pi_setprio(p, prio); /* broken */
oldprio = p->prio;
if (oldprio == prio)
@@ -3598,7 +3635,6 @@ void rt_mutex_setprio(struct task_struct
* running task
*/
if (dl_prio(prio)) {
- struct task_struct *pi_task = rt_mutex_get_top_task(p);
if (!dl_prio(p->normal_prio) ||
(pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
p->dl.dl_boosted = 1;
@@ -3635,6 +3671,11 @@ void rt_mutex_setprio(struct task_struct
balance_callback(rq);
preempt_enable();
}
+#else
+static inline int rt_effective_prio(struct task_struct p, int prio)
+{
+ return prio;
+}
#endif
void set_user_nice(struct task_struct *p, long nice)
@@ -3873,10 +3914,9 @@ static void __setscheduler(struct rq *rq
* Keep a potential priority boosting if called from
* sched_setscheduler().
*/
+ p->prio = normal_prio(p);
if (keep_boost)
- p->prio = rt_mutex_get_effective_prio(p, normal_prio(p));
- else
- p->prio = normal_prio(p);
+ p->prio = rt_effective_prio(p, p->prio);
if (dl_prio(p->prio))
p->sched_class = &dl_sched_class;
@@ -4163,7 +4203,7 @@ static int __sched_setscheduler(struct t
* the runqueue. This will be done when the task deboost
* itself.
*/
- new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
+ new_effective_prio = rt_effective_prio(p, newprio);
if (new_effective_prio == oldprio)
queue_flags &= ~DEQUEUE_MOVE;
}
next prev parent reply other threads:[~2016-06-07 20:19 UTC|newest]
Thread overview: 40+ messages / expand[flat|nested] mbox.gz Atom feed top
2016-06-07 19:56 [RFC][PATCH 0/8] PI and assorted failings Peter Zijlstra
2016-06-07 19:56 ` [RFC][PATCH 1/8] rtmutex: Deboost before waking up the top waiter Peter Zijlstra
2016-06-14 9:09 ` Juri Lelli
2016-06-14 12:54 ` Peter Zijlstra
2016-06-14 13:20 ` Juri Lelli
2016-06-14 13:59 ` Peter Zijlstra
2016-06-14 16:36 ` Davidlohr Bueso
2016-06-14 17:01 ` Juri Lelli
2016-06-14 18:22 ` Steven Rostedt
2016-06-07 19:56 ` [RFC][PATCH 2/8] sched/rtmutex/deadline: Fix a PI crash for deadline tasks Peter Zijlstra
2016-06-14 10:21 ` Juri Lelli
2016-06-14 12:30 ` Peter Zijlstra
2016-06-14 12:53 ` Xunlei Pang
2016-06-14 13:07 ` Juri Lelli
2016-06-14 16:39 ` Juri Lelli
2016-06-14 18:42 ` Steven Rostedt
2016-06-14 20:28 ` Peter Zijlstra
2016-06-15 16:14 ` Steven Rostedt
2016-06-07 19:56 ` [RFC][PATCH 3/8] sched/deadline/rtmutex: Dont miss the dl_runtime/dl_period update Peter Zijlstra
2016-06-14 10:43 ` Juri Lelli
2016-06-14 12:09 ` Peter Zijlstra
2016-06-15 16:30 ` Steven Rostedt
2016-06-15 17:55 ` Peter Zijlstra
2016-06-07 19:56 ` [RFC][PATCH 4/8] rtmutex: Remove rt_mutex_fastunlock() Peter Zijlstra
2016-06-15 16:43 ` Steven Rostedt
2016-06-07 19:56 ` [RFC][PATCH 5/8] rtmutex: Clean up Peter Zijlstra
2016-06-14 12:08 ` Juri Lelli
2016-06-14 12:32 ` Peter Zijlstra
2016-06-14 12:41 ` Juri Lelli
2016-06-07 19:56 ` Peter Zijlstra [this message]
2016-06-14 13:14 ` [RFC][PATCH 6/8] sched/rtmutex: Refactor rt_mutex_setprio() Juri Lelli
2016-06-14 14:08 ` Peter Zijlstra
2016-06-07 19:56 ` [RFC][PATCH 7/8] sched,tracing: Update trace_sched_pi_setprio() Peter Zijlstra
2016-06-07 19:56 ` [RFC][PATCH 8/8] rtmutex: Fix PI chain order integrity Peter Zijlstra
2016-06-14 17:39 ` Juri Lelli
2016-06-14 19:44 ` Peter Zijlstra
2016-06-15 7:25 ` Juri Lelli
2016-06-27 12:23 ` Peter Zijlstra
2016-06-27 12:40 ` Thomas Gleixner
2016-06-28 9:05 ` Juri Lelli
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20160607200215.990237037@infradead.org \
--to=peterz@infradead.org \
--cc=bristot@redhat.com \
--cc=jdesfossez@efficios.com \
--cc=juri.lelli@arm.com \
--cc=linux-kernel@vger.kernel.org \
--cc=mathieu.desnoyers@efficios.com \
--cc=mingo@kernel.org \
--cc=rostedt@goodmis.org \
--cc=tglx@linutronix.de \
--cc=xlpang@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).