All of lore.kernel.org
 help / color / mirror / Atom feed
From: Peter Zijlstra <peterz@infradead.org>
To: mingo@kernel.org, tglx@linutronix.de, juri.lelli@arm.com,
	rostedt@goodmis.org, xlpang@redhat.com
Cc: linux-kernel@vger.kernel.org, mathieu.desnoyers@efficios.com,
	jdesfossez@efficios.com, bristot@redhat.com,
	peterz@infradead.org
Subject: [RFC][PATCH 6/8] sched/rtmutex: Refactor rt_mutex_setprio()
Date: Tue, 07 Jun 2016 21:56:41 +0200	[thread overview]
Message-ID: <20160607200215.990237037@infradead.org> (raw)
In-Reply-To: 20160607195635.710022345@infradead.org

[-- Attachment #1: peterz-cleanup-rt-mutex-3.patch --]
[-- Type: text/plain, Size: 12018 bytes --]

With the introduction of SCHED_DEADLINE the whole notion that priority
is a single number is gone, therefore the @prio argument to
rt_mutex_setprio() doesn't make sense anymore.

So rework the code to pass a pi_task instead.

Note this also fixes a problem with pi_top_task caching; previously we
would not set the pointer (call rt_mutex_update_top_task) if the
priority didn't change, this could lead to a stale pointer.

As for the XXX, I think its fine to use pi_task->prio, because if it
differs from waiter->prio, a PI chain update is immenent.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/sched/rt.h        |   21 +-------
 kernel/locking/rtmutex.c        |  105 +++++++++++-----------------------------
 kernel/locking/rtmutex_common.h |    1 
 kernel/sched/core.c             |   66 ++++++++++++++++++++-----
 4 files changed, 88 insertions(+), 105 deletions(-)

--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -16,28 +16,17 @@ static inline int rt_task(struct task_st
 }
 
 #ifdef CONFIG_RT_MUTEXES
-extern int rt_mutex_getprio(struct task_struct *p);
-extern void rt_mutex_setprio(struct task_struct *p, int prio);
-extern int rt_mutex_get_effective_prio(struct task_struct *task, int newprio);
-extern void rt_mutex_update_top_task(struct task_struct *p);
-extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task);
+static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *p)
+{
+	return p->pi_top_task;
+}
+extern void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task);
 extern void rt_mutex_adjust_pi(struct task_struct *p);
 static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
 {
 	return tsk->pi_blocked_on != NULL;
 }
 #else
-static inline int rt_mutex_getprio(struct task_struct *p)
-{
-	return p->normal_prio;
-}
-
-static inline int rt_mutex_get_effective_prio(struct task_struct *task,
-					      int newprio)
-{
-	return newprio;
-}
-
 static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
 {
 	return NULL;
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -256,61 +256,16 @@ rt_mutex_dequeue_pi(struct task_struct *
 	RB_CLEAR_NODE(&waiter->pi_tree_entry);
 }
 
-void rt_mutex_update_top_task(struct task_struct *p)
+static void rt_mutex_adjust_prio(struct task_struct *p)
 {
-	if (!task_has_pi_waiters(p)) {
-		p->pi_top_task = NULL;
-		return;
-	}
+	struct task_struct *pi_task = NULL;
 
-	p->pi_top_task = task_top_pi_waiter(p)->task;
-}
-
-/*
- * Calculate task priority from the waiter tree priority
- *
- * Return task->normal_prio when the waiter tree is empty or when
- * the waiter is not allowed to do priority boosting
- */
-int rt_mutex_getprio(struct task_struct *task)
-{
-	if (likely(!task_has_pi_waiters(task)))
-		return task->normal_prio;
+	lockdep_assert_held(&p->pi_lock);
 
-	return min(task_top_pi_waiter(task)->prio,
-		   task->normal_prio);
-}
+	if (!task_has_pi_waiters(p))
+		pi_task = task_top_pi_waiter(p)->task;
 
-struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
-{
-	return task->pi_top_task;
-}
-
-/*
- * Called by sched_setscheduler() to get the priority which will be
- * effective after the change.
- */
-int rt_mutex_get_effective_prio(struct task_struct *task, int newprio)
-{
-	struct task_struct *top_task = rt_mutex_get_top_task(task);
-
-	if (!top_task)
-		return newprio;
-
-	return min(top_task->prio, newprio);
-}
-
-/*
- * Adjust the priority of a task, after its pi_waiters got modified.
- *
- * This can be both boosting and unboosting. task->pi_lock must be held.
- */
-static void __rt_mutex_adjust_prio(struct task_struct *task)
-{
-	int prio = rt_mutex_getprio(task);
-
-	if (task->prio != prio || dl_prio(prio))
-		rt_mutex_setprio(task, prio);
+	rt_mutex_setprio(p, pi_task);
 }
 
 /*
@@ -670,7 +625,7 @@ static int rt_mutex_adjust_prio_chain(st
 		 */
 		rt_mutex_dequeue_pi(task, prerequeue_top_waiter);
 		rt_mutex_enqueue_pi(task, waiter);
-		__rt_mutex_adjust_prio(task);
+		rt_mutex_adjust_prio(task);
 
 	} else if (prerequeue_top_waiter == waiter) {
 		/*
@@ -686,7 +641,7 @@ static int rt_mutex_adjust_prio_chain(st
 		rt_mutex_dequeue_pi(task, waiter);
 		waiter = rt_mutex_top_waiter(lock);
 		rt_mutex_enqueue_pi(task, waiter);
-		__rt_mutex_adjust_prio(task);
+		rt_mutex_adjust_prio(task);
 	} else {
 		/*
 		 * Nothing changed. No need to do any priority
@@ -896,7 +851,7 @@ static int task_blocks_on_rt_mutex(struc
 		return -EDEADLK;
 
 	raw_spin_lock(&task->pi_lock);
-	__rt_mutex_adjust_prio(task);
+	rt_mutex_adjust_prio(task);
 	waiter->task = task;
 	waiter->lock = lock;
 	waiter->prio = task->prio;
@@ -918,7 +873,7 @@ static int task_blocks_on_rt_mutex(struc
 		rt_mutex_dequeue_pi(owner, top_waiter);
 		rt_mutex_enqueue_pi(owner, waiter);
 
-		__rt_mutex_adjust_prio(owner);
+		rt_mutex_adjust_prio(owner);
 		if (owner->pi_blocked_on)
 			chain_walk = 1;
 	} else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
@@ -970,13 +925,14 @@ static void mark_wakeup_next_waiter(stru
 	waiter = rt_mutex_top_waiter(lock);
 
 	/*
-	 * Remove it from current->pi_waiters. We do not adjust a
-	 * possible priority boost right now. We execute wakeup in the
-	 * boosted mode and go back to normal after releasing
-	 * lock->wait_lock.
+	 * Remove it from current->pi_waiters and deboost.
+	 *
+	 * We must in fact deboost here in order to ensure we call
+	 * rt_mutex_setprio() to update p->pi_top_task before the
+	 * task unblocks.
 	 */
 	rt_mutex_dequeue_pi(current, waiter);
-	__rt_mutex_adjust_prio(current);
+	rt_mutex_adjust_prio(current);
 
 	/*
 	 * As we are waking up the top waiter, and the waiter stays
@@ -988,9 +944,19 @@ static void mark_wakeup_next_waiter(stru
 	 */
 	lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
 
-	raw_spin_unlock(&current->pi_lock);
-
+	/*
+	 * We deboosted before waking the top waiter task such that we don't
+	 * run two tasks with the 'same' priority (and ensure the
+	 * p->pi_top_task pointer points to a blocked task). This however can
+	 * lead to priority inversion if we would get preempted after the
+	 * deboost but before waking our donor task, hence the preempt_disable()
+	 * before unlock.
+	 *
+	 * Pairs with preempt_enable() in rt_mutex_postunlock();
+	 */
+	preempt_disable();
 	wake_q_add(wake_q, waiter->task);
+	raw_spin_unlock(&current->pi_lock);
 }
 
 /*
@@ -1025,7 +991,7 @@ static void remove_waiter(struct rt_mute
 	if (rt_mutex_has_waiters(lock))
 		rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
 
-	__rt_mutex_adjust_prio(owner);
+	rt_mutex_adjust_prio(owner);
 
 	/* Store the lock on which owner is blocked or NULL */
 	next_lock = task_blocked_on_lock(owner);
@@ -1064,8 +1030,7 @@ void rt_mutex_adjust_pi(struct task_stru
 	raw_spin_lock_irqsave(&task->pi_lock, flags);
 
 	waiter = task->pi_blocked_on;
-	if (!waiter || (waiter->prio == task->prio &&
-			!dl_prio(task->prio))) {
+	if (!waiter || (waiter->prio == task->prio && !dl_prio(task->prio))) {
 		raw_spin_unlock_irqrestore(&task->pi_lock, flags);
 		return;
 	}
@@ -1316,16 +1281,6 @@ static bool __sched rt_mutex_slowunlock(
 	 */
 	mark_wakeup_next_waiter(wake_q, lock);
 
-	/*
-	 * We should deboost before waking the top waiter task such that
-	 * we don't run two tasks with the 'same' priority. This however
-	 * can lead to prio-inversion if we would get preempted after
-	 * the deboost but before waking our high-prio task, hence the
-	 * preempt_disable before unlock. Pairs with preempt_enable() in
-	 * rt_mutex_postunlock();
-	 */
-	preempt_disable();
-
 	raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
 
 	/* call rt_mutex_postunlock() */
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -112,7 +112,6 @@ extern int rt_mutex_timed_futex_lock(str
 extern bool rt_mutex_futex_unlock(struct rt_mutex *lock,
 				  struct wake_q_head *wqh);
 extern void rt_mutex_postunlock(struct wake_q_head *wake_q);
-extern void rt_mutex_adjust_prio(struct task_struct *task);
 
 #ifdef CONFIG_DEBUG_RT_MUTEXES
 # include "rtmutex-debug.h"
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3532,10 +3532,25 @@ EXPORT_SYMBOL(default_wake_function);
 
 #ifdef CONFIG_RT_MUTEXES
 
+static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
+{
+	if (pi_task)
+		prio = min(prio, pi_task->prio);
+
+	return prio;
+}
+
+static inline int rt_effective_prio(struct task_struct *p, int prio)
+{
+	struct task_struct *pi_task = rt_mutex_get_top_task(p);
+
+	return __rt_effective_prio(pi_task, prio);
+}
+
 /*
  * rt_mutex_setprio - set the current priority of a task
- * @p: task
- * @prio: prio value (kernel-internal form)
+ * @p: task to boost
+ * @pi_task: donor task
  *
  * This function changes the 'effective' priority of a task. It does
  * not touch ->normal_prio like __setscheduler().
@@ -3543,16 +3558,40 @@ EXPORT_SYMBOL(default_wake_function);
  * Used by the rt_mutex code to implement priority inheritance
  * logic. Call site only calls if the priority of the task changed.
  */
-void rt_mutex_setprio(struct task_struct *p, int prio)
+void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
 {
-	int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
+	int prio, oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
 	const struct sched_class *prev_class;
 	struct rq_flags rf;
 	struct rq *rq;
 
-	BUG_ON(prio > MAX_PRIO);
+	/* XXX used to be waiter->prio, not waiter->task->prio */
+	prio = __rt_effective_prio(pi_task, p->normal_prio);
+
+	/*
+	 * If nothing changed; bail early.
+	 */
+	if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio))
+		return;
 
 	rq = __task_rq_lock(p, &rf);
+	/*
+	 * Set under pi_lock && rq->lock, such that the value can be used under
+	 * either lock.
+	 *
+	 * Note that there is loads of tricky to make this pointer cache work
+	 * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to
+	 * ensure a task is de-boosted (pi_task is set to NULL) before the
+	 * task is allowed to run again (and can exit). This ensures the pointer
+	 * points to a blocked task -- which guaratees the task is present.
+	 */
+	p->pi_top_task = pi_task;
+
+	/*
+	 * For FIFO/RR we only need to set prio, if that matches we're done.
+	 */
+	if (prio == p->prio && !dl_prio(prio))
+		goto out_unlock;
 
 	/*
 	 * Idle task boosting is a nono in general. There is one
@@ -3572,9 +3611,7 @@ void rt_mutex_setprio(struct task_struct
 		goto out_unlock;
 	}
 
-	rt_mutex_update_top_task(p);
-
-	trace_sched_pi_setprio(p, prio);
+	trace_sched_pi_setprio(p, prio); /* broken */
 	oldprio = p->prio;
 
 	if (oldprio == prio)
@@ -3598,7 +3635,6 @@ void rt_mutex_setprio(struct task_struct
 	 *          running task
 	 */
 	if (dl_prio(prio)) {
-		struct task_struct *pi_task = rt_mutex_get_top_task(p);
 		if (!dl_prio(p->normal_prio) ||
 		    (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
 			p->dl.dl_boosted = 1;
@@ -3635,6 +3671,11 @@ void rt_mutex_setprio(struct task_struct
 	balance_callback(rq);
 	preempt_enable();
 }
+#else
+static inline int rt_effective_prio(struct task_struct p, int prio)
+{
+	return prio;
+}
 #endif
 
 void set_user_nice(struct task_struct *p, long nice)
@@ -3873,10 +3914,9 @@ static void __setscheduler(struct rq *rq
 	 * Keep a potential priority boosting if called from
 	 * sched_setscheduler().
 	 */
+	p->prio = normal_prio(p);
 	if (keep_boost)
-		p->prio = rt_mutex_get_effective_prio(p, normal_prio(p));
-	else
-		p->prio = normal_prio(p);
+		p->prio = rt_effective_prio(p, p->prio);
 
 	if (dl_prio(p->prio))
 		p->sched_class = &dl_sched_class;
@@ -4163,7 +4203,7 @@ static int __sched_setscheduler(struct t
 		 * the runqueue. This will be done when the task deboost
 		 * itself.
 		 */
-		new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
+		new_effective_prio = rt_effective_prio(p, newprio);
 		if (new_effective_prio == oldprio)
 			queue_flags &= ~DEQUEUE_MOVE;
 	}

  parent reply	other threads:[~2016-06-07 20:19 UTC|newest]

Thread overview: 40+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-06-07 19:56 [RFC][PATCH 0/8] PI and assorted failings Peter Zijlstra
2016-06-07 19:56 ` [RFC][PATCH 1/8] rtmutex: Deboost before waking up the top waiter Peter Zijlstra
2016-06-14  9:09   ` Juri Lelli
2016-06-14 12:54     ` Peter Zijlstra
2016-06-14 13:20       ` Juri Lelli
2016-06-14 13:59         ` Peter Zijlstra
2016-06-14 16:36     ` Davidlohr Bueso
2016-06-14 17:01       ` Juri Lelli
2016-06-14 18:22   ` Steven Rostedt
2016-06-07 19:56 ` [RFC][PATCH 2/8] sched/rtmutex/deadline: Fix a PI crash for deadline tasks Peter Zijlstra
2016-06-14 10:21   ` Juri Lelli
2016-06-14 12:30     ` Peter Zijlstra
2016-06-14 12:53     ` Xunlei Pang
2016-06-14 13:07       ` Juri Lelli
2016-06-14 16:39         ` Juri Lelli
2016-06-14 18:42   ` Steven Rostedt
2016-06-14 20:28     ` Peter Zijlstra
2016-06-15 16:14       ` Steven Rostedt
2016-06-07 19:56 ` [RFC][PATCH 3/8] sched/deadline/rtmutex: Dont miss the dl_runtime/dl_period update Peter Zijlstra
2016-06-14 10:43   ` Juri Lelli
2016-06-14 12:09     ` Peter Zijlstra
2016-06-15 16:30   ` Steven Rostedt
2016-06-15 17:55     ` Peter Zijlstra
2016-06-07 19:56 ` [RFC][PATCH 4/8] rtmutex: Remove rt_mutex_fastunlock() Peter Zijlstra
2016-06-15 16:43   ` Steven Rostedt
2016-06-07 19:56 ` [RFC][PATCH 5/8] rtmutex: Clean up Peter Zijlstra
2016-06-14 12:08   ` Juri Lelli
2016-06-14 12:32     ` Peter Zijlstra
2016-06-14 12:41       ` Juri Lelli
2016-06-07 19:56 ` Peter Zijlstra [this message]
2016-06-14 13:14   ` [RFC][PATCH 6/8] sched/rtmutex: Refactor rt_mutex_setprio() Juri Lelli
2016-06-14 14:08     ` Peter Zijlstra
2016-06-07 19:56 ` [RFC][PATCH 7/8] sched,tracing: Update trace_sched_pi_setprio() Peter Zijlstra
2016-06-07 19:56 ` [RFC][PATCH 8/8] rtmutex: Fix PI chain order integrity Peter Zijlstra
2016-06-14 17:39   ` Juri Lelli
2016-06-14 19:44     ` Peter Zijlstra
2016-06-15  7:25       ` Juri Lelli
2016-06-27 12:23         ` Peter Zijlstra
2016-06-27 12:40           ` Thomas Gleixner
2016-06-28  9:05           ` Juri Lelli

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20160607200215.990237037@infradead.org \
    --to=peterz@infradead.org \
    --cc=bristot@redhat.com \
    --cc=jdesfossez@efficios.com \
    --cc=juri.lelli@arm.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mathieu.desnoyers@efficios.com \
    --cc=mingo@kernel.org \
    --cc=rostedt@goodmis.org \
    --cc=tglx@linutronix.de \
    --cc=xlpang@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.