All of lore.kernel.org
 help / color / mirror / Atom feed
From: Peter Zijlstra <peterz@infradead.org>
To: mingo@kernel.org, tglx@linutronix.de, juri.lelli@arm.com,
	rostedt@goodmis.org, xlpang@redhat.com, bigeasy@linutronix.de
Cc: linux-kernel@vger.kernel.org, mathieu.desnoyers@efficios.com,
	jdesfossez@efficios.com, bristot@redhat.com,
	peterz@infradead.org
Subject: [PATCH -v3 5/8] sched/rtmutex: Refactor rt_mutex_setprio()
Date: Thu, 23 Mar 2017 15:56:11 +0100	[thread overview]
Message-ID: <20170323150216.303827095@infradead.org> (raw)
In-Reply-To: 20170323145606.480214279@infradead.org

[-- Attachment #1: peterz-cleanup-rt-mutex-3.patch --]
[-- Type: text/plain, Size: 11839 bytes --]

With the introduction of SCHED_DEADLINE the whole notion that priority
is a single number is gone, therefore the @prio argument to
rt_mutex_setprio() doesn't make sense anymore.

So rework the code to pass a pi_task instead.

Note this also fixes a problem with pi_top_task caching; previously we
would not set the pointer (call rt_mutex_update_top_task) if the
priority didn't change, this could lead to a stale pointer.

As for the XXX, I think its fine to use pi_task->prio, because if it
differs from waiter->prio, a PI chain update is immenent.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/sched/rt.h |   24 +++-------
 kernel/locking/rtmutex.c |  112 ++++++++++++-----------------------------------
 kernel/sched/core.c      |   66 ++++++++++++++++++++++-----
 3 files changed, 91 insertions(+), 111 deletions(-)

--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -18,28 +18,20 @@ static inline int rt_task(struct task_st
 }
 
 #ifdef CONFIG_RT_MUTEXES
-extern int rt_mutex_getprio(struct task_struct *p);
-extern void rt_mutex_setprio(struct task_struct *p, int prio);
-extern int rt_mutex_get_effective_prio(struct task_struct *task, int newprio);
-extern void rt_mutex_update_top_task(struct task_struct *p);
-extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task);
+/*
+ * Must hold either p->pi_lock or task_rq(p)->lock.
+ */
+static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *p)
+{
+	return p->pi_top_task;
+}
+extern void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task);
 extern void rt_mutex_adjust_pi(struct task_struct *p);
 static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
 {
 	return tsk->pi_blocked_on != NULL;
 }
 #else
-static inline int rt_mutex_getprio(struct task_struct *p)
-{
-	return p->normal_prio;
-}
-
-static inline int rt_mutex_get_effective_prio(struct task_struct *task,
-					      int newprio)
-{
-	return newprio;
-}
-
 static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
 {
 	return NULL;
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -322,67 +322,16 @@ rt_mutex_dequeue_pi(struct task_struct *
 	RB_CLEAR_NODE(&waiter->pi_tree_entry);
 }
 
-/*
- * Must hold both p->pi_lock and task_rq(p)->lock.
- */
-void rt_mutex_update_top_task(struct task_struct *p)
-{
-	if (!task_has_pi_waiters(p)) {
-		p->pi_top_task = NULL;
-		return;
-	}
-
-	p->pi_top_task = task_top_pi_waiter(p)->task;
-}
-
-/*
- * Calculate task priority from the waiter tree priority
- *
- * Return task->normal_prio when the waiter tree is empty or when
- * the waiter is not allowed to do priority boosting
- */
-int rt_mutex_getprio(struct task_struct *task)
-{
-	if (likely(!task_has_pi_waiters(task)))
-		return task->normal_prio;
-
-	return min(task_top_pi_waiter(task)->prio,
-		   task->normal_prio);
-}
-
-/*
- * Must hold either p->pi_lock or task_rq(p)->lock.
- */
-struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
-{
-	return task->pi_top_task;
-}
-
-/*
- * Called by sched_setscheduler() to get the priority which will be
- * effective after the change.
- */
-int rt_mutex_get_effective_prio(struct task_struct *task, int newprio)
+static void rt_mutex_adjust_prio(struct task_struct *p)
 {
-	struct task_struct *top_task = rt_mutex_get_top_task(task);
+	struct task_struct *pi_task = NULL;
 
-	if (!top_task)
-		return newprio;
+	lockdep_assert_held(&p->pi_lock);
 
-	return min(top_task->prio, newprio);
-}
+	if (task_has_pi_waiters(p))
+		pi_task = task_top_pi_waiter(p)->task;
 
-/*
- * Adjust the priority of a task, after its pi_waiters got modified.
- *
- * This can be both boosting and unboosting. task->pi_lock must be held.
- */
-static void __rt_mutex_adjust_prio(struct task_struct *task)
-{
-	int prio = rt_mutex_getprio(task);
-
-	if (task->prio != prio || dl_prio(prio))
-		rt_mutex_setprio(task, prio);
+	rt_mutex_setprio(p, pi_task);
 }
 
 /*
@@ -742,7 +691,7 @@ static int rt_mutex_adjust_prio_chain(st
 		 */
 		rt_mutex_dequeue_pi(task, prerequeue_top_waiter);
 		rt_mutex_enqueue_pi(task, waiter);
-		__rt_mutex_adjust_prio(task);
+		rt_mutex_adjust_prio(task);
 
 	} else if (prerequeue_top_waiter == waiter) {
 		/*
@@ -758,7 +707,7 @@ static int rt_mutex_adjust_prio_chain(st
 		rt_mutex_dequeue_pi(task, waiter);
 		waiter = rt_mutex_top_waiter(lock);
 		rt_mutex_enqueue_pi(task, waiter);
-		__rt_mutex_adjust_prio(task);
+		rt_mutex_adjust_prio(task);
 	} else {
 		/*
 		 * Nothing changed. No need to do any priority
@@ -966,7 +915,7 @@ static int task_blocks_on_rt_mutex(struc
 		return -EDEADLK;
 
 	raw_spin_lock(&task->pi_lock);
-	__rt_mutex_adjust_prio(task);
+	rt_mutex_adjust_prio(task);
 	waiter->task = task;
 	waiter->lock = lock;
 	waiter->prio = task->prio;
@@ -988,7 +937,7 @@ static int task_blocks_on_rt_mutex(struc
 		rt_mutex_dequeue_pi(owner, top_waiter);
 		rt_mutex_enqueue_pi(owner, waiter);
 
-		__rt_mutex_adjust_prio(owner);
+		rt_mutex_adjust_prio(owner);
 		if (owner->pi_blocked_on)
 			chain_walk = 1;
 	} else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
@@ -1040,13 +989,14 @@ static void mark_wakeup_next_waiter(stru
 	waiter = rt_mutex_top_waiter(lock);
 
 	/*
-	 * Remove it from current->pi_waiters. We do not adjust a
-	 * possible priority boost right now. We execute wakeup in the
-	 * boosted mode and go back to normal after releasing
-	 * lock->wait_lock.
+	 * Remove it from current->pi_waiters and deboost.
+	 *
+	 * We must in fact deboost here in order to ensure we call
+	 * rt_mutex_setprio() to update p->pi_top_task before the
+	 * task unblocks.
 	 */
 	rt_mutex_dequeue_pi(current, waiter);
-	__rt_mutex_adjust_prio(current);
+	rt_mutex_adjust_prio(current);
 
 	/*
 	 * As we are waking up the top waiter, and the waiter stays
@@ -1058,9 +1008,19 @@ static void mark_wakeup_next_waiter(stru
 	 */
 	lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
 
-	raw_spin_unlock(&current->pi_lock);
-
+	/*
+	 * We deboosted before waking the top waiter task such that we don't
+	 * run two tasks with the 'same' priority (and ensure the
+	 * p->pi_top_task pointer points to a blocked task). This however can
+	 * lead to priority inversion if we would get preempted after the
+	 * deboost but before waking our donor task, hence the preempt_disable()
+	 * before unlock.
+	 *
+	 * Pairs with preempt_enable() in rt_mutex_postunlock();
+	 */
+	preempt_disable();
 	wake_q_add(wake_q, waiter->task);
+	raw_spin_unlock(&current->pi_lock);
 }
 
 /*
@@ -1095,7 +1055,7 @@ static void remove_waiter(struct rt_mute
 	if (rt_mutex_has_waiters(lock))
 		rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
 
-	__rt_mutex_adjust_prio(owner);
+	rt_mutex_adjust_prio(owner);
 
 	/* Store the lock on which owner is blocked or NULL */
 	next_lock = task_blocked_on_lock(owner);
@@ -1134,8 +1094,7 @@ void rt_mutex_adjust_pi(struct task_stru
 	raw_spin_lock_irqsave(&task->pi_lock, flags);
 
 	waiter = task->pi_blocked_on;
-	if (!waiter || (waiter->prio == task->prio &&
-			!dl_prio(task->prio))) {
+	if (!waiter || (waiter->prio == task->prio && !dl_prio(task->prio))) {
 		raw_spin_unlock_irqrestore(&task->pi_lock, flags);
 		return;
 	}
@@ -1389,17 +1348,6 @@ static bool __sched rt_mutex_slowunlock(
 	 * Queue the next waiter for wakeup once we release the wait_lock.
 	 */
 	mark_wakeup_next_waiter(wake_q, lock);
-
-	/*
-	 * We should deboost before waking the top waiter task such that
-	 * we don't run two tasks with the 'same' priority. This however
-	 * can lead to prio-inversion if we would get preempted after
-	 * the deboost but before waking our high-prio task, hence the
-	 * preempt_disable before unlock. Pairs with preempt_enable() in
-	 * rt_mutex_postunlock();
-	 */
-	preempt_disable();
-
 	raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
 
 	return true; /* call rt_mutex_postunlock() */
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3674,10 +3674,25 @@ EXPORT_SYMBOL(default_wake_function);
 
 #ifdef CONFIG_RT_MUTEXES
 
+static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
+{
+	if (pi_task)
+		prio = min(prio, pi_task->prio);
+
+	return prio;
+}
+
+static inline int rt_effective_prio(struct task_struct *p, int prio)
+{
+	struct task_struct *pi_task = rt_mutex_get_top_task(p);
+
+	return __rt_effective_prio(pi_task, prio);
+}
+
 /*
  * rt_mutex_setprio - set the current priority of a task
- * @p: task
- * @prio: prio value (kernel-internal form)
+ * @p: task to boost
+ * @pi_task: donor task
  *
  * This function changes the 'effective' priority of a task. It does
  * not touch ->normal_prio like __setscheduler().
@@ -3685,18 +3700,42 @@ EXPORT_SYMBOL(default_wake_function);
  * Used by the rt_mutex code to implement priority inheritance
  * logic. Call site only calls if the priority of the task changed.
  */
-void rt_mutex_setprio(struct task_struct *p, int prio)
+void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
 {
-	int oldprio, queued, running, queue_flag =
+	int prio, oldprio, queued, running, queue_flag =
 		DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
 	const struct sched_class *prev_class;
 	struct rq_flags rf;
 	struct rq *rq;
 
-	BUG_ON(prio > MAX_PRIO);
+	/* XXX used to be waiter->prio, not waiter->task->prio */
+	prio = __rt_effective_prio(pi_task, p->normal_prio);
+
+	/*
+	 * If nothing changed; bail early.
+	 */
+	if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio))
+		return;
 
 	rq = __task_rq_lock(p, &rf);
 	update_rq_clock(rq);
+	/*
+	 * Set under pi_lock && rq->lock, such that the value can be used under
+	 * either lock.
+	 *
+	 * Note that there is loads of tricky to make this pointer cache work
+	 * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to
+	 * ensure a task is de-boosted (pi_task is set to NULL) before the
+	 * task is allowed to run again (and can exit). This ensures the pointer
+	 * points to a blocked task -- which guaratees the task is present.
+	 */
+	p->pi_top_task = pi_task;
+
+	/*
+	 * For FIFO/RR we only need to set prio, if that matches we're done.
+	 */
+	if (prio == p->prio && !dl_prio(prio))
+		goto out_unlock;
 
 	/*
 	 * Idle task boosting is a nono in general. There is one
@@ -3716,9 +3755,7 @@ void rt_mutex_setprio(struct task_struct
 		goto out_unlock;
 	}
 
-	rt_mutex_update_top_task(p);
-
-	trace_sched_pi_setprio(p, prio);
+	trace_sched_pi_setprio(p, prio); /* broken */
 	oldprio = p->prio;
 
 	if (oldprio == prio)
@@ -3742,7 +3779,6 @@ void rt_mutex_setprio(struct task_struct
 	 *          running task
 	 */
 	if (dl_prio(prio)) {
-		struct task_struct *pi_task = rt_mutex_get_top_task(p);
 		if (!dl_prio(p->normal_prio) ||
 		    (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
 			p->dl.dl_boosted = 1;
@@ -3780,6 +3816,11 @@ void rt_mutex_setprio(struct task_struct
 	balance_callback(rq);
 	preempt_enable();
 }
+#else
+static inline int rt_effective_prio(struct task_struct *p, int prio)
+{
+	return prio;
+}
 #endif
 
 void set_user_nice(struct task_struct *p, long nice)
@@ -4026,10 +4067,9 @@ static void __setscheduler(struct rq *rq
 	 * Keep a potential priority boosting if called from
 	 * sched_setscheduler().
 	 */
+	p->prio = normal_prio(p);
 	if (keep_boost)
-		p->prio = rt_mutex_get_effective_prio(p, normal_prio(p));
-	else
-		p->prio = normal_prio(p);
+		p->prio = rt_effective_prio(p, p->prio);
 
 	if (dl_prio(p->prio))
 		p->sched_class = &dl_sched_class;
@@ -4316,7 +4356,7 @@ static int __sched_setscheduler(struct t
 		 * the runqueue. This will be done when the task deboost
 		 * itself.
 		 */
-		new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
+		new_effective_prio = rt_effective_prio(p, newprio);
 		if (new_effective_prio == oldprio)
 			queue_flags &= ~DEQUEUE_MOVE;
 	}

  parent reply	other threads:[~2017-03-23 15:06 UTC|newest]

Thread overview: 46+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-03-23 14:56 [PATCH -v3 0/8] PI vs SCHED_DEADLINE fixes Peter Zijlstra
2017-03-23 14:56 ` [PATCH -v3 1/8] rtmutex: Deboost before waking up the top waiter Peter Zijlstra
2017-04-04  9:48   ` [tip:locking/core] " tip-bot for Xunlei Pang
2017-04-05  8:08     ` Mike Galbraith
2017-04-05 14:55       ` [tip:locking/core] Retiplockingcore_rtmutex_Deboost_before_waking_up_the_top_waiter tip-bot for Mike Galbraith
2017-04-05 15:03       ` [tip:locking/core] rtmutex: Plug preempt count leak in rt_mutex_futex_unlock() tip-bot for Mike Galbraith
2017-04-06  6:16       ` [tip:locking/core] rtmutex: Deboost before waking up the top waiter Xunlei Pang
2017-03-23 14:56 ` [PATCH -v3 2/8] sched/rtmutex/deadline: Fix a PI crash for deadline tasks Peter Zijlstra
2017-04-04  9:49   ` [tip:locking/core] " tip-bot for Xunlei Pang
2017-03-23 14:56 ` [PATCH -v3 3/8] sched/deadline/rtmutex: Dont miss the dl_runtime/dl_period update Peter Zijlstra
2017-04-04  9:50   ` [tip:locking/core] " tip-bot for Xunlei Pang
2017-03-23 14:56 ` [PATCH -v3 4/8] rtmutex: Clean up Peter Zijlstra
2017-03-23 18:21   ` Steven Rostedt
2017-04-04  9:50   ` [tip:locking/core] " tip-bot for Peter Zijlstra
2017-03-23 14:56 ` Peter Zijlstra [this message]
2017-04-04  9:51   ` [tip:locking/core] sched/rtmutex: Refactor rt_mutex_setprio() tip-bot for Peter Zijlstra
2017-03-23 14:56 ` [PATCH -v3 6/8] sched,tracing: Update trace_sched_pi_setprio() Peter Zijlstra
2017-04-04  9:51   ` [tip:locking/core] " tip-bot for Peter Zijlstra
2017-03-23 14:56 ` [PATCH -v3 7/8] rtmutex: Fix PI chain order integrity Peter Zijlstra
2017-04-04  9:52   ` [tip:locking/core] " tip-bot for Peter Zijlstra
2017-03-23 14:56 ` [PATCH -v3 8/8] rtmutex: Fix more prio comparisons Peter Zijlstra
2017-04-04  9:52   ` [tip:locking/core] " tip-bot for Peter Zijlstra
  -- strict thread matches above, loose matches on Subject: below --
2016-04-14 11:37 [PATCH v3 0/6] sched/deadline/rtmutex: Fix two deadline PI issues Xunlei Pang
2016-04-14 11:37 ` [PATCH v3 1/6] rtmutex: Deboost before waking up the top waiter Xunlei Pang
2016-04-18  8:23   ` Thomas Gleixner
2016-04-18  8:44     ` Xunlei Pang
2016-04-18  9:02       ` Thomas Gleixner
2016-04-18  9:41         ` Xunlei Pang
2016-04-20 12:20         ` Peter Zijlstra
2016-04-20 12:43           ` Thomas Gleixner
2016-04-20 13:10             ` Peter Zijlstra
2016-04-14 11:37 ` [PATCH v3 2/6] sched/rtmutex/deadline: Fix a PI crash for deadline tasks Xunlei Pang
2016-04-20 13:19   ` Peter Zijlstra
2016-04-20 13:49     ` Xunlei Pang
2016-04-22  3:26       ` Xunlei Pang
2016-04-14 11:37 ` [PATCH v3 3/6] rtmutex: Move "rt_mutex_waiter" definition to "include/linux/rtmutex.h" Xunlei Pang
2016-04-14 11:37 ` [PATCH v3 4/6] sched: Move dl_policy() to "include/linux/sched.h" Xunlei Pang
2016-04-14 11:37 ` [PATCH v3 5/6] sched/deadline/rtmutex: Fix unprotected PI access in enqueue_task_dl() Xunlei Pang
2016-04-14 15:31   ` Peter Zijlstra
2016-04-15  1:58     ` Xunlei Pang
2016-04-15  2:19       ` Xunlei Pang
2016-04-20 12:25         ` Peter Zijlstra
2016-04-20 13:00           ` Xunlei Pang
2016-04-20 13:17             ` Peter Zijlstra
2016-04-20 13:45               ` Xunlei Pang
2016-04-14 11:37 ` [PATCH v3 6/6] sched/deadline/rtmutex: Don't miss the dl_runtime/dl_period update Xunlei Pang

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20170323150216.303827095@infradead.org \
    --to=peterz@infradead.org \
    --cc=bigeasy@linutronix.de \
    --cc=bristot@redhat.com \
    --cc=jdesfossez@efficios.com \
    --cc=juri.lelli@arm.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mathieu.desnoyers@efficios.com \
    --cc=mingo@kernel.org \
    --cc=rostedt@goodmis.org \
    --cc=tglx@linutronix.de \
    --cc=xlpang@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.