All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] sched/deadline/rtmutex: Fix a PI crash for deadline tasks
@ 2016-04-01 11:00 Xunlei Pang
  2016-04-01 11:38 ` Peter Zijlstra
  0 siblings, 1 reply; 25+ messages in thread
From: Xunlei Pang @ 2016-04-01 11:00 UTC (permalink / raw)
  To: linux-kernel
  Cc: Peter Zijlstra, Juri Lelli, Ingo Molnar, Steven Rostedt, Xunlei Pang

I found a kernel crash while playing with deadline PI rtmutex.

    BUG: unable to handle kernel NULL pointer dereference at 0000000000000018
    IP: [<ffffffff810eeb8f>] rt_mutex_get_top_task+0x1f/0x30
    PGD 232a75067 PUD 230947067 PMD 0
    Oops: 0000 [#1] SMP
    CPU: 1 PID: 10994 Comm: a.out Not tainted

    Call Trace:
    [<ffffffff810cf8aa>] ? enqueue_task_dl+0x2a/0x320
    [<ffffffff810b658c>] enqueue_task+0x2c/0x80
    [<ffffffff810ba763>] activate_task+0x23/0x30
    [<ffffffff810d0ab5>] pull_dl_task+0x1d5/0x260
    [<ffffffff810d0be6>] pre_schedule_dl+0x16/0x20
    [<ffffffff8164e783>] __schedule+0xd3/0x900
    [<ffffffff8164efd9>] schedule+0x29/0x70
    [<ffffffff8165035b>] __rt_mutex_slowlock+0x4b/0xc0
    [<ffffffff81650501>] rt_mutex_slowlock+0xd1/0x190
    [<ffffffff810eeb33>] rt_mutex_timed_lock+0x53/0x60
    [<ffffffff810ecbfc>] futex_lock_pi.isra.18+0x28c/0x390
    [<ffffffff810cfa15>] ? enqueue_task_dl+0x195/0x320
    [<ffffffff810d0bac>] ? prio_changed_dl+0x6c/0x90
    [<ffffffff810ed8b0>] do_futex+0x190/0x5b0
    [<ffffffff810edd50>] SyS_futex+0x80/0x180
    [<ffffffff8165a089>] system_call_fastpath+0x16/0x1b
    RIP  [<ffffffff810eeb8f>] rt_mutex_get_top_task+0x1f/0x30

This is because rt_mutex_enqueue_pi() and rt_mutex_dequeue_pi()
are only protected by pi_lock when operating pi waiters, while
rt_mutex_get_top_task() will access them with rq lock held but
not holding pi_lock.

It's hard for rt_mutex_get_top_task() to hold pi_lock, so the
patch ensures rt_mutex_enqueue_pi() and rt_mutex_dequeue_pi()
lock rq when operating "pi_waiters" and "pi_waiters_leftmost".
We need this iff lock owner has the deadline priority.

Signed-off-by: Xunlei Pang <xlpang@redhat.com>
---
 include/linux/sched/deadline.h |  3 +++
 kernel/locking/rtmutex.c       | 18 ++++++++++++++++++
 kernel/sched/deadline.c        | 17 +++++++++++++++++
 3 files changed, 38 insertions(+)

diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h
index 9089a2a..3083f6b 100644
--- a/include/linux/sched/deadline.h
+++ b/include/linux/sched/deadline.h
@@ -26,4 +26,7 @@ static inline bool dl_time_before(u64 a, u64 b)
 	return (s64)(a - b) < 0;
 }
 
+extern void *dl_pi_waiters_lock(struct task_struct *p);
+extern void dl_pi_waiters_unlock(void *lockdata);
+
 #endif /* _SCHED_DEADLINE_H */
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 3e74660..0fb247a 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -224,6 +224,7 @@ rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
 	struct rb_node *parent = NULL;
 	struct rt_mutex_waiter *entry;
 	int leftmost = 1;
+	void *dl_lockdata = NULL;
 
 	while (*link) {
 		parent = *link;
@@ -236,24 +237,38 @@ rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
 		}
 	}
 
+	if (dl_task(task))
+		dl_lockdata = dl_pi_waiters_lock(task);
+
 	if (leftmost)
 		task->pi_waiters_leftmost = &waiter->pi_tree_entry;
 
 	rb_link_node(&waiter->pi_tree_entry, parent, link);
 	rb_insert_color(&waiter->pi_tree_entry, &task->pi_waiters);
+
+	if (dl_lockdata)
+		dl_pi_waiters_unlock(dl_lockdata);
 }
 
 static void
 rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
 {
+	void *dl_lockdata = NULL;
+
 	if (RB_EMPTY_NODE(&waiter->pi_tree_entry))
 		return;
 
+	if (dl_task(task))
+		dl_lockdata = dl_pi_waiters_lock(task);
+
 	if (task->pi_waiters_leftmost == &waiter->pi_tree_entry)
 		task->pi_waiters_leftmost = rb_next(&waiter->pi_tree_entry);
 
 	rb_erase(&waiter->pi_tree_entry, &task->pi_waiters);
 	RB_CLEAR_NODE(&waiter->pi_tree_entry);
+
+	if (dl_lockdata)
+		dl_pi_waiters_unlock(dl_lockdata);
 }
 
 /*
@@ -271,6 +286,9 @@ int rt_mutex_getprio(struct task_struct *task)
 		   task->normal_prio);
 }
 
+/*
+ * rq->lock of @task must be held.
+ */
 struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
 {
 	if (likely(!task_has_pi_waiters(task)))
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index a3048fa..7b8aa93 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -926,6 +926,23 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
 	__dequeue_dl_entity(dl_se);
 }
 
+/*
+ * dl_pi_waiters_lock()/dl_pi_waiters_unlock() are needed by
+ * rt_mutex_enqueue_pi() and rt_mutex_dequeue_pi() to protect
+ * PI waiters accessed by rt_mutex_get_top_task().
+ */
+void *dl_pi_waiters_lock(struct task_struct *p)
+{
+	lockdep_assert_held(&p->pi_lock);
+
+	return __task_rq_lock(p);
+}
+
+void dl_pi_waiters_unlock(void *lockdata)
+{
+	__task_rq_unlock((struct rq *)lockdata);
+}
+
 static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct task_struct *pi_task = rt_mutex_get_top_task(p);
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH] sched/deadline/rtmutex: Fix a PI crash for deadline tasks
  2016-04-01 11:00 [PATCH] sched/deadline/rtmutex: Fix a PI crash for deadline tasks Xunlei Pang
@ 2016-04-01 11:38 ` Peter Zijlstra
  2016-04-01 12:23   ` Xunlei Pang
  0 siblings, 1 reply; 25+ messages in thread
From: Peter Zijlstra @ 2016-04-01 11:38 UTC (permalink / raw)
  To: Xunlei Pang; +Cc: linux-kernel, Juri Lelli, Ingo Molnar, Steven Rostedt

On Fri, Apr 01, 2016 at 07:00:18PM +0800, Xunlei Pang wrote:
> I found a kernel crash while playing with deadline PI rtmutex.
> 
>     BUG: unable to handle kernel NULL pointer dereference at 0000000000000018
>     IP: [<ffffffff810eeb8f>] rt_mutex_get_top_task+0x1f/0x30
>     PGD 232a75067 PUD 230947067 PMD 0
>     Oops: 0000 [#1] SMP
>     CPU: 1 PID: 10994 Comm: a.out Not tainted
> 
>     Call Trace:
>     [<ffffffff810cf8aa>] ? enqueue_task_dl+0x2a/0x320
>     [<ffffffff810b658c>] enqueue_task+0x2c/0x80
>     [<ffffffff810ba763>] activate_task+0x23/0x30
>     [<ffffffff810d0ab5>] pull_dl_task+0x1d5/0x260
>     [<ffffffff810d0be6>] pre_schedule_dl+0x16/0x20
>     [<ffffffff8164e783>] __schedule+0xd3/0x900
>     [<ffffffff8164efd9>] schedule+0x29/0x70
>     [<ffffffff8165035b>] __rt_mutex_slowlock+0x4b/0xc0
>     [<ffffffff81650501>] rt_mutex_slowlock+0xd1/0x190
>     [<ffffffff810eeb33>] rt_mutex_timed_lock+0x53/0x60
>     [<ffffffff810ecbfc>] futex_lock_pi.isra.18+0x28c/0x390
>     [<ffffffff810cfa15>] ? enqueue_task_dl+0x195/0x320
>     [<ffffffff810d0bac>] ? prio_changed_dl+0x6c/0x90
>     [<ffffffff810ed8b0>] do_futex+0x190/0x5b0
>     [<ffffffff810edd50>] SyS_futex+0x80/0x180
>     [<ffffffff8165a089>] system_call_fastpath+0x16/0x1b
>     RIP  [<ffffffff810eeb8f>] rt_mutex_get_top_task+0x1f/0x30
> 
> This is because rt_mutex_enqueue_pi() and rt_mutex_dequeue_pi()
> are only protected by pi_lock when operating pi waiters, while
> rt_mutex_get_top_task() will access them with rq lock held but
> not holding pi_lock.
> 
> It's hard for rt_mutex_get_top_task() to hold pi_lock, so the
> patch ensures rt_mutex_enqueue_pi() and rt_mutex_dequeue_pi()
> lock rq when operating "pi_waiters" and "pi_waiters_leftmost".
> We need this iff lock owner has the deadline priority.

How is this deadline specific, those functions you modify are
deadline/rt agnostic.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] sched/deadline/rtmutex: Fix a PI crash for deadline tasks
  2016-04-01 11:38 ` Peter Zijlstra
@ 2016-04-01 12:23   ` Xunlei Pang
  2016-04-01 13:12     ` Peter Zijlstra
  0 siblings, 1 reply; 25+ messages in thread
From: Xunlei Pang @ 2016-04-01 12:23 UTC (permalink / raw)
  To: Peter Zijlstra, Xunlei Pang
  Cc: linux-kernel, Juri Lelli, Ingo Molnar, Steven Rostedt

On 2016/04/01 at 19:38, Peter Zijlstra wrote:
> On Fri, Apr 01, 2016 at 07:00:18PM +0800, Xunlei Pang wrote:
>> I found a kernel crash while playing with deadline PI rtmutex.
>>
>>     BUG: unable to handle kernel NULL pointer dereference at 0000000000000018
>>     IP: [<ffffffff810eeb8f>] rt_mutex_get_top_task+0x1f/0x30
>>     PGD 232a75067 PUD 230947067 PMD 0
>>     Oops: 0000 [#1] SMP
>>     CPU: 1 PID: 10994 Comm: a.out Not tainted
>>
>>     Call Trace:
>>     [<ffffffff810cf8aa>] ? enqueue_task_dl+0x2a/0x320
>>     [<ffffffff810b658c>] enqueue_task+0x2c/0x80
>>     [<ffffffff810ba763>] activate_task+0x23/0x30
>>     [<ffffffff810d0ab5>] pull_dl_task+0x1d5/0x260
>>     [<ffffffff810d0be6>] pre_schedule_dl+0x16/0x20
>>     [<ffffffff8164e783>] __schedule+0xd3/0x900
>>     [<ffffffff8164efd9>] schedule+0x29/0x70
>>     [<ffffffff8165035b>] __rt_mutex_slowlock+0x4b/0xc0
>>     [<ffffffff81650501>] rt_mutex_slowlock+0xd1/0x190
>>     [<ffffffff810eeb33>] rt_mutex_timed_lock+0x53/0x60
>>     [<ffffffff810ecbfc>] futex_lock_pi.isra.18+0x28c/0x390
>>     [<ffffffff810cfa15>] ? enqueue_task_dl+0x195/0x320
>>     [<ffffffff810d0bac>] ? prio_changed_dl+0x6c/0x90
>>     [<ffffffff810ed8b0>] do_futex+0x190/0x5b0
>>     [<ffffffff810edd50>] SyS_futex+0x80/0x180
>>     [<ffffffff8165a089>] system_call_fastpath+0x16/0x1b
>>     RIP  [<ffffffff810eeb8f>] rt_mutex_get_top_task+0x1f/0x30
>>
>> This is because rt_mutex_enqueue_pi() and rt_mutex_dequeue_pi()
>> are only protected by pi_lock when operating pi waiters, while
>> rt_mutex_get_top_task() will access them with rq lock held but
>> not holding pi_lock.
>>
>> It's hard for rt_mutex_get_top_task() to hold pi_lock, so the
>> patch ensures rt_mutex_enqueue_pi() and rt_mutex_dequeue_pi()
>> lock rq when operating "pi_waiters" and "pi_waiters_leftmost".
>> We need this iff lock owner has the deadline priority.
> How is this deadline specific, those functions you modify are
> deadline/rt agnostic.

I checked the code, currently only deadline accesses the pi_waiters/pi_waiters_leftmost
without pi_lock held via rt_mutex_get_top_task(), other cases all have pi_lock held.

So adding the condition.

Regards,
Xunlei

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] sched/deadline/rtmutex: Fix a PI crash for deadline tasks
  2016-04-01 12:23   ` Xunlei Pang
@ 2016-04-01 13:12     ` Peter Zijlstra
  2016-04-01 13:34       ` Xunlei Pang
  0 siblings, 1 reply; 25+ messages in thread
From: Peter Zijlstra @ 2016-04-01 13:12 UTC (permalink / raw)
  To: xlpang, Xunlei Pang; +Cc: linux-kernel, Juri Lelli, Ingo Molnar, Steven Rostedt



On 1 April 2016 14:23:58 CEST, Xunlei Pang <xpang@redhat.com> wrote:

>>> We need this iff lock owner has the deadline priority.
>> How is this deadline specific, those functions you modify are
>> deadline/rt agnostic.
>
>I checked the code, currently only deadline accesses the
>pi_waiters/pi_waiters_leftmost
>without pi_lock held via rt_mutex_get_top_task(), other cases all have
>pi_lock held.
>
>So adding the condition.

How does that not suggest fixing the deadline code?
-- 
Sent from my Android device with K-9 Mail. Please excuse my brevity.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] sched/deadline/rtmutex: Fix a PI crash for deadline tasks
  2016-04-01 13:12     ` Peter Zijlstra
@ 2016-04-01 13:34       ` Xunlei Pang
  2016-04-01 21:51         ` Peter Zijlstra
  0 siblings, 1 reply; 25+ messages in thread
From: Xunlei Pang @ 2016-04-01 13:34 UTC (permalink / raw)
  To: Peter Zijlstra, xlpang
  Cc: linux-kernel, Juri Lelli, Ingo Molnar, Steven Rostedt

On 2016/04/01 at 21:12, Peter Zijlstra wrote:
>
> On 1 April 2016 14:23:58 CEST, Xunlei Pang <xpang@redhat.com> wrote:
>
>>>> We need this iff lock owner has the deadline priority.
>>> How is this deadline specific, those functions you modify are
>>> deadline/rt agnostic.
>> I checked the code, currently only deadline accesses the
>> pi_waiters/pi_waiters_leftmost
>> without pi_lock held via rt_mutex_get_top_task(), other cases all have
>> pi_lock held.
>>
>> So adding the condition.
> How does that not suggest fixing the deadline code?

I did tried that at first, but found it very hard when processing
pull_dl_task(push_dl_task can crash as well) like:
   double_lock_balance(this_rq, src_rq);
   p = pick_earliest_pushable_dl_task(src_rq, this_cpu);
  /*  and for each @p, we must hold its pi_lock,
       doing this once rq is locked will cause deadlock. */

Ditto for enqueue_task_dl()->rt_mutex_get_top_task(), as rq is locked.

If we unlock rq first and then lock pi_lock, this may cause other problems
due to unlocking rq.

Any better ideas is welcome.

Regards,
Xunlei

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] sched/deadline/rtmutex: Fix a PI crash for deadline tasks
  2016-04-01 13:34       ` Xunlei Pang
@ 2016-04-01 21:51         ` Peter Zijlstra
  2016-04-02 10:19           ` Xunlei Pang
  2016-04-05  8:38           ` Xunlei Pang
  0 siblings, 2 replies; 25+ messages in thread
From: Peter Zijlstra @ 2016-04-01 21:51 UTC (permalink / raw)
  To: xlpang
  Cc: linux-kernel, Juri Lelli, Ingo Molnar, Steven Rostedt, Thomas Gleixner

On Fri, Apr 01, 2016 at 09:34:24PM +0800, Xunlei Pang wrote:

> >> I checked the code, currently only deadline accesses the
> >> pi_waiters/pi_waiters_leftmost
> >> without pi_lock held via rt_mutex_get_top_task(), other cases all have
> >> pi_lock held.

> Any better ideas is welcome.

Something like the below _might_ work; but its late and I haven't looked
at the PI code in a while. This basically caches a pointer to the top
waiter task in the running task_struct, under pi_lock and rq->lock, and
therefore we can use it with only rq->lock held.

Since the task is blocked, and cannot unblock without taking itself from
the block chain -- which would cause rt_mutex_setprio() to set another
top waiter task, the lifetime rules should be good.

Having this top waiter pointer around might also be useful to further
implement bandwidth inheritance or such, but I've not thought about that
too much.

Lots of code deleted because we move the entire task->prio mapping muck
into the scheduler, because we now pass a pi_task, not a prio.

---
 include/linux/sched.h    |  1 +
 include/linux/sched/rt.h | 20 +-------------------
 kernel/locking/rtmutex.c | 45 +++++----------------------------------------
 kernel/sched/core.c      | 34 +++++++++++++++++++++++-----------
 kernel/sched/deadline.c  |  2 +-
 5 files changed, 31 insertions(+), 71 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 52c4847b05e2..30169f38cb24 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1615,6 +1615,7 @@ struct task_struct {
 
 	/* Protection of the PI data structures: */
 	raw_spinlock_t pi_lock;
+	struct task_struct *pi_task;
 
 	struct wake_q_node wake_q;
 
diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
index a30b172df6e1..0a8f071b16e3 100644
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -16,31 +16,13 @@ static inline int rt_task(struct task_struct *p)
 }
 
 #ifdef CONFIG_RT_MUTEXES
-extern int rt_mutex_getprio(struct task_struct *p);
-extern void rt_mutex_setprio(struct task_struct *p, int prio);
-extern int rt_mutex_get_effective_prio(struct task_struct *task, int newprio);
-extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task);
+extern void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task);
 extern void rt_mutex_adjust_pi(struct task_struct *p);
 static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
 {
 	return tsk->pi_blocked_on != NULL;
 }
 #else
-static inline int rt_mutex_getprio(struct task_struct *p)
-{
-	return p->normal_prio;
-}
-
-static inline int rt_mutex_get_effective_prio(struct task_struct *task,
-					      int newprio)
-{
-	return newprio;
-}
-
-static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
-{
-	return NULL;
-}
 # define rt_mutex_adjust_pi(p)		do { } while (0)
 static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
 {
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 3e746607abe5..13b6b5922d3c 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -257,53 +257,18 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
 }
 
 /*
- * Calculate task priority from the waiter tree priority
- *
- * Return task->normal_prio when the waiter tree is empty or when
- * the waiter is not allowed to do priority boosting
- */
-int rt_mutex_getprio(struct task_struct *task)
-{
-	if (likely(!task_has_pi_waiters(task)))
-		return task->normal_prio;
-
-	return min(task_top_pi_waiter(task)->prio,
-		   task->normal_prio);
-}
-
-struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
-{
-	if (likely(!task_has_pi_waiters(task)))
-		return NULL;
-
-	return task_top_pi_waiter(task)->task;
-}
-
-/*
- * Called by sched_setscheduler() to get the priority which will be
- * effective after the change.
- */
-int rt_mutex_get_effective_prio(struct task_struct *task, int newprio)
-{
-	if (!task_has_pi_waiters(task))
-		return newprio;
-
-	if (task_top_pi_waiter(task)->task->prio <= newprio)
-		return task_top_pi_waiter(task)->task->prio;
-	return newprio;
-}
-
-/*
  * Adjust the priority of a task, after its pi_waiters got modified.
  *
  * This can be both boosting and unboosting. task->pi_lock must be held.
  */
 static void __rt_mutex_adjust_prio(struct task_struct *task)
 {
-	int prio = rt_mutex_getprio(task);
+	struct task_struct *pi_task = task;
+
+	if (unlikely(task_has_pi_waiters(task)))
+		pi_task = task_top_pi_waiter(task)->task;
 
-	if (task->prio != prio || dl_prio(prio))
-		rt_mutex_setprio(task, prio);
+	rt_mutex_setprio(task, pi_task);
 }
 
 /*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8b489fcac37b..7d7e3a0eaeb0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3392,7 +3392,7 @@ EXPORT_SYMBOL(default_wake_function);
 /*
  * rt_mutex_setprio - set the current priority of a task
  * @p: task
- * @prio: prio value (kernel-internal form)
+ * @pi_task: top waiter, donating state
  *
  * This function changes the 'effective' priority of a task. It does
  * not touch ->normal_prio like __setscheduler().
@@ -3400,13 +3400,20 @@ EXPORT_SYMBOL(default_wake_function);
  * Used by the rt_mutex code to implement priority inheritance
  * logic. Call site only calls if the priority of the task changed.
  */
-void rt_mutex_setprio(struct task_struct *p, int prio)
+void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
 {
-	int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
-	struct rq *rq;
+	int prio, oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
 	const struct sched_class *prev_class;
+	struct rq *rq;
 
-	BUG_ON(prio > MAX_PRIO);
+	/*
+	 * For FIFO/RR we simply donate prio; for DL things are
+	 * more interesting.
+	 */
+	/* XXX used to be waiter->prio, not waiter->task->prio */
+	prio = min(pi_task->prio, p->normal_prio);
+	if (p->prio == prio && !dl_prio(prio))
+		return;
 
 	rq = __task_rq_lock(p);
 
@@ -3442,6 +3449,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	if (running)
 		put_prev_task(rq, p);
 
+	if (pi_task == p)
+		pi_task = NULL;
+	p->pi_task = pi_task;
+
 	/*
 	 * Boosting condition are:
 	 * 1. -rt task is running and holds mutex A
@@ -3452,7 +3463,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	 *          running task
 	 */
 	if (dl_prio(prio)) {
-		struct task_struct *pi_task = rt_mutex_get_top_task(p);
 		if (!dl_prio(p->normal_prio) ||
 		    (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
 			p->dl.dl_boosted = 1;
@@ -3727,10 +3737,9 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
 	 * Keep a potential priority boosting if called from
 	 * sched_setscheduler().
 	 */
-	if (keep_boost)
-		p->prio = rt_mutex_get_effective_prio(p, normal_prio(p));
-	else
-		p->prio = normal_prio(p);
+	p->prio = normal_prio(p);
+	if (keep_boost && p->pi_task)
+		p->prio = min(p->prio, p->pi_task->prio);
 
 	if (dl_prio(p->prio))
 		p->sched_class = &dl_sched_class;
@@ -4017,7 +4026,10 @@ static int __sched_setscheduler(struct task_struct *p,
 		 * the runqueue. This will be done when the task deboost
 		 * itself.
 		 */
-		new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
+		new_effective_prio = newprio;
+		if (p->pi_task)
+			new_effective_prio = min(new_effective_prio, p->pi_task->prio);
+
 		if (new_effective_prio == oldprio)
 			queue_flags &= ~DEQUEUE_MOVE;
 	}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index affd97ec9f65..6c5aa4612eb6 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -932,7 +932,7 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
 
 static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 {
-	struct task_struct *pi_task = rt_mutex_get_top_task(p);
+	struct task_struct *pi_task = p->pi_task;
 	struct sched_dl_entity *pi_se = &p->dl;
 
 	/*

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH] sched/deadline/rtmutex: Fix a PI crash for deadline tasks
  2016-04-01 21:51         ` Peter Zijlstra
@ 2016-04-02 10:19           ` Xunlei Pang
  2016-04-05  8:38           ` Xunlei Pang
  1 sibling, 0 replies; 25+ messages in thread
From: Xunlei Pang @ 2016-04-02 10:19 UTC (permalink / raw)
  To: Peter Zijlstra, xlpang
  Cc: linux-kernel, Juri Lelli, Ingo Molnar, Steven Rostedt, Thomas Gleixner

On 2016/04/02 at 05:51, Peter Zijlstra wrote:
> On Fri, Apr 01, 2016 at 09:34:24PM +0800, Xunlei Pang wrote:
>
>>>> I checked the code, currently only deadline accesses the
>>>> pi_waiters/pi_waiters_leftmost
>>>> without pi_lock held via rt_mutex_get_top_task(), other cases all have
>>>> pi_lock held.
>> Any better ideas is welcome.
> Something like the below _might_ work; but its late and I haven't looked
> at the PI code in a while. This basically caches a pointer to the top
> waiter task in the running task_struct, under pi_lock and rq->lock, and
> therefore we can use it with only rq->lock held.
>
> Since the task is blocked, and cannot unblock without taking itself from
> the block chain -- which would cause rt_mutex_setprio() to set another
> top waiter task, the lifetime rules should be good.
>
> Having this top waiter pointer around might also be useful to further
> implement bandwidth inheritance or such, but I've not thought about that
> too much.
>
> Lots of code deleted because we move the entire task->prio mapping muck
> into the scheduler, because we now pass a pi_task, not a prio.

I'll try to understand your patch, and do a test later. Thanks!

Regards,
Xunlei

> ---
>  include/linux/sched.h    |  1 +
>  include/linux/sched/rt.h | 20 +-------------------
>  kernel/locking/rtmutex.c | 45 +++++----------------------------------------
>  kernel/sched/core.c      | 34 +++++++++++++++++++++++-----------
>  kernel/sched/deadline.c  |  2 +-
>  5 files changed, 31 insertions(+), 71 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 52c4847b05e2..30169f38cb24 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1615,6 +1615,7 @@ struct task_struct {
>  
>  	/* Protection of the PI data structures: */
>  	raw_spinlock_t pi_lock;
> +	struct task_struct *pi_task;
>  
>  	struct wake_q_node wake_q;
>  
> diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
> index a30b172df6e1..0a8f071b16e3 100644
> --- a/include/linux/sched/rt.h
> +++ b/include/linux/sched/rt.h
> @@ -16,31 +16,13 @@ static inline int rt_task(struct task_struct *p)
>  }
>  
>  #ifdef CONFIG_RT_MUTEXES
> -extern int rt_mutex_getprio(struct task_struct *p);
> -extern void rt_mutex_setprio(struct task_struct *p, int prio);
> -extern int rt_mutex_get_effective_prio(struct task_struct *task, int newprio);
> -extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task);
> +extern void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task);
>  extern void rt_mutex_adjust_pi(struct task_struct *p);
>  static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
>  {
>  	return tsk->pi_blocked_on != NULL;
>  }
>  #else
> -static inline int rt_mutex_getprio(struct task_struct *p)
> -{
> -	return p->normal_prio;
> -}
> -
> -static inline int rt_mutex_get_effective_prio(struct task_struct *task,
> -					      int newprio)
> -{
> -	return newprio;
> -}
> -
> -static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
> -{
> -	return NULL;
> -}
>  # define rt_mutex_adjust_pi(p)		do { } while (0)
>  static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
>  {
> diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
> index 3e746607abe5..13b6b5922d3c 100644
> --- a/kernel/locking/rtmutex.c
> +++ b/kernel/locking/rtmutex.c
> @@ -257,53 +257,18 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
>  }
>  
>  /*
> - * Calculate task priority from the waiter tree priority
> - *
> - * Return task->normal_prio when the waiter tree is empty or when
> - * the waiter is not allowed to do priority boosting
> - */
> -int rt_mutex_getprio(struct task_struct *task)
> -{
> -	if (likely(!task_has_pi_waiters(task)))
> -		return task->normal_prio;
> -
> -	return min(task_top_pi_waiter(task)->prio,
> -		   task->normal_prio);
> -}
> -
> -struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
> -{
> -	if (likely(!task_has_pi_waiters(task)))
> -		return NULL;
> -
> -	return task_top_pi_waiter(task)->task;
> -}
> -
> -/*
> - * Called by sched_setscheduler() to get the priority which will be
> - * effective after the change.
> - */
> -int rt_mutex_get_effective_prio(struct task_struct *task, int newprio)
> -{
> -	if (!task_has_pi_waiters(task))
> -		return newprio;
> -
> -	if (task_top_pi_waiter(task)->task->prio <= newprio)
> -		return task_top_pi_waiter(task)->task->prio;
> -	return newprio;
> -}
> -
> -/*
>   * Adjust the priority of a task, after its pi_waiters got modified.
>   *
>   * This can be both boosting and unboosting. task->pi_lock must be held.
>   */
>  static void __rt_mutex_adjust_prio(struct task_struct *task)
>  {
> -	int prio = rt_mutex_getprio(task);
> +	struct task_struct *pi_task = task;
> +
> +	if (unlikely(task_has_pi_waiters(task)))
> +		pi_task = task_top_pi_waiter(task)->task;
>  
> -	if (task->prio != prio || dl_prio(prio))
> -		rt_mutex_setprio(task, prio);
> +	rt_mutex_setprio(task, pi_task);
>  }
>  
>  /*
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 8b489fcac37b..7d7e3a0eaeb0 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3392,7 +3392,7 @@ EXPORT_SYMBOL(default_wake_function);
>  /*
>   * rt_mutex_setprio - set the current priority of a task
>   * @p: task
> - * @prio: prio value (kernel-internal form)
> + * @pi_task: top waiter, donating state
>   *
>   * This function changes the 'effective' priority of a task. It does
>   * not touch ->normal_prio like __setscheduler().
> @@ -3400,13 +3400,20 @@ EXPORT_SYMBOL(default_wake_function);
>   * Used by the rt_mutex code to implement priority inheritance
>   * logic. Call site only calls if the priority of the task changed.
>   */
> -void rt_mutex_setprio(struct task_struct *p, int prio)
> +void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
>  {
> -	int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
> -	struct rq *rq;
> +	int prio, oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
>  	const struct sched_class *prev_class;
> +	struct rq *rq;
>  
> -	BUG_ON(prio > MAX_PRIO);
> +	/*
> +	 * For FIFO/RR we simply donate prio; for DL things are
> +	 * more interesting.
> +	 */
> +	/* XXX used to be waiter->prio, not waiter->task->prio */
> +	prio = min(pi_task->prio, p->normal_prio);
> +	if (p->prio == prio && !dl_prio(prio))
> +		return;
>  
>  	rq = __task_rq_lock(p);
>  
> @@ -3442,6 +3449,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
>  	if (running)
>  		put_prev_task(rq, p);
>  
> +	if (pi_task == p)
> +		pi_task = NULL;
> +	p->pi_task = pi_task;
> +
>  	/*
>  	 * Boosting condition are:
>  	 * 1. -rt task is running and holds mutex A
> @@ -3452,7 +3463,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
>  	 *          running task
>  	 */
>  	if (dl_prio(prio)) {
> -		struct task_struct *pi_task = rt_mutex_get_top_task(p);
>  		if (!dl_prio(p->normal_prio) ||
>  		    (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
>  			p->dl.dl_boosted = 1;
> @@ -3727,10 +3737,9 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
>  	 * Keep a potential priority boosting if called from
>  	 * sched_setscheduler().
>  	 */
> -	if (keep_boost)
> -		p->prio = rt_mutex_get_effective_prio(p, normal_prio(p));
> -	else
> -		p->prio = normal_prio(p);
> +	p->prio = normal_prio(p);
> +	if (keep_boost && p->pi_task)
> +		p->prio = min(p->prio, p->pi_task->prio);
>  
>  	if (dl_prio(p->prio))
>  		p->sched_class = &dl_sched_class;
> @@ -4017,7 +4026,10 @@ static int __sched_setscheduler(struct task_struct *p,
>  		 * the runqueue. This will be done when the task deboost
>  		 * itself.
>  		 */
> -		new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
> +		new_effective_prio = newprio;
> +		if (p->pi_task)
> +			new_effective_prio = min(new_effective_prio, p->pi_task->prio);
> +
>  		if (new_effective_prio == oldprio)
>  			queue_flags &= ~DEQUEUE_MOVE;
>  	}
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index affd97ec9f65..6c5aa4612eb6 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -932,7 +932,7 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
>  
>  static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
>  {
> -	struct task_struct *pi_task = rt_mutex_get_top_task(p);
> +	struct task_struct *pi_task = p->pi_task;
>  	struct sched_dl_entity *pi_se = &p->dl;
>  
>  	/*

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] sched/deadline/rtmutex: Fix a PI crash for deadline tasks
  2016-04-01 21:51         ` Peter Zijlstra
  2016-04-02 10:19           ` Xunlei Pang
@ 2016-04-05  8:38           ` Xunlei Pang
  2016-04-05  9:19             ` Peter Zijlstra
  1 sibling, 1 reply; 25+ messages in thread
From: Xunlei Pang @ 2016-04-05  8:38 UTC (permalink / raw)
  To: Peter Zijlstra, xlpang
  Cc: linux-kernel, Juri Lelli, Ingo Molnar, Steven Rostedt, Thomas Gleixner

On 2016/04/02 at 05:51, Peter Zijlstra wrote:
> On Fri, Apr 01, 2016 at 09:34:24PM +0800, Xunlei Pang wrote:
>
>>>> I checked the code, currently only deadline accesses the
>>>> pi_waiters/pi_waiters_leftmost
>>>> without pi_lock held via rt_mutex_get_top_task(), other cases all have
>>>> pi_lock held.
>> Any better ideas is welcome.
> Something like the below _might_ work; but its late and I haven't looked
> at the PI code in a while. This basically caches a pointer to the top
> waiter task in the running task_struct, under pi_lock and rq->lock, and
> therefore we can use it with only rq->lock held.
>
> Since the task is blocked, and cannot unblock without taking itself from
> the block chain -- which would cause rt_mutex_setprio() to set another
> top waiter task, the lifetime rules should be good.

In rt_mutex_slowunlock(), we release pi_lock and and wait_lock first, then
wake up the top waiter, then call rt_mutex_adjust_prio(), so there is a small
window without any lock or irq disabled between the top waiter wake up
and rt_mutex_adjust_prio(), which can cause problems.

For example, before calling rt_mutex_adjust_prio() to adjust the cached pointer,
if current is preempted and the waken top waiter exited, after that, the task is
back, and it may enter enqueue_task_dl() before entering rt_mutex_adjust_prio(),
where the cached pointer is updated, so it will access a stale cached pointer.

Regards,
Xunlei

> Having this top waiter pointer around might also be useful to further
> implement bandwidth inheritance or such, but I've not thought about that
> too much.
>
> Lots of code deleted because we move the entire task->prio mapping muck
> into the scheduler, because we now pass a pi_task, not a prio.
>
> ---
>  include/linux/sched.h    |  1 +
>  include/linux/sched/rt.h | 20 +-------------------
>  kernel/locking/rtmutex.c | 45 +++++----------------------------------------
>  kernel/sched/core.c      | 34 +++++++++++++++++++++++-----------
>  kernel/sched/deadline.c  |  2 +-
>  5 files changed, 31 insertions(+), 71 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 52c4847b05e2..30169f38cb24 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1615,6 +1615,7 @@ struct task_struct {
>  
>  	/* Protection of the PI data structures: */
>  	raw_spinlock_t pi_lock;
> +	struct task_struct *pi_task;
>  
>  	struct wake_q_node wake_q;
>  
> diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
> index a30b172df6e1..0a8f071b16e3 100644
> --- a/include/linux/sched/rt.h
> +++ b/include/linux/sched/rt.h
> @@ -16,31 +16,13 @@ static inline int rt_task(struct task_struct *p)
>  }
>  
>  #ifdef CONFIG_RT_MUTEXES
> -extern int rt_mutex_getprio(struct task_struct *p);
> -extern void rt_mutex_setprio(struct task_struct *p, int prio);
> -extern int rt_mutex_get_effective_prio(struct task_struct *task, int newprio);
> -extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task);
> +extern void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task);
>  extern void rt_mutex_adjust_pi(struct task_struct *p);
>  static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
>  {
>  	return tsk->pi_blocked_on != NULL;
>  }
>  #else
> -static inline int rt_mutex_getprio(struct task_struct *p)
> -{
> -	return p->normal_prio;
> -}
> -
> -static inline int rt_mutex_get_effective_prio(struct task_struct *task,
> -					      int newprio)
> -{
> -	return newprio;
> -}
> -
> -static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
> -{
> -	return NULL;
> -}
>  # define rt_mutex_adjust_pi(p)		do { } while (0)
>  static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
>  {
> diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
> index 3e746607abe5..13b6b5922d3c 100644
> --- a/kernel/locking/rtmutex.c
> +++ b/kernel/locking/rtmutex.c
> @@ -257,53 +257,18 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
>  }
>  
>  /*
> - * Calculate task priority from the waiter tree priority
> - *
> - * Return task->normal_prio when the waiter tree is empty or when
> - * the waiter is not allowed to do priority boosting
> - */
> -int rt_mutex_getprio(struct task_struct *task)
> -{
> -	if (likely(!task_has_pi_waiters(task)))
> -		return task->normal_prio;
> -
> -	return min(task_top_pi_waiter(task)->prio,
> -		   task->normal_prio);
> -}
> -
> -struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
> -{
> -	if (likely(!task_has_pi_waiters(task)))
> -		return NULL;
> -
> -	return task_top_pi_waiter(task)->task;
> -}
> -
> -/*
> - * Called by sched_setscheduler() to get the priority which will be
> - * effective after the change.
> - */
> -int rt_mutex_get_effective_prio(struct task_struct *task, int newprio)
> -{
> -	if (!task_has_pi_waiters(task))
> -		return newprio;
> -
> -	if (task_top_pi_waiter(task)->task->prio <= newprio)
> -		return task_top_pi_waiter(task)->task->prio;
> -	return newprio;
> -}
> -
> -/*
>   * Adjust the priority of a task, after its pi_waiters got modified.
>   *
>   * This can be both boosting and unboosting. task->pi_lock must be held.
>   */
>  static void __rt_mutex_adjust_prio(struct task_struct *task)
>  {
> -	int prio = rt_mutex_getprio(task);
> +	struct task_struct *pi_task = task;
> +
> +	if (unlikely(task_has_pi_waiters(task)))
> +		pi_task = task_top_pi_waiter(task)->task;
>  
> -	if (task->prio != prio || dl_prio(prio))
> -		rt_mutex_setprio(task, prio);
> +	rt_mutex_setprio(task, pi_task);
>  }
>  
>  /*
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 8b489fcac37b..7d7e3a0eaeb0 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3392,7 +3392,7 @@ EXPORT_SYMBOL(default_wake_function);
>  /*
>   * rt_mutex_setprio - set the current priority of a task
>   * @p: task
> - * @prio: prio value (kernel-internal form)
> + * @pi_task: top waiter, donating state
>   *
>   * This function changes the 'effective' priority of a task. It does
>   * not touch ->normal_prio like __setscheduler().
> @@ -3400,13 +3400,20 @@ EXPORT_SYMBOL(default_wake_function);
>   * Used by the rt_mutex code to implement priority inheritance
>   * logic. Call site only calls if the priority of the task changed.
>   */
> -void rt_mutex_setprio(struct task_struct *p, int prio)
> +void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
>  {
> -	int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
> -	struct rq *rq;
> +	int prio, oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
>  	const struct sched_class *prev_class;
> +	struct rq *rq;
>  
> -	BUG_ON(prio > MAX_PRIO);
> +	/*
> +	 * For FIFO/RR we simply donate prio; for DL things are
> +	 * more interesting.
> +	 */
> +	/* XXX used to be waiter->prio, not waiter->task->prio */
> +	prio = min(pi_task->prio, p->normal_prio);
> +	if (p->prio == prio && !dl_prio(prio))
> +		return;
>  
>  	rq = __task_rq_lock(p);
>  
> @@ -3442,6 +3449,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
>  	if (running)
>  		put_prev_task(rq, p);
>  
> +	if (pi_task == p)
> +		pi_task = NULL;
> +	p->pi_task = pi_task;
> +
>  	/*
>  	 * Boosting condition are:
>  	 * 1. -rt task is running and holds mutex A
> @@ -3452,7 +3463,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
>  	 *          running task
>  	 */
>  	if (dl_prio(prio)) {
> -		struct task_struct *pi_task = rt_mutex_get_top_task(p);
>  		if (!dl_prio(p->normal_prio) ||
>  		    (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
>  			p->dl.dl_boosted = 1;
> @@ -3727,10 +3737,9 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
>  	 * Keep a potential priority boosting if called from
>  	 * sched_setscheduler().
>  	 */
> -	if (keep_boost)
> -		p->prio = rt_mutex_get_effective_prio(p, normal_prio(p));
> -	else
> -		p->prio = normal_prio(p);
> +	p->prio = normal_prio(p);
> +	if (keep_boost && p->pi_task)
> +		p->prio = min(p->prio, p->pi_task->prio);
>  
>  	if (dl_prio(p->prio))
>  		p->sched_class = &dl_sched_class;
> @@ -4017,7 +4026,10 @@ static int __sched_setscheduler(struct task_struct *p,
>  		 * the runqueue. This will be done when the task deboost
>  		 * itself.
>  		 */
> -		new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
> +		new_effective_prio = newprio;
> +		if (p->pi_task)
> +			new_effective_prio = min(new_effective_prio, p->pi_task->prio);
> +
>  		if (new_effective_prio == oldprio)
>  			queue_flags &= ~DEQUEUE_MOVE;
>  	}
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index affd97ec9f65..6c5aa4612eb6 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -932,7 +932,7 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
>  
>  static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
>  {
> -	struct task_struct *pi_task = rt_mutex_get_top_task(p);
> +	struct task_struct *pi_task = p->pi_task;
>  	struct sched_dl_entity *pi_se = &p->dl;
>  
>  	/*

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] sched/deadline/rtmutex: Fix a PI crash for deadline tasks
  2016-04-05  8:38           ` Xunlei Pang
@ 2016-04-05  9:19             ` Peter Zijlstra
  2016-04-05  9:29               ` Peter Zijlstra
  0 siblings, 1 reply; 25+ messages in thread
From: Peter Zijlstra @ 2016-04-05  9:19 UTC (permalink / raw)
  To: xlpang
  Cc: linux-kernel, Juri Lelli, Ingo Molnar, Steven Rostedt, Thomas Gleixner

On Tue, Apr 05, 2016 at 04:38:12PM +0800, Xunlei Pang wrote:
> On 2016/04/02 at 05:51, Peter Zijlstra wrote:
> > On Fri, Apr 01, 2016 at 09:34:24PM +0800, Xunlei Pang wrote:
> >
> >>>> I checked the code, currently only deadline accesses the
> >>>> pi_waiters/pi_waiters_leftmost
> >>>> without pi_lock held via rt_mutex_get_top_task(), other cases all have
> >>>> pi_lock held.
> >> Any better ideas is welcome.
> > Something like the below _might_ work; but its late and I haven't looked
> > at the PI code in a while. This basically caches a pointer to the top
> > waiter task in the running task_struct, under pi_lock and rq->lock, and
> > therefore we can use it with only rq->lock held.
> >
> > Since the task is blocked, and cannot unblock without taking itself from
> > the block chain -- which would cause rt_mutex_setprio() to set another
> > top waiter task, the lifetime rules should be good.
> 
> In rt_mutex_slowunlock(), we release pi_lock and and wait_lock first, then
> wake up the top waiter, then call rt_mutex_adjust_prio(), so there is a small
> window without any lock or irq disabled between the top waiter wake up
> and rt_mutex_adjust_prio(), which can cause problems.

That is rt_mutex_fastunlock()'s:

	bool deboost = slowfs(lock, &wake_q); /* -> rt_mutex_slowunlock() */

	wake_up_q(&wake_q);

	if (deboost)
		rt_mutex_adjust_prio(current);


(and the IRQ enabled is irrelevant, SMP can race regardless)

> For example, before calling rt_mutex_adjust_prio() to adjust the cached pointer,
> if current is preempted and the waken top waiter exited, after that, the task is
> back, and it may enter enqueue_task_dl() before entering rt_mutex_adjust_prio(),
> where the cached pointer is updated, so it will access a stale cached pointer.

Hmm, so I would argue that that is a bug in any case. Its an effective
priority 'leak', we should deboost before letting the booster run again.

But it looks like a simple fix, simply call wake_up_q() after the
deboost. The wake_q has a reference on the task so it cannot go away,
which ensures any dereferences from within the DL code must still be
valid.

Or did I miss something (again) ? :-)

---
 kernel/locking/rtmutex.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 3e746607abe5..36eb232bd29f 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1390,11 +1390,11 @@ rt_mutex_fastunlock(struct rt_mutex *lock,
 	} else {
 		bool deboost = slowfn(lock, &wake_q);
 
-		wake_up_q(&wake_q);
-
 		/* Undo pi boosting if necessary: */
 		if (deboost)
 			rt_mutex_adjust_prio(current);
+
+		wake_up_q(&wake_q);
 	}
 }
 

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH] sched/deadline/rtmutex: Fix a PI crash for deadline tasks
  2016-04-05  9:19             ` Peter Zijlstra
@ 2016-04-05  9:29               ` Peter Zijlstra
  2016-04-05 10:48                 ` Xunlei Pang
  2016-04-08 16:25                 ` Steven Rostedt
  0 siblings, 2 replies; 25+ messages in thread
From: Peter Zijlstra @ 2016-04-05  9:29 UTC (permalink / raw)
  To: xlpang
  Cc: linux-kernel, Juri Lelli, Ingo Molnar, Steven Rostedt, Thomas Gleixner

On Tue, Apr 05, 2016 at 11:19:54AM +0200, Peter Zijlstra wrote:
> Or did I miss something (again) ? :-)
> 
> ---
>  kernel/locking/rtmutex.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
> index 3e746607abe5..36eb232bd29f 100644
> --- a/kernel/locking/rtmutex.c
> +++ b/kernel/locking/rtmutex.c
> @@ -1390,11 +1390,11 @@ rt_mutex_fastunlock(struct rt_mutex *lock,
>  	} else {
>  		bool deboost = slowfn(lock, &wake_q);
>  
> -		wake_up_q(&wake_q);
> -
>  		/* Undo pi boosting if necessary: */
>  		if (deboost)
>  			rt_mutex_adjust_prio(current);
> +
> +		wake_up_q(&wake_q);
>  	}
>  }

So one potential issue with this -- and this might be reason this code
is the way it is -- is that the moment we de-boost we can get preempted,
before having had a chance to wake the higher prio task, getting
ourselves into a prio-inversion.

But again, that should be fairly simply to fix.

--
 kernel/locking/rtmutex.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 3e746607abe5..1896baf28e9c 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1390,11 +1390,21 @@ rt_mutex_fastunlock(struct rt_mutex *lock,
 	} else {
 		bool deboost = slowfn(lock, &wake_q);
 
-		wake_up_q(&wake_q);
-
-		/* Undo pi boosting if necessary: */
+		/*
+		 * Undo pi boosting (if necessary) and wake top waiter.
+		 *
+		 * We should deboost before waking the high-prio task such that
+		 * we don't run two tasks with the 'same' state. This however
+		 * can lead to prio-inversion if we would get preempted after
+		 * the deboost but before waking our high-prio task, hence the
+		 * preempt_disable.
+		 */
+		preempt_disable();
 		if (deboost)
 			rt_mutex_adjust_prio(current);
+
+		wake_up_q(&wake_q);
+		preempt_enable();
 	}
 }
 

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH] sched/deadline/rtmutex: Fix a PI crash for deadline tasks
  2016-04-05  9:29               ` Peter Zijlstra
@ 2016-04-05 10:48                 ` Xunlei Pang
  2016-04-05 11:32                   ` Peter Zijlstra
  2016-04-08 16:25                 ` Steven Rostedt
  1 sibling, 1 reply; 25+ messages in thread
From: Xunlei Pang @ 2016-04-05 10:48 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, Juri Lelli, Ingo Molnar, Steven Rostedt, Thomas Gleixner

On 2016/04/05 at 17:29, Peter Zijlstra wrote:
> On Tue, Apr 05, 2016 at 11:19:54AM +0200, Peter Zijlstra wrote:
>> Or did I miss something (again) ? :-)
>>
>> ---
>>  kernel/locking/rtmutex.c | 4 ++--
>>  1 file changed, 2 insertions(+), 2 deletions(-)
>>
>> diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
>> index 3e746607abe5..36eb232bd29f 100644
>> --- a/kernel/locking/rtmutex.c
>> +++ b/kernel/locking/rtmutex.c
>> @@ -1390,11 +1390,11 @@ rt_mutex_fastunlock(struct rt_mutex *lock,
>>  	} else {
>>  		bool deboost = slowfn(lock, &wake_q);
>>  
>> -		wake_up_q(&wake_q);
>> -
>>  		/* Undo pi boosting if necessary: */
>>  		if (deboost)
>>  			rt_mutex_adjust_prio(current);
>> +
>> +		wake_up_q(&wake_q);
>>  	}
>>  }
> So one potential issue with this -- and this might be reason this code
> is the way it is -- is that the moment we de-boost we can get preempted,
> before having had a chance to wake the higher prio task, getting
> ourselves into a prio-inversion.
>
> But again, that should be fairly simply to fix.

This is cool, I think we should also init "pi_task" properly for INIT_MUTEX and fork,
otherwise looks good to me :-)

Besides, do you think we can kill "pi_waiters_leftmost" from task_struct, as we
can easily get it from "pi_waiters"?

I will test it further with these new changes soon.

Regards,
Xunlei

>
> --
>  kernel/locking/rtmutex.c | 16 +++++++++++++---
>  1 file changed, 13 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
> index 3e746607abe5..1896baf28e9c 100644
> --- a/kernel/locking/rtmutex.c
> +++ b/kernel/locking/rtmutex.c
> @@ -1390,11 +1390,21 @@ rt_mutex_fastunlock(struct rt_mutex *lock,
>  	} else {
>  		bool deboost = slowfn(lock, &wake_q);
>  
> -		wake_up_q(&wake_q);
> -
> -		/* Undo pi boosting if necessary: */
> +		/*
> +		 * Undo pi boosting (if necessary) and wake top waiter.
> +		 *
> +		 * We should deboost before waking the high-prio task such that
> +		 * we don't run two tasks with the 'same' state. This however
> +		 * can lead to prio-inversion if we would get preempted after
> +		 * the deboost but before waking our high-prio task, hence the
> +		 * preempt_disable.
> +		 */
> +		preempt_disable();
>  		if (deboost)
>  			rt_mutex_adjust_prio(current);
> +
> +		wake_up_q(&wake_q);
> +		preempt_enable();
>  	}
>  }
>  

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] sched/deadline/rtmutex: Fix a PI crash for deadline tasks
  2016-04-05 10:48                 ` Xunlei Pang
@ 2016-04-05 11:32                   ` Peter Zijlstra
  0 siblings, 0 replies; 25+ messages in thread
From: Peter Zijlstra @ 2016-04-05 11:32 UTC (permalink / raw)
  To: xlpang
  Cc: linux-kernel, Juri Lelli, Ingo Molnar, Steven Rostedt, Thomas Gleixner

On Tue, Apr 05, 2016 at 06:48:38PM +0800, Xunlei Pang wrote:
> This is cool, I think we should also init "pi_task" properly for INIT_MUTEX and fork,
> otherwise looks good to me :-)

Indeed..

> Besides, do you think we can kill "pi_waiters_leftmost" from task_struct, as we
> can easily get it from "pi_waiters"?

I don't quickly see a way to do that, different locking rules govern the
two.. which was the whole problem we started with :/

> I will test it further with these new changes soon.

Thanks!

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] sched/deadline/rtmutex: Fix a PI crash for deadline tasks
  2016-04-05  9:29               ` Peter Zijlstra
  2016-04-05 10:48                 ` Xunlei Pang
@ 2016-04-08 16:25                 ` Steven Rostedt
  2016-04-08 17:38                   ` Peter Zijlstra
  1 sibling, 1 reply; 25+ messages in thread
From: Steven Rostedt @ 2016-04-08 16:25 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: xlpang, linux-kernel, Juri Lelli, Ingo Molnar, Thomas Gleixner

On Tue, 5 Apr 2016 11:29:54 +0200
Peter Zijlstra <peterz@infradead.org> wrote:


> --
>  kernel/locking/rtmutex.c | 16 +++++++++++++---
>  1 file changed, 13 insertions(+), 3 deletions(-)
> 
> diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
> index 3e746607abe5..1896baf28e9c 100644
> --- a/kernel/locking/rtmutex.c
> +++ b/kernel/locking/rtmutex.c
> @@ -1390,11 +1390,21 @@ rt_mutex_fastunlock(struct rt_mutex *lock,
>  	} else {
>  		bool deboost = slowfn(lock, &wake_q);
>  
> -		wake_up_q(&wake_q);
> -
> -		/* Undo pi boosting if necessary: */
> +		/*
> +		 * Undo pi boosting (if necessary) and wake top waiter.
> +		 *
> +		 * We should deboost before waking the high-prio task such that
> +		 * we don't run two tasks with the 'same' state. This however
> +		 * can lead to prio-inversion if we would get preempted after
> +		 * the deboost but before waking our high-prio task, hence the
> +		 * preempt_disable.
> +		 */
> +		preempt_disable();
>  		if (deboost)
>  			rt_mutex_adjust_prio(current);
> +
> +		wake_up_q(&wake_q);
> +		preempt_enable();
>  	}
>  }
>  

So the preempt_disable() is to allow us to set current back to its
normal priority first before waking up the other task because we don't
want two tasks at the same priority?

Just remember, calling preempt_disable() is semantically the same as
setting your priority as the highest task on the CPU. Thus the above
fix is to set the one task to the highest priority so that we can
deboost it when we remove the highest priority (preempt_enable()),
after we wake up the task that will be lower priority then current
until current calls preempt_enable().

This will of course keep a task that is higher in priority than both
current and the waking task from running till this is all completed.

What's the point of swapping deboost and the wake up again?

Maybe I'm missing something, what exactly do you mean by "same state"?

-- Steve

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] sched/deadline/rtmutex: Fix a PI crash for deadline tasks
  2016-04-08 16:25                 ` Steven Rostedt
@ 2016-04-08 17:38                   ` Peter Zijlstra
  2016-04-08 18:50                     ` Steven Rostedt
  0 siblings, 1 reply; 25+ messages in thread
From: Peter Zijlstra @ 2016-04-08 17:38 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: xlpang, linux-kernel, Juri Lelli, Ingo Molnar, Thomas Gleixner

On Fri, Apr 08, 2016 at 12:25:10PM -0400, Steven Rostedt wrote:

> So the preempt_disable() is to allow us to set current back to its
> normal priority first before waking up the other task because we don't
> want two tasks at the same priority?

> What's the point of swapping deboost and the wake up again?

In the context of this patch, it ensures the new pi_task pointer points
to something that exists -- this is a rather useful property for a
pointer to have.

It furthermore guarantees that it points to a blocked task, another
useful property.

> Maybe I'm missing something, what exactly do you mean by "same state"?

So the whole point of boosting is to donate the waiters eligibility to
run to the lock owner. Semantically it is very poor to have two tasks
run based on this one eligibility.

And for pure priority inheritance it doesn't go further than that.

But now consider we want to implement things like bandwidth inheritance,
where the lock owner actively consumes the runtime budget of the waiter,
then it is very important to not have any overlap on the budget usage.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] sched/deadline/rtmutex: Fix a PI crash for deadline tasks
  2016-04-08 17:38                   ` Peter Zijlstra
@ 2016-04-08 18:50                     ` Steven Rostedt
  2016-04-08 18:59                       ` Peter Zijlstra
  0 siblings, 1 reply; 25+ messages in thread
From: Steven Rostedt @ 2016-04-08 18:50 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: xlpang, linux-kernel, Juri Lelli, Ingo Molnar, Thomas Gleixner

On Fri, 8 Apr 2016 19:38:35 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> On Fri, Apr 08, 2016 at 12:25:10PM -0400, Steven Rostedt wrote:
> 
> > So the preempt_disable() is to allow us to set current back to its
> > normal priority first before waking up the other task because we don't
> > want two tasks at the same priority?  
> 
> > What's the point of swapping deboost and the wake up again?  
> 
> In the context of this patch, it ensures the new pi_task pointer points
> to something that exists -- this is a rather useful property for a
> pointer to have.

It's not clear to what would make the new pi_task pointer object no
longer exist from this patch. I take it that waking up the wake_q, will
cause something to change in the code of rt_mutex_adjust_prio(current).
If so, it should probably be stated in a comment, because nothing is
obvious here.

> 
> It furthermore guarantees that it points to a blocked task, another
> useful property.

I would think that the slowfn() would have removed anything to do with
what's on the wake_q removed from current. What task on what pointer.
I'm only looking at this current patch, not anything to do with the
original patch of this thread. That is, just the swap of waking up
wake_q and calling rt_mutex_adjust_prio().

> 
> > Maybe I'm missing something, what exactly do you mean by "same state"?  
> 
> So the whole point of boosting is to donate the waiters eligibility to
> run to the lock owner. Semantically it is very poor to have two tasks
> run based on this one eligibility.
> 
> And for pure priority inheritance it doesn't go further than that.
> 
> But now consider we want to implement things like bandwidth inheritance,
> where the lock owner actively consumes the runtime budget of the waiter,
> then it is very important to not have any overlap on the budget usage.

If the two tasks (current and the wakee) are on two different CPUs, the
bandwidth charge does make better sense. Again, perhaps a bit more
explanation in the comment would help, as "state" is too generic of a
term to get anything from it.

-- Steve

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] sched/deadline/rtmutex: Fix a PI crash for deadline tasks
  2016-04-08 18:50                     ` Steven Rostedt
@ 2016-04-08 18:59                       ` Peter Zijlstra
  2016-04-08 19:15                         ` Steven Rostedt
  2016-04-09  3:25                         ` Xunlei Pang
  0 siblings, 2 replies; 25+ messages in thread
From: Peter Zijlstra @ 2016-04-08 18:59 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: xlpang, linux-kernel, Juri Lelli, Ingo Molnar, Thomas Gleixner

On Fri, Apr 08, 2016 at 02:50:55PM -0400, Steven Rostedt wrote:
> On Fri, 8 Apr 2016 19:38:35 +0200
> Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > On Fri, Apr 08, 2016 at 12:25:10PM -0400, Steven Rostedt wrote:
> > 
> > > So the preempt_disable() is to allow us to set current back to its
> > > normal priority first before waking up the other task because we don't
> > > want two tasks at the same priority?  
> > 
> > > What's the point of swapping deboost and the wake up again?  
> > 
> > In the context of this patch, it ensures the new pi_task pointer points
> > to something that exists -- this is a rather useful property for a
> > pointer to have.
> 
> It's not clear to what would make the new pi_task pointer object no
> longer exist from this patch. I take it that waking up the wake_q, will
> cause something to change in the code of rt_mutex_adjust_prio(current).
> If so, it should probably be stated in a comment, because nothing is
> obvious here.

Its pretty obvious that a running task can exit :-)

But also, wake_q holds a task ref.

> > It furthermore guarantees that it points to a blocked task, another
> > useful property.
> 
> I would think that the slowfn() would have removed anything to do with
> what's on the wake_q removed from current.

It cannot.

> What task on what pointer.
> I'm only looking at this current patch, not anything to do with the
> original patch of this thread. That is, just the swap of waking up
> wake_q and calling rt_mutex_adjust_prio().

This whole patch was in the context of the previous patch, as should be
clear from the thread.

In any case, I just realized we do not in fact provide this guarantee
(of pointing to a blocked task) that needs a bit more work.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] sched/deadline/rtmutex: Fix a PI crash for deadline tasks
  2016-04-08 18:59                       ` Peter Zijlstra
@ 2016-04-08 19:15                         ` Steven Rostedt
  2016-04-08 19:28                           ` Steven Rostedt
  2016-04-09  3:25                         ` Xunlei Pang
  1 sibling, 1 reply; 25+ messages in thread
From: Steven Rostedt @ 2016-04-08 19:15 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: xlpang, linux-kernel, Juri Lelli, Ingo Molnar, Thomas Gleixner

On Fri, 8 Apr 2016 20:59:16 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> On Fri, Apr 08, 2016 at 02:50:55PM -0400, Steven Rostedt wrote:
> > On Fri, 8 Apr 2016 19:38:35 +0200
> > Peter Zijlstra <peterz@infradead.org> wrote:
> >   
> > > On Fri, Apr 08, 2016 at 12:25:10PM -0400, Steven Rostedt wrote:
> > >   
> > > > So the preempt_disable() is to allow us to set current back to its
> > > > normal priority first before waking up the other task because we don't
> > > > want two tasks at the same priority?    
> > >   
> > > > What's the point of swapping deboost and the wake up again?    
> > > 
> > > In the context of this patch, it ensures the new pi_task pointer points
> > > to something that exists -- this is a rather useful property for a
> > > pointer to have.  
> > 
> > It's not clear to what would make the new pi_task pointer object no
> > longer exist from this patch. I take it that waking up the wake_q, will
> > cause something to change in the code of rt_mutex_adjust_prio(current).
> > If so, it should probably be stated in a comment, because nothing is
> > obvious here.  
> 
> Its pretty obvious that a running task can exit :-)
> 
> But also, wake_q holds a task ref.
> 
> > > It furthermore guarantees that it points to a blocked task, another
> > > useful property.  
> > 
> > I would think that the slowfn() would have removed anything to do with
> > what's on the wake_q removed from current.  
> 
> It cannot.
> 

It cannot?

Maybe I didn't make myself clear in the question.

>From what I understand, the slowfn() modifies the task pi_list (or
rbtree, as it is today). As this is an unlock, the task being woken
(the next one to grab the lock) is removed from the previous task's pi
list.

In rt_mutex_adjust_prio(current) I see it simply grabs current's
pi_lock and calls __rt_mutex_adjust_prio(current). This calls
rt_mutex_getprio(current) which returns current's normal prio if it
doesn't have any pi waiters, or it looks at the top pi waiter on the
tasks list and returns that. Which wouldn't be the task on wake_q,
otherwise we wouldn't be deboosting in the first place.

Again, I don't see how the swap has anything to do with pointers
disappearing.

> > What task on what pointer.
> > I'm only looking at this current patch, not anything to do with the
> > original patch of this thread. That is, just the swap of waking up
> > wake_q and calling rt_mutex_adjust_prio().  
> 
> This whole patch was in the context of the previous patch, as should be
> clear from the thread.

OK, so this patch is to be added to the previous patch, and everything
should be in context to that. My comment was just about this change,
not the change of the original patch.

> 
> In any case, I just realized we do not in fact provide this guarantee
> (of pointing to a blocked task) that needs a bit more work.

Looking forward to it ;-)

-- Steve

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] sched/deadline/rtmutex: Fix a PI crash for deadline tasks
  2016-04-08 19:15                         ` Steven Rostedt
@ 2016-04-08 19:28                           ` Steven Rostedt
  2016-04-09  3:27                             ` Xunlei Pang
  0 siblings, 1 reply; 25+ messages in thread
From: Steven Rostedt @ 2016-04-08 19:28 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: xlpang, linux-kernel, Juri Lelli, Ingo Molnar, Thomas Gleixner

On Fri, 8 Apr 2016 15:15:42 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> From what I understand, the slowfn() modifies the task pi_list (or
> rbtree, as it is today). As this is an unlock, the task being woken
> (the next one to grab the lock) is removed from the previous task's pi
> list.
> 
> In rt_mutex_adjust_prio(current) I see it simply grabs current's
> pi_lock and calls __rt_mutex_adjust_prio(current). This calls
> rt_mutex_getprio(current) which returns current's normal prio if it
> doesn't have any pi waiters, or it looks at the top pi waiter on the
> tasks list and returns that. Which wouldn't be the task on wake_q,
> otherwise we wouldn't be deboosting in the first place.
> 

OK, I now see that the your previous patch is changing what I'm looking
at :-) This is what happens when you go away and try to catch up on
email and not read the emails by threads. I see the
rt_mutex_adjust_prio() is being changed.

I'll go back and look at your previous patch (as I looked at that while
traveling and didn't think too hard about it).

-- Steve

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] sched/deadline/rtmutex: Fix a PI crash for deadline tasks
  2016-04-08 18:59                       ` Peter Zijlstra
  2016-04-08 19:15                         ` Steven Rostedt
@ 2016-04-09  3:25                         ` Xunlei Pang
  2016-04-09 13:29                           ` Peter Zijlstra
  1 sibling, 1 reply; 25+ messages in thread
From: Xunlei Pang @ 2016-04-09  3:25 UTC (permalink / raw)
  To: Peter Zijlstra, Steven Rostedt
  Cc: linux-kernel, Juri Lelli, Ingo Molnar, Thomas Gleixner

On 2016/04/09 at 02:59, Peter Zijlstra wrote:
> On Fri, Apr 08, 2016 at 02:50:55PM -0400, Steven Rostedt wrote:
>> On Fri, 8 Apr 2016 19:38:35 +0200
>> Peter Zijlstra <peterz@infradead.org> wrote:
>>
>>> On Fri, Apr 08, 2016 at 12:25:10PM -0400, Steven Rostedt wrote:
>>>
>>>> So the preempt_disable() is to allow us to set current back to its
>>>> normal priority first before waking up the other task because we don't
>>>> want two tasks at the same priority?  
>>>> What's the point of swapping deboost and the wake up again?  
>>> In the context of this patch, it ensures the new pi_task pointer points
>>> to something that exists -- this is a rather useful property for a
>>> pointer to have.
>> It's not clear to what would make the new pi_task pointer object no
>> longer exist from this patch. I take it that waking up the wake_q, will
>> cause something to change in the code of rt_mutex_adjust_prio(current).
>> If so, it should probably be stated in a comment, because nothing is
>> obvious here.
> Its pretty obvious that a running task can exit :-)
>
> But also, wake_q holds a task ref.
>
>>> It furthermore guarantees that it points to a blocked task, another
>>> useful property.
>> I would think that the slowfn() would have removed anything to do with
>> what's on the wake_q removed from current.
> It cannot.
>
>> What task on what pointer.
>> I'm only looking at this current patch, not anything to do with the
>> original patch of this thread. That is, just the swap of waking up
>> wake_q and calling rt_mutex_adjust_prio().
> This whole patch was in the context of the previous patch, as should be
> clear from the thread.
>
> In any case, I just realized we do not in fact provide this guarantee
> (of pointing to a blocked task) that needs a bit more work.

Current patch calls rt_mutex_adjust_prio() before wake_up_q() the wakee, at that moment
the wakee has been removed by rt_mutex_slowunlock()->mark_wakeup_next_waiter(),
from current's pi_waiters, rt_mutex_adjust_prio() won't see this wakee, so I think this should
not be problem.

Regards,
Xunlei

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] sched/deadline/rtmutex: Fix a PI crash for deadline tasks
  2016-04-08 19:28                           ` Steven Rostedt
@ 2016-04-09  3:27                             ` Xunlei Pang
  0 siblings, 0 replies; 25+ messages in thread
From: Xunlei Pang @ 2016-04-09  3:27 UTC (permalink / raw)
  To: Steven Rostedt, Peter Zijlstra
  Cc: linux-kernel, Juri Lelli, Ingo Molnar, Thomas Gleixner

On 2016/04/09 at 03:28, Steven Rostedt wrote:
> On Fri, 8 Apr 2016 15:15:42 -0400
> Steven Rostedt <rostedt@goodmis.org> wrote:
>
>> From what I understand, the slowfn() modifies the task pi_list (or
>> rbtree, as it is today). As this is an unlock, the task being woken
>> (the next one to grab the lock) is removed from the previous task's pi
>> list.
>>
>> In rt_mutex_adjust_prio(current) I see it simply grabs current's
>> pi_lock and calls __rt_mutex_adjust_prio(current). This calls
>> rt_mutex_getprio(current) which returns current's normal prio if it
>> doesn't have any pi waiters, or it looks at the top pi waiter on the
>> tasks list and returns that. Which wouldn't be the task on wake_q,
>> otherwise we wouldn't be deboosting in the first place.
>>
> OK, I now see that the your previous patch is changing what I'm looking
> at :-) This is what happens when you go away and try to catch up on
> email and not read the emails by threads. I see the
> rt_mutex_adjust_prio() is being changed.
>
> I'll go back and look at your previous patch (as I looked at that while
> traveling and didn't think too hard about it).

Sorry for that, I should add more comments about it, will add more next version.

Regards,
Xunlei

>
> -- Steve

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] sched/deadline/rtmutex: Fix a PI crash for deadline tasks
  2016-04-09  3:25                         ` Xunlei Pang
@ 2016-04-09 13:29                           ` Peter Zijlstra
  2016-04-10  8:22                             ` Xunlei Pang
  0 siblings, 1 reply; 25+ messages in thread
From: Peter Zijlstra @ 2016-04-09 13:29 UTC (permalink / raw)
  To: xlpang
  Cc: Steven Rostedt, linux-kernel, Juri Lelli, Ingo Molnar, Thomas Gleixner

On Sat, Apr 09, 2016 at 11:25:39AM +0800, Xunlei Pang wrote:

> > In any case, I just realized we do not in fact provide this guarantee
> > (of pointing to a blocked task) that needs a bit more work.
> 
> Current patch calls rt_mutex_adjust_prio() before wake_up_q() the
> wakee, at that moment the wakee has been removed by
> rt_mutex_slowunlock()->mark_wakeup_next_waiter(), from current's
> pi_waiters, rt_mutex_adjust_prio() won't see this wakee, so I think
> this should not be problem.

No, any wakeup after mark_wakeup_next_waiter() will make the task run.
And since we must always consider spurious wakeups, we cannot rely on us
(eg. our wake_up_q call) being the one and only.

Therefore it is possible and the only thing that stands between us and
doom is the fact that the wake_q stuff holds a task reference.

But we cannot guarantee that the task we have a pointer to is in fact
blocked.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] sched/deadline/rtmutex: Fix a PI crash for deadline tasks
  2016-04-09 13:29                           ` Peter Zijlstra
@ 2016-04-10  8:22                             ` Xunlei Pang
  2016-04-12  3:08                               ` Xunlei Pang
  0 siblings, 1 reply; 25+ messages in thread
From: Xunlei Pang @ 2016-04-10  8:22 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Steven Rostedt, linux-kernel, Juri Lelli, Ingo Molnar, Thomas Gleixner

On 2016/04/09 at 21:29, Peter Zijlstra wrote:
> On Sat, Apr 09, 2016 at 11:25:39AM +0800, Xunlei Pang wrote:
>
>>> In any case, I just realized we do not in fact provide this guarantee
>>> (of pointing to a blocked task) that needs a bit more work.
>> Current patch calls rt_mutex_adjust_prio() before wake_up_q() the
>> wakee, at that moment the wakee has been removed by
>> rt_mutex_slowunlock()->mark_wakeup_next_waiter(), from current's
>> pi_waiters, rt_mutex_adjust_prio() won't see this wakee, so I think
>> this should not be problem.
> No, any wakeup after mark_wakeup_next_waiter() will make the task run.
> And since we must always consider spurious wakeups, we cannot rely on us
> (eg. our wake_up_q call) being the one and only.
>
> Therefore it is possible and the only thing that stands between us and
> doom is the fact that the wake_q stuff holds a task reference.
>
> But we cannot guarantee that the task we have a pointer to is in fact
> blocked.

Does that really matters? the pointer is accessed on behalf of current,  and current
will call rt_mutex_adjust_prio() very soon to update the right pointer.

Also the pointer is used to update current's deadline/runtime, we can restore these
params in rt_mutex_setprio() for deboost cases. I just checked current code,  it did
nothing to restore current's deadline/runtime when deboosting, maybe we can leave
this job to future deadline bandwidth inheritance?

Regards,
Xunlei

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] sched/deadline/rtmutex: Fix a PI crash for deadline tasks
  2016-04-10  8:22                             ` Xunlei Pang
@ 2016-04-12  3:08                               ` Xunlei Pang
  2016-04-12 15:51                                 ` Peter Zijlstra
  0 siblings, 1 reply; 25+ messages in thread
From: Xunlei Pang @ 2016-04-12  3:08 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Steven Rostedt, linux-kernel, Juri Lelli, Ingo Molnar, Thomas Gleixner

On 2016/04/10 at 16:22, Xunlei Pang wrote:
> On 2016/04/09 at 21:29, Peter Zijlstra wrote:
>> On Sat, Apr 09, 2016 at 11:25:39AM +0800, Xunlei Pang wrote:
>>
>>>> In any case, I just realized we do not in fact provide this guarantee
>>>> (of pointing to a blocked task) that needs a bit more work.
>>> Current patch calls rt_mutex_adjust_prio() before wake_up_q() the
>>> wakee, at that moment the wakee has been removed by
>>> rt_mutex_slowunlock()->mark_wakeup_next_waiter(), from current's
>>> pi_waiters, rt_mutex_adjust_prio() won't see this wakee, so I think
>>> this should not be problem.
>> No, any wakeup after mark_wakeup_next_waiter() will make the task run.
>> And since we must always consider spurious wakeups, we cannot rely on us
>> (eg. our wake_up_q call) being the one and only.
>>
>> Therefore it is possible and the only thing that stands between us and
>> doom is the fact that the wake_q stuff holds a task reference.
>>
>> But we cannot guarantee that the task we have a pointer to is in fact
>> blocked.
> Does that really matters? the pointer is accessed on behalf of current,  and current
> will call rt_mutex_adjust_prio() very soon to update the right pointer.
>
> Also the pointer is used to update current's deadline/runtime, we can restore these
> params in rt_mutex_setprio() for deboost cases. I just checked current code,  it did
> nothing to restore current's deadline/runtime when deboosting, maybe we can leave
> this job to future deadline bandwidth inheritance?
>
> Regards,
> Xunlei
>

I spotted another issue, we access pi_task without any lock in enqueue_task_dl(),
though we have "dl_prio(pi_task->normal_prio)" condition, that's not enough, "dl_period"
and "dl_runtime" of pi_task can change, if it changed to !deadline class, dl_runtime
was cleared to 0, we will hit a forever loop in replenish_dl_entity() below:
    while (dl_se->runtime <= 0) {
        dl_se->deadline += pi_se->dl_period;
        dl_se->runtime += pi_se->dl_runtime;
    }

or hit "BUG_ON(pi_se->dl_runtime <= 0);".

That's all because without any lock of that task, there is no guarantee.

So I'm thinking of adding more members in rt_mutex_waiter(we don't lose memory,
it's defined on stack) and use this structure instead of task_struct as the top waiter
(i.e. using get_pi_top_waiter() instead of get_pi_top_task()), like:

Step1:
struct rt_mutex_waiter {
         int prio;
+       /* updated under waiter's pi_lock and rt_mutex lock */
+       u64 dl_runtime, dl_period;
+       /*
+        * under owner's pi_lock, rq lock, and rt_mutex lock, copied
+        * directly from dl_runtime, dl_period(under same rt_mutex lock).
+        */
+       u64 dl_runtime_copy, dl_period_copy;

Similarly, adding the memember in task_struct:
#ifdef CONFIG_RT_MUTEXES
    /* PI waiters blocked on a rt_mutex held by this task */
    struct rb_root pi_waiters;
    struct rb_node *pi_waiters_leftmost;
+  struct rb_node *pi_waiters_leftmost_copy; /* updated unlock pi_lock and rq lock */
    /* Deadlock detection and priority inheritance handling */
    struct rt_mutex_waiter *pi_blocked_on;
#endif

Then, we can update "pi_waiters_leftmost_copy" together with "dl_runtime_copy"
and "dl_period_copy" under rq lock, then enqueue_task_dl() can access them
without any problem.

Step2:
We must update "dl_runtime_copy" and "dl_period_copy" under rt_mutex lock,
because it is copied from "dl_runtime" and "dl_period" of rt_mutex_waiter, so we
add a new update function as long as we held rq lock and rt_mutex lock, mainly
can be implemented in rt_mutex_setprio.

Step3:
Note that rt_mutex_setprio() can be called without rtmutex lock by rt_mutex_adjust_prio(),
we can add a parameter to indicate not doing the copy-updating work at that place,
the same applies to rt_mutex_setprio(add a new waiter parameter and keep the original
"prio" parameter). Then we can do the copy-updating work in mark_wakeup_next_waiter()
before unlocking current's pi_lock, as long as we hold rq lock, because rtmutex lock and
owner's pi_lock was already held.

This can also solve the issue you mentioned with only a little overhead involved.

Regards,
Xunlei

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] sched/deadline/rtmutex: Fix a PI crash for deadline tasks
  2016-04-12  3:08                               ` Xunlei Pang
@ 2016-04-12 15:51                                 ` Peter Zijlstra
  2016-04-13  2:13                                   ` Xunlei Pang
  0 siblings, 1 reply; 25+ messages in thread
From: Peter Zijlstra @ 2016-04-12 15:51 UTC (permalink / raw)
  To: xlpang
  Cc: Steven Rostedt, linux-kernel, Juri Lelli, Ingo Molnar, Thomas Gleixner

On Tue, Apr 12, 2016 at 11:08:04AM +0800, Xunlei Pang wrote:
> I spotted another issue, we access pi_task without any lock in enqueue_task_dl(),

OK, so I'm on the road and entirely jetlagged, but how can
enqueue_task_dl() not have rq->lock held?

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] sched/deadline/rtmutex: Fix a PI crash for deadline tasks
  2016-04-12 15:51                                 ` Peter Zijlstra
@ 2016-04-13  2:13                                   ` Xunlei Pang
  0 siblings, 0 replies; 25+ messages in thread
From: Xunlei Pang @ 2016-04-13  2:13 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Steven Rostedt, linux-kernel, Juri Lelli, Ingo Molnar, Thomas Gleixner

On 2016/04/12 at 23:51, Peter Zijlstra wrote:
> On Tue, Apr 12, 2016 at 11:08:04AM +0800, Xunlei Pang wrote:
>> I spotted another issue, we access pi_task without any lock in enqueue_task_dl(),
> OK, so I'm on the road and entirely jetlagged, but how can
> enqueue_task_dl() not have rq->lock held?

It held current's rq->lock, but not holding pi_task's any lock, so pi_task's attributes
can change concurrently with subsequent pi_se->dl_runtime and pi_se->dl_period
access, and result in issues.

Regards,
Xunlei

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2016-04-13  2:13 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-04-01 11:00 [PATCH] sched/deadline/rtmutex: Fix a PI crash for deadline tasks Xunlei Pang
2016-04-01 11:38 ` Peter Zijlstra
2016-04-01 12:23   ` Xunlei Pang
2016-04-01 13:12     ` Peter Zijlstra
2016-04-01 13:34       ` Xunlei Pang
2016-04-01 21:51         ` Peter Zijlstra
2016-04-02 10:19           ` Xunlei Pang
2016-04-05  8:38           ` Xunlei Pang
2016-04-05  9:19             ` Peter Zijlstra
2016-04-05  9:29               ` Peter Zijlstra
2016-04-05 10:48                 ` Xunlei Pang
2016-04-05 11:32                   ` Peter Zijlstra
2016-04-08 16:25                 ` Steven Rostedt
2016-04-08 17:38                   ` Peter Zijlstra
2016-04-08 18:50                     ` Steven Rostedt
2016-04-08 18:59                       ` Peter Zijlstra
2016-04-08 19:15                         ` Steven Rostedt
2016-04-08 19:28                           ` Steven Rostedt
2016-04-09  3:27                             ` Xunlei Pang
2016-04-09  3:25                         ` Xunlei Pang
2016-04-09 13:29                           ` Peter Zijlstra
2016-04-10  8:22                             ` Xunlei Pang
2016-04-12  3:08                               ` Xunlei Pang
2016-04-12 15:51                                 ` Peter Zijlstra
2016-04-13  2:13                                   ` Xunlei Pang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.