From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1754580AbcDNLhv (ORCPT ); Thu, 14 Apr 2016 07:37:51 -0400 Received: from mx1.redhat.com ([209.132.183.28]:46353 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1754426AbcDNLhs (ORCPT ); Thu, 14 Apr 2016 07:37:48 -0400 From: Xunlei Pang To: linux-kernel@vger.kernel.org Cc: Peter Zijlstra , Thomas Gleixner , Juri Lelli , Ingo Molnar , Steven Rostedt , Xunlei Pang Subject: [PATCH v3 5/6] sched/deadline/rtmutex: Fix unprotected PI access in enqueue_task_dl() Date: Thu, 14 Apr 2016 19:37:06 +0800 Message-Id: <1460633827-345-6-git-send-email-xlpang@redhat.com> In-Reply-To: <1460633827-345-1-git-send-email-xlpang@redhat.com> References: <1460633827-345-1-git-send-email-xlpang@redhat.com> Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org We access @pi_task's data without any lock in enqueue_task_dl(), though checked "dl_prio(pi_task->normal_prio)" condition, that's not enough. "dl_period" and "dl_runtime" of "pi_task->dl" can change. For example, if it changes to !deadline class, dl_runtime will be cleared to zero, then we will hit an endless loop in replenish_dl_entity() below: while (dl_se->runtime <= 0) { dl_se->deadline += pi_se->dl_period; dl_se->runtime += pi_se->dl_runtime; } or hit "BUG_ON(pi_se->dl_runtime <= 0)" earlier. That's because without any lock of top waiter, there is no guarantee. In order to solve it, we add some members in "rt_mutex_waiter" and use this structure instead of task_struct as the one to be accessed by enqueue_task_dl(), specifically added: struct rt_mutex_waiter { ... ... int prio; + /* Updated under waiter's pi_lock and rt_mutex lock */ + u64 dl_runtime, dl_period; + /* + * Copied directly from above. + * Updated under owner's pi_lock, rq lock, and rt_mutex lock. + */ + u64 dl_runtime_copy, dl_period_copy; }; We must update "dl_runtime_copy" and "dl_period_copy" under rt_mutex lock, because they are copied from rt_mutex_waiter's "dl_runtime" and "dl_period" which are protected by the same rt_mutex lock. We update the copy in rt_mutex_update_copy() introduced perviously, as it is called by rt_mutex_setprio() which held owner's pi_lock, rq lock, and rt_mutex lock. "dl_runtime_copy" and "dl_period_copy" are updated under owner's pi_lock, rq lock, and rt_mutex lock, plus the waiter was dependably blocked on rtmutex, thus they can be safely accessed by enqueue_task_dl() which held rq lock. Note that, now we return a rt_mutex_waiter to enqueue_task_dl(), we add a new "struct sched_dl_entity_fake" to fake as a real sched_dl_entity, this is ok as long as we keep the "dl_runtime" and "dl_period" in it the same order as that in sched_dl_entity. Also adjust the location of "dl_period" in sched_dl_entity to make sched_dl_entity_fake smaller. Signed-off-by: Xunlei Pang --- include/linux/rtmutex.h | 7 +++++++ include/linux/sched.h | 2 +- include/linux/sched/deadline.h | 20 ++++++++++++++++++++ kernel/locking/rtmutex.c | 23 +++++++++++++++++++++++ kernel/sched/deadline.c | 10 +++++++--- 5 files changed, 58 insertions(+), 4 deletions(-) diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h index f9bf40a..56e2aaf 100644 --- a/include/linux/rtmutex.h +++ b/include/linux/rtmutex.h @@ -58,6 +58,13 @@ struct rt_mutex_waiter { struct rt_mutex *deadlock_lock; #endif int prio; + /* Updated under waiter's pi_lock and rt_mutex lock */ + u64 dl_runtime, dl_period; + /* + * Copied directly from above. + * Updated under owner's pi_lock, rq lock, and rt_mutex lock. + */ + u64 dl_runtime_copy, dl_period_copy; }; struct hrtimer_sleeper; diff --git a/include/linux/sched.h b/include/linux/sched.h index 8ad3522..960465c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1323,8 +1323,8 @@ struct sched_dl_entity { * the next sched_setattr(). */ u64 dl_runtime; /* maximum runtime for each instance */ - u64 dl_deadline; /* relative deadline of each instance */ u64 dl_period; /* separation of two instances (period) */ + u64 dl_deadline; /* relative deadline of each instance */ u64 dl_bw; /* dl_runtime / dl_deadline */ /* diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h index e8304d4..ca5bae5 100644 --- a/include/linux/sched/deadline.h +++ b/include/linux/sched/deadline.h @@ -2,6 +2,16 @@ #define _SCHED_DEADLINE_H /* + * Used by enqueue_task_dl() for PI cases to disguise sched_dl_entity, + * thus must be the same order as the counterparts in sched_dl_entity. + */ +struct sched_dl_entity_fake { + struct rb_node rb_node; + u64 dl_runtime; + u64 dl_period; +}; + +/* * SCHED_DEADLINE tasks has negative priorities, reflecting * the fact that any of them has higher prio than RT and * NORMAL/BATCH tasks. @@ -28,4 +38,14 @@ static inline bool dl_time_before(u64 a, u64 b) extern void rt_mutex_update_copy(struct task_struct *p); +#ifdef CONFIG_RT_MUTEXES +extern struct rt_mutex_waiter *rt_mutex_get_top_waiter(struct task_struct *p); +#else +static inline +struct rt_mutex_waiter *rt_mutex_get_top_waiter(struct task_struct *p) +{ + return NULL; +} +#endif + #endif /* _SCHED_DEADLINE_H */ diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 00c6560..4d14eee 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -280,6 +280,15 @@ struct task_struct *rt_mutex_get_top_task(struct task_struct *task) struct rt_mutex_waiter, pi_tree_entry)->task; } +struct rt_mutex_waiter *rt_mutex_get_top_waiter(struct task_struct *task) +{ + if (!task->pi_waiters_leftmost_copy) + return NULL; + + return rb_entry(task->pi_waiters_leftmost_copy, + struct rt_mutex_waiter, pi_tree_entry); +} + /* * Called by sched_setscheduler() to get the priority which will be * effective after the change. @@ -299,7 +308,17 @@ int rt_mutex_get_effective_prio(struct task_struct *task, int newprio) */ void rt_mutex_update_copy(struct task_struct *p) { + struct rt_mutex_waiter *top_waiter; + + /* We must always update it, even if NULL */ p->pi_waiters_leftmost_copy = p->pi_waiters_leftmost; + + if (!task_has_pi_waiters(p)) + return; + + top_waiter = task_top_pi_waiter(p); + top_waiter->dl_runtime_copy = top_waiter->dl_runtime; + top_waiter->dl_period_copy = top_waiter->dl_period; } /* @@ -632,6 +651,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* [7] Requeue the waiter in the lock waiter tree. */ rt_mutex_dequeue(lock, waiter); waiter->prio = task->prio; + waiter->dl_runtime = dl_policy(task->policy) ? task->dl.dl_runtime : 0; + waiter->dl_period = dl_policy(task->policy) ? task->dl.dl_period : 0; rt_mutex_enqueue(lock, waiter); /* [8] Release the task */ @@ -902,6 +923,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, waiter->task = task; waiter->lock = lock; waiter->prio = task->prio; + waiter->dl_runtime = dl_policy(task->policy) ? task->dl.dl_runtime : 0; + waiter->dl_period = dl_policy(task->policy) ? task->dl.dl_period : 0; /* Get the top priority waiter on the lock */ if (rt_mutex_has_waiters(lock)) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index affd97e..224aa64 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -932,8 +932,9 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se) static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) { - struct task_struct *pi_task = rt_mutex_get_top_task(p); + struct rt_mutex_waiter *top_waiter = rt_mutex_get_top_waiter(p); struct sched_dl_entity *pi_se = &p->dl; + struct sched_dl_entity_fake pi_se_fake; /* * Use the scheduling parameters of the top pi-waiter @@ -941,8 +942,11 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * smaller than our one... OTW we keep our runtime and * deadline. */ - if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) { - pi_se = &pi_task->dl; + if (top_waiter && p->dl.dl_boosted && top_waiter->dl_runtime_copy) { + BUG_ON(top_waiter->dl_period_copy == 0); + pi_se_fake.dl_runtime = top_waiter->dl_runtime_copy; + pi_se_fake.dl_period = top_waiter->dl_period_copy; + pi_se = (struct sched_dl_entity *)&pi_se_fake; } else if (!dl_prio(p->normal_prio)) { /* * Special case in which we have a !SCHED_DEADLINE task -- 1.8.3.1