All of lore.kernel.org
 help / color / mirror / Atom feed
From: Thomas Gleixner <tglx@linutronix.de>
To: Austin Schuh <austin@peloton-tech.com>
Cc: Richard Weinberger <richard.weinberger@gmail.com>,
	Mike Galbraith <umgwanakikbuti@gmail.com>,
	LKML <linux-kernel@vger.kernel.org>,
	rt-users <linux-rt-users@vger.kernel.org>,
	Steven Rostedt <rostedt@goodmis.org>
Subject: Re: Filesystem lockup with CONFIG_PREEMPT_RT
Date: Fri, 27 Jun 2014 16:24:52 +0200 (CEST)	[thread overview]
Message-ID: <alpine.DEB.2.10.1406271249510.5170@nanos> (raw)
In-Reply-To: <CANGgnMa+qtgJ3wwg_h5Rynw5vEvZpQZ6PvaUfXNQ8+Y3Yu5U0g@mail.gmail.com>

On Thu, 26 Jun 2014, Austin Schuh wrote:
> If I'm reading the rt patch correctly, wq_worker_sleeping was moved
> out of __schedule to sched_submit_work.  It looks like that changes
> the conditions under which wq_worker_sleeping is called.  It used to
> be called whenever a task was going to sleep (I think).  It looks like
> it is called now if the task is going to sleep, and if the task isn't
> blocked on a PI mutex (I think).
> 
> If I try the following experiment
> 
>  static inline void sched_submit_work(struct task_struct *tsk)
>  {
> +   if (tsk->state && tsk->flags & PF_WQ_WORKER) {
> +     wq_worker_sleeping(tsk);
> +     return;
> +   }
> 
> and then remove the call later in the function, I am able to pass my test.
> 
> Unfortunately, I then get a recursive pool spinlock BUG_ON after a
> while (as I would expect), and it all blows up.

Well, that's why the check for !pi_blocked_on is there.
 
> I'm not sure where to go from there.  Any changes to the workpool to
> try to fix that will be hard, or could affect latency significantly.

Completely untested patch below.

Thanks,

	tglx

--------------------->
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8749d20..621329a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2651,9 +2651,8 @@ need_resched:
 
 static inline void sched_submit_work(struct task_struct *tsk)
 {
-	if (!tsk->state || tsk_is_pi_blocked(tsk))
+	if (!tsk->state)
 		return;
-
 	/*
 	 * If a worker went to sleep, notify and ask workqueue whether
 	 * it wants to wake up a task to maintain concurrency.
@@ -2661,6 +2660,10 @@ static inline void sched_submit_work(struct task_struct *tsk)
 	if (tsk->flags & PF_WQ_WORKER)
 		wq_worker_sleeping(tsk);
 
+
+	if (tsk_is_pi_blocked(tsk))
+		return;
+
 	/*
 	 * If we are going to sleep and we have plugged IO queued,
 	 * make sure to submit it to avoid deadlocks.
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index be0ef50..0ca9d97 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -126,6 +126,8 @@ enum {
  *    cpu or grabbing pool->lock is enough for read access.  If
  *    POOL_DISASSOCIATED is set, it's identical to L.
  *
+ *    On RT we need the extra protection via rt_lock_idle_list()
+ *
  * MG: pool->manager_mutex and pool->lock protected.  Writes require both
  *     locks.  Reads can happen under either lock.
  *
@@ -409,6 +411,31 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to,
 		if (({ assert_rcu_or_wq_mutex(wq); false; })) { }	\
 		else
 
+#ifdef CONFIG_PREEMPT_RT_BASE
+static inline void rt_lock_idle_list(struct worker_pool *pool)
+{
+	preempt_disable();
+}
+static inline void rt_unlock_idle_list(struct worker_pool *pool)
+{
+	preempt_enable();
+}
+static inline void sched_lock_idle_list(struct worker_pool *pool) { }
+static inline void sched_unlock_idle_list(struct worker_pool *pool) { }
+#else
+static inline void rt_lock_idle_list(struct worker_pool *pool) { }
+static inline void rt_unlock_idle_list(struct worker_pool *pool) { }
+static inline void sched_lock_idle_list(struct worker_pool *pool)
+{
+	spin_lock_irq(&pool->lock);
+}
+static inline void sched_unlock_idle_list(struct worker_pool *pool)
+{
+	spin_unlock_irq(&pool->lock);
+}
+#endif
+
+
 #ifdef CONFIG_DEBUG_OBJECTS_WORK
 
 static struct debug_obj_descr work_debug_descr;
@@ -801,10 +828,16 @@ static struct worker *first_worker(struct worker_pool *pool)
  */
 static void wake_up_worker(struct worker_pool *pool)
 {
-	struct worker *worker = first_worker(pool);
+	struct worker *worker;
+
+	rt_lock_idle_list(pool);
+
+	worker = first_worker(pool);
 
 	if (likely(worker))
 		wake_up_process(worker->task);
+
+	rt_unlock_idle_list(pool);
 }
 
 /**
@@ -832,7 +865,7 @@ void wq_worker_running(struct task_struct *task)
  */
 void wq_worker_sleeping(struct task_struct *task)
 {
-	struct worker *next, *worker = kthread_data(task);
+	struct worker *worker = kthread_data(task);
 	struct worker_pool *pool;
 
 	/*
@@ -849,25 +882,18 @@ void wq_worker_sleeping(struct task_struct *task)
 		return;
 
 	worker->sleeping = 1;
-	spin_lock_irq(&pool->lock);
+
 	/*
 	 * The counterpart of the following dec_and_test, implied mb,
 	 * worklist not empty test sequence is in insert_work().
 	 * Please read comment there.
-	 *
-	 * NOT_RUNNING is clear.  This means that we're bound to and
-	 * running on the local cpu w/ rq lock held and preemption
-	 * disabled, which in turn means that none else could be
-	 * manipulating idle_list, so dereferencing idle_list without pool
-	 * lock is safe.
 	 */
 	if (atomic_dec_and_test(&pool->nr_running) &&
 	    !list_empty(&pool->worklist)) {
-		next = first_worker(pool);
-		if (next)
-			wake_up_process(next->task);
+		sched_lock_idle_list(pool);
+		wake_up_worker(pool);
+		sched_unlock_idle_list(pool);
 	}
-	spin_unlock_irq(&pool->lock);
 }
 
 /**
@@ -1571,7 +1597,9 @@ static void worker_enter_idle(struct worker *worker)
 	worker->last_active = jiffies;
 
 	/* idle_list is LIFO */
+	rt_lock_idle_list(pool);
 	list_add(&worker->entry, &pool->idle_list);
+	rt_unlock_idle_list(pool);
 
 	if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
 		mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
@@ -1604,7 +1632,9 @@ static void worker_leave_idle(struct worker *worker)
 		return;
 	worker_clr_flags(worker, WORKER_IDLE);
 	pool->nr_idle--;
+	rt_lock_idle_list(pool);
 	list_del_init(&worker->entry);
+	rt_unlock_idle_list(pool);
 }
 
 /**
@@ -1849,7 +1879,9 @@ static void destroy_worker(struct worker *worker)
 	 */
 	get_task_struct(worker->task);
 
+	rt_lock_idle_list(pool);
 	list_del_init(&worker->entry);
+	rt_unlock_idle_list(pool);
 	worker->flags |= WORKER_DIE;
 
 	idr_remove(&pool->worker_idr, worker->id);

  parent reply	other threads:[~2014-06-27 14:24 UTC|newest]

Thread overview: 43+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-05-14  2:29 Filesystem lockup with CONFIG_PREEMPT_RT Austin Schuh
2014-05-14  2:29 ` Austin Schuh
2014-05-21  6:23 ` Austin Schuh
2014-05-21  6:23   ` Austin Schuh
2014-05-21  7:33   ` Richard Weinberger
2014-05-21  7:33     ` Richard Weinberger
2014-06-26 19:50     ` Austin Schuh
2014-06-26 22:35       ` Thomas Gleixner
2014-06-27  0:07         ` Austin Schuh
2014-06-27  3:22           ` Mike Galbraith
2014-06-27 12:57           ` Mike Galbraith
2014-06-27 14:01             ` Steven Rostedt
2014-06-27 17:34               ` Mike Galbraith
2014-06-27 17:54                 ` Steven Rostedt
2014-06-27 18:07                   ` Mike Galbraith
2014-06-27 18:19                     ` Steven Rostedt
2014-06-27 19:11                       ` Mike Galbraith
2014-06-28  1:18                       ` Austin Schuh
2014-06-28  3:32                         ` Mike Galbraith
2014-06-28  6:20                           ` Austin Schuh
2014-06-28  7:11                             ` Mike Galbraith
2014-06-27 14:24           ` Thomas Gleixner [this message]
2014-06-28  4:51             ` Mike Galbraith
2014-07-01  0:12             ` Austin Schuh
2014-07-01  0:53               ` Austin Schuh
2014-07-05 20:26                 ` Thomas Gleixner
2014-07-06  4:55                   ` Austin Schuh
2014-07-01  3:01             ` Austin Schuh
2014-07-01 19:32               ` Austin Schuh
2014-07-03 23:08                 ` Austin Schuh
2014-07-04  4:42                   ` Mike Galbraith
2014-05-21 19:30 John Blackwood
2014-05-21 19:30 ` John Blackwood
2014-05-21 21:59 ` Austin Schuh
2014-05-21 21:59   ` Austin Schuh
2014-07-05 20:36 ` Thomas Gleixner
2014-07-05 20:36   ` Thomas Gleixner
2014-07-05 19:30 Jan de Kruyf
2014-07-07  8:48 Jan de Kruyf
2014-07-07 13:00 ` Thomas Gleixner
2014-07-07 16:23 ` Austin Schuh
2014-07-08  8:03   ` Jan de Kruyf
2014-07-08 16:09     ` Austin Schuh

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=alpine.DEB.2.10.1406271249510.5170@nanos \
    --to=tglx@linutronix.de \
    --cc=austin@peloton-tech.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-rt-users@vger.kernel.org \
    --cc=richard.weinberger@gmail.com \
    --cc=rostedt@goodmis.org \
    --cc=umgwanakikbuti@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.