All of lore.kernel.org
 help / color / mirror / Atom feed
* Soft lockups in __cancel_work_timer()
@ 2015-02-06 17:11 Rabin Vincent
  2015-02-06 18:01 ` Tejun Heo
  2015-02-09 16:15 ` [PATCH for-3.20-fixes] workqueue: fix hang involving racing cancel[_delayed]_work_sync()'s for PREEMPT_NONE Tejun Heo
  0 siblings, 2 replies; 17+ messages in thread
From: Rabin Vincent @ 2015-02-06 17:11 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Jesper Nilsson, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 2524 bytes --]

Hi Tejun,

The comment at the top of the try_to_grab_pending() says that the ENOENT
state "may persist for arbitrarily long".

If two threads call cancel_delayed_work_sync() at the same time, one of
them can end up looping in the loop in __cancel_work_timer() without
waiting in the flush_work() which is inside it.

The looping thread will see -ENOENT from try_to_grab_pending() because
the work is being cancelled, and it will see a false return from
flush_work()/start_flush_work() because there is no worker executing the
work.

task1                           task2                           worker

                                                                add to busy hash
                                                                clear pending
                                                                exec work

__cancel_work_timer()
 try_to_grab_pending()
   get pending, return 0
 set work cancelling
 flush_work()
   wait_for_completion()

                                                                remove from busy_hash

                                __cancel_work_timer()
                                while (forever) {
                                  try_to_grab_pending()
                                    return -ENOENT due to cancelling
                                  flush_work
                                    return false due to already gone
                                }


On kernels with CONFIG_PREEMPT disabled, this causes a permanent soft
lockup of the system.

Adding a cond_resched() to the loop in cancel_delayed_work_sync(), like
below, helps the simple !PREEMPT case, but does not completely fix the
problem, because the problem can still be seen if the thread which sees
the ENOENT has for example SCHED_FIFO scheduling class, both on systems
with CONFIG_PREEMPT enabled and on !PREEMPT with the cond_resched().

We've seen this problem with the cancel_delayed_work() call in
jffs2_sync_fs(), but I've attached a testing patch which forces the
problem on current mainline without the need for jffs2.

Suggestions?

Thanks.

/Rabin

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index beeeac9..904289f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2741,6 +2741,9 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
 		 */
 		if (unlikely(ret == -ENOENT))
 			flush_work(work);
+
+		if (ret < 0)
+			cond_resched();
 	} while (unlikely(ret < 0));
 
 	/* tell other tasks trying to grab @work to back off */

[-- Attachment #2: force-wq-lockup.patch --]
[-- Type: text/x-diff, Size: 3667 bytes --]

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index beeeac9..c161255 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -49,6 +49,15 @@
 #include <linux/moduleparam.h>
 #include <linux/uaccess.h>
 
+extern void hack_set_state(int newstate);
+extern void hack_wait_for_state(int state);
+
+static bool want(struct work_struct *work)
+{
+	extern void hackfunc(struct work_struct *work);
+	return work->func == hackfunc;
+}
+
 #include "workqueue_internal.h"
 
 enum {
@@ -2006,6 +2015,9 @@ __acquires(&pool->lock)
 	 */
 	set_work_pool_and_clear_pending(work, pool->id);
 
+	if (want(work))
+		hack_set_state(20);
+
 	spin_unlock_irq(&pool->lock);
 
 	lock_map_acquire_read(&pwq->wq->lockdep_map);
@@ -2039,8 +2051,14 @@ __acquires(&pool->lock)
 	 */
 	cond_resched_rcu_qs();
 
+	if (want(work))
+		hack_wait_for_state(30);
+
 	spin_lock_irq(&pool->lock);
 
+	if (want(work))
+		hack_set_state(40);
+
 	/* clear cpu intensive status */
 	if (unlikely(cpu_intensive))
 		worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
@@ -2682,6 +2700,9 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
 	insert_wq_barrier(pwq, barr, work, worker);
 	spin_unlock_irq(&pool->lock);
 
+	if (want(work) && !strcmp(current->comm, "thread1"))
+		hack_set_state(30);
+
 	/*
 	 * If @max_active is 1 or rescuer is in use, flushing another work
 	 * item on the same workqueue may lead to deadlock.  Make sure the
@@ -2733,6 +2754,12 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
 	unsigned long flags;
 	int ret;
 
+	if (want(work) && !strcmp(current->comm, "thread1"))
+		hack_wait_for_state(20);
+
+	if (want(work) && !strcmp(current->comm, "thread2"))
+		hack_wait_for_state(40);
+
 	do {
 		ret = try_to_grab_pending(work, is_dwork, &flags);
 		/*
@@ -2741,6 +2768,9 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
 		 */
 		if (unlikely(ret == -ENOENT))
 			flush_work(work);
+
+		if (want(work) && !strcmp(current->comm, "thread2"))
+			hack_set_state(50);
 	} while (unlikely(ret < 0));
 
 	/* tell other tasks trying to grab @work to back off */
@@ -2748,6 +2778,8 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
 	local_irq_restore(flags);
 
 	flush_work(work);
+	if (want(work) && !strcmp(current->comm, "thread1"))
+		hack_wait_for_state(50);
 	clear_work_data(work);
 	return ret;
 }
@@ -4910,3 +4942,59 @@ static int __init init_workqueues(void)
 	return 0;
 }
 early_initcall(init_workqueues);
+
+static struct completion completions[100];
+
+void hack_set_state(int newstate)
+{
+	trace_printk("caller %pF sets state to %d\n", (void *)_RET_IP_, newstate);
+	complete(&completions[newstate]);
+}
+
+void hack_wait_for_state(int state)
+{
+	trace_printk("caller %pF wants state %d\n", (void *)_RET_IP_, state);
+	wait_for_completion(&completions[state]);
+	trace_printk("caller %pF got state %d\n", (void *)_RET_IP_, state);
+}
+
+void hackfunc(struct work_struct *work)
+{
+	printk("%s\n", __func__);
+}
+
+static DECLARE_DELAYED_WORK(hackwork, hackfunc);
+
+static int thread1(void *data)
+{
+	cancel_delayed_work_sync(&hackwork);
+
+	printk("%s: done\n", __func__);
+
+	return 0;
+}
+
+static int thread2(void *data)
+{
+	cancel_delayed_work_sync(&hackwork);
+
+	printk("%s: done\n", __func__);
+
+	return 0;
+}
+
+static int hack_init(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(completions); i++)
+		init_completion(&completions[i]);
+
+	kthread_run(thread1, NULL, "thread1");
+	kthread_run(thread2, NULL, "thread2");
+
+	queue_delayed_work(system_long_wq, &hackwork, msecs_to_jiffies(5));
+
+	return 0;
+}
+late_initcall(hack_init);

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: Soft lockups in __cancel_work_timer()
  2015-02-06 17:11 Soft lockups in __cancel_work_timer() Rabin Vincent
@ 2015-02-06 18:01 ` Tejun Heo
  2015-02-09 16:15 ` [PATCH for-3.20-fixes] workqueue: fix hang involving racing cancel[_delayed]_work_sync()'s for PREEMPT_NONE Tejun Heo
  1 sibling, 0 replies; 17+ messages in thread
From: Tejun Heo @ 2015-02-06 18:01 UTC (permalink / raw)
  To: Rabin Vincent; +Cc: Jesper Nilsson, linux-kernel

Hello,

On Fri, Feb 06, 2015 at 06:11:56PM +0100, Rabin Vincent wrote:
> task1                           task2                           worker
> 
>                                                                 add to busy hash
>                                                                 clear pending
>                                                                 exec work
> 
> __cancel_work_timer()
>  try_to_grab_pending()
>    get pending, return 0
>  set work cancelling
>  flush_work()
>    wait_for_completion()
> 
>                                                                 remove from busy_hash
> 
>                                 __cancel_work_timer()
>                                 while (forever) {
>                                   try_to_grab_pending()
>                                     return -ENOENT due to cancelling
>                                   flush_work
>                                     return false due to already gone
>                                 }
> 
> 
> On kernels with CONFIG_PREEMPT disabled, this causes a permanent soft
> lockup of the system.

Ah, drat.

> Adding a cond_resched() to the loop in cancel_delayed_work_sync(), like
> below, helps the simple !PREEMPT case, but does not completely fix the
> problem, because the problem can still be seen if the thread which sees
> the ENOENT has for example SCHED_FIFO scheduling class, both on systems
> with CONFIG_PREEMPT enabled and on !PREEMPT with the cond_resched().
> 
> We've seen this problem with the cancel_delayed_work() call in
> jffs2_sync_fs(), but I've attached a testing patch which forces the
> problem on current mainline without the need for jffs2.
...
> @@ -2741,6 +2741,9 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
>  		 */
>  		if (unlikely(ret == -ENOENT))
>  			flush_work(work);
> +
> +		if (ret < 0)
> +			cond_resched();
>  	} while (unlikely(ret < 0));

Well, an obvious thing to do would be

		if (unlikely(ret == -ENOENT)) {
			if (!flush_work(work))
				schedule_timeout(1);
		}

It was gonna block for the work item anyway so I don't think
schedule_timeout() there is a problem.  That said, given that we're
guaranteed to be able to dereference @work there, there probably is a
better way.  I'll think more about it.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH for-3.20-fixes] workqueue: fix hang involving racing cancel[_delayed]_work_sync()'s for PREEMPT_NONE
  2015-02-06 17:11 Soft lockups in __cancel_work_timer() Rabin Vincent
  2015-02-06 18:01 ` Tejun Heo
@ 2015-02-09 16:15 ` Tejun Heo
  2015-02-09 16:29   ` Jesper Nilsson
                     ` (2 more replies)
  1 sibling, 3 replies; 17+ messages in thread
From: Tejun Heo @ 2015-02-09 16:15 UTC (permalink / raw)
  To: Rabin Vincent; +Cc: Jesper Nilsson, linux-kernel

Hello,

Can you please verify that the following patch fixes the issue?

Thanks.

-------- 8< --------
cancel[_delayed]_work_sync() are implemented using
__cancel_work_timer() which grabs the PENDING bit using
try_to_grab_pending() and then flushes the work item with PENDING set
to prevent the on-going execution of the work item from requeueing
itself.

try_to_grab_pending() can always grab PENDING bit without blocking
except when someone else is doing the above flushing during
cancelation.  In that case, try_to_grab_pending() returns -ENOENT.  In
this case, __cancel_work_timer() currently invokes flush_work().  The
assumption is that the completion of the work item is what the other
canceling task would be waiting for too and thus waiting for the same
condition and retrying should allow forward progress without excessive
busy looping

Unfortunately, this doesn't work if preemption is disabled or the
latter task has real time priority.  Let's say task A just got woken
up from flush_work() by the completion of the target work item.  If,
before task A starts executing, task B gets scheduled and invokes
__cancel_work_timer() on the same work item, its try_to_grab_pending()
will return -ENOENT as the work item is still being canceled by task A
and flush_work() will also immediately return false as the work item
is no longer executing.  This puts task B in a busy loop possibly
preventing task A from executing and clearing the canceling state on
the work item leading to a hang.

task A			task B			worker

						executing work
__cancel_work_timer()
  try_to_grab_pending()
  set work CANCELING
  flush_work()
    block for work completion
						completion, wakes up A
			__cancel_work_timer()
			while (forever) {
			  try_to_grab_pending()
			    -ENOENT as work is being canceled
			  flush_work()
			    false as work is no longer executing
			}

This patch removes the possible hang by updating __cancel_work_timer()
to explicitly wait for clearing of CANCELING rather than invoking
flush_work() after try_to_grab_pending() fails with -ENOENT.  The
explicit wait uses the matching bit waitqueue for the CANCELING bit.

Link: http://lkml.kernel.org/g/20150206171156.GA8942@axis.com

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Rabin Vincent <rabin.vincent@axis.com>
Cc: stable@vger.kernel.org
---
 include/linux/workqueue.h |    3 ++-
 kernel/workqueue.c        |   36 ++++++++++++++++++++++++++++++++----
 2 files changed, 34 insertions(+), 5 deletions(-)

--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -70,7 +70,8 @@ enum {
 	/* data contains off-queue information when !WORK_STRUCT_PWQ */
 	WORK_OFFQ_FLAG_BASE	= WORK_STRUCT_COLOR_SHIFT,
 
-	WORK_OFFQ_CANCELING	= (1 << WORK_OFFQ_FLAG_BASE),
+	__WORK_OFFQ_CANCELING	= WORK_OFFQ_FLAG_BASE,
+	WORK_OFFQ_CANCELING	= (1 << __WORK_OFFQ_CANCELING),
 
 	/*
 	 * When a work item is off queue, its high bits point to the last
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2730,17 +2730,35 @@ EXPORT_SYMBOL_GPL(flush_work);
 
 static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
 {
+	wait_queue_head_t *waitq = bit_waitqueue(&work->data,
+						 __WORK_OFFQ_CANCELING);
 	unsigned long flags;
 	int ret;
 
 	do {
 		ret = try_to_grab_pending(work, is_dwork, &flags);
 		/*
-		 * If someone else is canceling, wait for the same event it
-		 * would be waiting for before retrying.
+		 * If someone else is already canceling, wait for it to
+		 * finish.  flush_work() doesn't work for PREEMPT_NONE
+		 * because we may get scheduled between @work's completion
+		 * and the other canceling task resuming and clearing
+		 * CANCELING - flush_work() will return false immediately
+		 * as @work is no longer busy, try_to_grab_pending() will
+		 * return -ENOENT as @work is still being canceled and the
+		 * other canceling task won't be able to clear CANCELING as
+		 * we're hogging the CPU.
+		 *
+		 * Explicitly wait for completion using a bit waitqueue.
+		 * We can't use wait_on_bit() as the CANCELING bit may get
+		 * recycled to point to pwq if @work gets re-queued.
 		 */
-		if (unlikely(ret == -ENOENT))
-			flush_work(work);
+		if (unlikely(ret == -ENOENT)) {
+			DEFINE_WAIT(wait);
+			prepare_to_wait(waitq, &wait, TASK_UNINTERRUPTIBLE);
+			if (work_is_canceling(work))
+				schedule();
+			finish_wait(waitq, &wait);
+		}
 	} while (unlikely(ret < 0));
 
 	/* tell other tasks trying to grab @work to back off */
@@ -2749,6 +2767,16 @@ static bool __cancel_work_timer(struct w
 
 	flush_work(work);
 	clear_work_data(work);
+
+	/*
+	 * Paired with prepare_to_wait() above so that either
+	 * waitqueue_active() is visible here or !work_is_canceling() is
+	 * visible there.
+	 */
+	smp_mb();
+	if (waitqueue_active(waitq))
+		wake_up(waitq);
+
 	return ret;
 }
 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH for-3.20-fixes] workqueue: fix hang involving racing cancel[_delayed]_work_sync()'s for PREEMPT_NONE
  2015-02-09 16:15 ` [PATCH for-3.20-fixes] workqueue: fix hang involving racing cancel[_delayed]_work_sync()'s for PREEMPT_NONE Tejun Heo
@ 2015-02-09 16:29   ` Jesper Nilsson
  2015-02-10  8:35     ` Jesper Nilsson
  2015-02-10  9:33   ` Rabin Vincent
  2015-03-02 12:26   ` Jesper Nilsson
  2 siblings, 1 reply; 17+ messages in thread
From: Jesper Nilsson @ 2015-02-09 16:29 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Rabin Vincent, Jesper Nilsson, linux-kernel

On Mon, Feb 09, 2015 at 05:15:27PM +0100, Tejun Heo wrote:
> Hello,

Hi,

> Can you please verify that the following patch fixes the issue?

Rabin will have to report if it fixes it for his synthetic case,
but I'll try it in my "real-world" jffs2 sync problem, and report
after a couple of hours.

> Thanks.

/Jesper

> -------- 8< --------
> cancel[_delayed]_work_sync() are implemented using
> __cancel_work_timer() which grabs the PENDING bit using
> try_to_grab_pending() and then flushes the work item with PENDING set
> to prevent the on-going execution of the work item from requeueing
> itself.
> 
> try_to_grab_pending() can always grab PENDING bit without blocking
> except when someone else is doing the above flushing during
> cancelation.  In that case, try_to_grab_pending() returns -ENOENT.  In
> this case, __cancel_work_timer() currently invokes flush_work().  The
> assumption is that the completion of the work item is what the other
> canceling task would be waiting for too and thus waiting for the same
> condition and retrying should allow forward progress without excessive
> busy looping
> 
> Unfortunately, this doesn't work if preemption is disabled or the
> latter task has real time priority.  Let's say task A just got woken
> up from flush_work() by the completion of the target work item.  If,
> before task A starts executing, task B gets scheduled and invokes
> __cancel_work_timer() on the same work item, its try_to_grab_pending()
> will return -ENOENT as the work item is still being canceled by task A
> and flush_work() will also immediately return false as the work item
> is no longer executing.  This puts task B in a busy loop possibly
> preventing task A from executing and clearing the canceling state on
> the work item leading to a hang.
> 
> task A			task B			worker
> 
> 						executing work
> __cancel_work_timer()
>   try_to_grab_pending()
>   set work CANCELING
>   flush_work()
>     block for work completion
> 						completion, wakes up A
> 			__cancel_work_timer()
> 			while (forever) {
> 			  try_to_grab_pending()
> 			    -ENOENT as work is being canceled
> 			  flush_work()
> 			    false as work is no longer executing
> 			}
> 
> This patch removes the possible hang by updating __cancel_work_timer()
> to explicitly wait for clearing of CANCELING rather than invoking
> flush_work() after try_to_grab_pending() fails with -ENOENT.  The
> explicit wait uses the matching bit waitqueue for the CANCELING bit.
> 
> Link: http://lkml.kernel.org/g/20150206171156.GA8942@axis.com
> 
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Reported-by: Rabin Vincent <rabin.vincent@axis.com>
> Cc: stable@vger.kernel.org
> ---
>  include/linux/workqueue.h |    3 ++-
>  kernel/workqueue.c        |   36 ++++++++++++++++++++++++++++++++----
>  2 files changed, 34 insertions(+), 5 deletions(-)
> 
> --- a/include/linux/workqueue.h
> +++ b/include/linux/workqueue.h
> @@ -70,7 +70,8 @@ enum {
>  	/* data contains off-queue information when !WORK_STRUCT_PWQ */
>  	WORK_OFFQ_FLAG_BASE	= WORK_STRUCT_COLOR_SHIFT,
>  
> -	WORK_OFFQ_CANCELING	= (1 << WORK_OFFQ_FLAG_BASE),
> +	__WORK_OFFQ_CANCELING	= WORK_OFFQ_FLAG_BASE,
> +	WORK_OFFQ_CANCELING	= (1 << __WORK_OFFQ_CANCELING),
>  
>  	/*
>  	 * When a work item is off queue, its high bits point to the last
> --- a/kernel/workqueue.c
> +++ b/kernel/workqueue.c
> @@ -2730,17 +2730,35 @@ EXPORT_SYMBOL_GPL(flush_work);
>  
>  static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
>  {
> +	wait_queue_head_t *waitq = bit_waitqueue(&work->data,
> +						 __WORK_OFFQ_CANCELING);
>  	unsigned long flags;
>  	int ret;
>  
>  	do {
>  		ret = try_to_grab_pending(work, is_dwork, &flags);
>  		/*
> -		 * If someone else is canceling, wait for the same event it
> -		 * would be waiting for before retrying.
> +		 * If someone else is already canceling, wait for it to
> +		 * finish.  flush_work() doesn't work for PREEMPT_NONE
> +		 * because we may get scheduled between @work's completion
> +		 * and the other canceling task resuming and clearing
> +		 * CANCELING - flush_work() will return false immediately
> +		 * as @work is no longer busy, try_to_grab_pending() will
> +		 * return -ENOENT as @work is still being canceled and the
> +		 * other canceling task won't be able to clear CANCELING as
> +		 * we're hogging the CPU.
> +		 *
> +		 * Explicitly wait for completion using a bit waitqueue.
> +		 * We can't use wait_on_bit() as the CANCELING bit may get
> +		 * recycled to point to pwq if @work gets re-queued.
>  		 */
> -		if (unlikely(ret == -ENOENT))
> -			flush_work(work);
> +		if (unlikely(ret == -ENOENT)) {
> +			DEFINE_WAIT(wait);
> +			prepare_to_wait(waitq, &wait, TASK_UNINTERRUPTIBLE);
> +			if (work_is_canceling(work))
> +				schedule();
> +			finish_wait(waitq, &wait);
> +		}
>  	} while (unlikely(ret < 0));
>  
>  	/* tell other tasks trying to grab @work to back off */
> @@ -2749,6 +2767,16 @@ static bool __cancel_work_timer(struct w
>  
>  	flush_work(work);
>  	clear_work_data(work);
> +
> +	/*
> +	 * Paired with prepare_to_wait() above so that either
> +	 * waitqueue_active() is visible here or !work_is_canceling() is
> +	 * visible there.
> +	 */
> +	smp_mb();
> +	if (waitqueue_active(waitq))
> +		wake_up(waitq);
> +
>  	return ret;
>  }
>  

/^JN - Jesper Nilsson
-- 
               Jesper Nilsson -- jesper.nilsson@axis.com

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH for-3.20-fixes] workqueue: fix hang involving racing cancel[_delayed]_work_sync()'s for PREEMPT_NONE
  2015-02-09 16:29   ` Jesper Nilsson
@ 2015-02-10  8:35     ` Jesper Nilsson
  0 siblings, 0 replies; 17+ messages in thread
From: Jesper Nilsson @ 2015-02-10  8:35 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Rabin Vincent, Jesper Nilsson, linux-kernel

On Mon, Feb 09, 2015 at 05:29:55PM +0100, Jesper Nilsson wrote:
> On Mon, Feb 09, 2015 at 05:15:27PM +0100, Tejun Heo wrote:
> > Hello,
> 
> Hi,
> 
> > Can you please verify that the following patch fixes the issue?
> 
> Rabin will have to report if it fixes it for his synthetic case,
> but I'll try it in my "real-world" jffs2 sync problem, and report
> after a couple of hours.

I let the test run for the night, and got no failures.
Before I got at most an hour of tests before lockup.

Tested-by: Jesper Nilsson <jesper.nilsson@axis.com>

/^JN - Jesper Nilsson
-- 
               Jesper Nilsson -- jesper.nilsson@axis.com

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH for-3.20-fixes] workqueue: fix hang involving racing cancel[_delayed]_work_sync()'s for PREEMPT_NONE
  2015-02-09 16:15 ` [PATCH for-3.20-fixes] workqueue: fix hang involving racing cancel[_delayed]_work_sync()'s for PREEMPT_NONE Tejun Heo
  2015-02-09 16:29   ` Jesper Nilsson
@ 2015-02-10  9:33   ` Rabin Vincent
  2015-03-02 12:26   ` Jesper Nilsson
  2 siblings, 0 replies; 17+ messages in thread
From: Rabin Vincent @ 2015-02-10  9:33 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Jesper Nilsson, linux-kernel

On Mon, Feb 09, 2015 at 05:15:27PM +0100, Tejun Heo wrote:
> This patch removes the possible hang by updating __cancel_work_timer()
> to explicitly wait for clearing of CANCELING rather than invoking
> flush_work() after try_to_grab_pending() fails with -ENOENT.  The
> explicit wait uses the matching bit waitqueue for the CANCELING bit.
> 
> Link: http://lkml.kernel.org/g/20150206171156.GA8942@axis.com
> 
> Signed-off-by: Tejun Heo <tj@kernel.org>

Fixes my synthetic test:

Tested-by: Rabin Vincent <rabin.vincent@axis.com>

Thanks.

/Rabin

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH for-3.20-fixes] workqueue: fix hang involving racing cancel[_delayed]_work_sync()'s for PREEMPT_NONE
  2015-02-09 16:15 ` [PATCH for-3.20-fixes] workqueue: fix hang involving racing cancel[_delayed]_work_sync()'s for PREEMPT_NONE Tejun Heo
  2015-02-09 16:29   ` Jesper Nilsson
  2015-02-10  9:33   ` Rabin Vincent
@ 2015-03-02 12:26   ` Jesper Nilsson
  2015-03-02 16:21     ` Tejun Heo
  2 siblings, 1 reply; 17+ messages in thread
From: Jesper Nilsson @ 2015-03-02 12:26 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Rabin Vincent, Jesper Nilsson, linux-kernel

On Mon, Feb 09, 2015 at 05:15:27PM +0100, Tejun Heo wrote:
> Hello,

Hi!

> This patch removes the possible hang by updating __cancel_work_timer()
> to explicitly wait for clearing of CANCELING rather than invoking
> flush_work() after try_to_grab_pending() fails with -ENOENT.  The
> explicit wait uses the matching bit waitqueue for the CANCELING bit.
> 
> Link: http://lkml.kernel.org/g/20150206171156.GA8942@axis.com
> 
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Reported-by: Rabin Vincent <rabin.vincent@axis.com>
> Cc: stable@vger.kernel.org

What's the status on this patch, it's not in 4.0-rc1 at least?
Is it queued for the 3.18 stable branch?

Best regards,

/^JN - Jesper Nilsson
-- 
               Jesper Nilsson -- jesper.nilsson@axis.com

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH for-3.20-fixes] workqueue: fix hang involving racing cancel[_delayed]_work_sync()'s for PREEMPT_NONE
  2015-03-02 12:26   ` Jesper Nilsson
@ 2015-03-02 16:21     ` Tejun Heo
  2015-03-03  7:52       ` Jesper Nilsson
  2015-03-03 10:00       ` Tomeu Vizoso
  0 siblings, 2 replies; 17+ messages in thread
From: Tejun Heo @ 2015-03-02 16:21 UTC (permalink / raw)
  To: Jesper Nilsson; +Cc: Rabin Vincent, Jesper Nilsson, linux-kernel

On Mon, Mar 02, 2015 at 01:26:15PM +0100, Jesper Nilsson wrote:
> On Mon, Feb 09, 2015 at 05:15:27PM +0100, Tejun Heo wrote:
> > Hello,
> 
> Hi!
> 
> > This patch removes the possible hang by updating __cancel_work_timer()
> > to explicitly wait for clearing of CANCELING rather than invoking
> > flush_work() after try_to_grab_pending() fails with -ENOENT.  The
> > explicit wait uses the matching bit waitqueue for the CANCELING bit.
> > 
> > Link: http://lkml.kernel.org/g/20150206171156.GA8942@axis.com
> > 
> > Signed-off-by: Tejun Heo <tj@kernel.org>
> > Reported-by: Rabin Vincent <rabin.vincent@axis.com>
> > Cc: stable@vger.kernel.org
> 
> What's the status on this patch, it's not in 4.0-rc1 at least?
> Is it queued for the 3.18 stable branch?

Sorry about the delay.  Applied to wq/for-4.0-fixes.  Will push out in
a week or so.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH for-3.20-fixes] workqueue: fix hang involving racing cancel[_delayed]_work_sync()'s for PREEMPT_NONE
  2015-03-02 16:21     ` Tejun Heo
@ 2015-03-03  7:52       ` Jesper Nilsson
  2015-03-03 10:00       ` Tomeu Vizoso
  1 sibling, 0 replies; 17+ messages in thread
From: Jesper Nilsson @ 2015-03-03  7:52 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Jesper Nilsson, Rabin Vincent, Jesper Nilsson, linux-kernel

On Mon, Mar 02, 2015 at 11:21:44AM -0500, Tejun Heo wrote:
> On Mon, Mar 02, 2015 at 01:26:15PM +0100, Jesper Nilsson wrote:
> > On Mon, Feb 09, 2015 at 05:15:27PM +0100, Tejun Heo wrote:
> > > Hello,
> > 
> > Hi!
> > 
> > > This patch removes the possible hang by updating __cancel_work_timer()
> > > to explicitly wait for clearing of CANCELING rather than invoking
> > > flush_work() after try_to_grab_pending() fails with -ENOENT.  The
> > > explicit wait uses the matching bit waitqueue for the CANCELING bit.
> > > 
> > > Link: http://lkml.kernel.org/g/20150206171156.GA8942@axis.com
> > > 
> > > Signed-off-by: Tejun Heo <tj@kernel.org>
> > > Reported-by: Rabin Vincent <rabin.vincent@axis.com>
> > > Cc: stable@vger.kernel.org
> > 
> > What's the status on this patch, it's not in 4.0-rc1 at least?
> > Is it queued for the 3.18 stable branch?
> 
> Sorry about the delay.  Applied to wq/for-4.0-fixes.  Will push out in
> a week or so.

Thanks, that works great. Just wanted to know. :-)

> Thanks.
> 
> -- 
> tejun

/^JN - Jesper Nilsson
-- 
               Jesper Nilsson -- jesper.nilsson@axis.com

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH for-3.20-fixes] workqueue: fix hang involving racing cancel[_delayed]_work_sync()'s for PREEMPT_NONE
  2015-03-02 16:21     ` Tejun Heo
  2015-03-03  7:52       ` Jesper Nilsson
@ 2015-03-03 10:00       ` Tomeu Vizoso
  2015-03-03 13:21         ` Tejun Heo
  1 sibling, 1 reply; 17+ messages in thread
From: Tomeu Vizoso @ 2015-03-03 10:00 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Jesper Nilsson, Rabin Vincent, Jesper Nilsson, linux-kernel

On 2 March 2015 at 17:21, Tejun Heo <tj@kernel.org> wrote:
> On Mon, Mar 02, 2015 at 01:26:15PM +0100, Jesper Nilsson wrote:
>> On Mon, Feb 09, 2015 at 05:15:27PM +0100, Tejun Heo wrote:
>> > Hello,
>>
>> Hi!
>>
>> > This patch removes the possible hang by updating __cancel_work_timer()
>> > to explicitly wait for clearing of CANCELING rather than invoking
>> > flush_work() after try_to_grab_pending() fails with -ENOENT.  The
>> > explicit wait uses the matching bit waitqueue for the CANCELING bit.
>> >
>> > Link: http://lkml.kernel.org/g/20150206171156.GA8942@axis.com
>> >
>> > Signed-off-by: Tejun Heo <tj@kernel.org>
>> > Reported-by: Rabin Vincent <rabin.vincent@axis.com>
>> > Cc: stable@vger.kernel.org
>>
>> What's the status on this patch, it's not in 4.0-rc1 at least?
>> Is it queued for the 3.18 stable branch?
>
> Sorry about the delay.  Applied to wq/for-4.0-fixes.  Will push out in
> a week or so.

Hello,

I'm getting this during almost every boot this morning, after rebasing
on today's linux-next. Reverting this patch makes the issue go away.
This has been tested on a Tegra 124-based Acer Chromebook 13, running
a Debian derivative (I mention this because I see that in some test
farms the boot succeeded on similar hw, but they probably have a
simpler userspace, eg !systemd).

[    7.358239] Unable to handle kernel NULL pointer dereference at
virtual address 00000000
[    7.368225] pgd = c0204000
[    7.372693] [00000000] *pgd=00000000
[    7.378031] Internal error: Oops: 17 [#1] SMP ARM
[    7.384486] Modules linked in: ipv6
[    7.389738] CPU: 1 PID: 110 Comm: kworker/1:2 Not tainted
4.0.0-rc1-next-20150303ccu #568
[    7.399687] Hardware name: NVIDIA Tegra SoC (Flattened Device Tree)
[    7.407736] Workqueue: cgroup_destroy css_free_work_fn
[    7.414645] task: ecfe8e40 ti: eb9e0000 task.ti: eb9e0000
[    7.421803] PC is at wake_bit_function+0x18/0x6c
[    7.428168] LR is at __wake_up_common+0x5c/0x90
[    7.434433] pc : [<c028f54c>]    lr : [<c028ed3c>]    psr: 200f0093
[    7.434433] sp : eb9e1df0  ip : eb9e1e08  fp : eb9e1e04
[    7.449379] r10: 00000001  r9 : 00000003  r8 : 00000000
[    7.456331] r7 : 00000000  r6 : ee837a28  r5 : 00000001  r4 : eeda8ee0
[    7.464580] r3 : 00000000  r2 : 00000000  r1 : 00000003  r0 : ebb19df4
[    7.472825] Flags: nzCv  IRQs off  FIQs on  Mode SVC_32  ISA ARM
Segment kernel
[    7.481952] Control: 10c5387d  Table: abb9406a  DAC: 00000015
[    7.489427] Process kworker/1:2 (pid: 110, stack limit = 0xeb9e0220)
[    7.497524] Stack: (0xeb9e1df0 to 0xeb9e2000)
[    7.503616] 1de0:                                     ee837a1c
00000001 eb9e1e34 eb9e1e08
[    7.513549] 1e00: c028ed3c c028f540 00000000 ee837a24 800f0013
00000000 00000001 00000003
[    7.523490] 1e20: 00000000 00000000 eb9e1e64 eb9e1e38 c028ef88
c028ecec 00000000 c026d274
[    7.533438] 1e40: eb9e1e74 00000011 eb932fb4 00000000 ee837a24
c028f4f0 eb9e1eac eb9e1e68
[    7.543390] 1e60: c026fdfc c028ef4c 600f0013 eb9e1e88 eb9e1e88
eb9e1e74 eb9e1e74 00000006
[    7.553357] 1e80: 00000000 ebaf2a80 eb932f88 eb932f00 eb932f90
c11f5638 00000000 ee7f7005
[    7.563329] 1ea0: eb9e1ebc eb9e1eb0 c026fedc c026fd20 eb9e1ee4
eb9e1ec0 c02ce3bc c026fecc
[    7.573310] 1ec0: eb932f50 ecf81080 c11b0338 ee7f33c0 ee7f7000
00000000 eb9e1f24 eb9e1ee8
[    7.583296] 1ee0: c026f4d0 c02ce254 ee7f33c0 ee7f33d4 eb9e0000
00000000 ecf81080 ee7f33c0
[    7.593289] 1f00: ecf81098 ee7f33d4 eb9e0000 00000008 ecf81080
ee7f33c0 eb9e1f5c eb9e1f28
[    7.603295] 1f20: c026ff6c c026f380 c026ff18 c10ae100 00000000
00000000 eb9c0180 ecf81080
[    7.613305] 1f40: c026ff18 00000000 00000000 00000000 eb9e1fac
eb9e1f60 c02749d8 c026ff24
[    7.623321] 1f60: 00000000 00000000 00000000 ecf81080 00000000
00000000 eb9e1f78 eb9e1f78
[    7.633337] 1f80: 00000000 00000000 eb9e1f88 eb9e1f88 eb9c0180
c02748ec 00000000 00000000
[    7.643330] 1fa0: 00000000 eb9e1fb0 c0210aa0 c02748f8 00000000
00000000 00000000 00000000
[    7.653299] 1fc0: 00000000 00000000 00000000 00000000 00000000
00000000 00000000 00000000
[    7.663248] 1fe0: 00000000 00000000 00000000 00000000 00000013
00000000 00000000 00000000
[    7.673183] [<c028f54c>] (wake_bit_function) from [<c028ed3c>]
(__wake_up_common+0x5c/0x90)
[    7.683300] [<c028ed3c>] (__wake_up_common) from [<c028ef88>]
(__wake_up+0x48/0x5c)
[    7.692744] [<c028ef88>] (__wake_up) from [<c026fdfc>]
(__cancel_work_timer+0xe8/0x1ac)
[    7.702533] [<c026fdfc>] (__cancel_work_timer) from [<c026fedc>]
(cancel_work_sync+0x1c/0x20)
[    7.712854] [<c026fedc>] (cancel_work_sync) from [<c02ce3bc>]
(css_free_work_fn+0x174/0x2ec)
[    7.723099] [<c02ce3bc>] (css_free_work_fn) from [<c026f4d0>]
(process_one_work+0x15c/0x3dc)
[    7.733339] [<c026f4d0>] (process_one_work) from [<c026ff6c>]
(worker_thread+0x54/0x4e8)
[    7.743224] [<c026ff6c>] (worker_thread) from [<c02749d8>]
(kthread+0xec/0x104)
[    7.752339] [<c02749d8>] (kthread) from [<c0210aa0>]
(ret_from_fork+0x14/0x34)
[    7.761366] Code: e24cb004 e52de004 e8bd4000 e510400c (e5935000)
[    7.769273] ---[ end trace f25fc65c3d66034c ]---
[    7.778675] Unable to handle kernel paging request at virtual
address ffffffec
[    7.787735] pgd = c0204000
[    7.792274] [ffffffec] *pgd=af7fd821, *pte=00000000, *ppte=00000000
[    7.800424] Internal error: Oops: 17 [#2] SMP ARM
[    7.806970] Modules linked in: ipv6
[    7.812307] CPU: 1 PID: 110 Comm: kworker/1:2 Tainted: G      D
    4.0.0-rc1-next-20150303ccu #568
[    7.823549] Hardware name: NVIDIA Tegra SoC (Flattened Device Tree)
[    7.831683] task: ecfe8e40 ti: eb9e0000 task.ti: eb9e0000
[    7.838946] PC is at kthread_data+0x18/0x20
[    7.844997] LR is at wq_worker_sleeping+0x1c/0xe0
[    7.851562] pc : [<c02750a4>]    lr : [<c02704ac>]    psr: 00070093
[    7.851562] sp : eb9e1b38  ip : eb9e1b48  fp : eb9e1b44
[    7.866774] r10: 2d74a000  r9 : ecfe90d4  r8 : c10aedd4
[    7.873862] r7 : c10a9840  r6 : c10a9840  r5 : ecfe8e40  r4 : 00000001
[    7.882253] r3 : 00000000  r2 : 00000000  r1 : 00000001  r0 : ecfe8e40
[    7.890642] Flags: nzcv  IRQs off  FIQs on  Mode SVC_32  ISA ARM
Segment user
[    7.899735] Control: 10c5387d  Table: ab82806a  DAC: 00000015
[    7.907359] Process kworker/1:2 (pid: 110, stack limit = 0xeb9e0220)
[    7.915604] Stack: (0xeb9e1b38 to 0xeb9e2000)
[    7.921836] 1b20:
    eb9e1b5c eb9e1b48
[    7.931894] 1b40: c02704ac c0275098 ee7f3840 ecfe8e40 eb9e1ba4
eb9e1b60 c0ad3a64 c027049c
[    7.941971] 1b60: eb9e1bbc eb9e1b70 c025a2cc c02a81ac 00000000
00000001 ecfe6e08 eb9e0000
[    7.952061] 1b80: eb9e199c eb9e1bc8 ecfe9050 00000001 c028f550
ec920000 eb9e1bbc eb9e1ba8
[    7.962152] 1ba0: c0ad3cfc c0ad3700 0420806c ecfe8e40 eb9e1bfc
eb9e1bc0 c025a9b8 c0ad3cbc
[    7.972247] 1bc0: eb9e1bec c10f0ffc eb9e1bc8 eb9e1bc8 eb9e1da8
c11ca184 c10b32e4 eb9e1da8
[    7.982331] 1be0: 600f0193 0000000b c028f550 00000001 eb9e1c84
eb9e1c00 c0215058 c025a368
[    7.992414] 1c00: eb9e0220 0000000b c0d7bc6c c0d7bc64 00000008
00000000 00000000 c10b32e4
[    8.002501] 1c20: 6529b270 62633432 20343030 64323565 34303065
62386520 30303464 35652030
[    8.012600] 1c40: 30343031 28206330 33393565 30303035 c0002029
c0ad17e4 c0e54bcc 00000000
[    8.022715] 1c60: 00000017 eb9e1da8 00000000 00000000 00000003
eb9e1da8 eb9e1c9c eb9e1c88
[    8.032832] 1c80: c0ad0df4 c0214c08 eb9e1da8 ecfe8e40 eb9e1cf4
eb9e1ca0 c0221408 c0ad0d8c
[    8.042955] 1ca0: ecfe8e40 c10aedd4 ec9ab440 ecfe8e88 ee7f3840
c0285c50 ee7f3880 ec9ab490
[    8.053090] 1cc0: eb9e1ce4 eb9e1cd0 c0284f88 c10b3864 00000017
c0221190 00000000 eb9e1da8
[    8.063238] 1ce0: 00000003 00000001 eb9e1da4 eb9e1cf8 c02091e8
c022119c c10a9840 c0275894
[    8.073402] 1d00: ecb5801c c0275894 eb9e1d3c eb9e1d18 c0275894
c0219c3c ecb5801c ecfe8e40
[    8.083560] 1d20: ec9ab440 00000000 eb81ca80 c0ad3904 ee7f3840
ecfe8e40 ec9ab440 00000000
[    8.093711] 1d40: eb81ca80 ecfe90d0 eb9e1d9c eb9e1d58 c0ad3904
c027b70c 00000000 000003ff
[    8.103866] 1d60: 00000000 00000001 2d74a000 ee7f3840 00000000
eb9e0000 eb9e0000 eb9e1e84
[    8.114026] 1d80: 00000002 c028f54c 200f0093 ffffffff eb9e1ddc
00000000 eb9e1e04 eb9e1da8
[    8.124175] 1da0: c02157d8 c02091ac ebb19df4 00000003 00000000
00000000 eeda8ee0 00000001
[    8.134322] 1dc0: ee837a28 00000000 00000000 00000003 00000001
eb9e1e04 eb9e1e08 eb9e1df0
[    8.144464] 1de0: c028ed3c c028f54c 200f0093 ffffffff ee837a1c
00000001 eb9e1e34 eb9e1e08
[    8.154608] 1e00: c028ed3c c028f540 00000000 ee837a24 800f0013
00000000 00000001 00000003
[    8.164744] 1e20: 00000000 00000000 eb9e1e64 eb9e1e38 c028ef88
c028ecec 00000000 c026d274
[    8.174879] 1e40: eb9e1e74 00000011 eb932fb4 00000000 ee837a24
c028f4f0 eb9e1eac eb9e1e68
[    8.185012] 1e60: c026fdfc c028ef4c 600f0013 eb9e1e88 eb9e1e88
eb9e1e74 eb9e1e74 00000006
[    8.195145] 1e80: 00000000 ebaf2a80 eb932f88 eb932f00 eb932f90
c11f5638 00000000 ee7f7005
[    8.205283] 1ea0: eb9e1ebc eb9e1eb0 c026fedc c026fd20 eb9e1ee4
eb9e1ec0 c02ce3bc c026fecc
[    8.215423] 1ec0: eb932f50 ecf81080 c11b0338 ee7f33c0 ee7f7000
00000000 eb9e1f24 eb9e1ee8
[    8.225563] 1ee0: c026f4d0 c02ce254 ee7f33c0 ee7f33d4 eb9e0000
00000000 ecf81080 ee7f33c0
[    8.235696] 1f00: ecf81098 ee7f33d4 eb9e0000 00000008 ecf81080
ee7f33c0 eb9e1f5c eb9e1f28
[    8.245837] 1f20: c026ff6c c026f380 c026ff18 c10ae100 00000000
00000000 eb9c0180 ecf81080
[    8.255986] 1f40: c026ff18 00000000 00000000 00000000 eb9e1fac
eb9e1f60 c02749d8 c026ff24
[    8.266143] 1f60: 00000000 00000000 00000000 ecf81080 00000000
00000000 eb9e1f78 eb9e1f78
[    8.276310] 1f80: 00000001 00010001 eb9e1f88 eb9e1f88 eb9c0180
c02748ec 00000000 00000000
[    8.286498] 1fa0: 00000000 eb9e1fb0 c0210aa0 c02748f8 00000000
00000000 00000000 00000000
[    8.296672] 1fc0: 00000000 00000000 00000000 00000000 00000000
00000000 00000000 00000000
[    8.306828] 1fe0: 00000000 00000000 00000000 00000000 00000013
00000000 00000000 00000000
[    8.316967] [<c02750a4>] (kthread_data) from [<c02704ac>]
(wq_worker_sleeping+0x1c/0xe0)
[    8.327023] [<c02704ac>] (wq_worker_sleeping) from [<c0ad3a64>]
(__schedule+0x370/0x5bc)
[    8.337072] [<c0ad3a64>] (__schedule) from [<c0ad3cfc>] (schedule+0x4c/0xa4)
[    8.346085] [<c0ad3cfc>] (schedule) from [<c025a9b8>] (do_exit+0x65c/0x960)
[    8.355013] [<c025a9b8>] (do_exit) from [<c0215058>] (die+0x45c/0x474)
[    8.363498] [<c0215058>] (die) from [<c0ad0df4>]
(__do_kernel_fault.part.10+0x74/0x84)
[    8.373386] [<c0ad0df4>] (__do_kernel_fault.part.10) from
[<c0221408>] (do_page_fault+0x278/0x388)
[    8.384322] [<c0221408>] (do_page_fault) from [<c02091e8>]
(do_DataAbort+0x48/0xa8)
[    8.393960] [<c02091e8>] (do_DataAbort) from [<c02157d8>]
(__dabt_svc+0x38/0x60)
[    8.403332] Exception stack(0xeb9e1da8 to 0xeb9e1df0)
[    8.410356] 1da0:                   ebb19df4 00000003 00000000
00000000 eeda8ee0 00000001
[    8.420519] 1dc0: ee837a28 00000000 00000000 00000003 00000001
eb9e1e04 eb9e1e08 eb9e1df0
[    8.430686] 1de0: c028ed3c c028f54c 200f0093 ffffffff
[    8.437733] [<c02157d8>] (__dabt_svc) from [<c028f54c>]
(wake_bit_function+0x18/0x6c)
[    8.447569] [<c028f54c>] (wake_bit_function) from [<c028ed3c>]
(__wake_up_common+0x5c/0x90)
[    8.457917] [<c028ed3c>] (__wake_up_common) from [<c028ef88>]
(__wake_up+0x48/0x5c)
[    8.467579] [<c028ef88>] (__wake_up) from [<c026fdfc>]
(__cancel_work_timer+0xe8/0x1ac)
[    8.477591] [<c026fdfc>] (__cancel_work_timer) from [<c026fedc>]
(cancel_work_sync+0x1c/0x20)
[    8.488145] [<c026fedc>] (cancel_work_sync) from [<c02ce3bc>]
(css_free_work_fn+0x174/0x2ec)
[    8.498620] [<c02ce3bc>] (css_free_work_fn) from [<c026f4d0>]
(process_one_work+0x15c/0x3dc)
[    8.509093] [<c026f4d0>] (process_one_work) from [<c026ff6c>]
(worker_thread+0x54/0x4e8)
[    8.519227] [<c026ff6c>] (worker_thread) from [<c02749d8>]
(kthread+0xec/0x104)
[    8.528606] [<c02749d8>] (kthread) from [<c0210aa0>]
(ret_from_fork+0x14/0x34)
[    8.537899] Code: e24cb004 e52de004 e8bd4000 e5903268 (e5130014)
[    8.546045] ---[ end trace f25fc65c3d66034d ]---
[    8.552705] Fixing recursive fault but reboot is needed!

Regards,

Tomeu

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH for-3.20-fixes] workqueue: fix hang involving racing cancel[_delayed]_work_sync()'s for PREEMPT_NONE
  2015-03-03 10:00       ` Tomeu Vizoso
@ 2015-03-03 13:21         ` Tejun Heo
  2015-03-03 13:45           ` [PATCH wq/for-4.0-fixes v2] " Tejun Heo
  0 siblings, 1 reply; 17+ messages in thread
From: Tejun Heo @ 2015-03-03 13:21 UTC (permalink / raw)
  To: Tomeu Vizoso; +Cc: Jesper Nilsson, Rabin Vincent, Jesper Nilsson, linux-kernel

Hello, Tomeu.

On Tue, Mar 03, 2015 at 11:00:43AM +0100, Tomeu Vizoso wrote:
...
> [    7.421803] PC is at wake_bit_function+0x18/0x6c
> [    7.428168] LR is at __wake_up_common+0x5c/0x90
...
> [    7.673183] [<c028f54c>] (wake_bit_function) from [<c028ed3c>] (__wake_up_common+0x5c/0x90)
> [    7.683300] [<c028ed3c>] (__wake_up_common) from [<c028ef88>] (__wake_up+0x48/0x5c)
> [    7.692744] [<c028ef88>] (__wake_up) from [<c026fdfc>] (__cancel_work_timer+0xe8/0x1ac)
> [    7.702533] [<c026fdfc>] (__cancel_work_timer) from [<c026fedc>] (cancel_work_sync+0x1c/0x20)
> [    7.712854] [<c026fedc>] (cancel_work_sync) from [<c02ce3bc>] (css_free_work_fn+0x174/0x2ec)
> [    7.723099] [<c02ce3bc>] (css_free_work_fn) from [<c026f4d0>] (process_one_work+0x15c/0x3dc)
> [    7.733339] [<c026f4d0>] (process_one_work) from [<c026ff6c>] (worker_thread+0x54/0x4e8)
> [    7.743224] [<c026ff6c>] (worker_thread) from [<c02749d8>] (kthread+0xec/0x104)
> [    7.752339] [<c02749d8>] (kthread) from [<c0210aa0>] (ret_from_fork+0x14/0x34)
> [    7.761366] Code: e24cb004 e52de004 e8bd4000 e510400c (e5935000)

Hah, weird.  How is your machine ending up in wake_bit_function() from
there?  Can you please apply the following patch and report the dmesg
after crash?

Thanks.

---
 kernel/workqueue.c |   16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2755,8 +2755,12 @@ static bool __cancel_work_timer(struct w
 		if (unlikely(ret == -ENOENT)) {
 			DEFINE_WAIT(wait);
 			prepare_to_wait(waitq, &wait, TASK_UNINTERRUPTIBLE);
-			if (work_is_canceling(work))
+			WARN_ON_ONCE(wait.func != autoremove_wake_function);
+			if (work_is_canceling(work)) {
+				printk("XXX __cancel_work_timer: %d sleeping w/ %pf, wait=%p\n",
+				       task_pid_nr(current), wait.func, &wait);
 				schedule();
+			}
 			finish_wait(waitq, &wait);
 		}
 	} while (unlikely(ret < 0));
@@ -2774,8 +2778,16 @@ static bool __cancel_work_timer(struct w
 	 * visible there.
 	 */
 	smp_mb();
-	if (waitqueue_active(waitq))
+	if (waitqueue_active(waitq)) {
+		wait_queue_t *cur;
+
+		spin_lock_irq(&waitq->lock);
+		list_for_each_entry(cur, &waitq->task_list, task_list)
+			printk("XXX __cancel_work_timer: waking up %pf, wait=%p\n",
+			       cur->func, cur);
+		spin_unlock_irq(&waitq->lock);
 		wake_up(waitq);
+	}
 
 	return ret;
 }


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH wq/for-4.0-fixes v2] workqueue: fix hang involving racing cancel[_delayed]_work_sync()'s for PREEMPT_NONE
  2015-03-03 13:21         ` Tejun Heo
@ 2015-03-03 13:45           ` Tejun Heo
  2015-03-03 13:57             ` Tomeu Vizoso
  0 siblings, 1 reply; 17+ messages in thread
From: Tejun Heo @ 2015-03-03 13:45 UTC (permalink / raw)
  To: Tomeu Vizoso; +Cc: Jesper Nilsson, Rabin Vincent, Jesper Nilsson, linux-kernel

Hello,

Found the culprit.  Plain wake_up() shouldn't be used on
bit_waitqueues.  The following patch should fix the issue.  I replaced
the patch in the wq branches.

Thanks a lot.

----- 8< -----
>From bba5a377507efef69214108c0d3790cadfab21d4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 3 Mar 2015 08:43:09 -0500

cancel[_delayed]_work_sync() are implemented using
__cancel_work_timer() which grabs the PENDING bit using
try_to_grab_pending() and then flushes the work item with PENDING set
to prevent the on-going execution of the work item from requeueing
itself.

try_to_grab_pending() can always grab PENDING bit without blocking
except when someone else is doing the above flushing during
cancelation.  In that case, try_to_grab_pending() returns -ENOENT.  In
this case, __cancel_work_timer() currently invokes flush_work().  The
assumption is that the completion of the work item is what the other
canceling task would be waiting for too and thus waiting for the same
condition and retrying should allow forward progress without excessive
busy looping

Unfortunately, this doesn't work if preemption is disabled or the
latter task has real time priority.  Let's say task A just got woken
up from flush_work() by the completion of the target work item.  If,
before task A starts executing, task B gets scheduled and invokes
__cancel_work_timer() on the same work item, its try_to_grab_pending()
will return -ENOENT as the work item is still being canceled by task A
and flush_work() will also immediately return false as the work item
is no longer executing.  This puts task B in a busy loop possibly
preventing task A from executing and clearing the canceling state on
the work item leading to a hang.

task A			task B			worker

						executing work
__cancel_work_timer()
  try_to_grab_pending()
  set work CANCELING
  flush_work()
    block for work completion
						completion, wakes up A
			__cancel_work_timer()
			while (forever) {
			  try_to_grab_pending()
			    -ENOENT as work is being canceled
			  flush_work()
			    false as work is no longer executing
			}

This patch removes the possible hang by updating __cancel_work_timer()
to explicitly wait for clearing of CANCELING rather than invoking
flush_work() after try_to_grab_pending() fails with -ENOENT.  The
explicit wait uses the matching bit waitqueue for the CANCELING bit.

Link: http://lkml.kernel.org/g/20150206171156.GA8942@axis.com

v2: v1 used wake_up() on bit_waitqueue() which leads to NULL deref if
    the target bit waitqueue has wait_bit_queue's on it.  Use
    DEFINE_WAIT_BIT() and __wake_up_bit() instead.  Reported by Tomeu
    Vizoso.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Rabin Vincent <rabin.vincent@axis.com>
Cc: Tomeu Vizoso <tomeu.vizoso@gmail.com>
Cc: stable@vger.kernel.org
Tested-by: Jesper Nilsson <jesper.nilsson@axis.com>
Tested-by: Rabin Vincent <rabin.vincent@axis.com>
---
 include/linux/workqueue.h |  3 ++-
 kernel/workqueue.c        | 37 +++++++++++++++++++++++++++++++++----
 2 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 74db135..f597846 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -70,7 +70,8 @@ enum {
 	/* data contains off-queue information when !WORK_STRUCT_PWQ */
 	WORK_OFFQ_FLAG_BASE	= WORK_STRUCT_COLOR_SHIFT,
 
-	WORK_OFFQ_CANCELING	= (1 << WORK_OFFQ_FLAG_BASE),
+	__WORK_OFFQ_CANCELING	= WORK_OFFQ_FLAG_BASE,
+	WORK_OFFQ_CANCELING	= (1 << __WORK_OFFQ_CANCELING),
 
 	/*
 	 * When a work item is off queue, its high bits point to the last
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f288493..cfbae1d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2730,17 +2730,37 @@ EXPORT_SYMBOL_GPL(flush_work);
 
 static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
 {
+	wait_queue_head_t *waitq = bit_waitqueue(&work->data,
+						 __WORK_OFFQ_CANCELING);
 	unsigned long flags;
 	int ret;
 
 	do {
 		ret = try_to_grab_pending(work, is_dwork, &flags);
 		/*
-		 * If someone else is canceling, wait for the same event it
-		 * would be waiting for before retrying.
+		 * If someone else is already canceling, wait for it to
+		 * finish.  flush_work() doesn't work for PREEMPT_NONE
+		 * because we may get scheduled between @work's completion
+		 * and the other canceling task resuming and clearing
+		 * CANCELING - flush_work() will return false immediately
+		 * as @work is no longer busy, try_to_grab_pending() will
+		 * return -ENOENT as @work is still being canceled and the
+		 * other canceling task won't be able to clear CANCELING as
+		 * we're hogging the CPU.
+		 *
+		 * Explicitly wait for completion using a bit waitqueue.
+		 * We can't use wait_on_bit() as the CANCELING bit may get
+		 * recycled to point to pwq if @work gets re-queued.
 		 */
-		if (unlikely(ret == -ENOENT))
-			flush_work(work);
+		if (unlikely(ret == -ENOENT)) {
+			DEFINE_WAIT_BIT(wait, &work->data,
+					__WORK_OFFQ_CANCELING);
+			prepare_to_wait(waitq, &wait.wait,
+					TASK_UNINTERRUPTIBLE);
+			if (work_is_canceling(work))
+				schedule();
+			finish_wait(waitq, &wait.wait);
+		}
 	} while (unlikely(ret < 0));
 
 	/* tell other tasks trying to grab @work to back off */
@@ -2749,6 +2769,15 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
 
 	flush_work(work);
 	clear_work_data(work);
+
+	/*
+	 * Paired with prepare_to_wait() above so that either
+	 * __wake_up_bit() sees busy waitq here or !work_is_canceling() is
+	 * visible there.
+	 */
+	smp_mb();
+	__wake_up_bit(waitq, &work->data, __WORK_OFFQ_CANCELING);
+
 	return ret;
 }
 
-- 
2.1.0


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH wq/for-4.0-fixes v2] workqueue: fix hang involving racing cancel[_delayed]_work_sync()'s for PREEMPT_NONE
  2015-03-03 13:45           ` [PATCH wq/for-4.0-fixes v2] " Tejun Heo
@ 2015-03-03 13:57             ` Tomeu Vizoso
  2015-03-05  9:24               ` Tomeu Vizoso
  0 siblings, 1 reply; 17+ messages in thread
From: Tomeu Vizoso @ 2015-03-03 13:57 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Jesper Nilsson, Rabin Vincent, Jesper Nilsson, linux-kernel

On 3 March 2015 at 14:45, Tejun Heo <tj@kernel.org> wrote:
> Hello,
>
> Found the culprit.  Plain wake_up() shouldn't be used on
> bit_waitqueues.  The following patch should fix the issue.  I replaced
> the patch in the wq branches.

Yup, this looks good from here.

Thanks,

Tomeu

> Thanks a lot.
>
> ----- 8< -----
> From bba5a377507efef69214108c0d3790cadfab21d4 Mon Sep 17 00:00:00 2001
> From: Tejun Heo <tj@kernel.org>
> Date: Tue, 3 Mar 2015 08:43:09 -0500
>
> cancel[_delayed]_work_sync() are implemented using
> __cancel_work_timer() which grabs the PENDING bit using
> try_to_grab_pending() and then flushes the work item with PENDING set
> to prevent the on-going execution of the work item from requeueing
> itself.
>
> try_to_grab_pending() can always grab PENDING bit without blocking
> except when someone else is doing the above flushing during
> cancelation.  In that case, try_to_grab_pending() returns -ENOENT.  In
> this case, __cancel_work_timer() currently invokes flush_work().  The
> assumption is that the completion of the work item is what the other
> canceling task would be waiting for too and thus waiting for the same
> condition and retrying should allow forward progress without excessive
> busy looping
>
> Unfortunately, this doesn't work if preemption is disabled or the
> latter task has real time priority.  Let's say task A just got woken
> up from flush_work() by the completion of the target work item.  If,
> before task A starts executing, task B gets scheduled and invokes
> __cancel_work_timer() on the same work item, its try_to_grab_pending()
> will return -ENOENT as the work item is still being canceled by task A
> and flush_work() will also immediately return false as the work item
> is no longer executing.  This puts task B in a busy loop possibly
> preventing task A from executing and clearing the canceling state on
> the work item leading to a hang.
>
> task A                  task B                  worker
>
>                                                 executing work
> __cancel_work_timer()
>   try_to_grab_pending()
>   set work CANCELING
>   flush_work()
>     block for work completion
>                                                 completion, wakes up A
>                         __cancel_work_timer()
>                         while (forever) {
>                           try_to_grab_pending()
>                             -ENOENT as work is being canceled
>                           flush_work()
>                             false as work is no longer executing
>                         }
>
> This patch removes the possible hang by updating __cancel_work_timer()
> to explicitly wait for clearing of CANCELING rather than invoking
> flush_work() after try_to_grab_pending() fails with -ENOENT.  The
> explicit wait uses the matching bit waitqueue for the CANCELING bit.
>
> Link: http://lkml.kernel.org/g/20150206171156.GA8942@axis.com
>
> v2: v1 used wake_up() on bit_waitqueue() which leads to NULL deref if
>     the target bit waitqueue has wait_bit_queue's on it.  Use
>     DEFINE_WAIT_BIT() and __wake_up_bit() instead.  Reported by Tomeu
>     Vizoso.
>
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Reported-by: Rabin Vincent <rabin.vincent@axis.com>
> Cc: Tomeu Vizoso <tomeu.vizoso@gmail.com>
> Cc: stable@vger.kernel.org
> Tested-by: Jesper Nilsson <jesper.nilsson@axis.com>
> Tested-by: Rabin Vincent <rabin.vincent@axis.com>
> ---
>  include/linux/workqueue.h |  3 ++-
>  kernel/workqueue.c        | 37 +++++++++++++++++++++++++++++++++----
>  2 files changed, 35 insertions(+), 5 deletions(-)
>
> diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
> index 74db135..f597846 100644
> --- a/include/linux/workqueue.h
> +++ b/include/linux/workqueue.h
> @@ -70,7 +70,8 @@ enum {
>         /* data contains off-queue information when !WORK_STRUCT_PWQ */
>         WORK_OFFQ_FLAG_BASE     = WORK_STRUCT_COLOR_SHIFT,
>
> -       WORK_OFFQ_CANCELING     = (1 << WORK_OFFQ_FLAG_BASE),
> +       __WORK_OFFQ_CANCELING   = WORK_OFFQ_FLAG_BASE,
> +       WORK_OFFQ_CANCELING     = (1 << __WORK_OFFQ_CANCELING),
>
>         /*
>          * When a work item is off queue, its high bits point to the last
> diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> index f288493..cfbae1d 100644
> --- a/kernel/workqueue.c
> +++ b/kernel/workqueue.c
> @@ -2730,17 +2730,37 @@ EXPORT_SYMBOL_GPL(flush_work);
>
>  static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
>  {
> +       wait_queue_head_t *waitq = bit_waitqueue(&work->data,
> +                                                __WORK_OFFQ_CANCELING);
>         unsigned long flags;
>         int ret;
>
>         do {
>                 ret = try_to_grab_pending(work, is_dwork, &flags);
>                 /*
> -                * If someone else is canceling, wait for the same event it
> -                * would be waiting for before retrying.
> +                * If someone else is already canceling, wait for it to
> +                * finish.  flush_work() doesn't work for PREEMPT_NONE
> +                * because we may get scheduled between @work's completion
> +                * and the other canceling task resuming and clearing
> +                * CANCELING - flush_work() will return false immediately
> +                * as @work is no longer busy, try_to_grab_pending() will
> +                * return -ENOENT as @work is still being canceled and the
> +                * other canceling task won't be able to clear CANCELING as
> +                * we're hogging the CPU.
> +                *
> +                * Explicitly wait for completion using a bit waitqueue.
> +                * We can't use wait_on_bit() as the CANCELING bit may get
> +                * recycled to point to pwq if @work gets re-queued.
>                  */
> -               if (unlikely(ret == -ENOENT))
> -                       flush_work(work);
> +               if (unlikely(ret == -ENOENT)) {
> +                       DEFINE_WAIT_BIT(wait, &work->data,
> +                                       __WORK_OFFQ_CANCELING);
> +                       prepare_to_wait(waitq, &wait.wait,
> +                                       TASK_UNINTERRUPTIBLE);
> +                       if (work_is_canceling(work))
> +                               schedule();
> +                       finish_wait(waitq, &wait.wait);
> +               }
>         } while (unlikely(ret < 0));
>
>         /* tell other tasks trying to grab @work to back off */
> @@ -2749,6 +2769,15 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
>
>         flush_work(work);
>         clear_work_data(work);
> +
> +       /*
> +        * Paired with prepare_to_wait() above so that either
> +        * __wake_up_bit() sees busy waitq here or !work_is_canceling() is
> +        * visible there.
> +        */
> +       smp_mb();
> +       __wake_up_bit(waitq, &work->data, __WORK_OFFQ_CANCELING);
> +
>         return ret;
>  }
>
> --
> 2.1.0
>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH wq/for-4.0-fixes v2] workqueue: fix hang involving racing cancel[_delayed]_work_sync()'s for PREEMPT_NONE
  2015-03-03 13:57             ` Tomeu Vizoso
@ 2015-03-05  9:24               ` Tomeu Vizoso
  2015-03-05  9:36                 ` Tejun Heo
  2015-03-05 13:03                 ` [PATCH v3 wq/for-4.0-fixes] " Tejun Heo
  0 siblings, 2 replies; 17+ messages in thread
From: Tomeu Vizoso @ 2015-03-05  9:24 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Jesper Nilsson, Rabin Vincent, Jesper Nilsson, linux-kernel

On 3 March 2015 at 14:57, Tomeu Vizoso <tomeu.vizoso@gmail.com> wrote:
> On 3 March 2015 at 14:45, Tejun Heo <tj@kernel.org> wrote:
>> Hello,
>>
>> Found the culprit.  Plain wake_up() shouldn't be used on
>> bit_waitqueues.  The following patch should fix the issue.  I replaced
>> the patch in the wq branches.
>
> Yup, this looks good from here.

Actually, I'm getting this when unloading mwifiex_sdio:

[  317.201212] Unable to handle kernel paging request at virtual
address f081a680
[  317.208522] pgd = ebb70000
[  317.211302] [f081a680] *pgd=ab868811, *pte=00000000, *ppte=00000000
[  317.217692] Internal error: Oops: 7 [#1] SMP ARM
[  317.222328] Modules linked in: cfg80211(-) bluetooth ipv6 [last
unloaded: mwifiex]
[  317.230019] CPU: 2 PID: 682 Comm: modprobe Not tainted
4.0.0-rc2-next-20150305ccu-00013-g2b3b1a8 #589
[  317.239271] Hardware name: NVIDIA Tegra SoC (Flattened Device Tree)
[  317.245568] task: eb9cd580 ti: ebcea000 task.ti: ebcea000
[  317.251001] PC is at bit_waitqueue+0x38/0x6c
[  317.255312] LR is at __cancel_work_timer+0x28/0x1b0
[  317.260215] pc : [<c028fe18>]    lr : [<c0270d34>]    psr: 20000113
[  317.260215] sp : ebcebe90  ip : ee838000  fp : ebcebe9c
[  317.271737] r10: 00000000  r9 : ebcea000  r8 : c0210ba4
[  317.276991] r7 : 00000081  r6 : 2a07e3b0  r5 : bf134948  r4 : 00000000
[  317.283552] r3 : 9e370001  r2 : 0000c640  r1 : e2692904  r0 : 000ff134
[  317.290112] Flags: nzCv  IRQs on  FIQs on  Mode SVC_32  ISA ARM  Segment user
[  317.297286] Control: 10c5387d  Table: abb7006a  DAC: 00000015
[  317.303060] Process modprobe (pid: 682, stack limit = 0xebcea220)
[  317.309186] Stack: (0xebcebe90 to 0xebcec000)
[  317.313568] be80:                                     ebcebefc
ebcebea0 c0270d34 c028fdec
[  317.321781] bea0: c05062d0 c034e57c c1210824 ebe94be0 ebcebedc
eb8f080c bf134830 2a07e3b0
[  317.329998] bec0: 00000081 c0210ba4 ebcebef4 ebcebed8 c0506428
bf139830 bf1348dc 2a07e3b0
[  317.338220] bee0: 00000081 c0210ba4 ebcea000 00000000 ebcebf0c
ebcebf00 c0270ed8 c0270d18
[  317.346437] bf00: ebcebf34 ebcebf10 bf0ed138 c0270ec8 c11cb4f0
bf139830 bf134800 2a07e3b0
[  317.354655] bf20: 00000081 c0210ba4 ebcebf4c ebcebf38 bf125184
bf0ed120 bf1396b8 2a07f76c
[  317.362872] bf40: ebcebfa4 ebcebf50 c02c8b4c bf125158 ebcebf6c
38676663 31313230 00000000
[  317.372482] bf60: ebcebf8c ebcebf70 c0273e98 c0adcdb0 ebcea010
c0210ba4 ebcebfb0 ebcea000
[  317.382086] bf80: ebcebfac ebcebf90 c0214484 00273de0 2a07f738
2a07f738 00000000 ebcebfa8
[  317.391706] bfa0: c0210a00 c02c89a4 2a07f738 2a07f738 2a07f76c
00000800 0fea3100 00000000
[  317.401319] bfc0: 2a07f738 2a07f738 2a07e3b0 00000081 00000000
00000002 2a07e310 becc9db4
[  317.410948] bfe0: 2a07d12c becc89ac 2a061623 b6eb82e6 60000030
2a07f76c 00000000 00000000
[  317.420658] [<c028fe18>] (bit_waitqueue) from [<c0270d34>]
(__cancel_work_timer+0x28/0x1b0)
[  317.430598] [<c0270d34>] (__cancel_work_timer) from [<c0270ed8>]
(cancel_work_sync+0x1c/0x20)
[  317.440672] [<c0270ed8>] (cancel_work_sync) from [<bf0ed138>]
(regulatory_exit+0x24/0x17c [cfg80211])
[  317.451396] [<bf0ed138>] (regulatory_exit [cfg80211]) from
[<bf125184>] (cfg80211_exit+0x38/0x4c [cfg80211])
[  317.462726] [<bf125184>] (cfg80211_exit [cfg80211]) from
[<c02c8b4c>] (SyS_delete_module+0x1b4/0x1f8)
[  317.473411] [<c02c8b4c>] (SyS_delete_module) from [<c0210a00>]
(ret_fast_syscall+0x0/0x34)

Regards,

Tomeu

> Thanks,
>
> Tomeu
>
>> Thanks a lot.
>>
>> ----- 8< -----
>> From bba5a377507efef69214108c0d3790cadfab21d4 Mon Sep 17 00:00:00 2001
>> From: Tejun Heo <tj@kernel.org>
>> Date: Tue, 3 Mar 2015 08:43:09 -0500
>>
>> cancel[_delayed]_work_sync() are implemented using
>> __cancel_work_timer() which grabs the PENDING bit using
>> try_to_grab_pending() and then flushes the work item with PENDING set
>> to prevent the on-going execution of the work item from requeueing
>> itself.
>>
>> try_to_grab_pending() can always grab PENDING bit without blocking
>> except when someone else is doing the above flushing during
>> cancelation.  In that case, try_to_grab_pending() returns -ENOENT.  In
>> this case, __cancel_work_timer() currently invokes flush_work().  The
>> assumption is that the completion of the work item is what the other
>> canceling task would be waiting for too and thus waiting for the same
>> condition and retrying should allow forward progress without excessive
>> busy looping
>>
>> Unfortunately, this doesn't work if preemption is disabled or the
>> latter task has real time priority.  Let's say task A just got woken
>> up from flush_work() by the completion of the target work item.  If,
>> before task A starts executing, task B gets scheduled and invokes
>> __cancel_work_timer() on the same work item, its try_to_grab_pending()
>> will return -ENOENT as the work item is still being canceled by task A
>> and flush_work() will also immediately return false as the work item
>> is no longer executing.  This puts task B in a busy loop possibly
>> preventing task A from executing and clearing the canceling state on
>> the work item leading to a hang.
>>
>> task A                  task B                  worker
>>
>>                                                 executing work
>> __cancel_work_timer()
>>   try_to_grab_pending()
>>   set work CANCELING
>>   flush_work()
>>     block for work completion
>>                                                 completion, wakes up A
>>                         __cancel_work_timer()
>>                         while (forever) {
>>                           try_to_grab_pending()
>>                             -ENOENT as work is being canceled
>>                           flush_work()
>>                             false as work is no longer executing
>>                         }
>>
>> This patch removes the possible hang by updating __cancel_work_timer()
>> to explicitly wait for clearing of CANCELING rather than invoking
>> flush_work() after try_to_grab_pending() fails with -ENOENT.  The
>> explicit wait uses the matching bit waitqueue for the CANCELING bit.
>>
>> Link: http://lkml.kernel.org/g/20150206171156.GA8942@axis.com
>>
>> v2: v1 used wake_up() on bit_waitqueue() which leads to NULL deref if
>>     the target bit waitqueue has wait_bit_queue's on it.  Use
>>     DEFINE_WAIT_BIT() and __wake_up_bit() instead.  Reported by Tomeu
>>     Vizoso.
>>
>> Signed-off-by: Tejun Heo <tj@kernel.org>
>> Reported-by: Rabin Vincent <rabin.vincent@axis.com>
>> Cc: Tomeu Vizoso <tomeu.vizoso@gmail.com>
>> Cc: stable@vger.kernel.org
>> Tested-by: Jesper Nilsson <jesper.nilsson@axis.com>
>> Tested-by: Rabin Vincent <rabin.vincent@axis.com>
>> ---
>>  include/linux/workqueue.h |  3 ++-
>>  kernel/workqueue.c        | 37 +++++++++++++++++++++++++++++++++----
>>  2 files changed, 35 insertions(+), 5 deletions(-)
>>
>> diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
>> index 74db135..f597846 100644
>> --- a/include/linux/workqueue.h
>> +++ b/include/linux/workqueue.h
>> @@ -70,7 +70,8 @@ enum {
>>         /* data contains off-queue information when !WORK_STRUCT_PWQ */
>>         WORK_OFFQ_FLAG_BASE     = WORK_STRUCT_COLOR_SHIFT,
>>
>> -       WORK_OFFQ_CANCELING     = (1 << WORK_OFFQ_FLAG_BASE),
>> +       __WORK_OFFQ_CANCELING   = WORK_OFFQ_FLAG_BASE,
>> +       WORK_OFFQ_CANCELING     = (1 << __WORK_OFFQ_CANCELING),
>>
>>         /*
>>          * When a work item is off queue, its high bits point to the last
>> diff --git a/kernel/workqueue.c b/kernel/workqueue.c
>> index f288493..cfbae1d 100644
>> --- a/kernel/workqueue.c
>> +++ b/kernel/workqueue.c
>> @@ -2730,17 +2730,37 @@ EXPORT_SYMBOL_GPL(flush_work);
>>
>>  static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
>>  {
>> +       wait_queue_head_t *waitq = bit_waitqueue(&work->data,
>> +                                                __WORK_OFFQ_CANCELING);
>>         unsigned long flags;
>>         int ret;
>>
>>         do {
>>                 ret = try_to_grab_pending(work, is_dwork, &flags);
>>                 /*
>> -                * If someone else is canceling, wait for the same event it
>> -                * would be waiting for before retrying.
>> +                * If someone else is already canceling, wait for it to
>> +                * finish.  flush_work() doesn't work for PREEMPT_NONE
>> +                * because we may get scheduled between @work's completion
>> +                * and the other canceling task resuming and clearing
>> +                * CANCELING - flush_work() will return false immediately
>> +                * as @work is no longer busy, try_to_grab_pending() will
>> +                * return -ENOENT as @work is still being canceled and the
>> +                * other canceling task won't be able to clear CANCELING as
>> +                * we're hogging the CPU.
>> +                *
>> +                * Explicitly wait for completion using a bit waitqueue.
>> +                * We can't use wait_on_bit() as the CANCELING bit may get
>> +                * recycled to point to pwq if @work gets re-queued.
>>                  */
>> -               if (unlikely(ret == -ENOENT))
>> -                       flush_work(work);
>> +               if (unlikely(ret == -ENOENT)) {
>> +                       DEFINE_WAIT_BIT(wait, &work->data,
>> +                                       __WORK_OFFQ_CANCELING);
>> +                       prepare_to_wait(waitq, &wait.wait,
>> +                                       TASK_UNINTERRUPTIBLE);
>> +                       if (work_is_canceling(work))
>> +                               schedule();
>> +                       finish_wait(waitq, &wait.wait);
>> +               }
>>         } while (unlikely(ret < 0));
>>
>>         /* tell other tasks trying to grab @work to back off */
>> @@ -2749,6 +2769,15 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
>>
>>         flush_work(work);
>>         clear_work_data(work);
>> +
>> +       /*
>> +        * Paired with prepare_to_wait() above so that either
>> +        * __wake_up_bit() sees busy waitq here or !work_is_canceling() is
>> +        * visible there.
>> +        */
>> +       smp_mb();
>> +       __wake_up_bit(waitq, &work->data, __WORK_OFFQ_CANCELING);
>> +
>>         return ret;
>>  }
>>
>> --
>> 2.1.0
>>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH wq/for-4.0-fixes v2] workqueue: fix hang involving racing cancel[_delayed]_work_sync()'s for PREEMPT_NONE
  2015-03-05  9:24               ` Tomeu Vizoso
@ 2015-03-05  9:36                 ` Tejun Heo
  2015-03-05 11:46                   ` Tejun Heo
  2015-03-05 13:03                 ` [PATCH v3 wq/for-4.0-fixes] " Tejun Heo
  1 sibling, 1 reply; 17+ messages in thread
From: Tejun Heo @ 2015-03-05  9:36 UTC (permalink / raw)
  To: Tomeu Vizoso; +Cc: Jesper Nilsson, Rabin Vincent, Jesper Nilsson, linux-kernel

On Thu, Mar 05, 2015 at 10:24:50AM +0100, Tomeu Vizoso wrote:
...
> [  317.251001] PC is at bit_waitqueue+0x38/0x6c
...
> [  317.420658] [<c028fe18>] (bit_waitqueue) from [<c0270d34>]
> (__cancel_work_timer+0x28/0x1b0)
> [  317.430598] [<c0270d34>] (__cancel_work_timer) from [<c0270ed8>]
> (cancel_work_sync+0x1c/0x20)
> [  317.440672] [<c0270ed8>] (cancel_work_sync) from [<bf0ed138>]
> (regulatory_exit+0x24/0x17c [cfg80211])
> [  317.451396] [<bf0ed138>] (regulatory_exit [cfg80211]) from
> [<bf125184>] (cfg80211_exit+0x38/0x4c [cfg80211])
> [  317.462726] [<bf125184>] (cfg80211_exit [cfg80211]) from
> [<c02c8b4c>] (SyS_delete_module+0x1b4/0x1f8)
> [  317.473411] [<c02c8b4c>] (SyS_delete_module) from [<c0210a00>]
> (ret_fast_syscall+0x0/0x34)

Ah, I think that's from feeding static address to virt_to_page.  :(
Reverted the patch from the branch.  Will think more about what to do.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH wq/for-4.0-fixes v2] workqueue: fix hang involving racing cancel[_delayed]_work_sync()'s for PREEMPT_NONE
  2015-03-05  9:36                 ` Tejun Heo
@ 2015-03-05 11:46                   ` Tejun Heo
  0 siblings, 0 replies; 17+ messages in thread
From: Tejun Heo @ 2015-03-05 11:46 UTC (permalink / raw)
  To: Tomeu Vizoso; +Cc: Jesper Nilsson, Rabin Vincent, Jesper Nilsson, linux-kernel

On Thu, Mar 05, 2015 at 04:36:38AM -0500, Tejun Heo wrote:
> On Thu, Mar 05, 2015 at 10:24:50AM +0100, Tomeu Vizoso wrote:
> ...
> > [  317.251001] PC is at bit_waitqueue+0x38/0x6c
> ...
> > [  317.420658] [<c028fe18>] (bit_waitqueue) from [<c0270d34>]
> > (__cancel_work_timer+0x28/0x1b0)
> > [  317.430598] [<c0270d34>] (__cancel_work_timer) from [<c0270ed8>]
> > (cancel_work_sync+0x1c/0x20)
> > [  317.440672] [<c0270ed8>] (cancel_work_sync) from [<bf0ed138>]
> > (regulatory_exit+0x24/0x17c [cfg80211])
> > [  317.451396] [<bf0ed138>] (regulatory_exit [cfg80211]) from
> > [<bf125184>] (cfg80211_exit+0x38/0x4c [cfg80211])
> > [  317.462726] [<bf125184>] (cfg80211_exit [cfg80211]) from
> > [<c02c8b4c>] (SyS_delete_module+0x1b4/0x1f8)
> > [  317.473411] [<c02c8b4c>] (SyS_delete_module) from [<c0210a00>]
> > (ret_fast_syscall+0x0/0x34)
> 
> Ah, I think that's from feeding static address to virt_to_page.  :(
> Reverted the patch from the branch.  Will think more about what to do.

So, it's from feeding a static address of a module which is allocated
on the vmalloc space to bit_waitqueue() which then tries to find out
the backing page struct which vmalloc area obviously doesn't have.
Currently testing an alternative patch which uses a single waitqueue
w/ a custom wakeup function which can filter the target work item.
Will soon post the new version.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH v3 wq/for-4.0-fixes] workqueue: fix hang involving racing cancel[_delayed]_work_sync()'s for PREEMPT_NONE
  2015-03-05  9:24               ` Tomeu Vizoso
  2015-03-05  9:36                 ` Tejun Heo
@ 2015-03-05 13:03                 ` Tejun Heo
  1 sibling, 0 replies; 17+ messages in thread
From: Tejun Heo @ 2015-03-05 13:03 UTC (permalink / raw)
  To: Tomeu Vizoso; +Cc: Jesper Nilsson, Rabin Vincent, Jesper Nilsson, linux-kernel

cancel[_delayed]_work_sync() are implemented using
__cancel_work_timer() which grabs the PENDING bit using
try_to_grab_pending() and then flushes the work item with PENDING set
to prevent the on-going execution of the work item from requeueing
itself.

try_to_grab_pending() can always grab PENDING bit without blocking
except when someone else is doing the above flushing during
cancelation.  In that case, try_to_grab_pending() returns -ENOENT.  In
this case, __cancel_work_timer() currently invokes flush_work().  The
assumption is that the completion of the work item is what the other
canceling task would be waiting for too and thus waiting for the same
condition and retrying should allow forward progress without excessive
busy looping

Unfortunately, this doesn't work if preemption is disabled or the
latter task has real time priority.  Let's say task A just got woken
up from flush_work() by the completion of the target work item.  If,
before task A starts executing, task B gets scheduled and invokes
__cancel_work_timer() on the same work item, its try_to_grab_pending()
will return -ENOENT as the work item is still being canceled by task A
and flush_work() will also immediately return false as the work item
is no longer executing.  This puts task B in a busy loop possibly
preventing task A from executing and clearing the canceling state on
the work item leading to a hang.

task A			task B			worker

						executing work
__cancel_work_timer()
  try_to_grab_pending()
  set work CANCELING
  flush_work()
    block for work completion
						completion, wakes up A
			__cancel_work_timer()
			while (forever) {
			  try_to_grab_pending()
			    -ENOENT as work is being canceled
			  flush_work()
			    false as work is no longer executing
			}

This patch removes the possible hang by updating __cancel_work_timer()
to explicitly wait for clearing of CANCELING rather than invoking
flush_work() after try_to_grab_pending() fails with -ENOENT.

Link: http://lkml.kernel.org/g/20150206171156.GA8942@axis.com

v3: bit_waitqueue() can't be used for work items defined in vmalloc
    area.  Switched to custom wake function which matches the target
    work item and exclusive wait and wakeup.

v2: v1 used wake_up() on bit_waitqueue() which leads to NULL deref if
    the target bit waitqueue has wait_bit_queue's on it.  Use
    DEFINE_WAIT_BIT() and __wake_up_bit() instead.  Reported by Tomeu
    Vizoso.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Rabin Vincent <rabin.vincent@axis.com>
Cc: Tomeu Vizoso <tomeu.vizoso@gmail.com>
Cc: stable@vger.kernel.org
Tested-by: Jesper Nilsson <jesper.nilsson@axis.com>
Tested-by: Rabin Vincent <rabin.vincent@axis.com>
---
Hello,

I'd prefer using hashed waitqueues but this has to do for now at
least.  Tomeu, can you please test this patch?

Thanks.

 include/linux/workqueue.h |    3 +-
 kernel/workqueue.c        |   56 ++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 54 insertions(+), 5 deletions(-)

--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -70,7 +70,8 @@ enum {
 	/* data contains off-queue information when !WORK_STRUCT_PWQ */
 	WORK_OFFQ_FLAG_BASE	= WORK_STRUCT_COLOR_SHIFT,
 
-	WORK_OFFQ_CANCELING	= (1 << WORK_OFFQ_FLAG_BASE),
+	__WORK_OFFQ_CANCELING	= WORK_OFFQ_FLAG_BASE,
+	WORK_OFFQ_CANCELING	= (1 << __WORK_OFFQ_CANCELING),
 
 	/*
 	 * When a work item is off queue, its high bits point to the last
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2728,19 +2728,57 @@ bool flush_work(struct work_struct *work
 }
 EXPORT_SYMBOL_GPL(flush_work);
 
+struct cwt_wait {
+	wait_queue_t		wait;
+	struct work_struct	*work;
+};
+
+static int cwt_wakefn(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+	struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait);
+
+	if (cwait->work != key)
+		return 0;
+	return autoremove_wake_function(wait, mode, sync, key);
+}
+
 static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
 {
+	static DECLARE_WAIT_QUEUE_HEAD(cancel_waitq);
 	unsigned long flags;
 	int ret;
 
 	do {
 		ret = try_to_grab_pending(work, is_dwork, &flags);
 		/*
-		 * If someone else is canceling, wait for the same event it
-		 * would be waiting for before retrying.
+		 * If someone else is already canceling, wait for it to
+		 * finish.  flush_work() doesn't work for PREEMPT_NONE
+		 * because we may get scheduled between @work's completion
+		 * and the other canceling task resuming and clearing
+		 * CANCELING - flush_work() will return false immediately
+		 * as @work is no longer busy, try_to_grab_pending() will
+		 * return -ENOENT as @work is still being canceled and the
+		 * other canceling task won't be able to clear CANCELING as
+		 * we're hogging the CPU.
+		 *
+		 * Let's wait for completion using a waitqueue.  As this
+		 * may lead to the thundering herd problem, use a custom
+		 * wake function which matches @work along with exclusive
+		 * wait and wakeup.
 		 */
-		if (unlikely(ret == -ENOENT))
-			flush_work(work);
+		if (unlikely(ret == -ENOENT)) {
+			struct cwt_wait cwait;
+
+			init_wait(&cwait.wait);
+			cwait.wait.func = cwt_wakefn;
+			cwait.work = work;
+
+			prepare_to_wait_exclusive(&cancel_waitq, &cwait.wait,
+						  TASK_UNINTERRUPTIBLE);
+			if (work_is_canceling(work))
+				schedule();
+			finish_wait(&cancel_waitq, &cwait.wait);
+		}
 	} while (unlikely(ret < 0));
 
 	/* tell other tasks trying to grab @work to back off */
@@ -2749,6 +2787,16 @@ static bool __cancel_work_timer(struct w
 
 	flush_work(work);
 	clear_work_data(work);
+
+	/*
+	 * Paired with prepare_to_wait() above so that either
+	 * waitqueue_active() is visible here or !work_is_canceling() is
+	 * visible there.
+	 */
+	smp_mb();
+	if (waitqueue_active(&cancel_waitq))
+		__wake_up(&cancel_waitq, TASK_NORMAL, 1, work);
+
 	return ret;
 }
 

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2015-03-05 13:03 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-02-06 17:11 Soft lockups in __cancel_work_timer() Rabin Vincent
2015-02-06 18:01 ` Tejun Heo
2015-02-09 16:15 ` [PATCH for-3.20-fixes] workqueue: fix hang involving racing cancel[_delayed]_work_sync()'s for PREEMPT_NONE Tejun Heo
2015-02-09 16:29   ` Jesper Nilsson
2015-02-10  8:35     ` Jesper Nilsson
2015-02-10  9:33   ` Rabin Vincent
2015-03-02 12:26   ` Jesper Nilsson
2015-03-02 16:21     ` Tejun Heo
2015-03-03  7:52       ` Jesper Nilsson
2015-03-03 10:00       ` Tomeu Vizoso
2015-03-03 13:21         ` Tejun Heo
2015-03-03 13:45           ` [PATCH wq/for-4.0-fixes v2] " Tejun Heo
2015-03-03 13:57             ` Tomeu Vizoso
2015-03-05  9:24               ` Tomeu Vizoso
2015-03-05  9:36                 ` Tejun Heo
2015-03-05 11:46                   ` Tejun Heo
2015-03-05 13:03                 ` [PATCH v3 wq/for-4.0-fixes] " Tejun Heo

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.